597 files changed, 27238 insertions, 17075 deletions
diff --git a/Documentation/ABI/testing/sysfs-bus-i2c-devices-lm3533 b/Documentation/ABI/testing/sysfs-bus-i2c-devices-lm3533
new file mode 100644
index 000000000000..1b62230b33b9
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-i2c-devices-lm3533
@@ -0,0 +1,15 @@
+What:		/sys/bus/i2c/devices/.../output_hvled[n]
+Date:		April 2012
+KernelVersion:	3.5
+Contact:	Johan Hovold <[email protected]>
+Description:
+		Set the controlling backlight device for high-voltage current
+		sink HVLED[n] (n = 1, 2) (0, 1).
+
+What:		/sys/bus/i2c/devices/.../output_lvled[n]
+Date:		April 2012
+KernelVersion:	3.5
+Contact:	Johan Hovold <[email protected]>
+Description:
+		Set the controlling led device for low-voltage current sink
+		LVLED[n] (n = 1..5) (0..3).
diff --git a/Documentation/ABI/testing/sysfs-class-backlight-driver-lm3533 b/Documentation/ABI/testing/sysfs-class-backlight-driver-lm3533
new file mode 100644
index 000000000000..77cf7ac949af
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-backlight-driver-lm3533
@@ -0,0 +1,48 @@
+What:		/sys/class/backlight/<backlight>/als_channel
+Date:		May 2012
+KernelVersion:	3.5
+Contact:	Johan Hovold <[email protected]>
+Description:
+		Get the ALS output channel used as input in
+		ALS-current-control mode (0, 1), where
+
+		0 - out_current0 (backlight 0)
+		1 - out_current1 (backlight 1)
+
+What:		/sys/class/backlight/<backlight>/als_en
+Date:		May 2012
+KernelVersion:	3.5
+Contact:	Johan Hovold <[email protected]>
+Description:
+		Enable ALS-current-control mode (0, 1).
+
+What:		/sys/class/backlight/<backlight>/id
+Date:		April 2012
+KernelVersion:	3.5
+Contact:	Johan Hovold <[email protected]>
+Description:
+		Get the id of this backlight (0, 1).
+
+What:		/sys/class/backlight/<backlight>/linear
+Date:		April 2012
+KernelVersion:	3.5
+Contact:	Johan Hovold <[email protected]>
+Description:
+		Set the brightness-mapping mode (0, 1), where
+
+		0 - exponential mode
+		1 - linear mode
+
+What:		/sys/class/backlight/<backlight>/pwm
+Date:		April 2012
+KernelVersion:	3.5
+Contact:	Johan Hovold <[email protected]>
+Description:
+		Set the PWM-input control mask (5 bits), where
+
+		bit 5 - PWM-input enabled in Zone 4
+		bit 4 - PWM-input enabled in Zone 3
+		bit 3 - PWM-input enabled in Zone 2
+		bit 2 - PWM-input enabled in Zone 1
+		bit 1 - PWM-input enabled in Zone 0
+		bit 0 - PWM-input enabled
diff --git a/Documentation/ABI/testing/sysfs-class-led-driver-lm3533 b/Documentation/ABI/testing/sysfs-class-led-driver-lm3533
new file mode 100644
index 000000000000..620ebb3b9baa
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-led-driver-lm3533
@@ -0,0 +1,65 @@
+What:		/sys/class/leds/<led>/als_channel
+Date:		May 2012
+KernelVersion:	3.5
+Contact:	Johan Hovold <[email protected]>
+Description:
+		Set the ALS output channel to use as input in
+		ALS-current-control mode (1, 2), where
+
+		1 - out_current1
+		2 - out_current2
+
+What:		/sys/class/leds/<led>/als_en
+Date:		May 2012
+KernelVersion:	3.5
+Contact:	Johan Hovold <[email protected]>
+Description:
+		Enable ALS-current-control mode (0, 1).
+
+What:		/sys/class/leds/<led>/falltime
+What:		/sys/class/leds/<led>/risetime
+Date:		April 2012
+KernelVersion:	3.5
+Contact:	Johan Hovold <[email protected]>
+Description:
+		Set the pattern generator fall and rise times (0..7), where
+
+		0 - 2048 us
+		1 - 262 ms
+		2 - 524 ms
+		3 - 1.049 s
+		4 - 2.097 s
+		5 - 4.194 s
+		6 - 8.389 s
+		7 - 16.78 s
+
+What:		/sys/class/leds/<led>/id
+Date:		April 2012
+KernelVersion:	3.5
+Contact:	Johan Hovold <[email protected]>
+Description:
+		Get the id of this led (0..3).
+
+What:		/sys/class/leds/<led>/linear
+Date:		April 2012
+KernelVersion:	3.5
+Contact:	Johan Hovold <[email protected]>
+Description:
+		Set the brightness-mapping mode (0, 1), where
+
+		0 - exponential mode
+		1 - linear mode
+
+What:		/sys/class/leds/<led>/pwm
+Date:		April 2012
+KernelVersion:	3.5
+Contact:	Johan Hovold <[email protected]>
+Description:
+		Set the PWM-input control mask (5 bits), where
+
+		bit 5 - PWM-input enabled in Zone 4
+		bit 4 - PWM-input enabled in Zone 3
+		bit 3 - PWM-input enabled in Zone 2
+		bit 2 - PWM-input enabled in Zone 1
+		bit 1 - PWM-input enabled in Zone 0
+		bit 0 - PWM-input enabled
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 9b1067afb224..dd88540bb995 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -184,12 +184,14 @@ behind this approach is that a cgroup that aggressively uses a shared
 page will eventually get charged for it (once it is uncharged from
 the cgroup that brought it in -- this will happen on memory pressure).
 
+But see section 8.2: when moving a task to another cgroup, its pages may
+be recharged to the new cgroup, if move_charge_at_immigrate has been chosen.
+
 Exception: If CONFIG_CGROUP_CGROUP_MEM_RES_CTLR_SWAP is not used.
 When you do swapoff and make swapped-out pages of shmem(tmpfs) to
 be backed into memory in force, charges for pages are accounted against the
 caller of swapoff rather than the users of shmem.
 
-
 2.4 Swap Extension (CONFIG_CGROUP_MEM_RES_CTLR_SWAP)
 
 Swap Extension allows you to record charge for swap. A swapped-in page is
@@ -374,14 +376,15 @@ cgroup might have some charge associated with it, even though all
 tasks have migrated away from it. (because we charge against pages, not
 against tasks.)
 
-Such charges are freed or moved to their parent. At moving, both of RSS
-and CACHES are moved to parent.
-rmdir() may return -EBUSY if freeing/moving fails. See 5.1 also.
+We move the stats to root (if use_hierarchy==0) or parent (if
+use_hierarchy==1), and no change on the charge except uncharging
+from the child.
 
 Charges recorded in swap information is not updated at removal of cgroup.
 Recorded information is discarded and a cgroup which uses swap (swapcache)
 will be charged as a new owner of it.
 
+About use_hierarchy, see Section 6.
 
 5. Misc. interfaces.
 
@@ -394,13 +397,15 @@ will be charged as a new owner of it.
 
   Almost all pages tracked by this memory cgroup will be unmapped and freed.
   Some pages cannot be freed because they are locked or in-use. Such pages are
-  moved to parent and this cgroup will be empty. This may return -EBUSY if
-  VM is too busy to free/move all pages immediately.
+  moved to parent(if use_hierarchy==1) or root (if use_hierarchy==0) and this
+  cgroup will be empty.
 
   Typical use case of this interface is that calling this before rmdir().
   Because rmdir() moves all pages to parent, some out-of-use page caches can be
   moved to the parent. If you want to avoid that, force_empty will be useful.
 
+  About use_hierarchy, see Section 6.
+
 5.2 stat file
 
 memory.stat file includes following statistics
@@ -430,17 +435,10 @@ hierarchical_memory_limit - # of bytes of memory limit with regard to hierarchy
 hierarchical_memsw_limit - # of bytes of memory+swap limit with regard to
 			hierarchy under which memory cgroup is.
 
-total_cache		- sum of all children's "cache"
-total_rss		- sum of all children's "rss"
-total_mapped_file	- sum of all children's "cache"
-total_pgpgin		- sum of all children's "pgpgin"
-total_pgpgout		- sum of all children's "pgpgout"
-total_swap		- sum of all children's "swap"
-total_inactive_anon	- sum of all children's "inactive_anon"
-total_active_anon	- sum of all children's "active_anon"
-total_inactive_file	- sum of all children's "inactive_file"
-total_active_file	- sum of all children's "active_file"
-total_unevictable	- sum of all children's "unevictable"
+total_<counter>		- # hierarchical version of <counter>, which in
+			addition to the cgroup's own value includes the
+			sum of all hierarchical children's values of
+			<counter>, i.e. total_cache
 
 # The following additional stats are dependent on CONFIG_DEBUG_VM.
 
@@ -622,8 +620,7 @@ memory cgroup.
   bit | what type of charges would be moved ?
  -----+------------------------------------------------------------------------
    0  | A charge of an anonymous page(or swap of it) used by the target task.
-      | Those pages and swaps must be used only by the target task. You must
-      | enable Swap Extension(see 2.4) to enable move of swap charges.
+      | You must enable Swap Extension(see 2.4) to enable move of swap charges.
  -----+------------------------------------------------------------------------
    1  | A charge of file pages(normal file, tmpfs file(e.g. ipc shared memory)
       | and swaps of tmpfs file) mmapped by the target task. Unlike the case of
@@ -636,8 +633,6 @@ memory cgroup.
 
 8.3 TODO
 
-- Implement madvise(2) to let users decide the vma to be moved or not to be
-  moved.
 - All of moving charge operations are done under cgroup_mutex. It's not good
   behavior to hold the mutex too long, so we may need some trick.
 
diff --git a/Documentation/cgroups/resource_counter.txt b/Documentation/cgroups/resource_counter.txt
index f3c4ec3626a2..0c4a344e78fa 100644
--- a/Documentation/cgroups/resource_counter.txt
+++ b/Documentation/cgroups/resource_counter.txt
@@ -92,6 +92,14 @@ to work with it.
 
 	The _locked routines imply that the res_counter->lock is taken.
 
+ f. void res_counter_uncharge_until
+		(struct res_counter *rc, struct res_counter *top,
+		 unsinged long val)
+
+	Almost same as res_cunter_uncharge() but propagation of uncharge
+	stops when rc == top. This is useful when kill a res_coutner in
+	child cgroup.
+
  2.1 Other accounting routines
 
     There are more routines that may help you with common needs, like
diff --git a/Documentation/devicetree/bindings/gpio/gpio-mm-lantiq.txt b/Documentation/devicetree/bindings/gpio/gpio-mm-lantiq.txt
new file mode 100644
index 000000000000..f93d51478d5a
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/gpio-mm-lantiq.txt
@@ -0,0 +1,38 @@
+Lantiq SoC External Bus memory mapped GPIO controller
+
+By attaching hardware latches to the EBU it is possible to create output
+only gpios. This driver configures a special memory address, which when
+written to outputs 16 bit to the latches.
+
+The node describing the memory mapped GPIOs needs to be a child of the node
+describing the "lantiq,localbus".
+
+Required properties:
+- compatible : Should be "lantiq,gpio-mm-lantiq"
+- reg : Address and length of the register set for the device
+- #gpio-cells : Should be two.  The first cell is the pin number and
+  the second cell is used to specify optional parameters (currently
+  unused).
+- gpio-controller : Marks the device node as a gpio controller.
+
+Optional properties:
+- lantiq,shadow : The default value that we shall assume as already set on the
+  shift register cascade.
+
+Example:
+
+localbus@0 {
+	#address-cells = <2>;
+	#size-cells = <1>;
+	ranges = <0 0 0x0 0x3ffffff /* addrsel0 */
+		1 0 0x4000000 0x4000010>; /* addsel1 */
+	compatible = "lantiq,localbus", "simple-bus";
+
+	gpio_mm0: gpio@4000000 {
+		compatible = "lantiq,gpio-mm";
+		reg = <1 0x0 0x10>;
+		gpio-controller;
+		#gpio-cells = <2>;
+		lantiq,shadow = <0x77f>
+	};
+}
diff --git a/Documentation/devicetree/bindings/gpio/gpio-stp-xway.txt b/Documentation/devicetree/bindings/gpio/gpio-stp-xway.txt
new file mode 100644
index 000000000000..854de130a971
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/gpio-stp-xway.txt
@@ -0,0 +1,42 @@
+Lantiq SoC Serial To Parallel (STP) GPIO controller
+
+The Serial To Parallel (STP) is found on MIPS based Lantiq socs. It is a
+peripheral controller used to drive external shift register cascades. At most
+3 groups of 8 bits can be driven. The hardware is able to allow the DSL modem
+to drive the 2 LSBs of the cascade automatically.
+
+
+Required properties:
+- compatible : Should be "lantiq,gpio-stp-xway"
+- reg : Address and length of the register set for the device
+- #gpio-cells : Should be two.  The first cell is the pin number and
+  the second cell is used to specify optional parameters (currently
+  unused).
+- gpio-controller : Marks the device node as a gpio controller.
+
+Optional properties:
+- lantiq,shadow : The default value that we shall assume as already set on the
+  shift register cascade.
+- lantiq,groups : Set the 3 bit mask to select which of the 3 groups are enabled
+  in the shift register cascade.
+- lantiq,dsl : The dsl core can control the 2 LSBs of the gpio cascade. This 2 bit
+  property can enable this feature.
+- lantiq,phy1 : The gphy1 core can control 3 bits of the gpio cascade.
+- lantiq,phy2 : The gphy2 core can control 3 bits of the gpio cascade.
+- lantiq,rising : use rising instead of falling edge for the shift register
+
+Example:
+
+gpio1: stp@E100BB0 {
+	compatible = "lantiq,gpio-stp-xway";
+	reg = <0xE100BB0 0x40>;
+	#gpio-cells = <2>;
+	gpio-controller;
+
+	lantiq,shadow = <0xffff>;
+	lantiq,groups = <0x7>;
+	lantiq,dsl = <0x3>;
+	lantiq,phy1 = <0x7>;
+	lantiq,phy2 = <0x7>;
+	/* lantiq,rising; */
+};
diff --git a/Documentation/devicetree/bindings/iommu/nvidia,tegra20-gart.txt b/Documentation/devicetree/bindings/iommu/nvidia,tegra20-gart.txt
new file mode 100644
index 000000000000..099d9362ebc1
--- /dev/null
+++ b/Documentation/devicetree/bindings/iommu/nvidia,tegra20-gart.txt
@@ -0,0 +1,14 @@
+NVIDIA Tegra 20 GART
+
+Required properties:
+- compatible: "nvidia,tegra20-gart"
+- reg: Two pairs of cells specifying the physical address and size of
+  the memory controller registers and the GART aperture respectively.
+
+Example:
+
+	gart {
+		compatible = "nvidia,tegra20-gart";
+		reg = <0x7000f024 0x00000018	/* controller registers */
+		       0x58000000 0x02000000>;	/* GART aperture */
+	};
diff --git a/Documentation/devicetree/bindings/mfd/da9052-i2c.txt b/Documentation/devicetree/bindings/mfd/da9052-i2c.txt
new file mode 100644
index 000000000000..1857f4a6b9a9
--- /dev/null
+++ b/Documentation/devicetree/bindings/mfd/da9052-i2c.txt
@@ -0,0 +1,60 @@
+* Dialog DA9052/53 Power Management Integrated Circuit (PMIC)
+
+Required properties:
+- compatible : Should be "dlg,da9052", "dlg,da9053-aa",
+			 "dlg,da9053-ab", or "dlg,da9053-bb"
+
+Sub-nodes:
+- regulators : Contain the regulator nodes. The DA9052/53 regulators are
+  bound using their names as listed below:
+
+    buck0     : regulator BUCK0
+    buck1     : regulator BUCK1
+    buck2     : regulator BUCK2
+    buck3     : regulator BUCK3
+    ldo4      : regulator LDO4
+    ldo5      : regulator LDO5
+    ldo6      : regulator LDO6
+    ldo7      : regulator LDO7
+    ldo8      : regulator LDO8
+    ldo9      : regulator LDO9
+    ldo10     : regulator LDO10
+    ldo11     : regulator LDO11
+    ldo12     : regulator LDO12
+    ldo13     : regulator LDO13
+
+  The bindings details of individual regulator device can be found in:
+  Documentation/devicetree/bindings/regulator/regulator.txt
+
+Examples:
+
+i2c@63fc8000 { /* I2C1 */
+	status = "okay";
+
+	pmic: dialog@48 {
+		compatible = "dlg,da9053-aa";
+		reg = <0x48>;
+
+		regulators {
+			buck0 {
+				regulator-min-microvolt = <500000>;
+				regulator-max-microvolt = <2075000>;
+			};
+
+			buck1 {
+				regulator-min-microvolt = <500000>;
+				regulator-max-microvolt = <2075000>;
+			};
+
+			buck2 {
+				regulator-min-microvolt = <925000>;
+				regulator-max-microvolt = <2500000>;
+			};
+
+			buck3 {
+				regulator-min-microvolt = <925000>;
+				regulator-max-microvolt = <2500000>;
+			};
+		};
+	};
+};
diff --git a/Documentation/devicetree/bindings/mfd/tps65910.txt b/Documentation/devicetree/bindings/mfd/tps65910.txt
new file mode 100644
index 000000000000..645f5eaadb3f
--- /dev/null
+++ b/Documentation/devicetree/bindings/mfd/tps65910.txt
@@ -0,0 +1,133 @@
+TPS65910 Power Management Integrated Circuit
+
+Required properties:
+- compatible: "ti,tps65910" or "ti,tps65911"
+- reg: I2C slave address
+- interrupts: the interrupt outputs of the controller
+- #gpio-cells: number of cells to describe a GPIO, this should be 2.
+  The first cell is the GPIO number.
+  The second cell is used to specify additional options <unused>.
+- gpio-controller: mark the device as a GPIO controller
+- #interrupt-cells: the number of cells to describe an IRQ, this should be 2.
+  The first cell is the IRQ number.
+  The second cell is the flags, encoded as the trigger masks from
+  Documentation/devicetree/bindings/interrupts.txt
+- regulators: This is the list of child nodes that specify the regulator
+  initialization data for defined regulators. Not all regulators for the given
+  device need to be present. The definition for each of these nodes is defined
+  using the standard binding for regulators found at
+  Documentation/devicetree/bindings/regulator/regulator.txt.
+
+  The valid names for regulators are:
+  tps65910: vrtc, vio, vdd1, vdd2, vdd3, vdig1, vdig2, vpll, vdac, vaux1,
+            vaux2, vaux33, vmmc
+  tps65911: vrtc, vio, vdd1, vdd3, vddctrl, ldo1, ldo2, ldo3, ldo4, ldo5,
+            ldo6, ldo7, ldo8
+
+Optional properties:
+- ti,vmbch-threshold: (tps65911) main battery charged threshold
+  comparator. (see VMBCH_VSEL in TPS65910 datasheet)
+- ti,vmbch2-threshold: (tps65911) main battery discharged threshold
+  comparator. (see VMBCH_VSEL in TPS65910 datasheet)
+- ti,en-gpio-sleep: enable sleep control for gpios
+  There should be 9 entries here, one for each gpio.
+
+Regulator Optional properties:
+- ti,regulator-ext-sleep-control: enable external sleep
+  control through external inputs [0 (not enabled), 1 (EN1), 2 (EN2) or 4(EN3)]
+  If this property is not defined, it defaults to 0 (not enabled).
+
+Example:
+
+	pmu: tps65910@d2 {
+		compatible = "ti,tps65910";
+		reg = <0xd2>;
+		interrupt-parent = <&intc>;
+		interrupts = < 0 118 0x04 >;
+
+		#gpio-cells = <2>;
+		gpio-controller;
+
+		#interrupt-cells = <2>;
+		interrupt-controller;
+
+		ti,vmbch-threshold = 0;
+		ti,vmbch2-threshold = 0;
+
+		ti,en-gpio-sleep = <0 0 1 0 0 0 0 0 0>;
+
+		regulators {
+			vdd1_reg: vdd1 {
+				regulator-min-microvolt = < 600000>;
+				regulator-max-microvolt = <1500000>;
+				regulator-always-on;
+				regulator-boot-on;
+				ti,regulator-ext-sleep-control = <0>;
+			};
+			vdd2_reg: vdd2 {
+				regulator-min-microvolt = < 600000>;
+				regulator-max-microvolt = <1500000>;
+				regulator-always-on;
+				regulator-boot-on;
+				ti,regulator-ext-sleep-control = <4>;
+			};
+			vddctrl_reg: vddctrl {
+				regulator-min-microvolt = < 600000>;
+				regulator-max-microvolt = <1400000>;
+				regulator-always-on;
+				regulator-boot-on;
+				ti,regulator-ext-sleep-control = <0>;
+			};
+			vio_reg: vio {
+				regulator-min-microvolt = <1500000>;
+				regulator-max-microvolt = <1800000>;
+				regulator-always-on;
+				regulator-boot-on;
+				ti,regulator-ext-sleep-control = <1>;
+			};
+			ldo1_reg: ldo1 {
+				regulator-min-microvolt = <1000000>;
+				regulator-max-microvolt = <3300000>;
+				ti,regulator-ext-sleep-control = <0>;
+			};
+			ldo2_reg: ldo2 {
+				regulator-min-microvolt = <1050000>;
+				regulator-max-microvolt = <1050000>;
+				ti,regulator-ext-sleep-control = <0>;
+			};
+			ldo3_reg: ldo3 {
+				regulator-min-microvolt = <1000000>;
+				regulator-max-microvolt = <3300000>;
+				ti,regulator-ext-sleep-control = <0>;
+			};
+			ldo4_reg: ldo4 {
+				regulator-min-microvolt = <1000000>;
+				regulator-max-microvolt = <3300000>;
+				regulator-always-on;
+				ti,regulator-ext-sleep-control = <0>;
+			};
+			ldo5_reg: ldo5 {
+				regulator-min-microvolt = <1000000>;
+				regulator-max-microvolt = <3300000>;
+				ti,regulator-ext-sleep-control = <0>;
+			};
+			ldo6_reg: ldo6 {
+				regulator-min-microvolt = <1200000>;
+				regulator-max-microvolt = <1200000>;
+				ti,regulator-ext-sleep-control = <0>;
+			};
+			ldo7_reg: ldo7 {
+				regulator-min-microvolt = <1200000>;
+				regulator-max-microvolt = <1200000>;
+				regulator-always-on;
+				regulator-boot-on;
+				ti,regulator-ext-sleep-control = <1>;
+			};
+			ldo8_reg: ldo8 {
+				regulator-min-microvolt = <1000000>;
+				regulator-max-microvolt = <3300000>;
+				regulator-always-on;
+				ti,regulator-ext-sleep-control = <1>;
+			};
+		};
+	};
diff --git a/Documentation/devicetree/bindings/mfd/twl6040.txt b/Documentation/devicetree/bindings/mfd/twl6040.txt
new file mode 100644
index 000000000000..bc67c6f424aa
--- /dev/null
+++ b/Documentation/devicetree/bindings/mfd/twl6040.txt
@@ -0,0 +1,62 @@
+Texas Instruments TWL6040 family
+
+The TWL6040s are 8-channel high quality low-power audio codecs providing audio
+and vibra functionality on OMAP4+ platforms.
+They are connected ot the host processor via i2c for commands, McPDM for audio
+data and commands.
+
+Required properties:
+- compatible : Must be "ti,twl6040";
+- reg: must be 0x4b for i2c address
+- interrupts: twl6040 has one interrupt line connecteded to the main SoC
+- interrupt-parent: The parent interrupt controller
+- twl6040,audpwron-gpio: Power on GPIO line for the twl6040
+
+- vio-supply: Regulator for the twl6040 VIO supply
+- v2v1-supply: Regulator for the twl6040 V2V1 supply
+
+Optional properties, nodes:
+- enable-active-high: To power on the twl6040 during boot.
+
+Vibra functionality
+Required properties:
+- vddvibl-supply: Regulator for the left vibra motor
+- vddvibr-supply: Regulator for the right vibra motor
+- vibra { }: Configuration section for vibra parameters containing the following
+	     properties:
+- ti,vibldrv-res: Resistance parameter for left driver
+- ti,vibrdrv-res: Resistance parameter for right driver
+- ti,viblmotor-res: Resistance parameter for left motor
+- ti,viblmotor-res: Resistance parameter for right motor
+
+Optional properties within vibra { } section:
+- vddvibl_uV: If the vddvibl default voltage need to be changed
+- vddvibr_uV: If the vddvibr default voltage need to be changed
+
+Example:
+&i2c1 {
+	twl6040: twl@4b {
+		compatible = "ti,twl6040";
+		reg = <0x4b>;
+
+		interrupts = <0 119 4>;
+		interrupt-parent = <&gic>;
+		twl6040,audpwron-gpio = <&gpio4 31 0>;
+
+		vio-supply = <&v1v8>;
+		v2v1-supply = <&v2v1>;
+		enable-active-high;
+
+		/* regulators for vibra motor */
+		vddvibl-supply = <&vbat>;
+		vddvibr-supply = <&vbat>;
+
+		vibra {
+			/* Vibra driver, motor resistance parameters */
+			ti,vibldrv-res = <8>;
+			ti,vibrdrv-res = <3>;
+			ti,viblmotor-res = <10>;
+			ti,vibrmotor-res = <10>;
+		};
+	};
+};
diff --git a/Documentation/devicetree/bindings/rtc/lpc32xx-rtc.txt b/Documentation/devicetree/bindings/rtc/lpc32xx-rtc.txt
new file mode 100644
index 000000000000..a87a1e9bc060
--- /dev/null
+++ b/Documentation/devicetree/bindings/rtc/lpc32xx-rtc.txt
@@ -0,0 +1,15 @@
+* NXP LPC32xx SoC Real Time Clock controller
+
+Required properties:
+- compatible: must be "nxp,lpc3220-rtc"
+- reg: physical base address of the controller and length of memory mapped
+  region.
+- interrupts: The RTC interrupt
+
+Example:
+
+	rtc@40024000 {
+		compatible = "nxp,lpc3220-rtc";
+		reg = <0x40024000 0x1000>;
+		interrupts = <52 0>;
+	};
diff --git a/Documentation/devicetree/bindings/rtc/spear-rtc.txt b/Documentation/devicetree/bindings/rtc/spear-rtc.txt
new file mode 100644
index 000000000000..ca67ac62108e
--- /dev/null
+++ b/Documentation/devicetree/bindings/rtc/spear-rtc.txt
@@ -0,0 +1,17 @@
+* SPEAr RTC
+
+Required properties:
+- compatible : "st,spear600-rtc"
+- reg : Address range of the rtc registers
+- interrupt-parent: Should be the phandle for the interrupt controller
+  that services interrupts for this device
+- interrupt: Should contain the rtc interrupt number
+
+Example:
+
+	rtc@fc000000 {
+		compatible = "st,spear600-rtc";
+		reg = <0xfc000000 0x1000>;
+		interrupt-parent = <&vic1>;
+		interrupts = <12>;
+	};
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 4fca82e5276e..d449e632e6a0 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -60,7 +60,6 @@ ata *);
 	ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*removexattr) (struct dentry *, const char *);
-	void (*truncate_range)(struct inode *, loff_t, loff_t);
 	int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
 
 locking rules:
@@ -87,7 +86,6 @@ setxattr:	yes
 getxattr:	no
 listxattr:	no
 removexattr:	yes
-truncate_range:	yes
 fiemap:		no
 	Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
 victim.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index ef088e55ab2e..912af6ce5626 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -743,6 +743,7 @@ Committed_AS:   100056 kB
 VmallocTotal:   112216 kB
 VmallocUsed:       428 kB
 VmallocChunk:   111088 kB
+AnonHugePages:   49152 kB
 
     MemTotal: Total usable ram (i.e. physical ram minus a few reserved
               bits and the kernel binary code)
@@ -776,6 +777,7 @@ VmallocChunk:   111088 kB
        Dirty: Memory which is waiting to get written back to the disk
    Writeback: Memory which is actively being written back to the disk
    AnonPages: Non-file backed pages mapped into userspace page tables
+AnonHugePages: Non-file backed huge pages mapped into userspace page tables
       Mapped: files which have been mmaped, such as libraries
         Slab: in-kernel data structures cache
 SReclaimable: Part of Slab, that might be reclaimed, such as caches
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 0d0492028082..ef19f91a0f12 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -363,7 +363,6 @@ struct inode_operations {
 	ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*removexattr) (struct dentry *, const char *);
-	void (*truncate_range)(struct inode *, loff_t, loff_t);
 };
 
 Again, all methods are called without any locks being held, unless
@@ -472,9 +471,6 @@ otherwise noted.
   removexattr: called by the VFS to remove an extended attribute from
   	a file. This method is called by removexattr(2) system call.
 
-  truncate_range: a method provided by the underlying filesystem to truncate a
-  	range of blocks , i.e. punch a hole somewhere in a file.
-
 
 The Address Space Object
 ========================
@@ -760,7 +756,7 @@ struct file_operations
 ----------------------
 
 This describes how the VFS can manipulate an open file. As of kernel
-2.6.22, the following members are defined:
+3.5, the following members are defined:
 
 struct file_operations {
 	struct module *owner;
@@ -790,6 +786,8 @@ struct file_operations {
 	int (*flock) (struct file *, int, struct file_lock *);
 	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, size_t, unsigned int);
 	ssize_t (*splice_read)(struct file *, struct pipe_inode_info *, size_t, unsigned int);
+	int (*setlease)(struct file *, long arg, struct file_lock **);
+	long (*fallocate)(struct file *, int mode, loff_t offset, loff_t len);
 };
 
 Again, all methods are called without any locks being held, unless
@@ -858,6 +856,11 @@ otherwise noted.
   splice_read: called by the VFS to splice data from file to a pipe. This
 	       method is used by the splice(2) system call
 
+  setlease: called by the VFS to set or release a file lock lease.
+	    setlease has the file_lock_lock held and must not sleep.
+
+  fallocate: called by the VFS to preallocate blocks or punch a hole.
+
 Note that the file operations are implemented by the specific
 filesystem in which the inode resides. When opening a device node
 (character or block special) most filesystems will call special
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index b40b413db88e..c45513d806ab 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -335,6 +335,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 					  requirements as needed. This option
 					  does not override iommu=pt
 
+	amd_iommu_dump=	[HW,X86-64]
+			Enable AMD IOMMU driver option to dump the ACPI table
+			for AMD IOMMU. With this option enabled, AMD IOMMU
+			driver will print ACPI tables for AMD IOMMU during
+			IOMMU initialization.
+
 	amijoy.map=	[HW,JOY] Amiga joystick support
 			Map of devices attached to JOY0DAT and JOY1DAT
 			Format: <a>,<b>
diff --git a/Documentation/leds/ledtrig-transient.txt b/Documentation/leds/ledtrig-transient.txt
new file mode 100644
index 000000000000..3bd38b487df1
--- /dev/null
+++ b/Documentation/leds/ledtrig-transient.txt
@@ -0,0 +1,152 @@
+LED Transient Trigger
+=====================
+
+The leds timer trigger does not currently have an interface to activate
+a one shot timer. The current support allows for setting two timers, one for
+specifying how long a state to be on, and the second for how long the state
+to be off. The delay_on value specifies the time period an LED should stay
+in on state, followed by a delay_off value that specifies how long the LED
+should stay in off state. The on and off cycle repeats until the trigger
+gets deactivated. There is no provision for one time activation to implement
+features that require an on or off state to be held just once and then stay in
+the original state forever.
+
+Without one shot timer interface, user space can still use timer trigger to
+set a timer to hold a state, however when user space application crashes or
+goes away without deactivating the timer, the hardware will be left in that
+state permanently.
+
+As a specific example of this use-case, let's look at vibrate feature on
+phones. Vibrate function on phones is implemented using PWM pins on SoC or
+PMIC. There is a need to activate one shot timer to control the vibrate
+feature, to prevent user space crashes leaving the phone in vibrate mode
+permanently causing the battery to drain.
+
+Transient trigger addresses the need for one shot timer activation. The
+transient trigger can be enabled and disabled just like the other leds
+triggers.
+
+When an led class device driver registers itself, it can specify all leds
+triggers it supports and a default trigger. During registration, activation
+routine for the default trigger gets called. During registration of an led
+class device, the LED state does not change.
+
+When the driver unregisters, deactivation routine for the currently active
+trigger will be called, and LED state is changed to LED_OFF.
+
+Driver suspend changes the LED state to LED_OFF and resume doesn't change
+the state. Please note that there is no explicit interaction between the
+suspend and resume actions and the currently enabled trigger. LED state
+changes are suspended while the driver is in suspend state. Any timers
+that are active at the time driver gets suspended, continue to run, without
+being able to actually change the LED state. Once driver is resumed, triggers
+start functioning again.
+
+LED state changes are controlled using brightness which is a common led
+class device property. When brightness is set to 0 from user space via
+echo 0 > brightness, it will result in deactivating the current trigger.
+
+Transient trigger uses standard register and unregister interfaces. During
+trigger registration, for each led class device that specifies this trigger
+as its default trigger, trigger activation routine will get called. During
+registration, the LED state does not change, unless there is another trigger
+active, in which case LED state changes to LED_OFF.
+
+During trigger unregistration, LED state gets changed to LED_OFF.
+
+Transient trigger activation routine doesn't change the LED state. It
+creates its properties and does its initialization. Transient trigger
+deactivation routine, will cancel any timer that is active before it cleans
+up and removes the properties it created. It will restore the LED state to
+non-transient state. When driver gets suspended, irrespective of the transient
+state, the LED state changes to LED_OFF.
+
+Transient trigger can be enabled and disabled from user space on led class
+devices, that support this trigger as shown below:
+
+echo transient > trigger
+echo none > trigger
+
+NOTE: Add a new property trigger state to control the state.
+
+This trigger exports three properties, activate, state, and duration. When
+transient trigger is activated these properties are set to default values.
+
+- duration allows setting timer value in msecs. The initial value is 0.
+- activate allows activating and deactivating the timer specified by
+  duration as needed. The initial and default value is 0.  This will allow
+  duration to be set after trigger activation.
+- state allows user to specify a transient state to be held for the specified
+  duration.
+
+	activate - one shot timer activate mechanism.
+		1 when activated, 0 when deactivated.
+		default value is zero when transient trigger is enabled,
+		to allow duration to be set.
+
+		activate state indicates a timer with a value of specified
+		duration running.
+		deactivated state indicates that there is no active timer
+		running.
+
+	duration - one shot timer value. When activate is set, duration value
+		is used to start a timer that runs once. This value doesn't
+		get changed by the trigger unless user does a set via
+		echo new_value > duration
+
+	state - transient state to be held. It has two values 0 or 1. 0 maps
+		to LED_OFF and 1 maps to LED_FULL. The specified state is
+		held for the duration of the one shot timer and then the
+		state gets changed to the non-transient state which is the
+		inverse of transient state.
+		If state = LED_FULL, when the timer runs out the state will
+		go back to LED_OFF.
+		If state = LED_OFF, when the timer runs out the state will
+		go back to LED_FULL.
+		Please note that current LED state is not checked prior to
+		changing the state to the specified state.
+		Driver could map these values to inverted depending on the
+		default states it defines for the LED in its brightness_set()
+		interface which is called from the led brightness_set()
+		interfaces to control the LED state.
+
+When timer expires activate goes back to deactivated state, duration is left
+at the set value to be used when activate is set at a future time. This will
+allow user app to set the time once and activate it to run it once for the
+specified value as needed. When timer expires, state is restored to the
+non-transient state which is the inverse of the transient state.
+
+	echo 1 > activate - starts timer = duration when duration is not 0.
+	echo 0 > activate - cancels currently running timer.
+	echo n > duration - stores timer value to be used upon next
+                            activate. Currently active timer if
+                            any, continues to run for the specified time.
+	echo 0 > duration - stores timer value to be used upon next
+                            activate. Currently active timer if any,
+                            continues to run for the specified time.
+	echo 1 > state    - stores desired transient state LED_FULL to be
+			    held for the specified duration.
+	echo 0 > state    - stores desired transient state LED_OFF to be
+			    held for the specified duration.
+
+What is not supported:
+======================
+- Timer activation is one shot and extending and/or shortening the timer
+  is not supported.
+
+Example use-case 1:
+	echo transient > trigger
+	echo n > duration
+	echo 1 > state
+repeat the following step as needed:
+	echo 1 > activate - start timer = duration to run once
+	echo 1 > activate - start timer = duration to run once
+	echo none > trigger
+
+This trigger is intended to be used for for the following example use cases:
+ - Control of vibrate (phones, tablets etc.) hardware by user space app.
+ - Use of LED by user space app as activity indicator.
+ - Use of LED by user space app as a kind of watchdog indicator -- as
+       long as the app is alive, it can keep the LED illuminated, if it dies
+       the LED will be extinguished automatically.
+ - Use by any user space app that needs a transient GPIO output.
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index 29bdf62aac09..f734bb2a78dc 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -166,6 +166,68 @@ behavior. So to make them effective you need to restart any
 application that could have been using hugepages. This also applies to
 the regions registered in khugepaged.
 
+== Monitoring usage ==
+
+The number of transparent huge pages currently used by the system is
+available by reading the AnonHugePages field in /proc/meminfo. To
+identify what applications are using transparent huge pages, it is
+necessary to read /proc/PID/smaps and count the AnonHugePages fields
+for each mapping. Note that reading the smaps file is expensive and
+reading it frequently will incur overhead.
+
+There are a number of counters in /proc/vmstat that may be used to
+monitor how successfully the system is providing huge pages for use.
+
+thp_fault_alloc is incremented every time a huge page is successfully
+	allocated to handle a page fault. This applies to both the
+	first time a page is faulted and for COW faults.
+
+thp_collapse_alloc is incremented by khugepaged when it has found
+	a range of pages to collapse into one huge page and has
+	successfully allocated a new huge page to store the data.
+
+thp_fault_fallback is incremented if a page fault fails to allocate
+	a huge page and instead falls back to using small pages.
+
+thp_collapse_alloc_failed is incremented if khugepaged found a range
+	of pages that should be collapsed into one huge page but failed
+	the allocation.
+
+thp_split is incremented every time a huge page is split into base
+	pages. This can happen for a variety of reasons but a common
+	reason is that a huge page is old and is being reclaimed.
+
+As the system ages, allocating huge pages may be expensive as the
+system uses memory compaction to copy data around memory to free a
+huge page for use. There are some counters in /proc/vmstat to help
+monitor this overhead.
+
+compact_stall is incremented every time a process stalls to run
+	memory compaction so that a huge page is free for use.
+
+compact_success is incremented if the system compacted memory and
+	freed a huge page for use.
+
+compact_fail is incremented if the system tries to compact memory
+	but failed.
+
+compact_pages_moved is incremented each time a page is moved. If
+	this value is increasing rapidly, it implies that the system
+	is copying a lot of data to satisfy the huge page allocation.
+	It is possible that the cost of copying exceeds any savings
+	from reduced TLB misses.
+
+compact_pagemigrate_failed is incremented when the underlying mechanism
+	for moving a page failed.
+
+compact_blocks_moved is incremented each time memory compaction examines
+	a huge page aligned range of pages.
+
+It is possible to establish how long the stalls were using the function
+tracer to record how long was spent in __alloc_pages_nodemask and
+using the mm_page_alloc tracepoint to identify which allocations were
+for huge pages.
+
 == get_user_pages and follow_page ==
 
 get_user_pages and follow_page if run on a hugepage, will return the
diff --git a/MAINTAINERS b/MAINTAINERS
index 6f90c64a2539..64e675d6d478 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2818,6 +2818,12 @@ F:	Documentation/firmware_class/
 F:	drivers/base/firmware*.c
 F:	include/linux/firmware.h
 
+FLOPPY DRIVER
+M:	Jiri Kosina <[email protected]>
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/jikos/floppy.git
+S:	Odd fixes
+F:	drivers/block/floppy.c
+
 FPU EMULATOR
 M:	Bill Metzenthen <[email protected]>
 W:	http://floatingpoint.sourceforge.net/emulator/index.html
@@ -3232,10 +3238,8 @@ F:	include/linux/clockchips.h
 F:	include/linux/hrtimer.h
 
 HIGH-SPEED SCC DRIVER FOR AX.25
-M:	Klaus Kudielka <[email protected]>
 L:	[email protected]
-W:	http://www.nt.tuwien.ac.at/~kkudielk/Linux/
-S:	Maintained
+S:	Orphan
 F:	drivers/net/hamradio/dmascc.c
 F:	drivers/net/hamradio/scc.c
 
@@ -3382,6 +3386,12 @@ W:	http://www.developer.ibm.com/welcome/netfinity/serveraid.html
 S:	Supported
 F:	drivers/scsi/ips.*
 
+ICH LPC AND GPIO DRIVER
+M:	Peter Tyser <[email protected]>
+S:	Maintained
+F:	drivers/mfd/lpc_ich.c
+F:	drivers/gpio/gpio-ich.c
+
 IDE SUBSYSTEM
 M:	"David S. Miller" <[email protected]>
 L:	[email protected]
@@ -4505,12 +4515,6 @@ L:	[email protected] (moderated for non-subscribers)
 S:	Maintained
 F:	drivers/mmc/host/imxmmc.*
 
-MOUSE AND MISC DEVICES [GENERAL]
-M:	Alessandro Rubini <[email protected]>
-S:	Maintained
-F:	drivers/input/mouse/
-F:	include/linux/gpio_mouse.h
-
 MOXA SMARTIO/INDUSTIO/INTELLIO SERIAL CARD
 M:	Jiri Slaby <[email protected]>
 S:	Maintained
diff --git a/arch/arm/configs/imx_v4_v5_defconfig b/arch/arm/configs/imx_v4_v5_defconfig
index ebda45bd5763..e05a2f1665a7 100644
--- a/arch/arm/configs/imx_v4_v5_defconfig
+++ b/arch/arm/configs/imx_v4_v5_defconfig
@@ -173,7 +173,7 @@ CONFIG_LEDS_TRIGGER_DEFAULT_ON=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_PCF8563=y
 CONFIG_RTC_DRV_IMXDI=y
-CONFIG_RTC_MXC=y
+CONFIG_RTC_DRV_MXC=y
 CONFIG_DMADEVICES=y
 CONFIG_IMX_SDMA=y
 CONFIG_IMX_DMA=y
diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig
index 12617f7296e6..b1d3675df72c 100644
--- a/arch/arm/configs/imx_v6_v7_defconfig
+++ b/arch/arm/configs/imx_v6_v7_defconfig
@@ -178,7 +178,7 @@ CONFIG_NEW_LEDS=y
 CONFIG_LEDS_CLASS=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_INTF_DEV_UIE_EMUL=y
-CONFIG_RTC_MXC=y
+CONFIG_RTC_DRV_MXC=y
 CONFIG_DMADEVICES=y
 CONFIG_IMX_SDMA=y
 CONFIG_EXT2_FS=y
diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h
index 9af5563dd3eb..815c669fec0a 100644
--- a/arch/arm/include/asm/io.h
+++ b/arch/arm/include/asm/io.h
@@ -47,9 +47,9 @@ extern void __raw_readsb(const void __iomem *addr, void *data, int bytelen);
 extern void __raw_readsw(const void __iomem *addr, void *data, int wordlen);
 extern void __raw_readsl(const void __iomem *addr, void *data, int longlen);
 
-#define __raw_writeb(v,a)	(__chk_io_ptr(a), *(volatile unsigned char __force  *)(a) = (v))
-#define __raw_writew(v,a)	(__chk_io_ptr(a), *(volatile unsigned short __force *)(a) = (v))
-#define __raw_writel(v,a)	(__chk_io_ptr(a), *(volatile unsigned int __force   *)(a) = (v))
+#define __raw_writeb(v,a)	((void)(__chk_io_ptr(a), *(volatile unsigned char __force  *)(a) = (v)))
+#define __raw_writew(v,a)	((void)(__chk_io_ptr(a), *(volatile unsigned short __force *)(a) = (v)))
+#define __raw_writel(v,a)	((void)(__chk_io_ptr(a), *(volatile unsigned int __force   *)(a) = (v)))
 
 #define __raw_readb(a)		(__chk_io_ptr(a), *(volatile unsigned char __force  *)(a))
 #define __raw_readw(a)		(__chk_io_ptr(a), *(volatile unsigned short __force *)(a))
@@ -229,11 +229,9 @@ extern void _memset_io(volatile void __iomem *, int, size_t);
 #define readl_relaxed(c) ({ u32 __r = le32_to_cpu((__force __le32) \
 					__raw_readl(c)); __r; })
 
-#define writeb_relaxed(v,c)	((void)__raw_writeb(v,c))
-#define writew_relaxed(v,c)	((void)__raw_writew((__force u16) \
-					cpu_to_le16(v),c))
-#define writel_relaxed(v,c)	((void)__raw_writel((__force u32) \
-					cpu_to_le32(v),c))
+#define writeb_relaxed(v,c)	__raw_writeb(v,c)
+#define writew_relaxed(v,c)	__raw_writew((__force u16) cpu_to_le16(v),c)
+#define writel_relaxed(v,c)	__raw_writel((__force u32) cpu_to_le32(v),c)
 
 #define readb(c)		({ u8  __v = readb_relaxed(c); __iormb(); __v; })
 #define readw(c)		({ u16 __v = readw_relaxed(c); __iormb(); __v; })
@@ -281,12 +279,12 @@ extern void _memset_io(volatile void __iomem *, int, size_t);
 #define ioread16be(p)	({ unsigned int __v = be16_to_cpu((__force __be16)__raw_readw(p)); __iormb(); __v; })
 #define ioread32be(p)	({ unsigned int __v = be32_to_cpu((__force __be32)__raw_readl(p)); __iormb(); __v; })
 
-#define iowrite8(v,p)	({ __iowmb(); (void)__raw_writeb(v, p); })
-#define iowrite16(v,p)	({ __iowmb(); (void)__raw_writew((__force __u16)cpu_to_le16(v), p); })
-#define iowrite32(v,p)	({ __iowmb(); (void)__raw_writel((__force __u32)cpu_to_le32(v), p); })
+#define iowrite8(v,p)	({ __iowmb(); __raw_writeb(v, p); })
+#define iowrite16(v,p)	({ __iowmb(); __raw_writew((__force __u16)cpu_to_le16(v), p); })
+#define iowrite32(v,p)	({ __iowmb(); __raw_writel((__force __u32)cpu_to_le32(v), p); })
 
-#define iowrite16be(v,p) ({ __iowmb(); (void)__raw_writew((__force __u16)cpu_to_be16(v), p); })
-#define iowrite32be(v,p) ({ __iowmb(); (void)__raw_writel((__force __u32)cpu_to_be32(v), p); })
+#define iowrite16be(v,p) ({ __iowmb(); __raw_writew((__force __u16)cpu_to_be16(v), p); })
+#define iowrite32be(v,p) ({ __iowmb(); __raw_writel((__force __u32)cpu_to_be32(v), p); })
 
 #define ioread8_rep(p,d,c)	__raw_readsb(p,d,c)
 #define ioread16_rep(p,d,c)	__raw_readsw(p,d,c)
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index 68388eb4946b..b79f8e97f775 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -148,6 +148,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 #define TIF_NOTIFY_RESUME	2	/* callback before returning to user */
 #define TIF_SYSCALL_TRACE	8
 #define TIF_SYSCALL_AUDIT	9
+#define TIF_SYSCALL_RESTARTSYS	10
 #define TIF_POLLING_NRFLAG	16
 #define TIF_USING_IWMMXT	17
 #define TIF_MEMDIE		18	/* is terminating due to OOM killer */
@@ -162,16 +163,17 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
 #define _TIF_USING_IWMMXT	(1 << TIF_USING_IWMMXT)
-#define _TIF_RESTORE_SIGMASK	(1 << TIF_RESTORE_SIGMASK)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
+#define _TIF_SYSCALL_RESTARTSYS	(1 << TIF_SYSCALL_RESTARTSYS)
 
 /* Checks for any syscall work in entry-common.S */
-#define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT)
+#define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
+			   _TIF_SYSCALL_RESTARTSYS)
 
 /*
  * Change these and you break ASM code in entry-common.S
  */
-#define _TIF_WORK_MASK		0x000000ff
+#define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_SIGPENDING | _TIF_NOTIFY_RESUME)
 
 #endif /* __KERNEL__ */
 #endif /* __ASM_ARM_THREAD_INFO_H */
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index 7bd2d3cb8957..4afed88d250a 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -53,9 +53,13 @@ fast_work_pending:
 work_pending:
 	tst	r1, #_TIF_NEED_RESCHED
 	bne	work_resched
-	tst	r1, #_TIF_SIGPENDING|_TIF_NOTIFY_RESUME
-	beq	no_work_pending
+	/*
+	 * TIF_SIGPENDING or TIF_NOTIFY_RESUME must've been set if we got here
+	 */
+	ldr	r2, [sp, #S_PSR]
 	mov	r0, sp				@ 'regs'
+	tst	r2, #15				@ are we returning to user mode?
+	bne	no_work_pending			@ no?  just leave, then...
 	mov	r2, why				@ 'syscall'
 	tst	r1, #_TIF_SIGPENDING		@ delivering a signal?
 	movne	why, #0				@ prevent further restarts
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index 14e38261cd31..5700a7ae7f0b 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -25,6 +25,7 @@
 #include <linux/regset.h>
 #include <linux/audit.h>
 #include <linux/tracehook.h>
+#include <linux/unistd.h>
 
 #include <asm/pgtable.h>
 #include <asm/traps.h>
@@ -917,6 +918,8 @@ asmlinkage int syscall_trace(int why, struct pt_regs *regs, int scno)
 		audit_syscall_entry(AUDIT_ARCH_ARM, scno, regs->ARM_r0,
 				    regs->ARM_r1, regs->ARM_r2, regs->ARM_r3);
 
+	if (why == 0 && test_and_clear_thread_flag(TIF_SYSCALL_RESTARTSYS))
+		scno = __NR_restart_syscall - __NR_SYSCALL_BASE;
 	if (!test_thread_flag(TIF_SYSCALL_TRACE))
 		return scno;
 
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 4e5fdd9bd9e3..17fc36c41cff 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -29,7 +29,6 @@
  */
 #define SWI_SYS_SIGRETURN	(0xef000000|(__NR_sigreturn)|(__NR_OABI_SYSCALL_BASE))
 #define SWI_SYS_RT_SIGRETURN	(0xef000000|(__NR_rt_sigreturn)|(__NR_OABI_SYSCALL_BASE))
-#define SWI_SYS_RESTART		(0xef000000|__NR_restart_syscall|__NR_OABI_SYSCALL_BASE)
 
 /*
  * With EABI, the syscall number has to be loaded into r7.
@@ -50,18 +49,6 @@ const unsigned long sigreturn_codes[7] = {
 };
 
 /*
- * Either we support OABI only, or we have EABI with the OABI
- * compat layer enabled.  In the later case we don't know if
- * user space is EABI or not, and if not we must not clobber r7.
- * Always using the OABI syscall solves that issue and works for
- * all those cases.
- */
-const unsigned long syscall_restart_code[2] = {
-	SWI_SYS_RESTART,	/* swi	__NR_restart_syscall */
-	0xe49df004,		/* ldr	pc, [sp], #4 */
-};
-
-/*
  * atomically swap in the new signal mask, and wait for a signal.
  */
 asmlinkage int sys_sigsuspend(int restart, unsigned long oldmask, old_sigset_t mask)
@@ -82,10 +69,10 @@ sys_sigaction(int sig, const struct old_sigaction __user *act,
 		old_sigset_t mask;
 		if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
 		    __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
-		    __get_user(new_ka.sa.sa_restorer, &act->sa_restorer))
+		    __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) ||
+		    __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
+		    __get_user(mask, &act->sa_mask))
 			return -EFAULT;
-		__get_user(new_ka.sa.sa_flags, &act->sa_flags);
-		__get_user(mask, &act->sa_mask);
 		siginitset(&new_ka.sa.sa_mask, mask);
 	}
 
@@ -94,10 +81,10 @@ sys_sigaction(int sig, const struct old_sigaction __user *act,
 	if (!ret && oact) {
 		if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
 		    __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
-		    __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer))
+		    __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) ||
+		    __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
+		    __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
 			return -EFAULT;
-		__put_user(old_ka.sa.sa_flags, &oact->sa_flags);
-		__put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
 	}
 
 	return ret;
@@ -602,15 +589,6 @@ static void do_signal(struct pt_regs *regs, int syscall)
 	int signr;
 
 	/*
-	 * We want the common case to go fast, which
-	 * is why we may in certain cases get here from
-	 * kernel mode. Just return without doing anything
-	 * if so.
-	 */
-	if (!user_mode(regs))
-		return;
-
-	/*
 	 * If we were from a system call, check for system call restarting...
 	 */
 	if (syscall) {
@@ -626,18 +604,13 @@ static void do_signal(struct pt_regs *regs, int syscall)
 		case -ERESTARTNOHAND:
 		case -ERESTARTSYS:
 		case -ERESTARTNOINTR:
+		case -ERESTART_RESTARTBLOCK:
 			regs->ARM_r0 = regs->ARM_ORIG_r0;
 			regs->ARM_pc = restart_addr;
 			break;
-		case -ERESTART_RESTARTBLOCK:
-			regs->ARM_r0 = -EINTR;
-			break;
 		}
 	}
 
-	if (try_to_freeze())
-		goto no_signal;
-
 	/*
 	 * Get the signal to deliver.  When running under ptrace, at this
 	 * point the debugger may change all our registers ...
@@ -652,12 +625,14 @@ static void do_signal(struct pt_regs *regs, int syscall)
 		 * debugger has chosen to restart at a different PC.
 		 */
 		if (regs->ARM_pc == restart_addr) {
-			if (retval == -ERESTARTNOHAND
+			if (retval == -ERESTARTNOHAND ||
+			    retval == -ERESTART_RESTARTBLOCK
 			    || (retval == -ERESTARTSYS
 				&& !(ka.sa.sa_flags & SA_RESTART))) {
 				regs->ARM_r0 = -EINTR;
 				regs->ARM_pc = continue_addr;
 			}
+			clear_thread_flag(TIF_SYSCALL_RESTARTSYS);
 		}
 
 		if (test_thread_flag(TIF_RESTORE_SIGMASK))
@@ -677,7 +652,6 @@ static void do_signal(struct pt_regs *regs, int syscall)
 		return;
 	}
 
- no_signal:
 	if (syscall) {
 		/*
 		 * Handle restarting a different system call.  As above,
@@ -685,38 +659,15 @@ static void do_signal(struct pt_regs *regs, int syscall)
 		 * ignore the restart.
 		 */
 		if (retval == -ERESTART_RESTARTBLOCK
-		    && regs->ARM_pc == continue_addr) {
-			if (thumb_mode(regs)) {
-				regs->ARM_r7 = __NR_restart_syscall - __NR_SYSCALL_BASE;
-				regs->ARM_pc -= 2;
-			} else {
-#if defined(CONFIG_AEABI) && !defined(CONFIG_OABI_COMPAT)
-				regs->ARM_r7 = __NR_restart_syscall;
-				regs->ARM_pc -= 4;
-#else
-				u32 __user *usp;
-
-				regs->ARM_sp -= 4;
-				usp = (u32 __user *)regs->ARM_sp;
-
-				if (put_user(regs->ARM_pc, usp) == 0) {
-					regs->ARM_pc = KERN_RESTART_CODE;
-				} else {
-					regs->ARM_sp += 4;
-					force_sigsegv(0, current);
-				}
-#endif
-			}
-		}
-
-		/* If there's no signal to deliver, we just put the saved sigmask
-		 * back.
-		 */
-		if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
-			clear_thread_flag(TIF_RESTORE_SIGMASK);
-			sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
-		}
+		    && regs->ARM_pc == restart_addr)
+			set_thread_flag(TIF_SYSCALL_RESTARTSYS);
 	}
+
+	/* If there's no signal to deliver, we just put the saved sigmask
+	 * back.
+	 */
+	if (test_and_clear_thread_flag(TIF_RESTORE_SIGMASK))
+		set_current_blocked(&current->saved_sigmask);
 }
 
 asmlinkage void
diff --git a/arch/arm/kernel/signal.h b/arch/arm/kernel/signal.h
index 6fcfe8398aa4..5ff067b7c752 100644
--- a/arch/arm/kernel/signal.h
+++ b/arch/arm/kernel/signal.h
@@ -8,7 +8,5 @@
  * published by the Free Software Foundation.
  */
 #define KERN_SIGRETURN_CODE	(CONFIG_VECTORS_BASE + 0x00000500)
-#define KERN_RESTART_CODE	(KERN_SIGRETURN_CODE + sizeof(sigreturn_codes))
 
 extern const unsigned long sigreturn_codes[7];
-extern const unsigned long syscall_restart_code[2];
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 3647170e9a16..4928d89758f4 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -820,8 +820,6 @@ void __init early_trap_init(void *vectors_base)
 	 */
 	memcpy((void *)(vectors + KERN_SIGRETURN_CODE - CONFIG_VECTORS_BASE),
 	       sigreturn_codes, sizeof(sigreturn_codes));
-	memcpy((void *)(vectors + KERN_RESTART_CODE - CONFIG_VECTORS_BASE),
-	       syscall_restart_code, sizeof(syscall_restart_code));
 
 	flush_icache_range(vectors, vectors + PAGE_SIZE);
 	modify_domain(DOMAIN_USER, DOMAIN_CLIENT);
diff --git a/arch/arm/mach-sa1100/neponset.c b/arch/arm/mach-sa1100/neponset.c
index 6c58f01b358a..266db873a4e4 100644
--- a/arch/arm/mach-sa1100/neponset.c
+++ b/arch/arm/mach-sa1100/neponset.c
@@ -89,6 +89,7 @@ void neponset_ncr_frob(unsigned int mask, unsigned int val)
 		WARN(1, "nep_base unset\n");
 	}
 }
+EXPORT_SYMBOL(neponset_ncr_frob);
 
 static void neponset_set_mctrl(struct uart_port *port, u_int mctrl)
 {
diff --git a/arch/arm/mach-ux500/board-mop500.c b/arch/arm/mach-ux500/board-mop500.c
index b23a643f03f4..fba8adea421e 100644
--- a/arch/arm/mach-ux500/board-mop500.c
+++ b/arch/arm/mach-ux500/board-mop500.c
@@ -206,7 +206,7 @@ static struct resource ab8500_resources[] = {
 };
 
 struct platform_device ab8500_device = {
-	.name = "ab8500-i2c",
+	.name = "ab8500-core",
 	.id = 0,
 	.dev = {
 		.platform_data = &ab8500_platdata,
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 22d34d64cc81..bb344650a14f 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -40,6 +40,7 @@ config CRIS
 	bool
 	default y
 	select HAVE_IDE
+	select GENERIC_ATOMIC64
 	select HAVE_GENERIC_HARDIRQS
 	select GENERIC_IRQ_SHOW
 	select GENERIC_IOMAP
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 77050671eeef..09ab87ee6fef 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -233,8 +233,9 @@ config LANTIQ
 	select ARCH_REQUIRE_GPIOLIB
 	select SWAP_IO_SPACE
 	select BOOT_RAW
-	select HAVE_CLK
-	select MIPS_MACHINE
+	select HAVE_MACH_CLKDEV
+	select CLKDEV_LOOKUP
+	select USE_OF
 
 config LASAT
 	bool "LASAT Networks platforms"
@@ -1783,10 +1784,12 @@ endchoice
 
 config FORCE_MAX_ZONEORDER
 	int "Maximum zone order"
-	range 13 64 if SYS_SUPPORTS_HUGETLBFS && PAGE_SIZE_32KB
-	default "13" if SYS_SUPPORTS_HUGETLBFS && PAGE_SIZE_32KB
-	range 12 64 if SYS_SUPPORTS_HUGETLBFS && PAGE_SIZE_16KB
-	default "12" if SYS_SUPPORTS_HUGETLBFS && PAGE_SIZE_16KB
+	range 14 64 if HUGETLB_PAGE && PAGE_SIZE_64KB
+	default "14" if HUGETLB_PAGE && PAGE_SIZE_64KB
+	range 13 64 if HUGETLB_PAGE && PAGE_SIZE_32KB
+	default "13" if HUGETLB_PAGE && PAGE_SIZE_32KB
+	range 12 64 if HUGETLB_PAGE && PAGE_SIZE_16KB
+	default "12" if HUGETLB_PAGE && PAGE_SIZE_16KB
 	range 11 64
 	default "11"
 	help
diff --git a/arch/mips/alchemy/devboards/db1200.c b/arch/mips/alchemy/devboards/db1200.c
index a83302b96c01..7dde01642d6b 100644
--- a/arch/mips/alchemy/devboards/db1200.c
+++ b/arch/mips/alchemy/devboards/db1200.c
@@ -22,6 +22,7 @@
 #include <linux/gpio.h>
 #include <linux/i2c.h>
 #include <linux/init.h>
+#include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/leds.h>
diff --git a/arch/mips/ath79/Kconfig b/arch/mips/ath79/Kconfig
index e0fae8f4442b..f44feee2d67f 100644
--- a/arch/mips/ath79/Kconfig
+++ b/arch/mips/ath79/Kconfig
@@ -26,6 +26,18 @@ config ATH79_MACH_AP81
 	  Say 'Y' here if you want your kernel to support the
 	  Atheros AP81 reference board.
 
+config ATH79_MACH_DB120
+	bool "Atheros DB120 reference board"
+	select SOC_AR934X
+	select ATH79_DEV_GPIO_BUTTONS
+	select ATH79_DEV_LEDS_GPIO
+	select ATH79_DEV_SPI
+	select ATH79_DEV_USB
+	select ATH79_DEV_WMAC
+	help
+	  Say 'Y' here if you want your kernel to support the
+	  Atheros DB120 reference board.
+
 config ATH79_MACH_PB44
 	bool "Atheros PB44 reference board"
 	select SOC_AR71XX
@@ -52,12 +64,14 @@ endmenu
 config SOC_AR71XX
 	select USB_ARCH_HAS_EHCI
 	select USB_ARCH_HAS_OHCI
+	select HW_HAS_PCI
 	def_bool n
 
 config SOC_AR724X
 	select USB_ARCH_HAS_EHCI
 	select USB_ARCH_HAS_OHCI
 	select HW_HAS_PCI
+	select PCI_AR724X if PCI
 	def_bool n
 
 config SOC_AR913X
@@ -68,6 +82,15 @@ config SOC_AR933X
 	select USB_ARCH_HAS_EHCI
 	def_bool n
 
+config SOC_AR934X
+	select USB_ARCH_HAS_EHCI
+	select HW_HAS_PCI
+	select PCI_AR724X if PCI
+	def_bool n
+
+config PCI_AR724X
+	def_bool n
+
 config ATH79_DEV_GPIO_BUTTONS
 	def_bool n
 
@@ -81,7 +104,7 @@ config ATH79_DEV_USB
 	def_bool n
 
 config ATH79_DEV_WMAC
-	depends on (SOC_AR913X || SOC_AR933X)
+	depends on (SOC_AR913X || SOC_AR933X || SOC_AR934X)
 	def_bool n
 
 endif
diff --git a/arch/mips/ath79/Makefile b/arch/mips/ath79/Makefile
index 3b911e09dbec..2b54d98263f3 100644
--- a/arch/mips/ath79/Makefile
+++ b/arch/mips/ath79/Makefile
@@ -11,6 +11,7 @@
 obj-y	:= prom.o setup.o irq.o common.o clock.o gpio.o
 
 obj-$(CONFIG_EARLY_PRINTK)		+= early_printk.o
+obj-$(CONFIG_PCI)			+= pci.o
 
 #
 # Devices
@@ -27,5 +28,6 @@ obj-$(CONFIG_ATH79_DEV_WMAC)		+= dev-wmac.o
 #
 obj-$(CONFIG_ATH79_MACH_AP121)		+= mach-ap121.o
 obj-$(CONFIG_ATH79_MACH_AP81)		+= mach-ap81.o
+obj-$(CONFIG_ATH79_MACH_DB120)		+= mach-db120.o
 obj-$(CONFIG_ATH79_MACH_PB44)		+= mach-pb44.o
 obj-$(CONFIG_ATH79_MACH_UBNT_XM)	+= mach-ubnt-xm.o
diff --git a/arch/mips/ath79/clock.c b/arch/mips/ath79/clock.c
index 54d0eb4db987..b91ad3efe29e 100644
--- a/arch/mips/ath79/clock.c
+++ b/arch/mips/ath79/clock.c
@@ -1,8 +1,11 @@
 /*
  *  Atheros AR71XX/AR724X/AR913X common routines
  *
+ *  Copyright (C) 2010-2011 Jaiganesh Narayanan <[email protected]>
  *  Copyright (C) 2011 Gabor Juhos <[email protected]>
  *
+ *  Parts of this file are based on Atheros' 2.6.15/2.6.31 BSP
+ *
  *  This program is free software; you can redistribute it and/or modify it
  *  under the terms of the GNU General Public License version 2 as published
  *  by the Free Software Foundation.
@@ -163,6 +166,82 @@ static void __init ar933x_clocks_init(void)
 	ath79_uart_clk.rate = ath79_ref_clk.rate;
 }
 
+static void __init ar934x_clocks_init(void)
+{
+	u32 pll, out_div, ref_div, nint, frac, clk_ctrl, postdiv;
+	u32 cpu_pll, ddr_pll;
+	u32 bootstrap;
+
+	bootstrap = ath79_reset_rr(AR934X_RESET_REG_BOOTSTRAP);
+	if (bootstrap &	AR934X_BOOTSTRAP_REF_CLK_40)
+		ath79_ref_clk.rate = 40 * 1000 * 1000;
+	else
+		ath79_ref_clk.rate = 25 * 1000 * 1000;
+
+	pll = ath79_pll_rr(AR934X_PLL_CPU_CONFIG_REG);
+	out_div = (pll >> AR934X_PLL_CPU_CONFIG_OUTDIV_SHIFT) &
+		  AR934X_PLL_CPU_CONFIG_OUTDIV_MASK;
+	ref_div = (pll >> AR934X_PLL_CPU_CONFIG_REFDIV_SHIFT) &
+		  AR934X_PLL_CPU_CONFIG_REFDIV_MASK;
+	nint = (pll >> AR934X_PLL_CPU_CONFIG_NINT_SHIFT) &
+	       AR934X_PLL_CPU_CONFIG_NINT_MASK;
+	frac = (pll >> AR934X_PLL_CPU_CONFIG_NFRAC_SHIFT) &
+	       AR934X_PLL_CPU_CONFIG_NFRAC_MASK;
+
+	cpu_pll = nint * ath79_ref_clk.rate / ref_div;
+	cpu_pll += frac * ath79_ref_clk.rate / (ref_div * (2 << 6));
+	cpu_pll /= (1 << out_div);
+
+	pll = ath79_pll_rr(AR934X_PLL_DDR_CONFIG_REG);
+	out_div = (pll >> AR934X_PLL_DDR_CONFIG_OUTDIV_SHIFT) &
+		  AR934X_PLL_DDR_CONFIG_OUTDIV_MASK;
+	ref_div = (pll >> AR934X_PLL_DDR_CONFIG_REFDIV_SHIFT) &
+		  AR934X_PLL_DDR_CONFIG_REFDIV_MASK;
+	nint = (pll >> AR934X_PLL_DDR_CONFIG_NINT_SHIFT) &
+	       AR934X_PLL_DDR_CONFIG_NINT_MASK;
+	frac = (pll >> AR934X_PLL_DDR_CONFIG_NFRAC_SHIFT) &
+	       AR934X_PLL_DDR_CONFIG_NFRAC_MASK;
+
+	ddr_pll = nint * ath79_ref_clk.rate / ref_div;
+	ddr_pll += frac * ath79_ref_clk.rate / (ref_div * (2 << 10));
+	ddr_pll /= (1 << out_div);
+
+	clk_ctrl = ath79_pll_rr(AR934X_PLL_CPU_DDR_CLK_CTRL_REG);
+
+	postdiv = (clk_ctrl >> AR934X_PLL_CPU_DDR_CLK_CTRL_CPU_POST_DIV_SHIFT) &
+		  AR934X_PLL_CPU_DDR_CLK_CTRL_CPU_POST_DIV_MASK;
+
+	if (clk_ctrl & AR934X_PLL_CPU_DDR_CLK_CTRL_CPU_PLL_BYPASS)
+		ath79_cpu_clk.rate = ath79_ref_clk.rate;
+	else if (clk_ctrl & AR934X_PLL_CPU_DDR_CLK_CTRL_CPUCLK_FROM_CPUPLL)
+		ath79_cpu_clk.rate = cpu_pll / (postdiv + 1);
+	else
+		ath79_cpu_clk.rate = ddr_pll / (postdiv + 1);
+
+	postdiv = (clk_ctrl >> AR934X_PLL_CPU_DDR_CLK_CTRL_DDR_POST_DIV_SHIFT) &
+		  AR934X_PLL_CPU_DDR_CLK_CTRL_DDR_POST_DIV_MASK;
+
+	if (clk_ctrl & AR934X_PLL_CPU_DDR_CLK_CTRL_DDR_PLL_BYPASS)
+		ath79_ddr_clk.rate = ath79_ref_clk.rate;
+	else if (clk_ctrl & AR934X_PLL_CPU_DDR_CLK_CTRL_DDRCLK_FROM_DDRPLL)
+		ath79_ddr_clk.rate = ddr_pll / (postdiv + 1);
+	else
+		ath79_ddr_clk.rate = cpu_pll / (postdiv + 1);
+
+	postdiv = (clk_ctrl >> AR934X_PLL_CPU_DDR_CLK_CTRL_AHB_POST_DIV_SHIFT) &
+		  AR934X_PLL_CPU_DDR_CLK_CTRL_AHB_POST_DIV_MASK;
+
+	if (clk_ctrl & AR934X_PLL_CPU_DDR_CLK_CTRL_AHB_PLL_BYPASS)
+		ath79_ahb_clk.rate = ath79_ref_clk.rate;
+	else if (clk_ctrl & AR934X_PLL_CPU_DDR_CLK_CTRL_AHBCLK_FROM_DDRPLL)
+		ath79_ahb_clk.rate = ddr_pll / (postdiv + 1);
+	else
+		ath79_ahb_clk.rate = cpu_pll / (postdiv + 1);
+
+	ath79_wdt_clk.rate = ath79_ref_clk.rate;
+	ath79_uart_clk.rate = ath79_ref_clk.rate;
+}
+
 void __init ath79_clocks_init(void)
 {
 	if (soc_is_ar71xx())
@@ -173,6 +252,8 @@ void __init ath79_clocks_init(void)
 		ar913x_clocks_init();
 	else if (soc_is_ar933x())
 		ar933x_clocks_init();
+	else if (soc_is_ar934x())
+		ar934x_clocks_init();
 	else
 		BUG();
 
diff --git a/arch/mips/ath79/common.c b/arch/mips/ath79/common.c
index f0fda982b965..5a4adfc9d79d 100644
--- a/arch/mips/ath79/common.c
+++ b/arch/mips/ath79/common.c
@@ -1,9 +1,12 @@
 /*
  *  Atheros AR71XX/AR724X/AR913X common routines
  *
- *  Copyright (C) 2008-2010 Gabor Juhos <[email protected]>
+ *  Copyright (C) 2010-2011 Jaiganesh Narayanan <[email protected]>
+ *  Copyright (C) 2008-2011 Gabor Juhos <[email protected]>
  *  Copyright (C) 2008 Imre Kaloz <[email protected]>
  *
+ *  Parts of this file are based on Atheros' 2.6.15/2.6.31 BSP
+ *
  *  This program is free software; you can redistribute it and/or modify it
  *  under the terms of the GNU General Public License version 2 as published
  *  by the Free Software Foundation.
@@ -67,6 +70,8 @@ void ath79_device_reset_set(u32 mask)
 		reg = AR913X_RESET_REG_RESET_MODULE;
 	else if (soc_is_ar933x())
 		reg = AR933X_RESET_REG_RESET_MODULE;
+	else if (soc_is_ar934x())
+		reg = AR934X_RESET_REG_RESET_MODULE;
 	else
 		BUG();
 
@@ -91,6 +96,8 @@ void ath79_device_reset_clear(u32 mask)
 		reg = AR913X_RESET_REG_RESET_MODULE;
 	else if (soc_is_ar933x())
 		reg = AR933X_RESET_REG_RESET_MODULE;
+	else if (soc_is_ar934x())
+		reg = AR934X_RESET_REG_RESET_MODULE;
 	else
 		BUG();
 
diff --git a/arch/mips/ath79/dev-common.c b/arch/mips/ath79/dev-common.c
index f4956f809072..45efc63b08b6 100644
--- a/arch/mips/ath79/dev-common.c
+++ b/arch/mips/ath79/dev-common.c
@@ -89,7 +89,8 @@ void __init ath79_register_uart(void)
 
 	if (soc_is_ar71xx() ||
 	    soc_is_ar724x() ||
-	    soc_is_ar913x()) {
+	    soc_is_ar913x() ||
+	    soc_is_ar934x()) {
 		ath79_uart_data[0].uartclk = clk_get_rate(clk);
 		platform_device_register(&ath79_uart_device);
 	} else if (soc_is_ar933x()) {
diff --git a/arch/mips/ath79/dev-gpio-buttons.c b/arch/mips/ath79/dev-gpio-buttons.c
index 4b0168a11c01..366b35fb164d 100644
--- a/arch/mips/ath79/dev-gpio-buttons.c
+++ b/arch/mips/ath79/dev-gpio-buttons.c
@@ -25,12 +25,10 @@ void __init ath79_register_gpio_keys_polled(int id,
 	struct gpio_keys_button *p;
 	int err;
 
-	p = kmalloc(nbuttons * sizeof(*p), GFP_KERNEL);
+	p = kmemdup(buttons, nbuttons * sizeof(*p), GFP_KERNEL);
 	if (!p)
 		return;
 
-	memcpy(p, buttons, nbuttons * sizeof(*p));
-
 	pdev = platform_device_alloc("gpio-keys-polled", id);
 	if (!pdev)
 		goto err_free_buttons;
diff --git a/arch/mips/ath79/dev-leds-gpio.c b/arch/mips/ath79/dev-leds-gpio.c
index cdade68dcd17..dcb1debcefb8 100644
--- a/arch/mips/ath79/dev-leds-gpio.c
+++ b/arch/mips/ath79/dev-leds-gpio.c
@@ -24,12 +24,10 @@ void __init ath79_register_leds_gpio(int id,
 	struct gpio_led *p;
 	int err;
 
-	p = kmalloc(num_leds * sizeof(*p), GFP_KERNEL);
+	p = kmemdup(leds, num_leds * sizeof(*p), GFP_KERNEL);
 	if (!p)
 		return;
 
-	memcpy(p, leds, num_leds * sizeof(*p));
-
 	pdev = platform_device_alloc("leds-gpio", id);
 	if (!pdev)
 		goto err_free_leds;
diff --git a/arch/mips/ath79/dev-wmac.c b/arch/mips/ath79/dev-wmac.c
index 9c717bf98ffe..d6d893c16ad4 100644
--- a/arch/mips/ath79/dev-wmac.c
+++ b/arch/mips/ath79/dev-wmac.c
@@ -1,9 +1,12 @@
 /*
  *  Atheros AR913X/AR933X SoC built-in WMAC device support
  *
+ *  Copyright (C) 2010-2011 Jaiganesh Narayanan <[email protected]>
  *  Copyright (C) 2008-2011 Gabor Juhos <[email protected]>
  *  Copyright (C) 2008 Imre Kaloz <[email protected]>
  *
+ *  Parts of this file are based on Atheros 2.6.15/2.6.31 BSP
+ *
  *  This program is free software; you can redistribute it and/or modify it
  *  under the terms of the GNU General Public License version 2 as published
  *  by the Free Software Foundation.
@@ -26,8 +29,7 @@ static struct resource ath79_wmac_resources[] = {
 		/* .start and .end fields are filled dynamically */
 		.flags	= IORESOURCE_MEM,
 	}, {
-		.start	= ATH79_CPU_IRQ_IP2,
-		.end	= ATH79_CPU_IRQ_IP2,
+		/* .start and .end fields are filled dynamically */
 		.flags	= IORESOURCE_IRQ,
 	},
 };
@@ -53,6 +55,8 @@ static void __init ar913x_wmac_setup(void)
 
 	ath79_wmac_resources[0].start = AR913X_WMAC_BASE;
 	ath79_wmac_resources[0].end = AR913X_WMAC_BASE + AR913X_WMAC_SIZE - 1;
+	ath79_wmac_resources[1].start = ATH79_CPU_IRQ_IP2;
+	ath79_wmac_resources[1].end = ATH79_CPU_IRQ_IP2;
 }
 
 
@@ -79,6 +83,8 @@ static void __init ar933x_wmac_setup(void)
 
 	ath79_wmac_resources[0].start = AR933X_WMAC_BASE;
 	ath79_wmac_resources[0].end = AR933X_WMAC_BASE + AR933X_WMAC_SIZE - 1;
+	ath79_wmac_resources[1].start = ATH79_CPU_IRQ_IP2;
+	ath79_wmac_resources[1].end = ATH79_CPU_IRQ_IP2;
 
 	t = ath79_reset_rr(AR933X_RESET_REG_BOOTSTRAP);
 	if (t & AR933X_BOOTSTRAP_REF_CLK_40)
@@ -92,12 +98,32 @@ static void __init ar933x_wmac_setup(void)
 	ath79_wmac_data.external_reset = ar933x_wmac_reset;
 }
 
+static void ar934x_wmac_setup(void)
+{
+	u32 t;
+
+	ath79_wmac_device.name = "ar934x_wmac";
+
+	ath79_wmac_resources[0].start = AR934X_WMAC_BASE;
+	ath79_wmac_resources[0].end = AR934X_WMAC_BASE + AR934X_WMAC_SIZE - 1;
+	ath79_wmac_resources[1].start = ATH79_IP2_IRQ(1);
+	ath79_wmac_resources[1].start = ATH79_IP2_IRQ(1);
+
+	t = ath79_reset_rr(AR934X_RESET_REG_BOOTSTRAP);
+	if (t & AR934X_BOOTSTRAP_REF_CLK_40)
+		ath79_wmac_data.is_clk_25mhz = false;
+	else
+		ath79_wmac_data.is_clk_25mhz = true;
+}
+
 void __init ath79_register_wmac(u8 *cal_data)
 {
 	if (soc_is_ar913x())
 		ar913x_wmac_setup();
 	else if (soc_is_ar933x())
 		ar933x_wmac_setup();
+	else if (soc_is_ar934x())
+		ar934x_wmac_setup();
 	else
 		BUG();
 
diff --git a/arch/mips/ath79/early_printk.c b/arch/mips/ath79/early_printk.c
index 6a51ced7a293..dc938cb2ba58 100644
--- a/arch/mips/ath79/early_printk.c
+++ b/arch/mips/ath79/early_printk.c
@@ -71,6 +71,9 @@ static void prom_putchar_init(void)
 	case REV_ID_MAJOR_AR7241:
 	case REV_ID_MAJOR_AR7242:
 	case REV_ID_MAJOR_AR913X:
+	case REV_ID_MAJOR_AR9341:
+	case REV_ID_MAJOR_AR9342:
+	case REV_ID_MAJOR_AR9344:
 		_prom_putchar = prom_putchar_ar71xx;
 		break;
 
diff --git a/arch/mips/ath79/gpio.c b/arch/mips/ath79/gpio.c
index a2f8ca630ed6..29054f211832 100644
--- a/arch/mips/ath79/gpio.c
+++ b/arch/mips/ath79/gpio.c
@@ -1,9 +1,12 @@
 /*
  *  Atheros AR71XX/AR724X/AR913X GPIO API support
  *
- *  Copyright (C) 2008-2010 Gabor Juhos <[email protected]>
+ *  Copyright (C) 2010-2011 Jaiganesh Narayanan <[email protected]>
+ *  Copyright (C) 2008-2011 Gabor Juhos <[email protected]>
  *  Copyright (C) 2008 Imre Kaloz <[email protected]>
  *
+ *  Parts of this file are based on Atheros' 2.6.15/2.6.31 BSP
+ *
  *  This program is free software; you can redistribute it and/or modify it
  *  under the terms of the GNU General Public License version 2 as published
  *  by the Free Software Foundation.
@@ -89,6 +92,42 @@ static int ath79_gpio_direction_output(struct gpio_chip *chip,
 	return 0;
 }
 
+static int ar934x_gpio_direction_input(struct gpio_chip *chip, unsigned offset)
+{
+	void __iomem *base = ath79_gpio_base;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ath79_gpio_lock, flags);
+
+	__raw_writel(__raw_readl(base + AR71XX_GPIO_REG_OE) | (1 << offset),
+		     base + AR71XX_GPIO_REG_OE);
+
+	spin_unlock_irqrestore(&ath79_gpio_lock, flags);
+
+	return 0;
+}
+
+static int ar934x_gpio_direction_output(struct gpio_chip *chip, unsigned offset,
+					int value)
+{
+	void __iomem *base = ath79_gpio_base;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ath79_gpio_lock, flags);
+
+	if (value)
+		__raw_writel(1 << offset, base + AR71XX_GPIO_REG_SET);
+	else
+		__raw_writel(1 << offset, base + AR71XX_GPIO_REG_CLEAR);
+
+	__raw_writel(__raw_readl(base + AR71XX_GPIO_REG_OE) & ~(1 << offset),
+		     base + AR71XX_GPIO_REG_OE);
+
+	spin_unlock_irqrestore(&ath79_gpio_lock, flags);
+
+	return 0;
+}
+
 static struct gpio_chip ath79_gpio_chip = {
 	.label			= "ath79",
 	.get			= ath79_gpio_get_value,
@@ -155,11 +194,17 @@ void __init ath79_gpio_init(void)
 		ath79_gpio_count = AR913X_GPIO_COUNT;
 	else if (soc_is_ar933x())
 		ath79_gpio_count = AR933X_GPIO_COUNT;
+	else if (soc_is_ar934x())
+		ath79_gpio_count = AR934X_GPIO_COUNT;
 	else
 		BUG();
 
 	ath79_gpio_base = ioremap_nocache(AR71XX_GPIO_BASE, AR71XX_GPIO_SIZE);
 	ath79_gpio_chip.ngpio = ath79_gpio_count;
+	if (soc_is_ar934x()) {
+		ath79_gpio_chip.direction_input = ar934x_gpio_direction_input;
+		ath79_gpio_chip.direction_output = ar934x_gpio_direction_output;
+	}
 
 	err = gpiochip_add(&ath79_gpio_chip);
 	if (err)
diff --git a/arch/mips/ath79/irq.c b/arch/mips/ath79/irq.c
index 1b073de44680..90d09fc15398 100644
--- a/arch/mips/ath79/irq.c
+++ b/arch/mips/ath79/irq.c
@@ -1,10 +1,11 @@
 /*
  *  Atheros AR71xx/AR724x/AR913x specific interrupt handling
  *
- *  Copyright (C) 2008-2010 Gabor Juhos <[email protected]>
+ *  Copyright (C) 2010-2011 Jaiganesh Narayanan <[email protected]>
+ *  Copyright (C) 2008-2011 Gabor Juhos <[email protected]>
  *  Copyright (C) 2008 Imre Kaloz <[email protected]>
  *
- *  Parts of this file are based on Atheros' 2.6.15 BSP
+ *  Parts of this file are based on Atheros' 2.6.15/2.6.31 BSP
  *
  *  This program is free software; you can redistribute it and/or modify it
  *  under the terms of the GNU General Public License version 2 as published
@@ -23,8 +24,8 @@
 #include <asm/mach-ath79/ar71xx_regs.h>
 #include "common.h"
 
-static unsigned int ath79_ip2_flush_reg;
-static unsigned int ath79_ip3_flush_reg;
+static void (*ath79_ip2_handler)(void);
+static void (*ath79_ip3_handler)(void);
 
 static void ath79_misc_irq_handler(unsigned int irq, struct irq_desc *desc)
 {
@@ -129,7 +130,7 @@ static void __init ath79_misc_irq_init(void)
 
 	if (soc_is_ar71xx() || soc_is_ar913x())
 		ath79_misc_irq_chip.irq_mask_ack = ar71xx_misc_irq_mask;
-	else if (soc_is_ar724x() || soc_is_ar933x())
+	else if (soc_is_ar724x() || soc_is_ar933x() || soc_is_ar934x())
 		ath79_misc_irq_chip.irq_ack = ar724x_misc_irq_ack;
 	else
 		BUG();
@@ -143,6 +144,39 @@ static void __init ath79_misc_irq_init(void)
 	irq_set_chained_handler(ATH79_CPU_IRQ_MISC, ath79_misc_irq_handler);
 }
 
+static void ar934x_ip2_irq_dispatch(unsigned int irq, struct irq_desc *desc)
+{
+	u32 status;
+
+	disable_irq_nosync(irq);
+
+	status = ath79_reset_rr(AR934X_RESET_REG_PCIE_WMAC_INT_STATUS);
+
+	if (status & AR934X_PCIE_WMAC_INT_PCIE_ALL) {
+		ath79_ddr_wb_flush(AR934X_DDR_REG_FLUSH_PCIE);
+		generic_handle_irq(ATH79_IP2_IRQ(0));
+	} else if (status & AR934X_PCIE_WMAC_INT_WMAC_ALL) {
+		ath79_ddr_wb_flush(AR934X_DDR_REG_FLUSH_WMAC);
+		generic_handle_irq(ATH79_IP2_IRQ(1));
+	} else {
+		spurious_interrupt();
+	}
+
+	enable_irq(irq);
+}
+
+static void ar934x_ip2_irq_init(void)
+{
+	int i;
+
+	for (i = ATH79_IP2_IRQ_BASE;
+	     i < ATH79_IP2_IRQ_BASE + ATH79_IP2_IRQ_COUNT; i++)
+		irq_set_chip_and_handler(i, &dummy_irq_chip,
+					 handle_level_irq);
+
+	irq_set_chained_handler(ATH79_CPU_IRQ_IP2, ar934x_ip2_irq_dispatch);
+}
+
 asmlinkage void plat_irq_dispatch(void)
 {
 	unsigned long pending;
@@ -152,10 +186,8 @@ asmlinkage void plat_irq_dispatch(void)
 	if (pending & STATUSF_IP7)
 		do_IRQ(ATH79_CPU_IRQ_TIMER);
 
-	else if (pending & STATUSF_IP2) {
-		ath79_ddr_wb_flush(ath79_ip2_flush_reg);
-		do_IRQ(ATH79_CPU_IRQ_IP2);
-	}
+	else if (pending & STATUSF_IP2)
+		ath79_ip2_handler();
 
 	else if (pending & STATUSF_IP4)
 		do_IRQ(ATH79_CPU_IRQ_GE0);
@@ -163,10 +195,8 @@ asmlinkage void plat_irq_dispatch(void)
 	else if (pending & STATUSF_IP5)
 		do_IRQ(ATH79_CPU_IRQ_GE1);
 
-	else if (pending & STATUSF_IP3) {
-		ath79_ddr_wb_flush(ath79_ip3_flush_reg);
-		do_IRQ(ATH79_CPU_IRQ_USB);
-	}
+	else if (pending & STATUSF_IP3)
+		ath79_ip3_handler();
 
 	else if (pending & STATUSF_IP6)
 		do_IRQ(ATH79_CPU_IRQ_MISC);
@@ -175,24 +205,97 @@ asmlinkage void plat_irq_dispatch(void)
 		spurious_interrupt();
 }
 
+/*
+ * The IP2/IP3 lines are tied to a PCI/WMAC/USB device. Drivers for
+ * these devices typically allocate coherent DMA memory, however the
+ * DMA controller may still have some unsynchronized data in the FIFO.
+ * Issue a flush in the handlers to ensure that the driver sees
+ * the update.
+ */
+static void ar71xx_ip2_handler(void)
+{
+	ath79_ddr_wb_flush(AR71XX_DDR_REG_FLUSH_PCI);
+	do_IRQ(ATH79_CPU_IRQ_IP2);
+}
+
+static void ar724x_ip2_handler(void)
+{
+	ath79_ddr_wb_flush(AR724X_DDR_REG_FLUSH_PCIE);
+	do_IRQ(ATH79_CPU_IRQ_IP2);
+}
+
+static void ar913x_ip2_handler(void)
+{
+	ath79_ddr_wb_flush(AR913X_DDR_REG_FLUSH_WMAC);
+	do_IRQ(ATH79_CPU_IRQ_IP2);
+}
+
+static void ar933x_ip2_handler(void)
+{
+	ath79_ddr_wb_flush(AR933X_DDR_REG_FLUSH_WMAC);
+	do_IRQ(ATH79_CPU_IRQ_IP2);
+}
+
+static void ar934x_ip2_handler(void)
+{
+	do_IRQ(ATH79_CPU_IRQ_IP2);
+}
+
+static void ar71xx_ip3_handler(void)
+{
+	ath79_ddr_wb_flush(AR71XX_DDR_REG_FLUSH_USB);
+	do_IRQ(ATH79_CPU_IRQ_USB);
+}
+
+static void ar724x_ip3_handler(void)
+{
+	ath79_ddr_wb_flush(AR724X_DDR_REG_FLUSH_USB);
+	do_IRQ(ATH79_CPU_IRQ_USB);
+}
+
+static void ar913x_ip3_handler(void)
+{
+	ath79_ddr_wb_flush(AR913X_DDR_REG_FLUSH_USB);
+	do_IRQ(ATH79_CPU_IRQ_USB);
+}
+
+static void ar933x_ip3_handler(void)
+{
+	ath79_ddr_wb_flush(AR933X_DDR_REG_FLUSH_USB);
+	do_IRQ(ATH79_CPU_IRQ_USB);
+}
+
+static void ar934x_ip3_handler(void)
+{
+	ath79_ddr_wb_flush(AR934X_DDR_REG_FLUSH_USB);
+	do_IRQ(ATH79_CPU_IRQ_USB);
+}
+
 void __init arch_init_irq(void)
 {
 	if (soc_is_ar71xx()) {
-		ath79_ip2_flush_reg = AR71XX_DDR_REG_FLUSH_PCI;
-		ath79_ip3_flush_reg = AR71XX_DDR_REG_FLUSH_USB;
+		ath79_ip2_handler = ar71xx_ip2_handler;
+		ath79_ip3_handler = ar71xx_ip3_handler;
 	} else if (soc_is_ar724x()) {
-		ath79_ip2_flush_reg = AR724X_DDR_REG_FLUSH_PCIE;
-		ath79_ip3_flush_reg = AR724X_DDR_REG_FLUSH_USB;
+		ath79_ip2_handler = ar724x_ip2_handler;
+		ath79_ip3_handler = ar724x_ip3_handler;
 	} else if (soc_is_ar913x()) {
-		ath79_ip2_flush_reg = AR913X_DDR_REG_FLUSH_WMAC;
-		ath79_ip3_flush_reg = AR913X_DDR_REG_FLUSH_USB;
+		ath79_ip2_handler = ar913x_ip2_handler;
+		ath79_ip3_handler = ar913x_ip3_handler;
 	} else if (soc_is_ar933x()) {
-		ath79_ip2_flush_reg = AR933X_DDR_REG_FLUSH_WMAC;
-		ath79_ip3_flush_reg = AR933X_DDR_REG_FLUSH_USB;
-	} else
+		ath79_ip2_handler = ar933x_ip2_handler;
+		ath79_ip3_handler = ar933x_ip3_handler;
+	} else if (soc_is_ar934x()) {
+		ath79_ip2_handler = ar934x_ip2_handler;
+		ath79_ip3_handler = ar934x_ip3_handler;
+	} else {
 		BUG();
+	}
 
 	cp0_perfcount_irq = ATH79_MISC_IRQ_PERFC;
 	mips_cpu_irq_init();
 	ath79_misc_irq_init();
+
+	if (soc_is_ar934x())
+		ar934x_ip2_irq_init();
 }
diff --git a/arch/mips/ath79/mach-db120.c b/arch/mips/ath79/mach-db120.c
new file mode 100644
index 000000000000..1983e4d2af4b
--- /dev/null
+++ b/arch/mips/ath79/mach-db120.c
@@ -0,0 +1,134 @@
+/*
+ * Atheros DB120 reference board support
+ *
+ * Copyright (c) 2011 Qualcomm Atheros
+ * Copyright (c) 2011 Gabor Juhos <[email protected]>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/ath9k_platform.h>
+
+#include "machtypes.h"
+#include "dev-gpio-buttons.h"
+#include "dev-leds-gpio.h"
+#include "dev-spi.h"
+#include "dev-wmac.h"
+#include "pci.h"
+
+#define DB120_GPIO_LED_WLAN_5G		12
+#define DB120_GPIO_LED_WLAN_2G		13
+#define DB120_GPIO_LED_STATUS		14
+#define DB120_GPIO_LED_WPS		15
+
+#define DB120_GPIO_BTN_WPS		16
+
+#define DB120_KEYS_POLL_INTERVAL	20	/* msecs */
+#define DB120_KEYS_DEBOUNCE_INTERVAL	(3 * DB120_KEYS_POLL_INTERVAL)
+
+#define DB120_WMAC_CALDATA_OFFSET 0x1000
+#define DB120_PCIE_CALDATA_OFFSET 0x5000
+
+static struct gpio_led db120_leds_gpio[] __initdata = {
+	{
+		.name		= "db120:green:status",
+		.gpio		= DB120_GPIO_LED_STATUS,
+		.active_low	= 1,
+	},
+	{
+		.name		= "db120:green:wps",
+		.gpio		= DB120_GPIO_LED_WPS,
+		.active_low	= 1,
+	},
+	{
+		.name		= "db120:green:wlan-5g",
+		.gpio		= DB120_GPIO_LED_WLAN_5G,
+		.active_low	= 1,
+	},
+	{
+		.name		= "db120:green:wlan-2g",
+		.gpio		= DB120_GPIO_LED_WLAN_2G,
+		.active_low	= 1,
+	},
+};
+
+static struct gpio_keys_button db120_gpio_keys[] __initdata = {
+	{
+		.desc		= "WPS button",
+		.type		= EV_KEY,
+		.code		= KEY_WPS_BUTTON,
+		.debounce_interval = DB120_KEYS_DEBOUNCE_INTERVAL,
+		.gpio		= DB120_GPIO_BTN_WPS,
+		.active_low	= 1,
+	},
+};
+
+static struct spi_board_info db120_spi_info[] = {
+	{
+		.bus_num	= 0,
+		.chip_select	= 0,
+		.max_speed_hz	= 25000000,
+		.modalias	= "s25sl064a",
+	}
+};
+
+static struct ath79_spi_platform_data db120_spi_data = {
+	.bus_num	= 0,
+	.num_chipselect	= 1,
+};
+
+#ifdef CONFIG_PCI
+static struct ath9k_platform_data db120_ath9k_data;
+
+static int db120_pci_plat_dev_init(struct pci_dev *dev)
+{
+	switch (PCI_SLOT(dev->devfn)) {
+	case 0:
+		dev->dev.platform_data = &db120_ath9k_data;
+		break;
+	}
+
+	return 0;
+}
+
+static void __init db120_pci_init(u8 *eeprom)
+{
+	memcpy(db120_ath9k_data.eeprom_data, eeprom,
+	       sizeof(db120_ath9k_data.eeprom_data));
+
+	ath79_pci_set_plat_dev_init(db120_pci_plat_dev_init);
+	ath79_register_pci();
+}
+#else
+static inline void db120_pci_init(void) {}
+#endif /* CONFIG_PCI */
+
+static void __init db120_setup(void)
+{
+	u8 *art = (u8 *) KSEG1ADDR(0x1fff0000);
+
+	ath79_register_leds_gpio(-1, ARRAY_SIZE(db120_leds_gpio),
+				 db120_leds_gpio);
+	ath79_register_gpio_keys_polled(-1, DB120_KEYS_POLL_INTERVAL,
+					ARRAY_SIZE(db120_gpio_keys),
+					db120_gpio_keys);
+	ath79_register_spi(&db120_spi_data, db120_spi_info,
+			   ARRAY_SIZE(db120_spi_info));
+	ath79_register_wmac(art + DB120_WMAC_CALDATA_OFFSET);
+	db120_pci_init(art + DB120_PCIE_CALDATA_OFFSET);
+}
+
+MIPS_MACHINE(ATH79_MACH_DB120, "DB120", "Atheros DB120 reference board",
+	     db120_setup);
diff --git a/arch/mips/ath79/mach-pb44.c b/arch/mips/ath79/mach-pb44.c
index fe9701a32291..c5f0ea5e00c3 100644
--- a/arch/mips/ath79/mach-pb44.c
+++ b/arch/mips/ath79/mach-pb44.c
@@ -19,6 +19,7 @@
 #include "dev-leds-gpio.h"
 #include "dev-spi.h"
 #include "dev-usb.h"
+#include "pci.h"
 
 #define PB44_GPIO_I2C_SCL	0
 #define PB44_GPIO_I2C_SDA	1
@@ -114,6 +115,7 @@ static void __init pb44_init(void)
 	ath79_register_spi(&pb44_spi_data, pb44_spi_info,
 			   ARRAY_SIZE(pb44_spi_info));
 	ath79_register_usb();
+	ath79_register_pci();
 }
 
 MIPS_MACHINE(ATH79_MACH_PB44, "PB44", "Atheros PB44 reference board",
diff --git a/arch/mips/ath79/mach-ubnt-xm.c b/arch/mips/ath79/mach-ubnt-xm.c
index 3c311a539347..4a3c60694c75 100644
--- a/arch/mips/ath79/mach-ubnt-xm.c
+++ b/arch/mips/ath79/mach-ubnt-xm.c
@@ -12,16 +12,15 @@
 
 #include <linux/init.h>
 #include <linux/pci.h>
-
-#ifdef CONFIG_PCI
 #include <linux/ath9k_platform.h>
-#include <asm/mach-ath79/pci-ath724x.h>
-#endif /* CONFIG_PCI */
+
+#include <asm/mach-ath79/irq.h>
 
 #include "machtypes.h"
 #include "dev-gpio-buttons.h"
 #include "dev-leds-gpio.h"
 #include "dev-spi.h"
+#include "pci.h"
 
 #define UBNT_XM_GPIO_LED_L1		0
 #define UBNT_XM_GPIO_LED_L2		1
@@ -33,7 +32,6 @@
 #define UBNT_XM_KEYS_POLL_INTERVAL	20
 #define UBNT_XM_KEYS_DEBOUNCE_INTERVAL	(3 * UBNT_XM_KEYS_POLL_INTERVAL)
 
-#define UBNT_XM_PCI_IRQ			48
 #define UBNT_XM_EEPROM_ADDR		(u8 *) KSEG1ADDR(0x1fff1000)
 
 static struct gpio_led ubnt_xm_leds_gpio[] __initdata = {
@@ -84,12 +82,27 @@ static struct ath79_spi_platform_data ubnt_xm_spi_data = {
 #ifdef CONFIG_PCI
 static struct ath9k_platform_data ubnt_xm_eeprom_data;
 
-static struct ath724x_pci_data ubnt_xm_pci_data[] = {
-	{
-		.irq	= UBNT_XM_PCI_IRQ,
-		.pdata	= &ubnt_xm_eeprom_data,
-	},
-};
+static int ubnt_xm_pci_plat_dev_init(struct pci_dev *dev)
+{
+	switch (PCI_SLOT(dev->devfn)) {
+	case 0:
+		dev->dev.platform_data = &ubnt_xm_eeprom_data;
+		break;
+	}
+
+	return 0;
+}
+
+static void __init ubnt_xm_pci_init(void)
+{
+	memcpy(ubnt_xm_eeprom_data.eeprom_data, UBNT_XM_EEPROM_ADDR,
+	       sizeof(ubnt_xm_eeprom_data.eeprom_data));
+
+	ath79_pci_set_plat_dev_init(ubnt_xm_pci_plat_dev_init);
+	ath79_register_pci();
+}
+#else
+static inline void ubnt_xm_pci_init(void) {}
 #endif /* CONFIG_PCI */
 
 static void __init ubnt_xm_init(void)
@@ -104,13 +117,7 @@ static void __init ubnt_xm_init(void)
 	ath79_register_spi(&ubnt_xm_spi_data, ubnt_xm_spi_info,
 			   ARRAY_SIZE(ubnt_xm_spi_info));
 
-#ifdef CONFIG_PCI
-	memcpy(ubnt_xm_eeprom_data.eeprom_data, UBNT_XM_EEPROM_ADDR,
-	       sizeof(ubnt_xm_eeprom_data.eeprom_data));
-
-	ath724x_pci_add_data(ubnt_xm_pci_data, ARRAY_SIZE(ubnt_xm_pci_data));
-#endif /* CONFIG_PCI */
-
+	ubnt_xm_pci_init();
 }
 
 MIPS_MACHINE(ATH79_MACH_UBNT_XM,
diff --git a/arch/mips/ath79/machtypes.h b/arch/mips/ath79/machtypes.h
index 9a1f3826626e..af92e5c30d66 100644
--- a/arch/mips/ath79/machtypes.h
+++ b/arch/mips/ath79/machtypes.h
@@ -18,6 +18,7 @@ enum ath79_mach_type {
 	ATH79_MACH_GENERIC = 0,
 	ATH79_MACH_AP121,		/* Atheros AP121 reference board */
 	ATH79_MACH_AP81,		/* Atheros AP81 reference board */
+	ATH79_MACH_DB120,		/* Atheros DB120 reference board */
 	ATH79_MACH_PB44,		/* Atheros PB44 reference board */
 	ATH79_MACH_UBNT_XM,		/* Ubiquiti Networks XM board rev 1.0 */
 };
diff --git a/arch/mips/ath79/pci.c b/arch/mips/ath79/pci.c
new file mode 100644
index 000000000000..ca83abd9d31e
--- /dev/null
+++ b/arch/mips/ath79/pci.c
@@ -0,0 +1,130 @@
+/*
+ *  Atheros AR71XX/AR724X specific PCI setup code
+ *
+ *  Copyright (C) 2011 René Bolldorf <[email protected]>
+ *  Copyright (C) 2008-2011 Gabor Juhos <[email protected]>
+ *  Copyright (C) 2008 Imre Kaloz <[email protected]>
+ *
+ *  Parts of this file are based on Atheros' 2.6.15 BSP
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License version 2 as published
+ *  by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <asm/mach-ath79/ar71xx_regs.h>
+#include <asm/mach-ath79/ath79.h>
+#include <asm/mach-ath79/irq.h>
+#include <asm/mach-ath79/pci.h>
+#include "pci.h"
+
+static int (*ath79_pci_plat_dev_init)(struct pci_dev *dev);
+static const struct ath79_pci_irq *ath79_pci_irq_map __initdata;
+static unsigned ath79_pci_nr_irqs __initdata;
+
+static const struct ath79_pci_irq ar71xx_pci_irq_map[] __initconst = {
+	{
+		.slot	= 17,
+		.pin	= 1,
+		.irq	= ATH79_PCI_IRQ(0),
+	}, {
+		.slot	= 18,
+		.pin	= 1,
+		.irq	= ATH79_PCI_IRQ(1),
+	}, {
+		.slot	= 19,
+		.pin	= 1,
+		.irq	= ATH79_PCI_IRQ(2),
+	}
+};
+
+static const struct ath79_pci_irq ar724x_pci_irq_map[] __initconst = {
+	{
+		.slot	= 0,
+		.pin	= 1,
+		.irq	= ATH79_PCI_IRQ(0),
+	}
+};
+
+int __init pcibios_map_irq(const struct pci_dev *dev, uint8_t slot, uint8_t pin)
+{
+	int irq = -1;
+	int i;
+
+	if (ath79_pci_nr_irqs == 0 ||
+	    ath79_pci_irq_map == NULL) {
+		if (soc_is_ar71xx()) {
+			ath79_pci_irq_map = ar71xx_pci_irq_map;
+			ath79_pci_nr_irqs = ARRAY_SIZE(ar71xx_pci_irq_map);
+		} else if (soc_is_ar724x() ||
+			   soc_is_ar9342() ||
+			   soc_is_ar9344()) {
+			ath79_pci_irq_map = ar724x_pci_irq_map;
+			ath79_pci_nr_irqs = ARRAY_SIZE(ar724x_pci_irq_map);
+		} else {
+			pr_crit("pci %s: invalid irq map\n",
+				pci_name((struct pci_dev *) dev));
+			return irq;
+		}
+	}
+
+	for (i = 0; i < ath79_pci_nr_irqs; i++) {
+		const struct ath79_pci_irq *entry;
+
+		entry = &ath79_pci_irq_map[i];
+		if (entry->slot == slot && entry->pin == pin) {
+			irq = entry->irq;
+			break;
+		}
+	}
+
+	if (irq < 0)
+		pr_crit("pci %s: no irq found for pin %u\n",
+			pci_name((struct pci_dev *) dev), pin);
+	else
+		pr_info("pci %s: using irq %d for pin %u\n",
+			pci_name((struct pci_dev *) dev), irq, pin);
+
+	return irq;
+}
+
+int pcibios_plat_dev_init(struct pci_dev *dev)
+{
+	if (ath79_pci_plat_dev_init)
+		return ath79_pci_plat_dev_init(dev);
+
+	return 0;
+}
+
+void __init ath79_pci_set_irq_map(unsigned nr_irqs,
+				  const struct ath79_pci_irq *map)
+{
+	ath79_pci_nr_irqs = nr_irqs;
+	ath79_pci_irq_map = map;
+}
+
+void __init ath79_pci_set_plat_dev_init(int (*func)(struct pci_dev *dev))
+{
+	ath79_pci_plat_dev_init = func;
+}
+
+int __init ath79_register_pci(void)
+{
+	if (soc_is_ar71xx())
+		return ar71xx_pcibios_init();
+
+	if (soc_is_ar724x())
+		return ar724x_pcibios_init(ATH79_CPU_IRQ_IP2);
+
+	if (soc_is_ar9342() || soc_is_ar9344()) {
+		u32 bootstrap;
+
+		bootstrap = ath79_reset_rr(AR934X_RESET_REG_BOOTSTRAP);
+		if (bootstrap & AR934X_BOOTSTRAP_PCIE_RC)
+			return ar724x_pcibios_init(ATH79_IP2_IRQ(0));
+	}
+
+	return -ENODEV;
+}
diff --git a/arch/mips/ath79/pci.h b/arch/mips/ath79/pci.h
new file mode 100644
index 000000000000..51c6625dcc6d
--- /dev/null
+++ b/arch/mips/ath79/pci.h
@@ -0,0 +1,34 @@
+/*
+ *  Atheros AR71XX/AR724X PCI support
+ *
+ *  Copyright (C) 2011 René Bolldorf <[email protected]>
+ *  Copyright (C) 2008-2011 Gabor Juhos <[email protected]>
+ *  Copyright (C) 2008 Imre Kaloz <[email protected]>
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License version 2 as published
+ *  by the Free Software Foundation.
+ */
+
+#ifndef _ATH79_PCI_H
+#define _ATH79_PCI_H
+
+struct ath79_pci_irq {
+	u8	slot;
+	u8	pin;
+	int	irq;
+};
+
+#ifdef CONFIG_PCI
+void ath79_pci_set_irq_map(unsigned nr_irqs, const struct ath79_pci_irq *map);
+void ath79_pci_set_plat_dev_init(int (*func)(struct pci_dev *dev));
+int ath79_register_pci(void);
+#else
+static inline void
+ath79_pci_set_irq_map(unsigned nr_irqs, const struct ath79_pci_irq *map) {}
+static inline void
+ath79_pci_set_plat_dev_init(int (*func)(struct pci_dev *)) {}
+static inline int ath79_register_pci(void) { return 0; }
+#endif
+
+#endif /* _ATH79_PCI_H */
diff --git a/arch/mips/ath79/setup.c b/arch/mips/ath79/setup.c
index 80a7d4023d7f..60d212ef8629 100644
--- a/arch/mips/ath79/setup.c
+++ b/arch/mips/ath79/setup.c
@@ -1,10 +1,11 @@
 /*
  *  Atheros AR71XX/AR724X/AR913X specific setup
  *
+ *  Copyright (C) 2010-2011 Jaiganesh Narayanan <[email protected]>
  *  Copyright (C) 2008-2011 Gabor Juhos <[email protected]>
  *  Copyright (C) 2008 Imre Kaloz <[email protected]>
  *
- *  Parts of this file are based on Atheros' 2.6.15 BSP
+ *  Parts of this file are based on Atheros' 2.6.15/2.6.31 BSP
  *
  *  This program is free software; you can redistribute it and/or modify it
  *  under the terms of the GNU General Public License version 2 as published
@@ -116,18 +117,6 @@ static void __init ath79_detect_sys_type(void)
 		rev = id & AR724X_REV_ID_REVISION_MASK;
 		break;
 
-	case REV_ID_MAJOR_AR9330:
-		ath79_soc = ATH79_SOC_AR9330;
-		chip = "9330";
-		rev = id & AR933X_REV_ID_REVISION_MASK;
-		break;
-
-	case REV_ID_MAJOR_AR9331:
-		ath79_soc = ATH79_SOC_AR9331;
-		chip = "9331";
-		rev = id & AR933X_REV_ID_REVISION_MASK;
-		break;
-
 	case REV_ID_MAJOR_AR913X:
 		minor = id & AR913X_REV_ID_MINOR_MASK;
 		rev = id >> AR913X_REV_ID_REVISION_SHIFT;
@@ -145,6 +134,36 @@ static void __init ath79_detect_sys_type(void)
 		}
 		break;
 
+	case REV_ID_MAJOR_AR9330:
+		ath79_soc = ATH79_SOC_AR9330;
+		chip = "9330";
+		rev = id & AR933X_REV_ID_REVISION_MASK;
+		break;
+
+	case REV_ID_MAJOR_AR9331:
+		ath79_soc = ATH79_SOC_AR9331;
+		chip = "9331";
+		rev = id & AR933X_REV_ID_REVISION_MASK;
+		break;
+
+	case REV_ID_MAJOR_AR9341:
+		ath79_soc = ATH79_SOC_AR9341;
+		chip = "9341";
+		rev = id & AR934X_REV_ID_REVISION_MASK;
+		break;
+
+	case REV_ID_MAJOR_AR9342:
+		ath79_soc = ATH79_SOC_AR9342;
+		chip = "9342";
+		rev = id & AR934X_REV_ID_REVISION_MASK;
+		break;
+
+	case REV_ID_MAJOR_AR9344:
+		ath79_soc = ATH79_SOC_AR9344;
+		chip = "9344";
+		rev = id & AR934X_REV_ID_REVISION_MASK;
+		break;
+
 	default:
 		panic("ath79: unknown SoC, id:0x%08x", id);
 	}
diff --git a/arch/mips/bcm63xx/boards/Makefile b/arch/mips/bcm63xx/boards/Makefile
index 9f64fb414077..af07c1aa202f 100644
--- a/arch/mips/bcm63xx/boards/Makefile
+++ b/arch/mips/bcm63xx/boards/Makefile
@@ -1,3 +1 @@
 obj-$(CONFIG_BOARD_BCM963XX)		+= board_bcm963xx.o
-
-ccflags-y := -Werror
diff --git a/arch/mips/cavium-octeon/setup.c b/arch/mips/cavium-octeon/setup.c
index d3a9f012aa0a..260dc247c052 100644
--- a/arch/mips/cavium-octeon/setup.c
+++ b/arch/mips/cavium-octeon/setup.c
@@ -9,6 +9,7 @@
 #include <linux/init.h>
 #include <linux/console.h>
 #include <linux/delay.h>
+#include <linux/export.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/serial.h>
diff --git a/arch/mips/cavium-octeon/smp.c b/arch/mips/cavium-octeon/smp.c
index 97e7ce9b50ed..4b93048044eb 100644
--- a/arch/mips/cavium-octeon/smp.c
+++ b/arch/mips/cavium-octeon/smp.c
@@ -257,8 +257,6 @@ DEFINE_PER_CPU(int, cpu_state);
 
 extern void fixup_irqs(void);
 
-static DEFINE_SPINLOCK(smp_reserve_lock);
-
 static int octeon_cpu_disable(void)
 {
 	unsigned int cpu = smp_processor_id();
@@ -266,8 +264,6 @@ static int octeon_cpu_disable(void)
 	if (cpu == 0)
 		return -EBUSY;
 
-	spin_lock(&smp_reserve_lock);
-
 	set_cpu_online(cpu, false);
 	cpu_clear(cpu, cpu_callin_map);
 	local_irq_disable();
@@ -277,8 +273,6 @@ static int octeon_cpu_disable(void)
 	flush_cache_all();
 	local_flush_tlb_all();
 
-	spin_unlock(&smp_reserve_lock);
-
 	return 0;
 }
 
diff --git a/arch/mips/fw/arc/Makefile b/arch/mips/fw/arc/Makefile
index 5314b37aff2c..4f349ec1ea2d 100644
--- a/arch/mips/fw/arc/Makefile
+++ b/arch/mips/fw/arc/Makefile
@@ -8,5 +8,3 @@ lib-y				+= cmdline.o env.o file.o identify.o init.o \
 lib-$(CONFIG_ARC_MEMORY)	+= memory.o
 lib-$(CONFIG_ARC_CONSOLE)	+= arc_con.o
 lib-$(CONFIG_ARC_PROMLIB)	+= promlib.o
-
-ccflags-y			:= -Werror
diff --git a/arch/mips/include/asm/clkdev.h b/arch/mips/include/asm/clkdev.h
new file mode 100644
index 000000000000..262475414e5f
--- /dev/null
+++ b/arch/mips/include/asm/clkdev.h
@@ -0,0 +1,25 @@
+/*
+ *  based on arch/arm/include/asm/clkdev.h
+ *
+ *  Copyright (C) 2008 Russell King.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Helper for the clk API to assist looking up a struct clk.
+ */
+#ifndef __ASM_CLKDEV_H
+#define __ASM_CLKDEV_H
+
+#include <linux/slab.h>
+
+#define __clk_get(clk)	({ 1; })
+#define __clk_put(clk)	do { } while (0)
+
+static inline struct clk_lookup_alloc *__clkdev_alloc(size_t size)
+{
+	return kzalloc(size, GFP_KERNEL);
+}
+
+#endif
diff --git a/arch/mips/include/asm/mach-ath79/ar71xx_regs.h b/arch/mips/include/asm/mach-ath79/ar71xx_regs.h
index 2f0becb4ec8f..1caa78ad06d5 100644
--- a/arch/mips/include/asm/mach-ath79/ar71xx_regs.h
+++ b/arch/mips/include/asm/mach-ath79/ar71xx_regs.h
@@ -1,10 +1,11 @@
 /*
  *  Atheros AR71XX/AR724X/AR913X SoC register definitions
  *
+ *  Copyright (C) 2010-2011 Jaiganesh Narayanan <[email protected]>
  *  Copyright (C) 2008-2010 Gabor Juhos <[email protected]>
  *  Copyright (C) 2008 Imre Kaloz <[email protected]>
  *
- *  Parts of this file are based on Atheros' 2.6.15 BSP
+ *  Parts of this file are based on Atheros' 2.6.15/2.6.31 BSP
  *
  *  This program is free software; you can redistribute it and/or modify it
  *  under the terms of the GNU General Public License version 2 as published
@@ -60,6 +61,9 @@
 #define AR933X_EHCI_BASE	0x1b000000
 #define AR933X_EHCI_SIZE	0x1000
 
+#define AR934X_WMAC_BASE	(AR71XX_APB_BASE + 0x00100000)
+#define AR934X_WMAC_SIZE	0x20000
+
 /*
  * DDR_CTRL block
  */
@@ -91,6 +95,12 @@
 #define AR933X_DDR_REG_FLUSH_USB	0x84
 #define AR933X_DDR_REG_FLUSH_WMAC	0x88
 
+#define AR934X_DDR_REG_FLUSH_GE0	0x9c
+#define AR934X_DDR_REG_FLUSH_GE1	0xa0
+#define AR934X_DDR_REG_FLUSH_USB	0xa4
+#define AR934X_DDR_REG_FLUSH_PCIE	0xa8
+#define AR934X_DDR_REG_FLUSH_WMAC	0xac
+
 /*
  * PLL block
  */
@@ -150,6 +160,41 @@
 #define AR933X_PLL_CLOCK_CTRL_AHB_DIV_SHIFT	15
 #define AR933X_PLL_CLOCK_CTRL_AHB_DIV_MASK	0x7
 
+#define AR934X_PLL_CPU_CONFIG_REG		0x00
+#define AR934X_PLL_DDR_CONFIG_REG		0x04
+#define AR934X_PLL_CPU_DDR_CLK_CTRL_REG		0x08
+
+#define AR934X_PLL_CPU_CONFIG_NFRAC_SHIFT	0
+#define AR934X_PLL_CPU_CONFIG_NFRAC_MASK	0x3f
+#define AR934X_PLL_CPU_CONFIG_NINT_SHIFT	6
+#define AR934X_PLL_CPU_CONFIG_NINT_MASK		0x3f
+#define AR934X_PLL_CPU_CONFIG_REFDIV_SHIFT	12
+#define AR934X_PLL_CPU_CONFIG_REFDIV_MASK	0x1f
+#define AR934X_PLL_CPU_CONFIG_OUTDIV_SHIFT	19
+#define AR934X_PLL_CPU_CONFIG_OUTDIV_MASK	0x3
+
+#define AR934X_PLL_DDR_CONFIG_NFRAC_SHIFT	0
+#define AR934X_PLL_DDR_CONFIG_NFRAC_MASK	0x3ff
+#define AR934X_PLL_DDR_CONFIG_NINT_SHIFT	10
+#define AR934X_PLL_DDR_CONFIG_NINT_MASK		0x3f
+#define AR934X_PLL_DDR_CONFIG_REFDIV_SHIFT	16
+#define AR934X_PLL_DDR_CONFIG_REFDIV_MASK	0x1f
+#define AR934X_PLL_DDR_CONFIG_OUTDIV_SHIFT	23
+#define AR934X_PLL_DDR_CONFIG_OUTDIV_MASK	0x7
+
+#define AR934X_PLL_CPU_DDR_CLK_CTRL_CPU_PLL_BYPASS	BIT(2)
+#define AR934X_PLL_CPU_DDR_CLK_CTRL_DDR_PLL_BYPASS	BIT(3)
+#define AR934X_PLL_CPU_DDR_CLK_CTRL_AHB_PLL_BYPASS	BIT(4)
+#define AR934X_PLL_CPU_DDR_CLK_CTRL_CPU_POST_DIV_SHIFT	5
+#define AR934X_PLL_CPU_DDR_CLK_CTRL_CPU_POST_DIV_MASK	0x1f
+#define AR934X_PLL_CPU_DDR_CLK_CTRL_DDR_POST_DIV_SHIFT	10
+#define AR934X_PLL_CPU_DDR_CLK_CTRL_DDR_POST_DIV_MASK	0x1f
+#define AR934X_PLL_CPU_DDR_CLK_CTRL_AHB_POST_DIV_SHIFT	15
+#define AR934X_PLL_CPU_DDR_CLK_CTRL_AHB_POST_DIV_MASK	0x1f
+#define AR934X_PLL_CPU_DDR_CLK_CTRL_CPUCLK_FROM_CPUPLL	BIT(20)
+#define AR934X_PLL_CPU_DDR_CLK_CTRL_DDRCLK_FROM_DDRPLL	BIT(21)
+#define AR934X_PLL_CPU_DDR_CLK_CTRL_AHBCLK_FROM_DDRPLL	BIT(24)
+
 /*
  * USB_CONFIG block
  */
@@ -185,6 +230,10 @@
 #define AR933X_RESET_REG_RESET_MODULE		0x1c
 #define AR933X_RESET_REG_BOOTSTRAP		0xac
 
+#define AR934X_RESET_REG_RESET_MODULE		0x1c
+#define AR934X_RESET_REG_BOOTSTRAP		0xb0
+#define AR934X_RESET_REG_PCIE_WMAC_INT_STATUS	0xac
+
 #define MISC_INT_ETHSW			BIT(12)
 #define MISC_INT_TIMER4			BIT(10)
 #define MISC_INT_TIMER3			BIT(9)
@@ -241,6 +290,40 @@
 
 #define AR933X_BOOTSTRAP_REF_CLK_40	BIT(0)
 
+#define AR934X_BOOTSTRAP_SW_OPTION8	BIT(23)
+#define AR934X_BOOTSTRAP_SW_OPTION7	BIT(22)
+#define AR934X_BOOTSTRAP_SW_OPTION6	BIT(21)
+#define AR934X_BOOTSTRAP_SW_OPTION5	BIT(20)
+#define AR934X_BOOTSTRAP_SW_OPTION4	BIT(19)
+#define AR934X_BOOTSTRAP_SW_OPTION3	BIT(18)
+#define AR934X_BOOTSTRAP_SW_OPTION2	BIT(17)
+#define AR934X_BOOTSTRAP_SW_OPTION1	BIT(16)
+#define AR934X_BOOTSTRAP_USB_MODE_DEVICE BIT(7)
+#define AR934X_BOOTSTRAP_PCIE_RC	BIT(6)
+#define AR934X_BOOTSTRAP_EJTAG_MODE	BIT(5)
+#define AR934X_BOOTSTRAP_REF_CLK_40	BIT(4)
+#define AR934X_BOOTSTRAP_BOOT_FROM_SPI	BIT(2)
+#define AR934X_BOOTSTRAP_SDRAM_DISABLED	BIT(1)
+#define AR934X_BOOTSTRAP_DDR1		BIT(0)
+
+#define AR934X_PCIE_WMAC_INT_WMAC_MISC		BIT(0)
+#define AR934X_PCIE_WMAC_INT_WMAC_TX		BIT(1)
+#define AR934X_PCIE_WMAC_INT_WMAC_RXLP		BIT(2)
+#define AR934X_PCIE_WMAC_INT_WMAC_RXHP		BIT(3)
+#define AR934X_PCIE_WMAC_INT_PCIE_RC		BIT(4)
+#define AR934X_PCIE_WMAC_INT_PCIE_RC0		BIT(5)
+#define AR934X_PCIE_WMAC_INT_PCIE_RC1		BIT(6)
+#define AR934X_PCIE_WMAC_INT_PCIE_RC2		BIT(7)
+#define AR934X_PCIE_WMAC_INT_PCIE_RC3		BIT(8)
+#define AR934X_PCIE_WMAC_INT_WMAC_ALL \
+	(AR934X_PCIE_WMAC_INT_WMAC_MISC | AR934X_PCIE_WMAC_INT_WMAC_TX | \
+	 AR934X_PCIE_WMAC_INT_WMAC_RXLP | AR934X_PCIE_WMAC_INT_WMAC_RXHP)
+
+#define AR934X_PCIE_WMAC_INT_PCIE_ALL \
+	(AR934X_PCIE_WMAC_INT_PCIE_RC | AR934X_PCIE_WMAC_INT_PCIE_RC0 | \
+	 AR934X_PCIE_WMAC_INT_PCIE_RC1 | AR934X_PCIE_WMAC_INT_PCIE_RC2 | \
+	 AR934X_PCIE_WMAC_INT_PCIE_RC3)
+
 #define REV_ID_MAJOR_MASK		0xfff0
 #define REV_ID_MAJOR_AR71XX		0x00a0
 #define REV_ID_MAJOR_AR913X		0x00b0
@@ -249,6 +332,9 @@
 #define REV_ID_MAJOR_AR7242		0x1100
 #define REV_ID_MAJOR_AR9330		0x0110
 #define REV_ID_MAJOR_AR9331		0x1110
+#define REV_ID_MAJOR_AR9341		0x0120
+#define REV_ID_MAJOR_AR9342		0x1120
+#define REV_ID_MAJOR_AR9344		0x2120
 
 #define AR71XX_REV_ID_MINOR_MASK	0x3
 #define AR71XX_REV_ID_MINOR_AR7130	0x0
@@ -267,6 +353,8 @@
 
 #define AR724X_REV_ID_REVISION_MASK	0x3
 
+#define AR934X_REV_ID_REVISION_MASK     0xf
+
 /*
  * SPI block
  */
@@ -308,5 +396,6 @@
 #define AR724X_GPIO_COUNT		18
 #define AR913X_GPIO_COUNT		22
 #define AR933X_GPIO_COUNT		30
+#define AR934X_GPIO_COUNT		23
 
 #endif /* __ASM_MACH_AR71XX_REGS_H */
diff --git a/arch/mips/include/asm/mach-ath79/ath79.h b/arch/mips/include/asm/mach-ath79/ath79.h
index 6d0c6c9d5622..4f248c3d7b23 100644
--- a/arch/mips/include/asm/mach-ath79/ath79.h
+++ b/arch/mips/include/asm/mach-ath79/ath79.h
@@ -29,6 +29,9 @@ enum ath79_soc_type {
 	ATH79_SOC_AR9132,
 	ATH79_SOC_AR9330,
 	ATH79_SOC_AR9331,
+	ATH79_SOC_AR9341,
+	ATH79_SOC_AR9342,
+	ATH79_SOC_AR9344,
 };
 
 extern enum ath79_soc_type ath79_soc;
@@ -75,6 +78,26 @@ static inline int soc_is_ar933x(void)
 		ath79_soc == ATH79_SOC_AR9331);
 }
 
+static inline int soc_is_ar9341(void)
+{
+	return (ath79_soc == ATH79_SOC_AR9341);
+}
+
+static inline int soc_is_ar9342(void)
+{
+	return (ath79_soc == ATH79_SOC_AR9342);
+}
+
+static inline int soc_is_ar9344(void)
+{
+	return (ath79_soc == ATH79_SOC_AR9344);
+}
+
+static inline int soc_is_ar934x(void)
+{
+	return soc_is_ar9341() || soc_is_ar9342() || soc_is_ar9344();
+}
+
 extern void __iomem *ath79_ddr_base;
 extern void __iomem *ath79_pll_base;
 extern void __iomem *ath79_reset_base;
diff --git a/arch/mips/include/asm/mach-ath79/irq.h b/arch/mips/include/asm/mach-ath79/irq.h
index 519958fe4e3c..0968f69e2018 100644
--- a/arch/mips/include/asm/mach-ath79/irq.h
+++ b/arch/mips/include/asm/mach-ath79/irq.h
@@ -10,11 +10,19 @@
 #define __ASM_MACH_ATH79_IRQ_H
 
 #define MIPS_CPU_IRQ_BASE	0
-#define NR_IRQS			40
+#define NR_IRQS			48
 
 #define ATH79_MISC_IRQ_BASE	8
 #define ATH79_MISC_IRQ_COUNT	32
 
+#define ATH79_PCI_IRQ_BASE	(ATH79_MISC_IRQ_BASE + ATH79_MISC_IRQ_COUNT)
+#define ATH79_PCI_IRQ_COUNT	6
+#define ATH79_PCI_IRQ(_x)	(ATH79_PCI_IRQ_BASE + (_x))
+
+#define ATH79_IP2_IRQ_BASE	(ATH79_PCI_IRQ_BASE + ATH79_PCI_IRQ_COUNT)
+#define ATH79_IP2_IRQ_COUNT	2
+#define ATH79_IP2_IRQ(_x)	(ATH79_IP2_IRQ_BASE + (_x))
+
 #define ATH79_CPU_IRQ_IP2	(MIPS_CPU_IRQ_BASE + 2)
 #define ATH79_CPU_IRQ_USB	(MIPS_CPU_IRQ_BASE + 3)
 #define ATH79_CPU_IRQ_GE0	(MIPS_CPU_IRQ_BASE + 4)
diff --git a/arch/mips/include/asm/mach-ath79/pci-ath724x.h b/arch/mips/include/asm/mach-ath79/pci-ath724x.h
deleted file mode 100644
index 454885fa30c3..000000000000
--- a/arch/mips/include/asm/mach-ath79/pci-ath724x.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- *  Atheros 724x PCI support
- *
- *  Copyright (C) 2011 René Bolldorf <[email protected]>
- *
- *  This program is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License version 2 as published
- *  by the Free Software Foundation.
- */
-
-#ifndef __ASM_MACH_ATH79_PCI_ATH724X_H
-#define __ASM_MACH_ATH79_PCI_ATH724X_H
-
-struct ath724x_pci_data {
-	int irq;
-	void *pdata;
-};
-
-void ath724x_pci_add_data(struct ath724x_pci_data *data, int size);
-
-#endif /* __ASM_MACH_ATH79_PCI_ATH724X_H */
diff --git a/arch/mips/include/asm/mach-ath79/pci.h b/arch/mips/include/asm/mach-ath79/pci.h
new file mode 100644
index 000000000000..7868f7fa028f
--- /dev/null
+++ b/arch/mips/include/asm/mach-ath79/pci.h
@@ -0,0 +1,28 @@
+/*
+ *  Atheros AR71XX/AR724X PCI support
+ *
+ *  Copyright (C) 2011 René Bolldorf <[email protected]>
+ *  Copyright (C) 2008-2011 Gabor Juhos <[email protected]>
+ *  Copyright (C) 2008 Imre Kaloz <[email protected]>
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License version 2 as published
+ *  by the Free Software Foundation.
+ */
+
+#ifndef __ASM_MACH_ATH79_PCI_H
+#define __ASM_MACH_ATH79_PCI_H
+
+#if defined(CONFIG_PCI) && defined(CONFIG_SOC_AR71XX)
+int ar71xx_pcibios_init(void);
+#else
+static inline int ar71xx_pcibios_init(void) { return 0; }
+#endif
+
+#if defined(CONFIG_PCI_AR724X)
+int ar724x_pcibios_init(int irq);
+#else
+static inline int ar724x_pcibios_init(int irq) { return 0; }
+#endif
+
+#endif /* __ASM_MACH_ATH79_PCI_H */
diff --git a/arch/mips/include/asm/mach-bcm63xx/bcm63xx_gpio.h b/arch/mips/include/asm/mach-bcm63xx/bcm63xx_gpio.h
index 3d5de96d4036..1d7dd96aa460 100644
--- a/arch/mips/include/asm/mach-bcm63xx/bcm63xx_gpio.h
+++ b/arch/mips/include/asm/mach-bcm63xx/bcm63xx_gpio.h
@@ -2,6 +2,7 @@
 #define BCM63XX_GPIO_H
 
 #include <linux/init.h>
+#include <bcm63xx_cpu.h>
 
 int __init bcm63xx_gpio_init(void);
 
diff --git a/arch/mips/include/asm/mach-lantiq/falcon/falcon_irq.h b/arch/mips/include/asm/mach-lantiq/falcon/falcon_irq.h
new file mode 100644
index 000000000000..318f982f04ff
--- /dev/null
+++ b/arch/mips/include/asm/mach-lantiq/falcon/falcon_irq.h
@@ -0,0 +1,23 @@
+/*
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License version 2 as published
+ *  by the Free Software Foundation.
+ *
+ *  Copyright (C) 2010 Thomas Langer <[email protected]>
+ */
+
+#ifndef _FALCON_IRQ__
+#define _FALCON_IRQ__
+
+#define INT_NUM_IRQ0			8
+#define INT_NUM_IM0_IRL0		(INT_NUM_IRQ0 + 0)
+#define INT_NUM_IM1_IRL0		(INT_NUM_IM0_IRL0 + 32)
+#define INT_NUM_IM2_IRL0		(INT_NUM_IM1_IRL0 + 32)
+#define INT_NUM_IM3_IRL0		(INT_NUM_IM2_IRL0 + 32)
+#define INT_NUM_IM4_IRL0		(INT_NUM_IM3_IRL0 + 32)
+#define INT_NUM_EXTRA_START		(INT_NUM_IM4_IRL0 + 32)
+#define INT_NUM_IM_OFFSET		(INT_NUM_IM1_IRL0 - INT_NUM_IM0_IRL0)
+
+#define MIPS_CPU_TIMER_IRQ			7
+
+#endif /* _FALCON_IRQ__ */
diff --git a/arch/mips/include/asm/mach-lantiq/falcon/irq.h b/arch/mips/include/asm/mach-lantiq/falcon/irq.h
new file mode 100644
index 000000000000..2caccd9f9dbc
--- /dev/null
+++ b/arch/mips/include/asm/mach-lantiq/falcon/irq.h
@@ -0,0 +1,18 @@
+/*
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License version 2 as published
+ *  by the Free Software Foundation.
+ *
+ *  Copyright (C) 2011 Thomas Langer <[email protected]>
+ */
+
+#ifndef __FALCON_IRQ_H
+#define __FALCON_IRQ_H
+
+#include <falcon_irq.h>
+
+#define NR_IRQS 328
+
+#include_next <irq.h>
+
+#endif
diff --git a/arch/mips/include/asm/mach-lantiq/falcon/lantiq_soc.h b/arch/mips/include/asm/mach-lantiq/falcon/lantiq_soc.h
new file mode 100644
index 000000000000..b385252584ee
--- /dev/null
+++ b/arch/mips/include/asm/mach-lantiq/falcon/lantiq_soc.h
@@ -0,0 +1,67 @@
+/*
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * Copyright (C) 2010 John Crispin <[email protected]>
+ */
+
+#ifndef _LTQ_FALCON_H__
+#define _LTQ_FALCON_H__
+
+#ifdef CONFIG_SOC_FALCON
+
+#include <linux/pinctrl/pinctrl.h>
+#include <lantiq.h>
+
+/* Chip IDs */
+#define SOC_ID_FALCON		0x01B8
+
+/* SoC Types */
+#define SOC_TYPE_FALCON		0x01
+
+/*
+ * during early_printk no ioremap possible at this early stage
+ * lets use KSEG1 instead
+ */
+#define LTQ_ASC0_BASE_ADDR	0x1E100C00
+#define LTQ_EARLY_ASC		KSEG1ADDR(LTQ_ASC0_BASE_ADDR)
+
+/* WDT */
+#define LTQ_RST_CAUSE_WDTRST	0x0002
+
+/* CHIP ID */
+#define LTQ_STATUS_BASE_ADDR	0x1E802000
+
+#define FALCON_CHIPID		((u32 *)(KSEG1 + LTQ_STATUS_BASE_ADDR + 0x0c))
+#define FALCON_CHIPTYPE		((u32 *)(KSEG1 + LTQ_STATUS_BASE_ADDR + 0x38))
+#define FALCON_CHIPCONF		((u32 *)(KSEG1 + LTQ_STATUS_BASE_ADDR + 0x40))
+
+/* SYSCTL - start/stop/restart/configure/... different parts of the Soc */
+#define SYSCTL_SYS1		0
+#define SYSCTL_SYSETH		1
+#define SYSCTL_SYSGPE		2
+
+/* BOOT_SEL - find what boot media we have */
+#define BS_FLASH		0x1
+#define BS_SPI                  0x4
+
+/* global register ranges */
+extern __iomem void *ltq_ebu_membase;
+extern __iomem void *ltq_sys1_membase;
+#define ltq_ebu_w32(x, y)	ltq_w32((x), ltq_ebu_membase + (y))
+#define ltq_ebu_r32(x)		ltq_r32(ltq_ebu_membase + (x))
+
+#define ltq_sys1_w32(x, y)	ltq_w32((x), ltq_sys1_membase + (y))
+#define ltq_sys1_r32(x)		ltq_r32(ltq_sys1_membase + (x))
+#define ltq_sys1_w32_mask(clear, set, reg)   \
+	ltq_sys1_w32((ltq_sys1_r32(reg) & ~(clear)) | (set), reg)
+
+/*
+ * to keep the irq code generic we need to define this to 0 as falcon
+ * has no EIU/EBU
+ */
+#define LTQ_EBU_PCC_ISTAT	0
+
+#endif /* CONFIG_SOC_FALCON */
+#endif /* _LTQ_XWAY_H__ */
diff --git a/arch/mips/include/asm/mach-lantiq/gpio.h b/arch/mips/include/asm/mach-lantiq/gpio.h
new file mode 100644
index 000000000000..f79505b43609
--- /dev/null
+++ b/arch/mips/include/asm/mach-lantiq/gpio.h
@@ -0,0 +1,16 @@
+#ifndef __ASM_MIPS_MACH_LANTIQ_GPIO_H
+#define __ASM_MIPS_MACH_LANTIQ_GPIO_H
+
+static inline int gpio_to_irq(unsigned int gpio)
+{
+	return -1;
+}
+
+#define gpio_get_value __gpio_get_value
+#define gpio_set_value __gpio_set_value
+
+#define gpio_cansleep __gpio_cansleep
+
+#include <asm-generic/gpio.h>
+
+#endif
diff --git a/arch/mips/include/asm/mach-lantiq/lantiq.h b/arch/mips/include/asm/mach-lantiq/lantiq.h
index ce2f02929d22..5e8a6e965756 100644
--- a/arch/mips/include/asm/mach-lantiq/lantiq.h
+++ b/arch/mips/include/asm/mach-lantiq/lantiq.h
@@ -9,6 +9,8 @@
 #define _LANTIQ_H__
 
 #include <linux/irq.h>
+#include <linux/device.h>
+#include <linux/clk.h>
 
 /* generic reg access functions */
 #define ltq_r32(reg)		__raw_readl(reg)
@@ -21,25 +23,9 @@
 /* register access macros for EBU and CGU */
 #define ltq_ebu_w32(x, y)	ltq_w32((x), ltq_ebu_membase + (y))
 #define ltq_ebu_r32(x)		ltq_r32(ltq_ebu_membase + (x))
-#define ltq_cgu_w32(x, y)	ltq_w32((x), ltq_cgu_membase + (y))
-#define ltq_cgu_r32(x)		ltq_r32(ltq_cgu_membase + (x))
-
+#define ltq_ebu_w32_mask(x, y, z) \
+	ltq_w32_mask(x, y, ltq_ebu_membase + (z))
 extern __iomem void *ltq_ebu_membase;
-extern __iomem void *ltq_cgu_membase;
-
-extern unsigned int ltq_get_cpu_ver(void);
-extern unsigned int ltq_get_soc_type(void);
-
-/* clock speeds */
-#define CLOCK_60M	60000000
-#define CLOCK_83M	83333333
-#define CLOCK_111M	111111111
-#define CLOCK_133M	133333333
-#define CLOCK_167M	166666667
-#define CLOCK_200M	200000000
-#define CLOCK_266M	266666666
-#define CLOCK_333M	333333333
-#define CLOCK_400M	400000000
 
 /* spinlock all ebu i/o */
 extern spinlock_t ebu_lock;
@@ -49,15 +35,21 @@ extern void ltq_disable_irq(struct irq_data *data);
 extern void ltq_mask_and_ack_irq(struct irq_data *data);
 extern void ltq_enable_irq(struct irq_data *data);
 
+/* clock handling */
+extern int clk_activate(struct clk *clk);
+extern void clk_deactivate(struct clk *clk);
+extern struct clk *clk_get_cpu(void);
+extern struct clk *clk_get_fpi(void);
+extern struct clk *clk_get_io(void);
+
+/* find out what bootsource we have */
+extern unsigned char ltq_boot_select(void);
 /* find out what caused the last cpu reset */
 extern int ltq_reset_cause(void);
-#define LTQ_RST_CAUSE_WDTRST	0x20
 
 #define IOPORT_RESOURCE_START	0x10000000
 #define IOPORT_RESOURCE_END	0xffffffff
 #define IOMEM_RESOURCE_START	0x10000000
 #define IOMEM_RESOURCE_END	0xffffffff
-#define LTQ_FLASH_START		0x10000000
-#define LTQ_FLASH_MAX		0x04000000
 
 #endif
diff --git a/arch/mips/include/asm/mach-lantiq/lantiq_platform.h b/arch/mips/include/asm/mach-lantiq/lantiq_platform.h
index a305f1d0259e..e23bf7c9a2d0 100644
--- a/arch/mips/include/asm/mach-lantiq/lantiq_platform.h
+++ b/arch/mips/include/asm/mach-lantiq/lantiq_platform.h
@@ -9,41 +9,8 @@
 #ifndef _LANTIQ_PLATFORM_H__
 #define _LANTIQ_PLATFORM_H__
 
-#include <linux/mtd/partitions.h>
 #include <linux/socket.h>
 
-/* struct used to pass info to the pci core */
-enum {
-	PCI_CLOCK_INT = 0,
-	PCI_CLOCK_EXT
-};
-
-#define PCI_EXIN0	0x0001
-#define PCI_EXIN1	0x0002
-#define PCI_EXIN2	0x0004
-#define PCI_EXIN3	0x0008
-#define PCI_EXIN4	0x0010
-#define PCI_EXIN5	0x0020
-#define PCI_EXIN_MAX	6
-
-#define PCI_GNT1	0x0040
-#define PCI_GNT2	0x0080
-#define PCI_GNT3	0x0100
-#define PCI_GNT4	0x0200
-
-#define PCI_REQ1	0x0400
-#define PCI_REQ2	0x0800
-#define PCI_REQ3	0x1000
-#define PCI_REQ4	0x2000
-#define PCI_REQ_SHIFT	10
-#define PCI_REQ_MASK	0xf
-
-struct ltq_pci_data {
-	int clock;
-	int gpio;
-	int irq[16];
-};
-
 /* struct used to pass info to network drivers */
 struct ltq_eth_data {
 	struct sockaddr mac;
diff --git a/arch/mips/include/asm/mach-lantiq/xway/lantiq_irq.h b/arch/mips/include/asm/mach-lantiq/xway/lantiq_irq.h
index b4465a888e20..aa0b3b866f84 100644
--- a/arch/mips/include/asm/mach-lantiq/xway/lantiq_irq.h
+++ b/arch/mips/include/asm/mach-lantiq/xway/lantiq_irq.h
@@ -17,50 +17,8 @@
 #define INT_NUM_IM4_IRL0	(INT_NUM_IRQ0 + 128)
 #define INT_NUM_IM_OFFSET	(INT_NUM_IM1_IRL0 - INT_NUM_IM0_IRL0)
 
-#define LTQ_ASC_TIR(x)		(INT_NUM_IM3_IRL0 + (x * 8))
-#define LTQ_ASC_RIR(x)		(INT_NUM_IM3_IRL0 + (x * 8) + 1)
-#define LTQ_ASC_EIR(x)		(INT_NUM_IM3_IRL0 + (x * 8) + 2)
-
-#define LTQ_ASC_ASE_TIR		INT_NUM_IM2_IRL0
-#define LTQ_ASC_ASE_RIR		(INT_NUM_IM2_IRL0 + 2)
-#define LTQ_ASC_ASE_EIR		(INT_NUM_IM2_IRL0 + 3)
-
-#define LTQ_SSC_TIR		(INT_NUM_IM0_IRL0 + 15)
-#define LTQ_SSC_RIR		(INT_NUM_IM0_IRL0 + 14)
-#define LTQ_SSC_EIR		(INT_NUM_IM0_IRL0 + 16)
-
-#define LTQ_MEI_DYING_GASP_INT	(INT_NUM_IM1_IRL0 + 21)
-#define LTQ_MEI_INT		(INT_NUM_IM1_IRL0 + 23)
-
-#define LTQ_TIMER6_INT		(INT_NUM_IM1_IRL0 + 23)
-#define LTQ_USB_INT		(INT_NUM_IM1_IRL0 + 22)
-#define LTQ_USB_OC_INT		(INT_NUM_IM4_IRL0 + 23)
-
-#define MIPS_CPU_TIMER_IRQ		7
-
 #define LTQ_DMA_CH0_INT		(INT_NUM_IM2_IRL0)
-#define LTQ_DMA_CH1_INT		(INT_NUM_IM2_IRL0 + 1)
-#define LTQ_DMA_CH2_INT		(INT_NUM_IM2_IRL0 + 2)
-#define LTQ_DMA_CH3_INT		(INT_NUM_IM2_IRL0 + 3)
-#define LTQ_DMA_CH4_INT		(INT_NUM_IM2_IRL0 + 4)
-#define LTQ_DMA_CH5_INT		(INT_NUM_IM2_IRL0 + 5)
-#define LTQ_DMA_CH6_INT		(INT_NUM_IM2_IRL0 + 6)
-#define LTQ_DMA_CH7_INT		(INT_NUM_IM2_IRL0 + 7)
-#define LTQ_DMA_CH8_INT		(INT_NUM_IM2_IRL0 + 8)
-#define LTQ_DMA_CH9_INT		(INT_NUM_IM2_IRL0 + 9)
-#define LTQ_DMA_CH10_INT	(INT_NUM_IM2_IRL0 + 10)
-#define LTQ_DMA_CH11_INT	(INT_NUM_IM2_IRL0 + 11)
-#define LTQ_DMA_CH12_INT	(INT_NUM_IM2_IRL0 + 25)
-#define LTQ_DMA_CH13_INT	(INT_NUM_IM2_IRL0 + 26)
-#define LTQ_DMA_CH14_INT	(INT_NUM_IM2_IRL0 + 27)
-#define LTQ_DMA_CH15_INT	(INT_NUM_IM2_IRL0 + 28)
-#define LTQ_DMA_CH16_INT	(INT_NUM_IM2_IRL0 + 29)
-#define LTQ_DMA_CH17_INT	(INT_NUM_IM2_IRL0 + 30)
-#define LTQ_DMA_CH18_INT	(INT_NUM_IM2_IRL0 + 16)
-#define LTQ_DMA_CH19_INT	(INT_NUM_IM2_IRL0 + 21)
-
-#define LTQ_PPE_MBOX_INT	(INT_NUM_IM2_IRL0 + 24)
 
-#define INT_NUM_IM4_IRL14	(INT_NUM_IM4_IRL0 + 14)
+#define MIPS_CPU_TIMER_IRQ	7
 
 #endif
diff --git a/arch/mips/include/asm/mach-lantiq/xway/lantiq_soc.h b/arch/mips/include/asm/mach-lantiq/xway/lantiq_soc.h
index 8a3c6be669d2..6a2df709c576 100644
--- a/arch/mips/include/asm/mach-lantiq/xway/lantiq_soc.h
+++ b/arch/mips/include/asm/mach-lantiq/xway/lantiq_soc.h
@@ -17,38 +17,56 @@
 #define SOC_ID_DANUBE1		0x129
 #define SOC_ID_DANUBE2		0x12B
 #define SOC_ID_TWINPASS		0x12D
-#define SOC_ID_AMAZON_SE	0x152
+#define SOC_ID_AMAZON_SE_1	0x152 /* 50601 */
+#define SOC_ID_AMAZON_SE_2	0x153 /* 50600 */
 #define SOC_ID_ARX188		0x16C
-#define SOC_ID_ARX168		0x16D
+#define SOC_ID_ARX168_1		0x16D
+#define SOC_ID_ARX168_2		0x16E
 #define SOC_ID_ARX182		0x16F
-
-/* SoC Types */
+#define SOC_ID_GRX188		0x170
+#define SOC_ID_GRX168		0x171
+
+#define SOC_ID_VRX288		0x1C0 /* v1.1 */
+#define SOC_ID_VRX282		0x1C1 /* v1.1 */
+#define SOC_ID_VRX268		0x1C2 /* v1.1 */
+#define SOC_ID_GRX268		0x1C8 /* v1.1 */
+#define SOC_ID_GRX288		0x1C9 /* v1.1 */
+#define SOC_ID_VRX288_2		0x00B /* v1.2 */
+#define SOC_ID_VRX268_2		0x00C /* v1.2 */
+#define SOC_ID_GRX288_2		0x00D /* v1.2 */
+#define SOC_ID_GRX282_2		0x00E /* v1.2 */
+
+ /* SoC Types */
 #define SOC_TYPE_DANUBE		0x01
 #define SOC_TYPE_TWINPASS	0x02
 #define SOC_TYPE_AR9		0x03
-#define SOC_TYPE_VR9		0x04
-#define SOC_TYPE_AMAZON_SE	0x05
+#define SOC_TYPE_VR9		0x04 /* v1.1 */
+#define SOC_TYPE_VR9_2		0x05 /* v1.2 */
+#define SOC_TYPE_AMAZON_SE	0x06
+
+/* BOOT_SEL - find what boot media we have */
+#define BS_EXT_ROM		0x0
+#define BS_FLASH		0x1
+#define BS_MII0			0x2
+#define BS_PCI			0x3
+#define BS_UART1		0x4
+#define BS_SPI			0x5
+#define BS_NAND			0x6
+#define BS_RMII0		0x7
+
+/* helpers used to access the cgu */
+#define ltq_cgu_w32(x, y)	ltq_w32((x), ltq_cgu_membase + (y))
+#define ltq_cgu_r32(x)		ltq_r32(ltq_cgu_membase + (x))
+extern __iomem void *ltq_cgu_membase;
 
-/* ASC0/1 - serial port */
-#define LTQ_ASC0_BASE_ADDR	0x1E100400
+/*
+ * during early_printk no ioremap is possible
+ * lets use KSEG1 instead
+ */
 #define LTQ_ASC1_BASE_ADDR	0x1E100C00
-#define LTQ_ASC_SIZE		0x400
-
-/* RCU - reset control unit */
-#define LTQ_RCU_BASE_ADDR	0x1F203000
-#define LTQ_RCU_SIZE		0x1000
-
-/* GPTU - general purpose timer unit */
-#define LTQ_GPTU_BASE_ADDR	0x18000300
-#define LTQ_GPTU_SIZE		0x100
+#define LTQ_EARLY_ASC		KSEG1ADDR(LTQ_ASC1_BASE_ADDR)
 
 /* EBU - external bus unit */
-#define LTQ_EBU_GPIO_START	0x14000000
-#define LTQ_EBU_GPIO_SIZE	0x1000
-
-#define LTQ_EBU_BASE_ADDR	0x1E105300
-#define LTQ_EBU_SIZE		0x100
-
 #define LTQ_EBU_BUSCON0		0x0060
 #define LTQ_EBU_PCC_CON		0x0090
 #define LTQ_EBU_PCC_IEN		0x00A4
@@ -57,85 +75,17 @@
 #define LTQ_EBU_ADDRSEL1	0x0024
 #define EBU_WRDIS		0x80000000
 
-/* CGU - clock generation unit */
-#define LTQ_CGU_BASE_ADDR	0x1F103000
-#define LTQ_CGU_SIZE		0x1000
-
-/* ICU - interrupt control unit */
-#define LTQ_ICU_BASE_ADDR	0x1F880200
-#define LTQ_ICU_SIZE		0x100
-
-/* EIU - external interrupt unit */
-#define LTQ_EIU_BASE_ADDR	0x1F101000
-#define LTQ_EIU_SIZE		0x1000
-
-/* PMU - power management unit */
-#define LTQ_PMU_BASE_ADDR	0x1F102000
-#define LTQ_PMU_SIZE		0x1000
-
-#define PMU_DMA			0x0020
-#define PMU_USB			0x8041
-#define PMU_LED			0x0800
-#define PMU_GPT			0x1000
-#define PMU_PPE			0x2000
-#define PMU_FPI			0x4000
-#define PMU_SWITCH		0x10000000
-
-/* ETOP - ethernet */
-#define LTQ_ETOP_BASE_ADDR	0x1E180000
-#define LTQ_ETOP_SIZE		0x40000
-
-/* DMA */
-#define LTQ_DMA_BASE_ADDR	0x1E104100
-#define LTQ_DMA_SIZE		0x800
-
-/* PCI */
-#define PCI_CR_BASE_ADDR	0x1E105400
-#define PCI_CR_SIZE		0x400
-
 /* WDT */
-#define LTQ_WDT_BASE_ADDR	0x1F8803F0
-#define LTQ_WDT_SIZE		0x10
-
-/* STP - serial to parallel conversion unit */
-#define LTQ_STP_BASE_ADDR	0x1E100BB0
-#define LTQ_STP_SIZE		0x40
-
-/* GPIO */
-#define LTQ_GPIO0_BASE_ADDR	0x1E100B10
-#define LTQ_GPIO1_BASE_ADDR	0x1E100B40
-#define LTQ_GPIO2_BASE_ADDR	0x1E100B70
-#define LTQ_GPIO_SIZE		0x30
-
-/* SSC */
-#define LTQ_SSC_BASE_ADDR	0x1e100800
-#define LTQ_SSC_SIZE		0x100
-
-/* MEI - dsl core */
-#define LTQ_MEI_BASE_ADDR	0x1E116000
-
-/* DEU - data encryption unit */
-#define LTQ_DEU_BASE_ADDR	0x1E103100
+#define LTQ_RST_CAUSE_WDTRST	0x20
 
 /* MPS - multi processor unit (voice) */
 #define LTQ_MPS_BASE_ADDR	(KSEG1 + 0x1F107000)
 #define LTQ_MPS_CHIPID		((u32 *)(LTQ_MPS_BASE_ADDR + 0x0344))
 
 /* request a non-gpio and set the PIO config */
-extern int  ltq_gpio_request(unsigned int pin, unsigned int alt0,
-	unsigned int alt1, unsigned int dir, const char *name);
+#define PMU_PPE			 BIT(13)
 extern void ltq_pmu_enable(unsigned int module);
 extern void ltq_pmu_disable(unsigned int module);
 
-static inline int ltq_is_ar9(void)
-{
-	return (ltq_get_soc_type() == SOC_TYPE_AR9);
-}
-
-static inline int ltq_is_vr9(void)
-{
-	return (ltq_get_soc_type() == SOC_TYPE_VR9);
-}
-
 #endif /* CONFIG_SOC_TYPE_XWAY */
 #endif /* _LTQ_XWAY_H__ */
diff --git a/arch/mips/include/asm/mips-boards/generic.h b/arch/mips/include/asm/mips-boards/generic.h
index 46c08563e532..6e23ceb0ba8c 100644
--- a/arch/mips/include/asm/mips-boards/generic.h
+++ b/arch/mips/include/asm/mips-boards/generic.h
@@ -93,8 +93,4 @@ extern void mips_pcibios_init(void);
 #define mips_pcibios_init() do { } while (0)
 #endif
 
-#ifdef CONFIG_KGDB
-extern void kgdb_config(void);
-#endif
-
 #endif  /* __ASM_MIPS_BOARDS_GENERIC_H */
diff --git a/arch/mips/include/asm/module.h b/arch/mips/include/asm/module.h
index 7467d1d933d5..530008048c62 100644
--- a/arch/mips/include/asm/module.h
+++ b/arch/mips/include/asm/module.h
@@ -2,6 +2,7 @@
 #define _ASM_MODULE_H
 
 #include <linux/list.h>
+#include <linux/elf.h>
 #include <asm/uaccess.h>
 
 struct mod_arch_specific {
diff --git a/arch/mips/include/asm/octeon/cvmx-pcieep-defs.h b/arch/mips/include/asm/octeon/cvmx-pcieep-defs.h
deleted file mode 100644
index d553f8e88df6..000000000000
--- a/arch/mips/include/asm/octeon/cvmx-pcieep-defs.h
+++ /dev/null
@@ -1,1365 +0,0 @@
-/***********************license start***************
- * Author: Cavium Networks
- *
- * Contact: [email protected]
- * This file is part of the OCTEON SDK
- *
- * Copyright (c) 2003-2008 Cavium Networks
- *
- * This file is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, Version 2, as
- * published by the Free Software Foundation.
- *
- * This file is distributed in the hope that it will be useful, but
- * AS-IS and WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, TITLE, or
- * NONINFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this file; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- * or visit http://www.gnu.org/licenses/.
- *
- * This file may also be available under a different license from Cavium.
- * Contact Cavium Networks for more information
- ***********************license end**************************************/
-
-#ifndef __CVMX_PCIEEP_DEFS_H__
-#define __CVMX_PCIEEP_DEFS_H__
-
-#define CVMX_PCIEEP_CFG000 \
-	 (0x0000000000000000ull)
-#define CVMX_PCIEEP_CFG001 \
-	 (0x0000000000000004ull)
-#define CVMX_PCIEEP_CFG002 \
-	 (0x0000000000000008ull)
-#define CVMX_PCIEEP_CFG003 \
-	 (0x000000000000000Cull)
-#define CVMX_PCIEEP_CFG004 \
-	 (0x0000000000000010ull)
-#define CVMX_PCIEEP_CFG004_MASK \
-	 (0x0000000080000010ull)
-#define CVMX_PCIEEP_CFG005 \
-	 (0x0000000000000014ull)
-#define CVMX_PCIEEP_CFG005_MASK \
-	 (0x0000000080000014ull)
-#define CVMX_PCIEEP_CFG006 \
-	 (0x0000000000000018ull)
-#define CVMX_PCIEEP_CFG006_MASK \
-	 (0x0000000080000018ull)
-#define CVMX_PCIEEP_CFG007 \
-	 (0x000000000000001Cull)
-#define CVMX_PCIEEP_CFG007_MASK \
-	 (0x000000008000001Cull)
-#define CVMX_PCIEEP_CFG008 \
-	 (0x0000000000000020ull)
-#define CVMX_PCIEEP_CFG008_MASK \
-	 (0x0000000080000020ull)
-#define CVMX_PCIEEP_CFG009 \
-	 (0x0000000000000024ull)
-#define CVMX_PCIEEP_CFG009_MASK \
-	 (0x0000000080000024ull)
-#define CVMX_PCIEEP_CFG010 \
-	 (0x0000000000000028ull)
-#define CVMX_PCIEEP_CFG011 \
-	 (0x000000000000002Cull)
-#define CVMX_PCIEEP_CFG012 \
-	 (0x0000000000000030ull)
-#define CVMX_PCIEEP_CFG012_MASK \
-	 (0x0000000080000030ull)
-#define CVMX_PCIEEP_CFG013 \
-	 (0x0000000000000034ull)
-#define CVMX_PCIEEP_CFG015 \
-	 (0x000000000000003Cull)
-#define CVMX_PCIEEP_CFG016 \
-	 (0x0000000000000040ull)
-#define CVMX_PCIEEP_CFG017 \
-	 (0x0000000000000044ull)
-#define CVMX_PCIEEP_CFG020 \
-	 (0x0000000000000050ull)
-#define CVMX_PCIEEP_CFG021 \
-	 (0x0000000000000054ull)
-#define CVMX_PCIEEP_CFG022 \
-	 (0x0000000000000058ull)
-#define CVMX_PCIEEP_CFG023 \
-	 (0x000000000000005Cull)
-#define CVMX_PCIEEP_CFG028 \
-	 (0x0000000000000070ull)
-#define CVMX_PCIEEP_CFG029 \
-	 (0x0000000000000074ull)
-#define CVMX_PCIEEP_CFG030 \
-	 (0x0000000000000078ull)
-#define CVMX_PCIEEP_CFG031 \
-	 (0x000000000000007Cull)
-#define CVMX_PCIEEP_CFG032 \
-	 (0x0000000000000080ull)
-#define CVMX_PCIEEP_CFG033 \
-	 (0x0000000000000084ull)
-#define CVMX_PCIEEP_CFG034 \
-	 (0x0000000000000088ull)
-#define CVMX_PCIEEP_CFG037 \
-	 (0x0000000000000094ull)
-#define CVMX_PCIEEP_CFG038 \
-	 (0x0000000000000098ull)
-#define CVMX_PCIEEP_CFG039 \
-	 (0x000000000000009Cull)
-#define CVMX_PCIEEP_CFG040 \
-	 (0x00000000000000A0ull)
-#define CVMX_PCIEEP_CFG041 \
-	 (0x00000000000000A4ull)
-#define CVMX_PCIEEP_CFG042 \
-	 (0x00000000000000A8ull)
-#define CVMX_PCIEEP_CFG064 \
-	 (0x0000000000000100ull)
-#define CVMX_PCIEEP_CFG065 \
-	 (0x0000000000000104ull)
-#define CVMX_PCIEEP_CFG066 \
-	 (0x0000000000000108ull)
-#define CVMX_PCIEEP_CFG067 \
-	 (0x000000000000010Cull)
-#define CVMX_PCIEEP_CFG068 \
-	 (0x0000000000000110ull)
-#define CVMX_PCIEEP_CFG069 \
-	 (0x0000000000000114ull)
-#define CVMX_PCIEEP_CFG070 \
-	 (0x0000000000000118ull)
-#define CVMX_PCIEEP_CFG071 \
-	 (0x000000000000011Cull)
-#define CVMX_PCIEEP_CFG072 \
-	 (0x0000000000000120ull)
-#define CVMX_PCIEEP_CFG073 \
-	 (0x0000000000000124ull)
-#define CVMX_PCIEEP_CFG074 \
-	 (0x0000000000000128ull)
-#define CVMX_PCIEEP_CFG448 \
-	 (0x0000000000000700ull)
-#define CVMX_PCIEEP_CFG449 \
-	 (0x0000000000000704ull)
-#define CVMX_PCIEEP_CFG450 \
-	 (0x0000000000000708ull)
-#define CVMX_PCIEEP_CFG451 \
-	 (0x000000000000070Cull)
-#define CVMX_PCIEEP_CFG452 \
-	 (0x0000000000000710ull)
-#define CVMX_PCIEEP_CFG453 \
-	 (0x0000000000000714ull)
-#define CVMX_PCIEEP_CFG454 \
-	 (0x0000000000000718ull)
-#define CVMX_PCIEEP_CFG455 \
-	 (0x000000000000071Cull)
-#define CVMX_PCIEEP_CFG456 \
-	 (0x0000000000000720ull)
-#define CVMX_PCIEEP_CFG458 \
-	 (0x0000000000000728ull)
-#define CVMX_PCIEEP_CFG459 \
-	 (0x000000000000072Cull)
-#define CVMX_PCIEEP_CFG460 \
-	 (0x0000000000000730ull)
-#define CVMX_PCIEEP_CFG461 \
-	 (0x0000000000000734ull)
-#define CVMX_PCIEEP_CFG462 \
-	 (0x0000000000000738ull)
-#define CVMX_PCIEEP_CFG463 \
-	 (0x000000000000073Cull)
-#define CVMX_PCIEEP_CFG464 \
-	 (0x0000000000000740ull)
-#define CVMX_PCIEEP_CFG465 \
-	 (0x0000000000000744ull)
-#define CVMX_PCIEEP_CFG466 \
-	 (0x0000000000000748ull)
-#define CVMX_PCIEEP_CFG467 \
-	 (0x000000000000074Cull)
-#define CVMX_PCIEEP_CFG468 \
-	 (0x0000000000000750ull)
-#define CVMX_PCIEEP_CFG490 \
-	 (0x00000000000007A8ull)
-#define CVMX_PCIEEP_CFG491 \
-	 (0x00000000000007ACull)
-#define CVMX_PCIEEP_CFG492 \
-	 (0x00000000000007B0ull)
-#define CVMX_PCIEEP_CFG516 \
-	 (0x0000000000000810ull)
-#define CVMX_PCIEEP_CFG517 \
-	 (0x0000000000000814ull)
-
-union cvmx_pcieep_cfg000 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg000_s {
-		uint32_t devid:16;
-		uint32_t vendid:16;
-	} s;
-	struct cvmx_pcieep_cfg000_s cn52xx;
-	struct cvmx_pcieep_cfg000_s cn52xxp1;
-	struct cvmx_pcieep_cfg000_s cn56xx;
-	struct cvmx_pcieep_cfg000_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg001 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg001_s {
-		uint32_t dpe:1;
-		uint32_t sse:1;
-		uint32_t rma:1;
-		uint32_t rta:1;
-		uint32_t sta:1;
-		uint32_t devt:2;
-		uint32_t mdpe:1;
-		uint32_t fbb:1;
-		uint32_t reserved_22_22:1;
-		uint32_t m66:1;
-		uint32_t cl:1;
-		uint32_t i_stat:1;
-		uint32_t reserved_11_18:8;
-		uint32_t i_dis:1;
-		uint32_t fbbe:1;
-		uint32_t see:1;
-		uint32_t ids_wcc:1;
-		uint32_t per:1;
-		uint32_t vps:1;
-		uint32_t mwice:1;
-		uint32_t scse:1;
-		uint32_t me:1;
-		uint32_t msae:1;
-		uint32_t isae:1;
-	} s;
-	struct cvmx_pcieep_cfg001_s cn52xx;
-	struct cvmx_pcieep_cfg001_s cn52xxp1;
-	struct cvmx_pcieep_cfg001_s cn56xx;
-	struct cvmx_pcieep_cfg001_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg002 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg002_s {
-		uint32_t bcc:8;
-		uint32_t sc:8;
-		uint32_t pi:8;
-		uint32_t rid:8;
-	} s;
-	struct cvmx_pcieep_cfg002_s cn52xx;
-	struct cvmx_pcieep_cfg002_s cn52xxp1;
-	struct cvmx_pcieep_cfg002_s cn56xx;
-	struct cvmx_pcieep_cfg002_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg003 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg003_s {
-		uint32_t bist:8;
-		uint32_t mfd:1;
-		uint32_t chf:7;
-		uint32_t lt:8;
-		uint32_t cls:8;
-	} s;
-	struct cvmx_pcieep_cfg003_s cn52xx;
-	struct cvmx_pcieep_cfg003_s cn52xxp1;
-	struct cvmx_pcieep_cfg003_s cn56xx;
-	struct cvmx_pcieep_cfg003_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg004 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg004_s {
-		uint32_t lbab:18;
-		uint32_t reserved_4_13:10;
-		uint32_t pf:1;
-		uint32_t typ:2;
-		uint32_t mspc:1;
-	} s;
-	struct cvmx_pcieep_cfg004_s cn52xx;
-	struct cvmx_pcieep_cfg004_s cn52xxp1;
-	struct cvmx_pcieep_cfg004_s cn56xx;
-	struct cvmx_pcieep_cfg004_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg004_mask {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg004_mask_s {
-		uint32_t lmask:31;
-		uint32_t enb:1;
-	} s;
-	struct cvmx_pcieep_cfg004_mask_s cn52xx;
-	struct cvmx_pcieep_cfg004_mask_s cn52xxp1;
-	struct cvmx_pcieep_cfg004_mask_s cn56xx;
-	struct cvmx_pcieep_cfg004_mask_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg005 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg005_s {
-		uint32_t ubab:32;
-	} s;
-	struct cvmx_pcieep_cfg005_s cn52xx;
-	struct cvmx_pcieep_cfg005_s cn52xxp1;
-	struct cvmx_pcieep_cfg005_s cn56xx;
-	struct cvmx_pcieep_cfg005_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg005_mask {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg005_mask_s {
-		uint32_t umask:32;
-	} s;
-	struct cvmx_pcieep_cfg005_mask_s cn52xx;
-	struct cvmx_pcieep_cfg005_mask_s cn52xxp1;
-	struct cvmx_pcieep_cfg005_mask_s cn56xx;
-	struct cvmx_pcieep_cfg005_mask_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg006 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg006_s {
-		uint32_t lbab:6;
-		uint32_t reserved_4_25:22;
-		uint32_t pf:1;
-		uint32_t typ:2;
-		uint32_t mspc:1;
-	} s;
-	struct cvmx_pcieep_cfg006_s cn52xx;
-	struct cvmx_pcieep_cfg006_s cn52xxp1;
-	struct cvmx_pcieep_cfg006_s cn56xx;
-	struct cvmx_pcieep_cfg006_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg006_mask {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg006_mask_s {
-		uint32_t lmask:31;
-		uint32_t enb:1;
-	} s;
-	struct cvmx_pcieep_cfg006_mask_s cn52xx;
-	struct cvmx_pcieep_cfg006_mask_s cn52xxp1;
-	struct cvmx_pcieep_cfg006_mask_s cn56xx;
-	struct cvmx_pcieep_cfg006_mask_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg007 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg007_s {
-		uint32_t ubab:32;
-	} s;
-	struct cvmx_pcieep_cfg007_s cn52xx;
-	struct cvmx_pcieep_cfg007_s cn52xxp1;
-	struct cvmx_pcieep_cfg007_s cn56xx;
-	struct cvmx_pcieep_cfg007_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg007_mask {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg007_mask_s {
-		uint32_t umask:32;
-	} s;
-	struct cvmx_pcieep_cfg007_mask_s cn52xx;
-	struct cvmx_pcieep_cfg007_mask_s cn52xxp1;
-	struct cvmx_pcieep_cfg007_mask_s cn56xx;
-	struct cvmx_pcieep_cfg007_mask_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg008 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg008_s {
-		uint32_t reserved_4_31:28;
-		uint32_t pf:1;
-		uint32_t typ:2;
-		uint32_t mspc:1;
-	} s;
-	struct cvmx_pcieep_cfg008_s cn52xx;
-	struct cvmx_pcieep_cfg008_s cn52xxp1;
-	struct cvmx_pcieep_cfg008_s cn56xx;
-	struct cvmx_pcieep_cfg008_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg008_mask {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg008_mask_s {
-		uint32_t lmask:31;
-		uint32_t enb:1;
-	} s;
-	struct cvmx_pcieep_cfg008_mask_s cn52xx;
-	struct cvmx_pcieep_cfg008_mask_s cn52xxp1;
-	struct cvmx_pcieep_cfg008_mask_s cn56xx;
-	struct cvmx_pcieep_cfg008_mask_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg009 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg009_s {
-		uint32_t ubab:25;
-		uint32_t reserved_0_6:7;
-	} s;
-	struct cvmx_pcieep_cfg009_s cn52xx;
-	struct cvmx_pcieep_cfg009_s cn52xxp1;
-	struct cvmx_pcieep_cfg009_s cn56xx;
-	struct cvmx_pcieep_cfg009_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg009_mask {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg009_mask_s {
-		uint32_t umask:32;
-	} s;
-	struct cvmx_pcieep_cfg009_mask_s cn52xx;
-	struct cvmx_pcieep_cfg009_mask_s cn52xxp1;
-	struct cvmx_pcieep_cfg009_mask_s cn56xx;
-	struct cvmx_pcieep_cfg009_mask_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg010 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg010_s {
-		uint32_t cisp:32;
-	} s;
-	struct cvmx_pcieep_cfg010_s cn52xx;
-	struct cvmx_pcieep_cfg010_s cn52xxp1;
-	struct cvmx_pcieep_cfg010_s cn56xx;
-	struct cvmx_pcieep_cfg010_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg011 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg011_s {
-		uint32_t ssid:16;
-		uint32_t ssvid:16;
-	} s;
-	struct cvmx_pcieep_cfg011_s cn52xx;
-	struct cvmx_pcieep_cfg011_s cn52xxp1;
-	struct cvmx_pcieep_cfg011_s cn56xx;
-	struct cvmx_pcieep_cfg011_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg012 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg012_s {
-		uint32_t eraddr:16;
-		uint32_t reserved_1_15:15;
-		uint32_t er_en:1;
-	} s;
-	struct cvmx_pcieep_cfg012_s cn52xx;
-	struct cvmx_pcieep_cfg012_s cn52xxp1;
-	struct cvmx_pcieep_cfg012_s cn56xx;
-	struct cvmx_pcieep_cfg012_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg012_mask {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg012_mask_s {
-		uint32_t mask:31;
-		uint32_t enb:1;
-	} s;
-	struct cvmx_pcieep_cfg012_mask_s cn52xx;
-	struct cvmx_pcieep_cfg012_mask_s cn52xxp1;
-	struct cvmx_pcieep_cfg012_mask_s cn56xx;
-	struct cvmx_pcieep_cfg012_mask_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg013 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg013_s {
-		uint32_t reserved_8_31:24;
-		uint32_t cp:8;
-	} s;
-	struct cvmx_pcieep_cfg013_s cn52xx;
-	struct cvmx_pcieep_cfg013_s cn52xxp1;
-	struct cvmx_pcieep_cfg013_s cn56xx;
-	struct cvmx_pcieep_cfg013_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg015 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg015_s {
-		uint32_t ml:8;
-		uint32_t mg:8;
-		uint32_t inta:8;
-		uint32_t il:8;
-	} s;
-	struct cvmx_pcieep_cfg015_s cn52xx;
-	struct cvmx_pcieep_cfg015_s cn52xxp1;
-	struct cvmx_pcieep_cfg015_s cn56xx;
-	struct cvmx_pcieep_cfg015_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg016 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg016_s {
-		uint32_t pmes:5;
-		uint32_t d2s:1;
-		uint32_t d1s:1;
-		uint32_t auxc:3;
-		uint32_t dsi:1;
-		uint32_t reserved_20_20:1;
-		uint32_t pme_clock:1;
-		uint32_t pmsv:3;
-		uint32_t ncp:8;
-		uint32_t pmcid:8;
-	} s;
-	struct cvmx_pcieep_cfg016_s cn52xx;
-	struct cvmx_pcieep_cfg016_s cn52xxp1;
-	struct cvmx_pcieep_cfg016_s cn56xx;
-	struct cvmx_pcieep_cfg016_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg017 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg017_s {
-		uint32_t pmdia:8;
-		uint32_t bpccee:1;
-		uint32_t bd3h:1;
-		uint32_t reserved_16_21:6;
-		uint32_t pmess:1;
-		uint32_t pmedsia:2;
-		uint32_t pmds:4;
-		uint32_t pmeens:1;
-		uint32_t reserved_4_7:4;
-		uint32_t nsr:1;
-		uint32_t reserved_2_2:1;
-		uint32_t ps:2;
-	} s;
-	struct cvmx_pcieep_cfg017_s cn52xx;
-	struct cvmx_pcieep_cfg017_s cn52xxp1;
-	struct cvmx_pcieep_cfg017_s cn56xx;
-	struct cvmx_pcieep_cfg017_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg020 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg020_s {
-		uint32_t reserved_24_31:8;
-		uint32_t m64:1;
-		uint32_t mme:3;
-		uint32_t mmc:3;
-		uint32_t msien:1;
-		uint32_t ncp:8;
-		uint32_t msicid:8;
-	} s;
-	struct cvmx_pcieep_cfg020_s cn52xx;
-	struct cvmx_pcieep_cfg020_s cn52xxp1;
-	struct cvmx_pcieep_cfg020_s cn56xx;
-	struct cvmx_pcieep_cfg020_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg021 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg021_s {
-		uint32_t lmsi:30;
-		uint32_t reserved_0_1:2;
-	} s;
-	struct cvmx_pcieep_cfg021_s cn52xx;
-	struct cvmx_pcieep_cfg021_s cn52xxp1;
-	struct cvmx_pcieep_cfg021_s cn56xx;
-	struct cvmx_pcieep_cfg021_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg022 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg022_s {
-		uint32_t umsi:32;
-	} s;
-	struct cvmx_pcieep_cfg022_s cn52xx;
-	struct cvmx_pcieep_cfg022_s cn52xxp1;
-	struct cvmx_pcieep_cfg022_s cn56xx;
-	struct cvmx_pcieep_cfg022_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg023 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg023_s {
-		uint32_t reserved_16_31:16;
-		uint32_t msimd:16;
-	} s;
-	struct cvmx_pcieep_cfg023_s cn52xx;
-	struct cvmx_pcieep_cfg023_s cn52xxp1;
-	struct cvmx_pcieep_cfg023_s cn56xx;
-	struct cvmx_pcieep_cfg023_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg028 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg028_s {
-		uint32_t reserved_30_31:2;
-		uint32_t imn:5;
-		uint32_t si:1;
-		uint32_t dpt:4;
-		uint32_t pciecv:4;
-		uint32_t ncp:8;
-		uint32_t pcieid:8;
-	} s;
-	struct cvmx_pcieep_cfg028_s cn52xx;
-	struct cvmx_pcieep_cfg028_s cn52xxp1;
-	struct cvmx_pcieep_cfg028_s cn56xx;
-	struct cvmx_pcieep_cfg028_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg029 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg029_s {
-		uint32_t reserved_28_31:4;
-		uint32_t cspls:2;
-		uint32_t csplv:8;
-		uint32_t reserved_16_17:2;
-		uint32_t rber:1;
-		uint32_t reserved_12_14:3;
-		uint32_t el1al:3;
-		uint32_t el0al:3;
-		uint32_t etfs:1;
-		uint32_t pfs:2;
-		uint32_t mpss:3;
-	} s;
-	struct cvmx_pcieep_cfg029_s cn52xx;
-	struct cvmx_pcieep_cfg029_s cn52xxp1;
-	struct cvmx_pcieep_cfg029_s cn56xx;
-	struct cvmx_pcieep_cfg029_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg030 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg030_s {
-		uint32_t reserved_22_31:10;
-		uint32_t tp:1;
-		uint32_t ap_d:1;
-		uint32_t ur_d:1;
-		uint32_t fe_d:1;
-		uint32_t nfe_d:1;
-		uint32_t ce_d:1;
-		uint32_t reserved_15_15:1;
-		uint32_t mrrs:3;
-		uint32_t ns_en:1;
-		uint32_t ap_en:1;
-		uint32_t pf_en:1;
-		uint32_t etf_en:1;
-		uint32_t mps:3;
-		uint32_t ro_en:1;
-		uint32_t ur_en:1;
-		uint32_t fe_en:1;
-		uint32_t nfe_en:1;
-		uint32_t ce_en:1;
-	} s;
-	struct cvmx_pcieep_cfg030_s cn52xx;
-	struct cvmx_pcieep_cfg030_s cn52xxp1;
-	struct cvmx_pcieep_cfg030_s cn56xx;
-	struct cvmx_pcieep_cfg030_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg031 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg031_s {
-		uint32_t pnum:8;
-		uint32_t reserved_22_23:2;
-		uint32_t lbnc:1;
-		uint32_t dllarc:1;
-		uint32_t sderc:1;
-		uint32_t cpm:1;
-		uint32_t l1el:3;
-		uint32_t l0el:3;
-		uint32_t aslpms:2;
-		uint32_t mlw:6;
-		uint32_t mls:4;
-	} s;
-	struct cvmx_pcieep_cfg031_s cn52xx;
-	struct cvmx_pcieep_cfg031_s cn52xxp1;
-	struct cvmx_pcieep_cfg031_s cn56xx;
-	struct cvmx_pcieep_cfg031_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg032 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg032_s {
-		uint32_t reserved_30_31:2;
-		uint32_t dlla:1;
-		uint32_t scc:1;
-		uint32_t lt:1;
-		uint32_t reserved_26_26:1;
-		uint32_t nlw:6;
-		uint32_t ls:4;
-		uint32_t reserved_10_15:6;
-		uint32_t hawd:1;
-		uint32_t ecpm:1;
-		uint32_t es:1;
-		uint32_t ccc:1;
-		uint32_t rl:1;
-		uint32_t ld:1;
-		uint32_t rcb:1;
-		uint32_t reserved_2_2:1;
-		uint32_t aslpc:2;
-	} s;
-	struct cvmx_pcieep_cfg032_s cn52xx;
-	struct cvmx_pcieep_cfg032_s cn52xxp1;
-	struct cvmx_pcieep_cfg032_s cn56xx;
-	struct cvmx_pcieep_cfg032_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg033 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg033_s {
-		uint32_t ps_num:13;
-		uint32_t nccs:1;
-		uint32_t emip:1;
-		uint32_t sp_ls:2;
-		uint32_t sp_lv:8;
-		uint32_t hp_c:1;
-		uint32_t hp_s:1;
-		uint32_t pip:1;
-		uint32_t aip:1;
-		uint32_t mrlsp:1;
-		uint32_t pcp:1;
-		uint32_t abp:1;
-	} s;
-	struct cvmx_pcieep_cfg033_s cn52xx;
-	struct cvmx_pcieep_cfg033_s cn52xxp1;
-	struct cvmx_pcieep_cfg033_s cn56xx;
-	struct cvmx_pcieep_cfg033_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg034 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg034_s {
-		uint32_t reserved_25_31:7;
-		uint32_t dlls_c:1;
-		uint32_t emis:1;
-		uint32_t pds:1;
-		uint32_t mrlss:1;
-		uint32_t ccint_d:1;
-		uint32_t pd_c:1;
-		uint32_t mrls_c:1;
-		uint32_t pf_d:1;
-		uint32_t abp_d:1;
-		uint32_t reserved_13_15:3;
-		uint32_t dlls_en:1;
-		uint32_t emic:1;
-		uint32_t pcc:1;
-		uint32_t pic:2;
-		uint32_t aic:2;
-		uint32_t hpint_en:1;
-		uint32_t ccint_en:1;
-		uint32_t pd_en:1;
-		uint32_t mrls_en:1;
-		uint32_t pf_en:1;
-		uint32_t abp_en:1;
-	} s;
-	struct cvmx_pcieep_cfg034_s cn52xx;
-	struct cvmx_pcieep_cfg034_s cn52xxp1;
-	struct cvmx_pcieep_cfg034_s cn56xx;
-	struct cvmx_pcieep_cfg034_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg037 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg037_s {
-		uint32_t reserved_5_31:27;
-		uint32_t ctds:1;
-		uint32_t ctrs:4;
-	} s;
-	struct cvmx_pcieep_cfg037_s cn52xx;
-	struct cvmx_pcieep_cfg037_s cn52xxp1;
-	struct cvmx_pcieep_cfg037_s cn56xx;
-	struct cvmx_pcieep_cfg037_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg038 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg038_s {
-		uint32_t reserved_5_31:27;
-		uint32_t ctd:1;
-		uint32_t ctv:4;
-	} s;
-	struct cvmx_pcieep_cfg038_s cn52xx;
-	struct cvmx_pcieep_cfg038_s cn52xxp1;
-	struct cvmx_pcieep_cfg038_s cn56xx;
-	struct cvmx_pcieep_cfg038_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg039 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg039_s {
-		uint32_t reserved_0_31:32;
-	} s;
-	struct cvmx_pcieep_cfg039_s cn52xx;
-	struct cvmx_pcieep_cfg039_s cn52xxp1;
-	struct cvmx_pcieep_cfg039_s cn56xx;
-	struct cvmx_pcieep_cfg039_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg040 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg040_s {
-		uint32_t reserved_0_31:32;
-	} s;
-	struct cvmx_pcieep_cfg040_s cn52xx;
-	struct cvmx_pcieep_cfg040_s cn52xxp1;
-	struct cvmx_pcieep_cfg040_s cn56xx;
-	struct cvmx_pcieep_cfg040_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg041 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg041_s {
-		uint32_t reserved_0_31:32;
-	} s;
-	struct cvmx_pcieep_cfg041_s cn52xx;
-	struct cvmx_pcieep_cfg041_s cn52xxp1;
-	struct cvmx_pcieep_cfg041_s cn56xx;
-	struct cvmx_pcieep_cfg041_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg042 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg042_s {
-		uint32_t reserved_0_31:32;
-	} s;
-	struct cvmx_pcieep_cfg042_s cn52xx;
-	struct cvmx_pcieep_cfg042_s cn52xxp1;
-	struct cvmx_pcieep_cfg042_s cn56xx;
-	struct cvmx_pcieep_cfg042_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg064 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg064_s {
-		uint32_t nco:12;
-		uint32_t cv:4;
-		uint32_t pcieec:16;
-	} s;
-	struct cvmx_pcieep_cfg064_s cn52xx;
-	struct cvmx_pcieep_cfg064_s cn52xxp1;
-	struct cvmx_pcieep_cfg064_s cn56xx;
-	struct cvmx_pcieep_cfg064_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg065 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg065_s {
-		uint32_t reserved_21_31:11;
-		uint32_t ures:1;
-		uint32_t ecrces:1;
-		uint32_t mtlps:1;
-		uint32_t ros:1;
-		uint32_t ucs:1;
-		uint32_t cas:1;
-		uint32_t cts:1;
-		uint32_t fcpes:1;
-		uint32_t ptlps:1;
-		uint32_t reserved_6_11:6;
-		uint32_t sdes:1;
-		uint32_t dlpes:1;
-		uint32_t reserved_0_3:4;
-	} s;
-	struct cvmx_pcieep_cfg065_s cn52xx;
-	struct cvmx_pcieep_cfg065_s cn52xxp1;
-	struct cvmx_pcieep_cfg065_s cn56xx;
-	struct cvmx_pcieep_cfg065_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg066 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg066_s {
-		uint32_t reserved_21_31:11;
-		uint32_t urem:1;
-		uint32_t ecrcem:1;
-		uint32_t mtlpm:1;
-		uint32_t rom:1;
-		uint32_t ucm:1;
-		uint32_t cam:1;
-		uint32_t ctm:1;
-		uint32_t fcpem:1;
-		uint32_t ptlpm:1;
-		uint32_t reserved_6_11:6;
-		uint32_t sdem:1;
-		uint32_t dlpem:1;
-		uint32_t reserved_0_3:4;
-	} s;
-	struct cvmx_pcieep_cfg066_s cn52xx;
-	struct cvmx_pcieep_cfg066_s cn52xxp1;
-	struct cvmx_pcieep_cfg066_s cn56xx;
-	struct cvmx_pcieep_cfg066_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg067 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg067_s {
-		uint32_t reserved_21_31:11;
-		uint32_t ures:1;
-		uint32_t ecrces:1;
-		uint32_t mtlps:1;
-		uint32_t ros:1;
-		uint32_t ucs:1;
-		uint32_t cas:1;
-		uint32_t cts:1;
-		uint32_t fcpes:1;
-		uint32_t ptlps:1;
-		uint32_t reserved_6_11:6;
-		uint32_t sdes:1;
-		uint32_t dlpes:1;
-		uint32_t reserved_0_3:4;
-	} s;
-	struct cvmx_pcieep_cfg067_s cn52xx;
-	struct cvmx_pcieep_cfg067_s cn52xxp1;
-	struct cvmx_pcieep_cfg067_s cn56xx;
-	struct cvmx_pcieep_cfg067_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg068 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg068_s {
-		uint32_t reserved_14_31:18;
-		uint32_t anfes:1;
-		uint32_t rtts:1;
-		uint32_t reserved_9_11:3;
-		uint32_t rnrs:1;
-		uint32_t bdllps:1;
-		uint32_t btlps:1;
-		uint32_t reserved_1_5:5;
-		uint32_t res:1;
-	} s;
-	struct cvmx_pcieep_cfg068_s cn52xx;
-	struct cvmx_pcieep_cfg068_s cn52xxp1;
-	struct cvmx_pcieep_cfg068_s cn56xx;
-	struct cvmx_pcieep_cfg068_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg069 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg069_s {
-		uint32_t reserved_14_31:18;
-		uint32_t anfem:1;
-		uint32_t rttm:1;
-		uint32_t reserved_9_11:3;
-		uint32_t rnrm:1;
-		uint32_t bdllpm:1;
-		uint32_t btlpm:1;
-		uint32_t reserved_1_5:5;
-		uint32_t rem:1;
-	} s;
-	struct cvmx_pcieep_cfg069_s cn52xx;
-	struct cvmx_pcieep_cfg069_s cn52xxp1;
-	struct cvmx_pcieep_cfg069_s cn56xx;
-	struct cvmx_pcieep_cfg069_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg070 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg070_s {
-		uint32_t reserved_9_31:23;
-		uint32_t ce:1;
-		uint32_t cc:1;
-		uint32_t ge:1;
-		uint32_t gc:1;
-		uint32_t fep:5;
-	} s;
-	struct cvmx_pcieep_cfg070_s cn52xx;
-	struct cvmx_pcieep_cfg070_s cn52xxp1;
-	struct cvmx_pcieep_cfg070_s cn56xx;
-	struct cvmx_pcieep_cfg070_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg071 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg071_s {
-		uint32_t dword1:32;
-	} s;
-	struct cvmx_pcieep_cfg071_s cn52xx;
-	struct cvmx_pcieep_cfg071_s cn52xxp1;
-	struct cvmx_pcieep_cfg071_s cn56xx;
-	struct cvmx_pcieep_cfg071_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg072 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg072_s {
-		uint32_t dword2:32;
-	} s;
-	struct cvmx_pcieep_cfg072_s cn52xx;
-	struct cvmx_pcieep_cfg072_s cn52xxp1;
-	struct cvmx_pcieep_cfg072_s cn56xx;
-	struct cvmx_pcieep_cfg072_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg073 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg073_s {
-		uint32_t dword3:32;
-	} s;
-	struct cvmx_pcieep_cfg073_s cn52xx;
-	struct cvmx_pcieep_cfg073_s cn52xxp1;
-	struct cvmx_pcieep_cfg073_s cn56xx;
-	struct cvmx_pcieep_cfg073_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg074 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg074_s {
-		uint32_t dword4:32;
-	} s;
-	struct cvmx_pcieep_cfg074_s cn52xx;
-	struct cvmx_pcieep_cfg074_s cn52xxp1;
-	struct cvmx_pcieep_cfg074_s cn56xx;
-	struct cvmx_pcieep_cfg074_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg448 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg448_s {
-		uint32_t rtl:16;
-		uint32_t rtltl:16;
-	} s;
-	struct cvmx_pcieep_cfg448_s cn52xx;
-	struct cvmx_pcieep_cfg448_s cn52xxp1;
-	struct cvmx_pcieep_cfg448_s cn56xx;
-	struct cvmx_pcieep_cfg448_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg449 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg449_s {
-		uint32_t omr:32;
-	} s;
-	struct cvmx_pcieep_cfg449_s cn52xx;
-	struct cvmx_pcieep_cfg449_s cn52xxp1;
-	struct cvmx_pcieep_cfg449_s cn56xx;
-	struct cvmx_pcieep_cfg449_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg450 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg450_s {
-		uint32_t lpec:8;
-		uint32_t reserved_22_23:2;
-		uint32_t link_state:6;
-		uint32_t force_link:1;
-		uint32_t reserved_8_14:7;
-		uint32_t link_num:8;
-	} s;
-	struct cvmx_pcieep_cfg450_s cn52xx;
-	struct cvmx_pcieep_cfg450_s cn52xxp1;
-	struct cvmx_pcieep_cfg450_s cn56xx;
-	struct cvmx_pcieep_cfg450_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg451 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg451_s {
-		uint32_t reserved_30_31:2;
-		uint32_t l1el:3;
-		uint32_t l0el:3;
-		uint32_t n_fts_cc:8;
-		uint32_t n_fts:8;
-		uint32_t ack_freq:8;
-	} s;
-	struct cvmx_pcieep_cfg451_s cn52xx;
-	struct cvmx_pcieep_cfg451_s cn52xxp1;
-	struct cvmx_pcieep_cfg451_s cn56xx;
-	struct cvmx_pcieep_cfg451_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg452 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg452_s {
-		uint32_t reserved_26_31:6;
-		uint32_t eccrc:1;
-		uint32_t reserved_22_24:3;
-		uint32_t lme:6;
-		uint32_t reserved_8_15:8;
-		uint32_t flm:1;
-		uint32_t reserved_6_6:1;
-		uint32_t dllle:1;
-		uint32_t reserved_4_4:1;
-		uint32_t ra:1;
-		uint32_t le:1;
-		uint32_t sd:1;
-		uint32_t omr:1;
-	} s;
-	struct cvmx_pcieep_cfg452_s cn52xx;
-	struct cvmx_pcieep_cfg452_s cn52xxp1;
-	struct cvmx_pcieep_cfg452_s cn56xx;
-	struct cvmx_pcieep_cfg452_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg453 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg453_s {
-		uint32_t dlld:1;
-		uint32_t reserved_26_30:5;
-		uint32_t ack_nak:1;
-		uint32_t fcd:1;
-		uint32_t ilst:24;
-	} s;
-	struct cvmx_pcieep_cfg453_s cn52xx;
-	struct cvmx_pcieep_cfg453_s cn52xxp1;
-	struct cvmx_pcieep_cfg453_s cn56xx;
-	struct cvmx_pcieep_cfg453_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg454 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg454_s {
-		uint32_t reserved_29_31:3;
-		uint32_t tmfcwt:5;
-		uint32_t tmanlt:5;
-		uint32_t tmrt:5;
-		uint32_t reserved_11_13:3;
-		uint32_t nskps:3;
-		uint32_t reserved_4_7:4;
-		uint32_t ntss:4;
-	} s;
-	struct cvmx_pcieep_cfg454_s cn52xx;
-	struct cvmx_pcieep_cfg454_s cn52xxp1;
-	struct cvmx_pcieep_cfg454_s cn56xx;
-	struct cvmx_pcieep_cfg454_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg455 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg455_s {
-		uint32_t m_cfg0_filt:1;
-		uint32_t m_io_filt:1;
-		uint32_t msg_ctrl:1;
-		uint32_t m_cpl_ecrc_filt:1;
-		uint32_t m_ecrc_filt:1;
-		uint32_t m_cpl_len_err:1;
-		uint32_t m_cpl_attr_err:1;
-		uint32_t m_cpl_tc_err:1;
-		uint32_t m_cpl_fun_err:1;
-		uint32_t m_cpl_rid_err:1;
-		uint32_t m_cpl_tag_err:1;
-		uint32_t m_lk_filt:1;
-		uint32_t m_cfg1_filt:1;
-		uint32_t m_bar_match:1;
-		uint32_t m_pois_filt:1;
-		uint32_t m_fun:1;
-		uint32_t dfcwt:1;
-		uint32_t reserved_11_14:4;
-		uint32_t skpiv:11;
-	} s;
-	struct cvmx_pcieep_cfg455_s cn52xx;
-	struct cvmx_pcieep_cfg455_s cn52xxp1;
-	struct cvmx_pcieep_cfg455_s cn56xx;
-	struct cvmx_pcieep_cfg455_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg456 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg456_s {
-		uint32_t reserved_2_31:30;
-		uint32_t m_vend1_drp:1;
-		uint32_t m_vend0_drp:1;
-	} s;
-	struct cvmx_pcieep_cfg456_s cn52xx;
-	struct cvmx_pcieep_cfg456_s cn52xxp1;
-	struct cvmx_pcieep_cfg456_s cn56xx;
-	struct cvmx_pcieep_cfg456_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg458 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg458_s {
-		uint32_t dbg_info_l32:32;
-	} s;
-	struct cvmx_pcieep_cfg458_s cn52xx;
-	struct cvmx_pcieep_cfg458_s cn52xxp1;
-	struct cvmx_pcieep_cfg458_s cn56xx;
-	struct cvmx_pcieep_cfg458_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg459 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg459_s {
-		uint32_t dbg_info_u32:32;
-	} s;
-	struct cvmx_pcieep_cfg459_s cn52xx;
-	struct cvmx_pcieep_cfg459_s cn52xxp1;
-	struct cvmx_pcieep_cfg459_s cn56xx;
-	struct cvmx_pcieep_cfg459_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg460 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg460_s {
-		uint32_t reserved_20_31:12;
-		uint32_t tphfcc:8;
-		uint32_t tpdfcc:12;
-	} s;
-	struct cvmx_pcieep_cfg460_s cn52xx;
-	struct cvmx_pcieep_cfg460_s cn52xxp1;
-	struct cvmx_pcieep_cfg460_s cn56xx;
-	struct cvmx_pcieep_cfg460_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg461 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg461_s {
-		uint32_t reserved_20_31:12;
-		uint32_t tchfcc:8;
-		uint32_t tcdfcc:12;
-	} s;
-	struct cvmx_pcieep_cfg461_s cn52xx;
-	struct cvmx_pcieep_cfg461_s cn52xxp1;
-	struct cvmx_pcieep_cfg461_s cn56xx;
-	struct cvmx_pcieep_cfg461_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg462 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg462_s {
-		uint32_t reserved_20_31:12;
-		uint32_t tchfcc:8;
-		uint32_t tcdfcc:12;
-	} s;
-	struct cvmx_pcieep_cfg462_s cn52xx;
-	struct cvmx_pcieep_cfg462_s cn52xxp1;
-	struct cvmx_pcieep_cfg462_s cn56xx;
-	struct cvmx_pcieep_cfg462_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg463 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg463_s {
-		uint32_t reserved_3_31:29;
-		uint32_t rqne:1;
-		uint32_t trbne:1;
-		uint32_t rtlpfccnr:1;
-	} s;
-	struct cvmx_pcieep_cfg463_s cn52xx;
-	struct cvmx_pcieep_cfg463_s cn52xxp1;
-	struct cvmx_pcieep_cfg463_s cn56xx;
-	struct cvmx_pcieep_cfg463_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg464 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg464_s {
-		uint32_t wrr_vc3:8;
-		uint32_t wrr_vc2:8;
-		uint32_t wrr_vc1:8;
-		uint32_t wrr_vc0:8;
-	} s;
-	struct cvmx_pcieep_cfg464_s cn52xx;
-	struct cvmx_pcieep_cfg464_s cn52xxp1;
-	struct cvmx_pcieep_cfg464_s cn56xx;
-	struct cvmx_pcieep_cfg464_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg465 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg465_s {
-		uint32_t wrr_vc7:8;
-		uint32_t wrr_vc6:8;
-		uint32_t wrr_vc5:8;
-		uint32_t wrr_vc4:8;
-	} s;
-	struct cvmx_pcieep_cfg465_s cn52xx;
-	struct cvmx_pcieep_cfg465_s cn52xxp1;
-	struct cvmx_pcieep_cfg465_s cn56xx;
-	struct cvmx_pcieep_cfg465_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg466 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg466_s {
-		uint32_t rx_queue_order:1;
-		uint32_t type_ordering:1;
-		uint32_t reserved_24_29:6;
-		uint32_t queue_mode:3;
-		uint32_t reserved_20_20:1;
-		uint32_t header_credits:8;
-		uint32_t data_credits:12;
-	} s;
-	struct cvmx_pcieep_cfg466_s cn52xx;
-	struct cvmx_pcieep_cfg466_s cn52xxp1;
-	struct cvmx_pcieep_cfg466_s cn56xx;
-	struct cvmx_pcieep_cfg466_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg467 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg467_s {
-		uint32_t reserved_24_31:8;
-		uint32_t queue_mode:3;
-		uint32_t reserved_20_20:1;
-		uint32_t header_credits:8;
-		uint32_t data_credits:12;
-	} s;
-	struct cvmx_pcieep_cfg467_s cn52xx;
-	struct cvmx_pcieep_cfg467_s cn52xxp1;
-	struct cvmx_pcieep_cfg467_s cn56xx;
-	struct cvmx_pcieep_cfg467_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg468 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg468_s {
-		uint32_t reserved_24_31:8;
-		uint32_t queue_mode:3;
-		uint32_t reserved_20_20:1;
-		uint32_t header_credits:8;
-		uint32_t data_credits:12;
-	} s;
-	struct cvmx_pcieep_cfg468_s cn52xx;
-	struct cvmx_pcieep_cfg468_s cn52xxp1;
-	struct cvmx_pcieep_cfg468_s cn56xx;
-	struct cvmx_pcieep_cfg468_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg490 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg490_s {
-		uint32_t reserved_26_31:6;
-		uint32_t header_depth:10;
-		uint32_t reserved_14_15:2;
-		uint32_t data_depth:14;
-	} s;
-	struct cvmx_pcieep_cfg490_s cn52xx;
-	struct cvmx_pcieep_cfg490_s cn52xxp1;
-	struct cvmx_pcieep_cfg490_s cn56xx;
-	struct cvmx_pcieep_cfg490_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg491 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg491_s {
-		uint32_t reserved_26_31:6;
-		uint32_t header_depth:10;
-		uint32_t reserved_14_15:2;
-		uint32_t data_depth:14;
-	} s;
-	struct cvmx_pcieep_cfg491_s cn52xx;
-	struct cvmx_pcieep_cfg491_s cn52xxp1;
-	struct cvmx_pcieep_cfg491_s cn56xx;
-	struct cvmx_pcieep_cfg491_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg492 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg492_s {
-		uint32_t reserved_26_31:6;
-		uint32_t header_depth:10;
-		uint32_t reserved_14_15:2;
-		uint32_t data_depth:14;
-	} s;
-	struct cvmx_pcieep_cfg492_s cn52xx;
-	struct cvmx_pcieep_cfg492_s cn52xxp1;
-	struct cvmx_pcieep_cfg492_s cn56xx;
-	struct cvmx_pcieep_cfg492_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg516 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg516_s {
-		uint32_t phy_stat:32;
-	} s;
-	struct cvmx_pcieep_cfg516_s cn52xx;
-	struct cvmx_pcieep_cfg516_s cn52xxp1;
-	struct cvmx_pcieep_cfg516_s cn56xx;
-	struct cvmx_pcieep_cfg516_s cn56xxp1;
-};
-
-union cvmx_pcieep_cfg517 {
-	uint32_t u32;
-	struct cvmx_pcieep_cfg517_s {
-		uint32_t phy_ctrl:32;
-	} s;
-	struct cvmx_pcieep_cfg517_s cn52xx;
-	struct cvmx_pcieep_cfg517_s cn52xxp1;
-	struct cvmx_pcieep_cfg517_s cn56xx;
-	struct cvmx_pcieep_cfg517_s cn56xxp1;
-};
-
-#endif
diff --git a/arch/mips/include/asm/pci.h b/arch/mips/include/asm/pci.h
index fcd4060f6421..90bf3b3fce19 100644
--- a/arch/mips/include/asm/pci.h
+++ b/arch/mips/include/asm/pci.h
@@ -17,6 +17,7 @@
  */
 
 #include <linux/ioport.h>
+#include <linux/of.h>
 
 /*
  * Each pci channel is a top-level PCI bus seem by CPU.  A machine  with
@@ -26,6 +27,7 @@
 struct pci_controller {
 	struct pci_controller *next;
 	struct pci_bus *bus;
+	struct device_node *of_node;
 
 	struct pci_ops *pci_ops;
 	struct resource *mem_resource;
@@ -142,4 +144,8 @@ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
 
 extern char * (*pcibios_plat_setup)(char *str);
 
+/* this function parses memory ranges from a device node */
+extern void __devinit pci_load_of_ranges(struct pci_controller *hose,
+					 struct device_node *node);
+
 #endif /* _ASM_PCI_H */
diff --git a/arch/mips/include/asm/prom.h b/arch/mips/include/asm/prom.h
index 7a6e82ef449b..7206d445bab8 100644
--- a/arch/mips/include/asm/prom.h
+++ b/arch/mips/include/asm/prom.h
@@ -12,6 +12,9 @@
 #define __ASM_PROM_H
 
 #ifdef CONFIG_OF
+#include <linux/bug.h>
+#include <linux/io.h>
+#include <linux/types.h>
 #include <asm/bootinfo.h>
 
 extern int early_init_dt_scan_memory_arch(unsigned long node,
@@ -21,6 +24,29 @@ extern int reserve_mem_mach(unsigned long addr, unsigned long size);
 extern void free_mem_mach(unsigned long addr, unsigned long size);
 
 extern void device_tree_init(void);
+
+static inline unsigned long pci_address_to_pio(phys_addr_t address)
+{
+	/*
+	 * The ioport address can be directly used by inX() / outX()
+	 */
+	BUG_ON(address > IO_SPACE_LIMIT);
+
+	return (unsigned long) address;
+}
+#define pci_address_to_pio pci_address_to_pio
+
+struct boot_param_header;
+
+extern void __dt_setup_arch(struct boot_param_header *bph);
+
+#define dt_setup_arch(sym)						\
+({									\
+	extern struct boot_param_header __dtb_##sym##_begin;		\
+									\
+	__dt_setup_arch(&__dtb_##sym##_begin);				\
+})
+
 #else /* CONFIG_OF */
 static inline void device_tree_init(void) { }
 #endif /* CONFIG_OF */
diff --git a/arch/mips/include/asm/setup.h b/arch/mips/include/asm/setup.h
index 6dce6d8d09ab..2560b6b6a7d8 100644
--- a/arch/mips/include/asm/setup.h
+++ b/arch/mips/include/asm/setup.h
@@ -14,7 +14,8 @@ extern void *set_vi_handler(int n, vi_handler_t addr);
 
 extern void *set_except_vector(int n, void *addr);
 extern unsigned long ebase;
-extern void per_cpu_trap_init(void);
+extern void per_cpu_trap_init(bool);
+extern void cpu_cache_init(void);
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/mips/include/asm/sparsemem.h b/arch/mips/include/asm/sparsemem.h
index 7165333ad043..4461198361c9 100644
--- a/arch/mips/include/asm/sparsemem.h
+++ b/arch/mips/include/asm/sparsemem.h
@@ -6,7 +6,11 @@
  * SECTION_SIZE_BITS		2^N: how big each section will be
  * MAX_PHYSMEM_BITS		2^N: how much memory we can have in that space
  */
-#define SECTION_SIZE_BITS       28
+#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_PAGE_SIZE_64KB)
+# define SECTION_SIZE_BITS	29
+#else
+# define SECTION_SIZE_BITS	28
+#endif
 #define MAX_PHYSMEM_BITS        35
 
 #endif /* CONFIG_SPARSEMEM */
diff --git a/arch/mips/include/asm/termios.h b/arch/mips/include/asm/termios.h
index 8f77f774a2a0..abdd87aaf609 100644
--- a/arch/mips/include/asm/termios.h
+++ b/arch/mips/include/asm/termios.h
@@ -60,7 +60,7 @@ struct termio {
 };
 
 #ifdef __KERNEL__
-#include <linux/module.h>
+#include <asm/uaccess.h>
 
 /*
  *	intr=^C		quit=^\		erase=del	kill=^U
diff --git a/arch/mips/include/asm/traps.h b/arch/mips/include/asm/traps.h
index ff74aec3561a..420ca06b2f42 100644
--- a/arch/mips/include/asm/traps.h
+++ b/arch/mips/include/asm/traps.h
@@ -25,6 +25,7 @@ extern void (*board_nmi_handler_setup)(void);
 extern void (*board_ejtag_handler_setup)(void);
 extern void (*board_bind_eic_interrupt)(int irq, int regset);
 extern void (*board_ebase_setup)(void);
+extern void (*board_cache_error_setup)(void);
 
 extern int register_nmi_notifier(struct notifier_block *nb);
 
diff --git a/arch/mips/include/asm/uasm.h b/arch/mips/include/asm/uasm.h
index 504d40aedfae..440a21dab575 100644
--- a/arch/mips/include/asm/uasm.h
+++ b/arch/mips/include/asm/uasm.h
@@ -11,7 +11,7 @@
 #include <linux/types.h>
 
 #ifdef CONFIG_EXPORT_UASM
-#include <linux/module.h>
+#include <linux/export.h>
 #define __uasminit
 #define __uasminitdata
 #define UASM_EXPORT_SYMBOL(sym) EXPORT_SYMBOL(sym)
diff --git a/arch/mips/jz4740/Makefile b/arch/mips/jz4740/Makefile
index a9dff3321251..e44abea9c209 100644
--- a/arch/mips/jz4740/Makefile
+++ b/arch/mips/jz4740/Makefile
@@ -16,5 +16,3 @@ obj-$(CONFIG_JZ4740_QI_LB60)	+= board-qi_lb60.o
 # PM support
 
 obj-$(CONFIG_PM) += pm.o
-
-ccflags-y := -Werror -Wall
diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c
index 5099201fb7bc..6ae7ce4ac63e 100644
--- a/arch/mips/kernel/cpu-probe.c
+++ b/arch/mips/kernel/cpu-probe.c
@@ -340,7 +340,7 @@ static inline void cpu_probe_legacy(struct cpuinfo_mips *c, unsigned int cpu)
 		__cpu_name[cpu] = "R2000";
 		c->isa_level = MIPS_CPU_ISA_I;
 		c->options = MIPS_CPU_TLB | MIPS_CPU_3K_CACHE |
-		             MIPS_CPU_NOFPUEX;
+			     MIPS_CPU_NOFPUEX;
 		if (__cpu_has_fpu())
 			c->options |= MIPS_CPU_FPU;
 		c->tlbsize = 64;
@@ -361,7 +361,7 @@ static inline void cpu_probe_legacy(struct cpuinfo_mips *c, unsigned int cpu)
 		}
 		c->isa_level = MIPS_CPU_ISA_I;
 		c->options = MIPS_CPU_TLB | MIPS_CPU_3K_CACHE |
-		             MIPS_CPU_NOFPUEX;
+			     MIPS_CPU_NOFPUEX;
 		if (__cpu_has_fpu())
 			c->options |= MIPS_CPU_FPU;
 		c->tlbsize = 64;
@@ -387,8 +387,8 @@ static inline void cpu_probe_legacy(struct cpuinfo_mips *c, unsigned int cpu)
 
 		c->isa_level = MIPS_CPU_ISA_III;
 		c->options = R4K_OPTS | MIPS_CPU_FPU | MIPS_CPU_32FPR |
-		             MIPS_CPU_WATCH | MIPS_CPU_VCE |
-		             MIPS_CPU_LLSC;
+			     MIPS_CPU_WATCH | MIPS_CPU_VCE |
+			     MIPS_CPU_LLSC;
 		c->tlbsize = 48;
 		break;
 	case PRID_IMP_VR41XX:
@@ -434,7 +434,7 @@ static inline void cpu_probe_legacy(struct cpuinfo_mips *c, unsigned int cpu)
 		__cpu_name[cpu] = "R4300";
 		c->isa_level = MIPS_CPU_ISA_III;
 		c->options = R4K_OPTS | MIPS_CPU_FPU | MIPS_CPU_32FPR |
-		             MIPS_CPU_LLSC;
+			     MIPS_CPU_LLSC;
 		c->tlbsize = 32;
 		break;
 	case PRID_IMP_R4600:
@@ -446,7 +446,7 @@ static inline void cpu_probe_legacy(struct cpuinfo_mips *c, unsigned int cpu)
 		c->tlbsize = 48;
 		break;
 	#if 0
- 	case PRID_IMP_R4650:
+	case PRID_IMP_R4650:
 		/*
 		 * This processor doesn't have an MMU, so it's not
 		 * "real easy" to run Linux on it. It is left purely
@@ -455,9 +455,9 @@ static inline void cpu_probe_legacy(struct cpuinfo_mips *c, unsigned int cpu)
 		 */
 		c->cputype = CPU_R4650;
 		__cpu_name[cpu] = "R4650";
-	 	c->isa_level = MIPS_CPU_ISA_III;
+		c->isa_level = MIPS_CPU_ISA_III;
 		c->options = R4K_OPTS | MIPS_CPU_FPU | MIPS_CPU_LLSC;
-	        c->tlbsize = 48;
+		c->tlbsize = 48;
 		break;
 	#endif
 	case PRID_IMP_TX39:
@@ -488,7 +488,7 @@ static inline void cpu_probe_legacy(struct cpuinfo_mips *c, unsigned int cpu)
 		__cpu_name[cpu] = "R4700";
 		c->isa_level = MIPS_CPU_ISA_III;
 		c->options = R4K_OPTS | MIPS_CPU_FPU | MIPS_CPU_32FPR |
-		             MIPS_CPU_LLSC;
+			     MIPS_CPU_LLSC;
 		c->tlbsize = 48;
 		break;
 	case PRID_IMP_TX49:
@@ -505,7 +505,7 @@ static inline void cpu_probe_legacy(struct cpuinfo_mips *c, unsigned int cpu)
 		__cpu_name[cpu] = "R5000";
 		c->isa_level = MIPS_CPU_ISA_IV;
 		c->options = R4K_OPTS | MIPS_CPU_FPU | MIPS_CPU_32FPR |
-		             MIPS_CPU_LLSC;
+			     MIPS_CPU_LLSC;
 		c->tlbsize = 48;
 		break;
 	case PRID_IMP_R5432:
@@ -513,7 +513,7 @@ static inline void cpu_probe_legacy(struct cpuinfo_mips *c, unsigned int cpu)
 		__cpu_name[cpu] = "R5432";
 		c->isa_level = MIPS_CPU_ISA_IV;
 		c->options = R4K_OPTS | MIPS_CPU_FPU | MIPS_CPU_32FPR |
-		             MIPS_CPU_WATCH | MIPS_CPU_LLSC;
+			     MIPS_CPU_WATCH | MIPS_CPU_LLSC;
 		c->tlbsize = 48;
 		break;
 	case PRID_IMP_R5500:
@@ -521,7 +521,7 @@ static inline void cpu_probe_legacy(struct cpuinfo_mips *c, unsigned int cpu)
 		__cpu_name[cpu] = "R5500";
 		c->isa_level = MIPS_CPU_ISA_IV;
 		c->options = R4K_OPTS | MIPS_CPU_FPU | MIPS_CPU_32FPR |
-		             MIPS_CPU_WATCH | MIPS_CPU_LLSC;
+			     MIPS_CPU_WATCH | MIPS_CPU_LLSC;
 		c->tlbsize = 48;
 		break;
 	case PRID_IMP_NEVADA:
@@ -529,7 +529,7 @@ static inline void cpu_probe_legacy(struct cpuinfo_mips *c, unsigned int cpu)
 		__cpu_name[cpu] = "Nevada";
 		c->isa_level = MIPS_CPU_ISA_IV;
 		c->options = R4K_OPTS | MIPS_CPU_FPU | MIPS_CPU_32FPR |
-		             MIPS_CPU_DIVEC | MIPS_CPU_LLSC;
+			     MIPS_CPU_DIVEC | MIPS_CPU_LLSC;
 		c->tlbsize = 48;
 		break;
 	case PRID_IMP_R6000:
@@ -537,7 +537,7 @@ static inline void cpu_probe_legacy(struct cpuinfo_mips *c, unsigned int cpu)
 		__cpu_name[cpu] = "R6000";
 		c->isa_level = MIPS_CPU_ISA_II;
 		c->options = MIPS_CPU_TLB | MIPS_CPU_FPU |
-		             MIPS_CPU_LLSC;
+			     MIPS_CPU_LLSC;
 		c->tlbsize = 32;
 		break;
 	case PRID_IMP_R6000A:
@@ -545,7 +545,7 @@ static inline void cpu_probe_legacy(struct cpuinfo_mips *c, unsigned int cpu)
 		__cpu_name[cpu] = "R6000A";
 		c->isa_level = MIPS_CPU_ISA_II;
 		c->options = MIPS_CPU_TLB | MIPS_CPU_FPU |
-		             MIPS_CPU_LLSC;
+			     MIPS_CPU_LLSC;
 		c->tlbsize = 32;
 		break;
 	case PRID_IMP_RM7000:
@@ -553,7 +553,7 @@ static inline void cpu_probe_legacy(struct cpuinfo_mips *c, unsigned int cpu)
 		__cpu_name[cpu] = "RM7000";
 		c->isa_level = MIPS_CPU_ISA_IV;
 		c->options = R4K_OPTS | MIPS_CPU_FPU | MIPS_CPU_32FPR |
-		             MIPS_CPU_LLSC;
+			     MIPS_CPU_LLSC;
 		/*
 		 * Undocumented RM7000:  Bit 29 in the info register of
 		 * the RM7000 v2.0 indicates if the TLB has 48 or 64
@@ -569,7 +569,7 @@ static inline void cpu_probe_legacy(struct cpuinfo_mips *c, unsigned int cpu)
 		__cpu_name[cpu] = "RM9000";
 		c->isa_level = MIPS_CPU_ISA_IV;
 		c->options = R4K_OPTS | MIPS_CPU_FPU | MIPS_CPU_32FPR |
-		             MIPS_CPU_LLSC;
+			     MIPS_CPU_LLSC;
 		/*
 		 * Bit 29 in the info register of the RM9000
 		 * indicates if the TLB has 48 or 64 entries.
@@ -584,8 +584,8 @@ static inline void cpu_probe_legacy(struct cpuinfo_mips *c, unsigned int cpu)
 		__cpu_name[cpu] = "RM8000";
 		c->isa_level = MIPS_CPU_ISA_IV;
 		c->options = MIPS_CPU_TLB | MIPS_CPU_4KEX |
-		             MIPS_CPU_FPU | MIPS_CPU_32FPR |
-		             MIPS_CPU_LLSC;
+			     MIPS_CPU_FPU | MIPS_CPU_32FPR |
+			     MIPS_CPU_LLSC;
 		c->tlbsize = 384;      /* has weird TLB: 3-way x 128 */
 		break;
 	case PRID_IMP_R10000:
@@ -593,9 +593,9 @@ static inline void cpu_probe_legacy(struct cpuinfo_mips *c, unsigned int cpu)
 		__cpu_name[cpu] = "R10000";
 		c->isa_level = MIPS_CPU_ISA_IV;
 		c->options = MIPS_CPU_TLB | MIPS_CPU_4K_CACHE | MIPS_CPU_4KEX |
-		             MIPS_CPU_FPU | MIPS_CPU_32FPR |
+			     MIPS_CPU_FPU | MIPS_CPU_32FPR |
 			     MIPS_CPU_COUNTER | MIPS_CPU_WATCH |
-		             MIPS_CPU_LLSC;
+			     MIPS_CPU_LLSC;
 		c->tlbsize = 64;
 		break;
 	case PRID_IMP_R12000:
@@ -603,9 +603,9 @@ static inline void cpu_probe_legacy(struct cpuinfo_mips *c, unsigned int cpu)
 		__cpu_name[cpu] = "R12000";
 		c->isa_level = MIPS_CPU_ISA_IV;
 		c->options = MIPS_CPU_TLB | MIPS_CPU_4K_CACHE | MIPS_CPU_4KEX |
-		             MIPS_CPU_FPU | MIPS_CPU_32FPR |
+			     MIPS_CPU_FPU | MIPS_CPU_32FPR |
 			     MIPS_CPU_COUNTER | MIPS_CPU_WATCH |
-		             MIPS_CPU_LLSC;
+			     MIPS_CPU_LLSC;
 		c->tlbsize = 64;
 		break;
 	case PRID_IMP_R14000:
@@ -613,9 +613,9 @@ static inline void cpu_probe_legacy(struct cpuinfo_mips *c, unsigned int cpu)
 		__cpu_name[cpu] = "R14000";
 		c->isa_level = MIPS_CPU_ISA_IV;
 		c->options = MIPS_CPU_TLB | MIPS_CPU_4K_CACHE | MIPS_CPU_4KEX |
-		             MIPS_CPU_FPU | MIPS_CPU_32FPR |
+			     MIPS_CPU_FPU | MIPS_CPU_32FPR |
 			     MIPS_CPU_COUNTER | MIPS_CPU_WATCH |
-		             MIPS_CPU_LLSC;
+			     MIPS_CPU_LLSC;
 		c->tlbsize = 64;
 		break;
 	case PRID_IMP_LOONGSON2:
@@ -739,7 +739,7 @@ static inline unsigned int decode_config3(struct cpuinfo_mips *c)
 	if (config3 & MIPS_CONF3_VEIC)
 		c->options |= MIPS_CPU_VEIC;
 	if (config3 & MIPS_CONF3_MT)
-	        c->ases |= MIPS_ASE_MIPSMT;
+		c->ases |= MIPS_ASE_MIPSMT;
 	if (config3 & MIPS_CONF3_ULRI)
 		c->options |= MIPS_CPU_ULRI;
 
@@ -767,7 +767,7 @@ static void __cpuinit decode_configs(struct cpuinfo_mips *c)
 
 	/* MIPS32 or MIPS64 compliant CPU.  */
 	c->options = MIPS_CPU_4KEX | MIPS_CPU_4K_CACHE | MIPS_CPU_COUNTER |
-	             MIPS_CPU_DIVEC | MIPS_CPU_LLSC | MIPS_CPU_MCHECK;
+		     MIPS_CPU_DIVEC | MIPS_CPU_LLSC | MIPS_CPU_MCHECK;
 
 	c->scache.flags = MIPS_CACHE_NOT_PRESENT;
 
diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c
index ab73fa2fb9b5..f29099b104c4 100644
--- a/arch/mips/kernel/perf_event_mipsxx.c
+++ b/arch/mips/kernel/perf_event_mipsxx.c
@@ -1532,7 +1532,8 @@ init_hw_perf_events(void)
 		irq = MSC01E_INT_BASE + MSC01E_INT_PERFCTR;
 	} else {
 #endif
-		if (cp0_perfcount_irq >= 0)
+		if ((cp0_perfcount_irq >= 0) &&
+				(cp0_compare_irq != cp0_perfcount_irq))
 			irq = MIPS_CPU_IRQ_BASE + cp0_perfcount_irq;
 		else
 			irq = -1;
diff --git a/arch/mips/kernel/proc.c b/arch/mips/kernel/proc.c
index f8b2c592514d..5542817c1b49 100644
--- a/arch/mips/kernel/proc.c
+++ b/arch/mips/kernel/proc.c
@@ -41,27 +41,27 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 
 	seq_printf(m, "processor\t\t: %ld\n", n);
 	sprintf(fmt, "cpu model\t\t: %%s V%%d.%%d%s\n",
-	        cpu_data[n].options & MIPS_CPU_FPU ? "  FPU V%d.%d" : "");
+		      cpu_data[n].options & MIPS_CPU_FPU ? "  FPU V%d.%d" : "");
 	seq_printf(m, fmt, __cpu_name[n],
-	                           (version >> 4) & 0x0f, version & 0x0f,
-	                           (fp_vers >> 4) & 0x0f, fp_vers & 0x0f);
+		      (version >> 4) & 0x0f, version & 0x0f,
+		      (fp_vers >> 4) & 0x0f, fp_vers & 0x0f);
 	seq_printf(m, "BogoMIPS\t\t: %u.%02u\n",
-	              cpu_data[n].udelay_val / (500000/HZ),
-	              (cpu_data[n].udelay_val / (5000/HZ)) % 100);
+		      cpu_data[n].udelay_val / (500000/HZ),
+		      (cpu_data[n].udelay_val / (5000/HZ)) % 100);
 	seq_printf(m, "wait instruction\t: %s\n", cpu_wait ? "yes" : "no");
 	seq_printf(m, "microsecond timers\t: %s\n",
-	              cpu_has_counter ? "yes" : "no");
+		      cpu_has_counter ? "yes" : "no");
 	seq_printf(m, "tlb_entries\t\t: %d\n", cpu_data[n].tlbsize);
 	seq_printf(m, "extra interrupt vector\t: %s\n",
-	              cpu_has_divec ? "yes" : "no");
+		      cpu_has_divec ? "yes" : "no");
 	seq_printf(m, "hardware watchpoint\t: %s",
-		   cpu_has_watch ? "yes, " : "no\n");
+		      cpu_has_watch ? "yes, " : "no\n");
 	if (cpu_has_watch) {
 		seq_printf(m, "count: %d, address/irw mask: [",
-			   cpu_data[n].watch_reg_count);
+		      cpu_data[n].watch_reg_count);
 		for (i = 0; i < cpu_data[n].watch_reg_count; i++)
 			seq_printf(m, "%s0x%04x", i ? ", " : "" ,
-				   cpu_data[n].watch_reg_masks[i]);
+				cpu_data[n].watch_reg_masks[i]);
 		seq_printf(m, "]\n");
 	}
 	seq_printf(m, "ASEs implemented\t:%s%s%s%s%s%s\n",
@@ -73,13 +73,13 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		      cpu_has_mipsmt ? " mt" : ""
 		);
 	seq_printf(m, "shadow register sets\t: %d\n",
-		       cpu_data[n].srsets);
+		      cpu_data[n].srsets);
 	seq_printf(m, "kscratch registers\t: %d\n",
-		   hweight8(cpu_data[n].kscratch_mask));
+		      hweight8(cpu_data[n].kscratch_mask));
 	seq_printf(m, "core\t\t\t: %d\n", cpu_data[n].core);
 
 	sprintf(fmt, "VCE%%c exceptions\t\t: %s\n",
-	        cpu_has_vce ? "%u" : "not available");
+		      cpu_has_vce ? "%u" : "not available");
 	seq_printf(m, fmt, 'D', vced_count);
 	seq_printf(m, fmt, 'I', vcei_count);
 	seq_printf(m, "\n");
diff --git a/arch/mips/kernel/prom.c b/arch/mips/kernel/prom.c
index 558b5395795d..f11b2bbb826d 100644
--- a/arch/mips/kernel/prom.c
+++ b/arch/mips/kernel/prom.c
@@ -95,3 +95,16 @@ void __init device_tree_init(void)
 	/* free the space reserved for the dt blob */
 	free_mem_mach(base, size);
 }
+
+void __init __dt_setup_arch(struct boot_param_header *bph)
+{
+	if (be32_to_cpu(bph->magic) != OF_DT_HEADER) {
+		pr_err("DTB has bad magic, ignoring builtin OF DTB\n");
+
+		return;
+	}
+
+	initial_boot_params = bph;
+
+	early_init_devtree(initial_boot_params);
+}
diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index c504b212f8f3..a53f8ec37aac 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -605,6 +605,8 @@ void __init setup_arch(char **cmdline_p)
 
 	resource_init();
 	plat_smp_setup();
+
+	cpu_cache_init();
 }
 
 unsigned long kernelsp[NR_CPUS];
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 71a95f55a649..48650c818040 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -106,7 +106,7 @@ asmlinkage __cpuinit void start_secondary(void)
 #endif /* CONFIG_MIPS_MT_SMTC */
 	cpu_probe();
 	cpu_report();
-	per_cpu_trap_init();
+	per_cpu_trap_init(false);
 	mips_clockevent_init();
 	mp_ops->init_secondary();
 
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index cfdaaa4cffc0..2d0c2a277f52 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -15,6 +15,7 @@
 #include <linux/compiler.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
@@ -91,7 +92,7 @@ void (*board_nmi_handler_setup)(void);
 void (*board_ejtag_handler_setup)(void);
 void (*board_bind_eic_interrupt)(int irq, int regset);
 void (*board_ebase_setup)(void);
-
+void __cpuinitdata(*board_cache_error_setup)(void);
 
 static void show_raw_backtrace(unsigned long reg29)
 {
@@ -1490,7 +1491,6 @@ void *set_vi_handler(int n, vi_handler_t addr)
 	return set_vi_srs_handler(n, addr, 0);
 }
 
-extern void cpu_cache_init(void);
 extern void tlb_init(void);
 extern void flush_tlb_handlers(void);
 
@@ -1517,7 +1517,7 @@ static int __init ulri_disable(char *s)
 }
 __setup("noulri", ulri_disable);
 
-void __cpuinit per_cpu_trap_init(void)
+void __cpuinit per_cpu_trap_init(bool is_boot_cpu)
 {
 	unsigned int cpu = smp_processor_id();
 	unsigned int status_set = ST0_CU0;
@@ -1616,7 +1616,9 @@ void __cpuinit per_cpu_trap_init(void)
 #ifdef CONFIG_MIPS_MT_SMTC
 	if (bootTC) {
 #endif /* CONFIG_MIPS_MT_SMTC */
-		cpu_cache_init();
+		/* Boot CPU's cache setup in setup_arch(). */
+		if (!is_boot_cpu)
+			cpu_cache_init();
 		tlb_init();
 #ifdef CONFIG_MIPS_MT_SMTC
 	} else if (!secondaryTC) {
@@ -1632,7 +1634,7 @@ void __cpuinit per_cpu_trap_init(void)
 }
 
 /* Install CPU exception handler */
-void __init set_handler(unsigned long offset, void *addr, unsigned long size)
+void __cpuinit set_handler(unsigned long offset, void *addr, unsigned long size)
 {
 	memcpy((void *)(ebase + offset), addr, size);
 	local_flush_icache_range(ebase + offset, ebase + offset + size);
@@ -1693,7 +1695,7 @@ void __init trap_init(void)
 
 	if (board_ebase_setup)
 		board_ebase_setup();
-	per_cpu_trap_init();
+	per_cpu_trap_init(true);
 
 	/*
 	 * Copy the generic exception handlers to their final destination.
@@ -1797,6 +1799,9 @@ void __init trap_init(void)
 
 	set_except_vector(26, handle_dsp);
 
+	if (board_cache_error_setup)
+		board_cache_error_setup();
+
 	if (cpu_has_vce)
 		/* Special exception: R4[04]00 uses also the divec space. */
 		memcpy((void *)(ebase + 0x180), &except_vec3_r4000, 0x100);
diff --git a/arch/mips/lantiq/Kconfig b/arch/mips/lantiq/Kconfig
index 3fccf2104513..20bdf40b3efa 100644
--- a/arch/mips/lantiq/Kconfig
+++ b/arch/mips/lantiq/Kconfig
@@ -16,8 +16,22 @@ config SOC_XWAY
 	bool "XWAY"
 	select SOC_TYPE_XWAY
 	select HW_HAS_PCI
+
+config SOC_FALCON
+	bool "FALCON"
+
+endchoice
+
+choice
+	prompt "Devicetree"
+
+config DT_EASY50712
+	bool "Easy50712"
+	depends on SOC_XWAY
 endchoice
 
-source "arch/mips/lantiq/xway/Kconfig"
+config PCI_LANTIQ
+	bool "PCI Support"
+	depends on SOC_XWAY && PCI
 
 endif
diff --git a/arch/mips/lantiq/Makefile b/arch/mips/lantiq/Makefile
index e5dae0e24b00..d6bdc579419f 100644
--- a/arch/mips/lantiq/Makefile
+++ b/arch/mips/lantiq/Makefile
@@ -4,8 +4,11 @@
 # under the terms of the GNU General Public License version 2 as published
 # by the Free Software Foundation.
 
-obj-y := irq.o setup.o clk.o prom.o devices.o
+obj-y := irq.o clk.o prom.o
+
+obj-y += dts/
 
 obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
 
 obj-$(CONFIG_SOC_TYPE_XWAY) += xway/
+obj-$(CONFIG_SOC_FALCON) += falcon/
diff --git a/arch/mips/lantiq/Platform b/arch/mips/lantiq/Platform
index f3dff05722de..b3ec49838fd7 100644
--- a/arch/mips/lantiq/Platform
+++ b/arch/mips/lantiq/Platform
@@ -6,3 +6,4 @@ platform-$(CONFIG_LANTIQ)	+= lantiq/
 cflags-$(CONFIG_LANTIQ)		+= -I$(srctree)/arch/mips/include/asm/mach-lantiq
 load-$(CONFIG_LANTIQ)		= 0xffffffff80002000
 cflags-$(CONFIG_SOC_TYPE_XWAY)	+= -I$(srctree)/arch/mips/include/asm/mach-lantiq/xway
+cflags-$(CONFIG_SOC_FALCON)	+= -I$(srctree)/arch/mips/include/asm/mach-lantiq/falcon
diff --git a/arch/mips/lantiq/clk.c b/arch/mips/lantiq/clk.c
index 412814fdd3ee..d3bcc33f4699 100644
--- a/arch/mips/lantiq/clk.c
+++ b/arch/mips/lantiq/clk.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/clk.h>
+#include <linux/clkdev.h>
 #include <linux/err.h>
 #include <linux/list.h>
 
@@ -22,44 +23,32 @@
 #include <lantiq_soc.h>
 
 #include "clk.h"
+#include "prom.h"
 
-struct clk {
-	const char *name;
-	unsigned long rate;
-	unsigned long (*get_rate) (void);
-};
+/* lantiq socs have 3 static clocks */
+static struct clk cpu_clk_generic[3];
 
-static struct clk *cpu_clk;
-static int cpu_clk_cnt;
+void clkdev_add_static(unsigned long cpu, unsigned long fpi, unsigned long io)
+{
+	cpu_clk_generic[0].rate = cpu;
+	cpu_clk_generic[1].rate = fpi;
+	cpu_clk_generic[2].rate = io;
+}
 
-/* lantiq socs have 3 static clocks */
-static struct clk cpu_clk_generic[] = {
-	{
-		.name = "cpu",
-		.get_rate = ltq_get_cpu_hz,
-	}, {
-		.name = "fpi",
-		.get_rate = ltq_get_fpi_hz,
-	}, {
-		.name = "io",
-		.get_rate = ltq_get_io_region_clock,
-	},
-};
-
-static struct resource ltq_cgu_resource = {
-	.name	= "cgu",
-	.start	= LTQ_CGU_BASE_ADDR,
-	.end	= LTQ_CGU_BASE_ADDR + LTQ_CGU_SIZE - 1,
-	.flags	= IORESOURCE_MEM,
-};
-
-/* remapped clock register range */
-void __iomem *ltq_cgu_membase;
-
-void clk_init(void)
+struct clk *clk_get_cpu(void)
+{
+	return &cpu_clk_generic[0];
+}
+
+struct clk *clk_get_fpi(void)
+{
+	return &cpu_clk_generic[1];
+}
+EXPORT_SYMBOL_GPL(clk_get_fpi);
+
+struct clk *clk_get_io(void)
 {
-	cpu_clk = cpu_clk_generic;
-	cpu_clk_cnt = ARRAY_SIZE(cpu_clk_generic);
+	return &cpu_clk_generic[2];
 }
 
 static inline int clk_good(struct clk *clk)
@@ -82,38 +71,71 @@ unsigned long clk_get_rate(struct clk *clk)
 }
 EXPORT_SYMBOL(clk_get_rate);
 
-struct clk *clk_get(struct device *dev, const char *id)
+int clk_set_rate(struct clk *clk, unsigned long rate)
 {
-	int i;
-
-	for (i = 0; i < cpu_clk_cnt; i++)
-		if (!strcmp(id, cpu_clk[i].name))
-			return &cpu_clk[i];
-	BUG();
-	return ERR_PTR(-ENOENT);
-}
-EXPORT_SYMBOL(clk_get);
-
-void clk_put(struct clk *clk)
-{
-	/* not used */
+	if (unlikely(!clk_good(clk)))
+		return 0;
+	if (clk->rates && *clk->rates) {
+		unsigned long *r = clk->rates;
+
+		while (*r && (*r != rate))
+			r++;
+		if (!*r) {
+			pr_err("clk %s.%s: trying to set invalid rate %ld\n",
+				clk->cl.dev_id, clk->cl.con_id, rate);
+			return -1;
+		}
+	}
+	clk->rate = rate;
+	return 0;
 }
-EXPORT_SYMBOL(clk_put);
+EXPORT_SYMBOL(clk_set_rate);
 
 int clk_enable(struct clk *clk)
 {
-	/* not used */
-	return 0;
+	if (unlikely(!clk_good(clk)))
+		return -1;
+
+	if (clk->enable)
+		return clk->enable(clk);
+
+	return -1;
 }
 EXPORT_SYMBOL(clk_enable);
 
 void clk_disable(struct clk *clk)
 {
-	/* not used */
+	if (unlikely(!clk_good(clk)))
+		return;
+
+	if (clk->disable)
+		clk->disable(clk);
 }
 EXPORT_SYMBOL(clk_disable);
 
-static inline u32 ltq_get_counter_resolution(void)
+int clk_activate(struct clk *clk)
+{
+	if (unlikely(!clk_good(clk)))
+		return -1;
+
+	if (clk->activate)
+		return clk->activate(clk);
+
+	return -1;
+}
+EXPORT_SYMBOL(clk_activate);
+
+void clk_deactivate(struct clk *clk)
+{
+	if (unlikely(!clk_good(clk)))
+		return;
+
+	if (clk->deactivate)
+		clk->deactivate(clk);
+}
+EXPORT_SYMBOL(clk_deactivate);
+
+static inline u32 get_counter_resolution(void)
 {
 	u32 res;
 
@@ -133,21 +155,11 @@ void __init plat_time_init(void)
 {
 	struct clk *clk;
 
-	if (insert_resource(&iomem_resource, &ltq_cgu_resource) < 0)
-		panic("Failed to insert cgu memory");
+	ltq_soc_init();
 
-	if (request_mem_region(ltq_cgu_resource.start,
-			resource_size(&ltq_cgu_resource), "cgu") < 0)
-		panic("Failed to request cgu memory");
-
-	ltq_cgu_membase = ioremap_nocache(ltq_cgu_resource.start,
-				resource_size(&ltq_cgu_resource));
-	if (!ltq_cgu_membase) {
-		pr_err("Failed to remap cgu memory\n");
-		unreachable();
-	}
-	clk = clk_get(0, "cpu");
-	mips_hpt_frequency = clk_get_rate(clk) / ltq_get_counter_resolution();
+	clk = clk_get_cpu();
+	mips_hpt_frequency = clk_get_rate(clk) / get_counter_resolution();
 	write_c0_compare(read_c0_count());
+	pr_info("CPU Clock: %ldMHz\n", clk_get_rate(clk) / 1000000);
 	clk_put(clk);
 }
diff --git a/arch/mips/lantiq/clk.h b/arch/mips/lantiq/clk.h
index 3328925f2c3f..fa670602b91b 100644
--- a/arch/mips/lantiq/clk.h
+++ b/arch/mips/lantiq/clk.h
@@ -9,10 +9,70 @@
 #ifndef _LTQ_CLK_H__
 #define _LTQ_CLK_H__
 
-extern void clk_init(void);
+#include <linux/clkdev.h>
 
-extern unsigned long ltq_get_cpu_hz(void);
-extern unsigned long ltq_get_fpi_hz(void);
-extern unsigned long ltq_get_io_region_clock(void);
+/* clock speeds */
+#define CLOCK_33M	33333333
+#define CLOCK_60M	60000000
+#define CLOCK_62_5M	62500000
+#define CLOCK_83M	83333333
+#define CLOCK_83_5M	83500000
+#define CLOCK_98_304M	98304000
+#define CLOCK_100M	100000000
+#define CLOCK_111M	111111111
+#define CLOCK_125M	125000000
+#define CLOCK_133M	133333333
+#define CLOCK_150M	150000000
+#define CLOCK_166M	166666666
+#define CLOCK_167M	166666667
+#define CLOCK_196_608M	196608000
+#define CLOCK_200M	200000000
+#define CLOCK_250M	250000000
+#define CLOCK_266M	266666666
+#define CLOCK_300M	300000000
+#define CLOCK_333M	333333333
+#define CLOCK_393M	393215332
+#define CLOCK_400M	400000000
+#define CLOCK_500M	500000000
+#define CLOCK_600M	600000000
+
+/* clock out speeds */
+#define CLOCK_32_768K	32768
+#define CLOCK_1_536M	1536000
+#define CLOCK_2_5M	2500000
+#define CLOCK_12M	12000000
+#define CLOCK_24M	24000000
+#define CLOCK_25M	25000000
+#define CLOCK_30M	30000000
+#define CLOCK_40M	40000000
+#define CLOCK_48M	48000000
+#define CLOCK_50M	50000000
+#define CLOCK_60M	60000000
+
+struct clk {
+	struct clk_lookup cl;
+	unsigned long rate;
+	unsigned long *rates;
+	unsigned int module;
+	unsigned int bits;
+	unsigned long (*get_rate) (void);
+	int (*enable) (struct clk *clk);
+	void (*disable) (struct clk *clk);
+	int (*activate) (struct clk *clk);
+	void (*deactivate) (struct clk *clk);
+	void (*reboot) (struct clk *clk);
+};
+
+extern void clkdev_add_static(unsigned long cpu, unsigned long fpi,
+				unsigned long io);
+
+extern unsigned long ltq_danube_cpu_hz(void);
+extern unsigned long ltq_danube_fpi_hz(void);
+
+extern unsigned long ltq_ar9_cpu_hz(void);
+extern unsigned long ltq_ar9_fpi_hz(void);
+
+extern unsigned long ltq_vr9_cpu_hz(void);
+extern unsigned long ltq_vr9_fpi_hz(void);
 
 #endif
diff --git a/arch/mips/lantiq/devices.c b/arch/mips/lantiq/devices.c
deleted file mode 100644
index de1cb2bcd79a..000000000000
--- a/arch/mips/lantiq/devices.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- *  This program is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License version 2 as published
- *  by the Free Software Foundation.
- *
- *  Copyright (C) 2010 John Crispin <[email protected]>
- */
-
-#include <linux/init.h>
-#include <linux/export.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/reboot.h>
-#include <linux/platform_device.h>
-#include <linux/leds.h>
-#include <linux/etherdevice.h>
-#include <linux/time.h>
-#include <linux/io.h>
-#include <linux/gpio.h>
-
-#include <asm/bootinfo.h>
-#include <asm/irq.h>
-
-#include <lantiq_soc.h>
-
-#include "devices.h"
-
-/* nor flash */
-static struct resource ltq_nor_resource = {
-	.name	= "nor",
-	.start	= LTQ_FLASH_START,
-	.end	= LTQ_FLASH_START + LTQ_FLASH_MAX - 1,
-	.flags  = IORESOURCE_MEM,
-};
-
-static struct platform_device ltq_nor = {
-	.name		= "ltq_nor",
-	.resource	= &ltq_nor_resource,
-	.num_resources	= 1,
-};
-
-void __init ltq_register_nor(struct physmap_flash_data *data)
-{
-	ltq_nor.dev.platform_data = data;
-	platform_device_register(&ltq_nor);
-}
-
-/* watchdog */
-static struct resource ltq_wdt_resource = {
-	.name	= "watchdog",
-	.start  = LTQ_WDT_BASE_ADDR,
-	.end    = LTQ_WDT_BASE_ADDR + LTQ_WDT_SIZE - 1,
-	.flags  = IORESOURCE_MEM,
-};
-
-void __init ltq_register_wdt(void)
-{
-	platform_device_register_simple("ltq_wdt", 0, &ltq_wdt_resource, 1);
-}
-
-/* asc ports */
-static struct resource ltq_asc0_resources[] = {
-	{
-		.name	= "asc0",
-		.start  = LTQ_ASC0_BASE_ADDR,
-		.end    = LTQ_ASC0_BASE_ADDR + LTQ_ASC_SIZE - 1,
-		.flags  = IORESOURCE_MEM,
-	},
-	IRQ_RES(tx, LTQ_ASC_TIR(0)),
-	IRQ_RES(rx, LTQ_ASC_RIR(0)),
-	IRQ_RES(err, LTQ_ASC_EIR(0)),
-};
-
-static struct resource ltq_asc1_resources[] = {
-	{
-		.name	= "asc1",
-		.start  = LTQ_ASC1_BASE_ADDR,
-		.end    = LTQ_ASC1_BASE_ADDR + LTQ_ASC_SIZE - 1,
-		.flags  = IORESOURCE_MEM,
-	},
-	IRQ_RES(tx, LTQ_ASC_TIR(1)),
-	IRQ_RES(rx, LTQ_ASC_RIR(1)),
-	IRQ_RES(err, LTQ_ASC_EIR(1)),
-};
-
-void __init ltq_register_asc(int port)
-{
-	switch (port) {
-	case 0:
-		platform_device_register_simple("ltq_asc", 0,
-			ltq_asc0_resources, ARRAY_SIZE(ltq_asc0_resources));
-		break;
-	case 1:
-		platform_device_register_simple("ltq_asc", 1,
-			ltq_asc1_resources, ARRAY_SIZE(ltq_asc1_resources));
-		break;
-	default:
-		break;
-	}
-}
-
-#ifdef CONFIG_PCI
-/* pci */
-static struct platform_device ltq_pci = {
-	.name		= "ltq_pci",
-	.num_resources	= 0,
-};
-
-void __init ltq_register_pci(struct ltq_pci_data *data)
-{
-	ltq_pci.dev.platform_data = data;
-	platform_device_register(&ltq_pci);
-}
-#else
-void __init ltq_register_pci(struct ltq_pci_data *data)
-{
-	pr_err("kernel is compiled without PCI support\n");
-}
-#endif
diff --git a/arch/mips/lantiq/devices.h b/arch/mips/lantiq/devices.h
deleted file mode 100644
index 2947bb19a528..000000000000
--- a/arch/mips/lantiq/devices.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  This program is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License version 2 as published
- *  by the Free Software Foundation.
- *
- *  Copyright (C) 2010 John Crispin <[email protected]>
- */
-
-#ifndef _LTQ_DEVICES_H__
-#define _LTQ_DEVICES_H__
-
-#include <lantiq_platform.h>
-#include <linux/mtd/physmap.h>
-
-#define IRQ_RES(resname, irq) \
-	{.name = #resname, .start = (irq), .flags = IORESOURCE_IRQ}
-
-extern void ltq_register_nor(struct physmap_flash_data *data);
-extern void ltq_register_wdt(void);
-extern void ltq_register_asc(int port);
-extern void ltq_register_pci(struct ltq_pci_data *data);
-
-#endif
diff --git a/arch/mips/lantiq/dts/Makefile b/arch/mips/lantiq/dts/Makefile
new file mode 100644
index 000000000000..674fca45f72d
--- /dev/null
+++ b/arch/mips/lantiq/dts/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_DT_EASY50712) := easy50712.dtb.o
+
+$(obj)/%.dtb: $(obj)/%.dts
+	$(call if_changed,dtc)
diff --git a/arch/mips/lantiq/dts/danube.dtsi b/arch/mips/lantiq/dts/danube.dtsi
new file mode 100644
index 000000000000..3a4520f009cf
--- /dev/null
+++ b/arch/mips/lantiq/dts/danube.dtsi
@@ -0,0 +1,105 @@
+/ {
+	#address-cells = <1>;
+	#size-cells = <1>;
+	compatible = "lantiq,xway", "lantiq,danube";
+
+	cpus {
+		cpu@0 {
+			compatible = "mips,mips24Kc";
+		};
+	};
+
+	biu@1F800000 {
+		#address-cells = <1>;
+		#size-cells = <1>;
+		compatible = "lantiq,biu", "simple-bus";
+		reg = <0x1F800000 0x800000>;
+		ranges = <0x0 0x1F800000 0x7FFFFF>;
+
+		icu0: icu@80200 {
+			#interrupt-cells = <1>;
+			interrupt-controller;
+			compatible = "lantiq,icu";
+			reg = <0x80200 0x120>;
+		};
+
+		watchdog@803F0 {
+			compatible = "lantiq,wdt";
+			reg = <0x803F0 0x10>;
+		};
+	};
+
+	sram@1F000000 {
+		#address-cells = <1>;
+		#size-cells = <1>;
+		compatible = "lantiq,sram";
+		reg = <0x1F000000 0x800000>;
+		ranges = <0x0 0x1F000000 0x7FFFFF>;
+
+		eiu0: eiu@101000 {
+			#interrupt-cells = <1>;
+			interrupt-controller;
+			interrupt-parent;
+			compatible = "lantiq,eiu-xway";
+			reg = <0x101000 0x1000>;
+		};
+
+		pmu0: pmu@102000 {
+			compatible = "lantiq,pmu-xway";
+			reg = <0x102000 0x1000>;
+		};
+
+		cgu0: cgu@103000 {
+			compatible = "lantiq,cgu-xway";
+			reg = <0x103000 0x1000>;
+			#clock-cells = <1>;
+		};
+
+		rcu0: rcu@203000 {
+			compatible = "lantiq,rcu-xway";
+			reg = <0x203000 0x1000>;
+		};
+	};
+
+	fpi@10000000 {
+		#address-cells = <1>;
+		#size-cells = <1>;
+		compatible = "lantiq,fpi", "simple-bus";
+		ranges = <0x0 0x10000000 0xEEFFFFF>;
+		reg = <0x10000000 0xEF00000>;
+
+		gptu@E100A00 {
+			compatible = "lantiq,gptu-xway";
+			reg = <0xE100A00 0x100>;
+		};
+
+		serial@E100C00 {
+			compatible = "lantiq,asc";
+			reg = <0xE100C00 0x400>;
+			interrupt-parent = <&icu0>;
+			interrupts = <112 113 114>;
+		};
+
+		dma0: dma@E104100 {
+			compatible = "lantiq,dma-xway";
+			reg = <0xE104100 0x800>;
+		};
+
+		ebu0: ebu@E105300 {
+			compatible = "lantiq,ebu-xway";
+			reg = <0xE105300 0x100>;
+		};
+
+		pci0: pci@E105400 {
+			#address-cells = <3>;
+			#size-cells = <2>;
+			#interrupt-cells = <1>;
+			compatible = "lantiq,pci-xway";
+			bus-range = <0x0 0x0>;
+			ranges = <0x2000000 0 0x8000000 0x8000000 0 0x2000000	/* pci memory */
+				  0x1000000 0 0x00000000 0xAE00000 0 0x200000>;	/* io space */
+			reg = <0x7000000 0x8000		/* config space */
+				0xE105400 0x400>;	/* pci bridge */
+		};
+	};
+};
diff --git a/arch/mips/lantiq/dts/easy50712.dts b/arch/mips/lantiq/dts/easy50712.dts
new file mode 100644
index 000000000000..68c17310bc82
--- /dev/null
+++ b/arch/mips/lantiq/dts/easy50712.dts
@@ -0,0 +1,113 @@
+/dts-v1/;
+
+/include/ "danube.dtsi"
+
+/ {
+	chosen {
+		bootargs = "console=ttyLTQ0,115200 init=/etc/preinit";
+	};
+
+	memory@0 {
+		reg = <0x0 0x2000000>;
+	};
+
+	fpi@10000000 {
+		#address-cells = <1>;
+		#size-cells = <1>;
+		localbus@0 {
+			#address-cells = <2>;
+			#size-cells = <1>;
+			ranges = <0 0 0x0 0x3ffffff /* addrsel0 */
+				1 0 0x4000000 0x4000010>; /* addsel1 */
+			compatible = "lantiq,localbus", "simple-bus";
+
+			nor-boot@0 {
+				compatible = "lantiq,nor";
+				bank-width = <2>;
+				reg = <0 0x0 0x2000000>;
+				#address-cells = <1>;
+				#size-cells = <1>;
+
+				partition@0 {
+					label = "uboot";
+					reg = <0x00000 0x10000>; /* 64 KB */
+				};
+
+				partition@10000 {
+					label = "uboot_env";
+					reg = <0x10000 0x10000>; /* 64 KB */
+				};
+
+				partition@20000 {
+					label = "linux";
+					reg = <0x20000 0x3d0000>;
+				};
+
+				partition@400000 {
+					label = "rootfs";
+					reg = <0x400000 0x400000>;
+				};
+			};
+		};
+
+		gpio: pinmux@E100B10 {
+			compatible = "lantiq,pinctrl-xway";
+			pinctrl-names = "default";
+			pinctrl-0 = <&state_default>;
+
+			#gpio-cells = <2>;
+			gpio-controller;
+			reg = <0xE100B10 0xA0>;
+
+			state_default: pinmux {
+				stp {
+					lantiq,groups = "stp";
+					lantiq,function = "stp";
+				};
+				exin {
+					lantiq,groups = "exin1";
+					lantiq,function = "exin";
+				};
+				pci {
+					lantiq,groups = "gnt1";
+					lantiq,function = "pci";
+				};
+				conf_out {
+					lantiq,pins = "io4", "io5", "io6"; /* stp */
+					lantiq,open-drain;
+					lantiq,pull = <0>;
+				};
+			};
+		};
+
+		etop@E180000 {
+			compatible = "lantiq,etop-xway";
+			reg = <0xE180000 0x40000>;
+			interrupt-parent = <&icu0>;
+			interrupts = <73 78>;
+			phy-mode = "rmii";
+			mac-address = [ 00 11 22 33 44 55 ];
+		};
+
+		stp0: stp@E100BB0 {
+			#gpio-cells = <2>;
+			compatible = "lantiq,gpio-stp-xway";
+			gpio-controller;
+			reg = <0xE100BB0 0x40>;
+
+			lantiq,shadow = <0xfff>;
+			lantiq,groups = <0x3>;
+		};
+
+		pci@E105400 {
+			lantiq,bus-clock = <33333333>;
+			interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
+			interrupt-map = <
+                                0x7000 0 0 1 &icu0 29 1 // slot 14, irq 29
+			>;
+			gpios-reset = <&gpio 21 0>;
+			req-mask = <0x1>;		/* GNT1 */
+		};
+
+	};
+};
diff --git a/arch/mips/lantiq/early_printk.c b/arch/mips/lantiq/early_printk.c
index 972e05f87631..9b28d0940ef4 100644
--- a/arch/mips/lantiq/early_printk.c
+++ b/arch/mips/lantiq/early_printk.c
@@ -6,17 +6,16 @@
  *  Copyright (C) 2010 John Crispin <[email protected]>
  */
 
-#include <linux/init.h>
 #include <linux/cpu.h>
-
-#include <lantiq.h>
 #include <lantiq_soc.h>
 
-/* no ioremap possible at this early stage, lets use KSEG1 instead  */
-#define LTQ_ASC_BASE	KSEG1ADDR(LTQ_ASC1_BASE_ADDR)
 #define ASC_BUF		1024
-#define LTQ_ASC_FSTAT	((u32 *)(LTQ_ASC_BASE + 0x0048))
-#define LTQ_ASC_TBUF	((u32 *)(LTQ_ASC_BASE + 0x0020))
+#define LTQ_ASC_FSTAT	((u32 *)(LTQ_EARLY_ASC + 0x0048))
+#ifdef __BIG_ENDIAN
+#define LTQ_ASC_TBUF	((u32 *)(LTQ_EARLY_ASC + 0x0020 + 3))
+#else
+#define LTQ_ASC_TBUF	((u32 *)(LTQ_EARLY_ASC + 0x0020))
+#endif
 #define TXMASK		0x3F00
 #define TXOFFSET	8
 
@@ -27,7 +26,7 @@ void prom_putchar(char c)
 	local_irq_save(flags);
 	do { } while ((ltq_r32(LTQ_ASC_FSTAT) & TXMASK) >> TXOFFSET);
 	if (c == '\n')
-		ltq_w32('\r', LTQ_ASC_TBUF);
-	ltq_w32(c, LTQ_ASC_TBUF);
+		ltq_w8('\r', LTQ_ASC_TBUF);
+	ltq_w8(c, LTQ_ASC_TBUF);
 	local_irq_restore(flags);
 }
diff --git a/arch/mips/lantiq/falcon/Makefile b/arch/mips/lantiq/falcon/Makefile
new file mode 100644
index 000000000000..ff220f97693d
--- /dev/null
+++ b/arch/mips/lantiq/falcon/Makefile
@@ -0,0 +1 @@
+obj-y := prom.o reset.o sysctrl.o
diff --git a/arch/mips/lantiq/falcon/prom.c b/arch/mips/lantiq/falcon/prom.c
new file mode 100644
index 000000000000..c1d278f05a3a
--- /dev/null
+++ b/arch/mips/lantiq/falcon/prom.c
@@ -0,0 +1,87 @@
+/*
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * Copyright (C) 2012 Thomas Langer <[email protected]>
+ * Copyright (C) 2012 John Crispin <[email protected]>
+ */
+
+#include <linux/kernel.h>
+#include <asm/io.h>
+
+#include <lantiq_soc.h>
+
+#include "../prom.h"
+
+#define SOC_FALCON	"Falcon"
+#define SOC_FALCON_D	"Falcon-D"
+#define SOC_FALCON_V	"Falcon-V"
+#define SOC_FALCON_M	"Falcon-M"
+
+#define COMP_FALCON	"lantiq,falcon"
+
+#define PART_SHIFT	12
+#define PART_MASK	0x0FFFF000
+#define REV_SHIFT	28
+#define REV_MASK	0xF0000000
+#define SREV_SHIFT	22
+#define SREV_MASK	0x03C00000
+#define TYPE_SHIFT	26
+#define TYPE_MASK	0x3C000000
+
+/* reset, nmi and ejtag exception vectors */
+#define BOOT_REG_BASE	(KSEG1 | 0x1F200000)
+#define BOOT_RVEC	(BOOT_REG_BASE | 0x00)
+#define BOOT_NVEC	(BOOT_REG_BASE | 0x04)
+#define BOOT_EVEC	(BOOT_REG_BASE | 0x08)
+
+void __init ltq_soc_nmi_setup(void)
+{
+	extern void (*nmi_handler)(void);
+
+	ltq_w32((unsigned long)&nmi_handler, (void *)BOOT_NVEC);
+}
+
+void __init ltq_soc_ejtag_setup(void)
+{
+	extern void (*ejtag_debug_handler)(void);
+
+	ltq_w32((unsigned long)&ejtag_debug_handler, (void *)BOOT_EVEC);
+}
+
+void __init ltq_soc_detect(struct ltq_soc_info *i)
+{
+	u32 type;
+	i->partnum = (ltq_r32(FALCON_CHIPID) & PART_MASK) >> PART_SHIFT;
+	i->rev = (ltq_r32(FALCON_CHIPID) & REV_MASK) >> REV_SHIFT;
+	i->srev = ((ltq_r32(FALCON_CHIPCONF) & SREV_MASK) >> SREV_SHIFT);
+	i->compatible = COMP_FALCON;
+	i->type = SOC_TYPE_FALCON;
+	sprintf(i->rev_type, "%c%d%d", (i->srev & 0x4) ? ('B') : ('A'),
+		i->rev & 0x7, (i->srev & 0x3) + 1);
+
+	switch (i->partnum) {
+	case SOC_ID_FALCON:
+		type = (ltq_r32(FALCON_CHIPTYPE) & TYPE_MASK) >> TYPE_SHIFT;
+		switch (type) {
+		case 0:
+			i->name = SOC_FALCON_D;
+			break;
+		case 1:
+			i->name = SOC_FALCON_V;
+			break;
+		case 2:
+			i->name = SOC_FALCON_M;
+			break;
+		default:
+			i->name = SOC_FALCON;
+			break;
+		}
+		break;
+
+	default:
+		unreachable();
+		break;
+	}
+}
diff --git a/arch/mips/lantiq/falcon/reset.c b/arch/mips/lantiq/falcon/reset.c
new file mode 100644
index 000000000000..568248253426
--- /dev/null
+++ b/arch/mips/lantiq/falcon/reset.c
@@ -0,0 +1,90 @@
+/*
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * Copyright (C) 2012 Thomas Langer <[email protected]>
+ * Copyright (C) 2012 John Crispin <[email protected]>
+ */
+
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/pm.h>
+#include <asm/reboot.h>
+#include <linux/export.h>
+
+#include <lantiq_soc.h>
+
+/* CPU0 Reset Source Register */
+#define SYS1_CPU0RS		0x0040
+/* reset cause mask */
+#define CPU0RS_MASK		0x0003
+/* CPU0 Boot Mode Register */
+#define SYS1_BM			0x00a0
+/* boot mode mask */
+#define BM_MASK			0x0005
+
+/* allow platform code to find out what surce we booted from */
+unsigned char ltq_boot_select(void)
+{
+	return ltq_sys1_r32(SYS1_BM) & BM_MASK;
+}
+
+/* allow the watchdog driver to find out what the boot reason was */
+int ltq_reset_cause(void)
+{
+	return ltq_sys1_r32(SYS1_CPU0RS) & CPU0RS_MASK;
+}
+EXPORT_SYMBOL_GPL(ltq_reset_cause);
+
+#define BOOT_REG_BASE	(KSEG1 | 0x1F200000)
+#define BOOT_PW1_REG	(BOOT_REG_BASE | 0x20)
+#define BOOT_PW2_REG	(BOOT_REG_BASE | 0x24)
+#define BOOT_PW1	0x4C545100
+#define BOOT_PW2	0x0051544C
+
+#define WDT_REG_BASE	(KSEG1 | 0x1F8803F0)
+#define WDT_PW1		0x00BE0000
+#define WDT_PW2		0x00DC0000
+
+static void machine_restart(char *command)
+{
+	local_irq_disable();
+
+	/* reboot magic */
+	ltq_w32(BOOT_PW1, (void *)BOOT_PW1_REG); /* 'LTQ\0' */
+	ltq_w32(BOOT_PW2, (void *)BOOT_PW2_REG); /* '\0QTL' */
+	ltq_w32(0, (void *)BOOT_REG_BASE); /* reset Bootreg RVEC */
+
+	/* watchdog magic */
+	ltq_w32(WDT_PW1, (void *)WDT_REG_BASE);
+	ltq_w32(WDT_PW2 |
+		(0x3 << 26) | /* PWL */
+		(0x2 << 24) | /* CLKDIV */
+		(0x1 << 31) | /* enable */
+		(1), /* reload */
+		(void *)WDT_REG_BASE);
+	unreachable();
+}
+
+static void machine_halt(void)
+{
+	local_irq_disable();
+	unreachable();
+}
+
+static void machine_power_off(void)
+{
+	local_irq_disable();
+	unreachable();
+}
+
+static int __init mips_reboot_setup(void)
+{
+	_machine_restart = machine_restart;
+	_machine_halt = machine_halt;
+	pm_power_off = machine_power_off;
+	return 0;
+}
+
+arch_initcall(mips_reboot_setup);
diff --git a/arch/mips/lantiq/falcon/sysctrl.c b/arch/mips/lantiq/falcon/sysctrl.c
new file mode 100644
index 000000000000..ba0123d13d40
--- /dev/null
+++ b/arch/mips/lantiq/falcon/sysctrl.c
@@ -0,0 +1,260 @@
+/*
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * Copyright (C) 2011 Thomas Langer <[email protected]>
+ * Copyright (C) 2011 John Crispin <[email protected]>
+ */
+
+#include <linux/ioport.h>
+#include <linux/export.h>
+#include <linux/clkdev.h>
+#include <linux/of_address.h>
+#include <asm/delay.h>
+
+#include <lantiq_soc.h>
+
+#include "../clk.h"
+
+/* infrastructure control register */
+#define SYS1_INFRAC		0x00bc
+/* Configuration fuses for drivers and pll */
+#define STATUS_CONFIG		0x0040
+
+/* GPE frequency selection */
+#define GPPC_OFFSET		24
+#define GPEFREQ_MASK		0x00000C0
+#define GPEFREQ_OFFSET		10
+/* Clock status register */
+#define SYSCTL_CLKS		0x0000
+/* Clock enable register */
+#define SYSCTL_CLKEN		0x0004
+/* Clock clear register */
+#define SYSCTL_CLKCLR		0x0008
+/* Activation Status Register */
+#define SYSCTL_ACTS		0x0020
+/* Activation Register */
+#define SYSCTL_ACT		0x0024
+/* Deactivation Register */
+#define SYSCTL_DEACT		0x0028
+/* reboot Register */
+#define SYSCTL_RBT		0x002c
+/* CPU0 Clock Control Register */
+#define SYS1_CPU0CC		0x0040
+/* HRST_OUT_N Control Register */
+#define SYS1_HRSTOUTC		0x00c0
+/* clock divider bit */
+#define CPU0CC_CPUDIV		0x0001
+
+/* Activation Status Register */
+#define ACTS_ASC1_ACT	0x00000800
+#define ACTS_I2C_ACT	0x00004000
+#define ACTS_P0		0x00010000
+#define ACTS_P1		0x00010000
+#define ACTS_P2		0x00020000
+#define ACTS_P3		0x00020000
+#define ACTS_P4		0x00040000
+#define ACTS_PADCTRL0	0x00100000
+#define ACTS_PADCTRL1	0x00100000
+#define ACTS_PADCTRL2	0x00200000
+#define ACTS_PADCTRL3	0x00200000
+#define ACTS_PADCTRL4	0x00400000
+
+#define sysctl_w32(m, x, y)	ltq_w32((x), sysctl_membase[m] + (y))
+#define sysctl_r32(m, x)	ltq_r32(sysctl_membase[m] + (x))
+#define sysctl_w32_mask(m, clear, set, reg)	\
+		sysctl_w32(m, (sysctl_r32(m, reg) & ~(clear)) | (set), reg)
+
+#define status_w32(x, y)	ltq_w32((x), status_membase + (y))
+#define status_r32(x)		ltq_r32(status_membase + (x))
+
+static void __iomem *sysctl_membase[3], *status_membase;
+void __iomem *ltq_sys1_membase, *ltq_ebu_membase;
+
+void falcon_trigger_hrst(int level)
+{
+	sysctl_w32(SYSCTL_SYS1, level & 1, SYS1_HRSTOUTC);
+}
+
+static inline void sysctl_wait(struct clk *clk,
+		unsigned int test, unsigned int reg)
+{
+	int err = 1000000;
+
+	do {} while (--err && ((sysctl_r32(clk->module, reg)
+					& clk->bits) != test));
+	if (!err)
+		pr_err("module de/activation failed %d %08X %08X %08X\n",
+			clk->module, clk->bits, test,
+			sysctl_r32(clk->module, reg) & clk->bits);
+}
+
+static int sysctl_activate(struct clk *clk)
+{
+	sysctl_w32(clk->module, clk->bits, SYSCTL_CLKEN);
+	sysctl_w32(clk->module, clk->bits, SYSCTL_ACT);
+	sysctl_wait(clk, clk->bits, SYSCTL_ACTS);
+	return 0;
+}
+
+static void sysctl_deactivate(struct clk *clk)
+{
+	sysctl_w32(clk->module, clk->bits, SYSCTL_CLKCLR);
+	sysctl_w32(clk->module, clk->bits, SYSCTL_DEACT);
+	sysctl_wait(clk, 0, SYSCTL_ACTS);
+}
+
+static int sysctl_clken(struct clk *clk)
+{
+	sysctl_w32(clk->module, clk->bits, SYSCTL_CLKEN);
+	sysctl_wait(clk, clk->bits, SYSCTL_CLKS);
+	return 0;
+}
+
+static void sysctl_clkdis(struct clk *clk)
+{
+	sysctl_w32(clk->module, clk->bits, SYSCTL_CLKCLR);
+	sysctl_wait(clk, 0, SYSCTL_CLKS);
+}
+
+static void sysctl_reboot(struct clk *clk)
+{
+	unsigned int act;
+	unsigned int bits;
+
+	act = sysctl_r32(clk->module, SYSCTL_ACT);
+	bits = ~act & clk->bits;
+	if (bits != 0) {
+		sysctl_w32(clk->module, bits, SYSCTL_CLKEN);
+		sysctl_w32(clk->module, bits, SYSCTL_ACT);
+		sysctl_wait(clk, bits, SYSCTL_ACTS);
+	}
+	sysctl_w32(clk->module, act & clk->bits, SYSCTL_RBT);
+	sysctl_wait(clk, clk->bits, SYSCTL_ACTS);
+}
+
+/* enable the ONU core */
+static void falcon_gpe_enable(void)
+{
+	unsigned int freq;
+	unsigned int status;
+
+	/* if if the clock is already enabled */
+	status = sysctl_r32(SYSCTL_SYS1, SYS1_INFRAC);
+	if (status & (1 << (GPPC_OFFSET + 1)))
+		return;
+
+	if (status_r32(STATUS_CONFIG) == 0)
+		freq = 1; /* use 625MHz on unfused chip */
+	else
+		freq = (status_r32(STATUS_CONFIG) &
+			GPEFREQ_MASK) >>
+			GPEFREQ_OFFSET;
+
+	/* apply new frequency */
+	sysctl_w32_mask(SYSCTL_SYS1, 7 << (GPPC_OFFSET + 1),
+		freq << (GPPC_OFFSET + 2) , SYS1_INFRAC);
+	udelay(1);
+
+	/* enable new frequency */
+	sysctl_w32_mask(SYSCTL_SYS1, 0, 1 << (GPPC_OFFSET + 1), SYS1_INFRAC);
+	udelay(1);
+}
+
+static inline void clkdev_add_sys(const char *dev, unsigned int module,
+					unsigned int bits)
+{
+	struct clk *clk = kzalloc(sizeof(struct clk), GFP_KERNEL);
+
+	clk->cl.dev_id = dev;
+	clk->cl.con_id = NULL;
+	clk->cl.clk = clk;
+	clk->module = module;
+	clk->activate = sysctl_activate;
+	clk->deactivate = sysctl_deactivate;
+	clk->enable = sysctl_clken;
+	clk->disable = sysctl_clkdis;
+	clk->reboot = sysctl_reboot;
+	clkdev_add(&clk->cl);
+}
+
+void __init ltq_soc_init(void)
+{
+	struct device_node *np_status =
+		of_find_compatible_node(NULL, NULL, "lantiq,status-falcon");
+	struct device_node *np_ebu =
+		of_find_compatible_node(NULL, NULL, "lantiq,ebu-falcon");
+	struct device_node *np_sys1 =
+		of_find_compatible_node(NULL, NULL, "lantiq,sys1-falcon");
+	struct device_node *np_syseth =
+		of_find_compatible_node(NULL, NULL, "lantiq,syseth-falcon");
+	struct device_node *np_sysgpe =
+		of_find_compatible_node(NULL, NULL, "lantiq,sysgpe-falcon");
+	struct resource res_status, res_ebu, res_sys[3];
+	int i;
+
+	/* check if all the core register ranges are available */
+	if (!np_status || !np_ebu || !np_sys1 || !np_syseth || !np_sysgpe)
+		panic("Failed to load core nodes from devicetree");
+
+	if (of_address_to_resource(np_status, 0, &res_status) ||
+			of_address_to_resource(np_ebu, 0, &res_ebu) ||
+			of_address_to_resource(np_sys1, 0, &res_sys[0]) ||
+			of_address_to_resource(np_syseth, 0, &res_sys[1]) ||
+			of_address_to_resource(np_sysgpe, 0, &res_sys[2]))
+		panic("Failed to get core resources");
+
+	if ((request_mem_region(res_status.start, resource_size(&res_status),
+				res_status.name) < 0) ||
+		(request_mem_region(res_ebu.start, resource_size(&res_ebu),
+				res_ebu.name) < 0) ||
+		(request_mem_region(res_sys[0].start,
+				resource_size(&res_sys[0]),
+				res_sys[0].name) < 0) ||
+		(request_mem_region(res_sys[1].start,
+				resource_size(&res_sys[1]),
+				res_sys[1].name) < 0) ||
+		(request_mem_region(res_sys[2].start,
+				resource_size(&res_sys[2]),
+				res_sys[2].name) < 0))
+		pr_err("Failed to request core reources");
+
+	status_membase = ioremap_nocache(res_status.start,
+					resource_size(&res_status));
+	ltq_ebu_membase = ioremap_nocache(res_ebu.start,
+					resource_size(&res_ebu));
+
+	if (!status_membase || !ltq_ebu_membase)
+		panic("Failed to remap core resources");
+
+	for (i = 0; i < 3; i++) {
+		sysctl_membase[i] = ioremap_nocache(res_sys[i].start,
+						resource_size(&res_sys[i]));
+		if (!sysctl_membase[i])
+			panic("Failed to remap sysctrl resources");
+	}
+	ltq_sys1_membase = sysctl_membase[0];
+
+	falcon_gpe_enable();
+
+	/* get our 3 static rates for cpu, fpi and io clocks */
+	if (ltq_sys1_r32(SYS1_CPU0CC) & CPU0CC_CPUDIV)
+		clkdev_add_static(CLOCK_200M, CLOCK_100M, CLOCK_200M);
+	else
+		clkdev_add_static(CLOCK_400M, CLOCK_100M, CLOCK_200M);
+
+	/* add our clock domains */
+	clkdev_add_sys("1d810000.gpio", SYSCTL_SYSETH, ACTS_P0);
+	clkdev_add_sys("1d810100.gpio", SYSCTL_SYSETH, ACTS_P2);
+	clkdev_add_sys("1e800100.gpio", SYSCTL_SYS1, ACTS_P1);
+	clkdev_add_sys("1e800200.gpio", SYSCTL_SYS1, ACTS_P3);
+	clkdev_add_sys("1e800300.gpio", SYSCTL_SYS1, ACTS_P4);
+	clkdev_add_sys("1db01000.pad", SYSCTL_SYSETH, ACTS_PADCTRL0);
+	clkdev_add_sys("1db02000.pad", SYSCTL_SYSETH, ACTS_PADCTRL2);
+	clkdev_add_sys("1e800400.pad", SYSCTL_SYS1, ACTS_PADCTRL1);
+	clkdev_add_sys("1e800500.pad", SYSCTL_SYS1, ACTS_PADCTRL3);
+	clkdev_add_sys("1e800600.pad", SYSCTL_SYS1, ACTS_PADCTRL4);
+	clkdev_add_sys("1e100C00.serial", SYSCTL_SYS1, ACTS_ASC1_ACT);
+	clkdev_add_sys("1e200000.i2c", SYSCTL_SYS1, ACTS_I2C_ACT);
+}
diff --git a/arch/mips/lantiq/irq.c b/arch/mips/lantiq/irq.c
index d673731c538a..57c1a4e51408 100644
--- a/arch/mips/lantiq/irq.c
+++ b/arch/mips/lantiq/irq.c
@@ -9,6 +9,11 @@
 
 #include <linux/interrupt.h>
 #include <linux/ioport.h>
+#include <linux/sched.h>
+#include <linux/irqdomain.h>
+#include <linux/of_platform.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
 
 #include <asm/bootinfo.h>
 #include <asm/irq_cpu.h>
@@ -16,7 +21,7 @@
 #include <lantiq_soc.h>
 #include <irq.h>
 
-/* register definitions */
+/* register definitions - internal irqs */
 #define LTQ_ICU_IM0_ISR		0x0000
 #define LTQ_ICU_IM0_IER		0x0008
 #define LTQ_ICU_IM0_IOSR	0x0010
@@ -25,6 +30,7 @@
 #define LTQ_ICU_IM1_ISR		0x0028
 #define LTQ_ICU_OFFSET		(LTQ_ICU_IM1_ISR - LTQ_ICU_IM0_ISR)
 
+/* register definitions - external irqs */
 #define LTQ_EIU_EXIN_C		0x0000
 #define LTQ_EIU_EXIN_INIC	0x0004
 #define LTQ_EIU_EXIN_INEN	0x000C
@@ -37,10 +43,14 @@
 #define LTQ_EIU_IR4		(INT_NUM_IM1_IRL0 + 1)
 #define LTQ_EIU_IR5		(INT_NUM_IM1_IRL0 + 2)
 #define LTQ_EIU_IR6		(INT_NUM_IM2_IRL0 + 30)
-
+#define XWAY_EXIN_COUNT		3
 #define MAX_EIU			6
 
-/* irqs generated by device attached to the EBU need to be acked in
+/* the performance counter */
+#define LTQ_PERF_IRQ		(INT_NUM_IM4_IRL0 + 31)
+
+/*
+ * irqs generated by devices attached to the EBU need to be acked in
  * a special manner
  */
 #define LTQ_ICU_EBU_IRQ		22
@@ -51,6 +61,17 @@
 #define ltq_eiu_w32(x, y)	ltq_w32((x), ltq_eiu_membase + (y))
 #define ltq_eiu_r32(x)		ltq_r32(ltq_eiu_membase + (x))
 
+/* our 2 ipi interrupts for VSMP */
+#define MIPS_CPU_IPI_RESCHED_IRQ	0
+#define MIPS_CPU_IPI_CALL_IRQ		1
+
+/* we have a cascade of 8 irqs */
+#define MIPS_CPU_IRQ_CASCADE		8
+
+#if defined(CONFIG_MIPS_MT_SMP) || defined(CONFIG_MIPS_MT_SMTC)
+int gic_present;
+#endif
+
 static unsigned short ltq_eiu_irq[MAX_EIU] = {
 	LTQ_EIU_IR0,
 	LTQ_EIU_IR1,
@@ -60,64 +81,51 @@ static unsigned short ltq_eiu_irq[MAX_EIU] = {
 	LTQ_EIU_IR5,
 };
 
-static struct resource ltq_icu_resource = {
-	.name	= "icu",
-	.start	= LTQ_ICU_BASE_ADDR,
-	.end	= LTQ_ICU_BASE_ADDR + LTQ_ICU_SIZE - 1,
-	.flags	= IORESOURCE_MEM,
-};
-
-static struct resource ltq_eiu_resource = {
-	.name	= "eiu",
-	.start	= LTQ_EIU_BASE_ADDR,
-	.end	= LTQ_EIU_BASE_ADDR + LTQ_ICU_SIZE - 1,
-	.flags	= IORESOURCE_MEM,
-};
-
+static int exin_avail;
 static void __iomem *ltq_icu_membase;
 static void __iomem *ltq_eiu_membase;
 
 void ltq_disable_irq(struct irq_data *d)
 {
 	u32 ier = LTQ_ICU_IM0_IER;
-	int irq_nr = d->irq - INT_NUM_IRQ0;
+	int offset = d->hwirq - MIPS_CPU_IRQ_CASCADE;
 
-	ier += LTQ_ICU_OFFSET * (irq_nr / INT_NUM_IM_OFFSET);
-	irq_nr %= INT_NUM_IM_OFFSET;
-	ltq_icu_w32(ltq_icu_r32(ier) & ~(1 << irq_nr), ier);
+	ier += LTQ_ICU_OFFSET * (offset / INT_NUM_IM_OFFSET);
+	offset %= INT_NUM_IM_OFFSET;
+	ltq_icu_w32(ltq_icu_r32(ier) & ~BIT(offset), ier);
 }
 
 void ltq_mask_and_ack_irq(struct irq_data *d)
 {
 	u32 ier = LTQ_ICU_IM0_IER;
 	u32 isr = LTQ_ICU_IM0_ISR;
-	int irq_nr = d->irq - INT_NUM_IRQ0;
+	int offset = d->hwirq - MIPS_CPU_IRQ_CASCADE;
 
-	ier += LTQ_ICU_OFFSET * (irq_nr / INT_NUM_IM_OFFSET);
-	isr += LTQ_ICU_OFFSET * (irq_nr / INT_NUM_IM_OFFSET);
-	irq_nr %= INT_NUM_IM_OFFSET;
-	ltq_icu_w32(ltq_icu_r32(ier) & ~(1 << irq_nr), ier);
-	ltq_icu_w32((1 << irq_nr), isr);
+	ier += LTQ_ICU_OFFSET * (offset / INT_NUM_IM_OFFSET);
+	isr += LTQ_ICU_OFFSET * (offset / INT_NUM_IM_OFFSET);
+	offset %= INT_NUM_IM_OFFSET;
+	ltq_icu_w32(ltq_icu_r32(ier) & ~BIT(offset), ier);
+	ltq_icu_w32(BIT(offset), isr);
 }
 
 static void ltq_ack_irq(struct irq_data *d)
 {
 	u32 isr = LTQ_ICU_IM0_ISR;
-	int irq_nr = d->irq - INT_NUM_IRQ0;
+	int offset = d->hwirq - MIPS_CPU_IRQ_CASCADE;
 
-	isr += LTQ_ICU_OFFSET * (irq_nr / INT_NUM_IM_OFFSET);
-	irq_nr %= INT_NUM_IM_OFFSET;
-	ltq_icu_w32((1 << irq_nr), isr);
+	isr += LTQ_ICU_OFFSET * (offset / INT_NUM_IM_OFFSET);
+	offset %= INT_NUM_IM_OFFSET;
+	ltq_icu_w32(BIT(offset), isr);
 }
 
 void ltq_enable_irq(struct irq_data *d)
 {
 	u32 ier = LTQ_ICU_IM0_IER;
-	int irq_nr = d->irq - INT_NUM_IRQ0;
+	int offset = d->hwirq - MIPS_CPU_IRQ_CASCADE;
 
-	ier += LTQ_ICU_OFFSET  * (irq_nr / INT_NUM_IM_OFFSET);
-	irq_nr %= INT_NUM_IM_OFFSET;
-	ltq_icu_w32(ltq_icu_r32(ier) | (1 << irq_nr), ier);
+	ier += LTQ_ICU_OFFSET  * (offset / INT_NUM_IM_OFFSET);
+	offset %= INT_NUM_IM_OFFSET;
+	ltq_icu_w32(ltq_icu_r32(ier) | BIT(offset), ier);
 }
 
 static unsigned int ltq_startup_eiu_irq(struct irq_data *d)
@@ -126,15 +134,15 @@ static unsigned int ltq_startup_eiu_irq(struct irq_data *d)
 
 	ltq_enable_irq(d);
 	for (i = 0; i < MAX_EIU; i++) {
-		if (d->irq == ltq_eiu_irq[i]) {
+		if (d->hwirq == ltq_eiu_irq[i]) {
 			/* low level - we should really handle set_type */
 			ltq_eiu_w32(ltq_eiu_r32(LTQ_EIU_EXIN_C) |
 				(0x6 << (i * 4)), LTQ_EIU_EXIN_C);
 			/* clear all pending */
-			ltq_eiu_w32(ltq_eiu_r32(LTQ_EIU_EXIN_INIC) & ~(1 << i),
+			ltq_eiu_w32(ltq_eiu_r32(LTQ_EIU_EXIN_INIC) & ~BIT(i),
 				LTQ_EIU_EXIN_INIC);
 			/* enable */
-			ltq_eiu_w32(ltq_eiu_r32(LTQ_EIU_EXIN_INEN) | (1 << i),
+			ltq_eiu_w32(ltq_eiu_r32(LTQ_EIU_EXIN_INEN) | BIT(i),
 				LTQ_EIU_EXIN_INEN);
 			break;
 		}
@@ -149,9 +157,9 @@ static void ltq_shutdown_eiu_irq(struct irq_data *d)
 
 	ltq_disable_irq(d);
 	for (i = 0; i < MAX_EIU; i++) {
-		if (d->irq == ltq_eiu_irq[i]) {
+		if (d->hwirq == ltq_eiu_irq[i]) {
 			/* disable */
-			ltq_eiu_w32(ltq_eiu_r32(LTQ_EIU_EXIN_INEN) & ~(1 << i),
+			ltq_eiu_w32(ltq_eiu_r32(LTQ_EIU_EXIN_INEN) & ~BIT(i),
 				LTQ_EIU_EXIN_INEN);
 			break;
 		}
@@ -188,14 +196,15 @@ static void ltq_hw_irqdispatch(int module)
 	if (irq == 0)
 		return;
 
-	/* silicon bug causes only the msb set to 1 to be valid. all
+	/*
+	 * silicon bug causes only the msb set to 1 to be valid. all
 	 * other bits might be bogus
 	 */
 	irq = __fls(irq);
-	do_IRQ((int)irq + INT_NUM_IM0_IRL0 + (INT_NUM_IM_OFFSET * module));
+	do_IRQ((int)irq + MIPS_CPU_IRQ_CASCADE + (INT_NUM_IM_OFFSET * module));
 
 	/* if this is a EBU irq, we need to ack it or get a deadlock */
-	if ((irq == LTQ_ICU_EBU_IRQ) && (module == 0))
+	if ((irq == LTQ_ICU_EBU_IRQ) && (module == 0) && LTQ_EBU_PCC_ISTAT)
 		ltq_ebu_w32(ltq_ebu_r32(LTQ_EBU_PCC_ISTAT) | 0x10,
 			LTQ_EBU_PCC_ISTAT);
 }
@@ -216,6 +225,47 @@ static void ltq_hw5_irqdispatch(void)
 	do_IRQ(MIPS_CPU_TIMER_IRQ);
 }
 
+#ifdef CONFIG_MIPS_MT_SMP
+void __init arch_init_ipiirq(int irq, struct irqaction *action)
+{
+	setup_irq(irq, action);
+	irq_set_handler(irq, handle_percpu_irq);
+}
+
+static void ltq_sw0_irqdispatch(void)
+{
+	do_IRQ(MIPS_CPU_IRQ_BASE + MIPS_CPU_IPI_RESCHED_IRQ);
+}
+
+static void ltq_sw1_irqdispatch(void)
+{
+	do_IRQ(MIPS_CPU_IRQ_BASE + MIPS_CPU_IPI_CALL_IRQ);
+}
+static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id)
+{
+	scheduler_ipi();
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t ipi_call_interrupt(int irq, void *dev_id)
+{
+	smp_call_function_interrupt();
+	return IRQ_HANDLED;
+}
+
+static struct irqaction irq_resched = {
+	.handler	= ipi_resched_interrupt,
+	.flags		= IRQF_PERCPU,
+	.name		= "IPI_resched"
+};
+
+static struct irqaction irq_call = {
+	.handler	= ipi_call_interrupt,
+	.flags		= IRQF_PERCPU,
+	.name		= "IPI_call"
+};
+#endif
+
 asmlinkage void plat_irq_dispatch(void)
 {
 	unsigned int pending = read_c0_status() & read_c0_cause() & ST0_IM;
@@ -238,45 +288,75 @@ out:
 	return;
 }
 
+static int icu_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hw)
+{
+	struct irq_chip *chip = &ltq_irq_type;
+	int i;
+
+	for (i = 0; i < exin_avail; i++)
+		if (hw == ltq_eiu_irq[i])
+			chip = &ltq_eiu_type;
+
+	irq_set_chip_and_handler(hw, chip, handle_level_irq);
+
+	return 0;
+}
+
+static const struct irq_domain_ops irq_domain_ops = {
+	.xlate = irq_domain_xlate_onetwocell,
+	.map = icu_map,
+};
+
 static struct irqaction cascade = {
 	.handler = no_action,
 	.name = "cascade",
 };
 
-void __init arch_init_irq(void)
+int __init icu_of_init(struct device_node *node, struct device_node *parent)
 {
+	struct device_node *eiu_node;
+	struct resource res;
 	int i;
 
-	if (insert_resource(&iomem_resource, &ltq_icu_resource) < 0)
-		panic("Failed to insert icu memory");
+	if (of_address_to_resource(node, 0, &res))
+		panic("Failed to get icu memory range");
 
-	if (request_mem_region(ltq_icu_resource.start,
-			resource_size(&ltq_icu_resource), "icu") < 0)
-		panic("Failed to request icu memory");
+	if (request_mem_region(res.start, resource_size(&res), res.name) < 0)
+		pr_err("Failed to request icu memory");
 
-	ltq_icu_membase = ioremap_nocache(ltq_icu_resource.start,
-				resource_size(&ltq_icu_resource));
+	ltq_icu_membase = ioremap_nocache(res.start, resource_size(&res));
 	if (!ltq_icu_membase)
 		panic("Failed to remap icu memory");
 
-	if (insert_resource(&iomem_resource, &ltq_eiu_resource) < 0)
-		panic("Failed to insert eiu memory");
-
-	if (request_mem_region(ltq_eiu_resource.start,
-			resource_size(&ltq_eiu_resource), "eiu") < 0)
-		panic("Failed to request eiu memory");
-
-	ltq_eiu_membase = ioremap_nocache(ltq_eiu_resource.start,
-				resource_size(&ltq_eiu_resource));
-	if (!ltq_eiu_membase)
-		panic("Failed to remap eiu memory");
+	/* the external interrupts are optional and xway only */
+	eiu_node = of_find_compatible_node(NULL, NULL, "lantiq,eiu");
+	if (eiu_node && of_address_to_resource(eiu_node, 0, &res)) {
+		/* find out how many external irq sources we have */
+		const __be32 *count = of_get_property(node,
+							"lantiq,count",	NULL);
+
+		if (count)
+			exin_avail = *count;
+		if (exin_avail > MAX_EIU)
+			exin_avail = MAX_EIU;
+
+		if (request_mem_region(res.start, resource_size(&res),
+							res.name) < 0)
+			pr_err("Failed to request eiu memory");
+
+		ltq_eiu_membase = ioremap_nocache(res.start,
+							resource_size(&res));
+		if (!ltq_eiu_membase)
+			panic("Failed to remap eiu memory");
+	}
 
-	/* make sure all irqs are turned off by default */
-	for (i = 0; i < 5; i++)
+	/* turn off all irqs by default */
+	for (i = 0; i < 5; i++) {
+		/* make sure all irqs are turned off by default */
 		ltq_icu_w32(0, LTQ_ICU_IM0_IER + (i * LTQ_ICU_OFFSET));
-
-	/* clear all possibly pending interrupts */
-	ltq_icu_w32(~0, LTQ_ICU_IM0_ISR + (i * LTQ_ICU_OFFSET));
+		/* clear all possibly pending interrupts */
+		ltq_icu_w32(~0, LTQ_ICU_IM0_ISR + (i * LTQ_ICU_OFFSET));
+	}
 
 	mips_cpu_irq_init();
 
@@ -293,20 +373,19 @@ void __init arch_init_irq(void)
 		set_vi_handler(7, ltq_hw5_irqdispatch);
 	}
 
-	for (i = INT_NUM_IRQ0;
-		i <= (INT_NUM_IRQ0 + (5 * INT_NUM_IM_OFFSET)); i++)
-		if ((i == LTQ_EIU_IR0) || (i == LTQ_EIU_IR1) ||
-			(i == LTQ_EIU_IR2))
-			irq_set_chip_and_handler(i, &ltq_eiu_type,
-				handle_level_irq);
-		/* EIU3-5 only exist on ar9 and vr9 */
-		else if (((i == LTQ_EIU_IR3) || (i == LTQ_EIU_IR4) ||
-			(i == LTQ_EIU_IR5)) && (ltq_is_ar9() || ltq_is_vr9()))
-			irq_set_chip_and_handler(i, &ltq_eiu_type,
-				handle_level_irq);
-		else
-			irq_set_chip_and_handler(i, &ltq_irq_type,
-				handle_level_irq);
+	irq_domain_add_linear(node, 6 * INT_NUM_IM_OFFSET,
+		&irq_domain_ops, 0);
+
+#if defined(CONFIG_MIPS_MT_SMP)
+	if (cpu_has_vint) {
+		pr_info("Setting up IPI vectored interrupts\n");
+		set_vi_handler(MIPS_CPU_IPI_RESCHED_IRQ, ltq_sw0_irqdispatch);
+		set_vi_handler(MIPS_CPU_IPI_CALL_IRQ, ltq_sw1_irqdispatch);
+	}
+	arch_init_ipiirq(MIPS_CPU_IRQ_BASE + MIPS_CPU_IPI_RESCHED_IRQ,
+		&irq_resched);
+	arch_init_ipiirq(MIPS_CPU_IRQ_BASE + MIPS_CPU_IPI_CALL_IRQ, &irq_call);
+#endif
 
 #if !defined(CONFIG_MIPS_MT_SMP) && !defined(CONFIG_MIPS_MT_SMTC)
 	set_c0_status(IE_IRQ0 | IE_IRQ1 | IE_IRQ2 |
@@ -315,9 +394,23 @@ void __init arch_init_irq(void)
 	set_c0_status(IE_SW0 | IE_SW1 | IE_IRQ0 | IE_IRQ1 |
 		IE_IRQ2 | IE_IRQ3 | IE_IRQ4 | IE_IRQ5);
 #endif
+
+	/* tell oprofile which irq to use */
+	cp0_perfcount_irq = LTQ_PERF_IRQ;
+	return 0;
 }
 
 unsigned int __cpuinit get_c0_compare_int(void)
 {
 	return CP0_LEGACY_COMPARE_IRQ;
 }
+
+static struct of_device_id __initdata of_irq_ids[] = {
+	{ .compatible = "lantiq,icu", .data = icu_of_init },
+	{},
+};
+
+void __init arch_init_irq(void)
+{
+	of_irq_init(of_irq_ids);
+}
diff --git a/arch/mips/lantiq/machtypes.h b/arch/mips/lantiq/machtypes.h
deleted file mode 100644
index 7e01b8c484eb..000000000000
--- a/arch/mips/lantiq/machtypes.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- *  This program is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License version 2 as published
- *  by the Free Software Foundation.
- *
- *  Copyright (C) 2010 John Crispin <[email protected]>
- */
-
-#ifndef _LANTIQ_MACH_H__
-#define _LANTIQ_MACH_H__
-
-#include <asm/mips_machine.h>
-
-enum lantiq_mach_type {
-	LTQ_MACH_GENERIC = 0,
-	LTQ_MACH_EASY50712,	/* Danube evaluation board */
-	LTQ_MACH_EASY50601,	/* Amazon SE evaluation board */
-};
-
-#endif
diff --git a/arch/mips/lantiq/prom.c b/arch/mips/lantiq/prom.c
index e34fcfd0d5ca..d185e8477fdf 100644
--- a/arch/mips/lantiq/prom.c
+++ b/arch/mips/lantiq/prom.c
@@ -8,6 +8,7 @@
 
 #include <linux/export.h>
 #include <linux/clk.h>
+#include <linux/of_platform.h>
 #include <asm/bootinfo.h>
 #include <asm/time.h>
 
@@ -16,19 +17,15 @@
 #include "prom.h"
 #include "clk.h"
 
-static struct ltq_soc_info soc_info;
-
-unsigned int ltq_get_cpu_ver(void)
-{
-	return soc_info.rev;
-}
-EXPORT_SYMBOL(ltq_get_cpu_ver);
+/* access to the ebu needs to be locked between different drivers */
+DEFINE_SPINLOCK(ebu_lock);
+EXPORT_SYMBOL_GPL(ebu_lock);
 
-unsigned int ltq_get_soc_type(void)
-{
-	return soc_info.type;
-}
-EXPORT_SYMBOL(ltq_get_soc_type);
+/*
+ * this struct is filled by the soc specific detection code and holds
+ * information about the specific soc type, revision and name
+ */
+static struct ltq_soc_info soc_info;
 
 const char *get_system_type(void)
 {
@@ -45,27 +42,62 @@ static void __init prom_init_cmdline(void)
 	char **argv = (char **) KSEG1ADDR(fw_arg1);
 	int i;
 
+	arcs_cmdline[0] = '\0';
+
 	for (i = 0; i < argc; i++) {
-		char *p = (char *)  KSEG1ADDR(argv[i]);
+		char *p = (char *) KSEG1ADDR(argv[i]);
 
-		if (p && *p) {
+		if (CPHYSADDR(p) && *p) {
 			strlcat(arcs_cmdline, p, sizeof(arcs_cmdline));
 			strlcat(arcs_cmdline, " ", sizeof(arcs_cmdline));
 		}
 	}
 }
 
-void __init prom_init(void)
+void __init plat_mem_setup(void)
 {
-	struct clk *clk;
+	ioport_resource.start = IOPORT_RESOURCE_START;
+	ioport_resource.end = IOPORT_RESOURCE_END;
+	iomem_resource.start = IOMEM_RESOURCE_START;
+	iomem_resource.end = IOMEM_RESOURCE_END;
+
+	set_io_port_base((unsigned long) KSEG1);
 
+	/*
+	 * Load the builtin devicetree. This causes the chosen node to be
+	 * parsed resulting in our memory appearing
+	 */
+	__dt_setup_arch(&__dtb_start);
+}
+
+void __init prom_init(void)
+{
+	/* call the soc specific detetcion code and get it to fill soc_info */
 	ltq_soc_detect(&soc_info);
-	clk_init();
-	clk = clk_get(0, "cpu");
-	snprintf(soc_info.sys_type, LTQ_SYS_TYPE_LEN - 1, "%s rev1.%d",
-		soc_info.name, soc_info.rev);
-	clk_put(clk);
+	snprintf(soc_info.sys_type, LTQ_SYS_TYPE_LEN - 1, "%s rev %s",
+		soc_info.name, soc_info.rev_type);
 	soc_info.sys_type[LTQ_SYS_TYPE_LEN - 1] = '\0';
 	pr_info("SoC: %s\n", soc_info.sys_type);
 	prom_init_cmdline();
+
+#if defined(CONFIG_MIPS_MT_SMP)
+	if (register_vsmp_smp_ops())
+		panic("failed to register_vsmp_smp_ops()");
+#endif
 }
+
+int __init plat_of_setup(void)
+{
+	static struct of_device_id of_ids[3];
+
+	if (!of_have_populated_dt())
+		panic("device tree not present");
+
+	strncpy(of_ids[0].compatible, soc_info.compatible,
+		sizeof(of_ids[0].compatible));
+	strncpy(of_ids[1].compatible, "simple-bus",
+		sizeof(of_ids[1].compatible));
+	return of_platform_bus_probe(NULL, of_ids, NULL);
+}
+
+arch_initcall(plat_of_setup);
diff --git a/arch/mips/lantiq/prom.h b/arch/mips/lantiq/prom.h
index b4229d94280f..a3fa1a2bfaae 100644
--- a/arch/mips/lantiq/prom.h
+++ b/arch/mips/lantiq/prom.h
@@ -10,16 +10,22 @@
 #define _LTQ_PROM_H__
 
 #define LTQ_SYS_TYPE_LEN	0x100
+#define LTQ_SYS_REV_LEN         0x10
 
 struct ltq_soc_info {
 	unsigned char *name;
 	unsigned int rev;
+	unsigned char rev_type[LTQ_SYS_REV_LEN];
+	unsigned int srev;
 	unsigned int partnum;
 	unsigned int type;
 	unsigned char sys_type[LTQ_SYS_TYPE_LEN];
+	unsigned char *compatible;
 };
 
 extern void ltq_soc_detect(struct ltq_soc_info *i);
-extern void ltq_soc_setup(void);
+extern void ltq_soc_init(void);
+
+extern struct boot_param_header __dtb_start;
 
 #endif
diff --git a/arch/mips/lantiq/setup.c b/arch/mips/lantiq/setup.c
deleted file mode 100644
index 1ff6c9d6cb93..000000000000
--- a/arch/mips/lantiq/setup.c
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- *  This program is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License version 2 as published
- *  by the Free Software Foundation.
- *
- * Copyright (C) 2010 John Crispin <[email protected]>
- */
-
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/io.h>
-#include <linux/ioport.h>
-#include <asm/bootinfo.h>
-
-#include <lantiq_soc.h>
-
-#include "machtypes.h"
-#include "devices.h"
-#include "prom.h"
-
-void __init plat_mem_setup(void)
-{
-	/* assume 16M as default incase uboot fails to pass proper ramsize */
-	unsigned long memsize = 16;
-	char **envp = (char **) KSEG1ADDR(fw_arg2);
-
-	ioport_resource.start = IOPORT_RESOURCE_START;
-	ioport_resource.end = IOPORT_RESOURCE_END;
-	iomem_resource.start = IOMEM_RESOURCE_START;
-	iomem_resource.end = IOMEM_RESOURCE_END;
-
-	set_io_port_base((unsigned long) KSEG1);
-
-	while (*envp) {
-		char *e = (char *)KSEG1ADDR(*envp);
-		if (!strncmp(e, "memsize=", 8)) {
-			e += 8;
-			if (strict_strtoul(e, 0, &memsize))
-				pr_warn("bad memsize specified\n");
-		}
-		envp++;
-	}
-	memsize *= 1024 * 1024;
-	add_memory_region(0x00000000, memsize, BOOT_MEM_RAM);
-}
-
-static int __init
-lantiq_setup(void)
-{
-	ltq_soc_setup();
-	mips_machine_setup();
-	return 0;
-}
-
-arch_initcall(lantiq_setup);
-
-static void __init
-lantiq_generic_init(void)
-{
-	/* Nothing to do */
-}
-
-MIPS_MACHINE(LTQ_MACH_GENERIC,
-	     "Generic",
-	     "Generic Lantiq based board",
-	     lantiq_generic_init);
diff --git a/arch/mips/lantiq/xway/Kconfig b/arch/mips/lantiq/xway/Kconfig
deleted file mode 100644
index 2b857de36620..000000000000
--- a/arch/mips/lantiq/xway/Kconfig
+++ /dev/null
@@ -1,23 +0,0 @@
-if SOC_XWAY
-
-menu "MIPS Machine"
-
-config LANTIQ_MACH_EASY50712
-	bool "Easy50712 - Danube"
-	default y
-
-endmenu
-
-endif
-
-if SOC_AMAZON_SE
-
-menu "MIPS Machine"
-
-config LANTIQ_MACH_EASY50601
-	bool "Easy50601 - Amazon SE"
-	default y
-
-endmenu
-
-endif
diff --git a/arch/mips/lantiq/xway/Makefile b/arch/mips/lantiq/xway/Makefile
index c517f2e77563..dc3194f6ee42 100644
--- a/arch/mips/lantiq/xway/Makefile
+++ b/arch/mips/lantiq/xway/Makefile
@@ -1,7 +1 @@
-obj-y := pmu.o ebu.o reset.o gpio.o gpio_stp.o gpio_ebu.o devices.o dma.o
-
-obj-$(CONFIG_SOC_XWAY) += clk-xway.o prom-xway.o setup-xway.o
-obj-$(CONFIG_SOC_AMAZON_SE) += clk-ase.o prom-ase.o setup-ase.o
-
-obj-$(CONFIG_LANTIQ_MACH_EASY50712) += mach-easy50712.o
-obj-$(CONFIG_LANTIQ_MACH_EASY50601) += mach-easy50601.o
+obj-y := prom.o sysctrl.o clk.o reset.o gpio.o dma.o
diff --git a/arch/mips/lantiq/xway/clk-ase.c b/arch/mips/lantiq/xway/clk-ase.c
deleted file mode 100644
index 652258309c9c..000000000000
--- a/arch/mips/lantiq/xway/clk-ase.c
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- *  This program is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License version 2 as published
- *  by the Free Software Foundation.
- *
- *  Copyright (C) 2011 John Crispin <[email protected]>
- */
-
-#include <linux/io.h>
-#include <linux/export.h>
-#include <linux/init.h>
-#include <linux/clk.h>
-
-#include <asm/time.h>
-#include <asm/irq.h>
-#include <asm/div64.h>
-
-#include <lantiq_soc.h>
-
-/* cgu registers */
-#define LTQ_CGU_SYS	0x0010
-
-unsigned int ltq_get_io_region_clock(void)
-{
-	return CLOCK_133M;
-}
-EXPORT_SYMBOL(ltq_get_io_region_clock);
-
-unsigned int ltq_get_fpi_bus_clock(int fpi)
-{
-	return CLOCK_133M;
-}
-EXPORT_SYMBOL(ltq_get_fpi_bus_clock);
-
-unsigned int ltq_get_cpu_hz(void)
-{
-	if (ltq_cgu_r32(LTQ_CGU_SYS) & (1 << 5))
-		return CLOCK_266M;
-	else
-		return CLOCK_133M;
-}
-EXPORT_SYMBOL(ltq_get_cpu_hz);
-
-unsigned int ltq_get_fpi_hz(void)
-{
-	return CLOCK_133M;
-}
-EXPORT_SYMBOL(ltq_get_fpi_hz);
diff --git a/arch/mips/lantiq/xway/clk-xway.c b/arch/mips/lantiq/xway/clk-xway.c
deleted file mode 100644
index 696b1a3e0642..000000000000
--- a/arch/mips/lantiq/xway/clk-xway.c
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- *  This program is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License version 2 as published
- *  by the Free Software Foundation.
- *
- *  Copyright (C) 2010 John Crispin <[email protected]>
- */
-
-#include <linux/io.h>
-#include <linux/export.h>
-#include <linux/init.h>
-#include <linux/clk.h>
-
-#include <asm/time.h>
-#include <asm/irq.h>
-#include <asm/div64.h>
-
-#include <lantiq_soc.h>
-
-static unsigned int ltq_ram_clocks[] = {
-	CLOCK_167M, CLOCK_133M, CLOCK_111M, CLOCK_83M };
-#define DDR_HZ ltq_ram_clocks[ltq_cgu_r32(LTQ_CGU_SYS) & 0x3]
-
-#define BASIC_FREQUENCY_1	35328000
-#define BASIC_FREQUENCY_2	36000000
-#define BASIS_REQUENCY_USB	12000000
-
-#define GET_BITS(x, msb, lsb) \
-	(((x) & ((1 << ((msb) + 1)) - 1)) >> (lsb))
-
-#define LTQ_CGU_PLL0_CFG	0x0004
-#define LTQ_CGU_PLL1_CFG	0x0008
-#define LTQ_CGU_PLL2_CFG	0x000C
-#define LTQ_CGU_SYS		0x0010
-#define LTQ_CGU_UPDATE		0x0014
-#define LTQ_CGU_IF_CLK		0x0018
-#define LTQ_CGU_OSC_CON		0x001C
-#define LTQ_CGU_SMD		0x0020
-#define LTQ_CGU_CT1SR		0x0028
-#define LTQ_CGU_CT2SR		0x002C
-#define LTQ_CGU_PCMCR		0x0030
-#define LTQ_CGU_PCI_CR		0x0034
-#define LTQ_CGU_PD_PC		0x0038
-#define LTQ_CGU_FMR		0x003C
-
-#define CGU_PLL0_PHASE_DIVIDER_ENABLE	\
-	(ltq_cgu_r32(LTQ_CGU_PLL0_CFG) & (1 << 31))
-#define CGU_PLL0_BYPASS			\
-	(ltq_cgu_r32(LTQ_CGU_PLL0_CFG) & (1 << 30))
-#define CGU_PLL0_CFG_DSMSEL		\
-	(ltq_cgu_r32(LTQ_CGU_PLL0_CFG) & (1 << 28))
-#define CGU_PLL0_CFG_FRAC_EN		\
-	(ltq_cgu_r32(LTQ_CGU_PLL0_CFG) & (1 << 27))
-#define CGU_PLL1_SRC			\
-	(ltq_cgu_r32(LTQ_CGU_PLL1_CFG) & (1 << 31))
-#define CGU_PLL2_PHASE_DIVIDER_ENABLE	\
-	(ltq_cgu_r32(LTQ_CGU_PLL2_CFG) & (1 << 20))
-#define CGU_SYS_FPI_SEL			(1 << 6)
-#define CGU_SYS_DDR_SEL			0x3
-#define CGU_PLL0_SRC			(1 << 29)
-
-#define CGU_PLL0_CFG_PLLK	GET_BITS(ltq_cgu_r32(LTQ_CGU_PLL0_CFG), 26, 17)
-#define CGU_PLL0_CFG_PLLN	GET_BITS(ltq_cgu_r32(LTQ_CGU_PLL0_CFG), 12, 6)
-#define CGU_PLL0_CFG_PLLM	GET_BITS(ltq_cgu_r32(LTQ_CGU_PLL0_CFG), 5, 2)
-#define CGU_PLL2_SRC		GET_BITS(ltq_cgu_r32(LTQ_CGU_PLL2_CFG), 18, 17)
-#define CGU_PLL2_CFG_INPUT_DIV	GET_BITS(ltq_cgu_r32(LTQ_CGU_PLL2_CFG), 16, 13)
-
-static unsigned int ltq_get_pll0_fdiv(void);
-
-static inline unsigned int get_input_clock(int pll)
-{
-	switch (pll) {
-	case 0:
-		if (ltq_cgu_r32(LTQ_CGU_PLL0_CFG) & CGU_PLL0_SRC)
-			return BASIS_REQUENCY_USB;
-		else if (CGU_PLL0_PHASE_DIVIDER_ENABLE)
-			return BASIC_FREQUENCY_1;
-		else
-			return BASIC_FREQUENCY_2;
-	case 1:
-		if (CGU_PLL1_SRC)
-			return BASIS_REQUENCY_USB;
-		else if (CGU_PLL0_PHASE_DIVIDER_ENABLE)
-			return BASIC_FREQUENCY_1;
-		else
-			return BASIC_FREQUENCY_2;
-	case 2:
-		switch (CGU_PLL2_SRC) {
-		case 0:
-			return ltq_get_pll0_fdiv();
-		case 1:
-			return CGU_PLL2_PHASE_DIVIDER_ENABLE ?
-				BASIC_FREQUENCY_1 :
-				BASIC_FREQUENCY_2;
-		case 2:
-			return BASIS_REQUENCY_USB;
-		}
-	default:
-		return 0;
-	}
-}
-
-static inline unsigned int cal_dsm(int pll, unsigned int num, unsigned int den)
-{
-	u64 res, clock = get_input_clock(pll);
-
-	res = num * clock;
-	do_div(res, den);
-	return res;
-}
-
-static inline unsigned int mash_dsm(int pll, unsigned int M, unsigned int N,
-	unsigned int K)
-{
-	unsigned int num = ((N + 1) << 10) + K;
-	unsigned int den = (M + 1) << 10;
-
-	return cal_dsm(pll, num, den);
-}
-
-static inline unsigned int ssff_dsm_1(int pll, unsigned int M, unsigned int N,
-	unsigned int K)
-{
-	unsigned int num = ((N + 1) << 11) + K + 512;
-	unsigned int den = (M + 1) << 11;
-
-	return cal_dsm(pll, num, den);
-}
-
-static inline unsigned int ssff_dsm_2(int pll, unsigned int M, unsigned int N,
-	unsigned int K)
-{
-	unsigned int num = K >= 512 ?
-		((N + 1) << 12) + K - 512 : ((N + 1) << 12) + K + 3584;
-	unsigned int den = (M + 1) << 12;
-
-	return cal_dsm(pll, num, den);
-}
-
-static inline unsigned int dsm(int pll, unsigned int M, unsigned int N,
-	unsigned int K, unsigned int dsmsel, unsigned int phase_div_en)
-{
-	if (!dsmsel)
-		return mash_dsm(pll, M, N, K);
-	else if (!phase_div_en)
-		return mash_dsm(pll, M, N, K);
-	else
-		return ssff_dsm_2(pll, M, N, K);
-}
-
-static inline unsigned int ltq_get_pll0_fosc(void)
-{
-	if (CGU_PLL0_BYPASS)
-		return get_input_clock(0);
-	else
-		return !CGU_PLL0_CFG_FRAC_EN
-			? dsm(0, CGU_PLL0_CFG_PLLM, CGU_PLL0_CFG_PLLN, 0,
-				CGU_PLL0_CFG_DSMSEL,
-				CGU_PLL0_PHASE_DIVIDER_ENABLE)
-			: dsm(0, CGU_PLL0_CFG_PLLM, CGU_PLL0_CFG_PLLN,
-				CGU_PLL0_CFG_PLLK, CGU_PLL0_CFG_DSMSEL,
-				CGU_PLL0_PHASE_DIVIDER_ENABLE);
-}
-
-static unsigned int ltq_get_pll0_fdiv(void)
-{
-	unsigned int div = CGU_PLL2_CFG_INPUT_DIV + 1;
-
-	return (ltq_get_pll0_fosc() + (div >> 1)) / div;
-}
-
-unsigned int ltq_get_io_region_clock(void)
-{
-	unsigned int ret = ltq_get_pll0_fosc();
-
-	switch (ltq_cgu_r32(LTQ_CGU_PLL2_CFG) & CGU_SYS_DDR_SEL) {
-	default:
-	case 0:
-		return (ret + 1) / 2;
-	case 1:
-		return (ret * 2 + 2) / 5;
-	case 2:
-		return (ret + 1) / 3;
-	case 3:
-		return (ret + 2) / 4;
-	}
-}
-EXPORT_SYMBOL(ltq_get_io_region_clock);
-
-unsigned int ltq_get_fpi_bus_clock(int fpi)
-{
-	unsigned int ret = ltq_get_io_region_clock();
-
-	if ((fpi == 2) && (ltq_cgu_r32(LTQ_CGU_SYS) & CGU_SYS_FPI_SEL))
-		ret >>= 1;
-	return ret;
-}
-EXPORT_SYMBOL(ltq_get_fpi_bus_clock);
-
-unsigned int ltq_get_cpu_hz(void)
-{
-	switch (ltq_cgu_r32(LTQ_CGU_SYS) & 0xc) {
-	case 0:
-		return CLOCK_333M;
-	case 4:
-		return DDR_HZ;
-	case 8:
-		return DDR_HZ << 1;
-	default:
-		return DDR_HZ >> 1;
-	}
-}
-EXPORT_SYMBOL(ltq_get_cpu_hz);
-
-unsigned int ltq_get_fpi_hz(void)
-{
-	unsigned int ddr_clock = DDR_HZ;
-
-	if (ltq_cgu_r32(LTQ_CGU_SYS) & 0x40)
-		return ddr_clock >> 1;
-	return ddr_clock;
-}
-EXPORT_SYMBOL(ltq_get_fpi_hz);
diff --git a/arch/mips/lantiq/xway/clk.c b/arch/mips/lantiq/xway/clk.c
new file mode 100644
index 000000000000..9aa17f79a742
--- /dev/null
+++ b/arch/mips/lantiq/xway/clk.c
@@ -0,0 +1,151 @@
+/*
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License version 2 as published
+ *  by the Free Software Foundation.
+ *
+ *  Copyright (C) 2010 John Crispin <[email protected]>
+ */
+
+#include <linux/io.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/clk.h>
+
+#include <asm/time.h>
+#include <asm/irq.h>
+#include <asm/div64.h>
+
+#include <lantiq_soc.h>
+
+#include "../clk.h"
+
+static unsigned int ram_clocks[] = {
+	CLOCK_167M, CLOCK_133M, CLOCK_111M, CLOCK_83M };
+#define DDR_HZ ram_clocks[ltq_cgu_r32(CGU_SYS) & 0x3]
+
+/* legacy xway clock */
+#define CGU_SYS			0x10
+
+/* vr9 clock */
+#define CGU_SYS_VR9		0x0c
+#define CGU_IF_CLK_VR9		0x24
+
+unsigned long ltq_danube_fpi_hz(void)
+{
+	unsigned long ddr_clock = DDR_HZ;
+
+	if (ltq_cgu_r32(CGU_SYS) & 0x40)
+		return ddr_clock >> 1;
+	return ddr_clock;
+}
+
+unsigned long ltq_danube_cpu_hz(void)
+{
+	switch (ltq_cgu_r32(CGU_SYS) & 0xc) {
+	case 0:
+		return CLOCK_333M;
+	case 4:
+		return DDR_HZ;
+	case 8:
+		return DDR_HZ << 1;
+	default:
+		return DDR_HZ >> 1;
+	}
+}
+
+unsigned long ltq_ar9_sys_hz(void)
+{
+	if (((ltq_cgu_r32(CGU_SYS) >> 3) & 0x3) == 0x2)
+		return CLOCK_393M;
+	return CLOCK_333M;
+}
+
+unsigned long ltq_ar9_fpi_hz(void)
+{
+	unsigned long sys = ltq_ar9_sys_hz();
+
+	if (ltq_cgu_r32(CGU_SYS) & BIT(0))
+		return sys;
+	return sys >> 1;
+}
+
+unsigned long ltq_ar9_cpu_hz(void)
+{
+	if (ltq_cgu_r32(CGU_SYS) & BIT(2))
+		return ltq_ar9_fpi_hz();
+	else
+		return ltq_ar9_sys_hz();
+}
+
+unsigned long ltq_vr9_cpu_hz(void)
+{
+	unsigned int cpu_sel;
+	unsigned long clk;
+
+	cpu_sel = (ltq_cgu_r32(CGU_SYS_VR9) >> 4) & 0xf;
+
+	switch (cpu_sel) {
+	case 0:
+		clk = CLOCK_600M;
+		break;
+	case 1:
+		clk = CLOCK_500M;
+		break;
+	case 2:
+		clk = CLOCK_393M;
+		break;
+	case 3:
+		clk = CLOCK_333M;
+		break;
+	case 5:
+	case 6:
+		clk = CLOCK_196_608M;
+		break;
+	case 7:
+		clk = CLOCK_167M;
+		break;
+	case 4:
+	case 8:
+	case 9:
+		clk = CLOCK_125M;
+		break;
+	default:
+		clk = 0;
+		break;
+	}
+
+	return clk;
+}
+
+unsigned long ltq_vr9_fpi_hz(void)
+{
+	unsigned int ocp_sel, cpu_clk;
+	unsigned long clk;
+
+	cpu_clk = ltq_vr9_cpu_hz();
+	ocp_sel = ltq_cgu_r32(CGU_SYS_VR9) & 0x3;
+
+	switch (ocp_sel) {
+	case 0:
+		/* OCP ratio 1 */
+		clk = cpu_clk;
+		break;
+	case 2:
+		/* OCP ratio 2 */
+		clk = cpu_clk / 2;
+		break;
+	case 3:
+		/* OCP ratio 2.5 */
+		clk = (cpu_clk * 2) / 5;
+		break;
+	case 4:
+		/* OCP ratio 3 */
+		clk = cpu_clk / 3;
+		break;
+	default:
+		clk = 0;
+		break;
+	}
+
+	return clk;
+}
diff --git a/arch/mips/lantiq/xway/devices.c b/arch/mips/lantiq/xway/devices.c
deleted file mode 100644
index d614aa7ff07f..000000000000
--- a/arch/mips/lantiq/xway/devices.c
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- *  This program is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License version 2 as published
- *  by the Free Software Foundation.
- *
- *  Copyright (C) 2010 John Crispin <[email protected]>
- */
-
-#include <linux/init.h>
-#include <linux/export.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/mtd/physmap.h>
-#include <linux/kernel.h>
-#include <linux/reboot.h>
-#include <linux/platform_device.h>
-#include <linux/leds.h>
-#include <linux/etherdevice.h>
-#include <linux/time.h>
-#include <linux/io.h>
-#include <linux/gpio.h>
-
-#include <asm/bootinfo.h>
-#include <asm/irq.h>
-
-#include <lantiq_soc.h>
-#include <lantiq_irq.h>
-#include <lantiq_platform.h>
-
-#include "devices.h"
-
-/* gpio */
-static struct resource ltq_gpio_resource[] = {
-	{
-		.name	= "gpio0",
-		.start  = LTQ_GPIO0_BASE_ADDR,
-		.end    = LTQ_GPIO0_BASE_ADDR + LTQ_GPIO_SIZE - 1,
-		.flags  = IORESOURCE_MEM,
-	}, {
-		.name	= "gpio1",
-		.start  = LTQ_GPIO1_BASE_ADDR,
-		.end    = LTQ_GPIO1_BASE_ADDR + LTQ_GPIO_SIZE - 1,
-		.flags  = IORESOURCE_MEM,
-	}, {
-		.name	= "gpio2",
-		.start  = LTQ_GPIO2_BASE_ADDR,
-		.end    = LTQ_GPIO2_BASE_ADDR + LTQ_GPIO_SIZE - 1,
-		.flags  = IORESOURCE_MEM,
-	}
-};
-
-void __init ltq_register_gpio(void)
-{
-	platform_device_register_simple("ltq_gpio", 0,
-		&ltq_gpio_resource[0], 1);
-	platform_device_register_simple("ltq_gpio", 1,
-		&ltq_gpio_resource[1], 1);
-
-	/* AR9 and VR9 have an extra gpio block */
-	if (ltq_is_ar9() || ltq_is_vr9()) {
-		platform_device_register_simple("ltq_gpio", 2,
-			&ltq_gpio_resource[2], 1);
-	}
-}
-
-/* serial to parallel conversion */
-static struct resource ltq_stp_resource = {
-	.name   = "stp",
-	.start  = LTQ_STP_BASE_ADDR,
-	.end    = LTQ_STP_BASE_ADDR + LTQ_STP_SIZE - 1,
-	.flags  = IORESOURCE_MEM,
-};
-
-void __init ltq_register_gpio_stp(void)
-{
-	platform_device_register_simple("ltq_stp", 0, &ltq_stp_resource, 1);
-}
-
-/* asc ports - amazon se has its own serial mapping */
-static struct resource ltq_ase_asc_resources[] = {
-	{
-		.name	= "asc0",
-		.start  = LTQ_ASC1_BASE_ADDR,
-		.end    = LTQ_ASC1_BASE_ADDR + LTQ_ASC_SIZE - 1,
-		.flags  = IORESOURCE_MEM,
-	},
-	IRQ_RES(tx, LTQ_ASC_ASE_TIR),
-	IRQ_RES(rx, LTQ_ASC_ASE_RIR),
-	IRQ_RES(err, LTQ_ASC_ASE_EIR),
-};
-
-void __init ltq_register_ase_asc(void)
-{
-	platform_device_register_simple("ltq_asc", 0,
-		ltq_ase_asc_resources, ARRAY_SIZE(ltq_ase_asc_resources));
-}
-
-/* ethernet */
-static struct resource ltq_etop_resources = {
-	.name	= "etop",
-	.start	= LTQ_ETOP_BASE_ADDR,
-	.end	= LTQ_ETOP_BASE_ADDR + LTQ_ETOP_SIZE - 1,
-	.flags	= IORESOURCE_MEM,
-};
-
-static struct platform_device ltq_etop = {
-	.name		= "ltq_etop",
-	.resource	= &ltq_etop_resources,
-	.num_resources	= 1,
-};
-
-void __init
-ltq_register_etop(struct ltq_eth_data *eth)
-{
-	if (eth) {
-		ltq_etop.dev.platform_data = eth;
-		platform_device_register(&ltq_etop);
-	}
-}
diff --git a/arch/mips/lantiq/xway/devices.h b/arch/mips/lantiq/xway/devices.h
deleted file mode 100644
index e90493471bc1..000000000000
--- a/arch/mips/lantiq/xway/devices.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- *  This program is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License version 2 as published
- *  by the Free Software Foundation.
- *
- *  Copyright (C) 2010 John Crispin <[email protected]>
- */
-
-#ifndef _LTQ_DEVICES_XWAY_H__
-#define _LTQ_DEVICES_XWAY_H__
-
-#include "../devices.h"
-#include <linux/phy.h>
-
-extern void ltq_register_gpio(void);
-extern void ltq_register_gpio_stp(void);
-extern void ltq_register_ase_asc(void);
-extern void ltq_register_etop(struct ltq_eth_data *eth);
-
-#endif
diff --git a/arch/mips/lantiq/xway/dma.c b/arch/mips/lantiq/xway/dma.c
index b210e936c7c3..55d2c4fa4714 100644
--- a/arch/mips/lantiq/xway/dma.c
+++ b/arch/mips/lantiq/xway/dma.c
@@ -19,7 +19,8 @@
 #include <linux/platform_device.h>
 #include <linux/io.h>
 #include <linux/dma-mapping.h>
-#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/clk.h>
 
 #include <lantiq_soc.h>
 #include <xway_dma.h>
@@ -55,13 +56,6 @@
 #define ltq_dma_w32_mask(x, y, z)	ltq_w32_mask(x, y, \
 						ltq_dma_membase + (z))
 
-static struct resource ltq_dma_resource = {
-	.name	= "dma",
-	.start	= LTQ_DMA_BASE_ADDR,
-	.end	= LTQ_DMA_BASE_ADDR + LTQ_DMA_SIZE - 1,
-	.flags  = IORESOURCE_MEM,
-};
-
 static void __iomem *ltq_dma_membase;
 
 void
@@ -215,27 +209,28 @@ ltq_dma_init_port(int p)
 }
 EXPORT_SYMBOL_GPL(ltq_dma_init_port);
 
-int __init
-ltq_dma_init(void)
+static int __devinit
+ltq_dma_init(struct platform_device *pdev)
 {
+	struct clk *clk;
+	struct resource *res;
 	int i;
 
-	/* insert and request the memory region */
-	if (insert_resource(&iomem_resource, &ltq_dma_resource) < 0)
-		panic("Failed to insert dma memory");
-
-	if (request_mem_region(ltq_dma_resource.start,
-			resource_size(&ltq_dma_resource), "dma") < 0)
-		panic("Failed to request dma memory");
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res)
+		panic("Failed to get dma resource");
 
 	/* remap dma register range */
-	ltq_dma_membase = ioremap_nocache(ltq_dma_resource.start,
-				resource_size(&ltq_dma_resource));
+	ltq_dma_membase = devm_request_and_ioremap(&pdev->dev, res);
 	if (!ltq_dma_membase)
-		panic("Failed to remap dma memory");
+		panic("Failed to remap dma resource");
 
 	/* power up and reset the dma engine */
-	ltq_pmu_enable(PMU_DMA);
+	clk = clk_get(&pdev->dev, NULL);
+	if (IS_ERR(clk))
+		panic("Failed to get dma clock");
+
+	clk_enable(clk);
 	ltq_dma_w32_mask(0, DMA_RESET, LTQ_DMA_CTRL);
 
 	/* disable all interrupts */
@@ -248,7 +243,29 @@ ltq_dma_init(void)
 		ltq_dma_w32(DMA_POLL | DMA_CLK_DIV4, LTQ_DMA_CPOLL);
 		ltq_dma_w32_mask(DMA_CHAN_ON, 0, LTQ_DMA_CCTRL);
 	}
+	dev_info(&pdev->dev, "init done\n");
 	return 0;
 }
 
-postcore_initcall(ltq_dma_init);
+static const struct of_device_id dma_match[] = {
+	{ .compatible = "lantiq,dma-xway" },
+	{},
+};
+MODULE_DEVICE_TABLE(of, dma_match);
+
+static struct platform_driver dma_driver = {
+	.probe = ltq_dma_init,
+	.driver = {
+		.name = "dma-xway",
+		.owner = THIS_MODULE,
+		.of_match_table = dma_match,
+	},
+};
+
+int __init
+dma_init(void)
+{
+	return platform_driver_register(&dma_driver);
+}
+
+postcore_initcall(dma_init);
diff --git a/arch/mips/lantiq/xway/ebu.c b/arch/mips/lantiq/xway/ebu.c
deleted file mode 100644
index 862e3e830680..000000000000
--- a/arch/mips/lantiq/xway/ebu.c
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- *  This program is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License version 2 as published
- *  by the Free Software Foundation.
- *
- *  EBU - the external bus unit attaches PCI, NOR and NAND
- *
- *  Copyright (C) 2010 John Crispin <[email protected]>
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/ioport.h>
-
-#include <lantiq_soc.h>
-
-/* all access to the ebu must be locked */
-DEFINE_SPINLOCK(ebu_lock);
-EXPORT_SYMBOL_GPL(ebu_lock);
-
-static struct resource ltq_ebu_resource = {
-	.name	= "ebu",
-	.start	= LTQ_EBU_BASE_ADDR,
-	.end	= LTQ_EBU_BASE_ADDR + LTQ_EBU_SIZE - 1,
-	.flags	= IORESOURCE_MEM,
-};
-
-/* remapped base addr of the clock unit and external bus unit */
-void __iomem *ltq_ebu_membase;
-
-static int __init lantiq_ebu_init(void)
-{
-	/* insert and request the memory region */
-	if (insert_resource(&iomem_resource, &ltq_ebu_resource) < 0)
-		panic("Failed to insert ebu memory");
-
-	if (request_mem_region(ltq_ebu_resource.start,
-			resource_size(&ltq_ebu_resource), "ebu") < 0)
-		panic("Failed to request ebu memory");
-
-	/* remap ebu register range */
-	ltq_ebu_membase = ioremap_nocache(ltq_ebu_resource.start,
-				resource_size(&ltq_ebu_resource));
-	if (!ltq_ebu_membase)
-		panic("Failed to remap ebu memory");
-
-	/* make sure to unprotect the memory region where flash is located */
-	ltq_ebu_w32(ltq_ebu_r32(LTQ_EBU_BUSCON0) & ~EBU_WRDIS, LTQ_EBU_BUSCON0);
-	return 0;
-}
-
-postcore_initcall(lantiq_ebu_init);
diff --git a/arch/mips/lantiq/xway/gpio.c b/arch/mips/lantiq/xway/gpio.c
index c429a5bc080f..2ab39e93d9be 100644
--- a/arch/mips/lantiq/xway/gpio.c
+++ b/arch/mips/lantiq/xway/gpio.c
@@ -36,18 +36,6 @@ struct ltq_gpio {
 
 static struct ltq_gpio ltq_gpio_port[MAX_PORTS];
 
-int gpio_to_irq(unsigned int gpio)
-{
-	return -EINVAL;
-}
-EXPORT_SYMBOL(gpio_to_irq);
-
-int irq_to_gpio(unsigned int gpio)
-{
-	return -EINVAL;
-}
-EXPORT_SYMBOL(irq_to_gpio);
-
 int ltq_gpio_request(unsigned int pin, unsigned int alt0,
 	unsigned int alt1, unsigned int dir, const char *name)
 {
diff --git a/arch/mips/lantiq/xway/gpio_ebu.c b/arch/mips/lantiq/xway/gpio_ebu.c
deleted file mode 100644
index aae17170472f..000000000000
--- a/arch/mips/lantiq/xway/gpio_ebu.c
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- *  This program is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License version 2 as published
- *  by the Free Software Foundation.
- *
- *  Copyright (C) 2010 John Crispin <[email protected]>
- */
-
-#include <linux/init.h>
-#include <linux/export.h>
-#include <linux/types.h>
-#include <linux/platform_device.h>
-#include <linux/mutex.h>
-#include <linux/gpio.h>
-#include <linux/io.h>
-
-#include <lantiq_soc.h>
-
-/*
- * By attaching hardware latches to the EBU it is possible to create output
- * only gpios. This driver configures a special memory address, which when
- * written to outputs 16 bit to the latches.
- */
-
-#define LTQ_EBU_BUSCON	0x1e7ff		/* 16 bit access, slowest timing */
-#define LTQ_EBU_WP	0x80000000	/* write protect bit */
-
-/* we keep a shadow value of the last value written to the ebu */
-static int ltq_ebu_gpio_shadow = 0x0;
-static void __iomem *ltq_ebu_gpio_membase;
-
-static void ltq_ebu_apply(void)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&ebu_lock, flags);
-	ltq_ebu_w32(LTQ_EBU_BUSCON, LTQ_EBU_BUSCON1);
-	*((__u16 *)ltq_ebu_gpio_membase) = ltq_ebu_gpio_shadow;
-	ltq_ebu_w32(LTQ_EBU_BUSCON | LTQ_EBU_WP, LTQ_EBU_BUSCON1);
-	spin_unlock_irqrestore(&ebu_lock, flags);
-}
-
-static void ltq_ebu_set(struct gpio_chip *chip, unsigned offset, int value)
-{
-	if (value)
-		ltq_ebu_gpio_shadow |= (1 << offset);
-	else
-		ltq_ebu_gpio_shadow &= ~(1 << offset);
-	ltq_ebu_apply();
-}
-
-static int ltq_ebu_direction_output(struct gpio_chip *chip, unsigned offset,
-	int value)
-{
-	ltq_ebu_set(chip, offset, value);
-
-	return 0;
-}
-
-static struct gpio_chip ltq_ebu_chip = {
-	.label = "ltq_ebu",
-	.direction_output = ltq_ebu_direction_output,
-	.set = ltq_ebu_set,
-	.base = 72,
-	.ngpio = 16,
-	.can_sleep = 1,
-	.owner = THIS_MODULE,
-};
-
-static int ltq_ebu_probe(struct platform_device *pdev)
-{
-	int ret = 0;
-	struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-
-	if (!res) {
-		dev_err(&pdev->dev, "failed to get memory resource\n");
-		return -ENOENT;
-	}
-
-	res = devm_request_mem_region(&pdev->dev, res->start,
-		resource_size(res), dev_name(&pdev->dev));
-	if (!res) {
-		dev_err(&pdev->dev, "failed to request memory resource\n");
-		return -EBUSY;
-	}
-
-	ltq_ebu_gpio_membase = devm_ioremap_nocache(&pdev->dev, res->start,
-		resource_size(res));
-	if (!ltq_ebu_gpio_membase) {
-		dev_err(&pdev->dev, "Failed to ioremap mem region\n");
-		return -ENOMEM;
-	}
-
-	/* grab the default shadow value passed form the platform code */
-	ltq_ebu_gpio_shadow = (unsigned int) pdev->dev.platform_data;
-
-	/* tell the ebu controller which memory address we will be using */
-	ltq_ebu_w32(pdev->resource->start | 0x1, LTQ_EBU_ADDRSEL1);
-
-	/* write protect the region */
-	ltq_ebu_w32(LTQ_EBU_BUSCON | LTQ_EBU_WP, LTQ_EBU_BUSCON1);
-
-	ret = gpiochip_add(&ltq_ebu_chip);
-	if (!ret)
-		ltq_ebu_apply();
-	return ret;
-}
-
-static struct platform_driver ltq_ebu_driver = {
-	.probe = ltq_ebu_probe,
-	.driver = {
-		.name = "ltq_ebu",
-		.owner = THIS_MODULE,
-	},
-};
-
-static int __init ltq_ebu_init(void)
-{
-	int ret = platform_driver_register(&ltq_ebu_driver);
-
-	if (ret)
-		pr_info("ltq_ebu : Error registering platform driver!");
-	return ret;
-}
-
-postcore_initcall(ltq_ebu_init);
diff --git a/arch/mips/lantiq/xway/gpio_stp.c b/arch/mips/lantiq/xway/gpio_stp.c
deleted file mode 100644
index fd07d87adaa9..000000000000
--- a/arch/mips/lantiq/xway/gpio_stp.c
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- *  This program is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License version 2 as published
- *  by the Free Software Foundation.
- *
- *  Copyright (C) 2007 John Crispin <[email protected]>
- *
- */
-
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/export.h>
-#include <linux/types.h>
-#include <linux/platform_device.h>
-#include <linux/mutex.h>
-#include <linux/io.h>
-#include <linux/gpio.h>
-
-#include <lantiq_soc.h>
-
-#define LTQ_STP_CON0		0x00
-#define LTQ_STP_CON1		0x04
-#define LTQ_STP_CPU0		0x08
-#define LTQ_STP_CPU1		0x0C
-#define LTQ_STP_AR		0x10
-
-#define LTQ_STP_CON_SWU		(1 << 31)
-#define LTQ_STP_2HZ		0
-#define LTQ_STP_4HZ		(1 << 23)
-#define LTQ_STP_8HZ		(2 << 23)
-#define LTQ_STP_10HZ		(3 << 23)
-#define LTQ_STP_SPEED_MASK	(0xf << 23)
-#define LTQ_STP_UPD_FPI		(1 << 31)
-#define LTQ_STP_UPD_MASK	(3 << 30)
-#define LTQ_STP_ADSL_SRC	(3 << 24)
-
-#define LTQ_STP_GROUP0		(1 << 0)
-
-#define LTQ_STP_RISING		0
-#define LTQ_STP_FALLING		(1 << 26)
-#define LTQ_STP_EDGE_MASK	(1 << 26)
-
-#define ltq_stp_r32(reg)	__raw_readl(ltq_stp_membase + reg)
-#define ltq_stp_w32(val, reg)	__raw_writel(val, ltq_stp_membase + reg)
-#define ltq_stp_w32_mask(clear, set, reg) \
-		ltq_w32((ltq_r32(ltq_stp_membase + reg) & ~(clear)) | (set), \
-		ltq_stp_membase + (reg))
-
-static int ltq_stp_shadow = 0xffff;
-static void __iomem *ltq_stp_membase;
-
-static void ltq_stp_set(struct gpio_chip *chip, unsigned offset, int value)
-{
-	if (value)
-		ltq_stp_shadow |= (1 << offset);
-	else
-		ltq_stp_shadow &= ~(1 << offset);
-	ltq_stp_w32(ltq_stp_shadow, LTQ_STP_CPU0);
-}
-
-static int ltq_stp_direction_output(struct gpio_chip *chip, unsigned offset,
-	int value)
-{
-	ltq_stp_set(chip, offset, value);
-
-	return 0;
-}
-
-static struct gpio_chip ltq_stp_chip = {
-	.label = "ltq_stp",
-	.direction_output = ltq_stp_direction_output,
-	.set = ltq_stp_set,
-	.base = 48,
-	.ngpio = 24,
-	.can_sleep = 1,
-	.owner = THIS_MODULE,
-};
-
-static int ltq_stp_hw_init(void)
-{
-	/* the 3 pins used to control the external stp */
-	ltq_gpio_request(4, 1, 0, 1, "stp-st");
-	ltq_gpio_request(5, 1, 0, 1, "stp-d");
-	ltq_gpio_request(6, 1, 0, 1, "stp-sh");
-
-	/* sane defaults */
-	ltq_stp_w32(0, LTQ_STP_AR);
-	ltq_stp_w32(0, LTQ_STP_CPU0);
-	ltq_stp_w32(0, LTQ_STP_CPU1);
-	ltq_stp_w32(LTQ_STP_CON_SWU, LTQ_STP_CON0);
-	ltq_stp_w32(0, LTQ_STP_CON1);
-
-	/* rising or falling edge */
-	ltq_stp_w32_mask(LTQ_STP_EDGE_MASK, LTQ_STP_FALLING, LTQ_STP_CON0);
-
-	/* per default stp 15-0 are set */
-	ltq_stp_w32_mask(0, LTQ_STP_GROUP0, LTQ_STP_CON1);
-
-	/* stp are update periodically by the FPI bus */
-	ltq_stp_w32_mask(LTQ_STP_UPD_MASK, LTQ_STP_UPD_FPI, LTQ_STP_CON1);
-
-	/* set stp update speed */
-	ltq_stp_w32_mask(LTQ_STP_SPEED_MASK, LTQ_STP_8HZ, LTQ_STP_CON1);
-
-	/* tell the hardware that pin (led) 0 and 1 are controlled
-	 *  by the dsl arc
-	 */
-	ltq_stp_w32_mask(0, LTQ_STP_ADSL_SRC, LTQ_STP_CON0);
-
-	ltq_pmu_enable(PMU_LED);
-	return 0;
-}
-
-static int __devinit ltq_stp_probe(struct platform_device *pdev)
-{
-	struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	int ret = 0;
-
-	if (!res)
-		return -ENOENT;
-	res = devm_request_mem_region(&pdev->dev, res->start,
-		resource_size(res), dev_name(&pdev->dev));
-	if (!res) {
-		dev_err(&pdev->dev, "failed to request STP memory\n");
-		return -EBUSY;
-	}
-	ltq_stp_membase = devm_ioremap_nocache(&pdev->dev, res->start,
-		resource_size(res));
-	if (!ltq_stp_membase) {
-		dev_err(&pdev->dev, "failed to remap STP memory\n");
-		return -ENOMEM;
-	}
-	ret = gpiochip_add(&ltq_stp_chip);
-	if (!ret)
-		ret = ltq_stp_hw_init();
-
-	return ret;
-}
-
-static struct platform_driver ltq_stp_driver = {
-	.probe = ltq_stp_probe,
-	.driver = {
-		.name = "ltq_stp",
-		.owner = THIS_MODULE,
-	},
-};
-
-int __init ltq_stp_init(void)
-{
-	int ret = platform_driver_register(&ltq_stp_driver);
-
-	if (ret)
-		pr_info("ltq_stp: error registering platform driver");
-	return ret;
-}
-
-postcore_initcall(ltq_stp_init);
diff --git a/arch/mips/lantiq/xway/mach-easy50601.c b/arch/mips/lantiq/xway/mach-easy50601.c
deleted file mode 100644
index d5aaf637ab19..000000000000
--- a/arch/mips/lantiq/xway/mach-easy50601.c
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- *  This program is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License version 2 as published
- *  by the Free Software Foundation.
- *
- *  Copyright (C) 2010 John Crispin <[email protected]>
- */
-
-#include <linux/init.h>
-#include <linux/platform_device.h>
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/partitions.h>
-#include <linux/mtd/physmap.h>
-#include <linux/input.h>
-
-#include <lantiq.h>
-
-#include "../machtypes.h"
-#include "devices.h"
-
-static struct mtd_partition easy50601_partitions[] = {
-	{
-		.name	= "uboot",
-		.offset	= 0x0,
-		.size	= 0x10000,
-	},
-	{
-		.name	= "uboot_env",
-		.offset	= 0x10000,
-		.size	= 0x10000,
-	},
-	{
-		.name	= "linux",
-		.offset	= 0x20000,
-		.size	= 0xE0000,
-	},
-	{
-		.name	= "rootfs",
-		.offset	= 0x100000,
-		.size	= 0x300000,
-	},
-};
-
-static struct physmap_flash_data easy50601_flash_data = {
-	.nr_parts	= ARRAY_SIZE(easy50601_partitions),
-	.parts		= easy50601_partitions,
-};
-
-static void __init easy50601_init(void)
-{
-	ltq_register_nor(&easy50601_flash_data);
-}
-
-MIPS_MACHINE(LTQ_MACH_EASY50601,
-			"EASY50601",
-			"EASY50601 Eval Board",
-			easy50601_init);
diff --git a/arch/mips/lantiq/xway/mach-easy50712.c b/arch/mips/lantiq/xway/mach-easy50712.c
deleted file mode 100644
index ea5027b3239d..000000000000
--- a/arch/mips/lantiq/xway/mach-easy50712.c
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- *  This program is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License version 2 as published
- *  by the Free Software Foundation.
- *
- *  Copyright (C) 2010 John Crispin <[email protected]>
- */
-
-#include <linux/init.h>
-#include <linux/platform_device.h>
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/partitions.h>
-#include <linux/mtd/physmap.h>
-#include <linux/input.h>
-#include <linux/phy.h>
-
-#include <lantiq_soc.h>
-#include <irq.h>
-
-#include "../machtypes.h"
-#include "devices.h"
-
-static struct mtd_partition easy50712_partitions[] = {
-	{
-		.name	= "uboot",
-		.offset	= 0x0,
-		.size	= 0x10000,
-	},
-	{
-		.name	= "uboot_env",
-		.offset	= 0x10000,
-		.size	= 0x10000,
-	},
-	{
-		.name	= "linux",
-		.offset	= 0x20000,
-		.size	= 0xe0000,
-	},
-	{
-		.name	= "rootfs",
-		.offset	= 0x100000,
-		.size	= 0x300000,
-	},
-};
-
-static struct physmap_flash_data easy50712_flash_data = {
-	.nr_parts	= ARRAY_SIZE(easy50712_partitions),
-	.parts		= easy50712_partitions,
-};
-
-static struct ltq_pci_data ltq_pci_data = {
-	.clock	= PCI_CLOCK_INT,
-	.gpio	= PCI_GNT1 | PCI_REQ1,
-	.irq	= {
-		[14] = INT_NUM_IM0_IRL0 + 22,
-	},
-};
-
-static struct ltq_eth_data ltq_eth_data = {
-	.mii_mode = PHY_INTERFACE_MODE_MII,
-};
-
-static void __init easy50712_init(void)
-{
-	ltq_register_gpio_stp();
-	ltq_register_nor(&easy50712_flash_data);
-	ltq_register_pci(&ltq_pci_data);
-	ltq_register_etop(&ltq_eth_data);
-}
-
-MIPS_MACHINE(LTQ_MACH_EASY50712,
-	     "EASY50712",
-	     "EASY50712 Eval Board",
-	      easy50712_init);
diff --git a/arch/mips/lantiq/xway/pmu.c b/arch/mips/lantiq/xway/pmu.c
deleted file mode 100644
index fe85361e032e..000000000000
--- a/arch/mips/lantiq/xway/pmu.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- *  This program is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License version 2 as published
- *  by the Free Software Foundation.
- *
- *  Copyright (C) 2010 John Crispin <[email protected]>
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/ioport.h>
-
-#include <lantiq_soc.h>
-
-/* PMU - the power management unit allows us to turn part of the core
- * on and off
- */
-
-/* the enable / disable registers */
-#define LTQ_PMU_PWDCR	0x1C
-#define LTQ_PMU_PWDSR	0x20
-
-#define ltq_pmu_w32(x, y)	ltq_w32((x), ltq_pmu_membase + (y))
-#define ltq_pmu_r32(x)		ltq_r32(ltq_pmu_membase + (x))
-
-static struct resource ltq_pmu_resource = {
-	.name	= "pmu",
-	.start	= LTQ_PMU_BASE_ADDR,
-	.end	= LTQ_PMU_BASE_ADDR + LTQ_PMU_SIZE - 1,
-	.flags	= IORESOURCE_MEM,
-};
-
-static void __iomem *ltq_pmu_membase;
-
-void ltq_pmu_enable(unsigned int module)
-{
-	int err = 1000000;
-
-	ltq_pmu_w32(ltq_pmu_r32(LTQ_PMU_PWDCR) & ~module, LTQ_PMU_PWDCR);
-	do {} while (--err && (ltq_pmu_r32(LTQ_PMU_PWDSR) & module));
-
-	if (!err)
-		panic("activating PMU module failed!");
-}
-EXPORT_SYMBOL(ltq_pmu_enable);
-
-void ltq_pmu_disable(unsigned int module)
-{
-	ltq_pmu_w32(ltq_pmu_r32(LTQ_PMU_PWDCR) | module, LTQ_PMU_PWDCR);
-}
-EXPORT_SYMBOL(ltq_pmu_disable);
-
-int __init ltq_pmu_init(void)
-{
-	if (insert_resource(&iomem_resource, &ltq_pmu_resource) < 0)
-		panic("Failed to insert pmu memory");
-
-	if (request_mem_region(ltq_pmu_resource.start,
-			resource_size(&ltq_pmu_resource), "pmu") < 0)
-		panic("Failed to request pmu memory");
-
-	ltq_pmu_membase = ioremap_nocache(ltq_pmu_resource.start,
-				resource_size(&ltq_pmu_resource));
-	if (!ltq_pmu_membase)
-		panic("Failed to remap pmu memory");
-	return 0;
-}
-
-core_initcall(ltq_pmu_init);
diff --git a/arch/mips/lantiq/xway/prom-ase.c b/arch/mips/lantiq/xway/prom-ase.c
deleted file mode 100644
index ae4959ae865c..000000000000
--- a/arch/mips/lantiq/xway/prom-ase.c
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- *  This program is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License version 2 as published
- *  by the Free Software Foundation.
- *
- *  Copyright (C) 2010 John Crispin <[email protected]>
- */
-
-#include <linux/export.h>
-#include <linux/clk.h>
-#include <asm/bootinfo.h>
-#include <asm/time.h>
-
-#include <lantiq_soc.h>
-
-#include "../prom.h"
-
-#define SOC_AMAZON_SE	"Amazon_SE"
-
-#define PART_SHIFT	12
-#define PART_MASK	0x0FFFFFFF
-#define REV_SHIFT	28
-#define REV_MASK	0xF0000000
-
-void __init ltq_soc_detect(struct ltq_soc_info *i)
-{
-	i->partnum = (ltq_r32(LTQ_MPS_CHIPID) & PART_MASK) >> PART_SHIFT;
-	i->rev = (ltq_r32(LTQ_MPS_CHIPID) & REV_MASK) >> REV_SHIFT;
-	switch (i->partnum) {
-	case SOC_ID_AMAZON_SE:
-		i->name = SOC_AMAZON_SE;
-		i->type = SOC_TYPE_AMAZON_SE;
-		break;
-
-	default:
-		unreachable();
-		break;
-	}
-}
diff --git a/arch/mips/lantiq/xway/prom-xway.c b/arch/mips/lantiq/xway/prom-xway.c
deleted file mode 100644
index 2228133ca356..000000000000
--- a/arch/mips/lantiq/xway/prom-xway.c
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- *  This program is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License version 2 as published
- *  by the Free Software Foundation.
- *
- *  Copyright (C) 2010 John Crispin <[email protected]>
- */
-
-#include <linux/export.h>
-#include <linux/clk.h>
-#include <asm/bootinfo.h>
-#include <asm/time.h>
-
-#include <lantiq_soc.h>
-
-#include "../prom.h"
-
-#define SOC_DANUBE	"Danube"
-#define SOC_TWINPASS	"Twinpass"
-#define SOC_AR9		"AR9"
-
-#define PART_SHIFT	12
-#define PART_MASK	0x0FFFFFFF
-#define REV_SHIFT	28
-#define REV_MASK	0xF0000000
-
-void __init ltq_soc_detect(struct ltq_soc_info *i)
-{
-	i->partnum = (ltq_r32(LTQ_MPS_CHIPID) & PART_MASK) >> PART_SHIFT;
-	i->rev = (ltq_r32(LTQ_MPS_CHIPID) & REV_MASK) >> REV_SHIFT;
-	switch (i->partnum) {
-	case SOC_ID_DANUBE1:
-	case SOC_ID_DANUBE2:
-		i->name = SOC_DANUBE;
-		i->type = SOC_TYPE_DANUBE;
-		break;
-
-	case SOC_ID_TWINPASS:
-		i->name = SOC_TWINPASS;
-		i->type = SOC_TYPE_DANUBE;
-		break;
-
-	case SOC_ID_ARX188:
-	case SOC_ID_ARX168:
-	case SOC_ID_ARX182:
-		i->name = SOC_AR9;
-		i->type = SOC_TYPE_AR9;
-		break;
-
-	default:
-		unreachable();
-		break;
-	}
-}
diff --git a/arch/mips/lantiq/xway/prom.c b/arch/mips/lantiq/xway/prom.c
new file mode 100644
index 000000000000..248429ab2622
--- /dev/null
+++ b/arch/mips/lantiq/xway/prom.c
@@ -0,0 +1,115 @@
+/*
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License version 2 as published
+ *  by the Free Software Foundation.
+ *
+ *  Copyright (C) 2010 John Crispin <[email protected]>
+ */
+
+#include <linux/export.h>
+#include <linux/clk.h>
+#include <asm/bootinfo.h>
+#include <asm/time.h>
+
+#include <lantiq_soc.h>
+
+#include "../prom.h"
+
+#define SOC_DANUBE	"Danube"
+#define SOC_TWINPASS	"Twinpass"
+#define SOC_AMAZON_SE	"Amazon_SE"
+#define SOC_AR9		"AR9"
+#define SOC_GR9		"GR9"
+#define SOC_VR9		"VR9"
+
+#define COMP_DANUBE	"lantiq,danube"
+#define COMP_TWINPASS	"lantiq,twinpass"
+#define COMP_AMAZON_SE	"lantiq,ase"
+#define COMP_AR9	"lantiq,ar9"
+#define COMP_GR9	"lantiq,gr9"
+#define COMP_VR9	"lantiq,vr9"
+
+#define PART_SHIFT	12
+#define PART_MASK	0x0FFFFFFF
+#define REV_SHIFT	28
+#define REV_MASK	0xF0000000
+
+void __init ltq_soc_detect(struct ltq_soc_info *i)
+{
+	i->partnum = (ltq_r32(LTQ_MPS_CHIPID) & PART_MASK) >> PART_SHIFT;
+	i->rev = (ltq_r32(LTQ_MPS_CHIPID) & REV_MASK) >> REV_SHIFT;
+	sprintf(i->rev_type, "1.%d", i->rev);
+	switch (i->partnum) {
+	case SOC_ID_DANUBE1:
+	case SOC_ID_DANUBE2:
+		i->name = SOC_DANUBE;
+		i->type = SOC_TYPE_DANUBE;
+		i->compatible = COMP_DANUBE;
+		break;
+
+	case SOC_ID_TWINPASS:
+		i->name = SOC_TWINPASS;
+		i->type = SOC_TYPE_DANUBE;
+		i->compatible = COMP_TWINPASS;
+		break;
+
+	case SOC_ID_ARX188:
+	case SOC_ID_ARX168_1:
+	case SOC_ID_ARX168_2:
+	case SOC_ID_ARX182:
+		i->name = SOC_AR9;
+		i->type = SOC_TYPE_AR9;
+		i->compatible = COMP_AR9;
+		break;
+
+	case SOC_ID_GRX188:
+	case SOC_ID_GRX168:
+		i->name = SOC_GR9;
+		i->type = SOC_TYPE_AR9;
+		i->compatible = COMP_GR9;
+		break;
+
+	case SOC_ID_AMAZON_SE_1:
+	case SOC_ID_AMAZON_SE_2:
+#ifdef CONFIG_PCI
+		panic("ase is only supported for non pci kernels");
+#endif
+		i->name = SOC_AMAZON_SE;
+		i->type = SOC_TYPE_AMAZON_SE;
+		i->compatible = COMP_AMAZON_SE;
+		break;
+
+	case SOC_ID_VRX282:
+	case SOC_ID_VRX268:
+	case SOC_ID_VRX288:
+		i->name = SOC_VR9;
+		i->type = SOC_TYPE_VR9;
+		i->compatible = COMP_VR9;
+		break;
+
+	case SOC_ID_GRX268:
+	case SOC_ID_GRX288:
+		i->name = SOC_GR9;
+		i->type = SOC_TYPE_VR9;
+		i->compatible = COMP_GR9;
+		break;
+
+	case SOC_ID_VRX268_2:
+	case SOC_ID_VRX288_2:
+		i->name = SOC_VR9;
+		i->type = SOC_TYPE_VR9_2;
+		i->compatible = COMP_VR9;
+		break;
+
+	case SOC_ID_GRX282_2:
+	case SOC_ID_GRX288_2:
+		i->name = SOC_GR9;
+		i->type = SOC_TYPE_VR9_2;
+		i->compatible = COMP_GR9;
+		break;
+
+	default:
+		unreachable();
+		break;
+	}
+}
diff --git a/arch/mips/lantiq/xway/reset.c b/arch/mips/lantiq/xway/reset.c
index 8b66bd87f0c1..22c55f73aa9d 100644
--- a/arch/mips/lantiq/xway/reset.c
+++ b/arch/mips/lantiq/xway/reset.c
@@ -11,26 +11,31 @@
 #include <linux/ioport.h>
 #include <linux/pm.h>
 #include <linux/export.h>
+#include <linux/delay.h>
+#include <linux/of_address.h>
+#include <linux/of_platform.h>
+
 #include <asm/reboot.h>
 
 #include <lantiq_soc.h>
 
+#include "../prom.h"
+
 #define ltq_rcu_w32(x, y)	ltq_w32((x), ltq_rcu_membase + (y))
 #define ltq_rcu_r32(x)		ltq_r32(ltq_rcu_membase + (x))
 
-/* register definitions */
-#define LTQ_RCU_RST		0x0010
-#define LTQ_RCU_RST_ALL		0x40000000
-
-#define LTQ_RCU_RST_STAT	0x0014
-#define LTQ_RCU_STAT_SHIFT	26
+/* reset request register */
+#define RCU_RST_REQ		0x0010
+/* reset status register */
+#define RCU_RST_STAT		0x0014
 
-static struct resource ltq_rcu_resource = {
-	.name   = "rcu",
-	.start  = LTQ_RCU_BASE_ADDR,
-	.end    = LTQ_RCU_BASE_ADDR + LTQ_RCU_SIZE - 1,
-	.flags  = IORESOURCE_MEM,
-};
+/* reboot bit */
+#define RCU_RD_SRST		BIT(30)
+/* reset cause */
+#define RCU_STAT_SHIFT		26
+/* boot selection */
+#define RCU_BOOT_SEL_SHIFT	26
+#define RCU_BOOT_SEL_MASK	0x7
 
 /* remapped base addr of the reset control unit */
 static void __iomem *ltq_rcu_membase;
@@ -38,48 +43,64 @@ static void __iomem *ltq_rcu_membase;
 /* This function is used by the watchdog driver */
 int ltq_reset_cause(void)
 {
-	u32 val = ltq_rcu_r32(LTQ_RCU_RST_STAT);
-	return val >> LTQ_RCU_STAT_SHIFT;
+	u32 val = ltq_rcu_r32(RCU_RST_STAT);
+	return val >> RCU_STAT_SHIFT;
 }
 EXPORT_SYMBOL_GPL(ltq_reset_cause);
 
+/* allow platform code to find out what source we booted from */
+unsigned char ltq_boot_select(void)
+{
+	u32 val = ltq_rcu_r32(RCU_RST_STAT);
+	return (val >> RCU_BOOT_SEL_SHIFT) & RCU_BOOT_SEL_MASK;
+}
+
+/* reset a io domain for u micro seconds */
+void ltq_reset_once(unsigned int module, ulong u)
+{
+	ltq_rcu_w32(ltq_rcu_r32(RCU_RST_REQ) | module, RCU_RST_REQ);
+	udelay(u);
+	ltq_rcu_w32(ltq_rcu_r32(RCU_RST_REQ) & ~module, RCU_RST_REQ);
+}
+
 static void ltq_machine_restart(char *command)
 {
-	pr_notice("System restart\n");
 	local_irq_disable();
-	ltq_rcu_w32(ltq_rcu_r32(LTQ_RCU_RST) | LTQ_RCU_RST_ALL, LTQ_RCU_RST);
+	ltq_rcu_w32(ltq_rcu_r32(RCU_RST_REQ) | RCU_RD_SRST, RCU_RST_REQ);
 	unreachable();
 }
 
 static void ltq_machine_halt(void)
 {
-	pr_notice("System halted.\n");
 	local_irq_disable();
 	unreachable();
 }
 
 static void ltq_machine_power_off(void)
 {
-	pr_notice("Please turn off the power now.\n");
 	local_irq_disable();
 	unreachable();
 }
 
 static int __init mips_reboot_setup(void)
 {
-	/* insert and request the memory region */
-	if (insert_resource(&iomem_resource, &ltq_rcu_resource) < 0)
-		panic("Failed to insert rcu memory");
+	struct resource res;
+	struct device_node *np =
+		of_find_compatible_node(NULL, NULL, "lantiq,rcu-xway");
+
+	/* check if all the reset register range is available */
+	if (!np)
+		panic("Failed to load reset resources from devicetree");
+
+	if (of_address_to_resource(np, 0, &res))
+		panic("Failed to get rcu memory range");
 
-	if (request_mem_region(ltq_rcu_resource.start,
-			resource_size(&ltq_rcu_resource), "rcu") < 0)
-		panic("Failed to request rcu memory");
+	if (request_mem_region(res.start, resource_size(&res), res.name) < 0)
+		pr_err("Failed to request rcu memory");
 
-	/* remap rcu register range */
-	ltq_rcu_membase = ioremap_nocache(ltq_rcu_resource.start,
-				resource_size(&ltq_rcu_resource));
+	ltq_rcu_membase = ioremap_nocache(res.start, resource_size(&res));
 	if (!ltq_rcu_membase)
-		panic("Failed to remap rcu memory");
+		panic("Failed to remap core memory");
 
 	_machine_restart = ltq_machine_restart;
 	_machine_halt = ltq_machine_halt;
diff --git a/arch/mips/lantiq/xway/setup-ase.c b/arch/mips/lantiq/xway/setup-ase.c
deleted file mode 100644
index f6f326798a39..000000000000
--- a/arch/mips/lantiq/xway/setup-ase.c
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- *  This program is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License version 2 as published
- *  by the Free Software Foundation.
- *
- *  Copyright (C) 2011 John Crispin <[email protected]>
- */
-
-#include <lantiq_soc.h>
-
-#include "../prom.h"
-#include "devices.h"
-
-void __init ltq_soc_setup(void)
-{
-	ltq_register_ase_asc();
-	ltq_register_gpio();
-	ltq_register_wdt();
-}
diff --git a/arch/mips/lantiq/xway/setup-xway.c b/arch/mips/lantiq/xway/setup-xway.c
deleted file mode 100644
index c292f643a858..000000000000
--- a/arch/mips/lantiq/xway/setup-xway.c
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- *  This program is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License version 2 as published
- *  by the Free Software Foundation.
- *
- *  Copyright (C) 2011 John Crispin <[email protected]>
- */
-
-#include <lantiq_soc.h>
-
-#include "../prom.h"
-#include "devices.h"
-
-void __init ltq_soc_setup(void)
-{
-	ltq_register_asc(0);
-	ltq_register_asc(1);
-	ltq_register_gpio();
-	ltq_register_wdt();
-}
diff --git a/arch/mips/lantiq/xway/sysctrl.c b/arch/mips/lantiq/xway/sysctrl.c
new file mode 100644
index 000000000000..83780f7c842b
--- /dev/null
+++ b/arch/mips/lantiq/xway/sysctrl.c
@@ -0,0 +1,371 @@
+/*
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License version 2 as published
+ *  by the Free Software Foundation.
+ *
+ *  Copyright (C) 2011-2012 John Crispin <[email protected]>
+ */
+
+#include <linux/ioport.h>
+#include <linux/export.h>
+#include <linux/clkdev.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/of_address.h>
+
+#include <lantiq_soc.h>
+
+#include "../clk.h"
+#include "../prom.h"
+
+/* clock control register */
+#define CGU_IFCCR	0x0018
+/* system clock register */
+#define CGU_SYS		0x0010
+/* pci control register */
+#define CGU_PCICR	0x0034
+/* ephy configuration register */
+#define CGU_EPHY	0x10
+/* power control register */
+#define PMU_PWDCR	0x1C
+/* power status register */
+#define PMU_PWDSR	0x20
+/* power control register */
+#define PMU_PWDCR1	0x24
+/* power status register */
+#define PMU_PWDSR1	0x28
+/* power control register */
+#define PWDCR(x) ((x) ? (PMU_PWDCR1) : (PMU_PWDCR))
+/* power status register */
+#define PWDSR(x) ((x) ? (PMU_PWDSR1) : (PMU_PWDSR))
+
+/* clock gates that we can en/disable */
+#define PMU_USB0_P	BIT(0)
+#define PMU_PCI		BIT(4)
+#define PMU_DMA		BIT(5)
+#define PMU_USB0	BIT(6)
+#define PMU_ASC0	BIT(7)
+#define PMU_EPHY	BIT(7)	/* ase */
+#define PMU_SPI		BIT(8)
+#define PMU_DFE		BIT(9)
+#define PMU_EBU		BIT(10)
+#define PMU_STP		BIT(11)
+#define PMU_GPT		BIT(12)
+#define PMU_AHBS	BIT(13) /* vr9 */
+#define PMU_FPI		BIT(14)
+#define PMU_AHBM	BIT(15)
+#define PMU_ASC1	BIT(17)
+#define PMU_PPE_QSB	BIT(18)
+#define PMU_PPE_SLL01	BIT(19)
+#define PMU_PPE_TC	BIT(21)
+#define PMU_PPE_EMA	BIT(22)
+#define PMU_PPE_DPLUM	BIT(23)
+#define PMU_PPE_DPLUS	BIT(24)
+#define PMU_USB1_P	BIT(26)
+#define PMU_USB1	BIT(27)
+#define PMU_SWITCH	BIT(28)
+#define PMU_PPE_TOP	BIT(29)
+#define PMU_GPHY	BIT(30)
+#define PMU_PCIE_CLK	BIT(31)
+
+#define PMU1_PCIE_PHY	BIT(0)
+#define PMU1_PCIE_CTL	BIT(1)
+#define PMU1_PCIE_PDI	BIT(4)
+#define PMU1_PCIE_MSI	BIT(5)
+
+#define pmu_w32(x, y)	ltq_w32((x), pmu_membase + (y))
+#define pmu_r32(x)	ltq_r32(pmu_membase + (x))
+
+static void __iomem *pmu_membase;
+void __iomem *ltq_cgu_membase;
+void __iomem *ltq_ebu_membase;
+
+/* legacy function kept alive to ease clkdev transition */
+void ltq_pmu_enable(unsigned int module)
+{
+	int err = 1000000;
+
+	pmu_w32(pmu_r32(PMU_PWDCR) & ~module, PMU_PWDCR);
+	do {} while (--err && (pmu_r32(PMU_PWDSR) & module));
+
+	if (!err)
+		panic("activating PMU module failed!");
+}
+EXPORT_SYMBOL(ltq_pmu_enable);
+
+/* legacy function kept alive to ease clkdev transition */
+void ltq_pmu_disable(unsigned int module)
+{
+	pmu_w32(pmu_r32(PMU_PWDCR) | module, PMU_PWDCR);
+}
+EXPORT_SYMBOL(ltq_pmu_disable);
+
+/* enable a hw clock */
+static int cgu_enable(struct clk *clk)
+{
+	ltq_cgu_w32(ltq_cgu_r32(CGU_IFCCR) | clk->bits, CGU_IFCCR);
+	return 0;
+}
+
+/* disable a hw clock */
+static void cgu_disable(struct clk *clk)
+{
+	ltq_cgu_w32(ltq_cgu_r32(CGU_IFCCR) & ~clk->bits, CGU_IFCCR);
+}
+
+/* enable a clock gate */
+static int pmu_enable(struct clk *clk)
+{
+	int retry = 1000000;
+
+	pmu_w32(pmu_r32(PWDCR(clk->module)) & ~clk->bits,
+		PWDCR(clk->module));
+	do {} while (--retry && (pmu_r32(PWDSR(clk->module)) & clk->bits));
+
+	if (!retry)
+		panic("activating PMU module failed!\n");
+
+	return 0;
+}
+
+/* disable a clock gate */
+static void pmu_disable(struct clk *clk)
+{
+	pmu_w32(pmu_r32(PWDCR(clk->module)) | clk->bits,
+		PWDCR(clk->module));
+}
+
+/* the pci enable helper */
+static int pci_enable(struct clk *clk)
+{
+	unsigned int ifccr = ltq_cgu_r32(CGU_IFCCR);
+	/* set bus clock speed */
+	if (of_machine_is_compatible("lantiq,ar9")) {
+		ifccr &= ~0x1f00000;
+		if (clk->rate == CLOCK_33M)
+			ifccr |= 0xe00000;
+		else
+			ifccr |= 0x700000; /* 62.5M */
+	} else {
+		ifccr &= ~0xf00000;
+		if (clk->rate == CLOCK_33M)
+			ifccr |= 0x800000;
+		else
+			ifccr |= 0x400000; /* 62.5M */
+	}
+	ltq_cgu_w32(ifccr, CGU_IFCCR);
+	pmu_enable(clk);
+	return 0;
+}
+
+/* enable the external clock as a source */
+static int pci_ext_enable(struct clk *clk)
+{
+	ltq_cgu_w32(ltq_cgu_r32(CGU_IFCCR) & ~(1 << 16),
+		CGU_IFCCR);
+	ltq_cgu_w32((1 << 30), CGU_PCICR);
+	return 0;
+}
+
+/* disable the external clock as a source */
+static void pci_ext_disable(struct clk *clk)
+{
+	ltq_cgu_w32(ltq_cgu_r32(CGU_IFCCR) | (1 << 16),
+		CGU_IFCCR);
+	ltq_cgu_w32((1 << 31) | (1 << 30), CGU_PCICR);
+}
+
+/* enable a clockout source */
+static int clkout_enable(struct clk *clk)
+{
+	int i;
+
+	/* get the correct rate */
+	for (i = 0; i < 4; i++) {
+		if (clk->rates[i] == clk->rate) {
+			int shift = 14 - (2 * clk->module);
+			unsigned int ifccr = ltq_cgu_r32(CGU_IFCCR);
+
+			ifccr &= ~(3 << shift);
+			ifccr |= i << shift;
+			ltq_cgu_w32(ifccr, CGU_IFCCR);
+			return 0;
+		}
+	}
+	return -1;
+}
+
+/* manage the clock gates via PMU */
+static void clkdev_add_pmu(const char *dev, const char *con,
+					unsigned int module, unsigned int bits)
+{
+	struct clk *clk = kzalloc(sizeof(struct clk), GFP_KERNEL);
+
+	clk->cl.dev_id = dev;
+	clk->cl.con_id = con;
+	clk->cl.clk = clk;
+	clk->enable = pmu_enable;
+	clk->disable = pmu_disable;
+	clk->module = module;
+	clk->bits = bits;
+	clkdev_add(&clk->cl);
+}
+
+/* manage the clock generator */
+static void clkdev_add_cgu(const char *dev, const char *con,
+					unsigned int bits)
+{
+	struct clk *clk = kzalloc(sizeof(struct clk), GFP_KERNEL);
+
+	clk->cl.dev_id = dev;
+	clk->cl.con_id = con;
+	clk->cl.clk = clk;
+	clk->enable = cgu_enable;
+	clk->disable = cgu_disable;
+	clk->bits = bits;
+	clkdev_add(&clk->cl);
+}
+
+/* pci needs its own enable function as the setup is a bit more complex */
+static unsigned long valid_pci_rates[] = {CLOCK_33M, CLOCK_62_5M, 0};
+
+static void clkdev_add_pci(void)
+{
+	struct clk *clk = kzalloc(sizeof(struct clk), GFP_KERNEL);
+	struct clk *clk_ext = kzalloc(sizeof(struct clk), GFP_KERNEL);
+
+	/* main pci clock */
+	clk->cl.dev_id = "17000000.pci";
+	clk->cl.con_id = NULL;
+	clk->cl.clk = clk;
+	clk->rate = CLOCK_33M;
+	clk->rates = valid_pci_rates;
+	clk->enable = pci_enable;
+	clk->disable = pmu_disable;
+	clk->module = 0;
+	clk->bits = PMU_PCI;
+	clkdev_add(&clk->cl);
+
+	/* use internal/external bus clock */
+	clk_ext->cl.dev_id = "17000000.pci";
+	clk_ext->cl.con_id = "external";
+	clk_ext->cl.clk = clk_ext;
+	clk_ext->enable = pci_ext_enable;
+	clk_ext->disable = pci_ext_disable;
+	clkdev_add(&clk_ext->cl);
+}
+
+/* xway socs can generate clocks on gpio pins */
+static unsigned long valid_clkout_rates[4][5] = {
+	{CLOCK_32_768K, CLOCK_1_536M, CLOCK_2_5M, CLOCK_12M, 0},
+	{CLOCK_40M, CLOCK_12M, CLOCK_24M, CLOCK_48M, 0},
+	{CLOCK_25M, CLOCK_40M, CLOCK_30M, CLOCK_60M, 0},
+	{CLOCK_12M, CLOCK_50M, CLOCK_32_768K, CLOCK_25M, 0},
+};
+
+static void clkdev_add_clkout(void)
+{
+	int i;
+
+	for (i = 0; i < 4; i++) {
+		struct clk *clk;
+		char *name;
+
+		name = kzalloc(sizeof("clkout0"), GFP_KERNEL);
+		sprintf(name, "clkout%d", i);
+
+		clk = kzalloc(sizeof(struct clk), GFP_KERNEL);
+		clk->cl.dev_id = "1f103000.cgu";
+		clk->cl.con_id = name;
+		clk->cl.clk = clk;
+		clk->rate = 0;
+		clk->rates = valid_clkout_rates[i];
+		clk->enable = clkout_enable;
+		clk->module = i;
+		clkdev_add(&clk->cl);
+	}
+}
+
+/* bring up all register ranges that we need for basic system control */
+void __init ltq_soc_init(void)
+{
+	struct resource res_pmu, res_cgu, res_ebu;
+	struct device_node *np_pmu =
+			of_find_compatible_node(NULL, NULL, "lantiq,pmu-xway");
+	struct device_node *np_cgu =
+			of_find_compatible_node(NULL, NULL, "lantiq,cgu-xway");
+	struct device_node *np_ebu =
+			of_find_compatible_node(NULL, NULL, "lantiq,ebu-xway");
+
+	/* check if all the core register ranges are available */
+	if (!np_pmu || !np_cgu || !np_ebu)
+		panic("Failed to load core nodess from devicetree");
+
+	if (of_address_to_resource(np_pmu, 0, &res_pmu) ||
+			of_address_to_resource(np_cgu, 0, &res_cgu) ||
+			of_address_to_resource(np_ebu, 0, &res_ebu))
+		panic("Failed to get core resources");
+
+	if ((request_mem_region(res_pmu.start, resource_size(&res_pmu),
+				res_pmu.name) < 0) ||
+		(request_mem_region(res_cgu.start, resource_size(&res_cgu),
+				res_cgu.name) < 0) ||
+		(request_mem_region(res_ebu.start, resource_size(&res_ebu),
+				res_ebu.name) < 0))
+		pr_err("Failed to request core reources");
+
+	pmu_membase = ioremap_nocache(res_pmu.start, resource_size(&res_pmu));
+	ltq_cgu_membase = ioremap_nocache(res_cgu.start,
+						resource_size(&res_cgu));
+	ltq_ebu_membase = ioremap_nocache(res_ebu.start,
+						resource_size(&res_ebu));
+	if (!pmu_membase || !ltq_cgu_membase || !ltq_ebu_membase)
+		panic("Failed to remap core resources");
+
+	/* make sure to unprotect the memory region where flash is located */
+	ltq_ebu_w32(ltq_ebu_r32(LTQ_EBU_BUSCON0) & ~EBU_WRDIS, LTQ_EBU_BUSCON0);
+
+	/* add our generic xway clocks */
+	clkdev_add_pmu("10000000.fpi", NULL, 0, PMU_FPI);
+	clkdev_add_pmu("1e100400.serial", NULL, 0, PMU_ASC0);
+	clkdev_add_pmu("1e100a00.gptu", NULL, 0, PMU_GPT);
+	clkdev_add_pmu("1e100bb0.stp", NULL, 0, PMU_STP);
+	clkdev_add_pmu("1e104100.dma", NULL, 0, PMU_DMA);
+	clkdev_add_pmu("1e100800.spi", NULL, 0, PMU_SPI);
+	clkdev_add_pmu("1e105300.ebu", NULL, 0, PMU_EBU);
+	clkdev_add_clkout();
+
+	/* add the soc dependent clocks */
+	if (!of_machine_is_compatible("lantiq,vr9"))
+		clkdev_add_pmu("1e180000.etop", NULL, 0, PMU_PPE);
+
+	if (!of_machine_is_compatible("lantiq,ase")) {
+		clkdev_add_pmu("1e100c00.serial", NULL, 0, PMU_ASC1);
+		clkdev_add_pci();
+	}
+
+	if (of_machine_is_compatible("lantiq,ase")) {
+		if (ltq_cgu_r32(CGU_SYS) & (1 << 5))
+			clkdev_add_static(CLOCK_266M, CLOCK_133M, CLOCK_133M);
+		else
+			clkdev_add_static(CLOCK_133M, CLOCK_133M, CLOCK_133M);
+		clkdev_add_cgu("1e180000.etop", "ephycgu", CGU_EPHY),
+		clkdev_add_pmu("1e180000.etop", "ephy", 0, PMU_EPHY);
+	} else if (of_machine_is_compatible("lantiq,vr9")) {
+		clkdev_add_static(ltq_vr9_cpu_hz(), ltq_vr9_fpi_hz(),
+				ltq_vr9_fpi_hz());
+		clkdev_add_pmu("1d900000.pcie", "phy", 1, PMU1_PCIE_PHY);
+		clkdev_add_pmu("1d900000.pcie", "bus", 0, PMU_PCIE_CLK);
+		clkdev_add_pmu("1d900000.pcie", "msi", 1, PMU1_PCIE_MSI);
+		clkdev_add_pmu("1d900000.pcie", "pdi", 1, PMU1_PCIE_PDI);
+		clkdev_add_pmu("1d900000.pcie", "ctl", 1, PMU1_PCIE_CTL);
+		clkdev_add_pmu("1d900000.pcie", "ahb", 0, PMU_AHBM | PMU_AHBS);
+	} else if (of_machine_is_compatible("lantiq,ar9")) {
+		clkdev_add_static(ltq_ar9_cpu_hz(), ltq_ar9_fpi_hz(),
+				ltq_ar9_fpi_hz());
+		clkdev_add_pmu("1e180000.etop", "switch", 0, PMU_SWITCH);
+	} else {
+		clkdev_add_static(ltq_danube_cpu_hz(), ltq_danube_fpi_hz(),
+				ltq_danube_fpi_hz());
+	}
+}
diff --git a/arch/mips/mm/c-octeon.c b/arch/mips/mm/c-octeon.c
index 47037ec5589b..44e69e7a4519 100644
--- a/arch/mips/mm/c-octeon.c
+++ b/arch/mips/mm/c-octeon.c
@@ -21,6 +21,7 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/r4kcache.h>
+#include <asm/traps.h>
 #include <asm/mmu_context.h>
 #include <asm/war.h>
 
@@ -248,6 +249,11 @@ static void __cpuinit probe_octeon(void)
 	}
 }
 
+static void  __cpuinit octeon_cache_error_setup(void)
+{
+	extern char except_vec2_octeon;
+	set_handler(0x100, &except_vec2_octeon, 0x80);
+}
 
 /**
  * Setup the Octeon cache flush routines
@@ -255,12 +261,6 @@ static void __cpuinit probe_octeon(void)
  */
 void __cpuinit octeon_cache_init(void)
 {
-	extern unsigned long ebase;
-	extern char except_vec2_octeon;
-
-	memcpy((void *)(ebase + 0x100), &except_vec2_octeon, 0x80);
-	octeon_flush_cache_sigtramp(ebase + 0x100);
-
 	probe_octeon();
 
 	shm_align_mask = PAGE_SIZE - 1;
@@ -280,6 +280,8 @@ void __cpuinit octeon_cache_init(void)
 
 	build_clear_page();
 	build_copy_page();
+
+	board_cache_error_setup = octeon_cache_error_setup;
 }
 
 /**
diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c
index bda8eb26ece7..5109be96d98d 100644
--- a/arch/mips/mm/c-r4k.c
+++ b/arch/mips/mm/c-r4k.c
@@ -32,7 +32,7 @@
 #include <asm/mmu_context.h>
 #include <asm/war.h>
 #include <asm/cacheflush.h> /* for run_uncached() */
-
+#include <asm/traps.h>
 
 /*
  * Special Variant of smp_call_function for use by cache functions:
@@ -1385,10 +1385,8 @@ static int __init setcoherentio(char *str)
 __setup("coherentio", setcoherentio);
 #endif
 
-void __cpuinit r4k_cache_init(void)
+static void __cpuinit r4k_cache_error_setup(void)
 {
-	extern void build_clear_page(void);
-	extern void build_copy_page(void);
 	extern char __weak except_vec2_generic;
 	extern char __weak except_vec2_sb1;
 	struct cpuinfo_mips *c = &current_cpu_data;
@@ -1403,6 +1401,13 @@ void __cpuinit r4k_cache_init(void)
 		set_uncached_handler(0x100, &except_vec2_generic, 0x80);
 		break;
 	}
+}
+
+void __cpuinit r4k_cache_init(void)
+{
+	extern void build_clear_page(void);
+	extern void build_copy_page(void);
+	struct cpuinfo_mips *c = &current_cpu_data;
 
 	probe_pcache();
 	setup_scache();
@@ -1465,4 +1470,5 @@ void __cpuinit r4k_cache_init(void)
 	local_r4k___flush_cache_all(NULL);
 #endif
 	coherency_setup();
+	board_cache_error_setup = r4k_cache_error_setup;
 }
diff --git a/arch/mips/oprofile/Makefile b/arch/mips/oprofile/Makefile
index 29f2f13eb31c..1208c280f77d 100644
--- a/arch/mips/oprofile/Makefile
+++ b/arch/mips/oprofile/Makefile
@@ -1,5 +1,3 @@
-ccflags-y := -Werror
-
 obj-$(CONFIG_OPROFILE) += oprofile.o
 
 DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
diff --git a/arch/mips/oprofile/op_model_mipsxx.c b/arch/mips/oprofile/op_model_mipsxx.c
index 54759f1669d3..baba3bcaa3c2 100644
--- a/arch/mips/oprofile/op_model_mipsxx.c
+++ b/arch/mips/oprofile/op_model_mipsxx.c
@@ -298,6 +298,11 @@ static void reset_counters(void *arg)
 	}
 }
 
+static irqreturn_t mipsxx_perfcount_int(int irq, void *dev_id)
+{
+	return mipsxx_perfcount_handler();
+}
+
 static int __init mipsxx_init(void)
 {
 	int counters;
@@ -374,6 +379,10 @@ static int __init mipsxx_init(void)
 	save_perf_irq = perf_irq;
 	perf_irq = mipsxx_perfcount_handler;
 
+	if ((cp0_perfcount_irq >= 0) && (cp0_compare_irq != cp0_perfcount_irq))
+		return request_irq(cp0_perfcount_irq, mipsxx_perfcount_int,
+			0, "Perfcounter", save_perf_irq);
+
 	return 0;
 }
 
@@ -381,6 +390,9 @@ static void mipsxx_exit(void)
 {
 	int counters = op_model_mipsxx_ops.num_counters;
 
+	if ((cp0_perfcount_irq >= 0) && (cp0_compare_irq != cp0_perfcount_irq))
+		free_irq(cp0_perfcount_irq, save_perf_irq);
+
 	counters = counters_per_cpu_to_total(counters);
 	on_each_cpu(reset_counters, (void *)(long)counters, 1);
 
diff --git a/arch/mips/pci/Makefile b/arch/mips/pci/Makefile
index c3ac4b086eb2..c703f43a9914 100644
--- a/arch/mips/pci/Makefile
+++ b/arch/mips/pci/Makefile
@@ -19,7 +19,8 @@ obj-$(CONFIG_BCM47XX)		+= pci-bcm47xx.o
 obj-$(CONFIG_BCM63XX)		+= pci-bcm63xx.o fixup-bcm63xx.o \
 					ops-bcm63xx.o
 obj-$(CONFIG_MIPS_ALCHEMY)	+= pci-alchemy.o
-obj-$(CONFIG_SOC_AR724X)	+= pci-ath724x.o
+obj-$(CONFIG_SOC_AR71XX)	+= pci-ar71xx.o
+obj-$(CONFIG_PCI_AR724X)	+= pci-ar724x.o
 
 #
 # These are still pretty much in the old state, watch, go blind.
@@ -41,7 +42,8 @@ obj-$(CONFIG_SIBYTE_SB1250)	+= fixup-sb1250.o pci-sb1250.o
 obj-$(CONFIG_SIBYTE_BCM112X)	+= fixup-sb1250.o pci-sb1250.o
 obj-$(CONFIG_SIBYTE_BCM1x80)	+= pci-bcm1480.o pci-bcm1480ht.o
 obj-$(CONFIG_SNI_RM)		+= fixup-sni.o ops-sni.o
-obj-$(CONFIG_SOC_XWAY)		+= pci-lantiq.o ops-lantiq.o
+obj-$(CONFIG_LANTIQ)		+= fixup-lantiq.o
+obj-$(CONFIG_PCI_LANTIQ)	+= pci-lantiq.o ops-lantiq.o
 obj-$(CONFIG_TANBAC_TB0219)	+= fixup-tb0219.o
 obj-$(CONFIG_TANBAC_TB0226)	+= fixup-tb0226.o
 obj-$(CONFIG_TANBAC_TB0287)	+= fixup-tb0287.o
diff --git a/arch/mips/pci/fixup-lantiq.c b/arch/mips/pci/fixup-lantiq.c
new file mode 100644
index 000000000000..6c829df28dc7
--- /dev/null
+++ b/arch/mips/pci/fixup-lantiq.c
@@ -0,0 +1,40 @@
+/*
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License version 2 as published
+ *  by the Free Software Foundation.
+ *
+ *  Copyright (C) 2012 John Crispin <[email protected]>
+ */
+
+#include <linux/of_irq.h>
+#include <linux/of_pci.h>
+
+int (*ltq_pci_plat_arch_init)(struct pci_dev *dev) = NULL;
+int (*ltq_pci_plat_dev_init)(struct pci_dev *dev) = NULL;
+
+int pcibios_plat_dev_init(struct pci_dev *dev)
+{
+	if (ltq_pci_plat_arch_init)
+		return ltq_pci_plat_arch_init(dev);
+
+	if (ltq_pci_plat_dev_init)
+		return ltq_pci_plat_dev_init(dev);
+
+	return 0;
+}
+
+int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin)
+{
+	struct of_irq dev_irq;
+	int irq;
+
+	if (of_irq_map_pci(dev, &dev_irq)) {
+		dev_err(&dev->dev, "trying to map irq for unknown slot:%d pin:%d\n",
+			slot, pin);
+		return 0;
+	}
+	irq = irq_create_of_mapping(dev_irq.controller, dev_irq.specifier,
+					dev_irq.size);
+	dev_info(&dev->dev, "SLOT:%d PIN:%d IRQ:%d\n", slot, pin, irq);
+	return irq;
+}
diff --git a/arch/mips/pci/ops-loongson2.c b/arch/mips/pci/ops-loongson2.c
index d657ee0bc131..afd221122d22 100644
--- a/arch/mips/pci/ops-loongson2.c
+++ b/arch/mips/pci/ops-loongson2.c
@@ -15,6 +15,7 @@
 #include <linux/pci.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/export.h>
 
 #include <loongson.h>
 
diff --git a/arch/mips/pci/pci-ar71xx.c b/arch/mips/pci/pci-ar71xx.c
new file mode 100644
index 000000000000..1552522b8718
--- /dev/null
+++ b/arch/mips/pci/pci-ar71xx.c
@@ -0,0 +1,375 @@
+/*
+ *  Atheros AR71xx PCI host controller driver
+ *
+ *  Copyright (C) 2008-2011 Gabor Juhos <[email protected]>
+ *  Copyright (C) 2008 Imre Kaloz <[email protected]>
+ *
+ *  Parts of this file are based on Atheros' 2.6.15 BSP
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License version 2 as published
+ *  by the Free Software Foundation.
+ */
+
+#include <linux/resource.h>
+#include <linux/types.h>
+#include <linux/delay.h>
+#include <linux/bitops.h>
+#include <linux/pci.h>
+#include <linux/pci_regs.h>
+#include <linux/interrupt.h>
+
+#include <asm/mach-ath79/ar71xx_regs.h>
+#include <asm/mach-ath79/ath79.h>
+#include <asm/mach-ath79/pci.h>
+
+#define AR71XX_PCI_MEM_BASE	0x10000000
+#define AR71XX_PCI_MEM_SIZE	0x08000000
+
+#define AR71XX_PCI_WIN0_OFFS		0x10000000
+#define AR71XX_PCI_WIN1_OFFS		0x11000000
+#define AR71XX_PCI_WIN2_OFFS		0x12000000
+#define AR71XX_PCI_WIN3_OFFS		0x13000000
+#define AR71XX_PCI_WIN4_OFFS		0x14000000
+#define AR71XX_PCI_WIN5_OFFS		0x15000000
+#define AR71XX_PCI_WIN6_OFFS		0x16000000
+#define AR71XX_PCI_WIN7_OFFS		0x07000000
+
+#define AR71XX_PCI_CFG_BASE		\
+	(AR71XX_PCI_MEM_BASE + AR71XX_PCI_WIN7_OFFS + 0x10000)
+#define AR71XX_PCI_CFG_SIZE		0x100
+
+#define AR71XX_PCI_REG_CRP_AD_CBE	0x00
+#define AR71XX_PCI_REG_CRP_WRDATA	0x04
+#define AR71XX_PCI_REG_CRP_RDDATA	0x08
+#define AR71XX_PCI_REG_CFG_AD		0x0c
+#define AR71XX_PCI_REG_CFG_CBE		0x10
+#define AR71XX_PCI_REG_CFG_WRDATA	0x14
+#define AR71XX_PCI_REG_CFG_RDDATA	0x18
+#define AR71XX_PCI_REG_PCI_ERR		0x1c
+#define AR71XX_PCI_REG_PCI_ERR_ADDR	0x20
+#define AR71XX_PCI_REG_AHB_ERR		0x24
+#define AR71XX_PCI_REG_AHB_ERR_ADDR	0x28
+
+#define AR71XX_PCI_CRP_CMD_WRITE	0x00010000
+#define AR71XX_PCI_CRP_CMD_READ		0x00000000
+#define AR71XX_PCI_CFG_CMD_READ		0x0000000a
+#define AR71XX_PCI_CFG_CMD_WRITE	0x0000000b
+
+#define AR71XX_PCI_INT_CORE		BIT(4)
+#define AR71XX_PCI_INT_DEV2		BIT(2)
+#define AR71XX_PCI_INT_DEV1		BIT(1)
+#define AR71XX_PCI_INT_DEV0		BIT(0)
+
+#define AR71XX_PCI_IRQ_COUNT		5
+
+static DEFINE_SPINLOCK(ar71xx_pci_lock);
+static void __iomem *ar71xx_pcicfg_base;
+
+/* Byte lane enable bits */
+static const u8 ar71xx_pci_ble_table[4][4] = {
+	{0x0, 0xf, 0xf, 0xf},
+	{0xe, 0xd, 0xb, 0x7},
+	{0xc, 0xf, 0x3, 0xf},
+	{0xf, 0xf, 0xf, 0xf},
+};
+
+static const u32 ar71xx_pci_read_mask[8] = {
+	0, 0xff, 0xffff, 0, 0xffffffff, 0, 0, 0
+};
+
+static inline u32 ar71xx_pci_get_ble(int where, int size, int local)
+{
+	u32 t;
+
+	t = ar71xx_pci_ble_table[size & 3][where & 3];
+	BUG_ON(t == 0xf);
+	t <<= (local) ? 20 : 4;
+
+	return t;
+}
+
+static inline u32 ar71xx_pci_bus_addr(struct pci_bus *bus, unsigned int devfn,
+				      int where)
+{
+	u32 ret;
+
+	if (!bus->number) {
+		/* type 0 */
+		ret = (1 << PCI_SLOT(devfn)) | (PCI_FUNC(devfn) << 8) |
+		      (where & ~3);
+	} else {
+		/* type 1 */
+		ret = (bus->number << 16) | (PCI_SLOT(devfn) << 11) |
+		      (PCI_FUNC(devfn) << 8) | (where & ~3) | 1;
+	}
+
+	return ret;
+}
+
+static int ar71xx_pci_check_error(int quiet)
+{
+	void __iomem *base = ar71xx_pcicfg_base;
+	u32 pci_err;
+	u32 ahb_err;
+
+	pci_err = __raw_readl(base + AR71XX_PCI_REG_PCI_ERR) & 3;
+	if (pci_err) {
+		if (!quiet) {
+			u32 addr;
+
+			addr = __raw_readl(base + AR71XX_PCI_REG_PCI_ERR_ADDR);
+			pr_crit("ar71xx: %s bus error %d at addr 0x%x\n",
+				"PCI", pci_err, addr);
+		}
+
+		/* clear PCI error status */
+		__raw_writel(pci_err, base + AR71XX_PCI_REG_PCI_ERR);
+	}
+
+	ahb_err = __raw_readl(base + AR71XX_PCI_REG_AHB_ERR) & 1;
+	if (ahb_err) {
+		if (!quiet) {
+			u32 addr;
+
+			addr = __raw_readl(base + AR71XX_PCI_REG_AHB_ERR_ADDR);
+			pr_crit("ar71xx: %s bus error %d at addr 0x%x\n",
+				"AHB", ahb_err, addr);
+		}
+
+		/* clear AHB error status */
+		__raw_writel(ahb_err, base + AR71XX_PCI_REG_AHB_ERR);
+	}
+
+	return !!(ahb_err | pci_err);
+}
+
+static inline void ar71xx_pci_local_write(int where, int size, u32 value)
+{
+	void __iomem *base = ar71xx_pcicfg_base;
+	u32 ad_cbe;
+
+	value = value << (8 * (where & 3));
+
+	ad_cbe = AR71XX_PCI_CRP_CMD_WRITE | (where & ~3);
+	ad_cbe |= ar71xx_pci_get_ble(where, size, 1);
+
+	__raw_writel(ad_cbe, base + AR71XX_PCI_REG_CRP_AD_CBE);
+	__raw_writel(value, base + AR71XX_PCI_REG_CRP_WRDATA);
+}
+
+static inline int ar71xx_pci_set_cfgaddr(struct pci_bus *bus,
+					 unsigned int devfn,
+					 int where, int size, u32 cmd)
+{
+	void __iomem *base = ar71xx_pcicfg_base;
+	u32 addr;
+
+	addr = ar71xx_pci_bus_addr(bus, devfn, where);
+
+	__raw_writel(addr, base + AR71XX_PCI_REG_CFG_AD);
+	__raw_writel(cmd | ar71xx_pci_get_ble(where, size, 0),
+		     base + AR71XX_PCI_REG_CFG_CBE);
+
+	return ar71xx_pci_check_error(1);
+}
+
+static int ar71xx_pci_read_config(struct pci_bus *bus, unsigned int devfn,
+				  int where, int size, u32 *value)
+{
+	void __iomem *base = ar71xx_pcicfg_base;
+	unsigned long flags;
+	u32 data;
+	int err;
+	int ret;
+
+	ret = PCIBIOS_SUCCESSFUL;
+	data = ~0;
+
+	spin_lock_irqsave(&ar71xx_pci_lock, flags);
+
+	err = ar71xx_pci_set_cfgaddr(bus, devfn, where, size,
+				     AR71XX_PCI_CFG_CMD_READ);
+	if (err)
+		ret = PCIBIOS_DEVICE_NOT_FOUND;
+	else
+		data = __raw_readl(base + AR71XX_PCI_REG_CFG_RDDATA);
+
+	spin_unlock_irqrestore(&ar71xx_pci_lock, flags);
+
+	*value = (data >> (8 * (where & 3))) & ar71xx_pci_read_mask[size & 7];
+
+	return ret;
+}
+
+static int ar71xx_pci_write_config(struct pci_bus *bus, unsigned int devfn,
+				   int where, int size, u32 value)
+{
+	void __iomem *base = ar71xx_pcicfg_base;
+	unsigned long flags;
+	int err;
+	int ret;
+
+	value = value << (8 * (where & 3));
+	ret = PCIBIOS_SUCCESSFUL;
+
+	spin_lock_irqsave(&ar71xx_pci_lock, flags);
+
+	err = ar71xx_pci_set_cfgaddr(bus, devfn, where, size,
+				     AR71XX_PCI_CFG_CMD_WRITE);
+	if (err)
+		ret = PCIBIOS_DEVICE_NOT_FOUND;
+	else
+		__raw_writel(value, base + AR71XX_PCI_REG_CFG_WRDATA);
+
+	spin_unlock_irqrestore(&ar71xx_pci_lock, flags);
+
+	return ret;
+}
+
+static struct pci_ops ar71xx_pci_ops = {
+	.read	= ar71xx_pci_read_config,
+	.write	= ar71xx_pci_write_config,
+};
+
+static struct resource ar71xx_pci_io_resource = {
+	.name		= "PCI IO space",
+	.start		= 0,
+	.end		= 0,
+	.flags		= IORESOURCE_IO,
+};
+
+static struct resource ar71xx_pci_mem_resource = {
+	.name		= "PCI memory space",
+	.start		= AR71XX_PCI_MEM_BASE,
+	.end		= AR71XX_PCI_MEM_BASE + AR71XX_PCI_MEM_SIZE - 1,
+	.flags		= IORESOURCE_MEM
+};
+
+static struct pci_controller ar71xx_pci_controller = {
+	.pci_ops	= &ar71xx_pci_ops,
+	.mem_resource	= &ar71xx_pci_mem_resource,
+	.io_resource	= &ar71xx_pci_io_resource,
+};
+
+static void ar71xx_pci_irq_handler(unsigned int irq, struct irq_desc *desc)
+{
+	void __iomem *base = ath79_reset_base;
+	u32 pending;
+
+	pending = __raw_readl(base + AR71XX_RESET_REG_PCI_INT_STATUS) &
+		  __raw_readl(base + AR71XX_RESET_REG_PCI_INT_ENABLE);
+
+	if (pending & AR71XX_PCI_INT_DEV0)
+		generic_handle_irq(ATH79_PCI_IRQ(0));
+
+	else if (pending & AR71XX_PCI_INT_DEV1)
+		generic_handle_irq(ATH79_PCI_IRQ(1));
+
+	else if (pending & AR71XX_PCI_INT_DEV2)
+		generic_handle_irq(ATH79_PCI_IRQ(2));
+
+	else if (pending & AR71XX_PCI_INT_CORE)
+		generic_handle_irq(ATH79_PCI_IRQ(4));
+
+	else
+		spurious_interrupt();
+}
+
+static void ar71xx_pci_irq_unmask(struct irq_data *d)
+{
+	unsigned int irq = d->irq - ATH79_PCI_IRQ_BASE;
+	void __iomem *base = ath79_reset_base;
+	u32 t;
+
+	t = __raw_readl(base + AR71XX_RESET_REG_PCI_INT_ENABLE);
+	__raw_writel(t | (1 << irq), base + AR71XX_RESET_REG_PCI_INT_ENABLE);
+
+	/* flush write */
+	__raw_readl(base + AR71XX_RESET_REG_PCI_INT_ENABLE);
+}
+
+static void ar71xx_pci_irq_mask(struct irq_data *d)
+{
+	unsigned int irq = d->irq - ATH79_PCI_IRQ_BASE;
+	void __iomem *base = ath79_reset_base;
+	u32 t;
+
+	t = __raw_readl(base + AR71XX_RESET_REG_PCI_INT_ENABLE);
+	__raw_writel(t & ~(1 << irq), base + AR71XX_RESET_REG_PCI_INT_ENABLE);
+
+	/* flush write */
+	__raw_readl(base + AR71XX_RESET_REG_PCI_INT_ENABLE);
+}
+
+static struct irq_chip ar71xx_pci_irq_chip = {
+	.name		= "AR71XX PCI",
+	.irq_mask	= ar71xx_pci_irq_mask,
+	.irq_unmask	= ar71xx_pci_irq_unmask,
+	.irq_mask_ack	= ar71xx_pci_irq_mask,
+};
+
+static __init void ar71xx_pci_irq_init(void)
+{
+	void __iomem *base = ath79_reset_base;
+	int i;
+
+	__raw_writel(0, base + AR71XX_RESET_REG_PCI_INT_ENABLE);
+	__raw_writel(0, base + AR71XX_RESET_REG_PCI_INT_STATUS);
+
+	BUILD_BUG_ON(ATH79_PCI_IRQ_COUNT < AR71XX_PCI_IRQ_COUNT);
+
+	for (i = ATH79_PCI_IRQ_BASE;
+	     i < ATH79_PCI_IRQ_BASE + AR71XX_PCI_IRQ_COUNT; i++)
+		irq_set_chip_and_handler(i, &ar71xx_pci_irq_chip,
+					 handle_level_irq);
+
+	irq_set_chained_handler(ATH79_CPU_IRQ_IP2, ar71xx_pci_irq_handler);
+}
+
+static __init void ar71xx_pci_reset(void)
+{
+	void __iomem *ddr_base = ath79_ddr_base;
+
+	ath79_device_reset_set(AR71XX_RESET_PCI_BUS | AR71XX_RESET_PCI_CORE);
+	mdelay(100);
+
+	ath79_device_reset_clear(AR71XX_RESET_PCI_BUS | AR71XX_RESET_PCI_CORE);
+	mdelay(100);
+
+	__raw_writel(AR71XX_PCI_WIN0_OFFS, ddr_base + AR71XX_DDR_REG_PCI_WIN0);
+	__raw_writel(AR71XX_PCI_WIN1_OFFS, ddr_base + AR71XX_DDR_REG_PCI_WIN1);
+	__raw_writel(AR71XX_PCI_WIN2_OFFS, ddr_base + AR71XX_DDR_REG_PCI_WIN2);
+	__raw_writel(AR71XX_PCI_WIN3_OFFS, ddr_base + AR71XX_DDR_REG_PCI_WIN3);
+	__raw_writel(AR71XX_PCI_WIN4_OFFS, ddr_base + AR71XX_DDR_REG_PCI_WIN4);
+	__raw_writel(AR71XX_PCI_WIN5_OFFS, ddr_base + AR71XX_DDR_REG_PCI_WIN5);
+	__raw_writel(AR71XX_PCI_WIN6_OFFS, ddr_base + AR71XX_DDR_REG_PCI_WIN6);
+	__raw_writel(AR71XX_PCI_WIN7_OFFS, ddr_base + AR71XX_DDR_REG_PCI_WIN7);
+
+	mdelay(100);
+}
+
+__init int ar71xx_pcibios_init(void)
+{
+	u32 t;
+
+	ar71xx_pcicfg_base = ioremap(AR71XX_PCI_CFG_BASE, AR71XX_PCI_CFG_SIZE);
+	if (ar71xx_pcicfg_base == NULL)
+		return -ENOMEM;
+
+	ar71xx_pci_reset();
+
+	/* setup COMMAND register */
+	t = PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER | PCI_COMMAND_INVALIDATE
+	  | PCI_COMMAND_PARITY | PCI_COMMAND_SERR | PCI_COMMAND_FAST_BACK;
+	ar71xx_pci_local_write(PCI_COMMAND, 4, t);
+
+	/* clear bus errors */
+	ar71xx_pci_check_error(1);
+
+	ar71xx_pci_irq_init();
+
+	register_pci_controller(&ar71xx_pci_controller);
+
+	return 0;
+}
diff --git a/arch/mips/pci/pci-ar724x.c b/arch/mips/pci/pci-ar724x.c
new file mode 100644
index 000000000000..414a7459858d
--- /dev/null
+++ b/arch/mips/pci/pci-ar724x.c
@@ -0,0 +1,292 @@
+/*
+ *  Atheros AR724X PCI host controller driver
+ *
+ *  Copyright (C) 2011 René Bolldorf <[email protected]>
+ *  Copyright (C) 2009-2011 Gabor Juhos <[email protected]>
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License version 2 as published
+ *  by the Free Software Foundation.
+ */
+
+#include <linux/irq.h>
+#include <linux/pci.h>
+#include <asm/mach-ath79/ath79.h>
+#include <asm/mach-ath79/ar71xx_regs.h>
+#include <asm/mach-ath79/pci.h>
+
+#define AR724X_PCI_CFG_BASE	0x14000000
+#define AR724X_PCI_CFG_SIZE	0x1000
+#define AR724X_PCI_CTRL_BASE	(AR71XX_APB_BASE + 0x000f0000)
+#define AR724X_PCI_CTRL_SIZE	0x100
+
+#define AR724X_PCI_MEM_BASE	0x10000000
+#define AR724X_PCI_MEM_SIZE	0x08000000
+
+#define AR724X_PCI_REG_INT_STATUS	0x4c
+#define AR724X_PCI_REG_INT_MASK		0x50
+
+#define AR724X_PCI_INT_DEV0		BIT(14)
+
+#define AR724X_PCI_IRQ_COUNT		1
+
+#define AR7240_BAR0_WAR_VALUE	0xffff
+
+static DEFINE_SPINLOCK(ar724x_pci_lock);
+static void __iomem *ar724x_pci_devcfg_base;
+static void __iomem *ar724x_pci_ctrl_base;
+
+static u32 ar724x_pci_bar0_value;
+static bool ar724x_pci_bar0_is_cached;
+
+static int ar724x_pci_read(struct pci_bus *bus, unsigned int devfn, int where,
+			    int size, uint32_t *value)
+{
+	unsigned long flags;
+	void __iomem *base;
+	u32 data;
+
+	if (devfn)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	base = ar724x_pci_devcfg_base;
+
+	spin_lock_irqsave(&ar724x_pci_lock, flags);
+	data = __raw_readl(base + (where & ~3));
+
+	switch (size) {
+	case 1:
+		if (where & 1)
+			data >>= 8;
+		if (where & 2)
+			data >>= 16;
+		data &= 0xff;
+		break;
+	case 2:
+		if (where & 2)
+			data >>= 16;
+		data &= 0xffff;
+		break;
+	case 4:
+		break;
+	default:
+		spin_unlock_irqrestore(&ar724x_pci_lock, flags);
+
+		return PCIBIOS_BAD_REGISTER_NUMBER;
+	}
+
+	spin_unlock_irqrestore(&ar724x_pci_lock, flags);
+
+	if (where == PCI_BASE_ADDRESS_0 && size == 4 &&
+	    ar724x_pci_bar0_is_cached) {
+		/* use the cached value */
+		*value = ar724x_pci_bar0_value;
+	} else {
+		*value = data;
+	}
+
+	return PCIBIOS_SUCCESSFUL;
+}
+
+static int ar724x_pci_write(struct pci_bus *bus, unsigned int devfn, int where,
+			     int size, uint32_t value)
+{
+	unsigned long flags;
+	void __iomem *base;
+	u32 data;
+	int s;
+
+	if (devfn)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	if (soc_is_ar7240() && where == PCI_BASE_ADDRESS_0 && size == 4) {
+		if (value != 0xffffffff) {
+			/*
+			 * WAR for a hw issue. If the BAR0 register of the
+			 * device is set to the proper base address, the
+			 * memory space of the device is not accessible.
+			 *
+			 * Cache the intended value so it can be read back,
+			 * and write a SoC specific constant value to the
+			 * BAR0 register in order to make the device memory
+			 * accessible.
+			 */
+			ar724x_pci_bar0_is_cached = true;
+			ar724x_pci_bar0_value = value;
+
+			value = AR7240_BAR0_WAR_VALUE;
+		} else {
+			ar724x_pci_bar0_is_cached = false;
+		}
+	}
+
+	base = ar724x_pci_devcfg_base;
+
+	spin_lock_irqsave(&ar724x_pci_lock, flags);
+	data = __raw_readl(base + (where & ~3));
+
+	switch (size) {
+	case 1:
+		s = ((where & 3) * 8);
+		data &= ~(0xff << s);
+		data |= ((value & 0xff) << s);
+		break;
+	case 2:
+		s = ((where & 2) * 8);
+		data &= ~(0xffff << s);
+		data |= ((value & 0xffff) << s);
+		break;
+	case 4:
+		data = value;
+		break;
+	default:
+		spin_unlock_irqrestore(&ar724x_pci_lock, flags);
+
+		return PCIBIOS_BAD_REGISTER_NUMBER;
+	}
+
+	__raw_writel(data, base + (where & ~3));
+	/* flush write */
+	__raw_readl(base + (where & ~3));
+	spin_unlock_irqrestore(&ar724x_pci_lock, flags);
+
+	return PCIBIOS_SUCCESSFUL;
+}
+
+static struct pci_ops ar724x_pci_ops = {
+	.read	= ar724x_pci_read,
+	.write	= ar724x_pci_write,
+};
+
+static struct resource ar724x_io_resource = {
+	.name   = "PCI IO space",
+	.start  = 0,
+	.end    = 0,
+	.flags  = IORESOURCE_IO,
+};
+
+static struct resource ar724x_mem_resource = {
+	.name   = "PCI memory space",
+	.start  = AR724X_PCI_MEM_BASE,
+	.end    = AR724X_PCI_MEM_BASE + AR724X_PCI_MEM_SIZE - 1,
+	.flags  = IORESOURCE_MEM,
+};
+
+static struct pci_controller ar724x_pci_controller = {
+	.pci_ops        = &ar724x_pci_ops,
+	.io_resource    = &ar724x_io_resource,
+	.mem_resource	= &ar724x_mem_resource,
+};
+
+static void ar724x_pci_irq_handler(unsigned int irq, struct irq_desc *desc)
+{
+	void __iomem *base;
+	u32 pending;
+
+	base = ar724x_pci_ctrl_base;
+
+	pending = __raw_readl(base + AR724X_PCI_REG_INT_STATUS) &
+		  __raw_readl(base + AR724X_PCI_REG_INT_MASK);
+
+	if (pending & AR724X_PCI_INT_DEV0)
+		generic_handle_irq(ATH79_PCI_IRQ(0));
+
+	else
+		spurious_interrupt();
+}
+
+static void ar724x_pci_irq_unmask(struct irq_data *d)
+{
+	void __iomem *base;
+	u32 t;
+
+	base = ar724x_pci_ctrl_base;
+
+	switch (d->irq) {
+	case ATH79_PCI_IRQ(0):
+		t = __raw_readl(base + AR724X_PCI_REG_INT_MASK);
+		__raw_writel(t | AR724X_PCI_INT_DEV0,
+			     base + AR724X_PCI_REG_INT_MASK);
+		/* flush write */
+		__raw_readl(base + AR724X_PCI_REG_INT_MASK);
+	}
+}
+
+static void ar724x_pci_irq_mask(struct irq_data *d)
+{
+	void __iomem *base;
+	u32 t;
+
+	base = ar724x_pci_ctrl_base;
+
+	switch (d->irq) {
+	case ATH79_PCI_IRQ(0):
+		t = __raw_readl(base + AR724X_PCI_REG_INT_MASK);
+		__raw_writel(t & ~AR724X_PCI_INT_DEV0,
+			     base + AR724X_PCI_REG_INT_MASK);
+
+		/* flush write */
+		__raw_readl(base + AR724X_PCI_REG_INT_MASK);
+
+		t = __raw_readl(base + AR724X_PCI_REG_INT_STATUS);
+		__raw_writel(t | AR724X_PCI_INT_DEV0,
+			     base + AR724X_PCI_REG_INT_STATUS);
+
+		/* flush write */
+		__raw_readl(base + AR724X_PCI_REG_INT_STATUS);
+	}
+}
+
+static struct irq_chip ar724x_pci_irq_chip = {
+	.name		= "AR724X PCI ",
+	.irq_mask	= ar724x_pci_irq_mask,
+	.irq_unmask	= ar724x_pci_irq_unmask,
+	.irq_mask_ack	= ar724x_pci_irq_mask,
+};
+
+static void __init ar724x_pci_irq_init(int irq)
+{
+	void __iomem *base;
+	int i;
+
+	base = ar724x_pci_ctrl_base;
+
+	__raw_writel(0, base + AR724X_PCI_REG_INT_MASK);
+	__raw_writel(0, base + AR724X_PCI_REG_INT_STATUS);
+
+	BUILD_BUG_ON(ATH79_PCI_IRQ_COUNT < AR724X_PCI_IRQ_COUNT);
+
+	for (i = ATH79_PCI_IRQ_BASE;
+	     i < ATH79_PCI_IRQ_BASE + AR724X_PCI_IRQ_COUNT; i++)
+		irq_set_chip_and_handler(i, &ar724x_pci_irq_chip,
+					 handle_level_irq);
+
+	irq_set_chained_handler(irq, ar724x_pci_irq_handler);
+}
+
+int __init ar724x_pcibios_init(int irq)
+{
+	int ret;
+
+	ret = -ENOMEM;
+
+	ar724x_pci_devcfg_base = ioremap(AR724X_PCI_CFG_BASE,
+					 AR724X_PCI_CFG_SIZE);
+	if (ar724x_pci_devcfg_base == NULL)
+		goto err;
+
+	ar724x_pci_ctrl_base = ioremap(AR724X_PCI_CTRL_BASE,
+				       AR724X_PCI_CTRL_SIZE);
+	if (ar724x_pci_ctrl_base == NULL)
+		goto err_unmap_devcfg;
+
+	ar724x_pci_irq_init(irq);
+	register_pci_controller(&ar724x_pci_controller);
+
+	return PCIBIOS_SUCCESSFUL;
+
+err_unmap_devcfg:
+	iounmap(ar724x_pci_devcfg_base);
+err:
+	return ret;
+}
diff --git a/arch/mips/pci/pci-ath724x.c b/arch/mips/pci/pci-ath724x.c
deleted file mode 100644
index a4dd24a4130b..000000000000
--- a/arch/mips/pci/pci-ath724x.c
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- *  Atheros 724x PCI support
- *
- *  Copyright (C) 2011 René Bolldorf <[email protected]>
- *
- *  This program is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License version 2 as published
- *  by the Free Software Foundation.
- */
-
-#include <linux/pci.h>
-#include <asm/mach-ath79/pci-ath724x.h>
-
-#define reg_read(_phys)		(*(unsigned int *) KSEG1ADDR(_phys))
-#define reg_write(_phys, _val)	((*(unsigned int *) KSEG1ADDR(_phys)) = (_val))
-
-#define ATH724X_PCI_DEV_BASE	0x14000000
-#define ATH724X_PCI_MEM_BASE	0x10000000
-#define ATH724X_PCI_MEM_SIZE	0x08000000
-
-static DEFINE_SPINLOCK(ath724x_pci_lock);
-static struct ath724x_pci_data *pci_data;
-static int pci_data_size;
-
-static int ath724x_pci_read(struct pci_bus *bus, unsigned int devfn, int where,
-			    int size, uint32_t *value)
-{
-	unsigned long flags, addr, tval, mask;
-
-	if (devfn)
-		return PCIBIOS_DEVICE_NOT_FOUND;
-
-	if (where & (size - 1))
-		return PCIBIOS_BAD_REGISTER_NUMBER;
-
-	spin_lock_irqsave(&ath724x_pci_lock, flags);
-
-	switch (size) {
-	case 1:
-		addr = where & ~3;
-		mask = 0xff000000 >> ((where % 4) * 8);
-		tval = reg_read(ATH724X_PCI_DEV_BASE + addr);
-		tval = tval & ~mask;
-		*value = (tval >> ((4 - (where % 4))*8));
-		break;
-	case 2:
-		addr = where & ~3;
-		mask = 0xffff0000 >> ((where % 4)*8);
-		tval = reg_read(ATH724X_PCI_DEV_BASE + addr);
-		tval = tval & ~mask;
-		*value = (tval >> ((4 - (where % 4))*8));
-		break;
-	case 4:
-		*value = reg_read(ATH724X_PCI_DEV_BASE + where);
-		break;
-	default:
-		spin_unlock_irqrestore(&ath724x_pci_lock, flags);
-
-		return PCIBIOS_BAD_REGISTER_NUMBER;
-	}
-
-	spin_unlock_irqrestore(&ath724x_pci_lock, flags);
-
-	return PCIBIOS_SUCCESSFUL;
-}
-
-static int ath724x_pci_write(struct pci_bus *bus, unsigned int devfn, int where,
-			     int size, uint32_t value)
-{
-	unsigned long flags, tval, addr, mask;
-
-	if (devfn)
-		return PCIBIOS_DEVICE_NOT_FOUND;
-
-	if (where & (size - 1))
-		return PCIBIOS_BAD_REGISTER_NUMBER;
-
-	spin_lock_irqsave(&ath724x_pci_lock, flags);
-
-	switch (size) {
-	case 1:
-		addr = (ATH724X_PCI_DEV_BASE + where) & ~3;
-		mask = 0xff000000 >> ((where % 4)*8);
-		tval = reg_read(addr);
-		tval = tval & ~mask;
-		tval |= (value << ((4 - (where % 4))*8)) & mask;
-		reg_write(addr, tval);
-		break;
-	case 2:
-		addr = (ATH724X_PCI_DEV_BASE + where) & ~3;
-		mask = 0xffff0000 >> ((where % 4)*8);
-		tval = reg_read(addr);
-		tval = tval & ~mask;
-		tval |= (value << ((4 - (where % 4))*8)) & mask;
-		reg_write(addr, tval);
-		break;
-	case 4:
-		reg_write((ATH724X_PCI_DEV_BASE + where), value);
-		break;
-	default:
-		spin_unlock_irqrestore(&ath724x_pci_lock, flags);
-
-		return PCIBIOS_BAD_REGISTER_NUMBER;
-	}
-
-	spin_unlock_irqrestore(&ath724x_pci_lock, flags);
-
-	return PCIBIOS_SUCCESSFUL;
-}
-
-static struct pci_ops ath724x_pci_ops = {
-	.read	= ath724x_pci_read,
-	.write	= ath724x_pci_write,
-};
-
-static struct resource ath724x_io_resource = {
-	.name   = "PCI IO space",
-	.start  = 0,
-	.end    = 0,
-	.flags  = IORESOURCE_IO,
-};
-
-static struct resource ath724x_mem_resource = {
-	.name   = "PCI memory space",
-	.start  = ATH724X_PCI_MEM_BASE,
-	.end    = ATH724X_PCI_MEM_BASE + ATH724X_PCI_MEM_SIZE - 1,
-	.flags  = IORESOURCE_MEM,
-};
-
-static struct pci_controller ath724x_pci_controller = {
-	.pci_ops        = &ath724x_pci_ops,
-	.io_resource    = &ath724x_io_resource,
-	.mem_resource	= &ath724x_mem_resource,
-};
-
-void ath724x_pci_add_data(struct ath724x_pci_data *data, int size)
-{
-	pci_data	= data;
-	pci_data_size	= size;
-}
-
-int __init pcibios_map_irq(const struct pci_dev *dev, uint8_t slot, uint8_t pin)
-{
-	unsigned int devfn = dev->devfn;
-	int irq = -1;
-
-	if (devfn > pci_data_size - 1)
-		return irq;
-
-	irq = pci_data[devfn].irq;
-
-	return irq;
-}
-
-int pcibios_plat_dev_init(struct pci_dev *dev)
-{
-	unsigned int devfn = dev->devfn;
-
-	if (devfn > pci_data_size - 1)
-		return PCIBIOS_DEVICE_NOT_FOUND;
-
-	dev->dev.platform_data = pci_data[devfn].pdata;
-
-	return PCIBIOS_SUCCESSFUL;
-}
-
-static int __init ath724x_pcibios_init(void)
-{
-	register_pci_controller(&ath724x_pci_controller);
-
-	return PCIBIOS_SUCCESSFUL;
-}
-
-arch_initcall(ath724x_pcibios_init);
diff --git a/arch/mips/pci/pci-lantiq.c b/arch/mips/pci/pci-lantiq.c
index 030c77e7926e..ea453532a33c 100644
--- a/arch/mips/pci/pci-lantiq.c
+++ b/arch/mips/pci/pci-lantiq.c
@@ -13,8 +13,12 @@
 #include <linux/delay.h>
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
-#include <linux/export.h>
-#include <linux/platform_device.h>
+#include <linux/module.h>
+#include <linux/clk.h>
+#include <linux/of_platform.h>
+#include <linux/of_gpio.h>
+#include <linux/of_irq.h>
+#include <linux/of_pci.h>
 
 #include <asm/pci.h>
 #include <asm/gpio.h>
@@ -22,17 +26,9 @@
 
 #include <lantiq_soc.h>
 #include <lantiq_irq.h>
-#include <lantiq_platform.h>
 
 #include "pci-lantiq.h"
 
-#define LTQ_PCI_CFG_BASE		0x17000000
-#define LTQ_PCI_CFG_SIZE		0x00008000
-#define LTQ_PCI_MEM_BASE		0x18000000
-#define LTQ_PCI_MEM_SIZE		0x02000000
-#define LTQ_PCI_IO_BASE			0x1AE00000
-#define LTQ_PCI_IO_SIZE			0x00200000
-
 #define PCI_CR_FCI_ADDR_MAP0		0x00C0
 #define PCI_CR_FCI_ADDR_MAP1		0x00C4
 #define PCI_CR_FCI_ADDR_MAP2		0x00C8
@@ -68,79 +64,27 @@
 #define ltq_pci_cfg_w32(x, y)	ltq_w32((x), ltq_pci_mapped_cfg + (y))
 #define ltq_pci_cfg_r32(x)	ltq_r32(ltq_pci_mapped_cfg + (x))
 
-struct ltq_pci_gpio_map {
-	int pin;
-	int alt0;
-	int alt1;
-	int dir;
-	char *name;
-};
-
-/* the pci core can make use of the following gpios */
-static struct ltq_pci_gpio_map ltq_pci_gpio_map[] = {
-	{ 0, 1, 0, 0, "pci-exin0" },
-	{ 1, 1, 0, 0, "pci-exin1" },
-	{ 2, 1, 0, 0, "pci-exin2" },
-	{ 39, 1, 0, 0, "pci-exin3" },
-	{ 10, 1, 0, 0, "pci-exin4" },
-	{ 9, 1, 0, 0, "pci-exin5" },
-	{ 30, 1, 0, 1, "pci-gnt1" },
-	{ 23, 1, 0, 1, "pci-gnt2" },
-	{ 19, 1, 0, 1, "pci-gnt3" },
-	{ 38, 1, 0, 1, "pci-gnt4" },
-	{ 29, 1, 0, 0, "pci-req1" },
-	{ 31, 1, 0, 0, "pci-req2" },
-	{ 3, 1, 0, 0, "pci-req3" },
-	{ 37, 1, 0, 0, "pci-req4" },
-};
-
 __iomem void *ltq_pci_mapped_cfg;
 static __iomem void *ltq_pci_membase;
 
-int (*ltqpci_plat_dev_init)(struct pci_dev *dev) = NULL;
-
-/* Since the PCI REQ pins can be reused for other functionality, make it
-   possible to exclude those from interpretation by the PCI controller */
-static int ltq_pci_req_mask = 0xf;
-
-static int *ltq_pci_irq_map;
-
-struct pci_ops ltq_pci_ops = {
+static int reset_gpio;
+static struct clk *clk_pci, *clk_external;
+static struct resource pci_io_resource;
+static struct resource pci_mem_resource;
+static struct pci_ops pci_ops = {
 	.read	= ltq_pci_read_config_dword,
 	.write	= ltq_pci_write_config_dword
 };
 
-static struct resource pci_io_resource = {
-	.name	= "pci io space",
-	.start	= LTQ_PCI_IO_BASE,
-	.end	= LTQ_PCI_IO_BASE + LTQ_PCI_IO_SIZE - 1,
-	.flags	= IORESOURCE_IO
-};
-
-static struct resource pci_mem_resource = {
-	.name	= "pci memory space",
-	.start	= LTQ_PCI_MEM_BASE,
-	.end	= LTQ_PCI_MEM_BASE + LTQ_PCI_MEM_SIZE - 1,
-	.flags	= IORESOURCE_MEM
-};
-
-static struct pci_controller ltq_pci_controller = {
-	.pci_ops	= &ltq_pci_ops,
+static struct pci_controller pci_controller = {
+	.pci_ops	= &pci_ops,
 	.mem_resource	= &pci_mem_resource,
 	.mem_offset	= 0x00000000UL,
 	.io_resource	= &pci_io_resource,
 	.io_offset	= 0x00000000UL,
 };
 
-int pcibios_plat_dev_init(struct pci_dev *dev)
-{
-	if (ltqpci_plat_dev_init)
-		return ltqpci_plat_dev_init(dev);
-
-	return 0;
-}
-
-static u32 ltq_calc_bar11mask(void)
+static inline u32 ltq_calc_bar11mask(void)
 {
 	u32 mem, bar11mask;
 
@@ -151,48 +95,42 @@ static u32 ltq_calc_bar11mask(void)
 	return bar11mask;
 }
 
-static void ltq_pci_setup_gpio(int gpio)
-{
-	int i;
-	for (i = 0; i < ARRAY_SIZE(ltq_pci_gpio_map); i++) {
-		if (gpio & (1 << i)) {
-			ltq_gpio_request(ltq_pci_gpio_map[i].pin,
-				ltq_pci_gpio_map[i].alt0,
-				ltq_pci_gpio_map[i].alt1,
-				ltq_pci_gpio_map[i].dir,
-				ltq_pci_gpio_map[i].name);
-		}
-	}
-	ltq_gpio_request(21, 0, 0, 1, "pci-reset");
-	ltq_pci_req_mask = (gpio >> PCI_REQ_SHIFT) & PCI_REQ_MASK;
-}
-
-static int __devinit ltq_pci_startup(struct ltq_pci_data *conf)
+static int __devinit ltq_pci_startup(struct platform_device *pdev)
 {
+	struct device_node *node = pdev->dev.of_node;
+	const __be32 *req_mask, *bus_clk;
 	u32 temp_buffer;
 
-	/* set clock to 33Mhz */
-	if (ltq_is_ar9()) {
-		ltq_cgu_w32(ltq_cgu_r32(LTQ_CGU_IFCCR) & ~0x1f00000, LTQ_CGU_IFCCR);
-		ltq_cgu_w32(ltq_cgu_r32(LTQ_CGU_IFCCR) | 0xe00000, LTQ_CGU_IFCCR);
-	} else {
-		ltq_cgu_w32(ltq_cgu_r32(LTQ_CGU_IFCCR) & ~0xf00000, LTQ_CGU_IFCCR);
-		ltq_cgu_w32(ltq_cgu_r32(LTQ_CGU_IFCCR) | 0x800000, LTQ_CGU_IFCCR);
+	/* get our clocks */
+	clk_pci = clk_get(&pdev->dev, NULL);
+	if (IS_ERR(clk_pci)) {
+		dev_err(&pdev->dev, "failed to get pci clock\n");
+		return PTR_ERR(clk_pci);
 	}
 
-	/* external or internal clock ? */
-	if (conf->clock) {
-		ltq_cgu_w32(ltq_cgu_r32(LTQ_CGU_IFCCR) & ~(1 << 16),
-			LTQ_CGU_IFCCR);
-		ltq_cgu_w32((1 << 30), LTQ_CGU_PCICR);
-	} else {
-		ltq_cgu_w32(ltq_cgu_r32(LTQ_CGU_IFCCR) | (1 << 16),
-			LTQ_CGU_IFCCR);
-		ltq_cgu_w32((1 << 31) | (1 << 30), LTQ_CGU_PCICR);
+	clk_external = clk_get(&pdev->dev, "external");
+	if (IS_ERR(clk_external)) {
+		clk_put(clk_pci);
+		dev_err(&pdev->dev, "failed to get external pci clock\n");
+		return PTR_ERR(clk_external);
 	}
 
-	/* setup pci clock and gpis used by pci */
-	ltq_pci_setup_gpio(conf->gpio);
+	/* read the bus speed that we want */
+	bus_clk = of_get_property(node, "lantiq,bus-clock", NULL);
+	if (bus_clk)
+		clk_set_rate(clk_pci, *bus_clk);
+
+	/* and enable the clocks */
+	clk_enable(clk_pci);
+	if (of_find_property(node, "lantiq,external-clock", NULL))
+		clk_enable(clk_external);
+	else
+		clk_disable(clk_external);
+
+	/* setup reset gpio used by pci */
+	reset_gpio = of_get_named_gpio(node, "gpio-reset", 0);
+	if (reset_gpio > 0)
+		devm_gpio_request(&pdev->dev, reset_gpio, "pci-reset");
 
 	/* enable auto-switching between PCI and EBU */
 	ltq_pci_w32(0xa, PCI_CR_CLK_CTRL);
@@ -205,7 +143,12 @@ static int __devinit ltq_pci_startup(struct ltq_pci_data *conf)
 
 	/* enable external 2 PCI masters */
 	temp_buffer = ltq_pci_r32(PCI_CR_PC_ARB);
-	temp_buffer &= (~(ltq_pci_req_mask << 16));
+	/* setup the request mask */
+	req_mask = of_get_property(node, "req-mask", NULL);
+	if (req_mask)
+		temp_buffer &= ~((*req_mask & 0xf) << 16);
+	else
+		temp_buffer &= ~0xf0000;
 	/* enable internal arbiter */
 	temp_buffer |= (1 << INTERNAL_ARB_ENABLE_BIT);
 	/* enable internal PCI master reqest */
@@ -249,47 +192,55 @@ static int __devinit ltq_pci_startup(struct ltq_pci_data *conf)
 	ltq_ebu_w32(ltq_ebu_r32(LTQ_EBU_PCC_IEN) | 0x10, LTQ_EBU_PCC_IEN);
 
 	/* toggle reset pin */
-	__gpio_set_value(21, 0);
-	wmb();
-	mdelay(1);
-	__gpio_set_value(21, 1);
-	return 0;
-}
-
-int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin)
-{
-	if (ltq_pci_irq_map[slot])
-		return ltq_pci_irq_map[slot];
-	printk(KERN_ERR "lq_pci: trying to map irq for unknown slot %d\n",
-		slot);
-
+	if (reset_gpio > 0) {
+		__gpio_set_value(reset_gpio, 0);
+		wmb();
+		mdelay(1);
+		__gpio_set_value(reset_gpio, 1);
+	}
 	return 0;
 }
 
 static int __devinit ltq_pci_probe(struct platform_device *pdev)
 {
-	struct ltq_pci_data *ltq_pci_data =
-		(struct ltq_pci_data *) pdev->dev.platform_data;
+	struct resource *res_cfg, *res_bridge;
 
 	pci_clear_flags(PCI_PROBE_ONLY);
-	ltq_pci_irq_map = ltq_pci_data->irq;
-	ltq_pci_membase = ioremap_nocache(PCI_CR_BASE_ADDR, PCI_CR_SIZE);
-	ltq_pci_mapped_cfg =
-		ioremap_nocache(LTQ_PCI_CFG_BASE, LTQ_PCI_CFG_BASE);
-	ltq_pci_controller.io_map_base =
-		(unsigned long)ioremap(LTQ_PCI_IO_BASE, LTQ_PCI_IO_SIZE - 1);
-	ltq_pci_startup(ltq_pci_data);
-	register_pci_controller(&ltq_pci_controller);
 
+	res_cfg = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	res_bridge = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+	if (!res_cfg || !res_bridge) {
+		dev_err(&pdev->dev, "missing memory reources\n");
+		return -EINVAL;
+	}
+
+	ltq_pci_membase = devm_request_and_ioremap(&pdev->dev, res_bridge);
+	ltq_pci_mapped_cfg = devm_request_and_ioremap(&pdev->dev, res_cfg);
+
+	if (!ltq_pci_membase || !ltq_pci_mapped_cfg) {
+		dev_err(&pdev->dev, "failed to remap resources\n");
+		return -ENOMEM;
+	}
+
+	ltq_pci_startup(pdev);
+
+	pci_load_of_ranges(&pci_controller, pdev->dev.of_node);
+	register_pci_controller(&pci_controller);
 	return 0;
 }
 
-static struct platform_driver
-ltq_pci_driver = {
+static const struct of_device_id ltq_pci_match[] = {
+	{ .compatible = "lantiq,pci-xway" },
+	{},
+};
+MODULE_DEVICE_TABLE(of, ltq_pci_match);
+
+static struct platform_driver ltq_pci_driver = {
 	.probe = ltq_pci_probe,
 	.driver = {
-		.name = "ltq_pci",
+		.name = "pci-xway",
 		.owner = THIS_MODULE,
+		.of_match_table = ltq_pci_match,
 	},
 };
 
@@ -297,7 +248,7 @@ int __init pcibios_init(void)
 {
 	int ret = platform_driver_register(&ltq_pci_driver);
 	if (ret)
-		printk(KERN_INFO "ltq_pci: Error registering platfom driver!");
+		pr_info("pci-xway: Error registering platform driver!");
 	return ret;
 }
 
diff --git a/arch/mips/pci/pci.c b/arch/mips/pci/pci.c
index 0514866fa925..271e8c4a54c7 100644
--- a/arch/mips/pci/pci.c
+++ b/arch/mips/pci/pci.c
@@ -16,6 +16,7 @@
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/pci.h>
+#include <linux/of_address.h>
 
 #include <asm/cpu-info.h>
 
@@ -114,9 +115,63 @@ static void __devinit pcibios_scanbus(struct pci_controller *hose)
 			pci_bus_assign_resources(bus);
 			pci_enable_bridges(bus);
 		}
+		bus->dev.of_node = hose->of_node;
 	}
 }
 
+#ifdef CONFIG_OF
+void __devinit pci_load_of_ranges(struct pci_controller *hose,
+				struct device_node *node)
+{
+	const __be32 *ranges;
+	int rlen;
+	int pna = of_n_addr_cells(node);
+	int np = pna + 5;
+
+	pr_info("PCI host bridge %s ranges:\n", node->full_name);
+	ranges = of_get_property(node, "ranges", &rlen);
+	if (ranges == NULL)
+		return;
+	hose->of_node = node;
+
+	while ((rlen -= np * 4) >= 0) {
+		u32 pci_space;
+		struct resource *res = NULL;
+		u64 addr, size;
+
+		pci_space = be32_to_cpup(&ranges[0]);
+		addr = of_translate_address(node, ranges + 3);
+		size = of_read_number(ranges + pna + 3, 2);
+		ranges += np;
+		switch ((pci_space >> 24) & 0x3) {
+		case 1:		/* PCI IO space */
+			pr_info("  IO 0x%016llx..0x%016llx\n",
+					addr, addr + size - 1);
+			hose->io_map_base =
+				(unsigned long)ioremap(addr, size);
+			res = hose->io_resource;
+			res->flags = IORESOURCE_IO;
+			break;
+		case 2:		/* PCI Memory space */
+		case 3:		/* PCI 64 bits Memory space */
+			pr_info(" MEM 0x%016llx..0x%016llx\n",
+					addr, addr + size - 1);
+			res = hose->mem_resource;
+			res->flags = IORESOURCE_MEM;
+			break;
+		}
+		if (res != NULL) {
+			res->start = addr;
+			res->name = node->full_name;
+			res->end = res->start + size - 1;
+			res->parent = NULL;
+			res->sibling = NULL;
+			res->child = NULL;
+		}
+	}
+}
+#endif
+
 static DEFINE_MUTEX(pci_scan_mutex);
 
 void __devinit register_pci_controller(struct pci_controller *hose)
diff --git a/arch/mips/pmc-sierra/yosemite/Makefile b/arch/mips/pmc-sierra/yosemite/Makefile
index 02f5fb94ea28..5af95ec3319d 100644
--- a/arch/mips/pmc-sierra/yosemite/Makefile
+++ b/arch/mips/pmc-sierra/yosemite/Makefile
@@ -5,5 +5,3 @@
 obj-y    += irq.o prom.o py-console.o setup.o
 
 obj-$(CONFIG_SMP)		+= smp.o
-
-ccflags-y := -Werror
diff --git a/arch/mips/pmc-sierra/yosemite/setup.c b/arch/mips/pmc-sierra/yosemite/setup.c
index 3498ac9c35af..b6472fc88a99 100644
--- a/arch/mips/pmc-sierra/yosemite/setup.c
+++ b/arch/mips/pmc-sierra/yosemite/setup.c
@@ -27,6 +27,7 @@
 #include <linux/bcd.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/export.h>
 #include <linux/types.h>
 #include <linux/mm.h>
 #include <linux/bootmem.h>
diff --git a/arch/mips/powertv/Makefile b/arch/mips/powertv/Makefile
index 348d2e850ef5..39ca9f8d63ae 100644
--- a/arch/mips/powertv/Makefile
+++ b/arch/mips/powertv/Makefile
@@ -27,5 +27,3 @@ obj-y += init.o ioremap.o memory.o powertv_setup.o reset.o time.o \
 	asic/ pci/
 
 obj-$(CONFIG_USB) += powertv-usb.o
-
-ccflags-y := -Wall
diff --git a/arch/mips/powertv/asic/Makefile b/arch/mips/powertv/asic/Makefile
index d810a33182a4..35dcc53eb25f 100644
--- a/arch/mips/powertv/asic/Makefile
+++ b/arch/mips/powertv/asic/Makefile
@@ -19,5 +19,3 @@
 obj-y += asic-calliope.o asic-cronus.o asic-gaia.o asic-zeus.o \
 	asic_devices.o asic_int.o irq_asic.o prealloc-calliope.o \
 	prealloc-cronus.o prealloc-cronuslite.o prealloc-gaia.o prealloc-zeus.o
-
-ccflags-y := -Wall -Werror
diff --git a/arch/mips/powertv/pci/Makefile b/arch/mips/powertv/pci/Makefile
index 5783201cd2c8..2610a6af5b2c 100644
--- a/arch/mips/powertv/pci/Makefile
+++ b/arch/mips/powertv/pci/Makefile
@@ -17,5 +17,3 @@
 #
 
 obj-$(CONFIG_PCI)	+= fixup-powertv.o
-
-ccflags-y := -Wall -Werror
diff --git a/arch/mips/rb532/devices.c b/arch/mips/rb532/devices.c
index a969eb826634..ea774285e6c5 100644
--- a/arch/mips/rb532/devices.c
+++ b/arch/mips/rb532/devices.c
@@ -15,6 +15,7 @@
  *  GNU General Public License for more details.
  */
 #include <linux/kernel.h>
+#include <linux/export.h>
 #include <linux/init.h>
 #include <linux/ctype.h>
 #include <linux/string.h>
diff --git a/arch/mips/sni/setup.c b/arch/mips/sni/setup.c
index d16b462154c3..413f17f8e892 100644
--- a/arch/mips/sni/setup.c
+++ b/arch/mips/sni/setup.c
@@ -10,6 +10,7 @@
  */
 #include <linux/eisa.h>
 #include <linux/init.h>
+#include <linux/export.h>
 #include <linux/console.h>
 #include <linux/fb.h>
 #include <linux/screen_info.h>
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index 0e9dec6cadd1..e5287d8517aa 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -1,4 +1,3 @@
-
 obj-$(CONFIG_KVM) += kvm/
 
 # Xen paravirtualization support
@@ -7,6 +6,7 @@ obj-$(CONFIG_XEN) += xen/
 # lguest paravirtualization support
 obj-$(CONFIG_LGUEST_GUEST) += lguest/
 
+obj-y += realmode/
 obj-y += kernel/
 obj-y += mm/
 
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 610001d385dd..724aa441de7d 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -29,7 +29,6 @@
 #include <asm/processor.h>
 #include <asm/mmu.h>
 #include <asm/mpspec.h>
-#include <asm/trampoline.h>
 
 #define COMPILER_DEPENDENT_INT64   long long
 #define COMPILER_DEPENDENT_UINT64  unsigned long long
@@ -118,7 +117,6 @@ static inline void acpi_disable_pci(void)
 extern int acpi_suspend_lowlevel(void);
 
 extern const unsigned char acpi_wakeup_code[];
-#define acpi_wakeup_address (__pa(TRAMPOLINE_SYM(acpi_wakeup_code)))
 
 /* early initialization routine */
 extern void acpi_reserve_wakeup_memory(void);
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index effff47a3c82..43876f16caf1 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -31,6 +31,56 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte)
 	ptep->pte_low = pte.pte_low;
 }
 
+#define pmd_read_atomic pmd_read_atomic
+/*
+ * pte_offset_map_lock on 32bit PAE kernels was reading the pmd_t with
+ * a "*pmdp" dereference done by gcc. Problem is, in certain places
+ * where pte_offset_map_lock is called, concurrent page faults are
+ * allowed, if the mmap_sem is hold for reading. An example is mincore
+ * vs page faults vs MADV_DONTNEED. On the page fault side
+ * pmd_populate rightfully does a set_64bit, but if we're reading the
+ * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen
+ * because gcc will not read the 64bit of the pmd atomically. To fix
+ * this all places running pmd_offset_map_lock() while holding the
+ * mmap_sem in read mode, shall read the pmdp pointer using this
+ * function to know if the pmd is null nor not, and in turn to know if
+ * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd
+ * operations.
+ *
+ * Without THP if the mmap_sem is hold for reading, the
+ * pmd can only transition from null to not null while pmd_read_atomic runs.
+ * So there's no need of literally reading it atomically.
+ *
+ * With THP if the mmap_sem is hold for reading, the pmd can become
+ * THP or null or point to a pte (and in turn become "stable") at any
+ * time under pmd_read_atomic, so it's mandatory to read it atomically
+ * with cmpxchg8b.
+ */
+#ifndef CONFIG_TRANSPARENT_HUGEPAGE
+static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
+{
+	pmdval_t ret;
+	u32 *tmp = (u32 *)pmdp;
+
+	ret = (pmdval_t) (*tmp);
+	if (ret) {
+		/*
+		 * If the low part is null, we must not read the high part
+		 * or we can end up with a partial pmd.
+		 */
+		smp_rmb();
+		ret |= ((pmdval_t)*(tmp + 1)) << 32;
+	}
+
+	return (pmd_t) { ret };
+}
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
+{
+	return (pmd_t) { atomic64_read((atomic64_t *)pmdp) };
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
 static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
 {
 	set_64bit((unsigned long long *)(ptep), native_pte_val(pte));
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 7745b257f035..39bc5777211a 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -544,13 +544,16 @@ static inline void load_sp0(struct tss_struct *tss,
  * enable), so that any CPU's that boot up
  * after us can get the correct flags.
  */
-extern unsigned long		mmu_cr4_features;
+extern unsigned long mmu_cr4_features;
+extern u32 *trampoline_cr4_features;
 
 static inline void set_in_cr4(unsigned long mask)
 {
 	unsigned long cr4;
 
 	mmu_cr4_features |= mask;
+	if (trampoline_cr4_features)
+		*trampoline_cr4_features = mmu_cr4_features;
 	cr4 = read_cr4();
 	cr4 |= mask;
 	write_cr4(cr4);
@@ -561,6 +564,8 @@ static inline void clear_in_cr4(unsigned long mask)
 	unsigned long cr4;
 
 	mmu_cr4_features &= ~mask;
+	if (trampoline_cr4_features)
+		*trampoline_cr4_features = mmu_cr4_features;
 	cr4 = read_cr4();
 	cr4 &= ~mask;
 	write_cr4(cr4);
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
new file mode 100644
index 000000000000..fce3f4ae5bd6
--- /dev/null
+++ b/arch/x86/include/asm/realmode.h
@@ -0,0 +1,62 @@
+#ifndef _ARCH_X86_REALMODE_H
+#define _ARCH_X86_REALMODE_H
+
+#include <linux/types.h>
+#include <asm/io.h>
+
+/* This must match data at realmode.S */
+struct real_mode_header {
+	u32	text_start;
+	u32	ro_end;
+	/* SMP trampoline */
+	u32	trampoline_start;
+	u32	trampoline_status;
+	u32	trampoline_header;
+#ifdef CONFIG_X86_64
+	u32	trampoline_pgd;
+#endif
+	/* ACPI S3 wakeup */
+#ifdef CONFIG_ACPI_SLEEP
+	u32	wakeup_start;
+	u32	wakeup_header;
+#endif
+	/* APM/BIOS reboot */
+#ifdef CONFIG_X86_32
+	u32	machine_real_restart_asm;
+#endif
+};
+
+/* This must match data at trampoline_32/64.S */
+struct trampoline_header {
+#ifdef CONFIG_X86_32
+	u32 start;
+	u16 gdt_pad;
+	u16 gdt_limit;
+	u32 gdt_base;
+#else
+	u64 start;
+	u64 efer;
+	u32 cr4;
+#endif
+};
+
+extern struct real_mode_header *real_mode_header;
+extern unsigned char real_mode_blob_end[];
+
+extern unsigned long init_rsp;
+extern unsigned long initial_code;
+extern unsigned long initial_gs;
+
+extern unsigned char real_mode_blob[];
+extern unsigned char real_mode_relocs[];
+
+#ifdef CONFIG_X86_32
+extern unsigned char startup_32_smp[];
+extern unsigned char boot_gdt[];
+#else
+extern unsigned char secondary_startup_64[];
+#endif
+
+extern void __init setup_real_mode(void);
+
+#endif /* _ARCH_X86_REALMODE_H */
diff --git a/arch/x86/include/asm/sta2x11.h b/arch/x86/include/asm/sta2x11.h
new file mode 100644
index 000000000000..e9d32df89ccc
--- /dev/null
+++ b/arch/x86/include/asm/sta2x11.h
@@ -0,0 +1,12 @@
+/*
+ * Header file for STMicroelectronics ConneXt (STA2X11) IOHub
+ */
+#ifndef __ASM_STA2X11_H
+#define __ASM_STA2X11_H
+
+#include <linux/pci.h>
+
+/* This needs to be called from the MFD to configure its sub-devices */
+struct sta2x11_instance *sta2x11_get_instance(struct pci_dev *pdev);
+
+#endif /* __ASM_STA2X11_H */
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
deleted file mode 100644
index feca3118a73b..000000000000
--- a/arch/x86/include/asm/trampoline.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef _ASM_X86_TRAMPOLINE_H
-#define _ASM_X86_TRAMPOLINE_H
-
-#ifndef __ASSEMBLY__
-
-#include <linux/types.h>
-#include <asm/io.h>
-
-/*
- * Trampoline 80x86 program as an array.  These are in the init rodata
- * segment, but that's okay, because we only care about the relative
- * addresses of the symbols.
- */
-extern const unsigned char x86_trampoline_start [];
-extern const unsigned char x86_trampoline_end   [];
-extern unsigned char *x86_trampoline_base;
-
-extern unsigned long init_rsp;
-extern unsigned long initial_code;
-extern unsigned long initial_gs;
-
-extern void __init setup_trampolines(void);
-
-extern const unsigned char trampoline_data[];
-extern const unsigned char trampoline_status[];
-
-#define TRAMPOLINE_SYM(x)						\
-	((void *)(x86_trampoline_base +					\
-		  ((const unsigned char *)(x) - x86_trampoline_start)))
-
-/* Address of the SMP trampoline */
-static inline unsigned long trampoline_address(void)
-{
-	return virt_to_phys(TRAMPOLINE_SYM(trampoline_data));
-}
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _ASM_X86_TRAMPOLINE_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 9bba5b79902b..8215e5652d97 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -35,7 +35,6 @@ obj-y			+= tsc.o io_delay.o rtc.o
 obj-y			+= pci-iommu_table.o
 obj-y			+= resource.o
 
-obj-y				+= trampoline.o trampoline_$(BITS).o
 obj-y				+= process.o
 obj-y				+= i387.o xsave.o
 obj-y				+= ptrace.o
@@ -48,7 +47,6 @@ obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-y				+= cpu/
 obj-y				+= acpi/
 obj-y				+= reboot.o
-obj-$(CONFIG_X86_32)		+= reboot_32.o
 obj-$(CONFIG_X86_MSR)		+= msr.o
 obj-$(CONFIG_X86_CPUID)		+= cpuid.o
 obj-$(CONFIG_PCI)		+= early-quirks.o
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 6f35260bb3ef..163b22581472 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,14 +1,7 @@
-subdir-				:= realmode
-
 obj-$(CONFIG_ACPI)		+= boot.o
-obj-$(CONFIG_ACPI_SLEEP)	+= sleep.o wakeup_rm.o wakeup_$(BITS).o
+obj-$(CONFIG_ACPI_SLEEP)	+= sleep.o wakeup_$(BITS).o
 
 ifneq ($(CONFIG_ACPI_PROCESSOR),)
 obj-y				+= cstate.o
 endif
 
-$(obj)/wakeup_rm.o:    $(obj)/realmode/wakeup.bin
-
-$(obj)/realmode/wakeup.bin: FORCE
-	$(Q)$(MAKE) $(build)=$(obj)/realmode
-
diff --git a/arch/x86/kernel/acpi/realmode/.gitignore b/arch/x86/kernel/acpi/realmode/.gitignore
deleted file mode 100644
index 58f1f48a58f8..000000000000
--- a/arch/x86/kernel/acpi/realmode/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-wakeup.bin
-wakeup.elf
-wakeup.lds
diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile
deleted file mode 100644
index 6a564ac67ef5..000000000000
--- a/arch/x86/kernel/acpi/realmode/Makefile
+++ /dev/null
@@ -1,59 +0,0 @@
-#
-# arch/x86/kernel/acpi/realmode/Makefile
-#
-# This file is subject to the terms and conditions of the GNU General Public
-# License.  See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-
-always		:= wakeup.bin
-targets		:= wakeup.elf wakeup.lds
-
-wakeup-y	+= wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o
-
-# The link order of the video-*.o modules can matter.  In particular,
-# video-vga.o *must* be listed first, followed by video-vesa.o.
-# Hardware-specific drivers should follow in the order they should be
-# probed, and video-bios.o should typically be last.
-wakeup-y	+= video-vga.o
-wakeup-y	+= video-vesa.o
-wakeup-y	+= video-bios.o
-
-targets		+= $(wakeup-y)
-
-bootsrc		:= $(src)/../../../boot
-
-# ---------------------------------------------------------------------------
-
-# How to compile the 16-bit code.  Note we always compile for -march=i386,
-# that way we can complain to the user if the CPU is insufficient.
-# Compile with _SETUP since this is similar to the boot-time setup code.
-KBUILD_CFLAGS	:= $(LINUXINCLUDE) -g -Os -D_SETUP -D_WAKEUP -D__KERNEL__ \
-		   -I$(srctree)/$(bootsrc) \
-		   $(cflags-y) \
-		   -Wall -Wstrict-prototypes \
-		   -march=i386 -mregparm=3 \
-		   -include $(srctree)/$(bootsrc)/code16gcc.h \
-		   -fno-strict-aliasing -fomit-frame-pointer \
-		   $(call cc-option, -ffreestanding) \
-		   $(call cc-option, -fno-toplevel-reorder,\
-			$(call cc-option, -fno-unit-at-a-time)) \
-		   $(call cc-option, -fno-stack-protector) \
-		   $(call cc-option, -mpreferred-stack-boundary=2)
-KBUILD_CFLAGS	+= $(call cc-option, -m32)
-KBUILD_AFLAGS	:= $(KBUILD_CFLAGS) -D__ASSEMBLY__
-GCOV_PROFILE := n
-
-WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y))
-
-LDFLAGS_wakeup.elf	:= -T
-
-CPPFLAGS_wakeup.lds += -P -C
-
-$(obj)/wakeup.elf: $(obj)/wakeup.lds $(WAKEUP_OBJS) FORCE
-	$(call if_changed,ld)
-
-OBJCOPYFLAGS_wakeup.bin	:= -O binary
-
-$(obj)/wakeup.bin: $(obj)/wakeup.elf FORCE
-	$(call if_changed,objcopy)
diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S
deleted file mode 100644
index f51eb0bb56ce..000000000000
--- a/arch/x86/kernel/acpi/realmode/bioscall.S
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/bioscall.S"
diff --git a/arch/x86/kernel/acpi/realmode/copy.S b/arch/x86/kernel/acpi/realmode/copy.S
deleted file mode 100644
index dc59ebee69d8..000000000000
--- a/arch/x86/kernel/acpi/realmode/copy.S
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/copy.S"
diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c
deleted file mode 100644
index 6206033ba202..000000000000
--- a/arch/x86/kernel/acpi/realmode/regs.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/regs.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-bios.c b/arch/x86/kernel/acpi/realmode/video-bios.c
deleted file mode 100644
index 7deabc144a27..000000000000
--- a/arch/x86/kernel/acpi/realmode/video-bios.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-bios.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-mode.c b/arch/x86/kernel/acpi/realmode/video-mode.c
deleted file mode 100644
index 328ad209f113..000000000000
--- a/arch/x86/kernel/acpi/realmode/video-mode.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-mode.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-vesa.c b/arch/x86/kernel/acpi/realmode/video-vesa.c
deleted file mode 100644
index 9dbb9672226a..000000000000
--- a/arch/x86/kernel/acpi/realmode/video-vesa.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-vesa.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-vga.c b/arch/x86/kernel/acpi/realmode/video-vga.c
deleted file mode 100644
index bcc81255f374..000000000000
--- a/arch/x86/kernel/acpi/realmode/video-vga.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-vga.c"
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
deleted file mode 100644
index d4f8010a5b1b..000000000000
--- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * wakeup.ld
- *
- * Linker script for the real-mode wakeup code
- */
-#undef i386
-#include "wakeup.h"
-
-OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
-OUTPUT_ARCH(i386)
-ENTRY(_start)
-
-SECTIONS
-{
-	. = 0;
-	.jump	: {
-		*(.jump)
-	} = 0x90909090
-
-	. = WAKEUP_HEADER_OFFSET;
-	.header : {
-		*(.header)
-	}
-
-	. = ALIGN(16);
-	.text : {
-		 *(.text*)
-	} = 0x90909090
-
-	. = ALIGN(16);
-	.rodata : {
-		*(.rodata*)
-	}
-
-	.videocards : {
-		video_cards = .;
-		*(.videocards)
-		video_cards_end = .;
-	}
-
-	. = ALIGN(16);
-	.data : {
-		 *(.data*)
-	}
-
-	. = ALIGN(16);
-	.bss :	{
-		__bss_start = .;
-		*(.bss)
-		__bss_end = .;
-	}
-
-	.signature : {
-		*(.signature)
-	}
-
-	_end = .;
-
-	/DISCARD/ : {
-		*(.note*)
-	}
-}
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 146a49c763a4..95bf99de9058 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -14,8 +14,9 @@
 #include <asm/desc.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
+#include <asm/realmode.h>
 
-#include "realmode/wakeup.h"
+#include "../../realmode/rm/wakeup.h"
 #include "sleep.h"
 
 unsigned long acpi_realmode_flags;
@@ -36,13 +37,9 @@ asmlinkage void acpi_enter_s3(void)
  */
 int acpi_suspend_lowlevel(void)
 {
-	struct wakeup_header *header;
-	/* address in low memory of the wakeup routine. */
-	char *acpi_realmode;
+	struct wakeup_header *header =
+		(struct wakeup_header *) __va(real_mode_header->wakeup_header);
 
-	acpi_realmode = TRAMPOLINE_SYM(acpi_wakeup_code);
-
-	header = (struct wakeup_header *)(acpi_realmode + WAKEUP_HEADER_OFFSET);
 	if (header->signature != WAKEUP_HEADER_SIGNATURE) {
 		printk(KERN_ERR "wakeup header does not match\n");
 		return -EINVAL;
@@ -50,27 +47,6 @@ int acpi_suspend_lowlevel(void)
 
 	header->video_mode = saved_video_mode;
 
-	header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
-
-	/*
-	 * Set up the wakeup GDT.  We set these up as Big Real Mode,
-	 * that is, with limits set to 4 GB.  At least the Lenovo
-	 * Thinkpad X61 is known to need this for the video BIOS
-	 * initialization quirk to work; this is likely to also
-	 * be the case for other laptops or integrated video devices.
-	 */
-
-	/* GDT[0]: GDT self-pointer */
-	header->wakeup_gdt[0] =
-		(u64)(sizeof(header->wakeup_gdt) - 1) +
-		((u64)__pa(&header->wakeup_gdt) << 16);
-	/* GDT[1]: big real mode-like code segment */
-	header->wakeup_gdt[1] =
-		GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
-	/* GDT[2]: big real mode-like data segment */
-	header->wakeup_gdt[2] =
-		GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff);
-
 #ifndef CONFIG_64BIT
 	store_gdt((struct desc_ptr *)&header->pmode_gdt);
 
@@ -95,7 +71,6 @@ int acpi_suspend_lowlevel(void)
 	header->pmode_cr3 = (u32)__pa(&initial_page_table);
 	saved_magic = 0x12345678;
 #else /* CONFIG_64BIT */
-	header->trampoline_segment = trampoline_address() >> 4;
 #ifdef CONFIG_SMP
 	stack_start = (unsigned long)temp_stack + sizeof(temp_stack);
 	early_gdt_descr.address =
diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h
index d68677a2a010..5653a5791ec9 100644
--- a/arch/x86/kernel/acpi/sleep.h
+++ b/arch/x86/kernel/acpi/sleep.h
@@ -2,8 +2,8 @@
  *	Variables and functions used by the code in sleep.c
  */
 
-#include <asm/trampoline.h>
 #include <linux/linkage.h>
+#include <asm/realmode.h>
 
 extern unsigned long saved_video_mode;
 extern long saved_magic;
diff --git a/arch/x86/kernel/acpi/wakeup_rm.S b/arch/x86/kernel/acpi/wakeup_rm.S
deleted file mode 100644
index 63b8ab524f2c..000000000000
--- a/arch/x86/kernel/acpi/wakeup_rm.S
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * Wrapper script for the realmode binary as a transport object
- * before copying to low memory.
- */
-#include <asm/page_types.h>
-
-	.section ".x86_trampoline","a"
-	.balign PAGE_SIZE
-	.globl	acpi_wakeup_code
-acpi_wakeup_code:
-	.incbin	"arch/x86/kernel/acpi/realmode/wakeup.bin"
-	.size	acpi_wakeup_code, .-acpi_wakeup_code
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 62d61e9976eb..41857970517f 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -113,7 +113,9 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
 	int x = e820x->nr_map;
 
 	if (x >= ARRAY_SIZE(e820x->map)) {
-		printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+		printk(KERN_ERR "e820: too many entries; ignoring [mem %#010llx-%#010llx]\n",
+		       (unsigned long long) start,
+		       (unsigned long long) (start + size - 1));
 		return;
 	}
 
@@ -133,19 +135,19 @@ static void __init e820_print_type(u32 type)
 	switch (type) {
 	case E820_RAM:
 	case E820_RESERVED_KERN:
-		printk(KERN_CONT "(usable)");
+		printk(KERN_CONT "usable");
 		break;
 	case E820_RESERVED:
-		printk(KERN_CONT "(reserved)");
+		printk(KERN_CONT "reserved");
 		break;
 	case E820_ACPI:
-		printk(KERN_CONT "(ACPI data)");
+		printk(KERN_CONT "ACPI data");
 		break;
 	case E820_NVS:
-		printk(KERN_CONT "(ACPI NVS)");
+		printk(KERN_CONT "ACPI NVS");
 		break;
 	case E820_UNUSABLE:
-		printk(KERN_CONT "(unusable)");
+		printk(KERN_CONT "unusable");
 		break;
 	default:
 		printk(KERN_CONT "type %u", type);
@@ -158,10 +160,10 @@ void __init e820_print_map(char *who)
 	int i;
 
 	for (i = 0; i < e820.nr_map; i++) {
-		printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
+		printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who,
 		       (unsigned long long) e820.map[i].addr,
 		       (unsigned long long)
-		       (e820.map[i].addr + e820.map[i].size));
+		       (e820.map[i].addr + e820.map[i].size - 1));
 		e820_print_type(e820.map[i].type);
 		printk(KERN_CONT "\n");
 	}
@@ -428,9 +430,8 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
 		size = ULLONG_MAX - start;
 
 	end = start + size;
-	printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ",
-		       (unsigned long long) start,
-		       (unsigned long long) end);
+	printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ",
+	       (unsigned long long) start, (unsigned long long) (end - 1));
 	e820_print_type(old_type);
 	printk(KERN_CONT " ==> ");
 	e820_print_type(new_type);
@@ -509,9 +510,8 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
 		size = ULLONG_MAX - start;
 
 	end = start + size;
-	printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ",
-		       (unsigned long long) start,
-		       (unsigned long long) end);
+	printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ",
+	       (unsigned long long) start, (unsigned long long) (end - 1));
 	if (checktype)
 		e820_print_type(old_type);
 	printk(KERN_CONT "\n");
@@ -567,7 +567,7 @@ void __init update_e820(void)
 	if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
 		return;
 	e820.nr_map = nr_map;
-	printk(KERN_INFO "modified physical RAM map:\n");
+	printk(KERN_INFO "e820: modified physical RAM map:\n");
 	e820_print_map("modified");
 }
 static void __init update_e820_saved(void)
@@ -637,8 +637,8 @@ __init void e820_setup_gap(void)
 	if (!found) {
 		gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
 		printk(KERN_ERR
-	"PCI: Warning: Cannot find a gap in the 32bit address range\n"
-	"PCI: Unassigned devices with 32bit resource registers may break!\n");
+	"e820: cannot find a gap in the 32bit address range\n"
+	"e820: PCI devices with unassigned 32bit BARs may break!\n");
 	}
 #endif
 
@@ -648,8 +648,8 @@ __init void e820_setup_gap(void)
 	pci_mem_start = gapstart;
 
 	printk(KERN_INFO
-	       "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
-	       pci_mem_start, gapstart, gapsize);
+	       "e820: [mem %#010lx-%#010lx] available for PCI devices\n",
+	       gapstart, gapstart + gapsize - 1);
 }
 
 /**
@@ -667,7 +667,7 @@ void __init parse_e820_ext(struct setup_data *sdata)
 	extmap = (struct e820entry *)(sdata->data);
 	__append_e820_map(extmap, entries);
 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
-	printk(KERN_INFO "extended physical RAM map:\n");
+	printk(KERN_INFO "e820: extended physical RAM map:\n");
 	e820_print_map("extended");
 }
 
@@ -734,7 +734,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
 	addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
 	if (addr) {
 		e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED);
-		printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
+		printk(KERN_INFO "e820: update e820_saved for early_reserve_e820\n");
 		update_e820_saved();
 	}
 
@@ -784,7 +784,7 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
 	if (last_pfn > max_arch_pfn)
 		last_pfn = max_arch_pfn;
 
-	printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
+	printk(KERN_INFO "e820: last_pfn = %#lx max_arch_pfn = %#lx\n",
 			 last_pfn, max_arch_pfn);
 	return last_pfn;
 }
@@ -888,7 +888,7 @@ void __init finish_e820_parsing(void)
 			early_panic("Invalid user supplied memory map");
 		e820.nr_map = nr;
 
-		printk(KERN_INFO "user-defined physical RAM map:\n");
+		printk(KERN_INFO "e820: user-defined physical RAM map:\n");
 		e820_print_map("user");
 	}
 }
@@ -996,8 +996,9 @@ void __init e820_reserve_resources_late(void)
 			end = MAX_RESOURCE_SIZE;
 		if (start >= end)
 			continue;
-		printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ",
-			       start, end);
+		printk(KERN_DEBUG
+		       "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n",
+		       start, end);
 		reserve_region_with_split(&iomem_resource, start, end,
 					  "RAM buffer");
 	}
@@ -1047,7 +1048,7 @@ void __init setup_memory_map(void)
 
 	who = x86_init.resources.memory_setup();
 	memcpy(&e820_saved, &e820, sizeof(struct e820map));
-	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
+	printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n");
 	e820_print_map(who);
 }
 
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 51ff18616d50..c18f59d10101 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -14,7 +14,6 @@
 #include <asm/sections.h>
 #include <asm/e820.h>
 #include <asm/page.h>
-#include <asm/trampoline.h>
 #include <asm/apic.h>
 #include <asm/io_apic.h>
 #include <asm/bios_ebda.h>
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 3a3b779f41d3..037df57a99ac 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -24,7 +24,6 @@
 #include <asm/sections.h>
 #include <asm/kdebug.h>
 #include <asm/e820.h>
-#include <asm/trampoline.h>
 #include <asm/bios_ebda.h>
 
 static void __init zap_identity_mappings(void)
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 463c9797ca6a..d42ab17b7397 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -274,10 +274,7 @@ num_subarch_entries = (. - subarch_entries) / 4
  * If cpu hotplug is not supported then this code can go in init section
  * which will be freed later
  */
-
 __CPUINIT
-
-#ifdef CONFIG_SMP
 ENTRY(startup_32_smp)
 	cld
 	movl $(__BOOT_DS),%eax
@@ -288,7 +285,7 @@ ENTRY(startup_32_smp)
 	movl pa(stack_start),%ecx
 	movl %eax,%ss
 	leal -__PAGE_OFFSET(%ecx),%esp
-#endif /* CONFIG_SMP */
+
 default_entry:
 
 /*
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 7a40f2447321..94bf9cc2c7ee 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -139,10 +139,6 @@ ident_complete:
 	/* Fixup phys_base */
 	addq	%rbp, phys_base(%rip)
 
-	/* Fixup trampoline */
-	addq	%rbp, trampoline_level4_pgt + 0(%rip)
-	addq	%rbp, trampoline_level4_pgt + (511*8)(%rip)
-
 	/* Due to ENTRY(), sometimes the empty space gets filled with
 	 * zeros. Better take a jmp than relying on empty space being
 	 * filled with 0x90 (nop)
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index b02d4dd6b8a3..d2b56489d70f 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -27,7 +27,6 @@
 #include <asm/proto.h>
 #include <asm/bios_ebda.h>
 #include <asm/e820.h>
-#include <asm/trampoline.h>
 #include <asm/setup.h>
 #include <asm/smp.h>
 
@@ -568,8 +567,8 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
 	struct mpf_intel *mpf;
 	unsigned long mem;
 
-	apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
-			bp, length);
+	apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n",
+		    base, base + length - 1);
 	BUILD_BUG_ON(sizeof(*mpf) != 16);
 
 	while (length > 0) {
@@ -584,8 +583,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
 #endif
 			mpf_found = mpf;
 
-			printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
-			       mpf, (u64)virt_to_phys(mpf));
+			printk(KERN_INFO "found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n",
+			       (unsigned long long) virt_to_phys(mpf),
+			       (unsigned long long) virt_to_phys(mpf) +
+			       sizeof(*mpf) - 1, mpf);
 
 			mem = virt_to_phys(mpf);
 			memblock_reserve(mem, sizeof(*mpf));
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 77215c23fba1..79c45af81604 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -24,6 +24,7 @@
 #ifdef CONFIG_X86_32
 # include <linux/ctype.h>
 # include <linux/mc146818rtc.h>
+# include <asm/realmode.h>
 #else
 # include <asm/x86_init.h>
 #endif
@@ -156,15 +157,10 @@ static int __init set_bios_reboot(const struct dmi_system_id *d)
 	return 0;
 }
 
-extern const unsigned char machine_real_restart_asm[];
-extern const u64 machine_real_restart_gdt[3];
-
 void machine_real_restart(unsigned int type)
 {
-	void *restart_va;
-	unsigned long restart_pa;
-	void (*restart_lowmem)(unsigned int);
-	u64 *lowmem_gdt;
+	void (*restart_lowmem)(unsigned int) = (void (*)(unsigned int))
+		real_mode_header->machine_real_restart_asm;
 
 	local_irq_disable();
 
@@ -195,21 +191,6 @@ void machine_real_restart(unsigned int type)
 	 * too. */
 	*((unsigned short *)0x472) = reboot_mode;
 
-	/* Patch the GDT in the low memory trampoline */
-	lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt);
-
-	restart_va = TRAMPOLINE_SYM(machine_real_restart_asm);
-	restart_pa = virt_to_phys(restart_va);
-	restart_lowmem = (void (*)(unsigned int))restart_pa;
-
-	/* GDT[0]: GDT self-pointer */
-	lowmem_gdt[0] =
-		(u64)(sizeof(machine_real_restart_gdt) - 1) +
-		((u64)virt_to_phys(lowmem_gdt) << 16);
-	/* GDT[1]: 64K real mode code segment */
-	lowmem_gdt[1] =
-		GDT_ENTRY(0x009b, restart_pa, 0xffff);
-
 	/* Jump to the identity-mapped low memory code */
 	restart_lowmem(type);
 }
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index f2afee6a19c1..16be6dc14db1 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -73,7 +73,7 @@
 
 #include <asm/mtrr.h>
 #include <asm/apic.h>
-#include <asm/trampoline.h>
+#include <asm/realmode.h>
 #include <asm/e820.h>
 #include <asm/mpspec.h>
 #include <asm/setup.h>
@@ -334,8 +334,8 @@ static void __init relocate_initrd(void)
 	memblock_reserve(ramdisk_here, area_size);
 	initrd_start = ramdisk_here + PAGE_OFFSET;
 	initrd_end   = initrd_start + ramdisk_size;
-	printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
-			 ramdisk_here, ramdisk_here + ramdisk_size);
+	printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
+			 ramdisk_here, ramdisk_here + ramdisk_size - 1);
 
 	q = (char *)initrd_start;
 
@@ -366,8 +366,8 @@ static void __init relocate_initrd(void)
 	/* high pages is not converted by early_res_to_bootmem */
 	ramdisk_image = boot_params.hdr.ramdisk_image;
 	ramdisk_size  = boot_params.hdr.ramdisk_size;
-	printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to"
-		" %08llx - %08llx\n",
+	printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
+		" [mem %#010llx-%#010llx]\n",
 		ramdisk_image, ramdisk_image + ramdisk_size - 1,
 		ramdisk_here, ramdisk_here + ramdisk_size - 1);
 }
@@ -392,8 +392,8 @@ static void __init reserve_initrd(void)
 		       ramdisk_size, end_of_lowmem>>1);
 	}
 
-	printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image,
-			ramdisk_end);
+	printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image,
+			ramdisk_end - 1);
 
 
 	if (ramdisk_end <= end_of_lowmem) {
@@ -906,10 +906,10 @@ void __init setup_arch(char **cmdline_p)
 	setup_bios_corruption_check();
 #endif
 
-	printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
-			max_pfn_mapped<<PAGE_SHIFT);
+	printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n",
+			(max_pfn_mapped<<PAGE_SHIFT) - 1);
 
-	setup_trampolines();
+	setup_real_mode();
 
 	init_gbpages();
 
@@ -968,6 +968,8 @@ void __init setup_arch(char **cmdline_p)
 	if (boot_cpu_data.cpuid_level >= 0) {
 		/* A CPU has %cr4 if and only if it has CPUID */
 		mmu_cr4_features = read_cr4();
+		if (trampoline_cr4_features)
+			*trampoline_cr4_features = mmu_cr4_features;
 	}
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 433529e29be4..f56f96da77f5 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -57,7 +57,7 @@
 #include <asm/nmi.h>
 #include <asm/irq.h>
 #include <asm/idle.h>
-#include <asm/trampoline.h>
+#include <asm/realmode.h>
 #include <asm/cpu.h>
 #include <asm/numa.h>
 #include <asm/pgtable.h>
@@ -73,6 +73,8 @@
 #include <asm/smpboot_hooks.h>
 #include <asm/i8259.h>
 
+#include <asm/realmode.h>
+
 /* State of each CPU */
 DEFINE_PER_CPU(int, cpu_state) = { 0 };
 
@@ -660,8 +662,12 @@ static void __cpuinit announce_cpu(int cpu, int apicid)
  */
 static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 {
+	volatile u32 *trampoline_status =
+		(volatile u32 *) __va(real_mode_header->trampoline_status);
+	/* start_ip had better be page-aligned! */
+	unsigned long start_ip = real_mode_header->trampoline_start;
+
 	unsigned long boot_error = 0;
-	unsigned long start_ip;
 	int timeout;
 
 	alternatives_smp_switch(1);
@@ -684,9 +690,6 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 	initial_code = (unsigned long)start_secondary;
 	stack_start  = idle->thread.sp;
 
-	/* start_ip had better be page-aligned! */
-	start_ip = trampoline_address();
-
 	/* So we see what's up */
 	announce_cpu(cpu, apicid);
 
@@ -749,8 +752,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 			pr_debug("CPU%d: has booted.\n", cpu);
 		} else {
 			boot_error = 1;
-			if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status)
-			    == 0xA5A5A5A5)
+			if (*trampoline_status == 0xA5A5A5A5)
 				/* trampoline started but...? */
 				pr_err("CPU%d: Stuck ??\n", cpu);
 			else
@@ -776,7 +778,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 	}
 
 	/* mark "stuck" area as not stuck */
-	*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) = 0;
+	*trampoline_status = 0;
 
 	if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
 		/*
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 6410744ac5cb..f84fe00fad48 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -32,7 +32,7 @@
 #include <linux/mm.h>
 #include <linux/tboot.h>
 
-#include <asm/trampoline.h>
+#include <asm/realmode.h>
 #include <asm/processor.h>
 #include <asm/bootparam.h>
 #include <asm/pgtable.h>
@@ -44,7 +44,7 @@
 #include <asm/e820.h>
 #include <asm/io.h>
 
-#include "acpi/realmode/wakeup.h"
+#include "../realmode/rm/wakeup.h"
 
 /* Global pointer to shared data; NULL means no measured launch. */
 struct tboot *tboot __read_mostly;
@@ -201,7 +201,8 @@ static int tboot_setup_sleep(void)
 		add_mac_region(e820.map[i].addr, e820.map[i].size);
 	}
 
-	tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address;
+	tboot->acpi_sinfo.kernel_s3_resume_vector =
+		real_mode_header->wakeup_start;
 
 	return 0;
 }
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
deleted file mode 100644
index a73b61055ad6..000000000000
--- a/arch/x86/kernel/trampoline.c
+++ /dev/null
@@ -1,42 +0,0 @@
-#include <linux/io.h>
-#include <linux/memblock.h>
-
-#include <asm/trampoline.h>
-#include <asm/cacheflush.h>
-#include <asm/pgtable.h>
-
-unsigned char *x86_trampoline_base;
-
-void __init setup_trampolines(void)
-{
-	phys_addr_t mem;
-	size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start);
-
-	/* Has to be in very low memory so we can execute real-mode AP code. */
-	mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
-	if (!mem)
-		panic("Cannot allocate trampoline\n");
-
-	x86_trampoline_base = __va(mem);
-	memblock_reserve(mem, size);
-
-	printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
-	       x86_trampoline_base, (unsigned long long)mem, size);
-
-	memcpy(x86_trampoline_base, x86_trampoline_start, size);
-}
-
-/*
- * setup_trampolines() gets called very early, to guarantee the
- * availability of low memory.  This is before the proper kernel page
- * tables are set up, so we cannot set page permissions in that
- * function.  Thus, we use an arch_initcall instead.
- */
-static int __init configure_trampolines(void)
-{
-	size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start);
-
-	set_memory_x((unsigned long)x86_trampoline_base, size >> PAGE_SHIFT);
-	return 0;
-}
-arch_initcall(configure_trampolines);
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
deleted file mode 100644
index 451c0a7ef7fd..000000000000
--- a/arch/x86/kernel/trampoline_32.S
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- *
- *	Trampoline.S	Derived from Setup.S by Linus Torvalds
- *
- *	4 Jan 1997 Michael Chastain: changed to gnu as.
- *
- *	This is only used for booting secondary CPUs in SMP machine
- *
- *	Entry: CS:IP point to the start of our code, we are 
- *	in real mode with no stack, but the rest of the 
- *	trampoline page to make our stack and everything else
- *	is a mystery.
- *
- *	We jump into arch/x86/kernel/head_32.S.
- *
- *	On entry to trampoline_data, the processor is in real mode
- *	with 16-bit addressing and 16-bit data.  CS has some value
- *	and IP is zero.  Thus, data addresses need to be absolute
- *	(no relocation) and are taken with regard to r_base.
- *
- *	If you work on this file, check the object module with
- *	objdump --reloc to make sure there are no relocation
- *	entries except for:
- *
- *	TYPE              VALUE
- *	R_386_32          startup_32_smp
- *	R_386_32          boot_gdt
- */
-
-#include <linux/linkage.h>
-#include <linux/init.h>
-#include <asm/segment.h>
-#include <asm/page_types.h>
-
-#ifdef CONFIG_SMP
-
-	.section ".x86_trampoline","a"
-	.balign PAGE_SIZE
-	.code16
-
-ENTRY(trampoline_data)
-r_base = .
-	wbinvd			# Needed for NUMA-Q should be harmless for others
-	mov	%cs, %ax	# Code and data in the same place
-	mov	%ax, %ds
-
-	cli			# We should be safe anyway
-
-	movl	$0xA5A5A5A5, trampoline_status - r_base
-				# write marker for master knows we're running
-
-	/* GDT tables in non default location kernel can be beyond 16MB and
-	 * lgdt will not be able to load the address as in real mode default
-	 * operand size is 16bit. Use lgdtl instead to force operand size
-	 * to 32 bit.
-	 */
-
-	lidtl	boot_idt_descr - r_base	# load idt with 0, 0
-	lgdtl	boot_gdt_descr - r_base	# load gdt with whatever is appropriate
-
-	xor	%ax, %ax
-	inc	%ax		# protected mode (PE) bit
-	lmsw	%ax		# into protected mode
-	# flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S
-	ljmpl	$__BOOT_CS, $(startup_32_smp-__PAGE_OFFSET)
-
-	# These need to be in the same 64K segment as the above;
-	# hence we don't use the boot_gdt_descr defined in head.S
-boot_gdt_descr:
-	.word	__BOOT_DS + 7			# gdt limit
-	.long	boot_gdt - __PAGE_OFFSET	# gdt base
-
-boot_idt_descr:
-	.word	0				# idt limit = 0
-	.long	0				# idt base = 0L
-
-ENTRY(trampoline_status)
-	.long	0
-
-.globl trampoline_end
-trampoline_end:
-
-#endif /* CONFIG_SMP */
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 0f703f10901a..22a1530146a8 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -197,18 +197,6 @@ SECTIONS
 
 	INIT_DATA_SECTION(16)
 
-	/*
-	 * Code and data for a variety of lowlevel trampolines, to be
-	 * copied into base memory (< 1 MiB) during initialization.
-	 * Since it is copied early, the main copy can be discarded
-	 * afterwards.
-	 */
-	 .x86_trampoline : AT(ADDR(.x86_trampoline) - LOAD_OFFSET) {
-		x86_trampoline_start = .;
-		*(.x86_trampoline)
-		x86_trampoline_end = .;
-	}
-
 	.x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
 		__x86_cpu_dev_start = .;
 		*(.x86_cpu_dev.init)
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 319b6f2fb8b9..97141c26a13a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -84,8 +84,9 @@ static void __init find_early_table_space(struct map_range *mr, unsigned long en
 	pgt_buf_end = pgt_buf_start;
 	pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
 
-	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
-		end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT);
+	printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n",
+		end - 1, pgt_buf_start << PAGE_SHIFT,
+		(pgt_buf_top << PAGE_SHIFT) - 1);
 }
 
 void __init native_pagetable_reserve(u64 start, u64 end)
@@ -132,7 +133,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	int nr_range, i;
 	int use_pse, use_gbpages;
 
-	printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
+	printk(KERN_INFO "init_memory_mapping: [mem %#010lx-%#010lx]\n",
+	       start, end - 1);
 
 #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
 	/*
@@ -251,8 +253,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	}
 
 	for (i = 0; i < nr_range; i++)
-		printk(KERN_DEBUG " %010lx - %010lx page %s\n",
-				mr[i].start, mr[i].end,
+		printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n",
+				mr[i].start, mr[i].end - 1,
 			(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
 			 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
 
@@ -350,8 +352,8 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 	 * create a kernel page fault:
 	 */
 #ifdef CONFIG_DEBUG_PAGEALLOC
-	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
-		begin, end);
+	printk(KERN_INFO "debug: unmapping init [mem %#010lx-%#010lx]\n",
+		begin, end - 1);
 	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
 #else
 	/*
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 19d3fa08b119..2d125be1bae9 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -141,8 +141,8 @@ static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
 
 	/* whine about and ignore invalid blks */
 	if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
-		pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
-			   nid, start, end);
+		pr_warning("NUMA: Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
+			   nid, start, end - 1);
 		return 0;
 	}
 
@@ -210,8 +210,8 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
 
 	start = roundup(start, ZONE_ALIGN);
 
-	printk(KERN_INFO "Initmem setup node %d %016Lx-%016Lx\n",
-	       nid, start, end);
+	printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n",
+	       nid, start, end - 1);
 
 	/*
 	 * Allocate node data.  Try remap allocator first, node-local
@@ -232,7 +232,7 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
 	}
 
 	/* report and initialize */
-	printk(KERN_INFO "  NODE_DATA [%016Lx - %016Lx]%s\n",
+	printk(KERN_INFO "  NODE_DATA [mem %#010Lx-%#010Lx]%s\n",
 	       nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : "");
 	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
 	if (!remapped && tnid != nid)
@@ -291,14 +291,14 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
 			 */
 			if (bi->end > bj->start && bi->start < bj->end) {
 				if (bi->nid != bj->nid) {
-					pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
-					       bi->nid, bi->start, bi->end,
-					       bj->nid, bj->start, bj->end);
+					pr_err("NUMA: node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
+					       bi->nid, bi->start, bi->end - 1,
+					       bj->nid, bj->start, bj->end - 1);
 					return -EINVAL;
 				}
-				pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
-					   bi->nid, bi->start, bi->end,
-					   bj->start, bj->end);
+				pr_warning("NUMA: Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
+					   bi->nid, bi->start, bi->end - 1,
+					   bj->start, bj->end - 1);
 			}
 
 			/*
@@ -320,9 +320,9 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
 			}
 			if (k < mi->nr_blks)
 				continue;
-			printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%Lx,%Lx)\n",
-			       bi->nid, bi->start, bi->end, bj->start, bj->end,
-			       start, end);
+			printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n",
+			       bi->nid, bi->start, bi->end - 1, bj->start,
+			       bj->end - 1, start, end - 1);
 			bi->start = start;
 			bi->end = end;
 			numa_remove_memblk_from(j--, mi);
@@ -616,8 +616,8 @@ static int __init dummy_numa_init(void)
 {
 	printk(KERN_INFO "%s\n",
 	       numa_off ? "NUMA turned off" : "No NUMA configuration found");
-	printk(KERN_INFO "Faking a node at %016Lx-%016Lx\n",
-	       0LLU, PFN_PHYS(max_pfn));
+	printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
+	       0LLU, PFN_PHYS(max_pfn) - 1);
 
 	node_set(0, numa_nodes_parsed);
 	numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index 871dd8868170..dbbbb47260cc 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -68,8 +68,8 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei,
 		numa_remove_memblk_from(phys_blk, pi);
 	}
 
-	printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
-	       eb->start, eb->end, (eb->end - eb->start) >> 20);
+	printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n",
+	       nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20);
 	return 0;
 }
 
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index f6ff57b7efa5..f11729fd019c 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -209,9 +209,8 @@ static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
 		page = pfn_to_page(pfn);
 		type = get_page_memtype(page);
 		if (type != -1) {
-			printk(KERN_INFO "reserve_ram_pages_type failed "
-				"0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n",
-				start, end, type, req_type);
+			printk(KERN_INFO "reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%lx, req 0x%lx\n",
+				start, end - 1, type, req_type);
 			if (new_type)
 				*new_type = type;
 
@@ -314,9 +313,9 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 
 	err = rbt_memtype_check_insert(new, new_type);
 	if (err) {
-		printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
-		       "track %s, req %s\n",
-		       start, end, cattr_name(new->type), cattr_name(req_type));
+		printk(KERN_INFO "reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
+		       start, end - 1,
+		       cattr_name(new->type), cattr_name(req_type));
 		kfree(new);
 		spin_unlock(&memtype_lock);
 
@@ -325,8 +324,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 
 	spin_unlock(&memtype_lock);
 
-	dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
-		start, end, cattr_name(new->type), cattr_name(req_type),
+	dprintk("reserve_memtype added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n",
+		start, end - 1, cattr_name(new->type), cattr_name(req_type),
 		new_type ? cattr_name(*new_type) : "-");
 
 	return err;
@@ -360,14 +359,14 @@ int free_memtype(u64 start, u64 end)
 	spin_unlock(&memtype_lock);
 
 	if (!entry) {
-		printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n",
-			current->comm, current->pid, start, end);
+		printk(KERN_INFO "%s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
+		       current->comm, current->pid, start, end - 1);
 		return -EINVAL;
 	}
 
 	kfree(entry);
 
-	dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
+	dprintk("free_memtype request [mem %#010Lx-%#010Lx]\n", start, end - 1);
 
 	return 0;
 }
@@ -491,9 +490,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
 
 	while (cursor < to) {
 		if (!devmem_is_allowed(pfn)) {
-			printk(KERN_INFO
-		"Program %s tried to access /dev/mem between %Lx->%Lx.\n",
-				current->comm, from, to);
+			printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx]\n",
+				current->comm, from, to - 1);
 			return 0;
 		}
 		cursor += PAGE_SIZE;
@@ -554,12 +552,11 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags)
 				size;
 
 	if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) {
-		printk(KERN_INFO
-			"%s:%d ioremap_change_attr failed %s "
-			"for %Lx-%Lx\n",
+		printk(KERN_INFO "%s:%d ioremap_change_attr failed %s "
+			"for [mem %#010Lx-%#010Lx]\n",
 			current->comm, current->pid,
 			cattr_name(flags),
-			base, (unsigned long long)(base + size));
+			base, (unsigned long long)(base + size-1));
 		return -EINVAL;
 	}
 	return 0;
@@ -591,12 +588,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
 
 		flags = lookup_memtype(paddr);
 		if (want_flags != flags) {
-			printk(KERN_WARNING
-			"%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n",
+			printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
 				current->comm, current->pid,
 				cattr_name(want_flags),
 				(unsigned long long)paddr,
-				(unsigned long long)(paddr + size),
+				(unsigned long long)(paddr + size - 1),
 				cattr_name(flags));
 			*vma_prot = __pgprot((pgprot_val(*vma_prot) &
 					      (~_PAGE_CACHE_MASK)) |
@@ -614,11 +610,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
 		    !is_new_memtype_allowed(paddr, size, want_flags, flags)) {
 			free_memtype(paddr, paddr + size);
 			printk(KERN_ERR "%s:%d map pfn expected mapping type %s"
-				" for %Lx-%Lx, got %s\n",
+				" for [mem %#010Lx-%#010Lx], got %s\n",
 				current->comm, current->pid,
 				cattr_name(want_flags),
 				(unsigned long long)paddr,
-				(unsigned long long)(paddr + size),
+				(unsigned long long)(paddr + size - 1),
 				cattr_name(flags));
 			return -EINVAL;
 		}
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index efb5b4b93711..732af3a96183 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -176,8 +176,9 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 		return;
 	}
 
-	printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
-	       start, end);
+	printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n",
+	       node, pxm,
+	       (unsigned long long) start, (unsigned long long) end - 1);
 }
 
 void __init acpi_numa_arch_fixup(void) {}
diff --git a/arch/x86/realmode/Makefile b/arch/x86/realmode/Makefile
new file mode 100644
index 000000000000..94f7fbe97b08
--- /dev/null
+++ b/arch/x86/realmode/Makefile
@@ -0,0 +1,18 @@
+#
+# arch/x86/realmode/Makefile
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License.  See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+#
+
+subdir- := rm
+
+obj-y += init.o
+obj-y += rmpiggy.o
+
+$(obj)/rmpiggy.o: $(obj)/rm/realmode.bin
+
+$(obj)/rm/realmode.bin: FORCE
+	$(Q)$(MAKE) $(build)=$(obj)/rm $@
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
new file mode 100644
index 000000000000..cbca565af5bd
--- /dev/null
+++ b/arch/x86/realmode/init.c
@@ -0,0 +1,115 @@
+#include <linux/io.h>
+#include <linux/memblock.h>
+
+#include <asm/cacheflush.h>
+#include <asm/pgtable.h>
+#include <asm/realmode.h>
+
+struct real_mode_header *real_mode_header;
+u32 *trampoline_cr4_features;
+
+void __init setup_real_mode(void)
+{
+	phys_addr_t mem;
+	u16 real_mode_seg;
+	u32 *rel;
+	u32 count;
+	u32 *ptr;
+	u16 *seg;
+	int i;
+	unsigned char *base;
+	struct trampoline_header *trampoline_header;
+	size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
+#ifdef CONFIG_X86_64
+	u64 *trampoline_pgd;
+	u64 efer;
+#endif
+
+	/* Has to be in very low memory so we can execute real-mode AP code. */
+	mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
+	if (!mem)
+		panic("Cannot allocate trampoline\n");
+
+	base = __va(mem);
+	memblock_reserve(mem, size);
+	real_mode_header = (struct real_mode_header *) base;
+	printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
+	       base, (unsigned long long)mem, size);
+
+	memcpy(base, real_mode_blob, size);
+
+	real_mode_seg = __pa(base) >> 4;
+	rel = (u32 *) real_mode_relocs;
+
+	/* 16-bit segment relocations. */
+	count = rel[0];
+	rel = &rel[1];
+	for (i = 0; i < count; i++) {
+		seg = (u16 *) (base + rel[i]);
+		*seg = real_mode_seg;
+	}
+
+	/* 32-bit linear relocations. */
+	count = rel[i];
+	rel =  &rel[i + 1];
+	for (i = 0; i < count; i++) {
+		ptr = (u32 *) (base + rel[i]);
+		*ptr += __pa(base);
+	}
+
+	/* Must be perfomed *after* relocation. */
+	trampoline_header = (struct trampoline_header *)
+		__va(real_mode_header->trampoline_header);
+
+#ifdef CONFIG_X86_32
+	trampoline_header->start = __pa(startup_32_smp);
+	trampoline_header->gdt_limit = __BOOT_DS + 7;
+	trampoline_header->gdt_base = __pa(boot_gdt);
+#else
+	/*
+	 * Some AMD processors will #GP(0) if EFER.LMA is set in WRMSR
+	 * so we need to mask it out.
+	 */
+	rdmsrl(MSR_EFER, efer);
+	trampoline_header->efer = efer & ~EFER_LMA;
+
+	trampoline_header->start = (u64) secondary_startup_64;
+	trampoline_cr4_features = &trampoline_header->cr4;
+	*trampoline_cr4_features = read_cr4();
+
+	trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
+	trampoline_pgd[0] = __pa(level3_ident_pgt) + _KERNPG_TABLE;
+	trampoline_pgd[511] = __pa(level3_kernel_pgt) + _KERNPG_TABLE;
+#endif
+}
+
+/*
+ * set_real_mode_permissions() gets called very early, to guarantee the
+ * availability of low memory.  This is before the proper kernel page
+ * tables are set up, so we cannot set page permissions in that
+ * function.  Thus, we use an arch_initcall instead.
+ */
+static int __init set_real_mode_permissions(void)
+{
+	unsigned char *base = (unsigned char *) real_mode_header;
+	size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
+
+	size_t ro_size =
+		PAGE_ALIGN(real_mode_header->ro_end) -
+		__pa(base);
+
+	size_t text_size =
+		PAGE_ALIGN(real_mode_header->ro_end) -
+		real_mode_header->text_start;
+
+	unsigned long text_start =
+		(unsigned long) __va(real_mode_header->text_start);
+
+	set_memory_nx((unsigned long) base, size >> PAGE_SHIFT);
+	set_memory_ro((unsigned long) base, ro_size >> PAGE_SHIFT);
+	set_memory_x((unsigned long) text_start, text_size >> PAGE_SHIFT);
+
+	return 0;
+}
+
+arch_initcall(set_real_mode_permissions);
diff --git a/arch/x86/realmode/rm/.gitignore b/arch/x86/realmode/rm/.gitignore
new file mode 100644
index 000000000000..b6ed3a2555cb
--- /dev/null
+++ b/arch/x86/realmode/rm/.gitignore
@@ -0,0 +1,3 @@
+pasyms.h
+realmode.lds
+realmode.relocs
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
new file mode 100644
index 000000000000..5b84a2d30888
--- /dev/null
+++ b/arch/x86/realmode/rm/Makefile
@@ -0,0 +1,82 @@
+#
+# arch/x86/realmode/Makefile
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License.  See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+#
+
+always := realmode.bin realmode.relocs
+
+wakeup-objs	:= wakeup_asm.o wakemain.o video-mode.o
+wakeup-objs	+= copy.o bioscall.o regs.o
+# The link order of the video-*.o modules can matter.  In particular,
+# video-vga.o *must* be listed first, followed by video-vesa.o.
+# Hardware-specific drivers should follow in the order they should be
+# probed, and video-bios.o should typically be last.
+wakeup-objs	+= video-vga.o
+wakeup-objs	+= video-vesa.o
+wakeup-objs	+= video-bios.o
+
+realmode-y			+= header.o
+realmode-y			+= trampoline_$(BITS).o
+realmode-y			+= stack.o
+realmode-$(CONFIG_X86_32)	+= reboot_32.o
+realmode-$(CONFIG_ACPI_SLEEP)	+= $(wakeup-objs)
+
+targets	+= $(realmode-y)
+
+REALMODE_OBJS = $(addprefix $(obj)/,$(realmode-y))
+
+sed-pasyms := -n -r -e 's/^([0-9a-fA-F]+) [ABCDGRSTVW] (.+)$$/pa_\2 = \2;/p'
+
+quiet_cmd_pasyms = PASYMS  $@
+      cmd_pasyms = $(NM) $(filter-out FORCE,$^) | \
+		   sed $(sed-pasyms) | sort | uniq > $@
+
+targets += pasyms.h
+$(obj)/pasyms.h: $(REALMODE_OBJS) FORCE
+	$(call if_changed,pasyms)
+
+targets += realmode.lds
+$(obj)/realmode.lds: $(obj)/pasyms.h
+
+LDFLAGS_realmode.elf := --emit-relocs -T
+CPPFLAGS_realmode.lds += -P -C -I$(obj)
+
+targets += realmode.elf
+$(obj)/realmode.elf: $(obj)/realmode.lds $(REALMODE_OBJS) FORCE
+	$(call if_changed,ld)
+
+OBJCOPYFLAGS_realmode.bin := -O binary
+
+targets += realmode.bin
+$(obj)/realmode.bin: $(obj)/realmode.elf $(obj)/realmode.relocs
+	$(call if_changed,objcopy)
+
+quiet_cmd_relocs = RELOCS  $@
+      cmd_relocs = arch/x86/tools/relocs --realmode $< > $@
+
+targets += realmode.relocs
+$(obj)/realmode.relocs: $(obj)/realmode.elf FORCE
+	$(call if_changed,relocs)
+
+# ---------------------------------------------------------------------------
+
+# How to compile the 16-bit code.  Note we always compile for -march=i386,
+# that way we can complain to the user if the CPU is insufficient.
+KBUILD_CFLAGS	:= $(LINUXINCLUDE) -m32 -g -Os -D_SETUP -D__KERNEL__ -D_WAKEUP \
+		   -I$(srctree)/arch/x86/boot \
+		   -DDISABLE_BRANCH_PROFILING \
+		   -Wall -Wstrict-prototypes \
+		   -march=i386 -mregparm=3 \
+		   -include $(srctree)/$(src)/../../boot/code16gcc.h \
+		   -fno-strict-aliasing -fomit-frame-pointer \
+		   $(call cc-option, -ffreestanding) \
+		   $(call cc-option, -fno-toplevel-reorder,\
+			$(call cc-option, -fno-unit-at-a-time)) \
+		   $(call cc-option, -fno-stack-protector) \
+		   $(call cc-option, -mpreferred-stack-boundary=2)
+KBUILD_AFLAGS	:= $(KBUILD_CFLAGS) -D__ASSEMBLY__
+GCOV_PROFILE := n
diff --git a/arch/x86/realmode/rm/bioscall.S b/arch/x86/realmode/rm/bioscall.S
new file mode 100644
index 000000000000..16162d197918
--- /dev/null
+++ b/arch/x86/realmode/rm/bioscall.S
@@ -0,0 +1 @@
+#include "../../boot/bioscall.S"
diff --git a/arch/x86/realmode/rm/copy.S b/arch/x86/realmode/rm/copy.S
new file mode 100644
index 000000000000..b785e6f38fdd
--- /dev/null
+++ b/arch/x86/realmode/rm/copy.S
@@ -0,0 +1 @@
+#include "../../boot/copy.S"
diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S
new file mode 100644
index 000000000000..fadf48378ada
--- /dev/null
+++ b/arch/x86/realmode/rm/header.S
@@ -0,0 +1,41 @@
+/*
+ * Real-mode blob header; this should match realmode.h and be
+ * readonly; for mutable data instead add pointers into the .data
+ * or .bss sections as appropriate.
+ */
+
+#include <linux/linkage.h>
+#include <asm/page_types.h>
+
+#include "realmode.h"
+	
+	.section ".header", "a"
+
+	.balign	16
+GLOBAL(real_mode_header)
+	.long	pa_text_start
+	.long	pa_ro_end
+	/* SMP trampoline */
+	.long	pa_trampoline_start
+	.long	pa_trampoline_status
+	.long	pa_trampoline_header
+#ifdef CONFIG_X86_64
+	.long	pa_trampoline_pgd;
+#endif
+	/* ACPI S3 wakeup */
+#ifdef CONFIG_ACPI_SLEEP
+	.long	pa_wakeup_start
+	.long	pa_wakeup_header
+#endif
+	/* APM/BIOS reboot */
+#ifdef CONFIG_X86_32
+	.long	pa_machine_real_restart_asm
+#endif
+END(real_mode_header)
+
+	/* End signature, used to verify integrity */
+	.section ".signature","a"
+	.balign 4
+GLOBAL(end_signature)
+	.long	REALMODE_END_SIGNATURE
+END(end_signature)
diff --git a/arch/x86/realmode/rm/realmode.h b/arch/x86/realmode/rm/realmode.h
new file mode 100644
index 000000000000..d74cff6350ed
--- /dev/null
+++ b/arch/x86/realmode/rm/realmode.h
@@ -0,0 +1,21 @@
+#ifndef ARCH_X86_REALMODE_RM_REALMODE_H
+#define ARCH_X86_REALMODE_RM_REALMODE_H
+
+#ifdef __ASSEMBLY__
+
+/*
+ * 16-bit ljmpw to the real_mode_seg
+ *
+ * This must be open-coded since gas will choke on using a
+ * relocatable symbol for the segment portion.
+ */
+#define LJMPW_RM(to)	.byte 0xea ; .word (to), real_mode_seg
+
+#endif /* __ASSEMBLY__ */
+
+/*
+ * Signature at the end of the realmode region
+ */
+#define REALMODE_END_SIGNATURE	0x65a22c82
+
+#endif /* ARCH_X86_REALMODE_RM_REALMODE_H */
diff --git a/arch/x86/realmode/rm/realmode.lds.S b/arch/x86/realmode/rm/realmode.lds.S
new file mode 100644
index 000000000000..86b2e8d6b1f1
--- /dev/null
+++ b/arch/x86/realmode/rm/realmode.lds.S
@@ -0,0 +1,76 @@
+/*
+ * realmode.lds.S
+ *
+ * Linker script for the real-mode code
+ */
+
+#include <asm/page_types.h>
+
+#undef i386
+
+OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
+OUTPUT_ARCH(i386)
+
+SECTIONS
+{
+	real_mode_seg = 0;
+
+	. = 0;
+	.header : {
+		pa_real_mode_base = .;
+		*(.header)
+	}
+
+	. = ALIGN(4);
+	.rodata : {
+		*(.rodata)
+		*(.rodata.*)
+		. = ALIGN(16);
+		video_cards = .;
+		*(.videocards)
+		video_cards_end = .;
+	}
+
+	. = ALIGN(PAGE_SIZE);
+	pa_text_start = .;
+	.text : {
+		*(.text)
+		*(.text.*)
+	}
+
+	.text32 : {
+		*(.text32)
+		*(.text32.*)
+	}
+
+	.text64 : {
+		*(.text64)
+		*(.text64.*)
+	}
+	pa_ro_end = .;
+
+	. = ALIGN(PAGE_SIZE);
+	.data : {
+		*(.data)
+		*(.data.*)
+	}
+
+	. = ALIGN(128);
+	.bss : {
+		*(.bss*)
+	}
+
+	/* End signature for integrity checking */
+	. = ALIGN(4);
+	.signature : {
+		*(.signature)
+	}
+
+	/DISCARD/ : {
+		*(.note*)
+		*(.debug*)
+		*(.eh_frame*)
+	}
+
+#include "pasyms.h"
+}
diff --git a/arch/x86/kernel/reboot_32.S b/arch/x86/realmode/rm/reboot_32.S
index 1d5c46df0d78..114044876b3d 100644
--- a/arch/x86/kernel/reboot_32.S
+++ b/arch/x86/realmode/rm/reboot_32.S
@@ -2,6 +2,7 @@
 #include <linux/init.h>
 #include <asm/segment.h>
 #include <asm/page_types.h>
+#include "realmode.h"
 
 /*
  * The following code and data reboots the machine by switching to real
@@ -13,34 +14,20 @@
  *
  * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax.
  */
-	.section ".x86_trampoline","a"
-	.balign 16
+	.section ".text32", "ax"
 	.code32
-ENTRY(machine_real_restart_asm)
-r_base = .
-	/* Get our own relocated address */
-	call	1f
-1:	popl	%ebx
-	subl	$(1b - r_base), %ebx
-
-	/* Compute the equivalent real-mode segment */
-	movl	%ebx, %ecx
-	shrl	$4, %ecx
-	
-	/* Patch post-real-mode segment jump */
-	movw	(dispatch_table - r_base)(%ebx,%eax,2),%ax
-	movw	%ax, (101f - r_base)(%ebx)
-	movw	%cx, (102f - r_base)(%ebx)
 
+	.balign	16
+ENTRY(machine_real_restart_asm)
 	/* Set up the IDT for real mode. */
-	lidtl	(machine_real_restart_idt - r_base)(%ebx)
+	lidtl	pa_machine_real_restart_idt
 
 	/*
 	 * Set up a GDT from which we can load segment descriptors for real
 	 * mode.  The GDT is not used in real mode; it is just needed here to
 	 * prepare the descriptors.
 	 */
-	lgdtl	(machine_real_restart_gdt - r_base)(%ebx)
+	lgdtl	pa_machine_real_restart_gdt
 
 	/*
 	 * Load the data segment registers with 16-bit compatible values
@@ -51,7 +38,7 @@ r_base = .
 	movl	%ecx, %fs
 	movl	%ecx, %gs
 	movl	%ecx, %ss
-	ljmpl	$8, $1f - r_base
+	ljmpw	$8, $1f
 
 /*
  * This is 16-bit protected mode code to disable paging and the cache,
@@ -76,27 +63,29 @@ r_base = .
  *
  * Most of this work is probably excessive, but it is what is tested.
  */
+	.text
 	.code16
+
+	.balign	16
+machine_real_restart_asm16:
 1:
 	xorl	%ecx, %ecx
-	movl	%cr0, %eax
-	andl	$0x00000011, %eax
-	orl	$0x60000000, %eax
-	movl	%eax, %cr0
+	movl	%cr0, %edx
+	andl	$0x00000011, %edx
+	orl	$0x60000000, %edx
+	movl	%edx, %cr0
 	movl	%ecx, %cr3
 	movl	%cr0, %edx
-	andl	$0x60000000, %edx	/* If no cache bits -> no wbinvd */
+	testl	$0x60000000, %edx	/* If no cache bits -> no wbinvd */
 	jz	2f
 	wbinvd
 2:
-	andb	$0x10, %al
-	movl	%eax, %cr0
-	.byte	0xea			/* ljmpw */
-101:	.word	0			/* Offset */
-102:	.word	0			/* Segment */
-
-bios:
-	ljmpw	$0xf000, $0xfff0
+	andb	$0x10, %dl
+	movl	%edx, %cr0
+	LJMPW_RM(3f)
+3:
+	andw	%ax, %ax
+	jz	bios
 
 apm:
 	movw	$0x1000, %ax
@@ -106,26 +95,34 @@ apm:
 	movw	$0x0001, %bx
 	movw	$0x0003, %cx
 	int	$0x15
+	/* This should never return... */
 
-END(machine_real_restart_asm)
+bios:
+	ljmpw	$0xf000, $0xfff0
 
-	.balign 16
-	/* These must match <asm/reboot.h */
-dispatch_table:
-	.word	bios - r_base
-	.word	apm - r_base
-END(dispatch_table)
+	.section ".rodata", "a"
 
-	.balign 16
-machine_real_restart_idt:
+	.balign	16
+GLOBAL(machine_real_restart_idt)
 	.word	0xffff		/* Length - real mode default value */
 	.long	0		/* Base - real mode default value */
 END(machine_real_restart_idt)
 
-	.balign 16
-ENTRY(machine_real_restart_gdt)
-	.quad	0		/* Self-pointer, filled in by PM code */
-	.quad	0		/* 16-bit code segment, filled in by PM code */
+	.balign	16
+GLOBAL(machine_real_restart_gdt)
+	/* Self-pointer */
+	.word	0xffff		/* Length - real mode default value */
+	.long	pa_machine_real_restart_gdt
+	.word	0
+
+	/*
+	 * 16-bit code segment pointing to real_mode_seg
+	 * Selector value 8
+	 */
+	.word	0xffff		/* Limit */
+	.long	0x9b000000 + pa_real_mode_base
+	.word	0
+
 	/*
 	 * 16-bit data segment with the selector value 16 = 0x10 and
 	 * base value 0x100; since this is consistent with real mode
diff --git a/arch/x86/realmode/rm/regs.c b/arch/x86/realmode/rm/regs.c
new file mode 100644
index 000000000000..fbb15b9f9ca9
--- /dev/null
+++ b/arch/x86/realmode/rm/regs.c
@@ -0,0 +1 @@
+#include "../../boot/regs.c"
diff --git a/arch/x86/realmode/rm/stack.S b/arch/x86/realmode/rm/stack.S
new file mode 100644
index 000000000000..867ae87adfae
--- /dev/null
+++ b/arch/x86/realmode/rm/stack.S
@@ -0,0 +1,19 @@
+/*
+ * Common heap and stack allocations
+ */
+
+#include <linux/linkage.h>
+
+	.data
+GLOBAL(HEAP)
+	.long	rm_heap
+GLOBAL(heap_end)
+	.long	rm_stack
+
+	.bss
+	.balign	16
+GLOBAL(rm_heap)
+	.space	2048
+GLOBAL(rm_stack)
+	.space	2048
+GLOBAL(rm_stack_end)
diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S
new file mode 100644
index 000000000000..c1b2791183e7
--- /dev/null
+++ b/arch/x86/realmode/rm/trampoline_32.S
@@ -0,0 +1,74 @@
+/*
+ *
+ *	Trampoline.S	Derived from Setup.S by Linus Torvalds
+ *
+ *	4 Jan 1997 Michael Chastain: changed to gnu as.
+ *
+ *	This is only used for booting secondary CPUs in SMP machine
+ *
+ *	Entry: CS:IP point to the start of our code, we are
+ *	in real mode with no stack, but the rest of the
+ *	trampoline page to make our stack and everything else
+ *	is a mystery.
+ *
+ *	We jump into arch/x86/kernel/head_32.S.
+ *
+ *	On entry to trampoline_start, the processor is in real mode
+ *	with 16-bit addressing and 16-bit data.  CS has some value
+ *	and IP is zero.  Thus, we load CS to the physical segment
+ *	of the real mode code before doing anything further.
+ */
+
+#include <linux/linkage.h>
+#include <linux/init.h>
+#include <asm/segment.h>
+#include <asm/page_types.h>
+#include "realmode.h"
+
+	.text
+	.code16
+
+	.balign	PAGE_SIZE
+ENTRY(trampoline_start)
+	wbinvd			# Needed for NUMA-Q should be harmless for others
+
+	LJMPW_RM(1f)
+1:
+	mov	%cs, %ax	# Code and data in the same place
+	mov	%ax, %ds
+
+	cli			# We should be safe anyway
+
+	movl	tr_start, %eax	# where we need to go
+
+	movl	$0xA5A5A5A5, trampoline_status
+				# write marker for master knows we're running
+
+	/*
+	 * GDT tables in non default location kernel can be beyond 16MB and
+	 * lgdt will not be able to load the address as in real mode default
+	 * operand size is 16bit. Use lgdtl instead to force operand size
+	 * to 32 bit.
+	 */
+	lidtl	tr_idt			# load idt with 0, 0
+	lgdtl	tr_gdt			# load gdt with whatever is appropriate
+
+	movw	$1, %dx			# protected mode (PE) bit
+	lmsw	%dx			# into protected mode
+
+	ljmpl	$__BOOT_CS, $pa_startup_32
+
+	.section ".text32","ax"
+	.code32
+ENTRY(startup_32)			# note: also used from wakeup_asm.S
+	jmp	*%eax
+
+	.bss
+	.balign 8
+GLOBAL(trampoline_header)
+	tr_start:		.space	4
+	tr_gdt_pad:		.space	2
+	tr_gdt:			.space	6
+END(trampoline_header)
+	
+#include "trampoline_common.S"
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index 09ff51799e96..bb360dc39d21 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -5,12 +5,12 @@
  *	4 Jan 1997 Michael Chastain: changed to gnu as.
  *	15 Sept 2005 Eric Biederman: 64bit PIC support
  *
- *	Entry: CS:IP point to the start of our code, we are 
- *	in real mode with no stack, but the rest of the 
+ *	Entry: CS:IP point to the start of our code, we are
+ *	in real mode with no stack, but the rest of the
  *	trampoline page to make our stack and everything else
  *	is a mystery.
  *
- *	On entry to trampoline_data, the processor is in real mode
+ *	On entry to trampoline_start, the processor is in real mode
  *	with 16-bit addressing and 16-bit data.  CS has some value
  *	and IP is zero.  Thus, data addresses need to be absolute
  *	(no relocation) and are taken with regard to r_base.
@@ -31,43 +31,33 @@
 #include <asm/msr.h>
 #include <asm/segment.h>
 #include <asm/processor-flags.h>
+#include "realmode.h"
 
-	.section ".x86_trampoline","a"
-	.balign PAGE_SIZE
+	.text
 	.code16
 
-ENTRY(trampoline_data)
-r_base = .
+	.balign	PAGE_SIZE
+ENTRY(trampoline_start)
 	cli			# We should be safe anyway
 	wbinvd
+
+	LJMPW_RM(1f)
+1:
 	mov	%cs, %ax	# Code and data in the same place
 	mov	%ax, %ds
 	mov	%ax, %es
 	mov	%ax, %ss
 
+	movl	$0xA5A5A5A5, trampoline_status
+	# write marker for master knows we're running
 
-	movl	$0xA5A5A5A5, trampoline_status - r_base
-				# write marker for master knows we're running
-
-					# Setup stack
-	movw	$(trampoline_stack_end - r_base), %sp
+	# Setup stack
+	movl	$rm_stack_end, %esp
 
 	call	verify_cpu		# Verify the cpu supports long mode
 	testl   %eax, %eax		# Check for return code
 	jnz	no_longmode
 
-	mov	%cs, %ax
-	movzx	%ax, %esi		# Find the 32bit trampoline location
-	shll	$4, %esi
-
-					# Fixup the absolute vectors
-	leal	(startup_32 - r_base)(%esi), %eax
-	movl	%eax, startup_32_vector - r_base
-	leal	(startup_64 - r_base)(%esi), %eax
-	movl	%eax, startup_64_vector - r_base
-	leal	(tgdt - r_base)(%esi), %eax
-	movl	%eax, (tgdt + 2 - r_base)
-
 	/*
 	 * GDT tables in non default location kernel can be beyond 16MB and
 	 * lgdt will not be able to load the address as in real mode default
@@ -75,36 +65,49 @@ r_base = .
 	 * to 32 bit.
 	 */
 
-	lidtl	tidt - r_base	# load idt with 0, 0
-	lgdtl	tgdt - r_base	# load gdt with whatever is appropriate
+	lidtl	tr_idt	# load idt with 0, 0
+	lgdtl	tr_gdt	# load gdt with whatever is appropriate
+
+	movw	$__KERNEL_DS, %dx	# Data segment descriptor
 
-	mov	$X86_CR0_PE, %ax	# protected mode (PE) bit
-	lmsw	%ax			# into protected mode
+	# Enable protected mode
+	movl	$X86_CR0_PE, %eax	# protected mode (PE) bit
+	movl	%eax, %cr0		# into protected mode
 
 	# flush prefetch and jump to startup_32
-	ljmpl	*(startup_32_vector - r_base)
+	ljmpl	$__KERNEL32_CS, $pa_startup_32
 
+no_longmode:
+	hlt
+	jmp no_longmode
+#include "../kernel/verify_cpu.S"
+
+	.section ".text32","ax"
 	.code32
 	.balign 4
-startup_32:
-	movl	$__KERNEL_DS, %eax	# Initialize the %ds segment register
-	movl	%eax, %ds
-
-	movl	$X86_CR4_PAE, %eax
+ENTRY(startup_32)
+	movl	%edx, %ss
+	addl	$pa_real_mode_base, %esp
+	movl	%edx, %ds
+	movl	%edx, %es
+	movl	%edx, %fs
+	movl	%edx, %gs
+
+	movl	pa_tr_cr4, %eax
 	movl	%eax, %cr4		# Enable PAE mode
 
-					# Setup trampoline 4 level pagetables
-	leal	(trampoline_level4_pgt - r_base)(%esi), %eax
+	# Setup trampoline 4 level pagetables
+	movl	$pa_trampoline_pgd, %eax
 	movl	%eax, %cr3
 
+	# Set up EFER
+	movl	pa_tr_efer, %eax
+	movl	pa_tr_efer + 4, %edx
 	movl	$MSR_EFER, %ecx
-	movl	$(1 << _EFER_LME), %eax	# Enable Long Mode
-	xorl	%edx, %edx
 	wrmsr
 
 	# Enable paging and in turn activate Long Mode
-	# Enable protected mode
-	movl	$(X86_CR0_PG | X86_CR0_PE), %eax
+	movl	$(X86_CR0_PG | X86_CR0_WP | X86_CR0_PE), %eax
 	movl	%eax, %cr0
 
 	/*
@@ -113,59 +116,38 @@ startup_32:
 	 * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use
 	 * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
 	 */
-	ljmp	*(startup_64_vector - r_base)(%esi)
+	ljmpl	$__KERNEL_CS, $pa_startup_64
 
+	.section ".text64","ax"
 	.code64
 	.balign 4
-startup_64:
+ENTRY(startup_64)
 	# Now jump into the kernel using virtual addresses
-	movq	$secondary_startup_64, %rax
-	jmp	*%rax
-
-	.code16
-no_longmode:
-	hlt
-	jmp no_longmode
-#include "verify_cpu.S"
-
-	.balign 4
-	# Careful these need to be in the same 64K segment as the above;
-tidt:
-	.word	0			# idt limit = 0
-	.word	0, 0			# idt base = 0L
+	jmpq	*tr_start(%rip)
 
+	.section ".rodata","a"
 	# Duplicate the global descriptor table
 	# so the kernel can live anywhere
-	.balign 4
-tgdt:
-	.short	tgdt_end - tgdt		# gdt limit
-	.long	tgdt - r_base
-	.short 0
+	.balign	16
+	.globl tr_gdt
+tr_gdt:
+	.short	tr_gdt_end - tr_gdt - 1	# gdt limit
+	.long	pa_tr_gdt
+	.short	0
 	.quad	0x00cf9b000000ffff	# __KERNEL32_CS
 	.quad	0x00af9b000000ffff	# __KERNEL_CS
 	.quad	0x00cf93000000ffff	# __KERNEL_DS
-tgdt_end:
+tr_gdt_end:
 
-	.balign 4
-startup_32_vector:
-	.long	startup_32 - r_base
-	.word	__KERNEL32_CS, 0
+	.bss
+	.balign	PAGE_SIZE
+GLOBAL(trampoline_pgd)		.space	PAGE_SIZE
 
-	.balign 4
-startup_64_vector:
-	.long	startup_64 - r_base
-	.word	__KERNEL_CS, 0
+	.balign	8
+GLOBAL(trampoline_header)
+	tr_start:		.space	8
+	GLOBAL(tr_efer)		.space	8
+	GLOBAL(tr_cr4)		.space	4
+END(trampoline_header)
 
-	.balign 4
-ENTRY(trampoline_status)
-	.long	0
-
-trampoline_stack:
-	.org 0x1000
-trampoline_stack_end:
-ENTRY(trampoline_level4_pgt)
-	.quad	level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
-	.fill	510,8,0
-	.quad	level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
-
-ENTRY(trampoline_end)
+#include "trampoline_common.S"
diff --git a/arch/x86/realmode/rm/trampoline_common.S b/arch/x86/realmode/rm/trampoline_common.S
new file mode 100644
index 000000000000..b1ecdb9692ad
--- /dev/null
+++ b/arch/x86/realmode/rm/trampoline_common.S
@@ -0,0 +1,7 @@
+	.section ".rodata","a"
+	.balign	16
+tr_idt: .fill 1, 6, 0
+
+	.bss
+	.balign	4
+GLOBAL(trampoline_status)	.space	4
diff --git a/arch/x86/realmode/rm/video-bios.c b/arch/x86/realmode/rm/video-bios.c
new file mode 100644
index 000000000000..848b25aaf11b
--- /dev/null
+++ b/arch/x86/realmode/rm/video-bios.c
@@ -0,0 +1 @@
+#include "../../boot/video-bios.c"
diff --git a/arch/x86/realmode/rm/video-mode.c b/arch/x86/realmode/rm/video-mode.c
new file mode 100644
index 000000000000..2a98b7e2368b
--- /dev/null
+++ b/arch/x86/realmode/rm/video-mode.c
@@ -0,0 +1 @@
+#include "../../boot/video-mode.c"
diff --git a/arch/x86/realmode/rm/video-vesa.c b/arch/x86/realmode/rm/video-vesa.c
new file mode 100644
index 000000000000..413edddb51e5
--- /dev/null
+++ b/arch/x86/realmode/rm/video-vesa.c
@@ -0,0 +1 @@
+#include "../../boot/video-vesa.c"
diff --git a/arch/x86/realmode/rm/video-vga.c b/arch/x86/realmode/rm/video-vga.c
new file mode 100644
index 000000000000..3085f5c9d288
--- /dev/null
+++ b/arch/x86/realmode/rm/video-vga.c
@@ -0,0 +1 @@
+#include "../../boot/video-vga.c"
diff --git a/arch/x86/kernel/acpi/realmode/wakemain.c b/arch/x86/realmode/rm/wakemain.c
index 883962d9eef2..91405d515ec6 100644
--- a/arch/x86/kernel/acpi/realmode/wakemain.c
+++ b/arch/x86/realmode/rm/wakemain.c
@@ -65,7 +65,8 @@ void main(void)
 {
 	/* Kill machine if structures are wrong */
 	if (wakeup_header.real_magic != 0x12345678)
-		while (1);
+		while (1)
+			;
 
 	if (wakeup_header.realmode_flags & 4)
 		send_morse("...-");
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/realmode/rm/wakeup.h
index 97a29e1430e3..9317e0042f24 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.h
+++ b/arch/x86/realmode/rm/wakeup.h
@@ -12,9 +12,8 @@
 /* This must match data at wakeup.S */
 struct wakeup_header {
 	u16 video_mode;		/* Video mode number */
-	u16 _jmp1;		/* ljmpl opcode, 32-bit only */
 	u32 pmode_entry;	/* Protected mode resume point, 32-bit only */
-	u16 _jmp2;		/* CS value, 32-bit only */
+	u16 pmode_cs;
 	u32 pmode_cr0;		/* Protected mode cr0 */
 	u32 pmode_cr3;		/* Protected mode cr3 */
 	u32 pmode_cr4;		/* Protected mode cr4 */
@@ -26,12 +25,6 @@ struct wakeup_header {
 	u32 pmode_behavior;	/* Wakeup routine behavior flags */
 	u32 realmode_flags;
 	u32 real_magic;
-	u16 trampoline_segment;	/* segment with trampoline code, 64-bit only */
-	u8  _pad1;
-	u8  wakeup_jmp;
-	u16 wakeup_jmp_off;
-	u16 wakeup_jmp_seg;
-	u64 wakeup_gdt[3];
 	u32 signature;		/* To check we have correct structure */
 } __attribute__((__packed__));
 
@@ -40,7 +33,6 @@ extern struct wakeup_header wakeup_header;
 
 #define WAKEUP_HEADER_OFFSET	8
 #define WAKEUP_HEADER_SIGNATURE 0x51ee1111
-#define WAKEUP_END_SIGNATURE	0x65a22c82
 
 /* Wakeup behavior bits */
 #define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE     0
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/realmode/rm/wakeup_asm.S
index b4fd836e4053..8905166b0bbb 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/realmode/rm/wakeup_asm.S
@@ -1,50 +1,47 @@
 /*
  * ACPI wakeup real mode startup stub
  */
+#include <linux/linkage.h>
 #include <asm/segment.h>
 #include <asm/msr-index.h>
 #include <asm/page_types.h>
 #include <asm/pgtable_types.h>
 #include <asm/processor-flags.h>
+#include "realmode.h"
 #include "wakeup.h"
 
 	.code16
-	.section ".jump", "ax"
-	.globl	_start
-_start:
-	cli
-	jmp	wakeup_code
 
 /* This should match the structure in wakeup.h */
-		.section ".header", "a"
-		.globl	wakeup_header
-wakeup_header:
-video_mode:	.short	0	/* Video mode number */
-pmode_return:	.byte	0x66, 0xea	/* ljmpl */
-		.long	0	/* offset goes here */
-		.short	__KERNEL_CS
-pmode_cr0:	.long	0	/* Saved %cr0 */
-pmode_cr3:	.long	0	/* Saved %cr3 */
-pmode_cr4:	.long	0	/* Saved %cr4 */
-pmode_efer:	.quad	0	/* Saved EFER */
-pmode_gdt:	.quad	0
-pmode_misc_en:	.quad	0	/* Saved MISC_ENABLE MSR */
-pmode_behavior:	.long	0	/* Wakeup behavior flags */
-realmode_flags:	.long	0
-real_magic:	.long	0
-trampoline_segment:	.word 0
-_pad1:		.byte	0
-wakeup_jmp:	.byte	0xea	/* ljmpw */
-wakeup_jmp_off:	.word	3f
-wakeup_jmp_seg:	.word	0
-wakeup_gdt:	.quad	0, 0, 0
-signature:	.long	WAKEUP_HEADER_SIGNATURE
+	.section ".data", "aw"
+
+	.balign	16
+GLOBAL(wakeup_header)
+	video_mode:	.short	0	/* Video mode number */
+	pmode_entry:	.long	0
+	pmode_cs:	.short	__KERNEL_CS
+	pmode_cr0:	.long	0	/* Saved %cr0 */
+	pmode_cr3:	.long	0	/* Saved %cr3 */
+	pmode_cr4:	.long	0	/* Saved %cr4 */
+	pmode_efer:	.quad	0	/* Saved EFER */
+	pmode_gdt:	.quad	0
+	pmode_misc_en:	.quad	0	/* Saved MISC_ENABLE MSR */
+	pmode_behavior:	.long	0	/* Wakeup behavior flags */
+	realmode_flags:	.long	0
+	real_magic:	.long	0
+	signature:	.long	WAKEUP_HEADER_SIGNATURE
+END(wakeup_header)
 
 	.text
 	.code16
-wakeup_code:
+
+	.balign	16
+ENTRY(wakeup_start)
+	cli
 	cld
 
+	LJMPW_RM(3f)
+3:
 	/* Apparently some dimwit BIOS programmers don't know how to
 	   program a PM to RM transition, and we might end up here with
 	   junk in the data segment descriptor registers.  The only way
@@ -54,8 +51,7 @@ wakeup_code:
 	movl	%cr0, %eax
 	orb	$X86_CR0_PE, %al
 	movl	%eax, %cr0
-	jmp	1f
-1:	ljmpw	$8, $2f
+	ljmpw	$8, $2f
 2:
 	movw	%cx, %ds
 	movw	%cx, %es
@@ -65,16 +61,18 @@ wakeup_code:
 
 	andb	$~X86_CR0_PE, %al
 	movl	%eax, %cr0
-	jmp	wakeup_jmp
+	LJMPW_RM(3f)
 3:
 	/* Set up segments */
 	movw	%cs, %ax
+	movw	%ax, %ss
+	movl	$rm_stack_end, %esp
 	movw	%ax, %ds
 	movw	%ax, %es
-	movw	%ax, %ss
-	lidtl	wakeup_idt
+	movw	%ax, %fs
+	movw	%ax, %gs
 
-	movl	$wakeup_stack_end, %esp
+	lidtl	wakeup_idt
 
 	/* Clear the EFLAGS */
 	pushl	$0
@@ -87,7 +85,7 @@ wakeup_code:
 
 	/* Check we really have everything... */
 	movl	end_signature, %eax
-	cmpl	$WAKEUP_END_SIGNATURE, %eax
+	cmpl	$REALMODE_END_SIGNATURE, %eax
 	jne	bogus_real_magic
 
 	/* Call the C code */
@@ -128,14 +126,13 @@ wakeup_code:
 	lgdtl	pmode_gdt
 
 	/* This really couldn't... */
-	movl	pmode_cr0, %eax
-	movl	%eax, %cr0
-	jmp	pmode_return
+	movl	pmode_entry, %eax
+	movl	pmode_cr0, %ecx
+	movl	%ecx, %cr0
+	ljmpl	$__KERNEL_CS, $pa_startup_32
+	/* -> jmp *%eax in trampoline_32.S */
 #else
-	pushw	$0
-	pushw	trampoline_segment
-	pushw	$0
-	lret
+	jmp	trampoline_start
 #endif
 
 bogus_real_magic:
@@ -143,28 +140,38 @@ bogus_real_magic:
 	hlt
 	jmp	1b
 
-	.data
+	.section ".rodata","a"
+
+	/*
+	 * Set up the wakeup GDT.  We set these up as Big Real Mode,
+	 * that is, with limits set to 4 GB.  At least the Lenovo
+	 * Thinkpad X61 is known to need this for the video BIOS
+	 * initialization quirk to work; this is likely to also
+	 * be the case for other laptops or integrated video devices.
+	 */
+
+	.balign	16
+GLOBAL(wakeup_gdt)
+	.word	3*8-1		/* Self-descriptor */
+	.long	pa_wakeup_gdt
+	.word	0
+
+	.word	0xffff		/* 16-bit code segment @ real_mode_base */
+	.long	0x9b000000 + pa_real_mode_base
+	.word	0x008f		/* big real mode */
+
+	.word	0xffff		/* 16-bit data segment @ real_mode_base */
+	.long	0x93000000 + pa_real_mode_base
+	.word	0x008f		/* big real mode */
+END(wakeup_gdt)
+
+	.section ".rodata","a"
 	.balign	8
 
 	/* This is the standard real-mode IDT */
-wakeup_idt:
+	.balign	16
+GLOBAL(wakeup_idt)
 	.word	0xffff		/* limit */
 	.long	0		/* address */
 	.word	0
-
-	.globl	HEAP, heap_end
-HEAP:
-	.long	wakeup_heap
-heap_end:
-	.long	wakeup_stack
-
-	.bss
-wakeup_heap:
-	.space	2048
-wakeup_stack:
-	.space	2048
-wakeup_stack_end:
-
-	.section ".signature","a"
-end_signature:
-	.long	WAKEUP_END_SIGNATURE
+END(wakeup_idt)
diff --git a/arch/x86/realmode/rmpiggy.S b/arch/x86/realmode/rmpiggy.S
new file mode 100644
index 000000000000..204c6ece0e97
--- /dev/null
+++ b/arch/x86/realmode/rmpiggy.S
@@ -0,0 +1,20 @@
+/*
+ * Wrapper script for the realmode binary as a transport object
+ * before copying to low memory.
+ */
+#include <linux/linkage.h>
+#include <asm/page_types.h>
+
+	.section ".init.data","aw"
+
+	.balign PAGE_SIZE
+
+GLOBAL(real_mode_blob)
+	.incbin	"arch/x86/realmode/rm/realmode.bin"
+END(real_mode_blob)
+
+GLOBAL(real_mode_blob_end);
+
+GLOBAL(real_mode_relocs)
+	.incbin	"arch/x86/realmode/rm/realmode.relocs"
+END(real_mode_relocs)
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index b685296d4464..5a1847d61930 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -78,6 +78,13 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = {
 
 static const char * const sym_regex_realmode[S_NSYMTYPES] = {
 /*
+ * These symbols are known to be relative, even if the linker marks them
+ * as absolute (typically defined outside any section in the linker script.)
+ */
+	[S_REL] =
+	"^pa_",
+
+/*
  * These are 16-bit segment symbols when compiling 16-bit code.
  */
 	[S_SEG] =
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 3199b76f795d..421bef9c4c48 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -23,8 +23,6 @@ config IOSCHED_DEADLINE
 
 config IOSCHED_CFQ
 	tristate "CFQ I/O scheduler"
-	# If BLK_CGROUP is a module, CFQ has to be built as module.
-	depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y
 	default y
 	---help---
 	  The CFQ I/O scheduler tries to distribute bandwidth equally
@@ -34,8 +32,6 @@ config IOSCHED_CFQ
 
 	  This is the default I/O scheduler.
 
-	  Note: If BLK_CGROUP=m, then CFQ can be built only as module.
-
 config CFQ_GROUP_IOSCHED
 	bool "CFQ Group Scheduling support"
 	depends on IOSCHED_CFQ && BLK_CGROUP
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 126c341955de..02cf6335e9bd 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -11,1570 +11,612 @@
  * 	              Nauman Rafique <[email protected]>
  */
 #include <linux/ioprio.h>
-#include <linux/seq_file.h>
 #include <linux/kdev_t.h>
 #include <linux/module.h>
 #include <linux/err.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
-#include "blk-cgroup.h"
 #include <linux/genhd.h>
+#include <linux/delay.h>
+#include <linux/atomic.h>
+#include "blk-cgroup.h"
+#include "blk.h"
 
 #define MAX_KEY_LEN 100
 
-static DEFINE_SPINLOCK(blkio_list_lock);
-static LIST_HEAD(blkio_list);
+static DEFINE_MUTEX(blkcg_pol_mutex);
 
-struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
-EXPORT_SYMBOL_GPL(blkio_root_cgroup);
+struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT };
+EXPORT_SYMBOL_GPL(blkcg_root);
 
-/* for encoding cft->private value on file */
-#define BLKIOFILE_PRIVATE(x, val)	(((x) << 16) | (val))
-/* What policy owns the file, proportional or throttle */
-#define BLKIOFILE_POLICY(val)		(((val) >> 16) & 0xffff)
-#define BLKIOFILE_ATTR(val)		((val) & 0xffff)
+static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
 
-static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
-					    struct blkio_policy_node *pn)
+struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup)
 {
-	list_add(&pn->node, &blkcg->policy_list);
+	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
+			    struct blkcg, css);
 }
+EXPORT_SYMBOL_GPL(cgroup_to_blkcg);
 
-static inline bool cftype_blkg_same_policy(struct cftype *cft,
-			struct blkio_group *blkg)
+static struct blkcg *task_blkcg(struct task_struct *tsk)
 {
-	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
-
-	if (blkg->plid == plid)
-		return 1;
-
-	return 0;
+	return container_of(task_subsys_state(tsk, blkio_subsys_id),
+			    struct blkcg, css);
 }
 
-/* Determines if policy node matches cgroup file being accessed */
-static inline bool pn_matches_cftype(struct cftype *cft,
-			struct blkio_policy_node *pn)
+struct blkcg *bio_blkcg(struct bio *bio)
 {
-	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
-	int fileid = BLKIOFILE_ATTR(cft->private);
-
-	return (plid == pn->plid && fileid == pn->fileid);
+	if (bio && bio->bi_css)
+		return container_of(bio->bi_css, struct blkcg, css);
+	return task_blkcg(current);
 }
+EXPORT_SYMBOL_GPL(bio_blkcg);
 
-/* Must be called with blkcg->lock held */
-static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
+static bool blkcg_policy_enabled(struct request_queue *q,
+				 const struct blkcg_policy *pol)
 {
-	list_del(&pn->node);
+	return pol && test_bit(pol->plid, q->blkcg_pols);
 }
 
-/* Must be called with blkcg->lock held */
-static struct blkio_policy_node *
-blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev,
-		enum blkio_policy_id plid, int fileid)
+/**
+ * blkg_free - free a blkg
+ * @blkg: blkg to free
+ *
+ * Free @blkg which may be partially allocated.
+ */
+static void blkg_free(struct blkcg_gq *blkg)
 {
-	struct blkio_policy_node *pn;
-
-	list_for_each_entry(pn, &blkcg->policy_list, node) {
-		if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid)
-			return pn;
-	}
+	int i;
 
-	return NULL;
-}
+	if (!blkg)
+		return;
 
-struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
-{
-	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
-			    struct blkio_cgroup, css);
-}
-EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
+	for (i = 0; i < BLKCG_MAX_POLS; i++) {
+		struct blkcg_policy *pol = blkcg_policy[i];
+		struct blkg_policy_data *pd = blkg->pd[i];
 
-struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
-{
-	return container_of(task_subsys_state(tsk, blkio_subsys_id),
-			    struct blkio_cgroup, css);
-}
-EXPORT_SYMBOL_GPL(task_blkio_cgroup);
+		if (!pd)
+			continue;
 
-static inline void
-blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
-{
-	struct blkio_policy_type *blkiop;
+		if (pol && pol->pd_exit_fn)
+			pol->pd_exit_fn(blkg);
 
-	list_for_each_entry(blkiop, &blkio_list, list) {
-		/* If this policy does not own the blkg, do not send updates */
-		if (blkiop->plid != blkg->plid)
-			continue;
-		if (blkiop->ops.blkio_update_group_weight_fn)
-			blkiop->ops.blkio_update_group_weight_fn(blkg->key,
-							blkg, weight);
+		kfree(pd);
 	}
+
+	kfree(blkg);
 }
 
-static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
-				int fileid)
+/**
+ * blkg_alloc - allocate a blkg
+ * @blkcg: block cgroup the new blkg is associated with
+ * @q: request_queue the new blkg is associated with
+ *
+ * Allocate a new blkg assocating @blkcg and @q.
+ */
+static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q)
 {
-	struct blkio_policy_type *blkiop;
-
-	list_for_each_entry(blkiop, &blkio_list, list) {
-
-		/* If this policy does not own the blkg, do not send updates */
-		if (blkiop->plid != blkg->plid)
-			continue;
-
-		if (fileid == BLKIO_THROTL_read_bps_device
-		    && blkiop->ops.blkio_update_group_read_bps_fn)
-			blkiop->ops.blkio_update_group_read_bps_fn(blkg->key,
-								blkg, bps);
+	struct blkcg_gq *blkg;
+	int i;
 
-		if (fileid == BLKIO_THROTL_write_bps_device
-		    && blkiop->ops.blkio_update_group_write_bps_fn)
-			blkiop->ops.blkio_update_group_write_bps_fn(blkg->key,
-								blkg, bps);
-	}
-}
+	/* alloc and init base part */
+	blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
+	if (!blkg)
+		return NULL;
 
-static inline void blkio_update_group_iops(struct blkio_group *blkg,
-			unsigned int iops, int fileid)
-{
-	struct blkio_policy_type *blkiop;
+	blkg->q = q;
+	INIT_LIST_HEAD(&blkg->q_node);
+	blkg->blkcg = blkcg;
+	blkg->refcnt = 1;
 
-	list_for_each_entry(blkiop, &blkio_list, list) {
+	for (i = 0; i < BLKCG_MAX_POLS; i++) {
+		struct blkcg_policy *pol = blkcg_policy[i];
+		struct blkg_policy_data *pd;
 
-		/* If this policy does not own the blkg, do not send updates */
-		if (blkiop->plid != blkg->plid)
+		if (!blkcg_policy_enabled(q, pol))
 			continue;
 
-		if (fileid == BLKIO_THROTL_read_iops_device
-		    && blkiop->ops.blkio_update_group_read_iops_fn)
-			blkiop->ops.blkio_update_group_read_iops_fn(blkg->key,
-								blkg, iops);
+		/* alloc per-policy data and attach it to blkg */
+		pd = kzalloc_node(pol->pd_size, GFP_ATOMIC, q->node);
+		if (!pd) {
+			blkg_free(blkg);
+			return NULL;
+		}
 
-		if (fileid == BLKIO_THROTL_write_iops_device
-		    && blkiop->ops.blkio_update_group_write_iops_fn)
-			blkiop->ops.blkio_update_group_write_iops_fn(blkg->key,
-								blkg,iops);
+		blkg->pd[i] = pd;
+		pd->blkg = blkg;
 	}
-}
 
-/*
- * Add to the appropriate stat variable depending on the request type.
- * This should be called with the blkg->stats_lock held.
- */
-static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
-				bool sync)
-{
-	if (direction)
-		stat[BLKIO_STAT_WRITE] += add;
-	else
-		stat[BLKIO_STAT_READ] += add;
-	if (sync)
-		stat[BLKIO_STAT_SYNC] += add;
-	else
-		stat[BLKIO_STAT_ASYNC] += add;
-}
+	/* invoke per-policy init */
+	for (i = 0; i < BLKCG_MAX_POLS; i++) {
+		struct blkcg_policy *pol = blkcg_policy[i];
 
-/*
- * Decrements the appropriate stat variable if non-zero depending on the
- * request type. Panics on value being zero.
- * This should be called with the blkg->stats_lock held.
- */
-static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
-{
-	if (direction) {
-		BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
-		stat[BLKIO_STAT_WRITE]--;
-	} else {
-		BUG_ON(stat[BLKIO_STAT_READ] == 0);
-		stat[BLKIO_STAT_READ]--;
-	}
-	if (sync) {
-		BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
-		stat[BLKIO_STAT_SYNC]--;
-	} else {
-		BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
-		stat[BLKIO_STAT_ASYNC]--;
+		if (blkcg_policy_enabled(blkg->q, pol))
+			pol->pd_init_fn(blkg);
 	}
-}
 
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-/* This should be called with the blkg->stats_lock held. */
-static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
-						struct blkio_group *curr_blkg)
-{
-	if (blkio_blkg_waiting(&blkg->stats))
-		return;
-	if (blkg == curr_blkg)
-		return;
-	blkg->stats.start_group_wait_time = sched_clock();
-	blkio_mark_blkg_waiting(&blkg->stats);
+	return blkg;
 }
 
-/* This should be called with the blkg->stats_lock held. */
-static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
+static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
+				      struct request_queue *q)
 {
-	unsigned long long now;
+	struct blkcg_gq *blkg;
 
-	if (!blkio_blkg_waiting(stats))
-		return;
+	blkg = rcu_dereference(blkcg->blkg_hint);
+	if (blkg && blkg->q == q)
+		return blkg;
 
-	now = sched_clock();
-	if (time_after64(now, stats->start_group_wait_time))
-		stats->group_wait_time += now - stats->start_group_wait_time;
-	blkio_clear_blkg_waiting(stats);
+	/*
+	 * Hint didn't match.  Look up from the radix tree.  Note that we
+	 * may not be holding queue_lock and thus are not sure whether
+	 * @blkg from blkg_tree has already been removed or not, so we
+	 * can't update hint to the lookup result.  Leave it to the caller.
+	 */
+	blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
+	if (blkg && blkg->q == q)
+		return blkg;
+
+	return NULL;
 }
 
-/* This should be called with the blkg->stats_lock held. */
-static void blkio_end_empty_time(struct blkio_group_stats *stats)
+/**
+ * blkg_lookup - lookup blkg for the specified blkcg - q pair
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ *
+ * Lookup blkg for the @blkcg - @q pair.  This function should be called
+ * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
+ * - see blk_queue_bypass_start() for details.
+ */
+struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
 {
-	unsigned long long now;
-
-	if (!blkio_blkg_empty(stats))
-		return;
+	WARN_ON_ONCE(!rcu_read_lock_held());
 
-	now = sched_clock();
-	if (time_after64(now, stats->start_empty_time))
-		stats->empty_time += now - stats->start_empty_time;
-	blkio_clear_blkg_empty(stats);
+	if (unlikely(blk_queue_bypass(q)))
+		return NULL;
+	return __blkg_lookup(blkcg, q);
 }
+EXPORT_SYMBOL_GPL(blkg_lookup);
 
-void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
+static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
+					     struct request_queue *q)
+	__releases(q->queue_lock) __acquires(q->queue_lock)
 {
-	unsigned long flags;
+	struct blkcg_gq *blkg;
+	int ret;
 
-	spin_lock_irqsave(&blkg->stats_lock, flags);
-	BUG_ON(blkio_blkg_idling(&blkg->stats));
-	blkg->stats.start_idle_time = sched_clock();
-	blkio_mark_blkg_idling(&blkg->stats);
-	spin_unlock_irqrestore(&blkg->stats_lock, flags);
-}
-EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	lockdep_assert_held(q->queue_lock);
 
-void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
-{
-	unsigned long flags;
-	unsigned long long now;
-	struct blkio_group_stats *stats;
-
-	spin_lock_irqsave(&blkg->stats_lock, flags);
-	stats = &blkg->stats;
-	if (blkio_blkg_idling(stats)) {
-		now = sched_clock();
-		if (time_after64(now, stats->start_idle_time))
-			stats->idle_time += now - stats->start_idle_time;
-		blkio_clear_blkg_idling(stats);
+	/* lookup and update hint on success, see __blkg_lookup() for details */
+	blkg = __blkg_lookup(blkcg, q);
+	if (blkg) {
+		rcu_assign_pointer(blkcg->blkg_hint, blkg);
+		return blkg;
 	}
-	spin_unlock_irqrestore(&blkg->stats_lock, flags);
-}
-EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
 
-void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
-{
-	unsigned long flags;
-	struct blkio_group_stats *stats;
-
-	spin_lock_irqsave(&blkg->stats_lock, flags);
-	stats = &blkg->stats;
-	stats->avg_queue_size_sum +=
-			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
-			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
-	stats->avg_queue_size_samples++;
-	blkio_update_group_wait_time(stats);
-	spin_unlock_irqrestore(&blkg->stats_lock, flags);
-}
-EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
+	/* blkg holds a reference to blkcg */
+	if (!css_tryget(&blkcg->css))
+		return ERR_PTR(-EINVAL);
 
-void blkiocg_set_start_empty_time(struct blkio_group *blkg)
-{
-	unsigned long flags;
-	struct blkio_group_stats *stats;
+	/* allocate */
+	ret = -ENOMEM;
+	blkg = blkg_alloc(blkcg, q);
+	if (unlikely(!blkg))
+		goto err_put;
 
-	spin_lock_irqsave(&blkg->stats_lock, flags);
-	stats = &blkg->stats;
+	/* insert */
+	ret = radix_tree_preload(GFP_ATOMIC);
+	if (ret)
+		goto err_free;
 
-	if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
-			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
-		spin_unlock_irqrestore(&blkg->stats_lock, flags);
-		return;
+	spin_lock(&blkcg->lock);
+	ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
+	if (likely(!ret)) {
+		hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
+		list_add(&blkg->q_node, &q->blkg_list);
 	}
+	spin_unlock(&blkcg->lock);
 
-	/*
-	 * group is already marked empty. This can happen if cfqq got new
-	 * request in parent group and moved to this group while being added
-	 * to service tree. Just ignore the event and move on.
-	 */
-	if(blkio_blkg_empty(stats)) {
-		spin_unlock_irqrestore(&blkg->stats_lock, flags);
-		return;
-	}
+	radix_tree_preload_end();
 
-	stats->start_empty_time = sched_clock();
-	blkio_mark_blkg_empty(stats);
-	spin_unlock_irqrestore(&blkg->stats_lock, flags);
+	if (!ret)
+		return blkg;
+err_free:
+	blkg_free(blkg);
+err_put:
+	css_put(&blkcg->css);
+	return ERR_PTR(ret);
 }
-EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
 
-void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
-			unsigned long dequeue)
+struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
+				    struct request_queue *q)
 {
-	blkg->stats.dequeue += dequeue;
-}
-EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
-#else
-static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
-					struct blkio_group *curr_blkg) {}
-static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
-#endif
-
-void blkiocg_update_io_add_stats(struct blkio_group *blkg,
-			struct blkio_group *curr_blkg, bool direction,
-			bool sync)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&blkg->stats_lock, flags);
-	blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
-			sync);
-	blkio_end_empty_time(&blkg->stats);
-	blkio_set_start_group_wait_time(blkg, curr_blkg);
-	spin_unlock_irqrestore(&blkg->stats_lock, flags);
+	/*
+	 * This could be the first entry point of blkcg implementation and
+	 * we shouldn't allow anything to go through for a bypassing queue.
+	 */
+	if (unlikely(blk_queue_bypass(q)))
+		return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
+	return __blkg_lookup_create(blkcg, q);
 }
-EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
+EXPORT_SYMBOL_GPL(blkg_lookup_create);
 
-void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
-						bool direction, bool sync)
+static void blkg_destroy(struct blkcg_gq *blkg)
 {
-	unsigned long flags;
+	struct request_queue *q = blkg->q;
+	struct blkcg *blkcg = blkg->blkcg;
 
-	spin_lock_irqsave(&blkg->stats_lock, flags);
-	blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
-					direction, sync);
-	spin_unlock_irqrestore(&blkg->stats_lock, flags);
-}
-EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
+	lockdep_assert_held(q->queue_lock);
+	lockdep_assert_held(&blkcg->lock);
 
-void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
-				unsigned long unaccounted_time)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&blkg->stats_lock, flags);
-	blkg->stats.time += time;
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-	blkg->stats.unaccounted_time += unaccounted_time;
-#endif
-	spin_unlock_irqrestore(&blkg->stats_lock, flags);
-}
-EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
+	/* Something wrong if we are trying to remove same group twice */
+	WARN_ON_ONCE(list_empty(&blkg->q_node));
+	WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
 
-/*
- * should be called under rcu read lock or queue lock to make sure blkg pointer
- * is valid.
- */
-void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
-				uint64_t bytes, bool direction, bool sync)
-{
-	struct blkio_group_stats_cpu *stats_cpu;
-	unsigned long flags;
+	radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
+	list_del_init(&blkg->q_node);
+	hlist_del_init_rcu(&blkg->blkcg_node);
 
 	/*
-	 * Disabling interrupts to provide mutual exclusion between two
-	 * writes on same cpu. It probably is not needed for 64bit. Not
-	 * optimizing that case yet.
+	 * Both setting lookup hint to and clearing it from @blkg are done
+	 * under queue_lock.  If it's not pointing to @blkg now, it never
+	 * will.  Hint assignment itself can race safely.
 	 */
-	local_irq_save(flags);
-
-	stats_cpu = this_cpu_ptr(blkg->stats_cpu);
-
-	u64_stats_update_begin(&stats_cpu->syncp);
-	stats_cpu->sectors += bytes >> 9;
-	blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED],
-			1, direction, sync);
-	blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES],
-			bytes, direction, sync);
-	u64_stats_update_end(&stats_cpu->syncp);
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
-
-void blkiocg_update_completion_stats(struct blkio_group *blkg,
-	uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
-{
-	struct blkio_group_stats *stats;
-	unsigned long flags;
-	unsigned long long now = sched_clock();
-
-	spin_lock_irqsave(&blkg->stats_lock, flags);
-	stats = &blkg->stats;
-	if (time_after64(now, io_start_time))
-		blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
-				now - io_start_time, direction, sync);
-	if (time_after64(io_start_time, start_time))
-		blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
-				io_start_time - start_time, direction, sync);
-	spin_unlock_irqrestore(&blkg->stats_lock, flags);
-}
-EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
-
-/*  Merged stats are per cpu.  */
-void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
-					bool sync)
-{
-	struct blkio_group_stats_cpu *stats_cpu;
-	unsigned long flags;
+	if (rcu_dereference_raw(blkcg->blkg_hint) == blkg)
+		rcu_assign_pointer(blkcg->blkg_hint, NULL);
 
 	/*
-	 * Disabling interrupts to provide mutual exclusion between two
-	 * writes on same cpu. It probably is not needed for 64bit. Not
-	 * optimizing that case yet.
+	 * Put the reference taken at the time of creation so that when all
+	 * queues are gone, group can be destroyed.
 	 */
-	local_irq_save(flags);
-
-	stats_cpu = this_cpu_ptr(blkg->stats_cpu);
-
-	u64_stats_update_begin(&stats_cpu->syncp);
-	blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1,
-				direction, sync);
-	u64_stats_update_end(&stats_cpu->syncp);
-	local_irq_restore(flags);
+	blkg_put(blkg);
 }
-EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
 
-/*
- * This function allocates the per cpu stats for blkio_group. Should be called
- * from sleepable context as alloc_per_cpu() requires that.
+/**
+ * blkg_destroy_all - destroy all blkgs associated with a request_queue
+ * @q: request_queue of interest
+ *
+ * Destroy all blkgs associated with @q.
  */
-int blkio_alloc_blkg_stats(struct blkio_group *blkg)
+static void blkg_destroy_all(struct request_queue *q)
 {
-	/* Allocate memory for per cpu stats */
-	blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
-	if (!blkg->stats_cpu)
-		return -ENOMEM;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats);
+	struct blkcg_gq *blkg, *n;
 
-void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-		struct blkio_group *blkg, void *key, dev_t dev,
-		enum blkio_policy_id plid)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&blkcg->lock, flags);
-	spin_lock_init(&blkg->stats_lock);
-	rcu_assign_pointer(blkg->key, key);
-	blkg->blkcg_id = css_id(&blkcg->css);
-	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
-	blkg->plid = plid;
-	spin_unlock_irqrestore(&blkcg->lock, flags);
-	/* Need to take css reference ? */
-	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
-	blkg->dev = dev;
-}
-EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
+	lockdep_assert_held(q->queue_lock);
 
-static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
-{
-	hlist_del_init_rcu(&blkg->blkcg_node);
-	blkg->blkcg_id = 0;
-}
+	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
+		struct blkcg *blkcg = blkg->blkcg;
 
-/*
- * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
- * indicating that blk_group was unhashed by the time we got to it.
- */
-int blkiocg_del_blkio_group(struct blkio_group *blkg)
-{
-	struct blkio_cgroup *blkcg;
-	unsigned long flags;
-	struct cgroup_subsys_state *css;
-	int ret = 1;
-
-	rcu_read_lock();
-	css = css_lookup(&blkio_subsys, blkg->blkcg_id);
-	if (css) {
-		blkcg = container_of(css, struct blkio_cgroup, css);
-		spin_lock_irqsave(&blkcg->lock, flags);
-		if (!hlist_unhashed(&blkg->blkcg_node)) {
-			__blkiocg_del_blkio_group(blkg);
-			ret = 0;
-		}
-		spin_unlock_irqrestore(&blkcg->lock, flags);
+		spin_lock(&blkcg->lock);
+		blkg_destroy(blkg);
+		spin_unlock(&blkcg->lock);
 	}
-
-	rcu_read_unlock();
-	return ret;
 }
-EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
 
-/* called under rcu_read_lock(). */
-struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
+static void blkg_rcu_free(struct rcu_head *rcu_head)
 {
-	struct blkio_group *blkg;
-	struct hlist_node *n;
-	void *__key;
-
-	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
-		__key = blkg->key;
-		if (__key == key)
-			return blkg;
-	}
-
-	return NULL;
+	blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head));
 }
-EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
 
-static void blkio_reset_stats_cpu(struct blkio_group *blkg)
+void __blkg_release(struct blkcg_gq *blkg)
 {
-	struct blkio_group_stats_cpu *stats_cpu;
-	int i, j, k;
+	/* release the extra blkcg reference this blkg has been holding */
+	css_put(&blkg->blkcg->css);
+
 	/*
-	 * Note: On 64 bit arch this should not be an issue. This has the
-	 * possibility of returning some inconsistent value on 32bit arch
-	 * as 64bit update on 32bit is non atomic. Taking care of this
-	 * corner case makes code very complicated, like sending IPIs to
-	 * cpus, taking care of stats of offline cpus etc.
+	 * A group is freed in rcu manner. But having an rcu lock does not
+	 * mean that one can access all the fields of blkg and assume these
+	 * are valid. For example, don't try to follow throtl_data and
+	 * request queue links.
 	 *
-	 * reset stats is anyway more of a debug feature and this sounds a
-	 * corner case. So I am not complicating the code yet until and
-	 * unless this becomes a real issue.
+	 * Having a reference to blkg under an rcu allows acess to only
+	 * values local to groups like group stats and group rate limits
 	 */
-	for_each_possible_cpu(i) {
-		stats_cpu = per_cpu_ptr(blkg->stats_cpu, i);
-		stats_cpu->sectors = 0;
-		for(j = 0; j < BLKIO_STAT_CPU_NR; j++)
-			for (k = 0; k < BLKIO_STAT_TOTAL; k++)
-				stats_cpu->stat_arr_cpu[j][k] = 0;
-	}
+	call_rcu(&blkg->rcu_head, blkg_rcu_free);
 }
+EXPORT_SYMBOL_GPL(__blkg_release);
 
-static int
-blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
+static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype,
+			     u64 val)
 {
-	struct blkio_cgroup *blkcg;
-	struct blkio_group *blkg;
-	struct blkio_group_stats *stats;
+	struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
+	struct blkcg_gq *blkg;
 	struct hlist_node *n;
-	uint64_t queued[BLKIO_STAT_TOTAL];
 	int i;
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-	bool idling, waiting, empty;
-	unsigned long long now = sched_clock();
-#endif
 
-	blkcg = cgroup_to_blkio_cgroup(cgroup);
+	mutex_lock(&blkcg_pol_mutex);
 	spin_lock_irq(&blkcg->lock);
-	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
-		spin_lock(&blkg->stats_lock);
-		stats = &blkg->stats;
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-		idling = blkio_blkg_idling(stats);
-		waiting = blkio_blkg_waiting(stats);
-		empty = blkio_blkg_empty(stats);
-#endif
-		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
-			queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
-		memset(stats, 0, sizeof(struct blkio_group_stats));
-		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
-			stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-		if (idling) {
-			blkio_mark_blkg_idling(stats);
-			stats->start_idle_time = now;
-		}
-		if (waiting) {
-			blkio_mark_blkg_waiting(stats);
-			stats->start_group_wait_time = now;
-		}
-		if (empty) {
-			blkio_mark_blkg_empty(stats);
-			stats->start_empty_time = now;
-		}
-#endif
-		spin_unlock(&blkg->stats_lock);
-
-		/* Reset Per cpu stats which don't take blkg->stats_lock */
-		blkio_reset_stats_cpu(blkg);
-	}
-
-	spin_unlock_irq(&blkcg->lock);
-	return 0;
-}
-
-static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
-				int chars_left, bool diskname_only)
-{
-	snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
-	chars_left -= strlen(str);
-	if (chars_left <= 0) {
-		printk(KERN_WARNING
-			"Possibly incorrect cgroup stat display format");
-		return;
-	}
-	if (diskname_only)
-		return;
-	switch (type) {
-	case BLKIO_STAT_READ:
-		strlcat(str, " Read", chars_left);
-		break;
-	case BLKIO_STAT_WRITE:
-		strlcat(str, " Write", chars_left);
-		break;
-	case BLKIO_STAT_SYNC:
-		strlcat(str, " Sync", chars_left);
-		break;
-	case BLKIO_STAT_ASYNC:
-		strlcat(str, " Async", chars_left);
-		break;
-	case BLKIO_STAT_TOTAL:
-		strlcat(str, " Total", chars_left);
-		break;
-	default:
-		strlcat(str, " Invalid", chars_left);
-	}
-}
-
-static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
-				struct cgroup_map_cb *cb, dev_t dev)
-{
-	blkio_get_key_name(0, dev, str, chars_left, true);
-	cb->fill(cb, str, val);
-	return val;
-}
-
-
-static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg,
-			enum stat_type_cpu type, enum stat_sub_type sub_type)
-{
-	int cpu;
-	struct blkio_group_stats_cpu *stats_cpu;
-	u64 val = 0, tval;
-
-	for_each_possible_cpu(cpu) {
-		unsigned int start;
-		stats_cpu  = per_cpu_ptr(blkg->stats_cpu, cpu);
-
-		do {
-			start = u64_stats_fetch_begin(&stats_cpu->syncp);
-			if (type == BLKIO_STAT_CPU_SECTORS)
-				tval = stats_cpu->sectors;
-			else
-				tval = stats_cpu->stat_arr_cpu[type][sub_type];
-		} while(u64_stats_fetch_retry(&stats_cpu->syncp, start));
-
-		val += tval;
-	}
-
-	return val;
-}
-
-static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg,
-		struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type)
-{
-	uint64_t disk_total, val;
-	char key_str[MAX_KEY_LEN];
-	enum stat_sub_type sub_type;
 
-	if (type == BLKIO_STAT_CPU_SECTORS) {
-		val = blkio_read_stat_cpu(blkg, type, 0);
-		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev);
-	}
-
-	for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
-			sub_type++) {
-		blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
-		val = blkio_read_stat_cpu(blkg, type, sub_type);
-		cb->fill(cb, key_str, val);
-	}
-
-	disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) +
-			blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE);
-
-	blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
-	cb->fill(cb, key_str, disk_total);
-	return disk_total;
-}
-
-/* This should be called with blkg->stats_lock held */
-static uint64_t blkio_get_stat(struct blkio_group *blkg,
-		struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
-{
-	uint64_t disk_total;
-	char key_str[MAX_KEY_LEN];
-	enum stat_sub_type sub_type;
-
-	if (type == BLKIO_STAT_TIME)
-		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-					blkg->stats.time, cb, dev);
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-	if (type == BLKIO_STAT_UNACCOUNTED_TIME)
-		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-					blkg->stats.unaccounted_time, cb, dev);
-	if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
-		uint64_t sum = blkg->stats.avg_queue_size_sum;
-		uint64_t samples = blkg->stats.avg_queue_size_samples;
-		if (samples)
-			do_div(sum, samples);
-		else
-			sum = 0;
-		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
-	}
-	if (type == BLKIO_STAT_GROUP_WAIT_TIME)
-		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-					blkg->stats.group_wait_time, cb, dev);
-	if (type == BLKIO_STAT_IDLE_TIME)
-		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-					blkg->stats.idle_time, cb, dev);
-	if (type == BLKIO_STAT_EMPTY_TIME)
-		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-					blkg->stats.empty_time, cb, dev);
-	if (type == BLKIO_STAT_DEQUEUE)
-		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-					blkg->stats.dequeue, cb, dev);
-#endif
-
-	for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
-			sub_type++) {
-		blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
-		cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
-	}
-	disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
-			blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
-	blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
-	cb->fill(cb, key_str, disk_total);
-	return disk_total;
-}
-
-static int blkio_policy_parse_and_set(char *buf,
-	struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
-{
-	struct gendisk *disk = NULL;
-	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
-	unsigned long major, minor;
-	int i = 0, ret = -EINVAL;
-	int part;
-	dev_t dev;
-	u64 temp;
-
-	memset(s, 0, sizeof(s));
-
-	while ((p = strsep(&buf, " ")) != NULL) {
-		if (!*p)
-			continue;
-
-		s[i++] = p;
-
-		/* Prevent from inputing too many things */
-		if (i == 3)
-			break;
-	}
-
-	if (i != 2)
-		goto out;
-
-	p = strsep(&s[0], ":");
-	if (p != NULL)
-		major_s = p;
-	else
-		goto out;
-
-	minor_s = s[0];
-	if (!minor_s)
-		goto out;
-
-	if (strict_strtoul(major_s, 10, &major))
-		goto out;
-
-	if (strict_strtoul(minor_s, 10, &minor))
-		goto out;
-
-	dev = MKDEV(major, minor);
-
-	if (strict_strtoull(s[1], 10, &temp))
-		goto out;
-
-	/* For rule removal, do not check for device presence. */
-	if (temp) {
-		disk = get_gendisk(dev, &part);
-		if (!disk || part) {
-			ret = -ENODEV;
-			goto out;
-		}
-	}
-
-	newpn->dev = dev;
-
-	switch (plid) {
-	case BLKIO_POLICY_PROP:
-		if ((temp < BLKIO_WEIGHT_MIN && temp > 0) ||
-		     temp > BLKIO_WEIGHT_MAX)
-			goto out;
-
-		newpn->plid = plid;
-		newpn->fileid = fileid;
-		newpn->val.weight = temp;
-		break;
-	case BLKIO_POLICY_THROTL:
-		switch(fileid) {
-		case BLKIO_THROTL_read_bps_device:
-		case BLKIO_THROTL_write_bps_device:
-			newpn->plid = plid;
-			newpn->fileid = fileid;
-			newpn->val.bps = temp;
-			break;
-		case BLKIO_THROTL_read_iops_device:
-		case BLKIO_THROTL_write_iops_device:
-			if (temp > THROTL_IOPS_MAX)
-				goto out;
-
-			newpn->plid = plid;
-			newpn->fileid = fileid;
-			newpn->val.iops = (unsigned int)temp;
-			break;
-		}
-		break;
-	default:
-		BUG();
-	}
-	ret = 0;
-out:
-	put_disk(disk);
-	return ret;
-}
-
-unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
-			      dev_t dev)
-{
-	struct blkio_policy_node *pn;
-	unsigned long flags;
-	unsigned int weight;
-
-	spin_lock_irqsave(&blkcg->lock, flags);
-
-	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
-				BLKIO_PROP_weight_device);
-	if (pn)
-		weight = pn->val.weight;
-	else
-		weight = blkcg->weight;
-
-	spin_unlock_irqrestore(&blkcg->lock, flags);
-
-	return weight;
-}
-EXPORT_SYMBOL_GPL(blkcg_get_weight);
-
-uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
-{
-	struct blkio_policy_node *pn;
-	unsigned long flags;
-	uint64_t bps = -1;
-
-	spin_lock_irqsave(&blkcg->lock, flags);
-	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_read_bps_device);
-	if (pn)
-		bps = pn->val.bps;
-	spin_unlock_irqrestore(&blkcg->lock, flags);
-
-	return bps;
-}
-
-uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
-{
-	struct blkio_policy_node *pn;
-	unsigned long flags;
-	uint64_t bps = -1;
-
-	spin_lock_irqsave(&blkcg->lock, flags);
-	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_write_bps_device);
-	if (pn)
-		bps = pn->val.bps;
-	spin_unlock_irqrestore(&blkcg->lock, flags);
-
-	return bps;
-}
-
-unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev)
-{
-	struct blkio_policy_node *pn;
-	unsigned long flags;
-	unsigned int iops = -1;
-
-	spin_lock_irqsave(&blkcg->lock, flags);
-	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_read_iops_device);
-	if (pn)
-		iops = pn->val.iops;
-	spin_unlock_irqrestore(&blkcg->lock, flags);
-
-	return iops;
-}
-
-unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev)
-{
-	struct blkio_policy_node *pn;
-	unsigned long flags;
-	unsigned int iops = -1;
-
-	spin_lock_irqsave(&blkcg->lock, flags);
-	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_write_iops_device);
-	if (pn)
-		iops = pn->val.iops;
-	spin_unlock_irqrestore(&blkcg->lock, flags);
-
-	return iops;
-}
+	/*
+	 * Note that stat reset is racy - it doesn't synchronize against
+	 * stat updates.  This is a debug feature which shouldn't exist
+	 * anyway.  If you get hit by a race, retry.
+	 */
+	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+		for (i = 0; i < BLKCG_MAX_POLS; i++) {
+			struct blkcg_policy *pol = blkcg_policy[i];
 
-/* Checks whether user asked for deleting a policy rule */
-static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
-{
-	switch(pn->plid) {
-	case BLKIO_POLICY_PROP:
-		if (pn->val.weight == 0)
-			return 1;
-		break;
-	case BLKIO_POLICY_THROTL:
-		switch(pn->fileid) {
-		case BLKIO_THROTL_read_bps_device:
-		case BLKIO_THROTL_write_bps_device:
-			if (pn->val.bps == 0)
-				return 1;
-			break;
-		case BLKIO_THROTL_read_iops_device:
-		case BLKIO_THROTL_write_iops_device:
-			if (pn->val.iops == 0)
-				return 1;
+			if (blkcg_policy_enabled(blkg->q, pol) &&
+			    pol->pd_reset_stats_fn)
+				pol->pd_reset_stats_fn(blkg);
 		}
-		break;
-	default:
-		BUG();
 	}
 
+	spin_unlock_irq(&blkcg->lock);
+	mutex_unlock(&blkcg_pol_mutex);
 	return 0;
 }
 
-static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
-					struct blkio_policy_node *newpn)
-{
-	switch(oldpn->plid) {
-	case BLKIO_POLICY_PROP:
-		oldpn->val.weight = newpn->val.weight;
-		break;
-	case BLKIO_POLICY_THROTL:
-		switch(newpn->fileid) {
-		case BLKIO_THROTL_read_bps_device:
-		case BLKIO_THROTL_write_bps_device:
-			oldpn->val.bps = newpn->val.bps;
-			break;
-		case BLKIO_THROTL_read_iops_device:
-		case BLKIO_THROTL_write_iops_device:
-			oldpn->val.iops = newpn->val.iops;
-		}
-		break;
-	default:
-		BUG();
-	}
-}
-
-/*
- * Some rules/values in blkg have changed. Propagate those to respective
- * policies.
- */
-static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
-		struct blkio_group *blkg, struct blkio_policy_node *pn)
+static const char *blkg_dev_name(struct blkcg_gq *blkg)
 {
-	unsigned int weight, iops;
-	u64 bps;
-
-	switch(pn->plid) {
-	case BLKIO_POLICY_PROP:
-		weight = pn->val.weight ? pn->val.weight :
-				blkcg->weight;
-		blkio_update_group_weight(blkg, weight);
-		break;
-	case BLKIO_POLICY_THROTL:
-		switch(pn->fileid) {
-		case BLKIO_THROTL_read_bps_device:
-		case BLKIO_THROTL_write_bps_device:
-			bps = pn->val.bps ? pn->val.bps : (-1);
-			blkio_update_group_bps(blkg, bps, pn->fileid);
-			break;
-		case BLKIO_THROTL_read_iops_device:
-		case BLKIO_THROTL_write_iops_device:
-			iops = pn->val.iops ? pn->val.iops : (-1);
-			blkio_update_group_iops(blkg, iops, pn->fileid);
-			break;
-		}
-		break;
-	default:
-		BUG();
-	}
+	/* some drivers (floppy) instantiate a queue w/o disk registered */
+	if (blkg->q->backing_dev_info.dev)
+		return dev_name(blkg->q->backing_dev_info.dev);
+	return NULL;
 }
 
-/*
- * A policy node rule has been updated. Propagate this update to all the
- * block groups which might be affected by this update.
+/**
+ * blkcg_print_blkgs - helper for printing per-blkg data
+ * @sf: seq_file to print to
+ * @blkcg: blkcg of interest
+ * @prfill: fill function to print out a blkg
+ * @pol: policy in question
+ * @data: data to be passed to @prfill
+ * @show_total: to print out sum of prfill return values or not
+ *
+ * This function invokes @prfill on each blkg of @blkcg if pd for the
+ * policy specified by @pol exists.  @prfill is invoked with @sf, the
+ * policy data and @data.  If @show_total is %true, the sum of the return
+ * values from @prfill is printed with "Total" label at the end.
+ *
+ * This is to be used to construct print functions for
+ * cftype->read_seq_string method.
  */
-static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg,
-				struct blkio_policy_node *pn)
+void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
+		       u64 (*prfill)(struct seq_file *,
+				     struct blkg_policy_data *, int),
+		       const struct blkcg_policy *pol, int data,
+		       bool show_total)
 {
-	struct blkio_group *blkg;
+	struct blkcg_gq *blkg;
 	struct hlist_node *n;
+	u64 total = 0;
 
-	spin_lock(&blkio_list_lock);
 	spin_lock_irq(&blkcg->lock);
-
-	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
-		if (pn->dev != blkg->dev || pn->plid != blkg->plid)
-			continue;
-		blkio_update_blkg_policy(blkcg, blkg, pn);
-	}
-
+	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
+		if (blkcg_policy_enabled(blkg->q, pol))
+			total += prfill(sf, blkg->pd[pol->plid], data);
 	spin_unlock_irq(&blkcg->lock);
-	spin_unlock(&blkio_list_lock);
+
+	if (show_total)
+		seq_printf(sf, "Total %llu\n", (unsigned long long)total);
 }
+EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
 
-static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
- 				       const char *buffer)
+/**
+ * __blkg_prfill_u64 - prfill helper for a single u64 value
+ * @sf: seq_file to print to
+ * @pd: policy private data of interest
+ * @v: value to print
+ *
+ * Print @v to @sf for the device assocaited with @pd.
+ */
+u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
 {
-	int ret = 0;
-	char *buf;
-	struct blkio_policy_node *newpn, *pn;
-	struct blkio_cgroup *blkcg;
-	int keep_newpn = 0;
-	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
-	int fileid = BLKIOFILE_ATTR(cft->private);
-
-	buf = kstrdup(buffer, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
-	if (!newpn) {
-		ret = -ENOMEM;
-		goto free_buf;
-	}
-
-	ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid);
-	if (ret)
-		goto free_newpn;
-
-	blkcg = cgroup_to_blkio_cgroup(cgrp);
-
-	spin_lock_irq(&blkcg->lock);
-
-	pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid);
-	if (!pn) {
-		if (!blkio_delete_rule_command(newpn)) {
-			blkio_policy_insert_node(blkcg, newpn);
-			keep_newpn = 1;
-		}
-		spin_unlock_irq(&blkcg->lock);
-		goto update_io_group;
-	}
-
-	if (blkio_delete_rule_command(newpn)) {
-		blkio_policy_delete_node(pn);
-		kfree(pn);
-		spin_unlock_irq(&blkcg->lock);
-		goto update_io_group;
-	}
-	spin_unlock_irq(&blkcg->lock);
+	const char *dname = blkg_dev_name(pd->blkg);
 
-	blkio_update_policy_rule(pn, newpn);
+	if (!dname)
+		return 0;
 
-update_io_group:
-	blkio_update_policy_node_blkg(blkcg, newpn);
-
-free_newpn:
-	if (!keep_newpn)
-		kfree(newpn);
-free_buf:
-	kfree(buf);
-	return ret;
+	seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
+	return v;
 }
+EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
 
-static void
-blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
-{
-	switch(pn->plid) {
-		case BLKIO_POLICY_PROP:
-			if (pn->fileid == BLKIO_PROP_weight_device)
-				seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
-					MINOR(pn->dev), pn->val.weight);
-			break;
-		case BLKIO_POLICY_THROTL:
-			switch(pn->fileid) {
-			case BLKIO_THROTL_read_bps_device:
-			case BLKIO_THROTL_write_bps_device:
-				seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
-					MINOR(pn->dev), pn->val.bps);
-				break;
-			case BLKIO_THROTL_read_iops_device:
-			case BLKIO_THROTL_write_iops_device:
-				seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
-					MINOR(pn->dev), pn->val.iops);
-				break;
-			}
-			break;
-		default:
-			BUG();
-	}
-}
+/**
+ * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
+ * @sf: seq_file to print to
+ * @pd: policy private data of interest
+ * @rwstat: rwstat to print
+ *
+ * Print @rwstat to @sf for the device assocaited with @pd.
+ */
+u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+			 const struct blkg_rwstat *rwstat)
+{
+	static const char *rwstr[] = {
+		[BLKG_RWSTAT_READ]	= "Read",
+		[BLKG_RWSTAT_WRITE]	= "Write",
+		[BLKG_RWSTAT_SYNC]	= "Sync",
+		[BLKG_RWSTAT_ASYNC]	= "Async",
+	};
+	const char *dname = blkg_dev_name(pd->blkg);
+	u64 v;
+	int i;
 
-/* cgroup files which read their data from policy nodes end up here */
-static void blkio_read_policy_node_files(struct cftype *cft,
-			struct blkio_cgroup *blkcg, struct seq_file *m)
-{
-	struct blkio_policy_node *pn;
-
-	if (!list_empty(&blkcg->policy_list)) {
-		spin_lock_irq(&blkcg->lock);
-		list_for_each_entry(pn, &blkcg->policy_list, node) {
-			if (!pn_matches_cftype(cft, pn))
-				continue;
-			blkio_print_policy_node(m, pn);
-		}
-		spin_unlock_irq(&blkcg->lock);
-	}
-}
+	if (!dname)
+		return 0;
 
-static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
-				struct seq_file *m)
-{
-	struct blkio_cgroup *blkcg;
-	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
-	int name = BLKIOFILE_ATTR(cft->private);
-
-	blkcg = cgroup_to_blkio_cgroup(cgrp);
-
-	switch(plid) {
-	case BLKIO_POLICY_PROP:
-		switch(name) {
-		case BLKIO_PROP_weight_device:
-			blkio_read_policy_node_files(cft, blkcg, m);
-			return 0;
-		default:
-			BUG();
-		}
-		break;
-	case BLKIO_POLICY_THROTL:
-		switch(name){
-		case BLKIO_THROTL_read_bps_device:
-		case BLKIO_THROTL_write_bps_device:
-		case BLKIO_THROTL_read_iops_device:
-		case BLKIO_THROTL_write_iops_device:
-			blkio_read_policy_node_files(cft, blkcg, m);
-			return 0;
-		default:
-			BUG();
-		}
-		break;
-	default:
-		BUG();
-	}
+	for (i = 0; i < BLKG_RWSTAT_NR; i++)
+		seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
+			   (unsigned long long)rwstat->cnt[i]);
 
-	return 0;
+	v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
+	seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
+	return v;
 }
 
-static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
-		struct cftype *cft, struct cgroup_map_cb *cb,
-		enum stat_type type, bool show_total, bool pcpu)
+/**
+ * blkg_prfill_stat - prfill callback for blkg_stat
+ * @sf: seq_file to print to
+ * @pd: policy private data of interest
+ * @off: offset to the blkg_stat in @pd
+ *
+ * prfill callback for printing a blkg_stat.
+ */
+u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off)
 {
-	struct blkio_group *blkg;
-	struct hlist_node *n;
-	uint64_t cgroup_total = 0;
-
-	rcu_read_lock();
-	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
-		if (blkg->dev) {
-			if (!cftype_blkg_same_policy(cft, blkg))
-				continue;
-			if (pcpu)
-				cgroup_total += blkio_get_stat_cpu(blkg, cb,
-						blkg->dev, type);
-			else {
-				spin_lock_irq(&blkg->stats_lock);
-				cgroup_total += blkio_get_stat(blkg, cb,
-						blkg->dev, type);
-				spin_unlock_irq(&blkg->stats_lock);
-			}
-		}
-	}
-	if (show_total)
-		cb->fill(cb, "Total", cgroup_total);
-	rcu_read_unlock();
-	return 0;
+	return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off));
 }
+EXPORT_SYMBOL_GPL(blkg_prfill_stat);
 
-/* All map kind of cgroup file get serviced by this function */
-static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
-				struct cgroup_map_cb *cb)
+/**
+ * blkg_prfill_rwstat - prfill callback for blkg_rwstat
+ * @sf: seq_file to print to
+ * @pd: policy private data of interest
+ * @off: offset to the blkg_rwstat in @pd
+ *
+ * prfill callback for printing a blkg_rwstat.
+ */
+u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+		       int off)
 {
-	struct blkio_cgroup *blkcg;
-	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
-	int name = BLKIOFILE_ATTR(cft->private);
-
-	blkcg = cgroup_to_blkio_cgroup(cgrp);
-
-	switch(plid) {
-	case BLKIO_POLICY_PROP:
-		switch(name) {
-		case BLKIO_PROP_time:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_TIME, 0, 0);
-		case BLKIO_PROP_sectors:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_CPU_SECTORS, 0, 1);
-		case BLKIO_PROP_io_service_bytes:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-					BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
-		case BLKIO_PROP_io_serviced:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_CPU_SERVICED, 1, 1);
-		case BLKIO_PROP_io_service_time:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_SERVICE_TIME, 1, 0);
-		case BLKIO_PROP_io_wait_time:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_WAIT_TIME, 1, 0);
-		case BLKIO_PROP_io_merged:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_CPU_MERGED, 1, 1);
-		case BLKIO_PROP_io_queued:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_QUEUED, 1, 0);
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-		case BLKIO_PROP_unaccounted_time:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-					BLKIO_STAT_UNACCOUNTED_TIME, 0, 0);
-		case BLKIO_PROP_dequeue:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_DEQUEUE, 0, 0);
-		case BLKIO_PROP_avg_queue_size:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-					BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0);
-		case BLKIO_PROP_group_wait_time:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-					BLKIO_STAT_GROUP_WAIT_TIME, 0, 0);
-		case BLKIO_PROP_idle_time:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_IDLE_TIME, 0, 0);
-		case BLKIO_PROP_empty_time:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_EMPTY_TIME, 0, 0);
-#endif
-		default:
-			BUG();
-		}
-		break;
-	case BLKIO_POLICY_THROTL:
-		switch(name){
-		case BLKIO_THROTL_io_service_bytes:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
-		case BLKIO_THROTL_io_serviced:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_CPU_SERVICED, 1, 1);
-		default:
-			BUG();
-		}
-		break;
-	default:
-		BUG();
-	}
+	struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off);
 
-	return 0;
+	return __blkg_prfill_rwstat(sf, pd, &rwstat);
 }
+EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
 
-static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val)
+/**
+ * blkg_conf_prep - parse and prepare for per-blkg config update
+ * @blkcg: target block cgroup
+ * @pol: target policy
+ * @input: input string
+ * @ctx: blkg_conf_ctx to be filled
+ *
+ * Parse per-blkg config update from @input and initialize @ctx with the
+ * result.  @ctx->blkg points to the blkg to be updated and @ctx->v the new
+ * value.  This function returns with RCU read lock and queue lock held and
+ * must be paired with blkg_conf_finish().
+ */
+int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
+		   const char *input, struct blkg_conf_ctx *ctx)
+	__acquires(rcu) __acquires(disk->queue->queue_lock)
 {
-	struct blkio_group *blkg;
-	struct hlist_node *n;
-	struct blkio_policy_node *pn;
+	struct gendisk *disk;
+	struct blkcg_gq *blkg;
+	unsigned int major, minor;
+	unsigned long long v;
+	int part, ret;
 
-	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
+	if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
 		return -EINVAL;
 
-	spin_lock(&blkio_list_lock);
-	spin_lock_irq(&blkcg->lock);
-	blkcg->weight = (unsigned int)val;
-
-	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
-		pn = blkio_policy_search_node(blkcg, blkg->dev,
-				BLKIO_POLICY_PROP, BLKIO_PROP_weight_device);
-		if (pn)
-			continue;
-
-		blkio_update_group_weight(blkg, blkcg->weight);
-	}
-	spin_unlock_irq(&blkcg->lock);
-	spin_unlock(&blkio_list_lock);
-	return 0;
-}
+	disk = get_gendisk(MKDEV(major, minor), &part);
+	if (!disk || part)
+		return -EINVAL;
 
-static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
-	struct blkio_cgroup *blkcg;
-	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
-	int name = BLKIOFILE_ATTR(cft->private);
+	rcu_read_lock();
+	spin_lock_irq(disk->queue->queue_lock);
 
-	blkcg = cgroup_to_blkio_cgroup(cgrp);
+	if (blkcg_policy_enabled(disk->queue, pol))
+		blkg = blkg_lookup_create(blkcg, disk->queue);
+	else
+		blkg = ERR_PTR(-EINVAL);
 
-	switch(plid) {
-	case BLKIO_POLICY_PROP:
-		switch(name) {
-		case BLKIO_PROP_weight:
-			return (u64)blkcg->weight;
+	if (IS_ERR(blkg)) {
+		ret = PTR_ERR(blkg);
+		rcu_read_unlock();
+		spin_unlock_irq(disk->queue->queue_lock);
+		put_disk(disk);
+		/*
+		 * If queue was bypassing, we should retry.  Do so after a
+		 * short msleep().  It isn't strictly necessary but queue
+		 * can be bypassing for some time and it's always nice to
+		 * avoid busy looping.
+		 */
+		if (ret == -EBUSY) {
+			msleep(10);
+			ret = restart_syscall();
 		}
-		break;
-	default:
-		BUG();
+		return ret;
 	}
+
+	ctx->disk = disk;
+	ctx->blkg = blkg;
+	ctx->v = v;
 	return 0;
 }
+EXPORT_SYMBOL_GPL(blkg_conf_prep);
 
-static int
-blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
+/**
+ * blkg_conf_finish - finish up per-blkg config update
+ * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
+ *
+ * Finish up after per-blkg config update.  This function must be paired
+ * with blkg_conf_prep().
+ */
+void blkg_conf_finish(struct blkg_conf_ctx *ctx)
+	__releases(ctx->disk->queue->queue_lock) __releases(rcu)
 {
-	struct blkio_cgroup *blkcg;
-	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
-	int name = BLKIOFILE_ATTR(cft->private);
-
-	blkcg = cgroup_to_blkio_cgroup(cgrp);
-
-	switch(plid) {
-	case BLKIO_POLICY_PROP:
-		switch(name) {
-		case BLKIO_PROP_weight:
-			return blkio_weight_write(blkcg, val);
-		}
-		break;
-	default:
-		BUG();
-	}
-
-	return 0;
+	spin_unlock_irq(ctx->disk->queue->queue_lock);
+	rcu_read_unlock();
+	put_disk(ctx->disk);
 }
+EXPORT_SYMBOL_GPL(blkg_conf_finish);
 
-struct cftype blkio_files[] = {
-	{
-		.name = "weight_device",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_weight_device),
-		.read_seq_string = blkiocg_file_read,
-		.write_string = blkiocg_file_write,
-		.max_write_len = 256,
-	},
-	{
-		.name = "weight",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_weight),
-		.read_u64 = blkiocg_file_read_u64,
-		.write_u64 = blkiocg_file_write_u64,
-	},
-	{
-		.name = "time",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_time),
-		.read_map = blkiocg_file_read_map,
-	},
-	{
-		.name = "sectors",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_sectors),
-		.read_map = blkiocg_file_read_map,
-	},
-	{
-		.name = "io_service_bytes",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_io_service_bytes),
-		.read_map = blkiocg_file_read_map,
-	},
-	{
-		.name = "io_serviced",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_io_serviced),
-		.read_map = blkiocg_file_read_map,
-	},
-	{
-		.name = "io_service_time",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_io_service_time),
-		.read_map = blkiocg_file_read_map,
-	},
-	{
-		.name = "io_wait_time",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_io_wait_time),
-		.read_map = blkiocg_file_read_map,
-	},
-	{
-		.name = "io_merged",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_io_merged),
-		.read_map = blkiocg_file_read_map,
-	},
-	{
-		.name = "io_queued",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_io_queued),
-		.read_map = blkiocg_file_read_map,
-	},
+struct cftype blkcg_files[] = {
 	{
 		.name = "reset_stats",
-		.write_u64 = blkiocg_reset_stats,
-	},
-#ifdef CONFIG_BLK_DEV_THROTTLING
-	{
-		.name = "throttle.read_bps_device",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_read_bps_device),
-		.read_seq_string = blkiocg_file_read,
-		.write_string = blkiocg_file_write,
-		.max_write_len = 256,
-	},
-
-	{
-		.name = "throttle.write_bps_device",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_write_bps_device),
-		.read_seq_string = blkiocg_file_read,
-		.write_string = blkiocg_file_write,
-		.max_write_len = 256,
-	},
-
-	{
-		.name = "throttle.read_iops_device",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_read_iops_device),
-		.read_seq_string = blkiocg_file_read,
-		.write_string = blkiocg_file_write,
-		.max_write_len = 256,
-	},
-
-	{
-		.name = "throttle.write_iops_device",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_write_iops_device),
-		.read_seq_string = blkiocg_file_read,
-		.write_string = blkiocg_file_write,
-		.max_write_len = 256,
-	},
-	{
-		.name = "throttle.io_service_bytes",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_io_service_bytes),
-		.read_map = blkiocg_file_read_map,
-	},
-	{
-		.name = "throttle.io_serviced",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_io_serviced),
-		.read_map = blkiocg_file_read_map,
-	},
-#endif /* CONFIG_BLK_DEV_THROTTLING */
-
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-	{
-		.name = "avg_queue_size",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_avg_queue_size),
-		.read_map = blkiocg_file_read_map,
+		.write_u64 = blkcg_reset_stats,
 	},
-	{
-		.name = "group_wait_time",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_group_wait_time),
-		.read_map = blkiocg_file_read_map,
-	},
-	{
-		.name = "idle_time",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_idle_time),
-		.read_map = blkiocg_file_read_map,
-	},
-	{
-		.name = "empty_time",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_empty_time),
-		.read_map = blkiocg_file_read_map,
-	},
-	{
-		.name = "dequeue",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_dequeue),
-		.read_map = blkiocg_file_read_map,
-	},
-	{
-		.name = "unaccounted_time",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_unaccounted_time),
-		.read_map = blkiocg_file_read_map,
-	},
-#endif
 	{ }	/* terminate */
 };
 
-static void blkiocg_destroy(struct cgroup *cgroup)
+/**
+ * blkcg_pre_destroy - cgroup pre_destroy callback
+ * @cgroup: cgroup of interest
+ *
+ * This function is called when @cgroup is about to go away and responsible
+ * for shooting down all blkgs associated with @cgroup.  blkgs should be
+ * removed while holding both q and blkcg locks.  As blkcg lock is nested
+ * inside q lock, this function performs reverse double lock dancing.
+ *
+ * This is the blkcg counterpart of ioc_release_fn().
+ */
+static int blkcg_pre_destroy(struct cgroup *cgroup)
 {
-	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
-	unsigned long flags;
-	struct blkio_group *blkg;
-	void *key;
-	struct blkio_policy_type *blkiop;
-	struct blkio_policy_node *pn, *pntmp;
+	struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
 
-	rcu_read_lock();
-	do {
-		spin_lock_irqsave(&blkcg->lock, flags);
+	spin_lock_irq(&blkcg->lock);
 
-		if (hlist_empty(&blkcg->blkg_list)) {
-			spin_unlock_irqrestore(&blkcg->lock, flags);
-			break;
+	while (!hlist_empty(&blkcg->blkg_list)) {
+		struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
+						struct blkcg_gq, blkcg_node);
+		struct request_queue *q = blkg->q;
+
+		if (spin_trylock(q->queue_lock)) {
+			blkg_destroy(blkg);
+			spin_unlock(q->queue_lock);
+		} else {
+			spin_unlock_irq(&blkcg->lock);
+			cpu_relax();
+			spin_lock_irq(&blkcg->lock);
 		}
+	}
 
-		blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
-					blkcg_node);
-		key = rcu_dereference(blkg->key);
-		__blkiocg_del_blkio_group(blkg);
-
-		spin_unlock_irqrestore(&blkcg->lock, flags);
-
-		/*
-		 * This blkio_group is being unlinked as associated cgroup is
-		 * going away. Let all the IO controlling policies know about
-		 * this event.
-		 */
-		spin_lock(&blkio_list_lock);
-		list_for_each_entry(blkiop, &blkio_list, list) {
-			if (blkiop->plid != blkg->plid)
-				continue;
-			blkiop->ops.blkio_unlink_group_fn(key, blkg);
-		}
-		spin_unlock(&blkio_list_lock);
-	} while (1);
+	spin_unlock_irq(&blkcg->lock);
+	return 0;
+}
 
-	list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
-		blkio_policy_delete_node(pn);
-		kfree(pn);
-	}
+static void blkcg_destroy(struct cgroup *cgroup)
+{
+	struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
 
-	free_css_id(&blkio_subsys, &blkcg->css);
-	rcu_read_unlock();
-	if (blkcg != &blkio_root_cgroup)
+	if (blkcg != &blkcg_root)
 		kfree(blkcg);
 }
 
-static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup)
+static struct cgroup_subsys_state *blkcg_create(struct cgroup *cgroup)
 {
-	struct blkio_cgroup *blkcg;
+	static atomic64_t id_seq = ATOMIC64_INIT(0);
+	struct blkcg *blkcg;
 	struct cgroup *parent = cgroup->parent;
 
 	if (!parent) {
-		blkcg = &blkio_root_cgroup;
+		blkcg = &blkcg_root;
 		goto done;
 	}
 
@@ -1582,22 +624,68 @@ static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup)
 	if (!blkcg)
 		return ERR_PTR(-ENOMEM);
 
-	blkcg->weight = BLKIO_WEIGHT_DEFAULT;
+	blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT;
+	blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
 done:
 	spin_lock_init(&blkcg->lock);
+	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
 	INIT_HLIST_HEAD(&blkcg->blkg_list);
 
-	INIT_LIST_HEAD(&blkcg->policy_list);
 	return &blkcg->css;
 }
 
+/**
+ * blkcg_init_queue - initialize blkcg part of request queue
+ * @q: request_queue to initialize
+ *
+ * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
+ * part of new request_queue @q.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int blkcg_init_queue(struct request_queue *q)
+{
+	might_sleep();
+
+	return blk_throtl_init(q);
+}
+
+/**
+ * blkcg_drain_queue - drain blkcg part of request_queue
+ * @q: request_queue to drain
+ *
+ * Called from blk_drain_queue().  Responsible for draining blkcg part.
+ */
+void blkcg_drain_queue(struct request_queue *q)
+{
+	lockdep_assert_held(q->queue_lock);
+
+	blk_throtl_drain(q);
+}
+
+/**
+ * blkcg_exit_queue - exit and release blkcg part of request_queue
+ * @q: request_queue being released
+ *
+ * Called from blk_release_queue().  Responsible for exiting blkcg part.
+ */
+void blkcg_exit_queue(struct request_queue *q)
+{
+	spin_lock_irq(q->queue_lock);
+	blkg_destroy_all(q);
+	spin_unlock_irq(q->queue_lock);
+
+	blk_throtl_exit(q);
+}
+
 /*
  * We cannot support shared io contexts, as we have no mean to support
  * two tasks with the same ioc in two different groups without major rework
  * of the main cic data structures.  For now we allow a task to change
  * its cgroup only if it's the only owner of its ioc.
  */
-static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
+static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 {
 	struct task_struct *task;
 	struct io_context *ioc;
@@ -1616,63 +704,213 @@ static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 	return ret;
 }
 
-static void blkiocg_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
-{
-	struct task_struct *task;
-	struct io_context *ioc;
-
-	cgroup_taskset_for_each(task, cgrp, tset) {
-		/* we don't lose anything even if ioc allocation fails */
-		ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
-		if (ioc) {
-			ioc_cgroup_changed(ioc);
-			put_io_context(ioc);
-		}
-	}
-}
-
 struct cgroup_subsys blkio_subsys = {
 	.name = "blkio",
-	.create = blkiocg_create,
-	.can_attach = blkiocg_can_attach,
-	.attach = blkiocg_attach,
-	.destroy = blkiocg_destroy,
-#ifdef CONFIG_BLK_CGROUP
-	/* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
+	.create = blkcg_create,
+	.can_attach = blkcg_can_attach,
+	.pre_destroy = blkcg_pre_destroy,
+	.destroy = blkcg_destroy,
 	.subsys_id = blkio_subsys_id,
-#endif
-	.base_cftypes = blkio_files,
-	.use_id = 1,
+	.base_cftypes = blkcg_files,
 	.module = THIS_MODULE,
 };
 EXPORT_SYMBOL_GPL(blkio_subsys);
 
-void blkio_policy_register(struct blkio_policy_type *blkiop)
+/**
+ * blkcg_activate_policy - activate a blkcg policy on a request_queue
+ * @q: request_queue of interest
+ * @pol: blkcg policy to activate
+ *
+ * Activate @pol on @q.  Requires %GFP_KERNEL context.  @q goes through
+ * bypass mode to populate its blkgs with policy_data for @pol.
+ *
+ * Activation happens with @q bypassed, so nobody would be accessing blkgs
+ * from IO path.  Update of each blkg is protected by both queue and blkcg
+ * locks so that holding either lock and testing blkcg_policy_enabled() is
+ * always enough for dereferencing policy data.
+ *
+ * The caller is responsible for synchronizing [de]activations and policy
+ * [un]registerations.  Returns 0 on success, -errno on failure.
+ */
+int blkcg_activate_policy(struct request_queue *q,
+			  const struct blkcg_policy *pol)
 {
-	spin_lock(&blkio_list_lock);
-	list_add_tail(&blkiop->list, &blkio_list);
-	spin_unlock(&blkio_list_lock);
+	LIST_HEAD(pds);
+	struct blkcg_gq *blkg;
+	struct blkg_policy_data *pd, *n;
+	int cnt = 0, ret;
+
+	if (blkcg_policy_enabled(q, pol))
+		return 0;
+
+	blk_queue_bypass_start(q);
+
+	/* make sure the root blkg exists and count the existing blkgs */
+	spin_lock_irq(q->queue_lock);
+
+	rcu_read_lock();
+	blkg = __blkg_lookup_create(&blkcg_root, q);
+	rcu_read_unlock();
+
+	if (IS_ERR(blkg)) {
+		ret = PTR_ERR(blkg);
+		goto out_unlock;
+	}
+	q->root_blkg = blkg;
+
+	list_for_each_entry(blkg, &q->blkg_list, q_node)
+		cnt++;
+
+	spin_unlock_irq(q->queue_lock);
+
+	/* allocate policy_data for all existing blkgs */
+	while (cnt--) {
+		pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
+		if (!pd) {
+			ret = -ENOMEM;
+			goto out_free;
+		}
+		list_add_tail(&pd->alloc_node, &pds);
+	}
+
+	/*
+	 * Install the allocated pds.  With @q bypassing, no new blkg
+	 * should have been created while the queue lock was dropped.
+	 */
+	spin_lock_irq(q->queue_lock);
+
+	list_for_each_entry(blkg, &q->blkg_list, q_node) {
+		if (WARN_ON(list_empty(&pds))) {
+			/* umm... this shouldn't happen, just abort */
+			ret = -ENOMEM;
+			goto out_unlock;
+		}
+		pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
+		list_del_init(&pd->alloc_node);
+
+		/* grab blkcg lock too while installing @pd on @blkg */
+		spin_lock(&blkg->blkcg->lock);
+
+		blkg->pd[pol->plid] = pd;
+		pd->blkg = blkg;
+		pol->pd_init_fn(blkg);
+
+		spin_unlock(&blkg->blkcg->lock);
+	}
+
+	__set_bit(pol->plid, q->blkcg_pols);
+	ret = 0;
+out_unlock:
+	spin_unlock_irq(q->queue_lock);
+out_free:
+	blk_queue_bypass_end(q);
+	list_for_each_entry_safe(pd, n, &pds, alloc_node)
+		kfree(pd);
+	return ret;
 }
-EXPORT_SYMBOL_GPL(blkio_policy_register);
+EXPORT_SYMBOL_GPL(blkcg_activate_policy);
 
-void blkio_policy_unregister(struct blkio_policy_type *blkiop)
+/**
+ * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue
+ * @q: request_queue of interest
+ * @pol: blkcg policy to deactivate
+ *
+ * Deactivate @pol on @q.  Follows the same synchronization rules as
+ * blkcg_activate_policy().
+ */
+void blkcg_deactivate_policy(struct request_queue *q,
+			     const struct blkcg_policy *pol)
 {
-	spin_lock(&blkio_list_lock);
-	list_del_init(&blkiop->list);
-	spin_unlock(&blkio_list_lock);
+	struct blkcg_gq *blkg;
+
+	if (!blkcg_policy_enabled(q, pol))
+		return;
+
+	blk_queue_bypass_start(q);
+	spin_lock_irq(q->queue_lock);
+
+	__clear_bit(pol->plid, q->blkcg_pols);
+
+	/* if no policy is left, no need for blkgs - shoot them down */
+	if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS))
+		blkg_destroy_all(q);
+
+	list_for_each_entry(blkg, &q->blkg_list, q_node) {
+		/* grab blkcg lock too while removing @pd from @blkg */
+		spin_lock(&blkg->blkcg->lock);
+
+		if (pol->pd_exit_fn)
+			pol->pd_exit_fn(blkg);
+
+		kfree(blkg->pd[pol->plid]);
+		blkg->pd[pol->plid] = NULL;
+
+		spin_unlock(&blkg->blkcg->lock);
+	}
+
+	spin_unlock_irq(q->queue_lock);
+	blk_queue_bypass_end(q);
 }
-EXPORT_SYMBOL_GPL(blkio_policy_unregister);
+EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
 
-static int __init init_cgroup_blkio(void)
+/**
+ * blkcg_policy_register - register a blkcg policy
+ * @pol: blkcg policy to register
+ *
+ * Register @pol with blkcg core.  Might sleep and @pol may be modified on
+ * successful registration.  Returns 0 on success and -errno on failure.
+ */
+int blkcg_policy_register(struct blkcg_policy *pol)
 {
-	return cgroup_load_subsys(&blkio_subsys);
+	int i, ret;
+
+	if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
+		return -EINVAL;
+
+	mutex_lock(&blkcg_pol_mutex);
+
+	/* find an empty slot */
+	ret = -ENOSPC;
+	for (i = 0; i < BLKCG_MAX_POLS; i++)
+		if (!blkcg_policy[i])
+			break;
+	if (i >= BLKCG_MAX_POLS)
+		goto out_unlock;
+
+	/* register and update blkgs */
+	pol->plid = i;
+	blkcg_policy[i] = pol;
+
+	/* everything is in place, add intf files for the new policy */
+	if (pol->cftypes)
+		WARN_ON(cgroup_add_cftypes(&blkio_subsys, pol->cftypes));
+	ret = 0;
+out_unlock:
+	mutex_unlock(&blkcg_pol_mutex);
+	return ret;
 }
+EXPORT_SYMBOL_GPL(blkcg_policy_register);
 
-static void __exit exit_cgroup_blkio(void)
+/**
+ * blkcg_policy_unregister - unregister a blkcg policy
+ * @pol: blkcg policy to unregister
+ *
+ * Undo blkcg_policy_register(@pol).  Might sleep.
+ */
+void blkcg_policy_unregister(struct blkcg_policy *pol)
 {
-	cgroup_unload_subsys(&blkio_subsys);
-}
+	mutex_lock(&blkcg_pol_mutex);
 
-module_init(init_cgroup_blkio);
-module_exit(exit_cgroup_blkio);
-MODULE_LICENSE("GPL");
+	if (WARN_ON(blkcg_policy[pol->plid] != pol))
+		goto out_unlock;
+
+	/* kill the intf files first */
+	if (pol->cftypes)
+		cgroup_rm_cftypes(&blkio_subsys, pol->cftypes);
+
+	/* unregister and update blkgs */
+	blkcg_policy[pol->plid] = NULL;
+out_unlock:
+	mutex_unlock(&blkcg_pol_mutex);
+}
+EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 6f3ace7e792f..8ac457ce7783 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -15,350 +15,371 @@
 
 #include <linux/cgroup.h>
 #include <linux/u64_stats_sync.h>
-
-enum blkio_policy_id {
-	BLKIO_POLICY_PROP = 0,		/* Proportional Bandwidth division */
-	BLKIO_POLICY_THROTL,		/* Throttling */
-};
+#include <linux/seq_file.h>
+#include <linux/radix-tree.h>
 
 /* Max limits for throttle policy */
 #define THROTL_IOPS_MAX		UINT_MAX
 
-#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
-
-#ifndef CONFIG_BLK_CGROUP
-/* When blk-cgroup is a module, its subsys_id isn't a compile-time constant */
-extern struct cgroup_subsys blkio_subsys;
-#define blkio_subsys_id blkio_subsys.subsys_id
-#endif
-
-enum stat_type {
-	/* Total time spent (in ns) between request dispatch to the driver and
-	 * request completion for IOs doen by this cgroup. This may not be
-	 * accurate when NCQ is turned on. */
-	BLKIO_STAT_SERVICE_TIME = 0,
-	/* Total time spent waiting in scheduler queue in ns */
-	BLKIO_STAT_WAIT_TIME,
-	/* Number of IOs queued up */
-	BLKIO_STAT_QUEUED,
-	/* All the single valued stats go below this */
-	BLKIO_STAT_TIME,
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-	/* Time not charged to this cgroup */
-	BLKIO_STAT_UNACCOUNTED_TIME,
-	BLKIO_STAT_AVG_QUEUE_SIZE,
-	BLKIO_STAT_IDLE_TIME,
-	BLKIO_STAT_EMPTY_TIME,
-	BLKIO_STAT_GROUP_WAIT_TIME,
-	BLKIO_STAT_DEQUEUE
-#endif
-};
+/* CFQ specific, out here for blkcg->cfq_weight */
+#define CFQ_WEIGHT_MIN		10
+#define CFQ_WEIGHT_MAX		1000
+#define CFQ_WEIGHT_DEFAULT	500
 
-/* Per cpu stats */
-enum stat_type_cpu {
-	BLKIO_STAT_CPU_SECTORS,
-	/* Total bytes transferred */
-	BLKIO_STAT_CPU_SERVICE_BYTES,
-	/* Total IOs serviced, post merge */
-	BLKIO_STAT_CPU_SERVICED,
-	/* Number of IOs merged */
-	BLKIO_STAT_CPU_MERGED,
-	BLKIO_STAT_CPU_NR
-};
+#ifdef CONFIG_BLK_CGROUP
 
-enum stat_sub_type {
-	BLKIO_STAT_READ = 0,
-	BLKIO_STAT_WRITE,
-	BLKIO_STAT_SYNC,
-	BLKIO_STAT_ASYNC,
-	BLKIO_STAT_TOTAL
-};
+enum blkg_rwstat_type {
+	BLKG_RWSTAT_READ,
+	BLKG_RWSTAT_WRITE,
+	BLKG_RWSTAT_SYNC,
+	BLKG_RWSTAT_ASYNC,
 
-/* blkg state flags */
-enum blkg_state_flags {
-	BLKG_waiting = 0,
-	BLKG_idling,
-	BLKG_empty,
+	BLKG_RWSTAT_NR,
+	BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
 };
 
-/* cgroup files owned by proportional weight policy */
-enum blkcg_file_name_prop {
-	BLKIO_PROP_weight = 1,
-	BLKIO_PROP_weight_device,
-	BLKIO_PROP_io_service_bytes,
-	BLKIO_PROP_io_serviced,
-	BLKIO_PROP_time,
-	BLKIO_PROP_sectors,
-	BLKIO_PROP_unaccounted_time,
-	BLKIO_PROP_io_service_time,
-	BLKIO_PROP_io_wait_time,
-	BLKIO_PROP_io_merged,
-	BLKIO_PROP_io_queued,
-	BLKIO_PROP_avg_queue_size,
-	BLKIO_PROP_group_wait_time,
-	BLKIO_PROP_idle_time,
-	BLKIO_PROP_empty_time,
-	BLKIO_PROP_dequeue,
-};
+struct blkcg_gq;
 
-/* cgroup files owned by throttle policy */
-enum blkcg_file_name_throtl {
-	BLKIO_THROTL_read_bps_device,
-	BLKIO_THROTL_write_bps_device,
-	BLKIO_THROTL_read_iops_device,
-	BLKIO_THROTL_write_iops_device,
-	BLKIO_THROTL_io_service_bytes,
-	BLKIO_THROTL_io_serviced,
-};
+struct blkcg {
+	struct cgroup_subsys_state	css;
+	spinlock_t			lock;
 
-struct blkio_cgroup {
-	struct cgroup_subsys_state css;
-	unsigned int weight;
-	spinlock_t lock;
-	struct hlist_head blkg_list;
-	struct list_head policy_list; /* list of blkio_policy_node */
-};
+	struct radix_tree_root		blkg_tree;
+	struct blkcg_gq			*blkg_hint;
+	struct hlist_head		blkg_list;
+
+	/* for policies to test whether associated blkcg has changed */
+	uint64_t			id;
 
-struct blkio_group_stats {
-	/* total disk time and nr sectors dispatched by this group */
-	uint64_t time;
-	uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL];
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-	/* Time not charged to this cgroup */
-	uint64_t unaccounted_time;
-
-	/* Sum of number of IOs queued across all samples */
-	uint64_t avg_queue_size_sum;
-	/* Count of samples taken for average */
-	uint64_t avg_queue_size_samples;
-	/* How many times this group has been removed from service tree */
-	unsigned long dequeue;
-
-	/* Total time spent waiting for it to be assigned a timeslice. */
-	uint64_t group_wait_time;
-	uint64_t start_group_wait_time;
-
-	/* Time spent idling for this blkio_group */
-	uint64_t idle_time;
-	uint64_t start_idle_time;
-	/*
-	 * Total time when we have requests queued and do not contain the
-	 * current active queue.
-	 */
-	uint64_t empty_time;
-	uint64_t start_empty_time;
-	uint16_t flags;
-#endif
+	/* TODO: per-policy storage in blkcg */
+	unsigned int			cfq_weight;	/* belongs to cfq */
 };
 
-/* Per cpu blkio group stats */
-struct blkio_group_stats_cpu {
-	uint64_t sectors;
-	uint64_t stat_arr_cpu[BLKIO_STAT_CPU_NR][BLKIO_STAT_TOTAL];
-	struct u64_stats_sync syncp;
+struct blkg_stat {
+	struct u64_stats_sync		syncp;
+	uint64_t			cnt;
 };
 
-struct blkio_group {
-	/* An rcu protected unique identifier for the group */
-	void *key;
-	struct hlist_node blkcg_node;
-	unsigned short blkcg_id;
-	/* Store cgroup path */
-	char path[128];
-	/* The device MKDEV(major, minor), this group has been created for */
-	dev_t dev;
-	/* policy which owns this blk group */
-	enum blkio_policy_id plid;
-
-	/* Need to serialize the stats in the case of reset/update */
-	spinlock_t stats_lock;
-	struct blkio_group_stats stats;
-	/* Per cpu stats pointer */
-	struct blkio_group_stats_cpu __percpu *stats_cpu;
+struct blkg_rwstat {
+	struct u64_stats_sync		syncp;
+	uint64_t			cnt[BLKG_RWSTAT_NR];
 };
 
-struct blkio_policy_node {
-	struct list_head node;
-	dev_t dev;
-	/* This node belongs to max bw policy or porportional weight policy */
-	enum blkio_policy_id plid;
-	/* cgroup file to which this rule belongs to */
-	int fileid;
-
-	union {
-		unsigned int weight;
-		/*
-		 * Rate read/write in terms of bytes per second
-		 * Whether this rate represents read or write is determined
-		 * by file type "fileid".
-		 */
-		u64 bps;
-		unsigned int iops;
-	} val;
+/*
+ * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
+ * request_queue (q).  This is used by blkcg policies which need to track
+ * information per blkcg - q pair.
+ *
+ * There can be multiple active blkcg policies and each has its private
+ * data on each blkg, the size of which is determined by
+ * blkcg_policy->pd_size.  blkcg core allocates and frees such areas
+ * together with blkg and invokes pd_init/exit_fn() methods.
+ *
+ * Such private data must embed struct blkg_policy_data (pd) at the
+ * beginning and pd_size can't be smaller than pd.
+ */
+struct blkg_policy_data {
+	/* the blkg this per-policy data belongs to */
+	struct blkcg_gq			*blkg;
+
+	/* used during policy activation */
+	struct list_head		alloc_node;
 };
 
-extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
-				     dev_t dev);
-extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
-				     dev_t dev);
-extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
-				     dev_t dev);
-extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg,
-				     dev_t dev);
-extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg,
-				     dev_t dev);
-
-typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
-
-typedef void (blkio_update_group_weight_fn) (void *key,
-			struct blkio_group *blkg, unsigned int weight);
-typedef void (blkio_update_group_read_bps_fn) (void * key,
-			struct blkio_group *blkg, u64 read_bps);
-typedef void (blkio_update_group_write_bps_fn) (void *key,
-			struct blkio_group *blkg, u64 write_bps);
-typedef void (blkio_update_group_read_iops_fn) (void *key,
-			struct blkio_group *blkg, unsigned int read_iops);
-typedef void (blkio_update_group_write_iops_fn) (void *key,
-			struct blkio_group *blkg, unsigned int write_iops);
-
-struct blkio_policy_ops {
-	blkio_unlink_group_fn *blkio_unlink_group_fn;
-	blkio_update_group_weight_fn *blkio_update_group_weight_fn;
-	blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn;
-	blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn;
-	blkio_update_group_read_iops_fn *blkio_update_group_read_iops_fn;
-	blkio_update_group_write_iops_fn *blkio_update_group_write_iops_fn;
+/* association between a blk cgroup and a request queue */
+struct blkcg_gq {
+	/* Pointer to the associated request_queue */
+	struct request_queue		*q;
+	struct list_head		q_node;
+	struct hlist_node		blkcg_node;
+	struct blkcg			*blkcg;
+	/* reference count */
+	int				refcnt;
+
+	struct blkg_policy_data		*pd[BLKCG_MAX_POLS];
+
+	struct rcu_head			rcu_head;
 };
 
-struct blkio_policy_type {
-	struct list_head list;
-	struct blkio_policy_ops ops;
-	enum blkio_policy_id plid;
+typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
+typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
+typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
+
+struct blkcg_policy {
+	int				plid;
+	/* policy specific private data size */
+	size_t				pd_size;
+	/* cgroup files for the policy */
+	struct cftype			*cftypes;
+
+	/* operations */
+	blkcg_pol_init_pd_fn		*pd_init_fn;
+	blkcg_pol_exit_pd_fn		*pd_exit_fn;
+	blkcg_pol_reset_pd_stats_fn	*pd_reset_stats_fn;
 };
 
+extern struct blkcg blkcg_root;
+
+struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup);
+struct blkcg *bio_blkcg(struct bio *bio);
+struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
+struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
+				    struct request_queue *q);
+int blkcg_init_queue(struct request_queue *q);
+void blkcg_drain_queue(struct request_queue *q);
+void blkcg_exit_queue(struct request_queue *q);
+
 /* Blkio controller policy registration */
-extern void blkio_policy_register(struct blkio_policy_type *);
-extern void blkio_policy_unregister(struct blkio_policy_type *);
+int blkcg_policy_register(struct blkcg_policy *pol);
+void blkcg_policy_unregister(struct blkcg_policy *pol);
+int blkcg_activate_policy(struct request_queue *q,
+			  const struct blkcg_policy *pol);
+void blkcg_deactivate_policy(struct request_queue *q,
+			     const struct blkcg_policy *pol);
+
+void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
+		       u64 (*prfill)(struct seq_file *,
+				     struct blkg_policy_data *, int),
+		       const struct blkcg_policy *pol, int data,
+		       bool show_total);
+u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
+u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+			 const struct blkg_rwstat *rwstat);
+u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
+u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+		       int off);
+
+struct blkg_conf_ctx {
+	struct gendisk			*disk;
+	struct blkcg_gq			*blkg;
+	u64				v;
+};
+
+int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
+		   const char *input, struct blkg_conf_ctx *ctx);
+void blkg_conf_finish(struct blkg_conf_ctx *ctx);
+
+
+/**
+ * blkg_to_pdata - get policy private data
+ * @blkg: blkg of interest
+ * @pol: policy of interest
+ *
+ * Return pointer to private data associated with the @blkg-@pol pair.
+ */
+static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+						  struct blkcg_policy *pol)
+{
+	return blkg ? blkg->pd[pol->plid] : NULL;
+}
+
+/**
+ * pdata_to_blkg - get blkg associated with policy private data
+ * @pd: policy private data of interest
+ *
+ * @pd is policy private data.  Determine the blkg it's associated with.
+ */
+static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
+{
+	return pd ? pd->blkg : NULL;
+}
+
+/**
+ * blkg_path - format cgroup path of blkg
+ * @blkg: blkg of interest
+ * @buf: target buffer
+ * @buflen: target buffer length
+ *
+ * Format the path of the cgroup of @blkg into @buf.
+ */
+static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
+{
+	int ret;
+
+	rcu_read_lock();
+	ret = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
+	rcu_read_unlock();
+	if (ret)
+		strncpy(buf, "<unavailable>", buflen);
+	return ret;
+}
 
-static inline char *blkg_path(struct blkio_group *blkg)
+/**
+ * blkg_get - get a blkg reference
+ * @blkg: blkg to get
+ *
+ * The caller should be holding queue_lock and an existing reference.
+ */
+static inline void blkg_get(struct blkcg_gq *blkg)
 {
-	return blkg->path;
+	lockdep_assert_held(blkg->q->queue_lock);
+	WARN_ON_ONCE(!blkg->refcnt);
+	blkg->refcnt++;
 }
 
-#else
+void __blkg_release(struct blkcg_gq *blkg);
 
-struct blkio_group {
+/**
+ * blkg_put - put a blkg reference
+ * @blkg: blkg to put
+ *
+ * The caller should be holding queue_lock.
+ */
+static inline void blkg_put(struct blkcg_gq *blkg)
+{
+	lockdep_assert_held(blkg->q->queue_lock);
+	WARN_ON_ONCE(blkg->refcnt <= 0);
+	if (!--blkg->refcnt)
+		__blkg_release(blkg);
+}
+
+/**
+ * blkg_stat_add - add a value to a blkg_stat
+ * @stat: target blkg_stat
+ * @val: value to add
+ *
+ * Add @val to @stat.  The caller is responsible for synchronizing calls to
+ * this function.
+ */
+static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
+{
+	u64_stats_update_begin(&stat->syncp);
+	stat->cnt += val;
+	u64_stats_update_end(&stat->syncp);
+}
+
+/**
+ * blkg_stat_read - read the current value of a blkg_stat
+ * @stat: blkg_stat to read
+ *
+ * Read the current value of @stat.  This function can be called without
+ * synchroniztion and takes care of u64 atomicity.
+ */
+static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
+{
+	unsigned int start;
+	uint64_t v;
+
+	do {
+		start = u64_stats_fetch_begin(&stat->syncp);
+		v = stat->cnt;
+	} while (u64_stats_fetch_retry(&stat->syncp, start));
+
+	return v;
+}
+
+/**
+ * blkg_stat_reset - reset a blkg_stat
+ * @stat: blkg_stat to reset
+ */
+static inline void blkg_stat_reset(struct blkg_stat *stat)
+{
+	stat->cnt = 0;
+}
+
+/**
+ * blkg_rwstat_add - add a value to a blkg_rwstat
+ * @rwstat: target blkg_rwstat
+ * @rw: mask of REQ_{WRITE|SYNC}
+ * @val: value to add
+ *
+ * Add @val to @rwstat.  The counters are chosen according to @rw.  The
+ * caller is responsible for synchronizing calls to this function.
+ */
+static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
+				   int rw, uint64_t val)
+{
+	u64_stats_update_begin(&rwstat->syncp);
+
+	if (rw & REQ_WRITE)
+		rwstat->cnt[BLKG_RWSTAT_WRITE] += val;
+	else
+		rwstat->cnt[BLKG_RWSTAT_READ] += val;
+	if (rw & REQ_SYNC)
+		rwstat->cnt[BLKG_RWSTAT_SYNC] += val;
+	else
+		rwstat->cnt[BLKG_RWSTAT_ASYNC] += val;
+
+	u64_stats_update_end(&rwstat->syncp);
+}
+
+/**
+ * blkg_rwstat_read - read the current values of a blkg_rwstat
+ * @rwstat: blkg_rwstat to read
+ *
+ * Read the current snapshot of @rwstat and return it as the return value.
+ * This function can be called without synchronization and takes care of
+ * u64 atomicity.
+ */
+static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
+{
+	unsigned int start;
+	struct blkg_rwstat tmp;
+
+	do {
+		start = u64_stats_fetch_begin(&rwstat->syncp);
+		tmp = *rwstat;
+	} while (u64_stats_fetch_retry(&rwstat->syncp, start));
+
+	return tmp;
+}
+
+/**
+ * blkg_rwstat_sum - read the total count of a blkg_rwstat
+ * @rwstat: blkg_rwstat to read
+ *
+ * Return the total count of @rwstat regardless of the IO direction.  This
+ * function can be called without synchronization and takes care of u64
+ * atomicity.
+ */
+static inline uint64_t blkg_rwstat_sum(struct blkg_rwstat *rwstat)
+{
+	struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
+
+	return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
+}
+
+/**
+ * blkg_rwstat_reset - reset a blkg_rwstat
+ * @rwstat: blkg_rwstat to reset
+ */
+static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
+{
+	memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
+}
+
+#else	/* CONFIG_BLK_CGROUP */
+
+struct cgroup;
+
+struct blkg_policy_data {
 };
 
-struct blkio_policy_type {
+struct blkcg_gq {
 };
 
-static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { }
-static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
-
-static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
-
-#endif
-
-#define BLKIO_WEIGHT_MIN	10
-#define BLKIO_WEIGHT_MAX	1000
-#define BLKIO_WEIGHT_DEFAULT	500
-
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg);
-void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
-				unsigned long dequeue);
-void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg);
-void blkiocg_update_idle_time_stats(struct blkio_group *blkg);
-void blkiocg_set_start_empty_time(struct blkio_group *blkg);
-
-#define BLKG_FLAG_FNS(name)						\
-static inline void blkio_mark_blkg_##name(				\
-		struct blkio_group_stats *stats)			\
-{									\
-	stats->flags |= (1 << BLKG_##name);				\
-}									\
-static inline void blkio_clear_blkg_##name(				\
-		struct blkio_group_stats *stats)			\
-{									\
-	stats->flags &= ~(1 << BLKG_##name);				\
-}									\
-static inline int blkio_blkg_##name(struct blkio_group_stats *stats)	\
-{									\
-	return (stats->flags & (1 << BLKG_##name)) != 0;		\
-}									\
-
-BLKG_FLAG_FNS(waiting)
-BLKG_FLAG_FNS(idling)
-BLKG_FLAG_FNS(empty)
-#undef BLKG_FLAG_FNS
-#else
-static inline void blkiocg_update_avg_queue_size_stats(
-						struct blkio_group *blkg) {}
-static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
-						unsigned long dequeue) {}
-static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
-{}
-static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg) {}
-static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
-#endif
-
-#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
-extern struct blkio_cgroup blkio_root_cgroup;
-extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
-extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk);
-extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-	struct blkio_group *blkg, void *key, dev_t dev,
-	enum blkio_policy_id plid);
-extern int blkio_alloc_blkg_stats(struct blkio_group *blkg);
-extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
-extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
-						void *key);
-void blkiocg_update_timeslice_used(struct blkio_group *blkg,
-					unsigned long time,
-					unsigned long unaccounted_time);
-void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes,
-						bool direction, bool sync);
-void blkiocg_update_completion_stats(struct blkio_group *blkg,
-	uint64_t start_time, uint64_t io_start_time, bool direction, bool sync);
-void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
-					bool sync);
-void blkiocg_update_io_add_stats(struct blkio_group *blkg,
-		struct blkio_group *curr_blkg, bool direction, bool sync);
-void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
-					bool direction, bool sync);
-#else
-struct cgroup;
-static inline struct blkio_cgroup *
-cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
-static inline struct blkio_cgroup *
-task_blkio_cgroup(struct task_struct *tsk) { return NULL; }
-
-static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-		struct blkio_group *blkg, void *key, dev_t dev,
-		enum blkio_policy_id plid) {}
-
-static inline int blkio_alloc_blkg_stats(struct blkio_group *blkg) { return 0; }
-
-static inline int
-blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
-
-static inline struct blkio_group *
-blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
-static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
-						unsigned long time,
-						unsigned long unaccounted_time)
-{}
-static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
-				uint64_t bytes, bool direction, bool sync) {}
-static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
-		uint64_t start_time, uint64_t io_start_time, bool direction,
-		bool sync) {}
-static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
-						bool direction, bool sync) {}
-static inline void blkiocg_update_io_add_stats(struct blkio_group *blkg,
-		struct blkio_group *curr_blkg, bool direction, bool sync) {}
-static inline void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
-						bool direction, bool sync) {}
-#endif
-#endif /* _BLK_CGROUP_H */
+struct blkcg_policy {
+};
+
+static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; }
+static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
+static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
+static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
+static inline void blkcg_drain_queue(struct request_queue *q) { }
+static inline void blkcg_exit_queue(struct request_queue *q) { }
+static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
+static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
+static inline int blkcg_activate_policy(struct request_queue *q,
+					const struct blkcg_policy *pol) { return 0; }
+static inline void blkcg_deactivate_policy(struct request_queue *q,
+					   const struct blkcg_policy *pol) { }
+
+static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+						  struct blkcg_policy *pol) { return NULL; }
+static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
+static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
+static inline void blkg_get(struct blkcg_gq *blkg) { }
+static inline void blkg_put(struct blkcg_gq *blkg) { }
+
+#endif	/* CONFIG_BLK_CGROUP */
+#endif	/* _BLK_CGROUP_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index 1f61b74867e4..3c923a7aeb56 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -29,11 +29,13 @@
 #include <linux/fault-inject.h>
 #include <linux/list_sort.h>
 #include <linux/delay.h>
+#include <linux/ratelimit.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/block.h>
 
 #include "blk.h"
+#include "blk-cgroup.h"
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
@@ -280,7 +282,7 @@ EXPORT_SYMBOL(blk_stop_queue);
  *
  *     This function does not cancel any asynchronous activity arising
  *     out of elevator or throttling code. That would require elevaotor_exit()
- *     and blk_throtl_exit() to be called with queue lock initialized.
+ *     and blkcg_exit_queue() to be called with queue lock initialized.
  *
  */
 void blk_sync_queue(struct request_queue *q)
@@ -365,17 +367,23 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
 
 		spin_lock_irq(q->queue_lock);
 
-		elv_drain_elevator(q);
-		if (drain_all)
-			blk_throtl_drain(q);
+		/*
+		 * The caller might be trying to drain @q before its
+		 * elevator is initialized.
+		 */
+		if (q->elevator)
+			elv_drain_elevator(q);
+
+		blkcg_drain_queue(q);
 
 		/*
 		 * This function might be called on a queue which failed
-		 * driver init after queue creation.  Some drivers
-		 * (e.g. fd) get unhappy in such cases.  Kick queue iff
-		 * dispatch queue has something on it.
+		 * driver init after queue creation or is not yet fully
+		 * active yet.  Some drivers (e.g. fd and loop) get unhappy
+		 * in such cases.  Kick queue iff dispatch queue has
+		 * something on it and @q has request_fn set.
 		 */
-		if (!list_empty(&q->queue_head))
+		if (!list_empty(&q->queue_head) && q->request_fn)
 			__blk_run_queue(q);
 
 		drain |= q->rq.elvpriv;
@@ -403,6 +411,49 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
 }
 
 /**
+ * blk_queue_bypass_start - enter queue bypass mode
+ * @q: queue of interest
+ *
+ * In bypass mode, only the dispatch FIFO queue of @q is used.  This
+ * function makes @q enter bypass mode and drains all requests which were
+ * throttled or issued before.  On return, it's guaranteed that no request
+ * is being throttled or has ELVPRIV set and blk_queue_bypass() %true
+ * inside queue or RCU read lock.
+ */
+void blk_queue_bypass_start(struct request_queue *q)
+{
+	bool drain;
+
+	spin_lock_irq(q->queue_lock);
+	drain = !q->bypass_depth++;
+	queue_flag_set(QUEUE_FLAG_BYPASS, q);
+	spin_unlock_irq(q->queue_lock);
+
+	if (drain) {
+		blk_drain_queue(q, false);
+		/* ensure blk_queue_bypass() is %true inside RCU read lock */
+		synchronize_rcu();
+	}
+}
+EXPORT_SYMBOL_GPL(blk_queue_bypass_start);
+
+/**
+ * blk_queue_bypass_end - leave queue bypass mode
+ * @q: queue of interest
+ *
+ * Leave bypass mode and restore the normal queueing behavior.
+ */
+void blk_queue_bypass_end(struct request_queue *q)
+{
+	spin_lock_irq(q->queue_lock);
+	if (!--q->bypass_depth)
+		queue_flag_clear(QUEUE_FLAG_BYPASS, q);
+	WARN_ON_ONCE(q->bypass_depth < 0);
+	spin_unlock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL_GPL(blk_queue_bypass_end);
+
+/**
  * blk_cleanup_queue - shutdown a request queue
  * @q: request queue to shutdown
  *
@@ -418,6 +469,19 @@ void blk_cleanup_queue(struct request_queue *q)
 	queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
 
 	spin_lock_irq(lock);
+
+	/*
+	 * Dead queue is permanently in bypass mode till released.  Note
+	 * that, unlike blk_queue_bypass_start(), we aren't performing
+	 * synchronize_rcu() after entering bypass mode to avoid the delay
+	 * as some drivers create and destroy a lot of queues while
+	 * probing.  This is still safe because blk_release_queue() will be
+	 * called only after the queue refcnt drops to zero and nothing,
+	 * RCU or not, would be traversing the queue by then.
+	 */
+	q->bypass_depth++;
+	queue_flag_set(QUEUE_FLAG_BYPASS, q);
+
 	queue_flag_set(QUEUE_FLAG_NOMERGES, q);
 	queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
 	queue_flag_set(QUEUE_FLAG_DEAD, q);
@@ -428,13 +492,8 @@ void blk_cleanup_queue(struct request_queue *q)
 	spin_unlock_irq(lock);
 	mutex_unlock(&q->sysfs_lock);
 
-	/*
-	 * Drain all requests queued before DEAD marking.  The caller might
-	 * be trying to tear down @q before its elevator is initialized, in
-	 * which case we don't want to call into draining.
-	 */
-	if (q->elevator)
-		blk_drain_queue(q, true);
+	/* drain all requests queued before DEAD marking */
+	blk_drain_queue(q, true);
 
 	/* @q won't process any more request, flush async actions */
 	del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
@@ -498,14 +557,15 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	if (err)
 		goto fail_id;
 
-	if (blk_throtl_init(q))
-		goto fail_id;
-
 	setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
 		    laptop_mode_timer_fn, (unsigned long) q);
 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
+	INIT_LIST_HEAD(&q->queue_head);
 	INIT_LIST_HEAD(&q->timeout_list);
 	INIT_LIST_HEAD(&q->icq_list);
+#ifdef CONFIG_BLK_CGROUP
+	INIT_LIST_HEAD(&q->blkg_list);
+#endif
 	INIT_LIST_HEAD(&q->flush_queue[0]);
 	INIT_LIST_HEAD(&q->flush_queue[1]);
 	INIT_LIST_HEAD(&q->flush_data_in_flight);
@@ -522,6 +582,18 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	 */
 	q->queue_lock = &q->__queue_lock;
 
+	/*
+	 * A queue starts its life with bypass turned on to avoid
+	 * unnecessary bypass on/off overhead and nasty surprises during
+	 * init.  The initial bypass will be finished at the end of
+	 * blk_init_allocated_queue().
+	 */
+	q->bypass_depth = 1;
+	__set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
+
+	if (blkcg_init_queue(q))
+		goto fail_id;
+
 	return q;
 
 fail_id:
@@ -614,15 +686,15 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
 
 	q->sg_reserved_size = INT_MAX;
 
-	/*
-	 * all done
-	 */
-	if (!elevator_init(q, NULL)) {
-		blk_queue_congestion_threshold(q);
-		return q;
-	}
+	/* init elevator */
+	if (elevator_init(q, NULL))
+		return NULL;
 
-	return NULL;
+	blk_queue_congestion_threshold(q);
+
+	/* all done, end the initial bypass */
+	blk_queue_bypass_end(q);
+	return q;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
 
@@ -648,33 +720,6 @@ static inline void blk_free_request(struct request_queue *q, struct request *rq)
 	mempool_free(rq, q->rq.rq_pool);
 }
 
-static struct request *
-blk_alloc_request(struct request_queue *q, struct io_cq *icq,
-		  unsigned int flags, gfp_t gfp_mask)
-{
-	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
-
-	if (!rq)
-		return NULL;
-
-	blk_rq_init(q, rq);
-
-	rq->cmd_flags = flags | REQ_ALLOCED;
-
-	if (flags & REQ_ELVPRIV) {
-		rq->elv.icq = icq;
-		if (unlikely(elv_set_request(q, rq, gfp_mask))) {
-			mempool_free(rq, q->rq.rq_pool);
-			return NULL;
-		}
-		/* @rq->elv.icq holds on to io_context until @rq is freed */
-		if (icq)
-			get_io_context(icq->ioc);
-	}
-
-	return rq;
-}
-
 /*
  * ioc_batching returns true if the ioc is a valid batching request and
  * should be given priority access to a request.
@@ -763,6 +808,22 @@ static bool blk_rq_should_init_elevator(struct bio *bio)
 }
 
 /**
+ * rq_ioc - determine io_context for request allocation
+ * @bio: request being allocated is for this bio (can be %NULL)
+ *
+ * Determine io_context to use for request allocation for @bio.  May return
+ * %NULL if %current->io_context doesn't exist.
+ */
+static struct io_context *rq_ioc(struct bio *bio)
+{
+#ifdef CONFIG_BLK_CGROUP
+	if (bio && bio->bi_ioc)
+		return bio->bi_ioc;
+#endif
+	return current->io_context;
+}
+
+/**
  * get_request - get a free request
  * @q: request_queue to allocate request from
  * @rw_flags: RW and SYNC flags
@@ -779,7 +840,7 @@ static bool blk_rq_should_init_elevator(struct bio *bio)
 static struct request *get_request(struct request_queue *q, int rw_flags,
 				   struct bio *bio, gfp_t gfp_mask)
 {
-	struct request *rq = NULL;
+	struct request *rq;
 	struct request_list *rl = &q->rq;
 	struct elevator_type *et;
 	struct io_context *ioc;
@@ -789,7 +850,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 	int may_queue;
 retry:
 	et = q->elevator->type;
-	ioc = current->io_context;
+	ioc = rq_ioc(bio);
 
 	if (unlikely(blk_queue_dead(q)))
 		return NULL;
@@ -808,7 +869,7 @@ retry:
 			 */
 			if (!ioc && !retried) {
 				spin_unlock_irq(q->queue_lock);
-				create_io_context(current, gfp_mask, q->node);
+				create_io_context(gfp_mask, q->node);
 				spin_lock_irq(q->queue_lock);
 				retried = true;
 				goto retry;
@@ -831,7 +892,7 @@ retry:
 					 * process is not a "batcher", and not
 					 * exempted by the IO scheduler
 					 */
-					goto out;
+					return NULL;
 				}
 			}
 		}
@@ -844,7 +905,7 @@ retry:
 	 * allocated with any setting of ->nr_requests
 	 */
 	if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
-		goto out;
+		return NULL;
 
 	rl->count[is_sync]++;
 	rl->starved[is_sync] = 0;
@@ -859,8 +920,7 @@ retry:
 	 * Also, lookup icq while holding queue_lock.  If it doesn't exist,
 	 * it will be created after releasing queue_lock.
 	 */
-	if (blk_rq_should_init_elevator(bio) &&
-	    !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)) {
+	if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) {
 		rw_flags |= REQ_ELVPRIV;
 		rl->elvpriv++;
 		if (et->icq_cache && ioc)
@@ -871,41 +931,36 @@ retry:
 		rw_flags |= REQ_IO_STAT;
 	spin_unlock_irq(q->queue_lock);
 
-	/* create icq if missing */
-	if ((rw_flags & REQ_ELVPRIV) && unlikely(et->icq_cache && !icq)) {
-		icq = ioc_create_icq(q, gfp_mask);
-		if (!icq)
-			goto fail_icq;
-	}
-
-	rq = blk_alloc_request(q, icq, rw_flags, gfp_mask);
+	/* allocate and init request */
+	rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
+	if (!rq)
+		goto fail_alloc;
 
-fail_icq:
-	if (unlikely(!rq)) {
-		/*
-		 * Allocation failed presumably due to memory. Undo anything
-		 * we might have messed up.
-		 *
-		 * Allocating task should really be put onto the front of the
-		 * wait queue, but this is pretty rare.
-		 */
-		spin_lock_irq(q->queue_lock);
-		freed_request(q, rw_flags);
+	blk_rq_init(q, rq);
+	rq->cmd_flags = rw_flags | REQ_ALLOCED;
+
+	/* init elvpriv */
+	if (rw_flags & REQ_ELVPRIV) {
+		if (unlikely(et->icq_cache && !icq)) {
+			create_io_context(gfp_mask, q->node);
+			ioc = rq_ioc(bio);
+			if (!ioc)
+				goto fail_elvpriv;
+
+			icq = ioc_create_icq(ioc, q, gfp_mask);
+			if (!icq)
+				goto fail_elvpriv;
+		}
 
-		/*
-		 * in the very unlikely event that allocation failed and no
-		 * requests for this direction was pending, mark us starved
-		 * so that freeing of a request in the other direction will
-		 * notice us. another possible fix would be to split the
-		 * rq mempool into READ and WRITE
-		 */
-rq_starved:
-		if (unlikely(rl->count[is_sync] == 0))
-			rl->starved[is_sync] = 1;
+		rq->elv.icq = icq;
+		if (unlikely(elv_set_request(q, rq, bio, gfp_mask)))
+			goto fail_elvpriv;
 
-		goto out;
+		/* @rq->elv.icq holds io_context until @rq is freed */
+		if (icq)
+			get_io_context(icq->ioc);
 	}
-
+out:
 	/*
 	 * ioc may be NULL here, and ioc_batching will be false. That's
 	 * OK, if the queue is under the request limit then requests need
@@ -916,8 +971,48 @@ rq_starved:
 		ioc->nr_batch_requests--;
 
 	trace_block_getrq(q, bio, rw_flags & 1);
-out:
 	return rq;
+
+fail_elvpriv:
+	/*
+	 * elvpriv init failed.  ioc, icq and elvpriv aren't mempool backed
+	 * and may fail indefinitely under memory pressure and thus
+	 * shouldn't stall IO.  Treat this request as !elvpriv.  This will
+	 * disturb iosched and blkcg but weird is bettern than dead.
+	 */
+	printk_ratelimited(KERN_WARNING "%s: request aux data allocation failed, iosched may be disturbed\n",
+			   dev_name(q->backing_dev_info.dev));
+
+	rq->cmd_flags &= ~REQ_ELVPRIV;
+	rq->elv.icq = NULL;
+
+	spin_lock_irq(q->queue_lock);
+	rl->elvpriv--;
+	spin_unlock_irq(q->queue_lock);
+	goto out;
+
+fail_alloc:
+	/*
+	 * Allocation failed presumably due to memory. Undo anything we
+	 * might have messed up.
+	 *
+	 * Allocating task should really be put onto the front of the wait
+	 * queue, but this is pretty rare.
+	 */
+	spin_lock_irq(q->queue_lock);
+	freed_request(q, rw_flags);
+
+	/*
+	 * in the very unlikely event that allocation failed and no
+	 * requests for this direction was pending, mark us starved so that
+	 * freeing of a request in the other direction will notice
+	 * us. another possible fix would be to split the rq mempool into
+	 * READ and WRITE
+	 */
+rq_starved:
+	if (unlikely(rl->count[is_sync] == 0))
+		rl->starved[is_sync] = 1;
+	return NULL;
 }
 
 /**
@@ -961,7 +1056,7 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
 		 * up to a big batch of them for a small period time.
 		 * See ioc_batching, ioc_set_batching
 		 */
-		create_io_context(current, GFP_NOIO, q->node);
+		create_io_context(GFP_NOIO, q->node);
 		ioc_set_batching(q, current->io_context);
 
 		spin_lock_irq(q->queue_lock);
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index fb95dd2f889a..1e2d53b04858 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -155,20 +155,20 @@ void put_io_context(struct io_context *ioc)
 }
 EXPORT_SYMBOL(put_io_context);
 
-/* Called by the exiting task */
-void exit_io_context(struct task_struct *task)
+/**
+ * put_io_context_active - put active reference on ioc
+ * @ioc: ioc of interest
+ *
+ * Undo get_io_context_active().  If active reference reaches zero after
+ * put, @ioc can never issue further IOs and ioscheds are notified.
+ */
+void put_io_context_active(struct io_context *ioc)
 {
-	struct io_context *ioc;
-	struct io_cq *icq;
 	struct hlist_node *n;
 	unsigned long flags;
+	struct io_cq *icq;
 
-	task_lock(task);
-	ioc = task->io_context;
-	task->io_context = NULL;
-	task_unlock(task);
-
-	if (!atomic_dec_and_test(&ioc->nr_tasks)) {
+	if (!atomic_dec_and_test(&ioc->active_ref)) {
 		put_io_context(ioc);
 		return;
 	}
@@ -197,6 +197,20 @@ retry:
 	put_io_context(ioc);
 }
 
+/* Called by the exiting task */
+void exit_io_context(struct task_struct *task)
+{
+	struct io_context *ioc;
+
+	task_lock(task);
+	ioc = task->io_context;
+	task->io_context = NULL;
+	task_unlock(task);
+
+	atomic_dec(&ioc->nr_tasks);
+	put_io_context_active(ioc);
+}
+
 /**
  * ioc_clear_queue - break any ioc association with the specified queue
  * @q: request_queue being cleared
@@ -218,19 +232,18 @@ void ioc_clear_queue(struct request_queue *q)
 	}
 }
 
-void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags,
-				int node)
+int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
 {
 	struct io_context *ioc;
 
 	ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO,
 				    node);
 	if (unlikely(!ioc))
-		return;
+		return -ENOMEM;
 
 	/* initialize */
 	atomic_long_set(&ioc->refcount, 1);
-	atomic_set(&ioc->nr_tasks, 1);
+	atomic_set(&ioc->active_ref, 1);
 	spin_lock_init(&ioc->lock);
 	INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH);
 	INIT_HLIST_HEAD(&ioc->icq_list);
@@ -250,6 +263,8 @@ void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags,
 	else
 		kmem_cache_free(iocontext_cachep, ioc);
 	task_unlock(task);
+
+	return 0;
 }
 
 /**
@@ -281,7 +296,7 @@ struct io_context *get_task_io_context(struct task_struct *task,
 			return ioc;
 		}
 		task_unlock(task);
-	} while (create_io_context(task, gfp_flags, node));
+	} while (!create_task_io_context(task, gfp_flags, node));
 
 	return NULL;
 }
@@ -325,26 +340,23 @@ EXPORT_SYMBOL(ioc_lookup_icq);
 
 /**
  * ioc_create_icq - create and link io_cq
+ * @ioc: io_context of interest
  * @q: request_queue of interest
  * @gfp_mask: allocation mask
  *
- * Make sure io_cq linking %current->io_context and @q exists.  If either
- * io_context and/or icq don't exist, they will be created using @gfp_mask.
+ * Make sure io_cq linking @ioc and @q exists.  If icq doesn't exist, they
+ * will be created using @gfp_mask.
  *
  * The caller is responsible for ensuring @ioc won't go away and @q is
  * alive and will stay alive until this function returns.
  */
-struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask)
+struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
+			     gfp_t gfp_mask)
 {
 	struct elevator_type *et = q->elevator->type;
-	struct io_context *ioc;
 	struct io_cq *icq;
 
 	/* allocate stuff */
-	ioc = create_io_context(current, gfp_mask, q->node);
-	if (!ioc)
-		return NULL;
-
 	icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO,
 				    q->node);
 	if (!icq)
@@ -382,74 +394,6 @@ struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask)
 	return icq;
 }
 
-void ioc_set_icq_flags(struct io_context *ioc, unsigned int flags)
-{
-	struct io_cq *icq;
-	struct hlist_node *n;
-
-	hlist_for_each_entry(icq, n, &ioc->icq_list, ioc_node)
-		icq->flags |= flags;
-}
-
-/**
- * ioc_ioprio_changed - notify ioprio change
- * @ioc: io_context of interest
- * @ioprio: new ioprio
- *
- * @ioc's ioprio has changed to @ioprio.  Set %ICQ_IOPRIO_CHANGED for all
- * icq's.  iosched is responsible for checking the bit and applying it on
- * request issue path.
- */
-void ioc_ioprio_changed(struct io_context *ioc, int ioprio)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&ioc->lock, flags);
-	ioc->ioprio = ioprio;
-	ioc_set_icq_flags(ioc, ICQ_IOPRIO_CHANGED);
-	spin_unlock_irqrestore(&ioc->lock, flags);
-}
-
-/**
- * ioc_cgroup_changed - notify cgroup change
- * @ioc: io_context of interest
- *
- * @ioc's cgroup has changed.  Set %ICQ_CGROUP_CHANGED for all icq's.
- * iosched is responsible for checking the bit and applying it on request
- * issue path.
- */
-void ioc_cgroup_changed(struct io_context *ioc)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&ioc->lock, flags);
-	ioc_set_icq_flags(ioc, ICQ_CGROUP_CHANGED);
-	spin_unlock_irqrestore(&ioc->lock, flags);
-}
-EXPORT_SYMBOL(ioc_cgroup_changed);
-
-/**
- * icq_get_changed - fetch and clear icq changed mask
- * @icq: icq of interest
- *
- * Fetch and clear ICQ_*_CHANGED bits from @icq.  Grabs and releases
- * @icq->ioc->lock.
- */
-unsigned icq_get_changed(struct io_cq *icq)
-{
-	unsigned int changed = 0;
-	unsigned long flags;
-
-	if (unlikely(icq->flags & ICQ_CHANGED_MASK)) {
-		spin_lock_irqsave(&icq->ioc->lock, flags);
-		changed = icq->flags & ICQ_CHANGED_MASK;
-		icq->flags &= ~ICQ_CHANGED_MASK;
-		spin_unlock_irqrestore(&icq->ioc->lock, flags);
-	}
-	return changed;
-}
-EXPORT_SYMBOL(icq_get_changed);
-
 static int __init blk_ioc_init(void)
 {
 	iocontext_cachep = kmem_cache_create("blkdev_ioc",
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index cf150011d808..aa41b47c22d2 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -9,6 +9,7 @@
 #include <linux/blktrace_api.h>
 
 #include "blk.h"
+#include "blk-cgroup.h"
 
 struct queue_sysfs_entry {
 	struct attribute attr;
@@ -479,6 +480,8 @@ static void blk_release_queue(struct kobject *kobj)
 
 	blk_sync_queue(q);
 
+	blkcg_exit_queue(q);
+
 	if (q->elevator) {
 		spin_lock_irq(q->queue_lock);
 		ioc_clear_queue(q);
@@ -486,15 +489,12 @@ static void blk_release_queue(struct kobject *kobj)
 		elevator_exit(q->elevator);
 	}
 
-	blk_throtl_exit(q);
-
 	if (rl->rq_pool)
 		mempool_destroy(rl->rq_pool);
 
 	if (q->queue_tags)
 		__blk_queue_free_tags(q);
 
-	blk_throtl_release(q);
 	blk_trace_shutdown(q);
 
 	bdi_destroy(&q->backing_dev_info);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index f2ddb94626bd..5b0659512047 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -21,6 +21,8 @@ static int throtl_quantum = 32;
 /* Throttling is performed over 100ms slice and after that slice is renewed */
 static unsigned long throtl_slice = HZ/10;	/* 100 ms */
 
+static struct blkcg_policy blkcg_policy_throtl;
+
 /* A workqueue to queue throttle related work */
 static struct workqueue_struct *kthrotld_workqueue;
 static void throtl_schedule_delayed_work(struct throtl_data *td,
@@ -38,9 +40,17 @@ struct throtl_rb_root {
 
 #define rb_entry_tg(node)	rb_entry((node), struct throtl_grp, rb_node)
 
+/* Per-cpu group stats */
+struct tg_stats_cpu {
+	/* total bytes transferred */
+	struct blkg_rwstat		service_bytes;
+	/* total IOs serviced, post merge */
+	struct blkg_rwstat		serviced;
+};
+
 struct throtl_grp {
-	/* List of throtl groups on the request queue*/
-	struct hlist_node tg_node;
+	/* must be the first member */
+	struct blkg_policy_data pd;
 
 	/* active throtl group service_tree member */
 	struct rb_node rb_node;
@@ -52,8 +62,6 @@ struct throtl_grp {
 	 */
 	unsigned long disptime;
 
-	struct blkio_group blkg;
-	atomic_t ref;
 	unsigned int flags;
 
 	/* Two lists for READ and WRITE */
@@ -80,18 +88,18 @@ struct throtl_grp {
 	/* Some throttle limits got updated for the group */
 	int limits_changed;
 
-	struct rcu_head rcu_head;
+	/* Per cpu stats pointer */
+	struct tg_stats_cpu __percpu *stats_cpu;
+
+	/* List of tgs waiting for per cpu stats memory to be allocated */
+	struct list_head stats_alloc_node;
 };
 
 struct throtl_data
 {
-	/* List of throtl groups */
-	struct hlist_head tg_list;
-
 	/* service tree for active throtl groups */
 	struct throtl_rb_root tg_service_tree;
 
-	struct throtl_grp *root_tg;
 	struct request_queue *queue;
 
 	/* Total Number of queued bios on READ and WRITE lists */
@@ -108,6 +116,33 @@ struct throtl_data
 	int limits_changed;
 };
 
+/* list and work item to allocate percpu group stats */
+static DEFINE_SPINLOCK(tg_stats_alloc_lock);
+static LIST_HEAD(tg_stats_alloc_list);
+
+static void tg_stats_alloc_fn(struct work_struct *);
+static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
+
+static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
+{
+	return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
+}
+
+static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
+{
+	return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
+}
+
+static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
+{
+	return pd_to_blkg(&tg->pd);
+}
+
+static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
+{
+	return blkg_to_tg(td->queue->root_blkg);
+}
+
 enum tg_state_flags {
 	THROTL_TG_FLAG_on_rr = 0,	/* on round-robin busy list */
 };
@@ -128,244 +163,150 @@ static inline int throtl_tg_##name(const struct throtl_grp *tg)		\
 
 THROTL_TG_FNS(on_rr);
 
-#define throtl_log_tg(td, tg, fmt, args...)				\
-	blk_add_trace_msg((td)->queue, "throtl %s " fmt,		\
-				blkg_path(&(tg)->blkg), ##args);      	\
+#define throtl_log_tg(td, tg, fmt, args...)	do {			\
+	char __pbuf[128];						\
+									\
+	blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf));		\
+	blk_add_trace_msg((td)->queue, "throtl %s " fmt, __pbuf, ##args); \
+} while (0)
 
 #define throtl_log(td, fmt, args...)	\
 	blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
 
-static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg)
-{
-	if (blkg)
-		return container_of(blkg, struct throtl_grp, blkg);
-
-	return NULL;
-}
-
 static inline unsigned int total_nr_queued(struct throtl_data *td)
 {
 	return td->nr_queued[0] + td->nr_queued[1];
 }
 
-static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
-{
-	atomic_inc(&tg->ref);
-	return tg;
-}
-
-static void throtl_free_tg(struct rcu_head *head)
+/*
+ * Worker for allocating per cpu stat for tgs. This is scheduled on the
+ * system_nrt_wq once there are some groups on the alloc_list waiting for
+ * allocation.
+ */
+static void tg_stats_alloc_fn(struct work_struct *work)
 {
-	struct throtl_grp *tg;
+	static struct tg_stats_cpu *stats_cpu;	/* this fn is non-reentrant */
+	struct delayed_work *dwork = to_delayed_work(work);
+	bool empty = false;
+
+alloc_stats:
+	if (!stats_cpu) {
+		stats_cpu = alloc_percpu(struct tg_stats_cpu);
+		if (!stats_cpu) {
+			/* allocation failed, try again after some time */
+			queue_delayed_work(system_nrt_wq, dwork,
+					   msecs_to_jiffies(10));
+			return;
+		}
+	}
 
-	tg = container_of(head, struct throtl_grp, rcu_head);
-	free_percpu(tg->blkg.stats_cpu);
-	kfree(tg);
-}
+	spin_lock_irq(&tg_stats_alloc_lock);
 
-static void throtl_put_tg(struct throtl_grp *tg)
-{
-	BUG_ON(atomic_read(&tg->ref) <= 0);
-	if (!atomic_dec_and_test(&tg->ref))
-		return;
+	if (!list_empty(&tg_stats_alloc_list)) {
+		struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list,
+							 struct throtl_grp,
+							 stats_alloc_node);
+		swap(tg->stats_cpu, stats_cpu);
+		list_del_init(&tg->stats_alloc_node);
+	}
 
-	/*
-	 * A group is freed in rcu manner. But having an rcu lock does not
-	 * mean that one can access all the fields of blkg and assume these
-	 * are valid. For example, don't try to follow throtl_data and
-	 * request queue links.
-	 *
-	 * Having a reference to blkg under an rcu allows acess to only
-	 * values local to groups like group stats and group rate limits
-	 */
-	call_rcu(&tg->rcu_head, throtl_free_tg);
+	empty = list_empty(&tg_stats_alloc_list);
+	spin_unlock_irq(&tg_stats_alloc_lock);
+	if (!empty)
+		goto alloc_stats;
 }
 
-static void throtl_init_group(struct throtl_grp *tg)
+static void throtl_pd_init(struct blkcg_gq *blkg)
 {
-	INIT_HLIST_NODE(&tg->tg_node);
+	struct throtl_grp *tg = blkg_to_tg(blkg);
+	unsigned long flags;
+
 	RB_CLEAR_NODE(&tg->rb_node);
 	bio_list_init(&tg->bio_lists[0]);
 	bio_list_init(&tg->bio_lists[1]);
 	tg->limits_changed = false;
 
-	/* Practically unlimited BW */
-	tg->bps[0] = tg->bps[1] = -1;
-	tg->iops[0] = tg->iops[1] = -1;
+	tg->bps[READ] = -1;
+	tg->bps[WRITE] = -1;
+	tg->iops[READ] = -1;
+	tg->iops[WRITE] = -1;
 
 	/*
-	 * Take the initial reference that will be released on destroy
-	 * This can be thought of a joint reference by cgroup and
-	 * request queue which will be dropped by either request queue
-	 * exit or cgroup deletion path depending on who is exiting first.
+	 * Ugh... We need to perform per-cpu allocation for tg->stats_cpu
+	 * but percpu allocator can't be called from IO path.  Queue tg on
+	 * tg_stats_alloc_list and allocate from work item.
 	 */
-	atomic_set(&tg->ref, 1);
+	spin_lock_irqsave(&tg_stats_alloc_lock, flags);
+	list_add(&tg->stats_alloc_node, &tg_stats_alloc_list);
+	queue_delayed_work(system_nrt_wq, &tg_stats_alloc_work, 0);
+	spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
 }
 
-/* Should be called with rcu read lock held (needed for blkcg) */
-static void
-throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg)
+static void throtl_pd_exit(struct blkcg_gq *blkg)
 {
-	hlist_add_head(&tg->tg_node, &td->tg_list);
-	td->nr_undestroyed_grps++;
-}
-
-static void
-__throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
-{
-	struct backing_dev_info *bdi = &td->queue->backing_dev_info;
-	unsigned int major, minor;
-
-	if (!tg || tg->blkg.dev)
-		return;
-
-	/*
-	 * Fill in device details for a group which might not have been
-	 * filled at group creation time as queue was being instantiated
-	 * and driver had not attached a device yet
-	 */
-	if (bdi->dev && dev_name(bdi->dev)) {
-		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-		tg->blkg.dev = MKDEV(major, minor);
-	}
-}
-
-/*
- * Should be called with without queue lock held. Here queue lock will be
- * taken rarely. It will be taken only once during life time of a group
- * if need be
- */
-static void
-throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
-{
-	if (!tg || tg->blkg.dev)
-		return;
-
-	spin_lock_irq(td->queue->queue_lock);
-	__throtl_tg_fill_dev_details(td, tg);
-	spin_unlock_irq(td->queue->queue_lock);
-}
-
-static void throtl_init_add_tg_lists(struct throtl_data *td,
-			struct throtl_grp *tg, struct blkio_cgroup *blkcg)
-{
-	__throtl_tg_fill_dev_details(td, tg);
-
-	/* Add group onto cgroup list */
-	blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
-				tg->blkg.dev, BLKIO_POLICY_THROTL);
+	struct throtl_grp *tg = blkg_to_tg(blkg);
+	unsigned long flags;
 
-	tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
-	tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
-	tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
-	tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
+	spin_lock_irqsave(&tg_stats_alloc_lock, flags);
+	list_del_init(&tg->stats_alloc_node);
+	spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
 
-	throtl_add_group_to_td_list(td, tg);
+	free_percpu(tg->stats_cpu);
 }
 
-/* Should be called without queue lock and outside of rcu period */
-static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td)
+static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
 {
-	struct throtl_grp *tg = NULL;
-	int ret;
+	struct throtl_grp *tg = blkg_to_tg(blkg);
+	int cpu;
 
-	tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
-	if (!tg)
-		return NULL;
+	if (tg->stats_cpu == NULL)
+		return;
 
-	ret = blkio_alloc_blkg_stats(&tg->blkg);
+	for_each_possible_cpu(cpu) {
+		struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
 
-	if (ret) {
-		kfree(tg);
-		return NULL;
+		blkg_rwstat_reset(&sc->service_bytes);
+		blkg_rwstat_reset(&sc->serviced);
 	}
-
-	throtl_init_group(tg);
-	return tg;
 }
 
-static struct
-throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
+static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td,
+					   struct blkcg *blkcg)
 {
-	struct throtl_grp *tg = NULL;
-	void *key = td;
-
 	/*
-	 * This is the common case when there are no blkio cgroups.
- 	 * Avoid lookup in this case
- 	 */
-	if (blkcg == &blkio_root_cgroup)
-		tg = td->root_tg;
-	else
-		tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
+	 * This is the common case when there are no blkcgs.  Avoid lookup
+	 * in this case
+	 */
+	if (blkcg == &blkcg_root)
+		return td_root_tg(td);
 
-	__throtl_tg_fill_dev_details(td, tg);
-	return tg;
+	return blkg_to_tg(blkg_lookup(blkcg, td->queue));
 }
 
-static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
+static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
+						  struct blkcg *blkcg)
 {
-	struct throtl_grp *tg = NULL, *__tg = NULL;
-	struct blkio_cgroup *blkcg;
 	struct request_queue *q = td->queue;
-
-	/* no throttling for dead queue */
-	if (unlikely(blk_queue_dead(q)))
-		return NULL;
-
-	rcu_read_lock();
-	blkcg = task_blkio_cgroup(current);
-	tg = throtl_find_tg(td, blkcg);
-	if (tg) {
-		rcu_read_unlock();
-		return tg;
-	}
-
-	/*
-	 * Need to allocate a group. Allocation of group also needs allocation
-	 * of per cpu stats which in-turn takes a mutex() and can block. Hence
-	 * we need to drop rcu lock and queue_lock before we call alloc.
-	 */
-	rcu_read_unlock();
-	spin_unlock_irq(q->queue_lock);
-
-	tg = throtl_alloc_tg(td);
-
-	/* Group allocated and queue is still alive. take the lock */
-	spin_lock_irq(q->queue_lock);
-
-	/* Make sure @q is still alive */
-	if (unlikely(blk_queue_dead(q))) {
-		kfree(tg);
-		return NULL;
-	}
-
-	/*
-	 * Initialize the new group. After sleeping, read the blkcg again.
-	 */
-	rcu_read_lock();
-	blkcg = task_blkio_cgroup(current);
+	struct throtl_grp *tg = NULL;
 
 	/*
-	 * If some other thread already allocated the group while we were
-	 * not holding queue lock, free up the group
+	 * This is the common case when there are no blkcgs.  Avoid lookup
+	 * in this case
 	 */
-	__tg = throtl_find_tg(td, blkcg);
-
-	if (__tg) {
-		kfree(tg);
-		rcu_read_unlock();
-		return __tg;
-	}
-
-	/* Group allocation failed. Account the IO to root group */
-	if (!tg) {
-		tg = td->root_tg;
-		return tg;
+	if (blkcg == &blkcg_root) {
+		tg = td_root_tg(td);
+	} else {
+		struct blkcg_gq *blkg;
+
+		blkg = blkg_lookup_create(blkcg, q);
+
+		/* if %NULL and @q is alive, fall back to root_tg */
+		if (!IS_ERR(blkg))
+			tg = blkg_to_tg(blkg);
+		else if (!blk_queue_dead(q))
+			tg = td_root_tg(td);
 	}
 
-	throtl_init_add_tg_lists(td, tg, blkcg);
-	rcu_read_unlock();
 	return tg;
 }
 
@@ -734,16 +675,41 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
 	return 0;
 }
 
+static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
+					 int rw)
+{
+	struct throtl_grp *tg = blkg_to_tg(blkg);
+	struct tg_stats_cpu *stats_cpu;
+	unsigned long flags;
+
+	/* If per cpu stats are not allocated yet, don't do any accounting. */
+	if (tg->stats_cpu == NULL)
+		return;
+
+	/*
+	 * Disabling interrupts to provide mutual exclusion between two
+	 * writes on same cpu. It probably is not needed for 64bit. Not
+	 * optimizing that case yet.
+	 */
+	local_irq_save(flags);
+
+	stats_cpu = this_cpu_ptr(tg->stats_cpu);
+
+	blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
+	blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
+
+	local_irq_restore(flags);
+}
+
 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 {
 	bool rw = bio_data_dir(bio);
-	bool sync = rw_is_sync(bio->bi_rw);
 
 	/* Charge the bio to the group */
 	tg->bytes_disp[rw] += bio->bi_size;
 	tg->io_disp[rw]++;
 
-	blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
+	throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw);
 }
 
 static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
@@ -753,7 +719,7 @@ static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
 
 	bio_list_add(&tg->bio_lists[rw], bio);
 	/* Take a bio reference on tg */
-	throtl_ref_get_tg(tg);
+	blkg_get(tg_to_blkg(tg));
 	tg->nr_queued[rw]++;
 	td->nr_queued[rw]++;
 	throtl_enqueue_tg(td, tg);
@@ -786,8 +752,8 @@ static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
 
 	bio = bio_list_pop(&tg->bio_lists[rw]);
 	tg->nr_queued[rw]--;
-	/* Drop bio reference on tg */
-	throtl_put_tg(tg);
+	/* Drop bio reference on blkg */
+	blkg_put(tg_to_blkg(tg));
 
 	BUG_ON(td->nr_queued[rw] <= 0);
 	td->nr_queued[rw]--;
@@ -865,8 +831,8 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
 
 static void throtl_process_limit_change(struct throtl_data *td)
 {
-	struct throtl_grp *tg;
-	struct hlist_node *pos, *n;
+	struct request_queue *q = td->queue;
+	struct blkcg_gq *blkg, *n;
 
 	if (!td->limits_changed)
 		return;
@@ -875,7 +841,9 @@ static void throtl_process_limit_change(struct throtl_data *td)
 
 	throtl_log(td, "limits changed");
 
-	hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
+	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
+		struct throtl_grp *tg = blkg_to_tg(blkg);
+
 		if (!tg->limits_changed)
 			continue;
 
@@ -973,120 +941,159 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
 	}
 }
 
-static void
-throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg)
+static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
+				struct blkg_policy_data *pd, int off)
 {
-	/* Something wrong if we are trying to remove same group twice */
-	BUG_ON(hlist_unhashed(&tg->tg_node));
+	struct throtl_grp *tg = pd_to_tg(pd);
+	struct blkg_rwstat rwstat = { }, tmp;
+	int i, cpu;
 
-	hlist_del_init(&tg->tg_node);
+	for_each_possible_cpu(cpu) {
+		struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
 
-	/*
-	 * Put the reference taken at the time of creation so that when all
-	 * queues are gone, group can be destroyed.
-	 */
-	throtl_put_tg(tg);
-	td->nr_undestroyed_grps--;
+		tmp = blkg_rwstat_read((void *)sc + off);
+		for (i = 0; i < BLKG_RWSTAT_NR; i++)
+			rwstat.cnt[i] += tmp.cnt[i];
+	}
+
+	return __blkg_prfill_rwstat(sf, pd, &rwstat);
 }
 
-static void throtl_release_tgs(struct throtl_data *td)
+static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
+			       struct seq_file *sf)
 {
-	struct hlist_node *pos, *n;
-	struct throtl_grp *tg;
+	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
 
-	hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
-		/*
-		 * If cgroup removal path got to blk_group first and removed
-		 * it from cgroup list, then it will take care of destroying
-		 * cfqg also.
-		 */
-		if (!blkiocg_del_blkio_group(&tg->blkg))
-			throtl_destroy_tg(td, tg);
-	}
+	blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl,
+			  cft->private, true);
+	return 0;
 }
 
-/*
- * Blk cgroup controller notification saying that blkio_group object is being
- * delinked as associated cgroup object is going away. That also means that
- * no new IO will come in this group. So get rid of this group as soon as
- * any pending IO in the group is finished.
- *
- * This function is called under rcu_read_lock(). key is the rcu protected
- * pointer. That means "key" is a valid throtl_data pointer as long as we are
- * rcu read lock.
- *
- * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
- * it should not be NULL as even if queue was going away, cgroup deltion
- * path got to it first.
- */
-void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
+static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
+			      int off)
 {
-	unsigned long flags;
-	struct throtl_data *td = key;
+	struct throtl_grp *tg = pd_to_tg(pd);
+	u64 v = *(u64 *)((void *)tg + off);
 
-	spin_lock_irqsave(td->queue->queue_lock, flags);
-	throtl_destroy_tg(td, tg_of_blkg(blkg));
-	spin_unlock_irqrestore(td->queue->queue_lock, flags);
+	if (v == -1)
+		return 0;
+	return __blkg_prfill_u64(sf, pd, v);
 }
 
-static void throtl_update_blkio_group_common(struct throtl_data *td,
-				struct throtl_grp *tg)
+static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
+			       int off)
 {
-	xchg(&tg->limits_changed, true);
-	xchg(&td->limits_changed, true);
-	/* Schedule a work now to process the limit change */
-	throtl_schedule_delayed_work(td, 0);
+	struct throtl_grp *tg = pd_to_tg(pd);
+	unsigned int v = *(unsigned int *)((void *)tg + off);
+
+	if (v == -1)
+		return 0;
+	return __blkg_prfill_u64(sf, pd, v);
 }
 
-/*
- * For all update functions, key should be a valid pointer because these
- * update functions are called under blkcg_lock, that means, blkg is
- * valid and in turn key is valid. queue exit path can not race because
- * of blkcg_lock
- *
- * Can not take queue lock in update functions as queue lock under blkcg_lock
- * is not allowed. Under other paths we take blkcg_lock under queue_lock.
- */
-static void throtl_update_blkio_group_read_bps(void *key,
-				struct blkio_group *blkg, u64 read_bps)
+static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft,
+			     struct seq_file *sf)
 {
-	struct throtl_data *td = key;
-	struct throtl_grp *tg = tg_of_blkg(blkg);
-
-	tg->bps[READ] = read_bps;
-	throtl_update_blkio_group_common(td, tg);
+	blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_u64,
+			  &blkcg_policy_throtl, cft->private, false);
+	return 0;
 }
 
-static void throtl_update_blkio_group_write_bps(void *key,
-				struct blkio_group *blkg, u64 write_bps)
+static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft,
+			      struct seq_file *sf)
 {
-	struct throtl_data *td = key;
-	struct throtl_grp *tg = tg_of_blkg(blkg);
-
-	tg->bps[WRITE] = write_bps;
-	throtl_update_blkio_group_common(td, tg);
+	blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_uint,
+			  &blkcg_policy_throtl, cft->private, false);
+	return 0;
 }
 
-static void throtl_update_blkio_group_read_iops(void *key,
-			struct blkio_group *blkg, unsigned int read_iops)
+static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
+		       bool is_u64)
 {
-	struct throtl_data *td = key;
-	struct throtl_grp *tg = tg_of_blkg(blkg);
+	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+	struct blkg_conf_ctx ctx;
+	struct throtl_grp *tg;
+	struct throtl_data *td;
+	int ret;
+
+	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
+	if (ret)
+		return ret;
+
+	tg = blkg_to_tg(ctx.blkg);
+	td = ctx.blkg->q->td;
+
+	if (!ctx.v)
+		ctx.v = -1;
+
+	if (is_u64)
+		*(u64 *)((void *)tg + cft->private) = ctx.v;
+	else
+		*(unsigned int *)((void *)tg + cft->private) = ctx.v;
+
+	/* XXX: we don't need the following deferred processing */
+	xchg(&tg->limits_changed, true);
+	xchg(&td->limits_changed, true);
+	throtl_schedule_delayed_work(td, 0);
 
-	tg->iops[READ] = read_iops;
-	throtl_update_blkio_group_common(td, tg);
+	blkg_conf_finish(&ctx);
+	return 0;
 }
 
-static void throtl_update_blkio_group_write_iops(void *key,
-			struct blkio_group *blkg, unsigned int write_iops)
+static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft,
+			   const char *buf)
 {
-	struct throtl_data *td = key;
-	struct throtl_grp *tg = tg_of_blkg(blkg);
+	return tg_set_conf(cgrp, cft, buf, true);
+}
 
-	tg->iops[WRITE] = write_iops;
-	throtl_update_blkio_group_common(td, tg);
+static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft,
+			    const char *buf)
+{
+	return tg_set_conf(cgrp, cft, buf, false);
 }
 
+static struct cftype throtl_files[] = {
+	{
+		.name = "throttle.read_bps_device",
+		.private = offsetof(struct throtl_grp, bps[READ]),
+		.read_seq_string = tg_print_conf_u64,
+		.write_string = tg_set_conf_u64,
+		.max_write_len = 256,
+	},
+	{
+		.name = "throttle.write_bps_device",
+		.private = offsetof(struct throtl_grp, bps[WRITE]),
+		.read_seq_string = tg_print_conf_u64,
+		.write_string = tg_set_conf_u64,
+		.max_write_len = 256,
+	},
+	{
+		.name = "throttle.read_iops_device",
+		.private = offsetof(struct throtl_grp, iops[READ]),
+		.read_seq_string = tg_print_conf_uint,
+		.write_string = tg_set_conf_uint,
+		.max_write_len = 256,
+	},
+	{
+		.name = "throttle.write_iops_device",
+		.private = offsetof(struct throtl_grp, iops[WRITE]),
+		.read_seq_string = tg_print_conf_uint,
+		.write_string = tg_set_conf_uint,
+		.max_write_len = 256,
+	},
+	{
+		.name = "throttle.io_service_bytes",
+		.private = offsetof(struct tg_stats_cpu, service_bytes),
+		.read_seq_string = tg_print_cpu_rwstat,
+	},
+	{
+		.name = "throttle.io_serviced",
+		.private = offsetof(struct tg_stats_cpu, serviced),
+		.read_seq_string = tg_print_cpu_rwstat,
+	},
+	{ }	/* terminate */
+};
+
 static void throtl_shutdown_wq(struct request_queue *q)
 {
 	struct throtl_data *td = q->td;
@@ -1094,19 +1101,13 @@ static void throtl_shutdown_wq(struct request_queue *q)
 	cancel_delayed_work_sync(&td->throtl_work);
 }
 
-static struct blkio_policy_type blkio_policy_throtl = {
-	.ops = {
-		.blkio_unlink_group_fn = throtl_unlink_blkio_group,
-		.blkio_update_group_read_bps_fn =
-					throtl_update_blkio_group_read_bps,
-		.blkio_update_group_write_bps_fn =
-					throtl_update_blkio_group_write_bps,
-		.blkio_update_group_read_iops_fn =
-					throtl_update_blkio_group_read_iops,
-		.blkio_update_group_write_iops_fn =
-					throtl_update_blkio_group_write_iops,
-	},
-	.plid = BLKIO_POLICY_THROTL,
+static struct blkcg_policy blkcg_policy_throtl = {
+	.pd_size		= sizeof(struct throtl_grp),
+	.cftypes		= throtl_files,
+
+	.pd_init_fn		= throtl_pd_init,
+	.pd_exit_fn		= throtl_pd_exit,
+	.pd_reset_stats_fn	= throtl_pd_reset_stats,
 };
 
 bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
@@ -1114,7 +1115,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 	struct throtl_data *td = q->td;
 	struct throtl_grp *tg;
 	bool rw = bio_data_dir(bio), update_disptime = true;
-	struct blkio_cgroup *blkcg;
+	struct blkcg *blkcg;
 	bool throttled = false;
 
 	if (bio->bi_rw & REQ_THROTTLED) {
@@ -1122,33 +1123,31 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 		goto out;
 	}
 
+	/* bio_associate_current() needs ioc, try creating */
+	create_io_context(GFP_ATOMIC, q->node);
+
 	/*
 	 * A throtl_grp pointer retrieved under rcu can be used to access
 	 * basic fields like stats and io rates. If a group has no rules,
 	 * just update the dispatch stats in lockless manner and return.
 	 */
-
 	rcu_read_lock();
-	blkcg = task_blkio_cgroup(current);
-	tg = throtl_find_tg(td, blkcg);
+	blkcg = bio_blkcg(bio);
+	tg = throtl_lookup_tg(td, blkcg);
 	if (tg) {
-		throtl_tg_fill_dev_details(td, tg);
-
 		if (tg_no_rule_group(tg, rw)) {
-			blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size,
-					rw, rw_is_sync(bio->bi_rw));
-			rcu_read_unlock();
-			goto out;
+			throtl_update_dispatch_stats(tg_to_blkg(tg),
+						     bio->bi_size, bio->bi_rw);
+			goto out_unlock_rcu;
 		}
 	}
-	rcu_read_unlock();
 
 	/*
 	 * Either group has not been allocated yet or it is not an unlimited
 	 * IO group
 	 */
 	spin_lock_irq(q->queue_lock);
-	tg = throtl_get_tg(td);
+	tg = throtl_lookup_create_tg(td, blkcg);
 	if (unlikely(!tg))
 		goto out_unlock;
 
@@ -1189,6 +1188,7 @@ queue_bio:
 			tg->io_disp[rw], tg->iops[rw],
 			tg->nr_queued[READ], tg->nr_queued[WRITE]);
 
+	bio_associate_current(bio);
 	throtl_add_bio_tg(q->td, tg, bio);
 	throttled = true;
 
@@ -1199,6 +1199,8 @@ queue_bio:
 
 out_unlock:
 	spin_unlock_irq(q->queue_lock);
+out_unlock_rcu:
+	rcu_read_unlock();
 out:
 	return throttled;
 }
@@ -1241,79 +1243,31 @@ void blk_throtl_drain(struct request_queue *q)
 int blk_throtl_init(struct request_queue *q)
 {
 	struct throtl_data *td;
-	struct throtl_grp *tg;
+	int ret;
 
 	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
 	if (!td)
 		return -ENOMEM;
 
-	INIT_HLIST_HEAD(&td->tg_list);
 	td->tg_service_tree = THROTL_RB_ROOT;
 	td->limits_changed = false;
 	INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
 
-	/* alloc and Init root group. */
+	q->td = td;
 	td->queue = q;
-	tg = throtl_alloc_tg(td);
 
-	if (!tg) {
+	/* activate policy */
+	ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
+	if (ret)
 		kfree(td);
-		return -ENOMEM;
-	}
-
-	td->root_tg = tg;
-
-	rcu_read_lock();
-	throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup);
-	rcu_read_unlock();
-
-	/* Attach throtl data to request queue */
-	q->td = td;
-	return 0;
+	return ret;
 }
 
 void blk_throtl_exit(struct request_queue *q)
 {
-	struct throtl_data *td = q->td;
-	bool wait = false;
-
-	BUG_ON(!td);
-
-	throtl_shutdown_wq(q);
-
-	spin_lock_irq(q->queue_lock);
-	throtl_release_tgs(td);
-
-	/* If there are other groups */
-	if (td->nr_undestroyed_grps > 0)
-		wait = true;
-
-	spin_unlock_irq(q->queue_lock);
-
-	/*
-	 * Wait for tg->blkg->key accessors to exit their grace periods.
-	 * Do this wait only if there are other undestroyed groups out
-	 * there (other than root group). This can happen if cgroup deletion
-	 * path claimed the responsibility of cleaning up a group before
-	 * queue cleanup code get to the group.
-	 *
-	 * Do not call synchronize_rcu() unconditionally as there are drivers
-	 * which create/delete request queue hundreds of times during scan/boot
-	 * and synchronize_rcu() can take significant time and slow down boot.
-	 */
-	if (wait)
-		synchronize_rcu();
-
-	/*
-	 * Just being safe to make sure after previous flush if some body did
-	 * update limits through cgroup and another work got queued, cancel
-	 * it.
-	 */
+	BUG_ON(!q->td);
 	throtl_shutdown_wq(q);
-}
-
-void blk_throtl_release(struct request_queue *q)
-{
+	blkcg_deactivate_policy(q, &blkcg_policy_throtl);
 	kfree(q->td);
 }
 
@@ -1323,8 +1277,7 @@ static int __init throtl_init(void)
 	if (!kthrotld_workqueue)
 		panic("Failed to create kthrotld\n");
 
-	blkio_policy_register(&blkio_policy_throtl);
-	return 0;
+	return blkcg_policy_register(&blkcg_policy_throtl);
 }
 
 module_init(throtl_init);
diff --git a/block/blk.h b/block/blk.h
index d45be871329e..85f6ae42f7d3 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -23,7 +23,8 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 			struct bio *bio);
 int blk_rq_append_bio(struct request_queue *q, struct request *rq,
 		      struct bio *bio);
-void blk_drain_queue(struct request_queue *q, bool drain_all);
+void blk_queue_bypass_start(struct request_queue *q);
+void blk_queue_bypass_end(struct request_queue *q);
 void blk_dequeue_request(struct request *rq);
 void __blk_queue_free_tags(struct request_queue *q);
 bool __blk_end_bidi_request(struct request *rq, int error,
@@ -144,9 +145,6 @@ void blk_queue_congestion_threshold(struct request_queue *q);
 
 int blk_dev_init(void);
 
-void elv_quiesce_start(struct request_queue *q);
-void elv_quiesce_end(struct request_queue *q);
-
 
 /*
  * Return the threshold (number of used requests) at which the queue is
@@ -186,32 +184,30 @@ static inline int blk_do_io_stat(struct request *rq)
  */
 void get_io_context(struct io_context *ioc);
 struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q);
-struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask);
+struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
+			     gfp_t gfp_mask);
 void ioc_clear_queue(struct request_queue *q);
 
-void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_mask,
-				int node);
+int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
 
 /**
  * create_io_context - try to create task->io_context
- * @task: target task
  * @gfp_mask: allocation mask
  * @node: allocation node
  *
- * If @task->io_context is %NULL, allocate a new io_context and install it.
- * Returns the current @task->io_context which may be %NULL if allocation
- * failed.
+ * If %current->io_context is %NULL, allocate a new io_context and install
+ * it.  Returns the current %current->io_context which may be %NULL if
+ * allocation failed.
  *
  * Note that this function can't be called with IRQ disabled because
- * task_lock which protects @task->io_context is IRQ-unsafe.
+ * task_lock which protects %current->io_context is IRQ-unsafe.
  */
-static inline struct io_context *create_io_context(struct task_struct *task,
-						   gfp_t gfp_mask, int node)
+static inline struct io_context *create_io_context(gfp_t gfp_mask, int node)
 {
 	WARN_ON_ONCE(irqs_disabled());
-	if (unlikely(!task->io_context))
-		create_io_context_slowpath(task, gfp_mask, node);
-	return task->io_context;
+	if (unlikely(!current->io_context))
+		create_task_io_context(current, gfp_mask, node);
+	return current->io_context;
 }
 
 /*
@@ -222,7 +218,6 @@ extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio);
 extern void blk_throtl_drain(struct request_queue *q);
 extern int blk_throtl_init(struct request_queue *q);
 extern void blk_throtl_exit(struct request_queue *q);
-extern void blk_throtl_release(struct request_queue *q);
 #else /* CONFIG_BLK_DEV_THROTTLING */
 static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 {
@@ -231,7 +226,6 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 static inline void blk_throtl_drain(struct request_queue *q) { }
 static inline int blk_throtl_init(struct request_queue *q) { return 0; }
 static inline void blk_throtl_exit(struct request_queue *q) { }
-static inline void blk_throtl_release(struct request_queue *q) { }
 #endif /* CONFIG_BLK_DEV_THROTTLING */
 
 #endif /* BLK_INTERNAL_H */
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 3c38536bd52c..673c977cc2bf 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -15,7 +15,9 @@
 #include <linux/ioprio.h>
 #include <linux/blktrace_api.h>
 #include "blk.h"
-#include "cfq.h"
+#include "blk-cgroup.h"
+
+static struct blkcg_policy blkcg_policy_cfq __maybe_unused;
 
 /*
  * tunables
@@ -171,8 +173,53 @@ enum wl_type_t {
 	SYNC_WORKLOAD = 2
 };
 
+struct cfqg_stats {
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+	/* total bytes transferred */
+	struct blkg_rwstat		service_bytes;
+	/* total IOs serviced, post merge */
+	struct blkg_rwstat		serviced;
+	/* number of ios merged */
+	struct blkg_rwstat		merged;
+	/* total time spent on device in ns, may not be accurate w/ queueing */
+	struct blkg_rwstat		service_time;
+	/* total time spent waiting in scheduler queue in ns */
+	struct blkg_rwstat		wait_time;
+	/* number of IOs queued up */
+	struct blkg_rwstat		queued;
+	/* total sectors transferred */
+	struct blkg_stat		sectors;
+	/* total disk time and nr sectors dispatched by this group */
+	struct blkg_stat		time;
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+	/* time not charged to this cgroup */
+	struct blkg_stat		unaccounted_time;
+	/* sum of number of ios queued across all samples */
+	struct blkg_stat		avg_queue_size_sum;
+	/* count of samples taken for average */
+	struct blkg_stat		avg_queue_size_samples;
+	/* how many times this group has been removed from service tree */
+	struct blkg_stat		dequeue;
+	/* total time spent waiting for it to be assigned a timeslice. */
+	struct blkg_stat		group_wait_time;
+	/* time spent idling for this blkcg_gq */
+	struct blkg_stat		idle_time;
+	/* total time with empty current active q with other requests queued */
+	struct blkg_stat		empty_time;
+	/* fields after this shouldn't be cleared on stat reset */
+	uint64_t			start_group_wait_time;
+	uint64_t			start_idle_time;
+	uint64_t			start_empty_time;
+	uint16_t			flags;
+#endif	/* CONFIG_DEBUG_BLK_CGROUP */
+#endif	/* CONFIG_CFQ_GROUP_IOSCHED */
+};
+
 /* This is per cgroup per device grouping structure */
 struct cfq_group {
+	/* must be the first member */
+	struct blkg_policy_data pd;
+
 	/* group service_tree member */
 	struct rb_node rb_node;
 
@@ -180,7 +227,7 @@ struct cfq_group {
 	u64 vdisktime;
 	unsigned int weight;
 	unsigned int new_weight;
-	bool needs_update;
+	unsigned int dev_weight;
 
 	/* number of cfqq currently on this group */
 	int nr_cfqq;
@@ -206,20 +253,21 @@ struct cfq_group {
 	unsigned long saved_workload_slice;
 	enum wl_type_t saved_workload;
 	enum wl_prio_t saved_serving_prio;
-	struct blkio_group blkg;
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-	struct hlist_node cfqd_node;
-	int ref;
-#endif
+
 	/* number of requests that are on the dispatch list or inside driver */
 	int dispatched;
 	struct cfq_ttime ttime;
+	struct cfqg_stats stats;
 };
 
 struct cfq_io_cq {
 	struct io_cq		icq;		/* must be the first member */
 	struct cfq_queue	*cfqq[2];
 	struct cfq_ttime	ttime;
+	int			ioprio;		/* the current ioprio */
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+	uint64_t		blkcg_id;	/* the current blkcg ID */
+#endif
 };
 
 /*
@@ -229,7 +277,7 @@ struct cfq_data {
 	struct request_queue *queue;
 	/* Root service tree for cfq_groups */
 	struct cfq_rb_root grp_service_tree;
-	struct cfq_group root_group;
+	struct cfq_group *root_group;
 
 	/*
 	 * The priority currently being served
@@ -303,12 +351,6 @@ struct cfq_data {
 	struct cfq_queue oom_cfqq;
 
 	unsigned long last_delayed_sync;
-
-	/* List of cfq groups being managed on this device*/
-	struct hlist_head cfqg_list;
-
-	/* Number of groups which are on blkcg->blkg_list */
-	unsigned int nr_blkcg_linked_grps;
 };
 
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
@@ -371,21 +413,284 @@ CFQ_CFQQ_FNS(deep);
 CFQ_CFQQ_FNS(wait_busy);
 #undef CFQ_CFQQ_FNS
 
+static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
+{
+	return pd ? container_of(pd, struct cfq_group, pd) : NULL;
+}
+
+static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
+{
+	return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
+}
+
+static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
+{
+	return pd_to_blkg(&cfqg->pd);
+}
+
+#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
+
+/* cfqg stats flags */
+enum cfqg_stats_flags {
+	CFQG_stats_waiting = 0,
+	CFQG_stats_idling,
+	CFQG_stats_empty,
+};
+
+#define CFQG_FLAG_FNS(name)						\
+static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats)	\
+{									\
+	stats->flags |= (1 << CFQG_stats_##name);			\
+}									\
+static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats)	\
+{									\
+	stats->flags &= ~(1 << CFQG_stats_##name);			\
+}									\
+static inline int cfqg_stats_##name(struct cfqg_stats *stats)		\
+{									\
+	return (stats->flags & (1 << CFQG_stats_##name)) != 0;		\
+}									\
+
+CFQG_FLAG_FNS(waiting)
+CFQG_FLAG_FNS(idling)
+CFQG_FLAG_FNS(empty)
+#undef CFQG_FLAG_FNS
+
+/* This should be called with the queue_lock held. */
+static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
+{
+	unsigned long long now;
+
+	if (!cfqg_stats_waiting(stats))
+		return;
+
+	now = sched_clock();
+	if (time_after64(now, stats->start_group_wait_time))
+		blkg_stat_add(&stats->group_wait_time,
+			      now - stats->start_group_wait_time);
+	cfqg_stats_clear_waiting(stats);
+}
+
+/* This should be called with the queue_lock held. */
+static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
+						 struct cfq_group *curr_cfqg)
+{
+	struct cfqg_stats *stats = &cfqg->stats;
+
+	if (cfqg_stats_waiting(stats))
+		return;
+	if (cfqg == curr_cfqg)
+		return;
+	stats->start_group_wait_time = sched_clock();
+	cfqg_stats_mark_waiting(stats);
+}
+
+/* This should be called with the queue_lock held. */
+static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
+{
+	unsigned long long now;
+
+	if (!cfqg_stats_empty(stats))
+		return;
+
+	now = sched_clock();
+	if (time_after64(now, stats->start_empty_time))
+		blkg_stat_add(&stats->empty_time,
+			      now - stats->start_empty_time);
+	cfqg_stats_clear_empty(stats);
+}
+
+static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)
+{
+	blkg_stat_add(&cfqg->stats.dequeue, 1);
+}
+
+static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
+{
+	struct cfqg_stats *stats = &cfqg->stats;
+
+	if (blkg_rwstat_sum(&stats->queued))
+		return;
+
+	/*
+	 * group is already marked empty. This can happen if cfqq got new
+	 * request in parent group and moved to this group while being added
+	 * to service tree. Just ignore the event and move on.
+	 */
+	if (cfqg_stats_empty(stats))
+		return;
+
+	stats->start_empty_time = sched_clock();
+	cfqg_stats_mark_empty(stats);
+}
+
+static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
+{
+	struct cfqg_stats *stats = &cfqg->stats;
+
+	if (cfqg_stats_idling(stats)) {
+		unsigned long long now = sched_clock();
+
+		if (time_after64(now, stats->start_idle_time))
+			blkg_stat_add(&stats->idle_time,
+				      now - stats->start_idle_time);
+		cfqg_stats_clear_idling(stats);
+	}
+}
+
+static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
+{
+	struct cfqg_stats *stats = &cfqg->stats;
+
+	BUG_ON(cfqg_stats_idling(stats));
+
+	stats->start_idle_time = sched_clock();
+	cfqg_stats_mark_idling(stats);
+}
+
+static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
+{
+	struct cfqg_stats *stats = &cfqg->stats;
+
+	blkg_stat_add(&stats->avg_queue_size_sum,
+		      blkg_rwstat_sum(&stats->queued));
+	blkg_stat_add(&stats->avg_queue_size_samples, 1);
+	cfqg_stats_update_group_wait_time(stats);
+}
+
+#else	/* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
+
+static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { }
+static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
+static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
+static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
+static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
+static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
+static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
+
+#endif	/* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
+
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
+
+static inline void cfqg_get(struct cfq_group *cfqg)
+{
+	return blkg_get(cfqg_to_blkg(cfqg));
+}
+
+static inline void cfqg_put(struct cfq_group *cfqg)
+{
+	return blkg_put(cfqg_to_blkg(cfqg));
+}
+
+#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	do {			\
+	char __pbuf[128];						\
+									\
+	blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf));	\
 	blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
-			cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
-			blkg_path(&(cfqq)->cfqg->blkg), ##args)
+			  cfq_cfqq_sync((cfqq)) ? 'S' : 'A',		\
+			  __pbuf, ##args);				\
+} while (0)
 
-#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)				\
-	blk_add_trace_msg((cfqd)->queue, "%s " fmt,			\
-				blkg_path(&(cfqg)->blkg), ##args)       \
+#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)	do {			\
+	char __pbuf[128];						\
+									\
+	blkg_path(cfqg_to_blkg(cfqg), __pbuf, sizeof(__pbuf));		\
+	blk_add_trace_msg((cfqd)->queue, "%s " fmt, __pbuf, ##args);	\
+} while (0)
+
+static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
+					    struct cfq_group *curr_cfqg, int rw)
+{
+	blkg_rwstat_add(&cfqg->stats.queued, rw, 1);
+	cfqg_stats_end_empty_time(&cfqg->stats);
+	cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
+}
+
+static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
+			unsigned long time, unsigned long unaccounted_time)
+{
+	blkg_stat_add(&cfqg->stats.time, time);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+	blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);
+#endif
+}
+
+static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw)
+{
+	blkg_rwstat_add(&cfqg->stats.queued, rw, -1);
+}
+
+static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw)
+{
+	blkg_rwstat_add(&cfqg->stats.merged, rw, 1);
+}
+
+static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
+					      uint64_t bytes, int rw)
+{
+	blkg_stat_add(&cfqg->stats.sectors, bytes >> 9);
+	blkg_rwstat_add(&cfqg->stats.serviced, rw, 1);
+	blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes);
+}
+
+static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
+			uint64_t start_time, uint64_t io_start_time, int rw)
+{
+	struct cfqg_stats *stats = &cfqg->stats;
+	unsigned long long now = sched_clock();
+
+	if (time_after64(now, io_start_time))
+		blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
+	if (time_after64(io_start_time, start_time))
+		blkg_rwstat_add(&stats->wait_time, rw,
+				io_start_time - start_time);
+}
+
+static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
+{
+	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
+	struct cfqg_stats *stats = &cfqg->stats;
+
+	/* queued stats shouldn't be cleared */
+	blkg_rwstat_reset(&stats->service_bytes);
+	blkg_rwstat_reset(&stats->serviced);
+	blkg_rwstat_reset(&stats->merged);
+	blkg_rwstat_reset(&stats->service_time);
+	blkg_rwstat_reset(&stats->wait_time);
+	blkg_stat_reset(&stats->time);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+	blkg_stat_reset(&stats->unaccounted_time);
+	blkg_stat_reset(&stats->avg_queue_size_sum);
+	blkg_stat_reset(&stats->avg_queue_size_samples);
+	blkg_stat_reset(&stats->dequeue);
+	blkg_stat_reset(&stats->group_wait_time);
+	blkg_stat_reset(&stats->idle_time);
+	blkg_stat_reset(&stats->empty_time);
+#endif
+}
+
+#else	/* CONFIG_CFQ_GROUP_IOSCHED */
+
+static inline void cfqg_get(struct cfq_group *cfqg) { }
+static inline void cfqg_put(struct cfq_group *cfqg) { }
 
-#else
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
 	blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0)
-#endif
+
+static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
+			struct cfq_group *curr_cfqg, int rw) { }
+static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
+			unsigned long time, unsigned long unaccounted_time) { }
+static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { }
+static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { }
+static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
+					      uint64_t bytes, int rw) { }
+static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
+			uint64_t start_time, uint64_t io_start_time, int rw) { }
+
+#endif	/* CONFIG_CFQ_GROUP_IOSCHED */
+
 #define cfq_log(cfqd, fmt, args...)	\
 	blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
 
@@ -466,8 +771,9 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
 }
 
 static void cfq_dispatch_insert(struct request_queue *, struct request *);
-static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
-				       struct io_context *, gfp_t);
+static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
+				       struct cfq_io_cq *cic, struct bio *bio,
+				       gfp_t gfp_mask);
 
 static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
 {
@@ -545,7 +851,7 @@ static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
 {
 	u64 d = delta << CFQ_SERVICE_SHIFT;
 
-	d = d * BLKIO_WEIGHT_DEFAULT;
+	d = d * CFQ_WEIGHT_DEFAULT;
 	do_div(d, cfqg->weight);
 	return d;
 }
@@ -872,9 +1178,9 @@ static void
 cfq_update_group_weight(struct cfq_group *cfqg)
 {
 	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
-	if (cfqg->needs_update) {
+	if (cfqg->new_weight) {
 		cfqg->weight = cfqg->new_weight;
-		cfqg->needs_update = false;
+		cfqg->new_weight = 0;
 	}
 }
 
@@ -936,7 +1242,7 @@ cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
 	cfq_group_service_tree_del(st, cfqg);
 	cfqg->saved_workload_slice = 0;
-	cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
+	cfqg_stats_update_dequeue(cfqg);
 }
 
 static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
@@ -1008,178 +1314,59 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 		     "sl_used=%u disp=%u charge=%u iops=%u sect=%lu",
 		     used_sl, cfqq->slice_dispatch, charge,
 		     iops_mode(cfqd), cfqq->nr_sectors);
-	cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl,
-					  unaccounted_sl);
-	cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
+	cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl);
+	cfqg_stats_set_start_empty_time(cfqg);
 }
 
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
-{
-	if (blkg)
-		return container_of(blkg, struct cfq_group, blkg);
-	return NULL;
-}
-
-static void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
-					  unsigned int weight)
-{
-	struct cfq_group *cfqg = cfqg_of_blkg(blkg);
-	cfqg->new_weight = weight;
-	cfqg->needs_update = true;
-}
-
-static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd,
-			struct cfq_group *cfqg, struct blkio_cgroup *blkcg)
-{
-	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
-	unsigned int major, minor;
-
-	/*
-	 * Add group onto cgroup list. It might happen that bdi->dev is
-	 * not initialized yet. Initialize this new group without major
-	 * and minor info and this info will be filled in once a new thread
-	 * comes for IO.
-	 */
-	if (bdi->dev) {
-		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
-					(void *)cfqd, MKDEV(major, minor));
-	} else
-		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
-					(void *)cfqd, 0);
-
-	cfqd->nr_blkcg_linked_grps++;
-	cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
-
-	/* Add group on cfqd list */
-	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
-}
-
-/*
- * Should be called from sleepable context. No request queue lock as per
- * cpu stats are allocated dynamically and alloc_percpu needs to be called
- * from sleepable context.
+/**
+ * cfq_init_cfqg_base - initialize base part of a cfq_group
+ * @cfqg: cfq_group to initialize
+ *
+ * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED
+ * is enabled or not.
  */
-static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd)
+static void cfq_init_cfqg_base(struct cfq_group *cfqg)
 {
-	struct cfq_group *cfqg = NULL;
-	int i, j, ret;
 	struct cfq_rb_root *st;
-
-	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
-	if (!cfqg)
-		return NULL;
+	int i, j;
 
 	for_each_cfqg_st(cfqg, i, j, st)
 		*st = CFQ_RB_ROOT;
 	RB_CLEAR_NODE(&cfqg->rb_node);
 
 	cfqg->ttime.last_end_request = jiffies;
-
-	/*
-	 * Take the initial reference that will be released on destroy
-	 * This can be thought of a joint reference by cgroup and
-	 * elevator which will be dropped by either elevator exit
-	 * or cgroup deletion path depending on who is exiting first.
-	 */
-	cfqg->ref = 1;
-
-	ret = blkio_alloc_blkg_stats(&cfqg->blkg);
-	if (ret) {
-		kfree(cfqg);
-		return NULL;
-	}
-
-	return cfqg;
 }
 
-static struct cfq_group *
-cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg)
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+static void cfq_pd_init(struct blkcg_gq *blkg)
 {
-	struct cfq_group *cfqg = NULL;
-	void *key = cfqd;
-	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
-	unsigned int major, minor;
-
-	/*
-	 * This is the common case when there are no blkio cgroups.
-	 * Avoid lookup in this case
-	 */
-	if (blkcg == &blkio_root_cgroup)
-		cfqg = &cfqd->root_group;
-	else
-		cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
-
-	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
-		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-		cfqg->blkg.dev = MKDEV(major, minor);
-	}
+	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
 
-	return cfqg;
+	cfq_init_cfqg_base(cfqg);
+	cfqg->weight = blkg->blkcg->cfq_weight;
 }
 
 /*
  * Search for the cfq group current task belongs to. request_queue lock must
  * be held.
  */
-static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
+static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
+						struct blkcg *blkcg)
 {
-	struct blkio_cgroup *blkcg;
-	struct cfq_group *cfqg = NULL, *__cfqg = NULL;
 	struct request_queue *q = cfqd->queue;
+	struct cfq_group *cfqg = NULL;
 
-	rcu_read_lock();
-	blkcg = task_blkio_cgroup(current);
-	cfqg = cfq_find_cfqg(cfqd, blkcg);
-	if (cfqg) {
-		rcu_read_unlock();
-		return cfqg;
-	}
-
-	/*
-	 * Need to allocate a group. Allocation of group also needs allocation
-	 * of per cpu stats which in-turn takes a mutex() and can block. Hence
-	 * we need to drop rcu lock and queue_lock before we call alloc.
-	 *
-	 * Not taking any queue reference here and assuming that queue is
-	 * around by the time we return. CFQ queue allocation code does
-	 * the same. It might be racy though.
-	 */
-
-	rcu_read_unlock();
-	spin_unlock_irq(q->queue_lock);
-
-	cfqg = cfq_alloc_cfqg(cfqd);
-
-	spin_lock_irq(q->queue_lock);
-
-	rcu_read_lock();
-	blkcg = task_blkio_cgroup(current);
-
-	/*
-	 * If some other thread already allocated the group while we were
-	 * not holding queue lock, free up the group
-	 */
-	__cfqg = cfq_find_cfqg(cfqd, blkcg);
+	/* avoid lookup for the common case where there's no blkcg */
+	if (blkcg == &blkcg_root) {
+		cfqg = cfqd->root_group;
+	} else {
+		struct blkcg_gq *blkg;
 
-	if (__cfqg) {
-		kfree(cfqg);
-		rcu_read_unlock();
-		return __cfqg;
+		blkg = blkg_lookup_create(blkcg, q);
+		if (!IS_ERR(blkg))
+			cfqg = blkg_to_cfqg(blkg);
 	}
 
-	if (!cfqg)
-		cfqg = &cfqd->root_group;
-
-	cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg);
-	rcu_read_unlock();
-	return cfqg;
-}
-
-static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
-{
-	cfqg->ref++;
 	return cfqg;
 }
 
@@ -1187,94 +1374,224 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
 {
 	/* Currently, all async queues are mapped to root group */
 	if (!cfq_cfqq_sync(cfqq))
-		cfqg = &cfqq->cfqd->root_group;
+		cfqg = cfqq->cfqd->root_group;
 
 	cfqq->cfqg = cfqg;
 	/* cfqq reference on cfqg */
-	cfqq->cfqg->ref++;
+	cfqg_get(cfqg);
 }
 
-static void cfq_put_cfqg(struct cfq_group *cfqg)
+static u64 cfqg_prfill_weight_device(struct seq_file *sf,
+				     struct blkg_policy_data *pd, int off)
 {
-	struct cfq_rb_root *st;
-	int i, j;
+	struct cfq_group *cfqg = pd_to_cfqg(pd);
 
-	BUG_ON(cfqg->ref <= 0);
-	cfqg->ref--;
-	if (cfqg->ref)
-		return;
-	for_each_cfqg_st(cfqg, i, j, st)
-		BUG_ON(!RB_EMPTY_ROOT(&st->rb));
-	free_percpu(cfqg->blkg.stats_cpu);
-	kfree(cfqg);
+	if (!cfqg->dev_weight)
+		return 0;
+	return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
 }
 
-static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
+static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,
+				    struct seq_file *sf)
 {
-	/* Something wrong if we are trying to remove same group twice */
-	BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
+	blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp),
+			  cfqg_prfill_weight_device, &blkcg_policy_cfq, 0,
+			  false);
+	return 0;
+}
 
-	hlist_del_init(&cfqg->cfqd_node);
+static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft,
+			    struct seq_file *sf)
+{
+	seq_printf(sf, "%u\n", cgroup_to_blkcg(cgrp)->cfq_weight);
+	return 0;
+}
 
-	BUG_ON(cfqd->nr_blkcg_linked_grps <= 0);
-	cfqd->nr_blkcg_linked_grps--;
+static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
+				  const char *buf)
+{
+	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+	struct blkg_conf_ctx ctx;
+	struct cfq_group *cfqg;
+	int ret;
 
-	/*
-	 * Put the reference taken at the time of creation so that when all
-	 * queues are gone, group can be destroyed.
-	 */
-	cfq_put_cfqg(cfqg);
+	ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
+	if (ret)
+		return ret;
+
+	ret = -EINVAL;
+	cfqg = blkg_to_cfqg(ctx.blkg);
+	if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
+		cfqg->dev_weight = ctx.v;
+		cfqg->new_weight = cfqg->dev_weight ?: blkcg->cfq_weight;
+		ret = 0;
+	}
+
+	blkg_conf_finish(&ctx);
+	return ret;
 }
 
-static void cfq_release_cfq_groups(struct cfq_data *cfqd)
+static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
 {
-	struct hlist_node *pos, *n;
-	struct cfq_group *cfqg;
+	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+	struct blkcg_gq *blkg;
+	struct hlist_node *n;
 
-	hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) {
-		/*
-		 * If cgroup removal path got to blk_group first and removed
-		 * it from cgroup list, then it will take care of destroying
-		 * cfqg also.
-		 */
-		if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
-			cfq_destroy_cfqg(cfqd, cfqg);
+	if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
+		return -EINVAL;
+
+	spin_lock_irq(&blkcg->lock);
+	blkcg->cfq_weight = (unsigned int)val;
+
+	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+		struct cfq_group *cfqg = blkg_to_cfqg(blkg);
+
+		if (cfqg && !cfqg->dev_weight)
+			cfqg->new_weight = blkcg->cfq_weight;
 	}
+
+	spin_unlock_irq(&blkcg->lock);
+	return 0;
 }
 
-/*
- * Blk cgroup controller notification saying that blkio_group object is being
- * delinked as associated cgroup object is going away. That also means that
- * no new IO will come in this group. So get rid of this group as soon as
- * any pending IO in the group is finished.
- *
- * This function is called under rcu_read_lock(). key is the rcu protected
- * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu
- * read lock.
- *
- * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
- * it should not be NULL as even if elevator was exiting, cgroup deltion
- * path got to it first.
- */
-static void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
+static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft,
+			   struct seq_file *sf)
 {
-	unsigned long  flags;
-	struct cfq_data *cfqd = key;
+	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
 
-	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
-	cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));
-	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
+	blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq,
+			  cft->private, false);
+	return 0;
 }
 
-#else /* GROUP_IOSCHED */
-static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
+static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
+			     struct seq_file *sf)
 {
-	return &cfqd->root_group;
+	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+
+	blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq,
+			  cft->private, true);
+	return 0;
 }
 
-static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
+				      struct blkg_policy_data *pd, int off)
 {
-	return cfqg;
+	struct cfq_group *cfqg = pd_to_cfqg(pd);
+	u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples);
+	u64 v = 0;
+
+	if (samples) {
+		v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);
+		do_div(v, samples);
+	}
+	__blkg_prfill_u64(sf, pd, v);
+	return 0;
+}
+
+/* print avg_queue_size */
+static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
+				     struct seq_file *sf)
+{
+	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+
+	blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size,
+			  &blkcg_policy_cfq, 0, false);
+	return 0;
+}
+#endif	/* CONFIG_DEBUG_BLK_CGROUP */
+
+static struct cftype cfq_blkcg_files[] = {
+	{
+		.name = "weight_device",
+		.read_seq_string = cfqg_print_weight_device,
+		.write_string = cfqg_set_weight_device,
+		.max_write_len = 256,
+	},
+	{
+		.name = "weight",
+		.read_seq_string = cfq_print_weight,
+		.write_u64 = cfq_set_weight,
+	},
+	{
+		.name = "time",
+		.private = offsetof(struct cfq_group, stats.time),
+		.read_seq_string = cfqg_print_stat,
+	},
+	{
+		.name = "sectors",
+		.private = offsetof(struct cfq_group, stats.sectors),
+		.read_seq_string = cfqg_print_stat,
+	},
+	{
+		.name = "io_service_bytes",
+		.private = offsetof(struct cfq_group, stats.service_bytes),
+		.read_seq_string = cfqg_print_rwstat,
+	},
+	{
+		.name = "io_serviced",
+		.private = offsetof(struct cfq_group, stats.serviced),
+		.read_seq_string = cfqg_print_rwstat,
+	},
+	{
+		.name = "io_service_time",
+		.private = offsetof(struct cfq_group, stats.service_time),
+		.read_seq_string = cfqg_print_rwstat,
+	},
+	{
+		.name = "io_wait_time",
+		.private = offsetof(struct cfq_group, stats.wait_time),
+		.read_seq_string = cfqg_print_rwstat,
+	},
+	{
+		.name = "io_merged",
+		.private = offsetof(struct cfq_group, stats.merged),
+		.read_seq_string = cfqg_print_rwstat,
+	},
+	{
+		.name = "io_queued",
+		.private = offsetof(struct cfq_group, stats.queued),
+		.read_seq_string = cfqg_print_rwstat,
+	},
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+	{
+		.name = "avg_queue_size",
+		.read_seq_string = cfqg_print_avg_queue_size,
+	},
+	{
+		.name = "group_wait_time",
+		.private = offsetof(struct cfq_group, stats.group_wait_time),
+		.read_seq_string = cfqg_print_stat,
+	},
+	{
+		.name = "idle_time",
+		.private = offsetof(struct cfq_group, stats.idle_time),
+		.read_seq_string = cfqg_print_stat,
+	},
+	{
+		.name = "empty_time",
+		.private = offsetof(struct cfq_group, stats.empty_time),
+		.read_seq_string = cfqg_print_stat,
+	},
+	{
+		.name = "dequeue",
+		.private = offsetof(struct cfq_group, stats.dequeue),
+		.read_seq_string = cfqg_print_stat,
+	},
+	{
+		.name = "unaccounted_time",
+		.private = offsetof(struct cfq_group, stats.unaccounted_time),
+		.read_seq_string = cfqg_print_stat,
+	},
+#endif	/* CONFIG_DEBUG_BLK_CGROUP */
+	{ }	/* terminate */
+};
+#else /* GROUP_IOSCHED */
+static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
+						struct blkcg *blkcg)
+{
+	return cfqd->root_group;
 }
 
 static inline void
@@ -1282,9 +1599,6 @@ cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
 	cfqq->cfqg = cfqg;
 }
 
-static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}
-static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
-
 #endif /* GROUP_IOSCHED */
 
 /*
@@ -1551,12 +1865,10 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
 {
 	elv_rb_del(&cfqq->sort_list, rq);
 	cfqq->queued[rq_is_sync(rq)]--;
-	cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
-					rq_data_dir(rq), rq_is_sync(rq));
+	cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
 	cfq_add_rq_rb(rq);
-	cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
-			&cfqq->cfqd->serving_group->blkg, rq_data_dir(rq),
-			rq_is_sync(rq));
+	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group,
+				 rq->cmd_flags);
 }
 
 static struct request *
@@ -1612,8 +1924,7 @@ static void cfq_remove_request(struct request *rq)
 	cfq_del_rq_rb(rq);
 
 	cfqq->cfqd->rq_queued--;
-	cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
-					rq_data_dir(rq), rq_is_sync(rq));
+	cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
 	if (rq->cmd_flags & REQ_PRIO) {
 		WARN_ON(!cfqq->prio_pending);
 		cfqq->prio_pending--;
@@ -1648,8 +1959,7 @@ static void cfq_merged_request(struct request_queue *q, struct request *req,
 static void cfq_bio_merged(struct request_queue *q, struct request *req,
 				struct bio *bio)
 {
-	cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg,
-					bio_data_dir(bio), cfq_bio_sync(bio));
+	cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_rw);
 }
 
 static void
@@ -1671,8 +1981,7 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
 	if (cfqq->next_rq == next)
 		cfqq->next_rq = rq;
 	cfq_remove_request(next);
-	cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg,
-					rq_data_dir(next), rq_is_sync(next));
+	cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags);
 
 	cfqq = RQ_CFQQ(next);
 	/*
@@ -1713,7 +2022,7 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
 static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	del_timer(&cfqd->idle_slice_timer);
-	cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
+	cfqg_stats_update_idle_time(cfqq->cfqg);
 }
 
 static void __cfq_set_active_queue(struct cfq_data *cfqd,
@@ -1722,7 +2031,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
 	if (cfqq) {
 		cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
 				cfqd->serving_prio, cfqd->serving_type);
-		cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
+		cfqg_stats_update_avg_queue_size(cfqq->cfqg);
 		cfqq->slice_start = 0;
 		cfqq->dispatch_start = jiffies;
 		cfqq->allocated_slice = 0;
@@ -2043,7 +2352,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 	 * task has exited, don't wait
 	 */
 	cic = cfqd->active_cic;
-	if (!cic || !atomic_read(&cic->icq.ioc->nr_tasks))
+	if (!cic || !atomic_read(&cic->icq.ioc->active_ref))
 		return;
 
 	/*
@@ -2070,7 +2379,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 		sl = cfqd->cfq_slice_idle;
 
 	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
-	cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
+	cfqg_stats_set_start_idle_time(cfqq->cfqg);
 	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
 			group_idle ? 1 : 0);
 }
@@ -2093,8 +2402,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
 
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
 	cfqq->nr_sectors += blk_rq_sectors(rq);
-	cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
-					rq_data_dir(rq), rq_is_sync(rq));
+	cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags);
 }
 
 /*
@@ -2677,7 +2985,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 
 	BUG_ON(cfq_cfqq_on_rr(cfqq));
 	kmem_cache_free(cfq_pool, cfqq);
-	cfq_put_cfqg(cfqg);
+	cfqg_put(cfqg);
 }
 
 static void cfq_put_cooperator(struct cfq_queue *cfqq)
@@ -2736,7 +3044,7 @@ static void cfq_exit_icq(struct io_cq *icq)
 	}
 }
 
-static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
+static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
 {
 	struct task_struct *tsk = current;
 	int ioprio_class;
@@ -2744,7 +3052,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
 	if (!cfq_cfqq_prio_changed(cfqq))
 		return;
 
-	ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
+	ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
 	switch (ioprio_class) {
 	default:
 		printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
@@ -2756,11 +3064,11 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
 		cfqq->ioprio_class = task_nice_ioclass(tsk);
 		break;
 	case IOPRIO_CLASS_RT:
-		cfqq->ioprio = task_ioprio(ioc);
+		cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
 		cfqq->ioprio_class = IOPRIO_CLASS_RT;
 		break;
 	case IOPRIO_CLASS_BE:
-		cfqq->ioprio = task_ioprio(ioc);
+		cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
 		cfqq->ioprio_class = IOPRIO_CLASS_BE;
 		break;
 	case IOPRIO_CLASS_IDLE:
@@ -2778,19 +3086,24 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
 	cfq_clear_cfqq_prio_changed(cfqq);
 }
 
-static void changed_ioprio(struct cfq_io_cq *cic)
+static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
 {
+	int ioprio = cic->icq.ioc->ioprio;
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
 	struct cfq_queue *cfqq;
 
-	if (unlikely(!cfqd))
+	/*
+	 * Check whether ioprio has changed.  The condition may trigger
+	 * spuriously on a newly created cic but there's no harm.
+	 */
+	if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
 		return;
 
 	cfqq = cic->cfqq[BLK_RW_ASYNC];
 	if (cfqq) {
 		struct cfq_queue *new_cfqq;
-		new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->icq.ioc,
-						GFP_ATOMIC);
+		new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio,
+					 GFP_ATOMIC);
 		if (new_cfqq) {
 			cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
 			cfq_put_queue(cfqq);
@@ -2800,6 +3113,8 @@ static void changed_ioprio(struct cfq_io_cq *cic)
 	cfqq = cic->cfqq[BLK_RW_SYNC];
 	if (cfqq)
 		cfq_mark_cfqq_prio_changed(cfqq);
+
+	cic->ioprio = ioprio;
 }
 
 static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
@@ -2823,17 +3138,24 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-static void changed_cgroup(struct cfq_io_cq *cic)
+static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
 {
-	struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
-	struct request_queue *q;
+	struct cfq_queue *sync_cfqq;
+	uint64_t id;
 
-	if (unlikely(!cfqd))
-		return;
+	rcu_read_lock();
+	id = bio_blkcg(bio)->id;
+	rcu_read_unlock();
 
-	q = cfqd->queue;
+	/*
+	 * Check whether blkcg has changed.  The condition may trigger
+	 * spuriously on a newly created cic but there's no harm.
+	 */
+	if (unlikely(!cfqd) || likely(cic->blkcg_id == id))
+		return;
 
+	sync_cfqq = cic_to_cfqq(cic, 1);
 	if (sync_cfqq) {
 		/*
 		 * Drop reference to sync queue. A new sync queue will be
@@ -2843,21 +3165,26 @@ static void changed_cgroup(struct cfq_io_cq *cic)
 		cic_set_cfqq(cic, NULL, 1);
 		cfq_put_queue(sync_cfqq);
 	}
+
+	cic->blkcg_id = id;
 }
+#else
+static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { }
 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
 
 static struct cfq_queue *
-cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
-		     struct io_context *ioc, gfp_t gfp_mask)
+cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
+		     struct bio *bio, gfp_t gfp_mask)
 {
+	struct blkcg *blkcg;
 	struct cfq_queue *cfqq, *new_cfqq = NULL;
-	struct cfq_io_cq *cic;
 	struct cfq_group *cfqg;
 
 retry:
-	cfqg = cfq_get_cfqg(cfqd);
-	cic = cfq_cic_lookup(cfqd, ioc);
-	/* cic always exists here */
+	rcu_read_lock();
+
+	blkcg = bio_blkcg(bio);
+	cfqg = cfq_lookup_create_cfqg(cfqd, blkcg);
 	cfqq = cic_to_cfqq(cic, is_sync);
 
 	/*
@@ -2870,6 +3197,7 @@ retry:
 			cfqq = new_cfqq;
 			new_cfqq = NULL;
 		} else if (gfp_mask & __GFP_WAIT) {
+			rcu_read_unlock();
 			spin_unlock_irq(cfqd->queue->queue_lock);
 			new_cfqq = kmem_cache_alloc_node(cfq_pool,
 					gfp_mask | __GFP_ZERO,
@@ -2885,7 +3213,7 @@ retry:
 
 		if (cfqq) {
 			cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
-			cfq_init_prio_data(cfqq, ioc);
+			cfq_init_prio_data(cfqq, cic);
 			cfq_link_cfqq_cfqg(cfqq, cfqg);
 			cfq_log_cfqq(cfqd, cfqq, "alloced");
 		} else
@@ -2895,6 +3223,7 @@ retry:
 	if (new_cfqq)
 		kmem_cache_free(cfq_pool, new_cfqq);
 
+	rcu_read_unlock();
 	return cfqq;
 }
 
@@ -2904,6 +3233,9 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
 	switch (ioprio_class) {
 	case IOPRIO_CLASS_RT:
 		return &cfqd->async_cfqq[0][ioprio];
+	case IOPRIO_CLASS_NONE:
+		ioprio = IOPRIO_NORM;
+		/* fall through */
 	case IOPRIO_CLASS_BE:
 		return &cfqd->async_cfqq[1][ioprio];
 	case IOPRIO_CLASS_IDLE:
@@ -2914,11 +3246,11 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
 }
 
 static struct cfq_queue *
-cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
-	      gfp_t gfp_mask)
+cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
+	      struct bio *bio, gfp_t gfp_mask)
 {
-	const int ioprio = task_ioprio(ioc);
-	const int ioprio_class = task_ioprio_class(ioc);
+	const int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
+	const int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
 	struct cfq_queue **async_cfqq = NULL;
 	struct cfq_queue *cfqq = NULL;
 
@@ -2928,7 +3260,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
 	}
 
 	if (!cfqq)
-		cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask);
+		cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask);
 
 	/*
 	 * pin the queue now that it's allocated, scheduler exit will prune it
@@ -3010,7 +3342,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 
 	if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
 		enable_idle = 0;
-	else if (!atomic_read(&cic->icq.ioc->nr_tasks) ||
+	else if (!atomic_read(&cic->icq.ioc->active_ref) ||
 		 !cfqd->cfq_slice_idle ||
 		 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
 		enable_idle = 0;
@@ -3174,8 +3506,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 				cfq_clear_cfqq_wait_request(cfqq);
 				__blk_run_queue(cfqd->queue);
 			} else {
-				cfq_blkiocg_update_idle_time_stats(
-						&cfqq->cfqg->blkg);
+				cfqg_stats_update_idle_time(cfqq->cfqg);
 				cfq_mark_cfqq_must_dispatch(cfqq);
 			}
 		}
@@ -3197,14 +3528,13 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 
 	cfq_log_cfqq(cfqd, cfqq, "insert_request");
-	cfq_init_prio_data(cfqq, RQ_CIC(rq)->icq.ioc);
+	cfq_init_prio_data(cfqq, RQ_CIC(rq));
 
 	rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
 	list_add_tail(&rq->queuelist, &cfqq->fifo);
 	cfq_add_rq_rb(rq);
-	cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
-			&cfqd->serving_group->blkg, rq_data_dir(rq),
-			rq_is_sync(rq));
+	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,
+				 rq->cmd_flags);
 	cfq_rq_enqueued(cfqd, cfqq, rq);
 }
 
@@ -3300,9 +3630,8 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 	cfqd->rq_in_driver--;
 	cfqq->dispatched--;
 	(RQ_CFQG(rq))->dispatched--;
-	cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg,
-			rq_start_time_ns(rq), rq_io_start_time_ns(rq),
-			rq_data_dir(rq), rq_is_sync(rq));
+	cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq),
+				     rq_io_start_time_ns(rq), rq->cmd_flags);
 
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
 
@@ -3399,7 +3728,7 @@ static int cfq_may_queue(struct request_queue *q, int rw)
 
 	cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
 	if (cfqq) {
-		cfq_init_prio_data(cfqq, cic->icq.ioc);
+		cfq_init_prio_data(cfqq, cic);
 
 		return __cfq_may_queue(cfqq);
 	}
@@ -3421,7 +3750,7 @@ static void cfq_put_request(struct request *rq)
 		cfqq->allocated[rw]--;
 
 		/* Put down rq reference on cfqg */
-		cfq_put_cfqg(RQ_CFQG(rq));
+		cfqg_put(RQ_CFQG(rq));
 		rq->elv.priv[0] = NULL;
 		rq->elv.priv[1] = NULL;
 
@@ -3465,32 +3794,25 @@ split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
  * Allocate cfq data structures associated with this request.
  */
 static int
-cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
+cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
+		gfp_t gfp_mask)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
 	const int rw = rq_data_dir(rq);
 	const bool is_sync = rq_is_sync(rq);
 	struct cfq_queue *cfqq;
-	unsigned int changed;
 
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 
 	spin_lock_irq(q->queue_lock);
 
-	/* handle changed notifications */
-	changed = icq_get_changed(&cic->icq);
-	if (unlikely(changed & ICQ_IOPRIO_CHANGED))
-		changed_ioprio(cic);
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-	if (unlikely(changed & ICQ_CGROUP_CHANGED))
-		changed_cgroup(cic);
-#endif
-
+	check_ioprio_changed(cic, bio);
+	check_blkcg_changed(cic, bio);
 new_queue:
 	cfqq = cic_to_cfqq(cic, is_sync);
 	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
-		cfqq = cfq_get_queue(cfqd, is_sync, cic->icq.ioc, gfp_mask);
+		cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask);
 		cic_set_cfqq(cic, cfqq, is_sync);
 	} else {
 		/*
@@ -3516,8 +3838,9 @@ new_queue:
 	cfqq->allocated[rw]++;
 
 	cfqq->ref++;
+	cfqg_get(cfqq->cfqg);
 	rq->elv.priv[0] = cfqq;
-	rq->elv.priv[1] = cfq_ref_get_cfqg(cfqq->cfqg);
+	rq->elv.priv[1] = cfqq->cfqg;
 	spin_unlock_irq(q->queue_lock);
 	return 0;
 }
@@ -3614,7 +3937,6 @@ static void cfq_exit_queue(struct elevator_queue *e)
 {
 	struct cfq_data *cfqd = e->elevator_data;
 	struct request_queue *q = cfqd->queue;
-	bool wait = false;
 
 	cfq_shutdown_timer_wq(cfqd);
 
@@ -3624,89 +3946,52 @@ static void cfq_exit_queue(struct elevator_queue *e)
 		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
 
 	cfq_put_async_queues(cfqd);
-	cfq_release_cfq_groups(cfqd);
-
-	/*
-	 * If there are groups which we could not unlink from blkcg list,
-	 * wait for a rcu period for them to be freed.
-	 */
-	if (cfqd->nr_blkcg_linked_grps)
-		wait = true;
 
 	spin_unlock_irq(q->queue_lock);
 
 	cfq_shutdown_timer_wq(cfqd);
 
-	/*
-	 * Wait for cfqg->blkg->key accessors to exit their grace periods.
-	 * Do this wait only if there are other unlinked groups out
-	 * there. This can happen if cgroup deletion path claimed the
-	 * responsibility of cleaning up a group before queue cleanup code
-	 * get to the group.
-	 *
-	 * Do not call synchronize_rcu() unconditionally as there are drivers
-	 * which create/delete request queue hundreds of times during scan/boot
-	 * and synchronize_rcu() can take significant time and slow down boot.
-	 */
-	if (wait)
-		synchronize_rcu();
-
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-	/* Free up per cpu stats for root group */
-	free_percpu(cfqd->root_group.blkg.stats_cpu);
+#ifndef CONFIG_CFQ_GROUP_IOSCHED
+	kfree(cfqd->root_group);
 #endif
+	blkcg_deactivate_policy(q, &blkcg_policy_cfq);
 	kfree(cfqd);
 }
 
-static void *cfq_init_queue(struct request_queue *q)
+static int cfq_init_queue(struct request_queue *q)
 {
 	struct cfq_data *cfqd;
-	int i, j;
-	struct cfq_group *cfqg;
-	struct cfq_rb_root *st;
+	struct blkcg_gq *blkg __maybe_unused;
+	int i, ret;
 
 	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
 	if (!cfqd)
-		return NULL;
+		return -ENOMEM;
+
+	cfqd->queue = q;
+	q->elevator->elevator_data = cfqd;
 
 	/* Init root service tree */
 	cfqd->grp_service_tree = CFQ_RB_ROOT;
 
-	/* Init root group */
-	cfqg = &cfqd->root_group;
-	for_each_cfqg_st(cfqg, i, j, st)
-		*st = CFQ_RB_ROOT;
-	RB_CLEAR_NODE(&cfqg->rb_node);
-
-	/* Give preference to root group over other groups */
-	cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
-
+	/* Init root group and prefer root group over other groups by default */
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-	/*
-	 * Set root group reference to 2. One reference will be dropped when
-	 * all groups on cfqd->cfqg_list are being deleted during queue exit.
-	 * Other reference will remain there as we don't want to delete this
-	 * group as it is statically allocated and gets destroyed when
-	 * throtl_data goes away.
-	 */
-	cfqg->ref = 2;
-
-	if (blkio_alloc_blkg_stats(&cfqg->blkg)) {
-		kfree(cfqg);
-		kfree(cfqd);
-		return NULL;
-	}
-
-	rcu_read_lock();
+	ret = blkcg_activate_policy(q, &blkcg_policy_cfq);
+	if (ret)
+		goto out_free;
 
-	cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
-					(void *)cfqd, 0);
-	rcu_read_unlock();
-	cfqd->nr_blkcg_linked_grps++;
+	cfqd->root_group = blkg_to_cfqg(q->root_blkg);
+#else
+	ret = -ENOMEM;
+	cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group),
+					GFP_KERNEL, cfqd->queue->node);
+	if (!cfqd->root_group)
+		goto out_free;
 
-	/* Add group on cfqd->cfqg_list */
-	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+	cfq_init_cfqg_base(cfqd->root_group);
 #endif
+	cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;
+
 	/*
 	 * Not strictly needed (since RB_ROOT just clears the node and we
 	 * zeroed cfqd on alloc), but better be safe in case someone decides
@@ -3718,13 +4003,17 @@ static void *cfq_init_queue(struct request_queue *q)
 	/*
 	 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
 	 * Grab a permanent reference to it, so that the normal code flow
-	 * will not attempt to free it.
+	 * will not attempt to free it.  oom_cfqq is linked to root_group
+	 * but shouldn't hold a reference as it'll never be unlinked.  Lose
+	 * the reference from linking right away.
 	 */
 	cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
 	cfqd->oom_cfqq.ref++;
-	cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
 
-	cfqd->queue = q;
+	spin_lock_irq(q->queue_lock);
+	cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group);
+	cfqg_put(cfqd->root_group);
+	spin_unlock_irq(q->queue_lock);
 
 	init_timer(&cfqd->idle_slice_timer);
 	cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
@@ -3750,7 +4039,11 @@ static void *cfq_init_queue(struct request_queue *q)
 	 * second, in order to have larger depth for async operations.
 	 */
 	cfqd->last_delayed_sync = jiffies - HZ;
-	return cfqd;
+	return 0;
+
+out_free:
+	kfree(cfqd);
+	return ret;
 }
 
 /*
@@ -3877,15 +4170,13 @@ static struct elevator_type iosched_cfq = {
 };
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-static struct blkio_policy_type blkio_policy_cfq = {
-	.ops = {
-		.blkio_unlink_group_fn =	cfq_unlink_blkio_group,
-		.blkio_update_group_weight_fn =	cfq_update_blkio_group_weight,
-	},
-	.plid = BLKIO_POLICY_PROP,
+static struct blkcg_policy blkcg_policy_cfq = {
+	.pd_size		= sizeof(struct cfq_group),
+	.cftypes		= cfq_blkcg_files,
+
+	.pd_init_fn		= cfq_pd_init,
+	.pd_reset_stats_fn	= cfq_pd_reset_stats,
 };
-#else
-static struct blkio_policy_type blkio_policy_cfq;
 #endif
 
 static int __init cfq_init(void)
@@ -3906,24 +4197,31 @@ static int __init cfq_init(void)
 #else
 		cfq_group_idle = 0;
 #endif
+
+	ret = blkcg_policy_register(&blkcg_policy_cfq);
+	if (ret)
+		return ret;
+
 	cfq_pool = KMEM_CACHE(cfq_queue, 0);
 	if (!cfq_pool)
-		return -ENOMEM;
+		goto err_pol_unreg;
 
 	ret = elv_register(&iosched_cfq);
-	if (ret) {
-		kmem_cache_destroy(cfq_pool);
-		return ret;
-	}
-
-	blkio_policy_register(&blkio_policy_cfq);
+	if (ret)
+		goto err_free_pool;
 
 	return 0;
+
+err_free_pool:
+	kmem_cache_destroy(cfq_pool);
+err_pol_unreg:
+	blkcg_policy_unregister(&blkcg_policy_cfq);
+	return ret;
 }
 
 static void __exit cfq_exit(void)
 {
-	blkio_policy_unregister(&blkio_policy_cfq);
+	blkcg_policy_unregister(&blkcg_policy_cfq);
 	elv_unregister(&iosched_cfq);
 	kmem_cache_destroy(cfq_pool);
 }
diff --git a/block/cfq.h b/block/cfq.h
deleted file mode 100644
index 2a155927e37c..000000000000
--- a/block/cfq.h
+++ /dev/null
@@ -1,115 +0,0 @@
-#ifndef _CFQ_H
-#define _CFQ_H
-#include "blk-cgroup.h"
-
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg,
-	struct blkio_group *curr_blkg, bool direction, bool sync)
-{
-	blkiocg_update_io_add_stats(blkg, curr_blkg, direction, sync);
-}
-
-static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
-			unsigned long dequeue)
-{
-	blkiocg_update_dequeue_stats(blkg, dequeue);
-}
-
-static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
-			unsigned long time, unsigned long unaccounted_time)
-{
-	blkiocg_update_timeslice_used(blkg, time, unaccounted_time);
-}
-
-static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg)
-{
-	blkiocg_set_start_empty_time(blkg);
-}
-
-static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg,
-				bool direction, bool sync)
-{
-	blkiocg_update_io_remove_stats(blkg, direction, sync);
-}
-
-static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg,
-		bool direction, bool sync)
-{
-	blkiocg_update_io_merged_stats(blkg, direction, sync);
-}
-
-static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg)
-{
-	blkiocg_update_idle_time_stats(blkg);
-}
-
-static inline void
-cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
-{
-	blkiocg_update_avg_queue_size_stats(blkg);
-}
-
-static inline void
-cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
-{
-	blkiocg_update_set_idle_time_stats(blkg);
-}
-
-static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg,
-				uint64_t bytes, bool direction, bool sync)
-{
-	blkiocg_update_dispatch_stats(blkg, bytes, direction, sync);
-}
-
-static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
-{
-	blkiocg_update_completion_stats(blkg, start_time, io_start_time,
-				direction, sync);
-}
-
-static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-			struct blkio_group *blkg, void *key, dev_t dev) {
-	blkiocg_add_blkio_group(blkcg, blkg, key, dev, BLKIO_POLICY_PROP);
-}
-
-static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
-{
-	return blkiocg_del_blkio_group(blkg);
-}
-
-#else /* CFQ_GROUP_IOSCHED */
-static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg,
-	struct blkio_group *curr_blkg, bool direction, bool sync) {}
-
-static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
-			unsigned long dequeue) {}
-
-static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
-			unsigned long time, unsigned long unaccounted_time) {}
-static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
-static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg,
-				bool direction, bool sync) {}
-static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg,
-		bool direction, bool sync) {}
-static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg)
-{
-}
-static inline void
-cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) {}
-
-static inline void
-cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) {}
-
-static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg,
-				uint64_t bytes, bool direction, bool sync) {}
-static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) {}
-
-static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-			struct blkio_group *blkg, void *key, dev_t dev) {}
-static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
-{
-	return 0;
-}
-
-#endif /* CFQ_GROUP_IOSCHED */
-#endif
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 7bf12d793fcd..599b12e5380f 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -337,13 +337,13 @@ static void deadline_exit_queue(struct elevator_queue *e)
 /*
  * initialize elevator private data (deadline_data).
  */
-static void *deadline_init_queue(struct request_queue *q)
+static int deadline_init_queue(struct request_queue *q)
 {
 	struct deadline_data *dd;
 
 	dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node);
 	if (!dd)
-		return NULL;
+		return -ENOMEM;
 
 	INIT_LIST_HEAD(&dd->fifo_list[READ]);
 	INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
@@ -354,7 +354,9 @@ static void *deadline_init_queue(struct request_queue *q)
 	dd->writes_starved = writes_starved;
 	dd->front_merges = 1;
 	dd->fifo_batch = fifo_batch;
-	return dd;
+
+	q->elevator->elevator_data = dd;
+	return 0;
 }
 
 /*
diff --git a/block/elevator.c b/block/elevator.c
index f016855a46b0..6a55d418896f 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -38,6 +38,7 @@
 #include <trace/events/block.h>
 
 #include "blk.h"
+#include "blk-cgroup.h"
 
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
@@ -121,15 +122,6 @@ static struct elevator_type *elevator_get(const char *name)
 	return e;
 }
 
-static int elevator_init_queue(struct request_queue *q,
-			       struct elevator_queue *eq)
-{
-	eq->elevator_data = eq->type->ops.elevator_init_fn(q);
-	if (eq->elevator_data)
-		return 0;
-	return -ENOMEM;
-}
-
 static char chosen_elevator[ELV_NAME_MAX];
 
 static int __init elevator_setup(char *str)
@@ -188,7 +180,6 @@ static void elevator_release(struct kobject *kobj)
 int elevator_init(struct request_queue *q, char *name)
 {
 	struct elevator_type *e = NULL;
-	struct elevator_queue *eq;
 	int err;
 
 	if (unlikely(q->elevator))
@@ -222,17 +213,16 @@ int elevator_init(struct request_queue *q, char *name)
 		}
 	}
 
-	eq = elevator_alloc(q, e);
-	if (!eq)
+	q->elevator = elevator_alloc(q, e);
+	if (!q->elevator)
 		return -ENOMEM;
 
-	err = elevator_init_queue(q, eq);
+	err = e->ops.elevator_init_fn(q);
 	if (err) {
-		kobject_put(&eq->kobj);
+		kobject_put(&q->elevator->kobj);
 		return err;
 	}
 
-	q->elevator = eq;
 	return 0;
 }
 EXPORT_SYMBOL(elevator_init);
@@ -564,25 +554,6 @@ void elv_drain_elevator(struct request_queue *q)
 	}
 }
 
-void elv_quiesce_start(struct request_queue *q)
-{
-	if (!q->elevator)
-		return;
-
-	spin_lock_irq(q->queue_lock);
-	queue_flag_set(QUEUE_FLAG_ELVSWITCH, q);
-	spin_unlock_irq(q->queue_lock);
-
-	blk_drain_queue(q, false);
-}
-
-void elv_quiesce_end(struct request_queue *q)
-{
-	spin_lock_irq(q->queue_lock);
-	queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
-	spin_unlock_irq(q->queue_lock);
-}
-
 void __elv_add_request(struct request_queue *q, struct request *rq, int where)
 {
 	trace_block_rq_insert(q, rq);
@@ -692,12 +663,13 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq)
 	return NULL;
 }
 
-int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
+int elv_set_request(struct request_queue *q, struct request *rq,
+		    struct bio *bio, gfp_t gfp_mask)
 {
 	struct elevator_queue *e = q->elevator;
 
 	if (e->type->ops.elevator_set_req_fn)
-		return e->type->ops.elevator_set_req_fn(q, rq, gfp_mask);
+		return e->type->ops.elevator_set_req_fn(q, rq, bio, gfp_mask);
 	return 0;
 }
 
@@ -801,8 +773,9 @@ static struct kobj_type elv_ktype = {
 	.release	= elevator_release,
 };
 
-int __elv_register_queue(struct request_queue *q, struct elevator_queue *e)
+int elv_register_queue(struct request_queue *q)
 {
+	struct elevator_queue *e = q->elevator;
 	int error;
 
 	error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
@@ -820,11 +793,6 @@ int __elv_register_queue(struct request_queue *q, struct elevator_queue *e)
 	}
 	return error;
 }
-
-int elv_register_queue(struct request_queue *q)
-{
-	return __elv_register_queue(q, q->elevator);
-}
 EXPORT_SYMBOL(elv_register_queue);
 
 void elv_unregister_queue(struct request_queue *q)
@@ -907,53 +875,60 @@ EXPORT_SYMBOL_GPL(elv_unregister);
  */
 static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 {
-	struct elevator_queue *old_elevator, *e;
+	struct elevator_queue *old = q->elevator;
+	bool registered = old->registered;
 	int err;
 
-	/* allocate new elevator */
-	e = elevator_alloc(q, new_e);
-	if (!e)
-		return -ENOMEM;
+	/*
+	 * Turn on BYPASS and drain all requests w/ elevator private data.
+	 * Block layer doesn't call into a quiesced elevator - all requests
+	 * are directly put on the dispatch list without elevator data
+	 * using INSERT_BACK.  All requests have SOFTBARRIER set and no
+	 * merge happens either.
+	 */
+	blk_queue_bypass_start(q);
+
+	/* unregister and clear all auxiliary data of the old elevator */
+	if (registered)
+		elv_unregister_queue(q);
+
+	spin_lock_irq(q->queue_lock);
+	ioc_clear_queue(q);
+	spin_unlock_irq(q->queue_lock);
 
-	err = elevator_init_queue(q, e);
+	/* allocate, init and register new elevator */
+	err = -ENOMEM;
+	q->elevator = elevator_alloc(q, new_e);
+	if (!q->elevator)
+		goto fail_init;
+
+	err = new_e->ops.elevator_init_fn(q);
 	if (err) {
-		kobject_put(&e->kobj);
-		return err;
+		kobject_put(&q->elevator->kobj);
+		goto fail_init;
 	}
 
-	/* turn on BYPASS and drain all requests w/ elevator private data */
-	elv_quiesce_start(q);
-
-	/* unregister old queue, register new one and kill old elevator */
-	if (q->elevator->registered) {
-		elv_unregister_queue(q);
-		err = __elv_register_queue(q, e);
+	if (registered) {
+		err = elv_register_queue(q);
 		if (err)
 			goto fail_register;
 	}
 
-	/* done, clear io_cq's, switch elevators and turn off BYPASS */
-	spin_lock_irq(q->queue_lock);
-	ioc_clear_queue(q);
-	old_elevator = q->elevator;
-	q->elevator = e;
-	spin_unlock_irq(q->queue_lock);
-
-	elevator_exit(old_elevator);
-	elv_quiesce_end(q);
+	/* done, kill the old one and finish */
+	elevator_exit(old);
+	blk_queue_bypass_end(q);
 
-	blk_add_trace_msg(q, "elv switch: %s", e->type->elevator_name);
+	blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
 
 	return 0;
 
 fail_register:
-	/*
-	 * switch failed, exit the new io scheduler and reattach the old
-	 * one again (along with re-adding the sysfs dir)
-	 */
-	elevator_exit(e);
+	elevator_exit(q->elevator);
+fail_init:
+	/* switch failed, restore and re-register old elevator */
+	q->elevator = old;
 	elv_register_queue(q);
-	elv_quiesce_end(q);
+	blk_queue_bypass_end(q);
 
 	return err;
 }
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 413a0b1d788c..5d1bf70e33d5 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -59,15 +59,17 @@ noop_latter_request(struct request_queue *q, struct request *rq)
 	return list_entry(rq->queuelist.next, struct request, queuelist);
 }
 
-static void *noop_init_queue(struct request_queue *q)
+static int noop_init_queue(struct request_queue *q)
 {
 	struct noop_data *nd;
 
 	nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
 	if (!nd)
-		return NULL;
+		return -ENOMEM;
+
 	INIT_LIST_HEAD(&nd->queue);
-	return nd;
+	q->elevator->elevator_data = nd;
+	return 0;
 }
 
 static void noop_exit_queue(struct elevator_queue *e)
diff --git a/drivers/acpi/bgrt.c b/drivers/acpi/bgrt.c
index 8cf6c46e99fb..6680df36b963 100644
--- a/drivers/acpi/bgrt.c
+++ b/drivers/acpi/bgrt.c
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/device.h>
 #include <linux/sysfs.h>
+#include <linux/io.h>
 #include <acpi/acpi.h>
 #include <acpi/acpi_bus.h>
 
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index 06527c526618..ebaa04593236 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -25,6 +25,8 @@
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
 
+#include <asm/realmode.h>
+
 #include "internal.h"
 #include "sleep.h"
 
@@ -91,13 +93,13 @@ static struct notifier_block tts_notifier = {
 static int acpi_sleep_prepare(u32 acpi_state)
 {
 #ifdef CONFIG_ACPI_SLEEP
+	unsigned long wakeup_pa = real_mode_header->wakeup_start;
 	/* do we have a wakeup address for S2 and S3? */
 	if (acpi_state == ACPI_STATE_S3) {
-		if (!acpi_wakeup_address) {
+		if (!wakeup_pa)
 			return -EFAULT;
-		}
 		acpi_set_firmware_waking_vector(
-				(acpi_physical_address)acpi_wakeup_address);
+				(acpi_physical_address)wakeup_pa);
 
 	}
 	ACPI_FLUSH_CPU_CACHE();
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 90aa2a11a933..af1a177216f1 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -592,11 +592,9 @@ static ssize_t print_nodes_state(enum node_states state, char *buf)
 {
 	int n;
 
-	n = nodelist_scnprintf(buf, PAGE_SIZE, node_states[state]);
-	if (n > 0 && PAGE_SIZE > n + 1) {
-		*(buf + n++) = '\n';
-		*(buf + n++) = '\0';
-	}
+	n = nodelist_scnprintf(buf, PAGE_SIZE-2, node_states[state]);
+	buf[n++] = '\n';
+	buf[n] = '\0';
 	return n;
 }
 
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index cf0e63dd97da..e54e31b02b88 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -65,39 +65,80 @@ struct drbd_atodb_wait {
 
 int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int);
 
+void *drbd_md_get_buffer(struct drbd_conf *mdev)
+{
+	int r;
+
+	wait_event(mdev->misc_wait,
+		   (r = atomic_cmpxchg(&mdev->md_io_in_use, 0, 1)) == 0 ||
+		   mdev->state.disk <= D_FAILED);
+
+	return r ? NULL : page_address(mdev->md_io_page);
+}
+
+void drbd_md_put_buffer(struct drbd_conf *mdev)
+{
+	if (atomic_dec_and_test(&mdev->md_io_in_use))
+		wake_up(&mdev->misc_wait);
+}
+
+static bool md_io_allowed(struct drbd_conf *mdev)
+{
+	enum drbd_disk_state ds = mdev->state.disk;
+	return ds >= D_NEGOTIATING || ds == D_ATTACHING;
+}
+
+void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
+				     unsigned int *done)
+{
+	long dt = bdev->dc.disk_timeout * HZ / 10;
+	if (dt == 0)
+		dt = MAX_SCHEDULE_TIMEOUT;
+
+	dt = wait_event_timeout(mdev->misc_wait, *done || !md_io_allowed(mdev), dt);
+	if (dt == 0)
+		dev_err(DEV, "meta-data IO operation timed out\n");
+}
+
 static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
 				 struct drbd_backing_dev *bdev,
 				 struct page *page, sector_t sector,
 				 int rw, int size)
 {
 	struct bio *bio;
-	struct drbd_md_io md_io;
 	int ok;
 
-	md_io.mdev = mdev;
-	init_completion(&md_io.event);
-	md_io.error = 0;
+	mdev->md_io.done = 0;
+	mdev->md_io.error = -ENODEV;
 
 	if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags))
 		rw |= REQ_FUA | REQ_FLUSH;
 	rw |= REQ_SYNC;
 
-	bio = bio_alloc(GFP_NOIO, 1);
+	bio = bio_alloc_drbd(GFP_NOIO);
 	bio->bi_bdev = bdev->md_bdev;
 	bio->bi_sector = sector;
 	ok = (bio_add_page(bio, page, size, 0) == size);
 	if (!ok)
 		goto out;
-	bio->bi_private = &md_io;
+	bio->bi_private = &mdev->md_io;
 	bio->bi_end_io = drbd_md_io_complete;
 	bio->bi_rw = rw;
 
+	if (!get_ldev_if_state(mdev, D_ATTACHING)) {  /* Corresponding put_ldev in drbd_md_io_complete() */
+		dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n");
+		ok = 0;
+		goto out;
+	}
+
+	bio_get(bio); /* one bio_put() is in the completion handler */
+	atomic_inc(&mdev->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */
 	if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
 		bio_endio(bio, -EIO);
 	else
 		submit_bio(rw, bio);
-	wait_for_completion(&md_io.event);
-	ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0;
+	wait_until_done_or_disk_failure(mdev, bdev, &mdev->md_io.done);
+	ok = bio_flagged(bio, BIO_UPTODATE) && mdev->md_io.error == 0;
 
  out:
 	bio_put(bio);
@@ -111,7 +152,7 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
 	int offset = 0;
 	struct page *iop = mdev->md_io_page;
 
-	D_ASSERT(mutex_is_locked(&mdev->md_io_mutex));
+	D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1);
 
 	BUG_ON(!bdev->md_bdev);
 
@@ -328,8 +369,13 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
 		return 1;
 	}
 
-	mutex_lock(&mdev->md_io_mutex); /* protects md_io_buffer, al_tr_cycle, ... */
-	buffer = (struct al_transaction *)page_address(mdev->md_io_page);
+	buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */
+	if (!buffer) {
+		dev_err(DEV, "disk failed while waiting for md_io buffer\n");
+		complete(&((struct update_al_work *)w)->event);
+		put_ldev(mdev);
+		return 1;
+	}
 
 	buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
 	buffer->tr_number = cpu_to_be32(mdev->al_tr_number);
@@ -374,7 +420,7 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
 	D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE);
 	mdev->al_tr_number++;
 
-	mutex_unlock(&mdev->md_io_mutex);
+	drbd_md_put_buffer(mdev);
 
 	complete(&((struct update_al_work *)w)->event);
 	put_ldev(mdev);
@@ -443,8 +489,9 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 	/* lock out all other meta data io for now,
 	 * and make sure the page is mapped.
 	 */
-	mutex_lock(&mdev->md_io_mutex);
-	buffer = page_address(mdev->md_io_page);
+	buffer = drbd_md_get_buffer(mdev);
+	if (!buffer)
+		return 0;
 
 	/* Find the valid transaction in the log */
 	for (i = 0; i <= mx; i++) {
@@ -452,7 +499,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 		if (rv == 0)
 			continue;
 		if (rv == -1) {
-			mutex_unlock(&mdev->md_io_mutex);
+			drbd_md_put_buffer(mdev);
 			return 0;
 		}
 		cnr = be32_to_cpu(buffer->tr_number);
@@ -478,7 +525,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 
 	if (!found_valid) {
 		dev_warn(DEV, "No usable activity log found.\n");
-		mutex_unlock(&mdev->md_io_mutex);
+		drbd_md_put_buffer(mdev);
 		return 1;
 	}
 
@@ -493,7 +540,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 		rv = drbd_al_read_tr(mdev, bdev, buffer, i);
 		ERR_IF(rv == 0) goto cancel;
 		if (rv == -1) {
-			mutex_unlock(&mdev->md_io_mutex);
+			drbd_md_put_buffer(mdev);
 			return 0;
 		}
 
@@ -534,7 +581,7 @@ cancel:
 		mdev->al_tr_pos = 0;
 
 	/* ok, we are done with it */
-	mutex_unlock(&mdev->md_io_mutex);
+	drbd_md_put_buffer(mdev);
 
 	dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n",
 	     transactions, active_extents);
@@ -671,16 +718,20 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
 			else
 				ext->rs_failed += count;
 			if (ext->rs_left < ext->rs_failed) {
-				dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d "
-				    "rs_failed=%d count=%d\n",
+				dev_warn(DEV, "BAD! sector=%llus enr=%u rs_left=%d "
+				    "rs_failed=%d count=%d cstate=%s\n",
 				     (unsigned long long)sector,
 				     ext->lce.lc_number, ext->rs_left,
-				     ext->rs_failed, count);
-				dump_stack();
-
-				lc_put(mdev->resync, &ext->lce);
-				drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-				return;
+				     ext->rs_failed, count,
+				     drbd_conn_str(mdev->state.conn));
+
+				/* We don't expect to be able to clear more bits
+				 * than have been set when we originally counted
+				 * the set bits to cache that value in ext->rs_left.
+				 * Whatever the reason (disconnect during resync,
+				 * delayed local completion of an application write),
+				 * try to fix it up by recounting here. */
+				ext->rs_left = drbd_bm_e_weight(mdev, enr);
 			}
 		} else {
 			/* Normally this element should be in the cache,
@@ -1192,6 +1243,7 @@ int drbd_rs_del_all(struct drbd_conf *mdev)
 		put_ldev(mdev);
 	}
 	spin_unlock_irq(&mdev->al_lock);
+	wake_up(&mdev->al_wait);
 
 	return 0;
 }
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 3030201c69d8..b5c5ff53cb57 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -205,7 +205,7 @@ void drbd_bm_unlock(struct drbd_conf *mdev)
 static void bm_store_page_idx(struct page *page, unsigned long idx)
 {
 	BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK));
-	page_private(page) |= idx;
+	set_page_private(page, idx);
 }
 
 static unsigned long bm_page_to_idx(struct page *page)
@@ -886,12 +886,21 @@ void drbd_bm_clear_all(struct drbd_conf *mdev)
 struct bm_aio_ctx {
 	struct drbd_conf *mdev;
 	atomic_t in_flight;
-	struct completion done;
+	unsigned int done;
 	unsigned flags;
 #define BM_AIO_COPY_PAGES	1
 	int error;
+	struct kref kref;
 };
 
+static void bm_aio_ctx_destroy(struct kref *kref)
+{
+	struct bm_aio_ctx *ctx = container_of(kref, struct bm_aio_ctx, kref);
+
+	put_ldev(ctx->mdev);
+	kfree(ctx);
+}
+
 /* bv_page may be a copy, or may be the original */
 static void bm_async_io_complete(struct bio *bio, int error)
 {
@@ -930,20 +939,21 @@ static void bm_async_io_complete(struct bio *bio, int error)
 
 	bm_page_unlock_io(mdev, idx);
 
-	/* FIXME give back to page pool */
 	if (ctx->flags & BM_AIO_COPY_PAGES)
-		put_page(bio->bi_io_vec[0].bv_page);
+		mempool_free(bio->bi_io_vec[0].bv_page, drbd_md_io_page_pool);
 
 	bio_put(bio);
 
-	if (atomic_dec_and_test(&ctx->in_flight))
-		complete(&ctx->done);
+	if (atomic_dec_and_test(&ctx->in_flight)) {
+		ctx->done = 1;
+		wake_up(&mdev->misc_wait);
+		kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+	}
 }
 
 static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local)
 {
-	/* we are process context. we always get a bio */
-	struct bio *bio = bio_alloc(GFP_KERNEL, 1);
+	struct bio *bio = bio_alloc_drbd(GFP_NOIO);
 	struct drbd_conf *mdev = ctx->mdev;
 	struct drbd_bitmap *b = mdev->bitmap;
 	struct page *page;
@@ -966,10 +976,8 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must
 	bm_set_page_unchanged(b->bm_pages[page_nr]);
 
 	if (ctx->flags & BM_AIO_COPY_PAGES) {
-		/* FIXME alloc_page is good enough for now, but actually needs
-		 * to use pre-allocated page pool */
 		void *src, *dest;
-		page = alloc_page(__GFP_HIGHMEM|__GFP_WAIT);
+		page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT);
 		dest = kmap_atomic(page);
 		src = kmap_atomic(b->bm_pages[page_nr]);
 		memcpy(dest, src, PAGE_SIZE);
@@ -981,6 +989,8 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must
 
 	bio->bi_bdev = mdev->ldev->md_bdev;
 	bio->bi_sector = on_disk_sector;
+	/* bio_add_page of a single page to an empty bio will always succeed,
+	 * according to api.  Do we want to assert that? */
 	bio_add_page(bio, page, len, 0);
 	bio->bi_private = ctx;
 	bio->bi_end_io = bm_async_io_complete;
@@ -999,14 +1009,9 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must
 /*
  * bm_rw: read/write the whole bitmap from/to its on disk location.
  */
-static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_idx) __must_hold(local)
+static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_writeout_upper_idx) __must_hold(local)
 {
-	struct bm_aio_ctx ctx = {
-		.mdev = mdev,
-		.in_flight = ATOMIC_INIT(1),
-		.done = COMPLETION_INITIALIZER_ONSTACK(ctx.done),
-		.flags = lazy_writeout_upper_idx ? BM_AIO_COPY_PAGES : 0,
-	};
+	struct bm_aio_ctx *ctx;
 	struct drbd_bitmap *b = mdev->bitmap;
 	int num_pages, i, count = 0;
 	unsigned long now;
@@ -1021,7 +1026,27 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id
 	 * For lazy writeout, we don't care for ongoing changes to the bitmap,
 	 * as we submit copies of pages anyways.
 	 */
-	if (!ctx.flags)
+
+	ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
+	if (!ctx)
+		return -ENOMEM;
+
+	*ctx = (struct bm_aio_ctx) {
+		.mdev = mdev,
+		.in_flight = ATOMIC_INIT(1),
+		.done = 0,
+		.flags = flags,
+		.error = 0,
+		.kref = { ATOMIC_INIT(2) },
+	};
+
+	if (!get_ldev_if_state(mdev, D_ATTACHING)) {  /* put is in bm_aio_ctx_destroy() */
+		dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n");
+		kfree(ctx);
+		return -ENODEV;
+	}
+
+	if (!ctx->flags)
 		WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));
 
 	num_pages = b->bm_number_of_pages;
@@ -1046,29 +1071,38 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id
 				continue;
 			}
 		}
-		atomic_inc(&ctx.in_flight);
-		bm_page_io_async(&ctx, i, rw);
+		atomic_inc(&ctx->in_flight);
+		bm_page_io_async(ctx, i, rw);
 		++count;
 		cond_resched();
 	}
 
 	/*
-	 * We initialize ctx.in_flight to one to make sure bm_async_io_complete
-	 * will not complete() early, and decrement / test it here.  If there
+	 * We initialize ctx->in_flight to one to make sure bm_async_io_complete
+	 * will not set ctx->done early, and decrement / test it here.  If there
 	 * are still some bios in flight, we need to wait for them here.
+	 * If all IO is done already (or nothing had been submitted), there is
+	 * no need to wait.  Still, we need to put the kref associated with the
+	 * "in_flight reached zero, all done" event.
 	 */
-	if (!atomic_dec_and_test(&ctx.in_flight))
-		wait_for_completion(&ctx.done);
+	if (!atomic_dec_and_test(&ctx->in_flight))
+		wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done);
+	else
+		kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+
 	dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",
 			rw == WRITE ? "WRITE" : "READ",
 			count, jiffies - now);
 
-	if (ctx.error) {
+	if (ctx->error) {
 		dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
 		drbd_chk_io_error(mdev, 1, true);
-		err = -EIO; /* ctx.error ? */
+		err = -EIO; /* ctx->error ? */
 	}
 
+	if (atomic_read(&ctx->in_flight))
+		err = -EIO; /* Disk failed during IO... */
+
 	now = jiffies;
 	if (rw == WRITE) {
 		drbd_md_flush(mdev);
@@ -1082,6 +1116,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id
 	dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
 	     ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
 
+	kref_put(&ctx->kref, &bm_aio_ctx_destroy);
 	return err;
 }
 
@@ -1091,7 +1126,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id
  */
 int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
 {
-	return bm_rw(mdev, READ, 0);
+	return bm_rw(mdev, READ, 0, 0);
 }
 
 /**
@@ -1102,7 +1137,7 @@ int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
  */
 int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
 {
-	return bm_rw(mdev, WRITE, 0);
+	return bm_rw(mdev, WRITE, 0, 0);
 }
 
 /**
@@ -1112,7 +1147,23 @@ int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
  */
 int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(local)
 {
-	return bm_rw(mdev, WRITE, upper_idx);
+	return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, upper_idx);
+}
+
+/**
+ * drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location.
+ * @mdev:	DRBD device.
+ *
+ * Will only write pages that have changed since last IO.
+ * In contrast to drbd_bm_write(), this will copy the bitmap pages
+ * to temporary writeout pages. It is intended to trigger a full write-out
+ * while still allowing the bitmap to change, for example if a resync or online
+ * verify is aborted due to a failed peer disk, while local IO continues, or
+ * pending resync acks are still being processed.
+ */
+int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local)
+{
+	return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, 0);
 }
 
 
@@ -1130,28 +1181,45 @@ int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(l
  */
 int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local)
 {
-	struct bm_aio_ctx ctx = {
+	struct bm_aio_ctx *ctx;
+	int err;
+
+	if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) {
+		dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx);
+		return 0;
+	}
+
+	ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
+	if (!ctx)
+		return -ENOMEM;
+
+	*ctx = (struct bm_aio_ctx) {
 		.mdev = mdev,
 		.in_flight = ATOMIC_INIT(1),
-		.done = COMPLETION_INITIALIZER_ONSTACK(ctx.done),
+		.done = 0,
 		.flags = BM_AIO_COPY_PAGES,
+		.error = 0,
+		.kref = { ATOMIC_INIT(2) },
 	};
 
-	if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) {
-		dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx);
-		return 0;
+	if (!get_ldev_if_state(mdev, D_ATTACHING)) {  /* put is in bm_aio_ctx_destroy() */
+		dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in drbd_bm_write_page()\n");
+		kfree(ctx);
+		return -ENODEV;
 	}
 
-	bm_page_io_async(&ctx, idx, WRITE_SYNC);
-	wait_for_completion(&ctx.done);
+	bm_page_io_async(ctx, idx, WRITE_SYNC);
+	wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done);
 
-	if (ctx.error)
+	if (ctx->error)
 		drbd_chk_io_error(mdev, 1, true);
 		/* that should force detach, so the in memory bitmap will be
 		 * gone in a moment as well. */
 
 	mdev->bm_writ_cnt++;
-	return ctx.error;
+	err = atomic_read(&ctx->in_flight) ? -EIO : ctx->error;
+	kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+	return err;
 }
 
 /* NOTE
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 8d680562ba73..02f013a073a7 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -712,7 +712,6 @@ struct drbd_request {
 	struct list_head tl_requests; /* ring list in the transfer log */
 	struct bio *master_bio;       /* master bio pointer */
 	unsigned long rq_state; /* see comments above _req_mod() */
-	int seq_num;
 	unsigned long start_time;
 };
 
@@ -851,6 +850,7 @@ enum {
 	NEW_CUR_UUID,		/* Create new current UUID when thawing IO */
 	AL_SUSPENDED,		/* Activity logging is currently suspended. */
 	AHEAD_TO_SYNC_SOURCE,   /* Ahead -> SyncSource queued */
+	STATE_SENT,		/* Do not change state/UUIDs while this is set */
 };
 
 struct drbd_bitmap; /* opaque for drbd_conf */
@@ -862,31 +862,30 @@ enum bm_flag {
 	BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */
 
 	/* currently locked for bulk operation */
-	BM_LOCKED_MASK = 0x7,
+	BM_LOCKED_MASK = 0xf,
 
 	/* in detail, that is: */
 	BM_DONT_CLEAR = 0x1,
 	BM_DONT_SET   = 0x2,
 	BM_DONT_TEST  = 0x4,
 
+	/* so we can mark it locked for bulk operation,
+	 * and still allow all non-bulk operations */
+	BM_IS_LOCKED  = 0x8,
+
 	/* (test bit, count bit) allowed (common case) */
-	BM_LOCKED_TEST_ALLOWED = 0x3,
+	BM_LOCKED_TEST_ALLOWED = BM_DONT_CLEAR | BM_DONT_SET | BM_IS_LOCKED,
 
 	/* testing bits, as well as setting new bits allowed, but clearing bits
 	 * would be unexpected.  Used during bitmap receive.  Setting new bits
 	 * requires sending of "out-of-sync" information, though. */
-	BM_LOCKED_SET_ALLOWED = 0x1,
+	BM_LOCKED_SET_ALLOWED = BM_DONT_CLEAR | BM_IS_LOCKED,
 
-	/* clear is not expected while bitmap is locked for bulk operation */
+	/* for drbd_bm_write_copy_pages, everything is allowed,
+	 * only concurrent bulk operations are locked out. */
+	BM_LOCKED_CHANGE_ALLOWED = BM_IS_LOCKED,
 };
 
-
-/* TODO sort members for performance
- * MAYBE group them further */
-
-/* THINK maybe we actually want to use the default "event/%s" worker threads
- * or similar in linux 2.6, which uses per cpu data and threads.
- */
 struct drbd_work_queue {
 	struct list_head q;
 	struct semaphore s; /* producers up it, worker down()s it */
@@ -938,8 +937,7 @@ struct drbd_backing_dev {
 };
 
 struct drbd_md_io {
-	struct drbd_conf *mdev;
-	struct completion event;
+	unsigned int done;
 	int error;
 };
 
@@ -1022,6 +1020,7 @@ struct drbd_conf {
 	struct drbd_tl_epoch *newest_tle;
 	struct drbd_tl_epoch *oldest_tle;
 	struct list_head out_of_sequence_requests;
+	struct list_head barrier_acked_requests;
 	struct hlist_head *tl_hash;
 	unsigned int tl_hash_s;
 
@@ -1056,6 +1055,8 @@ struct drbd_conf {
 	struct crypto_hash *csums_tfm;
 	struct crypto_hash *verify_tfm;
 
+	unsigned long last_reattach_jif;
+	unsigned long last_reconnect_jif;
 	struct drbd_thread receiver;
 	struct drbd_thread worker;
 	struct drbd_thread asender;
@@ -1094,7 +1095,8 @@ struct drbd_conf {
 	wait_queue_head_t ee_wait;
 	struct page *md_io_page;	/* one page buffer for md_io */
 	struct page *md_io_tmpp;	/* for logical_block_size != 512 */
-	struct mutex md_io_mutex;	/* protects the md_io_buffer */
+	struct drbd_md_io md_io;
+	atomic_t md_io_in_use;		/* protects the md_io, md_io_page and md_io_tmpp */
 	spinlock_t al_lock;
 	wait_queue_head_t al_wait;
 	struct lru_cache *act_log;	/* activity log */
@@ -1228,8 +1230,8 @@ extern int drbd_send_uuids(struct drbd_conf *mdev);
 extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
 extern int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev);
 extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags);
-extern int _drbd_send_state(struct drbd_conf *mdev);
-extern int drbd_send_state(struct drbd_conf *mdev);
+extern int drbd_send_state(struct drbd_conf *mdev, union drbd_state s);
+extern int drbd_send_current_state(struct drbd_conf *mdev);
 extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
 			enum drbd_packets cmd, struct p_header80 *h,
 			size_t size, unsigned msg_flags);
@@ -1461,6 +1463,7 @@ extern int  drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
 extern int  drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local);
 extern int  drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
 extern int  drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
+extern int  drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local);
 extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
 		unsigned long al_enr);
 extern size_t	     drbd_bm_words(struct drbd_conf *mdev);
@@ -1493,11 +1496,38 @@ extern struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
 extern mempool_t *drbd_request_mempool;
 extern mempool_t *drbd_ee_mempool;
 
-extern struct page *drbd_pp_pool; /* drbd's page pool */
+/* drbd's page pool, used to buffer data received from the peer,
+ * or data requested by the peer.
+ *
+ * This does not have an emergency reserve.
+ *
+ * When allocating from this pool, it first takes pages from the pool.
+ * Only if the pool is depleted will try to allocate from the system.
+ *
+ * The assumption is that pages taken from this pool will be processed,
+ * and given back, "quickly", and then can be recycled, so we can avoid
+ * frequent calls to alloc_page(), and still will be able to make progress even
+ * under memory pressure.
+ */
+extern struct page *drbd_pp_pool;
 extern spinlock_t   drbd_pp_lock;
 extern int	    drbd_pp_vacant;
 extern wait_queue_head_t drbd_pp_wait;
 
+/* We also need a standard (emergency-reserve backed) page pool
+ * for meta data IO (activity log, bitmap).
+ * We can keep it global, as long as it is used as "N pages at a time".
+ * 128 should be plenty, currently we probably can get away with as few as 1.
+ */
+#define DRBD_MIN_POOL_PAGES	128
+extern mempool_t *drbd_md_io_page_pool;
+
+/* We also need to make sure we get a bio
+ * when we need it for housekeeping purposes */
+extern struct bio_set *drbd_md_io_bio_set;
+/* to allocate from that set */
+extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);
+
 extern rwlock_t global_state_lock;
 
 extern struct drbd_conf *drbd_new_device(unsigned int minor);
@@ -1536,8 +1566,12 @@ extern void resume_next_sg(struct drbd_conf *mdev);
 extern void suspend_other_sg(struct drbd_conf *mdev);
 extern int drbd_resync_finished(struct drbd_conf *mdev);
 /* maybe rather drbd_main.c ? */
+extern void *drbd_md_get_buffer(struct drbd_conf *mdev);
+extern void drbd_md_put_buffer(struct drbd_conf *mdev);
 extern int drbd_md_sync_page_io(struct drbd_conf *mdev,
-		struct drbd_backing_dev *bdev, sector_t sector, int rw);
+				struct drbd_backing_dev *bdev, sector_t sector, int rw);
+extern void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
+					    unsigned int *done);
 extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int);
 extern void drbd_rs_controller_reset(struct drbd_conf *mdev);
 
@@ -1754,19 +1788,6 @@ static inline struct page *page_chain_next(struct page *page)
 #define page_chain_for_each_safe(page, n) \
 	for (; page && ({ n = page_chain_next(page); 1; }); page = n)
 
-static inline int drbd_bio_has_active_page(struct bio *bio)
-{
-	struct bio_vec *bvec;
-	int i;
-
-	__bio_for_each_segment(bvec, bio, i, 0) {
-		if (page_count(bvec->bv_page) > 1)
-			return 1;
-	}
-
-	return 0;
-}
-
 static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e)
 {
 	struct page *page = e->pages;
@@ -1777,7 +1798,6 @@ static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e)
 	return 0;
 }
 
-
 static inline void drbd_state_lock(struct drbd_conf *mdev)
 {
 	wait_event(mdev->misc_wait,
@@ -2230,7 +2250,7 @@ static inline void drbd_get_syncer_progress(struct drbd_conf *mdev,
 		 * Note: currently we don't support such large bitmaps on 32bit
 		 * arch anyways, but no harm done to be prepared for it here.
 		 */
-		unsigned int shift = mdev->rs_total >= (1ULL << 32) ? 16 : 10;
+		unsigned int shift = mdev->rs_total > UINT_MAX ? 16 : 10;
 		unsigned long left = *bits_left >> shift;
 		unsigned long total = 1UL + (mdev->rs_total >> shift);
 		unsigned long tmp = 1000UL - left * 1000UL/total;
@@ -2306,12 +2326,12 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev)
 	case D_OUTDATED:
 	case D_CONSISTENT:
 	case D_UP_TO_DATE:
+	case D_FAILED:
 		/* disk state is stable as well. */
 		break;
 
 	/* no new io accepted during tansitional states */
 	case D_ATTACHING:
-	case D_FAILED:
 	case D_NEGOTIATING:
 	case D_UNKNOWN:
 	case D_MASK:
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 211fc44f84be..920ede2829d6 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -139,6 +139,8 @@ struct kmem_cache *drbd_bm_ext_cache;	/* bitmap extents */
 struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
 mempool_t *drbd_request_mempool;
 mempool_t *drbd_ee_mempool;
+mempool_t *drbd_md_io_page_pool;
+struct bio_set *drbd_md_io_bio_set;
 
 /* I do not use a standard mempool, because:
    1) I want to hand out the pre-allocated objects first.
@@ -159,7 +161,24 @@ static const struct block_device_operations drbd_ops = {
 	.release = drbd_release,
 };
 
-#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
+static void bio_destructor_drbd(struct bio *bio)
+{
+	bio_free(bio, drbd_md_io_bio_set);
+}
+
+struct bio *bio_alloc_drbd(gfp_t gfp_mask)
+{
+	struct bio *bio;
+
+	if (!drbd_md_io_bio_set)
+		return bio_alloc(gfp_mask, 1);
+
+	bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
+	if (!bio)
+		return NULL;
+	bio->bi_destructor = bio_destructor_drbd;
+	return bio;
+}
 
 #ifdef __CHECKER__
 /* When checking with sparse, and this is an inline function, sparse will
@@ -208,6 +227,7 @@ static int tl_init(struct drbd_conf *mdev)
 	mdev->oldest_tle = b;
 	mdev->newest_tle = b;
 	INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
+	INIT_LIST_HEAD(&mdev->barrier_acked_requests);
 
 	mdev->tl_hash = NULL;
 	mdev->tl_hash_s = 0;
@@ -246,9 +266,7 @@ void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
 	new->n_writes = 0;
 
 	newest_before = mdev->newest_tle;
-	/* never send a barrier number == 0, because that is special-cased
-	 * when using TCQ for our write ordering code */
-	new->br_number = (newest_before->br_number+1) ?: 1;
+	new->br_number = newest_before->br_number+1;
 	if (mdev->newest_tle != new) {
 		mdev->newest_tle->next = new;
 		mdev->newest_tle = new;
@@ -311,7 +329,7 @@ void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
 	   These have been list_move'd to the out_of_sequence_requests list in
 	   _req_mod(, barrier_acked) above.
 	   */
-	list_del_init(&b->requests);
+	list_splice_init(&b->requests, &mdev->barrier_acked_requests);
 
 	nob = b->next;
 	if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
@@ -411,6 +429,23 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
 		b = tmp;
 		list_splice(&carry_reads, &b->requests);
 	}
+
+	/* Actions operating on the disk state, also want to work on
+	   requests that got barrier acked. */
+	switch (what) {
+	case fail_frozen_disk_io:
+	case restart_frozen_disk_io:
+		list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
+			req = list_entry(le, struct drbd_request, tl_requests);
+			_req_mod(req, what);
+		}
+
+	case connection_lost_while_pending:
+	case resend:
+		break;
+	default:
+		dev_err(DEV, "what = %d in _tl_restart()\n", what);
+	}
 }
 
 
@@ -458,6 +493,38 @@ void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
 }
 
 /**
+ * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL
+ * @mdev:	DRBD device.
+ */
+void tl_abort_disk_io(struct drbd_conf *mdev)
+{
+	struct drbd_tl_epoch *b;
+	struct list_head *le, *tle;
+	struct drbd_request *req;
+
+	spin_lock_irq(&mdev->req_lock);
+	b = mdev->oldest_tle;
+	while (b) {
+		list_for_each_safe(le, tle, &b->requests) {
+			req = list_entry(le, struct drbd_request, tl_requests);
+			if (!(req->rq_state & RQ_LOCAL_PENDING))
+				continue;
+			_req_mod(req, abort_disk_io);
+		}
+		b = b->next;
+	}
+
+	list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
+		req = list_entry(le, struct drbd_request, tl_requests);
+		if (!(req->rq_state & RQ_LOCAL_PENDING))
+			continue;
+		_req_mod(req, abort_disk_io);
+	}
+
+	spin_unlock_irq(&mdev->req_lock);
+}
+
+/**
  * cl_wide_st_chg() - true if the state change is a cluster wide one
  * @mdev:	DRBD device.
  * @os:		old (current) state.
@@ -470,7 +537,7 @@ static int cl_wide_st_chg(struct drbd_conf *mdev,
 		 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
 		  (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
 		  (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
-		  (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
+		  (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
 		(os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
 		(os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
 }
@@ -509,8 +576,16 @@ static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
 static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
 						    union drbd_state,
 						    union drbd_state);
+enum sanitize_state_warnings {
+	NO_WARNING,
+	ABORTED_ONLINE_VERIFY,
+	ABORTED_RESYNC,
+	CONNECTION_LOST_NEGOTIATING,
+	IMPLICITLY_UPGRADED_DISK,
+	IMPLICITLY_UPGRADED_PDSK,
+};
 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
-				       union drbd_state ns, const char **warn_sync_abort);
+				       union drbd_state ns, enum sanitize_state_warnings *warn);
 int drbd_send_state_req(struct drbd_conf *,
 			union drbd_state, union drbd_state);
 
@@ -785,6 +860,13 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
 	if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
 		rv = SS_IN_TRANSIENT_STATE;
 
+	/* While establishing a connection only allow cstate to change.
+	   Delay/refuse role changes, detach attach etc... */
+	if (test_bit(STATE_SENT, &mdev->flags) &&
+	    !(os.conn == C_WF_REPORT_PARAMS ||
+	      (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION)))
+		rv = SS_IN_TRANSIENT_STATE;
+
 	if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
 		rv = SS_NEED_CONNECTION;
 
@@ -803,6 +885,21 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
 	return rv;
 }
 
+static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn)
+{
+	static const char *msg_table[] = {
+		[NO_WARNING] = "",
+		[ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
+		[ABORTED_RESYNC] = "Resync aborted.",
+		[CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
+		[IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
+		[IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
+	};
+
+	if (warn != NO_WARNING)
+		dev_warn(DEV, "%s\n", msg_table[warn]);
+}
+
 /**
  * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
  * @mdev:	DRBD device.
@@ -814,11 +911,14 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
  * to D_UNKNOWN. This rule and many more along those lines are in this function.
  */
 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
-				       union drbd_state ns, const char **warn_sync_abort)
+				       union drbd_state ns, enum sanitize_state_warnings *warn)
 {
 	enum drbd_fencing_p fp;
 	enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
 
+	if (warn)
+		*warn = NO_WARNING;
+
 	fp = FP_DONT_CARE;
 	if (get_ldev(mdev)) {
 		fp = mdev->ldev->dc.fencing;
@@ -833,18 +933,13 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
 	/* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
 	 * If you try to go into some Sync* state, that shall fail (elsewhere). */
 	if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
-	    ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
+	    ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED)
 		ns.conn = os.conn;
 
 	/* we cannot fail (again) if we already detached */
 	if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
 		ns.disk = D_DISKLESS;
 
-	/* if we are only D_ATTACHING yet,
-	 * we can (and should) go directly to D_DISKLESS. */
-	if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
-		ns.disk = D_DISKLESS;
-
 	/* After C_DISCONNECTING only C_STANDALONE may follow */
 	if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
 		ns.conn = os.conn;
@@ -863,10 +958,9 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
 	/* Abort resync if a disk fails/detaches */
 	if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
 	    (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
-		if (warn_sync_abort)
-			*warn_sync_abort =
-				os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
-				"Online-verify" : "Resync";
+		if (warn)
+			*warn =	os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
+				ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
 		ns.conn = C_CONNECTED;
 	}
 
@@ -877,7 +971,8 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
 			ns.disk = mdev->new_state_tmp.disk;
 			ns.pdsk = mdev->new_state_tmp.pdsk;
 		} else {
-			dev_alert(DEV, "Connection lost while negotiating, no data!\n");
+			if (warn)
+				*warn = CONNECTION_LOST_NEGOTIATING;
 			ns.disk = D_DISKLESS;
 			ns.pdsk = D_UNKNOWN;
 		}
@@ -959,16 +1054,16 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
 		ns.disk = disk_max;
 
 	if (ns.disk < disk_min) {
-		dev_warn(DEV, "Implicitly set disk from %s to %s\n",
-			 drbd_disk_str(ns.disk), drbd_disk_str(disk_min));
+		if (warn)
+			*warn = IMPLICITLY_UPGRADED_DISK;
 		ns.disk = disk_min;
 	}
 	if (ns.pdsk > pdsk_max)
 		ns.pdsk = pdsk_max;
 
 	if (ns.pdsk < pdsk_min) {
-		dev_warn(DEV, "Implicitly set pdsk from %s to %s\n",
-			 drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min));
+		if (warn)
+			*warn = IMPLICITLY_UPGRADED_PDSK;
 		ns.pdsk = pdsk_min;
 	}
 
@@ -1045,12 +1140,12 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
 {
 	union drbd_state os;
 	enum drbd_state_rv rv = SS_SUCCESS;
-	const char *warn_sync_abort = NULL;
+	enum sanitize_state_warnings ssw;
 	struct after_state_chg_work *ascw;
 
 	os = mdev->state;
 
-	ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
+	ns = sanitize_state(mdev, os, ns, &ssw);
 
 	if (ns.i == os.i)
 		return SS_NOTHING_TO_DO;
@@ -1076,8 +1171,7 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
 		return rv;
 	}
 
-	if (warn_sync_abort)
-		dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
+	print_sanitize_warnings(mdev, ssw);
 
 	{
 	char *pbp, pb[300];
@@ -1243,7 +1337,7 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
 		drbd_thread_stop_nowait(&mdev->receiver);
 
 	/* Upon network failure, we need to restart the receiver. */
-	if (os.conn > C_TEAR_DOWN &&
+	if (os.conn > C_WF_CONNECTION &&
 	    ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
 		drbd_thread_restart_nowait(&mdev->receiver);
 
@@ -1251,6 +1345,15 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
 	if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
 		drbd_resume_al(mdev);
 
+	/* remember last connect and attach times so request_timer_fn() won't
+	 * kill newly established sessions while we are still trying to thaw
+	 * previously frozen IO */
+	if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS)
+		mdev->last_reconnect_jif = jiffies;
+	if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
+	    ns.disk > D_NEGOTIATING)
+		mdev->last_reattach_jif = jiffies;
+
 	ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
 	if (ascw) {
 		ascw->os = os;
@@ -1354,12 +1457,16 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	/* Here we have the actions that are performed after a
 	   state change. This function might sleep */
 
+	if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING)
+		mod_timer(&mdev->request_timer, jiffies + HZ);
+
 	nsm.i = -1;
 	if (ns.susp_nod) {
 		if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
 			what = resend;
 
-		if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
+		if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
+		    ns.disk > D_NEGOTIATING)
 			what = restart_frozen_disk_io;
 
 		if (what != nothing)
@@ -1408,7 +1515,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	/* Do not change the order of the if above and the two below... */
 	if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) {      /* attach on the peer */
 		drbd_send_uuids(mdev);
-		drbd_send_state(mdev);
+		drbd_send_state(mdev, ns);
 	}
 	/* No point in queuing send_bitmap if we don't have a connection
 	 * anymore, so check also the _current_ state, not only the new state
@@ -1441,11 +1548,11 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	}
 
 	if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
-		if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
+		if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
+		    mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
 			drbd_uuid_new_current(mdev);
 			drbd_send_uuids(mdev);
 		}
-
 		/* D_DISKLESS Peer becomes secondary */
 		if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
 			/* We may still be Primary ourselves.
@@ -1473,14 +1580,14 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	    os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
 		drbd_send_sizes(mdev, 0, 0);  /* to start sync... */
 		drbd_send_uuids(mdev);
-		drbd_send_state(mdev);
+		drbd_send_state(mdev, ns);
 	}
 
 	/* We want to pause/continue resync, tell peer. */
 	if (ns.conn >= C_CONNECTED &&
 	     ((os.aftr_isp != ns.aftr_isp) ||
 	      (os.user_isp != ns.user_isp)))
-		drbd_send_state(mdev);
+		drbd_send_state(mdev, ns);
 
 	/* In case one of the isp bits got set, suspend other devices. */
 	if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
@@ -1490,10 +1597,10 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	/* Make sure the peer gets informed about eventual state
 	   changes (ISP bits) while we were in WFReportParams. */
 	if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
-		drbd_send_state(mdev);
+		drbd_send_state(mdev, ns);
 
 	if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
-		drbd_send_state(mdev);
+		drbd_send_state(mdev, ns);
 
 	/* We are in the progress to start a full sync... */
 	if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
@@ -1513,33 +1620,38 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	/* first half of local IO error, failure to attach,
 	 * or administrative detach */
 	if (os.disk != D_FAILED && ns.disk == D_FAILED) {
-		enum drbd_io_error_p eh;
-		int was_io_error;
+		enum drbd_io_error_p eh = EP_PASS_ON;
+		int was_io_error = 0;
 		/* corresponding get_ldev was in __drbd_set_state, to serialize
-		 * our cleanup here with the transition to D_DISKLESS,
-		 * so it is safe to dreference ldev here. */
-		eh = mdev->ldev->dc.on_io_error;
-		was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
-
-		/* current state still has to be D_FAILED,
-		 * there is only one way out: to D_DISKLESS,
-		 * and that may only happen after our put_ldev below. */
-		if (mdev->state.disk != D_FAILED)
-			dev_err(DEV,
-				"ASSERT FAILED: disk is %s during detach\n",
-				drbd_disk_str(mdev->state.disk));
-
-		if (drbd_send_state(mdev))
-			dev_warn(DEV, "Notified peer that I am detaching my disk\n");
-		else
-			dev_err(DEV, "Sending state for detaching disk failed\n");
-
-		drbd_rs_cancel_all(mdev);
-
-		/* In case we want to get something to stable storage still,
-		 * this may be the last chance.
-		 * Following put_ldev may transition to D_DISKLESS. */
-		drbd_md_sync(mdev);
+		 * our cleanup here with the transition to D_DISKLESS.
+		 * But is is still not save to dreference ldev here, since
+		 * we might come from an failed Attach before ldev was set. */
+		if (mdev->ldev) {
+			eh = mdev->ldev->dc.on_io_error;
+			was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
+
+			/* Immediately allow completion of all application IO, that waits
+			   for completion from the local disk. */
+			tl_abort_disk_io(mdev);
+
+			/* current state still has to be D_FAILED,
+			 * there is only one way out: to D_DISKLESS,
+			 * and that may only happen after our put_ldev below. */
+			if (mdev->state.disk != D_FAILED)
+				dev_err(DEV,
+					"ASSERT FAILED: disk is %s during detach\n",
+					drbd_disk_str(mdev->state.disk));
+
+			if (ns.conn >= C_CONNECTED)
+				drbd_send_state(mdev, ns);
+
+			drbd_rs_cancel_all(mdev);
+
+			/* In case we want to get something to stable storage still,
+			 * this may be the last chance.
+			 * Following put_ldev may transition to D_DISKLESS. */
+			drbd_md_sync(mdev);
+		}
 		put_ldev(mdev);
 
 		if (was_io_error && eh == EP_CALL_HELPER)
@@ -1561,16 +1673,17 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
                 mdev->rs_failed = 0;
                 atomic_set(&mdev->rs_pending_cnt, 0);
 
-		if (drbd_send_state(mdev))
-			dev_warn(DEV, "Notified peer that I'm now diskless.\n");
+		if (ns.conn >= C_CONNECTED)
+			drbd_send_state(mdev, ns);
+
 		/* corresponding get_ldev in __drbd_set_state
 		 * this may finally trigger drbd_ldev_destroy. */
 		put_ldev(mdev);
 	}
 
 	/* Notify peer that I had a local IO error, and did not detached.. */
-	if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT)
-		drbd_send_state(mdev);
+	if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
+		drbd_send_state(mdev, ns);
 
 	/* Disks got bigger while they were detached */
 	if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
@@ -1588,7 +1701,13 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	/* sync target done with resync.  Explicitly notify peer, even though
 	 * it should (at least for non-empty resyncs) already know itself. */
 	if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
-		drbd_send_state(mdev);
+		drbd_send_state(mdev, ns);
+
+	/* Wake up role changes, that were delayed because of connection establishing */
+	if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) {
+		clear_bit(STATE_SENT, &mdev->flags);
+		wake_up(&mdev->state_wait);
+	}
 
 	/* This triggers bitmap writeout of potentially still unwritten pages
 	 * if the resync finished cleanly, or aborted because of peer disk
@@ -1598,8 +1717,8 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	 * No harm done if some bits change during this phase.
 	 */
 	if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
-		drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL,
-			"write from resync_finished", BM_LOCKED_SET_ALLOWED);
+		drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL,
+			"write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
 		put_ldev(mdev);
 	}
 
@@ -2057,7 +2176,11 @@ int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
 
 	D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
 
-	uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET;
+	uuid = mdev->ldev->md.uuid[UI_BITMAP];
+	if (uuid && uuid != UUID_JUST_CREATED)
+		uuid = uuid + UUID_NEW_BM_OFFSET;
+	else
+		get_random_bytes(&uuid, sizeof(u64));
 	drbd_uuid_set(mdev, UI_BITMAP, uuid);
 	drbd_print_uuids(mdev, "updated sync UUID");
 	drbd_md_sync(mdev);
@@ -2089,6 +2212,10 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
 		max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
 	}
 
+	/* Never allow old drbd (up to 8.3.7) to see more than 32KiB */
+	if (mdev->agreed_pro_version <= 94)
+		max_bio_size = min_t(int, max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+
 	p.d_size = cpu_to_be64(d_size);
 	p.u_size = cpu_to_be64(u_size);
 	p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
@@ -2102,10 +2229,10 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
 }
 
 /**
- * drbd_send_state() - Sends the drbd state to the peer
+ * drbd_send_current_state() - Sends the drbd state to the peer
  * @mdev:	DRBD device.
  */
-int drbd_send_state(struct drbd_conf *mdev)
+int drbd_send_current_state(struct drbd_conf *mdev)
 {
 	struct socket *sock;
 	struct p_state p;
@@ -2131,6 +2258,37 @@ int drbd_send_state(struct drbd_conf *mdev)
 	return ok;
 }
 
+/**
+ * drbd_send_state() - After a state change, sends the new state to the peer
+ * @mdev:	DRBD device.
+ * @state:	the state to send, not necessarily the current state.
+ *
+ * Each state change queues an "after_state_ch" work, which will eventually
+ * send the resulting new state to the peer. If more state changes happen
+ * between queuing and processing of the after_state_ch work, we still
+ * want to send each intermediary state in the order it occurred.
+ */
+int drbd_send_state(struct drbd_conf *mdev, union drbd_state state)
+{
+	struct socket *sock;
+	struct p_state p;
+	int ok = 0;
+
+	mutex_lock(&mdev->data.mutex);
+
+	p.state = cpu_to_be32(state.i);
+	sock = mdev->data.socket;
+
+	if (likely(sock != NULL)) {
+		ok = _drbd_send_cmd(mdev, sock, P_STATE,
+				    (struct p_header80 *)&p, sizeof(p), 0);
+	}
+
+	mutex_unlock(&mdev->data.mutex);
+
+	return ok;
+}
+
 int drbd_send_state_req(struct drbd_conf *mdev,
 	union drbd_state mask, union drbd_state val)
 {
@@ -2615,7 +2773,7 @@ static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
 	struct bio_vec *bvec;
 	int i;
 	/* hint all but last page with MSG_MORE */
-	__bio_for_each_segment(bvec, bio, i, 0) {
+	bio_for_each_segment(bvec, bio, i) {
 		if (!_drbd_no_send_page(mdev, bvec->bv_page,
 				     bvec->bv_offset, bvec->bv_len,
 				     i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
@@ -2629,7 +2787,7 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
 	struct bio_vec *bvec;
 	int i;
 	/* hint all but last page with MSG_MORE */
-	__bio_for_each_segment(bvec, bio, i, 0) {
+	bio_for_each_segment(bvec, bio, i) {
 		if (!_drbd_send_page(mdev, bvec->bv_page,
 				     bvec->bv_offset, bvec->bv_len,
 				     i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
@@ -2695,8 +2853,7 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
 
 	p.sector   = cpu_to_be64(req->sector);
 	p.block_id = (unsigned long)req;
-	p.seq_num  = cpu_to_be32(req->seq_num =
-				 atomic_add_return(1, &mdev->packet_seq));
+	p.seq_num  = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
 
 	dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
 
@@ -2987,8 +3144,8 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
 	atomic_set(&mdev->rs_sect_in, 0);
 	atomic_set(&mdev->rs_sect_ev, 0);
 	atomic_set(&mdev->ap_in_flight, 0);
+	atomic_set(&mdev->md_io_in_use, 0);
 
-	mutex_init(&mdev->md_io_mutex);
 	mutex_init(&mdev->data.mutex);
 	mutex_init(&mdev->meta.mutex);
 	sema_init(&mdev->data.work.s, 0);
@@ -3126,6 +3283,10 @@ static void drbd_destroy_mempools(void)
 
 	/* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
 
+	if (drbd_md_io_bio_set)
+		bioset_free(drbd_md_io_bio_set);
+	if (drbd_md_io_page_pool)
+		mempool_destroy(drbd_md_io_page_pool);
 	if (drbd_ee_mempool)
 		mempool_destroy(drbd_ee_mempool);
 	if (drbd_request_mempool)
@@ -3139,6 +3300,8 @@ static void drbd_destroy_mempools(void)
 	if (drbd_al_ext_cache)
 		kmem_cache_destroy(drbd_al_ext_cache);
 
+	drbd_md_io_bio_set   = NULL;
+	drbd_md_io_page_pool = NULL;
 	drbd_ee_mempool      = NULL;
 	drbd_request_mempool = NULL;
 	drbd_ee_cache        = NULL;
@@ -3162,6 +3325,8 @@ static int drbd_create_mempools(void)
 	drbd_bm_ext_cache    = NULL;
 	drbd_al_ext_cache    = NULL;
 	drbd_pp_pool         = NULL;
+	drbd_md_io_page_pool = NULL;
+	drbd_md_io_bio_set   = NULL;
 
 	/* caches */
 	drbd_request_cache = kmem_cache_create(
@@ -3185,6 +3350,16 @@ static int drbd_create_mempools(void)
 		goto Enomem;
 
 	/* mempools */
+#ifdef COMPAT_HAVE_BIOSET_CREATE
+	drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
+	if (drbd_md_io_bio_set == NULL)
+		goto Enomem;
+#endif
+
+	drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
+	if (drbd_md_io_page_pool == NULL)
+		goto Enomem;
+
 	drbd_request_mempool = mempool_create(number,
 		mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
 	if (drbd_request_mempool == NULL)
@@ -3262,6 +3437,8 @@ static void drbd_delete_device(unsigned int minor)
 	if (!mdev)
 		return;
 
+	del_timer_sync(&mdev->request_timer);
+
 	/* paranoia asserts */
 	if (mdev->open_cnt != 0)
 		dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
@@ -3666,8 +3843,10 @@ void drbd_md_sync(struct drbd_conf *mdev)
 	if (!get_ldev_if_state(mdev, D_FAILED))
 		return;
 
-	mutex_lock(&mdev->md_io_mutex);
-	buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
+	buffer = drbd_md_get_buffer(mdev);
+	if (!buffer)
+		goto out;
+
 	memset(buffer, 0, 512);
 
 	buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
@@ -3698,7 +3877,8 @@ void drbd_md_sync(struct drbd_conf *mdev)
 	 * since we updated it on metadata. */
 	mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
 
-	mutex_unlock(&mdev->md_io_mutex);
+	drbd_md_put_buffer(mdev);
+out:
 	put_ldev(mdev);
 }
 
@@ -3718,8 +3898,9 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 	if (!get_ldev_if_state(mdev, D_ATTACHING))
 		return ERR_IO_MD_DISK;
 
-	mutex_lock(&mdev->md_io_mutex);
-	buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
+	buffer = drbd_md_get_buffer(mdev);
+	if (!buffer)
+		goto out;
 
 	if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
 		/* NOTE: can't do normal error processing here as this is
@@ -3780,7 +3961,8 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 		mdev->sync_conf.al_extents = 127;
 
  err:
-	mutex_unlock(&mdev->md_io_mutex);
+	drbd_md_put_buffer(mdev);
+ out:
 	put_ldev(mdev);
 
 	return rv;
@@ -4183,12 +4365,11 @@ const char *drbd_buildtag(void)
 	static char buildtag[38] = "\0uilt-in";
 
 	if (buildtag[0] == 0) {
-#ifdef CONFIG_MODULES
-		if (THIS_MODULE != NULL)
-			sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
-		else
+#ifdef MODULE
+		sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
+#else
+		buildtag[0] = 'b';
 #endif
-			buildtag[0] = 'b';
 	}
 
 	return buildtag;
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 946166e13953..6d4de6a72e80 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -289,7 +289,7 @@ static int _try_outdate_peer_async(void *data)
 	*/
 	spin_lock_irq(&mdev->req_lock);
 	ns = mdev->state;
-	if (ns.conn < C_WF_REPORT_PARAMS) {
+	if (ns.conn < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &mdev->flags)) {
 		ns.pdsk = nps;
 		_drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
 	}
@@ -432,7 +432,7 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
 		/* if this was forced, we should consider sync */
 		if (forced)
 			drbd_send_uuids(mdev);
-		drbd_send_state(mdev);
+		drbd_send_current_state(mdev);
 	}
 
 	drbd_md_sync(mdev);
@@ -845,9 +845,10 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev)
 	   Because new from 8.3.8 onwards the peer can use multiple
 	   BIOs for a single peer_request */
 	if (mdev->state.conn >= C_CONNECTED) {
-		if (mdev->agreed_pro_version < 94)
-			peer = mdev->peer_max_bio_size;
-		else if (mdev->agreed_pro_version == 94)
+		if (mdev->agreed_pro_version < 94) {
+			peer = min_t(int, mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+			/* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */
+		} else if (mdev->agreed_pro_version == 94)
 			peer = DRBD_MAX_SIZE_H80_PACKET;
 		else /* drbd 8.3.8 onwards */
 			peer = DRBD_MAX_BIO_SIZE;
@@ -1032,7 +1033,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 		dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
 			(unsigned long long) drbd_get_max_capacity(nbc),
 			(unsigned long long) nbc->dc.disk_size);
-		retcode = ERR_DISK_TO_SMALL;
+		retcode = ERR_DISK_TOO_SMALL;
 		goto fail;
 	}
 
@@ -1046,7 +1047,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	}
 
 	if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
-		retcode = ERR_MD_DISK_TO_SMALL;
+		retcode = ERR_MD_DISK_TOO_SMALL;
 		dev_warn(DEV, "refusing attach: md-device too small, "
 		     "at least %llu sectors needed for this meta-disk type\n",
 		     (unsigned long long) min_md_device_sectors);
@@ -1057,7 +1058,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	 * (we may currently be R_PRIMARY with no local disk...) */
 	if (drbd_get_max_capacity(nbc) <
 	    drbd_get_capacity(mdev->this_bdev)) {
-		retcode = ERR_DISK_TO_SMALL;
+		retcode = ERR_DISK_TOO_SMALL;
 		goto fail;
 	}
 
@@ -1138,7 +1139,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
 	    drbd_new_dev_size(mdev, nbc, 0) < nbc->md.la_size_sect) {
 		dev_warn(DEV, "refusing to truncate a consistent device\n");
-		retcode = ERR_DISK_TO_SMALL;
+		retcode = ERR_DISK_TOO_SMALL;
 		goto force_diskless_dec;
 	}
 
@@ -1336,17 +1337,34 @@ static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 {
 	enum drbd_ret_code retcode;
 	int ret;
+	struct detach dt = {};
+
+	if (!detach_from_tags(mdev, nlp->tag_list, &dt)) {
+		reply->ret_code = ERR_MANDATORY_TAG;
+		goto out;
+	}
+
+	if (dt.detach_force) {
+		drbd_force_state(mdev, NS(disk, D_FAILED));
+		reply->ret_code = SS_SUCCESS;
+		goto out;
+	}
+
 	drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */
+	drbd_md_get_buffer(mdev); /* make sure there is no in-flight meta-data IO */
 	retcode = drbd_request_state(mdev, NS(disk, D_FAILED));
+	drbd_md_put_buffer(mdev);
 	/* D_FAILED will transition to DISKLESS. */
 	ret = wait_event_interruptible(mdev->misc_wait,
 			mdev->state.disk != D_FAILED);
 	drbd_resume_io(mdev);
+
 	if ((int)retcode == (int)SS_IS_DISKLESS)
 		retcode = SS_NOTHING_TO_DO;
 	if (ret)
 		retcode = ERR_INTR;
 	reply->ret_code = retcode;
+out:
 	return 0;
 }
 
@@ -1711,7 +1729,7 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 
 	if (rs.no_resync && mdev->agreed_pro_version < 93) {
 		retcode = ERR_NEED_APV_93;
-		goto fail;
+		goto fail_ldev;
 	}
 
 	if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev))
@@ -1738,6 +1756,10 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
  fail:
 	reply->ret_code = retcode;
 	return 0;
+
+ fail_ldev:
+	put_ldev(mdev);
+	goto fail;
 }
 
 static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
@@ -1941,6 +1963,7 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
 
 	/* If there is still bitmap IO pending, probably because of a previous
 	 * resync just being finished, wait for it before requesting a new resync. */
+	drbd_suspend_io(mdev);
 	wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
 
 	retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED);
@@ -1959,6 +1982,7 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
 
 		retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
 	}
+	drbd_resume_io(mdev);
 
 	reply->ret_code = retcode;
 	return 0;
@@ -1980,6 +2004,7 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re
 
 	/* If there is still bitmap IO pending, probably because of a previous
 	 * resync just being finished, wait for it before requesting a new resync. */
+	drbd_suspend_io(mdev);
 	wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
 
 	retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED);
@@ -1998,6 +2023,7 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re
 		} else
 			retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
 	}
+	drbd_resume_io(mdev);
 
 	reply->ret_code = retcode;
 	return 0;
@@ -2170,11 +2196,13 @@ static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 
 	/* If there is still bitmap IO pending, e.g. previous resync or verify
 	 * just being finished, wait for it before requesting a new resync. */
+	drbd_suspend_io(mdev);
 	wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
 
 	/* w_make_ov_request expects position to be aligned */
 	mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT;
 	reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
+	drbd_resume_io(mdev);
 	return 0;
 }
 
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 2959cdfb77f5..869bada2ed06 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -52,7 +52,7 @@ void seq_printf_with_thousands_grouping(struct seq_file *seq, long v)
 	if (unlikely(v >= 1000000)) {
 		/* cool: > GiByte/s */
 		seq_printf(seq, "%ld,", v / 1000000);
-		v /= 1000000;
+		v %= 1000000;
 		seq_printf(seq, "%03ld,%03ld", v/1000, v % 1000);
 	} else if (likely(v >= 1000))
 		seq_printf(seq, "%ld,%03ld", v/1000, v % 1000);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 436f519bed1c..ea4836e0ae98 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -466,6 +466,7 @@ static int drbd_accept(struct drbd_conf *mdev, const char **what,
 		goto out;
 	}
 	(*newsock)->ops  = sock->ops;
+	__module_get((*newsock)->ops->owner);
 
 out:
 	return err;
@@ -750,6 +751,7 @@ static int drbd_connect(struct drbd_conf *mdev)
 {
 	struct socket *s, *sock, *msock;
 	int try, h, ok;
+	enum drbd_state_rv rv;
 
 	D_ASSERT(!mdev->data.socket);
 
@@ -888,25 +890,32 @@ retry:
 		}
 	}
 
-	if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS)
-		return 0;
-
 	sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
 	sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
 
 	atomic_set(&mdev->packet_seq, 0);
 	mdev->peer_seq = 0;
 
-	drbd_thread_start(&mdev->asender);
-
 	if (drbd_send_protocol(mdev) == -1)
 		return -1;
+	set_bit(STATE_SENT, &mdev->flags);
 	drbd_send_sync_param(mdev, &mdev->sync_conf);
 	drbd_send_sizes(mdev, 0, 0);
 	drbd_send_uuids(mdev);
-	drbd_send_state(mdev);
+	drbd_send_current_state(mdev);
 	clear_bit(USE_DEGR_WFC_T, &mdev->flags);
 	clear_bit(RESIZE_PENDING, &mdev->flags);
+
+	spin_lock_irq(&mdev->req_lock);
+	rv = _drbd_set_state(_NS(mdev, conn, C_WF_REPORT_PARAMS), CS_VERBOSE, NULL);
+	if (mdev->state.conn != C_WF_REPORT_PARAMS)
+		clear_bit(STATE_SENT, &mdev->flags);
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (rv < SS_SUCCESS)
+		return 0;
+
+	drbd_thread_start(&mdev->asender);
 	mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */
 
 	return 1;
@@ -957,7 +966,7 @@ static void drbd_flush(struct drbd_conf *mdev)
 		rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
 					NULL);
 		if (rv) {
-			dev_err(DEV, "local disk flush failed with status %d\n", rv);
+			dev_info(DEV, "local disk flush failed with status %d\n", rv);
 			/* would rather check on EOPNOTSUPP, but that is not reliable.
 			 * don't try again for ANY return value != 0
 			 * if (rv == -EOPNOTSUPP) */
@@ -1001,13 +1010,14 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
 
 		if (epoch_size != 0 &&
 		    atomic_read(&epoch->active) == 0 &&
-		    test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) {
+		    (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) {
 			if (!(ev & EV_CLEANUP)) {
 				spin_unlock(&mdev->epoch_lock);
 				drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
 				spin_lock(&mdev->epoch_lock);
 			}
-			dec_unacked(mdev);
+			if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
+				dec_unacked(mdev);
 
 			if (mdev->current_epoch != epoch) {
 				next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
@@ -1096,7 +1106,11 @@ int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
 	/* In most cases, we will only need one bio.  But in case the lower
 	 * level restrictions happen to be different at this offset on this
 	 * side than those of the sending peer, we may need to submit the
-	 * request in more than one bio. */
+	 * request in more than one bio.
+	 *
+	 * Plain bio_alloc is good enough here, this is no DRBD internally
+	 * generated bio, but a bio allocated on behalf of the peer.
+	 */
 next_bio:
 	bio = bio_alloc(GFP_NOIO, nr_pages);
 	if (!bio) {
@@ -1583,6 +1597,24 @@ static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int u
 	return ok;
 }
 
+static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_epoch_entry *data_e)
+{
+
+	struct drbd_epoch_entry *rs_e;
+	bool rv = 0;
+
+	spin_lock_irq(&mdev->req_lock);
+	list_for_each_entry(rs_e, &mdev->sync_ee, w.list) {
+		if (overlaps(data_e->sector, data_e->size, rs_e->sector, rs_e->size)) {
+			rv = 1;
+			break;
+		}
+	}
+	spin_unlock_irq(&mdev->req_lock);
+
+	return rv;
+}
+
 /* Called from receive_Data.
  * Synchronize packets on sock with packets on msock.
  *
@@ -1826,6 +1858,9 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 	list_add(&e->w.list, &mdev->active_ee);
 	spin_unlock_irq(&mdev->req_lock);
 
+	if (mdev->state.conn == C_SYNC_TARGET)
+		wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, e));
+
 	switch (mdev->net_conf->wire_protocol) {
 	case DRBD_PROT_C:
 		inc_unacked(mdev);
@@ -2420,7 +2455,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
 			mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
 			mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1];
 
-			dev_info(DEV, "Did not got last syncUUID packet, corrected:\n");
+			dev_info(DEV, "Lost last syncUUID packet, corrected:\n");
 			drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
 
 			return -1;
@@ -2806,10 +2841,10 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
 
 	if (apv >= 88) {
 		if (apv == 88) {
-			if (data_size > SHARED_SECRET_MAX) {
-				dev_err(DEV, "verify-alg too long, "
-				    "peer wants %u, accepting only %u byte\n",
-						data_size, SHARED_SECRET_MAX);
+			if (data_size > SHARED_SECRET_MAX || data_size == 0) {
+				dev_err(DEV, "verify-alg of wrong size, "
+					"peer wants %u, accepting only up to %u byte\n",
+					data_size, SHARED_SECRET_MAX);
 				return false;
 			}
 
@@ -3168,9 +3203,20 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 	os = ns = mdev->state;
 	spin_unlock_irq(&mdev->req_lock);
 
-	/* peer says his disk is uptodate, while we think it is inconsistent,
-	 * and this happens while we think we have a sync going on. */
-	if (os.pdsk == D_INCONSISTENT && real_peer_disk == D_UP_TO_DATE &&
+	/* If some other part of the code (asender thread, timeout)
+	 * already decided to close the connection again,
+	 * we must not "re-establish" it here. */
+	if (os.conn <= C_TEAR_DOWN)
+		return false;
+
+	/* If this is the "end of sync" confirmation, usually the peer disk
+	 * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
+	 * set) resync started in PausedSyncT, or if the timing of pause-/
+	 * unpause-sync events has been "just right", the peer disk may
+	 * transition from D_CONSISTENT to D_UP_TO_DATE as well.
+	 */
+	if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) &&
+	    real_peer_disk == D_UP_TO_DATE &&
 	    os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
 		/* If we are (becoming) SyncSource, but peer is still in sync
 		 * preparation, ignore its uptodate-ness to avoid flapping, it
@@ -3288,7 +3334,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 			/* Nowadays only used when forcing a node into primary role and
 			   setting its disk to UpToDate with that */
 			drbd_send_uuids(mdev);
-			drbd_send_state(mdev);
+			drbd_send_current_state(mdev);
 		}
 	}
 
@@ -3776,6 +3822,13 @@ static void drbd_disconnect(struct drbd_conf *mdev)
 	if (mdev->state.conn == C_STANDALONE)
 		return;
 
+	/* We are about to start the cleanup after connection loss.
+	 * Make sure drbd_make_request knows about that.
+	 * Usually we should be in some network failure state already,
+	 * but just in case we are not, we fix it up here.
+	 */
+	drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
+
 	/* asender does not clean up anything. it must not interfere, either */
 	drbd_thread_stop(&mdev->asender);
 	drbd_free_sock(mdev);
@@ -3803,8 +3856,6 @@ static void drbd_disconnect(struct drbd_conf *mdev)
 	atomic_set(&mdev->rs_pending_cnt, 0);
 	wake_up(&mdev->misc_wait);
 
-	del_timer(&mdev->request_timer);
-
 	/* make sure syncer is stopped and w_resume_next_sg queued */
 	del_timer_sync(&mdev->resync_timer);
 	resync_timer_fn((unsigned long)mdev);
@@ -4433,7 +4484,7 @@ static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h)
 
 	if (mdev->state.conn == C_AHEAD &&
 	    atomic_read(&mdev->ap_in_flight) == 0 &&
-	    !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags)) {
+	    !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags)) {
 		mdev->start_resync_timer.expires = jiffies + HZ;
 		add_timer(&mdev->start_resync_timer);
 	}
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 4a0f314086e5..9c5c84946b05 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -37,6 +37,7 @@ static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req
 	const int rw = bio_data_dir(bio);
 	int cpu;
 	cpu = part_stat_lock();
+	part_round_stats(cpu, &mdev->vdisk->part0);
 	part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]);
 	part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio));
 	part_inc_in_flight(&mdev->vdisk->part0, rw);
@@ -214,8 +215,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
 {
 	const unsigned long s = req->rq_state;
 	struct drbd_conf *mdev = req->mdev;
-	/* only WRITES may end up here without a master bio (on barrier ack) */
-	int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE;
+	int rw = req->rq_state & RQ_WRITE ? WRITE : READ;
 
 	/* we must not complete the master bio, while it is
 	 *	still being processed by _drbd_send_zc_bio (drbd_send_dblock)
@@ -230,7 +230,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
 		return;
 	if (s & RQ_NET_PENDING)
 		return;
-	if (s & RQ_LOCAL_PENDING)
+	if (s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED))
 		return;
 
 	if (req->master_bio) {
@@ -277,6 +277,9 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
 		req->master_bio = NULL;
 	}
 
+	if (s & RQ_LOCAL_PENDING)
+		return;
+
 	if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) {
 		/* this is disconnected (local only) operation,
 		 * or protocol C P_WRITE_ACK,
@@ -429,7 +432,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		break;
 
 	case completed_ok:
-		if (bio_data_dir(req->master_bio) == WRITE)
+		if (req->rq_state & RQ_WRITE)
 			mdev->writ_cnt += req->size>>9;
 		else
 			mdev->read_cnt += req->size>>9;
@@ -438,7 +441,14 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		req->rq_state &= ~RQ_LOCAL_PENDING;
 
 		_req_may_be_done_not_susp(req, m);
-		put_ldev(mdev);
+		break;
+
+	case abort_disk_io:
+		req->rq_state |= RQ_LOCAL_ABORTED;
+		if (req->rq_state & RQ_WRITE)
+			_req_may_be_done_not_susp(req, m);
+		else
+			goto goto_queue_for_net_read;
 		break;
 
 	case write_completed_with_error:
@@ -447,7 +457,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 
 		__drbd_chk_io_error(mdev, false);
 		_req_may_be_done_not_susp(req, m);
-		put_ldev(mdev);
 		break;
 
 	case read_ahead_completed_with_error:
@@ -455,7 +464,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		req->rq_state |= RQ_LOCAL_COMPLETED;
 		req->rq_state &= ~RQ_LOCAL_PENDING;
 		_req_may_be_done_not_susp(req, m);
-		put_ldev(mdev);
 		break;
 
 	case read_completed_with_error:
@@ -467,7 +475,8 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		D_ASSERT(!(req->rq_state & RQ_NET_MASK));
 
 		__drbd_chk_io_error(mdev, false);
-		put_ldev(mdev);
+
+	goto_queue_for_net_read:
 
 		/* no point in retrying if there is no good remote data,
 		 * or we have no connection. */
@@ -556,10 +565,8 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		drbd_queue_work(&mdev->data.work, &req->w);
 		break;
 
-	case oos_handed_to_network:
-		/* actually the same */
+	case read_retry_remote_canceled:
 	case send_canceled:
-		/* treat it the same */
 	case send_failed:
 		/* real cleanup will be done from tl_clear.  just update flags
 		 * so it is no longer marked as on the worker queue */
@@ -589,17 +596,17 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		}
 		req->rq_state &= ~RQ_NET_QUEUED;
 		req->rq_state |= RQ_NET_SENT;
-		/* because _drbd_send_zc_bio could sleep, and may want to
-		 * dereference the bio even after the "write_acked_by_peer" and
-		 * "completed_ok" events came in, once we return from
-		 * _drbd_send_zc_bio (drbd_send_dblock), we have to check
-		 * whether it is done already, and end it.  */
 		_req_may_be_done_not_susp(req, m);
 		break;
 
-	case read_retry_remote_canceled:
+	case oos_handed_to_network:
+		/* Was not set PENDING, no longer QUEUED, so is now DONE
+		 * as far as this connection is concerned. */
 		req->rq_state &= ~RQ_NET_QUEUED;
-		/* fall through, in case we raced with drbd_disconnect */
+		req->rq_state |= RQ_NET_DONE;
+		_req_may_be_done_not_susp(req, m);
+		break;
+
 	case connection_lost_while_pending:
 		/* transfer log cleanup after connection loss */
 		/* assert something? */
@@ -616,8 +623,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 			_req_may_be_done(req, m); /* Allowed while state.susp */
 		break;
 
-	case write_acked_by_peer_and_sis:
-		req->rq_state |= RQ_NET_SIS;
 	case conflict_discarded_by_peer:
 		/* for discarded conflicting writes of multiple primaries,
 		 * there is no need to keep anything in the tl, potential
@@ -628,18 +633,15 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 			      (unsigned long long)req->sector, req->size);
 		req->rq_state |= RQ_NET_DONE;
 		/* fall through */
+	case write_acked_by_peer_and_sis:
 	case write_acked_by_peer:
+		if (what == write_acked_by_peer_and_sis)
+			req->rq_state |= RQ_NET_SIS;
 		/* protocol C; successfully written on peer.
-		 * Nothing to do here.
+		 * Nothing more to do here.
 		 * We want to keep the tl in place for all protocols, to cater
-		 * for volatile write-back caches on lower level devices.
-		 *
-		 * A barrier request is expected to have forced all prior
-		 * requests onto stable storage, so completion of a barrier
-		 * request could set NET_DONE right here, and not wait for the
-		 * P_BARRIER_ACK, but that is an unnecessary optimization. */
+		 * for volatile write-back caches on lower level devices. */
 
-		/* this makes it effectively the same as for: */
 	case recv_acked_by_peer:
 		/* protocol B; pretends to be successfully written on peer.
 		 * see also notes above in handed_over_to_network about
@@ -773,6 +775,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
 	int local, remote, send_oos = 0;
 	int err = -EIO;
 	int ret = 0;
+	union drbd_state s;
 
 	/* allocate outside of all locks; */
 	req = drbd_req_new(mdev, bio);
@@ -834,8 +837,9 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
 		drbd_al_begin_io(mdev, sector);
 	}
 
-	remote = remote && drbd_should_do_remote(mdev->state);
-	send_oos = rw == WRITE && drbd_should_send_oos(mdev->state);
+	s = mdev->state;
+	remote = remote && drbd_should_do_remote(s);
+	send_oos = rw == WRITE && drbd_should_send_oos(s);
 	D_ASSERT(!(remote && send_oos));
 
 	if (!(local || remote) && !is_susp(mdev->state)) {
@@ -867,7 +871,7 @@ allocate_barrier:
 
 	if (is_susp(mdev->state)) {
 		/* If we got suspended, use the retry mechanism of
-		   generic_make_request() to restart processing of this
+		   drbd_make_request() to restart processing of this
 		   bio. In the next call to drbd_make_request
 		   we sleep in inc_ap_bio() */
 		ret = 1;
@@ -1091,7 +1095,6 @@ void drbd_make_request(struct request_queue *q, struct bio *bio)
 	 */
 	D_ASSERT(bio->bi_size > 0);
 	D_ASSERT((bio->bi_size & 0x1ff) == 0);
-	D_ASSERT(bio->bi_idx == 0);
 
 	/* to make some things easier, force alignment of requests within the
 	 * granularity of our hash tables */
@@ -1099,8 +1102,9 @@ void drbd_make_request(struct request_queue *q, struct bio *bio)
 	e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT;
 
 	if (likely(s_enr == e_enr)) {
-		inc_ap_bio(mdev, 1);
-		drbd_make_request_common(mdev, bio, start_time);
+		do {
+			inc_ap_bio(mdev, 1);
+		} while (drbd_make_request_common(mdev, bio, start_time));
 		return;
 	}
 
@@ -1196,36 +1200,66 @@ void request_timer_fn(unsigned long data)
 	struct drbd_conf *mdev = (struct drbd_conf *) data;
 	struct drbd_request *req; /* oldest request */
 	struct list_head *le;
-	unsigned long et = 0; /* effective timeout = ko_count * timeout */
+	unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
+	unsigned long now;
 
 	if (get_net_conf(mdev)) {
-		et = mdev->net_conf->timeout*HZ/10 * mdev->net_conf->ko_count;
+		if (mdev->state.conn >= C_WF_REPORT_PARAMS)
+			ent = mdev->net_conf->timeout*HZ/10
+				* mdev->net_conf->ko_count;
 		put_net_conf(mdev);
 	}
-	if (!et || mdev->state.conn < C_WF_REPORT_PARAMS)
+	if (get_ldev(mdev)) { /* implicit state.disk >= D_INCONSISTENT */
+		dt = mdev->ldev->dc.disk_timeout * HZ / 10;
+		put_ldev(mdev);
+	}
+	et = min_not_zero(dt, ent);
+
+	if (!et)
 		return; /* Recurring timer stopped */
 
+	now = jiffies;
+
 	spin_lock_irq(&mdev->req_lock);
 	le = &mdev->oldest_tle->requests;
 	if (list_empty(le)) {
 		spin_unlock_irq(&mdev->req_lock);
-		mod_timer(&mdev->request_timer, jiffies + et);
+		mod_timer(&mdev->request_timer, now + et);
 		return;
 	}
 
 	le = le->prev;
 	req = list_entry(le, struct drbd_request, tl_requests);
-	if (time_is_before_eq_jiffies(req->start_time + et)) {
-		if (req->rq_state & RQ_NET_PENDING) {
-			dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n");
-			_drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE, NULL);
-		} else {
-			dev_warn(DEV, "Local backing block device frozen?\n");
-			mod_timer(&mdev->request_timer, jiffies + et);
-		}
-	} else {
-		mod_timer(&mdev->request_timer, req->start_time + et);
-	}
 
+	/* The request is considered timed out, if
+	 * - we have some effective timeout from the configuration,
+	 *   with above state restrictions applied,
+	 * - the oldest request is waiting for a response from the network
+	 *   resp. the local disk,
+	 * - the oldest request is in fact older than the effective timeout,
+	 * - the connection was established (resp. disk was attached)
+	 *   for longer than the timeout already.
+	 * Note that for 32bit jiffies and very stable connections/disks,
+	 * we may have a wrap around, which is catched by
+	 *   !time_in_range(now, last_..._jif, last_..._jif + timeout).
+	 *
+	 * Side effect: once per 32bit wrap-around interval, which means every
+	 * ~198 days with 250 HZ, we have a window where the timeout would need
+	 * to expire twice (worst case) to become effective. Good enough.
+	 */
+	if (ent && req->rq_state & RQ_NET_PENDING &&
+		 time_after(now, req->start_time + ent) &&
+		!time_in_range(now, mdev->last_reconnect_jif, mdev->last_reconnect_jif + ent)) {
+		dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n");
+		_drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL);
+	}
+	if (dt && req->rq_state & RQ_LOCAL_PENDING &&
+		 time_after(now, req->start_time + dt) &&
+		!time_in_range(now, mdev->last_reattach_jif, mdev->last_reattach_jif + dt)) {
+		dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n");
+		__drbd_chk_io_error(mdev, 1);
+	}
+	nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et;
 	spin_unlock_irq(&mdev->req_lock);
+	mod_timer(&mdev->request_timer, nt);
 }
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 68a234a5fdc5..3d2111919486 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -105,6 +105,7 @@ enum drbd_req_event {
 	read_completed_with_error,
 	read_ahead_completed_with_error,
 	write_completed_with_error,
+	abort_disk_io,
 	completed_ok,
 	resend,
 	fail_frozen_disk_io,
@@ -118,18 +119,21 @@ enum drbd_req_event {
  * same time, so we should hold the request lock anyways.
  */
 enum drbd_req_state_bits {
-	/* 210
-	 * 000: no local possible
-	 * 001: to be submitted
+	/* 3210
+	 * 0000: no local possible
+	 * 0001: to be submitted
 	 *    UNUSED, we could map: 011: submitted, completion still pending
-	 * 110: completed ok
-	 * 010: completed with error
+	 * 0110: completed ok
+	 * 0010: completed with error
+	 * 1001: Aborted (before completion)
+	 * 1x10: Aborted and completed -> free
 	 */
 	__RQ_LOCAL_PENDING,
 	__RQ_LOCAL_COMPLETED,
 	__RQ_LOCAL_OK,
+	__RQ_LOCAL_ABORTED,
 
-	/* 76543
+	/* 87654
 	 * 00000: no network possible
 	 * 00001: to be send
 	 * 00011: to be send, on worker queue
@@ -199,8 +203,9 @@ enum drbd_req_state_bits {
 #define RQ_LOCAL_PENDING   (1UL << __RQ_LOCAL_PENDING)
 #define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED)
 #define RQ_LOCAL_OK        (1UL << __RQ_LOCAL_OK)
+#define RQ_LOCAL_ABORTED   (1UL << __RQ_LOCAL_ABORTED)
 
-#define RQ_LOCAL_MASK      ((RQ_LOCAL_OK << 1)-1) /* 0x07 */
+#define RQ_LOCAL_MASK      ((RQ_LOCAL_ABORTED << 1)-1)
 
 #define RQ_NET_PENDING     (1UL << __RQ_NET_PENDING)
 #define RQ_NET_QUEUED      (1UL << __RQ_NET_QUEUED)
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 4d3e6f6213ba..620c70ff2231 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -70,11 +70,29 @@ rwlock_t global_state_lock;
 void drbd_md_io_complete(struct bio *bio, int error)
 {
 	struct drbd_md_io *md_io;
+	struct drbd_conf *mdev;
 
 	md_io = (struct drbd_md_io *)bio->bi_private;
+	mdev = container_of(md_io, struct drbd_conf, md_io);
+
 	md_io->error = error;
 
-	complete(&md_io->event);
+	/* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
+	 * to timeout on the lower level device, and eventually detach from it.
+	 * If this io completion runs after that timeout expired, this
+	 * drbd_md_put_buffer() may allow us to finally try and re-attach.
+	 * During normal operation, this only puts that extra reference
+	 * down to 1 again.
+	 * Make sure we first drop the reference, and only then signal
+	 * completion, or we may (in drbd_al_read_log()) cycle so fast into the
+	 * next drbd_md_sync_page_io(), that we trigger the
+	 * ASSERT(atomic_read(&mdev->md_io_in_use) == 1) there.
+	 */
+	drbd_md_put_buffer(mdev);
+	md_io->done = 1;
+	wake_up(&mdev->misc_wait);
+	bio_put(bio);
+	put_ldev(mdev);
 }
 
 /* reads on behalf of the partner,
@@ -226,6 +244,7 @@ void drbd_endio_pri(struct bio *bio, int error)
 	spin_lock_irqsave(&mdev->req_lock, flags);
 	__req_mod(req, what, &m);
 	spin_unlock_irqrestore(&mdev->req_lock, flags);
+	put_ldev(mdev);
 
 	if (m.bio)
 		complete_master_bio(mdev, &m);
@@ -290,7 +309,7 @@ void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *
 	sg_init_table(&sg, 1);
 	crypto_hash_init(&desc);
 
-	__bio_for_each_segment(bvec, bio, i, 0) {
+	bio_for_each_segment(bvec, bio, i) {
 		sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
 		crypto_hash_update(&desc, &sg, sg.length);
 	}
@@ -728,7 +747,7 @@ int w_start_resync(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 	}
 
 	drbd_start_resync(mdev, C_SYNC_SOURCE);
-	clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags);
+	clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags);
 	return 1;
 }
 
@@ -1519,14 +1538,14 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
 	}
 
 	drbd_state_lock(mdev);
-
+	write_lock_irq(&global_state_lock);
 	if (!get_ldev_if_state(mdev, D_NEGOTIATING)) {
+		write_unlock_irq(&global_state_lock);
 		drbd_state_unlock(mdev);
 		return;
 	}
 
-	write_lock_irq(&global_state_lock);
-	ns = mdev->state;
+	ns.i = mdev->state.i;
 
 	ns.aftr_isp = !_drbd_may_sync_now(mdev);
 
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index b0b00d70c166..cce7df367b79 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -551,7 +551,7 @@ static void floppy_ready(void);
 static void floppy_start(void);
 static void process_fd_request(void);
 static void recalibrate_floppy(void);
-static void floppy_shutdown(unsigned long);
+static void floppy_shutdown(struct work_struct *);
 
 static int floppy_request_regions(int);
 static void floppy_release_regions(int);
@@ -588,6 +588,8 @@ static int buffer_max = -1;
 static struct floppy_fdc_state fdc_state[N_FDC];
 static int fdc;			/* current fdc */
 
+static struct workqueue_struct *floppy_wq;
+
 static struct floppy_struct *_floppy = floppy_type;
 static unsigned char current_drive;
 static long current_count_sectors;
@@ -629,16 +631,15 @@ static inline void set_debugt(void) { }
 static inline void debugt(const char *func, const char *msg) { }
 #endif /* DEBUGT */
 
-typedef void (*timeout_fn)(unsigned long);
-static DEFINE_TIMER(fd_timeout, floppy_shutdown, 0, 0);
 
+static DECLARE_DELAYED_WORK(fd_timeout, floppy_shutdown);
 static const char *timeout_message;
 
 static void is_alive(const char *func, const char *message)
 {
 	/* this routine checks whether the floppy driver is "alive" */
 	if (test_bit(0, &fdc_busy) && command_status < 2 &&
-	    !timer_pending(&fd_timeout)) {
+	    !delayed_work_pending(&fd_timeout)) {
 		DPRINT("%s: timeout handler died.  %s\n", func, message);
 	}
 }
@@ -666,15 +667,18 @@ static int output_log_pos;
 
 static void __reschedule_timeout(int drive, const char *message)
 {
+	unsigned long delay;
+
 	if (drive == current_reqD)
 		drive = current_drive;
-	del_timer(&fd_timeout);
+
 	if (drive < 0 || drive >= N_DRIVE) {
-		fd_timeout.expires = jiffies + 20UL * HZ;
+		delay = 20UL * HZ;
 		drive = 0;
 	} else
-		fd_timeout.expires = jiffies + UDP->timeout;
-	add_timer(&fd_timeout);
+		delay = UDP->timeout;
+
+	queue_delayed_work(floppy_wq, &fd_timeout, delay);
 	if (UDP->flags & FD_DEBUG)
 		DPRINT("reschedule timeout %s\n", message);
 	timeout_message = message;
@@ -872,7 +876,7 @@ static int lock_fdc(int drive, bool interruptible)
 
 	command_status = FD_COMMAND_NONE;
 
-	__reschedule_timeout(drive, "lock fdc");
+	reschedule_timeout(drive, "lock fdc");
 	set_fdc(drive);
 	return 0;
 }
@@ -880,23 +884,15 @@ static int lock_fdc(int drive, bool interruptible)
 /* unlocks the driver */
 static void unlock_fdc(void)
 {
-	unsigned long flags;
-
-	raw_cmd = NULL;
 	if (!test_bit(0, &fdc_busy))
 		DPRINT("FDC access conflict!\n");
 
-	if (do_floppy)
-		DPRINT("device interrupt still active at FDC release: %pf!\n",
-		       do_floppy);
+	raw_cmd = NULL;
 	command_status = FD_COMMAND_NONE;
-	spin_lock_irqsave(&floppy_lock, flags);
-	del_timer(&fd_timeout);
+	__cancel_delayed_work(&fd_timeout);
+	do_floppy = NULL;
 	cont = NULL;
 	clear_bit(0, &fdc_busy);
-	if (current_req || set_next_request())
-		do_fd_request(current_req->q);
-	spin_unlock_irqrestore(&floppy_lock, flags);
 	wake_up(&fdc_wait);
 }
 
@@ -968,26 +964,24 @@ static DECLARE_WORK(floppy_work, NULL);
 
 static void schedule_bh(void (*handler)(void))
 {
+	WARN_ON(work_pending(&floppy_work));
+
 	PREPARE_WORK(&floppy_work, (work_func_t)handler);
-	schedule_work(&floppy_work);
+	queue_work(floppy_wq, &floppy_work);
 }
 
-static DEFINE_TIMER(fd_timer, NULL, 0, 0);
+static DECLARE_DELAYED_WORK(fd_timer, NULL);
 
 static void cancel_activity(void)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&floppy_lock, flags);
 	do_floppy = NULL;
-	PREPARE_WORK(&floppy_work, (work_func_t)empty);
-	del_timer(&fd_timer);
-	spin_unlock_irqrestore(&floppy_lock, flags);
+	cancel_delayed_work_sync(&fd_timer);
+	cancel_work_sync(&floppy_work);
 }
 
 /* this function makes sure that the disk stays in the drive during the
  * transfer */
-static void fd_watchdog(void)
+static void fd_watchdog(struct work_struct *arg)
 {
 	debug_dcl(DP->flags, "calling disk change from watchdog\n");
 
@@ -997,21 +991,20 @@ static void fd_watchdog(void)
 		cont->done(0);
 		reset_fdc();
 	} else {
-		del_timer(&fd_timer);
-		fd_timer.function = (timeout_fn)fd_watchdog;
-		fd_timer.expires = jiffies + HZ / 10;
-		add_timer(&fd_timer);
+		cancel_delayed_work(&fd_timer);
+		PREPARE_DELAYED_WORK(&fd_timer, fd_watchdog);
+		queue_delayed_work(floppy_wq, &fd_timer, HZ / 10);
 	}
 }
 
 static void main_command_interrupt(void)
 {
-	del_timer(&fd_timer);
+	cancel_delayed_work(&fd_timer);
 	cont->interrupt();
 }
 
 /* waits for a delay (spinup or select) to pass */
-static int fd_wait_for_completion(unsigned long delay, timeout_fn function)
+static int fd_wait_for_completion(unsigned long expires, work_func_t function)
 {
 	if (FDCS->reset) {
 		reset_fdc();	/* do the reset during sleep to win time
@@ -1020,11 +1013,10 @@ static int fd_wait_for_completion(unsigned long delay, timeout_fn function)
 		return 1;
 	}
 
-	if (time_before(jiffies, delay)) {
-		del_timer(&fd_timer);
-		fd_timer.function = function;
-		fd_timer.expires = delay;
-		add_timer(&fd_timer);
+	if (time_before(jiffies, expires)) {
+		cancel_delayed_work(&fd_timer);
+		PREPARE_DELAYED_WORK(&fd_timer, function);
+		queue_delayed_work(floppy_wq, &fd_timer, expires - jiffies);
 		return 1;
 	}
 	return 0;
@@ -1342,7 +1334,7 @@ static int fdc_dtr(void)
 	 */
 	FDCS->dtr = raw_cmd->rate & 3;
 	return fd_wait_for_completion(jiffies + 2UL * HZ / 100,
-				      (timeout_fn)floppy_ready);
+				      (work_func_t)floppy_ready);
 }				/* fdc_dtr */
 
 static void tell_sector(void)
@@ -1447,7 +1439,7 @@ static void setup_rw_floppy(void)
 	int flags;
 	int dflags;
 	unsigned long ready_date;
-	timeout_fn function;
+	work_func_t function;
 
 	flags = raw_cmd->flags;
 	if (flags & (FD_RAW_READ | FD_RAW_WRITE))
@@ -1461,9 +1453,9 @@ static void setup_rw_floppy(void)
 		 */
 		if (time_after(ready_date, jiffies + DP->select_delay)) {
 			ready_date -= DP->select_delay;
-			function = (timeout_fn)floppy_start;
+			function = (work_func_t)floppy_start;
 		} else
-			function = (timeout_fn)setup_rw_floppy;
+			function = (work_func_t)setup_rw_floppy;
 
 		/* wait until the floppy is spinning fast enough */
 		if (fd_wait_for_completion(ready_date, function))
@@ -1493,7 +1485,7 @@ static void setup_rw_floppy(void)
 		inr = result();
 		cont->interrupt();
 	} else if (flags & FD_RAW_NEED_DISK)
-		fd_watchdog();
+		fd_watchdog(NULL);
 }
 
 static int blind_seek;
@@ -1802,20 +1794,22 @@ static void show_floppy(void)
 		pr_info("do_floppy=%pf\n", do_floppy);
 	if (work_pending(&floppy_work))
 		pr_info("floppy_work.func=%pf\n", floppy_work.func);
-	if (timer_pending(&fd_timer))
-		pr_info("fd_timer.function=%pf\n", fd_timer.function);
-	if (timer_pending(&fd_timeout)) {
-		pr_info("timer_function=%pf\n", fd_timeout.function);
-		pr_info("expires=%lu\n", fd_timeout.expires - jiffies);
-		pr_info("now=%lu\n", jiffies);
-	}
+	if (delayed_work_pending(&fd_timer))
+		pr_info("delayed work.function=%p expires=%ld\n",
+		       fd_timer.work.func,
+		       fd_timer.timer.expires - jiffies);
+	if (delayed_work_pending(&fd_timeout))
+		pr_info("timer_function=%p expires=%ld\n",
+		       fd_timeout.work.func,
+		       fd_timeout.timer.expires - jiffies);
+
 	pr_info("cont=%p\n", cont);
 	pr_info("current_req=%p\n", current_req);
 	pr_info("command_status=%d\n", command_status);
 	pr_info("\n");
 }
 
-static void floppy_shutdown(unsigned long data)
+static void floppy_shutdown(struct work_struct *arg)
 {
 	unsigned long flags;
 
@@ -1868,7 +1862,7 @@ static int start_motor(void (*function)(void))
 
 	/* wait_for_completion also schedules reset if needed. */
 	return fd_wait_for_completion(DRS->select_date + DP->select_delay,
-				      (timeout_fn)function);
+				      (work_func_t)function);
 }
 
 static void floppy_ready(void)
@@ -2821,7 +2815,6 @@ do_request:
 		spin_lock_irq(&floppy_lock);
 		pending = set_next_request();
 		spin_unlock_irq(&floppy_lock);
-
 		if (!pending) {
 			do_floppy = NULL;
 			unlock_fdc();
@@ -2898,13 +2891,15 @@ static void do_fd_request(struct request_queue *q)
 		 current_req->cmd_flags))
 		return;
 
-	if (test_bit(0, &fdc_busy)) {
+	if (test_and_set_bit(0, &fdc_busy)) {
 		/* fdc busy, this new request will be treated when the
 		   current one is done */
 		is_alive(__func__, "old request running");
 		return;
 	}
-	lock_fdc(MAXTIMEOUT, false);
+	command_status = FD_COMMAND_NONE;
+	__reschedule_timeout(MAXTIMEOUT, "fd_request");
+	set_fdc(0);
 	process_fd_request();
 	is_alive(__func__, "");
 }
@@ -3612,9 +3607,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
 
 	mutex_lock(&floppy_mutex);
 	mutex_lock(&open_lock);
-	if (UDRS->fd_ref < 0)
-		UDRS->fd_ref = 0;
-	else if (!UDRS->fd_ref--) {
+	if (!UDRS->fd_ref--) {
 		DPRINT("floppy_release with fd_ref == 0");
 		UDRS->fd_ref = 0;
 	}
@@ -3650,13 +3643,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
 		set_bit(FD_VERIFY_BIT, &UDRS->flags);
 	}
 
-	if (UDRS->fd_ref == -1 || (UDRS->fd_ref && (mode & FMODE_EXCL)))
-		goto out2;
-
-	if (mode & FMODE_EXCL)
-		UDRS->fd_ref = -1;
-	else
-		UDRS->fd_ref++;
+	UDRS->fd_ref++;
 
 	opened_bdev[drive] = bdev;
 
@@ -3719,10 +3706,8 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
 	mutex_unlock(&floppy_mutex);
 	return 0;
 out:
-	if (UDRS->fd_ref < 0)
-		UDRS->fd_ref = 0;
-	else
-		UDRS->fd_ref--;
+	UDRS->fd_ref--;
+
 	if (!UDRS->fd_ref)
 		opened_bdev[drive] = NULL;
 out2:
@@ -4159,10 +4144,16 @@ static int __init floppy_init(void)
 			goto out_put_disk;
 		}
 
+		floppy_wq = alloc_ordered_workqueue("floppy", 0);
+		if (!floppy_wq) {
+			err = -ENOMEM;
+			goto out_put_disk;
+		}
+
 		disks[dr]->queue = blk_init_queue(do_fd_request, &floppy_lock);
 		if (!disks[dr]->queue) {
 			err = -ENOMEM;
-			goto out_put_disk;
+			goto out_destroy_workq;
 		}
 
 		blk_queue_max_hw_sectors(disks[dr]->queue, 64);
@@ -4213,7 +4204,7 @@ static int __init floppy_init(void)
 	use_virtual_dma = can_use_virtual_dma & 1;
 	fdc_state[0].address = FDC1;
 	if (fdc_state[0].address == -1) {
-		del_timer_sync(&fd_timeout);
+		cancel_delayed_work(&fd_timeout);
 		err = -ENODEV;
 		goto out_unreg_region;
 	}
@@ -4224,7 +4215,7 @@ static int __init floppy_init(void)
 	fdc = 0;		/* reset fdc in case of unexpected interrupt */
 	err = floppy_grab_irq_and_dma();
 	if (err) {
-		del_timer_sync(&fd_timeout);
+		cancel_delayed_work(&fd_timeout);
 		err = -EBUSY;
 		goto out_unreg_region;
 	}
@@ -4281,13 +4272,13 @@ static int __init floppy_init(void)
 		user_reset_fdc(-1, FD_RESET_ALWAYS, false);
 	}
 	fdc = 0;
-	del_timer_sync(&fd_timeout);
+	cancel_delayed_work(&fd_timeout);
 	current_drive = 0;
 	initialized = true;
 	if (have_no_fdc) {
 		DPRINT("no floppy controllers found\n");
 		err = have_no_fdc;
-		goto out_flush_work;
+		goto out_release_dma;
 	}
 
 	for (drive = 0; drive < N_DRIVE; drive++) {
@@ -4302,7 +4293,7 @@ static int __init floppy_init(void)
 
 		err = platform_device_register(&floppy_device[drive]);
 		if (err)
-			goto out_flush_work;
+			goto out_release_dma;
 
 		err = device_create_file(&floppy_device[drive].dev,
 					 &dev_attr_cmos);
@@ -4320,13 +4311,14 @@ static int __init floppy_init(void)
 
 out_unreg_platform_dev:
 	platform_device_unregister(&floppy_device[drive]);
-out_flush_work:
-	flush_work_sync(&floppy_work);
+out_release_dma:
 	if (atomic_read(&usage_count))
 		floppy_release_irq_and_dma();
 out_unreg_region:
 	blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
 	platform_driver_unregister(&floppy_driver);
+out_destroy_workq:
+	destroy_workqueue(floppy_wq);
 out_unreg_blkdev:
 	unregister_blkdev(FLOPPY_MAJOR, "fd");
 out_put_disk:
@@ -4397,7 +4389,7 @@ static int floppy_grab_irq_and_dma(void)
 	 * We might have scheduled a free_irq(), wait it to
 	 * drain first:
 	 */
-	flush_work_sync(&floppy_work);
+	flush_workqueue(floppy_wq);
 
 	if (fd_request_irq()) {
 		DPRINT("Unable to grab IRQ%d for the floppy driver\n",
@@ -4488,9 +4480,9 @@ static void floppy_release_irq_and_dma(void)
 			pr_info("motor off timer %d still active\n", drive);
 #endif
 
-	if (timer_pending(&fd_timeout))
+	if (delayed_work_pending(&fd_timeout))
 		pr_info("floppy timer still active:%s\n", timeout_message);
-	if (timer_pending(&fd_timer))
+	if (delayed_work_pending(&fd_timer))
 		pr_info("auxiliary floppy timer still active\n");
 	if (work_pending(&floppy_work))
 		pr_info("work still pending\n");
@@ -4560,8 +4552,9 @@ static void __exit floppy_module_exit(void)
 		put_disk(disks[drive]);
 	}
 
-	del_timer_sync(&fd_timeout);
-	del_timer_sync(&fd_timer);
+	cancel_delayed_work_sync(&fd_timeout);
+	cancel_delayed_work_sync(&fd_timer);
+	destroy_workqueue(floppy_wq);
 
 	if (atomic_read(&usage_count))
 		floppy_release_irq_and_dma();
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 4e86393a09cf..60eed4bdd2e4 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -526,6 +526,14 @@ static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
 	return 0;
 }
 
+static char *encode_disk_name(char *ptr, unsigned int n)
+{
+	if (n >= 26)
+		ptr = encode_disk_name(ptr, n / 26 - 1);
+	*ptr = 'a' + n % 26;
+	return ptr + 1;
+}
+
 static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 			       struct blkfront_info *info,
 			       u16 vdisk_info, u16 sector_size)
@@ -536,6 +544,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 	unsigned int offset;
 	int minor;
 	int nr_parts;
+	char *ptr;
 
 	BUG_ON(info->gd != NULL);
 	BUG_ON(info->rq != NULL);
@@ -560,7 +569,11 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 					"emulated IDE disks,\n\t choose an xvd device name"
 					"from xvde on\n", info->vdevice);
 	}
-	err = -ENODEV;
+	if (minor >> MINORBITS) {
+		pr_warn("blkfront: %#x's minor (%#x) out of range; ignoring\n",
+			info->vdevice, minor);
+		return -ENODEV;
+	}
 
 	if ((minor % nr_parts) == 0)
 		nr_minors = nr_parts;
@@ -574,23 +587,14 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 	if (gd == NULL)
 		goto release;
 
-	if (nr_minors > 1) {
-		if (offset < 26)
-			sprintf(gd->disk_name, "%s%c", DEV_NAME, 'a' + offset);
-		else
-			sprintf(gd->disk_name, "%s%c%c", DEV_NAME,
-				'a' + ((offset / 26)-1), 'a' + (offset % 26));
-	} else {
-		if (offset < 26)
-			sprintf(gd->disk_name, "%s%c%d", DEV_NAME,
-				'a' + offset,
-				minor & (nr_parts - 1));
-		else
-			sprintf(gd->disk_name, "%s%c%c%d", DEV_NAME,
-				'a' + ((offset / 26) - 1),
-				'a' + (offset % 26),
-				minor & (nr_parts - 1));
-	}
+	strcpy(gd->disk_name, DEV_NAME);
+	ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset);
+	BUG_ON(ptr >= gd->disk_name + DISK_NAME_LEN);
+	if (nr_minors > 1)
+		*ptr = 0;
+	else
+		snprintf(ptr, gd->disk_name + DISK_NAME_LEN - ptr,
+			 "%d", minor & (nr_parts - 1));
 
 	gd->major = XENVBD_MAJOR;
 	gd->first_minor = minor;
@@ -1496,7 +1500,9 @@ module_init(xlblk_init);
 
 static void __exit xlblk_exit(void)
 {
-	return xenbus_unregister_driver(&blkfront_driver);
+	xenbus_unregister_driver(&blkfront_driver);
+	unregister_blkdev(XENVBD_MAJOR, DEV_NAME);
+	kfree(minors);
 }
 module_exit(xlblk_exit);
 
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 7ef73c919c5d..7be9b7288e90 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -715,25 +715,6 @@ static inline u64 input_addr_to_sys_addr(struct mem_ctl_info *mci,
 				     input_addr_to_dram_addr(mci, input_addr));
 }
 
-/*
- * Find the minimum and maximum InputAddr values that map to the given @csrow.
- * Pass back these values in *input_addr_min and *input_addr_max.
- */
-static void find_csrow_limits(struct mem_ctl_info *mci, int csrow,
-			      u64 *input_addr_min, u64 *input_addr_max)
-{
-	struct amd64_pvt *pvt;
-	u64 base, mask;
-
-	pvt = mci->pvt_info;
-	BUG_ON((csrow < 0) || (csrow >= pvt->csels[0].b_cnt));
-
-	get_cs_base_and_mask(pvt, csrow, 0, &base, &mask);
-
-	*input_addr_min = base & ~mask;
-	*input_addr_max = base | mask;
-}
-
 /* Map the Error address to a PAGE and PAGE OFFSET. */
 static inline void error_address_to_page_and_offset(u64 error_address,
 						    u32 *page, u32 *offset)
@@ -1058,6 +1039,37 @@ static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr,
 	int channel, csrow;
 	u32 page, offset;
 
+	error_address_to_page_and_offset(sys_addr, &page, &offset);
+
+	/*
+	 * Find out which node the error address belongs to. This may be
+	 * different from the node that detected the error.
+	 */
+	src_mci = find_mc_by_sys_addr(mci, sys_addr);
+	if (!src_mci) {
+		amd64_mc_err(mci, "failed to map error addr 0x%lx to a node\n",
+			     (unsigned long)sys_addr);
+		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
+				     page, offset, syndrome,
+				     -1, -1, -1,
+				     EDAC_MOD_STR,
+				     "failed to map error addr to a node",
+				     NULL);
+		return;
+	}
+
+	/* Now map the sys_addr to a CSROW */
+	csrow = sys_addr_to_csrow(src_mci, sys_addr);
+	if (csrow < 0) {
+		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
+				     page, offset, syndrome,
+				     -1, -1, -1,
+				     EDAC_MOD_STR,
+				     "failed to map error addr to a csrow",
+				     NULL);
+		return;
+	}
+
 	/* CHIPKILL enabled */
 	if (pvt->nbcfg & NBCFG_CHIPKILL) {
 		channel = get_channel_from_ecc_syndrome(mci, syndrome);
@@ -1067,9 +1079,15 @@ static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr,
 			 * 2 DIMMs is in error. So we need to ID 'both' of them
 			 * as suspect.
 			 */
-			amd64_mc_warn(mci, "unknown syndrome 0x%04x - possible "
-					   "error reporting race\n", syndrome);
-			edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR);
+			amd64_mc_warn(src_mci, "unknown syndrome 0x%04x - "
+				      "possible error reporting race\n",
+				      syndrome);
+			edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
+					     page, offset, syndrome,
+					     csrow, -1, -1,
+					     EDAC_MOD_STR,
+					     "unknown syndrome - possible error reporting race",
+					     NULL);
 			return;
 		}
 	} else {
@@ -1084,28 +1102,10 @@ static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr,
 		channel = ((sys_addr & BIT(3)) != 0);
 	}
 
-	/*
-	 * Find out which node the error address belongs to. This may be
-	 * different from the node that detected the error.
-	 */
-	src_mci = find_mc_by_sys_addr(mci, sys_addr);
-	if (!src_mci) {
-		amd64_mc_err(mci, "failed to map error addr 0x%lx to a node\n",
-			     (unsigned long)sys_addr);
-		edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR);
-		return;
-	}
-
-	/* Now map the sys_addr to a CSROW */
-	csrow = sys_addr_to_csrow(src_mci, sys_addr);
-	if (csrow < 0) {
-		edac_mc_handle_ce_no_info(src_mci, EDAC_MOD_STR);
-	} else {
-		error_address_to_page_and_offset(sys_addr, &page, &offset);
-
-		edac_mc_handle_ce(src_mci, page, offset, syndrome, csrow,
-				  channel, EDAC_MOD_STR);
-	}
+	edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, src_mci,
+			     page, offset, syndrome,
+			     csrow, channel, -1,
+			     EDAC_MOD_STR, "", NULL);
 }
 
 static int ddr2_cs_size(unsigned i, bool dct_width)
@@ -1611,15 +1611,20 @@ static void f1x_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr,
 	u32 page, offset;
 	int nid, csrow, chan = 0;
 
+	error_address_to_page_and_offset(sys_addr, &page, &offset);
+
 	csrow = f1x_translate_sysaddr_to_cs(pvt, sys_addr, &nid, &chan);
 
 	if (csrow < 0) {
-		edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR);
+		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
+				     page, offset, syndrome,
+				     -1, -1, -1,
+				     EDAC_MOD_STR,
+				     "failed to map error addr to a csrow",
+				     NULL);
 		return;
 	}
 
-	error_address_to_page_and_offset(sys_addr, &page, &offset);
-
 	/*
 	 * We need the syndromes for channel detection only when we're
 	 * ganged. Otherwise @chan should already contain the channel at
@@ -1628,16 +1633,10 @@ static void f1x_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr,
 	if (dct_ganging_enabled(pvt))
 		chan = get_channel_from_ecc_syndrome(mci, syndrome);
 
-	if (chan >= 0)
-		edac_mc_handle_ce(mci, page, offset, syndrome, csrow, chan,
-				  EDAC_MOD_STR);
-	else
-		/*
-		 * Channel unknown, report all channels on this CSROW as failed.
-		 */
-		for (chan = 0; chan < mci->csrows[csrow].nr_channels; chan++)
-			edac_mc_handle_ce(mci, page, offset, syndrome,
-					  csrow, chan, EDAC_MOD_STR);
+	edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
+			     page, offset, syndrome,
+			     csrow, chan, -1,
+			     EDAC_MOD_STR, "", NULL);
 }
 
 /*
@@ -1918,7 +1917,12 @@ static void amd64_handle_ce(struct mem_ctl_info *mci, struct mce *m)
 	/* Ensure that the Error Address is VALID */
 	if (!(m->status & MCI_STATUS_ADDRV)) {
 		amd64_mc_err(mci, "HW has no ERROR_ADDRESS available\n");
-		edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR);
+		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
+				     0, 0, 0,
+				     -1, -1, -1,
+				     EDAC_MOD_STR,
+				     "HW has no ERROR_ADDRESS available",
+				     NULL);
 		return;
 	}
 
@@ -1942,11 +1946,17 @@ static void amd64_handle_ue(struct mem_ctl_info *mci, struct mce *m)
 
 	if (!(m->status & MCI_STATUS_ADDRV)) {
 		amd64_mc_err(mci, "HW has no ERROR_ADDRESS available\n");
-		edac_mc_handle_ue_no_info(log_mci, EDAC_MOD_STR);
+		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
+				     0, 0, 0,
+				     -1, -1, -1,
+				     EDAC_MOD_STR,
+				     "HW has no ERROR_ADDRESS available",
+				     NULL);
 		return;
 	}
 
 	sys_addr = get_error_address(m);
+	error_address_to_page_and_offset(sys_addr, &page, &offset);
 
 	/*
 	 * Find out which node the error address belongs to. This may be
@@ -1956,7 +1966,11 @@ static void amd64_handle_ue(struct mem_ctl_info *mci, struct mce *m)
 	if (!src_mci) {
 		amd64_mc_err(mci, "ERROR ADDRESS (0x%lx) NOT mapped to a MC\n",
 				  (unsigned long)sys_addr);
-		edac_mc_handle_ue_no_info(log_mci, EDAC_MOD_STR);
+		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
+				     page, offset, 0,
+				     -1, -1, -1,
+				     EDAC_MOD_STR,
+				     "ERROR ADDRESS NOT mapped to a MC", NULL);
 		return;
 	}
 
@@ -1966,10 +1980,17 @@ static void amd64_handle_ue(struct mem_ctl_info *mci, struct mce *m)
 	if (csrow < 0) {
 		amd64_mc_err(mci, "ERROR_ADDRESS (0x%lx) NOT mapped to CS\n",
 				  (unsigned long)sys_addr);
-		edac_mc_handle_ue_no_info(log_mci, EDAC_MOD_STR);
+		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
+				     page, offset, 0,
+				     -1, -1, -1,
+				     EDAC_MOD_STR,
+				     "ERROR ADDRESS NOT mapped to CS",
+				     NULL);
 	} else {
-		error_address_to_page_and_offset(sys_addr, &page, &offset);
-		edac_mc_handle_ue(log_mci, page, offset, csrow, EDAC_MOD_STR);
+		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
+				     page, offset, 0,
+				     csrow, -1, -1,
+				     EDAC_MOD_STR, "", NULL);
 	}
 }
 
@@ -2171,7 +2192,7 @@ static u32 amd64_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr)
 	nr_pages = pvt->ops->dbam_to_cs(pvt, dct, cs_mode) << (20 - PAGE_SHIFT);
 
 	debugf0("  (csrow=%d) DBAM map index= %d\n", csrow_nr, cs_mode);
-	debugf0("    nr_pages= %u  channel-count = %d\n",
+	debugf0("    nr_pages/channel= %u  channel-count = %d\n",
 		nr_pages, pvt->channel_count);
 
 	return nr_pages;
@@ -2185,9 +2206,12 @@ static int init_csrows(struct mem_ctl_info *mci)
 {
 	struct csrow_info *csrow;
 	struct amd64_pvt *pvt = mci->pvt_info;
-	u64 input_addr_min, input_addr_max, sys_addr, base, mask;
+	u64 base, mask;
 	u32 val;
-	int i, empty = 1;
+	int i, j, empty = 1;
+	enum mem_type mtype;
+	enum edac_type edac_mode;
+	int nr_pages = 0;
 
 	amd64_read_pci_cfg(pvt->F3, NBCFG, &val);
 
@@ -2211,41 +2235,32 @@ static int init_csrows(struct mem_ctl_info *mci)
 
 		empty = 0;
 		if (csrow_enabled(i, 0, pvt))
-			csrow->nr_pages = amd64_csrow_nr_pages(pvt, 0, i);
+			nr_pages = amd64_csrow_nr_pages(pvt, 0, i);
 		if (csrow_enabled(i, 1, pvt))
-			csrow->nr_pages += amd64_csrow_nr_pages(pvt, 1, i);
-		find_csrow_limits(mci, i, &input_addr_min, &input_addr_max);
-		sys_addr = input_addr_to_sys_addr(mci, input_addr_min);
-		csrow->first_page = (u32) (sys_addr >> PAGE_SHIFT);
-		sys_addr = input_addr_to_sys_addr(mci, input_addr_max);
-		csrow->last_page = (u32) (sys_addr >> PAGE_SHIFT);
+			nr_pages += amd64_csrow_nr_pages(pvt, 1, i);
 
 		get_cs_base_and_mask(pvt, i, 0, &base, &mask);
-		csrow->page_mask = ~mask;
 		/* 8 bytes of resolution */
 
-		csrow->mtype = amd64_determine_memory_type(pvt, i);
+		mtype = amd64_determine_memory_type(pvt, i);
 
 		debugf1("  for MC node %d csrow %d:\n", pvt->mc_node_id, i);
-		debugf1("    input_addr_min: 0x%lx input_addr_max: 0x%lx\n",
-			(unsigned long)input_addr_min,
-			(unsigned long)input_addr_max);
-		debugf1("    sys_addr: 0x%lx  page_mask: 0x%lx\n",
-			(unsigned long)sys_addr, csrow->page_mask);
-		debugf1("    nr_pages: %u  first_page: 0x%lx "
-			"last_page: 0x%lx\n",
-			(unsigned)csrow->nr_pages,
-			csrow->first_page, csrow->last_page);
+		debugf1("    nr_pages: %u\n", nr_pages * pvt->channel_count);
 
 		/*
 		 * determine whether CHIPKILL or JUST ECC or NO ECC is operating
 		 */
 		if (pvt->nbcfg & NBCFG_ECC_ENABLE)
-			csrow->edac_mode =
-			    (pvt->nbcfg & NBCFG_CHIPKILL) ?
-			    EDAC_S4ECD4ED : EDAC_SECDED;
+			edac_mode = (pvt->nbcfg & NBCFG_CHIPKILL) ?
+				    EDAC_S4ECD4ED : EDAC_SECDED;
 		else
-			csrow->edac_mode = EDAC_NONE;
+			edac_mode = EDAC_NONE;
+
+		for (j = 0; j < pvt->channel_count; j++) {
+			csrow->channels[j].dimm->mtype = mtype;
+			csrow->channels[j].dimm->edac_mode = edac_mode;
+			csrow->channels[j].dimm->nr_pages = nr_pages;
+		}
 	}
 
 	return empty;
@@ -2540,6 +2555,7 @@ static int amd64_init_one_instance(struct pci_dev *F2)
 	struct amd64_pvt *pvt = NULL;
 	struct amd64_family_type *fam_type = NULL;
 	struct mem_ctl_info *mci = NULL;
+	struct edac_mc_layer layers[2];
 	int err = 0, ret;
 	u8 nid = get_node_id(F2);
 
@@ -2574,7 +2590,13 @@ static int amd64_init_one_instance(struct pci_dev *F2)
 		goto err_siblings;
 
 	ret = -ENOMEM;
-	mci = edac_mc_alloc(0, pvt->csels[0].b_cnt, pvt->channel_count, nid);
+	layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
+	layers[0].size = pvt->csels[0].b_cnt;
+	layers[0].is_virt_csrow = true;
+	layers[1].type = EDAC_MC_LAYER_CHANNEL;
+	layers[1].size = pvt->channel_count;
+	layers[1].is_virt_csrow = false;
+	mci = edac_mc_alloc(nid, ARRAY_SIZE(layers), layers, 0);
 	if (!mci)
 		goto err_siblings;
 
diff --git a/drivers/edac/amd76x_edac.c b/drivers/edac/amd76x_edac.c
index f8fd3c807bde..9774d443fa57 100644
--- a/drivers/edac/amd76x_edac.c
+++ b/drivers/edac/amd76x_edac.c
@@ -29,7 +29,6 @@
 	edac_mc_chipset_printk(mci, level, "amd76x", fmt, ##arg)
 
 #define AMD76X_NR_CSROWS 8
-#define AMD76X_NR_CHANS  1
 #define AMD76X_NR_DIMMS  4
 
 /* AMD 76x register addresses - device 0 function 0 - PCI bridge */
@@ -146,8 +145,10 @@ static int amd76x_process_error_info(struct mem_ctl_info *mci,
 
 		if (handle_errors) {
 			row = (info->ecc_mode_status >> 4) & 0xf;
-			edac_mc_handle_ue(mci, mci->csrows[row].first_page, 0,
-					row, mci->ctl_name);
+			edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
+					     mci->csrows[row].first_page, 0, 0,
+					     row, 0, -1,
+					     mci->ctl_name, "", NULL);
 		}
 	}
 
@@ -159,8 +160,10 @@ static int amd76x_process_error_info(struct mem_ctl_info *mci,
 
 		if (handle_errors) {
 			row = info->ecc_mode_status & 0xf;
-			edac_mc_handle_ce(mci, mci->csrows[row].first_page, 0,
-					0, row, 0, mci->ctl_name);
+			edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
+					     mci->csrows[row].first_page, 0, 0,
+					     row, 0, -1,
+					     mci->ctl_name, "", NULL);
 		}
 	}
 
@@ -186,11 +189,13 @@ static void amd76x_init_csrows(struct mem_ctl_info *mci, struct pci_dev *pdev,
 			enum edac_type edac_mode)
 {
 	struct csrow_info *csrow;
+	struct dimm_info *dimm;
 	u32 mba, mba_base, mba_mask, dms;
 	int index;
 
 	for (index = 0; index < mci->nr_csrows; index++) {
 		csrow = &mci->csrows[index];
+		dimm = csrow->channels[0].dimm;
 
 		/* find the DRAM Chip Select Base address and mask */
 		pci_read_config_dword(pdev,
@@ -203,13 +208,13 @@ static void amd76x_init_csrows(struct mem_ctl_info *mci, struct pci_dev *pdev,
 		mba_mask = ((mba & 0xff80) << 16) | 0x7fffffUL;
 		pci_read_config_dword(pdev, AMD76X_DRAM_MODE_STATUS, &dms);
 		csrow->first_page = mba_base >> PAGE_SHIFT;
-		csrow->nr_pages = (mba_mask + 1) >> PAGE_SHIFT;
-		csrow->last_page = csrow->first_page + csrow->nr_pages - 1;
+		dimm->nr_pages = (mba_mask + 1) >> PAGE_SHIFT;
+		csrow->last_page = csrow->first_page + dimm->nr_pages - 1;
 		csrow->page_mask = mba_mask >> PAGE_SHIFT;
-		csrow->grain = csrow->nr_pages << PAGE_SHIFT;
-		csrow->mtype = MEM_RDDR;
-		csrow->dtype = ((dms >> index) & 0x1) ? DEV_X4 : DEV_UNKNOWN;
-		csrow->edac_mode = edac_mode;
+		dimm->grain = dimm->nr_pages << PAGE_SHIFT;
+		dimm->mtype = MEM_RDDR;
+		dimm->dtype = ((dms >> index) & 0x1) ? DEV_X4 : DEV_UNKNOWN;
+		dimm->edac_mode = edac_mode;
 	}
 }
 
@@ -230,7 +235,8 @@ static int amd76x_probe1(struct pci_dev *pdev, int dev_idx)
 		EDAC_SECDED,
 		EDAC_SECDED
 	};
-	struct mem_ctl_info *mci = NULL;
+	struct mem_ctl_info *mci;
+	struct edac_mc_layer layers[2];
 	u32 ems;
 	u32 ems_mode;
 	struct amd76x_error_info discard;
@@ -238,11 +244,17 @@ static int amd76x_probe1(struct pci_dev *pdev, int dev_idx)
 	debugf0("%s()\n", __func__);
 	pci_read_config_dword(pdev, AMD76X_ECC_MODE_STATUS, &ems);
 	ems_mode = (ems >> 10) & 0x3;
-	mci = edac_mc_alloc(0, AMD76X_NR_CSROWS, AMD76X_NR_CHANS, 0);
 
-	if (mci == NULL) {
+	layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
+	layers[0].size = AMD76X_NR_CSROWS;
+	layers[0].is_virt_csrow = true;
+	layers[1].type = EDAC_MC_LAYER_CHANNEL;
+	layers[1].size = 1;
+	layers[1].is_virt_csrow = false;
+	mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, 0);
+
+	if (mci == NULL)
 		return -ENOMEM;
-	}
 
 	debugf0("%s(): mci = %p\n", __func__, mci);
 	mci->dev = &pdev->dev;
diff --git a/drivers/edac/cell_edac.c b/drivers/edac/cell_edac.c
index 9a6a274e6925..69ee6aab5c71 100644
--- a/drivers/edac/cell_edac.c
+++ b/drivers/edac/cell_edac.c
@@ -48,8 +48,9 @@ static void cell_edac_count_ce(struct mem_ctl_info *mci, int chan, u64 ar)
 	syndrome = (ar & 0x000000001fe00000ul) >> 21;
 
 	/* TODO: Decoding of the error address */
-	edac_mc_handle_ce(mci, csrow->first_page + pfn, offset,
-			  syndrome, 0, chan, "");
+	edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
+			     csrow->first_page + pfn, offset, syndrome,
+			     0, chan, -1, "", "", NULL);
 }
 
 static void cell_edac_count_ue(struct mem_ctl_info *mci, int chan, u64 ar)
@@ -69,7 +70,9 @@ static void cell_edac_count_ue(struct mem_ctl_info *mci, int chan, u64 ar)
 	offset = address & ~PAGE_MASK;
 
 	/* TODO: Decoding of the error address */
-	edac_mc_handle_ue(mci, csrow->first_page + pfn, offset, 0, "");
+	edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
+			     csrow->first_page + pfn, offset, 0,
+			     0, chan, -1, "", "", NULL);
 }
 
 static void cell_edac_check(struct mem_ctl_info *mci)
@@ -124,8 +127,11 @@ static void cell_edac_check(struct mem_ctl_info *mci)
 static void __devinit cell_edac_init_csrows(struct mem_ctl_info *mci)
 {
 	struct csrow_info		*csrow = &mci->csrows[0];
+	struct dimm_info		*dimm;
 	struct cell_edac_priv		*priv = mci->pvt_info;
 	struct device_node		*np;
+	int				j;
+	u32				nr_pages;
 
 	for (np = NULL;
 	     (np = of_find_node_by_name(np, "memory")) != NULL;) {
@@ -140,15 +146,20 @@ static void __devinit cell_edac_init_csrows(struct mem_ctl_info *mci)
 		if (of_node_to_nid(np) != priv->node)
 			continue;
 		csrow->first_page = r.start >> PAGE_SHIFT;
-		csrow->nr_pages = resource_size(&r) >> PAGE_SHIFT;
-		csrow->last_page = csrow->first_page + csrow->nr_pages - 1;
-		csrow->mtype = MEM_XDR;
-		csrow->edac_mode = EDAC_SECDED;
+		nr_pages = resource_size(&r) >> PAGE_SHIFT;
+		csrow->last_page = csrow->first_page + nr_pages - 1;
+
+		for (j = 0; j < csrow->nr_channels; j++) {
+			dimm = csrow->channels[j].dimm;
+			dimm->mtype = MEM_XDR;
+			dimm->edac_mode = EDAC_SECDED;
+			dimm->nr_pages = nr_pages / csrow->nr_channels;
+		}
 		dev_dbg(mci->dev,
 			"Initialized on node %d, chanmask=0x%x,"
 			" first_page=0x%lx, nr_pages=0x%x\n",
 			priv->node, priv->chanmask,
-			csrow->first_page, csrow->nr_pages);
+			csrow->first_page, nr_pages);
 		break;
 	}
 }
@@ -157,9 +168,10 @@ static int __devinit cell_edac_probe(struct platform_device *pdev)
 {
 	struct cbe_mic_tm_regs __iomem	*regs;
 	struct mem_ctl_info		*mci;
+	struct edac_mc_layer		layers[2];
 	struct cell_edac_priv		*priv;
 	u64				reg;
-	int				rc, chanmask;
+	int				rc, chanmask, num_chans;
 
 	regs = cbe_get_cpu_mic_tm_regs(cbe_node_to_cpu(pdev->id));
 	if (regs == NULL)
@@ -184,8 +196,16 @@ static int __devinit cell_edac_probe(struct platform_device *pdev)
 		in_be64(&regs->mic_fir));
 
 	/* Allocate & init EDAC MC data structure */
-	mci = edac_mc_alloc(sizeof(struct cell_edac_priv), 1,
-			    chanmask == 3 ? 2 : 1, pdev->id);
+	num_chans = chanmask == 3 ? 2 : 1;
+
+	layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
+	layers[0].size = 1;
+	layers[0].is_virt_csrow = true;
+	layers[1].type = EDAC_MC_LAYER_CHANNEL;
+	layers[1].size = num_chans;
+	layers[1].is_virt_csrow = false;
+	mci = edac_mc_alloc(pdev->id, ARRAY_SIZE(layers), layers,
+			    sizeof(struct cell_edac_priv));
 	if (mci == NULL)
 		return -ENOMEM;
 	priv = mci->pvt_info;
diff --git a/drivers/edac/cpc925_edac.c b/drivers/edac/cpc925_edac.c
index a774c0ddaf5b..e22030a9de66 100644
--- a/drivers/edac/cpc925_edac.c
+++ b/drivers/edac/cpc925_edac.c
@@ -329,9 +329,10 @@ static void cpc925_init_csrows(struct mem_ctl_info *mci)
 {
 	struct cpc925_mc_pdata *pdata = mci->pvt_info;
 	struct csrow_info *csrow;
-	int index;
+	struct dimm_info *dimm;
+	int index, j;
 	u32 mbmr, mbbar, bba;
-	unsigned long row_size, last_nr_pages = 0;
+	unsigned long row_size, nr_pages, last_nr_pages = 0;
 
 	get_total_mem(pdata);
 
@@ -350,36 +351,41 @@ static void cpc925_init_csrows(struct mem_ctl_info *mci)
 
 		row_size = bba * (1UL << 28);	/* 256M */
 		csrow->first_page = last_nr_pages;
-		csrow->nr_pages = row_size >> PAGE_SHIFT;
-		csrow->last_page = csrow->first_page + csrow->nr_pages - 1;
+		nr_pages = row_size >> PAGE_SHIFT;
+		csrow->last_page = csrow->first_page + nr_pages - 1;
 		last_nr_pages = csrow->last_page + 1;
 
-		csrow->mtype = MEM_RDDR;
-		csrow->edac_mode = EDAC_SECDED;
-
-		switch (csrow->nr_channels) {
-		case 1: /* Single channel */
-			csrow->grain = 32; /* four-beat burst of 32 bytes */
-			break;
-		case 2: /* Dual channel */
-		default:
-			csrow->grain = 64; /* four-beat burst of 64 bytes */
-			break;
-		}
-
-		switch ((mbmr & MBMR_MODE_MASK) >> MBMR_MODE_SHIFT) {
-		case 6: /* 0110, no way to differentiate X8 VS X16 */
-		case 5:	/* 0101 */
-		case 8: /* 1000 */
-			csrow->dtype = DEV_X16;
-			break;
-		case 7: /* 0111 */
-		case 9: /* 1001 */
-			csrow->dtype = DEV_X8;
-			break;
-		default:
-			csrow->dtype = DEV_UNKNOWN;
-			break;
+		for (j = 0; j < csrow->nr_channels; j++) {
+			dimm = csrow->channels[j].dimm;
+
+			dimm->nr_pages = nr_pages / csrow->nr_channels;
+			dimm->mtype = MEM_RDDR;
+			dimm->edac_mode = EDAC_SECDED;
+
+			switch (csrow->nr_channels) {
+			case 1: /* Single channel */
+				dimm->grain = 32; /* four-beat burst of 32 bytes */
+				break;
+			case 2: /* Dual channel */
+			default:
+				dimm->grain = 64; /* four-beat burst of 64 bytes */
+				break;
+			}
+
+			switch ((mbmr & MBMR_MODE_MASK) >> MBMR_MODE_SHIFT) {
+			case 6: /* 0110, no way to differentiate X8 VS X16 */
+			case 5:	/* 0101 */
+			case 8: /* 1000 */
+				dimm->dtype = DEV_X16;
+				break;
+			case 7: /* 0111 */
+			case 9: /* 1001 */
+				dimm->dtype = DEV_X8;
+				break;
+			default:
+				dimm->dtype = DEV_UNKNOWN;
+				break;
+			}
 		}
 	}
 }
@@ -549,13 +555,18 @@ static void cpc925_mc_check(struct mem_ctl_info *mci)
 	if (apiexcp & CECC_EXCP_DETECTED) {
 		cpc925_mc_printk(mci, KERN_INFO, "DRAM CECC Fault\n");
 		channel = cpc925_mc_find_channel(mci, syndrome);
-		edac_mc_handle_ce(mci, pfn, offset, syndrome,
-				  csrow, channel, mci->ctl_name);
+		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
+				     pfn, offset, syndrome,
+				     csrow, channel, -1,
+				     mci->ctl_name, "", NULL);
 	}
 
 	if (apiexcp & UECC_EXCP_DETECTED) {
 		cpc925_mc_printk(mci, KERN_INFO, "DRAM UECC Fault\n");
-		edac_mc_handle_ue(mci, pfn, offset, csrow, mci->ctl_name);
+		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
+				     pfn, offset, 0,
+				     csrow, -1, -1,
+				     mci->ctl_name, "", NULL);
 	}
 
 	cpc925_mc_printk(mci, KERN_INFO, "Dump registers:\n");
@@ -927,6 +938,7 @@ static int __devinit cpc925_probe(struct platform_device *pdev)
 {
 	static int edac_mc_idx;
 	struct mem_ctl_info *mci;
+	struct edac_mc_layer layers[2];
 	void __iomem *vbase;
 	struct cpc925_mc_pdata *pdata;
 	struct resource *r;
@@ -962,9 +974,16 @@ static int __devinit cpc925_probe(struct platform_device *pdev)
 		goto err2;
 	}
 
-	nr_channels = cpc925_mc_get_channels(vbase);
-	mci = edac_mc_alloc(sizeof(struct cpc925_mc_pdata),
-			CPC925_NR_CSROWS, nr_channels + 1, edac_mc_idx);
+	nr_channels = cpc925_mc_get_channels(vbase) + 1;
+
+	layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
+	layers[0].size = CPC925_NR_CSROWS;
+	layers[0].is_virt_csrow = true;
+	layers[1].type = EDAC_MC_LAYER_CHANNEL;
+	layers[1].size = nr_channels;
+	layers[1].is_virt_csrow = false;
+	mci = edac_mc_alloc(edac_mc_idx, ARRAY_SIZE(layers), layers,
+			    sizeof(struct cpc925_mc_pdata));
 	if (!mci) {
 		cpc925_printk(KERN_ERR, "No memory for mem_ctl_info\n");
 		res = -ENOMEM;
diff --git a/drivers/edac/e752x_edac.c b/drivers/edac/e752x_edac.c
index 41223261ede9..3186512c9739 100644
--- a/drivers/edac/e752x_edac.c
+++ b/drivers/edac/e752x_edac.c
@@ -4,7 +4,11 @@
  * This file may be distributed under the terms of the
  * GNU General Public License.
  *
- * See "enum e752x_chips" below for supported chipsets
+ * Implement support for the e7520, E7525, e7320 and i3100 memory controllers.
+ *
+ * Datasheets:
+ *	http://www.intel.in/content/www/in/en/chipsets/e7525-memory-controller-hub-datasheet.html
+ *	ftp://download.intel.com/design/intarch/datashts/31345803.pdf
  *
  * Written by Tom Zimmerman
  *
@@ -13,8 +17,6 @@
  * 	Wang Zhenyu at intel.com
  * 	Dave Jiang at mvista.com
  *
- * $Id: edac_e752x.c,v 1.5.2.11 2005/10/05 00:43:44 dsp_llnl Exp $
- *
  */
 
 #include <linux/module.h>
@@ -187,6 +189,25 @@ enum e752x_chips {
 	I3100 = 3
 };
 
+/*
+ * Those chips Support single-rank and dual-rank memories only.
+ *
+ * On e752x chips, the odd rows are present only on dual-rank memories.
+ * Dividing the rank by two will provide the dimm#
+ *
+ * i3100 MC has a different mapping: it supports only 4 ranks.
+ *
+ * The mapping is (from 1 to n):
+ *	slot	   single-ranked	double-ranked
+ *	dimm #1 -> rank #4		NA
+ *	dimm #2 -> rank #3		NA
+ *	dimm #3 -> rank #2		Ranks 2 and 3
+ *	dimm #4 -> rank $1		Ranks 1 and 4
+ *
+ * FIXME: The current mapping for i3100 considers that it supports up to 8
+ *	  ranks/chanel, but datasheet says that the MC supports only 4 ranks.
+ */
+
 struct e752x_pvt {
 	struct pci_dev *bridge_ck;
 	struct pci_dev *dev_d0f0;
@@ -350,8 +371,10 @@ static void do_process_ce(struct mem_ctl_info *mci, u16 error_one,
 	channel = !(error_one & 1);
 
 	/* e752x mc reads 34:6 of the DRAM linear address */
-	edac_mc_handle_ce(mci, page, offset_in_page(sec1_add << 4),
-			sec1_syndrome, row, channel, "e752x CE");
+	edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
+			     page, offset_in_page(sec1_add << 4), sec1_syndrome,
+			     row, channel, -1,
+			     "e752x CE", "", NULL);
 }
 
 static inline void process_ce(struct mem_ctl_info *mci, u16 error_one,
@@ -385,9 +408,12 @@ static void do_process_ue(struct mem_ctl_info *mci, u16 error_one,
 			edac_mc_find_csrow_by_page(mci, block_page);
 
 		/* e752x mc reads 34:6 of the DRAM linear address */
-		edac_mc_handle_ue(mci, block_page,
-				offset_in_page(error_2b << 4),
-				row, "e752x UE from Read");
+		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
+					block_page,
+					offset_in_page(error_2b << 4), 0,
+					 row, -1, -1,
+					"e752x UE from Read", "", NULL);
+
 	}
 	if (error_one & 0x0404) {
 		error_2b = scrb_add;
@@ -401,9 +427,11 @@ static void do_process_ue(struct mem_ctl_info *mci, u16 error_one,
 			edac_mc_find_csrow_by_page(mci, block_page);
 
 		/* e752x mc reads 34:6 of the DRAM linear address */
-		edac_mc_handle_ue(mci, block_page,
-				offset_in_page(error_2b << 4),
-				row, "e752x UE from Scruber");
+		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
+					block_page,
+					offset_in_page(error_2b << 4), 0,
+					row, -1, -1,
+					"e752x UE from Scruber", "", NULL);
 	}
 }
 
@@ -426,7 +454,9 @@ static inline void process_ue_no_info_wr(struct mem_ctl_info *mci,
 		return;
 
 	debugf3("%s()\n", __func__);
-	edac_mc_handle_ue_no_info(mci, "e752x UE log memory write");
+	edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 0, 0, 0,
+			     -1, -1, -1,
+			     "e752x UE log memory write", "", NULL);
 }
 
 static void do_process_ded_retry(struct mem_ctl_info *mci, u16 error,
@@ -1044,7 +1074,7 @@ static void e752x_init_csrows(struct mem_ctl_info *mci, struct pci_dev *pdev,
 	int drc_drbg;		/* DRB granularity 0=64mb, 1=128mb */
 	int drc_ddim;		/* DRAM Data Integrity Mode 0=none, 2=edac */
 	u8 value;
-	u32 dra, drc, cumul_size;
+	u32 dra, drc, cumul_size, i, nr_pages;
 
 	dra = 0;
 	for (index = 0; index < 4; index++) {
@@ -1053,7 +1083,7 @@ static void e752x_init_csrows(struct mem_ctl_info *mci, struct pci_dev *pdev,
 		dra |= dra_reg << (index * 8);
 	}
 	pci_read_config_dword(pdev, E752X_DRC, &drc);
-	drc_chan = dual_channel_active(ddrcsr);
+	drc_chan = dual_channel_active(ddrcsr) ? 1 : 0;
 	drc_drbg = drc_chan + 1;	/* 128 in dual mode, 64 in single */
 	drc_ddim = (drc >> 20) & 0x3;
 
@@ -1078,26 +1108,33 @@ static void e752x_init_csrows(struct mem_ctl_info *mci, struct pci_dev *pdev,
 
 		csrow->first_page = last_cumul_size;
 		csrow->last_page = cumul_size - 1;
-		csrow->nr_pages = cumul_size - last_cumul_size;
+		nr_pages = cumul_size - last_cumul_size;
 		last_cumul_size = cumul_size;
-		csrow->grain = 1 << 12;	/* 4KiB - resolution of CELOG */
-		csrow->mtype = MEM_RDDR;	/* only one type supported */
-		csrow->dtype = mem_dev ? DEV_X4 : DEV_X8;
-
-		/*
-		 * if single channel or x8 devices then SECDED
-		 * if dual channel and x4 then S4ECD4ED
-		 */
-		if (drc_ddim) {
-			if (drc_chan && mem_dev) {
-				csrow->edac_mode = EDAC_S4ECD4ED;
-				mci->edac_cap |= EDAC_FLAG_S4ECD4ED;
-			} else {
-				csrow->edac_mode = EDAC_SECDED;
-				mci->edac_cap |= EDAC_FLAG_SECDED;
-			}
-		} else
-			csrow->edac_mode = EDAC_NONE;
+
+		for (i = 0; i < csrow->nr_channels; i++) {
+			struct dimm_info *dimm = csrow->channels[i].dimm;
+
+			debugf3("Initializing rank at (%i,%i)\n", index, i);
+			dimm->nr_pages = nr_pages / csrow->nr_channels;
+			dimm->grain = 1 << 12;	/* 4KiB - resolution of CELOG */
+			dimm->mtype = MEM_RDDR;	/* only one type supported */
+			dimm->dtype = mem_dev ? DEV_X4 : DEV_X8;
+
+			/*
+			* if single channel or x8 devices then SECDED
+			* if dual channel and x4 then S4ECD4ED
+			*/
+			if (drc_ddim) {
+				if (drc_chan && mem_dev) {
+					dimm->edac_mode = EDAC_S4ECD4ED;
+					mci->edac_cap |= EDAC_FLAG_S4ECD4ED;
+				} else {
+					dimm->edac_mode = EDAC_SECDED;
+					mci->edac_cap |= EDAC_FLAG_SECDED;
+				}
+			} else
+				dimm->edac_mode = EDAC_NONE;
+		}
 	}
 }
 
@@ -1226,6 +1263,7 @@ static int e752x_probe1(struct pci_dev *pdev, int dev_idx)
 	u16 pci_data;
 	u8 stat8;
 	struct mem_ctl_info *mci;
+	struct edac_mc_layer layers[2];
 	struct e752x_pvt *pvt;
 	u16 ddrcsr;
 	int drc_chan;		/* Number of channels 0=1chan,1=2chan */
@@ -1252,11 +1290,15 @@ static int e752x_probe1(struct pci_dev *pdev, int dev_idx)
 	/* Dual channel = 1, Single channel = 0 */
 	drc_chan = dual_channel_active(ddrcsr);
 
-	mci = edac_mc_alloc(sizeof(*pvt), E752X_NR_CSROWS, drc_chan + 1, 0);
-
-	if (mci == NULL) {
+	layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
+	layers[0].size = E752X_NR_CSROWS;
+	layers[0].is_virt_csrow = true;
+	layers[1].type = EDAC_MC_LAYER_CHANNEL;
+	layers[1].size = drc_chan + 1;
+	layers[1].is_virt_csrow = false;
+	mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(*pvt));
+	if (mci == NULL)
 		return -ENOMEM;
-	}
 
 	debugf3("%s(): init mci\n", __func__);
 	mci->mtype_cap = MEM_FLAG_RDDR;
diff --git a/drivers/edac/e7xxx_edac.c b/drivers/edac/e7xxx_edac.c
index 68dea87b72e6..9a9c1a546797 100644
--- a/drivers/edac/e7xxx_edac.c
+++ b/drivers/edac/e7xxx_edac.c
@@ -10,6 +10,9 @@
  * Based on work by Dan Hollis <goemon at anime dot net> and others.
  *	http://www.anime.net/~goemon/linux-ecc/
  *
+ * Datasheet:
+ *	http://www.intel.com/content/www/us/en/chipsets/e7501-chipset-memory-controller-hub-datasheet.html
+ *
  * Contributors:
  *	Eric Biederman (Linux Networx)
  *	Tom Zimmerman (Linux Networx)
@@ -71,7 +74,7 @@
 #endif				/* PCI_DEVICE_ID_INTEL_7505_1_ERR */
 
 #define E7XXX_NR_CSROWS		8	/* number of csrows */
-#define E7XXX_NR_DIMMS		8	/* FIXME - is this correct? */
+#define E7XXX_NR_DIMMS		8	/* 2 channels, 4 dimms/channel */
 
 /* E7XXX register addresses - device 0 function 0 */
 #define E7XXX_DRB		0x60	/* DRAM row boundary register (8b) */
@@ -216,13 +219,15 @@ static void process_ce(struct mem_ctl_info *mci, struct e7xxx_error_info *info)
 	row = edac_mc_find_csrow_by_page(mci, page);
 	/* convert syndrome to channel */
 	channel = e7xxx_find_channel(syndrome);
-	edac_mc_handle_ce(mci, page, 0, syndrome, row, channel, "e7xxx CE");
+	edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, page, 0, syndrome,
+			     row, channel, -1, "e7xxx CE", "", NULL);
 }
 
 static void process_ce_no_info(struct mem_ctl_info *mci)
 {
 	debugf3("%s()\n", __func__);
-	edac_mc_handle_ce_no_info(mci, "e7xxx CE log register overflow");
+	edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 0, 0, 0, -1, -1, -1,
+			     "e7xxx CE log register overflow", "", NULL);
 }
 
 static void process_ue(struct mem_ctl_info *mci, struct e7xxx_error_info *info)
@@ -236,13 +241,17 @@ static void process_ue(struct mem_ctl_info *mci, struct e7xxx_error_info *info)
 	/* FIXME - should use PAGE_SHIFT */
 	block_page = error_2b >> 6;	/* convert to 4k address */
 	row = edac_mc_find_csrow_by_page(mci, block_page);
-	edac_mc_handle_ue(mci, block_page, 0, row, "e7xxx UE");
+
+	edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, block_page, 0, 0,
+			     row, -1, -1, "e7xxx UE", "", NULL);
 }
 
 static void process_ue_no_info(struct mem_ctl_info *mci)
 {
 	debugf3("%s()\n", __func__);
-	edac_mc_handle_ue_no_info(mci, "e7xxx UE log register overflow");
+
+	edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 0, 0, 0, -1, -1, -1,
+			     "e7xxx UE log register overflow", "", NULL);
 }
 
 static void e7xxx_get_error_info(struct mem_ctl_info *mci,
@@ -347,11 +356,12 @@ static void e7xxx_init_csrows(struct mem_ctl_info *mci, struct pci_dev *pdev,
 			int dev_idx, u32 drc)
 {
 	unsigned long last_cumul_size;
-	int index;
+	int index, j;
 	u8 value;
-	u32 dra, cumul_size;
+	u32 dra, cumul_size, nr_pages;
 	int drc_chan, drc_drbg, drc_ddim, mem_dev;
 	struct csrow_info *csrow;
+	struct dimm_info *dimm;
 
 	pci_read_config_dword(pdev, E7XXX_DRA, &dra);
 	drc_chan = dual_channel_active(drc, dev_idx);
@@ -379,26 +389,32 @@ static void e7xxx_init_csrows(struct mem_ctl_info *mci, struct pci_dev *pdev,
 
 		csrow->first_page = last_cumul_size;
 		csrow->last_page = cumul_size - 1;
-		csrow->nr_pages = cumul_size - last_cumul_size;
+		nr_pages = cumul_size - last_cumul_size;
 		last_cumul_size = cumul_size;
-		csrow->grain = 1 << 12;	/* 4KiB - resolution of CELOG */
-		csrow->mtype = MEM_RDDR;	/* only one type supported */
-		csrow->dtype = mem_dev ? DEV_X4 : DEV_X8;
-
-		/*
-		 * if single channel or x8 devices then SECDED
-		 * if dual channel and x4 then S4ECD4ED
-		 */
-		if (drc_ddim) {
-			if (drc_chan && mem_dev) {
-				csrow->edac_mode = EDAC_S4ECD4ED;
-				mci->edac_cap |= EDAC_FLAG_S4ECD4ED;
-			} else {
-				csrow->edac_mode = EDAC_SECDED;
-				mci->edac_cap |= EDAC_FLAG_SECDED;
-			}
-		} else
-			csrow->edac_mode = EDAC_NONE;
+
+		for (j = 0; j < drc_chan + 1; j++) {
+			dimm = csrow->channels[j].dimm;
+
+			dimm->nr_pages = nr_pages / (drc_chan + 1);
+			dimm->grain = 1 << 12;	/* 4KiB - resolution of CELOG */
+			dimm->mtype = MEM_RDDR;	/* only one type supported */
+			dimm->dtype = mem_dev ? DEV_X4 : DEV_X8;
+
+			/*
+			* if single channel or x8 devices then SECDED
+			* if dual channel and x4 then S4ECD4ED
+			*/
+			if (drc_ddim) {
+				if (drc_chan && mem_dev) {
+					dimm->edac_mode = EDAC_S4ECD4ED;
+					mci->edac_cap |= EDAC_FLAG_S4ECD4ED;
+				} else {
+					dimm->edac_mode = EDAC_SECDED;
+					mci->edac_cap |= EDAC_FLAG_SECDED;
+				}
+			} else
+				dimm->edac_mode = EDAC_NONE;
+		}
 	}
 }
 
@@ -406,6 +422,7 @@ static int e7xxx_probe1(struct pci_dev *pdev, int dev_idx)
 {
 	u16 pci_data;
 	struct mem_ctl_info *mci = NULL;
+	struct edac_mc_layer layers[2];
 	struct e7xxx_pvt *pvt = NULL;
 	u32 drc;
 	int drc_chan;
@@ -416,8 +433,21 @@ static int e7xxx_probe1(struct pci_dev *pdev, int dev_idx)
 	pci_read_config_dword(pdev, E7XXX_DRC, &drc);
 
 	drc_chan = dual_channel_active(drc, dev_idx);
-	mci = edac_mc_alloc(sizeof(*pvt), E7XXX_NR_CSROWS, drc_chan + 1, 0);
-
+	/*
+	 * According with the datasheet, this device has a maximum of
+	 * 4 DIMMS per channel, either single-rank or dual-rank. So, the
+	 * total amount of dimms is 8 (E7XXX_NR_DIMMS).
+	 * That means that the DIMM is mapped as CSROWs, and the channel
+	 * will map the rank. So, an error to either channel should be
+	 * attributed to the same dimm.
+	 */
+	layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
+	layers[0].size = E7XXX_NR_CSROWS;
+	layers[0].is_virt_csrow = true;
+	layers[1].type = EDAC_MC_LAYER_CHANNEL;
+	layers[1].size = drc_chan + 1;
+	layers[1].is_virt_csrow = false;
+	mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(*pvt));
 	if (mci == NULL)
 		return -ENOMEM;
 
diff --git a/drivers/edac/edac_core.h b/drivers/edac/edac_core.h
index 5b739411d62f..117490d4f835 100644
--- a/drivers/edac/edac_core.h
+++ b/drivers/edac/edac_core.h
@@ -447,8 +447,10 @@ static inline void pci_write_bits32(struct pci_dev *pdev, int offset,
 
 #endif				/* CONFIG_PCI */
 
-extern struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows,
-					  unsigned nr_chans, int edac_index);
+struct mem_ctl_info *edac_mc_alloc(unsigned mc_num,
+				   unsigned n_layers,
+				   struct edac_mc_layer *layers,
+				   unsigned sz_pvt);
 extern int edac_mc_add_mc(struct mem_ctl_info *mci);
 extern void edac_mc_free(struct mem_ctl_info *mci);
 extern struct mem_ctl_info *edac_mc_find(int idx);
@@ -456,35 +458,17 @@ extern struct mem_ctl_info *find_mci_by_dev(struct device *dev);
 extern struct mem_ctl_info *edac_mc_del_mc(struct device *dev);
 extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
 				      unsigned long page);
-
-/*
- * The no info errors are used when error overflows are reported.
- * There are a limited number of error logging registers that can
- * be exausted.  When all registers are exhausted and an additional
- * error occurs then an error overflow register records that an
- * error occurred and the type of error, but doesn't have any
- * further information.  The ce/ue versions make for cleaner
- * reporting logic and function interface - reduces conditional
- * statement clutter and extra function arguments.
- */
-extern void edac_mc_handle_ce(struct mem_ctl_info *mci,
-			      unsigned long page_frame_number,
-			      unsigned long offset_in_page,
-			      unsigned long syndrome, int row, int channel,
-			      const char *msg);
-extern void edac_mc_handle_ce_no_info(struct mem_ctl_info *mci,
-				      const char *msg);
-extern void edac_mc_handle_ue(struct mem_ctl_info *mci,
-			      unsigned long page_frame_number,
-			      unsigned long offset_in_page, int row,
-			      const char *msg);
-extern void edac_mc_handle_ue_no_info(struct mem_ctl_info *mci,
-				      const char *msg);
-extern void edac_mc_handle_fbd_ue(struct mem_ctl_info *mci, unsigned int csrow,
-				  unsigned int channel0, unsigned int channel1,
-				  char *msg);
-extern void edac_mc_handle_fbd_ce(struct mem_ctl_info *mci, unsigned int csrow,
-				  unsigned int channel, char *msg);
+void edac_mc_handle_error(const enum hw_event_mc_err_type type,
+			  struct mem_ctl_info *mci,
+			  const unsigned long page_frame_number,
+			  const unsigned long offset_in_page,
+			  const unsigned long syndrome,
+			  const int layer0,
+			  const int layer1,
+			  const int layer2,
+			  const char *msg,
+			  const char *other_detail,
+			  const void *mcelog);
 
 /*
  * edac_device APIs
@@ -496,6 +480,7 @@ extern void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
 extern void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
 				int inst_nr, int block_nr, const char *msg);
 extern int edac_device_alloc_index(void);
+extern const char *edac_layer_name[];
 
 /*
  * edac_pci APIs
diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c
index 45b8f4bdd773..ee3f1f810c1e 100644
--- a/drivers/edac/edac_device.c
+++ b/drivers/edac/edac_device.c
@@ -79,7 +79,7 @@ struct edac_device_ctl_info *edac_device_alloc_ctl_info(
 	unsigned total_size;
 	unsigned count;
 	unsigned instance, block, attr;
-	void *pvt;
+	void *pvt, *p;
 	int err;
 
 	debugf4("%s() instances=%d blocks=%d\n",
@@ -92,35 +92,30 @@ struct edac_device_ctl_info *edac_device_alloc_ctl_info(
 	 * to be at least as stringent as what the compiler would
 	 * provide if we could simply hardcode everything into a single struct.
 	 */
-	dev_ctl = (struct edac_device_ctl_info *)NULL;
+	p = NULL;
+	dev_ctl = edac_align_ptr(&p, sizeof(*dev_ctl), 1);
 
 	/* Calc the 'end' offset past end of ONE ctl_info structure
 	 * which will become the start of the 'instance' array
 	 */
-	dev_inst = edac_align_ptr(&dev_ctl[1], sizeof(*dev_inst));
+	dev_inst = edac_align_ptr(&p, sizeof(*dev_inst), nr_instances);
 
 	/* Calc the 'end' offset past the instance array within the ctl_info
 	 * which will become the start of the block array
 	 */
-	dev_blk = edac_align_ptr(&dev_inst[nr_instances], sizeof(*dev_blk));
+	count = nr_instances * nr_blocks;
+	dev_blk = edac_align_ptr(&p, sizeof(*dev_blk), count);
 
 	/* Calc the 'end' offset past the dev_blk array
 	 * which will become the start of the attrib array, if any.
 	 */
-	count = nr_instances * nr_blocks;
-	dev_attrib = edac_align_ptr(&dev_blk[count], sizeof(*dev_attrib));
-
-	/* Check for case of when an attribute array is specified */
-	if (nr_attrib > 0) {
-		/* calc how many nr_attrib we need */
+	/* calc how many nr_attrib we need */
+	if (nr_attrib > 0)
 		count *= nr_attrib;
+	dev_attrib = edac_align_ptr(&p, sizeof(*dev_attrib), count);
 
-		/* Calc the 'end' offset past the attributes array */
-		pvt = edac_align_ptr(&dev_attrib[count], sz_private);
-	} else {
-		/* no attribute array specified */
-		pvt = edac_align_ptr(dev_attrib, sz_private);
-	}
+	/* Calc the 'end' offset past the attributes array */
+	pvt = edac_align_ptr(&p, sz_private, 1);
 
 	/* 'pvt' now points to where the private data area is.
 	 * At this point 'pvt' (like dev_inst,dev_blk and dev_attrib)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index feef7733fae7..10f375032e96 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -43,9 +43,26 @@ static void edac_mc_dump_channel(struct rank_info *chan)
 {
 	debugf4("\tchannel = %p\n", chan);
 	debugf4("\tchannel->chan_idx = %d\n", chan->chan_idx);
-	debugf4("\tchannel->ce_count = %d\n", chan->ce_count);
-	debugf4("\tchannel->label = '%s'\n", chan->label);
 	debugf4("\tchannel->csrow = %p\n\n", chan->csrow);
+	debugf4("\tchannel->dimm = %p\n", chan->dimm);
+}
+
+static void edac_mc_dump_dimm(struct dimm_info *dimm)
+{
+	int i;
+
+	debugf4("\tdimm = %p\n", dimm);
+	debugf4("\tdimm->label = '%s'\n", dimm->label);
+	debugf4("\tdimm->nr_pages = 0x%x\n", dimm->nr_pages);
+	debugf4("\tdimm location ");
+	for (i = 0; i < dimm->mci->n_layers; i++) {
+		printk(KERN_CONT "%d", dimm->location[i]);
+		if (i < dimm->mci->n_layers - 1)
+			printk(KERN_CONT ".");
+	}
+	printk(KERN_CONT "\n");
+	debugf4("\tdimm->grain = %d\n", dimm->grain);
+	debugf4("\tdimm->nr_pages = 0x%x\n", dimm->nr_pages);
 }
 
 static void edac_mc_dump_csrow(struct csrow_info *csrow)
@@ -55,7 +72,6 @@ static void edac_mc_dump_csrow(struct csrow_info *csrow)
 	debugf4("\tcsrow->first_page = 0x%lx\n", csrow->first_page);
 	debugf4("\tcsrow->last_page = 0x%lx\n", csrow->last_page);
 	debugf4("\tcsrow->page_mask = 0x%lx\n", csrow->page_mask);
-	debugf4("\tcsrow->nr_pages = 0x%x\n", csrow->nr_pages);
 	debugf4("\tcsrow->nr_channels = %d\n", csrow->nr_channels);
 	debugf4("\tcsrow->channels = %p\n", csrow->channels);
 	debugf4("\tcsrow->mci = %p\n\n", csrow->mci);
@@ -70,6 +86,8 @@ static void edac_mc_dump_mci(struct mem_ctl_info *mci)
 	debugf4("\tmci->edac_check = %p\n", mci->edac_check);
 	debugf3("\tmci->nr_csrows = %d, csrows = %p\n",
 		mci->nr_csrows, mci->csrows);
+	debugf3("\tmci->nr_dimms = %d, dimms = %p\n",
+		mci->tot_dimms, mci->dimms);
 	debugf3("\tdev = %p\n", mci->dev);
 	debugf3("\tmod_name:ctl_name = %s:%s\n", mci->mod_name, mci->ctl_name);
 	debugf3("\tpvt_info = %p\n\n", mci->pvt_info);
@@ -101,18 +119,37 @@ const char *edac_mem_types[] = {
 };
 EXPORT_SYMBOL_GPL(edac_mem_types);
 
-/* 'ptr' points to a possibly unaligned item X such that sizeof(X) is 'size'.
- * Adjust 'ptr' so that its alignment is at least as stringent as what the
- * compiler would provide for X and return the aligned result.
+/**
+ * edac_align_ptr - Prepares the pointer offsets for a single-shot allocation
+ * @p:		pointer to a pointer with the memory offset to be used. At
+ *		return, this will be incremented to point to the next offset
+ * @size:	Size of the data structure to be reserved
+ * @n_elems:	Number of elements that should be reserved
  *
  * If 'size' is a constant, the compiler will optimize this whole function
- * down to either a no-op or the addition of a constant to the value of 'ptr'.
+ * down to either a no-op or the addition of a constant to the value of '*p'.
+ *
+ * The 'p' pointer is absolutely needed to keep the proper advancing
+ * further in memory to the proper offsets when allocating the struct along
+ * with its embedded structs, as edac_device_alloc_ctl_info() does it
+ * above, for example.
+ *
+ * At return, the pointer 'p' will be incremented to be used on a next call
+ * to this function.
  */
-void *edac_align_ptr(void *ptr, unsigned size)
+void *edac_align_ptr(void **p, unsigned size, int n_elems)
 {
 	unsigned align, r;
+	void *ptr = *p;
+
+	*p += size * n_elems;
 
-	/* Here we assume that the alignment of a "long long" is the most
+	/*
+	 * 'p' can possibly be an unaligned item X such that sizeof(X) is
+	 * 'size'.  Adjust 'p' so that its alignment is at least as
+	 * stringent as what the compiler would provide for X and return
+	 * the aligned result.
+	 * Here we assume that the alignment of a "long long" is the most
 	 * stringent alignment that the compiler will ever provide by default.
 	 * As far as I know, this is a reasonable assumption.
 	 */
@@ -132,14 +169,18 @@ void *edac_align_ptr(void *ptr, unsigned size)
 	if (r == 0)
 		return (char *)ptr;
 
+	*p += align - r;
+
 	return (void *)(((unsigned long)ptr) + align - r);
 }
 
 /**
- * edac_mc_alloc: Allocate a struct mem_ctl_info structure
- * @size_pvt:	size of private storage needed
- * @nr_csrows:	Number of CWROWS needed for this MC
- * @nr_chans:	Number of channels for the MC
+ * edac_mc_alloc: Allocate and partially fill a struct mem_ctl_info structure
+ * @mc_num:		Memory controller number
+ * @n_layers:		Number of MC hierarchy layers
+ * layers:		Describes each layer as seen by the Memory Controller
+ * @size_pvt:		size of private storage needed
+ *
  *
  * Everything is kmalloc'ed as one big chunk - more efficient.
  * Only can be used if all structures have the same lifetime - otherwise
@@ -147,32 +188,77 @@ void *edac_align_ptr(void *ptr, unsigned size)
  *
  * Use edac_mc_free() to free mc structures allocated by this function.
  *
+ * NOTE: drivers handle multi-rank memories in different ways: in some
+ * drivers, one multi-rank memory stick is mapped as one entry, while, in
+ * others, a single multi-rank memory stick would be mapped into several
+ * entries. Currently, this function will allocate multiple struct dimm_info
+ * on such scenarios, as grouping the multiple ranks require drivers change.
+ *
  * Returns:
- *	NULL allocation failed
- *	struct mem_ctl_info pointer
+ *	On failure: NULL
+ *	On success: struct mem_ctl_info pointer
  */
-struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows,
-				unsigned nr_chans, int edac_index)
+struct mem_ctl_info *edac_mc_alloc(unsigned mc_num,
+				   unsigned n_layers,
+				   struct edac_mc_layer *layers,
+				   unsigned sz_pvt)
 {
 	struct mem_ctl_info *mci;
-	struct csrow_info *csi, *csrow;
+	struct edac_mc_layer *layer;
+	struct csrow_info *csi, *csr;
 	struct rank_info *chi, *chp, *chan;
-	void *pvt;
-	unsigned size;
-	int row, chn;
-	int err;
+	struct dimm_info *dimm;
+	u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
+	unsigned pos[EDAC_MAX_LAYERS];
+	unsigned size, tot_dimms = 1, count = 1;
+	unsigned tot_csrows = 1, tot_channels = 1, tot_errcount = 0;
+	void *pvt, *p, *ptr = NULL;
+	int i, j, err, row, chn, n, len;
+	bool per_rank = false;
+
+	BUG_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0);
+	/*
+	 * Calculate the total amount of dimms and csrows/cschannels while
+	 * in the old API emulation mode
+	 */
+	for (i = 0; i < n_layers; i++) {
+		tot_dimms *= layers[i].size;
+		if (layers[i].is_virt_csrow)
+			tot_csrows *= layers[i].size;
+		else
+			tot_channels *= layers[i].size;
+
+		if (layers[i].type == EDAC_MC_LAYER_CHIP_SELECT)
+			per_rank = true;
+	}
 
 	/* Figure out the offsets of the various items from the start of an mc
 	 * structure.  We want the alignment of each item to be at least as
 	 * stringent as what the compiler would provide if we could simply
 	 * hardcode everything into a single struct.
 	 */
-	mci = (struct mem_ctl_info *)0;
-	csi = edac_align_ptr(&mci[1], sizeof(*csi));
-	chi = edac_align_ptr(&csi[nr_csrows], sizeof(*chi));
-	pvt = edac_align_ptr(&chi[nr_chans * nr_csrows], sz_pvt);
+	mci = edac_align_ptr(&ptr, sizeof(*mci), 1);
+	layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers);
+	csi = edac_align_ptr(&ptr, sizeof(*csi), tot_csrows);
+	chi = edac_align_ptr(&ptr, sizeof(*chi), tot_csrows * tot_channels);
+	dimm = edac_align_ptr(&ptr, sizeof(*dimm), tot_dimms);
+	for (i = 0; i < n_layers; i++) {
+		count *= layers[i].size;
+		debugf4("%s: errcount layer %d size %d\n", __func__, i, count);
+		ce_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
+		ue_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
+		tot_errcount += 2 * count;
+	}
+
+	debugf4("%s: allocating %d error counters\n", __func__, tot_errcount);
+	pvt = edac_align_ptr(&ptr, sz_pvt, 1);
 	size = ((unsigned long)pvt) + sz_pvt;
 
+	debugf1("%s(): allocating %u bytes for mci data (%d %s, %d csrows/channels)\n",
+		__func__, size,
+		tot_dimms,
+		per_rank ? "ranks" : "dimms",
+		tot_csrows * tot_channels);
 	mci = kzalloc(size, GFP_KERNEL);
 	if (mci == NULL)
 		return NULL;
@@ -180,28 +266,103 @@ struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows,
 	/* Adjust pointers so they point within the memory we just allocated
 	 * rather than an imaginary chunk of memory located at address 0.
 	 */
+	layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer));
 	csi = (struct csrow_info *)(((char *)mci) + ((unsigned long)csi));
 	chi = (struct rank_info *)(((char *)mci) + ((unsigned long)chi));
+	dimm = (struct dimm_info *)(((char *)mci) + ((unsigned long)dimm));
+	for (i = 0; i < n_layers; i++) {
+		mci->ce_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ce_per_layer[i]));
+		mci->ue_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ue_per_layer[i]));
+	}
 	pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL;
 
 	/* setup index and various internal pointers */
-	mci->mc_idx = edac_index;
+	mci->mc_idx = mc_num;
 	mci->csrows = csi;
+	mci->dimms  = dimm;
+	mci->tot_dimms = tot_dimms;
 	mci->pvt_info = pvt;
-	mci->nr_csrows = nr_csrows;
-
-	for (row = 0; row < nr_csrows; row++) {
-		csrow = &csi[row];
-		csrow->csrow_idx = row;
-		csrow->mci = mci;
-		csrow->nr_channels = nr_chans;
-		chp = &chi[row * nr_chans];
-		csrow->channels = chp;
+	mci->n_layers = n_layers;
+	mci->layers = layer;
+	memcpy(mci->layers, layers, sizeof(*layer) * n_layers);
+	mci->nr_csrows = tot_csrows;
+	mci->num_cschannel = tot_channels;
+	mci->mem_is_per_rank = per_rank;
 
-		for (chn = 0; chn < nr_chans; chn++) {
+	/*
+	 * Fill the csrow struct
+	 */
+	for (row = 0; row < tot_csrows; row++) {
+		csr = &csi[row];
+		csr->csrow_idx = row;
+		csr->mci = mci;
+		csr->nr_channels = tot_channels;
+		chp = &chi[row * tot_channels];
+		csr->channels = chp;
+
+		for (chn = 0; chn < tot_channels; chn++) {
 			chan = &chp[chn];
 			chan->chan_idx = chn;
-			chan->csrow = csrow;
+			chan->csrow = csr;
+		}
+	}
+
+	/*
+	 * Fill the dimm struct
+	 */
+	memset(&pos, 0, sizeof(pos));
+	row = 0;
+	chn = 0;
+	debugf4("%s: initializing %d %s\n", __func__, tot_dimms,
+		per_rank ? "ranks" : "dimms");
+	for (i = 0; i < tot_dimms; i++) {
+		chan = &csi[row].channels[chn];
+		dimm = EDAC_DIMM_PTR(layer, mci->dimms, n_layers,
+			       pos[0], pos[1], pos[2]);
+		dimm->mci = mci;
+
+		debugf2("%s: %d: %s%zd (%d:%d:%d): row %d, chan %d\n", __func__,
+			i, per_rank ? "rank" : "dimm", (dimm - mci->dimms),
+			pos[0], pos[1], pos[2], row, chn);
+
+		/*
+		 * Copy DIMM location and initialize it.
+		 */
+		len = sizeof(dimm->label);
+		p = dimm->label;
+		n = snprintf(p, len, "mc#%u", mc_num);
+		p += n;
+		len -= n;
+		for (j = 0; j < n_layers; j++) {
+			n = snprintf(p, len, "%s#%u",
+				     edac_layer_name[layers[j].type],
+				     pos[j]);
+			p += n;
+			len -= n;
+			dimm->location[j] = pos[j];
+
+			if (len <= 0)
+				break;
+		}
+
+		/* Link it to the csrows old API data */
+		chan->dimm = dimm;
+		dimm->csrow = row;
+		dimm->cschannel = chn;
+
+		/* Increment csrow location */
+		row++;
+		if (row == tot_csrows) {
+			row = 0;
+			chn++;
+		}
+
+		/* Increment dimm location */
+		for (j = n_layers - 1; j >= 0; j--) {
+			pos[j]++;
+			if (pos[j] < layers[j].size)
+				break;
+			pos[j] = 0;
 		}
 	}
 
@@ -490,7 +651,6 @@ EXPORT_SYMBOL(edac_mc_find);
  * edac_mc_add_mc: Insert the 'mci' structure into the mci global list and
  *                 create sysfs entries associated with mci structure
  * @mci: pointer to the mci structure to be added to the list
- * @mc_idx: A unique numeric identifier to be assigned to the 'mci' structure.
  *
  * Return:
  *	0	Success
@@ -517,6 +677,8 @@ int edac_mc_add_mc(struct mem_ctl_info *mci)
 				edac_mc_dump_channel(&mci->csrows[i].
 						channels[j]);
 		}
+		for (i = 0; i < mci->tot_dimms; i++)
+			edac_mc_dump_dimm(&mci->dimms[i]);
 	}
 #endif
 	mutex_lock(&mem_ctls_mutex);
@@ -636,15 +798,19 @@ static void edac_mc_scrub_block(unsigned long page, unsigned long offset,
 int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci, unsigned long page)
 {
 	struct csrow_info *csrows = mci->csrows;
-	int row, i;
+	int row, i, j, n;
 
 	debugf1("MC%d: %s(): 0x%lx\n", mci->mc_idx, __func__, page);
 	row = -1;
 
 	for (i = 0; i < mci->nr_csrows; i++) {
 		struct csrow_info *csrow = &csrows[i];
-
-		if (csrow->nr_pages == 0)
+		n = 0;
+		for (j = 0; j < csrow->nr_channels; j++) {
+			struct dimm_info *dimm = csrow->channels[j].dimm;
+			n += dimm->nr_pages;
+		}
+		if (n == 0)
 			continue;
 
 		debugf3("MC%d: %s(): first(0x%lx) page(0x%lx) last(0x%lx) "
@@ -670,249 +836,307 @@ int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci, unsigned long page)
 }
 EXPORT_SYMBOL_GPL(edac_mc_find_csrow_by_page);
 
-/* FIXME - setable log (warning/emerg) levels */
-/* FIXME - integrate with evlog: http://evlog.sourceforge.net/ */
-void edac_mc_handle_ce(struct mem_ctl_info *mci,
-		unsigned long page_frame_number,
-		unsigned long offset_in_page, unsigned long syndrome,
-		int row, int channel, const char *msg)
-{
-	unsigned long remapped_page;
+const char *edac_layer_name[] = {
+	[EDAC_MC_LAYER_BRANCH] = "branch",
+	[EDAC_MC_LAYER_CHANNEL] = "channel",
+	[EDAC_MC_LAYER_SLOT] = "slot",
+	[EDAC_MC_LAYER_CHIP_SELECT] = "csrow",
+};
+EXPORT_SYMBOL_GPL(edac_layer_name);
 
-	debugf3("MC%d: %s()\n", mci->mc_idx, __func__);
+static void edac_inc_ce_error(struct mem_ctl_info *mci,
+				    bool enable_per_layer_report,
+				    const int pos[EDAC_MAX_LAYERS])
+{
+	int i, index = 0;
 
-	/* FIXME - maybe make panic on INTERNAL ERROR an option */
-	if (row >= mci->nr_csrows || row < 0) {
-		/* something is wrong */
-		edac_mc_printk(mci, KERN_ERR,
-			"INTERNAL ERROR: row out of range "
-			"(%d >= %d)\n", row, mci->nr_csrows);
-		edac_mc_handle_ce_no_info(mci, "INTERNAL ERROR");
-		return;
-	}
+	mci->ce_mc++;
 
-	if (channel >= mci->csrows[row].nr_channels || channel < 0) {
-		/* something is wrong */
-		edac_mc_printk(mci, KERN_ERR,
-			"INTERNAL ERROR: channel out of range "
-			"(%d >= %d)\n", channel,
-			mci->csrows[row].nr_channels);
-		edac_mc_handle_ce_no_info(mci, "INTERNAL ERROR");
+	if (!enable_per_layer_report) {
+		mci->ce_noinfo_count++;
 		return;
 	}
 
-	if (edac_mc_get_log_ce())
-		/* FIXME - put in DIMM location */
-		edac_mc_printk(mci, KERN_WARNING,
-			"CE page 0x%lx, offset 0x%lx, grain %d, syndrome "
-			"0x%lx, row %d, channel %d, label \"%s\": %s\n",
-			page_frame_number, offset_in_page,
-			mci->csrows[row].grain, syndrome, row, channel,
-			mci->csrows[row].channels[channel].label, msg);
-
-	mci->ce_count++;
-	mci->csrows[row].ce_count++;
-	mci->csrows[row].channels[channel].ce_count++;
-
-	if (mci->scrub_mode & SCRUB_SW_SRC) {
-		/*
-		 * Some MC's can remap memory so that it is still available
-		 * at a different address when PCI devices map into memory.
-		 * MC's that can't do this lose the memory where PCI devices
-		 * are mapped.  This mapping is MC dependent and so we call
-		 * back into the MC driver for it to map the MC page to
-		 * a physical (CPU) page which can then be mapped to a virtual
-		 * page - which can then be scrubbed.
-		 */
-		remapped_page = mci->ctl_page_to_phys ?
-			mci->ctl_page_to_phys(mci, page_frame_number) :
-			page_frame_number;
+	for (i = 0; i < mci->n_layers; i++) {
+		if (pos[i] < 0)
+			break;
+		index += pos[i];
+		mci->ce_per_layer[i][index]++;
 
-		edac_mc_scrub_block(remapped_page, offset_in_page,
-				mci->csrows[row].grain);
+		if (i < mci->n_layers - 1)
+			index *= mci->layers[i + 1].size;
 	}
 }
-EXPORT_SYMBOL_GPL(edac_mc_handle_ce);
 
-void edac_mc_handle_ce_no_info(struct mem_ctl_info *mci, const char *msg)
+static void edac_inc_ue_error(struct mem_ctl_info *mci,
+				    bool enable_per_layer_report,
+				    const int pos[EDAC_MAX_LAYERS])
 {
-	if (edac_mc_get_log_ce())
-		edac_mc_printk(mci, KERN_WARNING,
-			"CE - no information available: %s\n", msg);
+	int i, index = 0;
 
-	mci->ce_noinfo_count++;
-	mci->ce_count++;
-}
-EXPORT_SYMBOL_GPL(edac_mc_handle_ce_no_info);
+	mci->ue_mc++;
 
-void edac_mc_handle_ue(struct mem_ctl_info *mci,
-		unsigned long page_frame_number,
-		unsigned long offset_in_page, int row, const char *msg)
-{
-	int len = EDAC_MC_LABEL_LEN * 4;
-	char labels[len + 1];
-	char *pos = labels;
-	int chan;
-	int chars;
-
-	debugf3("MC%d: %s()\n", mci->mc_idx, __func__);
-
-	/* FIXME - maybe make panic on INTERNAL ERROR an option */
-	if (row >= mci->nr_csrows || row < 0) {
-		/* something is wrong */
-		edac_mc_printk(mci, KERN_ERR,
-			"INTERNAL ERROR: row out of range "
-			"(%d >= %d)\n", row, mci->nr_csrows);
-		edac_mc_handle_ue_no_info(mci, "INTERNAL ERROR");
+	if (!enable_per_layer_report) {
+		mci->ce_noinfo_count++;
 		return;
 	}
 
-	chars = snprintf(pos, len + 1, "%s",
-			 mci->csrows[row].channels[0].label);
-	len -= chars;
-	pos += chars;
+	for (i = 0; i < mci->n_layers; i++) {
+		if (pos[i] < 0)
+			break;
+		index += pos[i];
+		mci->ue_per_layer[i][index]++;
 
-	for (chan = 1; (chan < mci->csrows[row].nr_channels) && (len > 0);
-		chan++) {
-		chars = snprintf(pos, len + 1, ":%s",
-				 mci->csrows[row].channels[chan].label);
-		len -= chars;
-		pos += chars;
+		if (i < mci->n_layers - 1)
+			index *= mci->layers[i + 1].size;
 	}
+}
 
-	if (edac_mc_get_log_ue())
-		edac_mc_printk(mci, KERN_EMERG,
-			"UE page 0x%lx, offset 0x%lx, grain %d, row %d, "
-			"labels \"%s\": %s\n", page_frame_number,
-			offset_in_page, mci->csrows[row].grain, row,
-			labels, msg);
+static void edac_ce_error(struct mem_ctl_info *mci,
+			  const int pos[EDAC_MAX_LAYERS],
+			  const char *msg,
+			  const char *location,
+			  const char *label,
+			  const char *detail,
+			  const char *other_detail,
+			  const bool enable_per_layer_report,
+			  const unsigned long page_frame_number,
+			  const unsigned long offset_in_page,
+			  u32 grain)
+{
+	unsigned long remapped_page;
 
-	if (edac_mc_get_panic_on_ue())
-		panic("EDAC MC%d: UE page 0x%lx, offset 0x%lx, grain %d, "
-			"row %d, labels \"%s\": %s\n", mci->mc_idx,
-			page_frame_number, offset_in_page,
-			mci->csrows[row].grain, row, labels, msg);
+	if (edac_mc_get_log_ce()) {
+		if (other_detail && *other_detail)
+			edac_mc_printk(mci, KERN_WARNING,
+				       "CE %s on %s (%s%s - %s)\n",
+				       msg, label, location,
+				       detail, other_detail);
+		else
+			edac_mc_printk(mci, KERN_WARNING,
+				       "CE %s on %s (%s%s)\n",
+				       msg, label, location,
+				       detail);
+	}
+	edac_inc_ce_error(mci, enable_per_layer_report, pos);
 
-	mci->ue_count++;
-	mci->csrows[row].ue_count++;
+	if (mci->scrub_mode & SCRUB_SW_SRC) {
+		/*
+			* Some memory controllers (called MCs below) can remap
+			* memory so that it is still available at a different
+			* address when PCI devices map into memory.
+			* MC's that can't do this, lose the memory where PCI
+			* devices are mapped. This mapping is MC-dependent
+			* and so we call back into the MC driver for it to
+			* map the MC page to a physical (CPU) page which can
+			* then be mapped to a virtual page - which can then
+			* be scrubbed.
+			*/
+		remapped_page = mci->ctl_page_to_phys ?
+			mci->ctl_page_to_phys(mci, page_frame_number) :
+			page_frame_number;
+
+		edac_mc_scrub_block(remapped_page,
+					offset_in_page, grain);
+	}
 }
-EXPORT_SYMBOL_GPL(edac_mc_handle_ue);
 
-void edac_mc_handle_ue_no_info(struct mem_ctl_info *mci, const char *msg)
+static void edac_ue_error(struct mem_ctl_info *mci,
+			  const int pos[EDAC_MAX_LAYERS],
+			  const char *msg,
+			  const char *location,
+			  const char *label,
+			  const char *detail,
+			  const char *other_detail,
+			  const bool enable_per_layer_report)
 {
-	if (edac_mc_get_panic_on_ue())
-		panic("EDAC MC%d: Uncorrected Error", mci->mc_idx);
+	if (edac_mc_get_log_ue()) {
+		if (other_detail && *other_detail)
+			edac_mc_printk(mci, KERN_WARNING,
+				       "UE %s on %s (%s%s - %s)\n",
+			               msg, label, location, detail,
+				       other_detail);
+		else
+			edac_mc_printk(mci, KERN_WARNING,
+				       "UE %s on %s (%s%s)\n",
+			               msg, label, location, detail);
+	}
 
-	if (edac_mc_get_log_ue())
-		edac_mc_printk(mci, KERN_WARNING,
-			"UE - no information available: %s\n", msg);
-	mci->ue_noinfo_count++;
-	mci->ue_count++;
+	if (edac_mc_get_panic_on_ue()) {
+		if (other_detail && *other_detail)
+			panic("UE %s on %s (%s%s - %s)\n",
+			      msg, label, location, detail, other_detail);
+		else
+			panic("UE %s on %s (%s%s)\n",
+			      msg, label, location, detail);
+	}
+
+	edac_inc_ue_error(mci, enable_per_layer_report, pos);
 }
-EXPORT_SYMBOL_GPL(edac_mc_handle_ue_no_info);
 
-/*************************************************************
- * On Fully Buffered DIMM modules, this help function is
- * called to process UE events
- */
-void edac_mc_handle_fbd_ue(struct mem_ctl_info *mci,
-			unsigned int csrow,
-			unsigned int channela,
-			unsigned int channelb, char *msg)
+#define OTHER_LABEL " or "
+void edac_mc_handle_error(const enum hw_event_mc_err_type type,
+			  struct mem_ctl_info *mci,
+			  const unsigned long page_frame_number,
+			  const unsigned long offset_in_page,
+			  const unsigned long syndrome,
+			  const int layer0,
+			  const int layer1,
+			  const int layer2,
+			  const char *msg,
+			  const char *other_detail,
+			  const void *mcelog)
 {
-	int len = EDAC_MC_LABEL_LEN * 4;
-	char labels[len + 1];
-	char *pos = labels;
-	int chars;
+	/* FIXME: too much for stack: move it to some pre-alocated area */
+	char detail[80], location[80];
+	char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * mci->tot_dimms];
+	char *p;
+	int row = -1, chan = -1;
+	int pos[EDAC_MAX_LAYERS] = { layer0, layer1, layer2 };
+	int i;
+	u32 grain;
+	bool enable_per_layer_report = false;
 
-	if (csrow >= mci->nr_csrows) {
-		/* something is wrong */
-		edac_mc_printk(mci, KERN_ERR,
-			"INTERNAL ERROR: row out of range (%d >= %d)\n",
-			csrow, mci->nr_csrows);
-		edac_mc_handle_ue_no_info(mci, "INTERNAL ERROR");
-		return;
-	}
+	debugf3("MC%d: %s()\n", mci->mc_idx, __func__);
 
-	if (channela >= mci->csrows[csrow].nr_channels) {
-		/* something is wrong */
-		edac_mc_printk(mci, KERN_ERR,
-			"INTERNAL ERROR: channel-a out of range "
-			"(%d >= %d)\n",
-			channela, mci->csrows[csrow].nr_channels);
-		edac_mc_handle_ue_no_info(mci, "INTERNAL ERROR");
-		return;
+	/*
+	 * Check if the event report is consistent and if the memory
+	 * location is known. If it is known, enable_per_layer_report will be
+	 * true, the DIMM(s) label info will be filled and the per-layer
+	 * error counters will be incremented.
+	 */
+	for (i = 0; i < mci->n_layers; i++) {
+		if (pos[i] >= (int)mci->layers[i].size) {
+			if (type == HW_EVENT_ERR_CORRECTED)
+				p = "CE";
+			else
+				p = "UE";
+
+			edac_mc_printk(mci, KERN_ERR,
+				       "INTERNAL ERROR: %s value is out of range (%d >= %d)\n",
+				       edac_layer_name[mci->layers[i].type],
+				       pos[i], mci->layers[i].size);
+			/*
+			 * Instead of just returning it, let's use what's
+			 * known about the error. The increment routines and
+			 * the DIMM filter logic will do the right thing by
+			 * pointing the likely damaged DIMMs.
+			 */
+			pos[i] = -1;
+		}
+		if (pos[i] >= 0)
+			enable_per_layer_report = true;
 	}
 
-	if (channelb >= mci->csrows[csrow].nr_channels) {
-		/* something is wrong */
-		edac_mc_printk(mci, KERN_ERR,
-			"INTERNAL ERROR: channel-b out of range "
-			"(%d >= %d)\n",
-			channelb, mci->csrows[csrow].nr_channels);
-		edac_mc_handle_ue_no_info(mci, "INTERNAL ERROR");
-		return;
-	}
+	/*
+	 * Get the dimm label/grain that applies to the match criteria.
+	 * As the error algorithm may not be able to point to just one memory
+	 * stick, the logic here will get all possible labels that could
+	 * pottentially be affected by the error.
+	 * On FB-DIMM memory controllers, for uncorrected errors, it is common
+	 * to have only the MC channel and the MC dimm (also called "branch")
+	 * but the channel is not known, as the memory is arranged in pairs,
+	 * where each memory belongs to a separate channel within the same
+	 * branch.
+	 */
+	grain = 0;
+	p = label;
+	*p = '\0';
+	for (i = 0; i < mci->tot_dimms; i++) {
+		struct dimm_info *dimm = &mci->dimms[i];
 
-	mci->ue_count++;
-	mci->csrows[csrow].ue_count++;
+		if (layer0 >= 0 && layer0 != dimm->location[0])
+			continue;
+		if (layer1 >= 0 && layer1 != dimm->location[1])
+			continue;
+		if (layer2 >= 0 && layer2 != dimm->location[2])
+			continue;
 
-	/* Generate the DIMM labels from the specified channels */
-	chars = snprintf(pos, len + 1, "%s",
-			 mci->csrows[csrow].channels[channela].label);
-	len -= chars;
-	pos += chars;
-	chars = snprintf(pos, len + 1, "-%s",
-			 mci->csrows[csrow].channels[channelb].label);
+		/* get the max grain, over the error match range */
+		if (dimm->grain > grain)
+			grain = dimm->grain;
 
-	if (edac_mc_get_log_ue())
-		edac_mc_printk(mci, KERN_EMERG,
-			"UE row %d, channel-a= %d channel-b= %d "
-			"labels \"%s\": %s\n", csrow, channela, channelb,
-			labels, msg);
+		/*
+		 * If the error is memory-controller wide, there's no need to
+		 * seek for the affected DIMMs because the whole
+		 * channel/memory controller/...  may be affected.
+		 * Also, don't show errors for empty DIMM slots.
+		 */
+		if (enable_per_layer_report && dimm->nr_pages) {
+			if (p != label) {
+				strcpy(p, OTHER_LABEL);
+				p += strlen(OTHER_LABEL);
+			}
+			strcpy(p, dimm->label);
+			p += strlen(p);
+			*p = '\0';
+
+			/*
+			 * get csrow/channel of the DIMM, in order to allow
+			 * incrementing the compat API counters
+			 */
+			debugf4("%s: %s csrows map: (%d,%d)\n",
+				__func__,
+				mci->mem_is_per_rank ? "rank" : "dimm",
+				dimm->csrow, dimm->cschannel);
+
+			if (row == -1)
+				row = dimm->csrow;
+			else if (row >= 0 && row != dimm->csrow)
+				row = -2;
+
+			if (chan == -1)
+				chan = dimm->cschannel;
+			else if (chan >= 0 && chan != dimm->cschannel)
+				chan = -2;
+		}
+	}
 
-	if (edac_mc_get_panic_on_ue())
-		panic("UE row %d, channel-a= %d channel-b= %d "
-			"labels \"%s\": %s\n", csrow, channela,
-			channelb, labels, msg);
-}
-EXPORT_SYMBOL(edac_mc_handle_fbd_ue);
+	if (!enable_per_layer_report) {
+		strcpy(label, "any memory");
+	} else {
+		debugf4("%s: csrow/channel to increment: (%d,%d)\n",
+			__func__, row, chan);
+		if (p == label)
+			strcpy(label, "unknown memory");
+		if (type == HW_EVENT_ERR_CORRECTED) {
+			if (row >= 0) {
+				mci->csrows[row].ce_count++;
+				if (chan >= 0)
+					mci->csrows[row].channels[chan].ce_count++;
+			}
+		} else
+			if (row >= 0)
+				mci->csrows[row].ue_count++;
+	}
 
-/*************************************************************
- * On Fully Buffered DIMM modules, this help function is
- * called to process CE events
- */
-void edac_mc_handle_fbd_ce(struct mem_ctl_info *mci,
-			unsigned int csrow, unsigned int channel, char *msg)
-{
+	/* Fill the RAM location data */
+	p = location;
+	for (i = 0; i < mci->n_layers; i++) {
+		if (pos[i] < 0)
+			continue;
 
-	/* Ensure boundary values */
-	if (csrow >= mci->nr_csrows) {
-		/* something is wrong */
-		edac_mc_printk(mci, KERN_ERR,
-			"INTERNAL ERROR: row out of range (%d >= %d)\n",
-			csrow, mci->nr_csrows);
-		edac_mc_handle_ce_no_info(mci, "INTERNAL ERROR");
-		return;
-	}
-	if (channel >= mci->csrows[csrow].nr_channels) {
-		/* something is wrong */
-		edac_mc_printk(mci, KERN_ERR,
-			"INTERNAL ERROR: channel out of range (%d >= %d)\n",
-			channel, mci->csrows[csrow].nr_channels);
-		edac_mc_handle_ce_no_info(mci, "INTERNAL ERROR");
-		return;
+		p += sprintf(p, "%s:%d ",
+			     edac_layer_name[mci->layers[i].type],
+			     pos[i]);
 	}
 
-	if (edac_mc_get_log_ce())
-		/* FIXME - put in DIMM location */
-		edac_mc_printk(mci, KERN_WARNING,
-			"CE row %d, channel %d, label \"%s\": %s\n",
-			csrow, channel,
-			mci->csrows[csrow].channels[channel].label, msg);
+	/* Memory type dependent details about the error */
+	if (type == HW_EVENT_ERR_CORRECTED) {
+		snprintf(detail, sizeof(detail),
+			"page:0x%lx offset:0x%lx grain:%d syndrome:0x%lx",
+			page_frame_number, offset_in_page,
+			grain, syndrome);
+		edac_ce_error(mci, pos, msg, location, label, detail,
+			      other_detail, enable_per_layer_report,
+			      page_frame_number, offset_in_page, grain);
+	} else {
+		snprintf(detail, sizeof(detail),
+			"page:0x%lx offset:0x%lx grain:%d",
+			page_frame_number, offset_in_page, grain);
 
-	mci->ce_count++;
-	mci->csrows[csrow].ce_count++;
-	mci->csrows[csrow].channels[channel].ce_count++;
+		edac_ue_error(mci, pos, msg, location, label, detail,
+			      other_detail, enable_per_layer_report);
+	}
 }
-EXPORT_SYMBOL(edac_mc_handle_fbd_ce);
+EXPORT_SYMBOL_GPL(edac_mc_handle_error);
diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c
index e9a28f576d14..f6a29b0eedc8 100644
--- a/drivers/edac/edac_mc_sysfs.c
+++ b/drivers/edac/edac_mc_sysfs.c
@@ -144,25 +144,31 @@ static ssize_t csrow_ce_count_show(struct csrow_info *csrow, char *data,
 static ssize_t csrow_size_show(struct csrow_info *csrow, char *data,
 				int private)
 {
-	return sprintf(data, "%u\n", PAGES_TO_MiB(csrow->nr_pages));
+	int i;
+	u32 nr_pages = 0;
+
+	for (i = 0; i < csrow->nr_channels; i++)
+		nr_pages += csrow->channels[i].dimm->nr_pages;
+
+	return sprintf(data, "%u\n", PAGES_TO_MiB(nr_pages));
 }
 
 static ssize_t csrow_mem_type_show(struct csrow_info *csrow, char *data,
 				int private)
 {
-	return sprintf(data, "%s\n", mem_types[csrow->mtype]);
+	return sprintf(data, "%s\n", mem_types[csrow->channels[0].dimm->mtype]);
 }
 
 static ssize_t csrow_dev_type_show(struct csrow_info *csrow, char *data,
 				int private)
 {
-	return sprintf(data, "%s\n", dev_types[csrow->dtype]);
+	return sprintf(data, "%s\n", dev_types[csrow->channels[0].dimm->dtype]);
 }
 
 static ssize_t csrow_edac_mode_show(struct csrow_info *csrow, char *data,
 				int private)
 {
-	return sprintf(data, "%s\n", edac_caps[csrow->edac_mode]);
+	return sprintf(data, "%s\n", edac_caps[csrow->channels[0].dimm->edac_mode]);
 }
 
 /* show/store functions for DIMM Label attributes */
@@ -170,11 +176,11 @@ static ssize_t channel_dimm_label_show(struct csrow_info *csrow,
 				char *data, int channel)
 {
 	/* if field has not been initialized, there is nothing to send */
-	if (!csrow->channels[channel].label[0])
+	if (!csrow->channels[channel].dimm->label[0])
 		return 0;
 
 	return snprintf(data, EDAC_MC_LABEL_LEN, "%s\n",
-			csrow->channels[channel].label);
+			csrow->channels[channel].dimm->label);
 }
 
 static ssize_t channel_dimm_label_store(struct csrow_info *csrow,
@@ -184,8 +190,8 @@ static ssize_t channel_dimm_label_store(struct csrow_info *csrow,
 	ssize_t max_size = 0;
 
 	max_size = min((ssize_t) count, (ssize_t) EDAC_MC_LABEL_LEN - 1);
-	strncpy(csrow->channels[channel].label, data, max_size);
-	csrow->channels[channel].label[max_size] = '\0';
+	strncpy(csrow->channels[channel].dimm->label, data, max_size);
+	csrow->channels[channel].dimm->label[max_size] = '\0';
 
 	return max_size;
 }
@@ -419,8 +425,8 @@ static ssize_t mci_reset_counters_store(struct mem_ctl_info *mci,
 
 	mci->ue_noinfo_count = 0;
 	mci->ce_noinfo_count = 0;
-	mci->ue_count = 0;
-	mci->ce_count = 0;
+	mci->ue_mc = 0;
+	mci->ce_mc = 0;
 
 	for (row = 0; row < mci->nr_csrows; row++) {
 		struct csrow_info *ri = &mci->csrows[row];
@@ -489,12 +495,12 @@ static ssize_t mci_sdram_scrub_rate_show(struct mem_ctl_info *mci, char *data)
 /* default attribute files for the MCI object */
 static ssize_t mci_ue_count_show(struct mem_ctl_info *mci, char *data)
 {
-	return sprintf(data, "%d\n", mci->ue_count);
+	return sprintf(data, "%d\n", mci->ue_mc);
 }
 
 static ssize_t mci_ce_count_show(struct mem_ctl_info *mci, char *data)
 {
-	return sprintf(data, "%d\n", mci->ce_count);
+	return sprintf(data, "%d\n", mci->ce_mc);
 }
 
 static ssize_t mci_ce_noinfo_show(struct mem_ctl_info *mci, char *data)
@@ -519,16 +525,16 @@ static ssize_t mci_ctl_name_show(struct mem_ctl_info *mci, char *data)
 
 static ssize_t mci_size_mb_show(struct mem_ctl_info *mci, char *data)
 {
-	int total_pages, csrow_idx;
+	int total_pages = 0, csrow_idx, j;
 
-	for (total_pages = csrow_idx = 0; csrow_idx < mci->nr_csrows;
-		csrow_idx++) {
+	for (csrow_idx = 0; csrow_idx < mci->nr_csrows; csrow_idx++) {
 		struct csrow_info *csrow = &mci->csrows[csrow_idx];
 
-		if (!csrow->nr_pages)
-			continue;
+		for (j = 0; j < csrow->nr_channels; j++) {
+			struct dimm_info *dimm = csrow->channels[j].dimm;
 
-		total_pages += csrow->nr_pages;
+			total_pages += dimm->nr_pages;
+		}
 	}
 
 	return sprintf(data, "%u\n", PAGES_TO_MiB(total_pages));
@@ -900,7 +906,7 @@ static void edac_remove_mci_instance_attributes(struct mem_ctl_info *mci,
  */
 int edac_create_sysfs_mci_device(struct mem_ctl_info *mci)
 {
-	int i;
+	int i, j;
 	int err;
 	struct csrow_info *csrow;
 	struct kobject *kobj_mci = &mci->edac_mci_kobj;
@@ -934,10 +940,13 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci)
 	/* Make directories for each CSROW object under the mc<id> kobject
 	 */
 	for (i = 0; i < mci->nr_csrows; i++) {
+		int nr_pages = 0;
+
 		csrow = &mci->csrows[i];
+		for (j = 0; j < csrow->nr_channels; j++)
+			nr_pages += csrow->channels[j].dimm->nr_pages;
 
-		/* Only expose populated CSROWs */
-		if (csrow->nr_pages > 0) {
+		if (nr_pages > 0) {
 			err = edac_create_csrow_object(mci, csrow, i);
 			if (err) {
 				debugf1("%s() failure: create csrow %d obj\n",
@@ -949,12 +958,15 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci)
 
 	return 0;
 
-	/* CSROW error: backout what has already been registered,  */
 fail1:
 	for (i--; i >= 0; i--) {
-		if (csrow->nr_pages > 0) {
+		int nr_pages = 0;
+
+		csrow = &mci->csrows[i];
+		for (j = 0; j < csrow->nr_channels; j++)
+			nr_pages += csrow->channels[j].dimm->nr_pages;
+		if (nr_pages > 0)
 			kobject_put(&mci->csrows[i].kobj);
-		}
 	}
 
 	/* remove the mci instance's attributes, if any */
@@ -973,14 +985,20 @@ fail0:
  */
 void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci)
 {
-	int i;
+	struct csrow_info *csrow;
+	int i, j;
 
 	debugf0("%s()\n", __func__);
 
 	/* remove all csrow kobjects */
 	debugf4("%s()  unregister this mci kobj\n", __func__);
 	for (i = 0; i < mci->nr_csrows; i++) {
-		if (mci->csrows[i].nr_pages > 0) {
+		int nr_pages = 0;
+
+		csrow = &mci->csrows[i];
+		for (j = 0; j < csrow->nr_channels; j++)
+			nr_pages += csrow->channels[j].dimm->nr_pages;
+		if (nr_pages > 0) {
 			debugf0("%s()  unreg csrow-%d\n", __func__, i);
 			kobject_put(&mci->csrows[i].kobj);
 		}
diff --git a/drivers/edac/edac_module.h b/drivers/edac/edac_module.h
index 00f81b47a51f..0ea7d14cb930 100644
--- a/drivers/edac/edac_module.h
+++ b/drivers/edac/edac_module.h
@@ -50,7 +50,7 @@ extern void edac_device_reset_delay_period(struct edac_device_ctl_info
 					   *edac_dev, unsigned long value);
 extern void edac_mc_reset_delay_period(int value);
 
-extern void *edac_align_ptr(void *ptr, unsigned size);
+extern void *edac_align_ptr(void **p, unsigned size, int n_elems);
 
 /*
  * EDAC PCI functions
diff --git a/drivers/edac/edac_pci.c b/drivers/edac/edac_pci.c
index 63af1c5673d1..f1ac86649886 100644
--- a/drivers/edac/edac_pci.c
+++ b/drivers/edac/edac_pci.c
@@ -42,13 +42,13 @@ struct edac_pci_ctl_info *edac_pci_alloc_ctl_info(unsigned int sz_pvt,
 						const char *edac_pci_name)
 {
 	struct edac_pci_ctl_info *pci;
-	void *pvt;
+	void *p = NULL, *pvt;
 	unsigned int size;
 
 	debugf1("%s()\n", __func__);
 
-	pci = (struct edac_pci_ctl_info *)0;
-	pvt = edac_align_ptr(&pci[1], sz_pvt);
+	pci = edac_align_ptr(&p, sizeof(*pci), 1);
+	pvt = edac_align_ptr(&p, 1, sz_pvt);
 	size = ((unsigned long)pvt) + sz_pvt;
 
 	/* Alloc the needed control struct memory */
diff --git a/drivers/edac/i3000_edac.c b/drivers/edac/i3000_edac.c
index 277689a68841..8ad1744faacd 100644
--- a/drivers/edac/i3000_edac.c
+++ b/drivers/edac/i3000_edac.c
@@ -245,7 +245,9 @@ static int i3000_process_error_info(struct mem_ctl_info *mci,
 		return 1;
 
 	if ((info->errsts ^ info->errsts2) & I3000_ERRSTS_BITS) {
-		edac_mc_handle_ce_no_info(mci, "UE overwrote CE");
+		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 0, 0, 0,
+				     -1, -1, -1,
+				     "UE overwrote CE", "", NULL);
 		info->errsts = info->errsts2;
 	}
 
@@ -256,10 +258,15 @@ static int i3000_process_error_info(struct mem_ctl_info *mci,
 	row = edac_mc_find_csrow_by_page(mci, pfn);
 
 	if (info->errsts & I3000_ERRSTS_UE)
-		edac_mc_handle_ue(mci, pfn, offset, row, "i3000 UE");
+		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
+				     pfn, offset, 0,
+				     row, -1, -1,
+				     "i3000 UE", "", NULL);
 	else
-		edac_mc_handle_ce(mci, pfn, offset, info->derrsyn, row,
-				multi_chan ? channel : 0, "i3000 CE");
+		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
+				     pfn, offset, info->derrsyn,
+				     row, multi_chan ? channel : 0, -1,
+				     "i3000 CE", "", NULL);
 
 	return 1;
 }
@@ -304,9 +311,10 @@ static int i3000_is_interleaved(const unsigned char *c0dra,
 static int i3000_probe1(struct pci_dev *pdev, int dev_idx)
 {
 	int rc;
-	int i;
+	int i, j;
 	struct mem_ctl_info *mci = NULL;
-	unsigned long last_cumul_size;
+	struct edac_mc_layer layers[2];
+	unsigned long last_cumul_size, nr_pages;
 	int interleaved, nr_channels;
 	unsigned char dra[I3000_RANKS / 2], drb[I3000_RANKS];
 	unsigned char *c0dra = dra, *c1dra = &dra[I3000_RANKS_PER_CHANNEL / 2];
@@ -347,7 +355,14 @@ static int i3000_probe1(struct pci_dev *pdev, int dev_idx)
 	 */
 	interleaved = i3000_is_interleaved(c0dra, c1dra, c0drb, c1drb);
 	nr_channels = interleaved ? 2 : 1;
-	mci = edac_mc_alloc(0, I3000_RANKS / nr_channels, nr_channels, 0);
+
+	layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
+	layers[0].size = I3000_RANKS / nr_channels;
+	layers[0].is_virt_csrow = true;
+	layers[1].type = EDAC_MC_LAYER_CHANNEL;
+	layers[1].size = nr_channels;
+	layers[1].is_virt_csrow = false;
+	mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, 0);
 	if (!mci)
 		return -ENOMEM;
 
@@ -386,19 +401,23 @@ static int i3000_probe1(struct pci_dev *pdev, int dev_idx)
 			cumul_size <<= 1;
 		debugf3("MC: %s(): (%d) cumul_size 0x%x\n",
 			__func__, i, cumul_size);
-		if (cumul_size == last_cumul_size) {
-			csrow->mtype = MEM_EMPTY;
+		if (cumul_size == last_cumul_size)
 			continue;
-		}
 
 		csrow->first_page = last_cumul_size;
 		csrow->last_page = cumul_size - 1;
-		csrow->nr_pages = cumul_size - last_cumul_size;
+		nr_pages = cumul_size - last_cumul_size;
 		last_cumul_size = cumul_size;
-		csrow->grain = I3000_DEAP_GRAIN;
-		csrow->mtype = MEM_DDR2;
-		csrow->dtype = DEV_UNKNOWN;
-		csrow->edac_mode = EDAC_UNKNOWN;
+
+		for (j = 0; j < nr_channels; j++) {
+			struct dimm_info *dimm = csrow->channels[j].dimm;
+
+			dimm->nr_pages = nr_pages / nr_channels;
+			dimm->grain = I3000_DEAP_GRAIN;
+			dimm->mtype = MEM_DDR2;
+			dimm->dtype = DEV_UNKNOWN;
+			dimm->edac_mode = EDAC_UNKNOWN;
+		}
 	}
 
 	/*
diff --git a/drivers/edac/i3200_edac.c b/drivers/edac/i3200_edac.c
index 046808c6357d..bbe43ef71823 100644
--- a/drivers/edac/i3200_edac.c
+++ b/drivers/edac/i3200_edac.c
@@ -23,6 +23,7 @@
 
 #define PCI_DEVICE_ID_INTEL_3200_HB    0x29f0
 
+#define I3200_DIMMS		4
 #define I3200_RANKS		8
 #define I3200_RANKS_PER_CHANNEL	4
 #define I3200_CHANNELS		2
@@ -217,21 +218,25 @@ static void i3200_process_error_info(struct mem_ctl_info *mci,
 		return;
 
 	if ((info->errsts ^ info->errsts2) & I3200_ERRSTS_BITS) {
-		edac_mc_handle_ce_no_info(mci, "UE overwrote CE");
+		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 0, 0, 0,
+				     -1, -1, -1, "UE overwrote CE", "", NULL);
 		info->errsts = info->errsts2;
 	}
 
 	for (channel = 0; channel < nr_channels; channel++) {
 		log = info->eccerrlog[channel];
 		if (log & I3200_ECCERRLOG_UE) {
-			edac_mc_handle_ue(mci, 0, 0,
-				eccerrlog_row(channel, log),
-				"i3200 UE");
+			edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
+					     0, 0, 0,
+					     eccerrlog_row(channel, log),
+					     -1, -1,
+					     "i3000 UE", "", NULL);
 		} else if (log & I3200_ECCERRLOG_CE) {
-			edac_mc_handle_ce(mci, 0, 0,
-				eccerrlog_syndrome(log),
-				eccerrlog_row(channel, log), 0,
-				"i3200 CE");
+			edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
+					     0, 0, eccerrlog_syndrome(log),
+					     eccerrlog_row(channel, log),
+					     -1, -1,
+					     "i3000 UE", "", NULL);
 		}
 	}
 }
@@ -319,9 +324,9 @@ static unsigned long drb_to_nr_pages(
 static int i3200_probe1(struct pci_dev *pdev, int dev_idx)
 {
 	int rc;
-	int i;
+	int i, j;
 	struct mem_ctl_info *mci = NULL;
-	unsigned long last_page;
+	struct edac_mc_layer layers[2];
 	u16 drbs[I3200_CHANNELS][I3200_RANKS_PER_CHANNEL];
 	bool stacked;
 	void __iomem *window;
@@ -336,8 +341,14 @@ static int i3200_probe1(struct pci_dev *pdev, int dev_idx)
 	i3200_get_drbs(window, drbs);
 	nr_channels = how_many_channels(pdev);
 
-	mci = edac_mc_alloc(sizeof(struct i3200_priv), I3200_RANKS,
-		nr_channels, 0);
+	layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
+	layers[0].size = I3200_DIMMS;
+	layers[0].is_virt_csrow = true;
+	layers[1].type = EDAC_MC_LAYER_CHANNEL;
+	layers[1].size = nr_channels;
+	layers[1].is_virt_csrow = false;
+	mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers,
+			    sizeof(struct i3200_priv));
 	if (!mci)
 		return -ENOMEM;
 
@@ -366,7 +377,6 @@ static int i3200_probe1(struct pci_dev *pdev, int dev_idx)
 	 * cumulative; the last one will contain the total memory
 	 * contained in all ranks.
 	 */
-	last_page = -1UL;
 	for (i = 0; i < mci->nr_csrows; i++) {
 		unsigned long nr_pages;
 		struct csrow_info *csrow = &mci->csrows[i];
@@ -375,20 +385,18 @@ static int i3200_probe1(struct pci_dev *pdev, int dev_idx)
 			i / I3200_RANKS_PER_CHANNEL,
 			i % I3200_RANKS_PER_CHANNEL);
 
-		if (nr_pages == 0) {
-			csrow->mtype = MEM_EMPTY;
+		if (nr_pages == 0)
 			continue;
-		}
 
-		csrow->first_page = last_page + 1;
-		last_page += nr_pages;
-		csrow->last_page = last_page;
-		csrow->nr_pages = nr_pages;
+		for (j = 0; j < nr_channels; j++) {
+			struct dimm_info *dimm = csrow->channels[j].dimm;
 
-		csrow->grain = nr_pages << PAGE_SHIFT;
-		csrow->mtype = MEM_DDR2;
-		csrow->dtype = DEV_UNKNOWN;
-		csrow->edac_mode = EDAC_UNKNOWN;
+			dimm->nr_pages = nr_pages / nr_channels;
+			dimm->grain = nr_pages << PAGE_SHIFT;
+			dimm->mtype = MEM_DDR2;
+			dimm->dtype = DEV_UNKNOWN;
+			dimm->edac_mode = EDAC_UNKNOWN;
+		}
 	}
 
 	i3200_clear_error_info(mci);
diff --git a/drivers/edac/i5000_edac.c b/drivers/edac/i5000_edac.c
index a2680d8e744b..11ea835f155a 100644
--- a/drivers/edac/i5000_edac.c
+++ b/drivers/edac/i5000_edac.c
@@ -270,7 +270,8 @@
 #define MTR3		0x8C
 
 #define NUM_MTRS		4
-#define CHANNELS_PER_BRANCH	(2)
+#define CHANNELS_PER_BRANCH	2
+#define MAX_BRANCHES		2
 
 /* Defines to extract the vaious fields from the
  *	MTRx - Memory Technology Registers
@@ -473,7 +474,6 @@ static void i5000_process_fatal_error_info(struct mem_ctl_info *mci,
 	char msg[EDAC_MC_LABEL_LEN + 1 + 160];
 	char *specific = NULL;
 	u32 allErrors;
-	int branch;
 	int channel;
 	int bank;
 	int rank;
@@ -485,8 +485,7 @@ static void i5000_process_fatal_error_info(struct mem_ctl_info *mci,
 	if (!allErrors)
 		return;		/* if no error, return now */
 
-	branch = EXTRACT_FBDCHAN_INDX(info->ferr_fat_fbd);
-	channel = branch;
+	channel = EXTRACT_FBDCHAN_INDX(info->ferr_fat_fbd);
 
 	/* Use the NON-Recoverable macros to extract data */
 	bank = NREC_BANK(info->nrecmema);
@@ -495,9 +494,9 @@ static void i5000_process_fatal_error_info(struct mem_ctl_info *mci,
 	ras = NREC_RAS(info->nrecmemb);
 	cas = NREC_CAS(info->nrecmemb);
 
-	debugf0("\t\tCSROW= %d  Channels= %d,%d  (Branch= %d "
-		"DRAM Bank= %d rdwr= %s ras= %d cas= %d)\n",
-		rank, channel, channel + 1, branch >> 1, bank,
+	debugf0("\t\tCSROW= %d  Channel= %d "
+		"(DRAM Bank= %d rdwr= %s ras= %d cas= %d)\n",
+		rank, channel, bank,
 		rdwr ? "Write" : "Read", ras, cas);
 
 	/* Only 1 bit will be on */
@@ -533,13 +532,14 @@ static void i5000_process_fatal_error_info(struct mem_ctl_info *mci,
 
 	/* Form out message */
 	snprintf(msg, sizeof(msg),
-		 "(Branch=%d DRAM-Bank=%d RDWR=%s RAS=%d CAS=%d "
-		 "FATAL Err=0x%x (%s))",
-		 branch >> 1, bank, rdwr ? "Write" : "Read", ras, cas,
-		 allErrors, specific);
+		 "Bank=%d RAS=%d CAS=%d FATAL Err=0x%x (%s)",
+		 bank, ras, cas, allErrors, specific);
 
 	/* Call the helper to output message */
-	edac_mc_handle_fbd_ue(mci, rank, channel, channel + 1, msg);
+	edac_mc_handle_error(HW_EVENT_ERR_FATAL, mci, 0, 0, 0,
+			     channel >> 1, channel & 1, rank,
+			     rdwr ? "Write error" : "Read error",
+			     msg, NULL);
 }
 
 /*
@@ -633,13 +633,14 @@ static void i5000_process_nonfatal_error_info(struct mem_ctl_info *mci,
 
 		/* Form out message */
 		snprintf(msg, sizeof(msg),
-			 "(Branch=%d DRAM-Bank=%d RDWR=%s RAS=%d "
-			 "CAS=%d, UE Err=0x%x (%s))",
-			 branch >> 1, bank, rdwr ? "Write" : "Read", ras, cas,
-			 ue_errors, specific);
+			 "Rank=%d Bank=%d RAS=%d CAS=%d, UE Err=0x%x (%s)",
+			 rank, bank, ras, cas, ue_errors, specific);
 
 		/* Call the helper to output message */
-		edac_mc_handle_fbd_ue(mci, rank, channel, channel + 1, msg);
+		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 0, 0, 0,
+				channel >> 1, -1, rank,
+				rdwr ? "Write error" : "Read error",
+				msg, NULL);
 	}
 
 	/* Check correctable errors */
@@ -685,13 +686,16 @@ static void i5000_process_nonfatal_error_info(struct mem_ctl_info *mci,
 
 		/* Form out message */
 		snprintf(msg, sizeof(msg),
-			 "(Branch=%d DRAM-Bank=%d RDWR=%s RAS=%d "
+			 "Rank=%d Bank=%d RDWR=%s RAS=%d "
 			 "CAS=%d, CE Err=0x%x (%s))", branch >> 1, bank,
 			 rdwr ? "Write" : "Read", ras, cas, ce_errors,
 			 specific);
 
 		/* Call the helper to output message */
-		edac_mc_handle_fbd_ce(mci, rank, channel, msg);
+		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 0, 0, 0,
+				channel >> 1, channel % 2, rank,
+				rdwr ? "Write error" : "Read error",
+				msg, NULL);
 	}
 
 	if (!misc_messages)
@@ -731,11 +735,12 @@ static void i5000_process_nonfatal_error_info(struct mem_ctl_info *mci,
 
 		/* Form out message */
 		snprintf(msg, sizeof(msg),
-			 "(Branch=%d Err=%#x (%s))", branch >> 1,
-			 misc_errors, specific);
+			 "Err=%#x (%s)", misc_errors, specific);
 
 		/* Call the helper to output message */
-		edac_mc_handle_fbd_ce(mci, 0, 0, msg);
+		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 0, 0, 0,
+				branch >> 1, -1, -1,
+				"Misc error", msg, NULL);
 	}
 }
 
@@ -956,14 +961,14 @@ static int determine_amb_present_reg(struct i5000_pvt *pvt, int channel)
  *
  *	return the proper MTR register as determine by the csrow and channel desired
  */
-static int determine_mtr(struct i5000_pvt *pvt, int csrow, int channel)
+static int determine_mtr(struct i5000_pvt *pvt, int slot, int channel)
 {
 	int mtr;
 
 	if (channel < CHANNELS_PER_BRANCH)
-		mtr = pvt->b0_mtr[csrow >> 1];
+		mtr = pvt->b0_mtr[slot];
 	else
-		mtr = pvt->b1_mtr[csrow >> 1];
+		mtr = pvt->b1_mtr[slot];
 
 	return mtr;
 }
@@ -988,37 +993,34 @@ static void decode_mtr(int slot_row, u16 mtr)
 	debugf2("\t\tNUMCOL: %s\n", numcol_toString[MTR_DIMM_COLS(mtr)]);
 }
 
-static void handle_channel(struct i5000_pvt *pvt, int csrow, int channel,
+static void handle_channel(struct i5000_pvt *pvt, int slot, int channel,
 			struct i5000_dimm_info *dinfo)
 {
 	int mtr;
 	int amb_present_reg;
 	int addrBits;
 
-	mtr = determine_mtr(pvt, csrow, channel);
+	mtr = determine_mtr(pvt, slot, channel);
 	if (MTR_DIMMS_PRESENT(mtr)) {
 		amb_present_reg = determine_amb_present_reg(pvt, channel);
 
-		/* Determine if there is  a  DIMM present in this DIMM slot */
-		if (amb_present_reg & (1 << (csrow >> 1))) {
+		/* Determine if there is a DIMM present in this DIMM slot */
+		if (amb_present_reg) {
 			dinfo->dual_rank = MTR_DIMM_RANK(mtr);
 
-			if (!((dinfo->dual_rank == 0) &&
-				((csrow & 0x1) == 0x1))) {
-				/* Start with the number of bits for a Bank
-				 * on the DRAM */
-				addrBits = MTR_DRAM_BANKS_ADDR_BITS(mtr);
-				/* Add thenumber of ROW bits */
-				addrBits += MTR_DIMM_ROWS_ADDR_BITS(mtr);
-				/* add the number of COLUMN bits */
-				addrBits += MTR_DIMM_COLS_ADDR_BITS(mtr);
-
-				addrBits += 6;	/* add 64 bits per DIMM */
-				addrBits -= 20;	/* divide by 2^^20 */
-				addrBits -= 3;	/* 8 bits per bytes */
-
-				dinfo->megabytes = 1 << addrBits;
-			}
+			/* Start with the number of bits for a Bank
+				* on the DRAM */
+			addrBits = MTR_DRAM_BANKS_ADDR_BITS(mtr);
+			/* Add the number of ROW bits */
+			addrBits += MTR_DIMM_ROWS_ADDR_BITS(mtr);
+			/* add the number of COLUMN bits */
+			addrBits += MTR_DIMM_COLS_ADDR_BITS(mtr);
+
+			addrBits += 6;	/* add 64 bits per DIMM */
+			addrBits -= 20;	/* divide by 2^^20 */
+			addrBits -= 3;	/* 8 bits per bytes */
+
+			dinfo->megabytes = 1 << addrBits;
 		}
 	}
 }
@@ -1032,10 +1034,9 @@ static void handle_channel(struct i5000_pvt *pvt, int csrow, int channel,
 static void calculate_dimm_size(struct i5000_pvt *pvt)
 {
 	struct i5000_dimm_info *dinfo;
-	int csrow, max_csrows;
+	int slot, channel, branch;
 	char *p, *mem_buffer;
 	int space, n;
-	int channel;
 
 	/* ================= Generate some debug output ================= */
 	space = PAGE_SIZE;
@@ -1046,22 +1047,17 @@ static void calculate_dimm_size(struct i5000_pvt *pvt)
 		return;
 	}
 
-	n = snprintf(p, space, "\n");
-	p += n;
-	space -= n;
-
-	/* Scan all the actual CSROWS (which is # of DIMMS * 2)
+	/* Scan all the actual slots
 	 * and calculate the information for each DIMM
-	 * Start with the highest csrow first, to display it first
-	 * and work toward the 0th csrow
+	 * Start with the highest slot first, to display it first
+	 * and work toward the 0th slot
 	 */
-	max_csrows = pvt->maxdimmperch * 2;
-	for (csrow = max_csrows - 1; csrow >= 0; csrow--) {
+	for (slot = pvt->maxdimmperch - 1; slot >= 0; slot--) {
 
-		/* on an odd csrow, first output a 'boundary' marker,
+		/* on an odd slot, first output a 'boundary' marker,
 		 * then reset the message buffer  */
-		if (csrow & 0x1) {
-			n = snprintf(p, space, "---------------------------"
+		if (slot & 0x1) {
+			n = snprintf(p, space, "--------------------------"
 				"--------------------------------");
 			p += n;
 			space -= n;
@@ -1069,30 +1065,39 @@ static void calculate_dimm_size(struct i5000_pvt *pvt)
 			p = mem_buffer;
 			space = PAGE_SIZE;
 		}
-		n = snprintf(p, space, "csrow %2d    ", csrow);
+		n = snprintf(p, space, "slot %2d    ", slot);
 		p += n;
 		space -= n;
 
 		for (channel = 0; channel < pvt->maxch; channel++) {
-			dinfo = &pvt->dimm_info[csrow][channel];
-			handle_channel(pvt, csrow, channel, dinfo);
-			n = snprintf(p, space, "%4d MB   | ", dinfo->megabytes);
+			dinfo = &pvt->dimm_info[slot][channel];
+			handle_channel(pvt, slot, channel, dinfo);
+			if (dinfo->megabytes)
+				n = snprintf(p, space, "%4d MB %dR| ",
+					     dinfo->megabytes, dinfo->dual_rank + 1);
+			else
+				n = snprintf(p, space, "%4d MB   | ", 0);
 			p += n;
 			space -= n;
 		}
-		n = snprintf(p, space, "\n");
 		p += n;
 		space -= n;
+		debugf2("%s\n", mem_buffer);
+		p = mem_buffer;
+		space = PAGE_SIZE;
 	}
 
 	/* Output the last bottom 'boundary' marker */
-	n = snprintf(p, space, "---------------------------"
-		"--------------------------------\n");
+	n = snprintf(p, space, "--------------------------"
+		"--------------------------------");
 	p += n;
 	space -= n;
+	debugf2("%s\n", mem_buffer);
+	p = mem_buffer;
+	space = PAGE_SIZE;
 
 	/* now output the 'channel' labels */
-	n = snprintf(p, space, "            ");
+	n = snprintf(p, space, "           ");
 	p += n;
 	space -= n;
 	for (channel = 0; channel < pvt->maxch; channel++) {
@@ -1100,9 +1105,17 @@ static void calculate_dimm_size(struct i5000_pvt *pvt)
 		p += n;
 		space -= n;
 	}
-	n = snprintf(p, space, "\n");
+	debugf2("%s\n", mem_buffer);
+	p = mem_buffer;
+	space = PAGE_SIZE;
+
+	n = snprintf(p, space, "           ");
 	p += n;
-	space -= n;
+	for (branch = 0; branch < MAX_BRANCHES; branch++) {
+		n = snprintf(p, space, "       branch %d       | ", branch);
+		p += n;
+		space -= n;
+	}
 
 	/* output the last message and free buffer */
 	debugf2("%s\n", mem_buffer);
@@ -1235,13 +1248,13 @@ static void i5000_get_mc_regs(struct mem_ctl_info *mci)
 static int i5000_init_csrows(struct mem_ctl_info *mci)
 {
 	struct i5000_pvt *pvt;
-	struct csrow_info *p_csrow;
+	struct dimm_info *dimm;
 	int empty, channel_count;
 	int max_csrows;
-	int mtr, mtr1;
+	int mtr;
 	int csrow_megs;
 	int channel;
-	int csrow;
+	int slot;
 
 	pvt = mci->pvt_info;
 
@@ -1250,43 +1263,40 @@ static int i5000_init_csrows(struct mem_ctl_info *mci)
 
 	empty = 1;		/* Assume NO memory */
 
-	for (csrow = 0; csrow < max_csrows; csrow++) {
-		p_csrow = &mci->csrows[csrow];
-
-		p_csrow->csrow_idx = csrow;
-
-		/* use branch 0 for the basis */
-		mtr = pvt->b0_mtr[csrow >> 1];
-		mtr1 = pvt->b1_mtr[csrow >> 1];
-
-		/* if no DIMMS on this row, continue */
-		if (!MTR_DIMMS_PRESENT(mtr) && !MTR_DIMMS_PRESENT(mtr1))
-			continue;
+	/*
+	 * FIXME: The memory layout used to map slot/channel into the
+	 * real memory architecture is weird: branch+slot are "csrows"
+	 * and channel is channel. That required an extra array (dimm_info)
+	 * to map the dimms. A good cleanup would be to remove this array,
+	 * and do a loop here with branch, channel, slot
+	 */
+	for (slot = 0; slot < max_csrows; slot++) {
+		for (channel = 0; channel < pvt->maxch; channel++) {
 
-		/* FAKE OUT VALUES, FIXME */
-		p_csrow->first_page = 0 + csrow * 20;
-		p_csrow->last_page = 9 + csrow * 20;
-		p_csrow->page_mask = 0xFFF;
+			mtr = determine_mtr(pvt, slot, channel);
 
-		p_csrow->grain = 8;
+			if (!MTR_DIMMS_PRESENT(mtr))
+				continue;
 
-		csrow_megs = 0;
-		for (channel = 0; channel < pvt->maxch; channel++) {
-			csrow_megs += pvt->dimm_info[csrow][channel].megabytes;
-		}
+			dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers,
+				       channel / MAX_BRANCHES,
+				       channel % MAX_BRANCHES, slot);
 
-		p_csrow->nr_pages = csrow_megs << 8;
+			csrow_megs = pvt->dimm_info[slot][channel].megabytes;
+			dimm->grain = 8;
 
-		/* Assume DDR2 for now */
-		p_csrow->mtype = MEM_FB_DDR2;
+			/* Assume DDR2 for now */
+			dimm->mtype = MEM_FB_DDR2;
 
-		/* ask what device type on this row */
-		if (MTR_DRAM_WIDTH(mtr))
-			p_csrow->dtype = DEV_X8;
-		else
-			p_csrow->dtype = DEV_X4;
+			/* ask what device type on this row */
+			if (MTR_DRAM_WIDTH(mtr))
+				dimm->dtype = DEV_X8;
+			else
+				dimm->dtype = DEV_X4;
 
-		p_csrow->edac_mode = EDAC_S8ECD8ED;
+			dimm->edac_mode = EDAC_S8ECD8ED;
+			dimm->nr_pages = csrow_megs << 8;
+		}
 
 		empty = 0;
 	}
@@ -1317,7 +1327,7 @@ static void i5000_enable_error_reporting(struct mem_ctl_info *mci)
 }
 
 /*
- * i5000_get_dimm_and_channel_counts(pdev, &num_csrows, &num_channels)
+ * i5000_get_dimm_and_channel_counts(pdev, &nr_csrows, &num_channels)
  *
  *	ask the device how many channels are present and how many CSROWS
  *	 as well
@@ -1332,7 +1342,7 @@ static void i5000_get_dimm_and_channel_counts(struct pci_dev *pdev,
 	 * supported on this memory controller
 	 */
 	pci_read_config_byte(pdev, MAXDIMMPERCH, &value);
-	*num_dimms_per_channel = (int)value *2;
+	*num_dimms_per_channel = (int)value;
 
 	pci_read_config_byte(pdev, MAXCH, &value);
 	*num_channels = (int)value;
@@ -1348,10 +1358,10 @@ static void i5000_get_dimm_and_channel_counts(struct pci_dev *pdev,
 static int i5000_probe1(struct pci_dev *pdev, int dev_idx)
 {
 	struct mem_ctl_info *mci;
+	struct edac_mc_layer layers[3];
 	struct i5000_pvt *pvt;
 	int num_channels;
 	int num_dimms_per_channel;
-	int num_csrows;
 
 	debugf0("MC: %s: %s(), pdev bus %u dev=0x%x fn=0x%x\n",
 		__FILE__, __func__,
@@ -1377,14 +1387,22 @@ static int i5000_probe1(struct pci_dev *pdev, int dev_idx)
 	 */
 	i5000_get_dimm_and_channel_counts(pdev, &num_dimms_per_channel,
 					&num_channels);
-	num_csrows = num_dimms_per_channel * 2;
 
-	debugf0("MC: %s(): Number of - Channels= %d  DIMMS= %d  CSROWS= %d\n",
-		__func__, num_channels, num_dimms_per_channel, num_csrows);
+	debugf0("MC: %s(): Number of Branches=2 Channels= %d  DIMMS= %d\n",
+		__func__, num_channels, num_dimms_per_channel);
 
 	/* allocate a new MC control structure */
-	mci = edac_mc_alloc(sizeof(*pvt), num_csrows, num_channels, 0);
 
+	layers[0].type = EDAC_MC_LAYER_BRANCH;
+	layers[0].size = MAX_BRANCHES;
+	layers[0].is_virt_csrow = false;
+	layers[1].type = EDAC_MC_LAYER_CHANNEL;
+	layers[1].size = num_channels / MAX_BRANCHES;
+	layers[1].is_virt_csrow = false;
+	layers[2].type = EDAC_MC_LAYER_SLOT;
+	layers[2].size = num_dimms_per_channel;
+	layers[2].is_virt_csrow = true;
+	mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(*pvt));
 	if (mci == NULL)
 		return -ENOMEM;
 
diff --git a/drivers/edac/i5100_edac.c b/drivers/edac/i5100_edac.c
index d500749464ea..e9e7c2a29dc3 100644
--- a/drivers/edac/i5100_edac.c
+++ b/drivers/edac/i5100_edac.c
@@ -14,6 +14,11 @@
  * rows for each respective channel are laid out one after another,
  * the first half belonging to channel 0, the second half belonging
  * to channel 1.
+ *
+ * This driver is for DDR2 DIMMs, and it uses chip select to select among the
+ * several ranks. However, instead of showing memories as ranks, it outputs
+ * them as DIMM's. An internal table creates the association between ranks
+ * and DIMM's.
  */
 #include <linux/module.h>
 #include <linux/init.h>
@@ -410,14 +415,6 @@ static int i5100_csrow_to_chan(const struct mem_ctl_info *mci, int csrow)
 	return csrow / priv->ranksperchan;
 }
 
-static unsigned i5100_rank_to_csrow(const struct mem_ctl_info *mci,
-				    int chan, int rank)
-{
-	const struct i5100_priv *priv = mci->pvt_info;
-
-	return chan * priv->ranksperchan + rank;
-}
-
 static void i5100_handle_ce(struct mem_ctl_info *mci,
 			    int chan,
 			    unsigned bank,
@@ -427,17 +424,17 @@ static void i5100_handle_ce(struct mem_ctl_info *mci,
 			    unsigned ras,
 			    const char *msg)
 {
-	const int csrow = i5100_rank_to_csrow(mci, chan, rank);
+	char detail[80];
 
-	printk(KERN_ERR
-		"CE chan %d, bank %u, rank %u, syndrome 0x%lx, "
-		"cas %u, ras %u, csrow %u, label \"%s\": %s\n",
-		chan, bank, rank, syndrome, cas, ras,
-		csrow, mci->csrows[csrow].channels[0].label, msg);
+	/* Form out message */
+	snprintf(detail, sizeof(detail),
+		 "bank %u, cas %u, ras %u\n",
+		 bank, cas, ras);
 
-	mci->ce_count++;
-	mci->csrows[csrow].ce_count++;
-	mci->csrows[csrow].channels[0].ce_count++;
+	edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
+			     0, 0, syndrome,
+			     chan, rank, -1,
+			     msg, detail, NULL);
 }
 
 static void i5100_handle_ue(struct mem_ctl_info *mci,
@@ -449,16 +446,17 @@ static void i5100_handle_ue(struct mem_ctl_info *mci,
 			    unsigned ras,
 			    const char *msg)
 {
-	const int csrow = i5100_rank_to_csrow(mci, chan, rank);
+	char detail[80];
 
-	printk(KERN_ERR
-		"UE chan %d, bank %u, rank %u, syndrome 0x%lx, "
-		"cas %u, ras %u, csrow %u, label \"%s\": %s\n",
-		chan, bank, rank, syndrome, cas, ras,
-		csrow, mci->csrows[csrow].channels[0].label, msg);
+	/* Form out message */
+	snprintf(detail, sizeof(detail),
+		 "bank %u, cas %u, ras %u\n",
+		 bank, cas, ras);
 
-	mci->ue_count++;
-	mci->csrows[csrow].ue_count++;
+	edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
+			     0, 0, syndrome,
+			     chan, rank, -1,
+			     msg, detail, NULL);
 }
 
 static void i5100_read_log(struct mem_ctl_info *mci, int chan,
@@ -835,10 +833,10 @@ static void __devinit i5100_init_interleaving(struct pci_dev *pdev,
 static void __devinit i5100_init_csrows(struct mem_ctl_info *mci)
 {
 	int i;
-	unsigned long total_pages = 0UL;
 	struct i5100_priv *priv = mci->pvt_info;
 
-	for (i = 0; i < mci->nr_csrows; i++) {
+	for (i = 0; i < mci->tot_dimms; i++) {
+		struct dimm_info *dimm;
 		const unsigned long npages = i5100_npages(mci, i);
 		const unsigned chan = i5100_csrow_to_chan(mci, i);
 		const unsigned rank = i5100_csrow_to_rank(mci, i);
@@ -846,33 +844,23 @@ static void __devinit i5100_init_csrows(struct mem_ctl_info *mci)
 		if (!npages)
 			continue;
 
-		/*
-		 * FIXME: these two are totally bogus -- I don't see how to
-		 * map them correctly to this structure...
-		 */
-		mci->csrows[i].first_page = total_pages;
-		mci->csrows[i].last_page = total_pages + npages - 1;
-		mci->csrows[i].page_mask = 0UL;
-
-		mci->csrows[i].nr_pages = npages;
-		mci->csrows[i].grain = 32;
-		mci->csrows[i].csrow_idx = i;
-		mci->csrows[i].dtype =
-			(priv->mtr[chan][rank].width == 4) ? DEV_X4 : DEV_X8;
-		mci->csrows[i].ue_count = 0;
-		mci->csrows[i].ce_count = 0;
-		mci->csrows[i].mtype = MEM_RDDR2;
-		mci->csrows[i].edac_mode = EDAC_SECDED;
-		mci->csrows[i].mci = mci;
-		mci->csrows[i].nr_channels = 1;
-		mci->csrows[i].channels[0].chan_idx = 0;
-		mci->csrows[i].channels[0].ce_count = 0;
-		mci->csrows[i].channels[0].csrow = mci->csrows + i;
-		snprintf(mci->csrows[i].channels[0].label,
-			 sizeof(mci->csrows[i].channels[0].label),
-			 "DIMM%u", i5100_rank_to_slot(mci, chan, rank));
-
-		total_pages += npages;
+		dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers,
+			       chan, rank, 0);
+
+		dimm->nr_pages = npages;
+		if (npages) {
+			dimm->grain = 32;
+			dimm->dtype = (priv->mtr[chan][rank].width == 4) ?
+					DEV_X4 : DEV_X8;
+			dimm->mtype = MEM_RDDR2;
+			dimm->edac_mode = EDAC_SECDED;
+			snprintf(dimm->label, sizeof(dimm->label),
+				"DIMM%u",
+				i5100_rank_to_slot(mci, chan, rank));
+		}
+
+		debugf2("dimm channel %d, rank %d, size %ld\n",
+			chan, rank, (long)PAGES_TO_MiB(npages));
 	}
 }
 
@@ -881,6 +869,7 @@ static int __devinit i5100_init_one(struct pci_dev *pdev,
 {
 	int rc;
 	struct mem_ctl_info *mci;
+	struct edac_mc_layer layers[2];
 	struct i5100_priv *priv;
 	struct pci_dev *ch0mm, *ch1mm;
 	int ret = 0;
@@ -941,7 +930,14 @@ static int __devinit i5100_init_one(struct pci_dev *pdev,
 		goto bail_ch1;
 	}
 
-	mci = edac_mc_alloc(sizeof(*priv), ranksperch * 2, 1, 0);
+	layers[0].type = EDAC_MC_LAYER_CHANNEL;
+	layers[0].size = 2;
+	layers[0].is_virt_csrow = false;
+	layers[1].type = EDAC_MC_LAYER_SLOT;
+	layers[1].size = ranksperch;
+	layers[1].is_virt_csrow = true;
+	mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers,
+			    sizeof(*priv));
 	if (!mci) {
 		ret = -ENOMEM;
 		goto bail_disable_ch1;
diff --git a/drivers/edac/i5400_edac.c b/drivers/edac/i5400_edac.c
index 1869a1018fb5..6640c29e1885 100644
--- a/drivers/edac/i5400_edac.c
+++ b/drivers/edac/i5400_edac.c
@@ -18,6 +18,10 @@
  * Intel 5400 Chipset Memory Controller Hub (MCH) - Datasheet
  * 	http://developer.intel.com/design/chipsets/datashts/313070.htm
  *
+ * This Memory Controller manages DDR2 FB-DIMMs. It has 2 branches, each with
+ * 2 channels operating in lockstep no-mirror mode. Each channel can have up to
+ * 4 dimm's, each with up to 8GB.
+ *
  */
 
 #include <linux/module.h>
@@ -44,12 +48,10 @@
 	edac_mc_chipset_printk(mci, level, "i5400", fmt, ##arg)
 
 /* Limits for i5400 */
-#define NUM_MTRS_PER_BRANCH	4
+#define MAX_BRANCHES		2
 #define CHANNELS_PER_BRANCH	2
-#define MAX_DIMMS_PER_CHANNEL	NUM_MTRS_PER_BRANCH
-#define	MAX_CHANNELS		4
-/* max possible csrows per channel */
-#define MAX_CSROWS		(MAX_DIMMS_PER_CHANNEL)
+#define DIMMS_PER_CHANNEL	4
+#define	MAX_CHANNELS		(MAX_BRANCHES * CHANNELS_PER_BRANCH)
 
 /* Device 16,
  * Function 0: System Address
@@ -347,16 +349,16 @@ struct i5400_pvt {
 
 	u16 mir0, mir1;
 
-	u16 b0_mtr[NUM_MTRS_PER_BRANCH];	/* Memory Technlogy Reg */
+	u16 b0_mtr[DIMMS_PER_CHANNEL];	/* Memory Technlogy Reg */
 	u16 b0_ambpresent0;			/* Branch 0, Channel 0 */
 	u16 b0_ambpresent1;			/* Brnach 0, Channel 1 */
 
-	u16 b1_mtr[NUM_MTRS_PER_BRANCH];	/* Memory Technlogy Reg */
+	u16 b1_mtr[DIMMS_PER_CHANNEL];	/* Memory Technlogy Reg */
 	u16 b1_ambpresent0;			/* Branch 1, Channel 8 */
 	u16 b1_ambpresent1;			/* Branch 1, Channel 1 */
 
 	/* DIMM information matrix, allocating architecture maximums */
-	struct i5400_dimm_info dimm_info[MAX_CSROWS][MAX_CHANNELS];
+	struct i5400_dimm_info dimm_info[DIMMS_PER_CHANNEL][MAX_CHANNELS];
 
 	/* Actual values for this controller */
 	int maxch;				/* Max channels */
@@ -532,13 +534,15 @@ static void i5400_proccess_non_recoverable_info(struct mem_ctl_info *mci,
 	int ras, cas;
 	int errnum;
 	char *type = NULL;
+	enum hw_event_mc_err_type tp_event = HW_EVENT_ERR_UNCORRECTED;
 
 	if (!allErrors)
 		return;		/* if no error, return now */
 
-	if (allErrors &  ERROR_FAT_MASK)
+	if (allErrors &  ERROR_FAT_MASK) {
 		type = "FATAL";
-	else if (allErrors & FERR_NF_UNCORRECTABLE)
+		tp_event = HW_EVENT_ERR_FATAL;
+	} else if (allErrors & FERR_NF_UNCORRECTABLE)
 		type = "NON-FATAL uncorrected";
 	else
 		type = "NON-FATAL recoverable";
@@ -556,7 +560,7 @@ static void i5400_proccess_non_recoverable_info(struct mem_ctl_info *mci,
 	ras = nrec_ras(info);
 	cas = nrec_cas(info);
 
-	debugf0("\t\tCSROW= %d  Channels= %d,%d  (Branch= %d "
+	debugf0("\t\tDIMM= %d  Channels= %d,%d  (Branch= %d "
 		"DRAM Bank= %d Buffer ID = %d rdwr= %s ras= %d cas= %d)\n",
 		rank, channel, channel + 1, branch >> 1, bank,
 		buf_id, rdwr_str(rdwr), ras, cas);
@@ -566,13 +570,13 @@ static void i5400_proccess_non_recoverable_info(struct mem_ctl_info *mci,
 
 	/* Form out message */
 	snprintf(msg, sizeof(msg),
-		 "%s (Branch=%d DRAM-Bank=%d Buffer ID = %d RDWR=%s "
-		 "RAS=%d CAS=%d %s Err=0x%lx (%s))",
-		 type, branch >> 1, bank, buf_id, rdwr_str(rdwr), ras, cas,
-		 type, allErrors, error_name[errnum]);
+		 "Bank=%d Buffer ID = %d RAS=%d CAS=%d Err=0x%lx (%s)",
+		 bank, buf_id, ras, cas, allErrors, error_name[errnum]);
 
-	/* Call the helper to output message */
-	edac_mc_handle_fbd_ue(mci, rank, channel, channel + 1, msg);
+	edac_mc_handle_error(tp_event, mci, 0, 0, 0,
+			     branch >> 1, -1, rank,
+			     rdwr ? "Write error" : "Read error",
+			     msg, NULL);
 }
 
 /*
@@ -630,7 +634,7 @@ static void i5400_process_nonfatal_error_info(struct mem_ctl_info *mci,
 		/* Only 1 bit will be on */
 		errnum = find_first_bit(&allErrors, ARRAY_SIZE(error_name));
 
-		debugf0("\t\tCSROW= %d Channel= %d  (Branch %d "
+		debugf0("\t\tDIMM= %d Channel= %d  (Branch %d "
 			"DRAM Bank= %d rdwr= %s ras= %d cas= %d)\n",
 			rank, channel, branch >> 1, bank,
 			rdwr_str(rdwr), ras, cas);
@@ -642,8 +646,10 @@ static void i5400_process_nonfatal_error_info(struct mem_ctl_info *mci,
 			 branch >> 1, bank, rdwr_str(rdwr), ras, cas,
 			 allErrors, error_name[errnum]);
 
-		/* Call the helper to output message */
-		edac_mc_handle_fbd_ce(mci, rank, channel, msg);
+		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 0, 0, 0,
+				     branch >> 1, channel % 2, rank,
+				     rdwr ? "Write error" : "Read error",
+				     msg, NULL);
 
 		return;
 	}
@@ -831,8 +837,8 @@ static int i5400_get_devices(struct mem_ctl_info *mci, int dev_idx)
 /*
  *	determine_amb_present
  *
- *		the information is contained in NUM_MTRS_PER_BRANCH different
- *		registers determining which of the NUM_MTRS_PER_BRANCH requires
+ *		the information is contained in DIMMS_PER_CHANNEL different
+ *		registers determining which of the DIMMS_PER_CHANNEL requires
  *              knowing which channel is in question
  *
  *	2 branches, each with 2 channels
@@ -861,11 +867,11 @@ static int determine_amb_present_reg(struct i5400_pvt *pvt, int channel)
 }
 
 /*
- * determine_mtr(pvt, csrow, channel)
+ * determine_mtr(pvt, dimm, channel)
  *
- * return the proper MTR register as determine by the csrow and desired channel
+ * return the proper MTR register as determine by the dimm and desired channel
  */
-static int determine_mtr(struct i5400_pvt *pvt, int csrow, int channel)
+static int determine_mtr(struct i5400_pvt *pvt, int dimm, int channel)
 {
 	int mtr;
 	int n;
@@ -873,11 +879,11 @@ static int determine_mtr(struct i5400_pvt *pvt, int csrow, int channel)
 	/* There is one MTR for each slot pair of FB-DIMMs,
 	   Each slot pair may be at branch 0 or branch 1.
 	 */
-	n = csrow;
+	n = dimm;
 
-	if (n >= NUM_MTRS_PER_BRANCH) {
-		debugf0("ERROR: trying to access an invalid csrow: %d\n",
-			csrow);
+	if (n >= DIMMS_PER_CHANNEL) {
+		debugf0("ERROR: trying to access an invalid dimm: %d\n",
+			dimm);
 		return 0;
 	}
 
@@ -913,19 +919,19 @@ static void decode_mtr(int slot_row, u16 mtr)
 	debugf2("\t\tNUMCOL: %s\n", numcol_toString[MTR_DIMM_COLS(mtr)]);
 }
 
-static void handle_channel(struct i5400_pvt *pvt, int csrow, int channel,
+static void handle_channel(struct i5400_pvt *pvt, int dimm, int channel,
 			struct i5400_dimm_info *dinfo)
 {
 	int mtr;
 	int amb_present_reg;
 	int addrBits;
 
-	mtr = determine_mtr(pvt, csrow, channel);
+	mtr = determine_mtr(pvt, dimm, channel);
 	if (MTR_DIMMS_PRESENT(mtr)) {
 		amb_present_reg = determine_amb_present_reg(pvt, channel);
 
 		/* Determine if there is a DIMM present in this DIMM slot */
-		if (amb_present_reg & (1 << csrow)) {
+		if (amb_present_reg & (1 << dimm)) {
 			/* Start with the number of bits for a Bank
 			 * on the DRAM */
 			addrBits = MTR_DRAM_BANKS_ADDR_BITS(mtr);
@@ -954,10 +960,10 @@ static void handle_channel(struct i5400_pvt *pvt, int csrow, int channel,
 static void calculate_dimm_size(struct i5400_pvt *pvt)
 {
 	struct i5400_dimm_info *dinfo;
-	int csrow, max_csrows;
+	int dimm, max_dimms;
 	char *p, *mem_buffer;
 	int space, n;
-	int channel;
+	int channel, branch;
 
 	/* ================= Generate some debug output ================= */
 	space = PAGE_SIZE;
@@ -968,32 +974,32 @@ static void calculate_dimm_size(struct i5400_pvt *pvt)
 		return;
 	}
 
-	/* Scan all the actual CSROWS
+	/* Scan all the actual DIMMS
 	 * and calculate the information for each DIMM
-	 * Start with the highest csrow first, to display it first
-	 * and work toward the 0th csrow
+	 * Start with the highest dimm first, to display it first
+	 * and work toward the 0th dimm
 	 */
-	max_csrows = pvt->maxdimmperch;
-	for (csrow = max_csrows - 1; csrow >= 0; csrow--) {
+	max_dimms = pvt->maxdimmperch;
+	for (dimm = max_dimms - 1; dimm >= 0; dimm--) {
 
-		/* on an odd csrow, first output a 'boundary' marker,
+		/* on an odd dimm, first output a 'boundary' marker,
 		 * then reset the message buffer  */
-		if (csrow & 0x1) {
+		if (dimm & 0x1) {
 			n = snprintf(p, space, "---------------------------"
-					"--------------------------------");
+					"-------------------------------");
 			p += n;
 			space -= n;
 			debugf2("%s\n", mem_buffer);
 			p = mem_buffer;
 			space = PAGE_SIZE;
 		}
-		n = snprintf(p, space, "csrow %2d    ", csrow);
+		n = snprintf(p, space, "dimm %2d    ", dimm);
 		p += n;
 		space -= n;
 
 		for (channel = 0; channel < pvt->maxch; channel++) {
-			dinfo = &pvt->dimm_info[csrow][channel];
-			handle_channel(pvt, csrow, channel, dinfo);
+			dinfo = &pvt->dimm_info[dimm][channel];
+			handle_channel(pvt, dimm, channel, dinfo);
 			n = snprintf(p, space, "%4d MB   | ", dinfo->megabytes);
 			p += n;
 			space -= n;
@@ -1005,7 +1011,7 @@ static void calculate_dimm_size(struct i5400_pvt *pvt)
 
 	/* Output the last bottom 'boundary' marker */
 	n = snprintf(p, space, "---------------------------"
-			"--------------------------------");
+			"-------------------------------");
 	p += n;
 	space -= n;
 	debugf2("%s\n", mem_buffer);
@@ -1013,7 +1019,7 @@ static void calculate_dimm_size(struct i5400_pvt *pvt)
 	space = PAGE_SIZE;
 
 	/* now output the 'channel' labels */
-	n = snprintf(p, space, "            ");
+	n = snprintf(p, space, "           ");
 	p += n;
 	space -= n;
 	for (channel = 0; channel < pvt->maxch; channel++) {
@@ -1022,6 +1028,19 @@ static void calculate_dimm_size(struct i5400_pvt *pvt)
 		space -= n;
 	}
 
+	space -= n;
+	debugf2("%s\n", mem_buffer);
+	p = mem_buffer;
+	space = PAGE_SIZE;
+
+	n = snprintf(p, space, "           ");
+	p += n;
+	for (branch = 0; branch < MAX_BRANCHES; branch++) {
+		n = snprintf(p, space, "       branch %d       | ", branch);
+		p += n;
+		space -= n;
+	}
+
 	/* output the last message and free buffer */
 	debugf2("%s\n", mem_buffer);
 	kfree(mem_buffer);
@@ -1080,7 +1099,7 @@ static void i5400_get_mc_regs(struct mem_ctl_info *mci)
 	debugf2("MIR1: limit= 0x%x  WAY1= %u  WAY0= %x\n", limit, way1, way0);
 
 	/* Get the set of MTR[0-3] regs by each branch */
-	for (slot_row = 0; slot_row < NUM_MTRS_PER_BRANCH; slot_row++) {
+	for (slot_row = 0; slot_row < DIMMS_PER_CHANNEL; slot_row++) {
 		int where = MTR0 + (slot_row * sizeof(u16));
 
 		/* Branch 0 set of MTR registers */
@@ -1105,7 +1124,7 @@ static void i5400_get_mc_regs(struct mem_ctl_info *mci)
 	/* Read and dump branch 0's MTRs */
 	debugf2("\nMemory Technology Registers:\n");
 	debugf2("   Branch 0:\n");
-	for (slot_row = 0; slot_row < NUM_MTRS_PER_BRANCH; slot_row++)
+	for (slot_row = 0; slot_row < DIMMS_PER_CHANNEL; slot_row++)
 		decode_mtr(slot_row, pvt->b0_mtr[slot_row]);
 
 	pci_read_config_word(pvt->branch_0, AMBPRESENT_0,
@@ -1122,7 +1141,7 @@ static void i5400_get_mc_regs(struct mem_ctl_info *mci)
 	} else {
 		/* Read and dump  branch 1's MTRs */
 		debugf2("   Branch 1:\n");
-		for (slot_row = 0; slot_row < NUM_MTRS_PER_BRANCH; slot_row++)
+		for (slot_row = 0; slot_row < DIMMS_PER_CHANNEL; slot_row++)
 			decode_mtr(slot_row, pvt->b1_mtr[slot_row]);
 
 		pci_read_config_word(pvt->branch_1, AMBPRESENT_0,
@@ -1141,7 +1160,7 @@ static void i5400_get_mc_regs(struct mem_ctl_info *mci)
 }
 
 /*
- *	i5400_init_csrows	Initialize the 'csrows' table within
+ *	i5400_init_dimms	Initialize the 'dimms' table within
  *				the mci control	structure with the
  *				addressing of memory.
  *
@@ -1149,64 +1168,68 @@ static void i5400_get_mc_regs(struct mem_ctl_info *mci)
  *		0	success
  *		1	no actual memory found on this MC
  */
-static int i5400_init_csrows(struct mem_ctl_info *mci)
+static int i5400_init_dimms(struct mem_ctl_info *mci)
 {
 	struct i5400_pvt *pvt;
-	struct csrow_info *p_csrow;
-	int empty, channel_count;
-	int max_csrows;
+	struct dimm_info *dimm;
+	int ndimms, channel_count;
+	int max_dimms;
 	int mtr;
-	int csrow_megs;
-	int channel;
-	int csrow;
+	int size_mb;
+	int  channel, slot;
 
 	pvt = mci->pvt_info;
 
 	channel_count = pvt->maxch;
-	max_csrows = pvt->maxdimmperch;
+	max_dimms = pvt->maxdimmperch;
 
-	empty = 1;		/* Assume NO memory */
+	ndimms = 0;
 
-	for (csrow = 0; csrow < max_csrows; csrow++) {
-		p_csrow = &mci->csrows[csrow];
-
-		p_csrow->csrow_idx = csrow;
-
-		/* use branch 0 for the basis */
-		mtr = determine_mtr(pvt, csrow, 0);
-
-		/* if no DIMMS on this row, continue */
-		if (!MTR_DIMMS_PRESENT(mtr))
-			continue;
-
-		/* FAKE OUT VALUES, FIXME */
-		p_csrow->first_page = 0 + csrow * 20;
-		p_csrow->last_page = 9 + csrow * 20;
-		p_csrow->page_mask = 0xFFF;
-
-		p_csrow->grain = 8;
-
-		csrow_megs = 0;
-		for (channel = 0; channel < pvt->maxch; channel++)
-			csrow_megs += pvt->dimm_info[csrow][channel].megabytes;
-
-		p_csrow->nr_pages = csrow_megs << 8;
-
-		/* Assume DDR2 for now */
-		p_csrow->mtype = MEM_FB_DDR2;
-
-		/* ask what device type on this row */
-		if (MTR_DRAM_WIDTH(mtr))
-			p_csrow->dtype = DEV_X8;
-		else
-			p_csrow->dtype = DEV_X4;
-
-		p_csrow->edac_mode = EDAC_S8ECD8ED;
-
-		empty = 0;
+	/*
+	 * FIXME: remove  pvt->dimm_info[slot][channel] and use the 3
+	 * layers here.
+	 */
+	for (channel = 0; channel < mci->layers[0].size * mci->layers[1].size;
+	     channel++) {
+		for (slot = 0; slot < mci->layers[2].size; slot++) {
+			mtr = determine_mtr(pvt, slot, channel);
+
+			/* if no DIMMS on this slot, continue */
+			if (!MTR_DIMMS_PRESENT(mtr))
+				continue;
+
+			dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers,
+				       channel / 2, channel % 2, slot);
+
+			size_mb =  pvt->dimm_info[slot][channel].megabytes;
+
+			debugf2("%s: dimm%zd (branch %d channel %d slot %d): %d.%03d GB\n",
+				__func__, dimm - mci->dimms,
+				channel / 2, channel % 2, slot,
+				size_mb / 1000, size_mb % 1000);
+
+			dimm->nr_pages = size_mb << 8;
+			dimm->grain = 8;
+			dimm->dtype = MTR_DRAM_WIDTH(mtr) ? DEV_X8 : DEV_X4;
+			dimm->mtype = MEM_FB_DDR2;
+			/*
+			 * The eccc mechanism is SDDC (aka SECC), with
+			 * is similar to Chipkill.
+			 */
+			dimm->edac_mode = MTR_DRAM_WIDTH(mtr) ?
+					  EDAC_S8ECD8ED : EDAC_S4ECD4ED;
+			ndimms++;
+		}
 	}
 
-	return empty;
+	/*
+	 * When just one memory is provided, it should be at location (0,0,0).
+	 * With such single-DIMM mode, the SDCC algorithm degrades to SECDEC+.
+	 */
+	if (ndimms == 1)
+		mci->dimms[0].edac_mode = EDAC_SECDED;
+
+	return (ndimms == 0);
 }
 
 /*
@@ -1242,9 +1265,7 @@ static int i5400_probe1(struct pci_dev *pdev, int dev_idx)
 {
 	struct mem_ctl_info *mci;
 	struct i5400_pvt *pvt;
-	int num_channels;
-	int num_dimms_per_channel;
-	int num_csrows;
+	struct edac_mc_layer layers[3];
 
 	if (dev_idx >= ARRAY_SIZE(i5400_devs))
 		return -EINVAL;
@@ -1258,23 +1279,21 @@ static int i5400_probe1(struct pci_dev *pdev, int dev_idx)
 	if (PCI_FUNC(pdev->devfn) != 0)
 		return -ENODEV;
 
-	/* As we don't have a motherboard identification routine to determine
-	 * actual number of slots/dimms per channel, we thus utilize the
-	 * resource as specified by the chipset. Thus, we might have
-	 * have more DIMMs per channel than actually on the mobo, but this
-	 * allows the driver to support up to the chipset max, without
-	 * some fancy mobo determination.
+	/*
+	 * allocate a new MC control structure
+	 *
+	 * This drivers uses the DIMM slot as "csrow" and the rest as "channel".
 	 */
-	num_dimms_per_channel = MAX_DIMMS_PER_CHANNEL;
-	num_channels = MAX_CHANNELS;
-	num_csrows = num_dimms_per_channel;
-
-	debugf0("MC: %s(): Number of - Channels= %d  DIMMS= %d  CSROWS= %d\n",
-		__func__, num_channels, num_dimms_per_channel, num_csrows);
-
-	/* allocate a new MC control structure */
-	mci = edac_mc_alloc(sizeof(*pvt), num_csrows, num_channels, 0);
-
+	layers[0].type = EDAC_MC_LAYER_BRANCH;
+	layers[0].size = MAX_BRANCHES;
+	layers[0].is_virt_csrow = false;
+	layers[1].type = EDAC_MC_LAYER_CHANNEL;
+	layers[1].size = CHANNELS_PER_BRANCH;
+	layers[1].is_virt_csrow = false;
+	layers[2].type = EDAC_MC_LAYER_SLOT;
+	layers[2].size = DIMMS_PER_CHANNEL;
+	layers[2].is_virt_csrow = true;
+	mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(*pvt));
 	if (mci == NULL)
 		return -ENOMEM;
 
@@ -1284,8 +1303,8 @@ static int i5400_probe1(struct pci_dev *pdev, int dev_idx)
 
 	pvt = mci->pvt_info;
 	pvt->system_address = pdev;	/* Record this device in our private */
-	pvt->maxch = num_channels;
-	pvt->maxdimmperch = num_dimms_per_channel;
+	pvt->maxch = MAX_CHANNELS;
+	pvt->maxdimmperch = DIMMS_PER_CHANNEL;
 
 	/* 'get' the pci devices we want to reserve for our use */
 	if (i5400_get_devices(mci, dev_idx))
@@ -1307,13 +1326,13 @@ static int i5400_probe1(struct pci_dev *pdev, int dev_idx)
 	/* Set the function pointer to an actual operation function */
 	mci->edac_check = i5400_check_error;
 
-	/* initialize the MC control structure 'csrows' table
+	/* initialize the MC control structure 'dimms' table
 	 * with the mapping and control information */
-	if (i5400_init_csrows(mci)) {
+	if (i5400_init_dimms(mci)) {
 		debugf0("MC: Setting mci->edac_cap to EDAC_FLAG_NONE\n"
-			"    because i5400_init_csrows() returned nonzero "
+			"    because i5400_init_dimms() returned nonzero "
 			"value\n");
-		mci->edac_cap = EDAC_FLAG_NONE;	/* no csrows found */
+		mci->edac_cap = EDAC_FLAG_NONE;	/* no dimms found */
 	} else {
 		debugf1("MC: Enable error reporting now\n");
 		i5400_enable_error_reporting(mci);
diff --git a/drivers/edac/i7300_edac.c b/drivers/edac/i7300_edac.c
index 3bafa3bca148..97c22fd650ee 100644
--- a/drivers/edac/i7300_edac.c
+++ b/drivers/edac/i7300_edac.c
@@ -464,17 +464,14 @@ static void i7300_process_fbd_error(struct mem_ctl_info *mci)
 				FERR_FAT_FBD, error_reg);
 
 		snprintf(pvt->tmp_prt_buffer, PAGE_SIZE,
-			"FATAL (Branch=%d DRAM-Bank=%d %s "
-			"RAS=%d CAS=%d Err=0x%lx (%s))",
-			branch, bank,
-			is_wr ? "RDWR" : "RD",
-			ras, cas,
-			errors, specific);
-
-		/* Call the helper to output message */
-		edac_mc_handle_fbd_ue(mci, rank, branch << 1,
-				      (branch << 1) + 1,
-				      pvt->tmp_prt_buffer);
+			 "Bank=%d RAS=%d CAS=%d Err=0x%lx (%s))",
+			 bank, ras, cas, errors, specific);
+
+		edac_mc_handle_error(HW_EVENT_ERR_FATAL, mci, 0, 0, 0,
+				     branch, -1, rank,
+				     is_wr ? "Write error" : "Read error",
+				     pvt->tmp_prt_buffer, NULL);
+
 	}
 
 	/* read in the 1st NON-FATAL error register */
@@ -513,23 +510,14 @@ static void i7300_process_fbd_error(struct mem_ctl_info *mci)
 
 		/* Form out message */
 		snprintf(pvt->tmp_prt_buffer, PAGE_SIZE,
-			"Corrected error (Branch=%d, Channel %d), "
-			" DRAM-Bank=%d %s "
-			"RAS=%d CAS=%d, CE Err=0x%lx, Syndrome=0x%08x(%s))",
-			branch, channel,
-			bank,
-			is_wr ? "RDWR" : "RD",
-			ras, cas,
-			errors, syndrome, specific);
-
-		/*
-		 * Call the helper to output message
-		 * NOTE: Errors are reported per-branch, and not per-channel
-		 *	 Currently, we don't know how to identify the right
-		 *	 channel.
-		 */
-		edac_mc_handle_fbd_ce(mci, rank, channel,
-				      pvt->tmp_prt_buffer);
+			 "DRAM-Bank=%d RAS=%d CAS=%d, Err=0x%lx (%s))",
+			 bank, ras, cas, errors, specific);
+
+		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 0, 0,
+				     syndrome,
+				     branch >> 1, channel % 2, rank,
+				     is_wr ? "Write error" : "Read error",
+				     pvt->tmp_prt_buffer, NULL);
 	}
 	return;
 }
@@ -617,8 +605,7 @@ static void i7300_enable_error_reporting(struct mem_ctl_info *mci)
 static int decode_mtr(struct i7300_pvt *pvt,
 		      int slot, int ch, int branch,
 		      struct i7300_dimm_info *dinfo,
-		      struct csrow_info *p_csrow,
-		      u32 *nr_pages)
+		      struct dimm_info *dimm)
 {
 	int mtr, ans, addrBits, channel;
 
@@ -650,7 +637,6 @@ static int decode_mtr(struct i7300_pvt *pvt,
 	addrBits -= 3;	/* 8 bits per bytes */
 
 	dinfo->megabytes = 1 << addrBits;
-	*nr_pages = dinfo->megabytes << 8;
 
 	debugf2("\t\tWIDTH: x%d\n", MTR_DRAM_WIDTH(mtr));
 
@@ -663,11 +649,6 @@ static int decode_mtr(struct i7300_pvt *pvt,
 	debugf2("\t\tNUMCOL: %s\n", numcol_toString[MTR_DIMM_COLS(mtr)]);
 	debugf2("\t\tSIZE: %d MB\n", dinfo->megabytes);
 
-	p_csrow->grain = 8;
-	p_csrow->mtype = MEM_FB_DDR2;
-	p_csrow->csrow_idx = slot;
-	p_csrow->page_mask = 0;
-
 	/*
 	 * The type of error detection actually depends of the
 	 * mode of operation. When it is just one single memory chip, at
@@ -677,15 +658,18 @@ static int decode_mtr(struct i7300_pvt *pvt,
 	 * See datasheet Sections 7.3.6 to 7.3.8
 	 */
 
+	dimm->nr_pages = MiB_TO_PAGES(dinfo->megabytes);
+	dimm->grain = 8;
+	dimm->mtype = MEM_FB_DDR2;
 	if (IS_SINGLE_MODE(pvt->mc_settings_a)) {
-		p_csrow->edac_mode = EDAC_SECDED;
+		dimm->edac_mode = EDAC_SECDED;
 		debugf2("\t\tECC code is 8-byte-over-32-byte SECDED+ code\n");
 	} else {
 		debugf2("\t\tECC code is on Lockstep mode\n");
 		if (MTR_DRAM_WIDTH(mtr) == 8)
-			p_csrow->edac_mode = EDAC_S8ECD8ED;
+			dimm->edac_mode = EDAC_S8ECD8ED;
 		else
-			p_csrow->edac_mode = EDAC_S4ECD4ED;
+			dimm->edac_mode = EDAC_S4ECD4ED;
 	}
 
 	/* ask what device type on this row */
@@ -694,9 +678,9 @@ static int decode_mtr(struct i7300_pvt *pvt,
 			IS_SCRBALGO_ENHANCED(pvt->mc_settings) ?
 					    "enhanced" : "normal");
 
-		p_csrow->dtype = DEV_X8;
+		dimm->dtype = DEV_X8;
 	} else
-		p_csrow->dtype = DEV_X4;
+		dimm->dtype = DEV_X4;
 
 	return mtr;
 }
@@ -774,11 +758,10 @@ static int i7300_init_csrows(struct mem_ctl_info *mci)
 {
 	struct i7300_pvt *pvt;
 	struct i7300_dimm_info *dinfo;
-	struct csrow_info *p_csrow;
 	int rc = -ENODEV;
 	int mtr;
 	int ch, branch, slot, channel;
-	u32 last_page = 0, nr_pages;
+	struct dimm_info *dimm;
 
 	pvt = mci->pvt_info;
 
@@ -809,25 +792,23 @@ static int i7300_init_csrows(struct mem_ctl_info *mci)
 			pci_read_config_word(pvt->pci_dev_2x_0_fbd_branch[branch],
 					where,
 					&pvt->mtr[slot][branch]);
-			for (ch = 0; ch < MAX_BRANCHES; ch++) {
+			for (ch = 0; ch < MAX_CH_PER_BRANCH; ch++) {
 				int channel = to_channel(ch, branch);
 
+				dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
+					       mci->n_layers, branch, ch, slot);
+
 				dinfo = &pvt->dimm_info[slot][channel];
-				p_csrow = &mci->csrows[slot];
 
 				mtr = decode_mtr(pvt, slot, ch, branch,
-						 dinfo, p_csrow, &nr_pages);
+						 dinfo, dimm);
+
 				/* if no DIMMS on this row, continue */
 				if (!MTR_DIMMS_PRESENT(mtr))
 					continue;
 
-				/* Update per_csrow memory count */
-				p_csrow->nr_pages += nr_pages;
-				p_csrow->first_page = last_page;
-				last_page += nr_pages;
-				p_csrow->last_page = last_page;
-
 				rc = 0;
+
 			}
 		}
 	}
@@ -1042,10 +1023,8 @@ static int __devinit i7300_init_one(struct pci_dev *pdev,
 				    const struct pci_device_id *id)
 {
 	struct mem_ctl_info *mci;
+	struct edac_mc_layer layers[3];
 	struct i7300_pvt *pvt;
-	int num_channels;
-	int num_dimms_per_channel;
-	int num_csrows;
 	int rc;
 
 	/* wake up device */
@@ -1062,23 +1041,17 @@ static int __devinit i7300_init_one(struct pci_dev *pdev,
 	if (PCI_FUNC(pdev->devfn) != 0)
 		return -ENODEV;
 
-	/* As we don't have a motherboard identification routine to determine
-	 * actual number of slots/dimms per channel, we thus utilize the
-	 * resource as specified by the chipset. Thus, we might have
-	 * have more DIMMs per channel than actually on the mobo, but this
-	 * allows the driver to support up to the chipset max, without
-	 * some fancy mobo determination.
-	 */
-	num_dimms_per_channel = MAX_SLOTS;
-	num_channels = MAX_CHANNELS;
-	num_csrows = MAX_SLOTS * MAX_CHANNELS;
-
-	debugf0("MC: %s(): Number of - Channels= %d  DIMMS= %d  CSROWS= %d\n",
-		__func__, num_channels, num_dimms_per_channel, num_csrows);
-
 	/* allocate a new MC control structure */
-	mci = edac_mc_alloc(sizeof(*pvt), num_csrows, num_channels, 0);
-
+	layers[0].type = EDAC_MC_LAYER_BRANCH;
+	layers[0].size = MAX_BRANCHES;
+	layers[0].is_virt_csrow = false;
+	layers[1].type = EDAC_MC_LAYER_CHANNEL;
+	layers[1].size = MAX_CH_PER_BRANCH;
+	layers[1].is_virt_csrow = true;
+	layers[2].type = EDAC_MC_LAYER_SLOT;
+	layers[2].size = MAX_SLOTS;
+	layers[2].is_virt_csrow = true;
+	mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(*pvt));
 	if (mci == NULL)
 		return -ENOMEM;
 
diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c
index 7f1dfcc4e597..d27778f65a5d 100644
--- a/drivers/edac/i7core_edac.c
+++ b/drivers/edac/i7core_edac.c
@@ -221,7 +221,9 @@ struct i7core_inject {
 };
 
 struct i7core_channel {
-	u32		ranks;
+	bool		is_3dimms_present;
+	bool		is_single_4rank;
+	bool		has_4rank;
 	u32		dimms;
 };
 
@@ -257,7 +259,6 @@ struct i7core_pvt {
 	struct i7core_channel	channel[NUM_CHANS];
 
 	int		ce_count_available;
-	int 		csrow_map[NUM_CHANS][MAX_DIMMS];
 
 			/* ECC corrected errors counts per udimm */
 	unsigned long	udimm_ce_count[MAX_DIMMS];
@@ -492,116 +493,15 @@ static void free_i7core_dev(struct i7core_dev *i7core_dev)
 /****************************************************************************
 			Memory check routines
  ****************************************************************************/
-static struct pci_dev *get_pdev_slot_func(u8 socket, unsigned slot,
-					  unsigned func)
-{
-	struct i7core_dev *i7core_dev = get_i7core_dev(socket);
-	int i;
-
-	if (!i7core_dev)
-		return NULL;
-
-	for (i = 0; i < i7core_dev->n_devs; i++) {
-		if (!i7core_dev->pdev[i])
-			continue;
-
-		if (PCI_SLOT(i7core_dev->pdev[i]->devfn) == slot &&
-		    PCI_FUNC(i7core_dev->pdev[i]->devfn) == func) {
-			return i7core_dev->pdev[i];
-		}
-	}
-
-	return NULL;
-}
-
-/**
- * i7core_get_active_channels() - gets the number of channels and csrows
- * @socket:	Quick Path Interconnect socket
- * @channels:	Number of channels that will be returned
- * @csrows:	Number of csrows found
- *
- * Since EDAC core needs to know in advance the number of available channels
- * and csrows, in order to allocate memory for csrows/channels, it is needed
- * to run two similar steps. At the first step, implemented on this function,
- * it checks the number of csrows/channels present at one socket.
- * this is used in order to properly allocate the size of mci components.
- *
- * It should be noticed that none of the current available datasheets explain
- * or even mention how csrows are seen by the memory controller. So, we need
- * to add a fake description for csrows.
- * So, this driver is attributing one DIMM memory for one csrow.
- */
-static int i7core_get_active_channels(const u8 socket, unsigned *channels,
-				      unsigned *csrows)
-{
-	struct pci_dev *pdev = NULL;
-	int i, j;
-	u32 status, control;
-
-	*channels = 0;
-	*csrows = 0;
-
-	pdev = get_pdev_slot_func(socket, 3, 0);
-	if (!pdev) {
-		i7core_printk(KERN_ERR, "Couldn't find socket %d fn 3.0!!!\n",
-			      socket);
-		return -ENODEV;
-	}
-
-	/* Device 3 function 0 reads */
-	pci_read_config_dword(pdev, MC_STATUS, &status);
-	pci_read_config_dword(pdev, MC_CONTROL, &control);
-
-	for (i = 0; i < NUM_CHANS; i++) {
-		u32 dimm_dod[3];
-		/* Check if the channel is active */
-		if (!(control & (1 << (8 + i))))
-			continue;
-
-		/* Check if the channel is disabled */
-		if (status & (1 << i))
-			continue;
-
-		pdev = get_pdev_slot_func(socket, i + 4, 1);
-		if (!pdev) {
-			i7core_printk(KERN_ERR, "Couldn't find socket %d "
-						"fn %d.%d!!!\n",
-						socket, i + 4, 1);
-			return -ENODEV;
-		}
-		/* Devices 4-6 function 1 */
-		pci_read_config_dword(pdev,
-				MC_DOD_CH_DIMM0, &dimm_dod[0]);
-		pci_read_config_dword(pdev,
-				MC_DOD_CH_DIMM1, &dimm_dod[1]);
-		pci_read_config_dword(pdev,
-				MC_DOD_CH_DIMM2, &dimm_dod[2]);
 
-		(*channels)++;
-
-		for (j = 0; j < 3; j++) {
-			if (!DIMM_PRESENT(dimm_dod[j]))
-				continue;
-			(*csrows)++;
-		}
-	}
-
-	debugf0("Number of active channels on socket %d: %d\n",
-		socket, *channels);
-
-	return 0;
-}
-
-static int get_dimm_config(const struct mem_ctl_info *mci)
+static int get_dimm_config(struct mem_ctl_info *mci)
 {
 	struct i7core_pvt *pvt = mci->pvt_info;
-	struct csrow_info *csr;
 	struct pci_dev *pdev;
 	int i, j;
-	int csrow = 0;
-	unsigned long last_page = 0;
 	enum edac_type mode;
 	enum mem_type mtype;
+	struct dimm_info *dimm;
 
 	/* Get data from the MC register, function 0 */
 	pdev = pvt->pci_mcr[0];
@@ -657,21 +557,20 @@ static int get_dimm_config(const struct mem_ctl_info *mci)
 		pci_read_config_dword(pvt->pci_ch[i][0],
 				MC_CHANNEL_DIMM_INIT_PARAMS, &data);
 
-		pvt->channel[i].ranks = (data & QUAD_RANK_PRESENT) ?
-						4 : 2;
+
+		if (data & THREE_DIMMS_PRESENT)
+			pvt->channel[i].is_3dimms_present = true;
+
+		if (data & SINGLE_QUAD_RANK_PRESENT)
+			pvt->channel[i].is_single_4rank = true;
+
+		if (data & QUAD_RANK_PRESENT)
+			pvt->channel[i].has_4rank = true;
 
 		if (data & REGISTERED_DIMM)
 			mtype = MEM_RDDR3;
 		else
 			mtype = MEM_DDR3;
-#if 0
-		if (data & THREE_DIMMS_PRESENT)
-			pvt->channel[i].dimms = 3;
-		else if (data & SINGLE_QUAD_RANK_PRESENT)
-			pvt->channel[i].dimms = 1;
-		else
-			pvt->channel[i].dimms = 2;
-#endif
 
 		/* Devices 4-6 function 1 */
 		pci_read_config_dword(pvt->pci_ch[i][1],
@@ -682,11 +581,13 @@ static int get_dimm_config(const struct mem_ctl_info *mci)
 				MC_DOD_CH_DIMM2, &dimm_dod[2]);
 
 		debugf0("Ch%d phy rd%d, wr%d (0x%08x): "
-			"%d ranks, %cDIMMs\n",
+			"%s%s%s%cDIMMs\n",
 			i,
 			RDLCH(pvt->info.ch_map, i), WRLCH(pvt->info.ch_map, i),
 			data,
-			pvt->channel[i].ranks,
+			pvt->channel[i].is_3dimms_present ? "3DIMMS " : "",
+			pvt->channel[i].is_3dimms_present ? "SINGLE_4R " : "",
+			pvt->channel[i].has_4rank ? "HAS_4R " : "",
 			(data & REGISTERED_DIMM) ? 'R' : 'U');
 
 		for (j = 0; j < 3; j++) {
@@ -696,6 +597,8 @@ static int get_dimm_config(const struct mem_ctl_info *mci)
 			if (!DIMM_PRESENT(dimm_dod[j]))
 				continue;
 
+			dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers,
+				       i, j, 0);
 			banks = numbank(MC_DOD_NUMBANK(dimm_dod[j]));
 			ranks = numrank(MC_DOD_NUMRANK(dimm_dod[j]));
 			rows = numrow(MC_DOD_NUMROW(dimm_dod[j]));
@@ -704,8 +607,6 @@ static int get_dimm_config(const struct mem_ctl_info *mci)
 			/* DDR3 has 8 I/O banks */
 			size = (rows * cols * banks * ranks) >> (20 - 3);
 
-			pvt->channel[i].dimms++;
-
 			debugf0("\tdimm %d %d Mb offset: %x, "
 				"bank: %d, rank: %d, row: %#x, col: %#x\n",
 				j, size,
@@ -714,44 +615,28 @@ static int get_dimm_config(const struct mem_ctl_info *mci)
 
 			npages = MiB_TO_PAGES(size);
 
-			csr = &mci->csrows[csrow];
-			csr->first_page = last_page + 1;
-			last_page += npages;
-			csr->last_page = last_page;
-			csr->nr_pages = npages;
-
-			csr->page_mask = 0;
-			csr->grain = 8;
-			csr->csrow_idx = csrow;
-			csr->nr_channels = 1;
-
-			csr->channels[0].chan_idx = i;
-			csr->channels[0].ce_count = 0;
-
-			pvt->csrow_map[i][j] = csrow;
+			dimm->nr_pages = npages;
 
 			switch (banks) {
 			case 4:
-				csr->dtype = DEV_X4;
+				dimm->dtype = DEV_X4;
 				break;
 			case 8:
-				csr->dtype = DEV_X8;
+				dimm->dtype = DEV_X8;
 				break;
 			case 16:
-				csr->dtype = DEV_X16;
+				dimm->dtype = DEV_X16;
 				break;
 			default:
-				csr->dtype = DEV_UNKNOWN;
+				dimm->dtype = DEV_UNKNOWN;
 			}
 
-			csr->edac_mode = mode;
-			csr->mtype = mtype;
-			snprintf(csr->channels[0].label,
-					sizeof(csr->channels[0].label),
-					"CPU#%uChannel#%u_DIMM#%u",
-					pvt->i7core_dev->socket, i, j);
-
-			csrow++;
+			snprintf(dimm->label, sizeof(dimm->label),
+				 "CPU#%uChannel#%u_DIMM#%u",
+				 pvt->i7core_dev->socket, i, j);
+			dimm->grain = 8;
+			dimm->edac_mode = mode;
+			dimm->mtype = mtype;
 		}
 
 		pci_read_config_dword(pdev, MC_SAG_CH_0, &value[0]);
@@ -1567,22 +1452,16 @@ error:
 /****************************************************************************
 			Error check routines
  ****************************************************************************/
-static void i7core_rdimm_update_csrow(struct mem_ctl_info *mci,
+static void i7core_rdimm_update_errcount(struct mem_ctl_info *mci,
 				      const int chan,
 				      const int dimm,
 				      const int add)
 {
-	char *msg;
-	struct i7core_pvt *pvt = mci->pvt_info;
-	int row = pvt->csrow_map[chan][dimm], i;
+	int i;
 
 	for (i = 0; i < add; i++) {
-		msg = kasprintf(GFP_KERNEL, "Corrected error "
-				"(Socket=%d channel=%d dimm=%d)",
-				pvt->i7core_dev->socket, chan, dimm);
-
-		edac_mc_handle_fbd_ce(mci, row, 0, msg);
-		kfree (msg);
+		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 0, 0, 0,
+				     chan, dimm, -1, "error", "", NULL);
 	}
 }
 
@@ -1623,11 +1502,11 @@ static void i7core_rdimm_update_ce_count(struct mem_ctl_info *mci,
 
 	/*updated the edac core */
 	if (add0 != 0)
-		i7core_rdimm_update_csrow(mci, chan, 0, add0);
+		i7core_rdimm_update_errcount(mci, chan, 0, add0);
 	if (add1 != 0)
-		i7core_rdimm_update_csrow(mci, chan, 1, add1);
+		i7core_rdimm_update_errcount(mci, chan, 1, add1);
 	if (add2 != 0)
-		i7core_rdimm_update_csrow(mci, chan, 2, add2);
+		i7core_rdimm_update_errcount(mci, chan, 2, add2);
 
 }
 
@@ -1747,20 +1626,30 @@ static void i7core_mce_output_error(struct mem_ctl_info *mci,
 				    const struct mce *m)
 {
 	struct i7core_pvt *pvt = mci->pvt_info;
-	char *type, *optype, *err, *msg;
+	char *type, *optype, *err, msg[80];
+	enum hw_event_mc_err_type tp_event;
 	unsigned long error = m->status & 0x1ff0000l;
+	bool uncorrected_error = m->mcgstatus & 1ll << 61;
+	bool ripv = m->mcgstatus & 1;
 	u32 optypenum = (m->status >> 4) & 0x07;
 	u32 core_err_cnt = (m->status >> 38) & 0x7fff;
 	u32 dimm = (m->misc >> 16) & 0x3;
 	u32 channel = (m->misc >> 18) & 0x3;
 	u32 syndrome = m->misc >> 32;
 	u32 errnum = find_first_bit(&error, 32);
-	int csrow;
 
-	if (m->mcgstatus & 1)
-		type = "FATAL";
-	else
-		type = "NON_FATAL";
+	if (uncorrected_error) {
+		if (ripv) {
+			type = "FATAL";
+			tp_event = HW_EVENT_ERR_FATAL;
+		} else {
+			type = "NON_FATAL";
+			tp_event = HW_EVENT_ERR_UNCORRECTED;
+		}
+	} else {
+		type = "CORRECTED";
+		tp_event = HW_EVENT_ERR_CORRECTED;
+	}
 
 	switch (optypenum) {
 	case 0:
@@ -1815,27 +1704,20 @@ static void i7core_mce_output_error(struct mem_ctl_info *mci,
 		err = "unknown";
 	}
 
-	/* FIXME: should convert addr into bank and rank information */
-	msg = kasprintf(GFP_ATOMIC,
-		"%s (addr = 0x%08llx, cpu=%d, Dimm=%d, Channel=%d, "
-		"syndrome=0x%08x, count=%d, Err=%08llx:%08llx (%s: %s))\n",
-		type, (long long) m->addr, m->cpu, dimm, channel,
-		syndrome, core_err_cnt, (long long)m->status,
-		(long long)m->misc, optype, err);
-
-	debugf0("%s", msg);
-
-	csrow = pvt->csrow_map[channel][dimm];
+	snprintf(msg, sizeof(msg), "count=%d %s", core_err_cnt, optype);
 
-	/* Call the helper to output message */
-	if (m->mcgstatus & 1)
-		edac_mc_handle_fbd_ue(mci, csrow, 0,
-				0 /* FIXME: should be channel here */, msg);
-	else if (!pvt->is_registered)
-		edac_mc_handle_fbd_ce(mci, csrow,
-				0 /* FIXME: should be channel here */, msg);
-
-	kfree(msg);
+	/*
+	 * Call the helper to output message
+	 * FIXME: what to do if core_err_cnt > 1? Currently, it generates
+	 * only one event
+	 */
+	if (uncorrected_error || !pvt->is_registered)
+		edac_mc_handle_error(tp_event, mci,
+				     m->addr >> PAGE_SHIFT,
+				     m->addr & ~PAGE_MASK,
+				     syndrome,
+				     channel, dimm, -1,
+				     err, msg, m);
 }
 
 /*
@@ -2252,15 +2134,19 @@ static int i7core_register_mci(struct i7core_dev *i7core_dev)
 {
 	struct mem_ctl_info *mci;
 	struct i7core_pvt *pvt;
-	int rc, channels, csrows;
-
-	/* Check the number of active and not disabled channels */
-	rc = i7core_get_active_channels(i7core_dev->socket, &channels, &csrows);
-	if (unlikely(rc < 0))
-		return rc;
+	int rc;
+	struct edac_mc_layer layers[2];
 
 	/* allocate a new MC control structure */
-	mci = edac_mc_alloc(sizeof(*pvt), csrows, channels, i7core_dev->socket);
+
+	layers[0].type = EDAC_MC_LAYER_CHANNEL;
+	layers[0].size = NUM_CHANS;
+	layers[0].is_virt_csrow = false;
+	layers[1].type = EDAC_MC_LAYER_SLOT;
+	layers[1].size = MAX_DIMMS;
+	layers[1].is_virt_csrow = true;
+	mci = edac_mc_alloc(i7core_dev->socket, ARRAY_SIZE(layers), layers,
+			    sizeof(*pvt));
 	if (unlikely(!mci))
 		return -ENOMEM;
 
diff --git a/drivers/edac/i82443bxgx_edac.c b/drivers/edac/i82443bxgx_edac.c
index 3bf2b2f490e7..52072c28a8a6 100644
--- a/drivers/edac/i82443bxgx_edac.c
+++ b/drivers/edac/i82443bxgx_edac.c
@@ -12,7 +12,7 @@
  * 440GX fix by Jason Uhlenkott <[email protected]>.
  *
  * Written with reference to 82443BX Host Bridge Datasheet:
- * http://download.intel.com/design/chipsets/datashts/29063301.pdf 
+ * http://download.intel.com/design/chipsets/datashts/29063301.pdf
  * references to this document given in [].
  *
  * This module doesn't support the 440LX, but it may be possible to
@@ -156,19 +156,19 @@ static int i82443bxgx_edacmc_process_error_info(struct mem_ctl_info *mci,
 	if (info->eap & I82443BXGX_EAP_OFFSET_SBE) {
 		error_found = 1;
 		if (handle_errors)
-			edac_mc_handle_ce(mci, page, pageoffset,
-				/* 440BX/GX don't make syndrome information
-				 * available */
-				0, edac_mc_find_csrow_by_page(mci, page), 0,
-				mci->ctl_name);
+			edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
+					     page, pageoffset, 0,
+					     edac_mc_find_csrow_by_page(mci, page),
+					     0, -1, mci->ctl_name, "", NULL);
 	}
 
 	if (info->eap & I82443BXGX_EAP_OFFSET_MBE) {
 		error_found = 1;
 		if (handle_errors)
-			edac_mc_handle_ue(mci, page, pageoffset,
-					edac_mc_find_csrow_by_page(mci, page),
-					mci->ctl_name);
+			edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
+					     page, pageoffset, 0,
+					     edac_mc_find_csrow_by_page(mci, page),
+					     0, -1, mci->ctl_name, "", NULL);
 	}
 
 	return error_found;
@@ -189,6 +189,7 @@ static void i82443bxgx_init_csrows(struct mem_ctl_info *mci,
 				enum mem_type mtype)
 {
 	struct csrow_info *csrow;
+	struct dimm_info *dimm;
 	int index;
 	u8 drbar, dramc;
 	u32 row_base, row_high_limit, row_high_limit_last;
@@ -197,6 +198,8 @@ static void i82443bxgx_init_csrows(struct mem_ctl_info *mci,
 	row_high_limit_last = 0;
 	for (index = 0; index < mci->nr_csrows; index++) {
 		csrow = &mci->csrows[index];
+		dimm = csrow->channels[0].dimm;
+
 		pci_read_config_byte(pdev, I82443BXGX_DRB + index, &drbar);
 		debugf1("MC%d: %s: %s() Row=%d DRB = %#0x\n",
 			mci->mc_idx, __FILE__, __func__, index, drbar);
@@ -217,14 +220,14 @@ static void i82443bxgx_init_csrows(struct mem_ctl_info *mci,
 		row_base = row_high_limit_last;
 		csrow->first_page = row_base >> PAGE_SHIFT;
 		csrow->last_page = (row_high_limit >> PAGE_SHIFT) - 1;
-		csrow->nr_pages = csrow->last_page - csrow->first_page + 1;
+		dimm->nr_pages = csrow->last_page - csrow->first_page + 1;
 		/* EAP reports in 4kilobyte granularity [61] */
-		csrow->grain = 1 << 12;
-		csrow->mtype = mtype;
+		dimm->grain = 1 << 12;
+		dimm->mtype = mtype;
 		/* I don't think 440BX can tell you device type? FIXME? */
-		csrow->dtype = DEV_UNKNOWN;
+		dimm->dtype = DEV_UNKNOWN;
 		/* Mode is global to all rows on 440BX */
-		csrow->edac_mode = edac_mode;
+		dimm->edac_mode = edac_mode;
 		row_high_limit_last = row_high_limit;
 	}
 }
@@ -232,6 +235,7 @@ static void i82443bxgx_init_csrows(struct mem_ctl_info *mci,
 static int i82443bxgx_edacmc_probe1(struct pci_dev *pdev, int dev_idx)
 {
 	struct mem_ctl_info *mci;
+	struct edac_mc_layer layers[2];
 	u8 dramc;
 	u32 nbxcfg, ecc_mode;
 	enum mem_type mtype;
@@ -245,8 +249,13 @@ static int i82443bxgx_edacmc_probe1(struct pci_dev *pdev, int dev_idx)
 	if (pci_read_config_dword(pdev, I82443BXGX_NBXCFG, &nbxcfg))
 		return -EIO;
 
-	mci = edac_mc_alloc(0, I82443BXGX_NR_CSROWS, I82443BXGX_NR_CHANS, 0);
-
+	layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
+	layers[0].size = I82443BXGX_NR_CSROWS;
+	layers[0].is_virt_csrow = true;
+	layers[1].type = EDAC_MC_LAYER_CHANNEL;
+	layers[1].size = I82443BXGX_NR_CHANS;
+	layers[1].is_virt_csrow = false;
+	mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, 0);
 	if (mci == NULL)
 		return -ENOMEM;
 
diff --git a/drivers/edac/i82860_edac.c b/drivers/edac/i82860_edac.c
index c779092d18d1..08045059d10b 100644
--- a/drivers/edac/i82860_edac.c
+++ b/drivers/edac/i82860_edac.c
@@ -99,6 +99,7 @@ static int i82860_process_error_info(struct mem_ctl_info *mci,
 				struct i82860_error_info *info,
 				int handle_errors)
 {
+	struct dimm_info *dimm;
 	int row;
 
 	if (!(info->errsts2 & 0x0003))
@@ -108,18 +109,25 @@ static int i82860_process_error_info(struct mem_ctl_info *mci,
 		return 1;
 
 	if ((info->errsts ^ info->errsts2) & 0x0003) {
-		edac_mc_handle_ce_no_info(mci, "UE overwrote CE");
+		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 0, 0, 0,
+				     -1, -1, -1, "UE overwrote CE", "", NULL);
 		info->errsts = info->errsts2;
 	}
 
 	info->eap >>= PAGE_SHIFT;
 	row = edac_mc_find_csrow_by_page(mci, info->eap);
+	dimm = mci->csrows[row].channels[0].dimm;
 
 	if (info->errsts & 0x0002)
-		edac_mc_handle_ue(mci, info->eap, 0, row, "i82860 UE");
+		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
+				     info->eap, 0, 0,
+				     dimm->location[0], dimm->location[1], -1,
+				     "i82860 UE", "", NULL);
 	else
-		edac_mc_handle_ce(mci, info->eap, 0, info->derrsyn, row, 0,
-				"i82860 UE");
+		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
+				     info->eap, 0, info->derrsyn,
+				     dimm->location[0], dimm->location[1], -1,
+				     "i82860 CE", "", NULL);
 
 	return 1;
 }
@@ -140,6 +148,7 @@ static void i82860_init_csrows(struct mem_ctl_info *mci, struct pci_dev *pdev)
 	u16 value;
 	u32 cumul_size;
 	struct csrow_info *csrow;
+	struct dimm_info *dimm;
 	int index;
 
 	pci_read_config_word(pdev, I82860_MCHCFG, &mchcfg_ddim);
@@ -153,6 +162,8 @@ static void i82860_init_csrows(struct mem_ctl_info *mci, struct pci_dev *pdev)
 	 */
 	for (index = 0; index < mci->nr_csrows; index++) {
 		csrow = &mci->csrows[index];
+		dimm = csrow->channels[0].dimm;
+
 		pci_read_config_word(pdev, I82860_GBA + index * 2, &value);
 		cumul_size = (value & I82860_GBA_MASK) <<
 			(I82860_GBA_SHIFT - PAGE_SHIFT);
@@ -164,30 +175,38 @@ static void i82860_init_csrows(struct mem_ctl_info *mci, struct pci_dev *pdev)
 
 		csrow->first_page = last_cumul_size;
 		csrow->last_page = cumul_size - 1;
-		csrow->nr_pages = cumul_size - last_cumul_size;
+		dimm->nr_pages = cumul_size - last_cumul_size;
 		last_cumul_size = cumul_size;
-		csrow->grain = 1 << 12;	/* I82860_EAP has 4KiB reolution */
-		csrow->mtype = MEM_RMBS;
-		csrow->dtype = DEV_UNKNOWN;
-		csrow->edac_mode = mchcfg_ddim ? EDAC_SECDED : EDAC_NONE;
+		dimm->grain = 1 << 12;	/* I82860_EAP has 4KiB reolution */
+		dimm->mtype = MEM_RMBS;
+		dimm->dtype = DEV_UNKNOWN;
+		dimm->edac_mode = mchcfg_ddim ? EDAC_SECDED : EDAC_NONE;
 	}
 }
 
 static int i82860_probe1(struct pci_dev *pdev, int dev_idx)
 {
 	struct mem_ctl_info *mci;
+	struct edac_mc_layer layers[2];
 	struct i82860_error_info discard;
 
-	/* RDRAM has channels but these don't map onto the abstractions that
-	   edac uses.
-	   The device groups from the GRA registers seem to map reasonably
-	   well onto the notion of a chip select row.
-	   There are 16 GRA registers and since the name is associated with
-	   the channel and the GRA registers map to physical devices so we are
-	   going to make 1 channel for group.
+	/*
+	 * RDRAM has channels but these don't map onto the csrow abstraction.
+	 * According with the datasheet, there are 2 Rambus channels, supporting
+	 * up to 16 direct RDRAM devices.
+	 * The device groups from the GRA registers seem to map reasonably
+	 * well onto the notion of a chip select row.
+	 * There are 16 GRA registers and since the name is associated with
+	 * the channel and the GRA registers map to physical devices so we are
+	 * going to make 1 channel for group.
 	 */
-	mci = edac_mc_alloc(0, 16, 1, 0);
-
+	layers[0].type = EDAC_MC_LAYER_CHANNEL;
+	layers[0].size = 2;
+	layers[0].is_virt_csrow = true;
+	layers[1].type = EDAC_MC_LAYER_SLOT;
+	layers[1].size = 8;
+	layers[1].is_virt_csrow = true;
+	mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, 0);
 	if (!mci)
 		return -ENOMEM;
 
diff --git a/drivers/edac/i82875p_edac.c b/drivers/edac/i82875p_edac.c
index 10f15d85fb5e..b613e31c16e5 100644
--- a/drivers/edac/i82875p_edac.c
+++ b/drivers/edac/i82875p_edac.c
@@ -38,7 +38,8 @@
 #endif				/* PCI_DEVICE_ID_INTEL_82875_6 */
 
 /* four csrows in dual channel, eight in single channel */
-#define I82875P_NR_CSROWS(nr_chans) (8/(nr_chans))
+#define I82875P_NR_DIMMS		8
+#define I82875P_NR_CSROWS(nr_chans)	(I82875P_NR_DIMMS / (nr_chans))
 
 /* Intel 82875p register addresses - device 0 function 0 - DRAM Controller */
 #define I82875P_EAP		0x58	/* Error Address Pointer (32b)
@@ -235,7 +236,9 @@ static int i82875p_process_error_info(struct mem_ctl_info *mci,
 		return 1;
 
 	if ((info->errsts ^ info->errsts2) & 0x0081) {
-		edac_mc_handle_ce_no_info(mci, "UE overwrote CE");
+		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 0, 0, 0,
+				     -1, -1, -1,
+				     "UE overwrote CE", "", NULL);
 		info->errsts = info->errsts2;
 	}
 
@@ -243,11 +246,15 @@ static int i82875p_process_error_info(struct mem_ctl_info *mci,
 	row = edac_mc_find_csrow_by_page(mci, info->eap);
 
 	if (info->errsts & 0x0080)
-		edac_mc_handle_ue(mci, info->eap, 0, row, "i82875p UE");
+		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
+				     info->eap, 0, 0,
+				     row, -1, -1,
+				     "i82875p UE", "", NULL);
 	else
-		edac_mc_handle_ce(mci, info->eap, 0, info->derrsyn, row,
-				multi_chan ? (info->des & 0x1) : 0,
-				"i82875p CE");
+		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
+				     info->eap, 0, info->derrsyn,
+				     row, multi_chan ? (info->des & 0x1) : 0,
+				     -1, "i82875p CE", "", NULL);
 
 	return 1;
 }
@@ -342,11 +349,13 @@ static void i82875p_init_csrows(struct mem_ctl_info *mci,
 				void __iomem * ovrfl_window, u32 drc)
 {
 	struct csrow_info *csrow;
+	struct dimm_info *dimm;
+	unsigned nr_chans = dual_channel_active(drc) + 1;
 	unsigned long last_cumul_size;
 	u8 value;
 	u32 drc_ddim;		/* DRAM Data Integrity Mode 0=none,2=edac */
-	u32 cumul_size;
-	int index;
+	u32 cumul_size, nr_pages;
+	int index, j;
 
 	drc_ddim = (drc >> 18) & 0x1;
 	last_cumul_size = 0;
@@ -369,12 +378,18 @@ static void i82875p_init_csrows(struct mem_ctl_info *mci,
 
 		csrow->first_page = last_cumul_size;
 		csrow->last_page = cumul_size - 1;
-		csrow->nr_pages = cumul_size - last_cumul_size;
+		nr_pages = cumul_size - last_cumul_size;
 		last_cumul_size = cumul_size;
-		csrow->grain = 1 << 12;	/* I82875P_EAP has 4KiB reolution */
-		csrow->mtype = MEM_DDR;
-		csrow->dtype = DEV_UNKNOWN;
-		csrow->edac_mode = drc_ddim ? EDAC_SECDED : EDAC_NONE;
+
+		for (j = 0; j < nr_chans; j++) {
+			dimm = csrow->channels[j].dimm;
+
+			dimm->nr_pages = nr_pages / nr_chans;
+			dimm->grain = 1 << 12;	/* I82875P_EAP has 4KiB reolution */
+			dimm->mtype = MEM_DDR;
+			dimm->dtype = DEV_UNKNOWN;
+			dimm->edac_mode = drc_ddim ? EDAC_SECDED : EDAC_NONE;
+		}
 	}
 }
 
@@ -382,6 +397,7 @@ static int i82875p_probe1(struct pci_dev *pdev, int dev_idx)
 {
 	int rc = -ENODEV;
 	struct mem_ctl_info *mci;
+	struct edac_mc_layer layers[2];
 	struct i82875p_pvt *pvt;
 	struct pci_dev *ovrfl_pdev;
 	void __iomem *ovrfl_window;
@@ -397,9 +413,14 @@ static int i82875p_probe1(struct pci_dev *pdev, int dev_idx)
 		return -ENODEV;
 	drc = readl(ovrfl_window + I82875P_DRC);
 	nr_chans = dual_channel_active(drc) + 1;
-	mci = edac_mc_alloc(sizeof(*pvt), I82875P_NR_CSROWS(nr_chans),
-			nr_chans, 0);
 
+	layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
+	layers[0].size = I82875P_NR_CSROWS(nr_chans);
+	layers[0].is_virt_csrow = true;
+	layers[1].type = EDAC_MC_LAYER_CHANNEL;
+	layers[1].size = nr_chans;
+	layers[1].is_virt_csrow = false;
+	mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(*pvt));
 	if (!mci) {
 		rc = -ENOMEM;
 		goto fail0;
diff --git a/drivers/edac/i82975x_edac.c b/drivers/edac/i82975x_edac.c
index 0cd8368f88f8..433332c7cdba 100644
--- a/drivers/edac/i82975x_edac.c
+++ b/drivers/edac/i82975x_edac.c
@@ -29,7 +29,8 @@
 #define PCI_DEVICE_ID_INTEL_82975_0	0x277c
 #endif				/* PCI_DEVICE_ID_INTEL_82975_0 */
 
-#define I82975X_NR_CSROWS(nr_chans)		(8/(nr_chans))
+#define I82975X_NR_DIMMS		8
+#define I82975X_NR_CSROWS(nr_chans)	(I82975X_NR_DIMMS / (nr_chans))
 
 /* Intel 82975X register addresses - device 0 function 0 - DRAM Controller */
 #define I82975X_EAP		0x58	/* Dram Error Address Pointer (32b)
@@ -287,7 +288,8 @@ static int i82975x_process_error_info(struct mem_ctl_info *mci,
 		return 1;
 
 	if ((info->errsts ^ info->errsts2) & 0x0003) {
-		edac_mc_handle_ce_no_info(mci, "UE overwrote CE");
+		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 0, 0, 0,
+				     -1, -1, -1, "UE overwrote CE", "", NULL);
 		info->errsts = info->errsts2;
 	}
 
@@ -309,13 +311,18 @@ static int i82975x_process_error_info(struct mem_ctl_info *mci,
 	chan = (mci->csrows[row].nr_channels == 1) ? 0 : info->eap & 1;
 	offst = info->eap
 			& ((1 << PAGE_SHIFT) -
-				(1 << mci->csrows[row].grain));
+			   (1 << mci->csrows[row].channels[chan].dimm->grain));
 
 	if (info->errsts & 0x0002)
-		edac_mc_handle_ue(mci, page, offst , row, "i82975x UE");
+		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
+				     page, offst, 0,
+				     row, -1, -1,
+				     "i82975x UE", "", NULL);
 	else
-		edac_mc_handle_ce(mci, page, offst, info->derrsyn, row,
-				chan, "i82975x CE");
+		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
+				     page, offst, info->derrsyn,
+				     row, chan ? chan : 0, -1,
+				     "i82975x CE", "", NULL);
 
 	return 1;
 }
@@ -370,8 +377,10 @@ static void i82975x_init_csrows(struct mem_ctl_info *mci,
 	struct csrow_info *csrow;
 	unsigned long last_cumul_size;
 	u8 value;
-	u32 cumul_size;
+	u32 cumul_size, nr_pages;
 	int index, chan;
+	struct dimm_info *dimm;
+	enum dev_type dtype;
 
 	last_cumul_size = 0;
 
@@ -400,28 +409,33 @@ static void i82975x_init_csrows(struct mem_ctl_info *mci,
 		debugf3("%s(): (%d) cumul_size 0x%x\n", __func__, index,
 			cumul_size);
 
+		nr_pages = cumul_size - last_cumul_size;
+		if (!nr_pages)
+			continue;
+
 		/*
 		 * Initialise dram labels
 		 * index values:
 		 *   [0-7] for single-channel; i.e. csrow->nr_channels = 1
 		 *   [0-3] for dual-channel; i.e. csrow->nr_channels = 2
 		 */
-		for (chan = 0; chan < csrow->nr_channels; chan++)
-			strncpy(csrow->channels[chan].label,
+		dtype = i82975x_dram_type(mch_window, index);
+		for (chan = 0; chan < csrow->nr_channels; chan++) {
+			dimm = mci->csrows[index].channels[chan].dimm;
+
+			dimm->nr_pages = nr_pages / csrow->nr_channels;
+			strncpy(csrow->channels[chan].dimm->label,
 					labels[(index >> 1) + (chan * 2)],
 					EDAC_MC_LABEL_LEN);
-
-		if (cumul_size == last_cumul_size)
-			continue;	/* not populated */
+			dimm->grain = 1 << 7;	/* 128Byte cache-line resolution */
+			dimm->dtype = i82975x_dram_type(mch_window, index);
+			dimm->mtype = MEM_DDR2; /* I82975x supports only DDR2 */
+			dimm->edac_mode = EDAC_SECDED; /* only supported */
+		}
 
 		csrow->first_page = last_cumul_size;
 		csrow->last_page = cumul_size - 1;
-		csrow->nr_pages = cumul_size - last_cumul_size;
 		last_cumul_size = cumul_size;
-		csrow->grain = 1 << 7;	/* 128Byte cache-line resolution */
-		csrow->mtype = MEM_DDR2; /* I82975x supports only DDR2 */
-		csrow->dtype = i82975x_dram_type(mch_window, index);
-		csrow->edac_mode = EDAC_SECDED; /* only supported */
 	}
 }
 
@@ -463,6 +477,7 @@ static int i82975x_probe1(struct pci_dev *pdev, int dev_idx)
 {
 	int rc = -ENODEV;
 	struct mem_ctl_info *mci;
+	struct edac_mc_layer layers[2];
 	struct i82975x_pvt *pvt;
 	void __iomem *mch_window;
 	u32 mchbar;
@@ -531,8 +546,13 @@ static int i82975x_probe1(struct pci_dev *pdev, int dev_idx)
 	chans = dual_channel_active(mch_window) + 1;
 
 	/* assuming only one controller, index thus is 0 */
-	mci = edac_mc_alloc(sizeof(*pvt), I82975X_NR_CSROWS(chans),
-					chans, 0);
+	layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
+	layers[0].size = I82975X_NR_DIMMS;
+	layers[0].is_virt_csrow = true;
+	layers[1].type = EDAC_MC_LAYER_CHANNEL;
+	layers[1].size = I82975X_NR_CSROWS(chans);
+	layers[1].is_virt_csrow = false;
+	mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(*pvt));
 	if (!mci) {
 		rc = -ENOMEM;
 		goto fail1;
diff --git a/drivers/edac/mpc85xx_edac.c b/drivers/edac/mpc85xx_edac.c
index 73464a62adf7..4c402353ba98 100644
--- a/drivers/edac/mpc85xx_edac.c
+++ b/drivers/edac/mpc85xx_edac.c
@@ -854,12 +854,16 @@ static void mpc85xx_mc_check(struct mem_ctl_info *mci)
 		mpc85xx_mc_printk(mci, KERN_ERR, "PFN out of range!\n");
 
 	if (err_detect & DDR_EDE_SBE)
-		edac_mc_handle_ce(mci, pfn, err_addr & ~PAGE_MASK,
-				  syndrome, row_index, 0, mci->ctl_name);
+		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
+				     pfn, err_addr & ~PAGE_MASK, syndrome,
+				     row_index, 0, -1,
+				     mci->ctl_name, "", NULL);
 
 	if (err_detect & DDR_EDE_MBE)
-		edac_mc_handle_ue(mci, pfn, err_addr & ~PAGE_MASK,
-				  row_index, mci->ctl_name);
+		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
+				     pfn, err_addr & ~PAGE_MASK, syndrome,
+				     row_index, 0, -1,
+				     mci->ctl_name, "", NULL);
 
 	out_be32(pdata->mc_vbase + MPC85XX_MC_ERR_DETECT, err_detect);
 }
@@ -883,6 +887,7 @@ static void __devinit mpc85xx_init_csrows(struct mem_ctl_info *mci)
 {
 	struct mpc85xx_mc_pdata *pdata = mci->pvt_info;
 	struct csrow_info *csrow;
+	struct dimm_info *dimm;
 	u32 sdram_ctl;
 	u32 sdtype;
 	enum mem_type mtype;
@@ -929,6 +934,8 @@ static void __devinit mpc85xx_init_csrows(struct mem_ctl_info *mci)
 		u32 end;
 
 		csrow = &mci->csrows[index];
+		dimm = csrow->channels[0].dimm;
+
 		cs_bnds = in_be32(pdata->mc_vbase + MPC85XX_MC_CS_BNDS_0 +
 				  (index * MPC85XX_MC_CS_BNDS_OFS));
 
@@ -944,19 +951,21 @@ static void __devinit mpc85xx_init_csrows(struct mem_ctl_info *mci)
 
 		csrow->first_page = start;
 		csrow->last_page = end;
-		csrow->nr_pages = end + 1 - start;
-		csrow->grain = 8;
-		csrow->mtype = mtype;
-		csrow->dtype = DEV_UNKNOWN;
+
+		dimm->nr_pages = end + 1 - start;
+		dimm->grain = 8;
+		dimm->mtype = mtype;
+		dimm->dtype = DEV_UNKNOWN;
 		if (sdram_ctl & DSC_X32_EN)
-			csrow->dtype = DEV_X32;
-		csrow->edac_mode = EDAC_SECDED;
+			dimm->dtype = DEV_X32;
+		dimm->edac_mode = EDAC_SECDED;
 	}
 }
 
 static int __devinit mpc85xx_mc_err_probe(struct platform_device *op)
 {
 	struct mem_ctl_info *mci;
+	struct edac_mc_layer layers[2];
 	struct mpc85xx_mc_pdata *pdata;
 	struct resource r;
 	u32 sdram_ctl;
@@ -965,7 +974,13 @@ static int __devinit mpc85xx_mc_err_probe(struct platform_device *op)
 	if (!devres_open_group(&op->dev, mpc85xx_mc_err_probe, GFP_KERNEL))
 		return -ENOMEM;
 
-	mci = edac_mc_alloc(sizeof(*pdata), 4, 1, edac_mc_idx);
+	layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
+	layers[0].size = 4;
+	layers[0].is_virt_csrow = true;
+	layers[1].type = EDAC_MC_LAYER_CHANNEL;
+	layers[1].size = 1;
+	layers[1].is_virt_csrow = false;
+	mci = edac_mc_alloc(edac_mc_idx, ARRAY_SIZE(layers), sizeof(*pdata));
 	if (!mci) {
 		devres_release_group(&op->dev, mpc85xx_mc_err_probe);
 		return -ENOMEM;
diff --git a/drivers/edac/mv64x60_edac.c b/drivers/edac/mv64x60_edac.c
index 7e5ff367705c..b0bb5a3d2527 100644
--- a/drivers/edac/mv64x60_edac.c
+++ b/drivers/edac/mv64x60_edac.c
@@ -611,12 +611,17 @@ static void mv64x60_mc_check(struct mem_ctl_info *mci)
 
 	/* first bit clear in ECC Err Reg, 1 bit error, correctable by HW */
 	if (!(reg & 0x1))
-		edac_mc_handle_ce(mci, err_addr >> PAGE_SHIFT,
-				  err_addr & PAGE_MASK, syndrome, 0, 0,
-				  mci->ctl_name);
+		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
+				     err_addr >> PAGE_SHIFT,
+				     err_addr & PAGE_MASK, syndrome,
+				     0, 0, -1,
+				     mci->ctl_name, "", NULL);
 	else	/* 2 bit error, UE */
-		edac_mc_handle_ue(mci, err_addr >> PAGE_SHIFT,
-				  err_addr & PAGE_MASK, 0, mci->ctl_name);
+		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
+				     err_addr >> PAGE_SHIFT,
+				     err_addr & PAGE_MASK, 0,
+				     0, 0, -1,
+				     mci->ctl_name, "", NULL);
 
 	/* clear the error */
 	out_le32(pdata->mc_vbase + MV64X60_SDRAM_ERR_ADDR, 0);
@@ -656,6 +661,8 @@ static void mv64x60_init_csrows(struct mem_ctl_info *mci,
 				struct mv64x60_mc_pdata *pdata)
 {
 	struct csrow_info *csrow;
+	struct dimm_info *dimm;
+
 	u32 devtype;
 	u32 ctl;
 
@@ -664,35 +671,36 @@ static void mv64x60_init_csrows(struct mem_ctl_info *mci,
 	ctl = in_le32(pdata->mc_vbase + MV64X60_SDRAM_CONFIG);
 
 	csrow = &mci->csrows[0];
-	csrow->first_page = 0;
-	csrow->nr_pages = pdata->total_mem >> PAGE_SHIFT;
-	csrow->last_page = csrow->first_page + csrow->nr_pages - 1;
-	csrow->grain = 8;
+	dimm = csrow->channels[0].dimm;
+
+	dimm->nr_pages = pdata->total_mem >> PAGE_SHIFT;
+	dimm->grain = 8;
 
-	csrow->mtype = (ctl & MV64X60_SDRAM_REGISTERED) ? MEM_RDDR : MEM_DDR;
+	dimm->mtype = (ctl & MV64X60_SDRAM_REGISTERED) ? MEM_RDDR : MEM_DDR;
 
 	devtype = (ctl >> 20) & 0x3;
 	switch (devtype) {
 	case 0x0:
-		csrow->dtype = DEV_X32;
+		dimm->dtype = DEV_X32;
 		break;
 	case 0x2:		/* could be X8 too, but no way to tell */
-		csrow->dtype = DEV_X16;
+		dimm->dtype = DEV_X16;
 		break;
 	case 0x3:
-		csrow->dtype = DEV_X4;
+		dimm->dtype = DEV_X4;
 		break;
 	default:
-		csrow->dtype = DEV_UNKNOWN;
+		dimm->dtype = DEV_UNKNOWN;
 		break;
 	}
 
-	csrow->edac_mode = EDAC_SECDED;
+	dimm->edac_mode = EDAC_SECDED;
 }
 
 static int __devinit mv64x60_mc_err_probe(struct platform_device *pdev)
 {
 	struct mem_ctl_info *mci;
+	struct edac_mc_layer layers[2];
 	struct mv64x60_mc_pdata *pdata;
 	struct resource *r;
 	u32 ctl;
@@ -701,7 +709,14 @@ static int __devinit mv64x60_mc_err_probe(struct platform_device *pdev)
 	if (!devres_open_group(&pdev->dev, mv64x60_mc_err_probe, GFP_KERNEL))
 		return -ENOMEM;
 
-	mci = edac_mc_alloc(sizeof(struct mv64x60_mc_pdata), 1, 1, edac_mc_idx);
+	layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
+	layers[0].size = 1;
+	layers[0].is_virt_csrow = true;
+	layers[1].type = EDAC_MC_LAYER_CHANNEL;
+	layers[1].size = 1;
+	layers[1].is_virt_csrow = false;
+	mci = edac_mc_alloc(edac_mc_idx, ARRAY_SIZE(layers), layers,
+			    sizeof(struct mv64x60_mc_pdata));
 	if (!mci) {
 		printk(KERN_ERR "%s: No memory for CPU err\n", __func__);
 		devres_release_group(&pdev->dev, mv64x60_mc_err_probe);
diff --git a/drivers/edac/pasemi_edac.c b/drivers/edac/pasemi_edac.c
index 7f71ee436744..b095a906a994 100644
--- a/drivers/edac/pasemi_edac.c
+++ b/drivers/edac/pasemi_edac.c
@@ -110,15 +110,16 @@ static void pasemi_edac_process_error_info(struct mem_ctl_info *mci, u32 errsta)
 	/* uncorrectable/multi-bit errors */
 	if (errsta & (MCDEBUG_ERRSTA_MBE_STATUS |
 		      MCDEBUG_ERRSTA_RFL_STATUS)) {
-		edac_mc_handle_ue(mci, mci->csrows[cs].first_page, 0,
-				  cs, mci->ctl_name);
+		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
+				     mci->csrows[cs].first_page, 0, 0,
+				     cs, 0, -1, mci->ctl_name, "", NULL);
 	}
 
 	/* correctable/single-bit errors */
-	if (errsta & MCDEBUG_ERRSTA_SBE_STATUS) {
-		edac_mc_handle_ce(mci, mci->csrows[cs].first_page, 0,
-				  0, cs, 0, mci->ctl_name);
-	}
+	if (errsta & MCDEBUG_ERRSTA_SBE_STATUS)
+		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
+				     mci->csrows[cs].first_page, 0, 0,
+				     cs, 0, -1, mci->ctl_name, "", NULL);
 }
 
 static void pasemi_edac_check(struct mem_ctl_info *mci)
@@ -135,11 +136,13 @@ static int pasemi_edac_init_csrows(struct mem_ctl_info *mci,
 				   enum edac_type edac_mode)
 {
 	struct csrow_info *csrow;
+	struct dimm_info *dimm;
 	u32 rankcfg;
 	int index;
 
 	for (index = 0; index < mci->nr_csrows; index++) {
 		csrow = &mci->csrows[index];
+		dimm = csrow->channels[0].dimm;
 
 		pci_read_config_dword(pdev,
 				      MCDRAM_RANKCFG + (index * 12),
@@ -151,20 +154,20 @@ static int pasemi_edac_init_csrows(struct mem_ctl_info *mci,
 		switch ((rankcfg & MCDRAM_RANKCFG_TYPE_SIZE_M) >>
 			MCDRAM_RANKCFG_TYPE_SIZE_S) {
 		case 0:
-			csrow->nr_pages = 128 << (20 - PAGE_SHIFT);
+			dimm->nr_pages = 128 << (20 - PAGE_SHIFT);
 			break;
 		case 1:
-			csrow->nr_pages = 256 << (20 - PAGE_SHIFT);
+			dimm->nr_pages = 256 << (20 - PAGE_SHIFT);
 			break;
 		case 2:
 		case 3:
-			csrow->nr_pages = 512 << (20 - PAGE_SHIFT);
+			dimm->nr_pages = 512 << (20 - PAGE_SHIFT);
 			break;
 		case 4:
-			csrow->nr_pages = 1024 << (20 - PAGE_SHIFT);
+			dimm->nr_pages = 1024 << (20 - PAGE_SHIFT);
 			break;
 		case 5:
-			csrow->nr_pages = 2048 << (20 - PAGE_SHIFT);
+			dimm->nr_pages = 2048 << (20 - PAGE_SHIFT);
 			break;
 		default:
 			edac_mc_printk(mci, KERN_ERR,
@@ -174,13 +177,13 @@ static int pasemi_edac_init_csrows(struct mem_ctl_info *mci,
 		}
 
 		csrow->first_page = last_page_in_mmc;
-		csrow->last_page = csrow->first_page + csrow->nr_pages - 1;
-		last_page_in_mmc += csrow->nr_pages;
+		csrow->last_page = csrow->first_page + dimm->nr_pages - 1;
+		last_page_in_mmc += dimm->nr_pages;
 		csrow->page_mask = 0;
-		csrow->grain = PASEMI_EDAC_ERROR_GRAIN;
-		csrow->mtype = MEM_DDR;
-		csrow->dtype = DEV_UNKNOWN;
-		csrow->edac_mode = edac_mode;
+		dimm->grain = PASEMI_EDAC_ERROR_GRAIN;
+		dimm->mtype = MEM_DDR;
+		dimm->dtype = DEV_UNKNOWN;
+		dimm->edac_mode = edac_mode;
 	}
 	return 0;
 }
@@ -189,6 +192,7 @@ static int __devinit pasemi_edac_probe(struct pci_dev *pdev,
 		const struct pci_device_id *ent)
 {
 	struct mem_ctl_info *mci = NULL;
+	struct edac_mc_layer layers[2];
 	u32 errctl1, errcor, scrub, mcen;
 
 	pci_read_config_dword(pdev, MCCFG_MCEN, &mcen);
@@ -205,9 +209,14 @@ static int __devinit pasemi_edac_probe(struct pci_dev *pdev,
 		MCDEBUG_ERRCTL1_RFL_LOG_EN;
 	pci_write_config_dword(pdev, MCDEBUG_ERRCTL1, errctl1);
 
-	mci = edac_mc_alloc(0, PASEMI_EDAC_NR_CSROWS, PASEMI_EDAC_NR_CHANS,
-				system_mmc_id++);
-
+	layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
+	layers[0].size = PASEMI_EDAC_NR_CSROWS;
+	layers[0].is_virt_csrow = true;
+	layers[1].type = EDAC_MC_LAYER_CHANNEL;
+	layers[1].size = PASEMI_EDAC_NR_CHANS;
+	layers[1].is_virt_csrow = false;
+	mci = edac_mc_alloc(system_mmc_id++, ARRAY_SIZE(layers), layers,
+			    0);
 	if (mci == NULL)
 		return -ENOMEM;
 
diff --git a/drivers/edac/ppc4xx_edac.c b/drivers/edac/ppc4xx_edac.c
index d427c69bb8b1..f3f9fed06ad7 100644
--- a/drivers/edac/ppc4xx_edac.c
+++ b/drivers/edac/ppc4xx_edac.c
@@ -727,7 +727,10 @@ ppc4xx_edac_handle_ce(struct mem_ctl_info *mci,
 
 	for (row = 0; row < mci->nr_csrows; row++)
 		if (ppc4xx_edac_check_bank_error(status, row))
-			edac_mc_handle_ce_no_info(mci, message);
+			edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
+					     0, 0, 0,
+					     row, 0, -1,
+					     message, "", NULL);
 }
 
 /**
@@ -755,7 +758,10 @@ ppc4xx_edac_handle_ue(struct mem_ctl_info *mci,
 
 	for (row = 0; row < mci->nr_csrows; row++)
 		if (ppc4xx_edac_check_bank_error(status, row))
-			edac_mc_handle_ue(mci, page, offset, row, message);
+			edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
+					     page, offset, 0,
+					     row, 0, -1,
+					     message, "", NULL);
 }
 
 /**
@@ -895,9 +901,8 @@ ppc4xx_edac_init_csrows(struct mem_ctl_info *mci, u32 mcopt1)
 	enum mem_type mtype;
 	enum dev_type dtype;
 	enum edac_type edac_mode;
-	int row;
-	u32 mbxcf, size;
-	static u32 ppc4xx_last_page;
+	int row, j;
+	u32 mbxcf, size, nr_pages;
 
 	/* Establish the memory type and width */
 
@@ -948,7 +953,7 @@ ppc4xx_edac_init_csrows(struct mem_ctl_info *mci, u32 mcopt1)
 		case SDRAM_MBCF_SZ_2GB:
 		case SDRAM_MBCF_SZ_4GB:
 		case SDRAM_MBCF_SZ_8GB:
-			csi->nr_pages = SDRAM_MBCF_SZ_TO_PAGES(size);
+			nr_pages = SDRAM_MBCF_SZ_TO_PAGES(size);
 			break;
 		default:
 			ppc4xx_edac_mc_printk(KERN_ERR, mci,
@@ -959,10 +964,6 @@ ppc4xx_edac_init_csrows(struct mem_ctl_info *mci, u32 mcopt1)
 			goto done;
 		}
 
-		csi->first_page = ppc4xx_last_page;
-		csi->last_page	= csi->first_page + csi->nr_pages - 1;
-		csi->page_mask	= 0;
-
 		/*
 		 * It's unclear exactly what grain should be set to
 		 * here. The SDRAM_ECCES register allows resolution of
@@ -975,15 +976,17 @@ ppc4xx_edac_init_csrows(struct mem_ctl_info *mci, u32 mcopt1)
 		 * possible values would be the PLB width (16), the
 		 * page size (PAGE_SIZE) or the memory width (2 or 4).
 		 */
+		for (j = 0; j < csi->nr_channels; j++) {
+			struct dimm_info *dimm = csi->channels[j].dimm;
 
-		csi->grain	= 1;
-
-		csi->mtype	= mtype;
-		csi->dtype	= dtype;
+			dimm->nr_pages  = nr_pages / csi->nr_channels;
+			dimm->grain	= 1;
 
-		csi->edac_mode	= edac_mode;
+			dimm->mtype	= mtype;
+			dimm->dtype	= dtype;
 
-		ppc4xx_last_page += csi->nr_pages;
+			dimm->edac_mode	= edac_mode;
+		}
 	}
 
  done:
@@ -1236,6 +1239,7 @@ static int __devinit ppc4xx_edac_probe(struct platform_device *op)
 	dcr_host_t dcr_host;
 	const struct device_node *np = op->dev.of_node;
 	struct mem_ctl_info *mci = NULL;
+	struct edac_mc_layer layers[2];
 	static int ppc4xx_edac_instance;
 
 	/*
@@ -1281,12 +1285,14 @@ static int __devinit ppc4xx_edac_probe(struct platform_device *op)
 	 * controller instance and perform the appropriate
 	 * initialization.
 	 */
-
-	mci = edac_mc_alloc(sizeof(struct ppc4xx_edac_pdata),
-			    ppc4xx_edac_nr_csrows,
-			    ppc4xx_edac_nr_chans,
-			    ppc4xx_edac_instance);
-
+	layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
+	layers[0].size = ppc4xx_edac_nr_csrows;
+	layers[0].is_virt_csrow = true;
+	layers[1].type = EDAC_MC_LAYER_CHANNEL;
+	layers[1].size = ppc4xx_edac_nr_chans;
+	layers[1].is_virt_csrow = false;
+	mci = edac_mc_alloc(ppc4xx_edac_instance, ARRAY_SIZE(layers), layers,
+			    sizeof(struct ppc4xx_edac_pdata));
 	if (mci == NULL) {
 		ppc4xx_edac_printk(KERN_ERR, "%s: "
 				   "Failed to allocate EDAC MC instance!\n",
diff --git a/drivers/edac/r82600_edac.c b/drivers/edac/r82600_edac.c
index 6d908ad72d64..e1cacd164f31 100644
--- a/drivers/edac/r82600_edac.c
+++ b/drivers/edac/r82600_edac.c
@@ -179,10 +179,11 @@ static int r82600_process_error_info(struct mem_ctl_info *mci,
 		error_found = 1;
 
 		if (handle_errors)
-			edac_mc_handle_ce(mci, page, 0,	/* not avail */
-					syndrome,
-					edac_mc_find_csrow_by_page(mci, page),
-					0, mci->ctl_name);
+			edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
+					     page, 0, syndrome,
+					     edac_mc_find_csrow_by_page(mci, page),
+					     0, -1,
+					     mci->ctl_name, "", NULL);
 	}
 
 	if (info->eapr & BIT(1)) {	/* UE? */
@@ -190,9 +191,11 @@ static int r82600_process_error_info(struct mem_ctl_info *mci,
 
 		if (handle_errors)
 			/* 82600 doesn't give enough info */
-			edac_mc_handle_ue(mci, page, 0,
-					edac_mc_find_csrow_by_page(mci, page),
-					mci->ctl_name);
+			edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
+					     page, 0, 0,
+					     edac_mc_find_csrow_by_page(mci, page),
+					     0, -1,
+					     mci->ctl_name, "", NULL);
 	}
 
 	return error_found;
@@ -216,6 +219,7 @@ static void r82600_init_csrows(struct mem_ctl_info *mci, struct pci_dev *pdev,
 			u8 dramcr)
 {
 	struct csrow_info *csrow;
+	struct dimm_info *dimm;
 	int index;
 	u8 drbar;		/* SDRAM Row Boundary Address Register */
 	u32 row_high_limit, row_high_limit_last;
@@ -227,6 +231,7 @@ static void r82600_init_csrows(struct mem_ctl_info *mci, struct pci_dev *pdev,
 
 	for (index = 0; index < mci->nr_csrows; index++) {
 		csrow = &mci->csrows[index];
+		dimm = csrow->channels[0].dimm;
 
 		/* find the DRAM Chip Select Base address and mask */
 		pci_read_config_byte(pdev, R82600_DRBA + index, &drbar);
@@ -247,16 +252,17 @@ static void r82600_init_csrows(struct mem_ctl_info *mci, struct pci_dev *pdev,
 
 		csrow->first_page = row_base >> PAGE_SHIFT;
 		csrow->last_page = (row_high_limit >> PAGE_SHIFT) - 1;
-		csrow->nr_pages = csrow->last_page - csrow->first_page + 1;
+
+		dimm->nr_pages = csrow->last_page - csrow->first_page + 1;
 		/* Error address is top 19 bits - so granularity is      *
 		 * 14 bits                                               */
-		csrow->grain = 1 << 14;
-		csrow->mtype = reg_sdram ? MEM_RDDR : MEM_DDR;
+		dimm->grain = 1 << 14;
+		dimm->mtype = reg_sdram ? MEM_RDDR : MEM_DDR;
 		/* FIXME - check that this is unknowable with this chipset */
-		csrow->dtype = DEV_UNKNOWN;
+		dimm->dtype = DEV_UNKNOWN;
 
 		/* Mode is global on 82600 */
-		csrow->edac_mode = ecc_on ? EDAC_SECDED : EDAC_NONE;
+		dimm->edac_mode = ecc_on ? EDAC_SECDED : EDAC_NONE;
 		row_high_limit_last = row_high_limit;
 	}
 }
@@ -264,6 +270,7 @@ static void r82600_init_csrows(struct mem_ctl_info *mci, struct pci_dev *pdev,
 static int r82600_probe1(struct pci_dev *pdev, int dev_idx)
 {
 	struct mem_ctl_info *mci;
+	struct edac_mc_layer layers[2];
 	u8 dramcr;
 	u32 eapr;
 	u32 scrub_disabled;
@@ -278,8 +285,13 @@ static int r82600_probe1(struct pci_dev *pdev, int dev_idx)
 	debugf2("%s(): sdram refresh rate = %#0x\n", __func__,
 		sdram_refresh_rate);
 	debugf2("%s(): DRAMC register = %#0x\n", __func__, dramcr);
-	mci = edac_mc_alloc(0, R82600_NR_CSROWS, R82600_NR_CHANS, 0);
-
+	layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
+	layers[0].size = R82600_NR_CSROWS;
+	layers[0].is_virt_csrow = true;
+	layers[1].type = EDAC_MC_LAYER_CHANNEL;
+	layers[1].size = R82600_NR_CHANS;
+	layers[1].is_virt_csrow = false;
+	mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, 0);
 	if (mci == NULL)
 		return -ENOMEM;
 
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index 123204f8e23b..4adaf4b7da99 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -314,8 +314,6 @@ struct sbridge_pvt {
 	struct sbridge_info	info;
 	struct sbridge_channel	channel[NUM_CHANNELS];
 
-	int 			csrow_map[NUM_CHANNELS][MAX_DIMMS];
-
 	/* Memory type detection */
 	bool			is_mirrored, is_lockstep, is_close_pg;
 
@@ -487,29 +485,14 @@ static struct pci_dev *get_pdev_slot_func(u8 bus, unsigned slot,
 }
 
 /**
- * sbridge_get_active_channels() - gets the number of channels and csrows
+ * check_if_ecc_is_active() - Checks if ECC is active
  * bus:		Device bus
- * @channels:	Number of channels that will be returned
- * @csrows:	Number of csrows found
- *
- * Since EDAC core needs to know in advance the number of available channels
- * and csrows, in order to allocate memory for csrows/channels, it is needed
- * to run two similar steps. At the first step, implemented on this function,
- * it checks the number of csrows/channels present at one socket, identified
- * by the associated PCI bus.
- * this is used in order to properly allocate the size of mci components.
- * Note: one csrow is one dimm.
  */
-static int sbridge_get_active_channels(const u8 bus, unsigned *channels,
-				      unsigned *csrows)
+static int check_if_ecc_is_active(const u8 bus)
 {
 	struct pci_dev *pdev = NULL;
-	int i, j;
 	u32 mcmtr;
 
-	*channels = 0;
-	*csrows = 0;
-
 	pdev = get_pdev_slot_func(bus, 15, 0);
 	if (!pdev) {
 		sbridge_printk(KERN_ERR, "Couldn't find PCI device "
@@ -523,41 +506,14 @@ static int sbridge_get_active_channels(const u8 bus, unsigned *channels,
 		sbridge_printk(KERN_ERR, "ECC is disabled. Aborting\n");
 		return -ENODEV;
 	}
-
-	for (i = 0; i < NUM_CHANNELS; i++) {
-		u32 mtr;
-
-		/* Device 15 functions 2 - 5  */
-		pdev = get_pdev_slot_func(bus, 15, 2 + i);
-		if (!pdev) {
-			sbridge_printk(KERN_ERR, "Couldn't find PCI device "
-						 "%2x.%02d.%d!!!\n",
-						 bus, 15, 2 + i);
-			return -ENODEV;
-		}
-		(*channels)++;
-
-		for (j = 0; j < ARRAY_SIZE(mtr_regs); j++) {
-			pci_read_config_dword(pdev, mtr_regs[j], &mtr);
-			debugf1("Bus#%02x channel #%d  MTR%d = %x\n", bus, i, j, mtr);
-			if (IS_DIMM_PRESENT(mtr))
-				(*csrows)++;
-		}
-	}
-
-	debugf0("Number of active channels: %d, number of active dimms: %d\n",
-		*channels, *csrows);
-
 	return 0;
 }
 
-static int get_dimm_config(const struct mem_ctl_info *mci)
+static int get_dimm_config(struct mem_ctl_info *mci)
 {
 	struct sbridge_pvt *pvt = mci->pvt_info;
-	struct csrow_info *csr;
+	struct dimm_info *dimm;
 	int i, j, banks, ranks, rows, cols, size, npages;
-	int csrow = 0;
-	unsigned long last_page = 0;
 	u32 reg;
 	enum edac_type mode;
 	enum mem_type mtype;
@@ -616,6 +572,8 @@ static int get_dimm_config(const struct mem_ctl_info *mci)
 		u32 mtr;
 
 		for (j = 0; j < ARRAY_SIZE(mtr_regs); j++) {
+			dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers,
+				       i, j, 0);
 			pci_read_config_dword(pvt->pci_tad[i],
 					      mtr_regs[j], &mtr);
 			debugf4("Channel #%d  MTR%d = %x\n", i, j, mtr);
@@ -634,29 +592,15 @@ static int get_dimm_config(const struct mem_ctl_info *mci)
 					pvt->sbridge_dev->mc, i, j,
 					size, npages,
 					banks, ranks, rows, cols);
-				csr = &mci->csrows[csrow];
-
-				csr->first_page = last_page;
-				csr->last_page = last_page + npages - 1;
-				csr->page_mask = 0UL;	/* Unused */
-				csr->nr_pages = npages;
-				csr->grain = 32;
-				csr->csrow_idx = csrow;
-				csr->dtype = (banks == 8) ? DEV_X8 : DEV_X4;
-				csr->ce_count = 0;
-				csr->ue_count = 0;
-				csr->mtype = mtype;
-				csr->edac_mode = mode;
-				csr->nr_channels = 1;
-				csr->channels[0].chan_idx = i;
-				csr->channels[0].ce_count = 0;
-				pvt->csrow_map[i][j] = csrow;
-				snprintf(csr->channels[0].label,
-					 sizeof(csr->channels[0].label),
+
+				dimm->nr_pages = npages;
+				dimm->grain = 32;
+				dimm->dtype = (banks == 8) ? DEV_X8 : DEV_X4;
+				dimm->mtype = mtype;
+				dimm->edac_mode = mode;
+				snprintf(dimm->label, sizeof(dimm->label),
 					 "CPU_SrcID#%u_Channel#%u_DIMM#%u",
 					 pvt->sbridge_dev->source_id, i, j);
-				last_page += npages;
-				csrow++;
 			}
 		}
 	}
@@ -844,11 +788,10 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
 				 u8 *socket,
 				 long *channel_mask,
 				 u8 *rank,
-				 char *area_type)
+				 char **area_type, char *msg)
 {
 	struct mem_ctl_info	*new_mci;
 	struct sbridge_pvt *pvt = mci->pvt_info;
-	char			msg[256];
 	int 			n_rir, n_sads, n_tads, sad_way, sck_xch;
 	int			sad_interl, idx, base_ch;
 	int			interleave_mode;
@@ -870,12 +813,10 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
 	 */
 	if ((addr > (u64) pvt->tolm) && (addr < (1LL << 32))) {
 		sprintf(msg, "Error at TOLM area, on addr 0x%08Lx", addr);
-		edac_mc_handle_ce_no_info(mci, msg);
 		return -EINVAL;
 	}
 	if (addr >= (u64)pvt->tohm) {
 		sprintf(msg, "Error at MMIOH area, on addr 0x%016Lx", addr);
-		edac_mc_handle_ce_no_info(mci, msg);
 		return -EINVAL;
 	}
 
@@ -892,7 +833,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
 		limit = SAD_LIMIT(reg);
 		if (limit <= prv) {
 			sprintf(msg, "Can't discover the memory socket");
-			edac_mc_handle_ce_no_info(mci, msg);
 			return -EINVAL;
 		}
 		if  (addr <= limit)
@@ -901,10 +841,9 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
 	}
 	if (n_sads == MAX_SAD) {
 		sprintf(msg, "Can't discover the memory socket");
-		edac_mc_handle_ce_no_info(mci, msg);
 		return -EINVAL;
 	}
-	area_type = get_dram_attr(reg);
+	*area_type = get_dram_attr(reg);
 	interleave_mode = INTERLEAVE_MODE(reg);
 
 	pci_read_config_dword(pvt->pci_sad0, interleave_list[n_sads],
@@ -942,7 +881,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
 		break;
 	default:
 		sprintf(msg, "Can't discover socket interleave");
-		edac_mc_handle_ce_no_info(mci, msg);
 		return -EINVAL;
 	}
 	*socket = sad_interleave[idx];
@@ -957,7 +895,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
 	if (!new_mci) {
 		sprintf(msg, "Struct for socket #%u wasn't initialized",
 			*socket);
-		edac_mc_handle_ce_no_info(mci, msg);
 		return -EINVAL;
 	}
 	mci = new_mci;
@@ -973,7 +910,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
 		limit = TAD_LIMIT(reg);
 		if (limit <= prv) {
 			sprintf(msg, "Can't discover the memory channel");
-			edac_mc_handle_ce_no_info(mci, msg);
 			return -EINVAL;
 		}
 		if  (addr <= limit)
@@ -1013,7 +949,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
 		break;
 	default:
 		sprintf(msg, "Can't discover the TAD target");
-		edac_mc_handle_ce_no_info(mci, msg);
 		return -EINVAL;
 	}
 	*channel_mask = 1 << base_ch;
@@ -1027,7 +962,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
 			break;
 		default:
 			sprintf(msg, "Invalid mirror set. Can't decode addr");
-			edac_mc_handle_ce_no_info(mci, msg);
 			return -EINVAL;
 		}
 	} else
@@ -1055,7 +989,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
 	if (offset > addr) {
 		sprintf(msg, "Can't calculate ch addr: TAD offset 0x%08Lx is too high for addr 0x%08Lx!",
 			offset, addr);
-		edac_mc_handle_ce_no_info(mci, msg);
 		return -EINVAL;
 	}
 	addr -= offset;
@@ -1095,7 +1028,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
 	if (n_rir == MAX_RIR_RANGES) {
 		sprintf(msg, "Can't discover the memory rank for ch addr 0x%08Lx",
 			ch_addr);
-		edac_mc_handle_ce_no_info(mci, msg);
 		return -EINVAL;
 	}
 	rir_way = RIR_WAY(reg);
@@ -1409,7 +1341,8 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
 {
 	struct mem_ctl_info *new_mci;
 	struct sbridge_pvt *pvt = mci->pvt_info;
-	char *type, *optype, *msg, *recoverable_msg;
+	enum hw_event_mc_err_type tp_event;
+	char *type, *optype, msg[256];
 	bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0);
 	bool overflow = GET_BITFIELD(m->status, 62, 62);
 	bool uncorrected_error = GET_BITFIELD(m->status, 61, 61);
@@ -1421,13 +1354,21 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
 	u32 optypenum = GET_BITFIELD(m->status, 4, 6);
 	long channel_mask, first_channel;
 	u8  rank, socket;
-	int csrow, rc, dimm;
-	char *area_type = "Unknown";
-
-	if (ripv)
-		type = "NON_FATAL";
-	else
-		type = "FATAL";
+	int rc, dimm;
+	char *area_type = NULL;
+
+	if (uncorrected_error) {
+		if (ripv) {
+			type = "FATAL";
+			tp_event = HW_EVENT_ERR_FATAL;
+		} else {
+			type = "NON_FATAL";
+			tp_event = HW_EVENT_ERR_UNCORRECTED;
+		}
+	} else {
+		type = "CORRECTED";
+		tp_event = HW_EVENT_ERR_CORRECTED;
+	}
 
 	/*
 	 * According with Table 15-9 of the Intel Architecture spec vol 3A,
@@ -1445,19 +1386,19 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
 	} else {
 		switch (optypenum) {
 		case 0:
-			optype = "generic undef request";
+			optype = "generic undef request error";
 			break;
 		case 1:
-			optype = "memory read";
+			optype = "memory read error";
 			break;
 		case 2:
-			optype = "memory write";
+			optype = "memory write error";
 			break;
 		case 3:
-			optype = "addr/cmd";
+			optype = "addr/cmd error";
 			break;
 		case 4:
-			optype = "memory scrubbing";
+			optype = "memory scrubbing error";
 			break;
 		default:
 			optype = "reserved";
@@ -1466,13 +1407,13 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
 	}
 
 	rc = get_memory_error_data(mci, m->addr, &socket,
-				   &channel_mask, &rank, area_type);
+				   &channel_mask, &rank, &area_type, msg);
 	if (rc < 0)
-		return;
+		goto err_parsing;
 	new_mci = get_mci_for_node_id(socket);
 	if (!new_mci) {
-		edac_mc_handle_ce_no_info(mci, "Error: socket got corrupted!");
-		return;
+		strcpy(msg, "Error: socket got corrupted!");
+		goto err_parsing;
 	}
 	mci = new_mci;
 	pvt = mci->pvt_info;
@@ -1486,45 +1427,39 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
 	else
 		dimm = 2;
 
-	csrow = pvt->csrow_map[first_channel][dimm];
-
-	if (uncorrected_error && recoverable)
-		recoverable_msg = " recoverable";
-	else
-		recoverable_msg = "";
 
 	/*
-	 * FIXME: What should we do with "channel" information on mcelog?
-	 * Probably, we can just discard it, as the channel information
-	 * comes from the get_memory_error_data() address decoding
+	 * FIXME: On some memory configurations (mirror, lockstep), the
+	 * Memory Controller can't point the error to a single DIMM. The
+	 * EDAC core should be handling the channel mask, in order to point
+	 * to the group of dimm's where the error may be happening.
 	 */
-	msg = kasprintf(GFP_ATOMIC,
-			"%d %s error(s): %s on %s area %s%s: cpu=%d Err=%04x:%04x (ch=%d), "
-			"addr = 0x%08llx => socket=%d, Channel=%ld(mask=%ld), rank=%d\n",
-			core_err_cnt,
-			area_type,
-			optype,
-			type,
-			recoverable_msg,
-			overflow ? "OVERFLOW" : "",
-			m->cpu,
-			mscod, errcode,
-			channel,		/* 1111b means not specified */
-			(long long) m->addr,
-			socket,
-			first_channel,		/* This is the real channel on SB */
-			channel_mask,
-			rank);
+	snprintf(msg, sizeof(msg),
+		 "count:%d%s%s area:%s err_code:%04x:%04x socket:%d channel_mask:%ld rank:%d",
+		 core_err_cnt,
+		 overflow ? " OVERFLOW" : "",
+		 (uncorrected_error && recoverable) ? " recoverable" : "",
+		 area_type,
+		 mscod, errcode,
+		 socket,
+		 channel_mask,
+		 rank);
 
 	debugf0("%s", msg);
 
+	/* FIXME: need support for channel mask */
+
 	/* Call the helper to output message */
-	if (uncorrected_error)
-		edac_mc_handle_fbd_ue(mci, csrow, 0, 0, msg);
-	else
-		edac_mc_handle_fbd_ce(mci, csrow, 0, msg);
+	edac_mc_handle_error(tp_event, mci,
+			     m->addr >> PAGE_SHIFT, m->addr & ~PAGE_MASK, 0,
+			     channel, dimm, -1,
+			     optype, msg, m);
+	return;
+err_parsing:
+	edac_mc_handle_error(tp_event, mci, 0, 0, 0,
+			     -1, -1, -1,
+			     msg, "", m);
 
-	kfree(msg);
 }
 
 /*
@@ -1683,16 +1618,25 @@ static void sbridge_unregister_mci(struct sbridge_dev *sbridge_dev)
 static int sbridge_register_mci(struct sbridge_dev *sbridge_dev)
 {
 	struct mem_ctl_info *mci;
+	struct edac_mc_layer layers[2];
 	struct sbridge_pvt *pvt;
-	int rc, channels, csrows;
+	int rc;
 
 	/* Check the number of active and not disabled channels */
-	rc = sbridge_get_active_channels(sbridge_dev->bus, &channels, &csrows);
+	rc = check_if_ecc_is_active(sbridge_dev->bus);
 	if (unlikely(rc < 0))
 		return rc;
 
 	/* allocate a new MC control structure */
-	mci = edac_mc_alloc(sizeof(*pvt), csrows, channels, sbridge_dev->mc);
+	layers[0].type = EDAC_MC_LAYER_CHANNEL;
+	layers[0].size = NUM_CHANNELS;
+	layers[0].is_virt_csrow = false;
+	layers[1].type = EDAC_MC_LAYER_SLOT;
+	layers[1].size = MAX_DIMMS;
+	layers[1].is_virt_csrow = true;
+	mci = edac_mc_alloc(sbridge_dev->mc, ARRAY_SIZE(layers), layers,
+			    sizeof(*pvt));
+
 	if (unlikely(!mci))
 		return -ENOMEM;
 
diff --git a/drivers/edac/tile_edac.c b/drivers/edac/tile_edac.c
index e99d00976189..7bb4614730db 100644
--- a/drivers/edac/tile_edac.c
+++ b/drivers/edac/tile_edac.c
@@ -71,7 +71,10 @@ static void tile_edac_check(struct mem_ctl_info *mci)
 	if (mem_error.sbe_count != priv->ce_count) {
 		dev_dbg(mci->dev, "ECC CE err on node %d\n", priv->node);
 		priv->ce_count = mem_error.sbe_count;
-		edac_mc_handle_ce(mci, 0, 0, 0, 0, 0, mci->ctl_name);
+		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
+				     0, 0, 0,
+				     0, 0, -1,
+				     mci->ctl_name, "", NULL);
 	}
 }
 
@@ -84,6 +87,7 @@ static int __devinit tile_edac_init_csrows(struct mem_ctl_info *mci)
 	struct csrow_info	*csrow = &mci->csrows[0];
 	struct tile_edac_priv	*priv = mci->pvt_info;
 	struct mshim_mem_info	mem_info;
+	struct dimm_info *dimm = csrow->channels[0].dimm;
 
 	if (hv_dev_pread(priv->hv_devhdl, 0, (HV_VirtAddr)&mem_info,
 		sizeof(struct mshim_mem_info), MSHIM_MEM_INFO_OFF) !=
@@ -93,27 +97,25 @@ static int __devinit tile_edac_init_csrows(struct mem_ctl_info *mci)
 	}
 
 	if (mem_info.mem_ecc)
-		csrow->edac_mode = EDAC_SECDED;
+		dimm->edac_mode = EDAC_SECDED;
 	else
-		csrow->edac_mode = EDAC_NONE;
+		dimm->edac_mode = EDAC_NONE;
 	switch (mem_info.mem_type) {
 	case DDR2:
-		csrow->mtype = MEM_DDR2;
+		dimm->mtype = MEM_DDR2;
 		break;
 
 	case DDR3:
-		csrow->mtype = MEM_DDR3;
+		dimm->mtype = MEM_DDR3;
 		break;
 
 	default:
 		return -1;
 	}
 
-	csrow->first_page = 0;
-	csrow->nr_pages = mem_info.mem_size >> PAGE_SHIFT;
-	csrow->last_page = csrow->first_page + csrow->nr_pages - 1;
-	csrow->grain = TILE_EDAC_ERROR_GRAIN;
-	csrow->dtype = DEV_UNKNOWN;
+	dimm->nr_pages = mem_info.mem_size >> PAGE_SHIFT;
+	dimm->grain = TILE_EDAC_ERROR_GRAIN;
+	dimm->dtype = DEV_UNKNOWN;
 
 	return 0;
 }
@@ -123,6 +125,7 @@ static int __devinit tile_edac_mc_probe(struct platform_device *pdev)
 	char			hv_file[32];
 	int			hv_devhdl;
 	struct mem_ctl_info	*mci;
+	struct edac_mc_layer	layers[2];
 	struct tile_edac_priv	*priv;
 	int			rc;
 
@@ -132,8 +135,14 @@ static int __devinit tile_edac_mc_probe(struct platform_device *pdev)
 		return -EINVAL;
 
 	/* A TILE MC has a single channel and one chip-select row. */
-	mci = edac_mc_alloc(sizeof(struct tile_edac_priv),
-		TILE_EDAC_NR_CSROWS, TILE_EDAC_NR_CHANS, pdev->id);
+	layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
+	layers[0].size = TILE_EDAC_NR_CSROWS;
+	layers[0].is_virt_csrow = true;
+	layers[1].type = EDAC_MC_LAYER_CHANNEL;
+	layers[1].size = TILE_EDAC_NR_CHANS;
+	layers[1].is_virt_csrow = false;
+	mci = edac_mc_alloc(pdev->id, ARRAY_SIZE(layers), layers,
+			    sizeof(struct tile_edac_priv));
 	if (mci == NULL)
 		return -ENOMEM;
 	priv = mci->pvt_info;
diff --git a/drivers/edac/x38_edac.c b/drivers/edac/x38_edac.c
index a438297389e5..1ac7962d63ea 100644
--- a/drivers/edac/x38_edac.c
+++ b/drivers/edac/x38_edac.c
@@ -215,19 +215,26 @@ static void x38_process_error_info(struct mem_ctl_info *mci,
 		return;
 
 	if ((info->errsts ^ info->errsts2) & X38_ERRSTS_BITS) {
-		edac_mc_handle_ce_no_info(mci, "UE overwrote CE");
+		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 0, 0, 0,
+				     -1, -1, -1,
+				     "UE overwrote CE", "", NULL);
 		info->errsts = info->errsts2;
 	}
 
 	for (channel = 0; channel < x38_channel_num; channel++) {
 		log = info->eccerrlog[channel];
 		if (log & X38_ECCERRLOG_UE) {
-			edac_mc_handle_ue(mci, 0, 0,
-				eccerrlog_row(channel, log), "x38 UE");
+			edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
+					     0, 0, 0,
+					     eccerrlog_row(channel, log),
+					     -1, -1,
+					     "x38 UE", "", NULL);
 		} else if (log & X38_ECCERRLOG_CE) {
-			edac_mc_handle_ce(mci, 0, 0,
-				eccerrlog_syndrome(log),
-				eccerrlog_row(channel, log), 0, "x38 CE");
+			edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
+					     0, 0, eccerrlog_syndrome(log),
+					     eccerrlog_row(channel, log),
+					     -1, -1,
+					     "x38 CE", "", NULL);
 		}
 	}
 }
@@ -317,9 +324,9 @@ static unsigned long drb_to_nr_pages(
 static int x38_probe1(struct pci_dev *pdev, int dev_idx)
 {
 	int rc;
-	int i;
+	int i, j;
 	struct mem_ctl_info *mci = NULL;
-	unsigned long last_page;
+	struct edac_mc_layer layers[2];
 	u16 drbs[X38_CHANNELS][X38_RANKS_PER_CHANNEL];
 	bool stacked;
 	void __iomem *window;
@@ -335,7 +342,13 @@ static int x38_probe1(struct pci_dev *pdev, int dev_idx)
 	how_many_channel(pdev);
 
 	/* FIXME: unconventional pvt_info usage */
-	mci = edac_mc_alloc(0, X38_RANKS, x38_channel_num, 0);
+	layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
+	layers[0].size = X38_RANKS;
+	layers[0].is_virt_csrow = true;
+	layers[1].type = EDAC_MC_LAYER_CHANNEL;
+	layers[1].size = x38_channel_num;
+	layers[1].is_virt_csrow = false;
+	mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, 0);
 	if (!mci)
 		return -ENOMEM;
 
@@ -363,7 +376,6 @@ static int x38_probe1(struct pci_dev *pdev, int dev_idx)
 	 * cumulative; the last one will contain the total memory
 	 * contained in all ranks.
 	 */
-	last_page = -1UL;
 	for (i = 0; i < mci->nr_csrows; i++) {
 		unsigned long nr_pages;
 		struct csrow_info *csrow = &mci->csrows[i];
@@ -372,20 +384,18 @@ static int x38_probe1(struct pci_dev *pdev, int dev_idx)
 			i / X38_RANKS_PER_CHANNEL,
 			i % X38_RANKS_PER_CHANNEL);
 
-		if (nr_pages == 0) {
-			csrow->mtype = MEM_EMPTY;
+		if (nr_pages == 0)
 			continue;
-		}
 
-		csrow->first_page = last_page + 1;
-		last_page += nr_pages;
-		csrow->last_page = last_page;
-		csrow->nr_pages = nr_pages;
+		for (j = 0; j < x38_channel_num; j++) {
+			struct dimm_info *dimm = csrow->channels[j].dimm;
 
-		csrow->grain = nr_pages << PAGE_SHIFT;
-		csrow->mtype = MEM_DDR2;
-		csrow->dtype = DEV_UNKNOWN;
-		csrow->edac_mode = EDAC_UNKNOWN;
+			dimm->nr_pages = nr_pages / x38_channel_num;
+			dimm->grain = nr_pages << PAGE_SHIFT;
+			dimm->mtype = MEM_DDR2;
+			dimm->dtype = DEV_UNKNOWN;
+			dimm->edac_mode = EDAC_UNKNOWN;
+		}
 	}
 
 	x38_clear_error_info(mci);
diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index aa3642cb8209..c4067d0141f7 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -114,6 +114,14 @@ config GPIO_EP93XX
 	depends on ARCH_EP93XX
 	select GPIO_GENERIC
 
+config GPIO_MM_LANTIQ
+	bool "Lantiq Memory mapped GPIOs"
+	depends on LANTIQ && SOC_XWAY
+	help
+	  This enables support for memory mapped GPIOs on the External Bus Unit
+	  (EBU) found on Lantiq SoCs. The gpios are output only as they are
+	  created by attaching a 16bit latch to the bus.
+
 config GPIO_MPC5200
 	def_bool y
 	depends on PPC_MPC52xx
@@ -167,6 +175,14 @@ config GPIO_PXA
 	help
 	  Say yes here to support the PXA GPIO device
 
+config GPIO_STA2X11
+	bool "STA2x11/ConneXt GPIO support"
+	depends on MFD_STA2X11
+	select GENERIC_IRQ_CHIP
+	help
+	  Say yes here to support the STA2x11/ConneXt GPIO device.
+	  The GPIO module has 128 GPIO pins with alternate functions.
+
 config GPIO_XILINX
 	bool "Xilinx GPIO support"
 	depends on PPC_OF || MICROBLAZE
@@ -180,13 +196,13 @@ config GPIO_VR41XX
 	  Say yes here to support the NEC VR4100 series General-purpose I/O Uint
 
 config GPIO_SCH
-	tristate "Intel SCH/TunnelCreek GPIO"
+	tristate "Intel SCH/TunnelCreek/Centerton GPIO"
 	depends on PCI && X86
 	select MFD_CORE
 	select LPC_SCH
 	help
-	  Say yes here to support GPIO interface on Intel Poulsbo SCH
-	  or Intel Tunnel Creek processor.
+	  Say yes here to support GPIO interface on Intel Poulsbo SCH,
+	  Intel Tunnel Creek processor or Intel Centerton processor.
 	  The Intel SCH contains a total of 14 GPIO pins. Ten GPIOs are
 	  powered by the core power rail and are turned off during sleep
 	  modes (S3 and higher). The remaining four GPIOs are powered by
@@ -195,6 +211,22 @@ config GPIO_SCH
 	  system from the Suspend-to-RAM state.
 	  The Intel Tunnel Creek processor has 5 GPIOs powered by the
 	  core power rail and 9 from suspend power supply.
+	  The Intel Centerton processor has a total of 30 GPIO pins.
+	  Twenty-one are powered by the core power rail and 9 from the
+	  suspend power supply.
+
+config GPIO_ICH
+	tristate "Intel ICH GPIO"
+	depends on PCI && X86
+	select MFD_CORE
+	select LPC_ICH
+	help
+	  Say yes here to support the GPIO functionality of a number of Intel
+	  ICH-based chipsets.  Currently supported devices: ICH6, ICH7, ICH8
+	  ICH9, ICH10, Series 5/3400 (eg Ibex Peak), Series 6/C200 (eg
+	  Cougar Point), NM10 (Tiger Point), and 3100 (Whitmore Lake).
+
+	  If unsure, say N.
 
 config GPIO_VX855
 	tristate "VIA VX855/VX875 GPIO"
@@ -334,6 +366,16 @@ config GPIO_STMPE
 	  This enables support for the GPIOs found on the STMPE I/O
 	  Expanders.
 
+config GPIO_STP_XWAY
+	bool "XWAY STP GPIOs"
+	depends on SOC_XWAY
+	help
+	  This enables support for the Serial To Parallel (STP) unit found on
+	  XWAY SoC. The STP allows the SoC to drive a shift registers cascade,
+	  that can be up to 24 bit. This peripheral is aimed at driving leds.
+	  Some of the gpios/leds can be auto updated by the soc with dsl and
+	  phy status.
+
 config GPIO_TC3589X
 	bool "TC3589X GPIOs"
 	depends on MFD_TC3589X
diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile
index 07a79e245407..0f55662002c3 100644
--- a/drivers/gpio/Makefile
+++ b/drivers/gpio/Makefile
@@ -19,6 +19,7 @@ obj-$(CONFIG_ARCH_DAVINCI)	+= gpio-davinci.o
 obj-$(CONFIG_GPIO_EM)		+= gpio-em.o
 obj-$(CONFIG_GPIO_EP93XX)	+= gpio-ep93xx.o
 obj-$(CONFIG_GPIO_GE_FPGA)	+= gpio-ge.o
+obj-$(CONFIG_GPIO_ICH)		+= gpio-ich.o
 obj-$(CONFIG_GPIO_IT8761E)	+= gpio-it8761e.o
 obj-$(CONFIG_GPIO_JANZ_TTL)	+= gpio-janz-ttl.o
 obj-$(CONFIG_ARCH_KS8695)	+= gpio-ks8695.o
@@ -32,6 +33,7 @@ obj-$(CONFIG_GPIO_MC33880)	+= gpio-mc33880.o
 obj-$(CONFIG_GPIO_MC9S08DZ60)	+= gpio-mc9s08dz60.o
 obj-$(CONFIG_GPIO_MCP23S08)	+= gpio-mcp23s08.o
 obj-$(CONFIG_GPIO_ML_IOH)	+= gpio-ml-ioh.o
+obj-$(CONFIG_GPIO_MM_LANTIQ)	+= gpio-mm-lantiq.o
 obj-$(CONFIG_GPIO_MPC5200)	+= gpio-mpc5200.o
 obj-$(CONFIG_GPIO_MPC8XXX)	+= gpio-mpc8xxx.o
 obj-$(CONFIG_GPIO_MSIC)		+= gpio-msic.o
@@ -51,7 +53,9 @@ obj-$(CONFIG_PLAT_SAMSUNG)	+= gpio-samsung.o
 obj-$(CONFIG_ARCH_SA1100)	+= gpio-sa1100.o
 obj-$(CONFIG_GPIO_SCH)		+= gpio-sch.o
 obj-$(CONFIG_GPIO_SODAVILLE)	+= gpio-sodaville.o
+obj-$(CONFIG_GPIO_STA2X11)	+= gpio-sta2x11.o
 obj-$(CONFIG_GPIO_STMPE)	+= gpio-stmpe.o
+obj-$(CONFIG_GPIO_STP_XWAY)	+= gpio-stp-xway.o
 obj-$(CONFIG_GPIO_SX150X)	+= gpio-sx150x.o
 obj-$(CONFIG_GPIO_TC3589X)	+= gpio-tc3589x.o
 obj-$(CONFIG_ARCH_TEGRA)	+= gpio-tegra.o
diff --git a/drivers/gpio/gpio-ich.c b/drivers/gpio/gpio-ich.c
new file mode 100644
index 000000000000..b7c06517403d
--- /dev/null
+++ b/drivers/gpio/gpio-ich.c
@@ -0,0 +1,419 @@
+/*
+ * Intel ICH6-10, Series 5 and 6 GPIO driver
+ *
+ * Copyright (C) 2010 Extreme Engineering Solutions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/gpio.h>
+#include <linux/platform_device.h>
+#include <linux/mfd/lpc_ich.h>
+
+#define DRV_NAME "gpio_ich"
+
+/*
+ * GPIO register offsets in GPIO I/O space.
+ * Each chunk of 32 GPIOs is manipulated via its own USE_SELx, IO_SELx, and
+ * LVLx registers.  Logic in the read/write functions takes a register and
+ * an absolute bit number and determines the proper register offset and bit
+ * number in that register.  For example, to read the value of GPIO bit 50
+ * the code would access offset ichx_regs[2(=GPIO_LVL)][1(=50/32)],
+ * bit 18 (50%32).
+ */
+enum GPIO_REG {
+	GPIO_USE_SEL = 0,
+	GPIO_IO_SEL,
+	GPIO_LVL,
+};
+
+static const u8 ichx_regs[3][3] = {
+	{0x00, 0x30, 0x40},	/* USE_SEL[1-3] offsets */
+	{0x04, 0x34, 0x44},	/* IO_SEL[1-3] offsets */
+	{0x0c, 0x38, 0x48},	/* LVL[1-3] offsets */
+};
+
+#define ICHX_WRITE(val, reg, base_res)	outl(val, (reg) + (base_res)->start)
+#define ICHX_READ(reg, base_res)	inl((reg) + (base_res)->start)
+
+struct ichx_desc {
+	/* Max GPIO pins the chipset can have */
+	uint ngpio;
+
+	/* Whether the chipset has GPIO in GPE0_STS in the PM IO region */
+	bool uses_gpe0;
+
+	/* USE_SEL is bogus on some chipsets, eg 3100 */
+	u32 use_sel_ignore[3];
+
+	/* Some chipsets have quirks, let these use their own request/get */
+	int (*request)(struct gpio_chip *chip, unsigned offset);
+	int (*get)(struct gpio_chip *chip, unsigned offset);
+};
+
+static struct {
+	spinlock_t lock;
+	struct platform_device *dev;
+	struct gpio_chip chip;
+	struct resource *gpio_base;	/* GPIO IO base */
+	struct resource *pm_base;	/* Power Mangagment IO base */
+	struct ichx_desc *desc;	/* Pointer to chipset-specific description */
+	u32 orig_gpio_ctrl;	/* Orig CTRL value, used to restore on exit */
+} ichx_priv;
+
+static int modparam_gpiobase = -1;	/* dynamic */
+module_param_named(gpiobase, modparam_gpiobase, int, 0444);
+MODULE_PARM_DESC(gpiobase, "The GPIO number base. -1 means dynamic, "
+			   "which is the default.");
+
+static int ichx_write_bit(int reg, unsigned nr, int val, int verify)
+{
+	unsigned long flags;
+	u32 data, tmp;
+	int reg_nr = nr / 32;
+	int bit = nr & 0x1f;
+	int ret = 0;
+
+	spin_lock_irqsave(&ichx_priv.lock, flags);
+
+	data = ICHX_READ(ichx_regs[reg][reg_nr], ichx_priv.gpio_base);
+	if (val)
+		data |= 1 << bit;
+	else
+		data &= ~(1 << bit);
+	ICHX_WRITE(data, ichx_regs[reg][reg_nr], ichx_priv.gpio_base);
+	tmp = ICHX_READ(ichx_regs[reg][reg_nr], ichx_priv.gpio_base);
+	if (verify && data != tmp)
+		ret = -EPERM;
+
+	spin_unlock_irqrestore(&ichx_priv.lock, flags);
+
+	return ret;
+}
+
+static int ichx_read_bit(int reg, unsigned nr)
+{
+	unsigned long flags;
+	u32 data;
+	int reg_nr = nr / 32;
+	int bit = nr & 0x1f;
+
+	spin_lock_irqsave(&ichx_priv.lock, flags);
+
+	data = ICHX_READ(ichx_regs[reg][reg_nr], ichx_priv.gpio_base);
+
+	spin_unlock_irqrestore(&ichx_priv.lock, flags);
+
+	return data & (1 << bit) ? 1 : 0;
+}
+
+static int ichx_gpio_direction_input(struct gpio_chip *gpio, unsigned nr)
+{
+	/*
+	 * Try setting pin as an input and verify it worked since many pins
+	 * are output-only.
+	 */
+	if (ichx_write_bit(GPIO_IO_SEL, nr, 1, 1))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int ichx_gpio_direction_output(struct gpio_chip *gpio, unsigned nr,
+					int val)
+{
+	/* Set GPIO output value. */
+	ichx_write_bit(GPIO_LVL, nr, val, 0);
+
+	/*
+	 * Try setting pin as an output and verify it worked since many pins
+	 * are input-only.
+	 */
+	if (ichx_write_bit(GPIO_IO_SEL, nr, 0, 1))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int ichx_gpio_get(struct gpio_chip *chip, unsigned nr)
+{
+	return ichx_read_bit(GPIO_LVL, nr);
+}
+
+static int ich6_gpio_get(struct gpio_chip *chip, unsigned nr)
+{
+	unsigned long flags;
+	u32 data;
+
+	/*
+	 * GPI 0 - 15 need to be read from the power management registers on
+	 * a ICH6/3100 bridge.
+	 */
+	if (nr < 16) {
+		if (!ichx_priv.pm_base)
+			return -ENXIO;
+
+		spin_lock_irqsave(&ichx_priv.lock, flags);
+
+		/* GPI 0 - 15 are latched, write 1 to clear*/
+		ICHX_WRITE(1 << (16 + nr), 0, ichx_priv.pm_base);
+		data = ICHX_READ(0, ichx_priv.pm_base);
+
+		spin_unlock_irqrestore(&ichx_priv.lock, flags);
+
+		return (data >> 16) & (1 << nr) ? 1 : 0;
+	} else {
+		return ichx_gpio_get(chip, nr);
+	}
+}
+
+static int ichx_gpio_request(struct gpio_chip *chip, unsigned nr)
+{
+	/*
+	 * Note we assume the BIOS properly set a bridge's USE value.  Some
+	 * chips (eg Intel 3100) have bogus USE values though, so first see if
+	 * the chipset's USE value can be trusted for this specific bit.
+	 * If it can't be trusted, assume that the pin can be used as a GPIO.
+	 */
+	if (ichx_priv.desc->use_sel_ignore[nr / 32] & (1 << (nr & 0x1f)))
+		return 1;
+
+	return ichx_read_bit(GPIO_USE_SEL, nr) ? 0 : -ENODEV;
+}
+
+static int ich6_gpio_request(struct gpio_chip *chip, unsigned nr)
+{
+	/*
+	 * Fixups for bits 16 and 17 are necessary on the Intel ICH6/3100
+	 * bridge as they are controlled by USE register bits 0 and 1.  See
+	 * "Table 704 GPIO_USE_SEL1 register" in the i3100 datasheet for
+	 * additional info.
+	 */
+	if (nr == 16 || nr == 17)
+		nr -= 16;
+
+	return ichx_gpio_request(chip, nr);
+}
+
+static void ichx_gpio_set(struct gpio_chip *chip, unsigned nr, int val)
+{
+	ichx_write_bit(GPIO_LVL, nr, val, 0);
+}
+
+static void __devinit ichx_gpiolib_setup(struct gpio_chip *chip)
+{
+	chip->owner = THIS_MODULE;
+	chip->label = DRV_NAME;
+	chip->dev = &ichx_priv.dev->dev;
+
+	/* Allow chip-specific overrides of request()/get() */
+	chip->request = ichx_priv.desc->request ?
+		ichx_priv.desc->request : ichx_gpio_request;
+	chip->get = ichx_priv.desc->get ?
+		ichx_priv.desc->get : ichx_gpio_get;
+
+	chip->set = ichx_gpio_set;
+	chip->direction_input = ichx_gpio_direction_input;
+	chip->direction_output = ichx_gpio_direction_output;
+	chip->base = modparam_gpiobase;
+	chip->ngpio = ichx_priv.desc->ngpio;
+	chip->can_sleep = 0;
+	chip->dbg_show = NULL;
+}
+
+/* ICH6-based, 631xesb-based */
+static struct ichx_desc ich6_desc = {
+	/* Bridges using the ICH6 controller need fixups for GPIO 0 - 17 */
+	.request = ich6_gpio_request,
+	.get = ich6_gpio_get,
+
+	/* GPIO 0-15 are read in the GPE0_STS PM register */
+	.uses_gpe0 = true,
+
+	.ngpio = 50,
+};
+
+/* Intel 3100 */
+static struct ichx_desc i3100_desc = {
+	/*
+	 * Bits 16,17, 20 of USE_SEL and bit 16 of USE_SEL2 always read 0 on
+	 * the Intel 3100.  See "Table 712. GPIO Summary Table" of 3100
+	 * Datasheet for more info.
+	 */
+	.use_sel_ignore = {0x00130000, 0x00010000, 0x0},
+
+	/* The 3100 needs fixups for GPIO 0 - 17 */
+	.request = ich6_gpio_request,
+	.get = ich6_gpio_get,
+
+	/* GPIO 0-15 are read in the GPE0_STS PM register */
+	.uses_gpe0 = true,
+
+	.ngpio = 50,
+};
+
+/* ICH7 and ICH8-based */
+static struct ichx_desc ich7_desc = {
+	.ngpio = 50,
+};
+
+/* ICH9-based */
+static struct ichx_desc ich9_desc = {
+	.ngpio = 61,
+};
+
+/* ICH10-based - Consumer/corporate versions have different amount of GPIO */
+static struct ichx_desc ich10_cons_desc = {
+	.ngpio = 61,
+};
+static struct ichx_desc ich10_corp_desc = {
+	.ngpio = 72,
+};
+
+/* Intel 5 series, 6 series, 3400 series, and C200 series */
+static struct ichx_desc intel5_desc = {
+	.ngpio = 76,
+};
+
+static int __devinit ichx_gpio_probe(struct platform_device *pdev)
+{
+	struct resource *res_base, *res_pm;
+	int err;
+	struct lpc_ich_info *ich_info = pdev->dev.platform_data;
+
+	if (!ich_info)
+		return -ENODEV;
+
+	ichx_priv.dev = pdev;
+
+	switch (ich_info->gpio_version) {
+	case ICH_I3100_GPIO:
+		ichx_priv.desc = &i3100_desc;
+		break;
+	case ICH_V5_GPIO:
+		ichx_priv.desc = &intel5_desc;
+		break;
+	case ICH_V6_GPIO:
+		ichx_priv.desc = &ich6_desc;
+		break;
+	case ICH_V7_GPIO:
+		ichx_priv.desc = &ich7_desc;
+		break;
+	case ICH_V9_GPIO:
+		ichx_priv.desc = &ich9_desc;
+		break;
+	case ICH_V10CORP_GPIO:
+		ichx_priv.desc = &ich10_corp_desc;
+		break;
+	case ICH_V10CONS_GPIO:
+		ichx_priv.desc = &ich10_cons_desc;
+		break;
+	default:
+		return -ENODEV;
+	}
+
+	res_base = platform_get_resource(pdev, IORESOURCE_IO, ICH_RES_GPIO);
+	if (!res_base || !res_base->start || !res_base->end)
+		return -ENODEV;
+
+	if (!request_region(res_base->start, resource_size(res_base),
+				pdev->name))
+		return -EBUSY;
+
+	ichx_priv.gpio_base = res_base;
+
+	/*
+	 * If necessary, determine the I/O address of ACPI/power management
+	 * registers which are needed to read the the GPE0 register for GPI pins
+	 * 0 - 15 on some chipsets.
+	 */
+	if (!ichx_priv.desc->uses_gpe0)
+		goto init;
+
+	res_pm = platform_get_resource(pdev, IORESOURCE_IO, ICH_RES_GPE0);
+	if (!res_pm) {
+		pr_warn("ACPI BAR is unavailable, GPI 0 - 15 unavailable\n");
+		goto init;
+	}
+
+	if (!request_region(res_pm->start, resource_size(res_pm),
+			pdev->name)) {
+		pr_warn("ACPI BAR is busy, GPI 0 - 15 unavailable\n");
+		goto init;
+	}
+
+	ichx_priv.pm_base = res_pm;
+
+init:
+	ichx_gpiolib_setup(&ichx_priv.chip);
+	err = gpiochip_add(&ichx_priv.chip);
+	if (err) {
+		pr_err("Failed to register GPIOs\n");
+		goto add_err;
+	}
+
+	pr_info("GPIO from %d to %d on %s\n", ichx_priv.chip.base,
+	       ichx_priv.chip.base + ichx_priv.chip.ngpio - 1, DRV_NAME);
+
+	return 0;
+
+add_err:
+	release_region(ichx_priv.gpio_base->start,
+			resource_size(ichx_priv.gpio_base));
+	if (ichx_priv.pm_base)
+		release_region(ichx_priv.pm_base->start,
+				resource_size(ichx_priv.pm_base));
+	return err;
+}
+
+static int __devexit ichx_gpio_remove(struct platform_device *pdev)
+{
+	int err;
+
+	err = gpiochip_remove(&ichx_priv.chip);
+	if (err) {
+		dev_err(&pdev->dev, "%s failed, %d\n",
+				"gpiochip_remove()", err);
+		return err;
+	}
+
+	release_region(ichx_priv.gpio_base->start,
+				resource_size(ichx_priv.gpio_base));
+	if (ichx_priv.pm_base)
+		release_region(ichx_priv.pm_base->start,
+				resource_size(ichx_priv.pm_base));
+
+	return 0;
+}
+
+static struct platform_driver ichx_gpio_driver = {
+	.driver		= {
+		.owner	= THIS_MODULE,
+		.name	= DRV_NAME,
+	},
+	.probe		= ichx_gpio_probe,
+	.remove		= __devexit_p(ichx_gpio_remove),
+};
+
+module_platform_driver(ichx_gpio_driver);
+
+MODULE_AUTHOR("Peter Tyser <[email protected]>");
+MODULE_DESCRIPTION("GPIO interface for Intel ICH series");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:"DRV_NAME);
diff --git a/drivers/gpio/gpio-mm-lantiq.c b/drivers/gpio/gpio-mm-lantiq.c
new file mode 100644
index 000000000000..2983dfbd0668
--- /dev/null
+++ b/drivers/gpio/gpio-mm-lantiq.c
@@ -0,0 +1,158 @@
+/*
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License version 2 as published
+ *  by the Free Software Foundation.
+ *
+ *  Copyright (C) 2012 John Crispin <[email protected]>
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/platform_device.h>
+#include <linux/mutex.h>
+#include <linux/gpio.h>
+#include <linux/of.h>
+#include <linux/of_gpio.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+
+#include <lantiq_soc.h>
+
+/*
+ * By attaching hardware latches to the EBU it is possible to create output
+ * only gpios. This driver configures a special memory address, which when
+ * written to outputs 16 bit to the latches.
+ */
+
+#define LTQ_EBU_BUSCON	0x1e7ff		/* 16 bit access, slowest timing */
+#define LTQ_EBU_WP	0x80000000	/* write protect bit */
+
+struct ltq_mm {
+	struct of_mm_gpio_chip mmchip;
+	u16 shadow;	/* shadow the latches state */
+};
+
+/**
+ * ltq_mm_apply() - write the shadow value to the ebu address.
+ * @chip:     Pointer to our private data structure.
+ *
+ * Write the shadow value to the EBU to set the gpios. We need to set the
+ * global EBU lock to make sure that PCI/MTD dont break.
+ */
+static void ltq_mm_apply(struct ltq_mm *chip)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ebu_lock, flags);
+	ltq_ebu_w32(LTQ_EBU_BUSCON, LTQ_EBU_BUSCON1);
+	__raw_writew(chip->shadow, chip->mmchip.regs);
+	ltq_ebu_w32(LTQ_EBU_BUSCON | LTQ_EBU_WP, LTQ_EBU_BUSCON1);
+	spin_unlock_irqrestore(&ebu_lock, flags);
+}
+
+/**
+ * ltq_mm_set() - gpio_chip->set - set gpios.
+ * @gc:     Pointer to gpio_chip device structure.
+ * @gpio:   GPIO signal number.
+ * @val:    Value to be written to specified signal.
+ *
+ * Set the shadow value and call ltq_mm_apply.
+ */
+static void ltq_mm_set(struct gpio_chip *gc, unsigned offset, int value)
+{
+	struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc);
+	struct ltq_mm *chip =
+		container_of(mm_gc, struct ltq_mm, mmchip);
+
+	if (value)
+		chip->shadow |= (1 << offset);
+	else
+		chip->shadow &= ~(1 << offset);
+	ltq_mm_apply(chip);
+}
+
+/**
+ * ltq_mm_dir_out() - gpio_chip->dir_out - set gpio direction.
+ * @gc:     Pointer to gpio_chip device structure.
+ * @gpio:   GPIO signal number.
+ * @val:    Value to be written to specified signal.
+ *
+ * Same as ltq_mm_set, always returns 0.
+ */
+static int ltq_mm_dir_out(struct gpio_chip *gc, unsigned offset, int value)
+{
+	ltq_mm_set(gc, offset, value);
+
+	return 0;
+}
+
+/**
+ * ltq_mm_save_regs() - Set initial values of GPIO pins
+ * @mm_gc: pointer to memory mapped GPIO chip structure
+ */
+static void ltq_mm_save_regs(struct of_mm_gpio_chip *mm_gc)
+{
+	struct ltq_mm *chip =
+		container_of(mm_gc, struct ltq_mm, mmchip);
+
+	/* tell the ebu controller which memory address we will be using */
+	ltq_ebu_w32(CPHYSADDR(chip->mmchip.regs) | 0x1, LTQ_EBU_ADDRSEL1);
+
+	ltq_mm_apply(chip);
+}
+
+static int ltq_mm_probe(struct platform_device *pdev)
+{
+	struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	struct ltq_mm *chip;
+	const __be32 *shadow;
+	int ret = 0;
+
+	if (!res) {
+		dev_err(&pdev->dev, "failed to get memory resource\n");
+		return -ENOENT;
+	}
+
+	chip = kzalloc(sizeof(*chip), GFP_KERNEL);
+	if (!chip)
+		return -ENOMEM;
+
+	chip->mmchip.gc.ngpio = 16;
+	chip->mmchip.gc.label = "gpio-mm-ltq";
+	chip->mmchip.gc.direction_output = ltq_mm_dir_out;
+	chip->mmchip.gc.set = ltq_mm_set;
+	chip->mmchip.save_regs = ltq_mm_save_regs;
+
+	/* store the shadow value if one was passed by the devicetree */
+	shadow = of_get_property(pdev->dev.of_node, "lantiq,shadow", NULL);
+	if (shadow)
+		chip->shadow = be32_to_cpu(*shadow);
+
+	ret = of_mm_gpiochip_add(pdev->dev.of_node, &chip->mmchip);
+	if (ret)
+		kfree(chip);
+	return ret;
+}
+
+static const struct of_device_id ltq_mm_match[] = {
+	{ .compatible = "lantiq,gpio-mm" },
+	{},
+};
+MODULE_DEVICE_TABLE(of, ltq_mm_match);
+
+static struct platform_driver ltq_mm_driver = {
+	.probe = ltq_mm_probe,
+	.driver = {
+		.name = "gpio-mm-ltq",
+		.owner = THIS_MODULE,
+		.of_match_table = ltq_mm_match,
+	},
+};
+
+static int __init ltq_mm_init(void)
+{
+	return platform_driver_register(&ltq_mm_driver);
+}
+
+subsys_initcall(ltq_mm_init);
diff --git a/drivers/gpio/gpio-sch.c b/drivers/gpio/gpio-sch.c
index 8cadf4d683a8..424dce8e3f30 100644
--- a/drivers/gpio/gpio-sch.c
+++ b/drivers/gpio/gpio-sch.c
@@ -232,6 +232,14 @@ static int __devinit sch_gpio_probe(struct platform_device *pdev)
 			sch_gpio_resume.ngpio = 9;
 			break;
 
+		case PCI_DEVICE_ID_INTEL_CENTERTON_ILB:
+			sch_gpio_core.base = 0;
+			sch_gpio_core.ngpio = 21;
+
+			sch_gpio_resume.base = 21;
+			sch_gpio_resume.ngpio = 9;
+			break;
+
 		default:
 			return -ENODEV;
 	}
diff --git a/drivers/gpio/gpio-sta2x11.c b/drivers/gpio/gpio-sta2x11.c
new file mode 100644
index 000000000000..38416be8ba11
--- /dev/null
+++ b/drivers/gpio/gpio-sta2x11.c
@@ -0,0 +1,435 @@
+/*
+ * STMicroelectronics ConneXt (STA2X11) GPIO driver
+ *
+ * Copyright 2012 ST Microelectronics (Alessandro Rubini)
+ * Based on gpio-ml-ioh.c, Copyright 2010 OKI Semiconductors Ltd.
+ * Also based on previous sta2x11 work, Copyright 2011 Wind River Systems, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/gpio.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include <linux/mfd/sta2x11-mfd.h>
+
+struct gsta_regs {
+	u32 dat;		/* 0x00 */
+	u32 dats;
+	u32 datc;
+	u32 pdis;
+	u32 dir;		/* 0x10 */
+	u32 dirs;
+	u32 dirc;
+	u32 unused_1c;
+	u32 afsela;		/* 0x20 */
+	u32 unused_24[7];
+	u32 rimsc;		/* 0x40 */
+	u32 fimsc;
+	u32 is;
+	u32 ic;
+};
+
+struct gsta_gpio {
+	spinlock_t			lock;
+	struct device			*dev;
+	void __iomem			*reg_base;
+	struct gsta_regs __iomem	*regs[GSTA_NR_BLOCKS];
+	struct gpio_chip		gpio;
+	int				irq_base;
+	/* FIXME: save the whole config here (AF, ...) */
+	unsigned			irq_type[GSTA_NR_GPIO];
+};
+
+static inline struct gsta_regs __iomem *__regs(struct gsta_gpio *chip, int nr)
+{
+	return chip->regs[nr / GSTA_GPIO_PER_BLOCK];
+}
+
+static inline u32 __bit(int nr)
+{
+	return 1U << (nr % GSTA_GPIO_PER_BLOCK);
+}
+
+/*
+ * gpio methods
+ */
+
+static void gsta_gpio_set(struct gpio_chip *gpio, unsigned nr, int val)
+{
+	struct gsta_gpio *chip = container_of(gpio, struct gsta_gpio, gpio);
+	struct gsta_regs __iomem *regs = __regs(chip, nr);
+	u32 bit = __bit(nr);
+
+	if (val)
+		writel(bit, &regs->dats);
+	else
+		writel(bit, &regs->datc);
+}
+
+static int gsta_gpio_get(struct gpio_chip *gpio, unsigned nr)
+{
+	struct gsta_gpio *chip = container_of(gpio, struct gsta_gpio, gpio);
+	struct gsta_regs __iomem *regs = __regs(chip, nr);
+	u32 bit = __bit(nr);
+
+	return readl(&regs->dat) & bit;
+}
+
+static int gsta_gpio_direction_output(struct gpio_chip *gpio, unsigned nr,
+				      int val)
+{
+	struct gsta_gpio *chip = container_of(gpio, struct gsta_gpio, gpio);
+	struct gsta_regs __iomem *regs = __regs(chip, nr);
+	u32 bit = __bit(nr);
+
+	writel(bit, &regs->dirs);
+	/* Data register after direction, otherwise pullup/down is selected */
+	if (val)
+		writel(bit, &regs->dats);
+	else
+		writel(bit, &regs->datc);
+	return 0;
+}
+
+static int gsta_gpio_direction_input(struct gpio_chip *gpio, unsigned nr)
+{
+	struct gsta_gpio *chip = container_of(gpio, struct gsta_gpio, gpio);
+	struct gsta_regs __iomem *regs = __regs(chip, nr);
+	u32 bit = __bit(nr);
+
+	writel(bit, &regs->dirc);
+	return 0;
+}
+
+static int gsta_gpio_to_irq(struct gpio_chip *gpio, unsigned offset)
+{
+	struct gsta_gpio *chip = container_of(gpio, struct gsta_gpio, gpio);
+	return chip->irq_base + offset;
+}
+
+static void gsta_gpio_setup(struct gsta_gpio *chip) /* called from probe */
+{
+	struct gpio_chip *gpio = &chip->gpio;
+
+	/*
+	 * ARCH_NR_GPIOS is currently 256 and dynamic allocation starts
+	 * from the end. However, for compatibility, we need the first
+	 * ConneXt device to start from gpio 0: it's the main chipset
+	 * on most boards so documents and drivers assume gpio0..gpio127
+	 */
+	static int gpio_base;
+
+	gpio->label = dev_name(chip->dev);
+	gpio->owner = THIS_MODULE;
+	gpio->direction_input = gsta_gpio_direction_input;
+	gpio->get = gsta_gpio_get;
+	gpio->direction_output = gsta_gpio_direction_output;
+	gpio->set = gsta_gpio_set;
+	gpio->dbg_show = NULL;
+	gpio->base = gpio_base;
+	gpio->ngpio = GSTA_NR_GPIO;
+	gpio->can_sleep = 0;
+	gpio->to_irq = gsta_gpio_to_irq;
+
+	/*
+	 * After the first device, turn to dynamic gpio numbers.
+	 * For example, with ARCH_NR_GPIOS = 256 we can fit two cards
+	 */
+	if (!gpio_base)
+		gpio_base = -1;
+}
+
+/*
+ * Special method: alternate functions and pullup/pulldown. This is only
+ * invoked on startup to configure gpio's according to platform data.
+ * FIXME : this functionality shall be managed (and exported to other drivers)
+ * via the pin control subsystem.
+ */
+static void gsta_set_config(struct gsta_gpio *chip, int nr, unsigned cfg)
+{
+	struct gsta_regs __iomem *regs = __regs(chip, nr);
+	unsigned long flags;
+	u32 bit = __bit(nr);
+	u32 val;
+	int err = 0;
+
+	pr_info("%s: %p %i %i\n", __func__, chip, nr, cfg);
+
+	if (cfg == PINMUX_TYPE_NONE)
+		return;
+
+	/* Alternate function or not? */
+	spin_lock_irqsave(&chip->lock, flags);
+	val = readl(&regs->afsela);
+	if (cfg == PINMUX_TYPE_FUNCTION)
+		val |= bit;
+	else
+		val &= ~bit;
+	writel(val | bit, &regs->afsela);
+	if (cfg == PINMUX_TYPE_FUNCTION) {
+		spin_unlock_irqrestore(&chip->lock, flags);
+		return;
+	}
+
+	/* not alternate function: set details */
+	switch (cfg) {
+	case PINMUX_TYPE_OUTPUT_LOW:
+		writel(bit, &regs->dirs);
+		writel(bit, &regs->datc);
+		break;
+	case PINMUX_TYPE_OUTPUT_HIGH:
+		writel(bit, &regs->dirs);
+		writel(bit, &regs->dats);
+		break;
+	case PINMUX_TYPE_INPUT:
+		writel(bit, &regs->dirc);
+		val = readl(&regs->pdis) | bit;
+		writel(val, &regs->pdis);
+		break;
+	case PINMUX_TYPE_INPUT_PULLUP:
+		writel(bit, &regs->dirc);
+		val = readl(&regs->pdis) & ~bit;
+		writel(val, &regs->pdis);
+		writel(bit, &regs->dats);
+		break;
+	case PINMUX_TYPE_INPUT_PULLDOWN:
+		writel(bit, &regs->dirc);
+		val = readl(&regs->pdis) & ~bit;
+		writel(val, &regs->pdis);
+		writel(bit, &regs->datc);
+		break;
+	default:
+		err = 1;
+	}
+	spin_unlock_irqrestore(&chip->lock, flags);
+	if (err)
+		pr_err("%s: chip %p, pin %i, cfg %i is invalid\n",
+		       __func__, chip, nr, cfg);
+}
+
+/*
+ * Irq methods
+ */
+
+static void gsta_irq_disable(struct irq_data *data)
+{
+	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(data);
+	struct gsta_gpio *chip = gc->private;
+	int nr = data->irq - chip->irq_base;
+	struct gsta_regs __iomem *regs = __regs(chip, nr);
+	u32 bit = __bit(nr);
+	u32 val;
+	unsigned long flags;
+
+	spin_lock_irqsave(&chip->lock, flags);
+	if (chip->irq_type[nr] & IRQ_TYPE_EDGE_RISING) {
+		val = readl(&regs->rimsc) & ~bit;
+		writel(val, &regs->rimsc);
+	}
+	if (chip->irq_type[nr] & IRQ_TYPE_EDGE_FALLING) {
+		val = readl(&regs->fimsc) & ~bit;
+		writel(val, &regs->fimsc);
+	}
+	spin_unlock_irqrestore(&chip->lock, flags);
+	return;
+}
+
+static void gsta_irq_enable(struct irq_data *data)
+{
+	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(data);
+	struct gsta_gpio *chip = gc->private;
+	int nr = data->irq - chip->irq_base;
+	struct gsta_regs __iomem *regs = __regs(chip, nr);
+	u32 bit = __bit(nr);
+	u32 val;
+	int type;
+	unsigned long flags;
+
+	type = chip->irq_type[nr];
+
+	spin_lock_irqsave(&chip->lock, flags);
+	val = readl(&regs->rimsc);
+	if (type & IRQ_TYPE_EDGE_RISING)
+		writel(val | bit, &regs->rimsc);
+	else
+		writel(val & ~bit, &regs->rimsc);
+	val = readl(&regs->rimsc);
+	if (type & IRQ_TYPE_EDGE_FALLING)
+		writel(val | bit, &regs->fimsc);
+	else
+		writel(val & ~bit, &regs->fimsc);
+	spin_unlock_irqrestore(&chip->lock, flags);
+	return;
+}
+
+static int gsta_irq_type(struct irq_data *d, unsigned int type)
+{
+	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+	struct gsta_gpio *chip = gc->private;
+	int nr = d->irq - chip->irq_base;
+
+	/* We only support edge interrupts */
+	if (!(type & (IRQ_TYPE_EDGE_RISING | IRQ_TYPE_EDGE_FALLING))) {
+		pr_debug("%s: unsupported type 0x%x\n", __func__, type);
+		return -EINVAL;
+	}
+
+	chip->irq_type[nr] = type; /* used for enable/disable */
+
+	gsta_irq_enable(d);
+	return 0;
+}
+
+static irqreturn_t gsta_gpio_handler(int irq, void *dev_id)
+{
+	struct gsta_gpio *chip = dev_id;
+	struct gsta_regs __iomem *regs;
+	u32 is;
+	int i, nr, base;
+	irqreturn_t ret = IRQ_NONE;
+
+	for (i = 0; i < GSTA_NR_BLOCKS; i++) {
+		regs = chip->regs[i];
+		base = chip->irq_base + i * GSTA_GPIO_PER_BLOCK;
+		while ((is = readl(&regs->is))) {
+			nr = __ffs(is);
+			irq = base + nr;
+			generic_handle_irq(irq);
+			writel(1 << nr, &regs->ic);
+			ret = IRQ_HANDLED;
+		}
+	}
+	return ret;
+}
+
+static __devinit void gsta_alloc_irq_chip(struct gsta_gpio *chip)
+{
+	struct irq_chip_generic *gc;
+	struct irq_chip_type *ct;
+
+	gc = irq_alloc_generic_chip(KBUILD_MODNAME, 1, chip->irq_base,
+				     chip->reg_base, handle_simple_irq);
+	gc->private = chip;
+	ct = gc->chip_types;
+
+	ct->chip.irq_set_type = gsta_irq_type;
+	ct->chip.irq_disable = gsta_irq_disable;
+	ct->chip.irq_enable = gsta_irq_enable;
+
+	/* FIXME: this makes at most 32 interrupts. Request 0 by now */
+	irq_setup_generic_chip(gc, 0 /* IRQ_MSK(GSTA_GPIO_PER_BLOCK) */, 0,
+			       IRQ_NOREQUEST | IRQ_NOPROBE, 0);
+
+	/* Set up all all 128 interrupts: code from setup_generic_chip */
+	{
+		struct irq_chip_type *ct = gc->chip_types;
+		int i, j;
+		for (j = 0; j < GSTA_NR_GPIO; j++) {
+			i = chip->irq_base + j;
+			irq_set_chip_and_handler(i, &ct->chip, ct->handler);
+			irq_set_chip_data(i, gc);
+			irq_modify_status(i, IRQ_NOREQUEST | IRQ_NOPROBE, 0);
+		}
+		gc->irq_cnt = i - gc->irq_base;
+	}
+}
+
+/* The platform device used here is instantiated by the MFD device */
+static int __devinit gsta_probe(struct platform_device *dev)
+{
+	int i, err;
+	struct pci_dev *pdev;
+	struct sta2x11_gpio_pdata *gpio_pdata;
+	struct gsta_gpio *chip;
+	struct resource *res;
+
+	pdev = *(struct pci_dev **)(dev->dev.platform_data);
+	gpio_pdata = dev_get_platdata(&pdev->dev);
+
+	if (gpio_pdata == NULL)
+		dev_err(&dev->dev, "no gpio config\n");
+	pr_debug("gpio config: %p\n", gpio_pdata);
+
+	res = platform_get_resource(dev, IORESOURCE_MEM, 0);
+
+	chip = devm_kzalloc(&dev->dev, sizeof(*chip), GFP_KERNEL);
+	chip->dev = &dev->dev;
+	chip->reg_base = devm_request_and_ioremap(&dev->dev, res);
+
+	for (i = 0; i < GSTA_NR_BLOCKS; i++) {
+		chip->regs[i] = chip->reg_base + i * 4096;
+		/* disable all irqs */
+		writel(0, &chip->regs[i]->rimsc);
+		writel(0, &chip->regs[i]->fimsc);
+		writel(~0, &chip->regs[i]->ic);
+	}
+	spin_lock_init(&chip->lock);
+	gsta_gpio_setup(chip);
+	for (i = 0; i < GSTA_NR_GPIO; i++)
+		gsta_set_config(chip, i, gpio_pdata->pinconfig[i]);
+
+	/* 384 was used in previous code: be compatible for other drivers */
+	err = irq_alloc_descs(-1, 384, GSTA_NR_GPIO, NUMA_NO_NODE);
+	if (err < 0) {
+		dev_warn(&dev->dev, "sta2x11 gpio: Can't get irq base (%i)\n",
+			 -err);
+		return err;
+	}
+	chip->irq_base = err;
+	gsta_alloc_irq_chip(chip);
+
+	err = request_irq(pdev->irq, gsta_gpio_handler,
+			     IRQF_SHARED, KBUILD_MODNAME, chip);
+	if (err < 0) {
+		dev_err(&dev->dev, "sta2x11 gpio: Can't request irq (%i)\n",
+			-err);
+		goto err_free_descs;
+	}
+
+	err = gpiochip_add(&chip->gpio);
+	if (err < 0) {
+		dev_err(&dev->dev, "sta2x11 gpio: Can't register (%i)\n",
+			-err);
+		goto err_free_irq;
+	}
+
+	platform_set_drvdata(dev, chip);
+	return 0;
+
+err_free_irq:
+	free_irq(pdev->irq, chip);
+err_free_descs:
+	irq_free_descs(chip->irq_base, GSTA_NR_GPIO);
+	return err;
+}
+
+static struct platform_driver sta2x11_gpio_platform_driver = {
+	.driver = {
+		.name	= "sta2x11-gpio",
+		.owner	= THIS_MODULE,
+	},
+	.probe = gsta_probe,
+};
+
+module_platform_driver(sta2x11_gpio_platform_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("sta2x11_gpio GPIO driver");
diff --git a/drivers/gpio/gpio-stp-xway.c b/drivers/gpio/gpio-stp-xway.c
new file mode 100644
index 000000000000..e35096bf3cfb
--- /dev/null
+++ b/drivers/gpio/gpio-stp-xway.c
@@ -0,0 +1,301 @@
+/*
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License version 2 as published
+ *  by the Free Software Foundation.
+ *
+ *  Copyright (C) 2012 John Crispin <[email protected]>
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/of_platform.h>
+#include <linux/mutex.h>
+#include <linux/gpio.h>
+#include <linux/io.h>
+#include <linux/of_gpio.h>
+#include <linux/clk.h>
+#include <linux/err.h>
+
+#include <lantiq_soc.h>
+
+/*
+ * The Serial To Parallel (STP) is found on MIPS based Lantiq socs. It is a
+ * peripheral controller used to drive external shift register cascades. At most
+ * 3 groups of 8 bits can be driven. The hardware is able to allow the DSL modem
+ * to drive the 2 LSBs of the cascade automatically.
+ */
+
+/* control register 0 */
+#define XWAY_STP_CON0		0x00
+/* control register 1 */
+#define XWAY_STP_CON1		0x04
+/* data register 0 */
+#define XWAY_STP_CPU0		0x08
+/* data register 1 */
+#define XWAY_STP_CPU1		0x0C
+/* access register */
+#define XWAY_STP_AR		0x10
+
+/* software or hardware update select bit */
+#define XWAY_STP_CON_SWU	BIT(31)
+
+/* automatic update rates */
+#define XWAY_STP_2HZ		0
+#define XWAY_STP_4HZ		BIT(23)
+#define XWAY_STP_8HZ		BIT(24)
+#define XWAY_STP_10HZ		(BIT(24) | BIT(23))
+#define XWAY_STP_SPEED_MASK	(0xf << 23)
+
+/* clock source for automatic update */
+#define XWAY_STP_UPD_FPI	BIT(31)
+#define XWAY_STP_UPD_MASK	(BIT(31) | BIT(30))
+
+/* let the adsl core drive the 2 LSBs */
+#define XWAY_STP_ADSL_SHIFT	24
+#define XWAY_STP_ADSL_MASK	0x3
+
+/* 2 groups of 3 bits can be driven by the phys */
+#define XWAY_STP_PHY_MASK	0x3
+#define XWAY_STP_PHY1_SHIFT	27
+#define XWAY_STP_PHY2_SHIFT	15
+
+/* STP has 3 groups of 8 bits */
+#define XWAY_STP_GROUP0		BIT(0)
+#define XWAY_STP_GROUP1		BIT(1)
+#define XWAY_STP_GROUP2		BIT(2)
+#define XWAY_STP_GROUP_MASK	(0x7)
+
+/* Edge configuration bits */
+#define XWAY_STP_FALLING	BIT(26)
+#define XWAY_STP_EDGE_MASK	BIT(26)
+
+#define xway_stp_r32(m, reg)		__raw_readl(m + reg)
+#define xway_stp_w32(m, val, reg)	__raw_writel(val, m + reg)
+#define xway_stp_w32_mask(m, clear, set, reg) \
+		ltq_w32((ltq_r32(m + reg) & ~(clear)) | (set), \
+		m + reg)
+
+struct xway_stp {
+	struct gpio_chip gc;
+	void __iomem *virt;
+	u32 edge;	/* rising or falling edge triggered shift register */
+	u16 shadow;	/* shadow the shift registers state */
+	u8 groups;	/* we can drive 1-3 groups of 8bit each */
+	u8 dsl;		/* the 2 LSBs can be driven by the dsl core */
+	u8 phy1;	/* 3 bits can be driven by phy1 */
+	u8 phy2;	/* 3 bits can be driven by phy2 */
+	u8 reserved;	/* mask out the hw driven bits in gpio_request */
+};
+
+/**
+ * xway_stp_set() - gpio_chip->set - set gpios.
+ * @gc:     Pointer to gpio_chip device structure.
+ * @gpio:   GPIO signal number.
+ * @val:    Value to be written to specified signal.
+ *
+ * Set the shadow value and call ltq_ebu_apply.
+ */
+static void xway_stp_set(struct gpio_chip *gc, unsigned gpio, int val)
+{
+	struct xway_stp *chip =
+		container_of(gc, struct xway_stp, gc);
+
+	if (val)
+		chip->shadow |= BIT(gpio);
+	else
+		chip->shadow &= ~BIT(gpio);
+	xway_stp_w32(chip->virt, chip->shadow, XWAY_STP_CPU0);
+	xway_stp_w32_mask(chip->virt, 0, XWAY_STP_CON_SWU, XWAY_STP_CON0);
+}
+
+/**
+ * xway_stp_dir_out() - gpio_chip->dir_out - set gpio direction.
+ * @gc:     Pointer to gpio_chip device structure.
+ * @gpio:   GPIO signal number.
+ * @val:    Value to be written to specified signal.
+ *
+ * Same as xway_stp_set, always returns 0.
+ */
+static int xway_stp_dir_out(struct gpio_chip *gc, unsigned gpio, int val)
+{
+	xway_stp_set(gc, gpio, val);
+
+	return 0;
+}
+
+/**
+ * xway_stp_request() - gpio_chip->request
+ * @gc:     Pointer to gpio_chip device structure.
+ * @gpio:   GPIO signal number.
+ *
+ * We mask out the HW driven pins
+ */
+static int xway_stp_request(struct gpio_chip *gc, unsigned gpio)
+{
+	struct xway_stp *chip =
+		container_of(gc, struct xway_stp, gc);
+
+	if ((gpio < 8) && (chip->reserved & BIT(gpio))) {
+		dev_err(gc->dev, "GPIO %d is driven by hardware\n", gpio);
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+/**
+ * xway_stp_hw_init() - Configure the STP unit and enable the clock gate
+ * @virt: pointer to the remapped register range
+ */
+static int xway_stp_hw_init(struct xway_stp *chip)
+{
+	/* sane defaults */
+	xway_stp_w32(chip->virt, 0, XWAY_STP_AR);
+	xway_stp_w32(chip->virt, 0, XWAY_STP_CPU0);
+	xway_stp_w32(chip->virt, 0, XWAY_STP_CPU1);
+	xway_stp_w32(chip->virt, XWAY_STP_CON_SWU, XWAY_STP_CON0);
+	xway_stp_w32(chip->virt, 0, XWAY_STP_CON1);
+
+	/* apply edge trigger settings for the shift register */
+	xway_stp_w32_mask(chip->virt, XWAY_STP_EDGE_MASK,
+				chip->edge, XWAY_STP_CON0);
+
+	/* apply led group settings */
+	xway_stp_w32_mask(chip->virt, XWAY_STP_GROUP_MASK,
+				chip->groups, XWAY_STP_CON1);
+
+	/* tell the hardware which pins are controlled by the dsl modem */
+	xway_stp_w32_mask(chip->virt,
+			XWAY_STP_ADSL_MASK << XWAY_STP_ADSL_SHIFT,
+			chip->dsl << XWAY_STP_ADSL_SHIFT,
+			XWAY_STP_CON0);
+
+	/* tell the hardware which pins are controlled by the phys */
+	xway_stp_w32_mask(chip->virt,
+			XWAY_STP_PHY_MASK << XWAY_STP_PHY1_SHIFT,
+			chip->phy1 << XWAY_STP_PHY1_SHIFT,
+			XWAY_STP_CON0);
+	xway_stp_w32_mask(chip->virt,
+			XWAY_STP_PHY_MASK << XWAY_STP_PHY2_SHIFT,
+			chip->phy2 << XWAY_STP_PHY2_SHIFT,
+			XWAY_STP_CON1);
+
+	/* mask out the hw driven bits in gpio_request */
+	chip->reserved = (chip->phy2 << 5) | (chip->phy1 << 2) | chip->dsl;
+
+	/*
+	 * if we have pins that are driven by hw, we need to tell the stp what
+	 * clock to use as a timer.
+	 */
+	if (chip->reserved)
+		xway_stp_w32_mask(chip->virt, XWAY_STP_UPD_MASK,
+			XWAY_STP_UPD_FPI, XWAY_STP_CON1);
+
+	return 0;
+}
+
+static int __devinit xway_stp_probe(struct platform_device *pdev)
+{
+	struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	const __be32 *shadow, *groups, *dsl, *phy;
+	struct xway_stp *chip;
+	struct clk *clk;
+	int ret = 0;
+
+	if (!res) {
+		dev_err(&pdev->dev, "failed to request STP resource\n");
+		return -ENOENT;
+	}
+
+	chip = devm_kzalloc(&pdev->dev, sizeof(*chip), GFP_KERNEL);
+	if (!chip)
+		return -ENOMEM;
+
+	chip->virt = devm_request_and_ioremap(&pdev->dev, res);
+	if (!chip->virt) {
+		dev_err(&pdev->dev, "failed to remap STP memory\n");
+		return -ENOMEM;
+	}
+	chip->gc.dev = &pdev->dev;
+	chip->gc.label = "stp-xway";
+	chip->gc.direction_output = xway_stp_dir_out;
+	chip->gc.set = xway_stp_set;
+	chip->gc.request = xway_stp_request;
+	chip->gc.base = -1;
+	chip->gc.owner = THIS_MODULE;
+
+	/* store the shadow value if one was passed by the devicetree */
+	shadow = of_get_property(pdev->dev.of_node, "lantiq,shadow", NULL);
+	if (shadow)
+		chip->shadow = be32_to_cpu(*shadow);
+
+	/* find out which gpio groups should be enabled */
+	groups = of_get_property(pdev->dev.of_node, "lantiq,groups", NULL);
+	if (groups)
+		chip->groups = be32_to_cpu(*groups) & XWAY_STP_GROUP_MASK;
+	else
+		chip->groups = XWAY_STP_GROUP0;
+	chip->gc.ngpio = fls(chip->groups) * 8;
+
+	/* find out which gpios are controlled by the dsl core */
+	dsl = of_get_property(pdev->dev.of_node, "lantiq,dsl", NULL);
+	if (dsl)
+		chip->dsl = be32_to_cpu(*dsl) & XWAY_STP_ADSL_MASK;
+
+	/* find out which gpios are controlled by the phys */
+	if (of_machine_is_compatible("lantiq,ar9") ||
+			of_machine_is_compatible("lantiq,gr9") ||
+			of_machine_is_compatible("lantiq,vr9")) {
+		phy = of_get_property(pdev->dev.of_node, "lantiq,phy1", NULL);
+		if (phy)
+			chip->phy1 = be32_to_cpu(*phy) & XWAY_STP_PHY_MASK;
+		phy = of_get_property(pdev->dev.of_node, "lantiq,phy2", NULL);
+		if (phy)
+			chip->phy2 = be32_to_cpu(*phy) & XWAY_STP_PHY_MASK;
+	}
+
+	/* check which edge trigger we should use, default to a falling edge */
+	if (!of_find_property(pdev->dev.of_node, "lantiq,rising", NULL))
+		chip->edge = XWAY_STP_FALLING;
+
+	clk = clk_get(&pdev->dev, NULL);
+	if (IS_ERR(clk)) {
+		dev_err(&pdev->dev, "Failed to get clock\n");
+		return PTR_ERR(clk);
+	}
+	clk_enable(clk);
+
+	ret = xway_stp_hw_init(chip);
+	if (!ret)
+		ret = gpiochip_add(&chip->gc);
+
+	if (!ret)
+		dev_info(&pdev->dev, "Init done\n");
+
+	return ret;
+}
+
+static const struct of_device_id xway_stp_match[] = {
+	{ .compatible = "lantiq,gpio-stp-xway" },
+	{},
+};
+MODULE_DEVICE_TABLE(of, xway_stp_match);
+
+static struct platform_driver xway_stp_driver = {
+	.probe = xway_stp_probe,
+	.driver = {
+		.name = "gpio-stp-xway",
+		.owner = THIS_MODULE,
+		.of_match_table = xway_stp_match,
+	},
+};
+
+int __init xway_stp_init(void)
+{
+	return platform_driver_register(&xway_stp_driver);
+}
+
+subsys_initcall(xway_stp_init);
diff --git a/drivers/gpio/gpio-tps65910.c b/drivers/gpio/gpio-tps65910.c
index 7eef648a3351..c1ad2884f2ed 100644
--- a/drivers/gpio/gpio-tps65910.c
+++ b/drivers/gpio/gpio-tps65910.c
@@ -18,14 +18,27 @@
 #include <linux/errno.h>
 #include <linux/gpio.h>
 #include <linux/i2c.h>
+#include <linux/platform_device.h>
 #include <linux/mfd/tps65910.h>
+#include <linux/of_device.h>
+
+struct tps65910_gpio {
+	struct gpio_chip gpio_chip;
+	struct tps65910 *tps65910;
+};
+
+static inline struct tps65910_gpio *to_tps65910_gpio(struct gpio_chip *chip)
+{
+	return container_of(chip, struct tps65910_gpio, gpio_chip);
+}
 
 static int tps65910_gpio_get(struct gpio_chip *gc, unsigned offset)
 {
-	struct tps65910 *tps65910 = container_of(gc, struct tps65910, gpio);
-	uint8_t val;
+	struct tps65910_gpio *tps65910_gpio = to_tps65910_gpio(gc);
+	struct tps65910 *tps65910 = tps65910_gpio->tps65910;
+	unsigned int val;
 
-	tps65910->read(tps65910, TPS65910_GPIO0 + offset, 1, &val);
+	tps65910_reg_read(tps65910, TPS65910_GPIO0 + offset, &val);
 
 	if (val & GPIO_STS_MASK)
 		return 1;
@@ -36,83 +49,170 @@ static int tps65910_gpio_get(struct gpio_chip *gc, unsigned offset)
 static void tps65910_gpio_set(struct gpio_chip *gc, unsigned offset,
 			      int value)
 {
-	struct tps65910 *tps65910 = container_of(gc, struct tps65910, gpio);
+	struct tps65910_gpio *tps65910_gpio = to_tps65910_gpio(gc);
+	struct tps65910 *tps65910 = tps65910_gpio->tps65910;
 
 	if (value)
-		tps65910_set_bits(tps65910, TPS65910_GPIO0 + offset,
+		tps65910_reg_set_bits(tps65910, TPS65910_GPIO0 + offset,
 						GPIO_SET_MASK);
 	else
-		tps65910_clear_bits(tps65910, TPS65910_GPIO0 + offset,
+		tps65910_reg_clear_bits(tps65910, TPS65910_GPIO0 + offset,
 						GPIO_SET_MASK);
 }
 
 static int tps65910_gpio_output(struct gpio_chip *gc, unsigned offset,
 				int value)
 {
-	struct tps65910 *tps65910 = container_of(gc, struct tps65910, gpio);
+	struct tps65910_gpio *tps65910_gpio = to_tps65910_gpio(gc);
+	struct tps65910 *tps65910 = tps65910_gpio->tps65910;
 
 	/* Set the initial value */
 	tps65910_gpio_set(gc, offset, value);
 
-	return tps65910_set_bits(tps65910, TPS65910_GPIO0 + offset,
+	return tps65910_reg_set_bits(tps65910, TPS65910_GPIO0 + offset,
 						GPIO_CFG_MASK);
 }
 
 static int tps65910_gpio_input(struct gpio_chip *gc, unsigned offset)
 {
-	struct tps65910 *tps65910 = container_of(gc, struct tps65910, gpio);
+	struct tps65910_gpio *tps65910_gpio = to_tps65910_gpio(gc);
+	struct tps65910 *tps65910 = tps65910_gpio->tps65910;
 
-	return tps65910_clear_bits(tps65910, TPS65910_GPIO0 + offset,
+	return tps65910_reg_clear_bits(tps65910, TPS65910_GPIO0 + offset,
 						GPIO_CFG_MASK);
 }
 
-void tps65910_gpio_init(struct tps65910 *tps65910, int gpio_base)
+#ifdef CONFIG_OF
+static struct tps65910_board *tps65910_parse_dt_for_gpio(struct device *dev,
+		struct tps65910 *tps65910, int chip_ngpio)
 {
+	struct tps65910_board *tps65910_board = tps65910->of_plat_data;
+	unsigned int prop_array[TPS6591X_MAX_NUM_GPIO];
+	int ngpio = min(chip_ngpio, TPS6591X_MAX_NUM_GPIO);
 	int ret;
-	struct tps65910_board *board_data;
+	int idx;
+
+	tps65910_board->gpio_base = -1;
+	ret = of_property_read_u32_array(tps65910->dev->of_node,
+			"ti,en-gpio-sleep", prop_array, ngpio);
+	if (ret < 0) {
+		dev_dbg(dev, "ti,en-gpio-sleep not specified\n");
+		return tps65910_board;
+	}
 
-	if (!gpio_base)
-		return;
+	for (idx = 0; idx < ngpio; idx++)
+		tps65910_board->en_gpio_sleep[idx] = (prop_array[idx] != 0);
 
-	tps65910->gpio.owner		= THIS_MODULE;
-	tps65910->gpio.label		= tps65910->i2c_client->name;
-	tps65910->gpio.dev		= tps65910->dev;
-	tps65910->gpio.base		= gpio_base;
+	return tps65910_board;
+}
+#else
+static struct tps65910_board *tps65910_parse_dt_for_gpio(struct device *dev,
+		struct tps65910 *tps65910, int chip_ngpio)
+{
+	return NULL;
+}
+#endif
+
+static int __devinit tps65910_gpio_probe(struct platform_device *pdev)
+{
+	struct tps65910 *tps65910 = dev_get_drvdata(pdev->dev.parent);
+	struct tps65910_board *pdata = dev_get_platdata(tps65910->dev);
+	struct tps65910_gpio *tps65910_gpio;
+	int ret;
+	int i;
+
+	tps65910_gpio = devm_kzalloc(&pdev->dev,
+				sizeof(*tps65910_gpio), GFP_KERNEL);
+	if (!tps65910_gpio) {
+		dev_err(&pdev->dev, "Could not allocate tps65910_gpio\n");
+		return -ENOMEM;
+	}
+
+	tps65910_gpio->tps65910 = tps65910;
+
+	tps65910_gpio->gpio_chip.owner = THIS_MODULE;
+	tps65910_gpio->gpio_chip.label = tps65910->i2c_client->name;
 
 	switch(tps65910_chip_id(tps65910)) {
 	case TPS65910:
-		tps65910->gpio.ngpio	= TPS65910_NUM_GPIO;
+		tps65910_gpio->gpio_chip.ngpio = TPS65910_NUM_GPIO;
 		break;
 	case TPS65911:
-		tps65910->gpio.ngpio	= TPS65911_NUM_GPIO;
+		tps65910_gpio->gpio_chip.ngpio = TPS65911_NUM_GPIO;
 		break;
 	default:
-		return;
+		return -EINVAL;
+	}
+	tps65910_gpio->gpio_chip.can_sleep = 1;
+	tps65910_gpio->gpio_chip.direction_input = tps65910_gpio_input;
+	tps65910_gpio->gpio_chip.direction_output = tps65910_gpio_output;
+	tps65910_gpio->gpio_chip.set	= tps65910_gpio_set;
+	tps65910_gpio->gpio_chip.get	= tps65910_gpio_get;
+	tps65910_gpio->gpio_chip.dev = &pdev->dev;
+	if (pdata && pdata->gpio_base)
+		tps65910_gpio->gpio_chip.base = pdata->gpio_base;
+	else
+		tps65910_gpio->gpio_chip.base = -1;
+
+	if (!pdata && tps65910->dev->of_node)
+		pdata = tps65910_parse_dt_for_gpio(&pdev->dev, tps65910,
+			tps65910_gpio->gpio_chip.ngpio);
+
+	if (!pdata)
+		goto skip_init;
+
+	/* Configure sleep control for gpios if provided */
+	for (i = 0; i < tps65910_gpio->gpio_chip.ngpio; ++i) {
+		if (!pdata->en_gpio_sleep[i])
+			continue;
+
+		ret = tps65910_reg_set_bits(tps65910,
+			TPS65910_GPIO0 + i, GPIO_SLEEP_MASK);
+		if (ret < 0)
+			dev_warn(tps65910->dev,
+				"GPIO Sleep setting failed with err %d\n", ret);
 	}
-	tps65910->gpio.can_sleep	= 1;
-
-	tps65910->gpio.direction_input	= tps65910_gpio_input;
-	tps65910->gpio.direction_output	= tps65910_gpio_output;
-	tps65910->gpio.set		= tps65910_gpio_set;
-	tps65910->gpio.get		= tps65910_gpio_get;
-
-	/* Configure sleep control for gpios */
-	board_data = dev_get_platdata(tps65910->dev);
-	if (board_data) {
-		int i;
-		for (i = 0; i < tps65910->gpio.ngpio; ++i) {
-			if (board_data->en_gpio_sleep[i]) {
-				ret = tps65910_set_bits(tps65910,
-					TPS65910_GPIO0 + i, GPIO_SLEEP_MASK);
-				if (ret < 0)
-					dev_warn(tps65910->dev,
-						"GPIO Sleep setting failed\n");
-			}
-		}
+
+skip_init:
+	ret = gpiochip_add(&tps65910_gpio->gpio_chip);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "Could not register gpiochip, %d\n", ret);
+		return ret;
 	}
 
-	ret = gpiochip_add(&tps65910->gpio);
+	platform_set_drvdata(pdev, tps65910_gpio);
+
+	return ret;
+}
+
+static int __devexit tps65910_gpio_remove(struct platform_device *pdev)
+{
+	struct tps65910_gpio *tps65910_gpio = platform_get_drvdata(pdev);
 
-	if (ret)
-		dev_warn(tps65910->dev, "GPIO registration failed: %d\n", ret);
+	return gpiochip_remove(&tps65910_gpio->gpio_chip);
 }
+
+static struct platform_driver tps65910_gpio_driver = {
+	.driver.name    = "tps65910-gpio",
+	.driver.owner   = THIS_MODULE,
+	.probe		= tps65910_gpio_probe,
+	.remove		= __devexit_p(tps65910_gpio_remove),
+};
+
+static int __init tps65910_gpio_init(void)
+{
+	return platform_driver_register(&tps65910_gpio_driver);
+}
+subsys_initcall(tps65910_gpio_init);
+
+static void __exit tps65910_gpio_exit(void)
+{
+	platform_driver_unregister(&tps65910_gpio_driver);
+}
+module_exit(tps65910_gpio_exit);
+
+MODULE_AUTHOR("Graeme Gregory <[email protected]>");
+MODULE_AUTHOR("Jorge Eduardo Candelaria [email protected]>");
+MODULE_DESCRIPTION("GPIO interface for TPS65910/TPS6511 PMICs");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform:tps65910-gpio");
diff --git a/drivers/gpio/gpio-wm831x.c b/drivers/gpio/gpio-wm831x.c
index deb949e75ec1..e56a2165641c 100644
--- a/drivers/gpio/gpio-wm831x.c
+++ b/drivers/gpio/gpio-wm831x.c
@@ -102,10 +102,8 @@ static int wm831x_gpio_to_irq(struct gpio_chip *chip, unsigned offset)
 	struct wm831x_gpio *wm831x_gpio = to_wm831x_gpio(chip);
 	struct wm831x *wm831x = wm831x_gpio->wm831x;
 
-	if (!wm831x->irq_base)
-		return -EINVAL;
-
-	return wm831x->irq_base + WM831X_IRQ_GPIO_1 + offset;
+	return irq_create_mapping(wm831x->irq_domain,
+				  WM831X_IRQ_GPIO_1 + offset);
 }
 
 static int wm831x_gpio_set_debounce(struct gpio_chip *chip, unsigned offset,
diff --git a/drivers/input/misc/wm831x-on.c b/drivers/input/misc/wm831x-on.c
index 47f18d6bce46..6790a812a1db 100644
--- a/drivers/input/misc/wm831x-on.c
+++ b/drivers/input/misc/wm831x-on.c
@@ -73,7 +73,7 @@ static int __devinit wm831x_on_probe(struct platform_device *pdev)
 {
 	struct wm831x *wm831x = dev_get_drvdata(pdev->dev.parent);
 	struct wm831x_on *wm831x_on;
-	int irq = platform_get_irq(pdev, 0);
+	int irq = wm831x_irq(wm831x, platform_get_irq(pdev, 0));
 	int ret;
 
 	wm831x_on = kzalloc(sizeof(struct wm831x_on), GFP_KERNEL);
diff --git a/drivers/input/touchscreen/wm831x-ts.c b/drivers/input/touchscreen/wm831x-ts.c
index 4bc851a9dc3d..e83410721e38 100644
--- a/drivers/input/touchscreen/wm831x-ts.c
+++ b/drivers/input/touchscreen/wm831x-ts.c
@@ -260,15 +260,16 @@ static __devinit int wm831x_ts_probe(struct platform_device *pdev)
 	 * If we have a direct IRQ use it, otherwise use the interrupt
 	 * from the WM831x IRQ controller.
 	 */
+	wm831x_ts->data_irq = wm831x_irq(wm831x,
+					 platform_get_irq_byname(pdev,
+								 "TCHDATA"));
 	if (pdata && pdata->data_irq)
 		wm831x_ts->data_irq = pdata->data_irq;
-	else
-		wm831x_ts->data_irq = platform_get_irq_byname(pdev, "TCHDATA");
 
+	wm831x_ts->pd_irq = wm831x_irq(wm831x,
+				       platform_get_irq_byname(pdev, "TCHPD"));
 	if (pdata && pdata->pd_irq)
 		wm831x_ts->pd_irq = pdata->pd_irq;
-	else
-		wm831x_ts->pd_irq = platform_get_irq_byname(pdev, "TCHPD");
 
 	if (pdata)
 		wm831x_ts->pressure = pdata->pressure;
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index a5bee8e2dfce..d90a421e9cac 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -450,12 +450,27 @@ static void dump_command(unsigned long phys_addr)
 
 static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
 {
-	u32 *event = __evt;
-	int type  = (event[1] >> EVENT_TYPE_SHIFT)  & EVENT_TYPE_MASK;
-	int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
-	int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
-	int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
-	u64 address = (u64)(((u64)event[3]) << 32) | event[2];
+	int type, devid, domid, flags;
+	volatile u32 *event = __evt;
+	int count = 0;
+	u64 address;
+
+retry:
+	type    = (event[1] >> EVENT_TYPE_SHIFT)  & EVENT_TYPE_MASK;
+	devid   = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
+	domid   = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
+	flags   = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
+	address = (u64)(((u64)event[3]) << 32) | event[2];
+
+	if (type == 0) {
+		/* Did we hit the erratum? */
+		if (++count == LOOP_TIMEOUT) {
+			pr_err("AMD-Vi: No event written to event log\n");
+			return;
+		}
+		udelay(1);
+		goto retry;
+	}
 
 	printk(KERN_ERR "AMD-Vi: Event logged [");
 
@@ -508,6 +523,8 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
 	default:
 		printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type);
 	}
+
+	memset(__evt, 0, 4 * sizeof(u32));
 }
 
 static void iommu_poll_events(struct amd_iommu *iommu)
@@ -2035,20 +2052,20 @@ out_err:
 }
 
 /* FIXME: Move this to PCI code */
-#define PCI_PRI_TLP_OFF		(1 << 2)
+#define PCI_PRI_TLP_OFF		(1 << 15)
 
 bool pci_pri_tlp_required(struct pci_dev *pdev)
 {
-	u16 control;
+	u16 status;
 	int pos;
 
 	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
 	if (!pos)
 		return false;
 
-	pci_read_config_word(pdev, pos + PCI_PRI_CTRL, &control);
+	pci_read_config_word(pdev, pos + PCI_PRI_STATUS, &status);
 
-	return (control & PCI_PRI_TLP_OFF) ? true : false;
+	return (status & PCI_PRI_TLP_OFF) ? true : false;
 }
 
 /*
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 2198b2dbbcd3..8b9ded88e6f5 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -119,6 +119,7 @@ EXPORT_SYMBOL_GPL(iommu_present);
  * iommu_set_fault_handler() - set a fault handler for an iommu domain
  * @domain: iommu domain
  * @handler: fault handler
+ * @token: user data, will be passed back to the fault handler
  *
  * This function should be used by IOMMU users which want to be notified
  * whenever an IOMMU fault happens.
@@ -127,11 +128,13 @@ EXPORT_SYMBOL_GPL(iommu_present);
  * error code otherwise.
  */
 void iommu_set_fault_handler(struct iommu_domain *domain,
-					iommu_fault_handler_t handler)
+					iommu_fault_handler_t handler,
+					void *token)
 {
 	BUG_ON(!domain);
 
 	domain->handler = handler;
+	domain->handler_token = token;
 }
 EXPORT_SYMBOL_GPL(iommu_set_fault_handler);
 
diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
index 6899dcd02dfa..e70ee2b59df9 100644
--- a/drivers/iommu/omap-iommu.c
+++ b/drivers/iommu/omap-iommu.c
@@ -41,11 +41,13 @@
  * @pgtable:	the page table
  * @iommu_dev:	an omap iommu device attached to this domain. only a single
  *		iommu device can be attached for now.
+ * @dev:	Device using this domain.
  * @lock:	domain lock, should be taken when attaching/detaching
  */
 struct omap_iommu_domain {
 	u32 *pgtable;
 	struct omap_iommu *iommu_dev;
+	struct device *dev;
 	spinlock_t lock;
 };
 
@@ -1081,6 +1083,7 @@ omap_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
 	}
 
 	omap_domain->iommu_dev = arch_data->iommu_dev = oiommu;
+	omap_domain->dev = dev;
 	oiommu->domain = domain;
 
 out:
@@ -1088,19 +1091,16 @@ out:
 	return ret;
 }
 
-static void omap_iommu_detach_dev(struct iommu_domain *domain,
-				 struct device *dev)
+static void _omap_iommu_detach_dev(struct omap_iommu_domain *omap_domain,
+			struct device *dev)
 {
-	struct omap_iommu_domain *omap_domain = domain->priv;
-	struct omap_iommu_arch_data *arch_data = dev->archdata.iommu;
 	struct omap_iommu *oiommu = dev_to_omap_iommu(dev);
-
-	spin_lock(&omap_domain->lock);
+	struct omap_iommu_arch_data *arch_data = dev->archdata.iommu;
 
 	/* only a single device is supported per domain for now */
 	if (omap_domain->iommu_dev != oiommu) {
 		dev_err(dev, "invalid iommu device\n");
-		goto out;
+		return;
 	}
 
 	iopgtable_clear_entry_all(oiommu);
@@ -1108,8 +1108,16 @@ static void omap_iommu_detach_dev(struct iommu_domain *domain,
 	omap_iommu_detach(oiommu);
 
 	omap_domain->iommu_dev = arch_data->iommu_dev = NULL;
+	omap_domain->dev = NULL;
+}
 
-out:
+static void omap_iommu_detach_dev(struct iommu_domain *domain,
+				 struct device *dev)
+{
+	struct omap_iommu_domain *omap_domain = domain->priv;
+
+	spin_lock(&omap_domain->lock);
+	_omap_iommu_detach_dev(omap_domain, dev);
 	spin_unlock(&omap_domain->lock);
 }
 
@@ -1148,13 +1156,19 @@ out:
 	return -ENOMEM;
 }
 
-/* assume device was already detached */
 static void omap_iommu_domain_destroy(struct iommu_domain *domain)
 {
 	struct omap_iommu_domain *omap_domain = domain->priv;
 
 	domain->priv = NULL;
 
+	/*
+	 * An iommu device is still attached
+	 * (currently, only one device can be attached) ?
+	 */
+	if (omap_domain->iommu_dev)
+		_omap_iommu_detach_dev(omap_domain, omap_domain->dev);
+
 	kfree(omap_domain->pgtable);
 	kfree(omap_domain);
 }
diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c
index 779306ee7b16..0c0a37792218 100644
--- a/drivers/iommu/tegra-gart.c
+++ b/drivers/iommu/tegra-gart.c
@@ -29,15 +29,17 @@
 #include <linux/device.h>
 #include <linux/io.h>
 #include <linux/iommu.h>
+#include <linux/of.h>
 
 #include <asm/cacheflush.h>
 
 /* bitmap of the page sizes currently supported */
 #define GART_IOMMU_PGSIZES	(SZ_4K)
 
-#define GART_CONFIG		0x24
-#define GART_ENTRY_ADDR		0x28
-#define GART_ENTRY_DATA		0x2c
+#define GART_REG_BASE		0x24
+#define GART_CONFIG		(0x24 - GART_REG_BASE)
+#define GART_ENTRY_ADDR		(0x28 - GART_REG_BASE)
+#define GART_ENTRY_DATA		(0x2c - GART_REG_BASE)
 #define GART_ENTRY_PHYS_ADDR_VALID	(1 << 31)
 
 #define GART_PAGE_SHIFT		12
@@ -158,7 +160,7 @@ static int gart_iommu_attach_dev(struct iommu_domain *domain,
 	struct gart_client *client, *c;
 	int err = 0;
 
-	gart = dev_get_drvdata(dev->parent);
+	gart = gart_handle;
 	if (!gart)
 		return -EINVAL;
 	domain->priv = gart;
@@ -422,6 +424,14 @@ const struct dev_pm_ops tegra_gart_pm_ops = {
 	.resume		= tegra_gart_resume,
 };
 
+#ifdef CONFIG_OF
+static struct of_device_id tegra_gart_of_match[] __devinitdata = {
+	{ .compatible = "nvidia,tegra20-gart", },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, tegra_gart_of_match);
+#endif
+
 static struct platform_driver tegra_gart_driver = {
 	.probe		= tegra_gart_probe,
 	.remove		= tegra_gart_remove,
@@ -429,6 +439,7 @@ static struct platform_driver tegra_gart_driver = {
 		.owner	= THIS_MODULE,
 		.name	= "tegra-gart",
 		.pm	= &tegra_gart_pm_ops,
+		.of_match_table = of_match_ptr(tegra_gart_of_match),
 	},
 };
 
@@ -448,4 +459,5 @@ module_exit(tegra_gart_exit);
 
 MODULE_DESCRIPTION("IOMMU API for GART in Tegra20");
 MODULE_AUTHOR("Hiroshi DOYU <[email protected]>");
+MODULE_ALIAS("platform:tegra-gart");
 MODULE_LICENSE("GPL v2");
diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c
index eb93c821f592..ecd679043d77 100644
--- a/drivers/iommu/tegra-smmu.c
+++ b/drivers/iommu/tegra-smmu.c
@@ -733,7 +733,7 @@ static int smmu_iommu_attach_dev(struct iommu_domain *domain,
 		pr_info("Reserve \"page zero\" for AVP vectors using a common dummy\n");
 	}
 
-	dev_dbg(smmu->dev, "%s is attached\n", dev_name(c->dev));
+	dev_dbg(smmu->dev, "%s is attached\n", dev_name(dev));
 	return 0;
 
 err_client:
diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index ff4b8cfda585..04cb8c88d74b 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -50,6 +50,19 @@ config LEDS_LM3530
 	  controlled manually or using PWM input or using ambient
 	  light automatically.
 
+config LEDS_LM3533
+	tristate "LED support for LM3533"
+	depends on LEDS_CLASS
+	depends on MFD_LM3533
+	help
+	  This option enables support for the LEDs on National Semiconductor /
+	  TI LM3533 Lighting Power chips.
+
+	  The LEDs can be controlled directly, through PWM input, or by the
+	  ambient-light-sensor interface. The chip supports
+	  hardware-accelerated blinking with maximum on and off periods of 9.8
+	  and 77 seconds respectively.
+
 config LEDS_LOCOMO
 	tristate "LED Support for Locomo device"
 	depends on LEDS_CLASS
@@ -259,6 +272,14 @@ config LEDS_DA903X
 	  This option enables support for on-chip LED drivers found
 	  on Dialog Semiconductor DA9030/DA9034 PMICs.
 
+config LEDS_DA9052
+	tristate "Dialog DA9052/DA9053 LEDS"
+	depends on LEDS_CLASS
+	depends on PMIC_DA9052
+	help
+	  This option enables support for on-chip LED drivers found
+	  on Dialog Semiconductor DA9052-BC and DA9053-AA/Bx PMICs.
+
 config LEDS_DAC124S085
 	tristate "LED Support for DAC124S085 SPI DAC"
 	depends on LEDS_CLASS
@@ -471,4 +492,12 @@ config LEDS_TRIGGER_DEFAULT_ON
 comment "iptables trigger is under Netfilter config (LED target)"
 	depends on LEDS_TRIGGERS
 
+config LEDS_TRIGGER_TRANSIENT
+	tristate "LED Transient Trigger"
+	depends on LEDS_TRIGGERS
+	help
+	  This allows one time activation of a transient state on
+	  GPIO/PWM based hadrware.
+	  If unsure, say Y.
+
 endif # NEW_LEDS
diff --git a/drivers/leds/Makefile b/drivers/leds/Makefile
index 890481cb09f6..f8958cd6cf6e 100644
--- a/drivers/leds/Makefile
+++ b/drivers/leds/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_LEDS_ATMEL_PWM)		+= leds-atmel-pwm.o
 obj-$(CONFIG_LEDS_BD2802)		+= leds-bd2802.o
 obj-$(CONFIG_LEDS_LOCOMO)		+= leds-locomo.o
 obj-$(CONFIG_LEDS_LM3530)		+= leds-lm3530.o
+obj-$(CONFIG_LEDS_LM3533)		+= leds-lm3533.o
 obj-$(CONFIG_LEDS_MIKROTIK_RB532)	+= leds-rb532.o
 obj-$(CONFIG_LEDS_S3C24XX)		+= leds-s3c24xx.o
 obj-$(CONFIG_LEDS_NET48XX)		+= leds-net48xx.o
@@ -31,6 +32,7 @@ obj-$(CONFIG_LEDS_FSG)			+= leds-fsg.o
 obj-$(CONFIG_LEDS_PCA955X)		+= leds-pca955x.o
 obj-$(CONFIG_LEDS_PCA9633)		+= leds-pca9633.o
 obj-$(CONFIG_LEDS_DA903X)		+= leds-da903x.o
+obj-$(CONFIG_LEDS_DA9052)		+= leds-da9052.o
 obj-$(CONFIG_LEDS_WM831X_STATUS)	+= leds-wm831x-status.o
 obj-$(CONFIG_LEDS_WM8350)		+= leds-wm8350.o
 obj-$(CONFIG_LEDS_PWM)			+= leds-pwm.o
@@ -56,3 +58,4 @@ obj-$(CONFIG_LEDS_TRIGGER_HEARTBEAT)	+= ledtrig-heartbeat.o
 obj-$(CONFIG_LEDS_TRIGGER_BACKLIGHT)	+= ledtrig-backlight.o
 obj-$(CONFIG_LEDS_TRIGGER_GPIO)		+= ledtrig-gpio.o
 obj-$(CONFIG_LEDS_TRIGGER_DEFAULT_ON)	+= ledtrig-default-on.o
+obj-$(CONFIG_LEDS_TRIGGER_TRANSIENT)	+= ledtrig-transient.o
diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c
index 5bff8439dc68..8ee92c81aec2 100644
--- a/drivers/leds/led-class.c
+++ b/drivers/leds/led-class.c
@@ -44,23 +44,18 @@ static ssize_t led_brightness_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t size)
 {
 	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	unsigned long state;
 	ssize_t ret = -EINVAL;
-	char *after;
-	unsigned long state = simple_strtoul(buf, &after, 10);
-	size_t count = after - buf;
 
-	if (isspace(*after))
-		count++;
+	ret = kstrtoul(buf, 10, &state);
+	if (ret)
+		return ret;
 
-	if (count == size) {
-		ret = count;
+	if (state == LED_OFF)
+		led_trigger_remove(led_cdev);
+	led_set_brightness(led_cdev, state);
 
-		if (state == LED_OFF)
-			led_trigger_remove(led_cdev);
-		led_set_brightness(led_cdev, state);
-	}
-
-	return ret;
+	return size;
 }
 
 static ssize_t led_max_brightness_show(struct device *dev,
diff --git a/drivers/leds/leds-da9052.c b/drivers/leds/leds-da9052.c
new file mode 100644
index 000000000000..58a5244c437e
--- /dev/null
+++ b/drivers/leds/leds-da9052.c
@@ -0,0 +1,214 @@
+/*
+ * LED Driver for Dialog DA9052 PMICs.
+ *
+ * Copyright(c) 2012 Dialog Semiconductor Ltd.
+ *
+ * Author: David Dajun Chen <[email protected]>
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/leds.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+
+#include <linux/mfd/da9052/reg.h>
+#include <linux/mfd/da9052/da9052.h>
+#include <linux/mfd/da9052/pdata.h>
+
+#define DA9052_OPENDRAIN_OUTPUT	2
+#define DA9052_SET_HIGH_LVL_OUTPUT	(1 << 3)
+#define DA9052_MASK_UPPER_NIBBLE	0xF0
+#define DA9052_MASK_LOWER_NIBBLE	0x0F
+#define DA9052_NIBBLE_SHIFT		4
+#define DA9052_MAX_BRIGHTNESS		0x5f
+
+struct da9052_led {
+	struct led_classdev cdev;
+	struct work_struct work;
+	struct da9052 *da9052;
+	unsigned char led_index;
+	unsigned char id;
+	int brightness;
+};
+
+static unsigned char led_reg[] = {
+	DA9052_LED_CONT_4_REG,
+	DA9052_LED_CONT_5_REG,
+};
+
+static int da9052_set_led_brightness(struct da9052_led *led)
+{
+	u8 val;
+	int error;
+
+	val = (led->brightness & 0x7f) | DA9052_LED_CONT_DIM;
+
+	error = da9052_reg_write(led->da9052, led_reg[led->led_index], val);
+	if (error < 0)
+		dev_err(led->da9052->dev, "Failed to set led brightness, %d\n",
+			error);
+	return error;
+}
+
+static void da9052_led_work(struct work_struct *work)
+{
+	struct da9052_led *led = container_of(work, struct da9052_led, work);
+
+	da9052_set_led_brightness(led);
+}
+
+static void da9052_led_set(struct led_classdev *led_cdev,
+			   enum led_brightness value)
+{
+	struct da9052_led *led;
+
+	led = container_of(led_cdev, struct da9052_led, cdev);
+	led->brightness = value;
+	schedule_work(&led->work);
+}
+
+static int da9052_configure_leds(struct da9052 *da9052)
+{
+	int error;
+	unsigned char register_value = DA9052_OPENDRAIN_OUTPUT
+				       | DA9052_SET_HIGH_LVL_OUTPUT;
+
+	error = da9052_reg_update(da9052, DA9052_GPIO_14_15_REG,
+				  DA9052_MASK_LOWER_NIBBLE,
+				  register_value);
+
+	if (error < 0) {
+		dev_err(da9052->dev, "Failed to write GPIO 14-15 reg, %d\n",
+			error);
+		return error;
+	}
+
+	error = da9052_reg_update(da9052, DA9052_GPIO_14_15_REG,
+				  DA9052_MASK_UPPER_NIBBLE,
+				  register_value << DA9052_NIBBLE_SHIFT);
+	if (error < 0)
+		dev_err(da9052->dev, "Failed to write GPIO 14-15 reg, %d\n",
+			error);
+
+	return error;
+}
+
+static int __devinit da9052_led_probe(struct platform_device *pdev)
+{
+	struct da9052_pdata *pdata;
+	struct da9052 *da9052;
+	struct led_platform_data *pled;
+	struct da9052_led *led = NULL;
+	int error = -ENODEV;
+	int i;
+
+	da9052 = dev_get_drvdata(pdev->dev.parent);
+	pdata = da9052->dev->platform_data;
+	if (pdata == NULL) {
+		dev_err(&pdev->dev, "No platform data\n");
+		goto err;
+	}
+
+	pled = pdata->pled;
+	if (pled == NULL) {
+		dev_err(&pdev->dev, "No platform data for LED\n");
+		goto err;
+	}
+
+	led = devm_kzalloc(&pdev->dev,
+			   sizeof(struct da9052_led) * pled->num_leds,
+			   GFP_KERNEL);
+	if (led == NULL) {
+		dev_err(&pdev->dev, "Failed to alloc memory\n");
+		error = -ENOMEM;
+		goto err;
+	}
+
+	for (i = 0; i < pled->num_leds; i++) {
+		led[i].cdev.name = pled->leds[i].name;
+		led[i].cdev.brightness_set = da9052_led_set;
+		led[i].cdev.brightness = LED_OFF;
+		led[i].cdev.max_brightness = DA9052_MAX_BRIGHTNESS;
+		led[i].brightness = LED_OFF;
+		led[i].led_index = pled->leds[i].flags;
+		led[i].da9052 = dev_get_drvdata(pdev->dev.parent);
+		INIT_WORK(&led[i].work, da9052_led_work);
+
+		error = led_classdev_register(pdev->dev.parent, &led[i].cdev);
+		if (error) {
+			dev_err(&pdev->dev, "Failed to register led %d\n",
+				led[i].led_index);
+			goto err_register;
+		}
+
+		error = da9052_set_led_brightness(&led[i]);
+		if (error) {
+			dev_err(&pdev->dev, "Unable to init led %d\n",
+				led[i].led_index);
+			continue;
+		}
+	}
+	error = da9052_configure_leds(led->da9052);
+	if (error) {
+		dev_err(&pdev->dev, "Failed to configure GPIO LED%d\n", error);
+		goto err_register;
+	}
+
+	platform_set_drvdata(pdev, led);
+
+	return 0;
+
+err_register:
+	for (i = i - 1; i >= 0; i--) {
+		led_classdev_unregister(&led[i].cdev);
+		cancel_work_sync(&led[i].work);
+	}
+err:
+	return error;
+}
+
+static int __devexit da9052_led_remove(struct platform_device *pdev)
+{
+	struct da9052_led *led = platform_get_drvdata(pdev);
+	struct da9052_pdata *pdata;
+	struct da9052 *da9052;
+	struct led_platform_data *pled;
+	int i;
+
+	da9052 = dev_get_drvdata(pdev->dev.parent);
+	pdata = da9052->dev->platform_data;
+	pled = pdata->pled;
+
+	for (i = 0; i < pled->num_leds; i++) {
+		led[i].brightness = 0;
+		da9052_set_led_brightness(&led[i]);
+		led_classdev_unregister(&led[i].cdev);
+		cancel_work_sync(&led[i].work);
+	}
+
+	return 0;
+}
+
+static struct platform_driver da9052_led_driver = {
+	.driver		= {
+		.name	= "da9052-leds",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= da9052_led_probe,
+	.remove		= __devexit_p(da9052_led_remove),
+};
+
+module_platform_driver(da9052_led_driver);
+
+MODULE_AUTHOR("Dialog Semiconductor Ltd <[email protected]>");
+MODULE_DESCRIPTION("LED driver for Dialog DA9052 PMIC");
+MODULE_LICENSE("GPL");
diff --git a/drivers/leds/leds-lm3530.c b/drivers/leds/leds-lm3530.c
index 968fd5fef4fc..84ba6de8039c 100644
--- a/drivers/leds/leds-lm3530.c
+++ b/drivers/leds/leds-lm3530.c
@@ -113,6 +113,18 @@ struct lm3530_data {
 	bool enable;
 };
 
+/*
+ * struct lm3530_als_data
+ * @config  : value of ALS configuration register
+ * @imp_sel : value of ALS resistor select register
+ * @zone    : values of ALS ZB(Zone Boundary) registers
+ */
+struct lm3530_als_data {
+	u8 config;
+	u8 imp_sel;
+	u8 zones[LM3530_ALS_ZB_MAX];
+};
+
 static const u8 lm3530_reg[LM3530_REG_MAX] = {
 	LM3530_GEN_CONFIG,
 	LM3530_ALS_CONFIG,
@@ -141,29 +153,65 @@ static int lm3530_get_mode_from_str(const char *str)
 	return -1;
 }
 
+static void lm3530_als_configure(struct lm3530_platform_data *pdata,
+				struct lm3530_als_data *als)
+{
+	int i;
+	u32 als_vmin, als_vmax, als_vstep;
+
+	if (pdata->als_vmax == 0) {
+		pdata->als_vmin = 0;
+		pdata->als_vmax = LM3530_ALS_WINDOW_mV;
+	}
+
+	als_vmin = pdata->als_vmin;
+	als_vmax = pdata->als_vmax;
+
+	if ((als_vmax - als_vmin) > LM3530_ALS_WINDOW_mV)
+		pdata->als_vmax = als_vmax = als_vmin + LM3530_ALS_WINDOW_mV;
+
+	/* n zone boundary makes n+1 zones */
+	als_vstep = (als_vmax - als_vmin) / (LM3530_ALS_ZB_MAX + 1);
+
+	for (i = 0; i < LM3530_ALS_ZB_MAX; i++)
+		als->zones[i] = (((als_vmin + LM3530_ALS_OFFSET_mV) +
+			als_vstep + (i * als_vstep)) * LED_FULL) / 1000;
+
+	als->config =
+		(pdata->als_avrg_time << LM3530_ALS_AVG_TIME_SHIFT) |
+		(LM3530_ENABLE_ALS) |
+		(pdata->als_input_mode << LM3530_ALS_SEL_SHIFT);
+
+	als->imp_sel =
+		(pdata->als1_resistor_sel << LM3530_ALS1_IMP_SHIFT) |
+		(pdata->als2_resistor_sel << LM3530_ALS2_IMP_SHIFT);
+}
+
 static int lm3530_init_registers(struct lm3530_data *drvdata)
 {
 	int ret = 0;
 	int i;
 	u8 gen_config;
-	u8 als_config = 0;
 	u8 brt_ramp;
-	u8 als_imp_sel = 0;
 	u8 brightness;
 	u8 reg_val[LM3530_REG_MAX];
-	u8 zones[LM3530_ALS_ZB_MAX];
-	u32 als_vmin, als_vmax, als_vstep;
 	struct lm3530_platform_data *pdata = drvdata->pdata;
 	struct i2c_client *client = drvdata->client;
 	struct lm3530_pwm_data *pwm = &pdata->pwm_data;
+	struct lm3530_als_data als;
+
+	memset(&als, 0, sizeof(struct lm3530_als_data));
 
 	gen_config = (pdata->brt_ramp_law << LM3530_RAMP_LAW_SHIFT) |
 			((pdata->max_current & 7) << LM3530_MAX_CURR_SHIFT);
 
 	switch (drvdata->mode) {
 	case LM3530_BL_MODE_MANUAL:
+		gen_config |= LM3530_ENABLE_I2C;
+		break;
 	case LM3530_BL_MODE_ALS:
 		gen_config |= LM3530_ENABLE_I2C;
+		lm3530_als_configure(pdata, &als);
 		break;
 	case LM3530_BL_MODE_PWM:
 		gen_config |= LM3530_ENABLE_PWM | LM3530_ENABLE_PWM_SIMPLE |
@@ -171,38 +219,6 @@ static int lm3530_init_registers(struct lm3530_data *drvdata)
 		break;
 	}
 
-	if (drvdata->mode == LM3530_BL_MODE_ALS) {
-		if (pdata->als_vmax == 0) {
-			pdata->als_vmin = 0;
-			pdata->als_vmax = LM3530_ALS_WINDOW_mV;
-		}
-
-		als_vmin = pdata->als_vmin;
-		als_vmax = pdata->als_vmax;
-
-		if ((als_vmax - als_vmin) > LM3530_ALS_WINDOW_mV)
-			pdata->als_vmax = als_vmax =
-				als_vmin + LM3530_ALS_WINDOW_mV;
-
-		/* n zone boundary makes n+1 zones */
-		als_vstep = (als_vmax - als_vmin) / (LM3530_ALS_ZB_MAX + 1);
-
-		for (i = 0; i < LM3530_ALS_ZB_MAX; i++)
-			zones[i] = (((als_vmin + LM3530_ALS_OFFSET_mV) +
-					als_vstep + (i * als_vstep)) * LED_FULL)
-					/ 1000;
-
-		als_config =
-			(pdata->als_avrg_time << LM3530_ALS_AVG_TIME_SHIFT) |
-			(LM3530_ENABLE_ALS) |
-			(pdata->als_input_mode << LM3530_ALS_SEL_SHIFT);
-
-		als_imp_sel =
-			(pdata->als1_resistor_sel << LM3530_ALS1_IMP_SHIFT) |
-			(pdata->als2_resistor_sel << LM3530_ALS2_IMP_SHIFT);
-
-	}
-
 	brt_ramp = (pdata->brt_ramp_fall << LM3530_BRT_RAMP_FALL_SHIFT) |
 			(pdata->brt_ramp_rise << LM3530_BRT_RAMP_RISE_SHIFT);
 
@@ -215,14 +231,14 @@ static int lm3530_init_registers(struct lm3530_data *drvdata)
 		brightness = drvdata->led_dev.max_brightness;
 
 	reg_val[0] = gen_config;	/* LM3530_GEN_CONFIG */
-	reg_val[1] = als_config;	/* LM3530_ALS_CONFIG */
+	reg_val[1] = als.config;	/* LM3530_ALS_CONFIG */
 	reg_val[2] = brt_ramp;		/* LM3530_BRT_RAMP_RATE */
-	reg_val[3] = als_imp_sel;	/* LM3530_ALS_IMP_SELECT */
+	reg_val[3] = als.imp_sel;	/* LM3530_ALS_IMP_SELECT */
 	reg_val[4] = brightness;	/* LM3530_BRT_CTRL_REG */
-	reg_val[5] = zones[0];		/* LM3530_ALS_ZB0_REG */
-	reg_val[6] = zones[1];		/* LM3530_ALS_ZB1_REG */
-	reg_val[7] = zones[2];		/* LM3530_ALS_ZB2_REG */
-	reg_val[8] = zones[3];		/* LM3530_ALS_ZB3_REG */
+	reg_val[5] = als.zones[0];	/* LM3530_ALS_ZB0_REG */
+	reg_val[6] = als.zones[1];	/* LM3530_ALS_ZB1_REG */
+	reg_val[7] = als.zones[2];	/* LM3530_ALS_ZB2_REG */
+	reg_val[8] = als.zones[3];	/* LM3530_ALS_ZB3_REG */
 	reg_val[9] = LM3530_DEF_ZT_0;	/* LM3530_ALS_Z0T_REG */
 	reg_val[10] = LM3530_DEF_ZT_1;	/* LM3530_ALS_Z1T_REG */
 	reg_val[11] = LM3530_DEF_ZT_2;	/* LM3530_ALS_Z2T_REG */
diff --git a/drivers/leds/leds-lm3533.c b/drivers/leds/leds-lm3533.c
new file mode 100644
index 000000000000..f56b6e7ffdac
--- /dev/null
+++ b/drivers/leds/leds-lm3533.c
@@ -0,0 +1,785 @@
+/*
+ * leds-lm3533.c -- LM3533 LED driver
+ *
+ * Copyright (C) 2011-2012 Texas Instruments
+ *
+ * Author: Johan Hovold <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under  the terms of the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/leds.h>
+#include <linux/mfd/core.h>
+#include <linux/mutex.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include <linux/mfd/lm3533.h>
+
+
+#define LM3533_LVCTRLBANK_MIN		2
+#define LM3533_LVCTRLBANK_MAX		5
+#define LM3533_LVCTRLBANK_COUNT		4
+#define LM3533_RISEFALLTIME_MAX		7
+#define LM3533_ALS_CHANNEL_LV_MIN	1
+#define LM3533_ALS_CHANNEL_LV_MAX	2
+
+#define LM3533_REG_CTRLBANK_BCONF_BASE		0x1b
+#define LM3533_REG_PATTERN_ENABLE		0x28
+#define LM3533_REG_PATTERN_LOW_TIME_BASE	0x71
+#define LM3533_REG_PATTERN_HIGH_TIME_BASE	0x72
+#define LM3533_REG_PATTERN_RISETIME_BASE	0x74
+#define LM3533_REG_PATTERN_FALLTIME_BASE	0x75
+
+#define LM3533_REG_PATTERN_STEP			0x10
+
+#define LM3533_REG_CTRLBANK_BCONF_MAPPING_MASK		0x04
+#define LM3533_REG_CTRLBANK_BCONF_ALS_EN_MASK		0x02
+#define LM3533_REG_CTRLBANK_BCONF_ALS_CHANNEL_MASK	0x01
+
+#define LM3533_LED_FLAG_PATTERN_ENABLE		1
+
+
+struct lm3533_led {
+	struct lm3533 *lm3533;
+	struct lm3533_ctrlbank cb;
+	struct led_classdev cdev;
+	int id;
+
+	struct mutex mutex;
+	unsigned long flags;
+
+	struct work_struct work;
+	u8 new_brightness;
+};
+
+
+static inline struct lm3533_led *to_lm3533_led(struct led_classdev *cdev)
+{
+	return container_of(cdev, struct lm3533_led, cdev);
+}
+
+static inline int lm3533_led_get_ctrlbank_id(struct lm3533_led *led)
+{
+	return led->id + 2;
+}
+
+static inline u8 lm3533_led_get_lv_reg(struct lm3533_led *led, u8 base)
+{
+	return base + led->id;
+}
+
+static inline u8 lm3533_led_get_pattern(struct lm3533_led *led)
+{
+	return led->id;
+}
+
+static inline u8 lm3533_led_get_pattern_reg(struct lm3533_led *led,
+								u8 base)
+{
+	return base + lm3533_led_get_pattern(led) * LM3533_REG_PATTERN_STEP;
+}
+
+static int lm3533_led_pattern_enable(struct lm3533_led *led, int enable)
+{
+	u8 mask;
+	u8 val;
+	int pattern;
+	int state;
+	int ret = 0;
+
+	dev_dbg(led->cdev.dev, "%s - %d\n", __func__, enable);
+
+	mutex_lock(&led->mutex);
+
+	state = test_bit(LM3533_LED_FLAG_PATTERN_ENABLE, &led->flags);
+	if ((enable && state) || (!enable && !state))
+		goto out;
+
+	pattern = lm3533_led_get_pattern(led);
+	mask = 1 << (2 * pattern);
+
+	if (enable)
+		val = mask;
+	else
+		val = 0;
+
+	ret = lm3533_update(led->lm3533, LM3533_REG_PATTERN_ENABLE, val, mask);
+	if (ret) {
+		dev_err(led->cdev.dev, "failed to enable pattern %d (%d)\n",
+							pattern, enable);
+		goto out;
+	}
+
+	__change_bit(LM3533_LED_FLAG_PATTERN_ENABLE, &led->flags);
+out:
+	mutex_unlock(&led->mutex);
+
+	return ret;
+}
+
+static void lm3533_led_work(struct work_struct *work)
+{
+	struct lm3533_led *led = container_of(work, struct lm3533_led, work);
+
+	dev_dbg(led->cdev.dev, "%s - %u\n", __func__, led->new_brightness);
+
+	if (led->new_brightness == 0)
+		lm3533_led_pattern_enable(led, 0);	/* disable blink */
+
+	lm3533_ctrlbank_set_brightness(&led->cb, led->new_brightness);
+}
+
+static void lm3533_led_set(struct led_classdev *cdev,
+						enum led_brightness value)
+{
+	struct lm3533_led *led = to_lm3533_led(cdev);
+
+	dev_dbg(led->cdev.dev, "%s - %d\n", __func__, value);
+
+	led->new_brightness = value;
+	schedule_work(&led->work);
+}
+
+static enum led_brightness lm3533_led_get(struct led_classdev *cdev)
+{
+	struct lm3533_led *led = to_lm3533_led(cdev);
+	u8 val;
+	int ret;
+
+	ret = lm3533_ctrlbank_get_brightness(&led->cb, &val);
+	if (ret)
+		return ret;
+
+	dev_dbg(led->cdev.dev, "%s - %u\n", __func__, val);
+
+	return val;
+}
+
+/* Pattern generator defines (delays in us). */
+#define LM3533_LED_DELAY1_VMIN	0x00
+#define LM3533_LED_DELAY2_VMIN	0x3d
+#define LM3533_LED_DELAY3_VMIN	0x80
+
+#define LM3533_LED_DELAY1_VMAX	(LM3533_LED_DELAY2_VMIN - 1)
+#define LM3533_LED_DELAY2_VMAX	(LM3533_LED_DELAY3_VMIN - 1)
+#define LM3533_LED_DELAY3_VMAX	0xff
+
+#define LM3533_LED_DELAY1_TMIN	16384U
+#define LM3533_LED_DELAY2_TMIN	1130496U
+#define LM3533_LED_DELAY3_TMIN	10305536U
+
+#define LM3533_LED_DELAY1_TMAX	999424U
+#define LM3533_LED_DELAY2_TMAX	9781248U
+#define LM3533_LED_DELAY3_TMAX	76890112U
+
+/* t_step = (t_max - t_min) / (v_max - v_min) */
+#define LM3533_LED_DELAY1_TSTEP	16384
+#define LM3533_LED_DELAY2_TSTEP	131072
+#define LM3533_LED_DELAY3_TSTEP	524288
+
+/* Delay limits for hardware accelerated blinking (in ms). */
+#define LM3533_LED_DELAY_ON_MAX \
+	((LM3533_LED_DELAY2_TMAX + LM3533_LED_DELAY2_TSTEP / 2) / 1000)
+#define LM3533_LED_DELAY_OFF_MAX \
+	((LM3533_LED_DELAY3_TMAX + LM3533_LED_DELAY3_TSTEP / 2) / 1000)
+
+/*
+ * Returns linear map of *t from [t_min,t_max] to [v_min,v_max] with a step
+ * size of t_step, where
+ *
+ *	t_step = (t_max - t_min) / (v_max - v_min)
+ *
+ * and updates *t to reflect the mapped value.
+ */
+static u8 time_to_val(unsigned *t, unsigned t_min, unsigned t_step,
+							u8 v_min, u8 v_max)
+{
+	unsigned val;
+
+	val = (*t + t_step / 2 - t_min) / t_step + v_min;
+
+	*t = t_step * (val - v_min) + t_min;
+
+	return (u8)val;
+}
+
+/*
+ * Returns time code corresponding to *delay (in ms) and updates *delay to
+ * reflect actual hardware delay.
+ *
+ * Hardware supports 256 discrete delay times, divided into three groups with
+ * the following ranges and step-sizes:
+ *
+ *	[   16,   999]	[0x00, 0x3e]	step  16 ms
+ *	[ 1130,  9781]	[0x3d, 0x7f]	step 131 ms
+ *	[10306, 76890]	[0x80, 0xff]	step 524 ms
+ *
+ * Note that delay group 3 is only available for delay_off.
+ */
+static u8 lm3533_led_get_hw_delay(unsigned *delay)
+{
+	unsigned t;
+	u8 val;
+
+	t = *delay * 1000;
+
+	if (t >= (LM3533_LED_DELAY2_TMAX + LM3533_LED_DELAY3_TMIN) / 2) {
+		t = clamp(t, LM3533_LED_DELAY3_TMIN, LM3533_LED_DELAY3_TMAX);
+		val = time_to_val(&t,	LM3533_LED_DELAY3_TMIN,
+					LM3533_LED_DELAY3_TSTEP,
+					LM3533_LED_DELAY3_VMIN,
+					LM3533_LED_DELAY3_VMAX);
+	} else if (t >= (LM3533_LED_DELAY1_TMAX + LM3533_LED_DELAY2_TMIN) / 2) {
+		t = clamp(t, LM3533_LED_DELAY2_TMIN, LM3533_LED_DELAY2_TMAX);
+		val = time_to_val(&t,	LM3533_LED_DELAY2_TMIN,
+					LM3533_LED_DELAY2_TSTEP,
+					LM3533_LED_DELAY2_VMIN,
+					LM3533_LED_DELAY2_VMAX);
+	} else {
+		t = clamp(t, LM3533_LED_DELAY1_TMIN, LM3533_LED_DELAY1_TMAX);
+		val = time_to_val(&t,	LM3533_LED_DELAY1_TMIN,
+					LM3533_LED_DELAY1_TSTEP,
+					LM3533_LED_DELAY1_VMIN,
+					LM3533_LED_DELAY1_VMAX);
+	}
+
+	*delay = (t + 500) / 1000;
+
+	return val;
+}
+
+/*
+ * Set delay register base to *delay (in ms) and update *delay to reflect
+ * actual hardware delay used.
+ */
+static u8 lm3533_led_delay_set(struct lm3533_led *led, u8 base,
+							unsigned long *delay)
+{
+	unsigned t;
+	u8 val;
+	u8 reg;
+	int ret;
+
+	t = (unsigned)*delay;
+
+	/* Delay group 3 is only available for low time (delay off). */
+	if (base != LM3533_REG_PATTERN_LOW_TIME_BASE)
+		t = min(t, LM3533_LED_DELAY2_TMAX / 1000);
+
+	val = lm3533_led_get_hw_delay(&t);
+
+	dev_dbg(led->cdev.dev, "%s - %lu: %u (0x%02x)\n", __func__,
+							*delay, t, val);
+	reg = lm3533_led_get_pattern_reg(led, base);
+	ret = lm3533_write(led->lm3533, reg, val);
+	if (ret)
+		dev_err(led->cdev.dev, "failed to set delay (%02x)\n", reg);
+
+	*delay = t;
+
+	return ret;
+}
+
+static int lm3533_led_delay_on_set(struct lm3533_led *led, unsigned long *t)
+{
+	return lm3533_led_delay_set(led, LM3533_REG_PATTERN_HIGH_TIME_BASE, t);
+}
+
+static int lm3533_led_delay_off_set(struct lm3533_led *led, unsigned long *t)
+{
+	return lm3533_led_delay_set(led, LM3533_REG_PATTERN_LOW_TIME_BASE, t);
+}
+
+static int lm3533_led_blink_set(struct led_classdev *cdev,
+				unsigned long *delay_on,
+				unsigned long *delay_off)
+{
+	struct lm3533_led *led = to_lm3533_led(cdev);
+	int ret;
+
+	dev_dbg(led->cdev.dev, "%s - on = %lu, off = %lu\n", __func__,
+							*delay_on, *delay_off);
+
+	if (*delay_on > LM3533_LED_DELAY_ON_MAX ||
+					*delay_off > LM3533_LED_DELAY_OFF_MAX)
+		return -EINVAL;
+
+	if (*delay_on == 0 && *delay_off == 0) {
+		*delay_on = 500;
+		*delay_off = 500;
+	}
+
+	ret = lm3533_led_delay_on_set(led, delay_on);
+	if (ret)
+		return ret;
+
+	ret = lm3533_led_delay_off_set(led, delay_off);
+	if (ret)
+		return ret;
+
+	return lm3533_led_pattern_enable(led, 1);
+}
+
+static ssize_t show_id(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct lm3533_led *led = to_lm3533_led(led_cdev);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", led->id);
+}
+
+/*
+ * Pattern generator rise/fall times:
+ *
+ *   0 - 2048 us (default)
+ *   1 - 262 ms
+ *   2 - 524 ms
+ *   3 - 1.049 s
+ *   4 - 2.097 s
+ *   5 - 4.194 s
+ *   6 - 8.389 s
+ *   7 - 16.78 s
+ */
+static ssize_t show_risefalltime(struct device *dev,
+					struct device_attribute *attr,
+					char *buf, u8 base)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct lm3533_led *led = to_lm3533_led(led_cdev);
+	ssize_t ret;
+	u8 reg;
+	u8 val;
+
+	reg = lm3533_led_get_pattern_reg(led, base);
+	ret = lm3533_read(led->lm3533, reg, &val);
+	if (ret)
+		return ret;
+
+	return scnprintf(buf, PAGE_SIZE, "%x\n", val);
+}
+
+static ssize_t show_risetime(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	return show_risefalltime(dev, attr, buf,
+					LM3533_REG_PATTERN_RISETIME_BASE);
+}
+
+static ssize_t show_falltime(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	return show_risefalltime(dev, attr, buf,
+					LM3533_REG_PATTERN_FALLTIME_BASE);
+}
+
+static ssize_t store_risefalltime(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t len, u8 base)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct lm3533_led *led = to_lm3533_led(led_cdev);
+	u8 val;
+	u8 reg;
+	int ret;
+
+	if (kstrtou8(buf, 0, &val) || val > LM3533_RISEFALLTIME_MAX)
+		return -EINVAL;
+
+	reg = lm3533_led_get_pattern_reg(led, base);
+	ret = lm3533_write(led->lm3533, reg, val);
+	if (ret)
+		return ret;
+
+	return len;
+}
+
+static ssize_t store_risetime(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t len)
+{
+	return store_risefalltime(dev, attr, buf, len,
+					LM3533_REG_PATTERN_RISETIME_BASE);
+}
+
+static ssize_t store_falltime(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t len)
+{
+	return store_risefalltime(dev, attr, buf, len,
+					LM3533_REG_PATTERN_FALLTIME_BASE);
+}
+
+static ssize_t show_als_channel(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct lm3533_led *led = to_lm3533_led(led_cdev);
+	unsigned channel;
+	u8 reg;
+	u8 val;
+	int ret;
+
+	reg = lm3533_led_get_lv_reg(led, LM3533_REG_CTRLBANK_BCONF_BASE);
+	ret = lm3533_read(led->lm3533, reg, &val);
+	if (ret)
+		return ret;
+
+	channel = (val & LM3533_REG_CTRLBANK_BCONF_ALS_CHANNEL_MASK) + 1;
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", channel);
+}
+
+static ssize_t store_als_channel(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t len)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct lm3533_led *led = to_lm3533_led(led_cdev);
+	unsigned channel;
+	u8 reg;
+	u8 val;
+	u8 mask;
+	int ret;
+
+	if (kstrtouint(buf, 0, &channel))
+		return -EINVAL;
+
+	if (channel < LM3533_ALS_CHANNEL_LV_MIN ||
+					channel > LM3533_ALS_CHANNEL_LV_MAX)
+		return -EINVAL;
+
+	reg = lm3533_led_get_lv_reg(led, LM3533_REG_CTRLBANK_BCONF_BASE);
+	mask = LM3533_REG_CTRLBANK_BCONF_ALS_CHANNEL_MASK;
+	val = channel - 1;
+
+	ret = lm3533_update(led->lm3533, reg, val, mask);
+	if (ret)
+		return ret;
+
+	return len;
+}
+
+static ssize_t show_als_en(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct lm3533_led *led = to_lm3533_led(led_cdev);
+	bool enable;
+	u8 reg;
+	u8 val;
+	int ret;
+
+	reg = lm3533_led_get_lv_reg(led, LM3533_REG_CTRLBANK_BCONF_BASE);
+	ret = lm3533_read(led->lm3533, reg, &val);
+	if (ret)
+		return ret;
+
+	enable = val & LM3533_REG_CTRLBANK_BCONF_ALS_EN_MASK;
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", enable);
+}
+
+static ssize_t store_als_en(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t len)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct lm3533_led *led = to_lm3533_led(led_cdev);
+	unsigned enable;
+	u8 reg;
+	u8 mask;
+	u8 val;
+	int ret;
+
+	if (kstrtouint(buf, 0, &enable))
+		return -EINVAL;
+
+	reg = lm3533_led_get_lv_reg(led, LM3533_REG_CTRLBANK_BCONF_BASE);
+	mask = LM3533_REG_CTRLBANK_BCONF_ALS_EN_MASK;
+
+	if (enable)
+		val = mask;
+	else
+		val = 0;
+
+	ret = lm3533_update(led->lm3533, reg, val, mask);
+	if (ret)
+		return ret;
+
+	return len;
+}
+
+static ssize_t show_linear(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct lm3533_led *led = to_lm3533_led(led_cdev);
+	u8 reg;
+	u8 val;
+	int linear;
+	int ret;
+
+	reg = lm3533_led_get_lv_reg(led, LM3533_REG_CTRLBANK_BCONF_BASE);
+	ret = lm3533_read(led->lm3533, reg, &val);
+	if (ret)
+		return ret;
+
+	if (val & LM3533_REG_CTRLBANK_BCONF_MAPPING_MASK)
+		linear = 1;
+	else
+		linear = 0;
+
+	return scnprintf(buf, PAGE_SIZE, "%x\n", linear);
+}
+
+static ssize_t store_linear(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t len)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct lm3533_led *led = to_lm3533_led(led_cdev);
+	unsigned long linear;
+	u8 reg;
+	u8 mask;
+	u8 val;
+	int ret;
+
+	if (kstrtoul(buf, 0, &linear))
+		return -EINVAL;
+
+	reg = lm3533_led_get_lv_reg(led, LM3533_REG_CTRLBANK_BCONF_BASE);
+	mask = LM3533_REG_CTRLBANK_BCONF_MAPPING_MASK;
+
+	if (linear)
+		val = mask;
+	else
+		val = 0;
+
+	ret = lm3533_update(led->lm3533, reg, val, mask);
+	if (ret)
+		return ret;
+
+	return len;
+}
+
+static ssize_t show_pwm(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct lm3533_led *led = to_lm3533_led(led_cdev);
+	u8 val;
+	int ret;
+
+	ret = lm3533_ctrlbank_get_pwm(&led->cb, &val);
+	if (ret)
+		return ret;
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", val);
+}
+
+static ssize_t store_pwm(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t len)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct lm3533_led *led = to_lm3533_led(led_cdev);
+	u8 val;
+	int ret;
+
+	if (kstrtou8(buf, 0, &val))
+		return -EINVAL;
+
+	ret = lm3533_ctrlbank_set_pwm(&led->cb, val);
+	if (ret)
+		return ret;
+
+	return len;
+}
+
+static LM3533_ATTR_RW(als_channel);
+static LM3533_ATTR_RW(als_en);
+static LM3533_ATTR_RW(falltime);
+static LM3533_ATTR_RO(id);
+static LM3533_ATTR_RW(linear);
+static LM3533_ATTR_RW(pwm);
+static LM3533_ATTR_RW(risetime);
+
+static struct attribute *lm3533_led_attributes[] = {
+	&dev_attr_als_channel.attr,
+	&dev_attr_als_en.attr,
+	&dev_attr_falltime.attr,
+	&dev_attr_id.attr,
+	&dev_attr_linear.attr,
+	&dev_attr_pwm.attr,
+	&dev_attr_risetime.attr,
+	NULL,
+};
+
+static umode_t lm3533_led_attr_is_visible(struct kobject *kobj,
+					     struct attribute *attr, int n)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct lm3533_led *led = to_lm3533_led(led_cdev);
+	umode_t mode = attr->mode;
+
+	if (attr == &dev_attr_als_channel.attr ||
+					attr == &dev_attr_als_en.attr) {
+		if (!led->lm3533->have_als)
+			mode = 0;
+	}
+
+	return mode;
+};
+
+static struct attribute_group lm3533_led_attribute_group = {
+	.is_visible	= lm3533_led_attr_is_visible,
+	.attrs		= lm3533_led_attributes
+};
+
+static int __devinit lm3533_led_setup(struct lm3533_led *led,
+					struct lm3533_led_platform_data *pdata)
+{
+	int ret;
+
+	ret = lm3533_ctrlbank_set_max_current(&led->cb, pdata->max_current);
+	if (ret)
+		return ret;
+
+	return lm3533_ctrlbank_set_pwm(&led->cb, pdata->pwm);
+}
+
+static int __devinit lm3533_led_probe(struct platform_device *pdev)
+{
+	struct lm3533 *lm3533;
+	struct lm3533_led_platform_data *pdata;
+	struct lm3533_led *led;
+	int ret;
+
+	dev_dbg(&pdev->dev, "%s\n", __func__);
+
+	lm3533 = dev_get_drvdata(pdev->dev.parent);
+	if (!lm3533)
+		return -EINVAL;
+
+	pdata = pdev->dev.platform_data;
+	if (!pdata) {
+		dev_err(&pdev->dev, "no platform data\n");
+		return -EINVAL;
+	}
+
+	if (pdev->id < 0 || pdev->id >= LM3533_LVCTRLBANK_COUNT) {
+		dev_err(&pdev->dev, "illegal LED id %d\n", pdev->id);
+		return -EINVAL;
+	}
+
+	led = devm_kzalloc(&pdev->dev, sizeof(*led), GFP_KERNEL);
+	if (!led)
+		return -ENOMEM;
+
+	led->lm3533 = lm3533;
+	led->cdev.name = pdata->name;
+	led->cdev.default_trigger = pdata->default_trigger;
+	led->cdev.brightness_set = lm3533_led_set;
+	led->cdev.brightness_get = lm3533_led_get;
+	led->cdev.blink_set = lm3533_led_blink_set;
+	led->cdev.brightness = LED_OFF;
+	led->id = pdev->id;
+
+	mutex_init(&led->mutex);
+	INIT_WORK(&led->work, lm3533_led_work);
+
+	/* The class framework makes a callback to get brightness during
+	 * registration so use parent device (for error reporting) until
+	 * registered.
+	 */
+	led->cb.lm3533 = lm3533;
+	led->cb.id = lm3533_led_get_ctrlbank_id(led);
+	led->cb.dev = lm3533->dev;
+
+	platform_set_drvdata(pdev, led);
+
+	ret = led_classdev_register(pdev->dev.parent, &led->cdev);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register LED %d\n", pdev->id);
+		return ret;
+	}
+
+	led->cb.dev = led->cdev.dev;
+
+	ret = sysfs_create_group(&led->cdev.dev->kobj,
+						&lm3533_led_attribute_group);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "failed to create sysfs attributes\n");
+		goto err_unregister;
+	}
+
+	ret = lm3533_led_setup(led, pdata);
+	if (ret)
+		goto err_sysfs_remove;
+
+	ret = lm3533_ctrlbank_enable(&led->cb);
+	if (ret)
+		goto err_sysfs_remove;
+
+	return 0;
+
+err_sysfs_remove:
+	sysfs_remove_group(&led->cdev.dev->kobj, &lm3533_led_attribute_group);
+err_unregister:
+	led_classdev_unregister(&led->cdev);
+	flush_work_sync(&led->work);
+
+	return ret;
+}
+
+static int __devexit lm3533_led_remove(struct platform_device *pdev)
+{
+	struct lm3533_led *led = platform_get_drvdata(pdev);
+
+	dev_dbg(&pdev->dev, "%s\n", __func__);
+
+	lm3533_ctrlbank_disable(&led->cb);
+	sysfs_remove_group(&led->cdev.dev->kobj, &lm3533_led_attribute_group);
+	led_classdev_unregister(&led->cdev);
+	flush_work_sync(&led->work);
+
+	return 0;
+}
+
+static void lm3533_led_shutdown(struct platform_device *pdev)
+{
+
+	struct lm3533_led *led = platform_get_drvdata(pdev);
+
+	dev_dbg(&pdev->dev, "%s\n", __func__);
+
+	lm3533_ctrlbank_disable(&led->cb);
+	lm3533_led_set(&led->cdev, LED_OFF);		/* disable blink */
+	flush_work_sync(&led->work);
+}
+
+static struct platform_driver lm3533_led_driver = {
+	.driver = {
+		.name = "lm3533-leds",
+		.owner = THIS_MODULE,
+	},
+	.probe		= lm3533_led_probe,
+	.remove		= __devexit_p(lm3533_led_remove),
+	.shutdown	= lm3533_led_shutdown,
+};
+module_platform_driver(lm3533_led_driver);
+
+MODULE_AUTHOR("Johan Hovold <[email protected]>");
+MODULE_DESCRIPTION("LM3533 LED driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:lm3533-leds");
diff --git a/drivers/leds/leds-lp5521.c b/drivers/leds/leds-lp5521.c
index 410a723b8691..23815624f35e 100644
--- a/drivers/leds/leds-lp5521.c
+++ b/drivers/leds/leds-lp5521.c
@@ -193,9 +193,14 @@ static int lp5521_load_program(struct lp5521_engine *eng, const u8 *pattern)
 
 	/* move current engine to direct mode and remember the state */
 	ret = lp5521_set_engine_mode(eng, LP5521_CMD_DIRECT);
+	if (ret)
+		return ret;
+
 	/* Mode change requires min 500 us delay. 1 - 2 ms  with margin */
 	usleep_range(1000, 2000);
-	ret |= lp5521_read(client, LP5521_REG_OP_MODE, &mode);
+	ret = lp5521_read(client, LP5521_REG_OP_MODE, &mode);
+	if (ret)
+		return ret;
 
 	/* For loading, all the engines to load mode */
 	lp5521_write(client, LP5521_REG_OP_MODE, LP5521_CMD_DIRECT);
@@ -211,8 +216,7 @@ static int lp5521_load_program(struct lp5521_engine *eng, const u8 *pattern)
 				LP5521_PROG_MEM_SIZE,
 				pattern);
 
-	ret |= lp5521_write(client, LP5521_REG_OP_MODE, mode);
-	return ret;
+	return lp5521_write(client, LP5521_REG_OP_MODE, mode);
 }
 
 static int lp5521_set_led_current(struct lp5521_chip *chip, int led, u8 curr)
@@ -785,7 +789,7 @@ static int __devinit lp5521_probe(struct i2c_client *client,
 	 * LP5521_REG_ENABLE register will not have any effect - strange!
 	 */
 	ret = lp5521_read(client, LP5521_REG_R_CURRENT, &buf);
-	if (buf != LP5521_REG_R_CURR_DEFAULT) {
+	if (ret || buf != LP5521_REG_R_CURR_DEFAULT) {
 		dev_err(&client->dev, "error in resetting chip\n");
 		goto fail2;
 	}
diff --git a/drivers/leds/leds-mc13783.c b/drivers/leds/leds-mc13783.c
index 8bc491541550..4cc6a2e3df34 100644
--- a/drivers/leds/leds-mc13783.c
+++ b/drivers/leds/leds-mc13783.c
@@ -280,7 +280,7 @@ static int __devinit mc13783_led_probe(struct platform_device *pdev)
 		return -EINVAL;
 	}
 
-	led = kzalloc(sizeof(*led) * pdata->num_leds, GFP_KERNEL);
+	led = kcalloc(pdata->num_leds, sizeof(*led), GFP_KERNEL);
 	if (led == NULL) {
 		dev_err(&pdev->dev, "failed to alloc memory\n");
 		return -ENOMEM;
diff --git a/drivers/leds/leds-pca955x.c b/drivers/leds/leds-pca955x.c
index dcc3bc3d38db..5f462dbf0dbb 100644
--- a/drivers/leds/leds-pca955x.c
+++ b/drivers/leds/leds-pca955x.c
@@ -101,11 +101,16 @@ static const struct i2c_device_id pca955x_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, pca955x_id);
 
-struct pca955x_led {
+struct pca955x {
+	struct mutex lock;
+	struct pca955x_led *leds;
 	struct pca955x_chipdef	*chipdef;
 	struct i2c_client	*client;
+};
+
+struct pca955x_led {
+	struct pca955x	*pca955x;
 	struct work_struct	work;
-	spinlock_t		lock;
 	enum led_brightness	brightness;
 	struct led_classdev	led_cdev;
 	int			led_num;	/* 0 .. 15 potentially */
@@ -140,7 +145,7 @@ static inline u8 pca955x_ledsel(u8 oldval, int led_num, int state)
  */
 static void pca955x_write_psc(struct i2c_client *client, int n, u8 val)
 {
-	struct pca955x_led *pca955x = i2c_get_clientdata(client);
+	struct pca955x *pca955x = i2c_get_clientdata(client);
 
 	i2c_smbus_write_byte_data(client,
 		pca95xx_num_input_regs(pca955x->chipdef->bits) + 2*n,
@@ -156,7 +161,7 @@ static void pca955x_write_psc(struct i2c_client *client, int n, u8 val)
  */
 static void pca955x_write_pwm(struct i2c_client *client, int n, u8 val)
 {
-	struct pca955x_led *pca955x = i2c_get_clientdata(client);
+	struct pca955x *pca955x = i2c_get_clientdata(client);
 
 	i2c_smbus_write_byte_data(client,
 		pca95xx_num_input_regs(pca955x->chipdef->bits) + 1 + 2*n,
@@ -169,7 +174,7 @@ static void pca955x_write_pwm(struct i2c_client *client, int n, u8 val)
  */
 static void pca955x_write_ls(struct i2c_client *client, int n, u8 val)
 {
-	struct pca955x_led *pca955x = i2c_get_clientdata(client);
+	struct pca955x *pca955x = i2c_get_clientdata(client);
 
 	i2c_smbus_write_byte_data(client,
 		pca95xx_num_input_regs(pca955x->chipdef->bits) + 4 + n,
@@ -182,7 +187,7 @@ static void pca955x_write_ls(struct i2c_client *client, int n, u8 val)
  */
 static u8 pca955x_read_ls(struct i2c_client *client, int n)
 {
-	struct pca955x_led *pca955x = i2c_get_clientdata(client);
+	struct pca955x *pca955x = i2c_get_clientdata(client);
 
 	return (u8) i2c_smbus_read_byte_data(client,
 		pca95xx_num_input_regs(pca955x->chipdef->bits) + 4 + n);
@@ -190,18 +195,23 @@ static u8 pca955x_read_ls(struct i2c_client *client, int n)
 
 static void pca955x_led_work(struct work_struct *work)
 {
-	struct pca955x_led *pca955x;
+	struct pca955x_led *pca955x_led;
+	struct pca955x *pca955x;
 	u8 ls;
 	int chip_ls;	/* which LSx to use (0-3 potentially) */
 	int ls_led;	/* which set of bits within LSx to use (0-3) */
 
-	pca955x = container_of(work, struct pca955x_led, work);
-	chip_ls = pca955x->led_num / 4;
-	ls_led = pca955x->led_num % 4;
+	pca955x_led = container_of(work, struct pca955x_led, work);
+	pca955x = pca955x_led->pca955x;
+
+	chip_ls = pca955x_led->led_num / 4;
+	ls_led = pca955x_led->led_num % 4;
+
+	mutex_lock(&pca955x->lock);
 
 	ls = pca955x_read_ls(pca955x->client, chip_ls);
 
-	switch (pca955x->brightness) {
+	switch (pca955x_led->brightness) {
 	case LED_FULL:
 		ls = pca955x_ledsel(ls, ls_led, PCA955X_LS_LED_ON);
 		break;
@@ -219,12 +229,15 @@ static void pca955x_led_work(struct work_struct *work)
 		 * OFF, HALF, or FULL.  But, this is probably better than
 		 * just turning off for all other values.
 		 */
-		pca955x_write_pwm(pca955x->client, 1, 255-pca955x->brightness);
+		pca955x_write_pwm(pca955x->client, 1,
+				255 - pca955x_led->brightness);
 		ls = pca955x_ledsel(ls, ls_led, PCA955X_LS_BLINK1);
 		break;
 	}
 
 	pca955x_write_ls(pca955x->client, chip_ls, ls);
+
+	mutex_unlock(&pca955x->lock);
 }
 
 static void pca955x_led_set(struct led_classdev *led_cdev, enum led_brightness value)
@@ -233,7 +246,6 @@ static void pca955x_led_set(struct led_classdev *led_cdev, enum led_brightness v
 
 	pca955x = container_of(led_cdev, struct pca955x_led, led_cdev);
 
-	spin_lock(&pca955x->lock);
 	pca955x->brightness = value;
 
 	/*
@@ -241,14 +253,13 @@ static void pca955x_led_set(struct led_classdev *led_cdev, enum led_brightness v
 	 * can sleep.
 	 */
 	schedule_work(&pca955x->work);
-
-	spin_unlock(&pca955x->lock);
 }
 
 static int __devinit pca955x_probe(struct i2c_client *client,
 					const struct i2c_device_id *id)
 {
-	struct pca955x_led *pca955x;
+	struct pca955x *pca955x;
+	struct pca955x_led *pca955x_led;
 	struct pca955x_chipdef *chip;
 	struct i2c_adapter *adapter;
 	struct led_platform_data *pdata;
@@ -282,39 +293,48 @@ static int __devinit pca955x_probe(struct i2c_client *client,
 		}
 	}
 
-	pca955x = kzalloc(sizeof(*pca955x) * chip->bits, GFP_KERNEL);
+	pca955x = kzalloc(sizeof(*pca955x), GFP_KERNEL);
 	if (!pca955x)
 		return -ENOMEM;
 
+	pca955x->leds = kzalloc(sizeof(*pca955x_led) * chip->bits, GFP_KERNEL);
+	if (!pca955x->leds) {
+		err = -ENOMEM;
+		goto exit_nomem;
+	}
+
 	i2c_set_clientdata(client, pca955x);
 
+	mutex_init(&pca955x->lock);
+	pca955x->client = client;
+	pca955x->chipdef = chip;
+
 	for (i = 0; i < chip->bits; i++) {
-		pca955x[i].chipdef = chip;
-		pca955x[i].client = client;
-		pca955x[i].led_num = i;
+		pca955x_led = &pca955x->leds[i];
+		pca955x_led->led_num = i;
+		pca955x_led->pca955x = pca955x;
 
 		/* Platform data can specify LED names and default triggers */
 		if (pdata) {
 			if (pdata->leds[i].name)
-				snprintf(pca955x[i].name,
-					 sizeof(pca955x[i].name), "pca955x:%s",
-					 pdata->leds[i].name);
+				snprintf(pca955x_led->name,
+					sizeof(pca955x_led->name), "pca955x:%s",
+					pdata->leds[i].name);
 			if (pdata->leds[i].default_trigger)
-				pca955x[i].led_cdev.default_trigger =
+				pca955x_led->led_cdev.default_trigger =
 					pdata->leds[i].default_trigger;
 		} else {
-			snprintf(pca955x[i].name, sizeof(pca955x[i].name),
+			snprintf(pca955x_led->name, sizeof(pca955x_led->name),
 				 "pca955x:%d", i);
 		}
 
-		spin_lock_init(&pca955x[i].lock);
-
-		pca955x[i].led_cdev.name = pca955x[i].name;
-		pca955x[i].led_cdev.brightness_set = pca955x_led_set;
+		pca955x_led->led_cdev.name = pca955x_led->name;
+		pca955x_led->led_cdev.brightness_set = pca955x_led_set;
 
-		INIT_WORK(&pca955x[i].work, pca955x_led_work);
+		INIT_WORK(&pca955x_led->work, pca955x_led_work);
 
-		err = led_classdev_register(&client->dev, &pca955x[i].led_cdev);
+		err = led_classdev_register(&client->dev,
+					&pca955x_led->led_cdev);
 		if (err < 0)
 			goto exit;
 	}
@@ -337,10 +357,12 @@ static int __devinit pca955x_probe(struct i2c_client *client,
 
 exit:
 	while (i--) {
-		led_classdev_unregister(&pca955x[i].led_cdev);
-		cancel_work_sync(&pca955x[i].work);
+		led_classdev_unregister(&pca955x->leds[i].led_cdev);
+		cancel_work_sync(&pca955x->leds[i].work);
 	}
 
+	kfree(pca955x->leds);
+exit_nomem:
 	kfree(pca955x);
 
 	return err;
@@ -348,14 +370,15 @@ exit:
 
 static int __devexit pca955x_remove(struct i2c_client *client)
 {
-	struct pca955x_led *pca955x = i2c_get_clientdata(client);
+	struct pca955x *pca955x = i2c_get_clientdata(client);
 	int i;
 
 	for (i = 0; i < pca955x->chipdef->bits; i++) {
-		led_classdev_unregister(&pca955x[i].led_cdev);
-		cancel_work_sync(&pca955x[i].work);
+		led_classdev_unregister(&pca955x->leds[i].led_cdev);
+		cancel_work_sync(&pca955x->leds[i].work);
 	}
 
+	kfree(pca955x->leds);
 	kfree(pca955x);
 
 	return 0;
diff --git a/drivers/leds/ledtrig-backlight.c b/drivers/leds/ledtrig-backlight.c
index 2b513a2ad7de..e2726867c5d4 100644
--- a/drivers/leds/ledtrig-backlight.c
+++ b/drivers/leds/ledtrig-backlight.c
@@ -120,6 +120,7 @@ static void bl_trig_activate(struct led_classdev *led)
 	ret = fb_register_client(&n->notifier);
 	if (ret)
 		dev_err(led->dev, "unable to register backlight trigger\n");
+	led->activated = true;
 
 	return;
 
@@ -133,10 +134,11 @@ static void bl_trig_deactivate(struct led_classdev *led)
 	struct bl_trig_notifier *n =
 		(struct bl_trig_notifier *) led->trigger_data;
 
-	if (n) {
+	if (led->activated) {
 		device_remove_file(led->dev, &dev_attr_inverted);
 		fb_unregister_client(&n->notifier);
 		kfree(n);
+		led->activated = false;
 	}
 }
 
diff --git a/drivers/leds/ledtrig-gpio.c b/drivers/leds/ledtrig-gpio.c
index ecc4bf3f37a9..f057c101b896 100644
--- a/drivers/leds/ledtrig-gpio.c
+++ b/drivers/leds/ledtrig-gpio.c
@@ -200,6 +200,7 @@ static void gpio_trig_activate(struct led_classdev *led)
 	gpio_data->led = led;
 	led->trigger_data = gpio_data;
 	INIT_WORK(&gpio_data->work, gpio_trig_work);
+	led->activated = true;
 
 	return;
 
@@ -217,7 +218,7 @@ static void gpio_trig_deactivate(struct led_classdev *led)
 {
 	struct gpio_trig_data *gpio_data = led->trigger_data;
 
-	if (gpio_data) {
+	if (led->activated) {
 		device_remove_file(led->dev, &dev_attr_gpio);
 		device_remove_file(led->dev, &dev_attr_inverted);
 		device_remove_file(led->dev, &dev_attr_desired_brightness);
@@ -225,6 +226,7 @@ static void gpio_trig_deactivate(struct led_classdev *led)
 		if (gpio_data->gpio != 0)
 			free_irq(gpio_to_irq(gpio_data->gpio), led);
 		kfree(gpio_data);
+		led->activated = false;
 	}
 }
 
diff --git a/drivers/leds/ledtrig-heartbeat.c b/drivers/leds/ledtrig-heartbeat.c
index 759c0bba4a8f..41dc76db4311 100644
--- a/drivers/leds/ledtrig-heartbeat.c
+++ b/drivers/leds/ledtrig-heartbeat.c
@@ -18,6 +18,7 @@
 #include <linux/timer.h>
 #include <linux/sched.h>
 #include <linux/leds.h>
+#include <linux/reboot.h>
 #include "leds.h"
 
 struct heartbeat_trig_data {
@@ -83,15 +84,17 @@ static void heartbeat_trig_activate(struct led_classdev *led_cdev)
 		    led_heartbeat_function, (unsigned long) led_cdev);
 	heartbeat_data->phase = 0;
 	led_heartbeat_function(heartbeat_data->timer.data);
+	led_cdev->activated = true;
 }
 
 static void heartbeat_trig_deactivate(struct led_classdev *led_cdev)
 {
 	struct heartbeat_trig_data *heartbeat_data = led_cdev->trigger_data;
 
-	if (heartbeat_data) {
+	if (led_cdev->activated) {
 		del_timer_sync(&heartbeat_data->timer);
 		kfree(heartbeat_data);
+		led_cdev->activated = false;
 	}
 }
 
@@ -101,13 +104,38 @@ static struct led_trigger heartbeat_led_trigger = {
 	.deactivate = heartbeat_trig_deactivate,
 };
 
+static int heartbeat_reboot_notifier(struct notifier_block *nb,
+				     unsigned long code, void *unused)
+{
+	led_trigger_unregister(&heartbeat_led_trigger);
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block heartbeat_reboot_nb = {
+	.notifier_call = heartbeat_reboot_notifier,
+};
+
+static struct notifier_block heartbeat_panic_nb = {
+	.notifier_call = heartbeat_reboot_notifier,
+};
+
 static int __init heartbeat_trig_init(void)
 {
-	return led_trigger_register(&heartbeat_led_trigger);
+	int rc = led_trigger_register(&heartbeat_led_trigger);
+
+	if (!rc) {
+		atomic_notifier_chain_register(&panic_notifier_list,
+					       &heartbeat_panic_nb);
+		register_reboot_notifier(&heartbeat_reboot_nb);
+	}
+	return rc;
 }
 
 static void __exit heartbeat_trig_exit(void)
 {
+	unregister_reboot_notifier(&heartbeat_reboot_nb);
+	atomic_notifier_chain_unregister(&panic_notifier_list,
+					 &heartbeat_panic_nb);
 	led_trigger_unregister(&heartbeat_led_trigger);
 }
 
diff --git a/drivers/leds/ledtrig-timer.c b/drivers/leds/ledtrig-timer.c
index 328c64c0841c..9010f7abaf2c 100644
--- a/drivers/leds/ledtrig-timer.c
+++ b/drivers/leds/ledtrig-timer.c
@@ -31,21 +31,17 @@ static ssize_t led_delay_on_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t size)
 {
 	struct led_classdev *led_cdev = dev_get_drvdata(dev);
-	int ret = -EINVAL;
-	char *after;
-	unsigned long state = simple_strtoul(buf, &after, 10);
-	size_t count = after - buf;
-
-	if (isspace(*after))
-		count++;
-
-	if (count == size) {
-		led_blink_set(led_cdev, &state, &led_cdev->blink_delay_off);
-		led_cdev->blink_delay_on = state;
-		ret = count;
-	}
+	unsigned long state;
+	ssize_t ret = -EINVAL;
+
+	ret = kstrtoul(buf, 10, &state);
+	if (ret)
+		return ret;
 
-	return ret;
+	led_blink_set(led_cdev, &state, &led_cdev->blink_delay_off);
+	led_cdev->blink_delay_on = state;
+
+	return size;
 }
 
 static ssize_t led_delay_off_show(struct device *dev,
@@ -60,21 +56,17 @@ static ssize_t led_delay_off_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t size)
 {
 	struct led_classdev *led_cdev = dev_get_drvdata(dev);
-	int ret = -EINVAL;
-	char *after;
-	unsigned long state = simple_strtoul(buf, &after, 10);
-	size_t count = after - buf;
-
-	if (isspace(*after))
-		count++;
-
-	if (count == size) {
-		led_blink_set(led_cdev, &led_cdev->blink_delay_on, &state);
-		led_cdev->blink_delay_off = state;
-		ret = count;
-	}
+	unsigned long state;
+	ssize_t ret = -EINVAL;
 
-	return ret;
+	ret = kstrtoul(buf, 10, &state);
+	if (ret)
+		return ret;
+
+	led_blink_set(led_cdev, &led_cdev->blink_delay_on, &state);
+	led_cdev->blink_delay_off = state;
+
+	return size;
 }
 
 static DEVICE_ATTR(delay_on, 0644, led_delay_on_show, led_delay_on_store);
@@ -95,8 +87,7 @@ static void timer_trig_activate(struct led_classdev *led_cdev)
 
 	led_blink_set(led_cdev, &led_cdev->blink_delay_on,
 		      &led_cdev->blink_delay_off);
-
-	led_cdev->trigger_data = (void *)1;
+	led_cdev->activated = true;
 
 	return;
 
@@ -106,9 +97,10 @@ err_out_delayon:
 
 static void timer_trig_deactivate(struct led_classdev *led_cdev)
 {
-	if (led_cdev->trigger_data) {
+	if (led_cdev->activated) {
 		device_remove_file(led_cdev->dev, &dev_attr_delay_on);
 		device_remove_file(led_cdev->dev, &dev_attr_delay_off);
+		led_cdev->activated = false;
 	}
 
 	/* Stop blinking */
diff --git a/drivers/leds/ledtrig-transient.c b/drivers/leds/ledtrig-transient.c
new file mode 100644
index 000000000000..83179f435e1e
--- /dev/null
+++ b/drivers/leds/ledtrig-transient.c
@@ -0,0 +1,237 @@
+/*
+ * LED Kernel Transient Trigger
+ *
+ * Copyright (C) 2012 Shuah Khan <[email protected]>
+ *
+ * Based on Richard Purdie's ledtrig-timer.c and Atsushi Nemoto's
+ * ledtrig-heartbeat.c
+ * Design and use-case input from Jonas Bonn <[email protected]> and
+ * Neil Brown <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+/*
+ * Transient trigger allows one shot timer activation. Please refer to
+ * Documentation/leds/ledtrig-transient.txt for details
+*/
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <linux/leds.h>
+#include "leds.h"
+
+struct transient_trig_data {
+	int activate;
+	int state;
+	int restore_state;
+	unsigned long duration;
+	struct timer_list timer;
+};
+
+static void transient_timer_function(unsigned long data)
+{
+	struct led_classdev *led_cdev = (struct led_classdev *) data;
+	struct transient_trig_data *transient_data = led_cdev->trigger_data;
+
+	transient_data->activate = 0;
+	led_set_brightness(led_cdev, transient_data->restore_state);
+}
+
+static ssize_t transient_activate_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct transient_trig_data *transient_data = led_cdev->trigger_data;
+
+	return sprintf(buf, "%d\n", transient_data->activate);
+}
+
+static ssize_t transient_activate_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t size)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct transient_trig_data *transient_data = led_cdev->trigger_data;
+	unsigned long state;
+	ssize_t ret;
+
+	ret = kstrtoul(buf, 10, &state);
+	if (ret)
+		return ret;
+
+	if (state != 1 && state != 0)
+		return -EINVAL;
+
+	/* cancel the running timer */
+	if (state == 0 && transient_data->activate == 1) {
+		del_timer(&transient_data->timer);
+		transient_data->activate = state;
+		led_set_brightness(led_cdev, transient_data->restore_state);
+		return size;
+	}
+
+	/* start timer if there is no active timer */
+	if (state == 1 && transient_data->activate == 0 &&
+	    transient_data->duration != 0) {
+		transient_data->activate = state;
+		led_set_brightness(led_cdev, transient_data->state);
+		transient_data->restore_state =
+		    (transient_data->state == LED_FULL) ? LED_OFF : LED_FULL;
+		mod_timer(&transient_data->timer,
+			  jiffies + transient_data->duration);
+	}
+
+	/* state == 0 && transient_data->activate == 0
+		timer is not active - just return */
+	/* state == 1 && transient_data->activate == 1
+		timer is already active - just return */
+
+	return size;
+}
+
+static ssize_t transient_duration_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct transient_trig_data *transient_data = led_cdev->trigger_data;
+
+	return sprintf(buf, "%lu\n", transient_data->duration);
+}
+
+static ssize_t transient_duration_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t size)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct transient_trig_data *transient_data = led_cdev->trigger_data;
+	unsigned long state;
+	ssize_t ret;
+
+	ret = kstrtoul(buf, 10, &state);
+	if (ret)
+		return ret;
+
+	transient_data->duration = state;
+	return size;
+}
+
+static ssize_t transient_state_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct transient_trig_data *transient_data = led_cdev->trigger_data;
+	int state;
+
+	state = (transient_data->state == LED_FULL) ? 1 : 0;
+	return sprintf(buf, "%d\n", state);
+}
+
+static ssize_t transient_state_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t size)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+	struct transient_trig_data *transient_data = led_cdev->trigger_data;
+	unsigned long state;
+	ssize_t ret;
+
+	ret = kstrtoul(buf, 10, &state);
+	if (ret)
+		return ret;
+
+	if (state != 1 && state != 0)
+		return -EINVAL;
+
+	transient_data->state = (state == 1) ? LED_FULL : LED_OFF;
+	return size;
+}
+
+static DEVICE_ATTR(activate, 0644, transient_activate_show,
+		   transient_activate_store);
+static DEVICE_ATTR(duration, 0644, transient_duration_show,
+		   transient_duration_store);
+static DEVICE_ATTR(state, 0644, transient_state_show, transient_state_store);
+
+static void transient_trig_activate(struct led_classdev *led_cdev)
+{
+	int rc;
+	struct transient_trig_data *tdata;
+
+	tdata = kzalloc(sizeof(struct transient_trig_data), GFP_KERNEL);
+	if (!tdata) {
+		dev_err(led_cdev->dev,
+			"unable to allocate transient trigger\n");
+		return;
+	}
+	led_cdev->trigger_data = tdata;
+
+	rc = device_create_file(led_cdev->dev, &dev_attr_activate);
+	if (rc)
+		goto err_out;
+
+	rc = device_create_file(led_cdev->dev, &dev_attr_duration);
+	if (rc)
+		goto err_out_duration;
+
+	rc = device_create_file(led_cdev->dev, &dev_attr_state);
+	if (rc)
+		goto err_out_state;
+
+	setup_timer(&tdata->timer, transient_timer_function,
+		    (unsigned long) led_cdev);
+	led_cdev->activated = true;
+
+	return;
+
+err_out_state:
+	device_remove_file(led_cdev->dev, &dev_attr_duration);
+err_out_duration:
+	device_remove_file(led_cdev->dev, &dev_attr_activate);
+err_out:
+	dev_err(led_cdev->dev, "unable to register transient trigger\n");
+	led_cdev->trigger_data = NULL;
+	kfree(tdata);
+}
+
+static void transient_trig_deactivate(struct led_classdev *led_cdev)
+{
+	struct transient_trig_data *transient_data = led_cdev->trigger_data;
+
+	if (led_cdev->activated) {
+		del_timer_sync(&transient_data->timer);
+		led_set_brightness(led_cdev, transient_data->restore_state);
+		device_remove_file(led_cdev->dev, &dev_attr_activate);
+		device_remove_file(led_cdev->dev, &dev_attr_duration);
+		device_remove_file(led_cdev->dev, &dev_attr_state);
+		led_cdev->trigger_data = NULL;
+		led_cdev->activated = false;
+		kfree(transient_data);
+	}
+}
+
+static struct led_trigger transient_trigger = {
+	.name     = "transient",
+	.activate = transient_trig_activate,
+	.deactivate = transient_trig_deactivate,
+};
+
+static int __init transient_trig_init(void)
+{
+	return led_trigger_register(&transient_trigger);
+}
+
+static void __exit transient_trig_exit(void)
+{
+	led_trigger_unregister(&transient_trigger);
+}
+
+module_init(transient_trig_init);
+module_exit(transient_trig_exit);
+
+MODULE_AUTHOR("Shuah Khan <[email protected]>");
+MODULE_DESCRIPTION("Transient LED trigger");
+MODULE_LICENSE("GPL");
diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index f4b4dad77391..e129c820df7d 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -106,6 +106,19 @@ config UCB1400_CORE
 	  To compile this driver as a module, choose M here: the
 	  module will be called ucb1400_core.
 
+config MFD_LM3533
+	tristate "LM3533 Lighting Power chip"
+	depends on I2C
+	select MFD_CORE
+	select REGMAP_I2C
+	help
+	  Say yes here to enable support for National Semiconductor / TI
+	  LM3533 Lighting Power chips.
+
+	  This driver provides common support for accessing the device;
+	  additional drivers must be enabled in order to use the LED,
+	  backlight or ambient-light-sensor functionality of the device.
+
 config TPS6105X
 	tristate "TPS61050/61052 Boost Converters"
 	depends on I2C
@@ -177,8 +190,8 @@ config MFD_TPS65910
 	bool "TPS65910 Power Management chip"
 	depends on I2C=y && GPIOLIB
 	select MFD_CORE
-	select GPIO_TPS65910
 	select REGMAP_I2C
+	select IRQ_DOMAIN
 	help
 	  if you say yes here you get support for the TPS65910 series of
 	  Power Management chips.
@@ -409,6 +422,19 @@ config PMIC_ADP5520
 	  individual components like LCD backlight, LEDs, GPIOs and Kepad
 	  under the corresponding menus.
 
+config MFD_MAX77693
+	bool "Maxim Semiconductor MAX77693 PMIC Support"
+	depends on I2C=y && GENERIC_HARDIRQS
+	select MFD_CORE
+	select REGMAP_I2C
+	help
+	  Say yes here to support for Maxim Semiconductor MAX77693.
+	  This is a companion Power Management IC with Flash, Haptic, Charger,
+	  and MUIC(Micro USB Interface Controller) controls on chip.
+	  This driver provides common support for accessing the device;
+	  additional drivers must be enabled in order to use the functionality
+	  of the device.
+
 config MFD_MAX8925
 	bool "Maxim Semiconductor MAX8925 PMIC Support"
 	depends on I2C=y && GENERIC_HARDIRQS
@@ -454,9 +480,9 @@ config MFD_S5M_CORE
 	 of the device
 
 config MFD_WM8400
-	tristate "Support Wolfson Microelectronics WM8400"
+	bool "Support Wolfson Microelectronics WM8400"
 	select MFD_CORE
-	depends on I2C
+	depends on I2C=y
 	select REGMAP_I2C
 	help
 	  Support for the Wolfson Microelecronics WM8400 PMIC and audio
@@ -473,6 +499,7 @@ config MFD_WM831X_I2C
 	select MFD_CORE
 	select MFD_WM831X
 	select REGMAP_I2C
+	select IRQ_DOMAIN
 	depends on I2C=y && GENERIC_HARDIRQS
 	help
 	  Support for the Wolfson Microelecronics WM831x and WM832x PMICs
@@ -485,6 +512,7 @@ config MFD_WM831X_SPI
 	select MFD_CORE
 	select MFD_WM831X
 	select REGMAP_SPI
+	select IRQ_DOMAIN
 	depends on SPI_MASTER && GENERIC_HARDIRQS
 	help
 	  Support for the Wolfson Microelecronics WM831x and WM832x PMICs
@@ -597,17 +625,32 @@ config MFD_MC13783
 	tristate
 
 config MFD_MC13XXX
-	tristate "Support Freescale MC13783 and MC13892"
-	depends on SPI_MASTER
+	tristate
+	depends on SPI_MASTER || I2C
 	select MFD_CORE
 	select MFD_MC13783
 	help
-	  Support for the Freescale (Atlas) PMIC and audio CODECs
-	  MC13783 and MC13892.
-	  This driver provides common support for accessing  the device,
+	  Enable support for the Freescale MC13783 and MC13892 PMICs.
+	  This driver provides common support for accessing the device,
 	  additional drivers must be enabled in order to use the
 	  functionality of the device.
 
+config MFD_MC13XXX_SPI
+	tristate "Freescale MC13783 and MC13892 SPI interface"
+	depends on SPI_MASTER
+	select REGMAP_SPI
+	select MFD_MC13XXX
+	help
+	  Select this if your MC13xxx is connected via an SPI bus.
+
+config MFD_MC13XXX_I2C
+	tristate "Freescale MC13892 I2C interface"
+	depends on I2C
+	select REGMAP_I2C
+	select MFD_MC13XXX
+	help
+	  Select this if your MC13xxx is connected via an I2C bus.
+
 config ABX500_CORE
 	bool "ST-Ericsson ABX500 Mixed Signal Circuit register functions"
 	default y if ARCH_U300 || ARCH_U8500
@@ -651,7 +694,7 @@ config EZX_PCAP
 
 config AB8500_CORE
 	bool "ST-Ericsson AB8500 Mixed Signal Power Management chip"
-	depends on GENERIC_HARDIRQS && ABX500_CORE
+	depends on GENERIC_HARDIRQS && ABX500_CORE && MFD_DB8500_PRCMU
 	select MFD_CORE
 	help
 	  Select this option to enable access to AB8500 power management
@@ -722,6 +765,16 @@ config LPC_SCH
 	  LPC bridge function of the Intel SCH provides support for
 	  System Management Bus and General Purpose I/O.
 
+config LPC_ICH
+	tristate "Intel ICH LPC"
+	depends on PCI
+	select MFD_CORE
+	help
+	  The LPC bridge function of the Intel ICH provides support for
+	  many functional units. This driver provides needed support for
+	  other drivers to control these functions, currently GPIO and
+	  watchdog.
+
 config MFD_RDC321X
 	tristate "Support for RDC-R321x southbridge"
 	select MFD_CORE
@@ -854,6 +907,11 @@ config MFD_RC5T583
 	  Additional drivers must be enabled in order to use the
 	  different functionality of the device.
 
+config MFD_STA2X11
+	bool "STA2X11 multi function device support"
+	depends on STA2X11
+	select MFD_CORE
+
 config MFD_ANATOP
 	bool "Support for Freescale i.MX on-chip ANATOP controller"
 	depends on SOC_IMX6Q
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 43672b87805a..75f6ed68a4b9 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_MFD_DAVINCI_VOICECODEC)	+= davinci_voicecodec.o
 obj-$(CONFIG_MFD_DM355EVM_MSP)	+= dm355evm_msp.o
 obj-$(CONFIG_MFD_TI_SSP)	+= ti-ssp.o
 
+obj-$(CONFIG_MFD_STA2X11)	+= sta2x11-mfd.o
 obj-$(CONFIG_MFD_STMPE)		+= stmpe.o
 obj-$(CONFIG_STMPE_I2C)		+= stmpe-i2c.o
 obj-$(CONFIG_STMPE_SPI)		+= stmpe-spi.o
@@ -54,6 +55,8 @@ obj-$(CONFIG_TWL6030_PWM)	+= twl6030-pwm.o
 obj-$(CONFIG_TWL6040_CORE)	+= twl6040-core.o twl6040-irq.o
 
 obj-$(CONFIG_MFD_MC13XXX)	+= mc13xxx-core.o
+obj-$(CONFIG_MFD_MC13XXX_SPI)	+= mc13xxx-spi.o
+obj-$(CONFIG_MFD_MC13XXX_I2C)	+= mc13xxx-i2c.o
 
 obj-$(CONFIG_MFD_CORE)		+= mfd-core.o
 
@@ -75,6 +78,7 @@ obj-$(CONFIG_PMIC_DA9052)	+= da9052-core.o
 obj-$(CONFIG_MFD_DA9052_SPI)	+= da9052-spi.o
 obj-$(CONFIG_MFD_DA9052_I2C)	+= da9052-i2c.o
 
+obj-$(CONFIG_MFD_MAX77693)	+= max77693.o max77693-irq.o
 max8925-objs			:= max8925-core.o max8925-i2c.o
 obj-$(CONFIG_MFD_MAX8925)	+= max8925.o
 obj-$(CONFIG_MFD_MAX8997)	+= max8997.o max8997-irq.o
@@ -87,15 +91,15 @@ obj-$(CONFIG_PCF50633_GPIO)	+= pcf50633-gpio.o
 obj-$(CONFIG_ABX500_CORE)	+= abx500-core.o
 obj-$(CONFIG_AB3100_CORE)	+= ab3100-core.o
 obj-$(CONFIG_AB3100_OTP)	+= ab3100-otp.o
-obj-$(CONFIG_AB8500_CORE)	+= ab8500-core.o ab8500-sysctrl.o
 obj-$(CONFIG_AB8500_DEBUG)	+= ab8500-debugfs.o
 obj-$(CONFIG_AB8500_GPADC)	+= ab8500-gpadc.o
 obj-$(CONFIG_MFD_DB8500_PRCMU)	+= db8500-prcmu.o
-# ab8500-i2c need to come after db8500-prcmu (which provides the channel)
-obj-$(CONFIG_AB8500_I2C_CORE)	+= ab8500-i2c.o
+# ab8500-core need to come after db8500-prcmu (which provides the channel)
+obj-$(CONFIG_AB8500_CORE)	+= ab8500-core.o ab8500-sysctrl.o
 obj-$(CONFIG_MFD_TIMBERDALE)    += timberdale.o
 obj-$(CONFIG_PMIC_ADP5520)	+= adp5520.o
 obj-$(CONFIG_LPC_SCH)		+= lpc_sch.o
+obj-$(CONFIG_LPC_ICH)		+= lpc_ich.o
 obj-$(CONFIG_MFD_RDC321X)	+= rdc321x-southbridge.o
 obj-$(CONFIG_MFD_JANZ_CMODIO)	+= janz-cmodio.o
 obj-$(CONFIG_MFD_JZ4740_ADC)	+= jz4740-adc.o
diff --git a/drivers/mfd/ab8500-core.c b/drivers/mfd/ab8500-core.c
index 1f08704f7ae8..dac0e2998603 100644
--- a/drivers/mfd/ab8500-core.c
+++ b/drivers/mfd/ab8500-core.c
@@ -18,7 +18,10 @@
 #include <linux/mfd/core.h>
 #include <linux/mfd/abx500.h>
 #include <linux/mfd/abx500/ab8500.h>
+#include <linux/mfd/dbx500-prcmu.h>
 #include <linux/regulator/ab8500.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
 
 /*
  * Interrupt register offsets
@@ -91,12 +94,24 @@
 #define AB8500_IT_MASK23_REG		0x56
 #define AB8500_IT_MASK24_REG		0x57
 
+/*
+ * latch hierarchy registers
+ */
+#define AB8500_IT_LATCHHIER1_REG	0x60
+#define AB8500_IT_LATCHHIER2_REG	0x61
+#define AB8500_IT_LATCHHIER3_REG	0x62
+
+#define AB8500_IT_LATCHHIER_NUM		3
+
 #define AB8500_REV_REG			0x80
 #define AB8500_IC_NAME_REG		0x82
 #define AB8500_SWITCH_OFF_STATUS	0x00
 
 #define AB8500_TURN_ON_STATUS		0x00
 
+static bool no_bm; /* No battery management */
+module_param(no_bm, bool, S_IRUGO);
+
 #define AB9540_MODEM_CTRL2_REG			0x23
 #define AB9540_MODEM_CTRL2_SWDBBRSTN_BIT	BIT(2)
 
@@ -125,6 +140,41 @@ static const char ab8500_version_str[][7] = {
 	[AB8500_VERSION_AB8540] = "AB8540",
 };
 
+static int ab8500_i2c_write(struct ab8500 *ab8500, u16 addr, u8 data)
+{
+	int ret;
+
+	ret = prcmu_abb_write((u8)(addr >> 8), (u8)(addr & 0xFF), &data, 1);
+	if (ret < 0)
+		dev_err(ab8500->dev, "prcmu i2c error %d\n", ret);
+	return ret;
+}
+
+static int ab8500_i2c_write_masked(struct ab8500 *ab8500, u16 addr, u8 mask,
+	u8 data)
+{
+	int ret;
+
+	ret = prcmu_abb_write_masked((u8)(addr >> 8), (u8)(addr & 0xFF), &data,
+		&mask, 1);
+	if (ret < 0)
+		dev_err(ab8500->dev, "prcmu i2c error %d\n", ret);
+	return ret;
+}
+
+static int ab8500_i2c_read(struct ab8500 *ab8500, u16 addr)
+{
+	int ret;
+	u8 data;
+
+	ret = prcmu_abb_read((u8)(addr >> 8), (u8)(addr & 0xFF), &data, 1);
+	if (ret < 0) {
+		dev_err(ab8500->dev, "prcmu i2c error %d\n", ret);
+		return ret;
+	}
+	return (int)data;
+}
+
 static int ab8500_get_chip_id(struct device *dev)
 {
 	struct ab8500 *ab8500;
@@ -161,9 +211,13 @@ static int set_register_interruptible(struct ab8500 *ab8500, u8 bank,
 static int ab8500_set_register(struct device *dev, u8 bank,
 	u8 reg, u8 value)
 {
+	int ret;
 	struct ab8500 *ab8500 = dev_get_drvdata(dev->parent);
 
-	return set_register_interruptible(ab8500, bank, reg, value);
+	atomic_inc(&ab8500->transfer_ongoing);
+	ret = set_register_interruptible(ab8500, bank, reg, value);
+	atomic_dec(&ab8500->transfer_ongoing);
+	return ret;
 }
 
 static int get_register_interruptible(struct ab8500 *ab8500, u8 bank,
@@ -192,9 +246,13 @@ static int get_register_interruptible(struct ab8500 *ab8500, u8 bank,
 static int ab8500_get_register(struct device *dev, u8 bank,
 	u8 reg, u8 *value)
 {
+	int ret;
 	struct ab8500 *ab8500 = dev_get_drvdata(dev->parent);
 
-	return get_register_interruptible(ab8500, bank, reg, value);
+	atomic_inc(&ab8500->transfer_ongoing);
+	ret = get_register_interruptible(ab8500, bank, reg, value);
+	atomic_dec(&ab8500->transfer_ongoing);
+	return ret;
 }
 
 static int mask_and_set_register_interruptible(struct ab8500 *ab8500, u8 bank,
@@ -241,11 +299,14 @@ out:
 static int ab8500_mask_and_set_register(struct device *dev,
 	u8 bank, u8 reg, u8 bitmask, u8 bitvalues)
 {
+	int ret;
 	struct ab8500 *ab8500 = dev_get_drvdata(dev->parent);
 
-	return mask_and_set_register_interruptible(ab8500, bank, reg,
-		bitmask, bitvalues);
-
+	atomic_inc(&ab8500->transfer_ongoing);
+	ret= mask_and_set_register_interruptible(ab8500, bank, reg,
+						 bitmask, bitvalues);
+	atomic_dec(&ab8500->transfer_ongoing);
+	return ret;
 }
 
 static struct abx500_ops ab8500_ops = {
@@ -264,6 +325,7 @@ static void ab8500_irq_lock(struct irq_data *data)
 	struct ab8500 *ab8500 = irq_data_get_irq_chip_data(data);
 
 	mutex_lock(&ab8500->irq_lock);
+	atomic_inc(&ab8500->transfer_ongoing);
 }
 
 static void ab8500_irq_sync_unlock(struct irq_data *data)
@@ -292,7 +354,7 @@ static void ab8500_irq_sync_unlock(struct irq_data *data)
 		reg = AB8500_IT_MASK1_REG + ab8500->irq_reg_offset[i];
 		set_register_interruptible(ab8500, AB8500_INTERRUPT, reg, new);
 	}
-
+	atomic_dec(&ab8500->transfer_ongoing);
 	mutex_unlock(&ab8500->irq_lock);
 }
 
@@ -325,6 +387,90 @@ static struct irq_chip ab8500_irq_chip = {
 	.irq_unmask		= ab8500_irq_unmask,
 };
 
+static int ab8500_handle_hierarchical_line(struct ab8500 *ab8500,
+					int latch_offset, u8 latch_val)
+{
+	int int_bit = __ffs(latch_val);
+	int line, i;
+
+	do {
+		int_bit = __ffs(latch_val);
+
+		for (i = 0; i < ab8500->mask_size; i++)
+			if (ab8500->irq_reg_offset[i] == latch_offset)
+				break;
+
+		if (i >= ab8500->mask_size) {
+			dev_err(ab8500->dev, "Register offset 0x%2x not declared\n",
+					latch_offset);
+			return -ENXIO;
+		}
+
+		line = (i << 3) + int_bit;
+		latch_val &= ~(1 << int_bit);
+
+		handle_nested_irq(ab8500->irq_base + line);
+	} while (latch_val);
+
+	return 0;
+}
+
+static int ab8500_handle_hierarchical_latch(struct ab8500 *ab8500,
+					int hier_offset, u8 hier_val)
+{
+	int latch_bit, status;
+	u8 latch_offset, latch_val;
+
+	do {
+		latch_bit = __ffs(hier_val);
+		latch_offset = (hier_offset << 3) + latch_bit;
+
+		/* Fix inconsistent ITFromLatch25 bit mapping... */
+		if (unlikely(latch_offset == 17))
+			latch_offset = 24;
+
+		status = get_register_interruptible(ab8500,
+				AB8500_INTERRUPT,
+				AB8500_IT_LATCH1_REG + latch_offset,
+				&latch_val);
+		if (status < 0 || latch_val == 0)
+			goto discard;
+
+		status = ab8500_handle_hierarchical_line(ab8500,
+				latch_offset, latch_val);
+		if (status < 0)
+			return status;
+discard:
+		hier_val &= ~(1 << latch_bit);
+	} while (hier_val);
+
+	return 0;
+}
+
+static irqreturn_t ab8500_hierarchical_irq(int irq, void *dev)
+{
+	struct ab8500 *ab8500 = dev;
+	u8 i;
+
+	dev_vdbg(ab8500->dev, "interrupt\n");
+
+	/*  Hierarchical interrupt version */
+	for (i = 0; i < AB8500_IT_LATCHHIER_NUM; i++) {
+		int status;
+		u8 hier_val;
+
+		status = get_register_interruptible(ab8500, AB8500_INTERRUPT,
+			AB8500_IT_LATCHHIER1_REG + i, &hier_val);
+		if (status < 0 || hier_val == 0)
+			continue;
+
+		status = ab8500_handle_hierarchical_latch(ab8500, i, hier_val);
+		if (status < 0)
+			break;
+	}
+	return IRQ_HANDLED;
+}
+
 static irqreturn_t ab8500_irq(int irq, void *dev)
 {
 	struct ab8500 *ab8500 = dev;
@@ -332,6 +478,8 @@ static irqreturn_t ab8500_irq(int irq, void *dev)
 
 	dev_vdbg(ab8500->dev, "interrupt\n");
 
+	atomic_inc(&ab8500->transfer_ongoing);
+
 	for (i = 0; i < ab8500->mask_size; i++) {
 		int regoffset = ab8500->irq_reg_offset[i];
 		int status;
@@ -355,9 +503,10 @@ static irqreturn_t ab8500_irq(int irq, void *dev)
 
 			handle_nested_irq(ab8500->irq_base + line);
 			value &= ~(1 << bit);
+
 		} while (value);
 	}
-
+	atomic_dec(&ab8500->transfer_ongoing);
 	return IRQ_HANDLED;
 }
 
@@ -411,6 +560,14 @@ static void ab8500_irq_remove(struct ab8500 *ab8500)
 	}
 }
 
+int ab8500_suspend(struct ab8500 *ab8500)
+{
+	if (atomic_read(&ab8500->transfer_ongoing))
+		return -EINVAL;
+	else
+		return 0;
+}
+
 /* AB8500 GPIO Resources */
 static struct resource __devinitdata ab8500_gpio_resources[] = {
 	{
@@ -744,6 +901,39 @@ static struct resource __devinitdata ab8500_usb_resources[] = {
 	},
 };
 
+static struct resource __devinitdata ab8505_iddet_resources[] = {
+	{
+		.name  = "KeyDeglitch",
+		.start = AB8505_INT_KEYDEGLITCH,
+		.end   = AB8505_INT_KEYDEGLITCH,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name  = "KP",
+		.start = AB8505_INT_KP,
+		.end   = AB8505_INT_KP,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name  = "IKP",
+		.start = AB8505_INT_IKP,
+		.end   = AB8505_INT_IKP,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name  = "IKR",
+		.start = AB8505_INT_IKR,
+		.end   = AB8505_INT_IKR,
+		.flags = IORESOURCE_IRQ,
+	},
+	{
+		.name  = "KeyStuck",
+		.start = AB8505_INT_KEYSTUCK,
+		.end   = AB8505_INT_KEYSTUCK,
+		.flags = IORESOURCE_IRQ,
+	},
+};
+
 static struct resource __devinitdata ab8500_temp_resources[] = {
 	{
 		.name  = "AB8500_TEMP_WARM",
@@ -778,35 +968,11 @@ static struct mfd_cell __devinitdata abx500_common_devs[] = {
 		.resources = ab8500_rtc_resources,
 	},
 	{
-		.name = "ab8500-charger",
-		.num_resources = ARRAY_SIZE(ab8500_charger_resources),
-		.resources = ab8500_charger_resources,
-	},
-	{
-		.name = "ab8500-btemp",
-		.num_resources = ARRAY_SIZE(ab8500_btemp_resources),
-		.resources = ab8500_btemp_resources,
-	},
-	{
-		.name = "ab8500-fg",
-		.num_resources = ARRAY_SIZE(ab8500_fg_resources),
-		.resources = ab8500_fg_resources,
-	},
-	{
-		.name = "ab8500-chargalg",
-		.num_resources = ARRAY_SIZE(ab8500_chargalg_resources),
-		.resources = ab8500_chargalg_resources,
-	},
-	{
 		.name = "ab8500-acc-det",
 		.num_resources = ARRAY_SIZE(ab8500_av_acc_detect_resources),
 		.resources = ab8500_av_acc_detect_resources,
 	},
 	{
-		.name = "ab8500-codec",
-	},
-
-	{
 		.name = "ab8500-poweron-key",
 		.num_resources = ARRAY_SIZE(ab8500_poweronkey_db_resources),
 		.resources = ab8500_poweronkey_db_resources,
@@ -834,6 +1000,29 @@ static struct mfd_cell __devinitdata abx500_common_devs[] = {
 	},
 };
 
+static struct mfd_cell __devinitdata ab8500_bm_devs[] = {
+	{
+		.name = "ab8500-charger",
+		.num_resources = ARRAY_SIZE(ab8500_charger_resources),
+		.resources = ab8500_charger_resources,
+	},
+	{
+		.name = "ab8500-btemp",
+		.num_resources = ARRAY_SIZE(ab8500_btemp_resources),
+		.resources = ab8500_btemp_resources,
+	},
+	{
+		.name = "ab8500-fg",
+		.num_resources = ARRAY_SIZE(ab8500_fg_resources),
+		.resources = ab8500_fg_resources,
+	},
+	{
+		.name = "ab8500-chargalg",
+		.num_resources = ARRAY_SIZE(ab8500_chargalg_resources),
+		.resources = ab8500_chargalg_resources,
+	},
+};
+
 static struct mfd_cell __devinitdata ab8500_devs[] = {
 	{
 		.name = "ab8500-gpio",
@@ -845,6 +1034,9 @@ static struct mfd_cell __devinitdata ab8500_devs[] = {
 		.num_resources = ARRAY_SIZE(ab8500_usb_resources),
 		.resources = ab8500_usb_resources,
 	},
+	{
+		.name = "ab8500-codec",
+	},
 };
 
 static struct mfd_cell __devinitdata ab9540_devs[] = {
@@ -858,6 +1050,18 @@ static struct mfd_cell __devinitdata ab9540_devs[] = {
 		.num_resources = ARRAY_SIZE(ab8500_usb_resources),
 		.resources = ab8500_usb_resources,
 	},
+	{
+		.name = "ab9540-codec",
+	},
+};
+
+/* Device list common to ab9540 and ab8505 */
+static struct mfd_cell __devinitdata ab9540_ab8505_devs[] = {
+	{
+		.name = "ab-iddet",
+		.num_resources = ARRAY_SIZE(ab8505_iddet_resources),
+		.resources = ab8505_iddet_resources,
+	},
 };
 
 static ssize_t show_chip_id(struct device *dev,
@@ -1003,18 +1207,66 @@ static struct attribute_group ab9540_attr_group = {
 	.attrs	= ab9540_sysfs_entries,
 };
 
-int __devinit ab8500_init(struct ab8500 *ab8500, enum ab8500_version version)
+static const struct of_device_id ab8500_match[] = {
+	{
+		.compatible = "stericsson,ab8500",
+		.data = (void *)AB8500_VERSION_AB8500,
+	},
+	{},
+};
+
+static int __devinit ab8500_probe(struct platform_device *pdev)
 {
-	struct ab8500_platform_data *plat = dev_get_platdata(ab8500->dev);
+	struct ab8500_platform_data *plat = dev_get_platdata(&pdev->dev);
+	const struct platform_device_id *platid = platform_get_device_id(pdev);
+	enum ab8500_version version = AB8500_VERSION_UNDEFINED;
+	struct device_node *np = pdev->dev.of_node;
+	struct ab8500 *ab8500;
+	struct resource *resource;
 	int ret;
 	int i;
 	u8 value;
 
+	ab8500 = kzalloc(sizeof *ab8500, GFP_KERNEL);
+	if (!ab8500)
+		return -ENOMEM;
+
 	if (plat)
 		ab8500->irq_base = plat->irq_base;
+	else if (np)
+		ret = of_property_read_u32(np, "stericsson,irq-base", &ab8500->irq_base);
+
+	if (!ab8500->irq_base) {
+		dev_info(&pdev->dev, "couldn't find irq-base\n");
+		ret = -EINVAL;
+		goto out_free_ab8500;
+	}
+
+	ab8500->dev = &pdev->dev;
+
+	resource = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+	if (!resource) {
+		ret = -ENODEV;
+		goto out_free_ab8500;
+	}
+
+	ab8500->irq = resource->start;
+
+	ab8500->read = ab8500_i2c_read;
+	ab8500->write = ab8500_i2c_write;
+	ab8500->write_masked = ab8500_i2c_write_masked;
 
 	mutex_init(&ab8500->lock);
 	mutex_init(&ab8500->irq_lock);
+	atomic_set(&ab8500->transfer_ongoing, 0);
+
+	platform_set_drvdata(pdev, ab8500);
+
+	if (platid)
+		version = platid->driver_data;
+	else if (np)
+		version = (unsigned int)
+			of_match_device(ab8500_match, &pdev->dev)->data;
 
 	if (version != AB8500_VERSION_UNDEFINED)
 		ab8500->version = version;
@@ -1022,7 +1274,7 @@ int __devinit ab8500_init(struct ab8500 *ab8500, enum ab8500_version version)
 		ret = get_register_interruptible(ab8500, AB8500_MISC,
 			AB8500_IC_NAME_REG, &value);
 		if (ret < 0)
-			return ret;
+			goto out_free_ab8500;
 
 		ab8500->version = value;
 	}
@@ -1030,7 +1282,7 @@ int __devinit ab8500_init(struct ab8500 *ab8500, enum ab8500_version version)
 	ret = get_register_interruptible(ab8500, AB8500_MISC,
 		AB8500_REV_REG, &value);
 	if (ret < 0)
-		return ret;
+		goto out_free_ab8500;
 
 	ab8500->chip_id = value;
 
@@ -1105,30 +1357,57 @@ int __devinit ab8500_init(struct ab8500 *ab8500, enum ab8500_version version)
 		if (ret)
 			goto out_freeoldmask;
 
-		ret = request_threaded_irq(ab8500->irq, NULL, ab8500_irq,
-					   IRQF_ONESHOT | IRQF_NO_SUSPEND,
-					   "ab8500", ab8500);
+		/*  Activate this feature only in ab9540 */
+		/*  till tests are done on ab8500 1p2 or later*/
+		if (is_ab9540(ab8500))
+			ret = request_threaded_irq(ab8500->irq, NULL,
+					ab8500_hierarchical_irq,
+					IRQF_ONESHOT | IRQF_NO_SUSPEND,
+					"ab8500", ab8500);
+		else
+			ret = request_threaded_irq(ab8500->irq, NULL,
+					ab8500_irq,
+					IRQF_ONESHOT | IRQF_NO_SUSPEND,
+					"ab8500", ab8500);
 		if (ret)
 			goto out_removeirq;
 	}
 
-	ret = mfd_add_devices(ab8500->dev, 0, abx500_common_devs,
-			      ARRAY_SIZE(abx500_common_devs), NULL,
-			      ab8500->irq_base);
+	if (!np) {
+		ret = mfd_add_devices(ab8500->dev, 0, abx500_common_devs,
+				ARRAY_SIZE(abx500_common_devs), NULL,
+				ab8500->irq_base);
 
-	if (ret)
-		goto out_freeirq;
+		if (ret)
+			goto out_freeirq;
+
+		if (is_ab9540(ab8500))
+			ret = mfd_add_devices(ab8500->dev, 0, ab9540_devs,
+					ARRAY_SIZE(ab9540_devs), NULL,
+					ab8500->irq_base);
+		else
+			ret = mfd_add_devices(ab8500->dev, 0, ab8500_devs,
+					ARRAY_SIZE(ab8500_devs), NULL,
+					ab8500->irq_base);
+		if (ret)
+			goto out_freeirq;
 
-	if (is_ab9540(ab8500))
-		ret = mfd_add_devices(ab8500->dev, 0, ab9540_devs,
-			      ARRAY_SIZE(ab9540_devs), NULL,
-			      ab8500->irq_base);
-	else
-		ret = mfd_add_devices(ab8500->dev, 0, ab8500_devs,
-			      ARRAY_SIZE(ab9540_devs), NULL,
-			      ab8500->irq_base);
-	if (ret)
-		goto out_freeirq;
+		if (is_ab9540(ab8500) || is_ab8505(ab8500))
+			ret = mfd_add_devices(ab8500->dev, 0, ab9540_ab8505_devs,
+					ARRAY_SIZE(ab9540_ab8505_devs), NULL,
+					ab8500->irq_base);
+		if (ret)
+			goto out_freeirq;
+	}
+
+	if (!no_bm) {
+		/* Add battery management devices */
+		ret = mfd_add_devices(ab8500->dev, 0, ab8500_bm_devs,
+				      ARRAY_SIZE(ab8500_bm_devs), NULL,
+				      ab8500->irq_base);
+		if (ret)
+			dev_err(ab8500->dev, "error adding bm devices\n");
+	}
 
 	if (is_ab9540(ab8500))
 		ret = sysfs_create_group(&ab8500->dev->kobj,
@@ -1151,12 +1430,16 @@ out_freeoldmask:
 	kfree(ab8500->oldmask);
 out_freemask:
 	kfree(ab8500->mask);
+out_free_ab8500:
+	kfree(ab8500);
 
 	return ret;
 }
 
-int __devexit ab8500_exit(struct ab8500 *ab8500)
+static int __devexit ab8500_remove(struct platform_device *pdev)
 {
+	struct ab8500 *ab8500 = platform_get_drvdata(pdev);
+
 	if (is_ab9540(ab8500))
 		sysfs_remove_group(&ab8500->dev->kobj, &ab9540_attr_group);
 	else
@@ -1168,10 +1451,42 @@ int __devexit ab8500_exit(struct ab8500 *ab8500)
 	}
 	kfree(ab8500->oldmask);
 	kfree(ab8500->mask);
+	kfree(ab8500);
 
 	return 0;
 }
 
+static const struct platform_device_id ab8500_id[] = {
+	{ "ab8500-core", AB8500_VERSION_AB8500 },
+	{ "ab8505-i2c", AB8500_VERSION_AB8505 },
+	{ "ab9540-i2c", AB8500_VERSION_AB9540 },
+	{ "ab8540-i2c", AB8500_VERSION_AB8540 },
+	{ }
+};
+
+static struct platform_driver ab8500_core_driver = {
+	.driver = {
+		.name = "ab8500-core",
+		.owner = THIS_MODULE,
+		.of_match_table = ab8500_match,
+	},
+	.probe	= ab8500_probe,
+	.remove	= __devexit_p(ab8500_remove),
+	.id_table = ab8500_id,
+};
+
+static int __init ab8500_core_init(void)
+{
+	return platform_driver_register(&ab8500_core_driver);
+}
+
+static void __exit ab8500_core_exit(void)
+{
+	platform_driver_unregister(&ab8500_core_driver);
+}
+arch_initcall(ab8500_core_init);
+module_exit(ab8500_core_exit);
+
 MODULE_AUTHOR("Mattias Wallin, Srinidhi Kasagar, Rabin Vincent");
 MODULE_DESCRIPTION("AB8500 MFD core");
 MODULE_LICENSE("GPL v2");
diff --git a/drivers/mfd/ab8500-debugfs.c b/drivers/mfd/ab8500-debugfs.c
index 9a0211aa8897..50c4c89ab220 100644
--- a/drivers/mfd/ab8500-debugfs.c
+++ b/drivers/mfd/ab8500-debugfs.c
@@ -608,10 +608,16 @@ static int __devexit ab8500_debug_remove(struct platform_device *plf)
 	return 0;
 }
 
+static const struct of_device_id ab8500_debug_match[] = {
+        { .compatible = "stericsson,ab8500-debug", },
+        {}
+};
+
 static struct platform_driver ab8500_debug_driver = {
 	.driver = {
 		.name = "ab8500-debug",
 		.owner = THIS_MODULE,
+		.of_match_table = ab8500_debug_match,
 	},
 	.probe  = ab8500_debug_probe,
 	.remove = __devexit_p(ab8500_debug_remove)
diff --git a/drivers/mfd/ab8500-gpadc.c b/drivers/mfd/ab8500-gpadc.c
index c39fc716e1dc..b86fd8e1ec3f 100644
--- a/drivers/mfd/ab8500-gpadc.c
+++ b/drivers/mfd/ab8500-gpadc.c
@@ -584,7 +584,7 @@ static int __devinit ab8500_gpadc_probe(struct platform_device *pdev)
 
 	gpadc->irq = platform_get_irq_byname(pdev, "SW_CONV_END");
 	if (gpadc->irq < 0) {
-		dev_err(gpadc->dev, "failed to get platform irq-%d\n",
+		dev_err(&pdev->dev, "failed to get platform irq-%d\n",
 			gpadc->irq);
 		ret = gpadc->irq;
 		goto fail;
@@ -648,12 +648,18 @@ static int __devexit ab8500_gpadc_remove(struct platform_device *pdev)
 	return 0;
 }
 
+static const struct of_device_id ab8500_gpadc_match[] = {
+	{ .compatible = "stericsson,ab8500-gpadc", },
+	{}
+};
+
 static struct platform_driver ab8500_gpadc_driver = {
 	.probe = ab8500_gpadc_probe,
 	.remove = __devexit_p(ab8500_gpadc_remove),
 	.driver = {
 		.name = "ab8500-gpadc",
 		.owner = THIS_MODULE,
+		.of_match_table = ab8500_gpadc_match,
 	},
 };
 
diff --git a/drivers/mfd/ab8500-i2c.c b/drivers/mfd/ab8500-i2c.c
deleted file mode 100644
index b83045f102be..000000000000
--- a/drivers/mfd/ab8500-i2c.c
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (C) ST-Ericsson SA 2010
- * Author: Mattias Wallin <[email protected]> for ST-Ericsson.
- * License Terms: GNU General Public License v2
- * This file was based on drivers/mfd/ab8500-spi.c
- */
-
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/platform_device.h>
-#include <linux/mfd/abx500/ab8500.h>
-#include <linux/mfd/dbx500-prcmu.h>
-
-static int ab8500_i2c_write(struct ab8500 *ab8500, u16 addr, u8 data)
-{
-	int ret;
-
-	ret = prcmu_abb_write((u8)(addr >> 8), (u8)(addr & 0xFF), &data, 1);
-	if (ret < 0)
-		dev_err(ab8500->dev, "prcmu i2c error %d\n", ret);
-	return ret;
-}
-
-static int ab8500_i2c_write_masked(struct ab8500 *ab8500, u16 addr, u8 mask,
-	u8 data)
-{
-	int ret;
-
-	ret = prcmu_abb_write_masked((u8)(addr >> 8), (u8)(addr & 0xFF), &data,
-		&mask, 1);
-	if (ret < 0)
-		dev_err(ab8500->dev, "prcmu i2c error %d\n", ret);
-	return ret;
-}
-
-static int ab8500_i2c_read(struct ab8500 *ab8500, u16 addr)
-{
-	int ret;
-	u8 data;
-
-	ret = prcmu_abb_read((u8)(addr >> 8), (u8)(addr & 0xFF), &data, 1);
-	if (ret < 0) {
-		dev_err(ab8500->dev, "prcmu i2c error %d\n", ret);
-		return ret;
-	}
-	return (int)data;
-}
-
-static int __devinit ab8500_i2c_probe(struct platform_device *plf)
-{
-	const struct platform_device_id *platid = platform_get_device_id(plf);
-	struct ab8500 *ab8500;
-	struct resource *resource;
-	int ret;
-
-	ab8500 = kzalloc(sizeof *ab8500, GFP_KERNEL);
-	if (!ab8500)
-		return -ENOMEM;
-
-	ab8500->dev = &plf->dev;
-
-	resource = platform_get_resource(plf, IORESOURCE_IRQ, 0);
-	if (!resource) {
-		kfree(ab8500);
-		return -ENODEV;
-	}
-
-	ab8500->irq = resource->start;
-
-	ab8500->read = ab8500_i2c_read;
-	ab8500->write = ab8500_i2c_write;
-	ab8500->write_masked = ab8500_i2c_write_masked;
-
-	platform_set_drvdata(plf, ab8500);
-
-	ret = ab8500_init(ab8500, platid->driver_data);
-	if (ret)
-		kfree(ab8500);
-
-
-	return ret;
-}
-
-static int __devexit ab8500_i2c_remove(struct platform_device *plf)
-{
-	struct ab8500 *ab8500 = platform_get_drvdata(plf);
-
-	ab8500_exit(ab8500);
-	kfree(ab8500);
-
-	return 0;
-}
-
-static const struct platform_device_id ab8500_id[] = {
-	{ "ab8500-i2c", AB8500_VERSION_AB8500 },
-	{ "ab8505-i2c", AB8500_VERSION_AB8505 },
-	{ "ab9540-i2c", AB8500_VERSION_AB9540 },
-	{ "ab8540-i2c", AB8500_VERSION_AB8540 },
-	{ }
-};
-
-static struct platform_driver ab8500_i2c_driver = {
-	.driver = {
-		.name = "ab8500-i2c",
-		.owner = THIS_MODULE,
-	},
-	.probe	= ab8500_i2c_probe,
-	.remove	= __devexit_p(ab8500_i2c_remove),
-	.id_table = ab8500_id,
-};
-
-static int __init ab8500_i2c_init(void)
-{
-	return platform_driver_register(&ab8500_i2c_driver);
-}
-
-static void __exit ab8500_i2c_exit(void)
-{
-	platform_driver_unregister(&ab8500_i2c_driver);
-}
-arch_initcall(ab8500_i2c_init);
-module_exit(ab8500_i2c_exit);
-
-MODULE_AUTHOR("Mattias WALLIN <[email protected]");
-MODULE_DESCRIPTION("AB8500 Core access via PRCMU I2C");
-MODULE_LICENSE("GPL v2");
diff --git a/drivers/mfd/ab8500-sysctrl.c b/drivers/mfd/ab8500-sysctrl.c
index c28d4eb1eff0..5a3e51ccf258 100644
--- a/drivers/mfd/ab8500-sysctrl.c
+++ b/drivers/mfd/ab8500-sysctrl.c
@@ -61,10 +61,16 @@ static int __devexit ab8500_sysctrl_remove(struct platform_device *pdev)
 	return 0;
 }
 
+static const struct of_device_id ab8500_sysctrl_match[] = {
+	{ .compatible = "stericsson,ab8500-sysctrl", },
+	{}
+};
+
 static struct platform_driver ab8500_sysctrl_driver = {
 	.driver = {
 		.name = "ab8500-sysctrl",
 		.owner = THIS_MODULE,
+		.of_match_table = ab8500_sysctrl_match,
 	},
 	.probe = ab8500_sysctrl_probe,
 	.remove = __devexit_p(ab8500_sysctrl_remove),
diff --git a/drivers/mfd/anatop-mfd.c b/drivers/mfd/anatop-mfd.c
index 2af42480635e..6da06341f6c9 100644
--- a/drivers/mfd/anatop-mfd.c
+++ b/drivers/mfd/anatop-mfd.c
@@ -41,39 +41,26 @@
 #include <linux/of_address.h>
 #include <linux/mfd/anatop.h>
 
-u32 anatop_get_bits(struct anatop *adata, u32 addr, int bit_shift,
-		    int bit_width)
+u32 anatop_read_reg(struct anatop *adata, u32 addr)
 {
-	u32 val, mask;
-
-	if (bit_width == 32)
-		mask = ~0;
-	else
-		mask = (1 << bit_width) - 1;
-
-	val = readl(adata->ioreg + addr);
-	val = (val >> bit_shift) & mask;
-
-	return val;
+	return readl(adata->ioreg + addr);
 }
-EXPORT_SYMBOL_GPL(anatop_get_bits);
+EXPORT_SYMBOL_GPL(anatop_read_reg);
 
-void anatop_set_bits(struct anatop *adata, u32 addr, int bit_shift,
-		     int bit_width, u32 data)
+void anatop_write_reg(struct anatop *adata, u32 addr, u32 data, u32 mask)
 {
-	u32 val, mask;
+	u32 val;
 
-	if (bit_width == 32)
-		mask = ~0;
-	else
-		mask = (1 << bit_width) - 1;
+	data &= mask;
 
 	spin_lock(&adata->reglock);
-	val = readl(adata->ioreg + addr) & ~(mask << bit_shift);
-	writel((data << bit_shift) | val, adata->ioreg + addr);
+	val = readl(adata->ioreg + addr);
+	val &= ~mask;
+	val |= data;
+	writel(val, adata->ioreg + addr);
 	spin_unlock(&adata->reglock);
 }
-EXPORT_SYMBOL_GPL(anatop_set_bits);
+EXPORT_SYMBOL_GPL(anatop_write_reg);
 
 static const struct of_device_id of_anatop_match[] = {
 	{ .compatible = "fsl,imx6q-anatop", },
diff --git a/drivers/mfd/asic3.c b/drivers/mfd/asic3.c
index 1582c3d95257..383421bf5760 100644
--- a/drivers/mfd/asic3.c
+++ b/drivers/mfd/asic3.c
@@ -353,12 +353,28 @@ static int asic3_gpio_irq_type(struct irq_data *data, unsigned int type)
 	return 0;
 }
 
+static int asic3_gpio_irq_set_wake(struct irq_data *data, unsigned int on)
+{
+	struct asic3 *asic = irq_data_get_irq_chip_data(data);
+	u32 bank, index;
+	u16 bit;
+
+	bank = asic3_irq_to_bank(asic, data->irq);
+	index = asic3_irq_to_index(asic, data->irq);
+	bit = 1<<index;
+
+	asic3_set_register(asic, bank + ASIC3_GPIO_SLEEP_MASK, bit, !on);
+
+	return 0;
+}
+
 static struct irq_chip asic3_gpio_irq_chip = {
 	.name		= "ASIC3-GPIO",
 	.irq_ack	= asic3_mask_gpio_irq,
 	.irq_mask	= asic3_mask_gpio_irq,
 	.irq_unmask	= asic3_unmask_gpio_irq,
 	.irq_set_type	= asic3_gpio_irq_type,
+	.irq_set_wake	= asic3_gpio_irq_set_wake,
 };
 
 static struct irq_chip asic3_irq_chip = {
@@ -529,7 +545,7 @@ static int asic3_gpio_to_irq(struct gpio_chip *chip, unsigned offset)
 {
 	struct asic3 *asic = container_of(chip, struct asic3, gpio);
 
-	return (offset < ASIC3_NUM_GPIOS) ? asic->irq_base + offset : -ENXIO;
+	return asic->irq_base + offset;
 }
 
 static __init int asic3_gpio_probe(struct platform_device *pdev,
@@ -894,10 +910,13 @@ static int __init asic3_mfd_probe(struct platform_device *pdev,
 	asic3_mmc_resources[0].start >>= asic->bus_shift;
 	asic3_mmc_resources[0].end   >>= asic->bus_shift;
 
-	ret = mfd_add_devices(&pdev->dev, pdev->id,
+	if (pdata->clock_rate) {
+		ds1wm_pdata.clock_rate = pdata->clock_rate;
+		ret = mfd_add_devices(&pdev->dev, pdev->id,
 			&asic3_cell_ds1wm, 1, mem, asic->irq_base);
-	if (ret < 0)
-		goto out;
+		if (ret < 0)
+			goto out;
+	}
 
 	if (mem_sdio && (irq >= 0)) {
 		ret = mfd_add_devices(&pdev->dev, pdev->id,
@@ -1000,6 +1019,9 @@ static int __init asic3_probe(struct platform_device *pdev)
 
 	asic3_mfd_probe(pdev, pdata, mem);
 
+	asic3_set_register(asic, ASIC3_OFFSET(EXTCF, SELECT),
+		(ASIC3_EXTCF_CF0_BUF_EN|ASIC3_EXTCF_CF0_PWAIT_EN), 1);
+
 	dev_info(asic->dev, "ASIC3 Core driver\n");
 
 	return 0;
@@ -1021,6 +1043,9 @@ static int __devexit asic3_remove(struct platform_device *pdev)
 	int ret;
 	struct asic3 *asic = platform_get_drvdata(pdev);
 
+	asic3_set_register(asic, ASIC3_OFFSET(EXTCF, SELECT),
+		(ASIC3_EXTCF_CF0_BUF_EN|ASIC3_EXTCF_CF0_PWAIT_EN), 0);
+
 	asic3_mfd_remove(pdev);
 
 	ret = asic3_gpio_remove(pdev);
diff --git a/drivers/mfd/cs5535-mfd.c b/drivers/mfd/cs5535-mfd.c
index 315fef5d466a..3419e726de47 100644
--- a/drivers/mfd/cs5535-mfd.c
+++ b/drivers/mfd/cs5535-mfd.c
@@ -186,18 +186,7 @@ static struct pci_driver cs5535_mfd_driver = {
 	.remove = __devexit_p(cs5535_mfd_remove),
 };
 
-static int __init cs5535_mfd_init(void)
-{
-	return pci_register_driver(&cs5535_mfd_driver);
-}
-
-static void __exit cs5535_mfd_exit(void)
-{
-	pci_unregister_driver(&cs5535_mfd_driver);
-}
-
-module_init(cs5535_mfd_init);
-module_exit(cs5535_mfd_exit);
+module_pci_driver(cs5535_mfd_driver);
 
 MODULE_AUTHOR("Andres Salomon <[email protected]>");
 MODULE_DESCRIPTION("MFD driver for CS5535/CS5536 southbridge's ISA PCI device");
diff --git a/drivers/mfd/da9052-core.c b/drivers/mfd/da9052-core.c
index 7776aff46269..1f1313c90573 100644
--- a/drivers/mfd/da9052-core.c
+++ b/drivers/mfd/da9052-core.c
@@ -318,6 +318,135 @@ static bool da9052_reg_volatile(struct device *dev, unsigned int reg)
 	}
 }
 
+/*
+ * TBAT look-up table is computed from the R90 reg (8 bit register)
+ * reading as below. The battery temperature is in milliCentigrade
+ * TBAT = (1/(t1+1/298) - 273) * 1000 mC
+ * where t1 = (1/B)* ln(( ADCval * 2.5)/(R25*ITBAT*255))
+ * Default values are R25 = 10e3, B = 3380, ITBAT = 50e-6
+ * Example:
+ * R25=10E3, B=3380, ITBAT=50e-6, ADCVAL=62d calculates
+ * TBAT = 20015 mili degrees Centrigrade
+ *
+*/
+static const int32_t tbat_lookup[255] = {
+	183258, 144221, 124334, 111336, 101826, 94397, 88343, 83257,
+	78889, 75071, 71688, 68656, 65914, 63414, 61120, 59001,
+	570366, 55204, 53490, 51881, 50364, 48931, 47574, 46285,
+	45059, 43889, 42772, 41703, 40678, 39694, 38748, 37838,
+	36961, 36115, 35297, 34507, 33743, 33002, 32284, 31588,
+	30911, 30254, 29615, 28994, 28389, 27799, 27225, 26664,
+	26117, 25584, 25062, 24553, 24054, 23567, 23091, 22624,
+	22167, 21719, 21281, 20851, 20429, 20015, 19610, 19211,
+	18820, 18436, 18058, 17688, 17323, 16965, 16612, 16266,
+	15925, 15589, 15259, 14933, 14613, 14298, 13987, 13681,
+	13379, 13082, 12788, 12499, 12214, 11933, 11655, 11382,
+	11112, 10845, 10582, 10322, 10066, 9812, 9562, 9315,
+	9071, 8830, 8591, 8356, 8123, 7893, 7665, 7440,
+	7218, 6998, 6780, 6565, 6352, 6141, 5933, 5726,
+	5522, 5320, 5120, 4922, 4726, 4532, 4340, 4149,
+	3961, 3774, 3589, 3406, 3225, 3045, 2867, 2690,
+	2516, 2342, 2170, 2000, 1831, 1664, 1498, 1334,
+	1171, 1009, 849, 690, 532, 376, 221, 67,
+	-84, -236, -386, -535, -683, -830, -975, -1119,
+	-1263, -1405, -1546, -1686, -1825, -1964, -2101, -2237,
+	-2372, -2506, -2639, -2771, -2902, -3033, -3162, -3291,
+	-3418, -3545, -3671, -3796, -3920, -4044, -4166, -4288,
+	-4409, -4529, -4649, -4767, -4885, -5002, -5119, -5235,
+	-5349, -5464, -5577, -5690, -5802, -5913, -6024, -6134,
+	-6244, -6352, -6461, -6568, -6675, -6781, -6887, -6992,
+	-7096, -7200, -7303, -7406, -7508, -7609, -7710, -7810,
+	-7910, -8009, -8108, -8206, -8304, -8401, -8497, -8593,
+	-8689, -8784, -8878, -8972, -9066, -9159, -9251, -9343,
+	-9435, -9526, -9617, -9707, -9796, -9886, -9975, -10063,
+	-10151, -10238, -10325, -10412, -10839, -10923, -11007, -11090,
+	-11173, -11256, -11338, -11420, -11501, -11583, -11663, -11744,
+	-11823, -11903, -11982
+};
+
+static const u8 chan_mux[DA9052_ADC_VBBAT + 1] = {
+	[DA9052_ADC_VDDOUT]	= DA9052_ADC_MAN_MUXSEL_VDDOUT,
+	[DA9052_ADC_ICH]	= DA9052_ADC_MAN_MUXSEL_ICH,
+	[DA9052_ADC_TBAT]	= DA9052_ADC_MAN_MUXSEL_TBAT,
+	[DA9052_ADC_VBAT]	= DA9052_ADC_MAN_MUXSEL_VBAT,
+	[DA9052_ADC_IN4]	= DA9052_ADC_MAN_MUXSEL_AD4,
+	[DA9052_ADC_IN5]	= DA9052_ADC_MAN_MUXSEL_AD5,
+	[DA9052_ADC_IN6]	= DA9052_ADC_MAN_MUXSEL_AD6,
+	[DA9052_ADC_VBBAT]	= DA9052_ADC_MAN_MUXSEL_VBBAT
+};
+
+int da9052_adc_manual_read(struct da9052 *da9052, unsigned char channel)
+{
+	int ret;
+	unsigned short calc_data;
+	unsigned short data;
+	unsigned char mux_sel;
+
+	if (channel > DA9052_ADC_VBBAT)
+		return -EINVAL;
+
+	mutex_lock(&da9052->auxadc_lock);
+
+	/* Channel gets activated on enabling the Conversion bit */
+	mux_sel = chan_mux[channel] | DA9052_ADC_MAN_MAN_CONV;
+
+	ret = da9052_reg_write(da9052, DA9052_ADC_MAN_REG, mux_sel);
+	if (ret < 0)
+		goto err;
+
+	/* Wait for an interrupt */
+	if (!wait_for_completion_timeout(&da9052->done,
+					 msecs_to_jiffies(500))) {
+		dev_err(da9052->dev,
+			"timeout waiting for ADC conversion interrupt\n");
+		ret = -ETIMEDOUT;
+		goto err;
+	}
+
+	ret = da9052_reg_read(da9052, DA9052_ADC_RES_H_REG);
+	if (ret < 0)
+		goto err;
+
+	calc_data = (unsigned short)ret;
+	data = calc_data << 2;
+
+	ret = da9052_reg_read(da9052, DA9052_ADC_RES_L_REG);
+	if (ret < 0)
+		goto err;
+
+	calc_data = (unsigned short)(ret & DA9052_ADC_RES_LSB);
+	data |= calc_data;
+
+	ret = data;
+
+err:
+	mutex_unlock(&da9052->auxadc_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(da9052_adc_manual_read);
+
+static irqreturn_t da9052_auxadc_irq(int irq, void *irq_data)
+{
+	struct da9052 *da9052 = irq_data;
+
+	complete(&da9052->done);
+
+	return IRQ_HANDLED;
+}
+
+int da9052_adc_read_temp(struct da9052 *da9052)
+{
+	int tbat;
+
+	tbat = da9052_reg_read(da9052, DA9052_TBAT_RES_REG);
+	if (tbat <= 0)
+		return tbat;
+
+	/* ARRAY_SIZE check is not needed since TBAT is a 8-bit register */
+	return tbat_lookup[tbat - 1];
+}
+EXPORT_SYMBOL_GPL(da9052_adc_read_temp);
+
 static struct resource da9052_rtc_resource = {
 	.name = "ALM",
 	.start = DA9052_IRQ_ALARM,
@@ -646,6 +775,9 @@ int __devinit da9052_device_init(struct da9052 *da9052, u8 chip_id)
 	struct irq_desc *desc;
 	int ret;
 
+	mutex_init(&da9052->auxadc_lock);
+	init_completion(&da9052->done);
+
 	if (pdata && pdata->init != NULL)
 		pdata->init(da9052);
 
@@ -665,6 +797,12 @@ int __devinit da9052_device_init(struct da9052 *da9052, u8 chip_id)
 
 	da9052->irq_base = regmap_irq_chip_get_base(da9052->irq_data);
 
+	ret = request_threaded_irq(DA9052_IRQ_ADC_EOM, NULL, da9052_auxadc_irq,
+				   IRQF_TRIGGER_LOW | IRQF_ONESHOT,
+				   "adc irq", da9052);
+	if (ret != 0)
+		dev_err(da9052->dev, "DA9052 ADC IRQ failed ret=%d\n", ret);
+
 	ret = mfd_add_devices(da9052->dev, -1, da9052_subdev_info,
 			      ARRAY_SIZE(da9052_subdev_info), NULL, 0);
 	if (ret)
@@ -673,6 +811,7 @@ int __devinit da9052_device_init(struct da9052 *da9052, u8 chip_id)
 	return 0;
 
 err:
+	free_irq(DA9052_IRQ_ADC_EOM, da9052);
 	mfd_remove_devices(da9052->dev);
 regmap_err:
 	return ret;
@@ -680,6 +819,7 @@ regmap_err:
 
 void da9052_device_exit(struct da9052 *da9052)
 {
+	free_irq(DA9052_IRQ_ADC_EOM, da9052);
 	regmap_del_irq_chip(da9052->chip_irq, da9052->irq_data);
 	mfd_remove_devices(da9052->dev);
 }
diff --git a/drivers/mfd/da9052-i2c.c b/drivers/mfd/da9052-i2c.c
index 36b88e395499..82c9d6450286 100644
--- a/drivers/mfd/da9052-i2c.c
+++ b/drivers/mfd/da9052-i2c.c
@@ -22,6 +22,11 @@
 #include <linux/mfd/da9052/da9052.h>
 #include <linux/mfd/da9052/reg.h>
 
+#ifdef CONFIG_OF
+#include <linux/of.h>
+#include <linux/of_device.h>
+#endif
+
 static int da9052_i2c_enable_multiwrite(struct da9052 *da9052)
 {
 	int reg_val, ret;
@@ -41,13 +46,31 @@ static int da9052_i2c_enable_multiwrite(struct da9052 *da9052)
 	return 0;
 }
 
+static struct i2c_device_id da9052_i2c_id[] = {
+	{"da9052", DA9052},
+	{"da9053-aa", DA9053_AA},
+	{"da9053-ba", DA9053_BA},
+	{"da9053-bb", DA9053_BB},
+	{}
+};
+
+#ifdef CONFIG_OF
+static const struct of_device_id dialog_dt_ids[] = {
+	{ .compatible = "dlg,da9052", .data = &da9052_i2c_id[0] },
+	{ .compatible = "dlg,da9053-aa", .data = &da9052_i2c_id[1] },
+	{ .compatible = "dlg,da9053-ab", .data = &da9052_i2c_id[2] },
+	{ .compatible = "dlg,da9053-bb", .data = &da9052_i2c_id[3] },
+	{ /* sentinel */ }
+};
+#endif
+
 static int __devinit da9052_i2c_probe(struct i2c_client *client,
 				       const struct i2c_device_id *id)
 {
 	struct da9052 *da9052;
 	int ret;
 
-	da9052 = kzalloc(sizeof(struct da9052), GFP_KERNEL);
+	da9052 = devm_kzalloc(&client->dev, sizeof(struct da9052), GFP_KERNEL);
 	if (!da9052)
 		return -ENOMEM;
 
@@ -55,8 +78,7 @@ static int __devinit da9052_i2c_probe(struct i2c_client *client,
 				     I2C_FUNC_SMBUS_BYTE_DATA)) {
 		dev_info(&client->dev, "Error in %s:i2c_check_functionality\n",
 			 __func__);
-		ret = -ENODEV;
-		goto err;
+		return  -ENODEV;
 	}
 
 	da9052->dev = &client->dev;
@@ -64,29 +86,39 @@ static int __devinit da9052_i2c_probe(struct i2c_client *client,
 
 	i2c_set_clientdata(client, da9052);
 
-	da9052->regmap = regmap_init_i2c(client, &da9052_regmap_config);
+	da9052->regmap = devm_regmap_init_i2c(client, &da9052_regmap_config);
 	if (IS_ERR(da9052->regmap)) {
 		ret = PTR_ERR(da9052->regmap);
 		dev_err(&client->dev, "Failed to allocate register map: %d\n",
 			ret);
-		goto err;
+		return ret;
 	}
 
 	ret = da9052_i2c_enable_multiwrite(da9052);
 	if (ret < 0)
-		goto err_regmap;
+		return ret;
+
+#ifdef CONFIG_OF
+	if (!id) {
+		struct device_node *np = client->dev.of_node;
+		const struct of_device_id *deviceid;
+
+		deviceid = of_match_node(dialog_dt_ids, np);
+		id = (const struct i2c_device_id *)deviceid->data;
+	}
+#endif
+
+	if (!id) {
+		ret = -ENODEV;
+		dev_err(&client->dev, "id is null.\n");
+		return ret;
+	}
 
 	ret = da9052_device_init(da9052, id->driver_data);
 	if (ret != 0)
-		goto err_regmap;
+		return ret;
 
 	return 0;
-
-err_regmap:
-	regmap_exit(da9052->regmap);
-err:
-	kfree(da9052);
-	return ret;
 }
 
 static int __devexit da9052_i2c_remove(struct i2c_client *client)
@@ -94,20 +126,9 @@ static int __devexit da9052_i2c_remove(struct i2c_client *client)
 	struct da9052 *da9052 = i2c_get_clientdata(client);
 
 	da9052_device_exit(da9052);
-	regmap_exit(da9052->regmap);
-	kfree(da9052);
-
 	return 0;
 }
 
-static struct i2c_device_id da9052_i2c_id[] = {
-	{"da9052", DA9052},
-	{"da9053-aa", DA9053_AA},
-	{"da9053-ba", DA9053_BA},
-	{"da9053-bb", DA9053_BB},
-	{}
-};
-
 static struct i2c_driver da9052_i2c_driver = {
 	.probe = da9052_i2c_probe,
 	.remove = __devexit_p(da9052_i2c_remove),
@@ -115,6 +136,9 @@ static struct i2c_driver da9052_i2c_driver = {
 	.driver = {
 		.name = "da9052",
 		.owner = THIS_MODULE,
+#ifdef CONFIG_OF
+		.of_match_table = dialog_dt_ids,
+#endif
 	},
 };
 
diff --git a/drivers/mfd/da9052-spi.c b/drivers/mfd/da9052-spi.c
index 6faf149e8d94..dbeadc5a6436 100644
--- a/drivers/mfd/da9052-spi.c
+++ b/drivers/mfd/da9052-spi.c
@@ -25,8 +25,9 @@ static int __devinit da9052_spi_probe(struct spi_device *spi)
 {
 	int ret;
 	const struct spi_device_id *id = spi_get_device_id(spi);
-	struct da9052 *da9052 = kzalloc(sizeof(struct da9052), GFP_KERNEL);
+	struct da9052 *da9052;
 
+	da9052 = devm_kzalloc(&spi->dev, sizeof(struct da9052), GFP_KERNEL);
 	if (!da9052)
 		return -ENOMEM;
 
@@ -42,25 +43,19 @@ static int __devinit da9052_spi_probe(struct spi_device *spi)
 	da9052_regmap_config.read_flag_mask = 1;
 	da9052_regmap_config.write_flag_mask = 0;
 
-	da9052->regmap = regmap_init_spi(spi, &da9052_regmap_config);
+	da9052->regmap = devm_regmap_init_spi(spi, &da9052_regmap_config);
 	if (IS_ERR(da9052->regmap)) {
 		ret = PTR_ERR(da9052->regmap);
 		dev_err(&spi->dev, "Failed to allocate register map: %d\n",
 			ret);
-		goto err;
+		return ret;
 	}
 
 	ret = da9052_device_init(da9052, id->driver_data);
 	if (ret != 0)
-		goto err_regmap;
+		return ret;
 
 	return 0;
-
-err_regmap:
-	regmap_exit(da9052->regmap);
-err:
-	kfree(da9052);
-	return ret;
 }
 
 static int __devexit da9052_spi_remove(struct spi_device *spi)
@@ -68,9 +63,6 @@ static int __devexit da9052_spi_remove(struct spi_device *spi)
 	struct da9052 *da9052 = dev_get_drvdata(&spi->dev);
 
 	da9052_device_exit(da9052);
-	regmap_exit(da9052->regmap);
-	kfree(da9052);
-
 	return 0;
 }
 
@@ -88,7 +80,6 @@ static struct spi_driver da9052_spi_driver = {
 	.id_table = da9052_spi_id,
 	.driver = {
 		.name = "da9052",
-		.bus = &spi_bus_type,
 		.owner = THIS_MODULE,
 	},
 };
diff --git a/drivers/mfd/db8500-prcmu.c b/drivers/mfd/db8500-prcmu.c
index 5be32489714f..671c8bc14bbc 100644
--- a/drivers/mfd/db8500-prcmu.c
+++ b/drivers/mfd/db8500-prcmu.c
@@ -2720,6 +2720,7 @@ static struct regulator_consumer_supply db8500_vape_consumers[] = {
 	REGULATOR_SUPPLY("v-i2c", "nmk-i2c.1"),
 	REGULATOR_SUPPLY("v-i2c", "nmk-i2c.2"),
 	REGULATOR_SUPPLY("v-i2c", "nmk-i2c.3"),
+	REGULATOR_SUPPLY("v-i2c", "nmk-i2c.4"),
 	/* "v-mmc" changed to "vcore" in the mainline kernel */
 	REGULATOR_SUPPLY("vcore", "sdi0"),
 	REGULATOR_SUPPLY("vcore", "sdi1"),
@@ -2958,9 +2959,10 @@ static struct mfd_cell db8500_prcmu_devs[] = {
  * prcmu_fw_init - arch init call for the Linux PRCMU fw init logic
  *
  */
-static int __init db8500_prcmu_probe(struct platform_device *pdev)
+static int __devinit db8500_prcmu_probe(struct platform_device *pdev)
 {
-	int err = 0;
+	struct device_node *np = pdev->dev.of_node;
+	int irq = 0, err = 0;
 
 	if (ux500_is_svp())
 		return -ENODEV;
@@ -2970,8 +2972,14 @@ static int __init db8500_prcmu_probe(struct platform_device *pdev)
 	/* Clean up the mailbox interrupts after pre-kernel code. */
 	writel(ALL_MBOX_BITS, PRCM_ARM_IT1_CLR);
 
-	err = request_threaded_irq(IRQ_DB8500_PRCMU1, prcmu_irq_handler,
-		prcmu_irq_thread_fn, IRQF_NO_SUSPEND, "prcmu", NULL);
+	if (np)
+		irq = platform_get_irq(pdev, 0);
+
+	if (!np || irq <= 0)
+		irq = IRQ_DB8500_PRCMU1;
+
+	err = request_threaded_irq(irq, prcmu_irq_handler,
+	        prcmu_irq_thread_fn, IRQF_NO_SUSPEND, "prcmu", NULL);
 	if (err < 0) {
 		pr_err("prcmu: Failed to allocate IRQ_DB8500_PRCMU1.\n");
 		err = -EBUSY;
@@ -2981,14 +2989,16 @@ static int __init db8500_prcmu_probe(struct platform_device *pdev)
 	if (cpu_is_u8500v20_or_later())
 		prcmu_config_esram0_deep_sleep(ESRAM0_DEEP_SLEEP_STATE_RET);
 
-	err = mfd_add_devices(&pdev->dev, 0, db8500_prcmu_devs,
-			      ARRAY_SIZE(db8500_prcmu_devs), NULL,
-			      0);
+	if (!np) {
+		err = mfd_add_devices(&pdev->dev, 0, db8500_prcmu_devs,
+				ARRAY_SIZE(db8500_prcmu_devs), NULL, 0);
+		if (err) {
+			pr_err("prcmu: Failed to add subdevices\n");
+			return err;
+		}
+	}
 
-	if (err)
-		pr_err("prcmu: Failed to add subdevices\n");
-	else
-		pr_info("DB8500 PRCMU initialized\n");
+	pr_info("DB8500 PRCMU initialized\n");
 
 no_irq_return:
 	return err;
@@ -2999,11 +3009,12 @@ static struct platform_driver db8500_prcmu_driver = {
 		.name = "db8500-prcmu",
 		.owner = THIS_MODULE,
 	},
+	.probe = db8500_prcmu_probe,
 };
 
 static int __init db8500_prcmu_init(void)
 {
-	return platform_driver_probe(&db8500_prcmu_driver, db8500_prcmu_probe);
+	return platform_driver_register(&db8500_prcmu_driver);
 }
 
 arch_initcall(db8500_prcmu_init);
diff --git a/drivers/mfd/intel_msic.c b/drivers/mfd/intel_msic.c
index b76657eb0c51..59df5584cb58 100644
--- a/drivers/mfd/intel_msic.c
+++ b/drivers/mfd/intel_msic.c
@@ -406,7 +406,7 @@ static int __devinit intel_msic_probe(struct platform_device *pdev)
 		return -ENXIO;
 	}
 
-	msic = kzalloc(sizeof(*msic), GFP_KERNEL);
+	msic = devm_kzalloc(&pdev->dev, sizeof(*msic), GFP_KERNEL);
 	if (!msic)
 		return -ENOMEM;
 
@@ -421,21 +421,13 @@ static int __devinit intel_msic_probe(struct platform_device *pdev)
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (!res) {
 		dev_err(&pdev->dev, "failed to get SRAM iomem resource\n");
-		ret = -ENODEV;
-		goto fail_free_msic;
+		return -ENODEV;
 	}
 
-	res = request_mem_region(res->start, resource_size(res), pdev->name);
-	if (!res) {
-		ret = -EBUSY;
-		goto fail_free_msic;
-	}
-
-	msic->irq_base = ioremap_nocache(res->start, resource_size(res));
+	msic->irq_base = devm_request_and_ioremap(&pdev->dev, res);
 	if (!msic->irq_base) {
 		dev_err(&pdev->dev, "failed to map SRAM memory\n");
-		ret = -ENOMEM;
-		goto fail_release_region;
+		return -ENOMEM;
 	}
 
 	platform_set_drvdata(pdev, msic);
@@ -443,7 +435,7 @@ static int __devinit intel_msic_probe(struct platform_device *pdev)
 	ret = intel_msic_init_devices(msic);
 	if (ret) {
 		dev_err(&pdev->dev, "failed to initialize MSIC devices\n");
-		goto fail_unmap_mem;
+		return ret;
 	}
 
 	dev_info(&pdev->dev, "Intel MSIC version %c%d (vendor %#x)\n",
@@ -451,27 +443,14 @@ static int __devinit intel_msic_probe(struct platform_device *pdev)
 		 msic->vendor);
 
 	return 0;
-
-fail_unmap_mem:
-	iounmap(msic->irq_base);
-fail_release_region:
-	release_mem_region(res->start, resource_size(res));
-fail_free_msic:
-	kfree(msic);
-
-	return ret;
 }
 
 static int __devexit intel_msic_remove(struct platform_device *pdev)
 {
 	struct intel_msic *msic = platform_get_drvdata(pdev);
-	struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 
 	intel_msic_remove_devices(msic);
 	platform_set_drvdata(pdev, NULL);
-	iounmap(msic->irq_base);
-	release_mem_region(res->start, resource_size(res));
-	kfree(msic);
 
 	return 0;
 }
diff --git a/drivers/mfd/janz-cmodio.c b/drivers/mfd/janz-cmodio.c
index a9223ed1b7c5..2ea99989551a 100644
--- a/drivers/mfd/janz-cmodio.c
+++ b/drivers/mfd/janz-cmodio.c
@@ -283,23 +283,8 @@ static struct pci_driver cmodio_pci_driver = {
 	.remove   = __devexit_p(cmodio_pci_remove),
 };
 
-/*
- * Module Init / Exit
- */
-
-static int __init cmodio_init(void)
-{
-	return pci_register_driver(&cmodio_pci_driver);
-}
-
-static void __exit cmodio_exit(void)
-{
-	pci_unregister_driver(&cmodio_pci_driver);
-}
+module_pci_driver(cmodio_pci_driver);
 
 MODULE_AUTHOR("Ira W. Snyder <[email protected]>");
 MODULE_DESCRIPTION("Janz CMOD-IO PCI MODULbus Carrier Board Driver");
 MODULE_LICENSE("GPL");
-
-module_init(cmodio_init);
-module_exit(cmodio_exit);
diff --git a/drivers/mfd/lm3533-core.c b/drivers/mfd/lm3533-core.c
new file mode 100644
index 000000000000..0b2879b87fd9
--- /dev/null
+++ b/drivers/mfd/lm3533-core.c
@@ -0,0 +1,667 @@
+/*
+ * lm3533-core.c -- LM3533 Core
+ *
+ * Copyright (C) 2011-2012 Texas Instruments
+ *
+ * Author: Johan Hovold <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under  the terms of the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/gpio.h>
+#include <linux/i2c.h>
+#include <linux/mfd/core.h>
+#include <linux/regmap.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include <linux/mfd/lm3533.h>
+
+
+#define LM3533_BOOST_OVP_MASK		0x06
+#define LM3533_BOOST_OVP_SHIFT		1
+
+#define LM3533_BOOST_FREQ_MASK		0x01
+#define LM3533_BOOST_FREQ_SHIFT		0
+
+#define LM3533_BL_ID_MASK		1
+#define LM3533_LED_ID_MASK		3
+#define LM3533_BL_ID_MAX		1
+#define LM3533_LED_ID_MAX		3
+
+#define LM3533_HVLED_ID_MAX		2
+#define LM3533_LVLED_ID_MAX		5
+
+#define LM3533_REG_OUTPUT_CONF1		0x10
+#define LM3533_REG_OUTPUT_CONF2		0x11
+#define LM3533_REG_BOOST_PWM		0x2c
+
+#define LM3533_REG_MAX			0xb2
+
+
+static struct mfd_cell lm3533_als_devs[] = {
+	{
+		.name	= "lm3533-als",
+		.id	= -1,
+	},
+};
+
+static struct mfd_cell lm3533_bl_devs[] = {
+	{
+		.name	= "lm3533-backlight",
+		.id	= 0,
+	},
+	{
+		.name	= "lm3533-backlight",
+		.id	= 1,
+	},
+};
+
+static struct mfd_cell lm3533_led_devs[] = {
+	{
+		.name	= "lm3533-leds",
+		.id	= 0,
+	},
+	{
+		.name	= "lm3533-leds",
+		.id	= 1,
+	},
+	{
+		.name	= "lm3533-leds",
+		.id	= 2,
+	},
+	{
+		.name	= "lm3533-leds",
+		.id	= 3,
+	},
+};
+
+int lm3533_read(struct lm3533 *lm3533, u8 reg, u8 *val)
+{
+	int tmp;
+	int ret;
+
+	ret = regmap_read(lm3533->regmap, reg, &tmp);
+	if (ret < 0) {
+		dev_err(lm3533->dev, "failed to read register %02x: %d\n",
+								reg, ret);
+		return ret;
+	}
+
+	*val = tmp;
+
+	dev_dbg(lm3533->dev, "read [%02x]: %02x\n", reg, *val);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(lm3533_read);
+
+int lm3533_write(struct lm3533 *lm3533, u8 reg, u8 val)
+{
+	int ret;
+
+	dev_dbg(lm3533->dev, "write [%02x]: %02x\n", reg, val);
+
+	ret = regmap_write(lm3533->regmap, reg, val);
+	if (ret < 0) {
+		dev_err(lm3533->dev, "failed to write register %02x: %d\n",
+								reg, ret);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(lm3533_write);
+
+int lm3533_update(struct lm3533 *lm3533, u8 reg, u8 val, u8 mask)
+{
+	int ret;
+
+	dev_dbg(lm3533->dev, "update [%02x]: %02x/%02x\n", reg, val, mask);
+
+	ret = regmap_update_bits(lm3533->regmap, reg, mask, val);
+	if (ret < 0) {
+		dev_err(lm3533->dev, "failed to update register %02x: %d\n",
+								reg, ret);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(lm3533_update);
+
+static int lm3533_set_boost_freq(struct lm3533 *lm3533,
+						enum lm3533_boost_freq freq)
+{
+	int ret;
+
+	ret = lm3533_update(lm3533, LM3533_REG_BOOST_PWM,
+					freq << LM3533_BOOST_FREQ_SHIFT,
+					LM3533_BOOST_FREQ_MASK);
+	if (ret)
+		dev_err(lm3533->dev, "failed to set boost frequency\n");
+
+	return ret;
+}
+
+
+static int lm3533_set_boost_ovp(struct lm3533 *lm3533,
+						enum lm3533_boost_ovp ovp)
+{
+	int ret;
+
+	ret = lm3533_update(lm3533, LM3533_REG_BOOST_PWM,
+					ovp << LM3533_BOOST_OVP_SHIFT,
+					LM3533_BOOST_OVP_MASK);
+	if (ret)
+		dev_err(lm3533->dev, "failed to set boost ovp\n");
+
+	return ret;
+}
+
+/*
+ * HVLED output config -- output hvled controlled by backlight bl
+ */
+static int lm3533_set_hvled_config(struct lm3533 *lm3533, u8 hvled, u8 bl)
+{
+	u8 val;
+	u8 mask;
+	int shift;
+	int ret;
+
+	if (hvled == 0 || hvled > LM3533_HVLED_ID_MAX)
+		return -EINVAL;
+
+	if (bl > LM3533_BL_ID_MAX)
+		return -EINVAL;
+
+	shift = hvled - 1;
+	mask = LM3533_BL_ID_MASK << shift;
+	val = bl << shift;
+
+	ret = lm3533_update(lm3533, LM3533_REG_OUTPUT_CONF1, val, mask);
+	if (ret)
+		dev_err(lm3533->dev, "failed to set hvled config\n");
+
+	return ret;
+}
+
+/*
+ * LVLED output config -- output lvled controlled by LED led
+ */
+static int lm3533_set_lvled_config(struct lm3533 *lm3533, u8 lvled, u8 led)
+{
+	u8 reg;
+	u8 val;
+	u8 mask;
+	int shift;
+	int ret;
+
+	if (lvled == 0 || lvled > LM3533_LVLED_ID_MAX)
+		return -EINVAL;
+
+	if (led > LM3533_LED_ID_MAX)
+		return -EINVAL;
+
+	if (lvled < 4) {
+		reg = LM3533_REG_OUTPUT_CONF1;
+		shift = 2 * lvled;
+	} else {
+		reg = LM3533_REG_OUTPUT_CONF2;
+		shift = 2 * (lvled - 4);
+	}
+
+	mask = LM3533_LED_ID_MASK << shift;
+	val = led << shift;
+
+	ret = lm3533_update(lm3533, reg, val, mask);
+	if (ret)
+		dev_err(lm3533->dev, "failed to set lvled config\n");
+
+	return ret;
+}
+
+static void lm3533_enable(struct lm3533 *lm3533)
+{
+	if (gpio_is_valid(lm3533->gpio_hwen))
+		gpio_set_value(lm3533->gpio_hwen, 1);
+}
+
+static void lm3533_disable(struct lm3533 *lm3533)
+{
+	if (gpio_is_valid(lm3533->gpio_hwen))
+		gpio_set_value(lm3533->gpio_hwen, 0);
+}
+
+enum lm3533_attribute_type {
+	LM3533_ATTR_TYPE_BACKLIGHT,
+	LM3533_ATTR_TYPE_LED,
+};
+
+struct lm3533_device_attribute {
+	struct device_attribute dev_attr;
+	enum lm3533_attribute_type type;
+	union {
+		struct {
+			u8 id;
+		} output;
+	} u;
+};
+
+#define to_lm3533_dev_attr(_attr) \
+	container_of(_attr, struct lm3533_device_attribute, dev_attr)
+
+static ssize_t show_output(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct lm3533 *lm3533 = dev_get_drvdata(dev);
+	struct lm3533_device_attribute *lattr = to_lm3533_dev_attr(attr);
+	int id = lattr->u.output.id;
+	u8 reg;
+	u8 val;
+	u8 mask;
+	int shift;
+	int ret;
+
+	if (lattr->type == LM3533_ATTR_TYPE_BACKLIGHT) {
+		reg = LM3533_REG_OUTPUT_CONF1;
+		shift = id - 1;
+		mask = LM3533_BL_ID_MASK << shift;
+	} else {
+		if (id < 4) {
+			reg = LM3533_REG_OUTPUT_CONF1;
+			shift = 2 * id;
+		} else {
+			reg = LM3533_REG_OUTPUT_CONF2;
+			shift = 2 * (id - 4);
+		}
+		mask = LM3533_LED_ID_MASK << shift;
+	}
+
+	ret = lm3533_read(lm3533, reg, &val);
+	if (ret)
+		return ret;
+
+	val = (val & mask) >> shift;
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", val);
+}
+
+static ssize_t store_output(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t len)
+{
+	struct lm3533 *lm3533 = dev_get_drvdata(dev);
+	struct lm3533_device_attribute *lattr = to_lm3533_dev_attr(attr);
+	int id = lattr->u.output.id;
+	u8 val;
+	int ret;
+
+	if (kstrtou8(buf, 0, &val))
+		return -EINVAL;
+
+	if (lattr->type == LM3533_ATTR_TYPE_BACKLIGHT)
+		ret = lm3533_set_hvled_config(lm3533, id, val);
+	else
+		ret = lm3533_set_lvled_config(lm3533, id, val);
+
+	if (ret)
+		return ret;
+
+	return len;
+}
+
+#define LM3533_OUTPUT_ATTR(_name, _mode, _show, _store, _type, _id) \
+	struct lm3533_device_attribute lm3533_dev_attr_##_name = \
+		{ .dev_attr	= __ATTR(_name, _mode, _show, _store), \
+		  .type		= _type, \
+		  .u.output	= { .id = _id }, }
+
+#define LM3533_OUTPUT_ATTR_RW(_name, _type, _id) \
+	LM3533_OUTPUT_ATTR(output_##_name, S_IRUGO | S_IWUSR, \
+					show_output, store_output, _type, _id)
+
+#define LM3533_OUTPUT_HVLED_ATTR_RW(_nr) \
+	LM3533_OUTPUT_ATTR_RW(hvled##_nr, LM3533_ATTR_TYPE_BACKLIGHT, _nr)
+#define LM3533_OUTPUT_LVLED_ATTR_RW(_nr) \
+	LM3533_OUTPUT_ATTR_RW(lvled##_nr, LM3533_ATTR_TYPE_LED, _nr)
+/*
+ * Output config:
+ *
+ * output_hvled<nr>	0-1
+ * output_lvled<nr>	0-3
+ */
+static LM3533_OUTPUT_HVLED_ATTR_RW(1);
+static LM3533_OUTPUT_HVLED_ATTR_RW(2);
+static LM3533_OUTPUT_LVLED_ATTR_RW(1);
+static LM3533_OUTPUT_LVLED_ATTR_RW(2);
+static LM3533_OUTPUT_LVLED_ATTR_RW(3);
+static LM3533_OUTPUT_LVLED_ATTR_RW(4);
+static LM3533_OUTPUT_LVLED_ATTR_RW(5);
+
+static struct attribute *lm3533_attributes[] = {
+	&lm3533_dev_attr_output_hvled1.dev_attr.attr,
+	&lm3533_dev_attr_output_hvled2.dev_attr.attr,
+	&lm3533_dev_attr_output_lvled1.dev_attr.attr,
+	&lm3533_dev_attr_output_lvled2.dev_attr.attr,
+	&lm3533_dev_attr_output_lvled3.dev_attr.attr,
+	&lm3533_dev_attr_output_lvled4.dev_attr.attr,
+	&lm3533_dev_attr_output_lvled5.dev_attr.attr,
+	NULL,
+};
+
+#define to_dev_attr(_attr) \
+	container_of(_attr, struct device_attribute, attr)
+
+static umode_t lm3533_attr_is_visible(struct kobject *kobj,
+					     struct attribute *attr, int n)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct lm3533 *lm3533 = dev_get_drvdata(dev);
+	struct device_attribute *dattr = to_dev_attr(attr);
+	struct lm3533_device_attribute *lattr = to_lm3533_dev_attr(dattr);
+	enum lm3533_attribute_type type = lattr->type;
+	umode_t mode = attr->mode;
+
+	if (!lm3533->have_backlights && type == LM3533_ATTR_TYPE_BACKLIGHT)
+		mode = 0;
+	else if (!lm3533->have_leds && type == LM3533_ATTR_TYPE_LED)
+		mode = 0;
+
+	return mode;
+};
+
+static struct attribute_group lm3533_attribute_group = {
+	.is_visible	= lm3533_attr_is_visible,
+	.attrs		= lm3533_attributes
+};
+
+static int __devinit lm3533_device_als_init(struct lm3533 *lm3533)
+{
+	struct lm3533_platform_data *pdata = lm3533->dev->platform_data;
+	int ret;
+
+	if (!pdata->als)
+		return 0;
+
+	lm3533_als_devs[0].platform_data = pdata->als;
+	lm3533_als_devs[0].pdata_size = sizeof(*pdata->als);
+
+	ret = mfd_add_devices(lm3533->dev, 0, lm3533_als_devs, 1, NULL, 0);
+	if (ret) {
+		dev_err(lm3533->dev, "failed to add ALS device\n");
+		return ret;
+	}
+
+	lm3533->have_als = 1;
+
+	return 0;
+}
+
+static int __devinit lm3533_device_bl_init(struct lm3533 *lm3533)
+{
+	struct lm3533_platform_data *pdata = lm3533->dev->platform_data;
+	int i;
+	int ret;
+
+	if (!pdata->backlights || pdata->num_backlights == 0)
+		return 0;
+
+	if (pdata->num_backlights > ARRAY_SIZE(lm3533_bl_devs))
+		pdata->num_backlights = ARRAY_SIZE(lm3533_bl_devs);
+
+	for (i = 0; i < pdata->num_backlights; ++i) {
+		lm3533_bl_devs[i].platform_data = &pdata->backlights[i];
+		lm3533_bl_devs[i].pdata_size = sizeof(pdata->backlights[i]);
+	}
+
+	ret = mfd_add_devices(lm3533->dev, 0, lm3533_bl_devs,
+					pdata->num_backlights, NULL, 0);
+	if (ret) {
+		dev_err(lm3533->dev, "failed to add backlight devices\n");
+		return ret;
+	}
+
+	lm3533->have_backlights = 1;
+
+	return 0;
+}
+
+static int __devinit lm3533_device_led_init(struct lm3533 *lm3533)
+{
+	struct lm3533_platform_data *pdata = lm3533->dev->platform_data;
+	int i;
+	int ret;
+
+	if (!pdata->leds || pdata->num_leds == 0)
+		return 0;
+
+	if (pdata->num_leds > ARRAY_SIZE(lm3533_led_devs))
+		pdata->num_leds = ARRAY_SIZE(lm3533_led_devs);
+
+	for (i = 0; i < pdata->num_leds; ++i) {
+		lm3533_led_devs[i].platform_data = &pdata->leds[i];
+		lm3533_led_devs[i].pdata_size = sizeof(pdata->leds[i]);
+	}
+
+	ret = mfd_add_devices(lm3533->dev, 0, lm3533_led_devs,
+						pdata->num_leds, NULL, 0);
+	if (ret) {
+		dev_err(lm3533->dev, "failed to add LED devices\n");
+		return ret;
+	}
+
+	lm3533->have_leds = 1;
+
+	return 0;
+}
+
+static int __devinit lm3533_device_setup(struct lm3533 *lm3533,
+					struct lm3533_platform_data *pdata)
+{
+	int ret;
+
+	ret = lm3533_set_boost_freq(lm3533, pdata->boost_freq);
+	if (ret)
+		return ret;
+
+	ret = lm3533_set_boost_ovp(lm3533, pdata->boost_ovp);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int __devinit lm3533_device_init(struct lm3533 *lm3533)
+{
+	struct lm3533_platform_data *pdata = lm3533->dev->platform_data;
+	int ret;
+
+	dev_dbg(lm3533->dev, "%s\n", __func__);
+
+	if (!pdata) {
+		dev_err(lm3533->dev, "no platform data\n");
+		return -EINVAL;
+	}
+
+	lm3533->gpio_hwen = pdata->gpio_hwen;
+
+	dev_set_drvdata(lm3533->dev, lm3533);
+
+	if (gpio_is_valid(lm3533->gpio_hwen)) {
+		ret = gpio_request_one(lm3533->gpio_hwen, GPIOF_OUT_INIT_LOW,
+								"lm3533-hwen");
+		if (ret < 0) {
+			dev_err(lm3533->dev,
+				"failed to request HWEN GPIO %d\n",
+				lm3533->gpio_hwen);
+			return ret;
+		}
+	}
+
+	lm3533_enable(lm3533);
+
+	ret = lm3533_device_setup(lm3533, pdata);
+	if (ret)
+		goto err_disable;
+
+	lm3533_device_als_init(lm3533);
+	lm3533_device_bl_init(lm3533);
+	lm3533_device_led_init(lm3533);
+
+	ret = sysfs_create_group(&lm3533->dev->kobj, &lm3533_attribute_group);
+	if (ret < 0) {
+		dev_err(lm3533->dev, "failed to create sysfs attributes\n");
+		goto err_unregister;
+	}
+
+	return 0;
+
+err_unregister:
+	mfd_remove_devices(lm3533->dev);
+err_disable:
+	lm3533_disable(lm3533);
+	if (gpio_is_valid(lm3533->gpio_hwen))
+		gpio_free(lm3533->gpio_hwen);
+
+	return ret;
+}
+
+static void __devexit lm3533_device_exit(struct lm3533 *lm3533)
+{
+	dev_dbg(lm3533->dev, "%s\n", __func__);
+
+	sysfs_remove_group(&lm3533->dev->kobj, &lm3533_attribute_group);
+
+	mfd_remove_devices(lm3533->dev);
+	lm3533_disable(lm3533);
+	if (gpio_is_valid(lm3533->gpio_hwen))
+		gpio_free(lm3533->gpio_hwen);
+}
+
+static bool lm3533_readable_register(struct device *dev, unsigned int reg)
+{
+	switch (reg) {
+	case 0x10 ... 0x2c:
+	case 0x30 ... 0x38:
+	case 0x40 ... 0x45:
+	case 0x50 ... 0x57:
+	case 0x60 ... 0x6e:
+	case 0x70 ... 0x75:
+	case 0x80 ... 0x85:
+	case 0x90 ... 0x95:
+	case 0xa0 ... 0xa5:
+	case 0xb0 ... 0xb2:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool lm3533_volatile_register(struct device *dev, unsigned int reg)
+{
+	switch (reg) {
+	case 0x34 ... 0x36:	/* zone */
+	case 0x37 ... 0x38:	/* adc */
+	case 0xb0 ... 0xb1:	/* fault */
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool lm3533_precious_register(struct device *dev, unsigned int reg)
+{
+	switch (reg) {
+	case 0x34:		/* zone */
+		return true;
+	default:
+		return false;
+	}
+}
+
+static struct regmap_config regmap_config = {
+	.reg_bits	= 8,
+	.val_bits	= 8,
+	.max_register	= LM3533_REG_MAX,
+	.readable_reg	= lm3533_readable_register,
+	.volatile_reg	= lm3533_volatile_register,
+	.precious_reg	= lm3533_precious_register,
+};
+
+static int __devinit lm3533_i2c_probe(struct i2c_client *i2c,
+					const struct i2c_device_id *id)
+{
+	struct lm3533 *lm3533;
+	int ret;
+
+	dev_dbg(&i2c->dev, "%s\n", __func__);
+
+	lm3533 = devm_kzalloc(&i2c->dev, sizeof(*lm3533), GFP_KERNEL);
+	if (!lm3533)
+		return -ENOMEM;
+
+	i2c_set_clientdata(i2c, lm3533);
+
+	lm3533->regmap = devm_regmap_init_i2c(i2c, &regmap_config);
+	if (IS_ERR(lm3533->regmap))
+		return PTR_ERR(lm3533->regmap);
+
+	lm3533->dev = &i2c->dev;
+	lm3533->irq = i2c->irq;
+
+	ret = lm3533_device_init(lm3533);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int __devexit lm3533_i2c_remove(struct i2c_client *i2c)
+{
+	struct lm3533 *lm3533 = i2c_get_clientdata(i2c);
+
+	dev_dbg(&i2c->dev, "%s\n", __func__);
+
+	lm3533_device_exit(lm3533);
+
+	return 0;
+}
+
+static const struct i2c_device_id lm3533_i2c_ids[] = {
+	{ "lm3533", 0 },
+	{ },
+};
+MODULE_DEVICE_TABLE(i2c, lm3533_i2c_ids);
+
+static struct i2c_driver lm3533_i2c_driver = {
+	.driver = {
+		   .name = "lm3533",
+		   .owner = THIS_MODULE,
+	},
+	.id_table	= lm3533_i2c_ids,
+	.probe		= lm3533_i2c_probe,
+	.remove		= __devexit_p(lm3533_i2c_remove),
+};
+
+static int __init lm3533_i2c_init(void)
+{
+	return i2c_add_driver(&lm3533_i2c_driver);
+}
+subsys_initcall(lm3533_i2c_init);
+
+static void __exit lm3533_i2c_exit(void)
+{
+	i2c_del_driver(&lm3533_i2c_driver);
+}
+module_exit(lm3533_i2c_exit);
+
+MODULE_AUTHOR("Johan Hovold <[email protected]>");
+MODULE_DESCRIPTION("LM3533 Core");
+MODULE_LICENSE("GPL");
diff --git a/drivers/mfd/lm3533-ctrlbank.c b/drivers/mfd/lm3533-ctrlbank.c
new file mode 100644
index 000000000000..a4cb7a5220a7
--- /dev/null
+++ b/drivers/mfd/lm3533-ctrlbank.c
@@ -0,0 +1,148 @@
+/*
+ * lm3533-ctrlbank.c -- LM3533 Generic Control Bank interface
+ *
+ * Copyright (C) 2011-2012 Texas Instruments
+ *
+ * Author: Johan Hovold <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under  the terms of the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/device.h>
+#include <linux/module.h>
+
+#include <linux/mfd/lm3533.h>
+
+
+#define LM3533_MAX_CURRENT_MIN		5000
+#define LM3533_MAX_CURRENT_MAX		29800
+#define LM3533_MAX_CURRENT_STEP		800
+
+#define LM3533_BRIGHTNESS_MAX		255
+#define LM3533_PWM_MAX			0x3f
+
+#define LM3533_REG_PWM_BASE		0x14
+#define LM3533_REG_MAX_CURRENT_BASE	0x1f
+#define LM3533_REG_CTRLBANK_ENABLE	0x27
+#define LM3533_REG_BRIGHTNESS_BASE	0x40
+
+
+static inline u8 lm3533_ctrlbank_get_reg(struct lm3533_ctrlbank *cb, u8 base)
+{
+	return base + cb->id;
+}
+
+int lm3533_ctrlbank_enable(struct lm3533_ctrlbank *cb)
+{
+	u8 mask;
+	int ret;
+
+	dev_dbg(cb->dev, "%s - %d\n", __func__, cb->id);
+
+	mask = 1 << cb->id;
+	ret = lm3533_update(cb->lm3533, LM3533_REG_CTRLBANK_ENABLE,
+								mask, mask);
+	if (ret)
+		dev_err(cb->dev, "failed to enable ctrlbank %d\n", cb->id);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(lm3533_ctrlbank_enable);
+
+int lm3533_ctrlbank_disable(struct lm3533_ctrlbank *cb)
+{
+	u8 mask;
+	int ret;
+
+	dev_dbg(cb->dev, "%s - %d\n", __func__, cb->id);
+
+	mask = 1 << cb->id;
+	ret = lm3533_update(cb->lm3533, LM3533_REG_CTRLBANK_ENABLE, 0, mask);
+	if (ret)
+		dev_err(cb->dev, "failed to disable ctrlbank %d\n", cb->id);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(lm3533_ctrlbank_disable);
+
+/*
+ * Full-scale current.
+ *
+ * imax		5000 - 29800 uA (800 uA step)
+ */
+int lm3533_ctrlbank_set_max_current(struct lm3533_ctrlbank *cb, u16 imax)
+{
+	u8 reg;
+	u8 val;
+	int ret;
+
+	if (imax < LM3533_MAX_CURRENT_MIN || imax > LM3533_MAX_CURRENT_MAX)
+		return -EINVAL;
+
+	val = (imax - LM3533_MAX_CURRENT_MIN) / LM3533_MAX_CURRENT_STEP;
+
+	reg = lm3533_ctrlbank_get_reg(cb, LM3533_REG_MAX_CURRENT_BASE);
+	ret = lm3533_write(cb->lm3533, reg, val);
+	if (ret)
+		dev_err(cb->dev, "failed to set max current\n");
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(lm3533_ctrlbank_set_max_current);
+
+#define lm3533_ctrlbank_set(_name, _NAME)				\
+int lm3533_ctrlbank_set_##_name(struct lm3533_ctrlbank *cb, u8 val)	\
+{									\
+	u8 reg;								\
+	int ret;							\
+									\
+	if (val > LM3533_##_NAME##_MAX)					\
+		return -EINVAL;						\
+									\
+	reg = lm3533_ctrlbank_get_reg(cb, LM3533_REG_##_NAME##_BASE);	\
+	ret = lm3533_write(cb->lm3533, reg, val);			\
+	if (ret)							\
+		dev_err(cb->dev, "failed to set " #_name "\n");		\
+									\
+	return ret;							\
+}									\
+EXPORT_SYMBOL_GPL(lm3533_ctrlbank_set_##_name);
+
+#define lm3533_ctrlbank_get(_name, _NAME)				\
+int lm3533_ctrlbank_get_##_name(struct lm3533_ctrlbank *cb, u8 *val)	\
+{									\
+	u8 reg;								\
+	int ret;							\
+									\
+	reg = lm3533_ctrlbank_get_reg(cb, LM3533_REG_##_NAME##_BASE);	\
+	ret = lm3533_read(cb->lm3533, reg, val);			\
+	if (ret)							\
+		dev_err(cb->dev, "failed to get " #_name "\n");		\
+									\
+	return ret;							\
+}									\
+EXPORT_SYMBOL_GPL(lm3533_ctrlbank_get_##_name);
+
+lm3533_ctrlbank_set(brightness, BRIGHTNESS);
+lm3533_ctrlbank_get(brightness, BRIGHTNESS);
+
+/*
+ * PWM-input control mask:
+ *
+ *   bit 5 - PWM-input enabled in Zone 4
+ *   bit 4 - PWM-input enabled in Zone 3
+ *   bit 3 - PWM-input enabled in Zone 2
+ *   bit 2 - PWM-input enabled in Zone 1
+ *   bit 1 - PWM-input enabled in Zone 0
+ *   bit 0 - PWM-input enabled
+ */
+lm3533_ctrlbank_set(pwm, PWM);
+lm3533_ctrlbank_get(pwm, PWM);
+
+
+MODULE_AUTHOR("Johan Hovold <[email protected]>");
+MODULE_DESCRIPTION("LM3533 Control Bank interface");
+MODULE_LICENSE("GPL");
diff --git a/drivers/mfd/lpc_ich.c b/drivers/mfd/lpc_ich.c
new file mode 100644
index 000000000000..027cc8f86132
--- /dev/null
+++ b/drivers/mfd/lpc_ich.c
@@ -0,0 +1,888 @@
+/*
+ *  lpc_ich.c - LPC interface for Intel ICH
+ *
+ *  LPC bridge function of the Intel ICH contains many other
+ *  functional units, such as Interrupt controllers, Timers,
+ *  Power Management, System Management, GPIO, RTC, and LPC
+ *  Configuration Registers.
+ *
+ *  This driver is derived from lpc_sch.
+
+ *  Copyright (c) 2011 Extreme Engineering Solution, Inc.
+ *  Author: Aaron Sierra <[email protected]>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License 2 as published
+ *  by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ *  This driver supports the following I/O Controller hubs:
+ *	(See the intel documentation on http://developer.intel.com.)
+ *	document number 290655-003, 290677-014: 82801AA (ICH), 82801AB (ICHO)
+ *	document number 290687-002, 298242-027: 82801BA (ICH2)
+ *	document number 290733-003, 290739-013: 82801CA (ICH3-S)
+ *	document number 290716-001, 290718-007: 82801CAM (ICH3-M)
+ *	document number 290744-001, 290745-025: 82801DB (ICH4)
+ *	document number 252337-001, 252663-008: 82801DBM (ICH4-M)
+ *	document number 273599-001, 273645-002: 82801E (C-ICH)
+ *	document number 252516-001, 252517-028: 82801EB (ICH5), 82801ER (ICH5R)
+ *	document number 300641-004, 300884-013: 6300ESB
+ *	document number 301473-002, 301474-026: 82801F (ICH6)
+ *	document number 313082-001, 313075-006: 631xESB, 632xESB
+ *	document number 307013-003, 307014-024: 82801G (ICH7)
+ *	document number 322896-001, 322897-001: NM10
+ *	document number 313056-003, 313057-017: 82801H (ICH8)
+ *	document number 316972-004, 316973-012: 82801I (ICH9)
+ *	document number 319973-002, 319974-002: 82801J (ICH10)
+ *	document number 322169-001, 322170-003: 5 Series, 3400 Series (PCH)
+ *	document number 320066-003, 320257-008: EP80597 (IICH)
+ *	document number 324645-001, 324646-001: Cougar Point (CPT)
+ *	document number TBD : Patsburg (PBG)
+ *	document number TBD : DH89xxCC
+ *	document number TBD : Panther Point
+ *	document number TBD : Lynx Point
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/acpi.h>
+#include <linux/pci.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/lpc_ich.h>
+
+#define ACPIBASE		0x40
+#define ACPIBASE_GPE_OFF	0x28
+#define ACPIBASE_GPE_END	0x2f
+#define ACPIBASE_SMI_OFF	0x30
+#define ACPIBASE_SMI_END	0x33
+#define ACPIBASE_TCO_OFF	0x60
+#define ACPIBASE_TCO_END	0x7f
+#define ACPICTRL		0x44
+
+#define ACPIBASE_GCS_OFF	0x3410
+#define ACPIBASE_GCS_END	0x3414
+
+#define GPIOBASE		0x48
+#define GPIOCTRL		0x4C
+
+#define RCBABASE		0xf0
+
+#define wdt_io_res(i) wdt_res(0, i)
+#define wdt_mem_res(i) wdt_res(ICH_RES_MEM_OFF, i)
+#define wdt_res(b, i) (&wdt_ich_res[(b) + (i)])
+
+static int lpc_ich_acpi_save = -1;
+static int lpc_ich_gpio_save = -1;
+
+static struct resource wdt_ich_res[] = {
+	/* ACPI - TCO */
+	{
+		.flags = IORESOURCE_IO,
+	},
+	/* ACPI - SMI */
+	{
+		.flags = IORESOURCE_IO,
+	},
+	/* GCS */
+	{
+		.flags = IORESOURCE_MEM,
+	},
+};
+
+static struct resource gpio_ich_res[] = {
+	/* GPIO */
+	{
+		.flags = IORESOURCE_IO,
+	},
+	/* ACPI - GPE0 */
+	{
+		.flags = IORESOURCE_IO,
+	},
+};
+
+enum lpc_cells {
+	LPC_WDT = 0,
+	LPC_GPIO,
+};
+
+static struct mfd_cell lpc_ich_cells[] = {
+	[LPC_WDT] = {
+		.name = "iTCO_wdt",
+		.num_resources = ARRAY_SIZE(wdt_ich_res),
+		.resources = wdt_ich_res,
+		.ignore_resource_conflicts = true,
+	},
+	[LPC_GPIO] = {
+		.name = "gpio_ich",
+		.num_resources = ARRAY_SIZE(gpio_ich_res),
+		.resources = gpio_ich_res,
+		.ignore_resource_conflicts = true,
+	},
+};
+
+/* chipset related info */
+enum lpc_chipsets {
+	LPC_ICH = 0,	/* ICH */
+	LPC_ICH0,	/* ICH0 */
+	LPC_ICH2,	/* ICH2 */
+	LPC_ICH2M,	/* ICH2-M */
+	LPC_ICH3,	/* ICH3-S */
+	LPC_ICH3M,	/* ICH3-M */
+	LPC_ICH4,	/* ICH4 */
+	LPC_ICH4M,	/* ICH4-M */
+	LPC_CICH,	/* C-ICH */
+	LPC_ICH5,	/* ICH5 & ICH5R */
+	LPC_6300ESB,	/* 6300ESB */
+	LPC_ICH6,	/* ICH6 & ICH6R */
+	LPC_ICH6M,	/* ICH6-M */
+	LPC_ICH6W,	/* ICH6W & ICH6RW */
+	LPC_631XESB,	/* 631xESB/632xESB */
+	LPC_ICH7,	/* ICH7 & ICH7R */
+	LPC_ICH7DH,	/* ICH7DH */
+	LPC_ICH7M,	/* ICH7-M & ICH7-U */
+	LPC_ICH7MDH,	/* ICH7-M DH */
+	LPC_NM10,	/* NM10 */
+	LPC_ICH8,	/* ICH8 & ICH8R */
+	LPC_ICH8DH,	/* ICH8DH */
+	LPC_ICH8DO,	/* ICH8DO */
+	LPC_ICH8M,	/* ICH8M */
+	LPC_ICH8ME,	/* ICH8M-E */
+	LPC_ICH9,	/* ICH9 */
+	LPC_ICH9R,	/* ICH9R */
+	LPC_ICH9DH,	/* ICH9DH */
+	LPC_ICH9DO,	/* ICH9DO */
+	LPC_ICH9M,	/* ICH9M */
+	LPC_ICH9ME,	/* ICH9M-E */
+	LPC_ICH10,	/* ICH10 */
+	LPC_ICH10R,	/* ICH10R */
+	LPC_ICH10D,	/* ICH10D */
+	LPC_ICH10DO,	/* ICH10DO */
+	LPC_PCH,	/* PCH Desktop Full Featured */
+	LPC_PCHM,	/* PCH Mobile Full Featured */
+	LPC_P55,	/* P55 */
+	LPC_PM55,	/* PM55 */
+	LPC_H55,	/* H55 */
+	LPC_QM57,	/* QM57 */
+	LPC_H57,	/* H57 */
+	LPC_HM55,	/* HM55 */
+	LPC_Q57,	/* Q57 */
+	LPC_HM57,	/* HM57 */
+	LPC_PCHMSFF,	/* PCH Mobile SFF Full Featured */
+	LPC_QS57,	/* QS57 */
+	LPC_3400,	/* 3400 */
+	LPC_3420,	/* 3420 */
+	LPC_3450,	/* 3450 */
+	LPC_EP80579,	/* EP80579 */
+	LPC_CPT,	/* Cougar Point */
+	LPC_CPTD,	/* Cougar Point Desktop */
+	LPC_CPTM,	/* Cougar Point Mobile */
+	LPC_PBG,	/* Patsburg */
+	LPC_DH89XXCC,	/* DH89xxCC */
+	LPC_PPT,	/* Panther Point */
+	LPC_LPT,	/* Lynx Point */
+};
+
+struct lpc_ich_info lpc_chipset_info[] __devinitdata = {
+	[LPC_ICH] = {
+		.name = "ICH",
+		.iTCO_version = 1,
+	},
+	[LPC_ICH0] = {
+		.name = "ICH0",
+		.iTCO_version = 1,
+	},
+	[LPC_ICH2] = {
+		.name = "ICH2",
+		.iTCO_version = 1,
+	},
+	[LPC_ICH2M] = {
+		.name = "ICH2-M",
+		.iTCO_version = 1,
+	},
+	[LPC_ICH3] = {
+		.name = "ICH3-S",
+		.iTCO_version = 1,
+	},
+	[LPC_ICH3M] = {
+		.name = "ICH3-M",
+		.iTCO_version = 1,
+	},
+	[LPC_ICH4] = {
+		.name = "ICH4",
+		.iTCO_version = 1,
+	},
+	[LPC_ICH4M] = {
+		.name = "ICH4-M",
+		.iTCO_version = 1,
+	},
+	[LPC_CICH] = {
+		.name = "C-ICH",
+		.iTCO_version = 1,
+	},
+	[LPC_ICH5] = {
+		.name = "ICH5 or ICH5R",
+		.iTCO_version = 1,
+	},
+	[LPC_6300ESB] = {
+		.name = "6300ESB",
+		.iTCO_version = 1,
+	},
+	[LPC_ICH6] = {
+		.name = "ICH6 or ICH6R",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V6_GPIO,
+	},
+	[LPC_ICH6M] = {
+		.name = "ICH6-M",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V6_GPIO,
+	},
+	[LPC_ICH6W] = {
+		.name = "ICH6W or ICH6RW",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V6_GPIO,
+	},
+	[LPC_631XESB] = {
+		.name = "631xESB/632xESB",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V6_GPIO,
+	},
+	[LPC_ICH7] = {
+		.name = "ICH7 or ICH7R",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V7_GPIO,
+	},
+	[LPC_ICH7DH] = {
+		.name = "ICH7DH",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V7_GPIO,
+	},
+	[LPC_ICH7M] = {
+		.name = "ICH7-M or ICH7-U",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V7_GPIO,
+	},
+	[LPC_ICH7MDH] = {
+		.name = "ICH7-M DH",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V7_GPIO,
+	},
+	[LPC_NM10] = {
+		.name = "NM10",
+		.iTCO_version = 2,
+	},
+	[LPC_ICH8] = {
+		.name = "ICH8 or ICH8R",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V7_GPIO,
+	},
+	[LPC_ICH8DH] = {
+		.name = "ICH8DH",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V7_GPIO,
+	},
+	[LPC_ICH8DO] = {
+		.name = "ICH8DO",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V7_GPIO,
+	},
+	[LPC_ICH8M] = {
+		.name = "ICH8M",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V7_GPIO,
+	},
+	[LPC_ICH8ME] = {
+		.name = "ICH8M-E",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V7_GPIO,
+	},
+	[LPC_ICH9] = {
+		.name = "ICH9",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V9_GPIO,
+	},
+	[LPC_ICH9R] = {
+		.name = "ICH9R",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V9_GPIO,
+	},
+	[LPC_ICH9DH] = {
+		.name = "ICH9DH",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V9_GPIO,
+	},
+	[LPC_ICH9DO] = {
+		.name = "ICH9DO",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V9_GPIO,
+	},
+	[LPC_ICH9M] = {
+		.name = "ICH9M",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V9_GPIO,
+	},
+	[LPC_ICH9ME] = {
+		.name = "ICH9M-E",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V9_GPIO,
+	},
+	[LPC_ICH10] = {
+		.name = "ICH10",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V10CONS_GPIO,
+	},
+	[LPC_ICH10R] = {
+		.name = "ICH10R",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V10CONS_GPIO,
+	},
+	[LPC_ICH10D] = {
+		.name = "ICH10D",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V10CORP_GPIO,
+	},
+	[LPC_ICH10DO] = {
+		.name = "ICH10DO",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V10CORP_GPIO,
+	},
+	[LPC_PCH] = {
+		.name = "PCH Desktop Full Featured",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_PCHM] = {
+		.name = "PCH Mobile Full Featured",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_P55] = {
+		.name = "P55",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_PM55] = {
+		.name = "PM55",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_H55] = {
+		.name = "H55",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_QM57] = {
+		.name = "QM57",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_H57] = {
+		.name = "H57",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_HM55] = {
+		.name = "HM55",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_Q57] = {
+		.name = "Q57",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_HM57] = {
+		.name = "HM57",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_PCHMSFF] = {
+		.name = "PCH Mobile SFF Full Featured",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_QS57] = {
+		.name = "QS57",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_3400] = {
+		.name = "3400",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_3420] = {
+		.name = "3420",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_3450] = {
+		.name = "3450",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_EP80579] = {
+		.name = "EP80579",
+		.iTCO_version = 2,
+	},
+	[LPC_CPT] = {
+		.name = "Cougar Point",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_CPTD] = {
+		.name = "Cougar Point Desktop",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_CPTM] = {
+		.name = "Cougar Point Mobile",
+		.iTCO_version = 2,
+		.gpio_version = ICH_V5_GPIO,
+	},
+	[LPC_PBG] = {
+		.name = "Patsburg",
+		.iTCO_version = 2,
+	},
+	[LPC_DH89XXCC] = {
+		.name = "DH89xxCC",
+		.iTCO_version = 2,
+	},
+	[LPC_PPT] = {
+		.name = "Panther Point",
+		.iTCO_version = 2,
+	},
+	[LPC_LPT] = {
+		.name = "Lynx Point",
+		.iTCO_version = 2,
+	},
+};
+
+/*
+ * This data only exists for exporting the supported PCI ids
+ * via MODULE_DEVICE_TABLE.  We do not actually register a
+ * pci_driver, because the I/O Controller Hub has also other
+ * functions that probably will be registered by other drivers.
+ */
+static DEFINE_PCI_DEVICE_TABLE(lpc_ich_ids) = {
+	{ PCI_VDEVICE(INTEL, 0x2410), LPC_ICH},
+	{ PCI_VDEVICE(INTEL, 0x2420), LPC_ICH0},
+	{ PCI_VDEVICE(INTEL, 0x2440), LPC_ICH2},
+	{ PCI_VDEVICE(INTEL, 0x244c), LPC_ICH2M},
+	{ PCI_VDEVICE(INTEL, 0x2480), LPC_ICH3},
+	{ PCI_VDEVICE(INTEL, 0x248c), LPC_ICH3M},
+	{ PCI_VDEVICE(INTEL, 0x24c0), LPC_ICH4},
+	{ PCI_VDEVICE(INTEL, 0x24cc), LPC_ICH4M},
+	{ PCI_VDEVICE(INTEL, 0x2450), LPC_CICH},
+	{ PCI_VDEVICE(INTEL, 0x24d0), LPC_ICH5},
+	{ PCI_VDEVICE(INTEL, 0x25a1), LPC_6300ESB},
+	{ PCI_VDEVICE(INTEL, 0x2640), LPC_ICH6},
+	{ PCI_VDEVICE(INTEL, 0x2641), LPC_ICH6M},
+	{ PCI_VDEVICE(INTEL, 0x2642), LPC_ICH6W},
+	{ PCI_VDEVICE(INTEL, 0x2670), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x2671), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x2672), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x2673), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x2674), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x2675), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x2676), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x2677), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x2678), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x2679), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x267a), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x267b), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x267c), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x267d), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x267e), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x267f), LPC_631XESB},
+	{ PCI_VDEVICE(INTEL, 0x27b8), LPC_ICH7},
+	{ PCI_VDEVICE(INTEL, 0x27b0), LPC_ICH7DH},
+	{ PCI_VDEVICE(INTEL, 0x27b9), LPC_ICH7M},
+	{ PCI_VDEVICE(INTEL, 0x27bd), LPC_ICH7MDH},
+	{ PCI_VDEVICE(INTEL, 0x27bc), LPC_NM10},
+	{ PCI_VDEVICE(INTEL, 0x2810), LPC_ICH8},
+	{ PCI_VDEVICE(INTEL, 0x2812), LPC_ICH8DH},
+	{ PCI_VDEVICE(INTEL, 0x2814), LPC_ICH8DO},
+	{ PCI_VDEVICE(INTEL, 0x2815), LPC_ICH8M},
+	{ PCI_VDEVICE(INTEL, 0x2811), LPC_ICH8ME},
+	{ PCI_VDEVICE(INTEL, 0x2918), LPC_ICH9},
+	{ PCI_VDEVICE(INTEL, 0x2916), LPC_ICH9R},
+	{ PCI_VDEVICE(INTEL, 0x2912), LPC_ICH9DH},
+	{ PCI_VDEVICE(INTEL, 0x2914), LPC_ICH9DO},
+	{ PCI_VDEVICE(INTEL, 0x2919), LPC_ICH9M},
+	{ PCI_VDEVICE(INTEL, 0x2917), LPC_ICH9ME},
+	{ PCI_VDEVICE(INTEL, 0x3a18), LPC_ICH10},
+	{ PCI_VDEVICE(INTEL, 0x3a16), LPC_ICH10R},
+	{ PCI_VDEVICE(INTEL, 0x3a1a), LPC_ICH10D},
+	{ PCI_VDEVICE(INTEL, 0x3a14), LPC_ICH10DO},
+	{ PCI_VDEVICE(INTEL, 0x3b00), LPC_PCH},
+	{ PCI_VDEVICE(INTEL, 0x3b01), LPC_PCHM},
+	{ PCI_VDEVICE(INTEL, 0x3b02), LPC_P55},
+	{ PCI_VDEVICE(INTEL, 0x3b03), LPC_PM55},
+	{ PCI_VDEVICE(INTEL, 0x3b06), LPC_H55},
+	{ PCI_VDEVICE(INTEL, 0x3b07), LPC_QM57},
+	{ PCI_VDEVICE(INTEL, 0x3b08), LPC_H57},
+	{ PCI_VDEVICE(INTEL, 0x3b09), LPC_HM55},
+	{ PCI_VDEVICE(INTEL, 0x3b0a), LPC_Q57},
+	{ PCI_VDEVICE(INTEL, 0x3b0b), LPC_HM57},
+	{ PCI_VDEVICE(INTEL, 0x3b0d), LPC_PCHMSFF},
+	{ PCI_VDEVICE(INTEL, 0x3b0f), LPC_QS57},
+	{ PCI_VDEVICE(INTEL, 0x3b12), LPC_3400},
+	{ PCI_VDEVICE(INTEL, 0x3b14), LPC_3420},
+	{ PCI_VDEVICE(INTEL, 0x3b16), LPC_3450},
+	{ PCI_VDEVICE(INTEL, 0x5031), LPC_EP80579},
+	{ PCI_VDEVICE(INTEL, 0x1c41), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c42), LPC_CPTD},
+	{ PCI_VDEVICE(INTEL, 0x1c43), LPC_CPTM},
+	{ PCI_VDEVICE(INTEL, 0x1c44), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c45), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c46), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c47), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c48), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c49), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c4a), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c4b), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c4c), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c4d), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c4e), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c4f), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c50), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c51), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c52), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c53), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c54), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c55), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c56), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c57), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c58), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c59), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c5a), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c5b), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c5c), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c5d), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c5e), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1c5f), LPC_CPT},
+	{ PCI_VDEVICE(INTEL, 0x1d40), LPC_PBG},
+	{ PCI_VDEVICE(INTEL, 0x1d41), LPC_PBG},
+	{ PCI_VDEVICE(INTEL, 0x2310), LPC_DH89XXCC},
+	{ PCI_VDEVICE(INTEL, 0x1e40), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e41), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e42), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e43), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e44), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e45), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e46), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e47), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e48), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e49), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e4a), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e4b), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e4c), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e4d), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e4e), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e4f), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e50), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e51), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e52), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e53), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e54), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e55), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e56), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e57), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e58), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e59), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e5a), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e5b), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e5c), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e5d), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e5e), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x1e5f), LPC_PPT},
+	{ PCI_VDEVICE(INTEL, 0x8c40), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c41), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c42), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c43), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c44), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c45), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c46), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c47), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c48), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c49), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c4a), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c4b), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c4c), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c4d), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c4e), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c4f), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c50), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c51), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c52), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c53), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c54), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c55), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c56), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c57), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c58), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c59), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c5a), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c5b), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c5c), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c5d), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c5e), LPC_LPT},
+	{ PCI_VDEVICE(INTEL, 0x8c5f), LPC_LPT},
+	{ 0, },			/* End of list */
+};
+MODULE_DEVICE_TABLE(pci, lpc_ich_ids);
+
+static void lpc_ich_restore_config_space(struct pci_dev *dev)
+{
+	if (lpc_ich_acpi_save >= 0) {
+		pci_write_config_byte(dev, ACPICTRL, lpc_ich_acpi_save);
+		lpc_ich_acpi_save = -1;
+	}
+
+	if (lpc_ich_gpio_save >= 0) {
+		pci_write_config_byte(dev, GPIOCTRL, lpc_ich_gpio_save);
+		lpc_ich_gpio_save = -1;
+	}
+}
+
+static void __devinit lpc_ich_enable_acpi_space(struct pci_dev *dev)
+{
+	u8 reg_save;
+
+	pci_read_config_byte(dev, ACPICTRL, &reg_save);
+	pci_write_config_byte(dev, ACPICTRL, reg_save | 0x10);
+	lpc_ich_acpi_save = reg_save;
+}
+
+static void __devinit lpc_ich_enable_gpio_space(struct pci_dev *dev)
+{
+	u8 reg_save;
+
+	pci_read_config_byte(dev, GPIOCTRL, &reg_save);
+	pci_write_config_byte(dev, GPIOCTRL, reg_save | 0x10);
+	lpc_ich_gpio_save = reg_save;
+}
+
+static void __devinit lpc_ich_finalize_cell(struct mfd_cell *cell,
+					const struct pci_device_id *id)
+{
+	cell->platform_data = &lpc_chipset_info[id->driver_data];
+	cell->pdata_size = sizeof(struct lpc_ich_info);
+}
+
+static int __devinit lpc_ich_init_gpio(struct pci_dev *dev,
+				const struct pci_device_id *id)
+{
+	u32 base_addr_cfg;
+	u32 base_addr;
+	int ret;
+	bool acpi_conflict = false;
+	struct resource *res;
+
+	/* Setup power management base register */
+	pci_read_config_dword(dev, ACPIBASE, &base_addr_cfg);
+	base_addr = base_addr_cfg & 0x0000ff80;
+	if (!base_addr) {
+		dev_err(&dev->dev, "I/O space for ACPI uninitialized\n");
+		lpc_ich_cells[LPC_GPIO].num_resources--;
+		goto gpe0_done;
+	}
+
+	res = &gpio_ich_res[ICH_RES_GPE0];
+	res->start = base_addr + ACPIBASE_GPE_OFF;
+	res->end = base_addr + ACPIBASE_GPE_END;
+	ret = acpi_check_resource_conflict(res);
+	if (ret) {
+		/*
+		 * This isn't fatal for the GPIO, but we have to make sure that
+		 * the platform_device subsystem doesn't see this resource
+		 * or it will register an invalid region.
+		 */
+		lpc_ich_cells[LPC_GPIO].num_resources--;
+		acpi_conflict = true;
+	} else {
+		lpc_ich_enable_acpi_space(dev);
+	}
+
+gpe0_done:
+	/* Setup GPIO base register */
+	pci_read_config_dword(dev, GPIOBASE, &base_addr_cfg);
+	base_addr = base_addr_cfg & 0x0000ff80;
+	if (!base_addr) {
+		dev_err(&dev->dev, "I/O space for GPIO uninitialized\n");
+		ret = -ENODEV;
+		goto gpio_done;
+	}
+
+	/* Older devices provide fewer GPIO and have a smaller resource size. */
+	res = &gpio_ich_res[ICH_RES_GPIO];
+	res->start = base_addr;
+	switch (lpc_chipset_info[id->driver_data].gpio_version) {
+	case ICH_V5_GPIO:
+	case ICH_V10CORP_GPIO:
+		res->end = res->start + 128 - 1;
+		break;
+	default:
+		res->end = res->start + 64 - 1;
+		break;
+	}
+
+	ret = acpi_check_resource_conflict(res);
+	if (ret) {
+		/* this isn't necessarily fatal for the GPIO */
+		acpi_conflict = true;
+		goto gpio_done;
+	}
+	lpc_ich_enable_gpio_space(dev);
+
+	lpc_ich_finalize_cell(&lpc_ich_cells[LPC_GPIO], id);
+	ret = mfd_add_devices(&dev->dev, -1, &lpc_ich_cells[LPC_GPIO],
+				1, NULL, 0);
+
+gpio_done:
+	if (acpi_conflict)
+		pr_warn("Resource conflict(s) found affecting %s\n",
+				lpc_ich_cells[LPC_GPIO].name);
+	return ret;
+}
+
+static int __devinit lpc_ich_init_wdt(struct pci_dev *dev,
+				const struct pci_device_id *id)
+{
+	u32 base_addr_cfg;
+	u32 base_addr;
+	int ret;
+	bool acpi_conflict = false;
+	struct resource *res;
+
+	/* Setup power management base register */
+	pci_read_config_dword(dev, ACPIBASE, &base_addr_cfg);
+	base_addr = base_addr_cfg & 0x0000ff80;
+	if (!base_addr) {
+		dev_err(&dev->dev, "I/O space for ACPI uninitialized\n");
+		ret = -ENODEV;
+		goto wdt_done;
+	}
+
+	res = wdt_io_res(ICH_RES_IO_TCO);
+	res->start = base_addr + ACPIBASE_TCO_OFF;
+	res->end = base_addr + ACPIBASE_TCO_END;
+	ret = acpi_check_resource_conflict(res);
+	if (ret) {
+		acpi_conflict = true;
+		goto wdt_done;
+	}
+
+	res = wdt_io_res(ICH_RES_IO_SMI);
+	res->start = base_addr + ACPIBASE_SMI_OFF;
+	res->end = base_addr + ACPIBASE_SMI_END;
+	ret = acpi_check_resource_conflict(res);
+	if (ret) {
+		acpi_conflict = true;
+		goto wdt_done;
+	}
+	lpc_ich_enable_acpi_space(dev);
+
+	/*
+	 * Get the Memory-Mapped GCS register. To get access to it
+	 * we have to read RCBA from PCI Config space 0xf0 and use
+	 * it as base. GCS = RCBA + ICH6_GCS(0x3410).
+	 */
+	if (lpc_chipset_info[id->driver_data].iTCO_version == 2) {
+		pci_read_config_dword(dev, RCBABASE, &base_addr_cfg);
+		base_addr = base_addr_cfg & 0xffffc000;
+		if (!(base_addr_cfg & 1)) {
+			pr_err("RCBA is disabled by hardware/BIOS, "
+					"device disabled\n");
+			ret = -ENODEV;
+			goto wdt_done;
+		}
+		res = wdt_mem_res(ICH_RES_MEM_GCS);
+		res->start = base_addr + ACPIBASE_GCS_OFF;
+		res->end = base_addr + ACPIBASE_GCS_END;
+		ret = acpi_check_resource_conflict(res);
+		if (ret) {
+			acpi_conflict = true;
+			goto wdt_done;
+		}
+	}
+
+	lpc_ich_finalize_cell(&lpc_ich_cells[LPC_WDT], id);
+	ret = mfd_add_devices(&dev->dev, -1, &lpc_ich_cells[LPC_WDT],
+				1, NULL, 0);
+
+wdt_done:
+	if (acpi_conflict)
+		pr_warn("Resource conflict(s) found affecting %s\n",
+				lpc_ich_cells[LPC_WDT].name);
+	return ret;
+}
+
+static int __devinit lpc_ich_probe(struct pci_dev *dev,
+				const struct pci_device_id *id)
+{
+	int ret;
+	bool cell_added = false;
+
+	ret = lpc_ich_init_wdt(dev, id);
+	if (!ret)
+		cell_added = true;
+
+	ret = lpc_ich_init_gpio(dev, id);
+	if (!ret)
+		cell_added = true;
+
+	/*
+	 * We only care if at least one or none of the cells registered
+	 * successfully.
+	 */
+	if (!cell_added) {
+		lpc_ich_restore_config_space(dev);
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+static void __devexit lpc_ich_remove(struct pci_dev *dev)
+{
+	mfd_remove_devices(&dev->dev);
+	lpc_ich_restore_config_space(dev);
+}
+
+static struct pci_driver lpc_ich_driver = {
+	.name		= "lpc_ich",
+	.id_table	= lpc_ich_ids,
+	.probe		= lpc_ich_probe,
+	.remove		= __devexit_p(lpc_ich_remove),
+};
+
+static int __init lpc_ich_init(void)
+{
+	return pci_register_driver(&lpc_ich_driver);
+}
+
+static void __exit lpc_ich_exit(void)
+{
+	pci_unregister_driver(&lpc_ich_driver);
+}
+
+module_init(lpc_ich_init);
+module_exit(lpc_ich_exit);
+
+MODULE_AUTHOR("Aaron Sierra <[email protected]>");
+MODULE_DESCRIPTION("LPC interface for Intel ICH");
+MODULE_LICENSE("GPL");
diff --git a/drivers/mfd/lpc_sch.c b/drivers/mfd/lpc_sch.c
index abc421364a45..9f20abc5e393 100644
--- a/drivers/mfd/lpc_sch.c
+++ b/drivers/mfd/lpc_sch.c
@@ -36,6 +36,7 @@
 
 #define GPIOBASE	0x44
 #define GPIO_IO_SIZE	64
+#define GPIO_IO_SIZE_CENTERTON	128
 
 #define WDTBASE		0x84
 #define WDT_IO_SIZE	64
@@ -68,7 +69,7 @@ static struct resource wdt_sch_resource = {
 
 static struct mfd_cell tunnelcreek_cells[] = {
 	{
-		.name = "tunnelcreek_wdt",
+		.name = "ie6xx_wdt",
 		.num_resources = 1,
 		.resources = &wdt_sch_resource,
 	},
@@ -77,6 +78,7 @@ static struct mfd_cell tunnelcreek_cells[] = {
 static DEFINE_PCI_DEVICE_TABLE(lpc_sch_ids) = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SCH_LPC) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ITC_LPC) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CENTERTON_ILB) },
 	{ 0, }
 };
 MODULE_DEVICE_TABLE(pci, lpc_sch_ids);
@@ -115,7 +117,11 @@ static int __devinit lpc_sch_probe(struct pci_dev *dev,
 	}
 
 	gpio_sch_resource.start = base_addr;
-	gpio_sch_resource.end = base_addr + GPIO_IO_SIZE - 1;
+
+	if (id->device == PCI_DEVICE_ID_INTEL_CENTERTON_ILB)
+		gpio_sch_resource.end = base_addr + GPIO_IO_SIZE_CENTERTON - 1;
+	else
+		gpio_sch_resource.end = base_addr + GPIO_IO_SIZE - 1;
 
 	for (i=0; i < ARRAY_SIZE(lpc_sch_cells); i++)
 		lpc_sch_cells[i].id = id->device;
@@ -125,7 +131,8 @@ static int __devinit lpc_sch_probe(struct pci_dev *dev,
 	if (ret)
 		goto out_dev;
 
-	if (id->device == PCI_DEVICE_ID_INTEL_ITC_LPC) {
+	if (id->device == PCI_DEVICE_ID_INTEL_ITC_LPC
+	 || id->device == PCI_DEVICE_ID_INTEL_CENTERTON_ILB) {
 		pci_read_config_dword(dev, WDTBASE, &base_addr_cfg);
 		if (!(base_addr_cfg & (1 << 31))) {
 			dev_err(&dev->dev, "Decode of the WDT I/O range disabled\n");
@@ -167,18 +174,7 @@ static struct pci_driver lpc_sch_driver = {
 	.remove		= __devexit_p(lpc_sch_remove),
 };
 
-static int __init lpc_sch_init(void)
-{
-	return pci_register_driver(&lpc_sch_driver);
-}
-
-static void __exit lpc_sch_exit(void)
-{
-	pci_unregister_driver(&lpc_sch_driver);
-}
-
-module_init(lpc_sch_init);
-module_exit(lpc_sch_exit);
+module_pci_driver(lpc_sch_driver);
 
 MODULE_AUTHOR("Denis Turischev <[email protected]>");
 MODULE_DESCRIPTION("LPC interface for Intel Poulsbo SCH");
diff --git a/drivers/mfd/max77693-irq.c b/drivers/mfd/max77693-irq.c
new file mode 100644
index 000000000000..2b403569e0a6
--- /dev/null
+++ b/drivers/mfd/max77693-irq.c
@@ -0,0 +1,309 @@
+/*
+ * max77693-irq.c - Interrupt controller support for MAX77693
+ *
+ * Copyright (C) 2012 Samsung Electronics Co.Ltd
+ * SangYoung Son <[email protected]>
+ *
+ * This program is not provided / owned by Maxim Integrated Products.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * This driver is based on max8997-irq.c
+ */
+
+#include <linux/err.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/irqdomain.h>
+#include <linux/mfd/max77693.h>
+#include <linux/mfd/max77693-private.h>
+
+static const u8 max77693_mask_reg[] = {
+	[LED_INT] = MAX77693_LED_REG_FLASH_INT_MASK,
+	[TOPSYS_INT] = MAX77693_PMIC_REG_TOPSYS_INT_MASK,
+	[CHG_INT] = MAX77693_CHG_REG_CHG_INT_MASK,
+	[MUIC_INT1] = MAX77693_MUIC_REG_INTMASK1,
+	[MUIC_INT2] = MAX77693_MUIC_REG_INTMASK2,
+	[MUIC_INT3] = MAX77693_MUIC_REG_INTMASK3,
+};
+
+static struct regmap *max77693_get_regmap(struct max77693_dev *max77693,
+				enum max77693_irq_source src)
+{
+	switch (src) {
+	case LED_INT ... CHG_INT:
+		return max77693->regmap;
+	case MUIC_INT1 ... MUIC_INT3:
+		return max77693->regmap_muic;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+}
+
+struct max77693_irq_data {
+	int mask;
+	enum max77693_irq_source group;
+};
+
+#define DECLARE_IRQ(idx, _group, _mask)		\
+	[(idx)] = { .group = (_group), .mask = (_mask) }
+static const struct max77693_irq_data max77693_irqs[] = {
+	DECLARE_IRQ(MAX77693_LED_IRQ_FLED2_OPEN,	LED_INT, 1 << 0),
+	DECLARE_IRQ(MAX77693_LED_IRQ_FLED2_SHORT,	LED_INT, 1 << 1),
+	DECLARE_IRQ(MAX77693_LED_IRQ_FLED1_OPEN,	LED_INT, 1 << 2),
+	DECLARE_IRQ(MAX77693_LED_IRQ_FLED1_SHORT,	LED_INT, 1 << 3),
+	DECLARE_IRQ(MAX77693_LED_IRQ_MAX_FLASH,		LED_INT, 1 << 4),
+
+	DECLARE_IRQ(MAX77693_TOPSYS_IRQ_T120C_INT,	TOPSYS_INT, 1 << 0),
+	DECLARE_IRQ(MAX77693_TOPSYS_IRQ_T140C_INT,	TOPSYS_INT, 1 << 1),
+	DECLARE_IRQ(MAX77693_TOPSYS_IRQ_LOWSYS_INT,	TOPSYS_INT, 1 << 3),
+
+	DECLARE_IRQ(MAX77693_CHG_IRQ_BYP_I,		CHG_INT, 1 << 0),
+	DECLARE_IRQ(MAX77693_CHG_IRQ_THM_I,		CHG_INT, 1 << 2),
+	DECLARE_IRQ(MAX77693_CHG_IRQ_BAT_I,		CHG_INT, 1 << 3),
+	DECLARE_IRQ(MAX77693_CHG_IRQ_CHG_I,		CHG_INT, 1 << 4),
+	DECLARE_IRQ(MAX77693_CHG_IRQ_CHGIN_I,		CHG_INT, 1 << 6),
+
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT1_ADC,		MUIC_INT1, 1 << 0),
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT1_ADC_LOW,	MUIC_INT1, 1 << 1),
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT1_ADC_ERR,	MUIC_INT1, 1 << 2),
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT1_ADC1K,	MUIC_INT1, 1 << 3),
+
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT2_CHGTYP,	MUIC_INT2, 1 << 0),
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT2_CHGDETREUN,	MUIC_INT2, 1 << 1),
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT2_DCDTMR,	MUIC_INT2, 1 << 2),
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT2_DXOVP,	MUIC_INT2, 1 << 3),
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT2_VBVOLT,	MUIC_INT2, 1 << 4),
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT2_VIDRM,	MUIC_INT2, 1 << 5),
+
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT3_EOC,		MUIC_INT3, 1 << 0),
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT3_CGMBC,	MUIC_INT3, 1 << 1),
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT3_OVP,		MUIC_INT3, 1 << 2),
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT3_MBCCHG_ERR,	MUIC_INT3, 1 << 3),
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT3_CHG_ENABLED,	MUIC_INT3, 1 << 4),
+	DECLARE_IRQ(MAX77693_MUIC_IRQ_INT3_BAT_DET,	MUIC_INT3, 1 << 5),
+};
+
+static void max77693_irq_lock(struct irq_data *data)
+{
+	struct max77693_dev *max77693 = irq_get_chip_data(data->irq);
+
+	mutex_lock(&max77693->irqlock);
+}
+
+static void max77693_irq_sync_unlock(struct irq_data *data)
+{
+	struct max77693_dev *max77693 = irq_get_chip_data(data->irq);
+	int i;
+
+	for (i = 0; i < MAX77693_IRQ_GROUP_NR; i++) {
+		u8 mask_reg = max77693_mask_reg[i];
+		struct regmap *map = max77693_get_regmap(max77693, i);
+
+		if (mask_reg == MAX77693_REG_INVALID ||
+				IS_ERR_OR_NULL(map))
+			continue;
+		max77693->irq_masks_cache[i] = max77693->irq_masks_cur[i];
+
+		max77693_write_reg(map, max77693_mask_reg[i],
+				max77693->irq_masks_cur[i]);
+	}
+
+	mutex_unlock(&max77693->irqlock);
+}
+
+static const inline struct max77693_irq_data *
+irq_to_max77693_irq(struct max77693_dev *max77693, int irq)
+{
+	return &max77693_irqs[irq];
+}
+
+static void max77693_irq_mask(struct irq_data *data)
+{
+	struct max77693_dev *max77693 = irq_get_chip_data(data->irq);
+	const struct max77693_irq_data *irq_data =
+				irq_to_max77693_irq(max77693, data->irq);
+
+	if (irq_data->group >= MUIC_INT1 && irq_data->group <= MUIC_INT3)
+		max77693->irq_masks_cur[irq_data->group] &= ~irq_data->mask;
+	else
+		max77693->irq_masks_cur[irq_data->group] |= irq_data->mask;
+}
+
+static void max77693_irq_unmask(struct irq_data *data)
+{
+	struct max77693_dev *max77693 = irq_get_chip_data(data->irq);
+	const struct max77693_irq_data *irq_data =
+	    irq_to_max77693_irq(max77693, data->irq);
+
+	if (irq_data->group >= MUIC_INT1 && irq_data->group <= MUIC_INT3)
+		max77693->irq_masks_cur[irq_data->group] |= irq_data->mask;
+	else
+		max77693->irq_masks_cur[irq_data->group] &= ~irq_data->mask;
+}
+
+static struct irq_chip max77693_irq_chip = {
+	.name			= "max77693",
+	.irq_bus_lock		= max77693_irq_lock,
+	.irq_bus_sync_unlock	= max77693_irq_sync_unlock,
+	.irq_mask		= max77693_irq_mask,
+	.irq_unmask		= max77693_irq_unmask,
+};
+
+#define MAX77693_IRQSRC_CHG		(1 << 0)
+#define MAX77693_IRQSRC_TOP		(1 << 1)
+#define MAX77693_IRQSRC_FLASH		(1 << 2)
+#define MAX77693_IRQSRC_MUIC		(1 << 3)
+static irqreturn_t max77693_irq_thread(int irq, void *data)
+{
+	struct max77693_dev *max77693 = data;
+	u8 irq_reg[MAX77693_IRQ_GROUP_NR] = {};
+	u8 irq_src;
+	int ret;
+	int i, cur_irq;
+
+	ret = max77693_read_reg(max77693->regmap, MAX77693_PMIC_REG_INTSRC,
+				&irq_src);
+	if (ret < 0) {
+		dev_err(max77693->dev, "Failed to read interrupt source: %d\n",
+				ret);
+		return IRQ_NONE;
+	}
+
+	if (irq_src & MAX77693_IRQSRC_CHG)
+		/* CHG_INT */
+		ret = max77693_read_reg(max77693->regmap, MAX77693_CHG_REG_CHG_INT,
+				&irq_reg[CHG_INT]);
+
+	if (irq_src & MAX77693_IRQSRC_TOP)
+		/* TOPSYS_INT */
+		ret = max77693_read_reg(max77693->regmap,
+			MAX77693_PMIC_REG_TOPSYS_INT, &irq_reg[TOPSYS_INT]);
+
+	if (irq_src & MAX77693_IRQSRC_FLASH)
+		/* LED_INT */
+		ret = max77693_read_reg(max77693->regmap,
+			MAX77693_LED_REG_FLASH_INT, &irq_reg[LED_INT]);
+
+	if (irq_src & MAX77693_IRQSRC_MUIC)
+		/* MUIC INT1 ~ INT3 */
+		max77693_bulk_read(max77693->regmap, MAX77693_MUIC_REG_INT1,
+			MAX77693_NUM_IRQ_MUIC_REGS, &irq_reg[MUIC_INT1]);
+
+	/* Apply masking */
+	for (i = 0; i < MAX77693_IRQ_GROUP_NR; i++) {
+		if (i >= MUIC_INT1 && i <= MUIC_INT3)
+			irq_reg[i] &= max77693->irq_masks_cur[i];
+		else
+			irq_reg[i] &= ~max77693->irq_masks_cur[i];
+	}
+
+	/* Report */
+	for (i = 0; i < MAX77693_IRQ_NR; i++) {
+		if (irq_reg[max77693_irqs[i].group] & max77693_irqs[i].mask) {
+			cur_irq = irq_find_mapping(max77693->irq_domain, i);
+			if (cur_irq)
+				handle_nested_irq(cur_irq);
+		}
+	}
+
+	return IRQ_HANDLED;
+}
+
+int max77693_irq_resume(struct max77693_dev *max77693)
+{
+	if (max77693->irq)
+		max77693_irq_thread(0, max77693);
+
+	return 0;
+}
+
+static int max77693_irq_domain_map(struct irq_domain *d, unsigned int irq,
+				irq_hw_number_t hw)
+{
+	struct max77693_dev *max77693 = d->host_data;
+
+	irq_set_chip_data(irq, max77693);
+	irq_set_chip_and_handler(irq, &max77693_irq_chip, handle_edge_irq);
+	irq_set_nested_thread(irq, 1);
+#ifdef CONFIG_ARM
+	set_irq_flags(irq, IRQF_VALID);
+#else
+	irq_set_noprobe(irq);
+#endif
+	return 0;
+}
+
+static struct irq_domain_ops max77693_irq_domain_ops = {
+	.map = max77693_irq_domain_map,
+};
+
+int max77693_irq_init(struct max77693_dev *max77693)
+{
+	struct irq_domain *domain;
+	int i;
+	int ret;
+
+	mutex_init(&max77693->irqlock);
+
+	/* Mask individual interrupt sources */
+	for (i = 0; i < MAX77693_IRQ_GROUP_NR; i++) {
+		struct regmap *map;
+		/* MUIC IRQ  0:MASK 1:NOT MASK */
+		/* Other IRQ 1:MASK 0:NOT MASK */
+		if (i >= MUIC_INT1 && i <= MUIC_INT3) {
+			max77693->irq_masks_cur[i] = 0x00;
+			max77693->irq_masks_cache[i] = 0x00;
+		} else {
+			max77693->irq_masks_cur[i] = 0xff;
+			max77693->irq_masks_cache[i] = 0xff;
+		}
+		map = max77693_get_regmap(max77693, i);
+
+		if (IS_ERR_OR_NULL(map))
+			continue;
+		if (max77693_mask_reg[i] == MAX77693_REG_INVALID)
+			continue;
+		if (i >= MUIC_INT1 && i <= MUIC_INT3)
+			max77693_write_reg(map, max77693_mask_reg[i], 0x00);
+		else
+			max77693_write_reg(map, max77693_mask_reg[i], 0xff);
+	}
+
+	domain = irq_domain_add_linear(NULL, MAX77693_IRQ_NR,
+					&max77693_irq_domain_ops, max77693);
+	if (!domain) {
+		dev_err(max77693->dev, "could not create irq domain\n");
+		return -ENODEV;
+	}
+	max77693->irq_domain = domain;
+
+	ret = request_threaded_irq(max77693->irq, NULL, max77693_irq_thread,
+				   IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
+				   "max77693-irq", max77693);
+
+	if (ret)
+		dev_err(max77693->dev, "Failed to request IRQ %d: %d\n",
+			max77693->irq, ret);
+
+	return 0;
+}
+
+void max77693_irq_exit(struct max77693_dev *max77693)
+{
+	if (max77693->irq)
+		free_irq(max77693->irq, max77693);
+}
diff --git a/drivers/mfd/max77693.c b/drivers/mfd/max77693.c
new file mode 100644
index 000000000000..e9e4278722f3
--- /dev/null
+++ b/drivers/mfd/max77693.c
@@ -0,0 +1,249 @@
+/*
+ * max77693.c - mfd core driver for the MAX 77693
+ *
+ * Copyright (C) 2012 Samsung Electronics
+ * SangYoung Son <[email protected]>
+ *
+ * This program is not provided / owned by Maxim Integrated Products.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * This driver is based on max8997.c
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/i2c.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/pm_runtime.h>
+#include <linux/mutex.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/max77693.h>
+#include <linux/mfd/max77693-private.h>
+#include <linux/regulator/machine.h>
+#include <linux/regmap.h>
+
+#define I2C_ADDR_PMIC	(0xCC >> 1)	/* Charger, Flash LED */
+#define I2C_ADDR_MUIC	(0x4A >> 1)
+#define I2C_ADDR_HAPTIC	(0x90 >> 1)
+
+static struct mfd_cell max77693_devs[] = {
+	{ .name = "max77693-pmic", },
+	{ .name = "max77693-charger", },
+	{ .name = "max77693-flash", },
+	{ .name = "max77693-muic", },
+	{ .name = "max77693-haptic", },
+};
+
+int max77693_read_reg(struct regmap *map, u8 reg, u8 *dest)
+{
+	unsigned int val;
+	int ret;
+
+	ret = regmap_read(map, reg, &val);
+	*dest = val;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(max77693_read_reg);
+
+int max77693_bulk_read(struct regmap *map, u8 reg, int count, u8 *buf)
+{
+	int ret;
+
+	ret = regmap_bulk_read(map, reg, buf, count);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(max77693_bulk_read);
+
+int max77693_write_reg(struct regmap *map, u8 reg, u8 value)
+{
+	int ret;
+
+	ret = regmap_write(map, reg, value);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(max77693_write_reg);
+
+int max77693_bulk_write(struct regmap *map, u8 reg, int count, u8 *buf)
+{
+	int ret;
+
+	ret = regmap_bulk_write(map, reg, buf, count);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(max77693_bulk_write);
+
+int max77693_update_reg(struct regmap *map, u8 reg, u8 val, u8 mask)
+{
+	int ret;
+
+	ret = regmap_update_bits(map, reg, mask, val);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(max77693_update_reg);
+
+static const struct regmap_config max77693_regmap_config = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.max_register = MAX77693_PMIC_REG_END,
+};
+
+static int max77693_i2c_probe(struct i2c_client *i2c,
+			      const struct i2c_device_id *id)
+{
+	struct max77693_dev *max77693;
+	struct max77693_platform_data *pdata = i2c->dev.platform_data;
+	u8 reg_data;
+	int ret = 0;
+
+	max77693 = devm_kzalloc(&i2c->dev,
+			sizeof(struct max77693_dev), GFP_KERNEL);
+	if (max77693 == NULL)
+		return -ENOMEM;
+
+	max77693->regmap = devm_regmap_init_i2c(i2c, &max77693_regmap_config);
+	if (IS_ERR(max77693->regmap)) {
+		ret = PTR_ERR(max77693->regmap);
+		dev_err(max77693->dev,"failed to allocate register map: %d\n",
+				ret);
+		goto err_regmap;
+	}
+
+	i2c_set_clientdata(i2c, max77693);
+	max77693->dev = &i2c->dev;
+	max77693->i2c = i2c;
+	max77693->irq = i2c->irq;
+	max77693->type = id->driver_data;
+
+	if (!pdata)
+		goto err_regmap;
+
+	max77693->wakeup = pdata->wakeup;
+
+	mutex_init(&max77693->iolock);
+
+	if (max77693_read_reg(max77693->regmap,
+				MAX77693_PMIC_REG_PMIC_ID2, &reg_data) < 0) {
+		dev_err(max77693->dev, "device not found on this channel\n");
+		ret = -ENODEV;
+		goto err_regmap;
+	} else
+		dev_info(max77693->dev, "device ID: 0x%x\n", reg_data);
+
+	max77693->muic = i2c_new_dummy(i2c->adapter, I2C_ADDR_MUIC);
+	i2c_set_clientdata(max77693->muic, max77693);
+
+	max77693->haptic = i2c_new_dummy(i2c->adapter, I2C_ADDR_HAPTIC);
+	i2c_set_clientdata(max77693->haptic, max77693);
+
+	ret = max77693_irq_init(max77693);
+	if (ret < 0)
+		goto err_mfd;
+
+	pm_runtime_set_active(max77693->dev);
+
+	ret = mfd_add_devices(max77693->dev, -1, max77693_devs,
+			ARRAY_SIZE(max77693_devs), NULL, 0);
+	if (ret < 0)
+		goto err_mfd;
+
+	device_init_wakeup(max77693->dev, pdata->wakeup);
+
+	return ret;
+
+err_mfd:
+	i2c_unregister_device(max77693->muic);
+	i2c_unregister_device(max77693->haptic);
+err_regmap:
+	kfree(max77693);
+
+	return ret;
+}
+
+static int max77693_i2c_remove(struct i2c_client *i2c)
+{
+	struct max77693_dev *max77693 = i2c_get_clientdata(i2c);
+
+	mfd_remove_devices(max77693->dev);
+	i2c_unregister_device(max77693->muic);
+	i2c_unregister_device(max77693->haptic);
+
+	return 0;
+}
+
+static const struct i2c_device_id max77693_i2c_id[] = {
+	{ "max77693", TYPE_MAX77693 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, max77693_i2c_id);
+
+static int max77693_suspend(struct device *dev)
+{
+	struct i2c_client *i2c = container_of(dev, struct i2c_client, dev);
+	struct max77693_dev *max77693 = i2c_get_clientdata(i2c);
+
+	if (device_may_wakeup(dev))
+		irq_set_irq_wake(max77693->irq, 1);
+	return 0;
+}
+
+static int max77693_resume(struct device *dev)
+{
+	struct i2c_client *i2c = container_of(dev, struct i2c_client, dev);
+	struct max77693_dev *max77693 = i2c_get_clientdata(i2c);
+
+	if (device_may_wakeup(dev))
+		irq_set_irq_wake(max77693->irq, 0);
+	return max77693_irq_resume(max77693);
+}
+
+const struct dev_pm_ops max77693_pm = {
+	.suspend = max77693_suspend,
+	.resume = max77693_resume,
+};
+
+static struct i2c_driver max77693_i2c_driver = {
+	.driver = {
+		   .name = "max77693",
+		   .owner = THIS_MODULE,
+		   .pm = &max77693_pm,
+	},
+	.probe = max77693_i2c_probe,
+	.remove = max77693_i2c_remove,
+	.id_table = max77693_i2c_id,
+};
+
+static int __init max77693_i2c_init(void)
+{
+	return i2c_add_driver(&max77693_i2c_driver);
+}
+/* init early so consumer devices can complete system boot */
+subsys_initcall(max77693_i2c_init);
+
+static void __exit max77693_i2c_exit(void)
+{
+	i2c_del_driver(&max77693_i2c_driver);
+}
+module_exit(max77693_i2c_exit);
+
+MODULE_DESCRIPTION("MAXIM 77693 multi-function core driver");
+MODULE_AUTHOR("SangYoung, Son <[email protected]>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/mfd/mc13xxx-core.c b/drivers/mfd/mc13xxx-core.c
index 738722cdecaa..f0ea3b8b3e4a 100644
--- a/drivers/mfd/mc13xxx-core.c
+++ b/drivers/mfd/mc13xxx-core.c
@@ -15,24 +15,13 @@
 #include <linux/platform_device.h>
 #include <linux/mutex.h>
 #include <linux/interrupt.h>
-#include <linux/spi/spi.h>
 #include <linux/mfd/core.h>
 #include <linux/mfd/mc13xxx.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
 #include <linux/of_gpio.h>
 
-struct mc13xxx {
-	struct spi_device *spidev;
-	struct mutex lock;
-	int irq;
-	int flags;
-
-	irq_handler_t irqhandler[MC13XXX_NUM_IRQ];
-	void *irqdata[MC13XXX_NUM_IRQ];
-
-	int adcflags;
-};
+#include "mc13xxx.h"
 
 #define MC13XXX_IRQSTAT0	0
 #define MC13XXX_IRQSTAT0_ADCDONEI	(1 << 0)
@@ -139,34 +128,29 @@ struct mc13xxx {
 
 #define MC13XXX_ADC2		45
 
-#define MC13XXX_NUMREGS 0x3f
-
 void mc13xxx_lock(struct mc13xxx *mc13xxx)
 {
 	if (!mutex_trylock(&mc13xxx->lock)) {
-		dev_dbg(&mc13xxx->spidev->dev, "wait for %s from %pf\n",
+		dev_dbg(mc13xxx->dev, "wait for %s from %pf\n",
 				__func__, __builtin_return_address(0));
 
 		mutex_lock(&mc13xxx->lock);
 	}
-	dev_dbg(&mc13xxx->spidev->dev, "%s from %pf\n",
+	dev_dbg(mc13xxx->dev, "%s from %pf\n",
 			__func__, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(mc13xxx_lock);
 
 void mc13xxx_unlock(struct mc13xxx *mc13xxx)
 {
-	dev_dbg(&mc13xxx->spidev->dev, "%s from %pf\n",
+	dev_dbg(mc13xxx->dev, "%s from %pf\n",
 			__func__, __builtin_return_address(0));
 	mutex_unlock(&mc13xxx->lock);
 }
 EXPORT_SYMBOL(mc13xxx_unlock);
 
-#define MC13XXX_REGOFFSET_SHIFT 25
 int mc13xxx_reg_read(struct mc13xxx *mc13xxx, unsigned int offset, u32 *val)
 {
-	struct spi_transfer t;
-	struct spi_message m;
 	int ret;
 
 	BUG_ON(!mutex_is_locked(&mc13xxx->lock));
@@ -174,84 +158,35 @@ int mc13xxx_reg_read(struct mc13xxx *mc13xxx, unsigned int offset, u32 *val)
 	if (offset > MC13XXX_NUMREGS)
 		return -EINVAL;
 
-	*val = offset << MC13XXX_REGOFFSET_SHIFT;
-
-	memset(&t, 0, sizeof(t));
-
-	t.tx_buf = val;
-	t.rx_buf = val;
-	t.len = sizeof(u32);
-
-	spi_message_init(&m);
-	spi_message_add_tail(&t, &m);
-
-	ret = spi_sync(mc13xxx->spidev, &m);
-
-	/* error in message.status implies error return from spi_sync */
-	BUG_ON(!ret && m.status);
+	ret = regmap_read(mc13xxx->regmap, offset, val);
+	dev_vdbg(mc13xxx->dev, "[0x%02x] -> 0x%06x\n", offset, *val);
 
-	if (ret)
-		return ret;
-
-	*val &= 0xffffff;
-
-	dev_vdbg(&mc13xxx->spidev->dev, "[0x%02x] -> 0x%06x\n", offset, *val);
-
-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL(mc13xxx_reg_read);
 
 int mc13xxx_reg_write(struct mc13xxx *mc13xxx, unsigned int offset, u32 val)
 {
-	u32 buf;
-	struct spi_transfer t;
-	struct spi_message m;
-	int ret;
-
 	BUG_ON(!mutex_is_locked(&mc13xxx->lock));
 
-	dev_vdbg(&mc13xxx->spidev->dev, "[0x%02x] <- 0x%06x\n", offset, val);
+	dev_vdbg(mc13xxx->dev, "[0x%02x] <- 0x%06x\n", offset, val);
 
 	if (offset > MC13XXX_NUMREGS || val > 0xffffff)
 		return -EINVAL;
 
-	buf = 1 << 31 | offset << MC13XXX_REGOFFSET_SHIFT | val;
-
-	memset(&t, 0, sizeof(t));
-
-	t.tx_buf = &buf;
-	t.rx_buf = &buf;
-	t.len = sizeof(u32);
-
-	spi_message_init(&m);
-	spi_message_add_tail(&t, &m);
-
-	ret = spi_sync(mc13xxx->spidev, &m);
-
-	BUG_ON(!ret && m.status);
-
-	if (ret)
-		return ret;
-
-	return 0;
+	return regmap_write(mc13xxx->regmap, offset, val);
 }
 EXPORT_SYMBOL(mc13xxx_reg_write);
 
 int mc13xxx_reg_rmw(struct mc13xxx *mc13xxx, unsigned int offset,
 		u32 mask, u32 val)
 {
-	int ret;
-	u32 valread;
-
+	BUG_ON(!mutex_is_locked(&mc13xxx->lock));
 	BUG_ON(val & ~mask);
+	dev_vdbg(mc13xxx->dev, "[0x%02x] <- 0x%06x (mask: 0x%06x)\n",
+			offset, val, mask);
 
-	ret = mc13xxx_reg_read(mc13xxx, offset, &valread);
-	if (ret)
-		return ret;
-
-	valread = (valread & ~mask) | val;
-
-	return mc13xxx_reg_write(mc13xxx, offset, valread);
+	return regmap_update_bits(mc13xxx->regmap, offset, mask, val);
 }
 EXPORT_SYMBOL(mc13xxx_reg_rmw);
 
@@ -439,7 +374,7 @@ static int mc13xxx_irq_handle(struct mc13xxx *mc13xxx,
 			if (handled == IRQ_HANDLED)
 				num_handled++;
 		} else {
-			dev_err(&mc13xxx->spidev->dev,
+			dev_err(mc13xxx->dev,
 					"BUG: irq %u but no handler\n",
 					baseirq + irq);
 
@@ -475,25 +410,23 @@ static irqreturn_t mc13xxx_irq_thread(int irq, void *data)
 	return IRQ_RETVAL(handled);
 }
 
-enum mc13xxx_id {
-	MC13XXX_ID_MC13783,
-	MC13XXX_ID_MC13892,
-	MC13XXX_ID_INVALID,
-};
-
 static const char *mc13xxx_chipname[] = {
 	[MC13XXX_ID_MC13783] = "mc13783",
 	[MC13XXX_ID_MC13892] = "mc13892",
 };
 
 #define maskval(reg, mask)	(((reg) & (mask)) >> __ffs(mask))
-static int mc13xxx_identify(struct mc13xxx *mc13xxx, enum mc13xxx_id *id)
+static int mc13xxx_identify(struct mc13xxx *mc13xxx)
 {
 	u32 icid;
 	u32 revision;
-	const char *name;
 	int ret;
 
+	/*
+	 * Get the generation ID from register 46, as apparently some older
+	 * IC revisions only have this info at this location. Newer ICs seem to
+	 * have both.
+	 */
 	ret = mc13xxx_reg_read(mc13xxx, 46, &icid);
 	if (ret)
 		return ret;
@@ -502,26 +435,23 @@ static int mc13xxx_identify(struct mc13xxx *mc13xxx, enum mc13xxx_id *id)
 
 	switch (icid) {
 	case 2:
-		*id = MC13XXX_ID_MC13783;
-		name = "mc13783";
+		mc13xxx->ictype = MC13XXX_ID_MC13783;
 		break;
 	case 7:
-		*id = MC13XXX_ID_MC13892;
-		name = "mc13892";
+		mc13xxx->ictype = MC13XXX_ID_MC13892;
 		break;
 	default:
-		*id = MC13XXX_ID_INVALID;
+		mc13xxx->ictype = MC13XXX_ID_INVALID;
 		break;
 	}
 
-	if (*id == MC13XXX_ID_MC13783 || *id == MC13XXX_ID_MC13892) {
+	if (mc13xxx->ictype == MC13XXX_ID_MC13783 ||
+			mc13xxx->ictype == MC13XXX_ID_MC13892) {
 		ret = mc13xxx_reg_read(mc13xxx, MC13XXX_REVISION, &revision);
-		if (ret)
-			return ret;
 
-		dev_info(&mc13xxx->spidev->dev, "%s: rev: %d.%d, "
+		dev_info(mc13xxx->dev, "%s: rev: %d.%d, "
 				"fin: %d, fab: %d, icid: %d/%d\n",
-				mc13xxx_chipname[*id],
+				mc13xxx_chipname[mc13xxx->ictype],
 				maskval(revision, MC13XXX_REVISION_REVFULL),
 				maskval(revision, MC13XXX_REVISION_REVMETAL),
 				maskval(revision, MC13XXX_REVISION_FIN),
@@ -530,26 +460,12 @@ static int mc13xxx_identify(struct mc13xxx *mc13xxx, enum mc13xxx_id *id)
 				maskval(revision, MC13XXX_REVISION_ICIDCODE));
 	}
 
-	if (*id != MC13XXX_ID_INVALID) {
-		const struct spi_device_id *devid =
-			spi_get_device_id(mc13xxx->spidev);
-		if (!devid || devid->driver_data != *id)
-			dev_warn(&mc13xxx->spidev->dev, "device id doesn't "
-					"match auto detection!\n");
-	}
-
-	return 0;
+	return (mc13xxx->ictype == MC13XXX_ID_INVALID) ? -ENODEV : 0;
 }
 
 static const char *mc13xxx_get_chipname(struct mc13xxx *mc13xxx)
 {
-	const struct spi_device_id *devid =
-		spi_get_device_id(mc13xxx->spidev);
-
-	if (!devid)
-		return NULL;
-
-	return mc13xxx_chipname[devid->driver_data];
+	return mc13xxx_chipname[mc13xxx->ictype];
 }
 
 int mc13xxx_get_flags(struct mc13xxx *mc13xxx)
@@ -592,7 +508,7 @@ int mc13xxx_adc_do_conversion(struct mc13xxx *mc13xxx, unsigned int mode,
 	};
 	init_completion(&adcdone_data.done);
 
-	dev_dbg(&mc13xxx->spidev->dev, "%s\n", __func__);
+	dev_dbg(mc13xxx->dev, "%s\n", __func__);
 
 	mc13xxx_lock(mc13xxx);
 
@@ -637,7 +553,8 @@ int mc13xxx_adc_do_conversion(struct mc13xxx *mc13xxx, unsigned int mode,
 	adc1 |= ato << MC13783_ADC1_ATO_SHIFT;
 	if (atox)
 		adc1 |= MC13783_ADC1_ATOX;
-	dev_dbg(&mc13xxx->spidev->dev, "%s: request irq\n", __func__);
+
+	dev_dbg(mc13xxx->dev, "%s: request irq\n", __func__);
 	mc13xxx_irq_request(mc13xxx, MC13XXX_IRQ_ADCDONE,
 			mc13xxx_handler_adcdone, __func__, &adcdone_data);
 	mc13xxx_irq_ack(mc13xxx, MC13XXX_IRQ_ADCDONE);
@@ -695,7 +612,7 @@ static int mc13xxx_add_subdevice_pdata(struct mc13xxx *mc13xxx,
 	if (!cell.name)
 		return -ENOMEM;
 
-	return mfd_add_devices(&mc13xxx->spidev->dev, -1, &cell, 1, NULL, 0);
+	return mfd_add_devices(mc13xxx->dev, -1, &cell, 1, NULL, 0);
 }
 
 static int mc13xxx_add_subdevice(struct mc13xxx *mc13xxx, const char *format)
@@ -706,7 +623,7 @@ static int mc13xxx_add_subdevice(struct mc13xxx *mc13xxx, const char *format)
 #ifdef CONFIG_OF
 static int mc13xxx_probe_flags_dt(struct mc13xxx *mc13xxx)
 {
-	struct device_node *np = mc13xxx->spidev->dev.of_node;
+	struct device_node *np = mc13xxx->dev->of_node;
 
 	if (!np)
 		return -ENODEV;
@@ -732,55 +649,15 @@ static inline int mc13xxx_probe_flags_dt(struct mc13xxx *mc13xxx)
 }
 #endif
 
-static const struct spi_device_id mc13xxx_device_id[] = {
-	{
-		.name = "mc13783",
-		.driver_data = MC13XXX_ID_MC13783,
-	}, {
-		.name = "mc13892",
-		.driver_data = MC13XXX_ID_MC13892,
-	}, {
-		/* sentinel */
-	}
-};
-MODULE_DEVICE_TABLE(spi, mc13xxx_device_id);
-
-static const struct of_device_id mc13xxx_dt_ids[] = {
-	{ .compatible = "fsl,mc13783", .data = (void *) MC13XXX_ID_MC13783, },
-	{ .compatible = "fsl,mc13892", .data = (void *) MC13XXX_ID_MC13892, },
-	{ /* sentinel */ }
-};
-MODULE_DEVICE_TABLE(of, mc13xxx_dt_ids);
-
-static int mc13xxx_probe(struct spi_device *spi)
+int mc13xxx_common_init(struct mc13xxx *mc13xxx,
+		struct mc13xxx_platform_data *pdata, int irq)
 {
-	const struct of_device_id *of_id;
-	struct spi_driver *sdrv = to_spi_driver(spi->dev.driver);
-	struct mc13xxx *mc13xxx;
-	struct mc13xxx_platform_data *pdata = dev_get_platdata(&spi->dev);
-	enum mc13xxx_id id;
 	int ret;
 
-	of_id = of_match_device(mc13xxx_dt_ids, &spi->dev);
-	if (of_id)
-		sdrv->id_table = &mc13xxx_device_id[(enum mc13xxx_id) of_id->data];
-
-	mc13xxx = kzalloc(sizeof(*mc13xxx), GFP_KERNEL);
-	if (!mc13xxx)
-		return -ENOMEM;
-
-	dev_set_drvdata(&spi->dev, mc13xxx);
-	spi->mode = SPI_MODE_0 | SPI_CS_HIGH;
-	spi->bits_per_word = 32;
-	spi_setup(spi);
-
-	mc13xxx->spidev = spi;
-
-	mutex_init(&mc13xxx->lock);
 	mc13xxx_lock(mc13xxx);
 
-	ret = mc13xxx_identify(mc13xxx, &id);
-	if (ret || id == MC13XXX_ID_INVALID)
+	ret = mc13xxx_identify(mc13xxx);
+	if (ret)
 		goto err_revision;
 
 	/* mask all irqs */
@@ -792,18 +669,19 @@ static int mc13xxx_probe(struct spi_device *spi)
 	if (ret)
 		goto err_mask;
 
-	ret = request_threaded_irq(spi->irq, NULL, mc13xxx_irq_thread,
+	ret = request_threaded_irq(irq, NULL, mc13xxx_irq_thread,
 			IRQF_ONESHOT | IRQF_TRIGGER_HIGH, "mc13xxx", mc13xxx);
 
 	if (ret) {
 err_mask:
 err_revision:
 		mc13xxx_unlock(mc13xxx);
-		dev_set_drvdata(&spi->dev, NULL);
 		kfree(mc13xxx);
 		return ret;
 	}
 
+	mc13xxx->irq = irq;
+
 	mc13xxx_unlock(mc13xxx);
 
 	if (mc13xxx_probe_flags_dt(mc13xxx) < 0 && pdata)
@@ -838,42 +716,19 @@ err_revision:
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(mc13xxx_common_init);
 
-static int __devexit mc13xxx_remove(struct spi_device *spi)
+void mc13xxx_common_cleanup(struct mc13xxx *mc13xxx)
 {
-	struct mc13xxx *mc13xxx = dev_get_drvdata(&spi->dev);
+	free_irq(mc13xxx->irq, mc13xxx);
 
-	free_irq(mc13xxx->spidev->irq, mc13xxx);
+	mfd_remove_devices(mc13xxx->dev);
 
-	mfd_remove_devices(&spi->dev);
+	regmap_exit(mc13xxx->regmap);
 
 	kfree(mc13xxx);
-
-	return 0;
-}
-
-static struct spi_driver mc13xxx_driver = {
-	.id_table = mc13xxx_device_id,
-	.driver = {
-		.name = "mc13xxx",
-		.owner = THIS_MODULE,
-		.of_match_table = mc13xxx_dt_ids,
-	},
-	.probe = mc13xxx_probe,
-	.remove = __devexit_p(mc13xxx_remove),
-};
-
-static int __init mc13xxx_init(void)
-{
-	return spi_register_driver(&mc13xxx_driver);
-}
-subsys_initcall(mc13xxx_init);
-
-static void __exit mc13xxx_exit(void)
-{
-	spi_unregister_driver(&mc13xxx_driver);
 }
-module_exit(mc13xxx_exit);
+EXPORT_SYMBOL_GPL(mc13xxx_common_cleanup);
 
 MODULE_DESCRIPTION("Core driver for Freescale MC13XXX PMIC");
 MODULE_AUTHOR("Uwe Kleine-Koenig <[email protected]>");
diff --git a/drivers/mfd/mc13xxx-i2c.c b/drivers/mfd/mc13xxx-i2c.c
new file mode 100644
index 000000000000..d22501dad6a6
--- /dev/null
+++ b/drivers/mfd/mc13xxx-i2c.c
@@ -0,0 +1,128 @@
+/*
+ * Copyright 2009-2010 Creative Product Design
+ * Marc Reilly [email protected]
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation.
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/mutex.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/mc13xxx.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_gpio.h>
+#include <linux/i2c.h>
+#include <linux/err.h>
+
+#include "mc13xxx.h"
+
+static const struct i2c_device_id mc13xxx_i2c_device_id[] = {
+	{
+		.name = "mc13892",
+		.driver_data = MC13XXX_ID_MC13892,
+	}, {
+		/* sentinel */
+	}
+};
+MODULE_DEVICE_TABLE(i2c, mc13xxx_i2c_device_id);
+
+static const struct of_device_id mc13xxx_dt_ids[] = {
+	{
+		.compatible = "fsl,mc13892",
+		.data = (void *) &mc13xxx_i2c_device_id[0],
+	}, {
+		/* sentinel */
+	}
+};
+MODULE_DEVICE_TABLE(of, mc13xxx_dt_ids);
+
+static struct regmap_config mc13xxx_regmap_i2c_config = {
+	.reg_bits = 8,
+	.val_bits = 24,
+
+	.max_register = MC13XXX_NUMREGS,
+
+	.cache_type = REGCACHE_NONE,
+};
+
+static int mc13xxx_i2c_probe(struct i2c_client *client,
+		const struct i2c_device_id *id)
+{
+	const struct of_device_id *of_id;
+	struct i2c_driver *idrv = to_i2c_driver(client->dev.driver);
+	struct mc13xxx *mc13xxx;
+	struct mc13xxx_platform_data *pdata = dev_get_platdata(&client->dev);
+	int ret;
+
+	of_id = of_match_device(mc13xxx_dt_ids, &client->dev);
+	if (of_id)
+		idrv->id_table = (const struct i2c_device_id*) of_id->data;
+
+	mc13xxx = kzalloc(sizeof(*mc13xxx), GFP_KERNEL);
+	if (!mc13xxx)
+		return -ENOMEM;
+
+	dev_set_drvdata(&client->dev, mc13xxx);
+
+	mc13xxx->dev = &client->dev;
+	mutex_init(&mc13xxx->lock);
+
+	mc13xxx->regmap = regmap_init_i2c(client, &mc13xxx_regmap_i2c_config);
+	if (IS_ERR(mc13xxx->regmap)) {
+		ret = PTR_ERR(mc13xxx->regmap);
+		dev_err(mc13xxx->dev, "Failed to initialize register map: %d\n",
+				ret);
+		dev_set_drvdata(&client->dev, NULL);
+		kfree(mc13xxx);
+		return ret;
+	}
+
+	ret = mc13xxx_common_init(mc13xxx, pdata, client->irq);
+
+	if (ret == 0 && (id->driver_data != mc13xxx->ictype))
+		dev_warn(mc13xxx->dev,
+				"device id doesn't match auto detection!\n");
+
+	return ret;
+}
+
+static int __devexit mc13xxx_i2c_remove(struct i2c_client *client)
+{
+	struct mc13xxx *mc13xxx = dev_get_drvdata(&client->dev);
+
+	mc13xxx_common_cleanup(mc13xxx);
+
+	return 0;
+}
+
+static struct i2c_driver mc13xxx_i2c_driver = {
+	.id_table = mc13xxx_i2c_device_id,
+	.driver = {
+		.owner = THIS_MODULE,
+		.name = "mc13xxx",
+		.of_match_table = mc13xxx_dt_ids,
+	},
+	.probe = mc13xxx_i2c_probe,
+	.remove = __devexit_p(mc13xxx_i2c_remove),
+};
+
+static int __init mc13xxx_i2c_init(void)
+{
+	return i2c_add_driver(&mc13xxx_i2c_driver);
+}
+subsys_initcall(mc13xxx_i2c_init);
+
+static void __exit mc13xxx_i2c_exit(void)
+{
+	i2c_del_driver(&mc13xxx_i2c_driver);
+}
+module_exit(mc13xxx_i2c_exit);
+
+MODULE_DESCRIPTION("i2c driver for Freescale MC13XXX PMIC");
+MODULE_AUTHOR("Marc Reilly <[email protected]");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/mfd/mc13xxx-spi.c b/drivers/mfd/mc13xxx-spi.c
new file mode 100644
index 000000000000..3fcdab3eb8eb
--- /dev/null
+++ b/drivers/mfd/mc13xxx-spi.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright 2009-2010 Pengutronix
+ * Uwe Kleine-Koenig <[email protected]>
+ *
+ * loosely based on an earlier driver that has
+ * Copyright 2009 Pengutronix, Sascha Hauer <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation.
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/mutex.h>
+#include <linux/interrupt.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/mc13xxx.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_gpio.h>
+#include <linux/err.h>
+#include <linux/spi/spi.h>
+
+#include "mc13xxx.h"
+
+static const struct spi_device_id mc13xxx_device_id[] = {
+	{
+		.name = "mc13783",
+		.driver_data = MC13XXX_ID_MC13783,
+	}, {
+		.name = "mc13892",
+		.driver_data = MC13XXX_ID_MC13892,
+	}, {
+		/* sentinel */
+	}
+};
+MODULE_DEVICE_TABLE(spi, mc13xxx_device_id);
+
+static const struct of_device_id mc13xxx_dt_ids[] = {
+	{ .compatible = "fsl,mc13783", .data = (void *) MC13XXX_ID_MC13783, },
+	{ .compatible = "fsl,mc13892", .data = (void *) MC13XXX_ID_MC13892, },
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, mc13xxx_dt_ids);
+
+static struct regmap_config mc13xxx_regmap_spi_config = {
+	.reg_bits = 7,
+	.pad_bits = 1,
+	.val_bits = 24,
+
+	.max_register = MC13XXX_NUMREGS,
+
+	.cache_type = REGCACHE_NONE,
+};
+
+static int mc13xxx_spi_probe(struct spi_device *spi)
+{
+	const struct of_device_id *of_id;
+	struct spi_driver *sdrv = to_spi_driver(spi->dev.driver);
+	struct mc13xxx *mc13xxx;
+	struct mc13xxx_platform_data *pdata = dev_get_platdata(&spi->dev);
+	int ret;
+
+	of_id = of_match_device(mc13xxx_dt_ids, &spi->dev);
+	if (of_id)
+		sdrv->id_table = &mc13xxx_device_id[(enum mc13xxx_id) of_id->data];
+
+	mc13xxx = kzalloc(sizeof(*mc13xxx), GFP_KERNEL);
+	if (!mc13xxx)
+		return -ENOMEM;
+
+	dev_set_drvdata(&spi->dev, mc13xxx);
+	spi->mode = SPI_MODE_0 | SPI_CS_HIGH;
+	spi->bits_per_word = 32;
+
+	mc13xxx->dev = &spi->dev;
+	mutex_init(&mc13xxx->lock);
+
+	mc13xxx->regmap = regmap_init_spi(spi, &mc13xxx_regmap_spi_config);
+	if (IS_ERR(mc13xxx->regmap)) {
+		ret = PTR_ERR(mc13xxx->regmap);
+		dev_err(mc13xxx->dev, "Failed to initialize register map: %d\n",
+				ret);
+		dev_set_drvdata(&spi->dev, NULL);
+		kfree(mc13xxx);
+		return ret;
+	}
+
+	ret = mc13xxx_common_init(mc13xxx, pdata, spi->irq);
+
+	if (ret) {
+		dev_set_drvdata(&spi->dev, NULL);
+	} else {
+		const struct spi_device_id *devid =
+			spi_get_device_id(spi);
+		if (!devid || devid->driver_data != mc13xxx->ictype)
+			dev_warn(mc13xxx->dev,
+				"device id doesn't match auto detection!\n");
+	}
+
+	return ret;
+}
+
+static int __devexit mc13xxx_spi_remove(struct spi_device *spi)
+{
+	struct mc13xxx *mc13xxx = dev_get_drvdata(&spi->dev);
+
+	mc13xxx_common_cleanup(mc13xxx);
+
+	return 0;
+}
+
+static struct spi_driver mc13xxx_spi_driver = {
+	.id_table = mc13xxx_device_id,
+	.driver = {
+		.name = "mc13xxx",
+		.owner = THIS_MODULE,
+		.of_match_table = mc13xxx_dt_ids,
+	},
+	.probe = mc13xxx_spi_probe,
+	.remove = __devexit_p(mc13xxx_spi_remove),
+};
+
+static int __init mc13xxx_init(void)
+{
+	return spi_register_driver(&mc13xxx_spi_driver);
+}
+subsys_initcall(mc13xxx_init);
+
+static void __exit mc13xxx_exit(void)
+{
+	spi_unregister_driver(&mc13xxx_spi_driver);
+}
+module_exit(mc13xxx_exit);
+
+MODULE_DESCRIPTION("Core driver for Freescale MC13XXX PMIC");
+MODULE_AUTHOR("Uwe Kleine-Koenig <[email protected]>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/mfd/mc13xxx.h b/drivers/mfd/mc13xxx.h
new file mode 100644
index 000000000000..bbba06feea06
--- /dev/null
+++ b/drivers/mfd/mc13xxx.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2012 Creative Product Design
+ * Marc Reilly <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation.
+ */
+#ifndef __DRIVERS_MFD_MC13XXX_H
+#define __DRIVERS_MFD_MC13XXX_H
+
+#include <linux/mutex.h>
+#include <linux/regmap.h>
+#include <linux/mfd/mc13xxx.h>
+
+enum mc13xxx_id {
+	MC13XXX_ID_MC13783,
+	MC13XXX_ID_MC13892,
+	MC13XXX_ID_INVALID,
+};
+
+#define MC13XXX_NUMREGS 0x3f
+
+struct mc13xxx {
+	struct regmap *regmap;
+
+	struct device *dev;
+	enum mc13xxx_id ictype;
+
+	struct mutex lock;
+	int irq;
+	int flags;
+
+	irq_handler_t irqhandler[MC13XXX_NUM_IRQ];
+	void *irqdata[MC13XXX_NUM_IRQ];
+
+	int adcflags;
+};
+
+int mc13xxx_common_init(struct mc13xxx *mc13xxx,
+		struct mc13xxx_platform_data *pdata, int irq);
+
+void mc13xxx_common_cleanup(struct mc13xxx *mc13xxx);
+
+#endif /* __DRIVERS_MFD_MC13XXX_H */
diff --git a/drivers/mfd/pcf50633-core.c b/drivers/mfd/pcf50633-core.c
index 189c2f07b83f..29c122bf28ea 100644
--- a/drivers/mfd/pcf50633-core.c
+++ b/drivers/mfd/pcf50633-core.c
@@ -204,7 +204,7 @@ static int __devinit pcf50633_probe(struct i2c_client *client,
 		return -ENOENT;
 	}
 
-	pcf = kzalloc(sizeof(*pcf), GFP_KERNEL);
+	pcf = devm_kzalloc(&client->dev, sizeof(*pcf), GFP_KERNEL);
 	if (!pcf)
 		return -ENOMEM;
 
@@ -212,12 +212,11 @@ static int __devinit pcf50633_probe(struct i2c_client *client,
 
 	mutex_init(&pcf->lock);
 
-	pcf->regmap = regmap_init_i2c(client, &pcf50633_regmap_config);
+	pcf->regmap = devm_regmap_init_i2c(client, &pcf50633_regmap_config);
 	if (IS_ERR(pcf->regmap)) {
 		ret = PTR_ERR(pcf->regmap);
-		dev_err(pcf->dev, "Failed to allocate register map: %d\n",
-			ret);
-		goto err_free;
+		dev_err(pcf->dev, "Failed to allocate register map: %d\n", ret);
+		return ret;
 	}
 
 	i2c_set_clientdata(client, pcf);
@@ -228,7 +227,7 @@ static int __devinit pcf50633_probe(struct i2c_client *client,
 	if (version < 0 || variant < 0) {
 		dev_err(pcf->dev, "Unable to probe pcf50633\n");
 		ret = -ENODEV;
-		goto err_regmap;
+		return ret;
 	}
 
 	dev_info(pcf->dev, "Probed device version %d variant %d\n",
@@ -237,16 +236,11 @@ static int __devinit pcf50633_probe(struct i2c_client *client,
 	pcf50633_irq_init(pcf, client->irq);
 
 	/* Create sub devices */
-	pcf50633_client_dev_register(pcf, "pcf50633-input",
-						&pcf->input_pdev);
-	pcf50633_client_dev_register(pcf, "pcf50633-rtc",
-						&pcf->rtc_pdev);
-	pcf50633_client_dev_register(pcf, "pcf50633-mbc",
-						&pcf->mbc_pdev);
-	pcf50633_client_dev_register(pcf, "pcf50633-adc",
-						&pcf->adc_pdev);
-	pcf50633_client_dev_register(pcf, "pcf50633-backlight",
-						&pcf->bl_pdev);
+	pcf50633_client_dev_register(pcf, "pcf50633-input", &pcf->input_pdev);
+	pcf50633_client_dev_register(pcf, "pcf50633-rtc", &pcf->rtc_pdev);
+	pcf50633_client_dev_register(pcf, "pcf50633-mbc", &pcf->mbc_pdev);
+	pcf50633_client_dev_register(pcf, "pcf50633-adc", &pcf->adc_pdev);
+	pcf50633_client_dev_register(pcf, "pcf50633-backlight", &pcf->bl_pdev);
 
 
 	for (i = 0; i < PCF50633_NUM_REGULATORS; i++) {
@@ -274,13 +268,6 @@ static int __devinit pcf50633_probe(struct i2c_client *client,
 		pdata->probe_done(pcf);
 
 	return 0;
-
-err_regmap:
-	regmap_exit(pcf->regmap);
-err_free:
-	kfree(pcf);
-
-	return ret;
 }
 
 static int __devexit pcf50633_remove(struct i2c_client *client)
@@ -300,9 +287,6 @@ static int __devexit pcf50633_remove(struct i2c_client *client)
 	for (i = 0; i < PCF50633_NUM_REGULATORS; i++)
 		platform_device_unregister(pcf->regulator_pdev[i]);
 
-	regmap_exit(pcf->regmap);
-	kfree(pcf);
-
 	return 0;
 }
 
diff --git a/drivers/mfd/rc5t583.c b/drivers/mfd/rc5t583.c
index 44afae0a69ce..cdc1df7fa0e9 100644
--- a/drivers/mfd/rc5t583.c
+++ b/drivers/mfd/rc5t583.c
@@ -75,6 +75,7 @@ static struct deepsleep_control_data deepsleep_data[] = {
 	(RC5T583_EXT_PWRREQ1_CONTROL | RC5T583_EXT_PWRREQ2_CONTROL)
 
 static struct mfd_cell rc5t583_subdevs[] = {
+	{.name = "rc5t583-gpio",},
 	{.name = "rc5t583-regulator",},
 	{.name = "rc5t583-rtc",      },
 	{.name = "rc5t583-key",      }
@@ -267,7 +268,7 @@ static int __devinit rc5t583_i2c_probe(struct i2c_client *i2c,
 	rc5t583->dev = &i2c->dev;
 	i2c_set_clientdata(i2c, rc5t583);
 
-	rc5t583->regmap = regmap_init_i2c(i2c, &rc5t583_regmap_config);
+	rc5t583->regmap = devm_regmap_init_i2c(i2c, &rc5t583_regmap_config);
 	if (IS_ERR(rc5t583->regmap)) {
 		ret = PTR_ERR(rc5t583->regmap);
 		dev_err(&i2c->dev, "regmap initialization failed: %d\n", ret);
@@ -276,7 +277,7 @@ static int __devinit rc5t583_i2c_probe(struct i2c_client *i2c,
 
 	ret = rc5t583_clear_ext_power_req(rc5t583, pdata);
 	if (ret < 0)
-		goto err_irq_init;
+		return ret;
 
 	if (i2c->irq) {
 		ret = rc5t583_irq_init(rc5t583, i2c->irq, pdata->irq_base);
@@ -299,8 +300,6 @@ static int __devinit rc5t583_i2c_probe(struct i2c_client *i2c,
 err_add_devs:
 	if (irq_init_success)
 		rc5t583_irq_exit(rc5t583);
-err_irq_init:
-	regmap_exit(rc5t583->regmap);
 	return ret;
 }
 
@@ -310,7 +309,6 @@ static int  __devexit rc5t583_i2c_remove(struct i2c_client *i2c)
 
 	mfd_remove_devices(rc5t583->dev);
 	rc5t583_irq_exit(rc5t583);
-	regmap_exit(rc5t583->regmap);
 	return 0;
 }
 
diff --git a/drivers/mfd/rdc321x-southbridge.c b/drivers/mfd/rdc321x-southbridge.c
index 809bd4a61089..685d61e431ad 100644
--- a/drivers/mfd/rdc321x-southbridge.c
+++ b/drivers/mfd/rdc321x-southbridge.c
@@ -108,18 +108,7 @@ static struct pci_driver rdc321x_sb_driver = {
 	.remove		= __devexit_p(rdc321x_sb_remove),
 };
 
-static int __init rdc321x_sb_init(void)
-{
-	return pci_register_driver(&rdc321x_sb_driver);
-}
-
-static void __exit rdc321x_sb_exit(void)
-{
-	pci_unregister_driver(&rdc321x_sb_driver);
-}
-
-module_init(rdc321x_sb_init);
-module_exit(rdc321x_sb_exit);
+module_pci_driver(rdc321x_sb_driver);
 
 MODULE_AUTHOR("Florian Fainelli <[email protected]>");
 MODULE_LICENSE("GPL");
diff --git a/drivers/mfd/s5m-core.c b/drivers/mfd/s5m-core.c
index 48949d998d10..dd170307e60e 100644
--- a/drivers/mfd/s5m-core.c
+++ b/drivers/mfd/s5m-core.c
@@ -114,12 +114,12 @@ static int s5m87xx_i2c_probe(struct i2c_client *i2c,
 		s5m87xx->wakeup = pdata->wakeup;
 	}
 
-	s5m87xx->regmap = regmap_init_i2c(i2c, &s5m_regmap_config);
+	s5m87xx->regmap = devm_regmap_init_i2c(i2c, &s5m_regmap_config);
 	if (IS_ERR(s5m87xx->regmap)) {
 		ret = PTR_ERR(s5m87xx->regmap);
 		dev_err(&i2c->dev, "Failed to allocate register map: %d\n",
 			ret);
-		goto err;
+		return ret;
 	}
 
 	s5m87xx->rtc = i2c_new_dummy(i2c->adapter, RTC_I2C_ADDR);
@@ -159,7 +159,6 @@ err:
 	mfd_remove_devices(s5m87xx->dev);
 	s5m_irq_exit(s5m87xx);
 	i2c_unregister_device(s5m87xx->rtc);
-	regmap_exit(s5m87xx->regmap);
 	return ret;
 }
 
@@ -170,7 +169,6 @@ static int s5m87xx_i2c_remove(struct i2c_client *i2c)
 	mfd_remove_devices(s5m87xx->dev);
 	s5m_irq_exit(s5m87xx);
 	i2c_unregister_device(s5m87xx->rtc);
-	regmap_exit(s5m87xx->regmap);
 	return 0;
 }
 
diff --git a/drivers/mfd/sta2x11-mfd.c b/drivers/mfd/sta2x11-mfd.c
new file mode 100644
index 000000000000..d31fed07aefb
--- /dev/null
+++ b/drivers/mfd/sta2x11-mfd.c
@@ -0,0 +1,467 @@
+/*
+ * Copyright (c) 2009-2011 Wind River Systems, Inc.
+ * Copyright (c) 2011 ST Microelectronics (Alessandro Rubini)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/pci.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/platform_device.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/sta2x11-mfd.h>
+
+#include <asm/sta2x11.h>
+
+/* This describes STA2X11 MFD chip for us, we may have several */
+struct sta2x11_mfd {
+	struct sta2x11_instance *instance;
+	spinlock_t lock;
+	struct list_head list;
+	void __iomem *sctl_regs;
+	void __iomem *apbreg_regs;
+};
+
+static LIST_HEAD(sta2x11_mfd_list);
+
+/* Three functions to act on the list */
+static struct sta2x11_mfd *sta2x11_mfd_find(struct pci_dev *pdev)
+{
+	struct sta2x11_instance *instance;
+	struct sta2x11_mfd *mfd;
+
+	if (!pdev && !list_empty(&sta2x11_mfd_list)) {
+		pr_warning("%s: Unspecified device, "
+			    "using first instance\n", __func__);
+		return list_entry(sta2x11_mfd_list.next,
+				  struct sta2x11_mfd, list);
+	}
+
+	instance = sta2x11_get_instance(pdev);
+	if (!instance)
+		return NULL;
+	list_for_each_entry(mfd, &sta2x11_mfd_list, list) {
+		if (mfd->instance == instance)
+			return mfd;
+	}
+	return NULL;
+}
+
+static int __devinit sta2x11_mfd_add(struct pci_dev *pdev, gfp_t flags)
+{
+	struct sta2x11_mfd *mfd = sta2x11_mfd_find(pdev);
+	struct sta2x11_instance *instance;
+
+	if (mfd)
+		return -EBUSY;
+	instance = sta2x11_get_instance(pdev);
+	if (!instance)
+		return -EINVAL;
+	mfd = kzalloc(sizeof(*mfd), flags);
+	if (!mfd)
+		return -ENOMEM;
+	INIT_LIST_HEAD(&mfd->list);
+	spin_lock_init(&mfd->lock);
+	mfd->instance = instance;
+	list_add(&mfd->list, &sta2x11_mfd_list);
+	return 0;
+}
+
+static int __devexit mfd_remove(struct pci_dev *pdev)
+{
+	struct sta2x11_mfd *mfd = sta2x11_mfd_find(pdev);
+
+	if (!mfd)
+		return -ENODEV;
+	list_del(&mfd->list);
+	kfree(mfd);
+	return 0;
+}
+
+/* These two functions are exported and are not expected to fail */
+u32 sta2x11_sctl_mask(struct pci_dev *pdev, u32 reg, u32 mask, u32 val)
+{
+	struct sta2x11_mfd *mfd = sta2x11_mfd_find(pdev);
+	u32 r;
+	unsigned long flags;
+
+	if (!mfd) {
+		dev_warn(&pdev->dev, ": can't access sctl regs\n");
+		return 0;
+	}
+	if (!mfd->sctl_regs) {
+		dev_warn(&pdev->dev, ": system ctl not initialized\n");
+		return 0;
+	}
+	spin_lock_irqsave(&mfd->lock, flags);
+	r = readl(mfd->sctl_regs + reg);
+	r &= ~mask;
+	r |= val;
+	if (mask)
+		writel(r, mfd->sctl_regs + reg);
+	spin_unlock_irqrestore(&mfd->lock, flags);
+	return r;
+}
+EXPORT_SYMBOL(sta2x11_sctl_mask);
+
+u32 sta2x11_apbreg_mask(struct pci_dev *pdev, u32 reg, u32 mask, u32 val)
+{
+	struct sta2x11_mfd *mfd = sta2x11_mfd_find(pdev);
+	u32 r;
+	unsigned long flags;
+
+	if (!mfd) {
+		dev_warn(&pdev->dev, ": can't access apb regs\n");
+		return 0;
+	}
+	if (!mfd->apbreg_regs) {
+		dev_warn(&pdev->dev, ": apb bridge not initialized\n");
+		return 0;
+	}
+	spin_lock_irqsave(&mfd->lock, flags);
+	r = readl(mfd->apbreg_regs + reg);
+	r &= ~mask;
+	r |= val;
+	if (mask)
+		writel(r, mfd->apbreg_regs + reg);
+	spin_unlock_irqrestore(&mfd->lock, flags);
+	return r;
+}
+EXPORT_SYMBOL(sta2x11_apbreg_mask);
+
+/* Two debugfs files, for our registers (FIXME: one instance only) */
+#define REG(regname) {.name = #regname, .offset = SCTL_ ## regname}
+static struct debugfs_reg32 sta2x11_sctl_regs[] = {
+	REG(SCCTL), REG(ARMCFG), REG(SCPLLCTL), REG(SCPLLFCTRL),
+	REG(SCRESFRACT), REG(SCRESCTRL1), REG(SCRESXTRL2), REG(SCPEREN0),
+	REG(SCPEREN1), REG(SCPEREN2), REG(SCGRST), REG(SCPCIPMCR1),
+	REG(SCPCIPMCR2), REG(SCPCIPMSR1), REG(SCPCIPMSR2), REG(SCPCIPMSR3),
+	REG(SCINTREN), REG(SCRISR), REG(SCCLKSTAT0), REG(SCCLKSTAT1),
+	REG(SCCLKSTAT2), REG(SCRSTSTA),
+};
+#undef REG
+
+static struct debugfs_regset32 sctl_regset = {
+	.regs = sta2x11_sctl_regs,
+	.nregs = ARRAY_SIZE(sta2x11_sctl_regs),
+};
+
+#define REG(regname) {.name = #regname, .offset = regname}
+static struct debugfs_reg32 sta2x11_apbreg_regs[] = {
+	REG(APBREG_BSR), REG(APBREG_PAER), REG(APBREG_PWAC), REG(APBREG_PRAC),
+	REG(APBREG_PCG), REG(APBREG_PUR), REG(APBREG_EMU_PCG),
+};
+#undef REG
+
+static struct debugfs_regset32 apbreg_regset = {
+	.regs = sta2x11_apbreg_regs,
+	.nregs = ARRAY_SIZE(sta2x11_apbreg_regs),
+};
+
+static struct dentry *sta2x11_sctl_debugfs;
+static struct dentry *sta2x11_apbreg_debugfs;
+
+/* Probe for the two platform devices */
+static int sta2x11_sctl_probe(struct platform_device *dev)
+{
+	struct pci_dev **pdev;
+	struct sta2x11_mfd *mfd;
+	struct resource *res;
+
+	pdev = dev->dev.platform_data;
+	mfd = sta2x11_mfd_find(*pdev);
+	if (!mfd)
+		return -ENODEV;
+
+	res = platform_get_resource(dev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENOMEM;
+
+	if (!request_mem_region(res->start, resource_size(res),
+				"sta2x11-sctl"))
+		return -EBUSY;
+
+	mfd->sctl_regs = ioremap(res->start, resource_size(res));
+	if (!mfd->sctl_regs) {
+		release_mem_region(res->start, resource_size(res));
+		return -ENOMEM;
+	}
+	sctl_regset.base = mfd->sctl_regs;
+	sta2x11_sctl_debugfs = debugfs_create_regset32("sta2x11-sctl",
+						  S_IFREG | S_IRUGO,
+						  NULL, &sctl_regset);
+	return 0;
+}
+
+static int sta2x11_apbreg_probe(struct platform_device *dev)
+{
+	struct pci_dev **pdev;
+	struct sta2x11_mfd *mfd;
+	struct resource *res;
+
+	pdev = dev->dev.platform_data;
+	dev_dbg(&dev->dev, "%s: pdata is %p\n", __func__, pdev);
+	dev_dbg(&dev->dev, "%s: *pdata is %p\n", __func__, *pdev);
+
+	mfd = sta2x11_mfd_find(*pdev);
+	if (!mfd)
+		return -ENODEV;
+
+	res = platform_get_resource(dev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENOMEM;
+
+	if (!request_mem_region(res->start, resource_size(res),
+				"sta2x11-apbreg"))
+		return -EBUSY;
+
+	mfd->apbreg_regs = ioremap(res->start, resource_size(res));
+	if (!mfd->apbreg_regs) {
+		release_mem_region(res->start, resource_size(res));
+		return -ENOMEM;
+	}
+	dev_dbg(&dev->dev, "%s: regbase %p\n", __func__, mfd->apbreg_regs);
+
+	apbreg_regset.base = mfd->apbreg_regs;
+	sta2x11_apbreg_debugfs = debugfs_create_regset32("sta2x11-apbreg",
+						  S_IFREG | S_IRUGO,
+						  NULL, &apbreg_regset);
+	return 0;
+}
+
+/* The two platform drivers */
+static struct platform_driver sta2x11_sctl_platform_driver = {
+	.driver = {
+		.name	= "sta2x11-sctl",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= sta2x11_sctl_probe,
+};
+
+static int __init sta2x11_sctl_init(void)
+{
+	pr_info("%s\n", __func__);
+	return platform_driver_register(&sta2x11_sctl_platform_driver);
+}
+
+static struct platform_driver sta2x11_platform_driver = {
+	.driver = {
+		.name	= "sta2x11-apbreg",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= sta2x11_apbreg_probe,
+};
+
+static int __init sta2x11_apbreg_init(void)
+{
+	pr_info("%s\n", __func__);
+	return platform_driver_register(&sta2x11_platform_driver);
+}
+
+/*
+ * What follows is the PCI device that hosts the above two pdevs.
+ * Each logic block is 4kB and they are all consecutive: we use this info.
+ */
+
+/* Bar 0 */
+enum bar0_cells {
+	STA2X11_GPIO_0 = 0,
+	STA2X11_GPIO_1,
+	STA2X11_GPIO_2,
+	STA2X11_GPIO_3,
+	STA2X11_SCTL,
+	STA2X11_SCR,
+	STA2X11_TIME,
+};
+/* Bar 1 */
+enum bar1_cells {
+	STA2X11_APBREG = 0,
+};
+#define CELL_4K(_name, _cell) { \
+		.name = _name, \
+		.start = _cell * 4096, .end = _cell * 4096 + 4095, \
+		.flags = IORESOURCE_MEM, \
+		}
+
+static const __devinitconst struct resource gpio_resources[] = {
+	{
+		.name = "sta2x11_gpio", /* 4 consecutive cells, 1 driver */
+		.start = 0,
+		.end = (4 * 4096) - 1,
+		.flags = IORESOURCE_MEM,
+	}
+};
+static const __devinitconst struct resource sctl_resources[] = {
+	CELL_4K("sta2x11-sctl", STA2X11_SCTL),
+};
+static const __devinitconst struct resource scr_resources[] = {
+	CELL_4K("sta2x11-scr", STA2X11_SCR),
+};
+static const __devinitconst struct resource time_resources[] = {
+	CELL_4K("sta2x11-time", STA2X11_TIME),
+};
+
+static const __devinitconst struct resource apbreg_resources[] = {
+	CELL_4K("sta2x11-apbreg", STA2X11_APBREG),
+};
+
+#define DEV(_name, _r) \
+	{ .name = _name, .num_resources = ARRAY_SIZE(_r), .resources = _r, }
+
+static __devinitdata struct mfd_cell sta2x11_mfd_bar0[] = {
+	DEV("sta2x11-gpio", gpio_resources), /* offset 0: we add pdata later */
+	DEV("sta2x11-sctl", sctl_resources),
+	DEV("sta2x11-scr", scr_resources),
+	DEV("sta2x11-time", time_resources),
+};
+
+static __devinitdata struct mfd_cell sta2x11_mfd_bar1[] = {
+	DEV("sta2x11-apbreg", apbreg_resources),
+};
+
+static int sta2x11_mfd_suspend(struct pci_dev *pdev, pm_message_t state)
+{
+	pci_save_state(pdev);
+	pci_disable_device(pdev);
+	pci_set_power_state(pdev, pci_choose_state(pdev, state));
+
+	return 0;
+}
+
+static int sta2x11_mfd_resume(struct pci_dev *pdev)
+{
+	int err;
+
+	pci_set_power_state(pdev, 0);
+	err = pci_enable_device(pdev);
+	if (err)
+		return err;
+	pci_restore_state(pdev);
+
+	return 0;
+}
+
+static int __devinit sta2x11_mfd_probe(struct pci_dev *pdev,
+				       const struct pci_device_id *pci_id)
+{
+	int err, i;
+	struct sta2x11_gpio_pdata *gpio_data;
+
+	dev_info(&pdev->dev, "%s\n", __func__);
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "Can't enable device.\n");
+		return err;
+	}
+
+	err = pci_enable_msi(pdev);
+	if (err)
+		dev_info(&pdev->dev, "Enable msi failed\n");
+
+	/* Read gpio config data as pci device's platform data */
+	gpio_data = dev_get_platdata(&pdev->dev);
+	if (!gpio_data)
+		dev_warn(&pdev->dev, "no gpio configuration\n");
+
+	dev_dbg(&pdev->dev, "%s, gpio_data = %p (%p)\n", __func__,
+		gpio_data, &gpio_data);
+	dev_dbg(&pdev->dev, "%s, pdev = %p (%p)\n", __func__,
+		pdev, &pdev);
+
+	/* platform data is the pci device for all of them */
+	for (i = 0; i < ARRAY_SIZE(sta2x11_mfd_bar0); i++) {
+		sta2x11_mfd_bar0[i].pdata_size = sizeof(pdev);
+		sta2x11_mfd_bar0[i].platform_data = &pdev;
+	}
+	sta2x11_mfd_bar1[0].pdata_size = sizeof(pdev);
+	sta2x11_mfd_bar1[0].platform_data = &pdev;
+
+	/* Record this pdev before mfd_add_devices: their probe looks for it */
+	sta2x11_mfd_add(pdev, GFP_ATOMIC);
+
+
+	err = mfd_add_devices(&pdev->dev, -1,
+			      sta2x11_mfd_bar0,
+			      ARRAY_SIZE(sta2x11_mfd_bar0),
+			      &pdev->resource[0],
+			      0);
+	if (err) {
+		dev_err(&pdev->dev, "mfd_add_devices[0] failed: %d\n", err);
+		goto err_disable;
+	}
+
+	err = mfd_add_devices(&pdev->dev, -1,
+			      sta2x11_mfd_bar1,
+			      ARRAY_SIZE(sta2x11_mfd_bar1),
+			      &pdev->resource[1],
+			      0);
+	if (err) {
+		dev_err(&pdev->dev, "mfd_add_devices[1] failed: %d\n", err);
+		goto err_disable;
+	}
+
+	return 0;
+
+err_disable:
+	mfd_remove_devices(&pdev->dev);
+	pci_disable_device(pdev);
+	pci_disable_msi(pdev);
+	return err;
+}
+
+static DEFINE_PCI_DEVICE_TABLE(sta2x11_mfd_tbl) = {
+	{PCI_DEVICE(PCI_VENDOR_ID_STMICRO, PCI_DEVICE_ID_STMICRO_GPIO)},
+	{0,},
+};
+
+static struct pci_driver sta2x11_mfd_driver = {
+	.name =		"sta2x11-mfd",
+	.id_table =	sta2x11_mfd_tbl,
+	.probe =	sta2x11_mfd_probe,
+	.suspend =	sta2x11_mfd_suspend,
+	.resume =	sta2x11_mfd_resume,
+};
+
+static int __init sta2x11_mfd_init(void)
+{
+	pr_info("%s\n", __func__);
+	return pci_register_driver(&sta2x11_mfd_driver);
+}
+
+/*
+ * All of this must be ready before "normal" devices like MMCI appear.
+ * But MFD (the pci device) can't be too early. The following choice
+ * prepares platform drivers very early and probe the PCI device later,
+ * but before other PCI devices.
+ */
+subsys_initcall(sta2x11_apbreg_init);
+subsys_initcall(sta2x11_sctl_init);
+rootfs_initcall(sta2x11_mfd_init);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Wind River");
+MODULE_DESCRIPTION("STA2x11 mfd for GPIO, SCTL and APBREG");
+MODULE_DEVICE_TABLE(pci, sta2x11_mfd_tbl);
diff --git a/drivers/mfd/stmpe-spi.c b/drivers/mfd/stmpe-spi.c
index b58c43c7ea93..afd459013ecb 100644
--- a/drivers/mfd/stmpe-spi.c
+++ b/drivers/mfd/stmpe-spi.c
@@ -122,7 +122,6 @@ MODULE_DEVICE_TABLE(spi, stmpe_id);
 static struct spi_driver stmpe_spi_driver = {
 	.driver = {
 		.name	= "stmpe-spi",
-		.bus	= &spi_bus_type,
 		.owner	= THIS_MODULE,
 #ifdef CONFIG_PM
 		.pm	= &stmpe_dev_pm_ops,
diff --git a/drivers/mfd/tps65090.c b/drivers/mfd/tps65090.c
index 47f802bf1848..396b9d1b6bd6 100644
--- a/drivers/mfd/tps65090.c
+++ b/drivers/mfd/tps65090.c
@@ -283,27 +283,24 @@ static int __devinit tps65090_i2c_probe(struct i2c_client *client,
 		}
 	}
 
-	tps65090->rmap = regmap_init_i2c(tps65090->client,
-		&tps65090_regmap_config);
+	tps65090->rmap = devm_regmap_init_i2c(tps65090->client,
+					      &tps65090_regmap_config);
 	if (IS_ERR(tps65090->rmap)) {
-		dev_err(&client->dev, "regmap_init failed with err: %ld\n",
-			PTR_ERR(tps65090->rmap));
+		ret = PTR_ERR(tps65090->rmap);
+		dev_err(&client->dev, "regmap_init failed with err: %d\n", ret);
 		goto err_irq_exit;
-	};
+	}
 
 	ret = mfd_add_devices(tps65090->dev, -1, tps65090s,
 		ARRAY_SIZE(tps65090s), NULL, 0);
 	if (ret) {
 		dev_err(&client->dev, "add mfd devices failed with err: %d\n",
 			ret);
-		goto err_regmap_exit;
+		goto err_irq_exit;
 	}
 
 	return 0;
 
-err_regmap_exit:
-	regmap_exit(tps65090->rmap);
-
 err_irq_exit:
 	if (client->irq)
 		free_irq(client->irq, tps65090);
@@ -316,29 +313,34 @@ static int __devexit tps65090_i2c_remove(struct i2c_client *client)
 	struct tps65090 *tps65090 = i2c_get_clientdata(client);
 
 	mfd_remove_devices(tps65090->dev);
-	regmap_exit(tps65090->rmap);
 	if (client->irq)
 		free_irq(client->irq, tps65090);
 
 	return 0;
 }
 
-#ifdef CONFIG_PM
-static int tps65090_i2c_suspend(struct i2c_client *client, pm_message_t state)
+#ifdef CONFIG_PM_SLEEP
+static int tps65090_suspend(struct device *dev)
 {
+	struct i2c_client *client = to_i2c_client(dev);
 	if (client->irq)
 		disable_irq(client->irq);
 	return 0;
 }
 
-static int tps65090_i2c_resume(struct i2c_client *client)
+static int tps65090_resume(struct device *dev)
 {
+	struct i2c_client *client = to_i2c_client(dev);
 	if (client->irq)
 		enable_irq(client->irq);
 	return 0;
 }
 #endif
 
+static const struct dev_pm_ops tps65090_pm_ops = {
+	SET_SYSTEM_SLEEP_PM_OPS(tps65090_suspend, tps65090_resume)
+};
+
 static const struct i2c_device_id tps65090_id_table[] = {
 	{ "tps65090", 0 },
 	{ },
@@ -349,13 +351,10 @@ static struct i2c_driver tps65090_driver = {
 	.driver	= {
 		.name	= "tps65090",
 		.owner	= THIS_MODULE,
+		.pm	= &tps65090_pm_ops,
 	},
 	.probe		= tps65090_i2c_probe,
 	.remove		= __devexit_p(tps65090_i2c_remove),
-#ifdef CONFIG_PM
-	.suspend	= tps65090_i2c_suspend,
-	.resume		= tps65090_i2c_resume,
-#endif
 	.id_table	= tps65090_id_table,
 };
 
diff --git a/drivers/mfd/tps65217.c b/drivers/mfd/tps65217.c
index f7d854e4cc62..db194e433c08 100644
--- a/drivers/mfd/tps65217.c
+++ b/drivers/mfd/tps65217.c
@@ -96,7 +96,7 @@ EXPORT_SYMBOL_GPL(tps65217_reg_write);
  * @val: Value to write.
  * @level: Password protected level
  */
-int tps65217_update_bits(struct tps65217 *tps, unsigned int reg,
+static int tps65217_update_bits(struct tps65217 *tps, unsigned int reg,
 		unsigned int mask, unsigned int val, unsigned int level)
 {
 	int ret;
@@ -150,7 +150,7 @@ static int __devinit tps65217_probe(struct i2c_client *client,
 		return -ENOMEM;
 
 	tps->pdata = pdata;
-	tps->regmap = regmap_init_i2c(client, &tps65217_regmap_config);
+	tps->regmap = devm_regmap_init_i2c(client, &tps65217_regmap_config);
 	if (IS_ERR(tps->regmap)) {
 		ret = PTR_ERR(tps->regmap);
 		dev_err(tps->dev, "Failed to allocate register map: %d\n",
@@ -163,9 +163,9 @@ static int __devinit tps65217_probe(struct i2c_client *client,
 
 	ret = tps65217_reg_read(tps, TPS65217_REG_CHIPID, &version);
 	if (ret < 0) {
-		dev_err(tps->dev, "Failed to read revision"
-					" register: %d\n", ret);
-		goto err_regmap;
+		dev_err(tps->dev, "Failed to read revision register: %d\n",
+			ret);
+		return ret;
 	}
 
 	dev_info(tps->dev, "TPS65217 ID %#x version 1.%d\n",
@@ -190,11 +190,6 @@ static int __devinit tps65217_probe(struct i2c_client *client,
 	}
 
 	return 0;
-
-err_regmap:
-	regmap_exit(tps->regmap);
-
-	return ret;
 }
 
 static int __devexit tps65217_remove(struct i2c_client *client)
@@ -205,8 +200,6 @@ static int __devexit tps65217_remove(struct i2c_client *client)
 	for (i = 0; i < TPS65217_NUM_REGULATOR; i++)
 		platform_device_unregister(tps->regulator_pdev[i]);
 
-	regmap_exit(tps->regmap);
-
 	return 0;
 }
 
diff --git a/drivers/mfd/tps65910-irq.c b/drivers/mfd/tps65910-irq.c
index c9ed5c00a621..09aab3e4776d 100644
--- a/drivers/mfd/tps65910-irq.c
+++ b/drivers/mfd/tps65910-irq.c
@@ -20,15 +20,10 @@
 #include <linux/device.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
+#include <linux/irqdomain.h>
 #include <linux/gpio.h>
 #include <linux/mfd/tps65910.h>
 
-static inline int irq_to_tps65910_irq(struct tps65910 *tps65910,
-							int irq)
-{
-	return (irq - tps65910->irq_base);
-}
-
 /*
  * This is a threaded IRQ handler so can access I2C/SPI.  Since all
  * interrupts are clear on read the IRQ line will be reasserted and
@@ -41,28 +36,28 @@ static inline int irq_to_tps65910_irq(struct tps65910 *tps65910,
 static irqreturn_t tps65910_irq(int irq, void *irq_data)
 {
 	struct tps65910 *tps65910 = irq_data;
+	unsigned int reg;
 	u32 irq_sts;
 	u32 irq_mask;
-	u8 reg;
 	int i;
 
-	tps65910->read(tps65910, TPS65910_INT_STS, 1, &reg);
+	tps65910_reg_read(tps65910, TPS65910_INT_STS, &reg);
 	irq_sts = reg;
-	tps65910->read(tps65910, TPS65910_INT_STS2, 1, &reg);
+	tps65910_reg_read(tps65910, TPS65910_INT_STS2, &reg);
 	irq_sts |= reg << 8;
 	switch (tps65910_chip_id(tps65910)) {
 	case TPS65911:
-		tps65910->read(tps65910, TPS65910_INT_STS3, 1, &reg);
+		tps65910_reg_read(tps65910, TPS65910_INT_STS3, &reg);
 		irq_sts |= reg << 16;
 	}
 
-	tps65910->read(tps65910, TPS65910_INT_MSK, 1, &reg);
+	tps65910_reg_read(tps65910, TPS65910_INT_MSK, &reg);
 	irq_mask = reg;
-	tps65910->read(tps65910, TPS65910_INT_MSK2, 1, &reg);
+	tps65910_reg_read(tps65910, TPS65910_INT_MSK2, &reg);
 	irq_mask |= reg << 8;
 	switch (tps65910_chip_id(tps65910)) {
 	case TPS65911:
-		tps65910->read(tps65910, TPS65910_INT_MSK3, 1, &reg);
+		tps65910_reg_read(tps65910, TPS65910_INT_MSK3, &reg);
 		irq_mask |= reg << 16;
 	}
 
@@ -76,19 +71,19 @@ static irqreturn_t tps65910_irq(int irq, void *irq_data)
 		if (!(irq_sts & (1 << i)))
 			continue;
 
-		handle_nested_irq(tps65910->irq_base + i);
+		handle_nested_irq(irq_find_mapping(tps65910->domain, i));
 	}
 
 	/* Write the STS register back to clear IRQs we handled */
 	reg = irq_sts & 0xFF;
 	irq_sts >>= 8;
-	tps65910->write(tps65910, TPS65910_INT_STS, 1, &reg);
+	tps65910_reg_write(tps65910, TPS65910_INT_STS, reg);
 	reg = irq_sts & 0xFF;
-	tps65910->write(tps65910, TPS65910_INT_STS2, 1, &reg);
+	tps65910_reg_write(tps65910, TPS65910_INT_STS2, reg);
 	switch (tps65910_chip_id(tps65910)) {
 	case TPS65911:
 		reg = irq_sts >> 8;
-		tps65910->write(tps65910, TPS65910_INT_STS3, 1, &reg);
+		tps65910_reg_write(tps65910, TPS65910_INT_STS3, reg);
 	}
 
 	return IRQ_HANDLED;
@@ -105,27 +100,27 @@ static void tps65910_irq_sync_unlock(struct irq_data *data)
 {
 	struct tps65910 *tps65910 = irq_data_get_irq_chip_data(data);
 	u32 reg_mask;
-	u8 reg;
+	unsigned int reg;
 
-	tps65910->read(tps65910, TPS65910_INT_MSK, 1, &reg);
+	tps65910_reg_read(tps65910, TPS65910_INT_MSK, &reg);
 	reg_mask = reg;
-	tps65910->read(tps65910, TPS65910_INT_MSK2, 1, &reg);
+	tps65910_reg_read(tps65910, TPS65910_INT_MSK2, &reg);
 	reg_mask |= reg << 8;
 	switch (tps65910_chip_id(tps65910)) {
 	case TPS65911:
-		tps65910->read(tps65910, TPS65910_INT_MSK3, 1, &reg);
+		tps65910_reg_read(tps65910, TPS65910_INT_MSK3, &reg);
 		reg_mask |= reg << 16;
 	}
 
 	if (tps65910->irq_mask != reg_mask) {
 		reg = tps65910->irq_mask & 0xFF;
-		tps65910->write(tps65910, TPS65910_INT_MSK, 1, &reg);
+		tps65910_reg_write(tps65910, TPS65910_INT_MSK, reg);
 		reg = tps65910->irq_mask >> 8 & 0xFF;
-		tps65910->write(tps65910, TPS65910_INT_MSK2, 1, &reg);
+		tps65910_reg_write(tps65910, TPS65910_INT_MSK2, reg);
 		switch (tps65910_chip_id(tps65910)) {
 		case TPS65911:
 			reg = tps65910->irq_mask >> 16;
-			tps65910->write(tps65910, TPS65910_INT_MSK3, 1, &reg);
+			tps65910_reg_write(tps65910, TPS65910_INT_MSK3, reg);
 		}
 	}
 	mutex_unlock(&tps65910->irq_lock);
@@ -135,14 +130,14 @@ static void tps65910_irq_enable(struct irq_data *data)
 {
 	struct tps65910 *tps65910 = irq_data_get_irq_chip_data(data);
 
-	tps65910->irq_mask &= ~( 1 << irq_to_tps65910_irq(tps65910, data->irq));
+	tps65910->irq_mask &= ~(1 << data->hwirq);
 }
 
 static void tps65910_irq_disable(struct irq_data *data)
 {
 	struct tps65910 *tps65910 = irq_data_get_irq_chip_data(data);
 
-	tps65910->irq_mask |= ( 1 << irq_to_tps65910_irq(tps65910, data->irq));
+	tps65910->irq_mask |= (1 << data->hwirq);
 }
 
 #ifdef CONFIG_PM_SLEEP
@@ -164,10 +159,35 @@ static struct irq_chip tps65910_irq_chip = {
 	.irq_set_wake = tps65910_irq_set_wake,
 };
 
+static int tps65910_irq_map(struct irq_domain *h, unsigned int virq,
+				irq_hw_number_t hw)
+{
+	struct tps65910 *tps65910 = h->host_data;
+
+	irq_set_chip_data(virq, tps65910);
+	irq_set_chip_and_handler(virq, &tps65910_irq_chip, handle_edge_irq);
+	irq_set_nested_thread(virq, 1);
+
+	/* ARM needs us to explicitly flag the IRQ as valid
+	 * and will set them noprobe when we do so. */
+#ifdef CONFIG_ARM
+	set_irq_flags(virq, IRQF_VALID);
+#else
+	irq_set_noprobe(virq);
+#endif
+
+	return 0;
+}
+
+static struct irq_domain_ops tps65910_domain_ops = {
+	.map	= tps65910_irq_map,
+	.xlate	= irq_domain_xlate_twocell,
+};
+
 int tps65910_irq_init(struct tps65910 *tps65910, int irq,
 		    struct tps65910_platform_data *pdata)
 {
-	int ret, cur_irq;
+	int ret;
 	int flags = IRQF_ONESHOT;
 
 	if (!irq) {
@@ -175,17 +195,11 @@ int tps65910_irq_init(struct tps65910 *tps65910, int irq,
 		return -EINVAL;
 	}
 
-	if (!pdata || !pdata->irq_base) {
-		dev_warn(tps65910->dev, "No interrupt support, no IRQ base\n");
+	if (!pdata) {
+		dev_warn(tps65910->dev, "No interrupt support, no pdata\n");
 		return -EINVAL;
 	}
 
-	tps65910->irq_mask = 0xFFFFFF;
-
-	mutex_init(&tps65910->irq_lock);
-	tps65910->chip_irq = irq;
-	tps65910->irq_base = pdata->irq_base;
-
 	switch (tps65910_chip_id(tps65910)) {
 	case TPS65910:
 		tps65910->irq_num = TPS65910_NUM_IRQ;
@@ -195,22 +209,36 @@ int tps65910_irq_init(struct tps65910 *tps65910, int irq,
 		break;
 	}
 
-	/* Register with genirq */
-	for (cur_irq = tps65910->irq_base;
-	     cur_irq < tps65910->irq_num + tps65910->irq_base;
-	     cur_irq++) {
-		irq_set_chip_data(cur_irq, tps65910);
-		irq_set_chip_and_handler(cur_irq, &tps65910_irq_chip,
-					 handle_edge_irq);
-		irq_set_nested_thread(cur_irq, 1);
-
-		/* ARM needs us to explicitly flag the IRQ as valid
-		 * and will set them noprobe when we do so. */
-#ifdef CONFIG_ARM
-		set_irq_flags(cur_irq, IRQF_VALID);
-#else
-		irq_set_noprobe(cur_irq);
-#endif
+	if (pdata->irq_base > 0) {
+		pdata->irq_base = irq_alloc_descs(pdata->irq_base, 0,
+					tps65910->irq_num, -1);
+		if (pdata->irq_base < 0) {
+			dev_warn(tps65910->dev, "Failed to alloc IRQs: %d\n",
+					pdata->irq_base);
+			return pdata->irq_base;
+		}
+	}
+
+	tps65910->irq_mask = 0xFFFFFF;
+
+	mutex_init(&tps65910->irq_lock);
+	tps65910->chip_irq = irq;
+	tps65910->irq_base = pdata->irq_base;
+
+	if (pdata->irq_base > 0)
+		tps65910->domain = irq_domain_add_legacy(tps65910->dev->of_node,
+					tps65910->irq_num,
+					pdata->irq_base,
+					0,
+					&tps65910_domain_ops, tps65910);
+	else
+		tps65910->domain = irq_domain_add_linear(tps65910->dev->of_node,
+					tps65910->irq_num,
+					&tps65910_domain_ops, tps65910);
+
+	if (!tps65910->domain) {
+		dev_err(tps65910->dev, "Failed to create IRQ domain\n");
+		return -ENOMEM;
 	}
 
 	ret = request_threaded_irq(irq, NULL, tps65910_irq, flags,
diff --git a/drivers/mfd/tps65910.c b/drivers/mfd/tps65910.c
index bf2b25ebf2ca..be9e07b77325 100644
--- a/drivers/mfd/tps65910.c
+++ b/drivers/mfd/tps65910.c
@@ -19,13 +19,16 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/i2c.h>
-#include <linux/gpio.h>
 #include <linux/mfd/core.h>
 #include <linux/regmap.h>
 #include <linux/mfd/tps65910.h>
+#include <linux/of_device.h>
 
 static struct mfd_cell tps65910s[] = {
 	{
+		.name = "tps65910-gpio",
+	},
+	{
 		.name = "tps65910-pmic",
 	},
 	{
@@ -37,30 +40,6 @@ static struct mfd_cell tps65910s[] = {
 };
 
 
-static int tps65910_i2c_read(struct tps65910 *tps65910, u8 reg,
-				  int bytes, void *dest)
-{
-	return regmap_bulk_read(tps65910->regmap, reg, dest, bytes);
-}
-
-static int tps65910_i2c_write(struct tps65910 *tps65910, u8 reg,
-				  int bytes, void *src)
-{
-	return regmap_bulk_write(tps65910->regmap, reg, src, bytes);
-}
-
-int tps65910_set_bits(struct tps65910 *tps65910, u8 reg, u8 mask)
-{
-	return regmap_update_bits(tps65910->regmap, reg, mask, mask);
-}
-EXPORT_SYMBOL_GPL(tps65910_set_bits);
-
-int tps65910_clear_bits(struct tps65910 *tps65910, u8 reg, u8 mask)
-{
-	return regmap_update_bits(tps65910->regmap, reg, mask, 0);
-}
-EXPORT_SYMBOL_GPL(tps65910_clear_bits);
-
 static bool is_volatile_reg(struct device *dev, unsigned int reg)
 {
 	struct tps65910 *tps65910 = dev_get_drvdata(dev);
@@ -85,80 +64,197 @@ static const struct regmap_config tps65910_regmap_config = {
 	.reg_bits = 8,
 	.val_bits = 8,
 	.volatile_reg = is_volatile_reg,
-	.max_register = TPS65910_MAX_REGISTER,
-	.num_reg_defaults_raw = TPS65910_MAX_REGISTER,
+	.max_register = TPS65910_MAX_REGISTER - 1,
 	.cache_type = REGCACHE_RBTREE,
 };
 
-static int tps65910_i2c_probe(struct i2c_client *i2c,
-			    const struct i2c_device_id *id)
+static int __devinit tps65910_sleepinit(struct tps65910 *tps65910,
+		struct tps65910_board *pmic_pdata)
+{
+	struct device *dev = NULL;
+	int ret = 0;
+
+	dev = tps65910->dev;
+
+	if (!pmic_pdata->en_dev_slp)
+		return 0;
+
+	/* enabling SLEEP device state */
+	ret = tps65910_reg_set_bits(tps65910, TPS65910_DEVCTRL,
+				DEVCTRL_DEV_SLP_MASK);
+	if (ret < 0) {
+		dev_err(dev, "set dev_slp failed: %d\n", ret);
+		goto err_sleep_init;
+	}
+
+	/* Return if there is no sleep keepon data. */
+	if (!pmic_pdata->slp_keepon)
+		return 0;
+
+	if (pmic_pdata->slp_keepon->therm_keepon) {
+		ret = tps65910_reg_set_bits(tps65910,
+				TPS65910_SLEEP_KEEP_RES_ON,
+				SLEEP_KEEP_RES_ON_THERM_KEEPON_MASK);
+		if (ret < 0) {
+			dev_err(dev, "set therm_keepon failed: %d\n", ret);
+			goto disable_dev_slp;
+		}
+	}
+
+	if (pmic_pdata->slp_keepon->clkout32k_keepon) {
+		ret = tps65910_reg_set_bits(tps65910,
+				TPS65910_SLEEP_KEEP_RES_ON,
+				SLEEP_KEEP_RES_ON_CLKOUT32K_KEEPON_MASK);
+		if (ret < 0) {
+			dev_err(dev, "set clkout32k_keepon failed: %d\n", ret);
+			goto disable_dev_slp;
+		}
+	}
+
+	if (pmic_pdata->slp_keepon->i2chs_keepon) {
+		ret = tps65910_reg_set_bits(tps65910,
+				TPS65910_SLEEP_KEEP_RES_ON,
+				SLEEP_KEEP_RES_ON_I2CHS_KEEPON_MASK);
+		if (ret < 0) {
+			dev_err(dev, "set i2chs_keepon failed: %d\n", ret);
+			goto disable_dev_slp;
+		}
+	}
+
+	return 0;
+
+disable_dev_slp:
+	tps65910_reg_clear_bits(tps65910, TPS65910_DEVCTRL,
+				DEVCTRL_DEV_SLP_MASK);
+
+err_sleep_init:
+	return ret;
+}
+
+#ifdef CONFIG_OF
+static struct of_device_id tps65910_of_match[] = {
+	{ .compatible = "ti,tps65910", .data = (void *)TPS65910},
+	{ .compatible = "ti,tps65911", .data = (void *)TPS65911},
+	{ },
+};
+MODULE_DEVICE_TABLE(of, tps65910_of_match);
+
+static struct tps65910_board *tps65910_parse_dt(struct i2c_client *client,
+						int *chip_id)
+{
+	struct device_node *np = client->dev.of_node;
+	struct tps65910_board *board_info;
+	unsigned int prop;
+	const struct of_device_id *match;
+	int ret = 0;
+
+	match = of_match_device(tps65910_of_match, &client->dev);
+	if (!match) {
+		dev_err(&client->dev, "Failed to find matching dt id\n");
+		return NULL;
+	}
+
+	*chip_id  = (int)match->data;
+
+	board_info = devm_kzalloc(&client->dev, sizeof(*board_info),
+			GFP_KERNEL);
+	if (!board_info) {
+		dev_err(&client->dev, "Failed to allocate pdata\n");
+		return NULL;
+	}
+
+	ret = of_property_read_u32(np, "ti,vmbch-threshold", &prop);
+	if (!ret)
+		board_info->vmbch_threshold = prop;
+	else if (*chip_id == TPS65911)
+		dev_warn(&client->dev, "VMBCH-Threshold not specified");
+
+	ret = of_property_read_u32(np, "ti,vmbch2-threshold", &prop);
+	if (!ret)
+		board_info->vmbch2_threshold = prop;
+	else if (*chip_id == TPS65911)
+		dev_warn(&client->dev, "VMBCH2-Threshold not specified");
+
+	board_info->irq = client->irq;
+	board_info->irq_base = -1;
+
+	return board_info;
+}
+#else
+static inline
+struct tps65910_board *tps65910_parse_dt(struct i2c_client *client,
+					 int *chip_id)
+{
+	return NULL;
+}
+#endif
+
+static __devinit int tps65910_i2c_probe(struct i2c_client *i2c,
+					const struct i2c_device_id *id)
 {
 	struct tps65910 *tps65910;
 	struct tps65910_board *pmic_plat_data;
+	struct tps65910_board *of_pmic_plat_data = NULL;
 	struct tps65910_platform_data *init_data;
 	int ret = 0;
+	int chip_id = id->driver_data;
 
 	pmic_plat_data = dev_get_platdata(&i2c->dev);
+
+	if (!pmic_plat_data && i2c->dev.of_node) {
+		pmic_plat_data = tps65910_parse_dt(i2c, &chip_id);
+		of_pmic_plat_data = pmic_plat_data;
+	}
+
 	if (!pmic_plat_data)
 		return -EINVAL;
 
-	init_data = kzalloc(sizeof(struct tps65910_platform_data), GFP_KERNEL);
+	init_data = devm_kzalloc(&i2c->dev, sizeof(*init_data), GFP_KERNEL);
 	if (init_data == NULL)
 		return -ENOMEM;
 
-	tps65910 = kzalloc(sizeof(struct tps65910), GFP_KERNEL);
-	if (tps65910 == NULL) {
-		kfree(init_data);
+	tps65910 = devm_kzalloc(&i2c->dev, sizeof(*tps65910), GFP_KERNEL);
+	if (tps65910 == NULL)
 		return -ENOMEM;
-	}
 
+	tps65910->of_plat_data = of_pmic_plat_data;
 	i2c_set_clientdata(i2c, tps65910);
 	tps65910->dev = &i2c->dev;
 	tps65910->i2c_client = i2c;
-	tps65910->id = id->driver_data;
-	tps65910->read = tps65910_i2c_read;
-	tps65910->write = tps65910_i2c_write;
+	tps65910->id = chip_id;
 	mutex_init(&tps65910->io_mutex);
 
-	tps65910->regmap = regmap_init_i2c(i2c, &tps65910_regmap_config);
+	tps65910->regmap = devm_regmap_init_i2c(i2c, &tps65910_regmap_config);
 	if (IS_ERR(tps65910->regmap)) {
 		ret = PTR_ERR(tps65910->regmap);
 		dev_err(&i2c->dev, "regmap initialization failed: %d\n", ret);
-		goto regmap_err;
+		return ret;
 	}
 
 	ret = mfd_add_devices(tps65910->dev, -1,
 			      tps65910s, ARRAY_SIZE(tps65910s),
 			      NULL, 0);
-	if (ret < 0)
-		goto err;
+	if (ret < 0) {
+		dev_err(&i2c->dev, "mfd_add_devices failed: %d\n", ret);
+		return ret;
+	}
 
 	init_data->irq = pmic_plat_data->irq;
 	init_data->irq_base = pmic_plat_data->irq_base;
 
-	tps65910_gpio_init(tps65910, pmic_plat_data->gpio_base);
-
 	tps65910_irq_init(tps65910, init_data->irq, init_data);
 
-	kfree(init_data);
-	return ret;
+	tps65910_sleepinit(tps65910, pmic_plat_data);
 
-err:
-	regmap_exit(tps65910->regmap);
-regmap_err:
-	kfree(tps65910);
-	kfree(init_data);
 	return ret;
 }
 
-static int tps65910_i2c_remove(struct i2c_client *i2c)
+static __devexit int tps65910_i2c_remove(struct i2c_client *i2c)
 {
 	struct tps65910 *tps65910 = i2c_get_clientdata(i2c);
 
 	tps65910_irq_exit(tps65910);
 	mfd_remove_devices(tps65910->dev);
-	regmap_exit(tps65910->regmap);
-	kfree(tps65910);
 
 	return 0;
 }
@@ -175,9 +271,10 @@ static struct i2c_driver tps65910_i2c_driver = {
 	.driver = {
 		   .name = "tps65910",
 		   .owner = THIS_MODULE,
+		   .of_match_table = of_match_ptr(tps65910_of_match),
 	},
 	.probe = tps65910_i2c_probe,
-	.remove = tps65910_i2c_remove,
+	.remove = __devexit_p(tps65910_i2c_remove),
 	.id_table = tps65910_i2c_id,
 };
 
diff --git a/drivers/mfd/twl4030-irq.c b/drivers/mfd/twl4030-irq.c
index 5d656e814358..ad733d76207a 100644
--- a/drivers/mfd/twl4030-irq.c
+++ b/drivers/mfd/twl4030-irq.c
@@ -757,6 +757,7 @@ int twl4030_init_irq(struct device *dev, int irq_num)
 		dev_err(dev, "could not claim irq%d: %d\n", irq_num, status);
 		goto fail_rqirq;
 	}
+	enable_irq_wake(irq_num);
 
 	return irq_base;
 fail_rqirq:
diff --git a/drivers/mfd/twl6040-core.c b/drivers/mfd/twl6040-core.c
index 2d6bedadca09..4ded9e7aa246 100644
--- a/drivers/mfd/twl6040-core.c
+++ b/drivers/mfd/twl6040-core.c
@@ -27,7 +27,12 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
+#include <linux/err.h>
 #include <linux/platform_device.h>
+#include <linux/of.h>
+#include <linux/of_irq.h>
+#include <linux/of_gpio.h>
+#include <linux/of_platform.h>
 #include <linux/gpio.h>
 #include <linux/delay.h>
 #include <linux/i2c.h>
@@ -35,8 +40,24 @@
 #include <linux/err.h>
 #include <linux/mfd/core.h>
 #include <linux/mfd/twl6040.h>
+#include <linux/regulator/consumer.h>
 
 #define VIBRACTRL_MEMBER(reg) ((reg == TWL6040_REG_VIBCTLL) ? 0 : 1)
+#define TWL6040_NUM_SUPPLIES	(2)
+
+static bool twl6040_has_vibra(struct twl6040_platform_data *pdata,
+			      struct device_node *node)
+{
+	if (pdata && pdata->vibra)
+		return true;
+
+#ifdef CONFIG_OF
+	if (of_find_node_by_name(node, "vibra"))
+		return true;
+#endif
+
+	return false;
+}
 
 int twl6040_reg_read(struct twl6040 *twl6040, unsigned int reg)
 {
@@ -502,17 +523,18 @@ static int __devinit twl6040_probe(struct i2c_client *client,
 				     const struct i2c_device_id *id)
 {
 	struct twl6040_platform_data *pdata = client->dev.platform_data;
+	struct device_node *node = client->dev.of_node;
 	struct twl6040 *twl6040;
 	struct mfd_cell *cell = NULL;
-	int ret, children = 0;
+	int irq, ret, children = 0;
 
-	if (!pdata) {
+	if (!pdata && !node) {
 		dev_err(&client->dev, "Platform data is missing\n");
 		return -EINVAL;
 	}
 
 	/* In order to operate correctly we need valid interrupt config */
-	if (!client->irq || !pdata->irq_base) {
+	if (!client->irq) {
 		dev_err(&client->dev, "Invalid IRQ configuration\n");
 		return -EINVAL;
 	}
@@ -524,7 +546,7 @@ static int __devinit twl6040_probe(struct i2c_client *client,
 		goto err;
 	}
 
-	twl6040->regmap = regmap_init_i2c(client, &twl6040_regmap_config);
+	twl6040->regmap = devm_regmap_init_i2c(client, &twl6040_regmap_config);
 	if (IS_ERR(twl6040->regmap)) {
 		ret = PTR_ERR(twl6040->regmap);
 		goto err;
@@ -532,9 +554,23 @@ static int __devinit twl6040_probe(struct i2c_client *client,
 
 	i2c_set_clientdata(client, twl6040);
 
+	twl6040->supplies[0].supply = "vio";
+	twl6040->supplies[1].supply = "v2v1";
+	ret = regulator_bulk_get(&client->dev, TWL6040_NUM_SUPPLIES,
+				 twl6040->supplies);
+	if (ret != 0) {
+		dev_err(&client->dev, "Failed to get supplies: %d\n", ret);
+		goto regulator_get_err;
+	}
+
+	ret = regulator_bulk_enable(TWL6040_NUM_SUPPLIES, twl6040->supplies);
+	if (ret != 0) {
+		dev_err(&client->dev, "Failed to enable supplies: %d\n", ret);
+		goto power_err;
+	}
+
 	twl6040->dev = &client->dev;
 	twl6040->irq = client->irq;
-	twl6040->irq_base = pdata->irq_base;
 
 	mutex_init(&twl6040->mutex);
 	mutex_init(&twl6040->io_mutex);
@@ -543,22 +579,26 @@ static int __devinit twl6040_probe(struct i2c_client *client,
 	twl6040->rev = twl6040_reg_read(twl6040, TWL6040_REG_ASICREV);
 
 	/* ERRATA: Automatic power-up is not possible in ES1.0 */
-	if (twl6040_get_revid(twl6040) > TWL6040_REV_ES1_0)
-		twl6040->audpwron = pdata->audpwron_gpio;
-	else
+	if (twl6040_get_revid(twl6040) > TWL6040_REV_ES1_0) {
+		if (pdata)
+			twl6040->audpwron = pdata->audpwron_gpio;
+		else
+			twl6040->audpwron = of_get_named_gpio(node,
+						"ti,audpwron-gpio", 0);
+	} else
 		twl6040->audpwron = -EINVAL;
 
 	if (gpio_is_valid(twl6040->audpwron)) {
 		ret = gpio_request_one(twl6040->audpwron, GPIOF_OUT_INIT_LOW,
 				       "audpwron");
 		if (ret)
-			goto gpio1_err;
+			goto gpio_err;
 	}
 
 	/* codec interrupt */
 	ret = twl6040_irq_init(twl6040);
 	if (ret)
-		goto gpio2_err;
+		goto irq_init_err;
 
 	ret = request_threaded_irq(twl6040->irq_base + TWL6040_IRQ_READY,
 				   NULL, twl6040_naudint_handler, 0,
@@ -572,22 +612,27 @@ static int __devinit twl6040_probe(struct i2c_client *client,
 	/* dual-access registers controlled by I2C only */
 	twl6040_set_bits(twl6040, TWL6040_REG_ACCCTL, TWL6040_I2CSEL);
 
-	if (pdata->codec) {
-		int irq = twl6040->irq_base + TWL6040_IRQ_PLUG;
-
-		cell = &twl6040->cells[children];
-		cell->name = "twl6040-codec";
-		twl6040_codec_rsrc[0].start = irq;
-		twl6040_codec_rsrc[0].end = irq;
-		cell->resources = twl6040_codec_rsrc;
-		cell->num_resources = ARRAY_SIZE(twl6040_codec_rsrc);
+	/*
+	 * The main functionality of twl6040 to provide audio on OMAP4+ systems.
+	 * We can add the ASoC codec child whenever this driver has been loaded.
+	 * The ASoC codec can work without pdata, pass the platform_data only if
+	 * it has been provided.
+	 */
+	irq = twl6040->irq_base + TWL6040_IRQ_PLUG;
+	cell = &twl6040->cells[children];
+	cell->name = "twl6040-codec";
+	twl6040_codec_rsrc[0].start = irq;
+	twl6040_codec_rsrc[0].end = irq;
+	cell->resources = twl6040_codec_rsrc;
+	cell->num_resources = ARRAY_SIZE(twl6040_codec_rsrc);
+	if (pdata && pdata->codec) {
 		cell->platform_data = pdata->codec;
 		cell->pdata_size = sizeof(*pdata->codec);
-		children++;
 	}
+	children++;
 
-	if (pdata->vibra) {
-		int irq = twl6040->irq_base + TWL6040_IRQ_VIB;
+	if (twl6040_has_vibra(pdata, node)) {
+		irq = twl6040->irq_base + TWL6040_IRQ_VIB;
 
 		cell = &twl6040->cells[children];
 		cell->name = "twl6040-vibra";
@@ -596,21 +641,17 @@ static int __devinit twl6040_probe(struct i2c_client *client,
 		cell->resources = twl6040_vibra_rsrc;
 		cell->num_resources = ARRAY_SIZE(twl6040_vibra_rsrc);
 
-		cell->platform_data = pdata->vibra;
-		cell->pdata_size = sizeof(*pdata->vibra);
+		if (pdata && pdata->vibra) {
+			cell->platform_data = pdata->vibra;
+			cell->pdata_size = sizeof(*pdata->vibra);
+		}
 		children++;
 	}
 
-	if (children) {
-		ret = mfd_add_devices(&client->dev, -1, twl6040->cells,
-				      children, NULL, 0);
-		if (ret)
-			goto mfd_err;
-	} else {
-		dev_err(&client->dev, "No platform data found for children\n");
-		ret = -ENODEV;
+	ret = mfd_add_devices(&client->dev, -1, twl6040->cells, children,
+			      NULL, 0);
+	if (ret)
 		goto mfd_err;
-	}
 
 	return 0;
 
@@ -618,12 +659,15 @@ mfd_err:
 	free_irq(twl6040->irq_base + TWL6040_IRQ_READY, twl6040);
 irq_err:
 	twl6040_irq_exit(twl6040);
-gpio2_err:
+irq_init_err:
 	if (gpio_is_valid(twl6040->audpwron))
 		gpio_free(twl6040->audpwron);
-gpio1_err:
+gpio_err:
+	regulator_bulk_disable(TWL6040_NUM_SUPPLIES, twl6040->supplies);
+power_err:
+	regulator_bulk_free(TWL6040_NUM_SUPPLIES, twl6040->supplies);
+regulator_get_err:
 	i2c_set_clientdata(client, NULL);
-	regmap_exit(twl6040->regmap);
 err:
 	return ret;
 }
@@ -643,7 +687,9 @@ static int __devexit twl6040_remove(struct i2c_client *client)
 
 	mfd_remove_devices(&client->dev);
 	i2c_set_clientdata(client, NULL);
-	regmap_exit(twl6040->regmap);
+
+	regulator_bulk_disable(TWL6040_NUM_SUPPLIES, twl6040->supplies);
+	regulator_bulk_free(TWL6040_NUM_SUPPLIES, twl6040->supplies);
 
 	return 0;
 }
diff --git a/drivers/mfd/twl6040-irq.c b/drivers/mfd/twl6040-irq.c
index b3f8ddaa28a8..4b42543da228 100644
--- a/drivers/mfd/twl6040-irq.c
+++ b/drivers/mfd/twl6040-irq.c
@@ -23,7 +23,10 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/err.h>
 #include <linux/irq.h>
+#include <linux/of.h>
+#include <linux/irqdomain.h>
 #include <linux/interrupt.h>
 #include <linux/mfd/core.h>
 #include <linux/mfd/twl6040.h>
@@ -138,7 +141,8 @@ static irqreturn_t twl6040_irq_thread(int irq, void *data)
 
 int twl6040_irq_init(struct twl6040 *twl6040)
 {
-	int cur_irq, ret;
+	struct device_node *node = twl6040->dev->of_node;
+	int i, nr_irqs, irq_base, ret;
 	u8 val;
 
 	mutex_init(&twl6040->irq_mutex);
@@ -148,21 +152,31 @@ int twl6040_irq_init(struct twl6040 *twl6040)
 	twl6040->irq_masks_cache = TWL6040_ALLINT_MSK;
 	twl6040_reg_write(twl6040, TWL6040_REG_INTMR, TWL6040_ALLINT_MSK);
 
+	nr_irqs = ARRAY_SIZE(twl6040_irqs);
+
+	irq_base = irq_alloc_descs(-1, 0, nr_irqs, 0);
+	if (IS_ERR_VALUE(irq_base)) {
+		dev_err(twl6040->dev, "Fail to allocate IRQ descs\n");
+		return irq_base;
+	}
+	twl6040->irq_base = irq_base;
+
+	irq_domain_add_legacy(node, ARRAY_SIZE(twl6040_irqs), irq_base, 0,
+			      &irq_domain_simple_ops, NULL);
+
 	/* Register them with genirq */
-	for (cur_irq = twl6040->irq_base;
-	     cur_irq < twl6040->irq_base + ARRAY_SIZE(twl6040_irqs);
-	     cur_irq++) {
-		irq_set_chip_data(cur_irq, twl6040);
-		irq_set_chip_and_handler(cur_irq, &twl6040_irq_chip,
+	for (i = irq_base; i < irq_base + nr_irqs; i++) {
+		irq_set_chip_data(i, twl6040);
+		irq_set_chip_and_handler(i, &twl6040_irq_chip,
 					 handle_level_irq);
-		irq_set_nested_thread(cur_irq, 1);
+		irq_set_nested_thread(i, 1);
 
 		/* ARM needs us to explicitly flag the IRQ as valid
 		 * and will set them noprobe when we do so. */
 #ifdef CONFIG_ARM
-		set_irq_flags(cur_irq, IRQF_VALID);
+		set_irq_flags(i, IRQF_VALID);
 #else
-		irq_set_noprobe(cur_irq);
+		irq_set_noprobe(i);
 #endif
 	}
 
diff --git a/drivers/mfd/vx855.c b/drivers/mfd/vx855.c
index b73cc15e0081..872aff21e4be 100644
--- a/drivers/mfd/vx855.c
+++ b/drivers/mfd/vx855.c
@@ -131,17 +131,7 @@ static struct pci_driver vx855_pci_driver = {
 	.remove		= __devexit_p(vx855_remove),
 };
 
-static int vx855_init(void)
-{
-	return pci_register_driver(&vx855_pci_driver);
-}
-module_init(vx855_init);
-
-static void vx855_exit(void)
-{
-	pci_unregister_driver(&vx855_pci_driver);
-}
-module_exit(vx855_exit);
+module_pci_driver(vx855_pci_driver);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Harald Welte <[email protected]>");
diff --git a/drivers/mfd/wm831x-auxadc.c b/drivers/mfd/wm831x-auxadc.c
index 87210954a066..6ee3018d8653 100644
--- a/drivers/mfd/wm831x-auxadc.c
+++ b/drivers/mfd/wm831x-auxadc.c
@@ -280,11 +280,11 @@ void wm831x_auxadc_init(struct wm831x *wm831x)
 	mutex_init(&wm831x->auxadc_lock);
 	INIT_LIST_HEAD(&wm831x->auxadc_pending);
 
-	if (wm831x->irq && wm831x->irq_base) {
+	if (wm831x->irq) {
 		wm831x->auxadc_read = wm831x_auxadc_read_irq;
 
-		ret = request_threaded_irq(wm831x->irq_base +
-					   WM831X_IRQ_AUXADC_DATA,
+		ret = request_threaded_irq(wm831x_irq(wm831x,
+						      WM831X_IRQ_AUXADC_DATA),
 					   NULL, wm831x_auxadc_irq, 0,
 					   "auxadc", wm831x);
 		if (ret < 0) {
diff --git a/drivers/mfd/wm831x-core.c b/drivers/mfd/wm831x-core.c
index 838056c3493a..946698fd2dc6 100644
--- a/drivers/mfd/wm831x-core.c
+++ b/drivers/mfd/wm831x-core.c
@@ -614,8 +614,15 @@ int wm831x_set_bits(struct wm831x *wm831x, unsigned short reg,
 }
 EXPORT_SYMBOL_GPL(wm831x_set_bits);
 
+static struct resource wm831x_io_parent = {
+	.start = 0,
+	.end   = 0xffffffff,
+	.flags = IORESOURCE_IO,
+};
+
 static struct resource wm831x_dcdc1_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_DC1_CONTROL_1,
 		.end   = WM831X_DC1_DVS_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -637,6 +644,7 @@ static struct resource wm831x_dcdc1_resources[] = {
 
 static struct resource wm831x_dcdc2_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_DC2_CONTROL_1,
 		.end   = WM831X_DC2_DVS_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -657,6 +665,7 @@ static struct resource wm831x_dcdc2_resources[] = {
 
 static struct resource wm831x_dcdc3_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_DC3_CONTROL_1,
 		.end   = WM831X_DC3_SLEEP_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -671,6 +680,7 @@ static struct resource wm831x_dcdc3_resources[] = {
 
 static struct resource wm831x_dcdc4_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_DC4_CONTROL,
 		.end   = WM831X_DC4_SLEEP_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -685,6 +695,7 @@ static struct resource wm831x_dcdc4_resources[] = {
 
 static struct resource wm8320_dcdc4_buck_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_DC4_CONTROL,
 		.end   = WM832X_DC4_SLEEP_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -707,6 +718,7 @@ static struct resource wm831x_gpio_resources[] = {
 
 static struct resource wm831x_isink1_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_CURRENT_SINK_1,
 		.end   = WM831X_CURRENT_SINK_1,
 		.flags = IORESOURCE_IO,
@@ -720,6 +732,7 @@ static struct resource wm831x_isink1_resources[] = {
 
 static struct resource wm831x_isink2_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_CURRENT_SINK_2,
 		.end   = WM831X_CURRENT_SINK_2,
 		.flags = IORESOURCE_IO,
@@ -733,6 +746,7 @@ static struct resource wm831x_isink2_resources[] = {
 
 static struct resource wm831x_ldo1_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_LDO1_CONTROL,
 		.end   = WM831X_LDO1_SLEEP_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -747,6 +761,7 @@ static struct resource wm831x_ldo1_resources[] = {
 
 static struct resource wm831x_ldo2_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_LDO2_CONTROL,
 		.end   = WM831X_LDO2_SLEEP_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -761,6 +776,7 @@ static struct resource wm831x_ldo2_resources[] = {
 
 static struct resource wm831x_ldo3_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_LDO3_CONTROL,
 		.end   = WM831X_LDO3_SLEEP_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -775,6 +791,7 @@ static struct resource wm831x_ldo3_resources[] = {
 
 static struct resource wm831x_ldo4_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_LDO4_CONTROL,
 		.end   = WM831X_LDO4_SLEEP_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -789,6 +806,7 @@ static struct resource wm831x_ldo4_resources[] = {
 
 static struct resource wm831x_ldo5_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_LDO5_CONTROL,
 		.end   = WM831X_LDO5_SLEEP_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -803,6 +821,7 @@ static struct resource wm831x_ldo5_resources[] = {
 
 static struct resource wm831x_ldo6_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_LDO6_CONTROL,
 		.end   = WM831X_LDO6_SLEEP_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -817,6 +836,7 @@ static struct resource wm831x_ldo6_resources[] = {
 
 static struct resource wm831x_ldo7_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_LDO7_CONTROL,
 		.end   = WM831X_LDO7_SLEEP_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -831,6 +851,7 @@ static struct resource wm831x_ldo7_resources[] = {
 
 static struct resource wm831x_ldo8_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_LDO8_CONTROL,
 		.end   = WM831X_LDO8_SLEEP_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -845,6 +866,7 @@ static struct resource wm831x_ldo8_resources[] = {
 
 static struct resource wm831x_ldo9_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_LDO9_CONTROL,
 		.end   = WM831X_LDO9_SLEEP_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -859,6 +881,7 @@ static struct resource wm831x_ldo9_resources[] = {
 
 static struct resource wm831x_ldo10_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_LDO10_CONTROL,
 		.end   = WM831X_LDO10_SLEEP_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -873,6 +896,7 @@ static struct resource wm831x_ldo10_resources[] = {
 
 static struct resource wm831x_ldo11_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_LDO11_ON_CONTROL,
 		.end   = WM831X_LDO11_SLEEP_CONTROL,
 		.flags = IORESOURCE_IO,
@@ -974,6 +998,7 @@ static struct resource wm831x_rtc_resources[] = {
 
 static struct resource wm831x_status1_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_STATUS_LED_1,
 		.end   = WM831X_STATUS_LED_1,
 		.flags = IORESOURCE_IO,
@@ -982,6 +1007,7 @@ static struct resource wm831x_status1_resources[] = {
 
 static struct resource wm831x_status2_resources[] = {
 	{
+		.parent = &wm831x_io_parent,
 		.start = WM831X_STATUS_LED_2,
 		.end   = WM831X_STATUS_LED_2,
 		.flags = IORESOURCE_IO,
@@ -1787,27 +1813,27 @@ int wm831x_device_init(struct wm831x *wm831x, unsigned long id, int irq)
 	case WM8310:
 		ret = mfd_add_devices(wm831x->dev, wm831x_num,
 				      wm8310_devs, ARRAY_SIZE(wm8310_devs),
-				      NULL, wm831x->irq_base);
+				      NULL, 0);
 		break;
 
 	case WM8311:
 		ret = mfd_add_devices(wm831x->dev, wm831x_num,
 				      wm8311_devs, ARRAY_SIZE(wm8311_devs),
-				      NULL, wm831x->irq_base);
+				      NULL, 0);
 		if (!pdata || !pdata->disable_touch)
 			mfd_add_devices(wm831x->dev, wm831x_num,
 					touch_devs, ARRAY_SIZE(touch_devs),
-					NULL, wm831x->irq_base);
+					NULL, 0);
 		break;
 
 	case WM8312:
 		ret = mfd_add_devices(wm831x->dev, wm831x_num,
 				      wm8312_devs, ARRAY_SIZE(wm8312_devs),
-				      NULL, wm831x->irq_base);
+				      NULL, 0);
 		if (!pdata || !pdata->disable_touch)
 			mfd_add_devices(wm831x->dev, wm831x_num,
 					touch_devs, ARRAY_SIZE(touch_devs),
-					NULL, wm831x->irq_base);
+					NULL, 0);
 		break;
 
 	case WM8320:
@@ -1816,7 +1842,7 @@ int wm831x_device_init(struct wm831x *wm831x, unsigned long id, int irq)
 	case WM8326:
 		ret = mfd_add_devices(wm831x->dev, wm831x_num,
 				      wm8320_devs, ARRAY_SIZE(wm8320_devs),
-				      NULL, wm831x->irq_base);
+				      NULL, 0);
 		break;
 
 	default:
@@ -1841,7 +1867,7 @@ int wm831x_device_init(struct wm831x *wm831x, unsigned long id, int irq)
 	if (ret & WM831X_XTAL_ENA) {
 		ret = mfd_add_devices(wm831x->dev, wm831x_num,
 				      rtc_devs, ARRAY_SIZE(rtc_devs),
-				      NULL, wm831x->irq_base);
+				      NULL, 0);
 		if (ret != 0) {
 			dev_err(wm831x->dev, "Failed to add RTC: %d\n", ret);
 			goto err_irq;
@@ -1854,7 +1880,7 @@ int wm831x_device_init(struct wm831x *wm831x, unsigned long id, int irq)
 		/* Treat errors as non-critical */
 		ret = mfd_add_devices(wm831x->dev, wm831x_num, backlight_devs,
 				      ARRAY_SIZE(backlight_devs), NULL,
-				      wm831x->irq_base);
+				      0);
 		if (ret < 0)
 			dev_err(wm831x->dev, "Failed to add backlight: %d\n",
 				ret);
@@ -1883,8 +1909,7 @@ void wm831x_device_exit(struct wm831x *wm831x)
 {
 	wm831x_otp_exit(wm831x);
 	mfd_remove_devices(wm831x->dev);
-	if (wm831x->irq_base)
-		free_irq(wm831x->irq_base + WM831X_IRQ_AUXADC_DATA, wm831x);
+	free_irq(wm831x_irq(wm831x, WM831X_IRQ_AUXADC_DATA), wm831x);
 	wm831x_irq_exit(wm831x);
 }
 
diff --git a/drivers/mfd/wm831x-irq.c b/drivers/mfd/wm831x-irq.c
index bec4d0539160..804e56ec99eb 100644
--- a/drivers/mfd/wm831x-irq.c
+++ b/drivers/mfd/wm831x-irq.c
@@ -18,6 +18,7 @@
 #include <linux/irq.h>
 #include <linux/mfd/core.h>
 #include <linux/interrupt.h>
+#include <linux/irqdomain.h>
 
 #include <linux/mfd/wm831x/core.h>
 #include <linux/mfd/wm831x/pdata.h>
@@ -328,7 +329,7 @@ static inline int irq_data_to_status_reg(struct wm831x_irq_data *irq_data)
 static inline struct wm831x_irq_data *irq_to_wm831x_irq(struct wm831x *wm831x,
 							int irq)
 {
-	return &wm831x_irqs[irq - wm831x->irq_base];
+	return &wm831x_irqs[irq];
 }
 
 static void wm831x_irq_lock(struct irq_data *data)
@@ -374,7 +375,7 @@ static void wm831x_irq_enable(struct irq_data *data)
 {
 	struct wm831x *wm831x = irq_data_get_irq_chip_data(data);
 	struct wm831x_irq_data *irq_data = irq_to_wm831x_irq(wm831x,
-							     data->irq);
+							     data->hwirq);
 
 	wm831x->irq_masks_cur[irq_data->reg - 1] &= ~irq_data->mask;
 }
@@ -383,7 +384,7 @@ static void wm831x_irq_disable(struct irq_data *data)
 {
 	struct wm831x *wm831x = irq_data_get_irq_chip_data(data);
 	struct wm831x_irq_data *irq_data = irq_to_wm831x_irq(wm831x,
-							     data->irq);
+							     data->hwirq);
 
 	wm831x->irq_masks_cur[irq_data->reg - 1] |= irq_data->mask;
 }
@@ -393,7 +394,7 @@ static int wm831x_irq_set_type(struct irq_data *data, unsigned int type)
 	struct wm831x *wm831x = irq_data_get_irq_chip_data(data);
 	int irq;
 
-	irq = data->irq - wm831x->irq_base;
+	irq = data->hwirq;
 
 	if (irq < WM831X_IRQ_GPIO_1 || irq > WM831X_IRQ_GPIO_11) {
 		/* Ignore internal-only IRQs */
@@ -412,22 +413,25 @@ static int wm831x_irq_set_type(struct irq_data *data, unsigned int type)
 	 * do the update here as we can be called with the bus lock
 	 * held.
 	 */
+	wm831x->gpio_level_low[irq] = false;
+	wm831x->gpio_level_high[irq] = false;
 	switch (type) {
 	case IRQ_TYPE_EDGE_BOTH:
 		wm831x->gpio_update[irq] = 0x10000 | WM831X_GPN_INT_MODE;
-		wm831x->gpio_level[irq] = false;
 		break;
 	case IRQ_TYPE_EDGE_RISING:
 		wm831x->gpio_update[irq] = 0x10000 | WM831X_GPN_POL;
-		wm831x->gpio_level[irq] = false;
 		break;
 	case IRQ_TYPE_EDGE_FALLING:
 		wm831x->gpio_update[irq] = 0x10000;
-		wm831x->gpio_level[irq] = false;
 		break;
 	case IRQ_TYPE_LEVEL_HIGH:
 		wm831x->gpio_update[irq] = 0x10000 | WM831X_GPN_POL;
-		wm831x->gpio_level[irq] = true;
+		wm831x->gpio_level_high[irq] = true;
+		break;
+	case IRQ_TYPE_LEVEL_LOW:
+		wm831x->gpio_update[irq] = 0x10000;
+		wm831x->gpio_level_low[irq] = true;
 		break;
 	default:
 		return -EINVAL;
@@ -469,9 +473,11 @@ static irqreturn_t wm831x_irq_thread(int irq, void *data)
 	 * descriptors.
 	 */
 	if (primary & WM831X_TCHPD_INT)
-		handle_nested_irq(wm831x->irq_base + WM831X_IRQ_TCHPD);
+		handle_nested_irq(irq_find_mapping(wm831x->irq_domain,
+						   WM831X_IRQ_TCHPD));
 	if (primary & WM831X_TCHDATA_INT)
-		handle_nested_irq(wm831x->irq_base + WM831X_IRQ_TCHDATA);
+		handle_nested_irq(irq_find_mapping(wm831x->irq_domain,
+						   WM831X_IRQ_TCHDATA));
 	primary &= ~(WM831X_TCHDATA_EINT | WM831X_TCHPD_EINT);
 
 	for (i = 0; i < ARRAY_SIZE(wm831x_irqs); i++) {
@@ -507,16 +513,29 @@ static irqreturn_t wm831x_irq_thread(int irq, void *data)
 		}
 
 		if (*status & wm831x_irqs[i].mask)
-			handle_nested_irq(wm831x->irq_base + i);
+			handle_nested_irq(irq_find_mapping(wm831x->irq_domain,
+							   i));
 
 		/* Simulate an edge triggered IRQ by polling the input
 		 * status.  This is sucky but improves interoperability.
 		 */
 		if (primary == WM831X_GP_INT &&
-		    wm831x->gpio_level[i - WM831X_IRQ_GPIO_1]) {
+		    wm831x->gpio_level_high[i - WM831X_IRQ_GPIO_1]) {
 			ret = wm831x_reg_read(wm831x, WM831X_GPIO_LEVEL);
 			while (ret & 1 << (i - WM831X_IRQ_GPIO_1)) {
-				handle_nested_irq(wm831x->irq_base + i);
+				handle_nested_irq(irq_find_mapping(wm831x->irq_domain,
+								   i));
+				ret = wm831x_reg_read(wm831x,
+						      WM831X_GPIO_LEVEL);
+			}
+		}
+
+		if (primary == WM831X_GP_INT &&
+		    wm831x->gpio_level_low[i - WM831X_IRQ_GPIO_1]) {
+			ret = wm831x_reg_read(wm831x, WM831X_GPIO_LEVEL);
+			while (!(ret & 1 << (i - WM831X_IRQ_GPIO_1))) {
+				handle_nested_irq(irq_find_mapping(wm831x->irq_domain,
+								   i));
 				ret = wm831x_reg_read(wm831x,
 						      WM831X_GPIO_LEVEL);
 			}
@@ -527,10 +546,34 @@ out:
 	return IRQ_HANDLED;
 }
 
+static int wm831x_irq_map(struct irq_domain *h, unsigned int virq,
+			  irq_hw_number_t hw)
+{
+	irq_set_chip_data(virq, h->host_data);
+	irq_set_chip_and_handler(virq, &wm831x_irq_chip, handle_edge_irq);
+	irq_set_nested_thread(virq, 1);
+
+	/* ARM needs us to explicitly flag the IRQ as valid
+	 * and will set them noprobe when we do so. */
+#ifdef CONFIG_ARM
+	set_irq_flags(virq, IRQF_VALID);
+#else
+	irq_set_noprobe(virq);
+#endif
+
+	return 0;
+}
+
+static struct irq_domain_ops wm831x_irq_domain_ops = {
+	.map	= wm831x_irq_map,
+	.xlate	= irq_domain_xlate_twocell,
+};
+
 int wm831x_irq_init(struct wm831x *wm831x, int irq)
 {
 	struct wm831x_pdata *pdata = wm831x->dev->platform_data;
-	int i, cur_irq, ret;
+	struct irq_domain *domain;
+	int i, ret, irq_base;
 
 	mutex_init(&wm831x->irq_lock);
 
@@ -543,18 +586,33 @@ int wm831x_irq_init(struct wm831x *wm831x, int irq)
 	}
 
 	/* Try to dynamically allocate IRQs if no base is specified */
-	if (!pdata || !pdata->irq_base)
-		wm831x->irq_base = -1;
+	if (pdata && pdata->irq_base) {
+		irq_base = irq_alloc_descs(pdata->irq_base, 0,
+					   WM831X_NUM_IRQS, 0);
+		if (irq_base < 0) {
+			dev_warn(wm831x->dev, "Failed to allocate IRQs: %d\n",
+				 irq_base);
+			irq_base = 0;
+		}
+	} else {
+		irq_base = 0;
+	}
+
+	if (irq_base)
+		domain = irq_domain_add_legacy(wm831x->dev->of_node,
+					       ARRAY_SIZE(wm831x_irqs),
+					       irq_base, 0,
+					       &wm831x_irq_domain_ops,
+					       wm831x);
 	else
-		wm831x->irq_base = pdata->irq_base;
+		domain = irq_domain_add_linear(wm831x->dev->of_node,
+					       ARRAY_SIZE(wm831x_irqs),
+					       &wm831x_irq_domain_ops,
+					       wm831x);
 
-	wm831x->irq_base = irq_alloc_descs(wm831x->irq_base, 0,
-					   WM831X_NUM_IRQS, 0);
-	if (wm831x->irq_base < 0) {
-		dev_warn(wm831x->dev, "Failed to allocate IRQs: %d\n",
-			 wm831x->irq_base);
-		wm831x->irq_base = 0;
-		return 0;
+	if (!domain) {
+		dev_warn(wm831x->dev, "Failed to allocate IRQ domain\n");
+		return -EINVAL;
 	}
 
 	if (pdata && pdata->irq_cmos)
@@ -565,38 +623,22 @@ int wm831x_irq_init(struct wm831x *wm831x, int irq)
 	wm831x_set_bits(wm831x, WM831X_IRQ_CONFIG,
 			WM831X_IRQ_OD, i);
 
-	/* Try to flag /IRQ as a wake source; there are a number of
-	 * unconditional wake sources in the PMIC so this isn't
-	 * conditional but we don't actually care *too* much if it
-	 * fails.
-	 */
-	ret = enable_irq_wake(irq);
-	if (ret != 0) {
-		dev_warn(wm831x->dev, "Can't enable IRQ as wake source: %d\n",
-			 ret);
-	}
-
 	wm831x->irq = irq;
-
-	/* Register them with genirq */
-	for (cur_irq = wm831x->irq_base;
-	     cur_irq < ARRAY_SIZE(wm831x_irqs) + wm831x->irq_base;
-	     cur_irq++) {
-		irq_set_chip_data(cur_irq, wm831x);
-		irq_set_chip_and_handler(cur_irq, &wm831x_irq_chip,
-					 handle_edge_irq);
-		irq_set_nested_thread(cur_irq, 1);
-
-		/* ARM needs us to explicitly flag the IRQ as valid
-		 * and will set them noprobe when we do so. */
-#ifdef CONFIG_ARM
-		set_irq_flags(cur_irq, IRQF_VALID);
-#else
-		irq_set_noprobe(cur_irq);
-#endif
-	}
+	wm831x->irq_domain = domain;
 
 	if (irq) {
+		/* Try to flag /IRQ as a wake source; there are a number of
+		 * unconditional wake sources in the PMIC so this isn't
+		 * conditional but we don't actually care *too* much if it
+		 * fails.
+		 */
+		ret = enable_irq_wake(irq);
+		if (ret != 0) {
+			dev_warn(wm831x->dev,
+				 "Can't enable IRQ as wake source: %d\n",
+				 ret);
+		}
+
 		ret = request_threaded_irq(irq, NULL, wm831x_irq_thread,
 					   IRQF_TRIGGER_LOW | IRQF_ONESHOT,
 					   "wm831x", wm831x);
diff --git a/drivers/mfd/wm8350-core.c b/drivers/mfd/wm8350-core.c
index dd1caaac55e4..8a9b11ca076a 100644
--- a/drivers/mfd/wm8350-core.c
+++ b/drivers/mfd/wm8350-core.c
@@ -20,6 +20,7 @@
 #include <linux/device.h>
 #include <linux/delay.h>
 #include <linux/interrupt.h>
+#include <linux/regmap.h>
 #include <linux/workqueue.h>
 
 #include <linux/mfd/wm8350/core.h>
@@ -74,7 +75,7 @@ static int wm8350_phys_read(struct wm8350 *wm8350, u8 reg, int num_regs,
 	int bytes = num_regs * 2;
 
 	dev_dbg(wm8350->dev, "volatile read\n");
-	ret = wm8350->read_dev(wm8350, reg, bytes, (char *)dest);
+	ret = regmap_raw_read(wm8350->regmap, reg, dest, bytes);
 
 	for (i = reg; i < reg + num_regs; i++) {
 		/* Cache is CPU endian */
@@ -96,9 +97,6 @@ static int wm8350_read(struct wm8350 *wm8350, u8 reg, int num_regs, u16 *dest)
 	int ret = 0;
 	int bytes = num_regs * 2;
 
-	if (wm8350->read_dev == NULL)
-		return -ENODEV;
-
 	if ((reg + num_regs - 1) > WM8350_MAX_REGISTER) {
 		dev_err(wm8350->dev, "invalid reg %x\n",
 			reg + num_regs - 1);
@@ -149,9 +147,6 @@ static int wm8350_write(struct wm8350 *wm8350, u8 reg, int num_regs, u16 *src)
 	int end = reg + num_regs;
 	int bytes = num_regs * 2;
 
-	if (wm8350->write_dev == NULL)
-		return -ENODEV;
-
 	if ((reg + num_regs - 1) > WM8350_MAX_REGISTER) {
 		dev_err(wm8350->dev, "invalid reg %x\n",
 			reg + num_regs - 1);
@@ -182,7 +177,7 @@ static int wm8350_write(struct wm8350 *wm8350, u8 reg, int num_regs, u16 *src)
 	}
 
 	/* Actually write it out */
-	return wm8350->write_dev(wm8350, reg, bytes, (char *)src);
+	return regmap_raw_write(wm8350->regmap, reg, src, bytes);
 }
 
 /*
@@ -515,9 +510,8 @@ static int wm8350_create_cache(struct wm8350 *wm8350, int type, int mode)
 	 * a PMIC so the device many not be in a virgin state and we
 	 * can't rely on the silicon values.
 	 */
-	ret = wm8350->read_dev(wm8350, 0,
-			       sizeof(u16) * (WM8350_MAX_REGISTER + 1),
-			       wm8350->reg_cache);
+	ret = regmap_raw_read(wm8350->regmap, 0, wm8350->reg_cache,
+			      sizeof(u16) * (WM8350_MAX_REGISTER + 1));
 	if (ret < 0) {
 		dev_err(wm8350->dev,
 			"failed to read initial cache values\n");
@@ -570,35 +564,30 @@ int wm8350_device_init(struct wm8350 *wm8350, int irq,
 		       struct wm8350_platform_data *pdata)
 {
 	int ret;
-	u16 id1, id2, mask_rev;
-	u16 cust_id, mode, chip_rev;
+	unsigned int id1, id2, mask_rev;
+	unsigned int cust_id, mode, chip_rev;
 
 	dev_set_drvdata(wm8350->dev, wm8350);
 
 	/* get WM8350 revision and config mode */
-	ret = wm8350->read_dev(wm8350, WM8350_RESET_ID, sizeof(id1), &id1);
+	ret = regmap_read(wm8350->regmap, WM8350_RESET_ID, &id1);
 	if (ret != 0) {
 		dev_err(wm8350->dev, "Failed to read ID: %d\n", ret);
 		goto err;
 	}
 
-	ret = wm8350->read_dev(wm8350, WM8350_ID, sizeof(id2), &id2);
+	ret = regmap_read(wm8350->regmap, WM8350_ID, &id2);
 	if (ret != 0) {
 		dev_err(wm8350->dev, "Failed to read ID: %d\n", ret);
 		goto err;
 	}
 
-	ret = wm8350->read_dev(wm8350, WM8350_REVISION, sizeof(mask_rev),
-			       &mask_rev);
+	ret = regmap_read(wm8350->regmap, WM8350_REVISION, &mask_rev);
 	if (ret != 0) {
 		dev_err(wm8350->dev, "Failed to read revision: %d\n", ret);
 		goto err;
 	}
 
-	id1 = be16_to_cpu(id1);
-	id2 = be16_to_cpu(id2);
-	mask_rev = be16_to_cpu(mask_rev);
-
 	if (id1 != 0x6143) {
 		dev_err(wm8350->dev,
 			"Device with ID %x is not a WM8350\n", id1);
diff --git a/drivers/mfd/wm8350-i2c.c b/drivers/mfd/wm8350-i2c.c
index d955faaf27c4..a68aceb4e48c 100644
--- a/drivers/mfd/wm8350-i2c.c
+++ b/drivers/mfd/wm8350-i2c.c
@@ -15,47 +15,18 @@
 
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/err.h>
 #include <linux/init.h>
 #include <linux/i2c.h>
 #include <linux/platform_device.h>
 #include <linux/mfd/wm8350/core.h>
+#include <linux/regmap.h>
 #include <linux/slab.h>
 
-static int wm8350_i2c_read_device(struct wm8350 *wm8350, char reg,
-				  int bytes, void *dest)
-{
-	int ret;
-
-	ret = i2c_master_send(wm8350->i2c_client, &reg, 1);
-	if (ret < 0)
-		return ret;
-	ret = i2c_master_recv(wm8350->i2c_client, dest, bytes);
-	if (ret < 0)
-		return ret;
-	if (ret != bytes)
-		return -EIO;
-	return 0;
-}
-
-static int wm8350_i2c_write_device(struct wm8350 *wm8350, char reg,
-				   int bytes, void *src)
-{
-	/* we add 1 byte for device register */
-	u8 msg[(WM8350_MAX_REGISTER << 1) + 1];
-	int ret;
-
-	if (bytes > ((WM8350_MAX_REGISTER << 1) + 1))
-		return -EINVAL;
-
-	msg[0] = reg;
-	memcpy(&msg[1], src, bytes);
-	ret = i2c_master_send(wm8350->i2c_client, msg, bytes + 1);
-	if (ret < 0)
-		return ret;
-	if (ret != bytes + 1)
-		return -EIO;
-	return 0;
-}
+static const struct regmap_config wm8350_regmap = {
+	.reg_bits = 8,
+	.val_bits = 16,
+};
 
 static int wm8350_i2c_probe(struct i2c_client *i2c,
 			    const struct i2c_device_id *id)
@@ -67,20 +38,18 @@ static int wm8350_i2c_probe(struct i2c_client *i2c,
 	if (wm8350 == NULL)
 		return -ENOMEM;
 
+	wm8350->regmap = devm_regmap_init_i2c(i2c, &wm8350_regmap);
+	if (IS_ERR(wm8350->regmap)) {
+		ret = PTR_ERR(wm8350->regmap);
+		dev_err(&i2c->dev, "Failed to allocate register map: %d\n",
+			ret);
+		return ret;
+	}
+
 	i2c_set_clientdata(i2c, wm8350);
 	wm8350->dev = &i2c->dev;
-	wm8350->i2c_client = i2c;
-	wm8350->read_dev = wm8350_i2c_read_device;
-	wm8350->write_dev = wm8350_i2c_write_device;
-
-	ret = wm8350_device_init(wm8350, i2c->irq, i2c->dev.platform_data);
-	if (ret < 0)
-		goto err;
-
-	return ret;
 
-err:
-	return ret;
+	return wm8350_device_init(wm8350, i2c->irq, i2c->dev.platform_data);
 }
 
 static int wm8350_i2c_remove(struct i2c_client *i2c)
diff --git a/drivers/mfd/wm8400-core.c b/drivers/mfd/wm8400-core.c
index 1189a17f0f25..4b7d378551d5 100644
--- a/drivers/mfd/wm8400-core.c
+++ b/drivers/mfd/wm8400-core.c
@@ -23,136 +23,16 @@
 #include <linux/regmap.h>
 #include <linux/slab.h>
 
-static struct {
-	u16  readable;    /* Mask of readable bits */
-	u16  writable;    /* Mask of writable bits */
-	u16  vol;         /* Mask of volatile bits */
-	int  is_codec;    /* Register controlled by codec reset */
-	u16  default_val; /* Value on reset */
-} reg_data[] = {
-	{ 0xFFFF, 0xFFFF, 0x0000, 0, 0x6172 }, /* R0 */
-	{ 0x7000, 0x0000, 0x8000, 0, 0x0000 }, /* R1 */
-	{ 0xFF17, 0xFF17, 0x0000, 0, 0x0000 }, /* R2 */
-	{ 0xEBF3, 0xEBF3, 0x0000, 1, 0x6000 }, /* R3 */
-	{ 0x3CF3, 0x3CF3, 0x0000, 1, 0x0000 }, /* R4  */
-	{ 0xF1F8, 0xF1F8, 0x0000, 1, 0x4050 }, /* R5  */
-	{ 0xFC1F, 0xFC1F, 0x0000, 1, 0x4000 }, /* R6  */
-	{ 0xDFDE, 0xDFDE, 0x0000, 1, 0x01C8 }, /* R7  */
-	{ 0xFCFC, 0xFCFC, 0x0000, 1, 0x0000 }, /* R8  */
-	{ 0xEFFF, 0xEFFF, 0x0000, 1, 0x0040 }, /* R9  */
-	{ 0xEFFF, 0xEFFF, 0x0000, 1, 0x0040 }, /* R10 */
-	{ 0x27F7, 0x27F7, 0x0000, 1, 0x0004 }, /* R11 */
-	{ 0x01FF, 0x01FF, 0x0000, 1, 0x00C0 }, /* R12 */
-	{ 0x01FF, 0x01FF, 0x0000, 1, 0x00C0 }, /* R13 */
-	{ 0x1FEF, 0x1FEF, 0x0000, 1, 0x0000 }, /* R14 */
-	{ 0x0163, 0x0163, 0x0000, 1, 0x0100 }, /* R15 */
-	{ 0x01FF, 0x01FF, 0x0000, 1, 0x00C0 }, /* R16 */
-	{ 0x01FF, 0x01FF, 0x0000, 1, 0x00C0 }, /* R17 */
-	{ 0x1FFF, 0x0FFF, 0x0000, 1, 0x0000 }, /* R18 */
-	{ 0xFFFF, 0xFFFF, 0x0000, 1, 0x1000 }, /* R19 */
-	{ 0xFFFF, 0xFFFF, 0x0000, 1, 0x1010 }, /* R20 */
-	{ 0xFFFF, 0xFFFF, 0x0000, 1, 0x1010 }, /* R21 */
-	{ 0x0FDD, 0x0FDD, 0x0000, 1, 0x8000 }, /* R22 */
-	{ 0x1FFF, 0x1FFF, 0x0000, 1, 0x0800 }, /* R23 */
-	{ 0x0000, 0x01DF, 0x0000, 1, 0x008B }, /* R24 */
-	{ 0x0000, 0x01DF, 0x0000, 1, 0x008B }, /* R25 */
-	{ 0x0000, 0x01DF, 0x0000, 1, 0x008B }, /* R26 */
-	{ 0x0000, 0x01DF, 0x0000, 1, 0x008B }, /* R27 */
-	{ 0x0000, 0x01FF, 0x0000, 1, 0x0000 }, /* R28 */
-	{ 0x0000, 0x01FF, 0x0000, 1, 0x0000 }, /* R29 */
-	{ 0x0000, 0x0077, 0x0000, 1, 0x0066 }, /* R30 */
-	{ 0x0000, 0x0033, 0x0000, 1, 0x0022 }, /* R31 */
-	{ 0x0000, 0x01FF, 0x0000, 1, 0x0079 }, /* R32 */
-	{ 0x0000, 0x01FF, 0x0000, 1, 0x0079 }, /* R33 */
-	{ 0x0000, 0x0003, 0x0000, 1, 0x0003 }, /* R34 */
-	{ 0x0000, 0x01FF, 0x0000, 1, 0x0003 }, /* R35 */
-	{ 0x0000, 0x0000, 0x0000, 0, 0x0000 }, /* R36 */
-	{ 0x0000, 0x003F, 0x0000, 1, 0x0100 }, /* R37 */
-	{ 0x0000, 0x0000, 0x0000, 0, 0x0000 }, /* R38 */
-	{ 0x0000, 0x000F, 0x0000, 0, 0x0000 }, /* R39 */
-	{ 0x0000, 0x00FF, 0x0000, 1, 0x0000 }, /* R40 */
-	{ 0x0000, 0x01B7, 0x0000, 1, 0x0000 }, /* R41 */
-	{ 0x0000, 0x01B7, 0x0000, 1, 0x0000 }, /* R42 */
-	{ 0x0000, 0x01FF, 0x0000, 1, 0x0000 }, /* R43 */
-	{ 0x0000, 0x01FF, 0x0000, 1, 0x0000 }, /* R44 */
-	{ 0x0000, 0x00FD, 0x0000, 1, 0x0000 }, /* R45 */
-	{ 0x0000, 0x00FD, 0x0000, 1, 0x0000 }, /* R46 */
-	{ 0x0000, 0x01FF, 0x0000, 1, 0x0000 }, /* R47 */
-	{ 0x0000, 0x01FF, 0x0000, 1, 0x0000 }, /* R48 */
-	{ 0x0000, 0x01FF, 0x0000, 1, 0x0000 }, /* R49 */
-	{ 0x0000, 0x01FF, 0x0000, 1, 0x0000 }, /* R50 */
-	{ 0x0000, 0x01B3, 0x0000, 1, 0x0180 }, /* R51 */
-	{ 0x0000, 0x0077, 0x0000, 1, 0x0000 }, /* R52 */
-	{ 0x0000, 0x0077, 0x0000, 1, 0x0000 }, /* R53 */
-	{ 0x0000, 0x00FF, 0x0000, 1, 0x0000 }, /* R54 */
-	{ 0x0000, 0x0001, 0x0000, 1, 0x0000 }, /* R55 */
-	{ 0x0000, 0x003F, 0x0000, 1, 0x0000 }, /* R56 */
-	{ 0x0000, 0x004F, 0x0000, 1, 0x0000 }, /* R57 */
-	{ 0x0000, 0x00FD, 0x0000, 1, 0x0000 }, /* R58 */
-	{ 0x0000, 0x0000, 0x0000, 0, 0x0000 }, /* R59 */
-	{ 0x1FFF, 0x1FFF, 0x0000, 1, 0x0000 }, /* R60 */
-	{ 0xFFFF, 0xFFFF, 0x0000, 1, 0x0000 }, /* R61 */
-	{ 0x03FF, 0x03FF, 0x0000, 1, 0x0000 }, /* R62 */
-	{ 0x007F, 0x007F, 0x0000, 1, 0x0000 }, /* R63 */
-	{ 0x0000, 0x0000, 0x0000, 0, 0x0000 }, /* R64 */
-	{ 0xDFFF, 0xDFFF, 0x0000, 0, 0x0000 }, /* R65 */
-	{ 0xDFFF, 0xDFFF, 0x0000, 0, 0x0000 }, /* R66 */
-	{ 0xDFFF, 0xDFFF, 0x0000, 0, 0x0000 }, /* R67 */
-	{ 0xDFFF, 0xDFFF, 0x0000, 0, 0x0000 }, /* R68 */
-	{ 0x0000, 0x0000, 0x0000, 0, 0x0000 }, /* R69 */
-	{ 0xFFFF, 0xFFFF, 0x0000, 0, 0x4400 }, /* R70 */
-	{ 0x23FF, 0x23FF, 0x0000, 0, 0x0000 }, /* R71 */
-	{ 0xFFFF, 0xFFFF, 0x0000, 0, 0x4400 }, /* R72 */
-	{ 0x23FF, 0x23FF, 0x0000, 0, 0x0000 }, /* R73 */
-	{ 0x0000, 0x0000, 0x0000, 0, 0x0000 }, /* R74 */
-	{ 0x000E, 0x000E, 0x0000, 0, 0x0008 }, /* R75 */
-	{ 0xE00F, 0xE00F, 0x0000, 0, 0x0000 }, /* R76 */
-	{ 0x0000, 0x0000, 0x0000, 0, 0x0000 }, /* R77 */
-	{ 0x03C0, 0x03C0, 0x0000, 0, 0x02C0 }, /* R78 */
-	{ 0xFFFF, 0x0000, 0xffff, 0, 0x0000 }, /* R79 */
-	{ 0xFFFF, 0xFFFF, 0x0000, 0, 0x0000 }, /* R80 */
-	{ 0xFFFF, 0x0000, 0xffff, 0, 0x0000 }, /* R81 */
-	{ 0x2BFF, 0x0000, 0xffff, 0, 0x0000 }, /* R82 */
-	{ 0x0000, 0x0000, 0x0000, 0, 0x0000 }, /* R83 */
-	{ 0x80FF, 0x80FF, 0x0000, 0, 0x00ff }, /* R84 */
-};
-
-static int wm8400_read(struct wm8400 *wm8400, u8 reg, int num_regs, u16 *dest)
+static bool wm8400_volatile(struct device *dev, unsigned int reg)
 {
-	int i, ret = 0;
-
-	BUG_ON(reg + num_regs > ARRAY_SIZE(wm8400->reg_cache));
-
-	/* If there are any volatile reads then read back the entire block */
-	for (i = reg; i < reg + num_regs; i++)
-		if (reg_data[i].vol) {
-			ret = regmap_bulk_read(wm8400->regmap, reg, dest,
-					       num_regs);
-			return ret;
-		}
-
-	/* Otherwise use the cache */
-	memcpy(dest, &wm8400->reg_cache[reg], num_regs * sizeof(u16));
-
-	return 0;
-}
-
-static int wm8400_write(struct wm8400 *wm8400, u8 reg, int num_regs,
-			u16 *src)
-{
-	int ret, i;
-
-	BUG_ON(reg + num_regs > ARRAY_SIZE(wm8400->reg_cache));
-
-	for (i = 0; i < num_regs; i++) {
-		BUG_ON(!reg_data[reg + i].writable);
-		wm8400->reg_cache[reg + i] = src[i];
-		ret = regmap_write(wm8400->regmap, reg, src[i]);
-		if (ret != 0)
-			return ret;
+	switch (reg) {
+	case WM8400_INTERRUPT_STATUS_1:
+	case WM8400_INTERRUPT_LEVELS:
+	case WM8400_SHUTDOWN_REASON:
+		return true;
+	default:
+		return false;
 	}
-
-	return 0;
 }
 
 /**
@@ -165,13 +45,12 @@ static int wm8400_write(struct wm8400 *wm8400, u8 reg, int num_regs,
  */
 u16 wm8400_reg_read(struct wm8400 *wm8400, u8 reg)
 {
-	u16 val;
-
-	mutex_lock(&wm8400->io_lock);
-
-	wm8400_read(wm8400, reg, 1, &val);
+	unsigned int val;
+	int ret;
 
-	mutex_unlock(&wm8400->io_lock);
+	ret = regmap_read(wm8400->regmap, reg, &val);
+	if (ret < 0)
+		return ret;
 
 	return val;
 }
@@ -179,63 +58,10 @@ EXPORT_SYMBOL_GPL(wm8400_reg_read);
 
 int wm8400_block_read(struct wm8400 *wm8400, u8 reg, int count, u16 *data)
 {
-	int ret;
-
-	mutex_lock(&wm8400->io_lock);
-
-	ret = wm8400_read(wm8400, reg, count, data);
-
-	mutex_unlock(&wm8400->io_lock);
-
-	return ret;
+	return regmap_bulk_read(wm8400->regmap, reg, data, count);
 }
 EXPORT_SYMBOL_GPL(wm8400_block_read);
 
-/**
- * wm8400_set_bits - Bitmask write
- *
- * @wm8400: Pointer to wm8400 control structure
- * @reg:    Register to access
- * @mask:   Mask of bits to change
- * @val:    Value to set for masked bits
- */
-int wm8400_set_bits(struct wm8400 *wm8400, u8 reg, u16 mask, u16 val)
-{
-	u16 tmp;
-	int ret;
-
-	mutex_lock(&wm8400->io_lock);
-
-	ret = wm8400_read(wm8400, reg, 1, &tmp);
-	tmp = (tmp & ~mask) | val;
-	if (ret == 0)
-		ret = wm8400_write(wm8400, reg, 1, &tmp);
-
-	mutex_unlock(&wm8400->io_lock);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(wm8400_set_bits);
-
-/**
- * wm8400_reset_codec_reg_cache - Reset cached codec registers to
- * their default values.
- */
-void wm8400_reset_codec_reg_cache(struct wm8400 *wm8400)
-{
-	int i;
-
-	mutex_lock(&wm8400->io_lock);
-
-	/* Reset all codec registers to their initial value */
-	for (i = 0; i < ARRAY_SIZE(wm8400->reg_cache); i++)
-		if (reg_data[i].is_codec)
-			wm8400->reg_cache[i] = reg_data[i].default_val;
-
-	mutex_unlock(&wm8400->io_lock);
-}
-EXPORT_SYMBOL_GPL(wm8400_reset_codec_reg_cache);
-
 static int wm8400_register_codec(struct wm8400 *wm8400)
 {
 	struct mfd_cell cell = {
@@ -257,44 +83,24 @@ static int wm8400_register_codec(struct wm8400 *wm8400)
 static int wm8400_init(struct wm8400 *wm8400,
 		       struct wm8400_platform_data *pdata)
 {
-	u16 reg;
-	int ret, i;
-
-	mutex_init(&wm8400->io_lock);
+	unsigned int reg;
+	int ret;
 
 	dev_set_drvdata(wm8400->dev, wm8400);
 
 	/* Check that this is actually a WM8400 */
-	ret = regmap_read(wm8400->regmap, WM8400_RESET_ID, &i);
+	ret = regmap_read(wm8400->regmap, WM8400_RESET_ID, &reg);
 	if (ret != 0) {
 		dev_err(wm8400->dev, "Chip ID register read failed\n");
 		return -EIO;
 	}
-	if (i != reg_data[WM8400_RESET_ID].default_val) {
-		dev_err(wm8400->dev, "Device is not a WM8400, ID is %x\n", i);
+	if (reg != 0x6172) {
+		dev_err(wm8400->dev, "Device is not a WM8400, ID is %x\n",
+			reg);
 		return -ENODEV;
 	}
 
-	/* We don't know what state the hardware is in and since this
-	 * is a PMIC we can't reset it safely so initialise the register
-	 * cache from the hardware.
-	 */
-	ret = regmap_raw_read(wm8400->regmap, 0, wm8400->reg_cache,
-			      ARRAY_SIZE(wm8400->reg_cache));
-	if (ret != 0) {
-		dev_err(wm8400->dev, "Register cache read failed\n");
-		return -EIO;
-	}
-	for (i = 0; i < ARRAY_SIZE(wm8400->reg_cache); i++)
-		wm8400->reg_cache[i] = be16_to_cpu(wm8400->reg_cache[i]);
-
-	/* If the codec is in reset use hard coded values */
-	if (!(wm8400->reg_cache[WM8400_POWER_MANAGEMENT_1] & WM8400_CODEC_ENA))
-		for (i = 0; i < ARRAY_SIZE(wm8400->reg_cache); i++)
-			if (reg_data[i].is_codec)
-				wm8400->reg_cache[i] = reg_data[i].default_val;
-
-	ret = wm8400_read(wm8400, WM8400_ID, 1, &reg);
+	ret = regmap_read(wm8400->regmap, WM8400_ID, &reg);
 	if (ret != 0) {
 		dev_err(wm8400->dev, "ID register read failed: %d\n", ret);
 		return ret;
@@ -334,8 +140,22 @@ static const struct regmap_config wm8400_regmap_config = {
 	.reg_bits = 8,
 	.val_bits = 16,
 	.max_register = WM8400_REGISTER_COUNT - 1,
+
+	.volatile_reg = wm8400_volatile,
+
+	.cache_type = REGCACHE_RBTREE,
 };
 
+/**
+ * wm8400_reset_codec_reg_cache - Reset cached codec registers to
+ * their default values.
+ */
+void wm8400_reset_codec_reg_cache(struct wm8400 *wm8400)
+{
+	regmap_reinit_cache(wm8400->regmap, &wm8400_regmap_config);
+}
+EXPORT_SYMBOL_GPL(wm8400_reset_codec_reg_cache);
+
 #if defined(CONFIG_I2C) || defined(CONFIG_I2C_MODULE)
 static int wm8400_i2c_probe(struct i2c_client *i2c,
 			    const struct i2c_device_id *id)
diff --git a/drivers/mfd/wm8994-core.c b/drivers/mfd/wm8994-core.c
index 9d7ca1e978fa..1e321d349777 100644
--- a/drivers/mfd/wm8994-core.c
+++ b/drivers/mfd/wm8994-core.c
@@ -500,7 +500,8 @@ static __devinit int wm8994_device_init(struct wm8994 *wm8994, int irq)
 			ret);
 		goto err_enable;
 	}
-	wm8994->revision = ret;
+	wm8994->revision = ret & WM8994_CHIP_REV_MASK;
+	wm8994->cust_id = (ret & WM8994_CUST_ID_MASK) >> WM8994_CUST_ID_SHIFT;
 
 	switch (wm8994->type) {
 	case WM8994:
@@ -553,8 +554,8 @@ static __devinit int wm8994_device_init(struct wm8994 *wm8994, int irq)
 		break;
 	}
 
-	dev_info(wm8994->dev, "%s revision %c\n", devname,
-		 'A' + wm8994->revision);
+	dev_info(wm8994->dev, "%s revision %c CUST_ID %02x\n", devname,
+		 'A' + wm8994->revision, wm8994->cust_id);
 
 	switch (wm8994->type) {
 	case WM1811:
@@ -732,23 +733,7 @@ static struct i2c_driver wm8994_i2c_driver = {
 	.id_table = wm8994_i2c_id,
 };
 
-static int __init wm8994_i2c_init(void)
-{
-	int ret;
-
-	ret = i2c_add_driver(&wm8994_i2c_driver);
-	if (ret != 0)
-		pr_err("Failed to register wm8994 I2C driver: %d\n", ret);
-
-	return ret;
-}
-module_init(wm8994_i2c_init);
-
-static void __exit wm8994_i2c_exit(void)
-{
-	i2c_del_driver(&wm8994_i2c_driver);
-}
-module_exit(wm8994_i2c_exit);
+module_i2c_driver(wm8994_i2c_driver);
 
 MODULE_DESCRIPTION("Core support for the WM8994 audio CODEC");
 MODULE_LICENSE("GPL");
diff --git a/drivers/mfd/wm8994-regmap.c b/drivers/mfd/wm8994-regmap.c
index bfd25af6ecb1..52e9e2944940 100644
--- a/drivers/mfd/wm8994-regmap.c
+++ b/drivers/mfd/wm8994-regmap.c
@@ -1122,7 +1122,6 @@ static bool wm8994_volatile_register(struct device *dev, unsigned int reg)
 	case WM8994_RATE_STATUS:
 	case WM8958_MIC_DETECT_3:
 	case WM8994_DC_SERVO_4E:
-	case WM8994_CHIP_REVISION:
 	case WM8994_INTERRUPT_STATUS_1:
 	case WM8994_INTERRUPT_STATUS_2:
 		return true;
diff --git a/drivers/misc/ab8500-pwm.c b/drivers/misc/ab8500-pwm.c
index d7a9aa14e5d5..042a8fe4efaa 100644
--- a/drivers/misc/ab8500-pwm.c
+++ b/drivers/misc/ab8500-pwm.c
@@ -142,10 +142,16 @@ static int __devexit ab8500_pwm_remove(struct platform_device *pdev)
 	return 0;
 }
 
+static const struct of_device_id ab8500_pwm_match[] = {
+	{ .compatible = "stericsson,ab8500-pwm", },
+	{}
+};
+
 static struct platform_driver ab8500_pwm_driver = {
 	.driver = {
 		.name = "ab8500-pwm",
 		.owner = THIS_MODULE,
+		.of_match_table = ab8500_pwm_match,
 	},
 	.probe = ab8500_pwm_probe,
 	.remove = __devexit_p(ab8500_pwm_remove),
diff --git a/drivers/mtd/maps/lantiq-flash.c b/drivers/mtd/maps/lantiq-flash.c
index b5401e355745..c03456f17004 100644
--- a/drivers/mtd/maps/lantiq-flash.c
+++ b/drivers/mtd/maps/lantiq-flash.c
@@ -19,9 +19,9 @@
 #include <linux/mtd/cfi.h>
 #include <linux/platform_device.h>
 #include <linux/mtd/physmap.h>
+#include <linux/of.h>
 
 #include <lantiq_soc.h>
-#include <lantiq_platform.h>
 
 /*
  * The NOR flash is connected to the same external bus unit (EBU) as PCI.
@@ -44,8 +44,9 @@ struct ltq_mtd {
 	struct map_info *map;
 };
 
-static char ltq_map_name[] = "ltq_nor";
-static const char *ltq_probe_types[] __devinitconst = { "cmdlinepart", NULL };
+static const char ltq_map_name[] = "ltq_nor";
+static const char *ltq_probe_types[] __devinitconst = {
+					"cmdlinepart", "ofpart", NULL };
 
 static map_word
 ltq_read16(struct map_info *map, unsigned long adr)
@@ -108,42 +109,38 @@ ltq_copy_to(struct map_info *map, unsigned long to,
 	spin_unlock_irqrestore(&ebu_lock, flags);
 }
 
-static int __init
+static int __devinit
 ltq_mtd_probe(struct platform_device *pdev)
 {
-	struct physmap_flash_data *ltq_mtd_data = dev_get_platdata(&pdev->dev);
+	struct mtd_part_parser_data ppdata;
 	struct ltq_mtd *ltq_mtd;
-	struct resource *res;
 	struct cfi_private *cfi;
 	int err;
 
+	if (of_machine_is_compatible("lantiq,falcon") &&
+			(ltq_boot_select() != BS_FLASH)) {
+		dev_err(&pdev->dev, "invalid bootstrap options\n");
+		return -ENODEV;
+	}
+
 	ltq_mtd = kzalloc(sizeof(struct ltq_mtd), GFP_KERNEL);
 	platform_set_drvdata(pdev, ltq_mtd);
 
 	ltq_mtd->res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (!ltq_mtd->res) {
-		dev_err(&pdev->dev, "failed to get memory resource");
+		dev_err(&pdev->dev, "failed to get memory resource\n");
 		err = -ENOENT;
 		goto err_out;
 	}
 
-	res = devm_request_mem_region(&pdev->dev, ltq_mtd->res->start,
-		resource_size(ltq_mtd->res), dev_name(&pdev->dev));
-	if (!ltq_mtd->res) {
-		dev_err(&pdev->dev, "failed to request mem resource");
-		err = -EBUSY;
-		goto err_out;
-	}
-
 	ltq_mtd->map = kzalloc(sizeof(struct map_info), GFP_KERNEL);
-	ltq_mtd->map->phys = res->start;
-	ltq_mtd->map->size = resource_size(res);
-	ltq_mtd->map->virt = devm_ioremap_nocache(&pdev->dev,
-				ltq_mtd->map->phys, ltq_mtd->map->size);
+	ltq_mtd->map->phys = ltq_mtd->res->start;
+	ltq_mtd->map->size = resource_size(ltq_mtd->res);
+	ltq_mtd->map->virt = devm_request_and_ioremap(&pdev->dev, ltq_mtd->res);
 	if (!ltq_mtd->map->virt) {
-		dev_err(&pdev->dev, "failed to ioremap!\n");
-		err = -ENOMEM;
-		goto err_free;
+		dev_err(&pdev->dev, "failed to remap mem resource\n");
+		err = -EBUSY;
+		goto err_out;
 	}
 
 	ltq_mtd->map->name = ltq_map_name;
@@ -169,9 +166,9 @@ ltq_mtd_probe(struct platform_device *pdev)
 	cfi->addr_unlock1 ^= 1;
 	cfi->addr_unlock2 ^= 1;
 
-	err = mtd_device_parse_register(ltq_mtd->mtd, ltq_probe_types, NULL,
-					ltq_mtd_data->parts,
-					ltq_mtd_data->nr_parts);
+	ppdata.of_node = pdev->dev.of_node;
+	err = mtd_device_parse_register(ltq_mtd->mtd, ltq_probe_types,
+					&ppdata, NULL, 0);
 	if (err) {
 		dev_err(&pdev->dev, "failed to add partitions\n");
 		goto err_destroy;
@@ -204,32 +201,23 @@ ltq_mtd_remove(struct platform_device *pdev)
 	return 0;
 }
 
+static const struct of_device_id ltq_mtd_match[] = {
+	{ .compatible = "lantiq,nor" },
+	{},
+};
+MODULE_DEVICE_TABLE(of, ltq_mtd_match);
+
 static struct platform_driver ltq_mtd_driver = {
+	.probe = ltq_mtd_probe,
 	.remove = __devexit_p(ltq_mtd_remove),
 	.driver = {
-		.name = "ltq_nor",
+		.name = "ltq-nor",
 		.owner = THIS_MODULE,
+		.of_match_table = ltq_mtd_match,
 	},
 };
 
-static int __init
-init_ltq_mtd(void)
-{
-	int ret = platform_driver_probe(&ltq_mtd_driver, ltq_mtd_probe);
-
-	if (ret)
-		pr_err("ltq_nor: error registering platform driver");
-	return ret;
-}
-
-static void __exit
-exit_ltq_mtd(void)
-{
-	platform_driver_unregister(&ltq_mtd_driver);
-}
-
-module_init(init_ltq_mtd);
-module_exit(exit_ltq_mtd);
+module_platform_driver(ltq_mtd_driver);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("John Crispin <[email protected]>");
diff --git a/drivers/of/of_pci_irq.c b/drivers/of/of_pci_irq.c
index 93125163dea2..677053813211 100644
--- a/drivers/of/of_pci_irq.c
+++ b/drivers/of/of_pci_irq.c
@@ -15,7 +15,7 @@
  * PCI tree until an device-node is found, at which point it will finish
  * resolving using the OF tree walking.
  */
-int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq)
+int of_irq_map_pci(const struct pci_dev *pdev, struct of_irq *out_irq)
 {
 	struct device_node *dn, *ppnode;
 	struct pci_dev *ppdev;
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 8f169002dc7e..447e83472c01 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2370,7 +2370,7 @@ void pci_enable_acs(struct pci_dev *dev)
  * number is always 0 (see the Implementation Note in section 2.2.8.1 of
  * the PCI Express Base Specification, Revision 2.1)
  */
-u8 pci_swizzle_interrupt_pin(struct pci_dev *dev, u8 pin)
+u8 pci_swizzle_interrupt_pin(const struct pci_dev *dev, u8 pin)
 {
 	int slot;
 
diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c
index ee79ce64d9df..57787d87d9a4 100644
--- a/drivers/platform/x86/toshiba_acpi.c
+++ b/drivers/platform/x86/toshiba_acpi.c
@@ -1104,6 +1104,7 @@ static int __devinit toshiba_acpi_add(struct acpi_device *acpi_dev)
 
 	mutex_init(&dev->mutex);
 
+	memset(&props, 0, sizeof(props));
 	props.type = BACKLIGHT_PLATFORM;
 	props.max_brightness = HCI_LCD_BRIGHTNESS_LEVELS - 1;
 	dev->backlight_dev = backlight_device_register("toshiba",
diff --git a/drivers/power/wm831x_power.c b/drivers/power/wm831x_power.c
index 987332b71d8d..fc1ad9551182 100644
--- a/drivers/power/wm831x_power.c
+++ b/drivers/power/wm831x_power.c
@@ -565,7 +565,7 @@ static __devinit int wm831x_power_probe(struct platform_device *pdev)
 			    goto err_usb;
 	}
 
-	irq = platform_get_irq_byname(pdev, "SYSLO");
+	irq = wm831x_irq(wm831x, platform_get_irq_byname(pdev, "SYSLO"));
 	ret = request_threaded_irq(irq, NULL, wm831x_syslo_irq,
 				   IRQF_TRIGGER_RISING, "System power low",
 				   power);
@@ -575,7 +575,7 @@ static __devinit int wm831x_power_probe(struct platform_device *pdev)
 		goto err_battery;
 	}
 
-	irq = platform_get_irq_byname(pdev, "PWR SRC");
+	irq = wm831x_irq(wm831x, platform_get_irq_byname(pdev, "PWR SRC"));
 	ret = request_threaded_irq(irq, NULL, wm831x_pwr_src_irq,
 				   IRQF_TRIGGER_RISING, "Power source",
 				   power);
@@ -586,7 +586,9 @@ static __devinit int wm831x_power_probe(struct platform_device *pdev)
 	}
 
 	for (i = 0; i < ARRAY_SIZE(wm831x_bat_irqs); i++) {
-		irq = platform_get_irq_byname(pdev, wm831x_bat_irqs[i]);
+		irq = wm831x_irq(wm831x,
+				 platform_get_irq_byname(pdev,
+							 wm831x_bat_irqs[i]));
 		ret = request_threaded_irq(irq, NULL, wm831x_bat_irq,
 					   IRQF_TRIGGER_RISING,
 					   wm831x_bat_irqs[i],
@@ -606,10 +608,10 @@ err_bat_irq:
 		irq = platform_get_irq_byname(pdev, wm831x_bat_irqs[i]);
 		free_irq(irq, power);
 	}
-	irq = platform_get_irq_byname(pdev, "PWR SRC");
+	irq = wm831x_irq(wm831x, platform_get_irq_byname(pdev, "PWR SRC"));
 	free_irq(irq, power);
 err_syslo:
-	irq = platform_get_irq_byname(pdev, "SYSLO");
+	irq = wm831x_irq(wm831x, platform_get_irq_byname(pdev, "SYSLO"));
 	free_irq(irq, power);
 err_battery:
 	if (power->have_battery)
@@ -626,17 +628,20 @@ err_kmalloc:
 static __devexit int wm831x_power_remove(struct platform_device *pdev)
 {
 	struct wm831x_power *wm831x_power = platform_get_drvdata(pdev);
+	struct wm831x *wm831x = wm831x_power->wm831x;
 	int irq, i;
 
 	for (i = 0; i < ARRAY_SIZE(wm831x_bat_irqs); i++) {
-		irq = platform_get_irq_byname(pdev, wm831x_bat_irqs[i]);
+		irq = wm831x_irq(wm831x, 
+				 platform_get_irq_byname(pdev,
+							 wm831x_bat_irqs[i]));
 		free_irq(irq, wm831x_power);
 	}
 
-	irq = platform_get_irq_byname(pdev, "PWR SRC");
+	irq = wm831x_irq(wm831x, platform_get_irq_byname(pdev, "PWR SRC"));
 	free_irq(irq, wm831x_power);
 
-	irq = platform_get_irq_byname(pdev, "SYSLO");
+	irq = wm831x_irq(wm831x, platform_get_irq_byname(pdev, "SYSLO"));
 	free_irq(irq, wm831x_power);
 
 	if (wm831x_power->have_battery)
diff --git a/drivers/regulator/anatop-regulator.c b/drivers/regulator/anatop-regulator.c
index 49b2112b0486..3660bace123c 100644
--- a/drivers/regulator/anatop-regulator.c
+++ b/drivers/regulator/anatop-regulator.c
@@ -47,7 +47,7 @@ static int anatop_set_voltage(struct regulator_dev *reg, int min_uV,
 				  int max_uV, unsigned *selector)
 {
 	struct anatop_regulator *anatop_reg = rdev_get_drvdata(reg);
-	u32 val, sel;
+	u32 val, sel, mask;
 	int uv;
 
 	uv = min_uV;
@@ -71,11 +71,10 @@ static int anatop_set_voltage(struct regulator_dev *reg, int min_uV,
 	val = anatop_reg->min_bit_val + sel;
 	*selector = sel;
 	dev_dbg(&reg->dev, "%s: calculated val %d\n", __func__, val);
-	anatop_set_bits(anatop_reg->mfd,
-			anatop_reg->control_reg,
-			anatop_reg->vol_bit_shift,
-			anatop_reg->vol_bit_width,
-			val);
+	mask = ((1 << anatop_reg->vol_bit_width) - 1) <<
+		anatop_reg->vol_bit_shift;
+	val <<= anatop_reg->vol_bit_shift;
+	anatop_write_reg(anatop_reg->mfd, anatop_reg->control_reg, val, mask);
 
 	return 0;
 }
@@ -88,10 +87,9 @@ static int anatop_get_voltage_sel(struct regulator_dev *reg)
 	if (!anatop_reg->control_reg)
 		return -ENOTSUPP;
 
-	val = anatop_get_bits(anatop_reg->mfd,
-			      anatop_reg->control_reg,
-			      anatop_reg->vol_bit_shift,
-			      anatop_reg->vol_bit_width);
+	val = anatop_read_reg(anatop_reg->mfd, anatop_reg->control_reg);
+	val = (val & ((1 << anatop_reg->vol_bit_width) - 1)) >>
+		anatop_reg->vol_bit_shift;
 
 	return val - anatop_reg->min_bit_val;
 }
diff --git a/drivers/regulator/tps65910-regulator.c b/drivers/regulator/tps65910-regulator.c
index 4e01a423471b..6bf864b4bdf6 100644
--- a/drivers/regulator/tps65910-regulator.c
+++ b/drivers/regulator/tps65910-regulator.c
@@ -331,21 +331,16 @@ struct tps65910_reg {
 
 static inline int tps65910_read(struct tps65910_reg *pmic, u8 reg)
 {
-	u8 val;
+	unsigned int val;
 	int err;
 
-	err = pmic->mfd->read(pmic->mfd, reg, 1, &val);
+	err = tps65910_reg_read(pmic->mfd, reg, &val);
 	if (err)
 		return err;
 
 	return val;
 }
 
-static inline int tps65910_write(struct tps65910_reg *pmic, u8 reg, u8 val)
-{
-	return pmic->mfd->write(pmic->mfd, reg, 1, &val);
-}
-
 static int tps65910_modify_bits(struct tps65910_reg *pmic, u8 reg,
 					u8 set_mask, u8 clear_mask)
 {
@@ -362,7 +357,7 @@ static int tps65910_modify_bits(struct tps65910_reg *pmic, u8 reg,
 
 	data &= ~clear_mask;
 	data |= set_mask;
-	err = tps65910_write(pmic, reg, data);
+	err = tps65910_reg_write(pmic->mfd, reg, data);
 	if (err)
 		dev_err(pmic->mfd->dev, "Write for reg 0x%x failed\n", reg);
 
@@ -371,7 +366,7 @@ out:
 	return err;
 }
 
-static int tps65910_reg_read(struct tps65910_reg *pmic, u8 reg)
+static int tps65910_reg_read_locked(struct tps65910_reg *pmic, u8 reg)
 {
 	int data;
 
@@ -385,13 +380,13 @@ static int tps65910_reg_read(struct tps65910_reg *pmic, u8 reg)
 	return data;
 }
 
-static int tps65910_reg_write(struct tps65910_reg *pmic, u8 reg, u8 val)
+static int tps65910_reg_write_locked(struct tps65910_reg *pmic, u8 reg, u8 val)
 {
 	int err;
 
 	mutex_lock(&pmic->mutex);
 
-	err = tps65910_write(pmic, reg, val);
+	err = tps65910_reg_write(pmic->mfd, reg, val);
 	if (err < 0)
 		dev_err(pmic->mfd->dev, "Write for reg 0x%x failed\n", reg);
 
@@ -490,9 +485,9 @@ static int tps65910_set_mode(struct regulator_dev *dev, unsigned int mode)
 							LDO_ST_MODE_BIT);
 	case REGULATOR_MODE_IDLE:
 		value = LDO_ST_ON_BIT | LDO_ST_MODE_BIT;
-		return tps65910_set_bits(mfd, reg, value);
+		return tps65910_reg_set_bits(mfd, reg, value);
 	case REGULATOR_MODE_STANDBY:
-		return tps65910_clear_bits(mfd, reg, LDO_ST_ON_BIT);
+		return tps65910_reg_clear_bits(mfd, reg, LDO_ST_ON_BIT);
 	}
 
 	return -EINVAL;
@@ -507,7 +502,7 @@ static unsigned int tps65910_get_mode(struct regulator_dev *dev)
 	if (reg < 0)
 		return reg;
 
-	value = tps65910_reg_read(pmic, reg);
+	value = tps65910_reg_read_locked(pmic, reg);
 	if (value < 0)
 		return value;
 
@@ -527,28 +522,28 @@ static int tps65910_get_voltage_dcdc_sel(struct regulator_dev *dev)
 
 	switch (id) {
 	case TPS65910_REG_VDD1:
-		opvsel = tps65910_reg_read(pmic, TPS65910_VDD1_OP);
-		mult = tps65910_reg_read(pmic, TPS65910_VDD1);
+		opvsel = tps65910_reg_read_locked(pmic, TPS65910_VDD1_OP);
+		mult = tps65910_reg_read_locked(pmic, TPS65910_VDD1);
 		mult = (mult & VDD1_VGAIN_SEL_MASK) >> VDD1_VGAIN_SEL_SHIFT;
-		srvsel = tps65910_reg_read(pmic, TPS65910_VDD1_SR);
+		srvsel = tps65910_reg_read_locked(pmic, TPS65910_VDD1_SR);
 		sr = opvsel & VDD1_OP_CMD_MASK;
 		opvsel &= VDD1_OP_SEL_MASK;
 		srvsel &= VDD1_SR_SEL_MASK;
 		vselmax = 75;
 		break;
 	case TPS65910_REG_VDD2:
-		opvsel = tps65910_reg_read(pmic, TPS65910_VDD2_OP);
-		mult = tps65910_reg_read(pmic, TPS65910_VDD2);
+		opvsel = tps65910_reg_read_locked(pmic, TPS65910_VDD2_OP);
+		mult = tps65910_reg_read_locked(pmic, TPS65910_VDD2);
 		mult = (mult & VDD2_VGAIN_SEL_MASK) >> VDD2_VGAIN_SEL_SHIFT;
-		srvsel = tps65910_reg_read(pmic, TPS65910_VDD2_SR);
+		srvsel = tps65910_reg_read_locked(pmic, TPS65910_VDD2_SR);
 		sr = opvsel & VDD2_OP_CMD_MASK;
 		opvsel &= VDD2_OP_SEL_MASK;
 		srvsel &= VDD2_SR_SEL_MASK;
 		vselmax = 75;
 		break;
 	case TPS65911_REG_VDDCTRL:
-		opvsel = tps65910_reg_read(pmic, TPS65911_VDDCTRL_OP);
-		srvsel = tps65910_reg_read(pmic, TPS65911_VDDCTRL_SR);
+		opvsel = tps65910_reg_read_locked(pmic, TPS65911_VDDCTRL_OP);
+		srvsel = tps65910_reg_read_locked(pmic, TPS65911_VDDCTRL_SR);
 		sr = opvsel & VDDCTRL_OP_CMD_MASK;
 		opvsel &= VDDCTRL_OP_SEL_MASK;
 		srvsel &= VDDCTRL_SR_SEL_MASK;
@@ -588,7 +583,7 @@ static int tps65910_get_voltage_sel(struct regulator_dev *dev)
 	if (reg < 0)
 		return reg;
 
-	value = tps65910_reg_read(pmic, reg);
+	value = tps65910_reg_read_locked(pmic, reg);
 	if (value < 0)
 		return value;
 
@@ -625,7 +620,7 @@ static int tps65911_get_voltage_sel(struct regulator_dev *dev)
 
 	reg = pmic->get_ctrl_reg(id);
 
-	value = tps65910_reg_read(pmic, reg);
+	value = tps65910_reg_read_locked(pmic, reg);
 
 	switch (id) {
 	case TPS65911_REG_LDO1:
@@ -670,7 +665,7 @@ static int tps65910_set_voltage_dcdc_sel(struct regulator_dev *dev,
 		tps65910_modify_bits(pmic, TPS65910_VDD1,
 				(dcdc_mult << VDD1_VGAIN_SEL_SHIFT),
 						VDD1_VGAIN_SEL_MASK);
-		tps65910_reg_write(pmic, TPS65910_VDD1_OP, vsel);
+		tps65910_reg_write_locked(pmic, TPS65910_VDD1_OP, vsel);
 		break;
 	case TPS65910_REG_VDD2:
 		dcdc_mult = (selector / VDD1_2_NUM_VOLT_FINE) + 1;
@@ -681,11 +676,11 @@ static int tps65910_set_voltage_dcdc_sel(struct regulator_dev *dev,
 		tps65910_modify_bits(pmic, TPS65910_VDD2,
 				(dcdc_mult << VDD2_VGAIN_SEL_SHIFT),
 						VDD1_VGAIN_SEL_MASK);
-		tps65910_reg_write(pmic, TPS65910_VDD2_OP, vsel);
+		tps65910_reg_write_locked(pmic, TPS65910_VDD2_OP, vsel);
 		break;
 	case TPS65911_REG_VDDCTRL:
 		vsel = selector + 3;
-		tps65910_reg_write(pmic, TPS65911_VDDCTRL_OP, vsel);
+		tps65910_reg_write_locked(pmic, TPS65911_VDDCTRL_OP, vsel);
 	}
 
 	return 0;
@@ -936,10 +931,10 @@ static int tps65910_set_ext_sleep_config(struct tps65910_reg *pmic,
 
 	/* External EN1 control */
 	if (ext_sleep_config & TPS65910_SLEEP_CONTROL_EXT_INPUT_EN1)
-		ret = tps65910_set_bits(mfd,
+		ret = tps65910_reg_set_bits(mfd,
 				TPS65910_EN1_LDO_ASS + regoffs, bit_pos);
 	else
-		ret = tps65910_clear_bits(mfd,
+		ret = tps65910_reg_clear_bits(mfd,
 				TPS65910_EN1_LDO_ASS + regoffs, bit_pos);
 	if (ret < 0) {
 		dev_err(mfd->dev,
@@ -949,10 +944,10 @@ static int tps65910_set_ext_sleep_config(struct tps65910_reg *pmic,
 
 	/* External EN2 control */
 	if (ext_sleep_config & TPS65910_SLEEP_CONTROL_EXT_INPUT_EN2)
-		ret = tps65910_set_bits(mfd,
+		ret = tps65910_reg_set_bits(mfd,
 				TPS65910_EN2_LDO_ASS + regoffs, bit_pos);
 	else
-		ret = tps65910_clear_bits(mfd,
+		ret = tps65910_reg_clear_bits(mfd,
 				TPS65910_EN2_LDO_ASS + regoffs, bit_pos);
 	if (ret < 0) {
 		dev_err(mfd->dev,
@@ -964,10 +959,10 @@ static int tps65910_set_ext_sleep_config(struct tps65910_reg *pmic,
 	if ((tps65910_chip_id(mfd) == TPS65910) &&
 			(id >= TPS65910_REG_VDIG1)) {
 		if (ext_sleep_config & TPS65910_SLEEP_CONTROL_EXT_INPUT_EN3)
-			ret = tps65910_set_bits(mfd,
+			ret = tps65910_reg_set_bits(mfd,
 				TPS65910_EN3_LDO_ASS + regoffs, bit_pos);
 		else
-			ret = tps65910_clear_bits(mfd,
+			ret = tps65910_reg_clear_bits(mfd,
 				TPS65910_EN3_LDO_ASS + regoffs, bit_pos);
 		if (ret < 0) {
 			dev_err(mfd->dev,
@@ -979,10 +974,10 @@ static int tps65910_set_ext_sleep_config(struct tps65910_reg *pmic,
 	/* Return if no external control is selected */
 	if (!(ext_sleep_config & EXT_SLEEP_CONTROL)) {
 		/* Clear all sleep controls */
-		ret = tps65910_clear_bits(mfd,
+		ret = tps65910_reg_clear_bits(mfd,
 			TPS65910_SLEEP_KEEP_LDO_ON + regoffs, bit_pos);
 		if (!ret)
-			ret = tps65910_clear_bits(mfd,
+			ret = tps65910_reg_clear_bits(mfd,
 				TPS65910_SLEEP_SET_LDO_OFF + regoffs, bit_pos);
 		if (ret < 0)
 			dev_err(mfd->dev,
@@ -1001,32 +996,33 @@ static int tps65910_set_ext_sleep_config(struct tps65910_reg *pmic,
 				(tps65910_chip_id(mfd) == TPS65911))) {
 		int op_reg_add = pmic->get_ctrl_reg(id) + 1;
 		int sr_reg_add = pmic->get_ctrl_reg(id) + 2;
-		int opvsel = tps65910_reg_read(pmic, op_reg_add);
-		int srvsel = tps65910_reg_read(pmic, sr_reg_add);
+		int opvsel = tps65910_reg_read_locked(pmic, op_reg_add);
+		int srvsel = tps65910_reg_read_locked(pmic, sr_reg_add);
 		if (opvsel & VDD1_OP_CMD_MASK) {
 			u8 reg_val = srvsel & VDD1_OP_SEL_MASK;
-			ret = tps65910_reg_write(pmic, op_reg_add, reg_val);
+			ret = tps65910_reg_write_locked(pmic, op_reg_add,
+							reg_val);
 			if (ret < 0) {
 				dev_err(mfd->dev,
 					"Error in configuring op register\n");
 				return ret;
 			}
 		}
-		ret = tps65910_reg_write(pmic, sr_reg_add, 0);
+		ret = tps65910_reg_write_locked(pmic, sr_reg_add, 0);
 		if (ret < 0) {
 			dev_err(mfd->dev, "Error in settting sr register\n");
 			return ret;
 		}
 	}
 
-	ret = tps65910_clear_bits(mfd,
+	ret = tps65910_reg_clear_bits(mfd,
 			TPS65910_SLEEP_KEEP_LDO_ON + regoffs, bit_pos);
 	if (!ret) {
 		if (ext_sleep_config & TPS65911_SLEEP_CONTROL_EXT_INPUT_SLEEP)
-			ret = tps65910_set_bits(mfd,
+			ret = tps65910_reg_set_bits(mfd,
 				TPS65910_SLEEP_SET_LDO_OFF + regoffs, bit_pos);
 		else
-			ret = tps65910_clear_bits(mfd,
+			ret = tps65910_reg_clear_bits(mfd,
 				TPS65910_SLEEP_SET_LDO_OFF + regoffs, bit_pos);
 	}
 	if (ret < 0)
@@ -1177,7 +1173,7 @@ static __devinit int tps65910_probe(struct platform_device *pdev)
 	platform_set_drvdata(pdev, pmic);
 
 	/* Give control of all register to control port */
-	tps65910_set_bits(pmic->mfd, TPS65910_DEVCTRL,
+	tps65910_reg_set_bits(pmic->mfd, TPS65910_DEVCTRL,
 				DEVCTRL_SR_CTL_I2C_SEL_MASK);
 
 	switch(tps65910_chip_id(tps65910)) {
diff --git a/drivers/regulator/wm831x-dcdc.c b/drivers/regulator/wm831x-dcdc.c
index a885911bb5fc..099da11e989f 100644
--- a/drivers/regulator/wm831x-dcdc.c
+++ b/drivers/regulator/wm831x-dcdc.c
@@ -535,7 +535,7 @@ static __devinit int wm831x_buckv_probe(struct platform_device *pdev)
 		goto err;
 	}
 
-	irq = platform_get_irq_byname(pdev, "UV");
+	irq = wm831x_irq(wm831x, platform_get_irq_byname(pdev, "UV"));
 	ret = request_threaded_irq(irq, NULL, wm831x_dcdc_uv_irq,
 				   IRQF_TRIGGER_RISING, dcdc->name, dcdc);
 	if (ret != 0) {
@@ -544,7 +544,7 @@ static __devinit int wm831x_buckv_probe(struct platform_device *pdev)
 		goto err_regulator;
 	}
 
-	irq = platform_get_irq_byname(pdev, "HC");
+	irq = wm831x_irq(wm831x, platform_get_irq_byname(pdev, "HC"));
 	ret = request_threaded_irq(irq, NULL, wm831x_dcdc_oc_irq,
 				   IRQF_TRIGGER_RISING, dcdc->name, dcdc);
 	if (ret != 0) {
@@ -558,7 +558,8 @@ static __devinit int wm831x_buckv_probe(struct platform_device *pdev)
 	return 0;
 
 err_uv:
-	free_irq(platform_get_irq_byname(pdev, "UV"), dcdc);
+	free_irq(wm831x_irq(wm831x, platform_get_irq_byname(pdev, "UV")),
+		 dcdc);
 err_regulator:
 	regulator_unregister(dcdc->regulator);
 err:
@@ -570,11 +571,14 @@ err:
 static __devexit int wm831x_buckv_remove(struct platform_device *pdev)
 {
 	struct wm831x_dcdc *dcdc = platform_get_drvdata(pdev);
+	struct wm831x *wm831x = dcdc->wm831x;
 
 	platform_set_drvdata(pdev, NULL);
 
-	free_irq(platform_get_irq_byname(pdev, "HC"), dcdc);
-	free_irq(platform_get_irq_byname(pdev, "UV"), dcdc);
+	free_irq(wm831x_irq(wm831x, platform_get_irq_byname(pdev, "HC")),
+			    dcdc);
+	free_irq(wm831x_irq(wm831x, platform_get_irq_byname(pdev, "UV")),
+			    dcdc);
 	regulator_unregister(dcdc->regulator);
 	if (dcdc->dvs_gpio)
 		gpio_free(dcdc->dvs_gpio);
@@ -726,7 +730,7 @@ static __devinit int wm831x_buckp_probe(struct platform_device *pdev)
 		goto err;
 	}
 
-	irq = platform_get_irq_byname(pdev, "UV");
+	irq = wm831x_irq(wm831x, platform_get_irq_byname(pdev, "UV"));
 	ret = request_threaded_irq(irq, NULL, wm831x_dcdc_uv_irq,
 				   IRQF_TRIGGER_RISING,	dcdc->name, dcdc);
 	if (ret != 0) {
@@ -751,7 +755,8 @@ static __devexit int wm831x_buckp_remove(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, NULL);
 
-	free_irq(platform_get_irq_byname(pdev, "UV"), dcdc);
+	free_irq(wm831x_irq(dcdc->wm831x, platform_get_irq_byname(pdev, "UV")),
+			    dcdc);
 	regulator_unregister(dcdc->regulator);
 
 	return 0;
@@ -859,7 +864,7 @@ static __devinit int wm831x_boostp_probe(struct platform_device *pdev)
 		goto err;
 	}
 
-	irq = platform_get_irq_byname(pdev, "UV");
+	irq = wm831x_irq(wm831x, platform_get_irq_byname(pdev, "UV"));
 	ret = request_threaded_irq(irq, NULL, wm831x_dcdc_uv_irq,
 				   IRQF_TRIGGER_RISING, dcdc->name,
 				   dcdc);
@@ -885,7 +890,8 @@ static __devexit int wm831x_boostp_remove(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, NULL);
 
-	free_irq(platform_get_irq_byname(pdev, "UV"), dcdc);
+	free_irq(wm831x_irq(dcdc->wm831x, platform_get_irq_byname(pdev, "UV")),
+		 dcdc);
 	regulator_unregister(dcdc->regulator);
 
 	return 0;
diff --git a/drivers/regulator/wm831x-isink.c b/drivers/regulator/wm831x-isink.c
index b50ab778b098..0d207c297714 100644
--- a/drivers/regulator/wm831x-isink.c
+++ b/drivers/regulator/wm831x-isink.c
@@ -202,7 +202,7 @@ static __devinit int wm831x_isink_probe(struct platform_device *pdev)
 		goto err;
 	}
 
-	irq = platform_get_irq(pdev, 0);
+	irq = wm831x_irq(wm831x, platform_get_irq(pdev, 0));
 	ret = request_threaded_irq(irq, NULL, wm831x_isink_irq,
 				   IRQF_TRIGGER_RISING, isink->name, isink);
 	if (ret != 0) {
@@ -227,7 +227,7 @@ static __devexit int wm831x_isink_remove(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, NULL);
 
-	free_irq(platform_get_irq(pdev, 0), isink);
+	free_irq(wm831x_irq(isink->wm831x, platform_get_irq(pdev, 0)), isink);
 
 	regulator_unregister(isink->regulator);
 
diff --git a/drivers/regulator/wm831x-ldo.c b/drivers/regulator/wm831x-ldo.c
index aa1f8b3fbe16..a9a28d8ac185 100644
--- a/drivers/regulator/wm831x-ldo.c
+++ b/drivers/regulator/wm831x-ldo.c
@@ -321,7 +321,7 @@ static __devinit int wm831x_gp_ldo_probe(struct platform_device *pdev)
 		goto err;
 	}
 
-	irq = platform_get_irq_byname(pdev, "UV");
+	irq = wm831x_irq(wm831x, platform_get_irq_byname(pdev, "UV"));
 	ret = request_threaded_irq(irq, NULL, wm831x_ldo_uv_irq,
 				   IRQF_TRIGGER_RISING, ldo->name,
 				   ldo);
@@ -347,7 +347,8 @@ static __devexit int wm831x_gp_ldo_remove(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, NULL);
 
-	free_irq(platform_get_irq_byname(pdev, "UV"), ldo);
+	free_irq(wm831x_irq(ldo->wm831x,
+			    platform_get_irq_byname(pdev, "UV")), ldo);
 	regulator_unregister(ldo->regulator);
 
 	return 0;
@@ -582,7 +583,7 @@ static __devinit int wm831x_aldo_probe(struct platform_device *pdev)
 		goto err;
 	}
 
-	irq = platform_get_irq_byname(pdev, "UV");
+	irq = wm831x_irq(wm831x, platform_get_irq_byname(pdev, "UV"));
 	ret = request_threaded_irq(irq, NULL, wm831x_ldo_uv_irq,
 				   IRQF_TRIGGER_RISING, ldo->name, ldo);
 	if (ret != 0) {
@@ -605,7 +606,8 @@ static __devexit int wm831x_aldo_remove(struct platform_device *pdev)
 {
 	struct wm831x_ldo *ldo = platform_get_drvdata(pdev);
 
-	free_irq(platform_get_irq_byname(pdev, "UV"), ldo);
+	free_irq(wm831x_irq(ldo->wm831x, platform_get_irq_byname(pdev, "UV")),
+		 ldo);
 	regulator_unregister(ldo->regulator);
 
 	return 0;
diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index d6f8adaa26ef..8ea7bccc7100 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -78,7 +78,7 @@ typedef int (*rproc_handle_resource_t)(struct rproc *rproc, void *, int avail);
  * the recovery of the remote processor.
  */
 static int rproc_iommu_fault(struct iommu_domain *domain, struct device *dev,
-		unsigned long iova, int flags)
+		unsigned long iova, int flags, void *token)
 {
 	dev_err(dev, "iommu fault: da 0x%lx flags 0x%x\n", iova, flags);
 
@@ -117,7 +117,7 @@ static int rproc_enable_iommu(struct rproc *rproc)
 		return -ENOMEM;
 	}
 
-	iommu_set_fault_handler(domain, rproc_iommu_fault);
+	iommu_set_fault_handler(domain, rproc_iommu_fault, rproc);
 
 	ret = iommu_attach_device(domain, dev);
 	if (ret) {
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 4161bfe462cd..08cbdb900a18 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -620,27 +620,6 @@ config RTC_DRV_MSM6242
 	  This driver can also be built as a module. If so, the module
 	  will be called rtc-msm6242.
 
-config RTC_DRV_IMXDI
-	tristate "Freescale IMX DryIce Real Time Clock"
-	depends on ARCH_MX25
-	depends on RTC_CLASS
-	help
-	   Support for Freescale IMX DryIce RTC
-
-	   This driver can also be built as a module, if so, the module
-	   will be called "rtc-imxdi".
-
-config RTC_MXC
-	tristate "Freescale MXC Real Time Clock"
-	depends on ARCH_MXC
-	depends on RTC_CLASS
-	help
-	   If you say yes here you get support for the Freescale MXC
-	   RTC module.
-
-	   This driver can also be built as a module, if so, the module
-	   will be called "rtc-mxc".
-
 config RTC_DRV_BQ4802
 	tristate "TI BQ4802"
 	help
@@ -738,6 +717,16 @@ config RTC_DRV_DAVINCI
 	  This driver can also be built as a module. If so, the module
 	  will be called rtc-davinci.
 
+config RTC_DRV_IMXDI
+	tristate "Freescale IMX DryIce Real Time Clock"
+	depends on SOC_IMX25
+	depends on RTC_CLASS
+	help
+	   Support for Freescale IMX DryIce RTC
+
+	   This driver can also be built as a module, if so, the module
+	   will be called "rtc-imxdi".
+
 config RTC_DRV_OMAP
 	tristate "TI OMAP1"
 	depends on ARCH_OMAP15XX || ARCH_OMAP16XX || ARCH_OMAP730 || ARCH_DAVINCI_DA8XX
@@ -1087,4 +1076,15 @@ config RTC_DRV_LOONGSON1
 	  This driver can also be built as a module. If so, the module
 	  will be called rtc-ls1x.
 
+config RTC_DRV_MXC
+	tristate "Freescale MXC Real Time Clock"
+	depends on ARCH_MXC
+	depends on RTC_CLASS
+	help
+	   If you say yes here you get support for the Freescale MXC
+	   RTC module.
+
+	   This driver can also be built as a module, if so, the module
+	   will be called "rtc-mxc".
+
 endif # RTC_CLASS
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index 727ae7786e6c..2973921c30d8 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -61,7 +61,7 @@ obj-$(CONFIG_RTC_DRV_M41T94)	+= rtc-m41t94.o
 obj-$(CONFIG_RTC_DRV_M48T35)	+= rtc-m48t35.o
 obj-$(CONFIG_RTC_DRV_M48T59)	+= rtc-m48t59.o
 obj-$(CONFIG_RTC_DRV_M48T86)	+= rtc-m48t86.o
-obj-$(CONFIG_RTC_MXC)		+= rtc-mxc.o
+obj-$(CONFIG_RTC_DRV_MXC)	+= rtc-mxc.o
 obj-$(CONFIG_RTC_DRV_MAX6900)	+= rtc-max6900.o
 obj-$(CONFIG_RTC_DRV_MAX8925)	+= rtc-max8925.o
 obj-$(CONFIG_RTC_DRV_MAX8998)	+= rtc-max8998.o
diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c
index c293d0cdb104..836710ce750e 100644
--- a/drivers/rtc/rtc-ds1307.c
+++ b/drivers/rtc/rtc-ds1307.c
@@ -17,8 +17,7 @@
 #include <linux/string.h>
 #include <linux/rtc.h>
 #include <linux/bcd.h>
-
-
+#include <linux/rtc/ds1307.h>
 
 /*
  * We can't determine type by probing, but if we expect pre-Linux code
@@ -92,7 +91,8 @@ enum ds_type {
 #	define DS1337_BIT_A2I		0x02
 #	define DS1337_BIT_A1I		0x01
 #define DS1339_REG_ALARM1_SECS	0x07
-#define DS1339_REG_TRICKLE	0x10
+
+#define DS13XX_TRICKLE_CHARGER_MAGIC	0xa0
 
 #define RX8025_REG_CTRL1	0x0e
 #	define RX8025_BIT_2412		0x20
@@ -124,6 +124,7 @@ struct chip_desc {
 	unsigned		alarm:1;
 	u16			nvram_offset;
 	u16			nvram_size;
+	u16			trickle_charger_reg;
 };
 
 static const struct chip_desc chips[last_ds_type] = {
@@ -140,6 +141,13 @@ static const struct chip_desc chips[last_ds_type] = {
 	},
 	[ds_1339] = {
 		.alarm		= 1,
+		.trickle_charger_reg = 0x10,
+	},
+	[ds_1340] = {
+		.trickle_charger_reg = 0x08,
+	},
+	[ds_1388] = {
+		.trickle_charger_reg = 0x0a,
 	},
 	[ds_3231] = {
 		.alarm		= 1,
@@ -619,6 +627,7 @@ static int __devinit ds1307_probe(struct i2c_client *client,
 	struct i2c_adapter	*adapter = to_i2c_adapter(client->dev.parent);
 	int			want_irq = false;
 	unsigned char		*buf;
+	struct ds1307_platform_data *pdata = client->dev.platform_data;
 	static const int	bbsqi_bitpos[] = {
 		[ds_1337] = 0,
 		[ds_1339] = DS1339_BIT_BBSQI,
@@ -637,7 +646,10 @@ static int __devinit ds1307_probe(struct i2c_client *client,
 
 	ds1307->client	= client;
 	ds1307->type	= id->driver_data;
-	ds1307->offset	= 0;
+
+	if (pdata && pdata->trickle_charger_setup && chip->trickle_charger_reg)
+		i2c_smbus_write_byte_data(client, chip->trickle_charger_reg,
+			DS13XX_TRICKLE_CHARGER_MAGIC | pdata->trickle_charger_setup);
 
 	buf = ds1307->regs;
 	if (i2c_check_functionality(adapter, I2C_FUNC_SMBUS_I2C_BLOCK)) {
diff --git a/drivers/rtc/rtc-ep93xx.c b/drivers/rtc/rtc-ep93xx.c
index 14a42a1edc66..9602278ff988 100644
--- a/drivers/rtc/rtc-ep93xx.c
+++ b/drivers/rtc/rtc-ep93xx.c
@@ -127,7 +127,7 @@ static const struct attribute_group ep93xx_rtc_sysfs_files = {
 	.attrs	= ep93xx_rtc_attrs,
 };
 
-static int __init ep93xx_rtc_probe(struct platform_device *pdev)
+static int __devinit ep93xx_rtc_probe(struct platform_device *pdev)
 {
 	struct ep93xx_rtc *ep93xx_rtc;
 	struct resource *res;
@@ -174,7 +174,7 @@ exit:
 	return err;
 }
 
-static int __exit ep93xx_rtc_remove(struct platform_device *pdev)
+static int __devexit ep93xx_rtc_remove(struct platform_device *pdev)
 {
 	struct ep93xx_rtc *ep93xx_rtc = platform_get_drvdata(pdev);
 
@@ -186,31 +186,19 @@ static int __exit ep93xx_rtc_remove(struct platform_device *pdev)
 	return 0;
 }
 
-/* work with hotplug and coldplug */
-MODULE_ALIAS("platform:ep93xx-rtc");
-
 static struct platform_driver ep93xx_rtc_driver = {
 	.driver		= {
 		.name	= "ep93xx-rtc",
 		.owner	= THIS_MODULE,
 	},
-	.remove		= __exit_p(ep93xx_rtc_remove),
+	.probe		= ep93xx_rtc_probe,
+	.remove		= __devexit_p(ep93xx_rtc_remove),
 };
 
-static int __init ep93xx_rtc_init(void)
-{
-        return platform_driver_probe(&ep93xx_rtc_driver, ep93xx_rtc_probe);
-}
-
-static void __exit ep93xx_rtc_exit(void)
-{
-	platform_driver_unregister(&ep93xx_rtc_driver);
-}
+module_platform_driver(ep93xx_rtc_driver);
 
 MODULE_AUTHOR("Alessandro Zummo <[email protected]>");
 MODULE_DESCRIPTION("EP93XX RTC driver");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(DRV_VERSION);
-
-module_init(ep93xx_rtc_init);
-module_exit(ep93xx_rtc_exit);
+MODULE_ALIAS("platform:ep93xx-rtc");
diff --git a/drivers/rtc/rtc-lpc32xx.c b/drivers/rtc/rtc-lpc32xx.c
index 63c72189c64b..d5218553741f 100644
--- a/drivers/rtc/rtc-lpc32xx.c
+++ b/drivers/rtc/rtc-lpc32xx.c
@@ -19,6 +19,7 @@
 #include <linux/rtc.h>
 #include <linux/slab.h>
 #include <linux/io.h>
+#include <linux/of.h>
 
 /*
  * Clock and Power control register offsets
@@ -386,13 +387,22 @@ static const struct dev_pm_ops lpc32xx_rtc_pm_ops = {
 #define LPC32XX_RTC_PM_OPS NULL
 #endif
 
+#ifdef CONFIG_OF
+static const struct of_device_id lpc32xx_rtc_match[] = {
+	{ .compatible = "nxp,lpc3220-rtc" },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, lpc32xx_rtc_match);
+#endif
+
 static struct platform_driver lpc32xx_rtc_driver = {
 	.probe		= lpc32xx_rtc_probe,
 	.remove		= __devexit_p(lpc32xx_rtc_remove),
 	.driver = {
 		.name	= RTC_NAME,
 		.owner	= THIS_MODULE,
-		.pm	= LPC32XX_RTC_PM_OPS
+		.pm	= LPC32XX_RTC_PM_OPS,
+		.of_match_table = of_match_ptr(lpc32xx_rtc_match),
 	},
 };
 
diff --git a/drivers/rtc/rtc-m41t93.c b/drivers/rtc/rtc-m41t93.c
index 10f1c29436ec..efab3d48cb15 100644
--- a/drivers/rtc/rtc-m41t93.c
+++ b/drivers/rtc/rtc-m41t93.c
@@ -48,6 +48,7 @@ static inline int m41t93_set_reg(struct spi_device *spi, u8 addr, u8 data)
 static int m41t93_set_time(struct device *dev, struct rtc_time *tm)
 {
 	struct spi_device *spi = to_spi_device(dev);
+	int tmp;
 	u8 buf[9] = {0x80};        /* write cmd + 8 data bytes */
 	u8 * const data = &buf[1]; /* ptr to first data byte */
 
@@ -62,6 +63,30 @@ static int m41t93_set_time(struct device *dev, struct rtc_time *tm)
 		return -EINVAL;
 	}
 
+	tmp = spi_w8r8(spi, M41T93_REG_FLAGS);
+	if (tmp < 0)
+		return tmp;
+
+	if (tmp & M41T93_FLAG_OF) {
+		dev_warn(&spi->dev, "OF bit is set, resetting.\n");
+		m41t93_set_reg(spi, M41T93_REG_FLAGS, tmp & ~M41T93_FLAG_OF);
+
+		tmp = spi_w8r8(spi, M41T93_REG_FLAGS);
+		if (tmp < 0) {
+			return tmp;
+		} else if (tmp & M41T93_FLAG_OF) {
+			/* OF cannot be immediately reset: oscillator has to be
+			 * restarted. */
+			u8 reset_osc = buf[M41T93_REG_ST_SEC] | M41T93_FLAG_ST;
+
+			dev_warn(&spi->dev,
+				 "OF bit is still set, kickstarting clock.\n");
+			m41t93_set_reg(spi, M41T93_REG_ST_SEC, reset_osc);
+			reset_osc &= ~M41T93_FLAG_ST;
+			m41t93_set_reg(spi, M41T93_REG_ST_SEC, reset_osc);
+		}
+	}
+
 	data[M41T93_REG_SSEC]		= 0;
 	data[M41T93_REG_ST_SEC]		= bin2bcd(tm->tm_sec);
 	data[M41T93_REG_MIN]		= bin2bcd(tm->tm_min);
@@ -89,10 +114,7 @@ static int m41t93_get_time(struct device *dev, struct rtc_time *tm)
 	   1. halt bit (HT) is set: the clock is running but update of readout
 	      registers has been disabled due to power failure. This is normal
 	      case after poweron. Time is valid after resetting HT bit.
-	   2. oscillator fail bit (OF) is set. Oscillator has be stopped and
-	      time is invalid:
-	      a) OF can be immeditely reset.
-	      b) OF cannot be immediately reset: oscillator has to be restarted.
+	   2. oscillator fail bit (OF) is set: time is invalid.
 	*/
 	tmp = spi_w8r8(spi, M41T93_REG_ALM_HOUR_HT);
 	if (tmp < 0)
@@ -110,21 +132,7 @@ static int m41t93_get_time(struct device *dev, struct rtc_time *tm)
 
 	if (tmp & M41T93_FLAG_OF) {
 		ret = -EINVAL;
-		dev_warn(&spi->dev, "OF bit is set, resetting.\n");
-		m41t93_set_reg(spi, M41T93_REG_FLAGS, tmp & ~M41T93_FLAG_OF);
-
-		tmp = spi_w8r8(spi, M41T93_REG_FLAGS);
-		if (tmp < 0)
-			return tmp;
-		else if (tmp & M41T93_FLAG_OF) {
-			u8 reset_osc = buf[M41T93_REG_ST_SEC] | M41T93_FLAG_ST;
-
-			dev_warn(&spi->dev,
-				 "OF bit is still set, kickstarting clock.\n");
-			m41t93_set_reg(spi, M41T93_REG_ST_SEC, reset_osc);
-			reset_osc &= ~M41T93_FLAG_ST;
-			m41t93_set_reg(spi, M41T93_REG_ST_SEC, reset_osc);
-		}
+		dev_warn(&spi->dev, "OF bit is set, write time to restart.\n");
 	}
 
 	if (tmp & M41T93_FLAG_BL)
diff --git a/drivers/rtc/rtc-pcf8563.c b/drivers/rtc/rtc-pcf8563.c
index bc0677de1996..97a3284bb7c6 100644
--- a/drivers/rtc/rtc-pcf8563.c
+++ b/drivers/rtc/rtc-pcf8563.c
@@ -64,6 +64,7 @@ struct pcf8563 {
 	 * 1970...2069.
 	 */
 	int c_polarity;	/* 0: MO_C=1 means 19xx, otherwise MO_C=1 means 20xx */
+	int voltage_low; /* incicates if a low_voltage was detected */
 };
 
 /*
@@ -86,9 +87,11 @@ static int pcf8563_get_datetime(struct i2c_client *client, struct rtc_time *tm)
 		return -EIO;
 	}
 
-	if (buf[PCF8563_REG_SC] & PCF8563_SC_LV)
+	if (buf[PCF8563_REG_SC] & PCF8563_SC_LV) {
+		pcf8563->voltage_low = 1;
 		dev_info(&client->dev,
 			"low voltage detected, date/time is not reliable.\n");
+	}
 
 	dev_dbg(&client->dev,
 		"%s: raw data is st1=%02x, st2=%02x, sec=%02x, min=%02x, hr=%02x, "
@@ -173,6 +176,44 @@ static int pcf8563_set_datetime(struct i2c_client *client, struct rtc_time *tm)
 	return 0;
 }
 
+#ifdef CONFIG_RTC_INTF_DEV
+static int pcf8563_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
+{
+	struct pcf8563 *pcf8563 = i2c_get_clientdata(to_i2c_client(dev));
+	struct rtc_time tm;
+
+	switch (cmd) {
+	case RTC_VL_READ:
+		if (pcf8563->voltage_low)
+			dev_info(dev, "low voltage detected, date/time is not reliable.\n");
+
+		if (copy_to_user((void __user *)arg, &pcf8563->voltage_low,
+					sizeof(int)))
+			return -EFAULT;
+		return 0;
+	case RTC_VL_CLR:
+		/*
+		 * Clear the VL bit in the seconds register in case
+		 * the time has not been set already (which would
+		 * have cleared it). This does not really matter
+		 * because of the cached voltage_low value but do it
+		 * anyway for consistency.
+		 */
+		if (pcf8563_get_datetime(to_i2c_client(dev), &tm))
+			pcf8563_set_datetime(to_i2c_client(dev), &tm);
+
+		/* Clear the cached value. */
+		pcf8563->voltage_low = 0;
+
+		return 0;
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+#else
+#define pcf8563_rtc_ioctl NULL
+#endif
+
 static int pcf8563_rtc_read_time(struct device *dev, struct rtc_time *tm)
 {
 	return pcf8563_get_datetime(to_i2c_client(dev), tm);
@@ -184,6 +225,7 @@ static int pcf8563_rtc_set_time(struct device *dev, struct rtc_time *tm)
 }
 
 static const struct rtc_class_ops pcf8563_rtc_ops = {
+	.ioctl		= pcf8563_rtc_ioctl,
 	.read_time	= pcf8563_rtc_read_time,
 	.set_time	= pcf8563_rtc_set_time,
 };
diff --git a/drivers/rtc/rtc-pl031.c b/drivers/rtc/rtc-pl031.c
index f027c063fb20..cc0533994f6e 100644
--- a/drivers/rtc/rtc-pl031.c
+++ b/drivers/rtc/rtc-pl031.c
@@ -220,17 +220,9 @@ static irqreturn_t pl031_interrupt(int irq, void *dev_id)
 	unsigned long events = 0;
 
 	rtcmis = readl(ldata->base + RTC_MIS);
-	if (rtcmis) {
-		writel(rtcmis, ldata->base + RTC_ICR);
-
-		if (rtcmis & RTC_BIT_AI)
-			events |= (RTC_AF | RTC_IRQF);
-
-		/* Timer interrupt is only available in ST variants */
-		if ((rtcmis & RTC_BIT_PI) &&
-			(ldata->hw_designer == AMBA_VENDOR_ST))
-			events |= (RTC_PF | RTC_IRQF);
-
+	if (rtcmis & RTC_BIT_AI) {
+		writel(RTC_BIT_AI, ldata->base + RTC_ICR);
+		events |= (RTC_AF | RTC_IRQF);
 		rtc_update_irq(ldata->rtc, 1, events);
 
 		return IRQ_HANDLED;
diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c
index 3f3a29752369..7e6af0b22f17 100644
--- a/drivers/rtc/rtc-s3c.c
+++ b/drivers/rtc/rtc-s3c.c
@@ -670,6 +670,7 @@ static int s3c_rtc_resume(struct platform_device *pdev)
 #define s3c_rtc_resume  NULL
 #endif
 
+#ifdef CONFIG_OF
 static struct s3c_rtc_drv_data s3c_rtc_drv_data_array[] = {
 	[TYPE_S3C2410] = { TYPE_S3C2410 },
 	[TYPE_S3C2416] = { TYPE_S3C2416 },
@@ -677,7 +678,6 @@ static struct s3c_rtc_drv_data s3c_rtc_drv_data_array[] = {
 	[TYPE_S3C64XX] = { TYPE_S3C64XX },
 };
 
-#ifdef CONFIG_OF
 static const struct of_device_id s3c_rtc_dt_match[] = {
 	{
 		.compatible = "samsung,s3c2410-rtc",
diff --git a/drivers/rtc/rtc-spear.c b/drivers/rtc/rtc-spear.c
index e38da0dc4187..1f76320e545b 100644
--- a/drivers/rtc/rtc-spear.c
+++ b/drivers/rtc/rtc-spear.c
@@ -16,6 +16,7 @@
 #include <linux/io.h>
 #include <linux/irq.h>
 #include <linux/module.h>
+#include <linux/of.h>
 #include <linux/platform_device.h>
 #include <linux/rtc.h>
 #include <linux/slab.h>
@@ -519,6 +520,14 @@ static void spear_rtc_shutdown(struct platform_device *pdev)
 	clk_disable(config->clk);
 }
 
+#ifdef CONFIG_OF
+static const struct of_device_id spear_rtc_id_table[] = {
+	{ .compatible = "st,spear600-rtc" },
+	{}
+};
+MODULE_DEVICE_TABLE(of, spear_rtc_id_table);
+#endif
+
 static struct platform_driver spear_rtc_driver = {
 	.probe = spear_rtc_probe,
 	.remove = __devexit_p(spear_rtc_remove),
@@ -527,6 +536,7 @@ static struct platform_driver spear_rtc_driver = {
 	.shutdown = spear_rtc_shutdown,
 	.driver = {
 		.name = "rtc-spear",
+		.of_match_table = of_match_ptr(spear_rtc_id_table),
 	},
 };
 
diff --git a/drivers/rtc/rtc-tegra.c b/drivers/rtc/rtc-tegra.c
index 75259fe38602..c006025cecc8 100644
--- a/drivers/rtc/rtc-tegra.c
+++ b/drivers/rtc/rtc-tegra.c
@@ -309,7 +309,8 @@ static int __devinit tegra_rtc_probe(struct platform_device *pdev)
 	struct resource *res;
 	int ret;
 
-	info = kzalloc(sizeof(struct tegra_rtc_info), GFP_KERNEL);
+	info = devm_kzalloc(&pdev->dev, sizeof(struct tegra_rtc_info),
+		GFP_KERNEL);
 	if (!info)
 		return -ENOMEM;
 
@@ -317,29 +318,18 @@ static int __devinit tegra_rtc_probe(struct platform_device *pdev)
 	if (!res) {
 		dev_err(&pdev->dev,
 			"Unable to allocate resources for device.\n");
-		ret = -EBUSY;
-		goto err_free_info;
+		return -EBUSY;
 	}
 
-	if (!request_mem_region(res->start, resource_size(res), pdev->name)) {
-		dev_err(&pdev->dev,
-			"Unable to request mem region for device.\n");
-		ret = -EBUSY;
-		goto err_free_info;
+	info->rtc_base = devm_request_and_ioremap(&pdev->dev, res);
+	if (!info->rtc_base) {
+		dev_err(&pdev->dev, "Unable to request mem region and grab IOs for device.\n");
+		return -EBUSY;
 	}
 
 	info->tegra_rtc_irq = platform_get_irq(pdev, 0);
-	if (info->tegra_rtc_irq <= 0) {
-		ret = -EBUSY;
-		goto err_release_mem_region;
-	}
-
-	info->rtc_base = ioremap_nocache(res->start, resource_size(res));
-	if (!info->rtc_base) {
-		dev_err(&pdev->dev, "Unable to grab IOs for device.\n");
-		ret = -EBUSY;
-		goto err_release_mem_region;
-	}
+	if (info->tegra_rtc_irq <= 0)
+		return -EBUSY;
 
 	/* set context info. */
 	info->pdev = pdev;
@@ -362,11 +352,12 @@ static int __devinit tegra_rtc_probe(struct platform_device *pdev)
 		dev_err(&pdev->dev,
 			"Unable to register device (err=%d).\n",
 			ret);
-		goto err_iounmap;
+		return ret;
 	}
 
-	ret = request_irq(info->tegra_rtc_irq, tegra_rtc_irq_handler,
-		IRQF_TRIGGER_HIGH, "rtc alarm", &pdev->dev);
+	ret = devm_request_irq(&pdev->dev, info->tegra_rtc_irq,
+			tegra_rtc_irq_handler, IRQF_TRIGGER_HIGH,
+			"rtc alarm", &pdev->dev);
 	if (ret) {
 		dev_err(&pdev->dev,
 			"Unable to request interrupt for device (err=%d).\n",
@@ -380,12 +371,6 @@ static int __devinit tegra_rtc_probe(struct platform_device *pdev)
 
 err_dev_unreg:
 	rtc_device_unregister(info->rtc_dev);
-err_iounmap:
-	iounmap(info->rtc_base);
-err_release_mem_region:
-	release_mem_region(res->start, resource_size(res));
-err_free_info:
-	kfree(info);
 
 	return ret;
 }
@@ -393,17 +378,8 @@ err_free_info:
 static int __devexit tegra_rtc_remove(struct platform_device *pdev)
 {
 	struct tegra_rtc_info *info = platform_get_drvdata(pdev);
-	struct resource *res;
-
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!res)
-		return -EBUSY;
 
-	free_irq(info->tegra_rtc_irq, &pdev->dev);
 	rtc_device_unregister(info->rtc_dev);
-	iounmap(info->rtc_base);
-	release_mem_region(res->start, resource_size(res));
-	kfree(info);
 
 	platform_set_drvdata(pdev, NULL);
 
diff --git a/drivers/rtc/rtc-wm831x.c b/drivers/rtc/rtc-wm831x.c
index 3b6e6a67e765..59c6245e0421 100644
--- a/drivers/rtc/rtc-wm831x.c
+++ b/drivers/rtc/rtc-wm831x.c
@@ -396,7 +396,7 @@ static int wm831x_rtc_probe(struct platform_device *pdev)
 {
 	struct wm831x *wm831x = dev_get_drvdata(pdev->dev.parent);
 	struct wm831x_rtc *wm831x_rtc;
-	int alm_irq = platform_get_irq_byname(pdev, "ALM");
+	int alm_irq = wm831x_irq(wm831x, platform_get_irq_byname(pdev, "ALM"));
 	int ret = 0;
 
 	wm831x_rtc = devm_kzalloc(&pdev->dev, sizeof(*wm831x_rtc), GFP_KERNEL);
diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c
index 4511420849bc..e84dbecd0991 100644
--- a/drivers/staging/android/ashmem.c
+++ b/drivers/staging/android/ashmem.c
@@ -19,6 +19,7 @@
 #include <linux/module.h>
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/falloc.h>
 #include <linux/miscdevice.h>
 #include <linux/security.h>
 #include <linux/mm.h>
@@ -363,11 +364,12 @@ static int ashmem_shrink(struct shrinker *s, struct shrink_control *sc)
 
 	mutex_lock(&ashmem_mutex);
 	list_for_each_entry_safe(range, next, &ashmem_lru_list, lru) {
-		struct inode *inode = range->asma->file->f_dentry->d_inode;
 		loff_t start = range->pgstart * PAGE_SIZE;
-		loff_t end = (range->pgend + 1) * PAGE_SIZE - 1;
+		loff_t end = (range->pgend + 1) * PAGE_SIZE;
 
-		vmtruncate_range(inode, start, end);
+		do_fallocate(range->asma->file,
+				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+				start, end - start);
 		range->purged = ASHMEM_WAS_PURGED;
 		lru_del(range);
 
diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index 59af3945ea85..65c7c62c7aae 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -633,7 +633,6 @@ static int ptmx_open(struct inode *inode, struct file *filp)
 	mutex_unlock(&devpts_mutex);
 
 	mutex_lock(&tty_mutex);
-	mutex_lock(&devpts_mutex);
 	tty = tty_init_dev(ptm_driver, index);
 
 	if (IS_ERR(tty)) {
@@ -643,7 +642,6 @@ static int ptmx_open(struct inode *inode, struct file *filp)
 
 	/* The tty returned here is locked so we can safely
 	   drop the mutex */
-	mutex_unlock(&devpts_mutex);
 	mutex_unlock(&tty_mutex);
 
 	set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */
diff --git a/drivers/tty/serial/lantiq.c b/drivers/tty/serial/lantiq.c
index 96c1cacc7360..02da071fe1e7 100644
--- a/drivers/tty/serial/lantiq.c
+++ b/drivers/tty/serial/lantiq.c
@@ -31,16 +31,19 @@
 #include <linux/tty_flip.h>
 #include <linux/serial_core.h>
 #include <linux/serial.h>
-#include <linux/platform_device.h>
+#include <linux/of_platform.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
 #include <linux/io.h>
 #include <linux/clk.h>
+#include <linux/gpio.h>
 
 #include <lantiq_soc.h>
 
 #define PORT_LTQ_ASC		111
 #define MAXPORTS		2
 #define UART_DUMMY_UER_RX	1
-#define DRVNAME			"ltq_asc"
+#define DRVNAME			"lantiq,asc"
 #ifdef __BIG_ENDIAN
 #define LTQ_ASC_TBUF		(0x0020 + 3)
 #define LTQ_ASC_RBUF		(0x0024 + 3)
@@ -114,6 +117,9 @@ static DEFINE_SPINLOCK(ltq_asc_lock);
 
 struct ltq_uart_port {
 	struct uart_port	port;
+	/* clock used to derive divider */
+	struct clk		*fpiclk;
+	/* clock gating of the ASC core */
 	struct clk		*clk;
 	unsigned int		tx_irq;
 	unsigned int		rx_irq;
@@ -316,7 +322,9 @@ lqasc_startup(struct uart_port *port)
 	struct ltq_uart_port *ltq_port = to_ltq_uart_port(port);
 	int retval;
 
-	port->uartclk = clk_get_rate(ltq_port->clk);
+	if (ltq_port->clk)
+		clk_enable(ltq_port->clk);
+	port->uartclk = clk_get_rate(ltq_port->fpiclk);
 
 	ltq_w32_mask(ASCCLC_DISS | ASCCLC_RMCMASK, (1 << ASCCLC_RMCOFFSET),
 		port->membase + LTQ_ASC_CLC);
@@ -382,6 +390,8 @@ lqasc_shutdown(struct uart_port *port)
 		port->membase + LTQ_ASC_RXFCON);
 	ltq_w32_mask(ASCTXFCON_TXFEN, ASCTXFCON_TXFFLU,
 		port->membase + LTQ_ASC_TXFCON);
+	if (ltq_port->clk)
+		clk_disable(ltq_port->clk);
 }
 
 static void
@@ -630,7 +640,7 @@ lqasc_console_setup(struct console *co, char *options)
 
 	port = &ltq_port->port;
 
-	port->uartclk = clk_get_rate(ltq_port->clk);
+	port->uartclk = clk_get_rate(ltq_port->fpiclk);
 
 	if (options)
 		uart_parse_options(options, &baud, &parity, &bits, &flow);
@@ -668,37 +678,32 @@ static struct uart_driver lqasc_reg = {
 static int __init
 lqasc_probe(struct platform_device *pdev)
 {
+	struct device_node *node = pdev->dev.of_node;
 	struct ltq_uart_port *ltq_port;
 	struct uart_port *port;
-	struct resource *mmres, *irqres;
-	int tx_irq, rx_irq, err_irq;
-	struct clk *clk;
+	struct resource *mmres, irqres[3];
+	int line = 0;
 	int ret;
 
 	mmres = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	irqres = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
-	if (!mmres || !irqres)
+	ret = of_irq_to_resource_table(node, irqres, 3);
+	if (!mmres || (ret != 3)) {
+		dev_err(&pdev->dev,
+			"failed to get memory/irq for serial port\n");
 		return -ENODEV;
+	}
 
-	if (pdev->id >= MAXPORTS)
-		return -EBUSY;
+	/* check if this is the console port */
+	if (mmres->start != CPHYSADDR(LTQ_EARLY_ASC))
+		line = 1;
 
-	if (lqasc_port[pdev->id] != NULL)
+	if (lqasc_port[line]) {
+		dev_err(&pdev->dev, "port %d already allocated\n", line);
 		return -EBUSY;
-
-	clk = clk_get(&pdev->dev, "fpi");
-	if (IS_ERR(clk)) {
-		pr_err("failed to get fpi clk\n");
-		return -ENOENT;
 	}
 
-	tx_irq = platform_get_irq_byname(pdev, "tx");
-	rx_irq = platform_get_irq_byname(pdev, "rx");
-	err_irq = platform_get_irq_byname(pdev, "err");
-	if ((tx_irq < 0) | (rx_irq < 0) | (err_irq < 0))
-		return -ENODEV;
-
-	ltq_port = kzalloc(sizeof(struct ltq_uart_port), GFP_KERNEL);
+	ltq_port = devm_kzalloc(&pdev->dev, sizeof(struct ltq_uart_port),
+			GFP_KERNEL);
 	if (!ltq_port)
 		return -ENOMEM;
 
@@ -709,19 +714,26 @@ lqasc_probe(struct platform_device *pdev)
 	port->ops	= &lqasc_pops;
 	port->fifosize	= 16;
 	port->type	= PORT_LTQ_ASC,
-	port->line	= pdev->id;
+	port->line	= line;
 	port->dev	= &pdev->dev;
-
-	port->irq	= tx_irq; /* unused, just to be backward-compatibe */
+	/* unused, just to be backward-compatible */
+	port->irq	= irqres[0].start;
 	port->mapbase	= mmres->start;
 
-	ltq_port->clk	= clk;
+	ltq_port->fpiclk = clk_get_fpi();
+	if (IS_ERR(ltq_port->fpiclk)) {
+		pr_err("failed to get fpi clk\n");
+		return -ENOENT;
+	}
 
-	ltq_port->tx_irq = tx_irq;
-	ltq_port->rx_irq = rx_irq;
-	ltq_port->err_irq = err_irq;
+	/* not all asc ports have clock gates, lets ignore the return code */
+	ltq_port->clk = clk_get(&pdev->dev, NULL);
 
-	lqasc_port[pdev->id] = ltq_port;
+	ltq_port->tx_irq = irqres[0].start;
+	ltq_port->rx_irq = irqres[1].start;
+	ltq_port->err_irq = irqres[2].start;
+
+	lqasc_port[line] = ltq_port;
 	platform_set_drvdata(pdev, ltq_port);
 
 	ret = uart_add_one_port(&lqasc_reg, port);
@@ -729,10 +741,17 @@ lqasc_probe(struct platform_device *pdev)
 	return ret;
 }
 
+static const struct of_device_id ltq_asc_match[] = {
+	{ .compatible = DRVNAME },
+	{},
+};
+MODULE_DEVICE_TABLE(of, ltq_asc_match);
+
 static struct platform_driver lqasc_driver = {
 	.driver		= {
 		.name	= DRVNAME,
 		.owner	= THIS_MODULE,
+		.of_match_table = ltq_asc_match,
 	},
 };
 
diff --git a/drivers/tty/serial/sb1250-duart.c b/drivers/tty/serial/sb1250-duart.c
index 0be8a2f00d0b..f76b1688c5c8 100644
--- a/drivers/tty/serial/sb1250-duart.c
+++ b/drivers/tty/serial/sb1250-duart.c
@@ -31,6 +31,7 @@
 #include <linux/interrupt.h>
 #include <linux/ioport.h>
 #include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/major.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
diff --git a/drivers/tty/serial/zs.c b/drivers/tty/serial/zs.c
index 4001eee6c08d..92c00b24d0df 100644
--- a/drivers/tty/serial/zs.c
+++ b/drivers/tty/serial/zs.c
@@ -57,6 +57,7 @@
 #include <linux/ioport.h>
 #include <linux/irqflags.h>
 #include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/major.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c
index 173a9000a6cb..ba8be396a621 100644
--- a/drivers/tty/tty_ldisc.c
+++ b/drivers/tty/tty_ldisc.c
@@ -894,6 +894,23 @@ int tty_ldisc_setup(struct tty_struct *tty, struct tty_struct *o_tty)
 	tty_ldisc_enable(tty);
 	return 0;
 }
+
+static void tty_ldisc_kill(struct tty_struct *tty)
+{
+	mutex_lock(&tty->ldisc_mutex);
+	/*
+	 * Now kill off the ldisc
+	 */
+	tty_ldisc_close(tty, tty->ldisc);
+	tty_ldisc_put(tty->ldisc);
+	/* Force an oops if we mess this up */
+	tty->ldisc = NULL;
+
+	/* Ensure the next open requests the N_TTY ldisc */
+	tty_set_termios_ldisc(tty, N_TTY);
+	mutex_unlock(&tty->ldisc_mutex);
+}
+
 /**
  *	tty_ldisc_release		-	release line discipline
  *	@tty: tty being shut down
@@ -912,27 +929,19 @@ void tty_ldisc_release(struct tty_struct *tty, struct tty_struct *o_tty)
 	 * race with the set_ldisc code path.
 	 */
 
-	tty_unlock(tty);
+	tty_unlock_pair(tty, o_tty);
 	tty_ldisc_halt(tty);
 	tty_ldisc_flush_works(tty);
-	tty_lock(tty);
-
-	mutex_lock(&tty->ldisc_mutex);
-	/*
-	 * Now kill off the ldisc
-	 */
-	tty_ldisc_close(tty, tty->ldisc);
-	tty_ldisc_put(tty->ldisc);
-	/* Force an oops if we mess this up */
-	tty->ldisc = NULL;
+	if (o_tty) {
+		tty_ldisc_halt(o_tty);
+		tty_ldisc_flush_works(o_tty);
+	}
+	tty_lock_pair(tty, o_tty);
 
-	/* Ensure the next open requests the N_TTY ldisc */
-	tty_set_termios_ldisc(tty, N_TTY);
-	mutex_unlock(&tty->ldisc_mutex);
 
-	/* This will need doing differently if we need to lock */
+	tty_ldisc_kill(tty);
 	if (o_tty)
-		tty_ldisc_release(o_tty, NULL);
+		tty_ldisc_kill(o_tty);
 
 	/* And the memory resources remaining (buffers, termios) will be
 	   disposed of when the kref hits zero */
diff --git a/drivers/video/backlight/Kconfig b/drivers/video/backlight/Kconfig
index af16884491ed..fa2b03750316 100644
--- a/drivers/video/backlight/Kconfig
+++ b/drivers/video/backlight/Kconfig
@@ -184,6 +184,18 @@ config BACKLIGHT_GENERIC
 	  known as the Corgi backlight driver. If you have a Sharp Zaurus
 	  SL-C7xx, SL-Cxx00 or SL-6000x say y.
 
+config BACKLIGHT_LM3533
+	tristate "Backlight Driver for LM3533"
+	depends on BACKLIGHT_CLASS_DEVICE
+	depends on MFD_LM3533
+	help
+	  Say Y to enable the backlight driver for National Semiconductor / TI
+	  LM3533 Lighting Power chips.
+
+	  The backlights can be controlled directly, through PWM input, or by
+	  the ambient-light-sensor interface. The chip supports 256 brightness
+	  levels.
+
 config BACKLIGHT_LOCOMO
 	tristate "Sharp LOCOMO LCD/Backlight Driver"
 	depends on SHARP_LOCOMO
diff --git a/drivers/video/backlight/Makefile b/drivers/video/backlight/Makefile
index 36855ae887d6..a2ac9cfbaf6b 100644
--- a/drivers/video/backlight/Makefile
+++ b/drivers/video/backlight/Makefile
@@ -21,6 +21,7 @@ obj-$(CONFIG_BACKLIGHT_EP93XX)	+= ep93xx_bl.o
 obj-$(CONFIG_BACKLIGHT_GENERIC)	+= generic_bl.o
 obj-$(CONFIG_BACKLIGHT_HP700)	+= jornada720_bl.o
 obj-$(CONFIG_BACKLIGHT_HP680)	+= hp680_bl.o
+obj-$(CONFIG_BACKLIGHT_LM3533)	+= lm3533_bl.o
 obj-$(CONFIG_BACKLIGHT_LOCOMO)	+= locomolcd.o
 obj-$(CONFIG_BACKLIGHT_LP855X)	+= lp855x_bl.o
 obj-$(CONFIG_BACKLIGHT_OMAP1)	+= omap1_bl.o
diff --git a/drivers/video/backlight/adp5520_bl.c b/drivers/video/backlight/adp5520_bl.c
index 4911ea7989c8..df5db99af23d 100644
--- a/drivers/video/backlight/adp5520_bl.c
+++ b/drivers/video/backlight/adp5520_bl.c
@@ -160,7 +160,7 @@ static ssize_t adp5520_store(struct device *dev, const char *buf,
 	unsigned long val;
 	int ret;
 
-	ret = strict_strtoul(buf, 10, &val);
+	ret = kstrtoul(buf, 10, &val);
 	if (ret)
 		return ret;
 
@@ -214,7 +214,7 @@ static ssize_t adp5520_bl_daylight_max_store(struct device *dev,
 	struct adp5520_bl *data = dev_get_drvdata(dev);
 	int ret;
 
-	ret = strict_strtoul(buf, 10, &data->cached_daylight_max);
+	ret = kstrtoul(buf, 10, &data->cached_daylight_max);
 	if (ret < 0)
 		return ret;
 
diff --git a/drivers/video/backlight/adp8860_bl.c b/drivers/video/backlight/adp8860_bl.c
index 550dbf0bb896..77d1fdba597f 100644
--- a/drivers/video/backlight/adp8860_bl.c
+++ b/drivers/video/backlight/adp8860_bl.c
@@ -222,7 +222,8 @@ static int __devinit adp8860_led_probe(struct i2c_client *client)
 	struct led_info *cur_led;
 	int ret, i;
 
-	led = kzalloc(sizeof(*led) * pdata->num_leds, GFP_KERNEL);
+	led = devm_kzalloc(&client->dev, sizeof(*led) * pdata->num_leds,
+				GFP_KERNEL);
 	if (led == NULL) {
 		dev_err(&client->dev, "failed to alloc memory\n");
 		return -ENOMEM;
@@ -236,7 +237,7 @@ static int __devinit adp8860_led_probe(struct i2c_client *client)
 
 	if (ret) {
 		dev_err(&client->dev, "failed to write\n");
-		goto err_free;
+		return ret;
 	}
 
 	for (i = 0; i < pdata->num_leds; ++i) {
@@ -291,9 +292,6 @@ static int __devinit adp8860_led_probe(struct i2c_client *client)
 		cancel_work_sync(&led[i].work);
 	}
 
- err_free:
-	kfree(led);
-
 	return ret;
 }
 
@@ -309,7 +307,6 @@ static int __devexit adp8860_led_remove(struct i2c_client *client)
 		cancel_work_sync(&data->led[i].work);
 	}
 
-	kfree(data->led);
 	return 0;
 }
 #else
@@ -451,7 +448,7 @@ static ssize_t adp8860_store(struct device *dev, const char *buf,
 	unsigned long val;
 	int ret;
 
-	ret = strict_strtoul(buf, 10, &val);
+	ret = kstrtoul(buf, 10, &val);
 	if (ret)
 		return ret;
 
@@ -501,7 +498,7 @@ static ssize_t adp8860_bl_l1_daylight_max_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
 {
 	struct adp8860_bl *data = dev_get_drvdata(dev);
-	int ret = strict_strtoul(buf, 10, &data->cached_daylight_max);
+	int ret = kstrtoul(buf, 10, &data->cached_daylight_max);
 	if (ret)
 		return ret;
 
@@ -608,7 +605,7 @@ static ssize_t adp8860_bl_ambient_light_zone_store(struct device *dev,
 	uint8_t reg_val;
 	int ret;
 
-	ret = strict_strtoul(buf, 10, &val);
+	ret = kstrtoul(buf, 10, &val);
 	if (ret)
 		return ret;
 
@@ -675,13 +672,13 @@ static int __devinit adp8860_probe(struct i2c_client *client,
 		return -EINVAL;
 	}
 
-	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	data = devm_kzalloc(&client->dev, sizeof(*data), GFP_KERNEL);
 	if (data == NULL)
 		return -ENOMEM;
 
 	ret = adp8860_read(client, ADP8860_MFDVID, &reg_val);
 	if (ret < 0)
-		goto out2;
+		return ret;
 
 	switch (ADP8860_MANID(reg_val)) {
 	case ADP8863_MANUFID:
@@ -694,8 +691,7 @@ static int __devinit adp8860_probe(struct i2c_client *client,
 		break;
 	default:
 		dev_err(&client->dev, "failed to probe\n");
-		ret = -ENODEV;
-		goto out2;
+		return -ENODEV;
 	}
 
 	/* It's confirmed that the DEVID field is actually a REVID */
@@ -717,8 +713,7 @@ static int __devinit adp8860_probe(struct i2c_client *client,
 			&client->dev, data, &adp8860_bl_ops, &props);
 	if (IS_ERR(bl)) {
 		dev_err(&client->dev, "failed to register backlight\n");
-		ret = PTR_ERR(bl);
-		goto out2;
+		return PTR_ERR(bl);
 	}
 
 	bl->props.brightness = ADP8860_MAX_BRIGHTNESS;
@@ -756,8 +751,6 @@ out:
 			&adp8860_bl_attr_group);
 out1:
 	backlight_device_unregister(bl);
-out2:
-	kfree(data);
 
 	return ret;
 }
@@ -776,7 +769,6 @@ static int __devexit adp8860_remove(struct i2c_client *client)
 			&adp8860_bl_attr_group);
 
 	backlight_device_unregister(data->bl);
-	kfree(data);
 
 	return 0;
 }
diff --git a/drivers/video/backlight/adp8870_bl.c b/drivers/video/backlight/adp8870_bl.c
index 9be58c6f18f1..edf7f91c8e61 100644
--- a/drivers/video/backlight/adp8870_bl.c
+++ b/drivers/video/backlight/adp8870_bl.c
@@ -244,8 +244,8 @@ static int __devinit adp8870_led_probe(struct i2c_client *client)
 	struct led_info *cur_led;
 	int ret, i;
 
-
-	led = kcalloc(pdata->num_leds, sizeof(*led), GFP_KERNEL);
+	led = devm_kzalloc(&client->dev, pdata->num_leds * sizeof(*led),
+				GFP_KERNEL);
 	if (led == NULL) {
 		dev_err(&client->dev, "failed to alloc memory\n");
 		return -ENOMEM;
@@ -253,17 +253,17 @@ static int __devinit adp8870_led_probe(struct i2c_client *client)
 
 	ret = adp8870_write(client, ADP8870_ISCLAW, pdata->led_fade_law);
 	if (ret)
-		goto err_free;
+		return ret;
 
 	ret = adp8870_write(client, ADP8870_ISCT1,
 			(pdata->led_on_time & 0x3) << 6);
 	if (ret)
-		goto err_free;
+		return ret;
 
 	ret = adp8870_write(client, ADP8870_ISCF,
 			FADE_VAL(pdata->led_fade_in, pdata->led_fade_out));
 	if (ret)
-		goto err_free;
+		return ret;
 
 	for (i = 0; i < pdata->num_leds; ++i) {
 		cur_led = &pdata->leds[i];
@@ -317,9 +317,6 @@ static int __devinit adp8870_led_probe(struct i2c_client *client)
 		cancel_work_sync(&led[i].work);
 	}
 
- err_free:
-	kfree(led);
-
 	return ret;
 }
 
@@ -335,7 +332,6 @@ static int __devexit adp8870_led_remove(struct i2c_client *client)
 		cancel_work_sync(&data->led[i].work);
 	}
 
-	kfree(data->led);
 	return 0;
 }
 #else
@@ -572,7 +568,7 @@ static ssize_t adp8870_store(struct device *dev, const char *buf,
 	unsigned long val;
 	int ret;
 
-	ret = strict_strtoul(buf, 10, &val);
+	ret = kstrtoul(buf, 10, &val);
 	if (ret)
 		return ret;
 
@@ -652,7 +648,7 @@ static ssize_t adp8870_bl_l1_daylight_max_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
 {
 	struct adp8870_bl *data = dev_get_drvdata(dev);
-	int ret = strict_strtoul(buf, 10, &data->cached_daylight_max);
+	int ret = kstrtoul(buf, 10, &data->cached_daylight_max);
 	if (ret)
 		return ret;
 
@@ -794,7 +790,7 @@ static ssize_t adp8870_bl_ambient_light_zone_store(struct device *dev,
 	uint8_t reg_val;
 	int ret;
 
-	ret = strict_strtoul(buf, 10, &val);
+	ret = kstrtoul(buf, 10, &val);
 	if (ret)
 		return ret;
 
@@ -874,7 +870,7 @@ static int __devinit adp8870_probe(struct i2c_client *client,
 		return -ENODEV;
 	}
 
-	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	data = devm_kzalloc(&client->dev, sizeof(*data), GFP_KERNEL);
 	if (data == NULL)
 		return -ENOMEM;
 
@@ -894,8 +890,7 @@ static int __devinit adp8870_probe(struct i2c_client *client,
 			&client->dev, data, &adp8870_bl_ops, &props);
 	if (IS_ERR(bl)) {
 		dev_err(&client->dev, "failed to register backlight\n");
-		ret = PTR_ERR(bl);
-		goto out2;
+		return PTR_ERR(bl);
 	}
 
 	data->bl = bl;
@@ -930,8 +925,6 @@ out:
 			&adp8870_bl_attr_group);
 out1:
 	backlight_device_unregister(bl);
-out2:
-	kfree(data);
 
 	return ret;
 }
@@ -950,7 +943,6 @@ static int __devexit adp8870_remove(struct i2c_client *client)
 			&adp8870_bl_attr_group);
 
 	backlight_device_unregister(data->bl);
-	kfree(data);
 
 	return 0;
 }
diff --git a/drivers/video/backlight/ams369fg06.c b/drivers/video/backlight/ams369fg06.c
index 7bdadc790117..3729238e7096 100644
--- a/drivers/video/backlight/ams369fg06.c
+++ b/drivers/video/backlight/ams369fg06.c
@@ -482,7 +482,7 @@ static int __devinit ams369fg06_probe(struct spi_device *spi)
 	struct backlight_device *bd = NULL;
 	struct backlight_properties props;
 
-	lcd = kzalloc(sizeof(struct ams369fg06), GFP_KERNEL);
+	lcd = devm_kzalloc(&spi->dev, sizeof(struct ams369fg06), GFP_KERNEL);
 	if (!lcd)
 		return -ENOMEM;
 
@@ -492,7 +492,7 @@ static int __devinit ams369fg06_probe(struct spi_device *spi)
 	ret = spi_setup(spi);
 	if (ret < 0) {
 		dev_err(&spi->dev, "spi setup failed.\n");
-		goto out_free_lcd;
+		return ret;
 	}
 
 	lcd->spi = spi;
@@ -501,15 +501,13 @@ static int __devinit ams369fg06_probe(struct spi_device *spi)
 	lcd->lcd_pd = spi->dev.platform_data;
 	if (!lcd->lcd_pd) {
 		dev_err(&spi->dev, "platform data is NULL\n");
-		goto out_free_lcd;
+		return -EFAULT;
 	}
 
 	ld = lcd_device_register("ams369fg06", &spi->dev, lcd,
 		&ams369fg06_lcd_ops);
-	if (IS_ERR(ld)) {
-		ret = PTR_ERR(ld);
-		goto out_free_lcd;
-	}
+	if (IS_ERR(ld))
+		return PTR_ERR(ld);
 
 	lcd->ld = ld;
 
@@ -547,8 +545,6 @@ static int __devinit ams369fg06_probe(struct spi_device *spi)
 
 out_lcd_unregister:
 	lcd_device_unregister(ld);
-out_free_lcd:
-	kfree(lcd);
 	return ret;
 }
 
@@ -559,7 +555,6 @@ static int __devexit ams369fg06_remove(struct spi_device *spi)
 	ams369fg06_power(lcd, FB_BLANK_POWERDOWN);
 	backlight_device_unregister(lcd->bd);
 	lcd_device_unregister(lcd->ld);
-	kfree(lcd);
 
 	return 0;
 }
@@ -619,7 +614,6 @@ static void ams369fg06_shutdown(struct spi_device *spi)
 static struct spi_driver ams369fg06_driver = {
 	.driver = {
 		.name	= "ams369fg06",
-		.bus	= &spi_bus_type,
 		.owner	= THIS_MODULE,
 	},
 	.probe		= ams369fg06_probe,
diff --git a/drivers/video/backlight/apple_bl.c b/drivers/video/backlight/apple_bl.c
index a523b255e124..9dc73ac3709a 100644
--- a/drivers/video/backlight/apple_bl.c
+++ b/drivers/video/backlight/apple_bl.c
@@ -16,6 +16,8 @@
  *  get at the firmware code in order to figure out what it's actually doing.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
@@ -25,6 +27,7 @@
 #include <linux/pci.h>
 #include <linux/acpi.h>
 #include <linux/atomic.h>
+#include <linux/apple_bl.h>
 
 static struct backlight_device *apple_backlight_device;
 
@@ -39,8 +42,6 @@ struct hw_data {
 
 static const struct hw_data *hw_data;
 
-#define DRIVER "apple_backlight: "
-
 /* Module parameters. */
 static int debug;
 module_param_named(debug, debug, int, 0644);
@@ -60,8 +61,7 @@ static int intel_chipset_send_intensity(struct backlight_device *bd)
 	int intensity = bd->props.brightness;
 
 	if (debug)
-		printk(KERN_DEBUG DRIVER "setting brightness to %d\n",
-		       intensity);
+		pr_debug("setting brightness to %d\n", intensity);
 
 	intel_chipset_set_brightness(intensity);
 	return 0;
@@ -76,8 +76,7 @@ static int intel_chipset_get_intensity(struct backlight_device *bd)
 	intensity = inb(0xb3) >> 4;
 
 	if (debug)
-		printk(KERN_DEBUG DRIVER "read brightness of %d\n",
-		       intensity);
+		pr_debug("read brightness of %d\n", intensity);
 
 	return intensity;
 }
@@ -107,8 +106,7 @@ static int nvidia_chipset_send_intensity(struct backlight_device *bd)
 	int intensity = bd->props.brightness;
 
 	if (debug)
-		printk(KERN_DEBUG DRIVER "setting brightness to %d\n",
-		       intensity);
+		pr_debug("setting brightness to %d\n", intensity);
 
 	nvidia_chipset_set_brightness(intensity);
 	return 0;
@@ -123,8 +121,7 @@ static int nvidia_chipset_get_intensity(struct backlight_device *bd)
 	intensity = inb(0x52f) >> 4;
 
 	if (debug)
-		printk(KERN_DEBUG DRIVER "read brightness of %d\n",
-		       intensity);
+		pr_debug("read brightness of %d\n", intensity);
 
 	return intensity;
 }
@@ -149,7 +146,7 @@ static int __devinit apple_bl_add(struct acpi_device *dev)
 	host = pci_get_bus_and_slot(0, 0);
 
 	if (!host) {
-		printk(KERN_ERR DRIVER "unable to find PCI host\n");
+		pr_err("unable to find PCI host\n");
 		return -ENODEV;
 	}
 
@@ -161,7 +158,7 @@ static int __devinit apple_bl_add(struct acpi_device *dev)
 	pci_dev_put(host);
 
 	if (!hw_data) {
-		printk(KERN_ERR DRIVER "unknown hardware\n");
+		pr_err("unknown hardware\n");
 		return -ENODEV;
 	}
 
diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c
index bf5b1ece7160..297db2fa91f5 100644
--- a/drivers/video/backlight/backlight.c
+++ b/drivers/video/backlight/backlight.c
@@ -5,6 +5,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/device.h>
@@ -123,7 +125,7 @@ static ssize_t backlight_store_power(struct device *dev,
 	rc = -ENXIO;
 	mutex_lock(&bd->ops_lock);
 	if (bd->ops) {
-		pr_debug("backlight: set power to %lu\n", power);
+		pr_debug("set power to %lu\n", power);
 		if (bd->props.power != power) {
 			bd->props.power = power;
 			backlight_update_status(bd);
@@ -161,8 +163,7 @@ static ssize_t backlight_store_brightness(struct device *dev,
 		if (brightness > bd->props.max_brightness)
 			rc = -EINVAL;
 		else {
-			pr_debug("backlight: set brightness to %lu\n",
-				 brightness);
+			pr_debug("set brightness to %lu\n", brightness);
 			bd->props.brightness = brightness;
 			backlight_update_status(bd);
 			rc = count;
@@ -378,8 +379,8 @@ static int __init backlight_class_init(void)
 {
 	backlight_class = class_create(THIS_MODULE, "backlight");
 	if (IS_ERR(backlight_class)) {
-		printk(KERN_WARNING "Unable to create backlight class; errno = %ld\n",
-				PTR_ERR(backlight_class));
+		pr_warn("Unable to create backlight class; errno = %ld\n",
+			PTR_ERR(backlight_class));
 		return PTR_ERR(backlight_class);
 	}
 
diff --git a/drivers/video/backlight/corgi_lcd.c b/drivers/video/backlight/corgi_lcd.c
index 6dab13fe562e..23d732677ba1 100644
--- a/drivers/video/backlight/corgi_lcd.c
+++ b/drivers/video/backlight/corgi_lcd.c
@@ -544,7 +544,7 @@ static int __devinit corgi_lcd_probe(struct spi_device *spi)
 		return -EINVAL;
 	}
 
-	lcd = kzalloc(sizeof(struct corgi_lcd), GFP_KERNEL);
+	lcd = devm_kzalloc(&spi->dev, sizeof(struct corgi_lcd), GFP_KERNEL);
 	if (!lcd) {
 		dev_err(&spi->dev, "failed to allocate memory\n");
 		return -ENOMEM;
@@ -554,10 +554,9 @@ static int __devinit corgi_lcd_probe(struct spi_device *spi)
 
 	lcd->lcd_dev = lcd_device_register("corgi_lcd", &spi->dev,
 					lcd, &corgi_lcd_ops);
-	if (IS_ERR(lcd->lcd_dev)) {
-		ret = PTR_ERR(lcd->lcd_dev);
-		goto err_free_lcd;
-	}
+	if (IS_ERR(lcd->lcd_dev))
+		return PTR_ERR(lcd->lcd_dev);
+
 	lcd->power = FB_BLANK_POWERDOWN;
 	lcd->mode = (pdata) ? pdata->init_mode : CORGI_LCD_MODE_VGA;
 
@@ -591,8 +590,6 @@ err_unregister_bl:
 	backlight_device_unregister(lcd->bl_dev);
 err_unregister_lcd:
 	lcd_device_unregister(lcd->lcd_dev);
-err_free_lcd:
-	kfree(lcd);
 	return ret;
 }
 
@@ -613,7 +610,6 @@ static int __devexit corgi_lcd_remove(struct spi_device *spi)
 
 	corgi_lcd_set_power(lcd->lcd_dev, FB_BLANK_POWERDOWN);
 	lcd_device_unregister(lcd->lcd_dev);
-	kfree(lcd);
 
 	return 0;
 }
diff --git a/drivers/video/backlight/cr_bllcd.c b/drivers/video/backlight/cr_bllcd.c
index 22489eb5f3e0..37bae801e23b 100644
--- a/drivers/video/backlight/cr_bllcd.c
+++ b/drivers/video/backlight/cr_bllcd.c
@@ -27,6 +27,8 @@
  *   Alan Hourihane <alanh-at-tungstengraphics-dot-com>
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
@@ -180,14 +182,13 @@ static int cr_backlight_probe(struct platform_device *pdev)
 	lpc_dev = pci_get_device(PCI_VENDOR_ID_INTEL,
 					CRVML_DEVICE_LPC, NULL);
 	if (!lpc_dev) {
-		printk("INTEL CARILLO RANCH LPC not found.\n");
+		pr_err("INTEL CARILLO RANCH LPC not found.\n");
 		return -ENODEV;
 	}
 
 	pci_read_config_byte(lpc_dev, CRVML_REG_GPIOEN, &dev_en);
 	if (!(dev_en & CRVML_GPIOEN_BIT)) {
-		printk(KERN_ERR
-		       "Carillo Ranch GPIO device was not enabled.\n");
+		pr_err("Carillo Ranch GPIO device was not enabled.\n");
 		pci_dev_put(lpc_dev);
 		return -ENODEV;
 	}
@@ -270,7 +271,7 @@ static int __init cr_backlight_init(void)
 		return PTR_ERR(crp);
 	}
 
-	printk("Carillo Ranch Backlight Driver Initialized.\n");
+	pr_info("Carillo Ranch Backlight Driver Initialized.\n");
 
 	return 0;
 }
diff --git a/drivers/video/backlight/da903x_bl.c b/drivers/video/backlight/da903x_bl.c
index 30e19681a30b..573c7ece0fde 100644
--- a/drivers/video/backlight/da903x_bl.c
+++ b/drivers/video/backlight/da903x_bl.c
@@ -136,6 +136,7 @@ static int da903x_backlight_probe(struct platform_device *pdev)
 		da903x_write(data->da903x_dev, DA9034_WLED_CONTROL2,
 				DA9034_WLED_ISET(pdata->output_current));
 
+	memset(&props, 0, sizeof(props));
 	props.type = BACKLIGHT_RAW;
 	props.max_brightness = max_brightness;
 	bl = backlight_device_register(pdev->name, data->da903x_dev, data,
diff --git a/drivers/video/backlight/generic_bl.c b/drivers/video/backlight/generic_bl.c
index 9ce6170c1860..8c660fcd250d 100644
--- a/drivers/video/backlight/generic_bl.c
+++ b/drivers/video/backlight/generic_bl.c
@@ -9,6 +9,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
@@ -106,7 +108,7 @@ static int genericbl_probe(struct platform_device *pdev)
 
 	generic_backlight_device = bd;
 
-	printk("Generic Backlight Driver Initialized.\n");
+	pr_info("Generic Backlight Driver Initialized.\n");
 	return 0;
 }
 
@@ -120,7 +122,7 @@ static int genericbl_remove(struct platform_device *pdev)
 
 	backlight_device_unregister(bd);
 
-	printk("Generic Backlight Driver Unloaded\n");
+	pr_info("Generic Backlight Driver Unloaded\n");
 	return 0;
 }
 
diff --git a/drivers/video/backlight/ili9320.c b/drivers/video/backlight/ili9320.c
index 5118a9f029ab..6c9399341bcf 100644
--- a/drivers/video/backlight/ili9320.c
+++ b/drivers/video/backlight/ili9320.c
@@ -220,7 +220,7 @@ int __devinit ili9320_probe_spi(struct spi_device *spi,
 
 	/* allocate and initialse our state */
 
-	ili = kzalloc(sizeof(struct ili9320), GFP_KERNEL);
+	ili = devm_kzalloc(&spi->dev, sizeof(struct ili9320), GFP_KERNEL);
 	if (ili == NULL) {
 		dev_err(dev, "no memory for device\n");
 		return -ENOMEM;
@@ -240,8 +240,7 @@ int __devinit ili9320_probe_spi(struct spi_device *spi,
 	lcd = lcd_device_register("ili9320", dev, ili, &ili9320_ops);
 	if (IS_ERR(lcd)) {
 		dev_err(dev, "failed to register lcd device\n");
-		ret = PTR_ERR(lcd);
-		goto err_free;
+		return PTR_ERR(lcd);
 	}
 
 	ili->lcd = lcd;
@@ -259,9 +258,6 @@ int __devinit ili9320_probe_spi(struct spi_device *spi,
  err_unregister:
 	lcd_device_unregister(lcd);
 
- err_free:
-	kfree(ili);
-
 	return ret;
 }
 
@@ -272,7 +268,6 @@ int __devexit ili9320_remove(struct ili9320 *ili)
 	ili9320_power(ili, FB_BLANK_POWERDOWN);
 
 	lcd_device_unregister(ili->lcd);
-	kfree(ili);
 
 	return 0;
 }
diff --git a/drivers/video/backlight/jornada720_bl.c b/drivers/video/backlight/jornada720_bl.c
index 2f8af5d786ab..16f593b64427 100644
--- a/drivers/video/backlight/jornada720_bl.c
+++ b/drivers/video/backlight/jornada720_bl.c
@@ -9,6 +9,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/backlight.h>
 #include <linux/device.h>
 #include <linux/fb.h>
@@ -38,7 +40,7 @@ static int jornada_bl_get_brightness(struct backlight_device *bd)
 	ret = jornada_ssp_byte(GETBRIGHTNESS);
 
 	if (jornada_ssp_byte(GETBRIGHTNESS) != TXDUMMY) {
-		printk(KERN_ERR "bl : get brightness timeout\n");
+		pr_err("get brightness timeout\n");
 		jornada_ssp_end();
 		return -ETIMEDOUT;
 	} else /* exchange txdummy for value */
@@ -59,7 +61,7 @@ static int jornada_bl_update_status(struct backlight_device *bd)
 	if ((bd->props.power != FB_BLANK_UNBLANK) || (bd->props.fb_blank != FB_BLANK_UNBLANK)) {
 		ret = jornada_ssp_byte(BRIGHTNESSOFF);
 		if (ret != TXDUMMY) {
-			printk(KERN_INFO "bl : brightness off timeout\n");
+			pr_info("brightness off timeout\n");
 			/* turn off backlight */
 			PPSR &= ~PPC_LDD1;
 			PPDR |= PPC_LDD1;
@@ -70,7 +72,7 @@ static int jornada_bl_update_status(struct backlight_device *bd)
 
 		/* send command to our mcu */
 		if (jornada_ssp_byte(SETBRIGHTNESS) != TXDUMMY) {
-			printk(KERN_INFO "bl : failed to set brightness\n");
+			pr_info("failed to set brightness\n");
 			ret = -ETIMEDOUT;
 			goto out;
 		}
@@ -81,7 +83,7 @@ static int jornada_bl_update_status(struct backlight_device *bd)
 		   but due to physical layout it is equal to 0, so we simply
 		   invert the value (MAX VALUE - NEW VALUE). */
 		if (jornada_ssp_byte(BL_MAX_BRIGHT - bd->props.brightness) != TXDUMMY) {
-			printk(KERN_ERR "bl : set brightness failed\n");
+			pr_err("set brightness failed\n");
 			ret = -ETIMEDOUT;
 		}
 
@@ -113,7 +115,7 @@ static int jornada_bl_probe(struct platform_device *pdev)
 
 	if (IS_ERR(bd)) {
 		ret = PTR_ERR(bd);
-		printk(KERN_ERR "bl : failed to register device, err=%x\n", ret);
+		pr_err("failed to register device, err=%x\n", ret);
 		return ret;
 	}
 
@@ -125,7 +127,7 @@ static int jornada_bl_probe(struct platform_device *pdev)
 	jornada_bl_update_status(bd);
 
 	platform_set_drvdata(pdev, bd);
-	printk(KERN_INFO "HP Jornada 700 series backlight driver\n");
+	pr_info("HP Jornada 700 series backlight driver\n");
 
 	return 0;
 }
diff --git a/drivers/video/backlight/jornada720_lcd.c b/drivers/video/backlight/jornada720_lcd.c
index 22d231a17e3c..635b30523fd5 100644
--- a/drivers/video/backlight/jornada720_lcd.c
+++ b/drivers/video/backlight/jornada720_lcd.c
@@ -9,6 +9,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/device.h>
 #include <linux/fb.h>
 #include <linux/kernel.h>
@@ -44,7 +46,7 @@ static int jornada_lcd_get_contrast(struct lcd_device *dev)
 	jornada_ssp_start();
 
 	if (jornada_ssp_byte(GETCONTRAST) != TXDUMMY) {
-		printk(KERN_ERR "lcd: get contrast failed\n");
+		pr_err("get contrast failed\n");
 		jornada_ssp_end();
 		return -ETIMEDOUT;
 	} else {
@@ -65,7 +67,7 @@ static int jornada_lcd_set_contrast(struct lcd_device *dev, int value)
 
 	/* push the new value */
 	if (jornada_ssp_byte(value) != TXDUMMY) {
-		printk(KERN_ERR "lcd : set contrast failed\n");
+		pr_err("set contrast failed\n");
 		jornada_ssp_end();
 		return -ETIMEDOUT;
 	}
@@ -103,7 +105,7 @@ static int jornada_lcd_probe(struct platform_device *pdev)
 
 	if (IS_ERR(lcd_device)) {
 		ret = PTR_ERR(lcd_device);
-		printk(KERN_ERR "lcd : failed to register device\n");
+		pr_err("failed to register device\n");
 		return ret;
 	}
 
diff --git a/drivers/video/backlight/l4f00242t03.c b/drivers/video/backlight/l4f00242t03.c
index 6022b67285ec..40f606a86093 100644
--- a/drivers/video/backlight/l4f00242t03.c
+++ b/drivers/video/backlight/l4f00242t03.c
@@ -11,6 +11,8 @@
  * published by the Free Software Foundation.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/device.h>
 #include <linux/kernel.h>
 #include <linux/delay.h>
@@ -159,7 +161,8 @@ static int __devinit l4f00242t03_probe(struct spi_device *spi)
 		return -EINVAL;
 	}
 
-	priv = kzalloc(sizeof(struct l4f00242t03_priv), GFP_KERNEL);
+	priv = devm_kzalloc(&spi->dev, sizeof(struct l4f00242t03_priv),
+				GFP_KERNEL);
 
 	if (priv == NULL) {
 		dev_err(&spi->dev, "No memory for this device.\n");
@@ -177,7 +180,7 @@ static int __devinit l4f00242t03_probe(struct spi_device *spi)
 	if (ret) {
 		dev_err(&spi->dev,
 			"Unable to get the lcd l4f00242t03 reset gpio.\n");
-		goto err;
+		return ret;
 	}
 
 	ret = gpio_request_one(pdata->data_enable_gpio, GPIOF_OUT_INIT_LOW,
@@ -185,7 +188,7 @@ static int __devinit l4f00242t03_probe(struct spi_device *spi)
 	if (ret) {
 		dev_err(&spi->dev,
 			"Unable to get the lcd l4f00242t03 data en gpio.\n");
-		goto err2;
+		goto err;
 	}
 
 	priv->io_reg = regulator_get(&spi->dev, "vdd");
@@ -193,7 +196,7 @@ static int __devinit l4f00242t03_probe(struct spi_device *spi)
 		ret = PTR_ERR(priv->io_reg);
 		dev_err(&spi->dev, "%s: Unable to get the IO regulator\n",
 		       __func__);
-		goto err3;
+		goto err2;
 	}
 
 	priv->core_reg = regulator_get(&spi->dev, "vcore");
@@ -201,14 +204,14 @@ static int __devinit l4f00242t03_probe(struct spi_device *spi)
 		ret = PTR_ERR(priv->core_reg);
 		dev_err(&spi->dev, "%s: Unable to get the core regulator\n",
 		       __func__);
-		goto err4;
+		goto err3;
 	}
 
 	priv->ld = lcd_device_register("l4f00242t03",
 					&spi->dev, priv, &l4f_ops);
 	if (IS_ERR(priv->ld)) {
 		ret = PTR_ERR(priv->ld);
-		goto err5;
+		goto err4;
 	}
 
 	/* Init the LCD */
@@ -220,16 +223,14 @@ static int __devinit l4f00242t03_probe(struct spi_device *spi)
 
 	return 0;
 
-err5:
-	regulator_put(priv->core_reg);
 err4:
-	regulator_put(priv->io_reg);
+	regulator_put(priv->core_reg);
 err3:
-	gpio_free(pdata->data_enable_gpio);
+	regulator_put(priv->io_reg);
 err2:
-	gpio_free(pdata->reset_gpio);
+	gpio_free(pdata->data_enable_gpio);
 err:
-	kfree(priv);
+	gpio_free(pdata->reset_gpio);
 
 	return ret;
 }
@@ -250,8 +251,6 @@ static int __devexit l4f00242t03_remove(struct spi_device *spi)
 	regulator_put(priv->io_reg);
 	regulator_put(priv->core_reg);
 
-	kfree(priv);
-
 	return 0;
 }
 
diff --git a/drivers/video/backlight/lcd.c b/drivers/video/backlight/lcd.c
index 79c1b0d609a8..a5d0d024bb92 100644
--- a/drivers/video/backlight/lcd.c
+++ b/drivers/video/backlight/lcd.c
@@ -5,6 +5,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/device.h>
@@ -32,6 +34,8 @@ static int fb_notifier_callback(struct notifier_block *self,
 	case FB_EVENT_BLANK:
 	case FB_EVENT_MODE_CHANGE:
 	case FB_EVENT_MODE_CHANGE_ALL:
+	case FB_EARLY_EVENT_BLANK:
+	case FB_R_EARLY_EVENT_BLANK:
 		break;
 	default:
 		return 0;
@@ -46,6 +50,14 @@ static int fb_notifier_callback(struct notifier_block *self,
 		if (event == FB_EVENT_BLANK) {
 			if (ld->ops->set_power)
 				ld->ops->set_power(ld, *(int *)evdata->data);
+		} else if (event == FB_EARLY_EVENT_BLANK) {
+			if (ld->ops->early_set_power)
+				ld->ops->early_set_power(ld,
+						*(int *)evdata->data);
+		} else if (event == FB_R_EARLY_EVENT_BLANK) {
+			if (ld->ops->r_early_set_power)
+				ld->ops->r_early_set_power(ld,
+						*(int *)evdata->data);
 		} else {
 			if (ld->ops->set_mode)
 				ld->ops->set_mode(ld, evdata->data);
@@ -106,7 +118,7 @@ static ssize_t lcd_store_power(struct device *dev,
 
 	mutex_lock(&ld->ops_lock);
 	if (ld->ops && ld->ops->set_power) {
-		pr_debug("lcd: set power to %lu\n", power);
+		pr_debug("set power to %lu\n", power);
 		ld->ops->set_power(ld, power);
 		rc = count;
 	}
@@ -142,7 +154,7 @@ static ssize_t lcd_store_contrast(struct device *dev,
 
 	mutex_lock(&ld->ops_lock);
 	if (ld->ops && ld->ops->set_contrast) {
-		pr_debug("lcd: set contrast to %lu\n", contrast);
+		pr_debug("set contrast to %lu\n", contrast);
 		ld->ops->set_contrast(ld, contrast);
 		rc = count;
 	}
@@ -253,8 +265,8 @@ static int __init lcd_class_init(void)
 {
 	lcd_class = class_create(THIS_MODULE, "lcd");
 	if (IS_ERR(lcd_class)) {
-		printk(KERN_WARNING "Unable to create backlight class; errno = %ld\n",
-				PTR_ERR(lcd_class));
+		pr_warn("Unable to create backlight class; errno = %ld\n",
+			PTR_ERR(lcd_class));
 		return PTR_ERR(lcd_class);
 	}
 
diff --git a/drivers/video/backlight/ld9040.c b/drivers/video/backlight/ld9040.c
index efd352be21ae..58f517fb7d40 100644
--- a/drivers/video/backlight/ld9040.c
+++ b/drivers/video/backlight/ld9040.c
@@ -707,7 +707,7 @@ static int ld9040_probe(struct spi_device *spi)
 	struct backlight_device *bd = NULL;
 	struct backlight_properties props;
 
-	lcd = kzalloc(sizeof(struct ld9040), GFP_KERNEL);
+	lcd = devm_kzalloc(&spi->dev, sizeof(struct ld9040), GFP_KERNEL);
 	if (!lcd)
 		return -ENOMEM;
 
@@ -717,7 +717,7 @@ static int ld9040_probe(struct spi_device *spi)
 	ret = spi_setup(spi);
 	if (ret < 0) {
 		dev_err(&spi->dev, "spi setup failed.\n");
-		goto out_free_lcd;
+		return ret;
 	}
 
 	lcd->spi = spi;
@@ -726,7 +726,7 @@ static int ld9040_probe(struct spi_device *spi)
 	lcd->lcd_pd = spi->dev.platform_data;
 	if (!lcd->lcd_pd) {
 		dev_err(&spi->dev, "platform data is NULL.\n");
-		goto out_free_lcd;
+		return -EFAULT;
 	}
 
 	mutex_init(&lcd->lock);
@@ -734,13 +734,13 @@ static int ld9040_probe(struct spi_device *spi)
 	ret = regulator_bulk_get(lcd->dev, ARRAY_SIZE(supplies), supplies);
 	if (ret) {
 		dev_err(lcd->dev, "Failed to get regulators: %d\n", ret);
-		goto out_free_lcd;
+		return ret;
 	}
 
 	ld = lcd_device_register("ld9040", &spi->dev, lcd, &ld9040_lcd_ops);
 	if (IS_ERR(ld)) {
 		ret = PTR_ERR(ld);
-		goto out_free_lcd;
+		goto out_free_regulator;
 	}
 
 	lcd->ld = ld;
@@ -782,10 +782,9 @@ static int ld9040_probe(struct spi_device *spi)
 
 out_unregister_lcd:
 	lcd_device_unregister(lcd->ld);
-out_free_lcd:
+out_free_regulator:
 	regulator_bulk_free(ARRAY_SIZE(supplies), supplies);
 
-	kfree(lcd);
 	return ret;
 }
 
@@ -797,7 +796,6 @@ static int __devexit ld9040_remove(struct spi_device *spi)
 	backlight_device_unregister(lcd->bd);
 	lcd_device_unregister(lcd->ld);
 	regulator_bulk_free(ARRAY_SIZE(supplies), supplies);
-	kfree(lcd);
 
 	return 0;
 }
@@ -846,7 +844,6 @@ static void ld9040_shutdown(struct spi_device *spi)
 static struct spi_driver ld9040_driver = {
 	.driver = {
 		.name	= "ld9040",
-		.bus	= &spi_bus_type,
 		.owner	= THIS_MODULE,
 	},
 	.probe		= ld9040_probe,
diff --git a/drivers/video/backlight/lm3533_bl.c b/drivers/video/backlight/lm3533_bl.c
new file mode 100644
index 000000000000..bebeb63607db
--- /dev/null
+++ b/drivers/video/backlight/lm3533_bl.c
@@ -0,0 +1,423 @@
+/*
+ * lm3533-bl.c -- LM3533 Backlight driver
+ *
+ * Copyright (C) 2011-2012 Texas Instruments
+ *
+ * Author: Johan Hovold <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under  the terms of the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/backlight.h>
+#include <linux/fb.h>
+#include <linux/slab.h>
+
+#include <linux/mfd/lm3533.h>
+
+
+#define LM3533_HVCTRLBANK_COUNT		2
+#define LM3533_BL_MAX_BRIGHTNESS	255
+
+#define LM3533_REG_CTRLBANK_AB_BCONF	0x1a
+
+
+struct lm3533_bl {
+	struct lm3533 *lm3533;
+	struct lm3533_ctrlbank cb;
+	struct backlight_device *bd;
+	int id;
+};
+
+
+static inline int lm3533_bl_get_ctrlbank_id(struct lm3533_bl *bl)
+{
+	return bl->id;
+}
+
+static int lm3533_bl_update_status(struct backlight_device *bd)
+{
+	struct lm3533_bl *bl = bl_get_data(bd);
+	int brightness = bd->props.brightness;
+
+	if (bd->props.power != FB_BLANK_UNBLANK)
+		brightness = 0;
+	if (bd->props.fb_blank != FB_BLANK_UNBLANK)
+		brightness = 0;
+
+	return lm3533_ctrlbank_set_brightness(&bl->cb, (u8)brightness);
+}
+
+static int lm3533_bl_get_brightness(struct backlight_device *bd)
+{
+	struct lm3533_bl *bl = bl_get_data(bd);
+	u8 val;
+	int ret;
+
+	ret = lm3533_ctrlbank_get_brightness(&bl->cb, &val);
+	if (ret)
+		return ret;
+
+	return val;
+}
+
+static const struct backlight_ops lm3533_bl_ops = {
+	.get_brightness	= lm3533_bl_get_brightness,
+	.update_status	= lm3533_bl_update_status,
+};
+
+static ssize_t show_id(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct lm3533_bl *bl = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", bl->id);
+}
+
+static ssize_t show_als_channel(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct lm3533_bl *bl = dev_get_drvdata(dev);
+	unsigned channel = lm3533_bl_get_ctrlbank_id(bl);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", channel);
+}
+
+static ssize_t show_als_en(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct lm3533_bl *bl = dev_get_drvdata(dev);
+	int ctrlbank = lm3533_bl_get_ctrlbank_id(bl);
+	u8 val;
+	u8 mask;
+	bool enable;
+	int ret;
+
+	ret = lm3533_read(bl->lm3533, LM3533_REG_CTRLBANK_AB_BCONF, &val);
+	if (ret)
+		return ret;
+
+	mask = 1 << (2 * ctrlbank);
+	enable = val & mask;
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", enable);
+}
+
+static ssize_t store_als_en(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t len)
+{
+	struct lm3533_bl *bl = dev_get_drvdata(dev);
+	int ctrlbank = lm3533_bl_get_ctrlbank_id(bl);
+	int enable;
+	u8 val;
+	u8 mask;
+	int ret;
+
+	if (kstrtoint(buf, 0, &enable))
+		return -EINVAL;
+
+	mask = 1 << (2 * ctrlbank);
+
+	if (enable)
+		val = mask;
+	else
+		val = 0;
+
+	ret = lm3533_update(bl->lm3533, LM3533_REG_CTRLBANK_AB_BCONF, val,
+									mask);
+	if (ret)
+		return ret;
+
+	return len;
+}
+
+static ssize_t show_linear(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct lm3533_bl *bl = dev_get_drvdata(dev);
+	u8 val;
+	u8 mask;
+	int linear;
+	int ret;
+
+	ret = lm3533_read(bl->lm3533, LM3533_REG_CTRLBANK_AB_BCONF, &val);
+	if (ret)
+		return ret;
+
+	mask = 1 << (2 * lm3533_bl_get_ctrlbank_id(bl) + 1);
+
+	if (val & mask)
+		linear = 1;
+	else
+		linear = 0;
+
+	return scnprintf(buf, PAGE_SIZE, "%x\n", linear);
+}
+
+static ssize_t store_linear(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t len)
+{
+	struct lm3533_bl *bl = dev_get_drvdata(dev);
+	unsigned long linear;
+	u8 mask;
+	u8 val;
+	int ret;
+
+	if (kstrtoul(buf, 0, &linear))
+		return -EINVAL;
+
+	mask = 1 << (2 * lm3533_bl_get_ctrlbank_id(bl) + 1);
+
+	if (linear)
+		val = mask;
+	else
+		val = 0;
+
+	ret = lm3533_update(bl->lm3533, LM3533_REG_CTRLBANK_AB_BCONF, val,
+									mask);
+	if (ret)
+		return ret;
+
+	return len;
+}
+
+static ssize_t show_pwm(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct lm3533_bl *bl = dev_get_drvdata(dev);
+	u8 val;
+	int ret;
+
+	ret = lm3533_ctrlbank_get_pwm(&bl->cb, &val);
+	if (ret)
+		return ret;
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", val);
+}
+
+static ssize_t store_pwm(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t len)
+{
+	struct lm3533_bl *bl = dev_get_drvdata(dev);
+	u8 val;
+	int ret;
+
+	if (kstrtou8(buf, 0, &val))
+		return -EINVAL;
+
+	ret = lm3533_ctrlbank_set_pwm(&bl->cb, val);
+	if (ret)
+		return ret;
+
+	return len;
+}
+
+static LM3533_ATTR_RO(als_channel);
+static LM3533_ATTR_RW(als_en);
+static LM3533_ATTR_RO(id);
+static LM3533_ATTR_RW(linear);
+static LM3533_ATTR_RW(pwm);
+
+static struct attribute *lm3533_bl_attributes[] = {
+	&dev_attr_als_channel.attr,
+	&dev_attr_als_en.attr,
+	&dev_attr_id.attr,
+	&dev_attr_linear.attr,
+	&dev_attr_pwm.attr,
+	NULL,
+};
+
+static umode_t lm3533_bl_attr_is_visible(struct kobject *kobj,
+					     struct attribute *attr, int n)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct lm3533_bl *bl = dev_get_drvdata(dev);
+	umode_t mode = attr->mode;
+
+	if (attr == &dev_attr_als_channel.attr ||
+					attr == &dev_attr_als_en.attr) {
+		if (!bl->lm3533->have_als)
+			mode = 0;
+	}
+
+	return mode;
+};
+
+static struct attribute_group lm3533_bl_attribute_group = {
+	.is_visible	= lm3533_bl_attr_is_visible,
+	.attrs		= lm3533_bl_attributes
+};
+
+static int __devinit lm3533_bl_setup(struct lm3533_bl *bl,
+					struct lm3533_bl_platform_data *pdata)
+{
+	int ret;
+
+	ret = lm3533_ctrlbank_set_max_current(&bl->cb, pdata->max_current);
+	if (ret)
+		return ret;
+
+	return lm3533_ctrlbank_set_pwm(&bl->cb, pdata->pwm);
+}
+
+static int __devinit lm3533_bl_probe(struct platform_device *pdev)
+{
+	struct lm3533 *lm3533;
+	struct lm3533_bl_platform_data *pdata;
+	struct lm3533_bl *bl;
+	struct backlight_device *bd;
+	struct backlight_properties props;
+	int ret;
+
+	dev_dbg(&pdev->dev, "%s\n", __func__);
+
+	lm3533 = dev_get_drvdata(pdev->dev.parent);
+	if (!lm3533)
+		return -EINVAL;
+
+	pdata = pdev->dev.platform_data;
+	if (!pdata) {
+		dev_err(&pdev->dev, "no platform data\n");
+		return -EINVAL;
+	}
+
+	if (pdev->id < 0 || pdev->id >= LM3533_HVCTRLBANK_COUNT) {
+		dev_err(&pdev->dev, "illegal backlight id %d\n", pdev->id);
+		return -EINVAL;
+	}
+
+	bl = kzalloc(sizeof(*bl), GFP_KERNEL);
+	if (!bl) {
+		dev_err(&pdev->dev,
+				"failed to allocate memory for backlight\n");
+		return -ENOMEM;
+	}
+
+	bl->lm3533 = lm3533;
+	bl->id = pdev->id;
+
+	bl->cb.lm3533 = lm3533;
+	bl->cb.id = lm3533_bl_get_ctrlbank_id(bl);
+	bl->cb.dev = NULL;			/* until registered */
+
+	memset(&props, 0, sizeof(props));
+	props.type = BACKLIGHT_RAW;
+	props.max_brightness = LM3533_BL_MAX_BRIGHTNESS;
+	props.brightness = pdata->default_brightness;
+	bd = backlight_device_register(pdata->name, pdev->dev.parent, bl,
+						&lm3533_bl_ops, &props);
+	if (IS_ERR(bd)) {
+		dev_err(&pdev->dev, "failed to register backlight device\n");
+		ret = PTR_ERR(bd);
+		goto err_free;
+	}
+
+	bl->bd = bd;
+	bl->cb.dev = &bl->bd->dev;
+
+	platform_set_drvdata(pdev, bl);
+
+	ret = sysfs_create_group(&bd->dev.kobj, &lm3533_bl_attribute_group);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "failed to create sysfs attributes\n");
+		goto err_unregister;
+	}
+
+	backlight_update_status(bd);
+
+	ret = lm3533_bl_setup(bl, pdata);
+	if (ret)
+		goto err_sysfs_remove;
+
+	ret = lm3533_ctrlbank_enable(&bl->cb);
+	if (ret)
+		goto err_sysfs_remove;
+
+	return 0;
+
+err_sysfs_remove:
+	sysfs_remove_group(&bd->dev.kobj, &lm3533_bl_attribute_group);
+err_unregister:
+	backlight_device_unregister(bd);
+err_free:
+	kfree(bl);
+
+	return ret;
+}
+
+static int __devexit lm3533_bl_remove(struct platform_device *pdev)
+{
+	struct lm3533_bl *bl = platform_get_drvdata(pdev);
+	struct backlight_device *bd = bl->bd;
+
+	dev_dbg(&bd->dev, "%s\n", __func__);
+
+	bd->props.power = FB_BLANK_POWERDOWN;
+	bd->props.brightness = 0;
+
+	lm3533_ctrlbank_disable(&bl->cb);
+	sysfs_remove_group(&bd->dev.kobj, &lm3533_bl_attribute_group);
+	backlight_device_unregister(bd);
+	kfree(bl);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int lm3533_bl_suspend(struct platform_device *pdev, pm_message_t state)
+{
+	struct lm3533_bl *bl = platform_get_drvdata(pdev);
+
+	dev_dbg(&pdev->dev, "%s\n", __func__);
+
+	return lm3533_ctrlbank_disable(&bl->cb);
+}
+
+static int lm3533_bl_resume(struct platform_device *pdev)
+{
+	struct lm3533_bl *bl = platform_get_drvdata(pdev);
+
+	dev_dbg(&pdev->dev, "%s\n", __func__);
+
+	return lm3533_ctrlbank_enable(&bl->cb);
+}
+#else
+#define lm3533_bl_suspend	NULL
+#define lm3533_bl_resume	NULL
+#endif
+
+static void lm3533_bl_shutdown(struct platform_device *pdev)
+{
+	struct lm3533_bl *bl = platform_get_drvdata(pdev);
+
+	dev_dbg(&pdev->dev, "%s\n", __func__);
+
+	lm3533_ctrlbank_disable(&bl->cb);
+}
+
+static struct platform_driver lm3533_bl_driver = {
+	.driver = {
+		.name	= "lm3533-backlight",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= lm3533_bl_probe,
+	.remove		= __devexit_p(lm3533_bl_remove),
+	.shutdown	= lm3533_bl_shutdown,
+	.suspend	= lm3533_bl_suspend,
+	.resume		= lm3533_bl_resume,
+};
+module_platform_driver(lm3533_bl_driver);
+
+MODULE_AUTHOR("Johan Hovold <[email protected]>");
+MODULE_DESCRIPTION("LM3533 Backlight driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:lm3533-backlight");
diff --git a/drivers/video/backlight/lms283gf05.c b/drivers/video/backlight/lms283gf05.c
index 4161f9e3982a..a9f2c36966f1 100644
--- a/drivers/video/backlight/lms283gf05.c
+++ b/drivers/video/backlight/lms283gf05.c
@@ -168,7 +168,8 @@ static int __devinit lms283gf05_probe(struct spi_device *spi)
 			goto err;
 	}
 
-	st = kzalloc(sizeof(struct lms283gf05_state), GFP_KERNEL);
+	st = devm_kzalloc(&spi->dev, sizeof(struct lms283gf05_state),
+				GFP_KERNEL);
 	if (st == NULL) {
 		dev_err(&spi->dev, "No memory for device state\n");
 		ret = -ENOMEM;
@@ -178,7 +179,7 @@ static int __devinit lms283gf05_probe(struct spi_device *spi)
 	ld = lcd_device_register("lms283gf05", &spi->dev, st, &lms_ops);
 	if (IS_ERR(ld)) {
 		ret = PTR_ERR(ld);
-		goto err2;
+		goto err;
 	}
 
 	st->spi = spi;
@@ -193,8 +194,6 @@ static int __devinit lms283gf05_probe(struct spi_device *spi)
 
 	return 0;
 
-err2:
-	kfree(st);
 err:
 	if (pdata != NULL)
 		gpio_free(pdata->reset_gpio);
@@ -212,8 +211,6 @@ static int __devexit lms283gf05_remove(struct spi_device *spi)
 	if (pdata != NULL)
 		gpio_free(pdata->reset_gpio);
 
-	kfree(st);
-
 	return 0;
 }
 
diff --git a/drivers/video/backlight/ltv350qv.c b/drivers/video/backlight/ltv350qv.c
index 333949ff3265..6c0f1ac0d32a 100644
--- a/drivers/video/backlight/ltv350qv.c
+++ b/drivers/video/backlight/ltv350qv.c
@@ -232,23 +232,20 @@ static int __devinit ltv350qv_probe(struct spi_device *spi)
 	struct lcd_device *ld;
 	int ret;
 
-	lcd = kzalloc(sizeof(struct ltv350qv), GFP_KERNEL);
+	lcd = devm_kzalloc(&spi->dev, sizeof(struct ltv350qv), GFP_KERNEL);
 	if (!lcd)
 		return -ENOMEM;
 
 	lcd->spi = spi;
 	lcd->power = FB_BLANK_POWERDOWN;
-	lcd->buffer = kzalloc(8, GFP_KERNEL);
-	if (!lcd->buffer) {
-		ret = -ENOMEM;
-		goto out_free_lcd;
-	}
+	lcd->buffer = devm_kzalloc(&spi->dev, 8, GFP_KERNEL);
+	if (!lcd->buffer)
+		return -ENOMEM;
 
 	ld = lcd_device_register("ltv350qv", &spi->dev, lcd, &ltv_ops);
-	if (IS_ERR(ld)) {
-		ret = PTR_ERR(ld);
-		goto out_free_buffer;
-	}
+	if (IS_ERR(ld))
+		return PTR_ERR(ld);
+
 	lcd->ld = ld;
 
 	ret = ltv350qv_power(lcd, FB_BLANK_UNBLANK);
@@ -261,10 +258,6 @@ static int __devinit ltv350qv_probe(struct spi_device *spi)
 
 out_unregister:
 	lcd_device_unregister(ld);
-out_free_buffer:
-	kfree(lcd->buffer);
-out_free_lcd:
-	kfree(lcd);
 	return ret;
 }
 
@@ -274,8 +267,6 @@ static int __devexit ltv350qv_remove(struct spi_device *spi)
 
 	ltv350qv_power(lcd, FB_BLANK_POWERDOWN);
 	lcd_device_unregister(lcd->ld);
-	kfree(lcd->buffer);
-	kfree(lcd);
 
 	return 0;
 }
@@ -310,7 +301,6 @@ static void ltv350qv_shutdown(struct spi_device *spi)
 static struct spi_driver ltv350qv_driver = {
 	.driver = {
 		.name		= "ltv350qv",
-		.bus		= &spi_bus_type,
 		.owner		= THIS_MODULE,
 	},
 
diff --git a/drivers/video/backlight/omap1_bl.c b/drivers/video/backlight/omap1_bl.c
index 0175bfb08a1c..bfdc5fbeaa11 100644
--- a/drivers/video/backlight/omap1_bl.c
+++ b/drivers/video/backlight/omap1_bl.c
@@ -18,6 +18,8 @@
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
@@ -168,7 +170,7 @@ static int omapbl_probe(struct platform_device *pdev)
 	dev->props.brightness = pdata->default_intensity;
 	omapbl_update_status(dev);
 
-	printk(KERN_INFO "OMAP LCD backlight initialised\n");
+	pr_info("OMAP LCD backlight initialised\n");
 
 	return 0;
 }
diff --git a/drivers/video/backlight/pcf50633-backlight.c b/drivers/video/backlight/pcf50633-backlight.c
index c65853cb9740..c092159f4383 100644
--- a/drivers/video/backlight/pcf50633-backlight.c
+++ b/drivers/video/backlight/pcf50633-backlight.c
@@ -111,6 +111,7 @@ static int __devinit pcf50633_bl_probe(struct platform_device *pdev)
 	if (!pcf_bl)
 		return -ENOMEM;
 
+	memset(&bl_props, 0, sizeof(bl_props));
 	bl_props.type = BACKLIGHT_RAW;
 	bl_props.max_brightness = 0x3f;
 	bl_props.power = FB_BLANK_UNBLANK;
diff --git a/drivers/video/backlight/progear_bl.c b/drivers/video/backlight/progear_bl.c
index 6af183d6465e..69b35f02929e 100644
--- a/drivers/video/backlight/progear_bl.c
+++ b/drivers/video/backlight/progear_bl.c
@@ -15,6 +15,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
@@ -68,13 +70,13 @@ static int progearbl_probe(struct platform_device *pdev)
 
 	pmu_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M7101, NULL);
 	if (!pmu_dev) {
-		printk("ALI M7101 PMU not found.\n");
+		pr_err("ALI M7101 PMU not found.\n");
 		return -ENODEV;
 	}
 
 	sb_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
 	if (!sb_dev) {
-		printk("ALI 1533 SB not found.\n");
+		pr_err("ALI 1533 SB not found.\n");
 		ret = -ENODEV;
 		goto put_pmu;
 	}
diff --git a/drivers/video/backlight/s6e63m0.c b/drivers/video/backlight/s6e63m0.c
index e264f55b2574..6437ae474cf2 100644
--- a/drivers/video/backlight/s6e63m0.c
+++ b/drivers/video/backlight/s6e63m0.c
@@ -741,7 +741,7 @@ static int __devinit s6e63m0_probe(struct spi_device *spi)
 	struct backlight_device *bd = NULL;
 	struct backlight_properties props;
 
-	lcd = kzalloc(sizeof(struct s6e63m0), GFP_KERNEL);
+	lcd = devm_kzalloc(&spi->dev, sizeof(struct s6e63m0), GFP_KERNEL);
 	if (!lcd)
 		return -ENOMEM;
 
@@ -751,7 +751,7 @@ static int __devinit s6e63m0_probe(struct spi_device *spi)
 	ret = spi_setup(spi);
 	if (ret < 0) {
 		dev_err(&spi->dev, "spi setup failed.\n");
-		goto out_free_lcd;
+		return ret;
 	}
 
 	lcd->spi = spi;
@@ -760,14 +760,12 @@ static int __devinit s6e63m0_probe(struct spi_device *spi)
 	lcd->lcd_pd = (struct lcd_platform_data *)spi->dev.platform_data;
 	if (!lcd->lcd_pd) {
 		dev_err(&spi->dev, "platform data is NULL.\n");
-		goto out_free_lcd;
+		return -EFAULT;
 	}
 
 	ld = lcd_device_register("s6e63m0", &spi->dev, lcd, &s6e63m0_lcd_ops);
-	if (IS_ERR(ld)) {
-		ret = PTR_ERR(ld);
-		goto out_free_lcd;
-	}
+	if (IS_ERR(ld))
+		return PTR_ERR(ld);
 
 	lcd->ld = ld;
 
@@ -824,8 +822,6 @@ static int __devinit s6e63m0_probe(struct spi_device *spi)
 
 out_lcd_unregister:
 	lcd_device_unregister(ld);
-out_free_lcd:
-	kfree(lcd);
 	return ret;
 }
 
@@ -838,7 +834,6 @@ static int __devexit s6e63m0_remove(struct spi_device *spi)
 	device_remove_file(&spi->dev, &dev_attr_gamma_mode);
 	backlight_device_unregister(lcd->bd);
 	lcd_device_unregister(lcd->ld);
-	kfree(lcd);
 
 	return 0;
 }
@@ -899,7 +894,6 @@ static void s6e63m0_shutdown(struct spi_device *spi)
 static struct spi_driver s6e63m0_driver = {
 	.driver = {
 		.name	= "s6e63m0",
-		.bus	= &spi_bus_type,
 		.owner	= THIS_MODULE,
 	},
 	.probe		= s6e63m0_probe,
diff --git a/drivers/video/backlight/tdo24m.c b/drivers/video/backlight/tdo24m.c
index 2368b8e5f89e..02444d042cd5 100644
--- a/drivers/video/backlight/tdo24m.c
+++ b/drivers/video/backlight/tdo24m.c
@@ -349,7 +349,7 @@ static int __devinit tdo24m_probe(struct spi_device *spi)
 	if (err)
 		return err;
 
-	lcd = kzalloc(sizeof(struct tdo24m), GFP_KERNEL);
+	lcd = devm_kzalloc(&spi->dev, sizeof(struct tdo24m), GFP_KERNEL);
 	if (!lcd)
 		return -ENOMEM;
 
@@ -357,11 +357,9 @@ static int __devinit tdo24m_probe(struct spi_device *spi)
 	lcd->power = FB_BLANK_POWERDOWN;
 	lcd->mode = MODE_VGA;	/* default to VGA */
 
-	lcd->buf = kmalloc(TDO24M_SPI_BUFF_SIZE, GFP_KERNEL);
-	if (lcd->buf == NULL) {
-		kfree(lcd);
+	lcd->buf = devm_kzalloc(&spi->dev, TDO24M_SPI_BUFF_SIZE, GFP_KERNEL);
+	if (lcd->buf == NULL)
 		return -ENOMEM;
-	}
 
 	m = &lcd->msg;
 	x = &lcd->xfer;
@@ -383,15 +381,13 @@ static int __devinit tdo24m_probe(struct spi_device *spi)
 		break;
 	default:
 		dev_err(&spi->dev, "Unsupported model");
-		goto out_free;
+		return -EINVAL;
 	}
 
 	lcd->lcd_dev = lcd_device_register("tdo24m", &spi->dev,
 					lcd, &tdo24m_ops);
-	if (IS_ERR(lcd->lcd_dev)) {
-		err = PTR_ERR(lcd->lcd_dev);
-		goto out_free;
-	}
+	if (IS_ERR(lcd->lcd_dev))
+		return PTR_ERR(lcd->lcd_dev);
 
 	dev_set_drvdata(&spi->dev, lcd);
 	err = tdo24m_power(lcd, FB_BLANK_UNBLANK);
@@ -402,9 +398,6 @@ static int __devinit tdo24m_probe(struct spi_device *spi)
 
 out_unregister:
 	lcd_device_unregister(lcd->lcd_dev);
-out_free:
-	kfree(lcd->buf);
-	kfree(lcd);
 	return err;
 }
 
@@ -414,8 +407,6 @@ static int __devexit tdo24m_remove(struct spi_device *spi)
 
 	tdo24m_power(lcd, FB_BLANK_POWERDOWN);
 	lcd_device_unregister(lcd->lcd_dev);
-	kfree(lcd->buf);
-	kfree(lcd);
 
 	return 0;
 }
diff --git a/drivers/video/backlight/tosa_bl.c b/drivers/video/backlight/tosa_bl.c
index 2b241abced43..0d54e607e82d 100644
--- a/drivers/video/backlight/tosa_bl.c
+++ b/drivers/video/backlight/tosa_bl.c
@@ -82,8 +82,11 @@ static int __devinit tosa_bl_probe(struct i2c_client *client,
 		const struct i2c_device_id *id)
 {
 	struct backlight_properties props;
-	struct tosa_bl_data *data = kzalloc(sizeof(struct tosa_bl_data), GFP_KERNEL);
+	struct tosa_bl_data *data;
 	int ret = 0;
+
+	data = devm_kzalloc(&client->dev, sizeof(struct tosa_bl_data),
+				GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
 
@@ -92,7 +95,7 @@ static int __devinit tosa_bl_probe(struct i2c_client *client,
 	ret = gpio_request(TOSA_GPIO_BL_C20MA, "backlight");
 	if (ret) {
 		dev_dbg(&data->bl->dev, "Unable to request gpio!\n");
-		goto err_gpio_bl;
+		return ret;
 	}
 	ret = gpio_direction_output(TOSA_GPIO_BL_C20MA, 0);
 	if (ret)
@@ -122,8 +125,6 @@ err_reg:
 	data->bl = NULL;
 err_gpio_dir:
 	gpio_free(TOSA_GPIO_BL_C20MA);
-err_gpio_bl:
-	kfree(data);
 	return ret;
 }
 
@@ -136,8 +137,6 @@ static int __devexit tosa_bl_remove(struct i2c_client *client)
 
 	gpio_free(TOSA_GPIO_BL_C20MA);
 
-	kfree(data);
-
 	return 0;
 }
 
diff --git a/drivers/video/backlight/tosa_lcd.c b/drivers/video/backlight/tosa_lcd.c
index 2231aec23918..47823b8efff0 100644
--- a/drivers/video/backlight/tosa_lcd.c
+++ b/drivers/video/backlight/tosa_lcd.c
@@ -174,7 +174,8 @@ static int __devinit tosa_lcd_probe(struct spi_device *spi)
 	int ret;
 	struct tosa_lcd_data *data;
 
-	data = kzalloc(sizeof(struct tosa_lcd_data), GFP_KERNEL);
+	data = devm_kzalloc(&spi->dev, sizeof(struct tosa_lcd_data),
+				GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
 
@@ -187,7 +188,7 @@ static int __devinit tosa_lcd_probe(struct spi_device *spi)
 
 	ret = spi_setup(spi);
 	if (ret < 0)
-		goto err_spi;
+		return ret;
 
 	data->spi = spi;
 	dev_set_drvdata(&spi->dev, data);
@@ -224,8 +225,6 @@ err_gpio_dir:
 	gpio_free(TOSA_GPIO_TG_ON);
 err_gpio_tg:
 	dev_set_drvdata(&spi->dev, NULL);
-err_spi:
-	kfree(data);
 	return ret;
 }
 
@@ -242,7 +241,6 @@ static int __devexit tosa_lcd_remove(struct spi_device *spi)
 
 	gpio_free(TOSA_GPIO_TG_ON);
 	dev_set_drvdata(&spi->dev, NULL);
-	kfree(data);
 
 	return 0;
 }
diff --git a/drivers/video/backlight/wm831x_bl.c b/drivers/video/backlight/wm831x_bl.c
index 5d365deb5f82..9e5517a3a52b 100644
--- a/drivers/video/backlight/wm831x_bl.c
+++ b/drivers/video/backlight/wm831x_bl.c
@@ -194,6 +194,7 @@ static int wm831x_backlight_probe(struct platform_device *pdev)
 	data->current_brightness = 0;
 	data->isink_reg = isink_reg;
 
+	memset(&props, 0, sizeof(props));
 	props.type = BACKLIGHT_RAW;
 	props.max_brightness = max_isel;
 	bl = backlight_device_register("wm831x", &pdev->dev, data,
diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index c6ce416ab587..0dff12a1daef 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -1046,20 +1046,29 @@ fb_set_var(struct fb_info *info, struct fb_var_screeninfo *var)
 int
 fb_blank(struct fb_info *info, int blank)
 {	
- 	int ret = -EINVAL;
+	struct fb_event event;
+	int ret = -EINVAL, early_ret;
 
  	if (blank > FB_BLANK_POWERDOWN)
  		blank = FB_BLANK_POWERDOWN;
 
+	event.info = info;
+	event.data = &blank;
+
+	early_ret = fb_notifier_call_chain(FB_EARLY_EVENT_BLANK, &event);
+
 	if (info->fbops->fb_blank)
  		ret = info->fbops->fb_blank(blank, info);
 
- 	if (!ret) {
-		struct fb_event event;
-
-		event.info = info;
-		event.data = &blank;
+	if (!ret)
 		fb_notifier_call_chain(FB_EVENT_BLANK, &event);
+	else {
+		/*
+		 * if fb_blank is failed then revert effects of
+		 * the early blank event.
+		 */
+		if (!early_ret)
+			fb_notifier_call_chain(FB_R_EARLY_EVENT_BLANK, &event);
 	}
 
  	return ret;
diff --git a/drivers/video/omap2/displays/panel-acx565akm.c b/drivers/video/omap2/displays/panel-acx565akm.c
index d26f37ac69d8..74e7cf078505 100644
--- a/drivers/video/omap2/displays/panel-acx565akm.c
+++ b/drivers/video/omap2/displays/panel-acx565akm.c
@@ -532,6 +532,7 @@ static int acx_panel_probe(struct omap_dss_device *dssdev)
 
 	/*------- Backlight control --------*/
 
+	memset(&props, 0, sizeof(props));
 	props.fb_blank = FB_BLANK_UNBLANK;
 	props.power = FB_BLANK_UNBLANK;
 	props.type = BACKLIGHT_RAW;
diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index a18bf6358eb8..d92d7488be16 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -565,6 +565,7 @@ config INTEL_SCU_WATCHDOG
 config ITCO_WDT
 	tristate "Intel TCO Timer/Watchdog"
 	depends on (X86 || IA64) && PCI
+	select LPC_ICH
 	---help---
 	  Hardware driver for the intel TCO timer based watchdog devices.
 	  These drivers are included in the Intel 82801 I/O Controller
diff --git a/drivers/watchdog/iTCO_vendor.h b/drivers/watchdog/iTCO_vendor.h
index 9e27e6422f66..3c57b45537a2 100644
--- a/drivers/watchdog/iTCO_vendor.h
+++ b/drivers/watchdog/iTCO_vendor.h
@@ -1,8 +1,8 @@
 /* iTCO Vendor Specific Support hooks */
 #ifdef CONFIG_ITCO_VENDOR_SUPPORT
-extern void iTCO_vendor_pre_start(unsigned long, unsigned int);
-extern void iTCO_vendor_pre_stop(unsigned long);
-extern void iTCO_vendor_pre_keepalive(unsigned long, unsigned int);
+extern void iTCO_vendor_pre_start(struct resource *, unsigned int);
+extern void iTCO_vendor_pre_stop(struct resource *);
+extern void iTCO_vendor_pre_keepalive(struct resource *, unsigned int);
 extern void iTCO_vendor_pre_set_heartbeat(unsigned int);
 extern int iTCO_vendor_check_noreboot_on(void);
 #else
diff --git a/drivers/watchdog/iTCO_vendor_support.c b/drivers/watchdog/iTCO_vendor_support.c
index 2721d29ce243..b6b2f90b5d44 100644
--- a/drivers/watchdog/iTCO_vendor_support.c
+++ b/drivers/watchdog/iTCO_vendor_support.c
@@ -35,11 +35,6 @@
 
 #include "iTCO_vendor.h"
 
-/* iTCO defines */
-#define	SMI_EN		(acpibase + 0x30) /* SMI Control and Enable Register */
-#define	TCOBASE		(acpibase + 0x60) /* TCO base address */
-#define	TCO1_STS	(TCOBASE + 0x04)  /* TCO1 Status Register */
-
 /* List of vendor support modes */
 /* SuperMicro Pentium 3 Era 370SSE+-OEM1/P3TSSE */
 #define SUPERMICRO_OLD_BOARD	1
@@ -82,24 +77,24 @@ MODULE_PARM_DESC(vendorsupport, "iTCO vendor specific support mode, default="
  *	    20.6 seconds.
  */
 
-static void supermicro_old_pre_start(unsigned long acpibase)
+static void supermicro_old_pre_start(struct resource *smires)
 {
 	unsigned long val32;
 
 	/* Bit 13: TCO_EN -> 0 = Disables TCO logic generating an SMI# */
-	val32 = inl(SMI_EN);
+	val32 = inl(smires->start);
 	val32 &= 0xffffdfff;	/* Turn off SMI clearing watchdog */
-	outl(val32, SMI_EN);	/* Needed to activate watchdog */
+	outl(val32, smires->start);	/* Needed to activate watchdog */
 }
 
-static void supermicro_old_pre_stop(unsigned long acpibase)
+static void supermicro_old_pre_stop(struct resource *smires)
 {
 	unsigned long val32;
 
 	/* Bit 13: TCO_EN -> 1 = Enables the TCO logic to generate SMI# */
-	val32 = inl(SMI_EN);
+	val32 = inl(smires->start);
 	val32 |= 0x00002000;	/* Turn on SMI clearing watchdog */
-	outl(val32, SMI_EN);	/* Needed to deactivate watchdog */
+	outl(val32, smires->start);	/* Needed to deactivate watchdog */
 }
 
 /*
@@ -270,66 +265,66 @@ static void supermicro_new_pre_set_heartbeat(unsigned int heartbeat)
  *	Don't use this fix if you don't need to!!!
  */
 
-static void broken_bios_start(unsigned long acpibase)
+static void broken_bios_start(struct resource *smires)
 {
 	unsigned long val32;
 
-	val32 = inl(SMI_EN);
+	val32 = inl(smires->start);
 	/* Bit 13: TCO_EN     -> 0 = Disables TCO logic generating an SMI#
 	   Bit  0: GBL_SMI_EN -> 0 = No SMI# will be generated by ICH. */
 	val32 &= 0xffffdffe;
-	outl(val32, SMI_EN);
+	outl(val32, smires->start);
 }
 
-static void broken_bios_stop(unsigned long acpibase)
+static void broken_bios_stop(struct resource *smires)
 {
 	unsigned long val32;
 
-	val32 = inl(SMI_EN);
+	val32 = inl(smires->start);
 	/* Bit 13: TCO_EN     -> 1 = Enables TCO logic generating an SMI#
 	   Bit  0: GBL_SMI_EN -> 1 = Turn global SMI on again. */
 	val32 |= 0x00002001;
-	outl(val32, SMI_EN);
+	outl(val32, smires->start);
 }
 
 /*
  *	Generic Support Functions
  */
 
-void iTCO_vendor_pre_start(unsigned long acpibase,
+void iTCO_vendor_pre_start(struct resource *smires,
 			   unsigned int heartbeat)
 {
 	switch (vendorsupport) {
 	case SUPERMICRO_OLD_BOARD:
-		supermicro_old_pre_start(acpibase);
+		supermicro_old_pre_start(smires);
 		break;
 	case SUPERMICRO_NEW_BOARD:
 		supermicro_new_pre_start(heartbeat);
 		break;
 	case BROKEN_BIOS:
-		broken_bios_start(acpibase);
+		broken_bios_start(smires);
 		break;
 	}
 }
 EXPORT_SYMBOL(iTCO_vendor_pre_start);
 
-void iTCO_vendor_pre_stop(unsigned long acpibase)
+void iTCO_vendor_pre_stop(struct resource *smires)
 {
 	switch (vendorsupport) {
 	case SUPERMICRO_OLD_BOARD:
-		supermicro_old_pre_stop(acpibase);
+		supermicro_old_pre_stop(smires);
 		break;
 	case SUPERMICRO_NEW_BOARD:
 		supermicro_new_pre_stop();
 		break;
 	case BROKEN_BIOS:
-		broken_bios_stop(acpibase);
+		broken_bios_stop(smires);
 		break;
 	}
 }
 EXPORT_SYMBOL(iTCO_vendor_pre_stop);
 
-void iTCO_vendor_pre_keepalive(unsigned long acpibase, unsigned int heartbeat)
+void iTCO_vendor_pre_keepalive(struct resource *smires, unsigned int heartbeat)
 {
 	if (vendorsupport == SUPERMICRO_NEW_BOARD)
 		supermicro_new_pre_set_heartbeat(heartbeat);
diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c
index 9fecb95645a3..741528b032e2 100644
--- a/drivers/watchdog/iTCO_wdt.c
+++ b/drivers/watchdog/iTCO_wdt.c
@@ -66,316 +66,16 @@
 #include <linux/spinlock.h>		/* For spin_lock/spin_unlock/... */
 #include <linux/uaccess.h>		/* For copy_to_user/put_user/... */
 #include <linux/io.h>			/* For inb/outb/... */
+#include <linux/mfd/core.h>
+#include <linux/mfd/lpc_ich.h>
 
 #include "iTCO_vendor.h"
 
-/* TCO related info */
-enum iTCO_chipsets {
-	TCO_ICH = 0,	/* ICH */
-	TCO_ICH0,	/* ICH0 */
-	TCO_ICH2,	/* ICH2 */
-	TCO_ICH2M,	/* ICH2-M */
-	TCO_ICH3,	/* ICH3-S */
-	TCO_ICH3M,	/* ICH3-M */
-	TCO_ICH4,	/* ICH4 */
-	TCO_ICH4M,	/* ICH4-M */
-	TCO_CICH,	/* C-ICH */
-	TCO_ICH5,	/* ICH5 & ICH5R */
-	TCO_6300ESB,	/* 6300ESB */
-	TCO_ICH6,	/* ICH6 & ICH6R */
-	TCO_ICH6M,	/* ICH6-M */
-	TCO_ICH6W,	/* ICH6W & ICH6RW */
-	TCO_631XESB,	/* 631xESB/632xESB */
-	TCO_ICH7,	/* ICH7 & ICH7R */
-	TCO_ICH7DH,	/* ICH7DH */
-	TCO_ICH7M,	/* ICH7-M & ICH7-U */
-	TCO_ICH7MDH,	/* ICH7-M DH */
-	TCO_NM10,	/* NM10 */
-	TCO_ICH8,	/* ICH8 & ICH8R */
-	TCO_ICH8DH,	/* ICH8DH */
-	TCO_ICH8DO,	/* ICH8DO */
-	TCO_ICH8M,	/* ICH8M */
-	TCO_ICH8ME,	/* ICH8M-E */
-	TCO_ICH9,	/* ICH9 */
-	TCO_ICH9R,	/* ICH9R */
-	TCO_ICH9DH,	/* ICH9DH */
-	TCO_ICH9DO,	/* ICH9DO */
-	TCO_ICH9M,	/* ICH9M */
-	TCO_ICH9ME,	/* ICH9M-E */
-	TCO_ICH10,	/* ICH10 */
-	TCO_ICH10R,	/* ICH10R */
-	TCO_ICH10D,	/* ICH10D */
-	TCO_ICH10DO,	/* ICH10DO */
-	TCO_PCH,	/* PCH Desktop Full Featured */
-	TCO_PCHM,	/* PCH Mobile Full Featured */
-	TCO_P55,	/* P55 */
-	TCO_PM55,	/* PM55 */
-	TCO_H55,	/* H55 */
-	TCO_QM57,	/* QM57 */
-	TCO_H57,	/* H57 */
-	TCO_HM55,	/* HM55 */
-	TCO_Q57,	/* Q57 */
-	TCO_HM57,	/* HM57 */
-	TCO_PCHMSFF,	/* PCH Mobile SFF Full Featured */
-	TCO_QS57,	/* QS57 */
-	TCO_3400,	/* 3400 */
-	TCO_3420,	/* 3420 */
-	TCO_3450,	/* 3450 */
-	TCO_EP80579,	/* EP80579 */
-	TCO_CPT,	/* Cougar Point */
-	TCO_CPTD,	/* Cougar Point Desktop */
-	TCO_CPTM,	/* Cougar Point Mobile */
-	TCO_PBG,	/* Patsburg */
-	TCO_DH89XXCC,	/* DH89xxCC */
-	TCO_PPT,	/* Panther Point */
-	TCO_LPT,	/* Lynx Point */
-};
-
-static struct {
-	char *name;
-	unsigned int iTCO_version;
-} iTCO_chipset_info[] __devinitdata = {
-	{"ICH", 1},
-	{"ICH0", 1},
-	{"ICH2", 1},
-	{"ICH2-M", 1},
-	{"ICH3-S", 1},
-	{"ICH3-M", 1},
-	{"ICH4", 1},
-	{"ICH4-M", 1},
-	{"C-ICH", 1},
-	{"ICH5 or ICH5R", 1},
-	{"6300ESB", 1},
-	{"ICH6 or ICH6R", 2},
-	{"ICH6-M", 2},
-	{"ICH6W or ICH6RW", 2},
-	{"631xESB/632xESB", 2},
-	{"ICH7 or ICH7R", 2},
-	{"ICH7DH", 2},
-	{"ICH7-M or ICH7-U", 2},
-	{"ICH7-M DH", 2},
-	{"NM10", 2},
-	{"ICH8 or ICH8R", 2},
-	{"ICH8DH", 2},
-	{"ICH8DO", 2},
-	{"ICH8M", 2},
-	{"ICH8M-E", 2},
-	{"ICH9", 2},
-	{"ICH9R", 2},
-	{"ICH9DH", 2},
-	{"ICH9DO", 2},
-	{"ICH9M", 2},
-	{"ICH9M-E", 2},
-	{"ICH10", 2},
-	{"ICH10R", 2},
-	{"ICH10D", 2},
-	{"ICH10DO", 2},
-	{"PCH Desktop Full Featured", 2},
-	{"PCH Mobile Full Featured", 2},
-	{"P55", 2},
-	{"PM55", 2},
-	{"H55", 2},
-	{"QM57", 2},
-	{"H57", 2},
-	{"HM55", 2},
-	{"Q57", 2},
-	{"HM57", 2},
-	{"PCH Mobile SFF Full Featured", 2},
-	{"QS57", 2},
-	{"3400", 2},
-	{"3420", 2},
-	{"3450", 2},
-	{"EP80579", 2},
-	{"Cougar Point", 2},
-	{"Cougar Point Desktop", 2},
-	{"Cougar Point Mobile", 2},
-	{"Patsburg", 2},
-	{"DH89xxCC", 2},
-	{"Panther Point", 2},
-	{"Lynx Point", 2},
-	{NULL, 0}
-};
-
-/*
- * This data only exists for exporting the supported PCI ids
- * via MODULE_DEVICE_TABLE.  We do not actually register a
- * pci_driver, because the I/O Controller Hub has also other
- * functions that probably will be registered by other drivers.
- */
-static DEFINE_PCI_DEVICE_TABLE(iTCO_wdt_pci_tbl) = {
-	{ PCI_VDEVICE(INTEL, 0x2410), TCO_ICH},
-	{ PCI_VDEVICE(INTEL, 0x2420), TCO_ICH0},
-	{ PCI_VDEVICE(INTEL, 0x2440), TCO_ICH2},
-	{ PCI_VDEVICE(INTEL, 0x244c), TCO_ICH2M},
-	{ PCI_VDEVICE(INTEL, 0x2480), TCO_ICH3},
-	{ PCI_VDEVICE(INTEL, 0x248c), TCO_ICH3M},
-	{ PCI_VDEVICE(INTEL, 0x24c0), TCO_ICH4},
-	{ PCI_VDEVICE(INTEL, 0x24cc), TCO_ICH4M},
-	{ PCI_VDEVICE(INTEL, 0x2450), TCO_CICH},
-	{ PCI_VDEVICE(INTEL, 0x24d0), TCO_ICH5},
-	{ PCI_VDEVICE(INTEL, 0x25a1), TCO_6300ESB},
-	{ PCI_VDEVICE(INTEL, 0x2640), TCO_ICH6},
-	{ PCI_VDEVICE(INTEL, 0x2641), TCO_ICH6M},
-	{ PCI_VDEVICE(INTEL, 0x2642), TCO_ICH6W},
-	{ PCI_VDEVICE(INTEL, 0x2670), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x2671), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x2672), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x2673), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x2674), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x2675), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x2676), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x2677), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x2678), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x2679), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x267a), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x267b), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x267c), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x267d), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x267e), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x267f), TCO_631XESB},
-	{ PCI_VDEVICE(INTEL, 0x27b8), TCO_ICH7},
-	{ PCI_VDEVICE(INTEL, 0x27b0), TCO_ICH7DH},
-	{ PCI_VDEVICE(INTEL, 0x27b9), TCO_ICH7M},
-	{ PCI_VDEVICE(INTEL, 0x27bd), TCO_ICH7MDH},
-	{ PCI_VDEVICE(INTEL, 0x27bc), TCO_NM10},
-	{ PCI_VDEVICE(INTEL, 0x2810), TCO_ICH8},
-	{ PCI_VDEVICE(INTEL, 0x2812), TCO_ICH8DH},
-	{ PCI_VDEVICE(INTEL, 0x2814), TCO_ICH8DO},
-	{ PCI_VDEVICE(INTEL, 0x2815), TCO_ICH8M},
-	{ PCI_VDEVICE(INTEL, 0x2811), TCO_ICH8ME},
-	{ PCI_VDEVICE(INTEL, 0x2918), TCO_ICH9},
-	{ PCI_VDEVICE(INTEL, 0x2916), TCO_ICH9R},
-	{ PCI_VDEVICE(INTEL, 0x2912), TCO_ICH9DH},
-	{ PCI_VDEVICE(INTEL, 0x2914), TCO_ICH9DO},
-	{ PCI_VDEVICE(INTEL, 0x2919), TCO_ICH9M},
-	{ PCI_VDEVICE(INTEL, 0x2917), TCO_ICH9ME},
-	{ PCI_VDEVICE(INTEL, 0x3a18), TCO_ICH10},
-	{ PCI_VDEVICE(INTEL, 0x3a16), TCO_ICH10R},
-	{ PCI_VDEVICE(INTEL, 0x3a1a), TCO_ICH10D},
-	{ PCI_VDEVICE(INTEL, 0x3a14), TCO_ICH10DO},
-	{ PCI_VDEVICE(INTEL, 0x3b00), TCO_PCH},
-	{ PCI_VDEVICE(INTEL, 0x3b01), TCO_PCHM},
-	{ PCI_VDEVICE(INTEL, 0x3b02), TCO_P55},
-	{ PCI_VDEVICE(INTEL, 0x3b03), TCO_PM55},
-	{ PCI_VDEVICE(INTEL, 0x3b06), TCO_H55},
-	{ PCI_VDEVICE(INTEL, 0x3b07), TCO_QM57},
-	{ PCI_VDEVICE(INTEL, 0x3b08), TCO_H57},
-	{ PCI_VDEVICE(INTEL, 0x3b09), TCO_HM55},
-	{ PCI_VDEVICE(INTEL, 0x3b0a), TCO_Q57},
-	{ PCI_VDEVICE(INTEL, 0x3b0b), TCO_HM57},
-	{ PCI_VDEVICE(INTEL, 0x3b0d), TCO_PCHMSFF},
-	{ PCI_VDEVICE(INTEL, 0x3b0f), TCO_QS57},
-	{ PCI_VDEVICE(INTEL, 0x3b12), TCO_3400},
-	{ PCI_VDEVICE(INTEL, 0x3b14), TCO_3420},
-	{ PCI_VDEVICE(INTEL, 0x3b16), TCO_3450},
-	{ PCI_VDEVICE(INTEL, 0x5031), TCO_EP80579},
-	{ PCI_VDEVICE(INTEL, 0x1c41), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c42), TCO_CPTD},
-	{ PCI_VDEVICE(INTEL, 0x1c43), TCO_CPTM},
-	{ PCI_VDEVICE(INTEL, 0x1c44), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c45), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c46), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c47), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c48), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c49), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c4a), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c4b), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c4c), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c4d), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c4e), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c4f), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c50), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c51), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c52), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c53), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c54), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c55), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c56), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c57), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c58), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c59), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c5a), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c5b), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c5c), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c5d), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c5e), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1c5f), TCO_CPT},
-	{ PCI_VDEVICE(INTEL, 0x1d40), TCO_PBG},
-	{ PCI_VDEVICE(INTEL, 0x1d41), TCO_PBG},
-	{ PCI_VDEVICE(INTEL, 0x2310), TCO_DH89XXCC},
-	{ PCI_VDEVICE(INTEL, 0x1e40), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e41), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e42), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e43), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e44), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e45), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e46), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e47), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e48), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e49), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e4a), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e4b), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e4c), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e4d), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e4e), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e4f), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e50), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e51), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e52), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e53), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e54), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e55), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e56), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e57), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e58), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e59), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e5a), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e5b), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e5c), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e5d), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e5e), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x1e5f), TCO_PPT},
-	{ PCI_VDEVICE(INTEL, 0x8c40), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c41), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c42), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c43), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c44), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c45), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c46), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c47), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c48), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c49), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c4a), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c4b), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c4c), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c4d), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c4e), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c4f), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c50), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c51), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c52), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c53), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c54), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c55), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c56), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c57), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c58), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c59), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c5a), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c5b), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c5c), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c5d), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c5e), TCO_LPT},
-	{ PCI_VDEVICE(INTEL, 0x8c5f), TCO_LPT},
-	{ 0, },			/* End of list */
-};
-MODULE_DEVICE_TABLE(pci, iTCO_wdt_pci_tbl);
-
 /* Address definitions for the TCO */
 /* TCO base address */
-#define TCOBASE		(iTCO_wdt_private.ACPIBASE + 0x60)
+#define TCOBASE		(iTCO_wdt_private.tco_res->start)
 /* SMI Control and Enable Register */
-#define SMI_EN		(iTCO_wdt_private.ACPIBASE + 0x30)
+#define SMI_EN		(iTCO_wdt_private.smi_res->start)
 
 #define TCO_RLD		(TCOBASE + 0x00) /* TCO Timer Reload and Curr. Value */
 #define TCOv1_TMR	(TCOBASE + 0x01) /* TCOv1 Timer Initial Value	*/
@@ -393,19 +93,18 @@ static char expect_release;
 static struct {		/* this is private data for the iTCO_wdt device */
 	/* TCO version/generation */
 	unsigned int iTCO_version;
-	/* The device's ACPIBASE address (TCOBASE = ACPIBASE+0x60) */
-	unsigned long ACPIBASE;
+	struct resource *tco_res;
+	struct resource *smi_res;
+	struct resource *gcs_res;
 	/* NO_REBOOT flag is Memory-Mapped GCS register bit 5 (TCO version 2)*/
 	unsigned long __iomem *gcs;
 	/* the lock for io operations */
 	spinlock_t io_lock;
+	struct platform_device *dev;
 	/* the PCI-device */
 	struct pci_dev *pdev;
 } iTCO_wdt_private;
 
-/* the watchdog platform device */
-static struct platform_device *iTCO_wdt_platform_device;
-
 /* module parameters */
 #define WATCHDOG_HEARTBEAT 30	/* 30 sec default heartbeat */
 static int heartbeat = WATCHDOG_HEARTBEAT;  /* in seconds */
@@ -485,7 +184,7 @@ static int iTCO_wdt_start(void)
 
 	spin_lock(&iTCO_wdt_private.io_lock);
 
-	iTCO_vendor_pre_start(iTCO_wdt_private.ACPIBASE, heartbeat);
+	iTCO_vendor_pre_start(iTCO_wdt_private.smi_res, heartbeat);
 
 	/* disable chipset's NO_REBOOT bit */
 	if (iTCO_wdt_unset_NO_REBOOT_bit()) {
@@ -519,7 +218,7 @@ static int iTCO_wdt_stop(void)
 
 	spin_lock(&iTCO_wdt_private.io_lock);
 
-	iTCO_vendor_pre_stop(iTCO_wdt_private.ACPIBASE);
+	iTCO_vendor_pre_stop(iTCO_wdt_private.smi_res);
 
 	/* Bit 11: TCO Timer Halt -> 1 = The TCO timer is disabled */
 	val = inw(TCO1_CNT);
@@ -541,7 +240,7 @@ static int iTCO_wdt_keepalive(void)
 {
 	spin_lock(&iTCO_wdt_private.io_lock);
 
-	iTCO_vendor_pre_keepalive(iTCO_wdt_private.ACPIBASE, heartbeat);
+	iTCO_vendor_pre_keepalive(iTCO_wdt_private.smi_res, heartbeat);
 
 	/* Reload the timer by writing to the TCO Timer Counter register */
 	if (iTCO_wdt_private.iTCO_version == 2)
@@ -786,83 +485,120 @@ static struct miscdevice iTCO_wdt_miscdev = {
  *	Init & exit routines
  */
 
-static int __devinit iTCO_wdt_init(struct pci_dev *pdev,
-		const struct pci_device_id *ent, struct platform_device *dev)
+static void __devexit iTCO_wdt_cleanup(void)
+{
+	/* Stop the timer before we leave */
+	if (!nowayout)
+		iTCO_wdt_stop();
+
+	/* Deregister */
+	misc_deregister(&iTCO_wdt_miscdev);
+
+	/* release resources */
+	release_region(iTCO_wdt_private.tco_res->start,
+			resource_size(iTCO_wdt_private.tco_res));
+	release_region(iTCO_wdt_private.smi_res->start,
+			resource_size(iTCO_wdt_private.smi_res));
+	if (iTCO_wdt_private.iTCO_version == 2) {
+		iounmap(iTCO_wdt_private.gcs);
+		release_mem_region(iTCO_wdt_private.gcs_res->start,
+				resource_size(iTCO_wdt_private.gcs_res));
+	}
+
+	iTCO_wdt_private.tco_res = NULL;
+	iTCO_wdt_private.smi_res = NULL;
+	iTCO_wdt_private.gcs_res = NULL;
+	iTCO_wdt_private.gcs = NULL;
+}
+
+static int __devinit iTCO_wdt_probe(struct platform_device *dev)
 {
-	int ret;
-	u32 base_address;
-	unsigned long RCBA;
+	int ret = -ENODEV;
 	unsigned long val32;
+	struct lpc_ich_info *ich_info = dev->dev.platform_data;
+
+	if (!ich_info)
+		goto out;
+
+	spin_lock_init(&iTCO_wdt_private.io_lock);
+
+	iTCO_wdt_private.tco_res =
+		platform_get_resource(dev, IORESOURCE_IO, ICH_RES_IO_TCO);
+	if (!iTCO_wdt_private.tco_res)
+		goto out;
+
+	iTCO_wdt_private.smi_res =
+		platform_get_resource(dev, IORESOURCE_IO, ICH_RES_IO_SMI);
+	if (!iTCO_wdt_private.smi_res)
+		goto out;
+
+	iTCO_wdt_private.iTCO_version = ich_info->iTCO_version;
+	iTCO_wdt_private.dev = dev;
+	iTCO_wdt_private.pdev = to_pci_dev(dev->dev.parent);
 
 	/*
-	 *      Find the ACPI/PM base I/O address which is the base
-	 *      for the TCO registers (TCOBASE=ACPIBASE + 0x60)
-	 *      ACPIBASE is bits [15:7] from 0x40-0x43
+	 * Get the Memory-Mapped GCS register, we need it for the
+	 * NO_REBOOT flag (TCO v2).
 	 */
-	pci_read_config_dword(pdev, 0x40, &base_address);
-	base_address &= 0x0000ff80;
-	if (base_address == 0x00000000) {
-		/* Something's wrong here, ACPIBASE has to be set */
-		pr_err("failed to get TCOBASE address, device disabled by hardware/BIOS\n");
-		return -ENODEV;
-	}
-	iTCO_wdt_private.iTCO_version =
-			iTCO_chipset_info[ent->driver_data].iTCO_version;
-	iTCO_wdt_private.ACPIBASE = base_address;
-	iTCO_wdt_private.pdev = pdev;
-
-	/* Get the Memory-Mapped GCS register, we need it for the
-	   NO_REBOOT flag (TCO v2). To get access to it you have to
-	   read RCBA from PCI Config space 0xf0 and use it as base.
-	   GCS = RCBA + ICH6_GCS(0x3410). */
 	if (iTCO_wdt_private.iTCO_version == 2) {
-		pci_read_config_dword(pdev, 0xf0, &base_address);
-		if ((base_address & 1) == 0) {
-			pr_err("RCBA is disabled by hardware/BIOS, device disabled\n");
-			ret = -ENODEV;
+		iTCO_wdt_private.gcs_res = platform_get_resource(dev,
+							IORESOURCE_MEM,
+							ICH_RES_MEM_GCS);
+
+		if (!iTCO_wdt_private.gcs_res)
+			goto out;
+
+		if (!request_mem_region(iTCO_wdt_private.gcs_res->start,
+			resource_size(iTCO_wdt_private.gcs_res), dev->name)) {
+			ret = -EBUSY;
 			goto out;
 		}
-		RCBA = base_address & 0xffffc000;
-		iTCO_wdt_private.gcs = ioremap((RCBA + 0x3410), 4);
+		iTCO_wdt_private.gcs = ioremap(iTCO_wdt_private.gcs_res->start,
+			resource_size(iTCO_wdt_private.gcs_res));
+		if (!iTCO_wdt_private.gcs) {
+			ret = -EIO;
+			goto unreg_gcs;
+		}
 	}
 
 	/* Check chipset's NO_REBOOT bit */
 	if (iTCO_wdt_unset_NO_REBOOT_bit() && iTCO_vendor_check_noreboot_on()) {
 		pr_info("unable to reset NO_REBOOT flag, device disabled by hardware/BIOS\n");
 		ret = -ENODEV;	/* Cannot reset NO_REBOOT bit */
-		goto out_unmap;
+		goto unmap_gcs;
 	}
 
 	/* Set the NO_REBOOT bit to prevent later reboots, just for sure */
 	iTCO_wdt_set_NO_REBOOT_bit();
 
 	/* The TCO logic uses the TCO_EN bit in the SMI_EN register */
-	if (!request_region(SMI_EN, 4, "iTCO_wdt")) {
-		pr_err("I/O address 0x%04lx already in use, device disabled\n",
+	if (!request_region(iTCO_wdt_private.smi_res->start,
+			resource_size(iTCO_wdt_private.smi_res), dev->name)) {
+		pr_err("I/O address 0x%04llx already in use, device disabled\n",
 		       SMI_EN);
-		ret = -EIO;
-		goto out_unmap;
+		ret = -EBUSY;
+		goto unmap_gcs;
 	}
 	if (turn_SMI_watchdog_clear_off >= iTCO_wdt_private.iTCO_version) {
-		/* Bit 13: TCO_EN -> 0 = Disables TCO logic generating an SMI# */
+		/*
+		 * Bit 13: TCO_EN -> 0
+		 * Disables TCO logic generating an SMI#
+		 */
 		val32 = inl(SMI_EN);
 		val32 &= 0xffffdfff;	/* Turn off SMI clearing watchdog */
 		outl(val32, SMI_EN);
 	}
 
-	/* The TCO I/O registers reside in a 32-byte range pointed to
-	   by the TCOBASE value */
-	if (!request_region(TCOBASE, 0x20, "iTCO_wdt")) {
-		pr_err("I/O address 0x%04lx already in use, device disabled\n",
+	if (!request_region(iTCO_wdt_private.tco_res->start,
+			resource_size(iTCO_wdt_private.tco_res), dev->name)) {
+		pr_err("I/O address 0x%04llx already in use, device disabled\n",
 		       TCOBASE);
-		ret = -EIO;
-		goto unreg_smi_en;
+		ret = -EBUSY;
+		goto unreg_smi;
 	}
 
-	pr_info("Found a %s TCO device (Version=%d, TCOBASE=0x%04lx)\n",
-		iTCO_chipset_info[ent->driver_data].name,
-		iTCO_chipset_info[ent->driver_data].iTCO_version,
-		TCOBASE);
+	pr_info("Found a %s TCO device (Version=%d, TCOBASE=0x%04llx)\n",
+		ich_info->name, ich_info->iTCO_version, TCOBASE);
 
 	/* Clear out the (probably old) status */
 	outw(0x0008, TCO1_STS);	/* Clear the Time Out Status bit */
@@ -883,7 +619,7 @@ static int __devinit iTCO_wdt_init(struct pci_dev *pdev,
 	if (ret != 0) {
 		pr_err("cannot register miscdev on minor=%d (err=%d)\n",
 		       WATCHDOG_MINOR, ret);
-		goto unreg_region;
+		goto unreg_tco;
 	}
 
 	pr_info("initialized. heartbeat=%d sec (nowayout=%d)\n",
@@ -891,62 +627,31 @@ static int __devinit iTCO_wdt_init(struct pci_dev *pdev,
 
 	return 0;
 
-unreg_region:
-	release_region(TCOBASE, 0x20);
-unreg_smi_en:
-	release_region(SMI_EN, 4);
-out_unmap:
+unreg_tco:
+	release_region(iTCO_wdt_private.tco_res->start,
+			resource_size(iTCO_wdt_private.tco_res));
+unreg_smi:
+	release_region(iTCO_wdt_private.smi_res->start,
+			resource_size(iTCO_wdt_private.smi_res));
+unmap_gcs:
 	if (iTCO_wdt_private.iTCO_version == 2)
 		iounmap(iTCO_wdt_private.gcs);
-out:
-	iTCO_wdt_private.ACPIBASE = 0;
-	return ret;
-}
-
-static void __devexit iTCO_wdt_cleanup(void)
-{
-	/* Stop the timer before we leave */
-	if (!nowayout)
-		iTCO_wdt_stop();
-
-	/* Deregister */
-	misc_deregister(&iTCO_wdt_miscdev);
-	release_region(TCOBASE, 0x20);
-	release_region(SMI_EN, 4);
+unreg_gcs:
 	if (iTCO_wdt_private.iTCO_version == 2)
-		iounmap(iTCO_wdt_private.gcs);
-	pci_dev_put(iTCO_wdt_private.pdev);
-	iTCO_wdt_private.ACPIBASE = 0;
-}
-
-static int __devinit iTCO_wdt_probe(struct platform_device *dev)
-{
-	int ret = -ENODEV;
-	int found = 0;
-	struct pci_dev *pdev = NULL;
-	const struct pci_device_id *ent;
-
-	spin_lock_init(&iTCO_wdt_private.io_lock);
-
-	for_each_pci_dev(pdev) {
-		ent = pci_match_id(iTCO_wdt_pci_tbl, pdev);
-		if (ent) {
-			found++;
-			ret = iTCO_wdt_init(pdev, ent, dev);
-			if (!ret)
-				break;
-		}
-	}
-
-	if (!found)
-		pr_info("No device detected\n");
+		release_mem_region(iTCO_wdt_private.gcs_res->start,
+				resource_size(iTCO_wdt_private.gcs_res));
+out:
+	iTCO_wdt_private.tco_res = NULL;
+	iTCO_wdt_private.smi_res = NULL;
+	iTCO_wdt_private.gcs_res = NULL;
+	iTCO_wdt_private.gcs = NULL;
 
 	return ret;
 }
 
 static int __devexit iTCO_wdt_remove(struct platform_device *dev)
 {
-	if (iTCO_wdt_private.ACPIBASE)
+	if (iTCO_wdt_private.tco_res || iTCO_wdt_private.smi_res)
 		iTCO_wdt_cleanup();
 
 	return 0;
@@ -977,23 +682,11 @@ static int __init iTCO_wdt_init_module(void)
 	if (err)
 		return err;
 
-	iTCO_wdt_platform_device = platform_device_register_simple(DRV_NAME,
-								-1, NULL, 0);
-	if (IS_ERR(iTCO_wdt_platform_device)) {
-		err = PTR_ERR(iTCO_wdt_platform_device);
-		goto unreg_platform_driver;
-	}
-
 	return 0;
-
-unreg_platform_driver:
-	platform_driver_unregister(&iTCO_wdt_driver);
-	return err;
 }
 
 static void __exit iTCO_wdt_cleanup_module(void)
 {
-	platform_device_unregister(iTCO_wdt_platform_device);
 	platform_driver_unregister(&iTCO_wdt_driver);
 	pr_info("Watchdog Module Unloaded\n");
 }
diff --git a/drivers/watchdog/lantiq_wdt.c b/drivers/watchdog/lantiq_wdt.c
index a9593a3a32a0..2e74c3a8ee58 100644
--- a/drivers/watchdog/lantiq_wdt.c
+++ b/drivers/watchdog/lantiq_wdt.c
@@ -13,14 +13,15 @@
 #include <linux/fs.h>
 #include <linux/miscdevice.h>
 #include <linux/watchdog.h>
-#include <linux/platform_device.h>
+#include <linux/of_platform.h>
 #include <linux/uaccess.h>
 #include <linux/clk.h>
 #include <linux/io.h>
 
-#include <lantiq.h>
+#include <lantiq_soc.h>
 
-/* Section 3.4 of the datasheet
+/*
+ * Section 3.4 of the datasheet
  * The password sequence protects the WDT control register from unintended
  * write actions, which might cause malfunction of the WDT.
  *
@@ -70,7 +71,8 @@ ltq_wdt_disable(void)
 {
 	/* write the first password magic */
 	ltq_w32(LTQ_WDT_PW1, ltq_wdt_membase + LTQ_WDT_CR);
-	/* write the second password magic with no config
+	/*
+	 * write the second password magic with no config
 	 * this turns the watchdog off
 	 */
 	ltq_w32(LTQ_WDT_PW2, ltq_wdt_membase + LTQ_WDT_CR);
@@ -184,7 +186,7 @@ static struct miscdevice ltq_wdt_miscdev = {
 	.fops	= &ltq_wdt_fops,
 };
 
-static int __init
+static int __devinit
 ltq_wdt_probe(struct platform_device *pdev)
 {
 	struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
@@ -194,28 +196,27 @@ ltq_wdt_probe(struct platform_device *pdev)
 		dev_err(&pdev->dev, "cannot obtain I/O memory region");
 		return -ENOENT;
 	}
-	res = devm_request_mem_region(&pdev->dev, res->start,
-		resource_size(res), dev_name(&pdev->dev));
-	if (!res) {
-		dev_err(&pdev->dev, "cannot request I/O memory region");
-		return -EBUSY;
-	}
-	ltq_wdt_membase = devm_ioremap_nocache(&pdev->dev, res->start,
-		resource_size(res));
+
+	ltq_wdt_membase = devm_request_and_ioremap(&pdev->dev, res);
 	if (!ltq_wdt_membase) {
 		dev_err(&pdev->dev, "cannot remap I/O memory region\n");
 		return -ENOMEM;
 	}
 
 	/* we do not need to enable the clock as it is always running */
-	clk = clk_get(&pdev->dev, "io");
-	WARN_ON(!clk);
+	clk = clk_get_io();
+	if (IS_ERR(clk)) {
+		dev_err(&pdev->dev, "Failed to get clock\n");
+		return -ENOENT;
+	}
 	ltq_io_region_clk_rate = clk_get_rate(clk);
 	clk_put(clk);
 
+	/* find out if the watchdog caused the last reboot */
 	if (ltq_reset_cause() == LTQ_RST_CAUSE_WDTRST)
 		ltq_wdt_bootstatus = WDIOF_CARDRESET;
 
+	dev_info(&pdev->dev, "Init done\n");
 	return misc_register(&ltq_wdt_miscdev);
 }
 
@@ -227,33 +228,26 @@ ltq_wdt_remove(struct platform_device *pdev)
 	return 0;
 }
 
+static const struct of_device_id ltq_wdt_match[] = {
+	{ .compatible = "lantiq,wdt" },
+	{},
+};
+MODULE_DEVICE_TABLE(of, ltq_wdt_match);
 
 static struct platform_driver ltq_wdt_driver = {
+	.probe = ltq_wdt_probe,
 	.remove = __devexit_p(ltq_wdt_remove),
 	.driver = {
-		.name = "ltq_wdt",
+		.name = "wdt",
 		.owner = THIS_MODULE,
+		.of_match_table = ltq_wdt_match,
 	},
 };
 
-static int __init
-init_ltq_wdt(void)
-{
-	return platform_driver_probe(&ltq_wdt_driver, ltq_wdt_probe);
-}
-
-static void __exit
-exit_ltq_wdt(void)
-{
-	return platform_driver_unregister(&ltq_wdt_driver);
-}
-
-module_init(init_ltq_wdt);
-module_exit(exit_ltq_wdt);
+module_platform_driver(ltq_wdt_driver);
 
 module_param(nowayout, bool, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started");
-
 MODULE_AUTHOR("John Crispin <[email protected]>");
 MODULE_DESCRIPTION("Lantiq SoC Watchdog");
 MODULE_LICENSE("GPL");
diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c
index 0b48579a9cd6..7ff2569e17ae 100644
--- a/drivers/xen/xen-acpi-processor.c
+++ b/drivers/xen/xen-acpi-processor.c
@@ -29,6 +29,7 @@
 #include <acpi/acpi_drivers.h>
 #include <acpi/processor.h>
 
+#include <xen/xen.h>
 #include <xen/interface/platform.h>
 #include <asm/xen/hypercall.h>
 
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 37268c5bb98b..1b35d6bd06b0 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -292,7 +292,6 @@ static const struct inode_operations bad_inode_ops =
 	.getxattr	= bad_inode_getxattr,
 	.listxattr	= bad_inode_listxattr,
 	.removexattr	= bad_inode_removexattr,
-	/* truncate_range returns void */
 };
 
 
diff --git a/fs/bio.c b/fs/bio.c
index 84da88539046..73922abba832 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -19,12 +19,14 @@
 #include <linux/swap.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/iocontext.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
+#include <linux/cgroup.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
 #include <trace/events/block.h>
@@ -418,6 +420,7 @@ void bio_put(struct bio *bio)
 	 * last put frees it
 	 */
 	if (atomic_dec_and_test(&bio->bi_cnt)) {
+		bio_disassociate_task(bio);
 		bio->bi_next = NULL;
 		bio->bi_destructor(bio);
 	}
@@ -1646,6 +1649,64 @@ bad:
 }
 EXPORT_SYMBOL(bioset_create);
 
+#ifdef CONFIG_BLK_CGROUP
+/**
+ * bio_associate_current - associate a bio with %current
+ * @bio: target bio
+ *
+ * Associate @bio with %current if it hasn't been associated yet.  Block
+ * layer will treat @bio as if it were issued by %current no matter which
+ * task actually issues it.
+ *
+ * This function takes an extra reference of @task's io_context and blkcg
+ * which will be put when @bio is released.  The caller must own @bio,
+ * ensure %current->io_context exists, and is responsible for synchronizing
+ * calls to this function.
+ */
+int bio_associate_current(struct bio *bio)
+{
+	struct io_context *ioc;
+	struct cgroup_subsys_state *css;
+
+	if (bio->bi_ioc)
+		return -EBUSY;
+
+	ioc = current->io_context;
+	if (!ioc)
+		return -ENOENT;
+
+	/* acquire active ref on @ioc and associate */
+	get_io_context_active(ioc);
+	bio->bi_ioc = ioc;
+
+	/* associate blkcg if exists */
+	rcu_read_lock();
+	css = task_subsys_state(current, blkio_subsys_id);
+	if (css && css_tryget(css))
+		bio->bi_css = css;
+	rcu_read_unlock();
+
+	return 0;
+}
+
+/**
+ * bio_disassociate_task - undo bio_associate_current()
+ * @bio: target bio
+ */
+void bio_disassociate_task(struct bio *bio)
+{
+	if (bio->bi_ioc) {
+		put_io_context(bio->bi_ioc);
+		bio->bi_ioc = NULL;
+	}
+	if (bio->bi_css) {
+		css_put(bio->bi_css);
+		bio->bi_css = NULL;
+	}
+}
+
+#endif /* CONFIG_BLK_CGROUP */
+
 static void __init biovec_init_slabs(void)
 {
 	int i;
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 2b243af70aa3..a08306a8bec9 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -158,3 +158,23 @@ config CIFS_NFSD_EXPORT
 	  depends on CIFS && EXPERIMENTAL && BROKEN
 	  help
 	   Allows NFS server to export a CIFS mounted share (nfsd over cifs)
+
+config CIFS_SMB2
+	bool "SMB2 network file system support (EXPERIMENTAL)"
+	depends on EXPERIMENTAL && INET && BROKEN
+	select NLS
+	select KEYS
+	select FSCACHE
+	select DNS_RESOLVER
+
+	help
+	  This enables experimental support for the SMB2 (Server Message Block
+	  version 2) protocol. The SMB2 protocol is the successor to the
+	  popular CIFS and SMB network file sharing protocols. SMB2 is the
+	  native file sharing mechanism for recent versions of Windows
+	  operating systems (since Vista).  SMB2 enablement will eventually
+	  allow users better performance, security and features, than would be
+	  possible with cifs. Note that smb2 mount options also are simpler
+	  (compared to cifs) due to protocol improvements.
+
+	  Unless you are a developer or tester, say N.
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 005d524c3a4a..4b4127544349 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_CIFS) += cifs.o
 cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
 	  link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \
 	  cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
-	  readdir.o ioctl.o sess.o export.o
+	  readdir.o ioctl.o sess.o export.o smb1ops.o
 
 cifs-$(CONFIG_CIFS_ACL) += cifsacl.o
 
@@ -15,3 +15,5 @@ cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
 cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o
 
 cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o
+
+cifs-$(CONFIG_CIFS_SMB2) += smb2ops.o
diff --git a/fs/cifs/README b/fs/cifs/README
index b7d782bab797..22ab7b5b8da7 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -608,11 +608,6 @@ Stats			Lists summary resource usage information as well as per
 			in the kernel configuration.
 
 Configuration pseudo-files:
-MultiuserMount		If set to one, more than one CIFS session to 
-			the same server ip address can be established
-			if more than one uid accesses the same mount
-			point and if the uids user/password mapping
-			information is available. (default is 0)
 PacketSigningEnabled	If set to one, cifs packet signing is enabled
 			and will be used if the server requires 
 			it.  If set to two, cifs packet signing is
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 270464629416..e8140528ca5c 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -57,19 +57,21 @@ cifs_dump_mem(char *label, void *data, int length)
 	}
 }
 
-#ifdef CONFIG_CIFS_DEBUG2
 void cifs_dump_detail(void *buf)
 {
+#ifdef CONFIG_CIFS_DEBUG2
 	struct smb_hdr *smb = (struct smb_hdr *)buf;
 
 	cERROR(1, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d",
 		  smb->Command, smb->Status.CifsError,
 		  smb->Flags, smb->Flags2, smb->Mid, smb->Pid);
 	cERROR(1, "smb buf %p len %d", smb, smbCalcSize(smb));
+#endif /* CONFIG_CIFS_DEBUG2 */
 }
 
 void cifs_dump_mids(struct TCP_Server_Info *server)
 {
+#ifdef CONFIG_CIFS_DEBUG2
 	struct list_head *tmp;
 	struct mid_q_entry *mid_entry;
 
@@ -102,8 +104,8 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
 		}
 	}
 	spin_unlock(&GlobalMid_Lock);
-}
 #endif /* CONFIG_CIFS_DEBUG2 */
+}
 
 #ifdef CONFIG_PROC_FS
 static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
@@ -420,7 +422,6 @@ static struct proc_dir_entry *proc_fs_cifs;
 static const struct file_operations cifsFYI_proc_fops;
 static const struct file_operations cifs_lookup_cache_proc_fops;
 static const struct file_operations traceSMB_proc_fops;
-static const struct file_operations cifs_multiuser_mount_proc_fops;
 static const struct file_operations cifs_security_flags_proc_fops;
 static const struct file_operations cifs_linux_ext_proc_fops;
 
@@ -440,8 +441,6 @@ cifs_proc_init(void)
 	proc_create("traceSMB", 0, proc_fs_cifs, &traceSMB_proc_fops);
 	proc_create("LinuxExtensionsEnabled", 0, proc_fs_cifs,
 		    &cifs_linux_ext_proc_fops);
-	proc_create("MultiuserMount", 0, proc_fs_cifs,
-		    &cifs_multiuser_mount_proc_fops);
 	proc_create("SecurityFlags", 0, proc_fs_cifs,
 		    &cifs_security_flags_proc_fops);
 	proc_create("LookupCacheEnabled", 0, proc_fs_cifs,
@@ -460,7 +459,6 @@ cifs_proc_clean(void)
 #ifdef CONFIG_CIFS_STATS
 	remove_proc_entry("Stats", proc_fs_cifs);
 #endif
-	remove_proc_entry("MultiuserMount", proc_fs_cifs);
 	remove_proc_entry("SecurityFlags", proc_fs_cifs);
 	remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs);
 	remove_proc_entry("LookupCacheEnabled", proc_fs_cifs);
@@ -617,52 +615,6 @@ static const struct file_operations traceSMB_proc_fops = {
 	.write		= traceSMB_proc_write,
 };
 
-static int cifs_multiuser_mount_proc_show(struct seq_file *m, void *v)
-{
-	seq_printf(m, "%d\n", multiuser_mount);
-	return 0;
-}
-
-static int cifs_multiuser_mount_proc_open(struct inode *inode, struct file *fh)
-{
-	return single_open(fh, cifs_multiuser_mount_proc_show, NULL);
-}
-
-static ssize_t cifs_multiuser_mount_proc_write(struct file *file,
-		const char __user *buffer, size_t count, loff_t *ppos)
-{
-	char c;
-	int rc;
-	static bool warned;
-
-	rc = get_user(c, buffer);
-	if (rc)
-		return rc;
-	if (c == '0' || c == 'n' || c == 'N')
-		multiuser_mount = 0;
-	else if (c == '1' || c == 'y' || c == 'Y') {
-		multiuser_mount = 1;
-		if (!warned) {
-			warned = true;
-			printk(KERN_WARNING "CIFS VFS: The legacy multiuser "
-				"mount code is scheduled to be deprecated in "
-				"3.5. Please switch to using the multiuser "
-				"mount option.");
-		}
-	}
-
-	return count;
-}
-
-static const struct file_operations cifs_multiuser_mount_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= cifs_multiuser_mount_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-	.write		= cifs_multiuser_mount_proc_write,
-};
-
 static int cifs_security_flags_proc_show(struct seq_file *m, void *v)
 {
 	seq_printf(m, "0x%x\n", global_secflags);
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index 566e0ae8dc2c..c0c68bb492d7 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -24,10 +24,10 @@
 #define _H_CIFS_DEBUG
 
 void cifs_dump_mem(char *label, void *data, int length);
-#ifdef CONFIG_CIFS_DEBUG2
-#define DBG2 2
 void cifs_dump_detail(void *);
 void cifs_dump_mids(struct TCP_Server_Info *);
+#ifdef CONFIG_CIFS_DEBUG2
+#define DBG2 2
 #else
 #define DBG2 0
 #endif
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 0a0fa0250e99..8b6e344eb0ba 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -56,7 +56,6 @@ int traceSMB = 0;
 bool enable_oplocks = true;
 unsigned int linuxExtEnabled = 1;
 unsigned int lookupCacheEnabled = 1;
-unsigned int multiuser_mount = 0;
 unsigned int global_secflags = CIFSSEC_DEF;
 /* unsigned int ntlmv2_support = 0; */
 unsigned int sign_CIFS_PDUs = 1;
@@ -125,7 +124,7 @@ cifs_read_super(struct super_block *sb)
 		goto out_no_root;
 	}
 
-	/* do that *after* d_alloc_root() - we want NULL ->d_op for root here */
+	/* do that *after* d_make_root() - we want NULL ->d_op for root here */
 	if (cifs_sb_master_tcon(cifs_sb)->nocase)
 		sb->s_d_op = &cifs_ci_dentry_ops;
 	else
@@ -329,6 +328,19 @@ cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server)
 		seq_printf(s, "i");
 }
 
+static void
+cifs_show_cache_flavor(struct seq_file *s, struct cifs_sb_info *cifs_sb)
+{
+	seq_printf(s, ",cache=");
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
+		seq_printf(s, "strict");
+	else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO)
+		seq_printf(s, "none");
+	else
+		seq_printf(s, "loose");
+}
+
 /*
  * cifs_show_options() is for displaying mount options in /proc/mounts.
  * Not all settable options are displayed but most of the important
@@ -342,7 +354,9 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
 	struct sockaddr *srcaddr;
 	srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
 
+	seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string);
 	cifs_show_security(s, tcon->ses->server);
+	cifs_show_cache_flavor(s, cifs_sb);
 
 	seq_printf(s, ",unc=%s", tcon->treeName);
 
@@ -408,8 +422,6 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
 		seq_printf(s, ",rwpidforward");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL)
 		seq_printf(s, ",forcemand");
-	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO)
-		seq_printf(s, ",directio");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
 		seq_printf(s, ",nouser_xattr");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
@@ -432,8 +444,6 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
 		seq_printf(s, ",nostrictsync");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
 		seq_printf(s, ",noperm");
-	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
-		seq_printf(s, ",strictcache");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID)
 		seq_printf(s, ",backupuid=%u", cifs_sb->mnt_backupuid);
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID)
@@ -945,7 +955,6 @@ cifs_init_once(void *inode)
 	struct cifsInodeInfo *cifsi = inode;
 
 	inode_init_once(&cifsi->vfs_inode);
-	INIT_LIST_HEAD(&cifsi->llist);
 	mutex_init(&cifsi->lock_mutex);
 }
 
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 4ff6313f0a91..20350a93ed99 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -43,6 +43,7 @@
 
 #define CIFS_MIN_RCV_POOL 4
 
+#define MAX_REOPEN_ATT	5 /* these many maximum attempts to reopen a file */
 /*
  * default attribute cache timeout (jiffies)
  */
@@ -150,6 +151,57 @@ struct cifs_cred {
  *****************************************************************
  */
 
+enum smb_version {
+	Smb_1 = 1,
+	Smb_21,
+};
+
+struct mid_q_entry;
+struct TCP_Server_Info;
+struct cifsFileInfo;
+struct cifs_ses;
+
+struct smb_version_operations {
+	int (*send_cancel)(struct TCP_Server_Info *, void *,
+			   struct mid_q_entry *);
+	bool (*compare_fids)(struct cifsFileInfo *, struct cifsFileInfo *);
+	/* setup request: allocate mid, sign message */
+	int (*setup_request)(struct cifs_ses *, struct kvec *, unsigned int,
+			     struct mid_q_entry **);
+	/* check response: verify signature, map error */
+	int (*check_receive)(struct mid_q_entry *, struct TCP_Server_Info *,
+			     bool);
+	void (*add_credits)(struct TCP_Server_Info *, const unsigned int);
+	void (*set_credits)(struct TCP_Server_Info *, const int);
+	int * (*get_credits_field)(struct TCP_Server_Info *);
+	/* data offset from read response message */
+	unsigned int (*read_data_offset)(char *);
+	/* data length from read response message */
+	unsigned int (*read_data_length)(char *);
+	/* map smb to linux error */
+	int (*map_error)(char *, bool);
+	/* find mid corresponding to the response message */
+	struct mid_q_entry * (*find_mid)(struct TCP_Server_Info *, char *);
+	void (*dump_detail)(void *);
+	/* verify the message */
+	int (*check_message)(char *, unsigned int);
+	bool (*is_oplock_break)(char *, struct TCP_Server_Info *);
+};
+
+struct smb_version_values {
+	char		*version_string;
+	__u32		large_lock_type;
+	__u32		exclusive_lock_type;
+	__u32		shared_lock_type;
+	__u32		unlock_lock_type;
+	size_t		header_size;
+	size_t		max_header_size;
+	size_t		read_rsp_size;
+};
+
+#define HEADER_SIZE(server) (server->vals->header_size)
+#define MAX_HEADER_SIZE(server) (server->vals->max_header_size)
+
 struct smb_vol {
 	char *username;
 	char *password;
@@ -205,6 +257,8 @@ struct smb_vol {
 	bool sockopt_tcp_nodelay:1;
 	unsigned short int port;
 	unsigned long actimeo; /* attribute cache timeout (jiffies) */
+	struct smb_version_operations *ops;
+	struct smb_version_values *vals;
 	char *prepath;
 	struct sockaddr_storage srcaddr; /* allow binding to a local IP */
 	struct nls_table *local_nls;
@@ -242,6 +296,8 @@ struct TCP_Server_Info {
 	int srv_count; /* reference counter */
 	/* 15 character server name + 0x20 16th byte indicating type = srv */
 	char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
+	struct smb_version_operations	*ops;
+	struct smb_version_values	*vals;
 	enum statusEnum tcpStatus; /* what we think the status is */
 	char *hostname; /* hostname portion of UNC string */
 	struct socket *ssocket;
@@ -321,16 +377,6 @@ in_flight(struct TCP_Server_Info *server)
 	return num;
 }
 
-static inline int*
-get_credits_field(struct TCP_Server_Info *server)
-{
-	/*
-	 * This will change to switch statement when we reserve slots for echos
-	 * and oplock breaks.
-	 */
-	return &server->credits;
-}
-
 static inline bool
 has_credits(struct TCP_Server_Info *server, int *credits)
 {
@@ -341,16 +387,16 @@ has_credits(struct TCP_Server_Info *server, int *credits)
 	return num > 0;
 }
 
-static inline size_t
-header_size(void)
+static inline void
+add_credits(struct TCP_Server_Info *server, const unsigned int add)
 {
-	return sizeof(struct smb_hdr);
+	server->ops->add_credits(server, add);
 }
 
-static inline size_t
-max_header_size(void)
+static inline void
+set_credits(struct TCP_Server_Info *server, const int val)
 {
-	return MAX_CIFS_HDR_SIZE;
+	server->ops->set_credits(server, val);
 }
 
 /*
@@ -547,8 +593,7 @@ struct cifsLockInfo {
 	__u64 offset;
 	__u64 length;
 	__u32 pid;
-	__u8 type;
-	__u16 netfid;
+	__u32 type;
 };
 
 /*
@@ -573,6 +618,10 @@ struct cifs_search_info {
 struct cifsFileInfo {
 	struct list_head tlist;	/* pointer to next fid owned by tcon */
 	struct list_head flist;	/* next fid (file instance) for this inode */
+	struct list_head llist;	/*
+				 * brlocks held by this fid, protected by
+				 * lock_mutex from cifsInodeInfo structure
+				 */
 	unsigned int uid;	/* allows finding which FileInfo structure */
 	__u32 pid;		/* process id who opened file */
 	__u16 netfid;		/* file id from remote */
@@ -615,9 +664,12 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
  */
 
 struct cifsInodeInfo {
-	struct list_head llist;		/* brlocks for this inode */
 	bool can_cache_brlcks;
-	struct mutex lock_mutex;	/* protect two fields above */
+	struct mutex lock_mutex;	/*
+					 * protect the field above and llist
+					 * from every cifsFileInfo structure
+					 * from openFileList
+					 */
 	/* BB add in lists for dirty pages i.e. write caching info for oplock */
 	struct list_head openFileList;
 	__u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
@@ -703,7 +755,6 @@ static inline void cifs_stats_bytes_read(struct cifs_tcon *tcon,
 
 #endif
 
-struct mid_q_entry;
 
 /*
  * This is the prototype for the mid receive function. This function is for
@@ -1042,12 +1093,7 @@ GLOBAL_EXTERN atomic_t smBufAllocCount;
 GLOBAL_EXTERN atomic_t midCount;
 
 /* Misc globals */
-GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions
-				to be established on existing mount if we
-				have the uid/password or Kerberos credential
-				or equivalent for current user */
-/* enable or disable oplocks */
-GLOBAL_EXTERN bool enable_oplocks;
+GLOBAL_EXTERN bool enable_oplocks; /* enable or disable oplocks */
 GLOBAL_EXTERN unsigned int lookupCacheEnabled;
 GLOBAL_EXTERN unsigned int global_secflags;	/* if on, session setup sent
 				with more secure ntlmssp2 challenge/resp */
@@ -1074,4 +1120,11 @@ void cifs_oplock_break(struct work_struct *work);
 extern const struct slow_work_ops cifs_oplock_break_ops;
 extern struct workqueue_struct *cifsiod_wq;
 
+/* Operations for different SMB versions */
+#define SMB1_VERSION_STRING	"1.0"
+extern struct smb_version_operations smb1_operations;
+extern struct smb_version_values smb1_values;
+#define SMB21_VERSION_STRING	"2.1"
+extern struct smb_version_operations smb21_operations;
+extern struct smb_version_values smb21_values;
 #endif	/* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 96192c1e380a..5ec21ecf7980 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -78,6 +78,8 @@ extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *,
 			int * /* bytes returned */ , const int long_op);
 extern int SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses,
 			    char *in_buf, int flags);
+extern int cifs_setup_request(struct cifs_ses *, struct kvec *, unsigned int,
+			      struct mid_q_entry **);
 extern int cifs_check_receive(struct mid_q_entry *mid,
 			struct TCP_Server_Info *server, bool log_error);
 extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *,
@@ -88,9 +90,6 @@ extern int SendReceiveBlockingLock(const unsigned int xid,
 			struct smb_hdr *in_buf ,
 			struct smb_hdr *out_buf,
 			int *bytes_returned);
-extern void cifs_add_credits(struct TCP_Server_Info *server,
-			     const unsigned int add);
-extern void cifs_set_credits(struct TCP_Server_Info *server, const int val);
 extern int checkSMB(char *buf, unsigned int length);
 extern bool is_valid_oplock_break(char *, struct TCP_Server_Info *);
 extern bool backup_cred(struct cifs_sb_info *);
@@ -192,11 +191,13 @@ extern int CIFSTCon(unsigned int xid, struct cifs_ses *ses,
 
 extern int CIFSFindFirst(const int xid, struct cifs_tcon *tcon,
 		const char *searchName, const struct nls_table *nls_codepage,
-		__u16 *searchHandle, struct cifs_search_info *psrch_inf,
+		__u16 *searchHandle, __u16 search_flags,
+		struct cifs_search_info *psrch_inf,
 		int map, const char dirsep);
 
 extern int CIFSFindNext(const int xid, struct cifs_tcon *tcon,
-		__u16 searchHandle, struct cifs_search_info *psrch_inf);
+		__u16 searchHandle, __u16 search_flags,
+		struct cifs_search_info *psrch_inf);
 
 extern int CIFSFindClose(const int, struct cifs_tcon *tcon,
 			const __u16 search_handle);
@@ -464,6 +465,9 @@ extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8,
 
 /* asynchronous read support */
 struct cifs_readdata {
+	struct kref			refcount;
+	struct list_head		list;
+	struct completion		done;
 	struct cifsFileInfo		*cfile;
 	struct address_space		*mapping;
 	__u64				offset;
@@ -472,12 +476,13 @@ struct cifs_readdata {
 	int				result;
 	struct list_head		pages;
 	struct work_struct		work;
+	int (*marshal_iov) (struct cifs_readdata *rdata,
+			    unsigned int remaining);
 	unsigned int			nr_iov;
 	struct kvec			iov[1];
 };
 
-struct cifs_readdata *cifs_readdata_alloc(unsigned int nr_pages);
-void cifs_readdata_free(struct cifs_readdata *rdata);
+void cifs_readdata_release(struct kref *refcount);
 int cifs_async_readv(struct cifs_readdata *rdata);
 
 /* asynchronous write support */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index da2f5446fa7a..b5ad716b2642 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -87,7 +87,6 @@ static struct {
 #endif /* CIFS_POSIX */
 
 /* Forward declarations */
-static void cifs_readv_complete(struct work_struct *work);
 
 /* Mark as invalid, all open files on tree connections since they
    were closed when session to server was lost */
@@ -461,7 +460,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)
 		server->maxReq = min_t(unsigned int,
 				       le16_to_cpu(rsp->MaxMpxCount),
 				       cifs_max_pending);
-		cifs_set_credits(server, server->maxReq);
+		set_credits(server, server->maxReq);
 		server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
 		server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
 		/* even though we do not use raw we might as well set this
@@ -569,7 +568,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)
 	   little endian */
 	server->maxReq = min_t(unsigned int, le16_to_cpu(pSMBr->MaxMpxCount),
 			       cifs_max_pending);
-	cifs_set_credits(server, server->maxReq);
+	set_credits(server, server->maxReq);
 	/* probably no need to store and check maxvcs */
 	server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize);
 	server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
@@ -721,7 +720,7 @@ cifs_echo_callback(struct mid_q_entry *mid)
 	struct TCP_Server_Info *server = mid->callback_data;
 
 	DeleteMidQEntry(mid);
-	cifs_add_credits(server, 1);
+	add_credits(server, 1);
 }
 
 int
@@ -1385,28 +1384,6 @@ openRetry:
 	return rc;
 }
 
-struct cifs_readdata *
-cifs_readdata_alloc(unsigned int nr_pages)
-{
-	struct cifs_readdata *rdata;
-
-	/* readdata + 1 kvec for each page */
-	rdata = kzalloc(sizeof(*rdata) +
-			sizeof(struct kvec) * nr_pages, GFP_KERNEL);
-	if (rdata != NULL) {
-		INIT_WORK(&rdata->work, cifs_readv_complete);
-		INIT_LIST_HEAD(&rdata->pages);
-	}
-	return rdata;
-}
-
-void
-cifs_readdata_free(struct cifs_readdata *rdata)
-{
-	cifsFileInfo_put(rdata->cfile);
-	kfree(rdata);
-}
-
 /*
  * Discard any remaining data in the current SMB. To do this, we borrow the
  * current bigbuf.
@@ -1423,7 +1400,7 @@ cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 
 		length = cifs_read_from_socket(server, server->bigbuf,
 				min_t(unsigned int, remaining,
-					CIFSMaxBufSize + max_header_size()));
+				    CIFSMaxBufSize + MAX_HEADER_SIZE(server)));
 		if (length < 0)
 			return length;
 		server->total_read += length;
@@ -1434,38 +1411,14 @@ cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 	return 0;
 }
 
-static inline size_t
-read_rsp_size(void)
-{
-	return sizeof(READ_RSP);
-}
-
-static inline unsigned int
-read_data_offset(char *buf)
-{
-	READ_RSP *rsp = (READ_RSP *)buf;
-	return le16_to_cpu(rsp->DataOffset);
-}
-
-static inline unsigned int
-read_data_length(char *buf)
-{
-	READ_RSP *rsp = (READ_RSP *)buf;
-	return (le16_to_cpu(rsp->DataLengthHigh) << 16) +
-	       le16_to_cpu(rsp->DataLength);
-}
-
 static int
 cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 {
 	int length, len;
-	unsigned int data_offset, remaining, data_len;
+	unsigned int data_offset, data_len;
 	struct cifs_readdata *rdata = mid->callback_data;
 	char *buf = server->smallbuf;
 	unsigned int buflen = get_rfc1002_length(buf) + 4;
-	u64 eof;
-	pgoff_t eof_index;
-	struct page *page, *tpage;
 
 	cFYI(1, "%s: mid=%llu offset=%llu bytes=%u", __func__,
 		mid->mid, rdata->offset, rdata->bytes);
@@ -1475,9 +1428,10 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 	 * can if there's not enough data. At this point, we've read down to
 	 * the Mid.
 	 */
-	len = min_t(unsigned int, buflen, read_rsp_size()) - header_size() + 1;
+	len = min_t(unsigned int, buflen, server->vals->read_rsp_size) -
+							HEADER_SIZE(server) + 1;
 
-	rdata->iov[0].iov_base = buf + header_size() - 1;
+	rdata->iov[0].iov_base = buf + HEADER_SIZE(server) - 1;
 	rdata->iov[0].iov_len = len;
 
 	length = cifs_readv_from_socket(server, rdata->iov, 1, len);
@@ -1486,7 +1440,7 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 	server->total_read += length;
 
 	/* Was the SMB read successful? */
-	rdata->result = map_smb_to_linux_error(buf, false);
+	rdata->result = server->ops->map_error(buf, false);
 	if (rdata->result != 0) {
 		cFYI(1, "%s: server returned error %d", __func__,
 			rdata->result);
@@ -1494,14 +1448,15 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 	}
 
 	/* Is there enough to get to the rest of the READ_RSP header? */
-	if (server->total_read < read_rsp_size()) {
+	if (server->total_read < server->vals->read_rsp_size) {
 		cFYI(1, "%s: server returned short header. got=%u expected=%zu",
-			__func__, server->total_read, read_rsp_size());
+			__func__, server->total_read,
+			server->vals->read_rsp_size);
 		rdata->result = -EIO;
 		return cifs_readv_discard(server, mid);
 	}
 
-	data_offset = read_data_offset(buf) + 4;
+	data_offset = server->ops->read_data_offset(buf) + 4;
 	if (data_offset < server->total_read) {
 		/*
 		 * win2k8 sometimes sends an offset of 0 when the read
@@ -1540,7 +1495,7 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 		rdata->iov[0].iov_base, rdata->iov[0].iov_len);
 
 	/* how much data is in the response? */
-	data_len = read_data_length(buf);
+	data_len = server->ops->read_data_length(buf);
 	if (data_offset + data_len > buflen) {
 		/* data_len is corrupt -- discard frame */
 		rdata->result = -EIO;
@@ -1548,64 +1503,8 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 	}
 
 	/* marshal up the page array */
-	len = 0;
-	remaining = data_len;
-	rdata->nr_iov = 1;
-
-	/* determine the eof that the server (probably) has */
-	eof = CIFS_I(rdata->mapping->host)->server_eof;
-	eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0;
-	cFYI(1, "eof=%llu eof_index=%lu", eof, eof_index);
-
-	list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
-		if (remaining >= PAGE_CACHE_SIZE) {
-			/* enough data to fill the page */
-			rdata->iov[rdata->nr_iov].iov_base = kmap(page);
-			rdata->iov[rdata->nr_iov].iov_len = PAGE_CACHE_SIZE;
-			cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
-				rdata->nr_iov, page->index,
-				rdata->iov[rdata->nr_iov].iov_base,
-				rdata->iov[rdata->nr_iov].iov_len);
-			++rdata->nr_iov;
-			len += PAGE_CACHE_SIZE;
-			remaining -= PAGE_CACHE_SIZE;
-		} else if (remaining > 0) {
-			/* enough for partial page, fill and zero the rest */
-			rdata->iov[rdata->nr_iov].iov_base = kmap(page);
-			rdata->iov[rdata->nr_iov].iov_len = remaining;
-			cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
-				rdata->nr_iov, page->index,
-				rdata->iov[rdata->nr_iov].iov_base,
-				rdata->iov[rdata->nr_iov].iov_len);
-			memset(rdata->iov[rdata->nr_iov].iov_base + remaining,
-				'\0', PAGE_CACHE_SIZE - remaining);
-			++rdata->nr_iov;
-			len += remaining;
-			remaining = 0;
-		} else if (page->index > eof_index) {
-			/*
-			 * The VFS will not try to do readahead past the
-			 * i_size, but it's possible that we have outstanding
-			 * writes with gaps in the middle and the i_size hasn't
-			 * caught up yet. Populate those with zeroed out pages
-			 * to prevent the VFS from repeatedly attempting to
-			 * fill them until the writes are flushed.
-			 */
-			zero_user(page, 0, PAGE_CACHE_SIZE);
-			list_del(&page->lru);
-			lru_cache_add_file(page);
-			flush_dcache_page(page);
-			SetPageUptodate(page);
-			unlock_page(page);
-			page_cache_release(page);
-		} else {
-			/* no need to hold page hostage */
-			list_del(&page->lru);
-			lru_cache_add_file(page);
-			unlock_page(page);
-			page_cache_release(page);
-		}
-	}
+	len = rdata->marshal_iov(rdata, data_len);
+	data_len -= len;
 
 	/* issue the read if we have any iovecs left to fill */
 	if (rdata->nr_iov > 1) {
@@ -1621,7 +1520,7 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 	rdata->bytes = length;
 
 	cFYI(1, "total_read=%u buflen=%u remaining=%u", server->total_read,
-		buflen, remaining);
+		buflen, data_len);
 
 	/* discard anything left over */
 	if (server->total_read < buflen)
@@ -1632,33 +1531,6 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 }
 
 static void
-cifs_readv_complete(struct work_struct *work)
-{
-	struct cifs_readdata *rdata = container_of(work,
-						struct cifs_readdata, work);
-	struct page *page, *tpage;
-
-	list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
-		list_del(&page->lru);
-		lru_cache_add_file(page);
-
-		if (rdata->result == 0) {
-			kunmap(page);
-			flush_dcache_page(page);
-			SetPageUptodate(page);
-		}
-
-		unlock_page(page);
-
-		if (rdata->result == 0)
-			cifs_readpage_to_fscache(rdata->mapping->host, page);
-
-		page_cache_release(page);
-	}
-	cifs_readdata_free(rdata);
-}
-
-static void
 cifs_readv_callback(struct mid_q_entry *mid)
 {
 	struct cifs_readdata *rdata = mid->callback_data;
@@ -1691,7 +1563,7 @@ cifs_readv_callback(struct mid_q_entry *mid)
 
 	queue_work(cifsiod_wq, &rdata->work);
 	DeleteMidQEntry(mid);
-	cifs_add_credits(server, 1);
+	add_credits(server, 1);
 }
 
 /* cifs_async_readv - send an async write, and set up mid to handle result */
@@ -1744,12 +1616,15 @@ cifs_async_readv(struct cifs_readdata *rdata)
 	rdata->iov[0].iov_base = smb;
 	rdata->iov[0].iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4;
 
+	kref_get(&rdata->refcount);
 	rc = cifs_call_async(tcon->ses->server, rdata->iov, 1,
 			     cifs_readv_receive, cifs_readv_callback,
 			     rdata, false);
 
 	if (rc == 0)
 		cifs_stats_inc(&tcon->num_reads);
+	else
+		kref_put(&rdata->refcount, cifs_readdata_release);
 
 	cifs_small_buf_release(smb);
 	return rc;
@@ -2135,7 +2010,7 @@ cifs_writev_callback(struct mid_q_entry *mid)
 
 	queue_work(cifsiod_wq, &wdata->work);
 	DeleteMidQEntry(mid);
-	cifs_add_credits(tcon->ses->server, 1);
+	add_credits(tcon->ses->server, 1);
 }
 
 /* cifs_async_writev - send an async write, and set up mid to handle result */
@@ -4344,7 +4219,7 @@ int
 CIFSFindFirst(const int xid, struct cifs_tcon *tcon,
 	      const char *searchName,
 	      const struct nls_table *nls_codepage,
-	      __u16 *pnetfid,
+	      __u16 *pnetfid, __u16 search_flags,
 	      struct cifs_search_info *psrch_inf, int remap, const char dirsep)
 {
 /* level 257 SMB_ */
@@ -4416,8 +4291,7 @@ findFirstRetry:
 	    cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM |
 			ATTR_DIRECTORY);
 	pSMB->SearchCount = cpu_to_le16(CIFSMaxBufSize/sizeof(FILE_UNIX_INFO));
-	pSMB->SearchFlags = cpu_to_le16(CIFS_SEARCH_CLOSE_AT_END |
-		CIFS_SEARCH_RETURN_RESUME);
+	pSMB->SearchFlags = cpu_to_le16(search_flags);
 	pSMB->InformationLevel = cpu_to_le16(psrch_inf->info_level);
 
 	/* BB what should we set StorageType to? Does it matter? BB */
@@ -4487,8 +4361,8 @@ findFirstRetry:
 	return rc;
 }
 
-int CIFSFindNext(const int xid, struct cifs_tcon *tcon,
-		 __u16 searchHandle, struct cifs_search_info *psrch_inf)
+int CIFSFindNext(const int xid, struct cifs_tcon *tcon, __u16 searchHandle,
+		 __u16 search_flags, struct cifs_search_info *psrch_inf)
 {
 	TRANSACTION2_FNEXT_REQ *pSMB = NULL;
 	TRANSACTION2_FNEXT_RSP *pSMBr = NULL;
@@ -4531,8 +4405,7 @@ int CIFSFindNext(const int xid, struct cifs_tcon *tcon,
 		cpu_to_le16(CIFSMaxBufSize / sizeof(FILE_UNIX_INFO));
 	pSMB->InformationLevel = cpu_to_le16(psrch_inf->info_level);
 	pSMB->ResumeKey = psrch_inf->resume_key;
-	pSMB->SearchFlags =
-	      cpu_to_le16(CIFS_SEARCH_CLOSE_AT_END | CIFS_SEARCH_RETURN_RESUME);
+	pSMB->SearchFlags = cpu_to_le16(search_flags);
 
 	name_len = psrch_inf->resume_name_len;
 	params += name_len;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e0b56d7a19c5..ccafdedd0dbc 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1,7 +1,7 @@
 /*
  *   fs/cifs/connect.c
  *
- *   Copyright (C) International Business Machines  Corp., 2002,2009
+ *   Copyright (C) International Business Machines  Corp., 2002,2011
  *   Author(s): Steve French ([email protected])
  *
  *   This library is free software; you can redistribute it and/or modify
@@ -102,7 +102,7 @@ enum {
 	Opt_srcaddr, Opt_prefixpath,
 	Opt_iocharset, Opt_sockopt,
 	Opt_netbiosname, Opt_servern,
-	Opt_ver, Opt_sec,
+	Opt_ver, Opt_vers, Opt_sec, Opt_cache,
 
 	/* Mount options to be ignored */
 	Opt_ignore,
@@ -210,9 +210,9 @@ static const match_table_t cifs_mount_option_tokens = {
 	{ Opt_netbiosname, "netbiosname=%s" },
 	{ Opt_servern, "servern=%s" },
 	{ Opt_ver, "ver=%s" },
-	{ Opt_ver, "vers=%s" },
-	{ Opt_ver, "version=%s" },
+	{ Opt_vers, "vers=%s" },
 	{ Opt_sec, "sec=%s" },
+	{ Opt_cache, "cache=%s" },
 
 	{ Opt_ignore, "cred" },
 	{ Opt_ignore, "credentials" },
@@ -261,6 +261,26 @@ static const match_table_t cifs_secflavor_tokens = {
 	{ Opt_sec_err, NULL }
 };
 
+/* cache flavors */
+enum {
+	Opt_cache_loose,
+	Opt_cache_strict,
+	Opt_cache_none,
+	Opt_cache_err
+};
+
+static const match_table_t cifs_cacheflavor_tokens = {
+	{ Opt_cache_loose, "loose" },
+	{ Opt_cache_strict, "strict" },
+	{ Opt_cache_none, "none" },
+	{ Opt_cache_err, NULL }
+};
+
+static const match_table_t cifs_smb_version_tokens = {
+	{ Smb_1, SMB1_VERSION_STRING },
+	{ Smb_21, SMB21_VERSION_STRING },
+};
+
 static int ip_connect(struct TCP_Server_Info *server);
 static int generic_ip_connect(struct TCP_Server_Info *server);
 static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink);
@@ -549,7 +569,7 @@ allocate_buffers(struct TCP_Server_Info *server)
 		}
 	} else if (server->large_buf) {
 		/* we are reusing a dirty large buf, clear its start */
-		memset(server->bigbuf, 0, header_size());
+		memset(server->bigbuf, 0, HEADER_SIZE(server));
 	}
 
 	if (!server->smallbuf) {
@@ -563,7 +583,7 @@ allocate_buffers(struct TCP_Server_Info *server)
 		/* beginning of smb buffer is cleared in our buf_get */
 	} else {
 		/* if existing small buf clear beginning */
-		memset(server->smallbuf, 0, header_size());
+		memset(server->smallbuf, 0, HEADER_SIZE(server));
 	}
 
 	return true;
@@ -764,25 +784,6 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type)
 	return false;
 }
 
-static struct mid_q_entry *
-find_mid(struct TCP_Server_Info *server, char *buffer)
-{
-	struct smb_hdr *buf = (struct smb_hdr *)buffer;
-	struct mid_q_entry *mid;
-
-	spin_lock(&GlobalMid_Lock);
-	list_for_each_entry(mid, &server->pending_mid_q, qhead) {
-		if (mid->mid == buf->Mid &&
-		    mid->mid_state == MID_REQUEST_SUBMITTED &&
-		    le16_to_cpu(mid->command) == buf->Command) {
-			spin_unlock(&GlobalMid_Lock);
-			return mid;
-		}
-	}
-	spin_unlock(&GlobalMid_Lock);
-	return NULL;
-}
-
 void
 dequeue_mid(struct mid_q_entry *mid, bool malformed)
 {
@@ -934,7 +935,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 	unsigned int pdu_length = get_rfc1002_length(buf);
 
 	/* make sure this will fit in a large buffer */
-	if (pdu_length > CIFSMaxBufSize + max_header_size() - 4) {
+	if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server) - 4) {
 		cERROR(1, "SMB response too long (%u bytes)",
 			pdu_length);
 		cifs_reconnect(server);
@@ -950,8 +951,8 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 	}
 
 	/* now read the rest */
-	length = cifs_read_from_socket(server, buf + header_size() - 1,
-				       pdu_length - header_size() + 1 + 4);
+	length = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1,
+				pdu_length - HEADER_SIZE(server) + 1 + 4);
 	if (length < 0)
 		return length;
 	server->total_read += length;
@@ -967,7 +968,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 	 * 48 bytes is enough to display the header and a little bit
 	 * into the payload for debugging purposes.
 	 */
-	length = checkSMB(buf, server->total_read);
+	length = server->ops->check_message(buf, server->total_read);
 	if (length != 0)
 		cifs_dump_mem("Bad SMB: ", buf,
 			min_t(unsigned int, server->total_read, 48));
@@ -1025,7 +1026,7 @@ cifs_demultiplex_thread(void *p)
 			continue;
 
 		/* make sure we have enough to get to the MID */
-		if (pdu_length < header_size() - 1 - 4) {
+		if (pdu_length < HEADER_SIZE(server) - 1 - 4) {
 			cERROR(1, "SMB response too short (%u bytes)",
 				pdu_length);
 			cifs_reconnect(server);
@@ -1035,12 +1036,12 @@ cifs_demultiplex_thread(void *p)
 
 		/* read down to the MID */
 		length = cifs_read_from_socket(server, buf + 4,
-					       header_size() - 1 - 4);
+					       HEADER_SIZE(server) - 1 - 4);
 		if (length < 0)
 			continue;
 		server->total_read += length;
 
-		mid_entry = find_mid(server, buf);
+		mid_entry = server->ops->find_mid(server, buf);
 
 		if (!mid_entry || !mid_entry->receive)
 			length = standard_receive3(server, mid_entry);
@@ -1057,12 +1058,13 @@ cifs_demultiplex_thread(void *p)
 		if (mid_entry != NULL) {
 			if (!mid_entry->multiRsp || mid_entry->multiEnd)
 				mid_entry->callback(mid_entry);
-		} else if (!is_valid_oplock_break(buf, server)) {
+		} else if (!server->ops->is_oplock_break(buf, server)) {
 			cERROR(1, "No task to wake, unknown frame received! "
 				   "NumMids %d", atomic_read(&midCount));
-			cifs_dump_mem("Received Data is: ", buf, header_size());
+			cifs_dump_mem("Received Data is: ", buf,
+				      HEADER_SIZE(server));
 #ifdef CONFIG_CIFS_DEBUG2
-			cifs_dump_detail(buf);
+			server->ops->dump_detail(buf);
 			cifs_dump_mids(server);
 #endif /* CIFS_DEBUG2 */
 
@@ -1186,6 +1188,54 @@ static int cifs_parse_security_flavors(char *value,
 }
 
 static int
+cifs_parse_cache_flavor(char *value, struct smb_vol *vol)
+{
+	substring_t args[MAX_OPT_ARGS];
+
+	switch (match_token(value, cifs_cacheflavor_tokens, args)) {
+	case Opt_cache_loose:
+		vol->direct_io = false;
+		vol->strict_io = false;
+		break;
+	case Opt_cache_strict:
+		vol->direct_io = false;
+		vol->strict_io = true;
+		break;
+	case Opt_cache_none:
+		vol->direct_io = true;
+		vol->strict_io = false;
+		break;
+	default:
+		cERROR(1, "bad cache= option: %s", value);
+		return 1;
+	}
+	return 0;
+}
+
+static int
+cifs_parse_smb_version(char *value, struct smb_vol *vol)
+{
+	substring_t args[MAX_OPT_ARGS];
+
+	switch (match_token(value, cifs_smb_version_tokens, args)) {
+	case Smb_1:
+		vol->ops = &smb1_operations;
+		vol->vals = &smb1_values;
+		break;
+#ifdef CONFIG_CIFS_SMB2
+	case Smb_21:
+		vol->ops = &smb21_operations;
+		vol->vals = &smb21_values;
+		break;
+#endif
+	default:
+		cERROR(1, "Unknown vers= option specified: %s", value);
+		return 1;
+	}
+	return 0;
+}
+
+static int
 cifs_parse_mount_options(const char *mountdata, const char *devname,
 			 struct smb_vol *vol)
 {
@@ -1203,6 +1253,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 	char *string = NULL;
 	char *tmp_end, *value;
 	char delim;
+	bool cache_specified = false;
+	static bool cache_warned = false;
 
 	separator[0] = ',';
 	separator[1] = 0;
@@ -1236,6 +1288,10 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 
 	vol->actimeo = CIFS_DEF_ACTIMEO;
 
+	/* FIXME: add autonegotiation -- for now, SMB1 is default */
+	vol->ops = &smb1_operations;
+	vol->vals = &smb1_values;
+
 	if (!mountdata)
 		goto cifs_parse_mount_err;
 
@@ -1414,10 +1470,20 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 			vol->seal = 1;
 			break;
 		case Opt_direct:
-			vol->direct_io = 1;
+			cache_specified = true;
+			vol->direct_io = true;
+			vol->strict_io = false;
+			cERROR(1, "The \"directio\" option will be removed in "
+				  "3.7. Please switch to the \"cache=none\" "
+				  "option.");
 			break;
 		case Opt_strictcache:
-			vol->strict_io = 1;
+			cache_specified = true;
+			vol->direct_io = false;
+			vol->strict_io = true;
+			cERROR(1, "The \"strictcache\" option will be removed "
+				"in 3.7. Please switch to the \"cache=strict\" "
+				"option.");
 			break;
 		case Opt_noac:
 			printk(KERN_WARNING "CIFS: Mount option noac not "
@@ -1821,8 +1887,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 			if (string == NULL)
 				goto out_nomem;
 
-			if (strnicmp(string, "cifs", 4) == 0 ||
-			    strnicmp(string, "1", 1) == 0) {
+			if (strnicmp(string, "1", 1) == 0) {
 				/* This is the default */
 				break;
 			}
@@ -1830,6 +1895,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 			printk(KERN_WARNING "CIFS: Invalid version"
 					    " specified\n");
 			goto cifs_parse_mount_err;
+		case Opt_vers:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+
+			if (cifs_parse_smb_version(string, vol) != 0)
+				goto cifs_parse_mount_err;
+			break;
 		case Opt_sec:
 			string = match_strdup(args);
 			if (string == NULL)
@@ -1838,6 +1911,15 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 			if (cifs_parse_security_flavors(string, vol) != 0)
 				goto cifs_parse_mount_err;
 			break;
+		case Opt_cache:
+			cache_specified = true;
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+
+			if (cifs_parse_cache_flavor(string, vol) != 0)
+				goto cifs_parse_mount_err;
+			break;
 		default:
 			/*
 			 * An option we don't recognize. Save it off for later
@@ -1881,6 +1963,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 		printk(KERN_NOTICE "CIFS: ignoring forcegid mount option "
 				   "specified with no gid= option.\n");
 
+	/* FIXME: remove this block in 3.7 */
+	if (!cache_specified && !cache_warned) {
+		cache_warned = true;
+		printk(KERN_NOTICE "CIFS: no cache= option specified, using "
+				   "\"cache=loose\". This default will change "
+				   "to \"cache=strict\" in 3.7.\n");
+	}
+
 	kfree(mountdata_copy);
 	return 0;
 
@@ -2041,6 +2131,9 @@ match_security(struct TCP_Server_Info *server, struct smb_vol *vol)
 static int match_server(struct TCP_Server_Info *server, struct sockaddr *addr,
 			 struct smb_vol *vol)
 {
+	if ((server->vals != vol->vals) || (server->ops != vol->ops))
+		return 0;
+
 	if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns))
 		return 0;
 
@@ -2163,6 +2256,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 		goto out_err;
 	}
 
+	tcp_ses->ops = volume_info->ops;
+	tcp_ses->vals = volume_info->vals;
 	cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));
 	tcp_ses->hostname = extract_hostname(volume_info->UNC);
 	if (IS_ERR(tcp_ses->hostname)) {
@@ -3569,6 +3664,7 @@ cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
 	if (cifs_parse_mount_options(mount_data, devname, volume_info))
 		return -EINVAL;
 
+
 	if (volume_info->nullauth) {
 		cFYI(1, "Anonymous login");
 		kfree(volume_info->username);
@@ -4010,11 +4106,11 @@ int cifs_negotiate_protocol(unsigned int xid, struct cifs_ses *ses)
 	if (server->maxBuf != 0)
 		return 0;
 
-	cifs_set_credits(server, 1);
+	set_credits(server, 1);
 	rc = CIFSSMBNegotiate(xid, ses);
 	if (rc == -EAGAIN) {
 		/* retry only once on 1st time connection */
-		cifs_set_credits(server, 1);
+		set_credits(server, 1);
 		rc = CIFSSMBNegotiate(xid, ses);
 		if (rc == -EAGAIN)
 			rc = -EHOSTDOWN;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 81725e9286e9..253170dfa716 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -264,6 +264,7 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
 	pCifsFile->tlink = cifs_get_tlink(tlink);
 	mutex_init(&pCifsFile->fh_mutex);
 	INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
+	INIT_LIST_HEAD(&pCifsFile->llist);
 
 	spin_lock(&cifs_file_list_lock);
 	list_add(&pCifsFile->tlist, &(tlink_tcon(tlink)->openFileList));
@@ -334,9 +335,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
 	 * is closed anyway.
 	 */
 	mutex_lock(&cifsi->lock_mutex);
-	list_for_each_entry_safe(li, tmp, &cifsi->llist, llist) {
-		if (li->netfid != cifs_file->netfid)
-			continue;
+	list_for_each_entry_safe(li, tmp, &cifs_file->llist, llist) {
 		list_del(&li->llist);
 		cifs_del_lock_waiters(li);
 		kfree(li);
@@ -645,7 +644,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
 }
 
 static struct cifsLockInfo *
-cifs_lock_init(__u64 offset, __u64 length, __u8 type, __u16 netfid)
+cifs_lock_init(__u64 offset, __u64 length, __u8 type)
 {
 	struct cifsLockInfo *lock =
 		kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL);
@@ -654,7 +653,6 @@ cifs_lock_init(__u64 offset, __u64 length, __u8 type, __u16 netfid)
 	lock->offset = offset;
 	lock->length = length;
 	lock->type = type;
-	lock->netfid = netfid;
 	lock->pid = current->tgid;
 	INIT_LIST_HEAD(&lock->blist);
 	init_waitqueue_head(&lock->block_q);
@@ -672,19 +670,20 @@ cifs_del_lock_waiters(struct cifsLockInfo *lock)
 }
 
 static bool
-__cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset,
-			__u64 length, __u8 type, __u16 netfid,
-			struct cifsLockInfo **conf_lock)
+cifs_find_fid_lock_conflict(struct cifsFileInfo *cfile, __u64 offset,
+			    __u64 length, __u8 type, struct cifsFileInfo *cur,
+			    struct cifsLockInfo **conf_lock)
 {
-	struct cifsLockInfo *li, *tmp;
+	struct cifsLockInfo *li;
+	struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
 
-	list_for_each_entry_safe(li, tmp, &cinode->llist, llist) {
+	list_for_each_entry(li, &cfile->llist, llist) {
 		if (offset + length <= li->offset ||
 		    offset >= li->offset + li->length)
 			continue;
-		else if ((type & LOCKING_ANDX_SHARED_LOCK) &&
-			 ((netfid == li->netfid && current->tgid == li->pid) ||
-			  type == li->type))
+		else if ((type & server->vals->shared_lock_type) &&
+			 ((server->ops->compare_fids(cur, cfile) &&
+			   current->tgid == li->pid) || type == li->type))
 			continue;
 		else {
 			*conf_lock = li;
@@ -695,11 +694,23 @@ __cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset,
 }
 
 static bool
-cifs_find_lock_conflict(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock,
-			struct cifsLockInfo **conf_lock)
+cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
+			__u8 type, struct cifsLockInfo **conf_lock)
 {
-	return __cifs_find_lock_conflict(cinode, lock->offset, lock->length,
-					 lock->type, lock->netfid, conf_lock);
+	bool rc = false;
+	struct cifsFileInfo *fid, *tmp;
+	struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+
+	spin_lock(&cifs_file_list_lock);
+	list_for_each_entry_safe(fid, tmp, &cinode->openFileList, flist) {
+		rc = cifs_find_fid_lock_conflict(fid, offset, length, type,
+						 cfile, conf_lock);
+		if (rc)
+			break;
+	}
+	spin_unlock(&cifs_file_list_lock);
+
+	return rc;
 }
 
 /*
@@ -710,22 +721,24 @@ cifs_find_lock_conflict(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock,
  * the server or 1 otherwise.
  */
 static int
-cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
-	       __u8 type, __u16 netfid, struct file_lock *flock)
+cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
+	       __u8 type, struct file_lock *flock)
 {
 	int rc = 0;
 	struct cifsLockInfo *conf_lock;
+	struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+	struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
 	bool exist;
 
 	mutex_lock(&cinode->lock_mutex);
 
-	exist = __cifs_find_lock_conflict(cinode, offset, length, type, netfid,
-					  &conf_lock);
+	exist = cifs_find_lock_conflict(cfile, offset, length, type,
+					&conf_lock);
 	if (exist) {
 		flock->fl_start = conf_lock->offset;
 		flock->fl_end = conf_lock->offset + conf_lock->length - 1;
 		flock->fl_pid = conf_lock->pid;
-		if (conf_lock->type & LOCKING_ANDX_SHARED_LOCK)
+		if (conf_lock->type & server->vals->shared_lock_type)
 			flock->fl_type = F_RDLCK;
 		else
 			flock->fl_type = F_WRLCK;
@@ -739,10 +752,11 @@ cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
 }
 
 static void
-cifs_lock_add(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock)
+cifs_lock_add(struct cifsFileInfo *cfile, struct cifsLockInfo *lock)
 {
+	struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
 	mutex_lock(&cinode->lock_mutex);
-	list_add_tail(&lock->llist, &cinode->llist);
+	list_add_tail(&lock->llist, &cfile->llist);
 	mutex_unlock(&cinode->lock_mutex);
 }
 
@@ -753,10 +767,11 @@ cifs_lock_add(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock)
  * 3) -EACCESS, if there is a lock that prevents us and wait is false.
  */
 static int
-cifs_lock_add_if(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock,
+cifs_lock_add_if(struct cifsFileInfo *cfile, struct cifsLockInfo *lock,
 		 bool wait)
 {
 	struct cifsLockInfo *conf_lock;
+	struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
 	bool exist;
 	int rc = 0;
 
@@ -764,9 +779,10 @@ try_again:
 	exist = false;
 	mutex_lock(&cinode->lock_mutex);
 
-	exist = cifs_find_lock_conflict(cinode, lock, &conf_lock);
+	exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length,
+					lock->type, &conf_lock);
 	if (!exist && cinode->can_cache_brlcks) {
-		list_add_tail(&lock->llist, &cinode->llist);
+		list_add_tail(&lock->llist, &cfile->llist);
 		mutex_unlock(&cinode->lock_mutex);
 		return rc;
 	}
@@ -888,7 +904,7 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
 	for (i = 0; i < 2; i++) {
 		cur = buf;
 		num = 0;
-		list_for_each_entry_safe(li, tmp, &cinode->llist, llist) {
+		list_for_each_entry_safe(li, tmp, &cfile->llist, llist) {
 			if (li->type != types[i])
 				continue;
 			cur->Pid = cpu_to_le16(li->pid);
@@ -898,7 +914,8 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
 			cur->OffsetHigh = cpu_to_le32((u32)(li->offset>>32));
 			if (++num == max_num) {
 				stored_rc = cifs_lockv(xid, tcon, cfile->netfid,
-						       li->type, 0, num, buf);
+						       (__u8)li->type, 0, num,
+						       buf);
 				if (stored_rc)
 					rc = stored_rc;
 				cur = buf;
@@ -909,7 +926,7 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
 
 		if (num) {
 			stored_rc = cifs_lockv(xid, tcon, cfile->netfid,
-					       types[i], 0, num, buf);
+					       (__u8)types[i], 0, num, buf);
 			if (stored_rc)
 				rc = stored_rc;
 		}
@@ -1053,8 +1070,8 @@ cifs_push_locks(struct cifsFileInfo *cfile)
 }
 
 static void
-cifs_read_flock(struct file_lock *flock, __u8 *type, int *lock, int *unlock,
-		bool *wait_flag)
+cifs_read_flock(struct file_lock *flock, __u32 *type, int *lock, int *unlock,
+		bool *wait_flag, struct TCP_Server_Info *server)
 {
 	if (flock->fl_flags & FL_POSIX)
 		cFYI(1, "Posix");
@@ -1073,38 +1090,50 @@ cifs_read_flock(struct file_lock *flock, __u8 *type, int *lock, int *unlock,
 	    (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE)))
 		cFYI(1, "Unknown lock flags 0x%x", flock->fl_flags);
 
-	*type = LOCKING_ANDX_LARGE_FILES;
+	*type = server->vals->large_lock_type;
 	if (flock->fl_type == F_WRLCK) {
 		cFYI(1, "F_WRLCK ");
+		*type |= server->vals->exclusive_lock_type;
 		*lock = 1;
 	} else if (flock->fl_type == F_UNLCK) {
 		cFYI(1, "F_UNLCK");
+		*type |= server->vals->unlock_lock_type;
 		*unlock = 1;
 		/* Check if unlock includes more than one lock range */
 	} else if (flock->fl_type == F_RDLCK) {
 		cFYI(1, "F_RDLCK");
-		*type |= LOCKING_ANDX_SHARED_LOCK;
+		*type |= server->vals->shared_lock_type;
 		*lock = 1;
 	} else if (flock->fl_type == F_EXLCK) {
 		cFYI(1, "F_EXLCK");
+		*type |= server->vals->exclusive_lock_type;
 		*lock = 1;
 	} else if (flock->fl_type == F_SHLCK) {
 		cFYI(1, "F_SHLCK");
-		*type |= LOCKING_ANDX_SHARED_LOCK;
+		*type |= server->vals->shared_lock_type;
 		*lock = 1;
 	} else
 		cFYI(1, "Unknown type of lock");
 }
 
 static int
-cifs_getlk(struct file *file, struct file_lock *flock, __u8 type,
+cifs_mandatory_lock(int xid, struct cifsFileInfo *cfile, __u64 offset,
+		    __u64 length, __u32 type, int lock, int unlock, bool wait)
+{
+	return CIFSSMBLock(xid, tlink_tcon(cfile->tlink), cfile->netfid,
+			   current->tgid, length, offset, unlock, lock,
+			   (__u8)type, wait, 0);
+}
+
+static int
+cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
 	   bool wait_flag, bool posix_lck, int xid)
 {
 	int rc = 0;
 	__u64 length = 1 + flock->fl_end - flock->fl_start;
 	struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
 	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
-	struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+	struct TCP_Server_Info *server = tcon->ses->server;
 	__u16 netfid = cfile->netfid;
 
 	if (posix_lck) {
@@ -1114,7 +1143,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u8 type,
 		if (!rc)
 			return rc;
 
-		if (type & LOCKING_ANDX_SHARED_LOCK)
+		if (type & server->vals->shared_lock_type)
 			posix_lock_type = CIFS_RDLCK;
 		else
 			posix_lock_type = CIFS_WRLCK;
@@ -1124,38 +1153,35 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u8 type,
 		return rc;
 	}
 
-	rc = cifs_lock_test(cinode, flock->fl_start, length, type, netfid,
-			    flock);
+	rc = cifs_lock_test(cfile, flock->fl_start, length, type, flock);
 	if (!rc)
 		return rc;
 
 	/* BB we could chain these into one lock request BB */
-	rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
-			 flock->fl_start, 0, 1, type, 0, 0);
+	rc = cifs_mandatory_lock(xid, cfile, flock->fl_start, length, type,
+				 1, 0, false);
 	if (rc == 0) {
-		rc = CIFSSMBLock(xid, tcon, netfid, current->tgid,
-				 length, flock->fl_start, 1, 0,
-				 type, 0, 0);
+		rc = cifs_mandatory_lock(xid, cfile, flock->fl_start, length,
+					 type, 0, 1, false);
 		flock->fl_type = F_UNLCK;
 		if (rc != 0)
 			cERROR(1, "Error unlocking previously locked "
-				   "range %d during test of lock", rc);
+				  "range %d during test of lock", rc);
 		return 0;
 	}
 
-	if (type & LOCKING_ANDX_SHARED_LOCK) {
+	if (type & server->vals->shared_lock_type) {
 		flock->fl_type = F_WRLCK;
 		return 0;
 	}
 
-	rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
-			 flock->fl_start, 0, 1,
-			 type | LOCKING_ANDX_SHARED_LOCK, 0, 0);
+	rc = cifs_mandatory_lock(xid, cfile, flock->fl_start, length,
+				 type | server->vals->shared_lock_type, 1, 0,
+				 false);
 	if (rc == 0) {
-		rc = CIFSSMBLock(xid, tcon, netfid, current->tgid,
-				 length, flock->fl_start, 1, 0,
-				 type | LOCKING_ANDX_SHARED_LOCK,
-				 0, 0);
+		rc = cifs_mandatory_lock(xid, cfile, flock->fl_start, length,
+					 type | server->vals->shared_lock_type,
+					 0, 1, false);
 		flock->fl_type = F_RDLCK;
 		if (rc != 0)
 			cERROR(1, "Error unlocking previously locked "
@@ -1212,15 +1238,13 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid)
 	for (i = 0; i < 2; i++) {
 		cur = buf;
 		num = 0;
-		list_for_each_entry_safe(li, tmp, &cinode->llist, llist) {
+		list_for_each_entry_safe(li, tmp, &cfile->llist, llist) {
 			if (flock->fl_start > li->offset ||
 			    (flock->fl_start + length) <
 			    (li->offset + li->length))
 				continue;
 			if (current->tgid != li->pid)
 				continue;
-			if (cfile->netfid != li->netfid)
-				continue;
 			if (types[i] != li->type)
 				continue;
 			if (!cinode->can_cache_brlcks) {
@@ -1233,7 +1257,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid)
 					cpu_to_le32((u32)(li->offset>>32));
 				/*
 				 * We need to save a lock here to let us add
-				 * it again to the inode list if the unlock
+				 * it again to the file's list if the unlock
 				 * range request fails on the server.
 				 */
 				list_move(&li->llist, &tmp_llist);
@@ -1247,10 +1271,10 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid)
 						 * We failed on the unlock range
 						 * request - add all locks from
 						 * the tmp list to the head of
-						 * the inode list.
+						 * the file's list.
 						 */
 						cifs_move_llist(&tmp_llist,
-								&cinode->llist);
+								&cfile->llist);
 						rc = stored_rc;
 					} else
 						/*
@@ -1265,7 +1289,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid)
 			} else {
 				/*
 				 * We can cache brlock requests - simply remove
-				 * a lock from the inode list.
+				 * a lock from the file's list.
 				 */
 				list_del(&li->llist);
 				cifs_del_lock_waiters(li);
@@ -1276,7 +1300,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid)
 			stored_rc = cifs_lockv(xid, tcon, cfile->netfid,
 					       types[i], num, 0, buf);
 			if (stored_rc) {
-				cifs_move_llist(&tmp_llist, &cinode->llist);
+				cifs_move_llist(&tmp_llist, &cfile->llist);
 				rc = stored_rc;
 			} else
 				cifs_free_llist(&tmp_llist);
@@ -1289,14 +1313,14 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid)
 }
 
 static int
-cifs_setlk(struct file *file,  struct file_lock *flock, __u8 type,
+cifs_setlk(struct file *file,  struct file_lock *flock, __u32 type,
 	   bool wait_flag, bool posix_lck, int lock, int unlock, int xid)
 {
 	int rc = 0;
 	__u64 length = 1 + flock->fl_end - flock->fl_start;
 	struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
 	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
-	struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
+	struct TCP_Server_Info *server = tcon->ses->server;
 	__u16 netfid = cfile->netfid;
 
 	if (posix_lck) {
@@ -1306,7 +1330,7 @@ cifs_setlk(struct file *file,  struct file_lock *flock, __u8 type,
 		if (!rc || rc < 0)
 			return rc;
 
-		if (type & LOCKING_ANDX_SHARED_LOCK)
+		if (type & server->vals->shared_lock_type)
 			posix_lock_type = CIFS_RDLCK;
 		else
 			posix_lock_type = CIFS_WRLCK;
@@ -1323,24 +1347,24 @@ cifs_setlk(struct file *file,  struct file_lock *flock, __u8 type,
 	if (lock) {
 		struct cifsLockInfo *lock;
 
-		lock = cifs_lock_init(flock->fl_start, length, type, netfid);
+		lock = cifs_lock_init(flock->fl_start, length, type);
 		if (!lock)
 			return -ENOMEM;
 
-		rc = cifs_lock_add_if(cinode, lock, wait_flag);
+		rc = cifs_lock_add_if(cfile, lock, wait_flag);
 		if (rc < 0)
 			kfree(lock);
 		if (rc <= 0)
 			goto out;
 
-		rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
-				 flock->fl_start, 0, 1, type, wait_flag, 0);
+		rc = cifs_mandatory_lock(xid, cfile, flock->fl_start, length,
+					 type, 1, 0, wait_flag);
 		if (rc) {
 			kfree(lock);
 			goto out;
 		}
 
-		cifs_lock_add(cinode, lock);
+		cifs_lock_add(cfile, lock);
 	} else if (unlock)
 		rc = cifs_unlock_range(cfile, flock, xid);
 
@@ -1361,7 +1385,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
 	struct cifsInodeInfo *cinode;
 	struct cifsFileInfo *cfile;
 	__u16 netfid;
-	__u8 type;
+	__u32 type;
 
 	rc = -EACCES;
 	xid = GetXid();
@@ -1370,11 +1394,13 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
 		"end: %lld", cmd, flock->fl_flags, flock->fl_type,
 		flock->fl_start, flock->fl_end);
 
-	cifs_read_flock(flock, &type, &lock, &unlock, &wait_flag);
-
-	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 	cfile = (struct cifsFileInfo *)file->private_data;
 	tcon = tlink_tcon(cfile->tlink);
+
+	cifs_read_flock(flock, &type, &lock, &unlock, &wait_flag,
+			tcon->ses->server);
+
+	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 	netfid = cfile->netfid;
 	cinode = CIFS_I(file->f_path.dentry->d_inode);
 
@@ -1539,10 +1565,11 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
 struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
 					bool fsuid_only)
 {
-	struct cifsFileInfo *open_file;
+	struct cifsFileInfo *open_file, *inv_file = NULL;
 	struct cifs_sb_info *cifs_sb;
 	bool any_available = false;
 	int rc;
+	unsigned int refind = 0;
 
 	/* Having a null inode here (because mapping->host was set to zero by
 	the VFS or MM) should not happen but we had reports of on oops (due to
@@ -1562,40 +1589,25 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
 
 	spin_lock(&cifs_file_list_lock);
 refind_writable:
+	if (refind > MAX_REOPEN_ATT) {
+		spin_unlock(&cifs_file_list_lock);
+		return NULL;
+	}
 	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
 		if (!any_available && open_file->pid != current->tgid)
 			continue;
 		if (fsuid_only && open_file->uid != current_fsuid())
 			continue;
 		if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
-			cifsFileInfo_get(open_file);
-
 			if (!open_file->invalidHandle) {
 				/* found a good writable file */
+				cifsFileInfo_get(open_file);
 				spin_unlock(&cifs_file_list_lock);
 				return open_file;
+			} else {
+				if (!inv_file)
+					inv_file = open_file;
 			}
-
-			spin_unlock(&cifs_file_list_lock);
-
-			/* Had to unlock since following call can block */
-			rc = cifs_reopen_file(open_file, false);
-			if (!rc)
-				return open_file;
-
-			/* if it fails, try another handle if possible */
-			cFYI(1, "wp failed on reopen file");
-			cifsFileInfo_put(open_file);
-
-			spin_lock(&cifs_file_list_lock);
-
-			/* else we simply continue to the next entry. Thus
-			   we do not loop on reopen errors.  If we
-			   can not reopen the file, for example if we
-			   reconnected to a server with another client
-			   racing to delete or lock the file we would not
-			   make progress if we restarted before the beginning
-			   of the loop here. */
 		}
 	}
 	/* couldn't find useable FH with same pid, try any available */
@@ -1603,7 +1615,30 @@ refind_writable:
 		any_available = true;
 		goto refind_writable;
 	}
+
+	if (inv_file) {
+		any_available = false;
+		cifsFileInfo_get(inv_file);
+	}
+
 	spin_unlock(&cifs_file_list_lock);
+
+	if (inv_file) {
+		rc = cifs_reopen_file(inv_file, false);
+		if (!rc)
+			return inv_file;
+		else {
+			spin_lock(&cifs_file_list_lock);
+			list_move_tail(&inv_file->flist,
+					&cifs_inode->openFileList);
+			spin_unlock(&cifs_file_list_lock);
+			cifsFileInfo_put(inv_file);
+			spin_lock(&cifs_file_list_lock);
+			++refind;
+			goto refind_writable;
+		}
+	}
+
 	return NULL;
 }
 
@@ -2339,24 +2374,224 @@ ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
 	return cifs_user_writev(iocb, iov, nr_segs, pos);
 }
 
+static struct cifs_readdata *
+cifs_readdata_alloc(unsigned int nr_vecs, work_func_t complete)
+{
+	struct cifs_readdata *rdata;
+
+	rdata = kzalloc(sizeof(*rdata) +
+			sizeof(struct kvec) * nr_vecs, GFP_KERNEL);
+	if (rdata != NULL) {
+		kref_init(&rdata->refcount);
+		INIT_LIST_HEAD(&rdata->list);
+		init_completion(&rdata->done);
+		INIT_WORK(&rdata->work, complete);
+		INIT_LIST_HEAD(&rdata->pages);
+	}
+	return rdata;
+}
+
+void
+cifs_readdata_release(struct kref *refcount)
+{
+	struct cifs_readdata *rdata = container_of(refcount,
+					struct cifs_readdata, refcount);
+
+	if (rdata->cfile)
+		cifsFileInfo_put(rdata->cfile);
+
+	kfree(rdata);
+}
+
+static int
+cifs_read_allocate_pages(struct list_head *list, unsigned int npages)
+{
+	int rc = 0;
+	struct page *page, *tpage;
+	unsigned int i;
+
+	for (i = 0; i < npages; i++) {
+		page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
+		if (!page) {
+			rc = -ENOMEM;
+			break;
+		}
+		list_add(&page->lru, list);
+	}
+
+	if (rc) {
+		list_for_each_entry_safe(page, tpage, list, lru) {
+			list_del(&page->lru);
+			put_page(page);
+		}
+	}
+	return rc;
+}
+
+static void
+cifs_uncached_readdata_release(struct kref *refcount)
+{
+	struct page *page, *tpage;
+	struct cifs_readdata *rdata = container_of(refcount,
+					struct cifs_readdata, refcount);
+
+	list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
+		list_del(&page->lru);
+		put_page(page);
+	}
+	cifs_readdata_release(refcount);
+}
+
+static int
+cifs_retry_async_readv(struct cifs_readdata *rdata)
+{
+	int rc;
+
+	do {
+		if (rdata->cfile->invalidHandle) {
+			rc = cifs_reopen_file(rdata->cfile, true);
+			if (rc != 0)
+				continue;
+		}
+		rc = cifs_async_readv(rdata);
+	} while (rc == -EAGAIN);
+
+	return rc;
+}
+
+/**
+ * cifs_readdata_to_iov - copy data from pages in response to an iovec
+ * @rdata:	the readdata response with list of pages holding data
+ * @iov:	vector in which we should copy the data
+ * @nr_segs:	number of segments in vector
+ * @offset:	offset into file of the first iovec
+ * @copied:	used to return the amount of data copied to the iov
+ *
+ * This function copies data from a list of pages in a readdata response into
+ * an array of iovecs. It will first calculate where the data should go
+ * based on the info in the readdata and then copy the data into that spot.
+ */
+static ssize_t
+cifs_readdata_to_iov(struct cifs_readdata *rdata, const struct iovec *iov,
+			unsigned long nr_segs, loff_t offset, ssize_t *copied)
+{
+	int rc = 0;
+	struct iov_iter ii;
+	size_t pos = rdata->offset - offset;
+	struct page *page, *tpage;
+	ssize_t remaining = rdata->bytes;
+	unsigned char *pdata;
+
+	/* set up iov_iter and advance to the correct offset */
+	iov_iter_init(&ii, iov, nr_segs, iov_length(iov, nr_segs), 0);
+	iov_iter_advance(&ii, pos);
+
+	*copied = 0;
+	list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
+		ssize_t copy;
+
+		/* copy a whole page or whatever's left */
+		copy = min_t(ssize_t, remaining, PAGE_SIZE);
+
+		/* ...but limit it to whatever space is left in the iov */
+		copy = min_t(ssize_t, copy, iov_iter_count(&ii));
+
+		/* go while there's data to be copied and no errors */
+		if (copy && !rc) {
+			pdata = kmap(page);
+			rc = memcpy_toiovecend(ii.iov, pdata, ii.iov_offset,
+						(int)copy);
+			kunmap(page);
+			if (!rc) {
+				*copied += copy;
+				remaining -= copy;
+				iov_iter_advance(&ii, copy);
+			}
+		}
+
+		list_del(&page->lru);
+		put_page(page);
+	}
+
+	return rc;
+}
+
+static void
+cifs_uncached_readv_complete(struct work_struct *work)
+{
+	struct cifs_readdata *rdata = container_of(work,
+						struct cifs_readdata, work);
+
+	/* if the result is non-zero then the pages weren't kmapped */
+	if (rdata->result == 0) {
+		struct page *page;
+
+		list_for_each_entry(page, &rdata->pages, lru)
+			kunmap(page);
+	}
+
+	complete(&rdata->done);
+	kref_put(&rdata->refcount, cifs_uncached_readdata_release);
+}
+
+static int
+cifs_uncached_read_marshal_iov(struct cifs_readdata *rdata,
+				unsigned int remaining)
+{
+	int len = 0;
+	struct page *page, *tpage;
+
+	rdata->nr_iov = 1;
+	list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
+		if (remaining >= PAGE_SIZE) {
+			/* enough data to fill the page */
+			rdata->iov[rdata->nr_iov].iov_base = kmap(page);
+			rdata->iov[rdata->nr_iov].iov_len = PAGE_SIZE;
+			cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
+				rdata->nr_iov, page->index,
+				rdata->iov[rdata->nr_iov].iov_base,
+				rdata->iov[rdata->nr_iov].iov_len);
+			++rdata->nr_iov;
+			len += PAGE_SIZE;
+			remaining -= PAGE_SIZE;
+		} else if (remaining > 0) {
+			/* enough for partial page, fill and zero the rest */
+			rdata->iov[rdata->nr_iov].iov_base = kmap(page);
+			rdata->iov[rdata->nr_iov].iov_len = remaining;
+			cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
+				rdata->nr_iov, page->index,
+				rdata->iov[rdata->nr_iov].iov_base,
+				rdata->iov[rdata->nr_iov].iov_len);
+			memset(rdata->iov[rdata->nr_iov].iov_base + remaining,
+				'\0', PAGE_SIZE - remaining);
+			++rdata->nr_iov;
+			len += remaining;
+			remaining = 0;
+		} else {
+			/* no need to hold page hostage */
+			list_del(&page->lru);
+			put_page(page);
+		}
+	}
+
+	return len;
+}
+
 static ssize_t
 cifs_iovec_read(struct file *file, const struct iovec *iov,
 		 unsigned long nr_segs, loff_t *poffset)
 {
-	int rc;
-	int xid;
-	ssize_t total_read;
-	unsigned int bytes_read = 0;
+	ssize_t rc;
 	size_t len, cur_len;
-	int iov_offset = 0;
+	ssize_t total_read = 0;
+	loff_t offset = *poffset;
+	unsigned int npages;
 	struct cifs_sb_info *cifs_sb;
-	struct cifs_tcon *pTcon;
+	struct cifs_tcon *tcon;
 	struct cifsFileInfo *open_file;
-	struct smb_com_read_rsp *pSMBr;
-	struct cifs_io_parms io_parms;
-	char *read_data;
-	unsigned int rsize;
-	__u32 pid;
+	struct cifs_readdata *rdata, *tmp;
+	struct list_head rdata_list;
+	pid_t pid;
 
 	if (!nr_segs)
 		return 0;
@@ -2365,14 +2600,10 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
 	if (!len)
 		return 0;
 
-	xid = GetXid();
+	INIT_LIST_HEAD(&rdata_list);
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-
-	/* FIXME: set up handlers for larger reads and/or convert to async */
-	rsize = min_t(unsigned int, cifs_sb->rsize, CIFSMaxBufSize);
-
 	open_file = file->private_data;
-	pTcon = tlink_tcon(open_file->tlink);
+	tcon = tlink_tcon(open_file->tlink);
 
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
 		pid = open_file->pid;
@@ -2382,56 +2613,78 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
 	if ((file->f_flags & O_ACCMODE) == O_WRONLY)
 		cFYI(1, "attempting read on write only file instance");
 
-	for (total_read = 0; total_read < len; total_read += bytes_read) {
-		cur_len = min_t(const size_t, len - total_read, rsize);
-		rc = -EAGAIN;
-		read_data = NULL;
+	do {
+		cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize);
+		npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
 
-		while (rc == -EAGAIN) {
-			int buf_type = CIFS_NO_BUFFER;
-			if (open_file->invalidHandle) {
-				rc = cifs_reopen_file(open_file, true);
-				if (rc != 0)
-					break;
-			}
-			io_parms.netfid = open_file->netfid;
-			io_parms.pid = pid;
-			io_parms.tcon = pTcon;
-			io_parms.offset = *poffset;
-			io_parms.length = cur_len;
-			rc = CIFSSMBRead(xid, &io_parms, &bytes_read,
-					 &read_data, &buf_type);
-			pSMBr = (struct smb_com_read_rsp *)read_data;
-			if (read_data) {
-				char *data_offset = read_data + 4 +
-						le16_to_cpu(pSMBr->DataOffset);
-				if (memcpy_toiovecend(iov, data_offset,
-						      iov_offset, bytes_read))
-					rc = -EFAULT;
-				if (buf_type == CIFS_SMALL_BUFFER)
-					cifs_small_buf_release(read_data);
-				else if (buf_type == CIFS_LARGE_BUFFER)
-					cifs_buf_release(read_data);
-				read_data = NULL;
-				iov_offset += bytes_read;
-			}
+		/* allocate a readdata struct */
+		rdata = cifs_readdata_alloc(npages,
+					    cifs_uncached_readv_complete);
+		if (!rdata) {
+			rc = -ENOMEM;
+			goto error;
 		}
 
-		if (rc || (bytes_read == 0)) {
-			if (total_read) {
-				break;
-			} else {
-				FreeXid(xid);
-				return rc;
+		rc = cifs_read_allocate_pages(&rdata->pages, npages);
+		if (rc)
+			goto error;
+
+		rdata->cfile = cifsFileInfo_get(open_file);
+		rdata->offset = offset;
+		rdata->bytes = cur_len;
+		rdata->pid = pid;
+		rdata->marshal_iov = cifs_uncached_read_marshal_iov;
+
+		rc = cifs_retry_async_readv(rdata);
+error:
+		if (rc) {
+			kref_put(&rdata->refcount,
+				 cifs_uncached_readdata_release);
+			break;
+		}
+
+		list_add_tail(&rdata->list, &rdata_list);
+		offset += cur_len;
+		len -= cur_len;
+	} while (len > 0);
+
+	/* if at least one read request send succeeded, then reset rc */
+	if (!list_empty(&rdata_list))
+		rc = 0;
+
+	/* the loop below should proceed in the order of increasing offsets */
+restart_loop:
+	list_for_each_entry_safe(rdata, tmp, &rdata_list, list) {
+		if (!rc) {
+			ssize_t copied;
+
+			/* FIXME: freezable sleep too? */
+			rc = wait_for_completion_killable(&rdata->done);
+			if (rc)
+				rc = -EINTR;
+			else if (rdata->result)
+				rc = rdata->result;
+			else {
+				rc = cifs_readdata_to_iov(rdata, iov,
+							nr_segs, *poffset,
+							&copied);
+				total_read += copied;
+			}
+
+			/* resend call if it's a retryable error */
+			if (rc == -EAGAIN) {
+				rc = cifs_retry_async_readv(rdata);
+				goto restart_loop;
 			}
-		} else {
-			cifs_stats_bytes_read(pTcon, bytes_read);
-			*poffset += bytes_read;
 		}
+		list_del_init(&rdata->list);
+		kref_put(&rdata->refcount, cifs_uncached_readdata_release);
 	}
 
-	FreeXid(xid);
-	return total_read;
+	cifs_stats_bytes_read(tcon, total_read);
+	*poffset += total_read;
+
+	return total_read ? total_read : rc;
 }
 
 ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
@@ -2606,6 +2859,100 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	return rc;
 }
 
+static void
+cifs_readv_complete(struct work_struct *work)
+{
+	struct cifs_readdata *rdata = container_of(work,
+						struct cifs_readdata, work);
+	struct page *page, *tpage;
+
+	list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
+		list_del(&page->lru);
+		lru_cache_add_file(page);
+
+		if (rdata->result == 0) {
+			kunmap(page);
+			flush_dcache_page(page);
+			SetPageUptodate(page);
+		}
+
+		unlock_page(page);
+
+		if (rdata->result == 0)
+			cifs_readpage_to_fscache(rdata->mapping->host, page);
+
+		page_cache_release(page);
+	}
+	kref_put(&rdata->refcount, cifs_readdata_release);
+}
+
+static int
+cifs_readpages_marshal_iov(struct cifs_readdata *rdata, unsigned int remaining)
+{
+	int len = 0;
+	struct page *page, *tpage;
+	u64 eof;
+	pgoff_t eof_index;
+
+	/* determine the eof that the server (probably) has */
+	eof = CIFS_I(rdata->mapping->host)->server_eof;
+	eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0;
+	cFYI(1, "eof=%llu eof_index=%lu", eof, eof_index);
+
+	rdata->nr_iov = 1;
+	list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
+		if (remaining >= PAGE_CACHE_SIZE) {
+			/* enough data to fill the page */
+			rdata->iov[rdata->nr_iov].iov_base = kmap(page);
+			rdata->iov[rdata->nr_iov].iov_len = PAGE_CACHE_SIZE;
+			cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
+				rdata->nr_iov, page->index,
+				rdata->iov[rdata->nr_iov].iov_base,
+				rdata->iov[rdata->nr_iov].iov_len);
+			++rdata->nr_iov;
+			len += PAGE_CACHE_SIZE;
+			remaining -= PAGE_CACHE_SIZE;
+		} else if (remaining > 0) {
+			/* enough for partial page, fill and zero the rest */
+			rdata->iov[rdata->nr_iov].iov_base = kmap(page);
+			rdata->iov[rdata->nr_iov].iov_len = remaining;
+			cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
+				rdata->nr_iov, page->index,
+				rdata->iov[rdata->nr_iov].iov_base,
+				rdata->iov[rdata->nr_iov].iov_len);
+			memset(rdata->iov[rdata->nr_iov].iov_base + remaining,
+				'\0', PAGE_CACHE_SIZE - remaining);
+			++rdata->nr_iov;
+			len += remaining;
+			remaining = 0;
+		} else if (page->index > eof_index) {
+			/*
+			 * The VFS will not try to do readahead past the
+			 * i_size, but it's possible that we have outstanding
+			 * writes with gaps in the middle and the i_size hasn't
+			 * caught up yet. Populate those with zeroed out pages
+			 * to prevent the VFS from repeatedly attempting to
+			 * fill them until the writes are flushed.
+			 */
+			zero_user(page, 0, PAGE_CACHE_SIZE);
+			list_del(&page->lru);
+			lru_cache_add_file(page);
+			flush_dcache_page(page);
+			SetPageUptodate(page);
+			unlock_page(page);
+			page_cache_release(page);
+		} else {
+			/* no need to hold page hostage */
+			list_del(&page->lru);
+			lru_cache_add_file(page);
+			unlock_page(page);
+			page_cache_release(page);
+		}
+	}
+
+	return len;
+}
+
 static int cifs_readpages(struct file *file, struct address_space *mapping,
 	struct list_head *page_list, unsigned num_pages)
 {
@@ -2708,7 +3055,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 			nr_pages++;
 		}
 
-		rdata = cifs_readdata_alloc(nr_pages);
+		rdata = cifs_readdata_alloc(nr_pages, cifs_readv_complete);
 		if (!rdata) {
 			/* best to give up if we're out of mem */
 			list_for_each_entry_safe(page, tpage, &tmplist, lru) {
@@ -2722,24 +3069,16 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 		}
 
 		spin_lock(&cifs_file_list_lock);
-		cifsFileInfo_get(open_file);
 		spin_unlock(&cifs_file_list_lock);
-		rdata->cfile = open_file;
+		rdata->cfile = cifsFileInfo_get(open_file);
 		rdata->mapping = mapping;
 		rdata->offset = offset;
 		rdata->bytes = bytes;
 		rdata->pid = pid;
+		rdata->marshal_iov = cifs_readpages_marshal_iov;
 		list_splice_init(&tmplist, &rdata->pages);
 
-		do {
-			if (open_file->invalidHandle) {
-				rc = cifs_reopen_file(open_file, true);
-				if (rc != 0)
-					continue;
-			}
-			rc = cifs_async_readv(rdata);
-		} while (rc == -EAGAIN);
-
+		rc = cifs_retry_async_readv(rdata);
 		if (rc != 0) {
 			list_for_each_entry_safe(page, tpage, &rdata->pages,
 						 lru) {
@@ -2748,9 +3087,11 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 				unlock_page(page);
 				page_cache_release(page);
 			}
-			cifs_readdata_free(rdata);
+			kref_put(&rdata->refcount, cifs_readdata_release);
 			break;
 		}
+
+		kref_put(&rdata->refcount, cifs_readdata_release);
 	}
 
 	return rc;
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 4221b5e48a42..6d2667f0c98c 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -51,7 +51,15 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 	cifs_sb = CIFS_SB(inode->i_sb);
 
 	switch (command) {
+		static bool warned = false;
 		case CIFS_IOC_CHECKUMOUNT:
+			if (!warned) {
+				warned = true;
+				cERROR(1, "the CIFS_IOC_CHECKMOUNT ioctl will "
+					  "be deprecated in 3.7. Please "
+					  "migrate away from the use of "
+					  "umount.cifs");
+			}
 			cFYI(1, "User unmount attempted");
 			if (cifs_sb->mnt_uid == current_uid())
 				rc = 0;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index c29d1aa2c54f..e2552d2b2e42 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -306,8 +306,6 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
 		const struct cifs_tcon *treeCon, int word_count
 		/* length of fixed section (word count) in two byte units  */)
 {
-	struct list_head *temp_item;
-	struct cifs_ses *ses;
 	char *temp = (char *) buffer;
 
 	memset(temp, 0, 256); /* bigger than MAX_CIFS_HDR_SIZE */
@@ -337,51 +335,6 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
 			/* Uid is not converted */
 			buffer->Uid = treeCon->ses->Suid;
 			buffer->Mid = GetNextMid(treeCon->ses->server);
-			if (multiuser_mount != 0) {
-		/* For the multiuser case, there are few obvious technically  */
-		/* possible mechanisms to match the local linux user (uid)    */
-		/* to a valid remote smb user (smb_uid):		      */
-		/* 	1) Query Winbind (or other local pam/nss daemon       */
-		/* 	  for userid/password/logon_domain or credential      */
-		/*      2) Query Winbind for uid to sid to username mapping   */
-		/* 	   and see if we have a matching password for existing*/
-		/*         session for that user perhas getting password by   */
-		/*         adding a new pam_cifs module that stores passwords */
-		/*         so that the cifs vfs can get at that for all logged*/
-		/*	   on users					      */
-		/*	3) (Which is the mechanism we have chosen)	      */
-		/*	   Search through sessions to the same server for a   */
-		/*	   a match on the uid that was passed in on mount     */
-		/*         with the current processes uid (or euid?) and use  */
-		/* 	   that smb uid.   If no existing smb session for     */
-		/* 	   that uid found, use the default smb session ie     */
-		/*         the smb session for the volume mounted which is    */
-		/* 	   the same as would be used if the multiuser mount   */
-		/* 	   flag were disabled.  */
-
-		/*  BB Add support for establishing new tCon and SMB Session  */
-		/*      with userid/password pairs found on the smb session   */
-		/*	for other target tcp/ip addresses 		BB    */
-				if (current_fsuid() != treeCon->ses->linux_uid) {
-					cFYI(1, "Multiuser mode and UID "
-						 "did not match tcon uid");
-					spin_lock(&cifs_tcp_ses_lock);
-					list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) {
-						ses = list_entry(temp_item, struct cifs_ses, smb_ses_list);
-						if (ses->linux_uid == current_fsuid()) {
-							if (ses->server == treeCon->ses->server) {
-								cFYI(1, "found matching uid substitute right smb_uid");
-								buffer->Uid = ses->Suid;
-								break;
-							} else {
-				/* BB eventually call cifs_setup_session here */
-								cFYI(1, "local UID found but no smb sess with this server exists");
-							}
-						}
-					}
-					spin_unlock(&cifs_tcp_ses_lock);
-				}
-			}
 		}
 		if (treeCon->Flags & SMB_SHARE_IS_IN_DFS)
 			buffer->Flags2 |= SMBFLG2_DFS;
@@ -700,22 +653,3 @@ backup_cred(struct cifs_sb_info *cifs_sb)
 
 	return false;
 }
-
-void
-cifs_add_credits(struct TCP_Server_Info *server, const unsigned int add)
-{
-	spin_lock(&server->req_lock);
-	server->credits += add;
-	server->in_flight--;
-	spin_unlock(&server->req_lock);
-	wake_up(&server->request_q);
-}
-
-void
-cifs_set_credits(struct TCP_Server_Info *server, const int val)
-{
-	spin_lock(&server->req_lock);
-	server->credits = val;
-	server->oplocks = val > 1 ? enable_oplocks : false;
-	spin_unlock(&server->req_lock);
-}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index e2bbc683e018..0a8224d1c4c5 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -219,6 +219,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
 
 static int initiate_cifs_search(const int xid, struct file *file)
 {
+	__u16 search_flags;
 	int rc = 0;
 	char *full_path = NULL;
 	struct cifsFileInfo *cifsFile;
@@ -270,8 +271,12 @@ ffirst_retry:
 		cifsFile->srch_inf.info_level = SMB_FIND_FILE_DIRECTORY_INFO;
 	}
 
+	search_flags = CIFS_SEARCH_CLOSE_AT_END | CIFS_SEARCH_RETURN_RESUME;
+	if (backup_cred(cifs_sb))
+		search_flags |= CIFS_SEARCH_BACKUP_SEARCH;
+
 	rc = CIFSFindFirst(xid, pTcon, full_path, cifs_sb->local_nls,
-		&cifsFile->netfid, &cifsFile->srch_inf,
+		&cifsFile->netfid, search_flags, &cifsFile->srch_inf,
 		cifs_sb->mnt_cifs_flags &
 			CIFS_MOUNT_MAP_SPECIAL_CHR, CIFS_DIR_SEP(cifs_sb));
 	if (rc == 0)
@@ -502,11 +507,13 @@ static int cifs_save_resume_key(const char *current_entry,
 static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon,
 	struct file *file, char **ppCurrentEntry, int *num_to_ret)
 {
+	__u16 search_flags;
 	int rc = 0;
 	int pos_in_buf = 0;
 	loff_t first_entry_in_buffer;
 	loff_t index_to_find = file->f_pos;
 	struct cifsFileInfo *cifsFile = file->private_data;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 	/* check if index in the buffer */
 
 	if ((cifsFile == NULL) || (ppCurrentEntry == NULL) ||
@@ -560,10 +567,14 @@ static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon,
 						cifsFile);
 	}
 
+	search_flags = CIFS_SEARCH_CLOSE_AT_END | CIFS_SEARCH_RETURN_RESUME;
+	if (backup_cred(cifs_sb))
+		search_flags |= CIFS_SEARCH_BACKUP_SEARCH;
+
 	while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) &&
 	      (rc == 0) && !cifsFile->srch_inf.endOfSearch) {
 		cFYI(1, "calling findnext2");
-		rc = CIFSFindNext(xid, pTcon, cifsFile->netfid,
+		rc = CIFSFindNext(xid, pTcon, cifsFile->netfid, search_flags,
 				  &cifsFile->srch_inf);
 		/* FindFirst/Next set last_entry to NULL on malformed reply */
 		if (cifsFile->srch_inf.last_entry)
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
new file mode 100644
index 000000000000..d9d615fbed3f
--- /dev/null
+++ b/fs/cifs/smb1ops.c
@@ -0,0 +1,154 @@
+/*
+ *  SMB1 (CIFS) version specific operations
+ *
+ *  Copyright (c) 2012, Jeff Layton <[email protected]>
+ *
+ *  This library is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License v2 as published
+ *  by the Free Software Foundation.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *  the GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "cifspdu.h"
+
+/*
+ * An NT cancel request header looks just like the original request except:
+ *
+ * The Command is SMB_COM_NT_CANCEL
+ * The WordCount is zeroed out
+ * The ByteCount is zeroed out
+ *
+ * This function mangles an existing request buffer into a
+ * SMB_COM_NT_CANCEL request and then sends it.
+ */
+static int
+send_nt_cancel(struct TCP_Server_Info *server, void *buf,
+	       struct mid_q_entry *mid)
+{
+	int rc = 0;
+	struct smb_hdr *in_buf = (struct smb_hdr *)buf;
+
+	/* -4 for RFC1001 length and +2 for BCC field */
+	in_buf->smb_buf_length = cpu_to_be32(sizeof(struct smb_hdr) - 4  + 2);
+	in_buf->Command = SMB_COM_NT_CANCEL;
+	in_buf->WordCount = 0;
+	put_bcc(0, in_buf);
+
+	mutex_lock(&server->srv_mutex);
+	rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
+	if (rc) {
+		mutex_unlock(&server->srv_mutex);
+		return rc;
+	}
+	rc = smb_send(server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
+	mutex_unlock(&server->srv_mutex);
+
+	cFYI(1, "issued NT_CANCEL for mid %u, rc = %d",
+		in_buf->Mid, rc);
+
+	return rc;
+}
+
+static bool
+cifs_compare_fids(struct cifsFileInfo *ob1, struct cifsFileInfo *ob2)
+{
+	return ob1->netfid == ob2->netfid;
+}
+
+static unsigned int
+cifs_read_data_offset(char *buf)
+{
+	READ_RSP *rsp = (READ_RSP *)buf;
+	return le16_to_cpu(rsp->DataOffset);
+}
+
+static unsigned int
+cifs_read_data_length(char *buf)
+{
+	READ_RSP *rsp = (READ_RSP *)buf;
+	return (le16_to_cpu(rsp->DataLengthHigh) << 16) +
+	       le16_to_cpu(rsp->DataLength);
+}
+
+static struct mid_q_entry *
+cifs_find_mid(struct TCP_Server_Info *server, char *buffer)
+{
+	struct smb_hdr *buf = (struct smb_hdr *)buffer;
+	struct mid_q_entry *mid;
+
+	spin_lock(&GlobalMid_Lock);
+	list_for_each_entry(mid, &server->pending_mid_q, qhead) {
+		if (mid->mid == buf->Mid &&
+		    mid->mid_state == MID_REQUEST_SUBMITTED &&
+		    le16_to_cpu(mid->command) == buf->Command) {
+			spin_unlock(&GlobalMid_Lock);
+			return mid;
+		}
+	}
+	spin_unlock(&GlobalMid_Lock);
+	return NULL;
+}
+
+static void
+cifs_add_credits(struct TCP_Server_Info *server, const unsigned int add)
+{
+	spin_lock(&server->req_lock);
+	server->credits += add;
+	server->in_flight--;
+	spin_unlock(&server->req_lock);
+	wake_up(&server->request_q);
+}
+
+static void
+cifs_set_credits(struct TCP_Server_Info *server, const int val)
+{
+	spin_lock(&server->req_lock);
+	server->credits = val;
+	server->oplocks = val > 1 ? enable_oplocks : false;
+	spin_unlock(&server->req_lock);
+}
+
+static int *
+cifs_get_credits_field(struct TCP_Server_Info *server)
+{
+	return &server->credits;
+}
+
+struct smb_version_operations smb1_operations = {
+	.send_cancel = send_nt_cancel,
+	.compare_fids = cifs_compare_fids,
+	.setup_request = cifs_setup_request,
+	.check_receive = cifs_check_receive,
+	.add_credits = cifs_add_credits,
+	.set_credits = cifs_set_credits,
+	.get_credits_field = cifs_get_credits_field,
+	.read_data_offset = cifs_read_data_offset,
+	.read_data_length = cifs_read_data_length,
+	.map_error = map_smb_to_linux_error,
+	.find_mid = cifs_find_mid,
+	.check_message = checkSMB,
+	.dump_detail = cifs_dump_detail,
+	.is_oplock_break = is_valid_oplock_break,
+};
+
+struct smb_version_values smb1_values = {
+	.version_string = SMB1_VERSION_STRING,
+	.large_lock_type = LOCKING_ANDX_LARGE_FILES,
+	.exclusive_lock_type = 0,
+	.shared_lock_type = LOCKING_ANDX_SHARED_LOCK,
+	.unlock_lock_type = 0,
+	.header_size = sizeof(struct smb_hdr),
+	.max_header_size = MAX_CIFS_HDR_SIZE,
+	.read_rsp_size = sizeof(READ_RSP),
+};
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
new file mode 100644
index 000000000000..f065e89756a1
--- /dev/null
+++ b/fs/cifs/smb2ops.c
@@ -0,0 +1,27 @@
+/*
+ *  SMB2 version specific operations
+ *
+ *  Copyright (c) 2012, Jeff Layton <[email protected]>
+ *
+ *  This library is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License v2 as published
+ *  by the Free Software Foundation.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *  the GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include "cifsglob.h"
+
+struct smb_version_operations smb21_operations = {
+};
+
+struct smb_version_values smb21_values = {
+	.version_string = SMB21_VERSION_STRING,
+};
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 0961336513d5..1b36ffe6a47b 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -304,7 +304,8 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int optype,
 static int
 wait_for_free_request(struct TCP_Server_Info *server, const int optype)
 {
-	return wait_for_free_credits(server, optype, get_credits_field(server));
+	return wait_for_free_credits(server, optype,
+				     server->ops->get_credits_field(server));
 }
 
 static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf,
@@ -396,7 +397,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
 	rc = cifs_setup_async_request(server, iov, nvec, &mid);
 	if (rc) {
 		mutex_unlock(&server->srv_mutex);
-		cifs_add_credits(server, 1);
+		add_credits(server, 1);
 		wake_up(&server->request_q);
 		return rc;
 	}
@@ -418,7 +419,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
 	return rc;
 out_err:
 	delete_mid(mid);
-	cifs_add_credits(server, 1);
+	add_credits(server, 1);
 	wake_up(&server->request_q);
 	return rc;
 }
@@ -483,41 +484,11 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
 	return rc;
 }
 
-/*
- * An NT cancel request header looks just like the original request except:
- *
- * The Command is SMB_COM_NT_CANCEL
- * The WordCount is zeroed out
- * The ByteCount is zeroed out
- *
- * This function mangles an existing request buffer into a
- * SMB_COM_NT_CANCEL request and then sends it.
- */
-static int
-send_nt_cancel(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
-		struct mid_q_entry *mid)
+static inline int
+send_cancel(struct TCP_Server_Info *server, void *buf, struct mid_q_entry *mid)
 {
-	int rc = 0;
-
-	/* -4 for RFC1001 length and +2 for BCC field */
-	in_buf->smb_buf_length = cpu_to_be32(sizeof(struct smb_hdr) - 4  + 2);
-	in_buf->Command = SMB_COM_NT_CANCEL;
-	in_buf->WordCount = 0;
-	put_bcc(0, in_buf);
-
-	mutex_lock(&server->srv_mutex);
-	rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
-	if (rc) {
-		mutex_unlock(&server->srv_mutex);
-		return rc;
-	}
-	rc = smb_send(server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
-	mutex_unlock(&server->srv_mutex);
-
-	cFYI(1, "issued NT_CANCEL for mid %u, rc = %d",
-		in_buf->Mid, rc);
-
-	return rc;
+	return server->ops->send_cancel ?
+				server->ops->send_cancel(server, buf, mid) : 0;
 }
 
 int
@@ -544,7 +515,7 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
 	return map_smb_to_linux_error(mid->resp_buf, log_error);
 }
 
-static int
+int
 cifs_setup_request(struct cifs_ses *ses, struct kvec *iov,
 		   unsigned int nvec, struct mid_q_entry **ret_mid)
 {
@@ -607,12 +578,12 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
 
 	mutex_lock(&ses->server->srv_mutex);
 
-	rc = cifs_setup_request(ses, iov, n_vec, &midQ);
+	rc = ses->server->ops->setup_request(ses, iov, n_vec, &midQ);
 	if (rc) {
 		mutex_unlock(&ses->server->srv_mutex);
 		cifs_small_buf_release(buf);
 		/* Update # of requests on wire to server */
-		cifs_add_credits(ses->server, 1);
+		add_credits(ses->server, 1);
 		return rc;
 	}
 
@@ -636,13 +607,13 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
 
 	rc = wait_for_response(ses->server, midQ);
 	if (rc != 0) {
-		send_nt_cancel(ses->server, (struct smb_hdr *)buf, midQ);
+		send_cancel(ses->server, buf, midQ);
 		spin_lock(&GlobalMid_Lock);
 		if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
 			midQ->callback = DeleteMidQEntry;
 			spin_unlock(&GlobalMid_Lock);
 			cifs_small_buf_release(buf);
-			cifs_add_credits(ses->server, 1);
+			add_credits(ses->server, 1);
 			return rc;
 		}
 		spin_unlock(&GlobalMid_Lock);
@@ -652,7 +623,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
 
 	rc = cifs_sync_mid_result(midQ, ses->server);
 	if (rc != 0) {
-		cifs_add_credits(ses->server, 1);
+		add_credits(ses->server, 1);
 		return rc;
 	}
 
@@ -670,14 +641,15 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
 	else
 		*pRespBufType = CIFS_SMALL_BUFFER;
 
-	rc = cifs_check_receive(midQ, ses->server, flags & CIFS_LOG_ERROR);
+	rc = ses->server->ops->check_receive(midQ, ses->server,
+					     flags & CIFS_LOG_ERROR);
 
 	/* mark it so buf will not be freed by delete_mid */
 	if ((flags & CIFS_NO_RESP) == 0)
 		midQ->resp_buf = NULL;
 out:
 	delete_mid(midQ);
-	cifs_add_credits(ses->server, 1);
+	add_credits(ses->server, 1);
 
 	return rc;
 }
@@ -727,7 +699,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
 	if (rc) {
 		mutex_unlock(&ses->server->srv_mutex);
 		/* Update # of requests on wire to server */
-		cifs_add_credits(ses->server, 1);
+		add_credits(ses->server, 1);
 		return rc;
 	}
 
@@ -753,13 +725,13 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
 
 	rc = wait_for_response(ses->server, midQ);
 	if (rc != 0) {
-		send_nt_cancel(ses->server, in_buf, midQ);
+		send_cancel(ses->server, in_buf, midQ);
 		spin_lock(&GlobalMid_Lock);
 		if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
 			/* no longer considered to be "in-flight" */
 			midQ->callback = DeleteMidQEntry;
 			spin_unlock(&GlobalMid_Lock);
-			cifs_add_credits(ses->server, 1);
+			add_credits(ses->server, 1);
 			return rc;
 		}
 		spin_unlock(&GlobalMid_Lock);
@@ -767,7 +739,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
 
 	rc = cifs_sync_mid_result(midQ, ses->server);
 	if (rc != 0) {
-		cifs_add_credits(ses->server, 1);
+		add_credits(ses->server, 1);
 		return rc;
 	}
 
@@ -783,7 +755,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
 	rc = cifs_check_receive(midQ, ses->server, 0);
 out:
 	delete_mid(midQ);
-	cifs_add_credits(ses->server, 1);
+	add_credits(ses->server, 1);
 
 	return rc;
 }
@@ -898,7 +870,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
 		if (in_buf->Command == SMB_COM_TRANSACTION2) {
 			/* POSIX lock. We send a NT_CANCEL SMB to cause the
 			   blocking lock to return. */
-			rc = send_nt_cancel(ses->server, in_buf, midQ);
+			rc = send_cancel(ses->server, in_buf, midQ);
 			if (rc) {
 				delete_mid(midQ);
 				return rc;
@@ -919,7 +891,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
 
 		rc = wait_for_response(ses->server, midQ);
 		if (rc) {
-			send_nt_cancel(ses->server, in_buf, midQ);
+			send_cancel(ses->server, in_buf, midQ);
 			spin_lock(&GlobalMid_Lock);
 			if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
 				/* no longer considered to be "in-flight" */
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 5e6dbe8958fc..e50170ca7c33 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -50,7 +50,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
 
 	ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
 	if (ioc) {
-		ioc_ioprio_changed(ioc, ioprio);
+		ioc->ioprio = ioprio;
 		put_io_context(ioc);
 	}
 
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 2a0e6c599147..f90f4f5cd421 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -29,9 +29,20 @@ config NFS_FS
 
 	  If unsure, say N.
 
+config NFS_V2
+	bool "NFS client support for NFS version 2"
+	depends on NFS_FS
+	default y
+	help
+	  This option enables support for version 2 of the NFS protocol
+	  (RFC 1094) in the kernel's NFS client.
+
+	  If unsure, say Y.
+
 config NFS_V3
 	bool "NFS client support for NFS version 3"
 	depends on NFS_FS
+	default y
 	help
 	  This option enables support for version 3 of the NFS protocol
 	  (RFC 1813) in the kernel's NFS client.
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index b58613d0abb3..7ddd45d9f170 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -4,11 +4,12 @@
 
 obj-$(CONFIG_NFS_FS) += nfs.o
 
-nfs-y 			:= client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \
-			   direct.o pagelist.o proc.o read.o symlink.o unlink.o \
+nfs-y 			:= client.o dir.o file.o getroot.o inode.o super.o \
+			   direct.o pagelist.o read.o symlink.o unlink.o \
 			   write.o namespace.o mount_clnt.o \
 			   dns_resolve.o cache_lib.o
 nfs-$(CONFIG_ROOT_NFS)	+= nfsroot.o
+nfs-$(CONFIG_NFS_V2)	+= proc.o nfs2xdr.o
 nfs-$(CONFIG_NFS_V3)	+= nfs3proc.o nfs3xdr.o
 nfs-$(CONFIG_NFS_V3_ACL)	+= nfs3acl.o
 nfs-$(CONFIG_NFS_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 7f6a23f0244e..7ae8a608956f 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -187,7 +187,6 @@ static void bl_end_io_read(struct bio *bio, int err)
 	struct parallel_io *par = bio->bi_private;
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-	struct nfs_read_data *rdata = (struct nfs_read_data *)par->data;
 
 	do {
 		struct page *page = bvec->bv_page;
@@ -198,9 +197,12 @@ static void bl_end_io_read(struct bio *bio, int err)
 			SetPageUptodate(page);
 	} while (bvec >= bio->bi_io_vec);
 	if (!uptodate) {
-		if (!rdata->pnfs_error)
-			rdata->pnfs_error = -EIO;
-		pnfs_set_lo_fail(rdata->lseg);
+		struct nfs_read_data *rdata = par->data;
+		struct nfs_pgio_header *header = rdata->header;
+
+		if (!header->pnfs_error)
+			header->pnfs_error = -EIO;
+		pnfs_set_lo_fail(header->lseg);
 	}
 	bio_put(bio);
 	put_parallel(par);
@@ -221,7 +223,7 @@ bl_end_par_io_read(void *data, int unused)
 {
 	struct nfs_read_data *rdata = data;
 
-	rdata->task.tk_status = rdata->pnfs_error;
+	rdata->task.tk_status = rdata->header->pnfs_error;
 	INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
 	schedule_work(&rdata->task.u.tk_work);
 }
@@ -229,6 +231,7 @@ bl_end_par_io_read(void *data, int unused)
 static enum pnfs_try_status
 bl_read_pagelist(struct nfs_read_data *rdata)
 {
+	struct nfs_pgio_header *header = rdata->header;
 	int i, hole;
 	struct bio *bio = NULL;
 	struct pnfs_block_extent *be = NULL, *cow_read = NULL;
@@ -239,7 +242,7 @@ bl_read_pagelist(struct nfs_read_data *rdata)
 	int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT;
 
 	dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
-	       rdata->npages, f_offset, (unsigned int)rdata->args.count);
+	       rdata->pages.npages, f_offset, (unsigned int)rdata->args.count);
 
 	par = alloc_parallel(rdata);
 	if (!par)
@@ -249,17 +252,17 @@ bl_read_pagelist(struct nfs_read_data *rdata)
 
 	isect = (sector_t) (f_offset >> SECTOR_SHIFT);
 	/* Code assumes extents are page-aligned */
-	for (i = pg_index; i < rdata->npages; i++) {
+	for (i = pg_index; i < rdata->pages.npages; i++) {
 		if (!extent_length) {
 			/* We've used up the previous extent */
 			bl_put_extent(be);
 			bl_put_extent(cow_read);
 			bio = bl_submit_bio(READ, bio);
 			/* Get the next one */
-			be = bl_find_get_extent(BLK_LSEG2EXT(rdata->lseg),
+			be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg),
 					     isect, &cow_read);
 			if (!be) {
-				rdata->pnfs_error = -EIO;
+				header->pnfs_error = -EIO;
 				goto out;
 			}
 			extent_length = be->be_length -
@@ -282,11 +285,12 @@ bl_read_pagelist(struct nfs_read_data *rdata)
 			struct pnfs_block_extent *be_read;
 
 			be_read = (hole && cow_read) ? cow_read : be;
-			bio = bl_add_page_to_bio(bio, rdata->npages - i, READ,
+			bio = bl_add_page_to_bio(bio, rdata->pages.npages - i,
+						 READ,
 						 isect, pages[i], be_read,
 						 bl_end_io_read, par);
 			if (IS_ERR(bio)) {
-				rdata->pnfs_error = PTR_ERR(bio);
+				header->pnfs_error = PTR_ERR(bio);
 				bio = NULL;
 				goto out;
 			}
@@ -294,9 +298,9 @@ bl_read_pagelist(struct nfs_read_data *rdata)
 		isect += PAGE_CACHE_SECTORS;
 		extent_length -= PAGE_CACHE_SECTORS;
 	}
-	if ((isect << SECTOR_SHIFT) >= rdata->inode->i_size) {
+	if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
 		rdata->res.eof = 1;
-		rdata->res.count = rdata->inode->i_size - f_offset;
+		rdata->res.count = header->inode->i_size - f_offset;
 	} else {
 		rdata->res.count = (isect << SECTOR_SHIFT) - f_offset;
 	}
@@ -345,7 +349,6 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
 	struct parallel_io *par = bio->bi_private;
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-	struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
 
 	do {
 		struct page *page = bvec->bv_page;
@@ -358,9 +361,12 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
 	} while (bvec >= bio->bi_io_vec);
 
 	if (unlikely(!uptodate)) {
-		if (!wdata->pnfs_error)
-			wdata->pnfs_error = -EIO;
-		pnfs_set_lo_fail(wdata->lseg);
+		struct nfs_write_data *data = par->data;
+		struct nfs_pgio_header *header = data->header;
+
+		if (!header->pnfs_error)
+			header->pnfs_error = -EIO;
+		pnfs_set_lo_fail(header->lseg);
 	}
 	bio_put(bio);
 	put_parallel(par);
@@ -370,12 +376,13 @@ static void bl_end_io_write(struct bio *bio, int err)
 {
 	struct parallel_io *par = bio->bi_private;
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
+	struct nfs_write_data *data = par->data;
+	struct nfs_pgio_header *header = data->header;
 
 	if (!uptodate) {
-		if (!wdata->pnfs_error)
-			wdata->pnfs_error = -EIO;
-		pnfs_set_lo_fail(wdata->lseg);
+		if (!header->pnfs_error)
+			header->pnfs_error = -EIO;
+		pnfs_set_lo_fail(header->lseg);
 	}
 	bio_put(bio);
 	put_parallel(par);
@@ -391,9 +398,9 @@ static void bl_write_cleanup(struct work_struct *work)
 	dprintk("%s enter\n", __func__);
 	task = container_of(work, struct rpc_task, u.tk_work);
 	wdata = container_of(task, struct nfs_write_data, task);
-	if (likely(!wdata->pnfs_error)) {
+	if (likely(!wdata->header->pnfs_error)) {
 		/* Marks for LAYOUTCOMMIT */
-		mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
+		mark_extents_written(BLK_LSEG2EXT(wdata->header->lseg),
 				     wdata->args.offset, wdata->args.count);
 	}
 	pnfs_ld_write_done(wdata);
@@ -404,12 +411,12 @@ static void bl_end_par_io_write(void *data, int num_se)
 {
 	struct nfs_write_data *wdata = data;
 
-	if (unlikely(wdata->pnfs_error)) {
-		bl_free_short_extents(&BLK_LSEG2EXT(wdata->lseg)->bl_inval,
+	if (unlikely(wdata->header->pnfs_error)) {
+		bl_free_short_extents(&BLK_LSEG2EXT(wdata->header->lseg)->bl_inval,
 					num_se);
 	}
 
-	wdata->task.tk_status = wdata->pnfs_error;
+	wdata->task.tk_status = wdata->header->pnfs_error;
 	wdata->verf.committed = NFS_FILE_SYNC;
 	INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
 	schedule_work(&wdata->task.u.tk_work);
@@ -540,6 +547,7 @@ check_page:
 static enum pnfs_try_status
 bl_write_pagelist(struct nfs_write_data *wdata, int sync)
 {
+	struct nfs_pgio_header *header = wdata->header;
 	int i, ret, npg_zero, pg_index, last = 0;
 	struct bio *bio = NULL;
 	struct pnfs_block_extent *be = NULL, *cow_read = NULL;
@@ -552,7 +560,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
 	pgoff_t index;
 	u64 temp;
 	int npg_per_block =
-	    NFS_SERVER(wdata->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
+	    NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
 
 	dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
 	/* At this point, wdata->pages is a (sequential) list of nfs_pages.
@@ -566,7 +574,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
 	/* At this point, have to be more careful with error handling */
 
 	isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
-	be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
+	be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), isect, &cow_read);
 	if (!be || !is_writable(be, isect)) {
 		dprintk("%s no matching extents!\n", __func__);
 		goto out_mds;
@@ -597,10 +605,10 @@ fill_invalid_ext:
 			dprintk("%s zero %dth page: index %lu isect %llu\n",
 				__func__, npg_zero, index,
 				(unsigned long long)isect);
-			page = bl_find_get_zeroing_page(wdata->inode, index,
+			page = bl_find_get_zeroing_page(header->inode, index,
 							cow_read);
 			if (unlikely(IS_ERR(page))) {
-				wdata->pnfs_error = PTR_ERR(page);
+				header->pnfs_error = PTR_ERR(page);
 				goto out;
 			} else if (page == NULL)
 				goto next_page;
@@ -612,7 +620,7 @@ fill_invalid_ext:
 					__func__, ret);
 				end_page_writeback(page);
 				page_cache_release(page);
-				wdata->pnfs_error = ret;
+				header->pnfs_error = ret;
 				goto out;
 			}
 			if (likely(!bl_push_one_short_extent(be->be_inval)))
@@ -620,11 +628,11 @@ fill_invalid_ext:
 			else {
 				end_page_writeback(page);
 				page_cache_release(page);
-				wdata->pnfs_error = -ENOMEM;
+				header->pnfs_error = -ENOMEM;
 				goto out;
 			}
 			/* FIXME: This should be done in bi_end_io */
-			mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
+			mark_extents_written(BLK_LSEG2EXT(header->lseg),
 					     page->index << PAGE_CACHE_SHIFT,
 					     PAGE_CACHE_SIZE);
 
@@ -632,7 +640,7 @@ fill_invalid_ext:
 						 isect, page, be,
 						 bl_end_io_write_zero, par);
 			if (IS_ERR(bio)) {
-				wdata->pnfs_error = PTR_ERR(bio);
+				header->pnfs_error = PTR_ERR(bio);
 				bio = NULL;
 				goto out;
 			}
@@ -647,16 +655,16 @@ next_page:
 
 	/* Middle pages */
 	pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
-	for (i = pg_index; i < wdata->npages; i++) {
+	for (i = pg_index; i < wdata->pages.npages; i++) {
 		if (!extent_length) {
 			/* We've used up the previous extent */
 			bl_put_extent(be);
 			bio = bl_submit_bio(WRITE, bio);
 			/* Get the next one */
-			be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg),
+			be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg),
 					     isect, NULL);
 			if (!be || !is_writable(be, isect)) {
-				wdata->pnfs_error = -EINVAL;
+				header->pnfs_error = -EINVAL;
 				goto out;
 			}
 			if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
@@ -664,7 +672,7 @@ next_page:
 								be->be_inval)))
 					par->bse_count++;
 				else {
-					wdata->pnfs_error = -ENOMEM;
+					header->pnfs_error = -ENOMEM;
 					goto out;
 				}
 			}
@@ -677,15 +685,15 @@ next_page:
 			if (unlikely(ret)) {
 				dprintk("%s bl_mark_sectors_init fail %d\n",
 					__func__, ret);
-				wdata->pnfs_error = ret;
+				header->pnfs_error = ret;
 				goto out;
 			}
 		}
-		bio = bl_add_page_to_bio(bio, wdata->npages - i, WRITE,
+		bio = bl_add_page_to_bio(bio, wdata->pages.npages - i, WRITE,
 					 isect, pages[i], be,
 					 bl_end_io_write, par);
 		if (IS_ERR(bio)) {
-			wdata->pnfs_error = PTR_ERR(bio);
+			header->pnfs_error = PTR_ERR(bio);
 			bio = NULL;
 			goto out;
 		}
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index a5c88a554d92..c96554245ccf 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -123,7 +123,7 @@ nfs4_blk_decode_device(struct nfs_server *server,
 	uint8_t *dataptr;
 	DECLARE_WAITQUEUE(wq, current);
 	int offset, len, i, rc;
-	struct net *net = server->nfs_client->net;
+	struct net *net = server->nfs_client->cl_net;
 	struct nfs_net *nn = net_generic(net, nfs_net_id);
 	struct bl_dev_msg *reply = &nn->bl_mount_reply;
 
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 60f7e4ec842c..7d108753af81 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -65,7 +65,7 @@ static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq);
 static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
 {
 	int ret = 0;
-	struct nfs_net *nn = net_generic(clp->net, nfs_net_id);
+	struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
 
 	if (clp->rpc_ops->version != 4 || minorversion != 0)
 		return ret;
@@ -90,7 +90,9 @@ static bool nfs4_disable_idmapping = true;
  * RPC cruft for NFS
  */
 static const struct rpc_version *nfs_version[5] = {
+#ifdef CONFIG_NFS_V2
 	[2]			= &nfs_version2,
+#endif
 #ifdef CONFIG_NFS_V3
 	[3]			= &nfs_version3,
 #endif
@@ -129,6 +131,7 @@ const struct rpc_program nfsacl_program = {
 #endif  /* CONFIG_NFS_V3_ACL */
 
 struct nfs_client_initdata {
+	unsigned long init_flags;
 	const char *hostname;
 	const struct sockaddr *addr;
 	size_t addrlen;
@@ -172,7 +175,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
 	clp->cl_rpcclient = ERR_PTR(-EINVAL);
 
 	clp->cl_proto = cl_init->proto;
-	clp->net = get_net(cl_init->net);
+	clp->cl_net = get_net(cl_init->net);
 
 #ifdef CONFIG_NFS_V4
 	err = nfs_get_cb_ident_idr(clp, cl_init->minorversion);
@@ -182,7 +185,6 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
 	spin_lock_init(&clp->cl_lock);
 	INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
 	rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
-	clp->cl_boot_time = CURRENT_TIME;
 	clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
 	clp->cl_minorversion = cl_init->minorversion;
 	clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
@@ -207,6 +209,7 @@ static void nfs4_shutdown_session(struct nfs_client *clp)
 	if (nfs4_has_session(clp)) {
 		nfs4_deviceid_purge_client(clp);
 		nfs4_destroy_session(clp->cl_session);
+		nfs4_destroy_clientid(clp);
 	}
 
 }
@@ -235,6 +238,9 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
 		nfs_idmap_delete(clp);
 
 	rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
+	kfree(clp->cl_serverowner);
+	kfree(clp->cl_serverscope);
+	kfree(clp->cl_implid);
 }
 
 /* idr_remove_all is not needed as all id's are removed by nfs_put_client */
@@ -248,7 +254,7 @@ void nfs_cleanup_cb_ident_idr(struct net *net)
 /* nfs_client_lock held */
 static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
 {
-	struct nfs_net *nn = net_generic(clp->net, nfs_net_id);
+	struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
 
 	if (clp->cl_cb_ident)
 		idr_remove(&nn->cb_ident_idr, clp->cl_cb_ident);
@@ -301,10 +307,8 @@ static void nfs_free_client(struct nfs_client *clp)
 	if (clp->cl_machine_cred != NULL)
 		put_rpccred(clp->cl_machine_cred);
 
-	put_net(clp->net);
+	put_net(clp->cl_net);
 	kfree(clp->cl_hostname);
-	kfree(clp->server_scope);
-	kfree(clp->impl_id);
 	kfree(clp);
 
 	dprintk("<-- nfs_free_client()\n");
@@ -321,7 +325,7 @@ void nfs_put_client(struct nfs_client *clp)
 		return;
 
 	dprintk("--> nfs_put_client({%d})\n", atomic_read(&clp->cl_count));
-	nn = net_generic(clp->net, nfs_net_id);
+	nn = net_generic(clp->cl_net, nfs_net_id);
 
 	if (atomic_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) {
 		list_del(&clp->cl_share_link);
@@ -456,6 +460,8 @@ static bool nfs4_cb_match_client(const struct sockaddr *addr,
 	    clp->cl_cons_state == NFS_CS_SESSION_INITING))
 		return false;
 
+	smp_rmb();
+
 	/* Match the version and minorversion */
 	if (clp->rpc_ops->version != 4 ||
 	    clp->cl_minorversion != minorversion)
@@ -504,6 +510,47 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
 	return NULL;
 }
 
+static bool nfs_client_init_is_complete(const struct nfs_client *clp)
+{
+	return clp->cl_cons_state != NFS_CS_INITING;
+}
+
+int nfs_wait_client_init_complete(const struct nfs_client *clp)
+{
+	return wait_event_killable(nfs_client_active_wq,
+			nfs_client_init_is_complete(clp));
+}
+
+/*
+ * Found an existing client.  Make sure it's ready before returning.
+ */
+static struct nfs_client *
+nfs_found_client(const struct nfs_client_initdata *cl_init,
+		 struct nfs_client *clp)
+{
+	int error;
+
+	error = nfs_wait_client_init_complete(clp);
+	if (error < 0) {
+		nfs_put_client(clp);
+		return ERR_PTR(-ERESTARTSYS);
+	}
+
+	if (clp->cl_cons_state < NFS_CS_READY) {
+		error = clp->cl_cons_state;
+		nfs_put_client(clp);
+		return ERR_PTR(error);
+	}
+
+	smp_rmb();
+
+	BUG_ON(clp->cl_cons_state != NFS_CS_READY);
+
+	dprintk("<-- %s found nfs_client %p for %s\n",
+		__func__, clp, cl_init->hostname ?: "");
+	return clp;
+}
+
 /*
  * Look up a client by IP address and protocol version
  * - creates a new record if one doesn't yet exist
@@ -512,11 +559,9 @@ static struct nfs_client *
 nfs_get_client(const struct nfs_client_initdata *cl_init,
 	       const struct rpc_timeout *timeparms,
 	       const char *ip_addr,
-	       rpc_authflavor_t authflavour,
-	       int noresvport)
+	       rpc_authflavor_t authflavour)
 {
 	struct nfs_client *clp, *new = NULL;
-	int error;
 	struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id);
 
 	dprintk("--> nfs_get_client(%s,v%u)\n",
@@ -527,60 +572,29 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
 		spin_lock(&nn->nfs_client_lock);
 
 		clp = nfs_match_client(cl_init);
-		if (clp)
-			goto found_client;
-		if (new)
-			goto install_client;
+		if (clp) {
+			spin_unlock(&nn->nfs_client_lock);
+			if (new)
+				nfs_free_client(new);
+			return nfs_found_client(cl_init, clp);
+		}
+		if (new) {
+			list_add(&new->cl_share_link, &nn->nfs_client_list);
+			spin_unlock(&nn->nfs_client_lock);
+			new->cl_flags = cl_init->init_flags;
+			return cl_init->rpc_ops->init_client(new,
+						timeparms, ip_addr,
+						authflavour);
+		}
 
 		spin_unlock(&nn->nfs_client_lock);
 
 		new = nfs_alloc_client(cl_init);
 	} while (!IS_ERR(new));
 
-	dprintk("--> nfs_get_client() = %ld [failed]\n", PTR_ERR(new));
+	dprintk("<-- nfs_get_client() Failed to find %s (%ld)\n",
+		cl_init->hostname ?: "", PTR_ERR(new));
 	return new;
-
-	/* install a new client and return with it unready */
-install_client:
-	clp = new;
-	list_add(&clp->cl_share_link, &nn->nfs_client_list);
-	spin_unlock(&nn->nfs_client_lock);
-
-	error = cl_init->rpc_ops->init_client(clp, timeparms, ip_addr,
-					      authflavour, noresvport);
-	if (error < 0) {
-		nfs_put_client(clp);
-		return ERR_PTR(error);
-	}
-	dprintk("--> nfs_get_client() = %p [new]\n", clp);
-	return clp;
-
-	/* found an existing client
-	 * - make sure it's ready before returning
-	 */
-found_client:
-	spin_unlock(&nn->nfs_client_lock);
-
-	if (new)
-		nfs_free_client(new);
-
-	error = wait_event_killable(nfs_client_active_wq,
-				clp->cl_cons_state < NFS_CS_INITING);
-	if (error < 0) {
-		nfs_put_client(clp);
-		return ERR_PTR(-ERESTARTSYS);
-	}
-
-	if (clp->cl_cons_state < NFS_CS_READY) {
-		error = clp->cl_cons_state;
-		nfs_put_client(clp);
-		return ERR_PTR(error);
-	}
-
-	BUG_ON(clp->cl_cons_state != NFS_CS_READY);
-
-	dprintk("--> nfs_get_client() = %p [share]\n", clp);
-	return clp;
 }
 
 /*
@@ -588,27 +602,12 @@ found_client:
  */
 void nfs_mark_client_ready(struct nfs_client *clp, int state)
 {
+	smp_wmb();
 	clp->cl_cons_state = state;
 	wake_up_all(&nfs_client_active_wq);
 }
 
 /*
- * With sessions, the client is not marked ready until after a
- * successful EXCHANGE_ID and CREATE_SESSION.
- *
- * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate
- * other versions of NFS can be tried.
- */
-int nfs4_check_client_ready(struct nfs_client *clp)
-{
-	if (!nfs4_has_session(clp))
-		return 0;
-	if (clp->cl_cons_state < NFS_CS_READY)
-		return -EPROTONOSUPPORT;
-	return 0;
-}
-
-/*
  * Initialise the timeout values for a connection
  */
 static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
@@ -654,12 +653,11 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
  */
 static int nfs_create_rpc_client(struct nfs_client *clp,
 				 const struct rpc_timeout *timeparms,
-				 rpc_authflavor_t flavor,
-				 int discrtry, int noresvport)
+				 rpc_authflavor_t flavor)
 {
 	struct rpc_clnt		*clnt = NULL;
 	struct rpc_create_args args = {
-		.net		= clp->net,
+		.net		= clp->cl_net,
 		.protocol	= clp->cl_proto,
 		.address	= (struct sockaddr *)&clp->cl_addr,
 		.addrsize	= clp->cl_addrlen,
@@ -670,9 +668,9 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
 		.authflavor	= flavor,
 	};
 
-	if (discrtry)
+	if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags))
 		args.flags |= RPC_CLNT_CREATE_DISCRTRY;
-	if (noresvport)
+	if (test_bit(NFS_CS_NORESVPORT, &clp->cl_flags))
 		args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
 
 	if (!IS_ERR(clp->cl_rpcclient))
@@ -713,7 +711,7 @@ static int nfs_start_lockd(struct nfs_server *server)
 		.nfs_version	= clp->rpc_ops->version,
 		.noresvport	= server->flags & NFS_MOUNT_NORESVPORT ?
 					1 : 0,
-		.net		= clp->net,
+		.net		= clp->cl_net,
 	};
 
 	if (nlm_init.nfs_version > 3)
@@ -805,36 +803,43 @@ static int nfs_init_server_rpcclient(struct nfs_server *server,
 	return 0;
 }
 
-/*
- * Initialise an NFS2 or NFS3 client
+/**
+ * nfs_init_client - Initialise an NFS2 or NFS3 client
+ *
+ * @clp: nfs_client to initialise
+ * @timeparms: timeout parameters for underlying RPC transport
+ * @ip_addr: IP presentation address (not used)
+ * @authflavor: authentication flavor for underlying RPC transport
+ *
+ * Returns pointer to an NFS client, or an ERR_PTR value.
  */
-int nfs_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms,
-		    const char *ip_addr, rpc_authflavor_t authflavour,
-		    int noresvport)
+struct nfs_client *nfs_init_client(struct nfs_client *clp,
+		    const struct rpc_timeout *timeparms,
+		    const char *ip_addr, rpc_authflavor_t authflavour)
 {
 	int error;
 
 	if (clp->cl_cons_state == NFS_CS_READY) {
 		/* the client is already initialised */
 		dprintk("<-- nfs_init_client() = 0 [already %p]\n", clp);
-		return 0;
+		return clp;
 	}
 
 	/*
 	 * Create a client RPC handle for doing FSSTAT with UNIX auth only
 	 * - RFC 2623, sec 2.3.2
 	 */
-	error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX,
-				      0, noresvport);
+	error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX);
 	if (error < 0)
 		goto error;
 	nfs_mark_client_ready(clp, NFS_CS_READY);
-	return 0;
+	return clp;
 
 error:
 	nfs_mark_client_ready(clp, error);
+	nfs_put_client(clp);
 	dprintk("<-- nfs_init_client() = xerror %d\n", error);
-	return error;
+	return ERR_PTR(error);
 }
 
 /*
@@ -847,7 +852,7 @@ static int nfs_init_server(struct nfs_server *server,
 		.hostname = data->nfs_server.hostname,
 		.addr = (const struct sockaddr *)&data->nfs_server.address,
 		.addrlen = data->nfs_server.addrlen,
-		.rpc_ops = &nfs_v2_clientops,
+		.rpc_ops = NULL,
 		.proto = data->nfs_server.protocol,
 		.net = data->net,
 	};
@@ -857,17 +862,28 @@ static int nfs_init_server(struct nfs_server *server,
 
 	dprintk("--> nfs_init_server()\n");
 
+	switch (data->version) {
+#ifdef CONFIG_NFS_V2
+	case 2:
+		cl_init.rpc_ops = &nfs_v2_clientops;
+		break;
+#endif
 #ifdef CONFIG_NFS_V3
-	if (data->version == 3)
+	case 3:
 		cl_init.rpc_ops = &nfs_v3_clientops;
+		break;
 #endif
+	default:
+		return -EPROTONOSUPPORT;
+	}
 
 	nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
 			data->timeo, data->retrans);
+	if (data->flags & NFS_MOUNT_NORESVPORT)
+		set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
 
 	/* Allocate or find a client reference we can use */
-	clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX,
-			     data->flags & NFS_MOUNT_NORESVPORT);
+	clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX);
 	if (IS_ERR(clp)) {
 		dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
 		return PTR_ERR(clp);
@@ -880,7 +896,7 @@ static int nfs_init_server(struct nfs_server *server,
 	server->options = data->options;
 	server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
 		NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP|
-		NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME;
+		NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME|NFS_CAP_CHANGE_ATTR;
 
 	if (data->rsize)
 		server->rsize = nfs_block_size(data->rsize, NULL);
@@ -1048,7 +1064,7 @@ static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_serve
 static void nfs_server_insert_lists(struct nfs_server *server)
 {
 	struct nfs_client *clp = server->nfs_client;
-	struct nfs_net *nn = net_generic(clp->net, nfs_net_id);
+	struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
 
 	spin_lock(&nn->nfs_client_lock);
 	list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
@@ -1065,7 +1081,7 @@ static void nfs_server_remove_lists(struct nfs_server *server)
 
 	if (clp == NULL)
 		return;
-	nn = net_generic(clp->net, nfs_net_id);
+	nn = net_generic(clp->cl_net, nfs_net_id);
 	spin_lock(&nn->nfs_client_lock);
 	list_del_rcu(&server->client_link);
 	if (list_empty(&clp->cl_superblocks))
@@ -1333,21 +1349,27 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
 		 * so that the client back channel can find the
 		 * nfs_client struct
 		 */
-		clp->cl_cons_state = NFS_CS_SESSION_INITING;
+		nfs_mark_client_ready(clp, NFS_CS_SESSION_INITING);
 	}
 #endif /* CONFIG_NFS_V4_1 */
 
 	return nfs4_init_callback(clp);
 }
 
-/*
- * Initialise an NFS4 client record
+/**
+ * nfs4_init_client - Initialise an NFS4 client record
+ *
+ * @clp: nfs_client to initialise
+ * @timeparms: timeout parameters for underlying RPC transport
+ * @ip_addr: callback IP address in presentation format
+ * @authflavor: authentication flavor for underlying RPC transport
+ *
+ * Returns pointer to an NFS client, or an ERR_PTR value.
  */
-int nfs4_init_client(struct nfs_client *clp,
-		     const struct rpc_timeout *timeparms,
-		     const char *ip_addr,
-		     rpc_authflavor_t authflavour,
-		     int noresvport)
+struct nfs_client *nfs4_init_client(struct nfs_client *clp,
+				    const struct rpc_timeout *timeparms,
+				    const char *ip_addr,
+				    rpc_authflavor_t authflavour)
 {
 	char buf[INET6_ADDRSTRLEN + 1];
 	int error;
@@ -1355,14 +1377,14 @@ int nfs4_init_client(struct nfs_client *clp,
 	if (clp->cl_cons_state == NFS_CS_READY) {
 		/* the client is initialised already */
 		dprintk("<-- nfs4_init_client() = 0 [already %p]\n", clp);
-		return 0;
+		return clp;
 	}
 
 	/* Check NFS protocol revision and initialize RPC op vector */
 	clp->rpc_ops = &nfs_v4_clientops;
 
-	error = nfs_create_rpc_client(clp, timeparms, authflavour,
-				      1, noresvport);
+	__set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
+	error = nfs_create_rpc_client(clp, timeparms, authflavour);
 	if (error < 0)
 		goto error;
 
@@ -1395,12 +1417,13 @@ int nfs4_init_client(struct nfs_client *clp,
 
 	if (!nfs4_has_session(clp))
 		nfs_mark_client_ready(clp, NFS_CS_READY);
-	return 0;
+	return clp;
 
 error:
 	nfs_mark_client_ready(clp, error);
+	nfs_put_client(clp);
 	dprintk("<-- nfs4_init_client() = xerror %d\n", error);
-	return error;
+	return ERR_PTR(error);
 }
 
 /*
@@ -1429,9 +1452,11 @@ static int nfs4_set_client(struct nfs_server *server,
 
 	dprintk("--> nfs4_set_client()\n");
 
+	if (server->flags & NFS_MOUNT_NORESVPORT)
+		set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+
 	/* Allocate or find a client reference we can use */
-	clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour,
-			     server->flags & NFS_MOUNT_NORESVPORT);
+	clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour);
 	if (IS_ERR(clp)) {
 		error = PTR_ERR(clp);
 		goto error;
@@ -1465,8 +1490,8 @@ error:
  * the MDS.
  */
 struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
-		const struct sockaddr *ds_addr,
-		int ds_addrlen, int ds_proto)
+		const struct sockaddr *ds_addr, int ds_addrlen,
+		int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans)
 {
 	struct nfs_client_initdata cl_init = {
 		.addr = ds_addr,
@@ -1474,14 +1499,9 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
 		.rpc_ops = &nfs_v4_clientops,
 		.proto = ds_proto,
 		.minorversion = mds_clp->cl_minorversion,
-		.net = mds_clp->net,
-	};
-	struct rpc_timeout ds_timeout = {
-		.to_initval = 15 * HZ,
-		.to_maxval = 15 * HZ,
-		.to_retries = 1,
-		.to_exponential = 1,
+		.net = mds_clp->cl_net,
 	};
+	struct rpc_timeout ds_timeout;
 	struct nfs_client *clp;
 
 	/*
@@ -1489,8 +1509,9 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
 	 * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS
 	 * (section 13.1 RFC 5661).
 	 */
+	nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
 	clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
-			     mds_clp->cl_rpcclient->cl_auth->au_flavor, 0);
+			     mds_clp->cl_rpcclient->cl_auth->au_flavor);
 
 	dprintk("<-- %s %p\n", __func__, clp);
 	return clp;
@@ -1701,7 +1722,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 				rpc_protocol(parent_server->client),
 				parent_server->client->cl_timeout,
 				parent_client->cl_mvops->minor_version,
-				parent_client->net);
+				parent_client->cl_net);
 	if (error < 0)
 		goto error;
 
@@ -1805,6 +1826,7 @@ void nfs_clients_init(struct net *net)
 	idr_init(&nn->cb_ident_idr);
 #endif
 	spin_lock_init(&nn->nfs_client_lock);
+	nn->boot_time = CURRENT_TIME;
 }
 
 #ifdef CONFIG_PROC_FS
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 89af1d269274..bd3a9601d32d 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -316,6 +316,10 @@ out:
  * nfs_client_return_marked_delegations - return previously marked delegations
  * @clp: nfs_client to process
  *
+ * Note that this function is designed to be called by the state
+ * manager thread. For this reason, it cannot flush the dirty data,
+ * since that could deadlock in case of a state recovery error.
+ *
  * Returns zero on success, or a negative errno value.
  */
 int nfs_client_return_marked_delegations(struct nfs_client *clp)
@@ -340,11 +344,9 @@ restart:
 								server);
 			rcu_read_unlock();
 
-			if (delegation != NULL) {
-				filemap_flush(inode->i_mapping);
+			if (delegation != NULL)
 				err = __nfs_inode_return_delegation(inode,
 								delegation, 0);
-			}
 			iput(inode);
 			if (!err)
 				goto restart;
@@ -380,6 +382,10 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode)
  * nfs_inode_return_delegation - synchronously return a delegation
  * @inode: inode to process
  *
+ * This routine will always flush any dirty data to disk on the
+ * assumption that if we need to return the delegation, then
+ * we should stop caching.
+ *
  * Returns zero on success, or a negative errno value.
  */
 int nfs_inode_return_delegation(struct inode *inode)
@@ -389,10 +395,10 @@ int nfs_inode_return_delegation(struct inode *inode)
 	struct nfs_delegation *delegation;
 	int err = 0;
 
+	nfs_wb_all(inode);
 	if (rcu_access_pointer(nfsi->delegation) != NULL) {
 		delegation = nfs_detach_delegation(nfsi, server);
 		if (delegation != NULL) {
-			nfs_wb_all(inode);
 			err = __nfs_inode_return_delegation(inode, delegation, 1);
 		}
 	}
@@ -538,6 +544,8 @@ int nfs_async_inode_return_delegation(struct inode *inode,
 	struct nfs_client *clp = server->nfs_client;
 	struct nfs_delegation *delegation;
 
+	filemap_flush(inode->i_mapping);
+
 	rcu_read_lock();
 	delegation = rcu_dereference(NFS_I(inode)->delegation);
 
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index cd6a7a8dadae..72709c4193fa 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -66,6 +66,7 @@ static inline int nfs_have_delegation(struct inode *inode, fmode_t flags)
 
 static inline int nfs_inode_return_delegation(struct inode *inode)
 {
+	nfs_wb_all(inode);
 	return 0;
 }
 #endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index eedd24d0ad2e..0989a2099688 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -475,6 +475,29 @@ different:
 }
 
 static
+bool nfs_use_readdirplus(struct inode *dir, struct file *filp)
+{
+	if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS))
+		return false;
+	if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags))
+		return true;
+	if (filp->f_pos == 0)
+		return true;
+	return false;
+}
+
+/*
+ * This function is called by the lookup code to request the use of
+ * readdirplus to accelerate any future lookups in the same
+ * directory.
+ */
+static
+void nfs_advise_use_readdirplus(struct inode *dir)
+{
+	set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags);
+}
+
+static
 void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
 {
 	struct qstr filename = QSTR_INIT(entry->name, entry->len);
@@ -871,7 +894,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	desc->file = filp;
 	desc->dir_cookie = &dir_ctx->dir_cookie;
 	desc->decode = NFS_PROTO(inode)->decode_dirent;
-	desc->plus = NFS_USE_READDIRPLUS(inode);
+	desc->plus = nfs_use_readdirplus(inode, filp) ? 1 : 0;
 
 	nfs_block_sillyrename(dentry);
 	res = nfs_revalidate_mapping(inode, filp->f_mapping);
@@ -1111,7 +1134,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
 	if (!inode) {
 		if (nfs_neg_need_reval(dir, dentry, nd))
 			goto out_bad;
-		goto out_valid;
+		goto out_valid_noent;
 	}
 
 	if (is_bad_inode(inode)) {
@@ -1140,7 +1163,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
 	if (fhandle == NULL || fattr == NULL)
 		goto out_error;
 
-	error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr);
+	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
 	if (error)
 		goto out_bad;
 	if (nfs_compare_fh(NFS_FH(inode), fhandle))
@@ -1153,6 +1176,9 @@ static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
 out_set_verifier:
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
  out_valid:
+	/* Success: notify readdir to use READDIRPLUS */
+	nfs_advise_use_readdirplus(dir);
+ out_valid_noent:
 	dput(parent);
 	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n",
 			__func__, dentry->d_parent->d_name.name,
@@ -1296,7 +1322,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
 	parent = dentry->d_parent;
 	/* Protect against concurrent sillydeletes */
 	nfs_block_sillyrename(parent);
-	error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr);
+	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
 	if (error == -ENOENT)
 		goto no_entry;
 	if (error < 0) {
@@ -1308,6 +1334,9 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
 	if (IS_ERR(res))
 		goto out_unblock_sillyrename;
 
+	/* Success: notify readdir to use READDIRPLUS */
+	nfs_advise_use_readdirplus(dir);
+
 no_entry:
 	res = d_materialise_unique(dentry, inode);
 	if (res != NULL) {
@@ -1643,7 +1672,7 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
 	if (dentry->d_inode)
 		goto out;
 	if (fhandle->size == 0) {
-		error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr);
+		error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
 		if (error)
 			goto out_error;
 	}
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 481be7f7bdd3..23d170bc44f4 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -56,6 +56,7 @@
 
 #include "internal.h"
 #include "iostat.h"
+#include "pnfs.h"
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 
@@ -81,16 +82,19 @@ struct nfs_direct_req {
 	struct completion	completion;	/* wait for i/o completion */
 
 	/* commit state */
-	struct list_head	rewrite_list;	/* saved nfs_write_data structs */
-	struct nfs_write_data *	commit_data;	/* special write_data for commits */
+	struct nfs_mds_commit_info mds_cinfo;	/* Storage for cinfo */
+	struct pnfs_ds_commit_info ds_cinfo;	/* Storage for cinfo */
+	struct work_struct	work;
 	int			flags;
 #define NFS_ODIRECT_DO_COMMIT		(1)	/* an unstable reply was received */
 #define NFS_ODIRECT_RESCHED_WRITES	(2)	/* write verification failed */
 	struct nfs_writeverf	verf;		/* unstable write verifier */
 };
 
+static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops;
+static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops;
 static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);
-static const struct rpc_call_ops nfs_write_direct_ops;
+static void nfs_direct_write_schedule_work(struct work_struct *work);
 
 static inline void get_dreq(struct nfs_direct_req *dreq)
 {
@@ -124,22 +128,6 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_
 	return -EINVAL;
 }
 
-static void nfs_direct_dirty_pages(struct page **pages, unsigned int pgbase, size_t count)
-{
-	unsigned int npages;
-	unsigned int i;
-
-	if (count == 0)
-		return;
-	pages += (pgbase >> PAGE_SHIFT);
-	npages = (count + (pgbase & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	for (i = 0; i < npages; i++) {
-		struct page *page = pages[i];
-		if (!PageCompound(page))
-			set_page_dirty(page);
-	}
-}
-
 static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
 {
 	unsigned int i;
@@ -147,26 +135,30 @@ static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
 		page_cache_release(pages[i]);
 }
 
+void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
+			      struct nfs_direct_req *dreq)
+{
+	cinfo->lock = &dreq->lock;
+	cinfo->mds = &dreq->mds_cinfo;
+	cinfo->ds = &dreq->ds_cinfo;
+	cinfo->dreq = dreq;
+	cinfo->completion_ops = &nfs_direct_commit_completion_ops;
+}
+
 static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
 {
 	struct nfs_direct_req *dreq;
 
-	dreq = kmem_cache_alloc(nfs_direct_cachep, GFP_KERNEL);
+	dreq = kmem_cache_zalloc(nfs_direct_cachep, GFP_KERNEL);
 	if (!dreq)
 		return NULL;
 
 	kref_init(&dreq->kref);
 	kref_get(&dreq->kref);
 	init_completion(&dreq->completion);
-	INIT_LIST_HEAD(&dreq->rewrite_list);
-	dreq->iocb = NULL;
-	dreq->ctx = NULL;
-	dreq->l_ctx = NULL;
+	INIT_LIST_HEAD(&dreq->mds_cinfo.list);
+	INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
 	spin_lock_init(&dreq->lock);
-	atomic_set(&dreq->io_count, 0);
-	dreq->count = 0;
-	dreq->error = 0;
-	dreq->flags = 0;
 
 	return dreq;
 }
@@ -226,47 +218,80 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)
 	nfs_direct_req_release(dreq);
 }
 
-/*
- * We must hold a reference to all the pages in this direct read request
- * until the RPCs complete.  This could be long *after* we are woken up in
- * nfs_direct_wait (for instance, if someone hits ^C on a slow server).
- */
-static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
+static void nfs_direct_readpage_release(struct nfs_page *req)
 {
-	struct nfs_read_data *data = calldata;
-
-	nfs_readpage_result(task, data);
+	dprintk("NFS: direct read done (%s/%lld %d@%lld)\n",
+		req->wb_context->dentry->d_inode->i_sb->s_id,
+		(long long)NFS_FILEID(req->wb_context->dentry->d_inode),
+		req->wb_bytes,
+		(long long)req_offset(req));
+	nfs_release_request(req);
 }
 
-static void nfs_direct_read_release(void *calldata)
+static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
 {
+	unsigned long bytes = 0;
+	struct nfs_direct_req *dreq = hdr->dreq;
 
-	struct nfs_read_data *data = calldata;
-	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
-	int status = data->task.tk_status;
+	if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
+		goto out_put;
 
 	spin_lock(&dreq->lock);
-	if (unlikely(status < 0)) {
-		dreq->error = status;
-		spin_unlock(&dreq->lock);
-	} else {
-		dreq->count += data->res.count;
-		spin_unlock(&dreq->lock);
-		nfs_direct_dirty_pages(data->pagevec,
-				data->args.pgbase,
-				data->res.count);
-	}
-	nfs_direct_release_pages(data->pagevec, data->npages);
+	if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0))
+		dreq->error = hdr->error;
+	else
+		dreq->count += hdr->good_bytes;
+	spin_unlock(&dreq->lock);
 
+	while (!list_empty(&hdr->pages)) {
+		struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+		struct page *page = req->wb_page;
+
+		if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) {
+			if (bytes > hdr->good_bytes)
+				zero_user(page, 0, PAGE_SIZE);
+			else if (hdr->good_bytes - bytes < PAGE_SIZE)
+				zero_user_segment(page,
+					hdr->good_bytes & ~PAGE_MASK,
+					PAGE_SIZE);
+		}
+		if (!PageCompound(page)) {
+			if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
+				if (bytes < hdr->good_bytes)
+					set_page_dirty(page);
+			} else
+				set_page_dirty(page);
+		}
+		bytes += req->wb_bytes;
+		nfs_list_remove_request(req);
+		nfs_direct_readpage_release(req);
+	}
+out_put:
 	if (put_dreq(dreq))
 		nfs_direct_complete(dreq);
-	nfs_readdata_free(data);
+	hdr->release(hdr);
+}
+
+static void nfs_read_sync_pgio_error(struct list_head *head)
+{
+	struct nfs_page *req;
+
+	while (!list_empty(head)) {
+		req = nfs_list_entry(head->next);
+		nfs_list_remove_request(req);
+		nfs_release_request(req);
+	}
 }
 
-static const struct rpc_call_ops nfs_read_direct_ops = {
-	.rpc_call_prepare = nfs_read_prepare,
-	.rpc_call_done = nfs_direct_read_result,
-	.rpc_release = nfs_direct_read_release,
+static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr)
+{
+	get_dreq(hdr->dreq);
+}
+
+static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = {
+	.error_cleanup = nfs_read_sync_pgio_error,
+	.init_hdr = nfs_direct_pgio_init,
+	.completion = nfs_direct_read_completion,
 };
 
 /*
@@ -276,107 +301,82 @@ static const struct rpc_call_ops nfs_read_direct_ops = {
  * handled automatically by nfs_direct_read_result().  Otherwise, if
  * no requests have been sent, just return an error.
  */
-static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
+static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *desc,
 						const struct iovec *iov,
 						loff_t pos)
 {
+	struct nfs_direct_req *dreq = desc->pg_dreq;
 	struct nfs_open_context *ctx = dreq->ctx;
 	struct inode *inode = ctx->dentry->d_inode;
 	unsigned long user_addr = (unsigned long)iov->iov_base;
 	size_t count = iov->iov_len;
 	size_t rsize = NFS_SERVER(inode)->rsize;
-	struct rpc_task *task;
-	struct rpc_message msg = {
-		.rpc_cred = ctx->cred,
-	};
-	struct rpc_task_setup task_setup_data = {
-		.rpc_client = NFS_CLIENT(inode),
-		.rpc_message = &msg,
-		.callback_ops = &nfs_read_direct_ops,
-		.workqueue = nfsiod_workqueue,
-		.flags = RPC_TASK_ASYNC,
-	};
 	unsigned int pgbase;
 	int result;
 	ssize_t started = 0;
+	struct page **pagevec = NULL;
+	unsigned int npages;
 
 	do {
-		struct nfs_read_data *data;
 		size_t bytes;
+		int i;
 
 		pgbase = user_addr & ~PAGE_MASK;
-		bytes = min(rsize,count);
+		bytes = min(max_t(size_t, rsize, PAGE_SIZE), count);
 
 		result = -ENOMEM;
-		data = nfs_readdata_alloc(nfs_page_array_len(pgbase, bytes));
-		if (unlikely(!data))
+		npages = nfs_page_array_len(pgbase, bytes);
+		if (!pagevec)
+			pagevec = kmalloc(npages * sizeof(struct page *),
+					  GFP_KERNEL);
+		if (!pagevec)
 			break;
-
 		down_read(&current->mm->mmap_sem);
 		result = get_user_pages(current, current->mm, user_addr,
-					data->npages, 1, 0, data->pagevec, NULL);
+					npages, 1, 0, pagevec, NULL);
 		up_read(&current->mm->mmap_sem);
-		if (result < 0) {
-			nfs_readdata_free(data);
+		if (result < 0)
 			break;
-		}
-		if ((unsigned)result < data->npages) {
+		if ((unsigned)result < npages) {
 			bytes = result * PAGE_SIZE;
 			if (bytes <= pgbase) {
-				nfs_direct_release_pages(data->pagevec, result);
-				nfs_readdata_free(data);
+				nfs_direct_release_pages(pagevec, result);
 				break;
 			}
 			bytes -= pgbase;
-			data->npages = result;
+			npages = result;
 		}
 
-		get_dreq(dreq);
-
-		data->req = (struct nfs_page *) dreq;
-		data->inode = inode;
-		data->cred = msg.rpc_cred;
-		data->args.fh = NFS_FH(inode);
-		data->args.context = ctx;
-		data->args.lock_context = dreq->l_ctx;
-		data->args.offset = pos;
-		data->args.pgbase = pgbase;
-		data->args.pages = data->pagevec;
-		data->args.count = bytes;
-		data->res.fattr = &data->fattr;
-		data->res.eof = 0;
-		data->res.count = bytes;
-		nfs_fattr_init(&data->fattr);
-		msg.rpc_argp = &data->args;
-		msg.rpc_resp = &data->res;
-
-		task_setup_data.task = &data->task;
-		task_setup_data.callback_data = data;
-		NFS_PROTO(inode)->read_setup(data, &msg);
-
-		task = rpc_run_task(&task_setup_data);
-		if (IS_ERR(task))
-			break;
-		rpc_put_task(task);
-
-		dprintk("NFS: %5u initiated direct read call "
-			"(req %s/%Ld, %zu bytes @ offset %Lu)\n",
-				data->task.tk_pid,
-				inode->i_sb->s_id,
-				(long long)NFS_FILEID(inode),
-				bytes,
-				(unsigned long long)data->args.offset);
-
-		started += bytes;
-		user_addr += bytes;
-		pos += bytes;
-		/* FIXME: Remove this unnecessary math from final patch */
-		pgbase += bytes;
-		pgbase &= ~PAGE_MASK;
-		BUG_ON(pgbase != (user_addr & ~PAGE_MASK));
-
-		count -= bytes;
-	} while (count != 0);
+		for (i = 0; i < npages; i++) {
+			struct nfs_page *req;
+			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
+			/* XXX do we need to do the eof zeroing found in async_filler? */
+			req = nfs_create_request(dreq->ctx, dreq->inode,
+						 pagevec[i],
+						 pgbase, req_len);
+			if (IS_ERR(req)) {
+				result = PTR_ERR(req);
+				break;
+			}
+			req->wb_index = pos >> PAGE_SHIFT;
+			req->wb_offset = pos & ~PAGE_MASK;
+			if (!nfs_pageio_add_request(desc, req)) {
+				result = desc->pg_error;
+				nfs_release_request(req);
+				break;
+			}
+			pgbase = 0;
+			bytes -= req_len;
+			started += req_len;
+			user_addr += req_len;
+			pos += req_len;
+			count -= req_len;
+		}
+		/* The nfs_page now hold references to these pages */
+		nfs_direct_release_pages(pagevec, npages);
+	} while (count != 0 && result >= 0);
+
+	kfree(pagevec);
 
 	if (started)
 		return started;
@@ -388,15 +388,19 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 					      unsigned long nr_segs,
 					      loff_t pos)
 {
+	struct nfs_pageio_descriptor desc;
 	ssize_t result = -EINVAL;
 	size_t requested_bytes = 0;
 	unsigned long seg;
 
+	nfs_pageio_init_read(&desc, dreq->inode,
+			     &nfs_direct_read_completion_ops);
 	get_dreq(dreq);
+	desc.pg_dreq = dreq;
 
 	for (seg = 0; seg < nr_segs; seg++) {
 		const struct iovec *vec = &iov[seg];
-		result = nfs_direct_read_schedule_segment(dreq, vec, pos);
+		result = nfs_direct_read_schedule_segment(&desc, vec, pos);
 		if (result < 0)
 			break;
 		requested_bytes += result;
@@ -405,6 +409,8 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 		pos += vec->iov_len;
 	}
 
+	nfs_pageio_complete(&desc);
+
 	/*
 	 * If no bytes were started, return the error, and let the
 	 * generic layer handle the completion.
@@ -441,104 +447,64 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
 	result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos);
 	if (!result)
 		result = nfs_direct_wait(dreq);
+	NFS_I(inode)->read_io += result;
 out_release:
 	nfs_direct_req_release(dreq);
 out:
 	return result;
 }
 
-static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
-{
-	while (!list_empty(&dreq->rewrite_list)) {
-		struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages);
-		list_del(&data->pages);
-		nfs_direct_release_pages(data->pagevec, data->npages);
-		nfs_writedata_free(data);
-	}
-}
-
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 {
-	struct inode *inode = dreq->inode;
-	struct list_head *p;
-	struct nfs_write_data *data;
-	struct rpc_task *task;
-	struct rpc_message msg = {
-		.rpc_cred = dreq->ctx->cred,
-	};
-	struct rpc_task_setup task_setup_data = {
-		.rpc_client = NFS_CLIENT(inode),
-		.rpc_message = &msg,
-		.callback_ops = &nfs_write_direct_ops,
-		.workqueue = nfsiod_workqueue,
-		.flags = RPC_TASK_ASYNC,
-	};
+	struct nfs_pageio_descriptor desc;
+	struct nfs_page *req, *tmp;
+	LIST_HEAD(reqs);
+	struct nfs_commit_info cinfo;
+	LIST_HEAD(failed);
+
+	nfs_init_cinfo_from_dreq(&cinfo, dreq);
+	pnfs_recover_commit_reqs(dreq->inode, &reqs, &cinfo);
+	spin_lock(cinfo.lock);
+	nfs_scan_commit_list(&cinfo.mds->list, &reqs, &cinfo, 0);
+	spin_unlock(cinfo.lock);
 
 	dreq->count = 0;
 	get_dreq(dreq);
 
-	list_for_each(p, &dreq->rewrite_list) {
-		data = list_entry(p, struct nfs_write_data, pages);
-
-		get_dreq(dreq);
-
-		/* Use stable writes */
-		data->args.stable = NFS_FILE_SYNC;
-
-		/*
-		 * Reset data->res.
-		 */
-		nfs_fattr_init(&data->fattr);
-		data->res.count = data->args.count;
-		memset(&data->verf, 0, sizeof(data->verf));
-
-		/*
-		 * Reuse data->task; data->args should not have changed
-		 * since the original request was sent.
-		 */
-		task_setup_data.task = &data->task;
-		task_setup_data.callback_data = data;
-		msg.rpc_argp = &data->args;
-		msg.rpc_resp = &data->res;
-		NFS_PROTO(inode)->write_setup(data, &msg);
-
-		/*
-		 * We're called via an RPC callback, so BKL is already held.
-		 */
-		task = rpc_run_task(&task_setup_data);
-		if (!IS_ERR(task))
-			rpc_put_task(task);
-
-		dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
-				data->task.tk_pid,
-				inode->i_sb->s_id,
-				(long long)NFS_FILEID(inode),
-				data->args.count,
-				(unsigned long long)data->args.offset);
+	nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE,
+			      &nfs_direct_write_completion_ops);
+	desc.pg_dreq = dreq;
+
+	list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
+		if (!nfs_pageio_add_request(&desc, req)) {
+			nfs_list_add_request(req, &failed);
+			spin_lock(cinfo.lock);
+			dreq->flags = 0;
+			dreq->error = -EIO;
+			spin_unlock(cinfo.lock);
+		}
 	}
+	nfs_pageio_complete(&desc);
 
-	if (put_dreq(dreq))
-		nfs_direct_write_complete(dreq, inode);
-}
-
-static void nfs_direct_commit_result(struct rpc_task *task, void *calldata)
-{
-	struct nfs_write_data *data = calldata;
+	while (!list_empty(&failed))
+		nfs_unlock_and_release_request(req);
 
-	/* Call the NFS version-specific code */
-	NFS_PROTO(data->inode)->commit_done(task, data);
+	if (put_dreq(dreq))
+		nfs_direct_write_complete(dreq, dreq->inode);
 }
 
-static void nfs_direct_commit_release(void *calldata)
+static void nfs_direct_commit_complete(struct nfs_commit_data *data)
 {
-	struct nfs_write_data *data = calldata;
-	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
+	struct nfs_direct_req *dreq = data->dreq;
+	struct nfs_commit_info cinfo;
+	struct nfs_page *req;
 	int status = data->task.tk_status;
 
+	nfs_init_cinfo_from_dreq(&cinfo, dreq);
 	if (status < 0) {
 		dprintk("NFS: %5u commit failed with error %d.\n",
-				data->task.tk_pid, status);
+			data->task.tk_pid, status);
 		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
 	} else if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) {
 		dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid);
@@ -546,62 +512,47 @@ static void nfs_direct_commit_release(void *calldata)
 	}
 
 	dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status);
-	nfs_direct_write_complete(dreq, data->inode);
-	nfs_commit_free(data);
+	while (!list_empty(&data->pages)) {
+		req = nfs_list_entry(data->pages.next);
+		nfs_list_remove_request(req);
+		if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) {
+			/* Note the rewrite will go through mds */
+			kref_get(&req->wb_kref);
+			nfs_mark_request_commit(req, NULL, &cinfo);
+		}
+		nfs_unlock_and_release_request(req);
+	}
+
+	if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
+		nfs_direct_write_complete(dreq, data->inode);
+}
+
+static void nfs_direct_error_cleanup(struct nfs_inode *nfsi)
+{
+	/* There is no lock to clear */
 }
 
-static const struct rpc_call_ops nfs_commit_direct_ops = {
-	.rpc_call_prepare = nfs_write_prepare,
-	.rpc_call_done = nfs_direct_commit_result,
-	.rpc_release = nfs_direct_commit_release,
+static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = {
+	.completion = nfs_direct_commit_complete,
+	.error_cleanup = nfs_direct_error_cleanup,
 };
 
 static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 {
-	struct nfs_write_data *data = dreq->commit_data;
-	struct rpc_task *task;
-	struct rpc_message msg = {
-		.rpc_argp = &data->args,
-		.rpc_resp = &data->res,
-		.rpc_cred = dreq->ctx->cred,
-	};
-	struct rpc_task_setup task_setup_data = {
-		.task = &data->task,
-		.rpc_client = NFS_CLIENT(dreq->inode),
-		.rpc_message = &msg,
-		.callback_ops = &nfs_commit_direct_ops,
-		.callback_data = data,
-		.workqueue = nfsiod_workqueue,
-		.flags = RPC_TASK_ASYNC,
-	};
-
-	data->inode = dreq->inode;
-	data->cred = msg.rpc_cred;
-
-	data->args.fh = NFS_FH(data->inode);
-	data->args.offset = 0;
-	data->args.count = 0;
-	data->args.context = dreq->ctx;
-	data->args.lock_context = dreq->l_ctx;
-	data->res.count = 0;
-	data->res.fattr = &data->fattr;
-	data->res.verf = &data->verf;
-	nfs_fattr_init(&data->fattr);
-
-	NFS_PROTO(data->inode)->commit_setup(data, &msg);
-
-	/* Note: task.tk_ops->rpc_release will free dreq->commit_data */
-	dreq->commit_data = NULL;
-
-	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
-
-	task = rpc_run_task(&task_setup_data);
-	if (!IS_ERR(task))
-		rpc_put_task(task);
+	int res;
+	struct nfs_commit_info cinfo;
+	LIST_HEAD(mds_list);
+
+	nfs_init_cinfo_from_dreq(&cinfo, dreq);
+	nfs_scan_commit(dreq->inode, &mds_list, &cinfo);
+	res = nfs_generic_commit_list(dreq->inode, &mds_list, 0, &cinfo);
+	if (res < 0) /* res == -ENOMEM */
+		nfs_direct_write_reschedule(dreq);
 }
 
-static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
+static void nfs_direct_write_schedule_work(struct work_struct *work)
 {
+	struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work);
 	int flags = dreq->flags;
 
 	dreq->flags = 0;
@@ -613,89 +564,32 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
 			nfs_direct_write_reschedule(dreq);
 			break;
 		default:
-			if (dreq->commit_data != NULL)
-				nfs_commit_free(dreq->commit_data);
-			nfs_direct_free_writedata(dreq);
-			nfs_zap_mapping(inode, inode->i_mapping);
+			nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping);
 			nfs_direct_complete(dreq);
 	}
 }
 
-static void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
+static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
 {
-	dreq->commit_data = nfs_commitdata_alloc();
-	if (dreq->commit_data != NULL)
-		dreq->commit_data->req = (struct nfs_page *) dreq;
+	schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */
 }
+
 #else
-static inline void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
+static void nfs_direct_write_schedule_work(struct work_struct *work)
 {
-	dreq->commit_data = NULL;
 }
 
 static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
 {
-	nfs_direct_free_writedata(dreq);
 	nfs_zap_mapping(inode, inode->i_mapping);
 	nfs_direct_complete(dreq);
 }
 #endif
 
-static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
-{
-	struct nfs_write_data *data = calldata;
-
-	nfs_writeback_done(task, data);
-}
-
 /*
  * NB: Return the value of the first error return code.  Subsequent
  *     errors after the first one are ignored.
  */
-static void nfs_direct_write_release(void *calldata)
-{
-	struct nfs_write_data *data = calldata;
-	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
-	int status = data->task.tk_status;
-
-	spin_lock(&dreq->lock);
-
-	if (unlikely(status < 0)) {
-		/* An error has occurred, so we should not commit */
-		dreq->flags = 0;
-		dreq->error = status;
-	}
-	if (unlikely(dreq->error != 0))
-		goto out_unlock;
-
-	dreq->count += data->res.count;
-
-	if (data->res.verf->committed != NFS_FILE_SYNC) {
-		switch (dreq->flags) {
-			case 0:
-				memcpy(&dreq->verf, &data->verf, sizeof(dreq->verf));
-				dreq->flags = NFS_ODIRECT_DO_COMMIT;
-				break;
-			case NFS_ODIRECT_DO_COMMIT:
-				if (memcmp(&dreq->verf, &data->verf, sizeof(dreq->verf))) {
-					dprintk("NFS: %5u write verify failed\n", data->task.tk_pid);
-					dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
-				}
-		}
-	}
-out_unlock:
-	spin_unlock(&dreq->lock);
-
-	if (put_dreq(dreq))
-		nfs_direct_write_complete(dreq, data->inode);
-}
-
-static const struct rpc_call_ops nfs_write_direct_ops = {
-	.rpc_call_prepare = nfs_write_prepare,
-	.rpc_call_done = nfs_direct_write_result,
-	.rpc_release = nfs_direct_write_release,
-};
-
 /*
  * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
  * operation.  If nfs_writedata_alloc() or get_user_pages() fails,
@@ -703,132 +597,187 @@ static const struct rpc_call_ops nfs_write_direct_ops = {
  * handled automatically by nfs_direct_write_result().  Otherwise, if
  * no requests have been sent, just return an error.
  */
-static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
+static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *desc,
 						 const struct iovec *iov,
-						 loff_t pos, int sync)
+						 loff_t pos)
 {
+	struct nfs_direct_req *dreq = desc->pg_dreq;
 	struct nfs_open_context *ctx = dreq->ctx;
 	struct inode *inode = ctx->dentry->d_inode;
 	unsigned long user_addr = (unsigned long)iov->iov_base;
 	size_t count = iov->iov_len;
-	struct rpc_task *task;
-	struct rpc_message msg = {
-		.rpc_cred = ctx->cred,
-	};
-	struct rpc_task_setup task_setup_data = {
-		.rpc_client = NFS_CLIENT(inode),
-		.rpc_message = &msg,
-		.callback_ops = &nfs_write_direct_ops,
-		.workqueue = nfsiod_workqueue,
-		.flags = RPC_TASK_ASYNC,
-	};
 	size_t wsize = NFS_SERVER(inode)->wsize;
 	unsigned int pgbase;
 	int result;
 	ssize_t started = 0;
+	struct page **pagevec = NULL;
+	unsigned int npages;
 
 	do {
-		struct nfs_write_data *data;
 		size_t bytes;
+		int i;
 
 		pgbase = user_addr & ~PAGE_MASK;
-		bytes = min(wsize,count);
+		bytes = min(max_t(size_t, wsize, PAGE_SIZE), count);
 
 		result = -ENOMEM;
-		data = nfs_writedata_alloc(nfs_page_array_len(pgbase, bytes));
-		if (unlikely(!data))
+		npages = nfs_page_array_len(pgbase, bytes);
+		if (!pagevec)
+			pagevec = kmalloc(npages * sizeof(struct page *), GFP_KERNEL);
+		if (!pagevec)
 			break;
 
 		down_read(&current->mm->mmap_sem);
 		result = get_user_pages(current, current->mm, user_addr,
-					data->npages, 0, 0, data->pagevec, NULL);
+					npages, 0, 0, pagevec, NULL);
 		up_read(&current->mm->mmap_sem);
-		if (result < 0) {
-			nfs_writedata_free(data);
+		if (result < 0)
 			break;
-		}
-		if ((unsigned)result < data->npages) {
+
+		if ((unsigned)result < npages) {
 			bytes = result * PAGE_SIZE;
 			if (bytes <= pgbase) {
-				nfs_direct_release_pages(data->pagevec, result);
-				nfs_writedata_free(data);
+				nfs_direct_release_pages(pagevec, result);
 				break;
 			}
 			bytes -= pgbase;
-			data->npages = result;
+			npages = result;
 		}
 
-		get_dreq(dreq);
-
-		list_move_tail(&data->pages, &dreq->rewrite_list);
-
-		data->req = (struct nfs_page *) dreq;
-		data->inode = inode;
-		data->cred = msg.rpc_cred;
-		data->args.fh = NFS_FH(inode);
-		data->args.context = ctx;
-		data->args.lock_context = dreq->l_ctx;
-		data->args.offset = pos;
-		data->args.pgbase = pgbase;
-		data->args.pages = data->pagevec;
-		data->args.count = bytes;
-		data->args.stable = sync;
-		data->res.fattr = &data->fattr;
-		data->res.count = bytes;
-		data->res.verf = &data->verf;
-		nfs_fattr_init(&data->fattr);
-
-		task_setup_data.task = &data->task;
-		task_setup_data.callback_data = data;
-		msg.rpc_argp = &data->args;
-		msg.rpc_resp = &data->res;
-		NFS_PROTO(inode)->write_setup(data, &msg);
-
-		task = rpc_run_task(&task_setup_data);
-		if (IS_ERR(task))
-			break;
-		rpc_put_task(task);
-
-		dprintk("NFS: %5u initiated direct write call "
-			"(req %s/%Ld, %zu bytes @ offset %Lu)\n",
-				data->task.tk_pid,
-				inode->i_sb->s_id,
-				(long long)NFS_FILEID(inode),
-				bytes,
-				(unsigned long long)data->args.offset);
+		for (i = 0; i < npages; i++) {
+			struct nfs_page *req;
+			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
 
-		started += bytes;
-		user_addr += bytes;
-		pos += bytes;
-
-		/* FIXME: Remove this useless math from the final patch */
-		pgbase += bytes;
-		pgbase &= ~PAGE_MASK;
-		BUG_ON(pgbase != (user_addr & ~PAGE_MASK));
+			req = nfs_create_request(dreq->ctx, dreq->inode,
+						 pagevec[i],
+						 pgbase, req_len);
+			if (IS_ERR(req)) {
+				result = PTR_ERR(req);
+				break;
+			}
+			nfs_lock_request(req);
+			req->wb_index = pos >> PAGE_SHIFT;
+			req->wb_offset = pos & ~PAGE_MASK;
+			if (!nfs_pageio_add_request(desc, req)) {
+				result = desc->pg_error;
+				nfs_unlock_and_release_request(req);
+				break;
+			}
+			pgbase = 0;
+			bytes -= req_len;
+			started += req_len;
+			user_addr += req_len;
+			pos += req_len;
+			count -= req_len;
+		}
+		/* The nfs_page now hold references to these pages */
+		nfs_direct_release_pages(pagevec, npages);
+	} while (count != 0 && result >= 0);
 
-		count -= bytes;
-	} while (count != 0);
+	kfree(pagevec);
 
 	if (started)
 		return started;
 	return result < 0 ? (ssize_t) result : -EFAULT;
 }
 
+static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
+{
+	struct nfs_direct_req *dreq = hdr->dreq;
+	struct nfs_commit_info cinfo;
+	int bit = -1;
+	struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+
+	if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
+		goto out_put;
+
+	nfs_init_cinfo_from_dreq(&cinfo, dreq);
+
+	spin_lock(&dreq->lock);
+
+	if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
+		dreq->flags = 0;
+		dreq->error = hdr->error;
+	}
+	if (dreq->error != 0)
+		bit = NFS_IOHDR_ERROR;
+	else {
+		dreq->count += hdr->good_bytes;
+		if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) {
+			dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+			bit = NFS_IOHDR_NEED_RESCHED;
+		} else if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) {
+			if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
+				bit = NFS_IOHDR_NEED_RESCHED;
+			else if (dreq->flags == 0) {
+				memcpy(&dreq->verf, &req->wb_verf,
+				       sizeof(dreq->verf));
+				bit = NFS_IOHDR_NEED_COMMIT;
+				dreq->flags = NFS_ODIRECT_DO_COMMIT;
+			} else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) {
+				if (memcmp(&dreq->verf, &req->wb_verf, sizeof(dreq->verf))) {
+					dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+					bit = NFS_IOHDR_NEED_RESCHED;
+				} else
+					bit = NFS_IOHDR_NEED_COMMIT;
+			}
+		}
+	}
+	spin_unlock(&dreq->lock);
+
+	while (!list_empty(&hdr->pages)) {
+		req = nfs_list_entry(hdr->pages.next);
+		nfs_list_remove_request(req);
+		switch (bit) {
+		case NFS_IOHDR_NEED_RESCHED:
+		case NFS_IOHDR_NEED_COMMIT:
+			kref_get(&req->wb_kref);
+			nfs_mark_request_commit(req, hdr->lseg, &cinfo);
+		}
+		nfs_unlock_and_release_request(req);
+	}
+
+out_put:
+	if (put_dreq(dreq))
+		nfs_direct_write_complete(dreq, hdr->inode);
+	hdr->release(hdr);
+}
+
+static void nfs_write_sync_pgio_error(struct list_head *head)
+{
+	struct nfs_page *req;
+
+	while (!list_empty(head)) {
+		req = nfs_list_entry(head->next);
+		nfs_list_remove_request(req);
+		nfs_unlock_and_release_request(req);
+	}
+}
+
+static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
+	.error_cleanup = nfs_write_sync_pgio_error,
+	.init_hdr = nfs_direct_pgio_init,
+	.completion = nfs_direct_write_completion,
+};
+
 static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 					       const struct iovec *iov,
 					       unsigned long nr_segs,
-					       loff_t pos, int sync)
+					       loff_t pos)
 {
+	struct nfs_pageio_descriptor desc;
 	ssize_t result = 0;
 	size_t requested_bytes = 0;
 	unsigned long seg;
 
+	nfs_pageio_init_write(&desc, dreq->inode, FLUSH_COND_STABLE,
+			      &nfs_direct_write_completion_ops);
+	desc.pg_dreq = dreq;
 	get_dreq(dreq);
 
 	for (seg = 0; seg < nr_segs; seg++) {
 		const struct iovec *vec = &iov[seg];
-		result = nfs_direct_write_schedule_segment(dreq, vec,
-							   pos, sync);
+		result = nfs_direct_write_schedule_segment(&desc, vec, pos);
 		if (result < 0)
 			break;
 		requested_bytes += result;
@@ -836,6 +785,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 			break;
 		pos += vec->iov_len;
 	}
+	nfs_pageio_complete(&desc);
+	NFS_I(dreq->inode)->write_io += desc.pg_bytes_written;
 
 	/*
 	 * If no bytes were started, return the error, and let the
@@ -858,16 +809,10 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	ssize_t result = -ENOMEM;
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
 	struct nfs_direct_req *dreq;
-	size_t wsize = NFS_SERVER(inode)->wsize;
-	int sync = NFS_UNSTABLE;
 
 	dreq = nfs_direct_req_alloc();
 	if (!dreq)
 		goto out;
-	nfs_alloc_commit_data(dreq);
-
-	if (dreq->commit_data == NULL || count <= wsize)
-		sync = NFS_FILE_SYNC;
 
 	dreq->inode = inode;
 	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
@@ -877,7 +822,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	if (!is_sync_kiocb(iocb))
 		dreq->iocb = iocb;
 
-	result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync);
+	result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos);
 	if (!result)
 		result = nfs_direct_wait(dreq);
 out_release:
@@ -997,10 +942,15 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	task_io_account_write(count);
 
 	retval = nfs_direct_write(iocb, iov, nr_segs, pos, count);
+	if (retval > 0) {
+		struct inode *inode = mapping->host;
 
-	if (retval > 0)
 		iocb->ki_pos = pos + retval;
-
+		spin_lock(&inode->i_lock);
+		if (i_size_read(inode) < iocb->ki_pos)
+			i_size_write(inode, iocb->ki_pos);
+		spin_unlock(&inode->i_lock);
+	}
 out:
 	return retval;
 }
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index aa9b709fd328..56311ca5f9f8 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -174,6 +174,13 @@ nfs_file_flush(struct file *file, fl_owner_t id)
 	if ((file->f_mode & FMODE_WRITE) == 0)
 		return 0;
 
+	/*
+	 * If we're holding a write delegation, then just start the i/o
+	 * but don't wait for completion (or send a commit).
+	 */
+	if (nfs_have_delegation(inode, FMODE_WRITE))
+		return filemap_fdatawrite(file->f_mapping);
+
 	/* Flush writes to the server and return any errors */
 	return vfs_fsync(file, 0);
 }
@@ -417,6 +424,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
 
 	if (status < 0)
 		return status;
+	NFS_I(mapping->host)->write_io += copied;
 	return copied;
 }
 
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index ae65c16b3670..c817787fbdb4 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -64,23 +64,12 @@ void nfs_fscache_release_client_cookie(struct nfs_client *clp)
  * either by the 'fsc=xxx' option to mount, or by inheriting it from the parent
  * superblock across an automount point of some nature.
  */
-void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq,
-				  struct nfs_clone_mount *mntdata)
+void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int ulen)
 {
 	struct nfs_fscache_key *key, *xkey;
 	struct nfs_server *nfss = NFS_SB(sb);
 	struct rb_node **p, *parent;
-	int diff, ulen;
-
-	if (uniq) {
-		ulen = strlen(uniq);
-	} else if (mntdata) {
-		struct nfs_server *mnt_s = NFS_SB(mntdata->sb);
-		if (mnt_s->fscache_key) {
-			uniq = mnt_s->fscache_key->key.uniquifier;
-			ulen = mnt_s->fscache_key->key.uniq_len;
-		}
-	}
+	int diff;
 
 	if (!uniq) {
 		uniq = "";
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index b9c572d0679f..c5b11b53ff33 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -73,9 +73,7 @@ extern void nfs_fscache_unregister(void);
 extern void nfs_fscache_get_client_cookie(struct nfs_client *);
 extern void nfs_fscache_release_client_cookie(struct nfs_client *);
 
-extern void nfs_fscache_get_super_cookie(struct super_block *,
-					 const char *,
-					 struct nfs_clone_mount *);
+extern void nfs_fscache_get_super_cookie(struct super_block *, const char *, int);
 extern void nfs_fscache_release_super_cookie(struct super_block *);
 
 extern void nfs_fscache_init_inode_cookie(struct inode *);
@@ -172,12 +170,6 @@ static inline void nfs_fscache_unregister(void) {}
 static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {}
 static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
 
-static inline void nfs_fscache_get_super_cookie(
-	struct super_block *sb,
-	const char *uniq,
-	struct nfs_clone_mount *mntdata)
-{
-}
 static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
 
 static inline void nfs_fscache_init_inode_cookie(struct inode *inode) {}
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 4ca6f5c8038e..8abfb19bd3aa 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -150,7 +150,7 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh)
 		goto out;
 
 	/* Start by getting the root filehandle from the server */
-	ret = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
+	ret = nfs4_proc_get_rootfh(server, mntfh, &fsinfo);
 	if (ret < 0) {
 		dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret);
 		goto out;
@@ -178,87 +178,4 @@ out:
 	return ret;
 }
 
-/*
- * get an NFS4 root dentry from the root filehandle
- */
-struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh,
-			     const char *devname)
-{
-	struct nfs_server *server = NFS_SB(sb);
-	struct nfs_fattr *fattr = NULL;
-	struct dentry *ret;
-	struct inode *inode;
-	void *name = kstrdup(devname, GFP_KERNEL);
-	int error;
-
-	dprintk("--> nfs4_get_root()\n");
-
-	if (!name)
-		return ERR_PTR(-ENOMEM);
-
-	/* get the info about the server and filesystem */
-	error = nfs4_server_capabilities(server, mntfh);
-	if (error < 0) {
-		dprintk("nfs_get_root: getcaps error = %d\n",
-			-error);
-		kfree(name);
-		return ERR_PTR(error);
-	}
-
-	fattr = nfs_alloc_fattr();
-	if (fattr == NULL) {
-		kfree(name);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	/* get the actual root for this mount */
-	error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr);
-	if (error < 0) {
-		dprintk("nfs_get_root: getattr error = %d\n", -error);
-		ret = ERR_PTR(error);
-		goto out;
-	}
-
-	if (fattr->valid & NFS_ATTR_FATTR_FSID &&
-	    !nfs_fsid_equal(&server->fsid, &fattr->fsid))
-		memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
-
-	inode = nfs_fhget(sb, mntfh, fattr);
-	if (IS_ERR(inode)) {
-		dprintk("nfs_get_root: get root inode failed\n");
-		ret = ERR_CAST(inode);
-		goto out;
-	}
-
-	error = nfs_superblock_set_dummy_root(sb, inode);
-	if (error != 0) {
-		ret = ERR_PTR(error);
-		goto out;
-	}
-
-	/* root dentries normally start off anonymous and get spliced in later
-	 * if the dentry tree reaches them; however if the dentry already
-	 * exists, we'll pick it up at this point and use it as the root
-	 */
-	ret = d_obtain_alias(inode);
-	if (IS_ERR(ret)) {
-		dprintk("nfs_get_root: get root dentry failed\n");
-		goto out;
-	}
-
-	security_d_instantiate(ret, inode);
-	spin_lock(&ret->d_lock);
-	if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) {
-		ret->d_fsdata = name;
-		name = NULL;
-	}
-	spin_unlock(&ret->d_lock);
-out:
-	if (name)
-		kfree(name);
-	nfs_free_fattr(fattr);
-	dprintk("<-- nfs4_get_root()\n");
-	return ret;
-}
-
 #endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index ba3019f5934c..b5b86a05059c 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -415,7 +415,7 @@ static int __nfs_idmap_register(struct dentry *dir,
 static void nfs_idmap_unregister(struct nfs_client *clp,
 				      struct rpc_pipe *pipe)
 {
-	struct net *net = clp->net;
+	struct net *net = clp->cl_net;
 	struct super_block *pipefs_sb;
 
 	pipefs_sb = rpc_get_sb_net(net);
@@ -429,7 +429,7 @@ static int nfs_idmap_register(struct nfs_client *clp,
 				   struct idmap *idmap,
 				   struct rpc_pipe *pipe)
 {
-	struct net *net = clp->net;
+	struct net *net = clp->cl_net;
 	struct super_block *pipefs_sb;
 	int err = 0;
 
@@ -530,9 +530,25 @@ static struct nfs_client *nfs_get_client_for_event(struct net *net, int event)
 	struct nfs_net *nn = net_generic(net, nfs_net_id);
 	struct dentry *cl_dentry;
 	struct nfs_client *clp;
+	int err;
 
+restart:
 	spin_lock(&nn->nfs_client_lock);
 	list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
+		/* Wait for initialisation to finish */
+		if (clp->cl_cons_state == NFS_CS_INITING) {
+			atomic_inc(&clp->cl_count);
+			spin_unlock(&nn->nfs_client_lock);
+			err = nfs_wait_client_init_complete(clp);
+			nfs_put_client(clp);
+			if (err)
+				return NULL;
+			goto restart;
+		}
+		/* Skip nfs_clients that failed to initialise */
+		if (clp->cl_cons_state < 0)
+			continue;
+		smp_rmb();
 		if (clp->rpc_ops != &nfs_v4_clientops)
 			continue;
 		cl_dentry = clp->cl_idmap->idmap_pipe->dentry;
@@ -640,20 +656,16 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons,
 	struct idmap_msg *im;
 	struct idmap *idmap = (struct idmap *)aux;
 	struct key *key = cons->key;
-	int ret;
+	int ret = -ENOMEM;
 
 	/* msg and im are freed in idmap_pipe_destroy_msg */
 	msg = kmalloc(sizeof(*msg), GFP_KERNEL);
-	if (IS_ERR(msg)) {
-		ret = PTR_ERR(msg);
+	if (!msg)
 		goto out0;
-	}
 
 	im = kmalloc(sizeof(*im), GFP_KERNEL);
-	if (IS_ERR(im)) {
-		ret = PTR_ERR(im);
+	if (!im)
 		goto out1;
-	}
 
 	ret = nfs_idmap_prepare_message(key->description, im, msg);
 	if (ret < 0)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c6073139b402..2f6f78c4b42d 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -285,9 +285,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		inode->i_mode = fattr->mode;
 		if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0
 				&& nfs_server_capable(inode, NFS_CAP_MODE))
-			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
-				| NFS_INO_INVALID_ACCESS
-				| NFS_INO_INVALID_ACL;
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
 		/* Why so? Because we want revalidate for devices/FIFOs, and
 		 * that's precisely what we have in nfs_file_inode_operations.
 		 */
@@ -300,8 +298,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 			inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
 			inode->i_fop = &nfs_dir_operations;
 			inode->i_data.a_ops = &nfs_dir_aops;
-			if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS))
-				set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
 			/* Deal with crossing mountpoints */
 			if (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT ||
 					fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
@@ -327,6 +323,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		inode->i_gid = -2;
 		inode->i_blocks = 0;
 		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
+		nfsi->write_io = 0;
+		nfsi->read_io = 0;
 
 		nfsi->read_cache_jiffies = fattr->time_start;
 		nfsi->attr_gencount = fattr->gencount;
@@ -337,24 +335,19 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		if (fattr->valid & NFS_ATTR_FATTR_MTIME)
 			inode->i_mtime = fattr->mtime;
 		else if (nfs_server_capable(inode, NFS_CAP_MTIME))
-			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
-				| NFS_INO_INVALID_DATA;
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
 		if (fattr->valid & NFS_ATTR_FATTR_CTIME)
 			inode->i_ctime = fattr->ctime;
 		else if (nfs_server_capable(inode, NFS_CAP_CTIME))
-			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
-				| NFS_INO_INVALID_ACCESS
-				| NFS_INO_INVALID_ACL;
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
 		if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
 			inode->i_version = fattr->change_attr;
 		else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR))
-			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
-				| NFS_INO_INVALID_DATA;
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
 		if (fattr->valid & NFS_ATTR_FATTR_SIZE)
 			inode->i_size = nfs_size_to_loff_t(fattr->size);
 		else
 			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
-				| NFS_INO_INVALID_DATA
 				| NFS_INO_REVAL_PAGECACHE;
 		if (fattr->valid & NFS_ATTR_FATTR_NLINK)
 			set_nlink(inode, fattr->nlink);
@@ -363,15 +356,11 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		if (fattr->valid & NFS_ATTR_FATTR_OWNER)
 			inode->i_uid = fattr->uid;
 		else if (nfs_server_capable(inode, NFS_CAP_OWNER))
-			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
-				| NFS_INO_INVALID_ACCESS
-				| NFS_INO_INVALID_ACL;
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
 		if (fattr->valid & NFS_ATTR_FATTR_GROUP)
 			inode->i_gid = fattr->gid;
 		else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP))
-			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
-				| NFS_INO_INVALID_ACCESS
-				| NFS_INO_INVALID_ACL;
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
 		if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
 			inode->i_blocks = fattr->du.nfs2.blocks;
 		if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
@@ -654,6 +643,7 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f
 	nfs_init_lock_context(&ctx->lock_context);
 	ctx->lock_context.open_context = ctx;
 	INIT_LIST_HEAD(&ctx->list);
+	ctx->mdsthreshold = NULL;
 	return ctx;
 }
 
@@ -682,6 +672,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
 		put_rpccred(ctx->cred);
 	dput(ctx->dentry);
 	nfs_sb_deactive(sb);
+	kfree(ctx->mdsthreshold);
 	kfree(ctx);
 }
 
@@ -870,6 +861,15 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
 	return 0;
 }
 
+static bool nfs_mapping_need_revalidate_inode(struct inode *inode)
+{
+	if (nfs_have_delegated_attributes(inode))
+		return false;
+	return (NFS_I(inode)->cache_validity & NFS_INO_REVAL_PAGECACHE)
+		|| nfs_attribute_timeout(inode)
+		|| NFS_STALE(inode);
+}
+
 /**
  * nfs_revalidate_mapping - Revalidate the pagecache
  * @inode - pointer to host inode
@@ -880,9 +880,7 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 	struct nfs_inode *nfsi = NFS_I(inode);
 	int ret = 0;
 
-	if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
-			|| nfs_attribute_cache_expired(inode)
-			|| NFS_STALE(inode)) {
+	if (nfs_mapping_need_revalidate_inode(inode)) {
 		ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
 		if (ret < 0)
 			goto out;
@@ -948,6 +946,8 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 	unsigned long invalid = 0;
 
 
+	if (nfs_have_delegated_attributes(inode))
+		return 0;
 	/* Has the inode gone and changed behind our back? */
 	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
 		return -EIO;
@@ -960,7 +960,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 
 	/* Verify a few of the more important attributes */
 	if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
-		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+		invalid |= NFS_INO_INVALID_ATTR;
 
 	if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
 		cur_size = i_size_read(inode);
@@ -1279,14 +1279,26 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			nfs_display_fhandle_hash(NFS_FH(inode)),
 			atomic_read(&inode->i_count), fattr->valid);
 
-	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
-		goto out_fileid;
+	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) {
+		printk(KERN_ERR "NFS: server %s error: fileid changed\n"
+			"fsid %s: expected fileid 0x%Lx, got 0x%Lx\n",
+			NFS_SERVER(inode)->nfs_client->cl_hostname,
+			inode->i_sb->s_id, (long long)nfsi->fileid,
+			(long long)fattr->fileid);
+		goto out_err;
+	}
 
 	/*
 	 * Make sure the inode's type hasn't changed.
 	 */
-	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
-		goto out_changed;
+	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) {
+		/*
+		* Big trouble! The inode has become a different object.
+		*/
+		printk(KERN_DEBUG "NFS: %s: inode %ld mode changed, %07o to %07o\n",
+				__func__, inode->i_ino, inode->i_mode, fattr->mode);
+		goto out_err;
+	}
 
 	server = NFS_SERVER(inode);
 	/* Update the fsid? */
@@ -1314,7 +1326,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		if (inode->i_version != fattr->change_attr) {
 			dprintk("NFS: change_attr change on server for file %s/%ld\n",
 					inode->i_sb->s_id, inode->i_ino);
-			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+			invalid |= NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_DATA
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL
+				| NFS_INO_REVAL_PAGECACHE;
 			if (S_ISDIR(inode->i_mode))
 				nfs_force_lookup_revalidate(inode);
 			inode->i_version = fattr->change_attr;
@@ -1323,38 +1339,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		invalid |= save_cache_validity;
 
 	if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
-		/* NFSv2/v3: Check if the mtime agrees */
-		if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
-			dprintk("NFS: mtime change on server for file %s/%ld\n",
-					inode->i_sb->s_id, inode->i_ino);
-			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
-			if (S_ISDIR(inode->i_mode))
-				nfs_force_lookup_revalidate(inode);
-			memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
-		}
+		memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
 	} else if (server->caps & NFS_CAP_MTIME)
 		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
-				| NFS_INO_INVALID_DATA
-				| NFS_INO_REVAL_PAGECACHE
 				| NFS_INO_REVAL_FORCED);
 
 	if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
-		/* If ctime has changed we should definitely clear access+acl caches */
-		if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
-			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-			/* and probably clear data for a directory too as utimes can cause
-			 * havoc with our cache.
-			 */
-			if (S_ISDIR(inode->i_mode)) {
-				invalid |= NFS_INO_INVALID_DATA;
-				nfs_force_lookup_revalidate(inode);
-			}
-			memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
-		}
+		memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
 	} else if (server->caps & NFS_CAP_CTIME)
 		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
-				| NFS_INO_INVALID_ACCESS
-				| NFS_INO_INVALID_ACL
 				| NFS_INO_REVAL_FORCED);
 
 	/* Check if our cached file size is stale */
@@ -1466,12 +1459,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		nfsi->cache_validity |= invalid;
 
 	return 0;
- out_changed:
-	/*
-	 * Big trouble! The inode has become a different object.
-	 */
-	printk(KERN_DEBUG "NFS: %s: inode %ld mode changed, %07o to %07o\n",
-			__func__, inode->i_ino, inode->i_mode, fattr->mode);
  out_err:
 	/*
 	 * No need to worry about unhashing the dentry, as the
@@ -1480,13 +1467,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	 */
 	nfs_invalidate_inode(inode);
 	return -ESTALE;
-
- out_fileid:
-	printk(KERN_ERR "NFS: server %s error: fileid changed\n"
-		"fsid %s: expected fileid 0x%Lx, got 0x%Lx\n",
-		NFS_SERVER(inode)->nfs_client->cl_hostname, inode->i_sb->s_id,
-		(long long)nfsi->fileid, (long long)fattr->fileid);
-	goto out_err;
 }
 
 
@@ -1547,7 +1527,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
 	nfsi->delegation_state = 0;
 	init_rwsem(&nfsi->rwsem);
 	nfsi->layout = NULL;
-	atomic_set(&nfsi->commits_outstanding, 0);
+	atomic_set(&nfsi->commit_info.rpcs_out, 0);
 #endif
 }
 
@@ -1559,9 +1539,9 @@ static void init_once(void *foo)
 	INIT_LIST_HEAD(&nfsi->open_files);
 	INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
 	INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
-	INIT_LIST_HEAD(&nfsi->commit_list);
+	INIT_LIST_HEAD(&nfsi->commit_info.list);
 	nfsi->npages = 0;
-	nfsi->ncommit = 0;
+	nfsi->commit_info.ncommit = 0;
 	atomic_set(&nfsi->silly_count, 1);
 	INIT_HLIST_HEAD(&nfsi->silly_list);
 	init_waitqueue_head(&nfsi->waitqueue);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index b777bdaba4c5..1848a7275592 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -103,6 +103,7 @@ struct nfs_parsed_mount_data {
 	unsigned int		version;
 	unsigned int		minorversion;
 	char			*fscache_uniq;
+	bool			need_mount;
 
 	struct {
 		struct sockaddr_storage	address;
@@ -167,11 +168,13 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *,
 					   struct nfs_fh *,
 					   struct nfs_fattr *,
 					   rpc_authflavor_t);
+extern int nfs_wait_client_init_complete(const struct nfs_client *clp);
 extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
-extern int nfs4_check_client_ready(struct nfs_client *clp);
 extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
 					     const struct sockaddr *ds_addr,
-					     int ds_addrlen, int ds_proto);
+					     int ds_addrlen, int ds_proto,
+					     unsigned int ds_timeo,
+					     unsigned int ds_retrans);
 #ifdef CONFIG_PROC_FS
 extern int __init nfs_fs_proc_init(void);
 extern void nfs_fs_proc_exit(void);
@@ -185,21 +188,11 @@ static inline void nfs_fs_proc_exit(void)
 }
 #endif
 
-/* nfs4namespace.c */
-#ifdef CONFIG_NFS_V4
-extern struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry *dentry);
-#else
-static inline
-struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry *dentry)
-{
-	return ERR_PTR(-ENOENT);
-}
-#endif
-
 /* callback_xdr.c */
 extern struct svc_version nfs4_callback_version1;
 extern struct svc_version nfs4_callback_version4;
 
+struct nfs_pageio_descriptor;
 /* pagelist.c */
 extern int __init nfs_init_nfspagecache(void);
 extern void nfs_destroy_nfspagecache(void);
@@ -210,9 +203,13 @@ extern void nfs_destroy_writepagecache(void);
 
 extern int __init nfs_init_directcache(void);
 extern void nfs_destroy_directcache(void);
+extern bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount);
+extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
+			      struct nfs_pgio_header *hdr,
+			      void (*release)(struct nfs_pgio_header *hdr));
+void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);
 
 /* nfs2xdr.c */
-extern int nfs_stat_to_errno(enum nfs_stat);
 extern struct rpc_procinfo nfs_procedures[];
 extern int nfs2_decode_dirent(struct xdr_stream *,
 				struct nfs_entry *, int);
@@ -237,14 +234,13 @@ extern const u32 nfs41_maxwrite_overhead;
 extern struct rpc_procinfo nfs4_procedures[];
 #endif
 
-extern int nfs4_init_ds_session(struct nfs_client *clp);
+extern int nfs4_init_ds_session(struct nfs_client *, unsigned long);
 
 /* proc.c */
 void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
-extern int nfs_init_client(struct nfs_client *clp,
+extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
 			   const struct rpc_timeout *timeparms,
-			   const char *ip_addr, rpc_authflavor_t authflavour,
-			   int noresvport);
+			   const char *ip_addr, rpc_authflavor_t authflavour);
 
 /* dir.c */
 extern int nfs_access_cache_shrinker(struct shrinker *shrink,
@@ -280,9 +276,10 @@ extern void nfs_sb_deactive(struct super_block *sb);
 extern char *nfs_path(char **p, struct dentry *dentry,
 		      char *buffer, ssize_t buflen);
 extern struct vfsmount *nfs_d_automount(struct path *path);
-#ifdef CONFIG_NFS_V4
-rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *);
-#endif
+struct vfsmount *nfs_submount(struct nfs_server *, struct dentry *,
+			      struct nfs_fh *, struct nfs_fattr *);
+struct vfsmount *nfs_do_submount(struct dentry *, struct nfs_fh *,
+				 struct nfs_fattr *, rpc_authflavor_t);
 
 /* getroot.c */
 extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *,
@@ -294,46 +291,73 @@ extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *,
 extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
 #endif
 
-struct nfs_pageio_descriptor;
+struct nfs_pgio_completion_ops;
 /* read.c */
-extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
-			     const struct rpc_call_ops *call_ops);
+extern struct nfs_read_header *nfs_readhdr_alloc(void);
+extern void nfs_readhdr_free(struct nfs_pgio_header *hdr);
+extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
+			struct inode *inode,
+			const struct nfs_pgio_completion_ops *compl_ops);
+extern int nfs_initiate_read(struct rpc_clnt *clnt,
+			     struct nfs_read_data *data,
+			     const struct rpc_call_ops *call_ops, int flags);
 extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
-		struct list_head *head);
-
+			      struct nfs_pgio_header *hdr);
 extern void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
-		struct inode *inode);
+			struct inode *inode,
+			const struct nfs_pgio_completion_ops *compl_ops);
 extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
 extern void nfs_readdata_release(struct nfs_read_data *rdata);
 
 /* write.c */
+extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
+			struct inode *inode, int ioflags,
+			const struct nfs_pgio_completion_ops *compl_ops);
+extern struct nfs_write_header *nfs_writehdr_alloc(void);
+extern void nfs_writehdr_free(struct nfs_pgio_header *hdr);
 extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
-		struct list_head *head);
+			     struct nfs_pgio_header *hdr);
 extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
-				  struct inode *inode, int ioflags);
+			struct inode *inode, int ioflags,
+			const struct nfs_pgio_completion_ops *compl_ops);
 extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
 extern void nfs_writedata_release(struct nfs_write_data *wdata);
-extern void nfs_commit_free(struct nfs_write_data *p);
-extern int nfs_initiate_write(struct nfs_write_data *data,
-			      struct rpc_clnt *clnt,
+extern void nfs_commit_free(struct nfs_commit_data *p);
+extern int nfs_initiate_write(struct rpc_clnt *clnt,
+			      struct nfs_write_data *data,
 			      const struct rpc_call_ops *call_ops,
-			      int how);
+			      int how, int flags);
 extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
-extern int nfs_initiate_commit(struct nfs_write_data *data,
-			       struct rpc_clnt *clnt,
+extern void nfs_commit_prepare(struct rpc_task *task, void *calldata);
+extern int nfs_initiate_commit(struct rpc_clnt *clnt,
+			       struct nfs_commit_data *data,
 			       const struct rpc_call_ops *call_ops,
-			       int how);
-extern void nfs_init_commit(struct nfs_write_data *data,
+			       int how, int flags);
+extern void nfs_init_commit(struct nfs_commit_data *data,
 			    struct list_head *head,
-			    struct pnfs_layout_segment *lseg);
+			    struct pnfs_layout_segment *lseg,
+			    struct nfs_commit_info *cinfo);
+int nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
+			 struct nfs_commit_info *cinfo, int max);
+int nfs_scan_commit(struct inode *inode, struct list_head *dst,
+		    struct nfs_commit_info *cinfo);
+void nfs_mark_request_commit(struct nfs_page *req,
+			     struct pnfs_layout_segment *lseg,
+			     struct nfs_commit_info *cinfo);
+int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
+			    int how, struct nfs_commit_info *cinfo);
 void nfs_retry_commit(struct list_head *page_list,
-		      struct pnfs_layout_segment *lseg);
-void nfs_commit_clear_lock(struct nfs_inode *nfsi);
-void nfs_commitdata_release(void *data);
-void nfs_commit_release_pages(struct nfs_write_data *data);
-void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *head);
-void nfs_request_remove_commit_list(struct nfs_page *req);
+		      struct pnfs_layout_segment *lseg,
+		      struct nfs_commit_info *cinfo);
+void nfs_commitdata_release(struct nfs_commit_data *data);
+void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
+				 struct nfs_commit_info *cinfo);
+void nfs_request_remove_commit_list(struct nfs_page *req,
+				    struct nfs_commit_info *cinfo);
+void nfs_init_cinfo(struct nfs_commit_info *cinfo,
+		    struct inode *inode,
+		    struct nfs_direct_req *dreq);
 
 #ifdef CONFIG_MIGRATION
 extern int nfs_migrate_page(struct address_space *,
@@ -342,15 +366,16 @@ extern int nfs_migrate_page(struct address_space *,
 #define nfs_migrate_page NULL
 #endif
 
+/* direct.c */
+void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
+			      struct nfs_direct_req *dreq);
+
 /* nfs4proc.c */
 extern void __nfs4_read_done_cb(struct nfs_read_data *);
-extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data);
-extern int nfs4_init_client(struct nfs_client *clp,
+extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
 			    const struct rpc_timeout *timeparms,
 			    const char *ip_addr,
-			    rpc_authflavor_t authflavour,
-			    int noresvport);
-extern void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data);
+			    rpc_authflavor_t authflavour);
 extern int _nfs4_call_sync(struct rpc_clnt *clnt,
 			   struct nfs_server *server,
 			   struct rpc_message *msg,
@@ -466,3 +491,15 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
 		PAGE_SIZE - 1) >> PAGE_SHIFT;
 }
 
+/*
+ * Convert a struct timespec into a 64-bit change attribute
+ *
+ * This does approximately the same thing as timespec_to_ns(),
+ * but for calculation efficiency, we multiply the seconds by
+ * 1024*1024*1024.
+ */
+static inline
+u64 nfs_timespec_to_change_attr(const struct timespec *ts)
+{
+	return ((u64)ts->tv_sec << 30) + ts->tv_nsec;
+}
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index d51868e5683c..08b9c93675da 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -26,11 +26,6 @@ static LIST_HEAD(nfs_automount_list);
 static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts);
 int nfs_mountpoint_expiry_timeout = 500 * HZ;
 
-static struct vfsmount *nfs_do_submount(struct dentry *dentry,
-					struct nfs_fh *fh,
-					struct nfs_fattr *fattr,
-					rpc_authflavor_t authflavor);
-
 /*
  * nfs_path - reconstruct the path given an arbitrary dentry
  * @base - used to return pointer to the end of devname part of path
@@ -118,64 +113,6 @@ Elong:
 	return ERR_PTR(-ENAMETOOLONG);
 }
 
-#ifdef CONFIG_NFS_V4
-rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors)
-{
-	struct gss_api_mech *mech;
-	struct xdr_netobj oid;
-	int i;
-	rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX;
-
-	for (i = 0; i < flavors->num_flavors; i++) {
-		struct nfs4_secinfo_flavor *flavor;
-		flavor = &flavors->flavors[i];
-
-		if (flavor->flavor == RPC_AUTH_NULL || flavor->flavor == RPC_AUTH_UNIX) {
-			pseudoflavor = flavor->flavor;
-			break;
-		} else if (flavor->flavor == RPC_AUTH_GSS) {
-			oid.len  = flavor->gss.sec_oid4.len;
-			oid.data = flavor->gss.sec_oid4.data;
-			mech = gss_mech_get_by_OID(&oid);
-			if (!mech)
-				continue;
-			pseudoflavor = gss_svc_to_pseudoflavor(mech, flavor->gss.service);
-			gss_mech_put(mech);
-			break;
-		}
-	}
-
-	return pseudoflavor;
-}
-
-static struct rpc_clnt *nfs_lookup_mountpoint(struct inode *dir,
-					      struct qstr *name,
-					      struct nfs_fh *fh,
-					      struct nfs_fattr *fattr)
-{
-	int err;
-
-	if (NFS_PROTO(dir)->version == 4)
-		return nfs4_proc_lookup_mountpoint(dir, name, fh, fattr);
-
-	err = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, name, fh, fattr);
-	if (err)
-		return ERR_PTR(err);
-	return rpc_clone_client(NFS_SERVER(dir)->client);
-}
-#else /* CONFIG_NFS_V4 */
-static inline struct rpc_clnt *nfs_lookup_mountpoint(struct inode *dir,
-						     struct qstr *name,
-						     struct nfs_fh *fh,
-						     struct nfs_fattr *fattr)
-{
-	int err = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, name, fh, fattr);
-	if (err)
-		return ERR_PTR(err);
-	return rpc_clone_client(NFS_SERVER(dir)->client);
-}
-#endif /* CONFIG_NFS_V4 */
-
 /*
  * nfs_d_automount - Handle crossing a mountpoint on the server
  * @path - The mountpoint
@@ -191,10 +128,9 @@ static inline struct rpc_clnt *nfs_lookup_mountpoint(struct inode *dir,
 struct vfsmount *nfs_d_automount(struct path *path)
 {
 	struct vfsmount *mnt;
-	struct dentry *parent;
+	struct nfs_server *server = NFS_SERVER(path->dentry->d_inode);
 	struct nfs_fh *fh = NULL;
 	struct nfs_fattr *fattr = NULL;
-	struct rpc_clnt *client;
 
 	dprintk("--> nfs_d_automount()\n");
 
@@ -210,21 +146,7 @@ struct vfsmount *nfs_d_automount(struct path *path)
 
 	dprintk("%s: enter\n", __func__);
 
-	/* Look it up again to get its attributes */
-	parent = dget_parent(path->dentry);
-	client = nfs_lookup_mountpoint(parent->d_inode, &path->dentry->d_name, fh, fattr);
-	dput(parent);
-	if (IS_ERR(client)) {
-		mnt = ERR_CAST(client);
-		goto out;
-	}
-
-	if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
-		mnt = nfs_do_refmount(client, path->dentry);
-	else
-		mnt = nfs_do_submount(path->dentry, fh, fattr, client->cl_auth->au_flavor);
-	rpc_shutdown_client(client);
-
+	mnt = server->nfs_client->rpc_ops->submount(server, path->dentry, fh, fattr);
 	if (IS_ERR(mnt))
 		goto out;
 
@@ -297,10 +219,8 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
  * @authflavor - security flavor to use when performing the mount
  *
  */
-static struct vfsmount *nfs_do_submount(struct dentry *dentry,
-					struct nfs_fh *fh,
-					struct nfs_fattr *fattr,
-					rpc_authflavor_t authflavor)
+struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh,
+				 struct nfs_fattr *fattr, rpc_authflavor_t authflavor)
 {
 	struct nfs_clone_mount mountdata = {
 		.sb = dentry->d_sb,
@@ -333,3 +253,18 @@ out:
 	dprintk("<-- nfs_do_submount() = %p\n", mnt);
 	return mnt;
 }
+
+struct vfsmount *nfs_submount(struct nfs_server *server, struct dentry *dentry,
+			      struct nfs_fh *fh, struct nfs_fattr *fattr)
+{
+	int err;
+	struct dentry *parent = dget_parent(dentry);
+
+	/* Look it up again to get its attributes */
+	err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr);
+	dput(parent);
+	if (err != 0)
+		return ERR_PTR(err);
+
+	return nfs_do_submount(dentry, fh, fattr, server->client->cl_auth->au_flavor);
+}
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index aa14ec303e94..8a6394edb8b0 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -1,3 +1,7 @@
+/*
+ * NFS-private data for each "struct net".  Accessed with net_generic().
+ */
+
 #ifndef __NFS_NETNS_H__
 #define __NFS_NETNS_H__
 
@@ -20,6 +24,7 @@ struct nfs_net {
 	struct idr cb_ident_idr; /* Protected by nfs_client_lock */
 #endif
 	spinlock_t nfs_client_lock;
+	struct timespec boot_time;
 };
 
 extern int nfs_net_id;
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 1f56000fabbd..baf759bccd05 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -61,6 +61,7 @@
 #define NFS_readdirres_sz	(1)
 #define NFS_statfsres_sz	(1+NFS_info_sz)
 
+static int nfs_stat_to_errno(enum nfs_stat);
 
 /*
  * While encoding arguments, set up the reply buffer in advance to
@@ -313,6 +314,8 @@ static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
 	p = xdr_decode_time(p, &fattr->atime);
 	p = xdr_decode_time(p, &fattr->mtime);
 	xdr_decode_time(p, &fattr->ctime);
+	fattr->change_attr = nfs_timespec_to_change_attr(&fattr->ctime);
+
 	return 0;
 out_overflow:
 	print_overflow_msg(__func__, xdr);
@@ -1109,7 +1112,7 @@ static const struct {
  * Returns a local errno value, or -EIO if the NFS status code is
  * not recognized.  This function is used jointly by NFSv2 and NFSv3.
  */
-int nfs_stat_to_errno(enum nfs_stat status)
+static int nfs_stat_to_errno(enum nfs_stat status)
 {
 	int i;
 
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 75c68299358e..2292a0fd2bff 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -142,7 +142,7 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 }
 
 static int
-nfs3_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name,
+nfs3_proc_lookup(struct inode *dir, struct qstr *name,
 		 struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
 	struct nfs3_diropargs	arg = {
@@ -810,11 +810,13 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 
 static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
-	if (nfs3_async_handle_jukebox(task, data->inode))
+	struct inode *inode = data->header->inode;
+
+	if (nfs3_async_handle_jukebox(task, inode))
 		return -EAGAIN;
 
-	nfs_invalidate_atime(data->inode);
-	nfs_refresh_inode(data->inode, &data->fattr);
+	nfs_invalidate_atime(inode);
+	nfs_refresh_inode(inode, &data->fattr);
 	return 0;
 }
 
@@ -830,10 +832,12 @@ static void nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_da
 
 static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
-	if (nfs3_async_handle_jukebox(task, data->inode))
+	struct inode *inode = data->header->inode;
+
+	if (nfs3_async_handle_jukebox(task, inode))
 		return -EAGAIN;
 	if (task->tk_status >= 0)
-		nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr);
+		nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
 	return 0;
 }
 
@@ -847,7 +851,12 @@ static void nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_
 	rpc_call_start(task);
 }
 
-static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data)
+static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
+{
+	rpc_call_start(task);
+}
+
+static int nfs3_commit_done(struct rpc_task *task, struct nfs_commit_data *data)
 {
 	if (nfs3_async_handle_jukebox(task, data->inode))
 		return -EAGAIN;
@@ -855,7 +864,7 @@ static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data)
 	return 0;
 }
 
-static void nfs3_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)
+static void nfs3_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg)
 {
 	msg->rpc_proc = &nfs3_procedures[NFS3PROC_COMMIT];
 }
@@ -875,6 +884,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
 	.file_inode_ops	= &nfs3_file_inode_operations,
 	.file_ops	= &nfs_file_operations,
 	.getroot	= nfs3_proc_get_root,
+	.submount	= nfs_submount,
 	.getattr	= nfs3_proc_getattr,
 	.setattr	= nfs3_proc_setattr,
 	.lookup		= nfs3_proc_lookup,
@@ -906,6 +916,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
 	.write_rpc_prepare = nfs3_proc_write_rpc_prepare,
 	.write_done	= nfs3_write_done,
 	.commit_setup	= nfs3_proc_commit_setup,
+	.commit_rpc_prepare = nfs3_proc_commit_rpc_prepare,
 	.commit_done	= nfs3_commit_done,
 	.lock		= nfs3_proc_lock,
 	.clear_acl_cache = nfs3_forget_cached_acls,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index a77cc9a3ce55..902de489ec9b 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -86,6 +86,8 @@
 				XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
 #define ACL3_setaclres_sz	(1+NFS3_post_op_attr_sz)
 
+static int nfs3_stat_to_errno(enum nfs_stat);
+
 /*
  * Map file type to S_IFMT bits
  */
@@ -675,6 +677,7 @@ static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr)
 	p = xdr_decode_nfstime3(p, &fattr->atime);
 	p = xdr_decode_nfstime3(p, &fattr->mtime);
 	xdr_decode_nfstime3(p, &fattr->ctime);
+	fattr->change_attr = nfs_timespec_to_change_attr(&fattr->ctime);
 
 	fattr->valid |= NFS_ATTR_FATTR_V3;
 	return 0;
@@ -725,12 +728,14 @@ static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
 		goto out_overflow;
 
 	fattr->valid |= NFS_ATTR_FATTR_PRESIZE
+		| NFS_ATTR_FATTR_PRECHANGE
 		| NFS_ATTR_FATTR_PREMTIME
 		| NFS_ATTR_FATTR_PRECTIME;
 
 	p = xdr_decode_size3(p, &fattr->pre_size);
 	p = xdr_decode_nfstime3(p, &fattr->pre_mtime);
 	xdr_decode_nfstime3(p, &fattr->pre_ctime);
+	fattr->pre_change_attr = nfs_timespec_to_change_attr(&fattr->pre_ctime);
 
 	return 0;
 out_overflow:
@@ -1287,7 +1292,7 @@ static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req,
  *	};
  */
 static void encode_commit3args(struct xdr_stream *xdr,
-			       const struct nfs_writeargs *args)
+			       const struct nfs_commitargs *args)
 {
 	__be32 *p;
 
@@ -1300,7 +1305,7 @@ static void encode_commit3args(struct xdr_stream *xdr,
 
 static void nfs3_xdr_enc_commit3args(struct rpc_rqst *req,
 				     struct xdr_stream *xdr,
-				     const struct nfs_writeargs *args)
+				     const struct nfs_commitargs *args)
 {
 	encode_commit3args(xdr, args);
 }
@@ -1385,7 +1390,7 @@ static int nfs3_xdr_dec_getattr3res(struct rpc_rqst *req,
 out:
 	return error;
 out_default:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /*
@@ -1424,7 +1429,7 @@ static int nfs3_xdr_dec_setattr3res(struct rpc_rqst *req,
 out:
 	return error;
 out_status:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /*
@@ -1472,7 +1477,7 @@ out_default:
 	error = decode_post_op_attr(xdr, result->dir_attr);
 	if (unlikely(error))
 		goto out;
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /*
@@ -1513,7 +1518,7 @@ static int nfs3_xdr_dec_access3res(struct rpc_rqst *req,
 out:
 	return error;
 out_default:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /*
@@ -1554,7 +1559,7 @@ static int nfs3_xdr_dec_readlink3res(struct rpc_rqst *req,
 out:
 	return error;
 out_default:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /*
@@ -1636,7 +1641,7 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
 out:
 	return error;
 out_status:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /*
@@ -1706,7 +1711,7 @@ static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
 out:
 	return error;
 out_status:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /*
@@ -1770,7 +1775,7 @@ out_default:
 	error = decode_wcc_data(xdr, result->dir_attr);
 	if (unlikely(error))
 		goto out;
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /*
@@ -1809,7 +1814,7 @@ static int nfs3_xdr_dec_remove3res(struct rpc_rqst *req,
 out:
 	return error;
 out_status:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /*
@@ -1853,7 +1858,7 @@ static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req,
 out:
 	return error;
 out_status:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /*
@@ -1896,7 +1901,7 @@ static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr,
 out:
 	return error;
 out_status:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /**
@@ -2088,7 +2093,7 @@ out_default:
 	error = decode_post_op_attr(xdr, result->dir_attr);
 	if (unlikely(error))
 		goto out;
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /*
@@ -2156,7 +2161,7 @@ static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req,
 out:
 	return error;
 out_status:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /*
@@ -2232,7 +2237,7 @@ static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req,
 out:
 	return error;
 out_status:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /*
@@ -2295,7 +2300,7 @@ static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req,
 out:
 	return error;
 out_status:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 /*
@@ -2319,7 +2324,7 @@ out_status:
  */
 static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req,
 				   struct xdr_stream *xdr,
-				   struct nfs_writeres *result)
+				   struct nfs_commitres *result)
 {
 	enum nfs_stat status;
 	int error;
@@ -2336,7 +2341,7 @@ static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req,
 out:
 	return error;
 out_status:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 #ifdef CONFIG_NFS_V3_ACL
@@ -2401,7 +2406,7 @@ static int nfs3_xdr_dec_getacl3res(struct rpc_rqst *req,
 out:
 	return error;
 out_default:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req,
@@ -2420,11 +2425,76 @@ static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req,
 out:
 	return error;
 out_default:
-	return nfs_stat_to_errno(status);
+	return nfs3_stat_to_errno(status);
 }
 
 #endif  /* CONFIG_NFS_V3_ACL */
 
+
+/*
+ * We need to translate between nfs status return values and
+ * the local errno values which may not be the same.
+ */
+static const struct {
+	int stat;
+	int errno;
+} nfs_errtbl[] = {
+	{ NFS_OK,		0		},
+	{ NFSERR_PERM,		-EPERM		},
+	{ NFSERR_NOENT,		-ENOENT		},
+	{ NFSERR_IO,		-errno_NFSERR_IO},
+	{ NFSERR_NXIO,		-ENXIO		},
+/*	{ NFSERR_EAGAIN,	-EAGAIN		}, */
+	{ NFSERR_ACCES,		-EACCES		},
+	{ NFSERR_EXIST,		-EEXIST		},
+	{ NFSERR_XDEV,		-EXDEV		},
+	{ NFSERR_NODEV,		-ENODEV		},
+	{ NFSERR_NOTDIR,	-ENOTDIR	},
+	{ NFSERR_ISDIR,		-EISDIR		},
+	{ NFSERR_INVAL,		-EINVAL		},
+	{ NFSERR_FBIG,		-EFBIG		},
+	{ NFSERR_NOSPC,		-ENOSPC		},
+	{ NFSERR_ROFS,		-EROFS		},
+	{ NFSERR_MLINK,		-EMLINK		},
+	{ NFSERR_NAMETOOLONG,	-ENAMETOOLONG	},
+	{ NFSERR_NOTEMPTY,	-ENOTEMPTY	},
+	{ NFSERR_DQUOT,		-EDQUOT		},
+	{ NFSERR_STALE,		-ESTALE		},
+	{ NFSERR_REMOTE,	-EREMOTE	},
+#ifdef EWFLUSH
+	{ NFSERR_WFLUSH,	-EWFLUSH	},
+#endif
+	{ NFSERR_BADHANDLE,	-EBADHANDLE	},
+	{ NFSERR_NOT_SYNC,	-ENOTSYNC	},
+	{ NFSERR_BAD_COOKIE,	-EBADCOOKIE	},
+	{ NFSERR_NOTSUPP,	-ENOTSUPP	},
+	{ NFSERR_TOOSMALL,	-ETOOSMALL	},
+	{ NFSERR_SERVERFAULT,	-EREMOTEIO	},
+	{ NFSERR_BADTYPE,	-EBADTYPE	},
+	{ NFSERR_JUKEBOX,	-EJUKEBOX	},
+	{ -1,			-EIO		}
+};
+
+/**
+ * nfs3_stat_to_errno - convert an NFS status code to a local errno
+ * @status: NFS status code to convert
+ *
+ * Returns a local errno value, or -EIO if the NFS status code is
+ * not recognized.  This function is used jointly by NFSv2 and NFSv3.
+ */
+static int nfs3_stat_to_errno(enum nfs_stat status)
+{
+	int i;
+
+	for (i = 0; nfs_errtbl[i].stat != -1; i++) {
+		if (nfs_errtbl[i].stat == (int)status)
+			return nfs_errtbl[i].errno;
+	}
+	dprintk("NFS: Unrecognized nfs status value: %u\n", status);
+	return nfs_errtbl[i].errno;
+}
+
+
 #define PROC(proc, argtype, restype, timer)				\
 [NFS3PROC_##proc] = {							\
 	.p_proc      = NFS3PROC_##proc,					\
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 8d75021020b3..c6827f93ab57 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -24,6 +24,8 @@ enum nfs4_client_state {
 	NFS4CLNT_RECALL_SLOT,
 	NFS4CLNT_LEASE_CONFIRM,
 	NFS4CLNT_SERVER_SCOPE_MISMATCH,
+	NFS4CLNT_PURGE_STATE,
+	NFS4CLNT_BIND_CONN_TO_SESSION,
 };
 
 enum nfs4_session_state {
@@ -52,11 +54,6 @@ struct nfs4_minor_version_ops {
 	const struct nfs4_state_maintenance_ops *state_renewal_ops;
 };
 
-struct nfs_unique_id {
-	struct rb_node rb_node;
-	__u64 id;
-};
-
 #define NFS_SEQID_CONFIRMED 1
 struct nfs_seqid_counter {
 	ktime_t create_time;
@@ -206,12 +203,18 @@ extern const struct dentry_operations nfs4_dentry_operations;
 extern const struct inode_operations nfs4_dir_inode_operations;
 
 /* nfs4namespace.c */
+rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *);
 struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *, struct inode *, struct qstr *);
+struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *,
+			       struct nfs_fh *, struct nfs_fattr *);
 
 /* nfs4proc.c */
 extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
 extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
+extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
+extern int nfs4_proc_bind_conn_to_session(struct nfs_client *, struct rpc_cred *cred);
 extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
+extern int nfs4_destroy_clientid(struct nfs_client *clp);
 extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc);
@@ -239,8 +242,8 @@ extern int nfs41_setup_sequence(struct nfs4_session *session,
 		struct rpc_task *task);
 extern void nfs4_destroy_session(struct nfs4_session *session);
 extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
-extern int nfs4_proc_create_session(struct nfs_client *);
-extern int nfs4_proc_destroy_session(struct nfs4_session *);
+extern int nfs4_proc_create_session(struct nfs_client *, struct rpc_cred *);
+extern int nfs4_proc_destroy_session(struct nfs4_session *, struct rpc_cred *);
 extern int nfs4_init_session(struct nfs_server *server);
 extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
 		struct nfs_fsinfo *fsinfo);
@@ -310,9 +313,9 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
 #if defined(CONFIG_NFS_V4_1)
 struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
 struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp);
-extern void nfs4_schedule_session_recovery(struct nfs4_session *);
+extern void nfs4_schedule_session_recovery(struct nfs4_session *, int);
 #else
-static inline void nfs4_schedule_session_recovery(struct nfs4_session *session)
+static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
 {
 }
 #endif /* CONFIG_NFS_V4_1 */
@@ -334,7 +337,7 @@ extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs
 extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
 extern void nfs41_handle_recall_slot(struct nfs_client *clp);
 extern void nfs41_handle_server_scope(struct nfs_client *,
-				      struct server_scope **);
+				      struct nfs41_server_scope **);
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
 extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
 extern void nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *,
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 5acfd9ea8a31..e1340293872c 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -82,29 +82,76 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
 	BUG();
 }
 
+static void filelayout_reset_write(struct nfs_write_data *data)
+{
+	struct nfs_pgio_header *hdr = data->header;
+	struct rpc_task *task = &data->task;
+
+	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+		dprintk("%s Reset task %5u for i/o through MDS "
+			"(req %s/%lld, %u bytes @ offset %llu)\n", __func__,
+			data->task.tk_pid,
+			hdr->inode->i_sb->s_id,
+			(long long)NFS_FILEID(hdr->inode),
+			data->args.count,
+			(unsigned long long)data->args.offset);
+
+		task->tk_status = pnfs_write_done_resend_to_mds(hdr->inode,
+							&hdr->pages,
+							hdr->completion_ops);
+	}
+}
+
+static void filelayout_reset_read(struct nfs_read_data *data)
+{
+	struct nfs_pgio_header *hdr = data->header;
+	struct rpc_task *task = &data->task;
+
+	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+		dprintk("%s Reset task %5u for i/o through MDS "
+			"(req %s/%lld, %u bytes @ offset %llu)\n", __func__,
+			data->task.tk_pid,
+			hdr->inode->i_sb->s_id,
+			(long long)NFS_FILEID(hdr->inode),
+			data->args.count,
+			(unsigned long long)data->args.offset);
+
+		task->tk_status = pnfs_read_done_resend_to_mds(hdr->inode,
+							&hdr->pages,
+							hdr->completion_ops);
+	}
+}
+
 static int filelayout_async_handle_error(struct rpc_task *task,
 					 struct nfs4_state *state,
 					 struct nfs_client *clp,
-					 int *reset)
+					 struct pnfs_layout_segment *lseg)
 {
-	struct nfs_server *mds_server = NFS_SERVER(state->inode);
+	struct inode *inode = lseg->pls_layout->plh_inode;
+	struct nfs_server *mds_server = NFS_SERVER(inode);
+	struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
 	struct nfs_client *mds_client = mds_server->nfs_client;
+	struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
 
 	if (task->tk_status >= 0)
 		return 0;
-	*reset = 0;
 
 	switch (task->tk_status) {
 	/* MDS state errors */
 	case -NFS4ERR_DELEG_REVOKED:
 	case -NFS4ERR_ADMIN_REVOKED:
 	case -NFS4ERR_BAD_STATEID:
+		if (state == NULL)
+			break;
 		nfs_remove_bad_delegation(state->inode);
 	case -NFS4ERR_OPENMODE:
+		if (state == NULL)
+			break;
 		nfs4_schedule_stateid_recovery(mds_server, state);
 		goto wait_on_recovery;
 	case -NFS4ERR_EXPIRED:
-		nfs4_schedule_stateid_recovery(mds_server, state);
+		if (state != NULL)
+			nfs4_schedule_stateid_recovery(mds_server, state);
 		nfs4_schedule_lease_recovery(mds_client);
 		goto wait_on_recovery;
 	/* DS session errors */
@@ -118,7 +165,7 @@ static int filelayout_async_handle_error(struct rpc_task *task,
 		dprintk("%s ERROR %d, Reset session. Exchangeid "
 			"flags 0x%x\n", __func__, task->tk_status,
 			clp->cl_exchange_flags);
-		nfs4_schedule_session_recovery(clp->cl_session);
+		nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
 		break;
 	case -NFS4ERR_DELAY:
 	case -NFS4ERR_GRACE:
@@ -127,11 +174,48 @@ static int filelayout_async_handle_error(struct rpc_task *task,
 		break;
 	case -NFS4ERR_RETRY_UNCACHED_REP:
 		break;
+	/* Invalidate Layout errors */
+	case -NFS4ERR_PNFS_NO_LAYOUT:
+	case -ESTALE:           /* mapped NFS4ERR_STALE */
+	case -EBADHANDLE:       /* mapped NFS4ERR_BADHANDLE */
+	case -EISDIR:           /* mapped NFS4ERR_ISDIR */
+	case -NFS4ERR_FHEXPIRED:
+	case -NFS4ERR_WRONG_TYPE:
+		dprintk("%s Invalid layout error %d\n", __func__,
+			task->tk_status);
+		/*
+		 * Destroy layout so new i/o will get a new layout.
+		 * Layout will not be destroyed until all current lseg
+		 * references are put. Mark layout as invalid to resend failed
+		 * i/o and all i/o waiting on the slot table to the MDS until
+		 * layout is destroyed and a new valid layout is obtained.
+		 */
+		set_bit(NFS_LAYOUT_INVALID,
+				&NFS_I(inode)->layout->plh_flags);
+		pnfs_destroy_layout(NFS_I(inode));
+		rpc_wake_up(&tbl->slot_tbl_waitq);
+		goto reset;
+	/* RPC connection errors */
+	case -ECONNREFUSED:
+	case -EHOSTDOWN:
+	case -EHOSTUNREACH:
+	case -ENETUNREACH:
+	case -EIO:
+	case -ETIMEDOUT:
+	case -EPIPE:
+		dprintk("%s DS connection error %d\n", __func__,
+			task->tk_status);
+		if (!filelayout_test_devid_invalid(devid))
+			_pnfs_return_layout(inode);
+		filelayout_mark_devid_invalid(devid);
+		rpc_wake_up(&tbl->slot_tbl_waitq);
+		nfs4_ds_disconnect(clp);
+		/* fall through */
 	default:
-		dprintk("%s DS error. Retry through MDS %d\n", __func__,
+reset:
+		dprintk("%s Retry through MDS. Error %d\n", __func__,
 			task->tk_status);
-		*reset = 1;
-		break;
+		return -NFS4ERR_RESET_TO_MDS;
 	}
 out:
 	task->tk_status = 0;
@@ -148,18 +232,17 @@ wait_on_recovery:
 static int filelayout_read_done_cb(struct rpc_task *task,
 				struct nfs_read_data *data)
 {
-	int reset = 0;
+	struct nfs_pgio_header *hdr = data->header;
+	int err;
 
-	dprintk("%s DS read\n", __func__);
+	err = filelayout_async_handle_error(task, data->args.context->state,
+					    data->ds_clp, hdr->lseg);
 
-	if (filelayout_async_handle_error(task, data->args.context->state,
-					  data->ds_clp, &reset) == -EAGAIN) {
-		dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
-			__func__, data->ds_clp, data->ds_clp->cl_session);
-		if (reset) {
-			pnfs_set_lo_fail(data->lseg);
-			nfs4_reset_read(task, data);
-		}
+	switch (err) {
+	case -NFS4ERR_RESET_TO_MDS:
+		filelayout_reset_read(data);
+		return task->tk_status;
+	case -EAGAIN:
 		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
@@ -175,13 +258,15 @@ static int filelayout_read_done_cb(struct rpc_task *task,
 static void
 filelayout_set_layoutcommit(struct nfs_write_data *wdata)
 {
-	if (FILELAYOUT_LSEG(wdata->lseg)->commit_through_mds ||
+	struct nfs_pgio_header *hdr = wdata->header;
+
+	if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds ||
 	    wdata->res.verf->committed == NFS_FILE_SYNC)
 		return;
 
 	pnfs_set_layoutcommit(wdata);
-	dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, wdata->inode->i_ino,
-		(unsigned long) NFS_I(wdata->inode)->layout->plh_lwb);
+	dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
+		(unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
 }
 
 /*
@@ -191,8 +276,14 @@ filelayout_set_layoutcommit(struct nfs_write_data *wdata)
  */
 static void filelayout_read_prepare(struct rpc_task *task, void *data)
 {
-	struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+	struct nfs_read_data *rdata = data;
 
+	if (filelayout_reset_to_mds(rdata->header->lseg)) {
+		dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
+		filelayout_reset_read(rdata);
+		rpc_exit(task, 0);
+		return;
+	}
 	rdata->read_done_cb = filelayout_read_done_cb;
 
 	if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
@@ -205,42 +296,47 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data)
 
 static void filelayout_read_call_done(struct rpc_task *task, void *data)
 {
-	struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+	struct nfs_read_data *rdata = data;
 
 	dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
 
+	if (test_bit(NFS_IOHDR_REDO, &rdata->header->flags) &&
+	    task->tk_status == 0)
+		return;
+
 	/* Note this may cause RPC to be resent */
-	rdata->mds_ops->rpc_call_done(task, data);
+	rdata->header->mds_ops->rpc_call_done(task, data);
 }
 
 static void filelayout_read_count_stats(struct rpc_task *task, void *data)
 {
-	struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+	struct nfs_read_data *rdata = data;
 
-	rpc_count_iostats(task, NFS_SERVER(rdata->inode)->client->cl_metrics);
+	rpc_count_iostats(task, NFS_SERVER(rdata->header->inode)->client->cl_metrics);
 }
 
 static void filelayout_read_release(void *data)
 {
-	struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+	struct nfs_read_data *rdata = data;
 
-	put_lseg(rdata->lseg);
-	rdata->mds_ops->rpc_release(data);
+	nfs_put_client(rdata->ds_clp);
+	rdata->header->mds_ops->rpc_release(data);
 }
 
 static int filelayout_write_done_cb(struct rpc_task *task,
 				struct nfs_write_data *data)
 {
-	int reset = 0;
-
-	if (filelayout_async_handle_error(task, data->args.context->state,
-					  data->ds_clp, &reset) == -EAGAIN) {
-		dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
-			__func__, data->ds_clp, data->ds_clp->cl_session);
-		if (reset) {
-			pnfs_set_lo_fail(data->lseg);
-			nfs4_reset_write(task, data);
-		}
+	struct nfs_pgio_header *hdr = data->header;
+	int err;
+
+	err = filelayout_async_handle_error(task, data->args.context->state,
+					    data->ds_clp, hdr->lseg);
+
+	switch (err) {
+	case -NFS4ERR_RESET_TO_MDS:
+		filelayout_reset_write(data);
+		return task->tk_status;
+	case -EAGAIN:
 		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
@@ -250,7 +346,7 @@ static int filelayout_write_done_cb(struct rpc_task *task,
 }
 
 /* Fake up some data that will cause nfs_commit_release to retry the writes. */
-static void prepare_to_resend_writes(struct nfs_write_data *data)
+static void prepare_to_resend_writes(struct nfs_commit_data *data)
 {
 	struct nfs_page *first = nfs_list_entry(data->pages.next);
 
@@ -261,19 +357,19 @@ static void prepare_to_resend_writes(struct nfs_write_data *data)
 }
 
 static int filelayout_commit_done_cb(struct rpc_task *task,
-				     struct nfs_write_data *data)
+				     struct nfs_commit_data *data)
 {
-	int reset = 0;
-
-	if (filelayout_async_handle_error(task, data->args.context->state,
-					  data->ds_clp, &reset) == -EAGAIN) {
-		dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
-			__func__, data->ds_clp, data->ds_clp->cl_session);
-		if (reset) {
-			prepare_to_resend_writes(data);
-			pnfs_set_lo_fail(data->lseg);
-		} else
-			rpc_restart_call_prepare(task);
+	int err;
+
+	err = filelayout_async_handle_error(task, NULL, data->ds_clp,
+					    data->lseg);
+
+	switch (err) {
+	case -NFS4ERR_RESET_TO_MDS:
+		prepare_to_resend_writes(data);
+		return -EAGAIN;
+	case -EAGAIN:
+		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
 
@@ -282,8 +378,14 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
 
 static void filelayout_write_prepare(struct rpc_task *task, void *data)
 {
-	struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+	struct nfs_write_data *wdata = data;
 
+	if (filelayout_reset_to_mds(wdata->header->lseg)) {
+		dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
+		filelayout_reset_write(wdata);
+		rpc_exit(task, 0);
+		return;
+	}
 	if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
 				&wdata->args.seq_args, &wdata->res.seq_res,
 				task))
@@ -294,36 +396,66 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data)
 
 static void filelayout_write_call_done(struct rpc_task *task, void *data)
 {
-	struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+	struct nfs_write_data *wdata = data;
+
+	if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) &&
+	    task->tk_status == 0)
+		return;
 
 	/* Note this may cause RPC to be resent */
-	wdata->mds_ops->rpc_call_done(task, data);
+	wdata->header->mds_ops->rpc_call_done(task, data);
 }
 
 static void filelayout_write_count_stats(struct rpc_task *task, void *data)
 {
-	struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+	struct nfs_write_data *wdata = data;
 
-	rpc_count_iostats(task, NFS_SERVER(wdata->inode)->client->cl_metrics);
+	rpc_count_iostats(task, NFS_SERVER(wdata->header->inode)->client->cl_metrics);
 }
 
 static void filelayout_write_release(void *data)
 {
-	struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+	struct nfs_write_data *wdata = data;
+
+	nfs_put_client(wdata->ds_clp);
+	wdata->header->mds_ops->rpc_release(data);
+}
+
+static void filelayout_commit_prepare(struct rpc_task *task, void *data)
+{
+	struct nfs_commit_data *wdata = data;
 
-	put_lseg(wdata->lseg);
-	wdata->mds_ops->rpc_release(data);
+	if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
+				&wdata->args.seq_args, &wdata->res.seq_res,
+				task))
+		return;
+
+	rpc_call_start(task);
+}
+
+static void filelayout_write_commit_done(struct rpc_task *task, void *data)
+{
+	struct nfs_commit_data *wdata = data;
+
+	/* Note this may cause RPC to be resent */
+	wdata->mds_ops->rpc_call_done(task, data);
+}
+
+static void filelayout_commit_count_stats(struct rpc_task *task, void *data)
+{
+	struct nfs_commit_data *cdata = data;
+
+	rpc_count_iostats(task, NFS_SERVER(cdata->inode)->client->cl_metrics);
 }
 
-static void filelayout_commit_release(void *data)
+static void filelayout_commit_release(void *calldata)
 {
-	struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+	struct nfs_commit_data *data = calldata;
 
-	nfs_commit_release_pages(wdata);
-	if (atomic_dec_and_test(&NFS_I(wdata->inode)->commits_outstanding))
-		nfs_commit_clear_lock(NFS_I(wdata->inode));
-	put_lseg(wdata->lseg);
-	nfs_commitdata_release(wdata);
+	data->completion_ops->completion(data);
+	put_lseg(data->lseg);
+	nfs_put_client(data->ds_clp);
+	nfs_commitdata_release(data);
 }
 
 static const struct rpc_call_ops filelayout_read_call_ops = {
@@ -341,16 +473,17 @@ static const struct rpc_call_ops filelayout_write_call_ops = {
 };
 
 static const struct rpc_call_ops filelayout_commit_call_ops = {
-	.rpc_call_prepare = filelayout_write_prepare,
-	.rpc_call_done = filelayout_write_call_done,
-	.rpc_count_stats = filelayout_write_count_stats,
+	.rpc_call_prepare = filelayout_commit_prepare,
+	.rpc_call_done = filelayout_write_commit_done,
+	.rpc_count_stats = filelayout_commit_count_stats,
 	.rpc_release = filelayout_commit_release,
 };
 
 static enum pnfs_try_status
 filelayout_read_pagelist(struct nfs_read_data *data)
 {
-	struct pnfs_layout_segment *lseg = data->lseg;
+	struct nfs_pgio_header *hdr = data->header;
+	struct pnfs_layout_segment *lseg = hdr->lseg;
 	struct nfs4_pnfs_ds *ds;
 	loff_t offset = data->args.offset;
 	u32 j, idx;
@@ -358,25 +491,20 @@ filelayout_read_pagelist(struct nfs_read_data *data)
 	int status;
 
 	dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
-		__func__, data->inode->i_ino,
+		__func__, hdr->inode->i_ino,
 		data->args.pgbase, (size_t)data->args.count, offset);
 
-	if (test_bit(NFS_DEVICEID_INVALID, &FILELAYOUT_DEVID_NODE(lseg)->flags))
-		return PNFS_NOT_ATTEMPTED;
-
 	/* Retrieve the correct rpc_client for the byte range */
 	j = nfs4_fl_calc_j_index(lseg, offset);
 	idx = nfs4_fl_calc_ds_index(lseg, j);
 	ds = nfs4_fl_prepare_ds(lseg, idx);
-	if (!ds) {
-		/* Either layout fh index faulty, or ds connect failed */
-		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
-		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+	if (!ds)
 		return PNFS_NOT_ATTEMPTED;
-	}
-	dprintk("%s USE DS: %s\n", __func__, ds->ds_remotestr);
+	dprintk("%s USE DS: %s cl_count %d\n", __func__,
+		ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
 
 	/* No multipath support. Use first DS */
+	atomic_inc(&ds->ds_clp->cl_count);
 	data->ds_clp = ds->ds_clp;
 	fh = nfs4_fl_select_ds_fh(lseg, j);
 	if (fh)
@@ -386,8 +514,8 @@ filelayout_read_pagelist(struct nfs_read_data *data)
 	data->mds_offset = offset;
 
 	/* Perform an asynchronous read to ds */
-	status = nfs_initiate_read(data, ds->ds_clp->cl_rpcclient,
-				   &filelayout_read_call_ops);
+	status = nfs_initiate_read(ds->ds_clp->cl_rpcclient, data,
+				  &filelayout_read_call_ops, RPC_TASK_SOFTCONN);
 	BUG_ON(status != 0);
 	return PNFS_ATTEMPTED;
 }
@@ -396,32 +524,26 @@ filelayout_read_pagelist(struct nfs_read_data *data)
 static enum pnfs_try_status
 filelayout_write_pagelist(struct nfs_write_data *data, int sync)
 {
-	struct pnfs_layout_segment *lseg = data->lseg;
+	struct nfs_pgio_header *hdr = data->header;
+	struct pnfs_layout_segment *lseg = hdr->lseg;
 	struct nfs4_pnfs_ds *ds;
 	loff_t offset = data->args.offset;
 	u32 j, idx;
 	struct nfs_fh *fh;
 	int status;
 
-	if (test_bit(NFS_DEVICEID_INVALID, &FILELAYOUT_DEVID_NODE(lseg)->flags))
-		return PNFS_NOT_ATTEMPTED;
-
 	/* Retrieve the correct rpc_client for the byte range */
 	j = nfs4_fl_calc_j_index(lseg, offset);
 	idx = nfs4_fl_calc_ds_index(lseg, j);
 	ds = nfs4_fl_prepare_ds(lseg, idx);
-	if (!ds) {
-		printk(KERN_ERR "NFS: %s: prepare_ds failed, use MDS\n",
-			__func__);
-		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
-		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+	if (!ds)
 		return PNFS_NOT_ATTEMPTED;
-	}
-	dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s\n", __func__,
-		data->inode->i_ino, sync, (size_t) data->args.count, offset,
-		ds->ds_remotestr);
+	dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d\n",
+		__func__, hdr->inode->i_ino, sync, (size_t) data->args.count,
+		offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
 
 	data->write_done_cb = filelayout_write_done_cb;
+	atomic_inc(&ds->ds_clp->cl_count);
 	data->ds_clp = ds->ds_clp;
 	fh = nfs4_fl_select_ds_fh(lseg, j);
 	if (fh)
@@ -433,8 +555,9 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
 	data->args.offset = filelayout_get_dserver_offset(lseg, offset);
 
 	/* Perform an asynchronous write */
-	status = nfs_initiate_write(data, ds->ds_clp->cl_rpcclient,
-				    &filelayout_write_call_ops, sync);
+	status = nfs_initiate_write(ds->ds_clp->cl_rpcclient, data,
+				    &filelayout_write_call_ops, sync,
+				    RPC_TASK_SOFTCONN);
 	BUG_ON(status != 0);
 	return PNFS_ATTEMPTED;
 }
@@ -650,10 +773,65 @@ filelayout_free_lseg(struct pnfs_layout_segment *lseg)
 
 	dprintk("--> %s\n", __func__);
 	nfs4_fl_put_deviceid(fl->dsaddr);
-	kfree(fl->commit_buckets);
+	/* This assumes a single RW lseg */
+	if (lseg->pls_range.iomode == IOMODE_RW) {
+		struct nfs4_filelayout *flo;
+
+		flo = FILELAYOUT_FROM_HDR(lseg->pls_layout);
+		flo->commit_info.nbuckets = 0;
+		kfree(flo->commit_info.buckets);
+		flo->commit_info.buckets = NULL;
+	}
 	_filelayout_free_lseg(fl);
 }
 
+static int
+filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
+			     struct nfs_commit_info *cinfo,
+			     gfp_t gfp_flags)
+{
+	struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
+	struct pnfs_commit_bucket *buckets;
+	int size;
+
+	if (fl->commit_through_mds)
+		return 0;
+	if (cinfo->ds->nbuckets != 0) {
+		/* This assumes there is only one IOMODE_RW lseg.  What
+		 * we really want to do is have a layout_hdr level
+		 * dictionary of <multipath_list4, fh> keys, each
+		 * associated with a struct list_head, populated by calls
+		 * to filelayout_write_pagelist().
+		 * */
+		return 0;
+	}
+
+	size = (fl->stripe_type == STRIPE_SPARSE) ?
+		fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
+
+	buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket),
+			  gfp_flags);
+	if (!buckets)
+		return -ENOMEM;
+	else {
+		int i;
+
+		spin_lock(cinfo->lock);
+		if (cinfo->ds->nbuckets != 0)
+			kfree(buckets);
+		else {
+			cinfo->ds->buckets = buckets;
+			cinfo->ds->nbuckets = size;
+			for (i = 0; i < size; i++) {
+				INIT_LIST_HEAD(&buckets[i].written);
+				INIT_LIST_HEAD(&buckets[i].committing);
+			}
+		}
+		spin_unlock(cinfo->lock);
+		return 0;
+	}
+}
+
 static struct pnfs_layout_segment *
 filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
 		      struct nfs4_layoutget_res *lgr,
@@ -673,29 +851,6 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
 		_filelayout_free_lseg(fl);
 		return NULL;
 	}
-
-	/* This assumes there is only one IOMODE_RW lseg.  What
-	 * we really want to do is have a layout_hdr level
-	 * dictionary of <multipath_list4, fh> keys, each
-	 * associated with a struct list_head, populated by calls
-	 * to filelayout_write_pagelist().
-	 * */
-	if ((!fl->commit_through_mds) && (lgr->range.iomode == IOMODE_RW)) {
-		int i;
-		int size = (fl->stripe_type == STRIPE_SPARSE) ?
-			fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
-
-		fl->commit_buckets = kcalloc(size, sizeof(struct nfs4_fl_commit_bucket), gfp_flags);
-		if (!fl->commit_buckets) {
-			filelayout_free_lseg(&fl->generic_hdr);
-			return NULL;
-		}
-		fl->number_of_buckets = size;
-		for (i = 0; i < size; i++) {
-			INIT_LIST_HEAD(&fl->commit_buckets[i].written);
-			INIT_LIST_HEAD(&fl->commit_buckets[i].committing);
-		}
-	}
 	return &fl->generic_hdr;
 }
 
@@ -716,8 +871,8 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 	    !nfs_generic_pg_test(pgio, prev, req))
 		return false;
 
-	p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
-	r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT;
+	p_stripe = (u64)req_offset(prev);
+	r_stripe = (u64)req_offset(req);
 	stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
 
 	do_div(p_stripe, stripe_unit);
@@ -732,6 +887,16 @@ filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
 {
 	BUG_ON(pgio->pg_lseg != NULL);
 
+	if (req->wb_offset != req->wb_pgbase) {
+		/*
+		 * Handling unaligned pages is difficult, because have to
+		 * somehow split a req in two in certain cases in the
+		 * pg.test code.  Avoid this by just not using pnfs
+		 * in this case.
+		 */
+		nfs_pageio_reset_read_mds(pgio);
+		return;
+	}
 	pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 					   req->wb_context,
 					   0,
@@ -747,8 +912,13 @@ static void
 filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
 			 struct nfs_page *req)
 {
+	struct nfs_commit_info cinfo;
+	int status;
+
 	BUG_ON(pgio->pg_lseg != NULL);
 
+	if (req->wb_offset != req->wb_pgbase)
+		goto out_mds;
 	pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 					   req->wb_context,
 					   0,
@@ -757,7 +927,17 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
 					   GFP_NOFS);
 	/* If no lseg, fall back to write through mds */
 	if (pgio->pg_lseg == NULL)
-		nfs_pageio_reset_write_mds(pgio);
+		goto out_mds;
+	nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq);
+	status = filelayout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS);
+	if (status < 0) {
+		put_lseg(pgio->pg_lseg);
+		pgio->pg_lseg = NULL;
+		goto out_mds;
+	}
+	return;
+out_mds:
+	nfs_pageio_reset_write_mds(pgio);
 }
 
 static const struct nfs_pageio_ops filelayout_pg_read_ops = {
@@ -784,43 +964,42 @@ static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
  * If this will make the bucket empty, it will need to put the lseg reference.
  */
 static void
-filelayout_clear_request_commit(struct nfs_page *req)
+filelayout_clear_request_commit(struct nfs_page *req,
+				struct nfs_commit_info *cinfo)
 {
 	struct pnfs_layout_segment *freeme = NULL;
-	struct inode *inode = req->wb_context->dentry->d_inode;
 
-	spin_lock(&inode->i_lock);
+	spin_lock(cinfo->lock);
 	if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
 		goto out;
+	cinfo->ds->nwritten--;
 	if (list_is_singular(&req->wb_list)) {
-		struct pnfs_layout_segment *lseg;
+		struct pnfs_commit_bucket *bucket;
 
-		/* From here we can find the bucket, but for the moment,
-		 * since there is only one relevant lseg...
-		 */
-		list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {
-			if (lseg->pls_range.iomode == IOMODE_RW) {
-				freeme = lseg;
-				break;
-			}
-		}
+		bucket = list_first_entry(&req->wb_list,
+					  struct pnfs_commit_bucket,
+					  written);
+		freeme = bucket->wlseg;
+		bucket->wlseg = NULL;
 	}
 out:
-	nfs_request_remove_commit_list(req);
-	spin_unlock(&inode->i_lock);
+	nfs_request_remove_commit_list(req, cinfo);
+	spin_unlock(cinfo->lock);
 	put_lseg(freeme);
 }
 
 static struct list_head *
 filelayout_choose_commit_list(struct nfs_page *req,
-			      struct pnfs_layout_segment *lseg)
+			      struct pnfs_layout_segment *lseg,
+			      struct nfs_commit_info *cinfo)
 {
 	struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
 	u32 i, j;
 	struct list_head *list;
+	struct pnfs_commit_bucket *buckets;
 
 	if (fl->commit_through_mds)
-		return &NFS_I(req->wb_context->dentry->d_inode)->commit_list;
+		return &cinfo->mds->list;
 
 	/* Note that we are calling nfs4_fl_calc_j_index on each page
 	 * that ends up being committed to a data server.  An attractive
@@ -828,31 +1007,33 @@ filelayout_choose_commit_list(struct nfs_page *req,
 	 * to store the value calculated in filelayout_write_pagelist
 	 * and just use that here.
 	 */
-	j = nfs4_fl_calc_j_index(lseg,
-				 (loff_t)req->wb_index << PAGE_CACHE_SHIFT);
+	j = nfs4_fl_calc_j_index(lseg, req_offset(req));
 	i = select_bucket_index(fl, j);
-	list = &fl->commit_buckets[i].written;
+	buckets = cinfo->ds->buckets;
+	list = &buckets[i].written;
 	if (list_empty(list)) {
 		/* Non-empty buckets hold a reference on the lseg.  That ref
 		 * is normally transferred to the COMMIT call and released
 		 * there.  It could also be released if the last req is pulled
 		 * off due to a rewrite, in which case it will be done in
-		 * filelayout_remove_commit_req
+		 * filelayout_clear_request_commit
 		 */
-		get_lseg(lseg);
+		buckets[i].wlseg = get_lseg(lseg);
 	}
 	set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
+	cinfo->ds->nwritten++;
 	return list;
 }
 
 static void
 filelayout_mark_request_commit(struct nfs_page *req,
-		struct pnfs_layout_segment *lseg)
+			       struct pnfs_layout_segment *lseg,
+			       struct nfs_commit_info *cinfo)
 {
 	struct list_head *list;
 
-	list = filelayout_choose_commit_list(req, lseg);
-	nfs_request_add_commit_list(req, list);
+	list = filelayout_choose_commit_list(req, lseg, cinfo);
+	nfs_request_add_commit_list(req, list, cinfo);
 }
 
 static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
@@ -880,7 +1061,7 @@ select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i)
 	return flseg->fh_array[i];
 }
 
-static int filelayout_initiate_commit(struct nfs_write_data *data, int how)
+static int filelayout_initiate_commit(struct nfs_commit_data *data, int how)
 {
 	struct pnfs_layout_segment *lseg = data->lseg;
 	struct nfs4_pnfs_ds *ds;
@@ -890,135 +1071,138 @@ static int filelayout_initiate_commit(struct nfs_write_data *data, int how)
 	idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
 	ds = nfs4_fl_prepare_ds(lseg, idx);
 	if (!ds) {
-		printk(KERN_ERR "NFS: %s: prepare_ds failed, use MDS\n",
-			__func__);
-		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
-		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
 		prepare_to_resend_writes(data);
 		filelayout_commit_release(data);
 		return -EAGAIN;
 	}
-	dprintk("%s ino %lu, how %d\n", __func__, data->inode->i_ino, how);
-	data->write_done_cb = filelayout_commit_done_cb;
+	dprintk("%s ino %lu, how %d cl_count %d\n", __func__,
+		data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count));
+	data->commit_done_cb = filelayout_commit_done_cb;
+	atomic_inc(&ds->ds_clp->cl_count);
 	data->ds_clp = ds->ds_clp;
 	fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
 	if (fh)
 		data->args.fh = fh;
-	return nfs_initiate_commit(data, ds->ds_clp->cl_rpcclient,
-				   &filelayout_commit_call_ops, how);
-}
-
-/*
- * This is only useful while we are using whole file layouts.
- */
-static struct pnfs_layout_segment *
-find_only_write_lseg_locked(struct inode *inode)
-{
-	struct pnfs_layout_segment *lseg;
-
-	list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list)
-		if (lseg->pls_range.iomode == IOMODE_RW)
-			return lseg;
-	return NULL;
-}
-
-static struct pnfs_layout_segment *find_only_write_lseg(struct inode *inode)
-{
-	struct pnfs_layout_segment *rv;
-
-	spin_lock(&inode->i_lock);
-	rv = find_only_write_lseg_locked(inode);
-	if (rv)
-		get_lseg(rv);
-	spin_unlock(&inode->i_lock);
-	return rv;
+	return nfs_initiate_commit(ds->ds_clp->cl_rpcclient, data,
+				   &filelayout_commit_call_ops, how,
+				   RPC_TASK_SOFTCONN);
 }
 
 static int
-filelayout_scan_ds_commit_list(struct nfs4_fl_commit_bucket *bucket, int max,
-		spinlock_t *lock)
+transfer_commit_list(struct list_head *src, struct list_head *dst,
+		     struct nfs_commit_info *cinfo, int max)
 {
-	struct list_head *src = &bucket->written;
-	struct list_head *dst = &bucket->committing;
 	struct nfs_page *req, *tmp;
 	int ret = 0;
 
 	list_for_each_entry_safe(req, tmp, src, wb_list) {
 		if (!nfs_lock_request(req))
 			continue;
-		if (cond_resched_lock(lock))
+		kref_get(&req->wb_kref);
+		if (cond_resched_lock(cinfo->lock))
 			list_safe_reset_next(req, tmp, wb_list);
-		nfs_request_remove_commit_list(req);
+		nfs_request_remove_commit_list(req, cinfo);
 		clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
 		nfs_list_add_request(req, dst);
 		ret++;
-		if (ret == max)
+		if ((ret == max) && !cinfo->dreq)
 			break;
 	}
 	return ret;
 }
 
+static int
+filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
+			       struct nfs_commit_info *cinfo,
+			       int max)
+{
+	struct list_head *src = &bucket->written;
+	struct list_head *dst = &bucket->committing;
+	int ret;
+
+	ret = transfer_commit_list(src, dst, cinfo, max);
+	if (ret) {
+		cinfo->ds->nwritten -= ret;
+		cinfo->ds->ncommitting += ret;
+		bucket->clseg = bucket->wlseg;
+		if (list_empty(src))
+			bucket->wlseg = NULL;
+		else
+			get_lseg(bucket->clseg);
+	}
+	return ret;
+}
+
 /* Move reqs from written to committing lists, returning count of number moved.
- * Note called with i_lock held.
+ * Note called with cinfo->lock held.
  */
-static int filelayout_scan_commit_lists(struct inode *inode, int max,
-		spinlock_t *lock)
+static int filelayout_scan_commit_lists(struct nfs_commit_info *cinfo,
+					int max)
 {
-	struct pnfs_layout_segment *lseg;
-	struct nfs4_filelayout_segment *fl;
 	int i, rv = 0, cnt;
 
-	lseg = find_only_write_lseg_locked(inode);
-	if (!lseg)
-		goto out_done;
-	fl = FILELAYOUT_LSEG(lseg);
-	if (fl->commit_through_mds)
-		goto out_done;
-	for (i = 0; i < fl->number_of_buckets && max != 0; i++) {
-		cnt = filelayout_scan_ds_commit_list(&fl->commit_buckets[i],
-				max, lock);
+	for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
+		cnt = filelayout_scan_ds_commit_list(&cinfo->ds->buckets[i],
+						     cinfo, max);
 		max -= cnt;
 		rv += cnt;
 	}
-out_done:
 	return rv;
 }
 
+/* Pull everything off the committing lists and dump into @dst */
+static void filelayout_recover_commit_reqs(struct list_head *dst,
+					   struct nfs_commit_info *cinfo)
+{
+	struct pnfs_commit_bucket *b;
+	int i;
+
+	/* NOTE cinfo->lock is NOT held, relying on fact that this is
+	 * only called on single thread per dreq.
+	 * Can't take the lock because need to do put_lseg
+	 */
+	for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
+		if (transfer_commit_list(&b->written, dst, cinfo, 0)) {
+			BUG_ON(!list_empty(&b->written));
+			put_lseg(b->wlseg);
+			b->wlseg = NULL;
+		}
+	}
+	cinfo->ds->nwritten = 0;
+}
+
 static unsigned int
-alloc_ds_commits(struct inode *inode, struct list_head *list)
+alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
 {
-	struct pnfs_layout_segment *lseg;
-	struct nfs4_filelayout_segment *fl;
-	struct nfs_write_data *data;
+	struct pnfs_ds_commit_info *fl_cinfo;
+	struct pnfs_commit_bucket *bucket;
+	struct nfs_commit_data *data;
 	int i, j;
 	unsigned int nreq = 0;
 
-	/* Won't need this when non-whole file layout segments are supported
-	 * instead we will use a pnfs_layout_hdr structure */
-	lseg = find_only_write_lseg(inode);
-	if (!lseg)
-		return 0;
-	fl = FILELAYOUT_LSEG(lseg);
-	for (i = 0; i < fl->number_of_buckets; i++) {
-		if (list_empty(&fl->commit_buckets[i].committing))
+	fl_cinfo = cinfo->ds;
+	bucket = fl_cinfo->buckets;
+	for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
+		if (list_empty(&bucket->committing))
 			continue;
 		data = nfs_commitdata_alloc();
 		if (!data)
 			break;
 		data->ds_commit_index = i;
-		data->lseg = lseg;
+		data->lseg = bucket->clseg;
+		bucket->clseg = NULL;
 		list_add(&data->pages, list);
 		nreq++;
 	}
 
 	/* Clean up on error */
-	for (j = i; j < fl->number_of_buckets; j++) {
-		if (list_empty(&fl->commit_buckets[i].committing))
+	for (j = i; j < fl_cinfo->nbuckets; j++, bucket++) {
+		if (list_empty(&bucket->committing))
 			continue;
-		nfs_retry_commit(&fl->commit_buckets[i].committing, lseg);
-		put_lseg(lseg);  /* associated with emptying bucket */
+		nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
+		put_lseg(bucket->clseg);
+		bucket->clseg = NULL;
 	}
-	put_lseg(lseg);
 	/* Caller will clean up entries put on list */
 	return nreq;
 }
@@ -1026,9 +1210,9 @@ alloc_ds_commits(struct inode *inode, struct list_head *list)
 /* This follows nfs_commit_list pretty closely */
 static int
 filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
-			   int how)
+			   int how, struct nfs_commit_info *cinfo)
 {
-	struct nfs_write_data	*data, *tmp;
+	struct nfs_commit_data *data, *tmp;
 	LIST_HEAD(list);
 	unsigned int nreq = 0;
 
@@ -1039,30 +1223,34 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
 			list_add(&data->pages, &list);
 			nreq++;
 		} else
-			nfs_retry_commit(mds_pages, NULL);
+			nfs_retry_commit(mds_pages, NULL, cinfo);
 	}
 
-	nreq += alloc_ds_commits(inode, &list);
+	nreq += alloc_ds_commits(cinfo, &list);
 
 	if (nreq == 0) {
-		nfs_commit_clear_lock(NFS_I(inode));
+		cinfo->completion_ops->error_cleanup(NFS_I(inode));
 		goto out;
 	}
 
-	atomic_add(nreq, &NFS_I(inode)->commits_outstanding);
+	atomic_add(nreq, &cinfo->mds->rpcs_out);
 
 	list_for_each_entry_safe(data, tmp, &list, pages) {
 		list_del_init(&data->pages);
 		if (!data->lseg) {
-			nfs_init_commit(data, mds_pages, NULL);
-			nfs_initiate_commit(data, NFS_CLIENT(inode),
-					    data->mds_ops, how);
+			nfs_init_commit(data, mds_pages, NULL, cinfo);
+			nfs_initiate_commit(NFS_CLIENT(inode), data,
+					    data->mds_ops, how, 0);
 		} else {
-			nfs_init_commit(data, &FILELAYOUT_LSEG(data->lseg)->commit_buckets[data->ds_commit_index].committing, data->lseg);
+			struct pnfs_commit_bucket *buckets;
+
+			buckets = cinfo->ds->buckets;
+			nfs_init_commit(data, &buckets[data->ds_commit_index].committing, data->lseg, cinfo);
 			filelayout_initiate_commit(data, how);
 		}
 	}
 out:
+	cinfo->ds->ncommitting = 0;
 	return PNFS_ATTEMPTED;
 }
 
@@ -1072,17 +1260,47 @@ filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d)
 	nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node));
 }
 
+static struct pnfs_layout_hdr *
+filelayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
+{
+	struct nfs4_filelayout *flo;
+
+	flo = kzalloc(sizeof(*flo), gfp_flags);
+	return &flo->generic_hdr;
+}
+
+static void
+filelayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+	kfree(FILELAYOUT_FROM_HDR(lo));
+}
+
+static struct pnfs_ds_commit_info *
+filelayout_get_ds_info(struct inode *inode)
+{
+	struct pnfs_layout_hdr *layout = NFS_I(inode)->layout;
+
+	if (layout == NULL)
+		return NULL;
+	else
+		return &FILELAYOUT_FROM_HDR(layout)->commit_info;
+}
+
 static struct pnfs_layoutdriver_type filelayout_type = {
 	.id			= LAYOUT_NFSV4_1_FILES,
 	.name			= "LAYOUT_NFSV4_1_FILES",
 	.owner			= THIS_MODULE,
+	.alloc_layout_hdr	= filelayout_alloc_layout_hdr,
+	.free_layout_hdr	= filelayout_free_layout_hdr,
 	.alloc_lseg		= filelayout_alloc_lseg,
 	.free_lseg		= filelayout_free_lseg,
 	.pg_read_ops		= &filelayout_pg_read_ops,
 	.pg_write_ops		= &filelayout_pg_write_ops,
+	.get_ds_info		= &filelayout_get_ds_info,
 	.mark_request_commit	= filelayout_mark_request_commit,
 	.clear_request_commit	= filelayout_clear_request_commit,
 	.scan_commit_lists	= filelayout_scan_commit_lists,
+	.recover_commit_reqs	= filelayout_recover_commit_reqs,
 	.commit_pagelist	= filelayout_commit_pagelist,
 	.read_pagelist		= filelayout_read_pagelist,
 	.write_pagelist		= filelayout_write_pagelist,
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 21190bb1f5e3..43fe802dd678 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -33,6 +33,13 @@
 #include "pnfs.h"
 
 /*
+ * Default data server connection timeout and retrans vaules.
+ * Set by module paramters dataserver_timeo and dataserver_retrans.
+ */
+#define NFS4_DEF_DS_TIMEO   60
+#define NFS4_DEF_DS_RETRANS 5
+
+/*
  * Field testing shows we need to support up to 4096 stripe indices.
  * We store each index as a u8 (u32 on the wire) to keep the memory footprint
  * reasonable. This in turn means we support a maximum of 256
@@ -41,6 +48,9 @@
 #define NFS4_PNFS_MAX_STRIPE_CNT 4096
 #define NFS4_PNFS_MAX_MULTI_CNT  256 /* 256 fit into a u8 stripe_index */
 
+/* error codes for internal use */
+#define NFS4ERR_RESET_TO_MDS   12001
+
 enum stripetype4 {
 	STRIPE_SPARSE = 1,
 	STRIPE_DENSE = 2
@@ -62,23 +72,14 @@ struct nfs4_pnfs_ds {
 	atomic_t		ds_count;
 };
 
-/* nfs4_file_layout_dsaddr flags */
-#define NFS4_DEVICE_ID_NEG_ENTRY	0x00000001
-
 struct nfs4_file_layout_dsaddr {
 	struct nfs4_deviceid_node	id_node;
-	unsigned long			flags;
 	u32				stripe_count;
 	u8				*stripe_indices;
 	u32				ds_num;
 	struct nfs4_pnfs_ds		*ds_list[1];
 };
 
-struct nfs4_fl_commit_bucket {
-	struct list_head written;
-	struct list_head committing;
-};
-
 struct nfs4_filelayout_segment {
 	struct pnfs_layout_segment generic_hdr;
 	u32 stripe_type;
@@ -89,10 +90,19 @@ struct nfs4_filelayout_segment {
 	struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */
 	unsigned int num_fh;
 	struct nfs_fh **fh_array;
-	struct nfs4_fl_commit_bucket *commit_buckets; /* Sort commits to ds */
-	int number_of_buckets;
 };
 
+struct nfs4_filelayout {
+	struct pnfs_layout_hdr generic_hdr;
+	struct pnfs_ds_commit_info commit_info;
+};
+
+static inline struct nfs4_filelayout *
+FILELAYOUT_FROM_HDR(struct pnfs_layout_hdr *lo)
+{
+	return container_of(lo, struct nfs4_filelayout, generic_hdr);
+}
+
 static inline struct nfs4_filelayout_segment *
 FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
 {
@@ -107,6 +117,36 @@ FILELAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg)
 	return &FILELAYOUT_LSEG(lseg)->dsaddr->id_node;
 }
 
+static inline void
+filelayout_mark_devid_invalid(struct nfs4_deviceid_node *node)
+{
+	u32 *p = (u32 *)&node->deviceid;
+
+	printk(KERN_WARNING "NFS: Deviceid [%x%x%x%x] marked out of use.\n",
+		p[0], p[1], p[2], p[3]);
+
+	set_bit(NFS_DEVICEID_INVALID, &node->flags);
+}
+
+static inline bool
+filelayout_test_layout_invalid(struct pnfs_layout_hdr *lo)
+{
+	return test_bit(NFS_LAYOUT_INVALID, &lo->plh_flags);
+}
+
+static inline bool
+filelayout_test_devid_invalid(struct nfs4_deviceid_node *node)
+{
+	return test_bit(NFS_DEVICEID_INVALID, &node->flags);
+}
+
+static inline bool
+filelayout_reset_to_mds(struct pnfs_layout_segment *lseg)
+{
+	return filelayout_test_devid_invalid(FILELAYOUT_DEVID_NODE(lseg)) ||
+		filelayout_test_layout_invalid(lseg->pls_layout);
+}
+
 extern struct nfs_fh *
 nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
 
@@ -119,5 +159,6 @@ extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
 extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
 struct nfs4_file_layout_dsaddr *
 get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags);
+void nfs4_ds_disconnect(struct nfs_client *clp);
 
 #endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index c9cff9adb2d3..a1fab8da7f03 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -30,12 +30,16 @@
 
 #include <linux/nfs_fs.h>
 #include <linux/vmalloc.h>
+#include <linux/module.h>
 
 #include "internal.h"
 #include "nfs4filelayout.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PNFS_LD
 
+static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
+static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;
+
 /*
  * Data server cache
  *
@@ -145,6 +149,28 @@ _data_server_lookup_locked(const struct list_head *dsaddrs)
 }
 
 /*
+ * Lookup DS by nfs_client pointer. Zero data server client pointer
+ */
+void nfs4_ds_disconnect(struct nfs_client *clp)
+{
+	struct nfs4_pnfs_ds *ds;
+	struct nfs_client *found = NULL;
+
+	dprintk("%s clp %p\n", __func__, clp);
+	spin_lock(&nfs4_ds_cache_lock);
+	list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
+		if (ds->ds_clp && ds->ds_clp == clp) {
+			found = ds->ds_clp;
+			ds->ds_clp = NULL;
+		}
+	spin_unlock(&nfs4_ds_cache_lock);
+	if (found) {
+		set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
+		nfs_put_client(clp);
+	}
+}
+
+/*
  * Create an rpc connection to the nfs4_pnfs_ds data server
  * Currently only supports IPv4 and IPv6 addresses
  */
@@ -165,8 +191,9 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
 			__func__, ds->ds_remotestr, da->da_remotestr);
 
 		clp = nfs4_set_ds_client(mds_srv->nfs_client,
-				 (struct sockaddr *)&da->da_addr,
-				 da->da_addrlen, IPPROTO_TCP);
+					(struct sockaddr *)&da->da_addr,
+					da->da_addrlen, IPPROTO_TCP,
+					dataserver_timeo, dataserver_retrans);
 		if (!IS_ERR(clp))
 			break;
 	}
@@ -176,28 +203,7 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
 		goto out;
 	}
 
-	if ((clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) != 0) {
-		if (!is_ds_client(clp)) {
-			status = -ENODEV;
-			goto out_put;
-		}
-		ds->ds_clp = clp;
-		dprintk("%s [existing] server=%s\n", __func__,
-			ds->ds_remotestr);
-		goto out;
-	}
-
-	/*
-	 * Do not set NFS_CS_CHECK_LEASE_TIME instead set the DS lease to
-	 * be equal to the MDS lease. Renewal is scheduled in create_session.
-	 */
-	spin_lock(&mds_srv->nfs_client->cl_lock);
-	clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time;
-	spin_unlock(&mds_srv->nfs_client->cl_lock);
-	clp->cl_last_renewal = jiffies;
-
-	/* New nfs_client */
-	status = nfs4_init_ds_session(clp);
+	status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time);
 	if (status)
 		goto out_put;
 
@@ -602,7 +608,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
 
 		mp_count = be32_to_cpup(p); /* multipath count */
 		for (j = 0; j < mp_count; j++) {
-			da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->net,
+			da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->cl_net,
 					    &stream, gfp_flags);
 			if (da)
 				list_add_tail(&da->da_node, &dsaddrs);
@@ -791,48 +797,42 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
 	return flseg->fh_array[i];
 }
 
-static void
-filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr,
-			       int err, const char *ds_remotestr)
-{
-	u32 *p = (u32 *)&dsaddr->id_node.deviceid;
-
-	printk(KERN_ERR "NFS: data server %s connection error %d."
-		" Deviceid [%x%x%x%x] marked out of use.\n",
-		ds_remotestr, err, p[0], p[1], p[2], p[3]);
-
-	spin_lock(&nfs4_ds_cache_lock);
-	dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY;
-	spin_unlock(&nfs4_ds_cache_lock);
-}
-
 struct nfs4_pnfs_ds *
 nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
 {
 	struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
 	struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
+	struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
+
+	if (filelayout_test_devid_invalid(devid))
+		return NULL;
 
 	if (ds == NULL) {
 		printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
 			__func__, ds_idx);
-		return NULL;
+		goto mark_dev_invalid;
 	}
 
 	if (!ds->ds_clp) {
 		struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
 		int err;
 
-		if (dsaddr->flags & NFS4_DEVICE_ID_NEG_ENTRY) {
-			/* Already tried to connect, don't try again */
-			dprintk("%s Deviceid marked out of use\n", __func__);
-			return NULL;
-		}
 		err = nfs4_ds_connect(s, ds);
-		if (err) {
-			filelayout_mark_devid_negative(dsaddr, err,
-						       ds->ds_remotestr);
-			return NULL;
-		}
+		if (err)
+			goto mark_dev_invalid;
 	}
 	return ds;
+
+mark_dev_invalid:
+	filelayout_mark_devid_invalid(devid);
+	return NULL;
 }
+
+module_param(dataserver_retrans, uint, 0644);
+MODULE_PARM_DESC(dataserver_retrans, "The  number of times the NFSv4.1 client "
+			"retries a request before it attempts further "
+			" recovery  action.");
+module_param(dataserver_timeo, uint, 0644);
+MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the "
+			"NFSv4.1  client  waits for a response from a "
+			" data server before it retries an NFS request.");
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index a7f3dedc4ec7..017b4b01a69c 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -132,6 +132,35 @@ static size_t nfs_parse_server_name(char *string, size_t len,
 	return ret;
 }
 
+rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors)
+{
+	struct gss_api_mech *mech;
+	struct xdr_netobj oid;
+	int i;
+	rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX;
+
+	for (i = 0; i < flavors->num_flavors; i++) {
+		struct nfs4_secinfo_flavor *flavor;
+		flavor = &flavors->flavors[i];
+
+		if (flavor->flavor == RPC_AUTH_NULL || flavor->flavor == RPC_AUTH_UNIX) {
+			pseudoflavor = flavor->flavor;
+			break;
+		} else if (flavor->flavor == RPC_AUTH_GSS) {
+			oid.len  = flavor->gss.sec_oid4.len;
+			oid.data = flavor->gss.sec_oid4.data;
+			mech = gss_mech_get_by_OID(&oid);
+			if (!mech)
+				continue;
+			pseudoflavor = gss_svc_to_pseudoflavor(mech, flavor->gss.service);
+			gss_mech_put(mech);
+			break;
+		}
+	}
+
+	return pseudoflavor;
+}
+
 static rpc_authflavor_t nfs4_negotiate_security(struct inode *inode, struct qstr *name)
 {
 	struct page *page;
@@ -168,7 +197,7 @@ struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *clnt, struct inode *ino
 	rpc_authflavor_t flavor;
 
 	flavor = nfs4_negotiate_security(inode, name);
-	if (flavor < 0)
+	if ((int)flavor < 0)
 		return ERR_PTR(flavor);
 
 	clone = rpc_clone_client(clnt);
@@ -300,7 +329,7 @@ out:
  * @dentry - dentry of referral
  *
  */
-struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry *dentry)
+static struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry *dentry)
 {
 	struct vfsmount *mnt = ERR_PTR(-ENOMEM);
 	struct dentry *parent;
@@ -341,3 +370,25 @@ out:
 	dprintk("%s: done\n", __func__);
 	return mnt;
 }
+
+struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry,
+			       struct nfs_fh *fh, struct nfs_fattr *fattr)
+{
+	struct dentry *parent = dget_parent(dentry);
+	struct rpc_clnt *client;
+	struct vfsmount *mnt;
+
+	/* Look it up again to get its attributes and sec flavor */
+	client = nfs4_proc_lookup_mountpoint(parent->d_inode, &dentry->d_name, fh, fattr);
+	dput(parent);
+	if (IS_ERR(client))
+		return ERR_CAST(client);
+
+	if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
+		mnt = nfs_do_refmount(client, dentry);
+	else
+		mnt = nfs_do_submount(dentry, fh, fattr, client->cl_auth->au_flavor);
+
+	rpc_shutdown_client(client);
+	return mnt;
+}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ab985f6f0da8..d48dbefa0e71 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -64,6 +64,7 @@
 #include "iostat.h"
 #include "callback.h"
 #include "pnfs.h"
+#include "netns.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PROC
 
@@ -80,6 +81,7 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
 static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
 static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
+static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *);
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
 static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
 			    struct nfs_fattr *fattr, struct iattr *sattr,
@@ -101,6 +103,8 @@ static int nfs4_map_errors(int err)
 	case -NFS4ERR_BADOWNER:
 	case -NFS4ERR_BADNAME:
 		return -EINVAL;
+	case -NFS4ERR_SHARE_DENIED:
+		return -EACCES;
 	default:
 		dprintk("%s could not handle NFSv4 error %d\n",
 				__func__, -err);
@@ -304,7 +308,7 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
 		case -NFS4ERR_SEQ_MISORDERED:
 			dprintk("%s ERROR: %d Reset session\n", __func__,
 				errorcode);
-			nfs4_schedule_session_recovery(clp->cl_session);
+			nfs4_schedule_session_recovery(clp->cl_session, errorcode);
 			exception->retry = 1;
 			break;
 #endif /* defined(CONFIG_NFS_V4_1) */
@@ -772,7 +776,7 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
 	struct nfs_inode *nfsi = NFS_I(dir);
 
 	spin_lock(&dir->i_lock);
-	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
+	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
 	if (!cinfo->atomic || cinfo->before != dir->i_version)
 		nfs_force_lookup_revalidate(dir);
 	dir->i_version = cinfo->after;
@@ -788,7 +792,6 @@ struct nfs4_opendata {
 	struct nfs4_string owner_name;
 	struct nfs4_string group_name;
 	struct nfs_fattr f_attr;
-	struct nfs_fattr dir_attr;
 	struct dentry *dir;
 	struct dentry *dentry;
 	struct nfs4_state_owner *owner;
@@ -804,12 +807,10 @@ struct nfs4_opendata {
 static void nfs4_init_opendata_res(struct nfs4_opendata *p)
 {
 	p->o_res.f_attr = &p->f_attr;
-	p->o_res.dir_attr = &p->dir_attr;
 	p->o_res.seqid = p->o_arg.seqid;
 	p->c_res.seqid = p->c_arg.seqid;
 	p->o_res.server = p->o_arg.server;
 	nfs_fattr_init(&p->f_attr);
-	nfs_fattr_init(&p->dir_attr);
 	nfs_fattr_init_names(&p->f_attr, &p->owner_name, &p->group_name);
 }
 
@@ -843,7 +844,6 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
 	p->o_arg.name = &dentry->d_name;
 	p->o_arg.server = server;
 	p->o_arg.bitmask = server->attr_bitmask;
-	p->o_arg.dir_bitmask = server->cache_consistency_bitmask;
 	p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
 	if (attrs != NULL && attrs->ia_valid != 0) {
 		__be32 verf[2];
@@ -1332,7 +1332,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
 			case -NFS4ERR_BAD_HIGH_SLOT:
 			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
 			case -NFS4ERR_DEADSESSION:
-				nfs4_schedule_session_recovery(server->nfs_client->cl_session);
+				nfs4_schedule_session_recovery(server->nfs_client->cl_session, err);
 				goto out;
 			case -NFS4ERR_STALE_CLIENTID:
 			case -NFS4ERR_STALE_STATEID:
@@ -1611,8 +1611,6 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
 
 	nfs_fattr_map_and_free_names(NFS_SERVER(dir), &data->f_attr);
 
-	nfs_refresh_inode(dir, o_res->dir_attr);
-
 	if (o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
 		status = _nfs4_proc_open_confirm(data);
 		if (status != 0)
@@ -1645,11 +1643,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
 
 	nfs_fattr_map_and_free_names(server, &data->f_attr);
 
-	if (o_arg->open_flags & O_CREAT) {
+	if (o_arg->open_flags & O_CREAT)
 		update_changeattr(dir, &o_res->cinfo);
-		nfs_post_op_update_inode(dir, o_res->dir_attr);
-	} else
-		nfs_refresh_inode(dir, o_res->dir_attr);
 	if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0)
 		server->caps &= ~NFS_CAP_POSIX_LOCK;
 	if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
@@ -1789,7 +1784,14 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct
 /*
  * Returns a referenced nfs4_state
  */
-static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res)
+static int _nfs4_do_open(struct inode *dir,
+			struct dentry *dentry,
+			fmode_t fmode,
+			int flags,
+			struct iattr *sattr,
+			struct rpc_cred *cred,
+			struct nfs4_state **res,
+			struct nfs4_threshold **ctx_th)
 {
 	struct nfs4_state_owner  *sp;
 	struct nfs4_state     *state = NULL;
@@ -1814,6 +1816,11 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, fmode_t fmode
 	if (opendata == NULL)
 		goto err_put_state_owner;
 
+	if (ctx_th && server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) {
+		opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc();
+		if (!opendata->f_attr.mdsthreshold)
+			goto err_opendata_put;
+	}
 	if (dentry->d_inode != NULL)
 		opendata->state = nfs4_get_open_state(dentry->d_inode, sp);
 
@@ -1839,11 +1846,19 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, fmode_t fmode
 			nfs_setattr_update_inode(state->inode, sattr);
 		nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
 	}
+
+	if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server))
+		*ctx_th = opendata->f_attr.mdsthreshold;
+	else
+		kfree(opendata->f_attr.mdsthreshold);
+	opendata->f_attr.mdsthreshold = NULL;
+
 	nfs4_opendata_put(opendata);
 	nfs4_put_state_owner(sp);
 	*res = state;
 	return 0;
 err_opendata_put:
+	kfree(opendata->f_attr.mdsthreshold);
 	nfs4_opendata_put(opendata);
 err_put_state_owner:
 	nfs4_put_state_owner(sp);
@@ -1853,14 +1868,21 @@ out_err:
 }
 
 
-static struct nfs4_state *nfs4_do_open(struct inode *dir, struct dentry *dentry, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred)
+static struct nfs4_state *nfs4_do_open(struct inode *dir,
+					struct dentry *dentry,
+					fmode_t fmode,
+					int flags,
+					struct iattr *sattr,
+					struct rpc_cred *cred,
+					struct nfs4_threshold **ctx_th)
 {
 	struct nfs4_exception exception = { };
 	struct nfs4_state *res;
 	int status;
 
 	do {
-		status = _nfs4_do_open(dir, dentry, fmode, flags, sattr, cred, &res);
+		status = _nfs4_do_open(dir, dentry, fmode, flags, sattr, cred,
+				       &res, ctx_th);
 		if (status == 0)
 			break;
 		/* NOTE: BAD_SEQID means the server and client disagree about the
@@ -2184,7 +2206,8 @@ nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags
 	struct nfs4_state *state;
 
 	/* Protect against concurrent sillydeletes */
-	state = nfs4_do_open(dir, ctx->dentry, ctx->mode, open_flags, attr, ctx->cred);
+	state = nfs4_do_open(dir, ctx->dentry, ctx->mode, open_flags, attr,
+			     ctx->cred, &ctx->mdsthreshold);
 	if (IS_ERR(state))
 		return ERR_CAST(state);
 	ctx->state = state;
@@ -2354,8 +2377,8 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
 /*
  * get the file handle for the "/" directory on the server
  */
-static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
-			      struct nfs_fsinfo *info)
+int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle,
+			 struct nfs_fsinfo *info)
 {
 	int minor_version = server->nfs_client->cl_minorversion;
 	int status = nfs4_lookup_root(server, fhandle, info);
@@ -2372,6 +2395,31 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
 	return nfs4_map_errors(status);
 }
 
+static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
+			      struct nfs_fsinfo *info)
+{
+	int error;
+	struct nfs_fattr *fattr = info->fattr;
+
+	error = nfs4_server_capabilities(server, mntfh);
+	if (error < 0) {
+		dprintk("nfs4_get_root: getcaps error = %d\n", -error);
+		return error;
+	}
+
+	error = nfs4_proc_getattr(server, mntfh, fattr);
+	if (error < 0) {
+		dprintk("nfs4_get_root: getattr error = %d\n", -error);
+		return error;
+	}
+
+	if (fattr->valid & NFS_ATTR_FATTR_FSID &&
+	    !nfs_fsid_equal(&server->fsid, &fattr->fsid))
+		memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
+
+	return error;
+}
+
 /*
  * Get locations and (maybe) other attributes of a referral.
  * Note that we'll actually follow the referral later when
@@ -2578,7 +2626,7 @@ out:
 	return err;
 }
 
-static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name,
+static int nfs4_proc_lookup(struct inode *dir, struct qstr *name,
 			    struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
 	int status;
@@ -2761,7 +2809,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		fmode = ctx->mode;
 	}
 	sattr->ia_mode &= ~current_umask();
-	state = nfs4_do_open(dir, de, fmode, flags, sattr, cred);
+	state = nfs4_do_open(dir, de, fmode, flags, sattr, cred, NULL);
 	d_drop(dentry);
 	if (IS_ERR(state)) {
 		status = PTR_ERR(state);
@@ -2783,7 +2831,6 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
 	struct nfs_removeargs args = {
 		.fh = NFS_FH(dir),
 		.name = *name,
-		.bitmask = server->attr_bitmask,
 	};
 	struct nfs_removeres res = {
 		.server = server,
@@ -2793,19 +2840,11 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
 		.rpc_argp = &args,
 		.rpc_resp = &res,
 	};
-	int status = -ENOMEM;
-
-	res.dir_attr = nfs_alloc_fattr();
-	if (res.dir_attr == NULL)
-		goto out;
+	int status;
 
 	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1);
-	if (status == 0) {
+	if (status == 0)
 		update_changeattr(dir, &res.cinfo);
-		nfs_post_op_update_inode(dir, res.dir_attr);
-	}
-	nfs_free_fattr(res.dir_attr);
-out:
 	return status;
 }
 
@@ -2827,7 +2866,6 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
 	struct nfs_removeargs *args = msg->rpc_argp;
 	struct nfs_removeres *res = msg->rpc_resp;
 
-	args->bitmask = server->cache_consistency_bitmask;
 	res->server = server;
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
 	nfs41_init_sequence(&args->seq_args, &res->seq_res, 1);
@@ -2852,7 +2890,6 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 	if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
 		return 0;
 	update_changeattr(dir, &res->cinfo);
-	nfs_post_op_update_inode(dir, res->dir_attr);
 	return 1;
 }
 
@@ -2863,7 +2900,6 @@ static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
 	struct nfs_renameres *res = msg->rpc_resp;
 
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME];
-	arg->bitmask = server->attr_bitmask;
 	res->server = server;
 	nfs41_init_sequence(&arg->seq_args, &res->seq_res, 1);
 }
@@ -2889,9 +2925,7 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
 		return 0;
 
 	update_changeattr(old_dir, &res->old_cinfo);
-	nfs_post_op_update_inode(old_dir, res->old_fattr);
 	update_changeattr(new_dir, &res->new_cinfo);
-	nfs_post_op_update_inode(new_dir, res->new_fattr);
 	return 1;
 }
 
@@ -2904,7 +2938,6 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
 		.new_dir = NFS_FH(new_dir),
 		.old_name = old_name,
 		.new_name = new_name,
-		.bitmask = server->attr_bitmask,
 	};
 	struct nfs_renameres res = {
 		.server = server,
@@ -2916,21 +2949,11 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
 	};
 	int status = -ENOMEM;
 	
-	res.old_fattr = nfs_alloc_fattr();
-	res.new_fattr = nfs_alloc_fattr();
-	if (res.old_fattr == NULL || res.new_fattr == NULL)
-		goto out;
-
 	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
 	if (!status) {
 		update_changeattr(old_dir, &res.old_cinfo);
-		nfs_post_op_update_inode(old_dir, res.old_fattr);
 		update_changeattr(new_dir, &res.new_cinfo);
-		nfs_post_op_update_inode(new_dir, res.new_fattr);
 	}
-out:
-	nfs_free_fattr(res.new_fattr);
-	nfs_free_fattr(res.old_fattr);
 	return status;
 }
 
@@ -2968,18 +2991,15 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
 	int status = -ENOMEM;
 
 	res.fattr = nfs_alloc_fattr();
-	res.dir_attr = nfs_alloc_fattr();
-	if (res.fattr == NULL || res.dir_attr == NULL)
+	if (res.fattr == NULL)
 		goto out;
 
 	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
 	if (!status) {
 		update_changeattr(dir, &res.cinfo);
-		nfs_post_op_update_inode(dir, res.dir_attr);
 		nfs_post_op_update_inode(inode, res.fattr);
 	}
 out:
-	nfs_free_fattr(res.dir_attr);
 	nfs_free_fattr(res.fattr);
 	return status;
 }
@@ -3002,7 +3022,6 @@ struct nfs4_createdata {
 	struct nfs4_create_res res;
 	struct nfs_fh fh;
 	struct nfs_fattr fattr;
-	struct nfs_fattr dir_fattr;
 };
 
 static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
@@ -3026,9 +3045,7 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
 		data->res.server = server;
 		data->res.fh = &data->fh;
 		data->res.fattr = &data->fattr;
-		data->res.dir_fattr = &data->dir_fattr;
 		nfs_fattr_init(data->res.fattr);
-		nfs_fattr_init(data->res.dir_fattr);
 	}
 	return data;
 }
@@ -3039,7 +3056,6 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_
 				    &data->arg.seq_args, &data->res.seq_res, 1);
 	if (status == 0) {
 		update_changeattr(dir, &data->res.dir_cinfo);
-		nfs_post_op_update_inode(dir, data->res.dir_fattr);
 		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
 	}
 	return status;
@@ -3335,12 +3351,12 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 
 void __nfs4_read_done_cb(struct nfs_read_data *data)
 {
-	nfs_invalidate_atime(data->inode);
+	nfs_invalidate_atime(data->header->inode);
 }
 
 static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
 {
-	struct nfs_server *server = NFS_SERVER(data->inode);
+	struct nfs_server *server = NFS_SERVER(data->header->inode);
 
 	if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
 		rpc_restart_call_prepare(task);
@@ -3375,7 +3391,7 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message
 
 static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
 {
-	if (nfs4_setup_sequence(NFS_SERVER(data->inode),
+	if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
 				&data->args.seq_args,
 				&data->res.seq_res,
 				task))
@@ -3383,25 +3399,9 @@ static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_da
 	rpc_call_start(task);
 }
 
-/* Reset the the nfs_read_data to send the read to the MDS. */
-void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data)
-{
-	dprintk("%s Reset task for i/o through\n", __func__);
-	put_lseg(data->lseg);
-	data->lseg = NULL;
-	/* offsets will differ in the dense stripe case */
-	data->args.offset = data->mds_offset;
-	data->ds_clp = NULL;
-	data->args.fh     = NFS_FH(data->inode);
-	data->read_done_cb = nfs4_read_done_cb;
-	task->tk_ops = data->mds_ops;
-	rpc_task_reset_client(task, NFS_CLIENT(data->inode));
-}
-EXPORT_SYMBOL_GPL(nfs4_reset_read);
-
 static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
 {
-	struct inode *inode = data->inode;
+	struct inode *inode = data->header->inode;
 	
 	if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
 		rpc_restart_call_prepare(task);
@@ -3409,7 +3409,7 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data
 	}
 	if (task->tk_status >= 0) {
 		renew_lease(NFS_SERVER(inode), data->timestamp);
-		nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
+		nfs_post_op_update_inode_force_wcc(inode, &data->fattr);
 	}
 	return 0;
 }
@@ -3422,32 +3422,30 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
 		nfs4_write_done_cb(task, data);
 }
 
-/* Reset the the nfs_write_data to send the write to the MDS. */
-void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data)
+static
+bool nfs4_write_need_cache_consistency_data(const struct nfs_write_data *data)
 {
-	dprintk("%s Reset task for i/o through\n", __func__);
-	put_lseg(data->lseg);
-	data->lseg          = NULL;
-	data->ds_clp        = NULL;
-	data->write_done_cb = nfs4_write_done_cb;
-	data->args.fh       = NFS_FH(data->inode);
-	data->args.bitmask  = data->res.server->cache_consistency_bitmask;
-	data->args.offset   = data->mds_offset;
-	data->res.fattr     = &data->fattr;
-	task->tk_ops        = data->mds_ops;
-	rpc_task_reset_client(task, NFS_CLIENT(data->inode));
+	const struct nfs_pgio_header *hdr = data->header;
+
+	/* Don't request attributes for pNFS or O_DIRECT writes */
+	if (data->ds_clp != NULL || hdr->dreq != NULL)
+		return false;
+	/* Otherwise, request attributes if and only if we don't hold
+	 * a delegation
+	 */
+	return nfs_have_delegation(hdr->inode, FMODE_READ) == 0;
 }
-EXPORT_SYMBOL_GPL(nfs4_reset_write);
 
 static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
 {
-	struct nfs_server *server = NFS_SERVER(data->inode);
+	struct nfs_server *server = NFS_SERVER(data->header->inode);
 
-	if (data->lseg) {
+	if (!nfs4_write_need_cache_consistency_data(data)) {
 		data->args.bitmask = NULL;
 		data->res.fattr = NULL;
 	} else
 		data->args.bitmask = server->cache_consistency_bitmask;
+
 	if (!data->write_done_cb)
 		data->write_done_cb = nfs4_write_done_cb;
 	data->res.server = server;
@@ -3459,6 +3457,16 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
 
 static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
 {
+	if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
+				&data->args.seq_args,
+				&data->res.seq_res,
+				task))
+		return;
+	rpc_call_start(task);
+}
+
+static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
+{
 	if (nfs4_setup_sequence(NFS_SERVER(data->inode),
 				&data->args.seq_args,
 				&data->res.seq_res,
@@ -3467,7 +3475,7 @@ static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_
 	rpc_call_start(task);
 }
 
-static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *data)
 {
 	struct inode *inode = data->inode;
 
@@ -3475,28 +3483,22 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_write_data *dat
 		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
-	nfs_refresh_inode(inode, data->res.fattr);
 	return 0;
 }
 
-static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs4_commit_done(struct rpc_task *task, struct nfs_commit_data *data)
 {
 	if (!nfs4_sequence_done(task, &data->res.seq_res))
 		return -EAGAIN;
-	return data->write_done_cb(task, data);
+	return data->commit_done_cb(task, data);
 }
 
-static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)
+static void nfs4_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg)
 {
 	struct nfs_server *server = NFS_SERVER(data->inode);
 
-	if (data->lseg) {
-		data->args.bitmask = NULL;
-		data->res.fattr = NULL;
-	} else
-		data->args.bitmask = server->cache_consistency_bitmask;
-	if (!data->write_done_cb)
-		data->write_done_cb = nfs4_commit_done_cb;
+	if (data->commit_done_cb == NULL)
+		data->commit_done_cb = nfs4_commit_done_cb;
 	data->res.server = server;
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
 	nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
@@ -3905,7 +3907,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 		case -NFS4ERR_SEQ_MISORDERED:
 			dprintk("%s ERROR %d, Reset session\n", __func__,
 				task->tk_status);
-			nfs4_schedule_session_recovery(clp->cl_session);
+			nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
 			task->tk_status = 0;
 			return -EAGAIN;
 #endif /* CONFIG_NFS_V4_1 */
@@ -3931,13 +3933,21 @@ wait_on_recovery:
 	return -EAGAIN;
 }
 
-static void nfs4_construct_boot_verifier(struct nfs_client *clp,
-					 nfs4_verifier *bootverf)
+static void nfs4_init_boot_verifier(const struct nfs_client *clp,
+				    nfs4_verifier *bootverf)
 {
 	__be32 verf[2];
 
-	verf[0] = htonl((u32)clp->cl_boot_time.tv_sec);
-	verf[1] = htonl((u32)clp->cl_boot_time.tv_nsec);
+	if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) {
+		/* An impossible timestamp guarantees this value
+		 * will never match a generated boot time. */
+		verf[0] = 0;
+		verf[1] = (__be32)(NSEC_PER_SEC + 1);
+	} else {
+		struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
+		verf[0] = (__be32)nn->boot_time.tv_sec;
+		verf[1] = (__be32)nn->boot_time.tv_nsec;
+	}
 	memcpy(bootverf->data, verf, sizeof(bootverf->data));
 }
 
@@ -3960,7 +3970,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
 	int loop = 0;
 	int status;
 
-	nfs4_construct_boot_verifier(clp, &sc_verifier);
+	nfs4_init_boot_verifier(clp, &sc_verifier);
 
 	for(;;) {
 		rcu_read_lock();
@@ -4104,7 +4114,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
 	nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
 	data->args.fhandle = &data->fh;
 	data->args.stateid = &data->stateid;
-	data->args.bitmask = server->attr_bitmask;
+	data->args.bitmask = server->cache_consistency_bitmask;
 	nfs_copy_fh(&data->fh, NFS_FH(inode));
 	nfs4_stateid_copy(&data->stateid, stateid);
 	data->res.fattr = &data->fattr;
@@ -4125,9 +4135,10 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
 	if (status != 0)
 		goto out;
 	status = data->rpc_status;
-	if (status != 0)
-		goto out;
-	nfs_refresh_inode(inode, &data->fattr);
+	if (status == 0)
+		nfs_post_op_update_inode_force_wcc(inode, &data->fattr);
+	else
+		nfs_refresh_inode(inode, &data->fattr);
 out:
 	rpc_put_task(task);
 	return status;
@@ -4837,7 +4848,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
 			case -NFS4ERR_BAD_HIGH_SLOT:
 			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
 			case -NFS4ERR_DEADSESSION:
-				nfs4_schedule_session_recovery(server->nfs_client->cl_session);
+				nfs4_schedule_session_recovery(server->nfs_client->cl_session, err);
 				goto out;
 			case -ERESTARTSYS:
 				/*
@@ -5079,7 +5090,8 @@ out_inval:
 }
 
 static bool
-nfs41_same_server_scope(struct server_scope *a, struct server_scope *b)
+nfs41_same_server_scope(struct nfs41_server_scope *a,
+			struct nfs41_server_scope *b)
 {
 	if (a->server_scope_sz == b->server_scope_sz &&
 	    memcmp(a->server_scope, b->server_scope, a->server_scope_sz) == 0)
@@ -5089,6 +5101,61 @@ nfs41_same_server_scope(struct server_scope *a, struct server_scope *b)
 }
 
 /*
+ * nfs4_proc_bind_conn_to_session()
+ *
+ * The 4.1 client currently uses the same TCP connection for the
+ * fore and backchannel.
+ */
+int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred)
+{
+	int status;
+	struct nfs41_bind_conn_to_session_res res;
+	struct rpc_message msg = {
+		.rpc_proc =
+			&nfs4_procedures[NFSPROC4_CLNT_BIND_CONN_TO_SESSION],
+		.rpc_argp = clp,
+		.rpc_resp = &res,
+		.rpc_cred = cred,
+	};
+
+	dprintk("--> %s\n", __func__);
+	BUG_ON(clp == NULL);
+
+	res.session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
+	if (unlikely(res.session == NULL)) {
+		status = -ENOMEM;
+		goto out;
+	}
+
+	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	if (status == 0) {
+		if (memcmp(res.session->sess_id.data,
+		    clp->cl_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) {
+			dprintk("NFS: %s: Session ID mismatch\n", __func__);
+			status = -EIO;
+			goto out_session;
+		}
+		if (res.dir != NFS4_CDFS4_BOTH) {
+			dprintk("NFS: %s: Unexpected direction from server\n",
+				__func__);
+			status = -EIO;
+			goto out_session;
+		}
+		if (res.use_conn_in_rdma_mode) {
+			dprintk("NFS: %s: Server returned RDMA mode = true\n",
+				__func__);
+			status = -EIO;
+			goto out_session;
+		}
+	}
+out_session:
+	kfree(res.session);
+out:
+	dprintk("<-- %s status= %d\n", __func__, status);
+	return status;
+}
+
+/*
  * nfs4_proc_exchange_id()
  *
  * Since the clientid has expired, all compounds using sessions
@@ -5105,7 +5172,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 		.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER,
 	};
 	struct nfs41_exchange_id_res res = {
-		.client = clp,
+		0
 	};
 	int status;
 	struct rpc_message msg = {
@@ -5118,7 +5185,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 	dprintk("--> %s\n", __func__);
 	BUG_ON(clp == NULL);
 
-	nfs4_construct_boot_verifier(clp, &verifier);
+	nfs4_init_boot_verifier(clp, &verifier);
 
 	args.id_len = scnprintf(args.id, sizeof(args.id),
 				"%s/%s/%u",
@@ -5126,59 +5193,135 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 				clp->cl_rpcclient->cl_nodename,
 				clp->cl_rpcclient->cl_auth->au_flavor);
 
-	res.server_scope = kzalloc(sizeof(struct server_scope), GFP_KERNEL);
-	if (unlikely(!res.server_scope)) {
+	res.server_owner = kzalloc(sizeof(struct nfs41_server_owner),
+					GFP_NOFS);
+	if (unlikely(res.server_owner == NULL)) {
 		status = -ENOMEM;
 		goto out;
 	}
 
-	res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_KERNEL);
-	if (unlikely(!res.impl_id)) {
+	res.server_scope = kzalloc(sizeof(struct nfs41_server_scope),
+					GFP_NOFS);
+	if (unlikely(res.server_scope == NULL)) {
+		status = -ENOMEM;
+		goto out_server_owner;
+	}
+
+	res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_NOFS);
+	if (unlikely(res.impl_id == NULL)) {
 		status = -ENOMEM;
 		goto out_server_scope;
 	}
 
 	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
-	if (!status)
-		status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
+	if (status == 0)
+		status = nfs4_check_cl_exchange_flags(res.flags);
+
+	if (status == 0) {
+		clp->cl_clientid = res.clientid;
+		clp->cl_exchange_flags = (res.flags & ~EXCHGID4_FLAG_CONFIRMED_R);
+		if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R))
+			clp->cl_seqid = res.seqid;
+
+		kfree(clp->cl_serverowner);
+		clp->cl_serverowner = res.server_owner;
+		res.server_owner = NULL;
 
-	if (!status) {
 		/* use the most recent implementation id */
-		kfree(clp->impl_id);
-		clp->impl_id = res.impl_id;
-	} else
-		kfree(res.impl_id);
+		kfree(clp->cl_implid);
+		clp->cl_implid = res.impl_id;
 
-	if (!status) {
-		if (clp->server_scope &&
-		    !nfs41_same_server_scope(clp->server_scope,
+		if (clp->cl_serverscope != NULL &&
+		    !nfs41_same_server_scope(clp->cl_serverscope,
 					     res.server_scope)) {
 			dprintk("%s: server_scope mismatch detected\n",
 				__func__);
 			set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state);
-			kfree(clp->server_scope);
-			clp->server_scope = NULL;
+			kfree(clp->cl_serverscope);
+			clp->cl_serverscope = NULL;
 		}
 
-		if (!clp->server_scope) {
-			clp->server_scope = res.server_scope;
+		if (clp->cl_serverscope == NULL) {
+			clp->cl_serverscope = res.server_scope;
 			goto out;
 		}
-	}
+	} else
+		kfree(res.impl_id);
 
+out_server_owner:
+	kfree(res.server_owner);
 out_server_scope:
 	kfree(res.server_scope);
 out:
-	if (clp->impl_id)
+	if (clp->cl_implid != NULL)
 		dprintk("%s: Server Implementation ID: "
 			"domain: %s, name: %s, date: %llu,%u\n",
-			__func__, clp->impl_id->domain, clp->impl_id->name,
-			clp->impl_id->date.seconds,
-			clp->impl_id->date.nseconds);
+			__func__, clp->cl_implid->domain, clp->cl_implid->name,
+			clp->cl_implid->date.seconds,
+			clp->cl_implid->date.nseconds);
 	dprintk("<-- %s status= %d\n", __func__, status);
 	return status;
 }
 
+static int _nfs4_proc_destroy_clientid(struct nfs_client *clp,
+		struct rpc_cred *cred)
+{
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_CLIENTID],
+		.rpc_argp = clp,
+		.rpc_cred = cred,
+	};
+	int status;
+
+	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	if (status)
+		pr_warn("NFS: Got error %d from the server %s on "
+			"DESTROY_CLIENTID.", status, clp->cl_hostname);
+	return status;
+}
+
+static int nfs4_proc_destroy_clientid(struct nfs_client *clp,
+		struct rpc_cred *cred)
+{
+	unsigned int loop;
+	int ret;
+
+	for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
+		ret = _nfs4_proc_destroy_clientid(clp, cred);
+		switch (ret) {
+		case -NFS4ERR_DELAY:
+		case -NFS4ERR_CLIENTID_BUSY:
+			ssleep(1);
+			break;
+		default:
+			return ret;
+		}
+	}
+	return 0;
+}
+
+int nfs4_destroy_clientid(struct nfs_client *clp)
+{
+	struct rpc_cred *cred;
+	int ret = 0;
+
+	if (clp->cl_mvops->minor_version < 1)
+		goto out;
+	if (clp->cl_exchange_flags == 0)
+		goto out;
+	cred = nfs4_get_exchange_id_cred(clp);
+	ret = nfs4_proc_destroy_clientid(clp, cred);
+	if (cred)
+		put_rpccred(cred);
+	switch (ret) {
+	case 0:
+	case -NFS4ERR_STALE_CLIENTID:
+		clp->cl_exchange_flags = 0;
+	}
+out:
+	return ret;
+}
+
 struct nfs4_get_lease_time_data {
 	struct nfs4_get_lease_time_args *args;
 	struct nfs4_get_lease_time_res *res;
@@ -5399,8 +5542,12 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
 void nfs4_destroy_session(struct nfs4_session *session)
 {
 	struct rpc_xprt *xprt;
+	struct rpc_cred *cred;
 
-	nfs4_proc_destroy_session(session);
+	cred = nfs4_get_exchange_id_cred(session->clp);
+	nfs4_proc_destroy_session(session, cred);
+	if (cred)
+		put_rpccred(cred);
 
 	rcu_read_lock();
 	xprt = rcu_dereference(session->clp->cl_rpcclient->cl_xprt);
@@ -5510,7 +5657,8 @@ static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args,
 	return nfs4_verify_back_channel_attrs(args, session);
 }
 
-static int _nfs4_proc_create_session(struct nfs_client *clp)
+static int _nfs4_proc_create_session(struct nfs_client *clp,
+		struct rpc_cred *cred)
 {
 	struct nfs4_session *session = clp->cl_session;
 	struct nfs41_create_session_args args = {
@@ -5524,6 +5672,7 @@ static int _nfs4_proc_create_session(struct nfs_client *clp)
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE_SESSION],
 		.rpc_argp = &args,
 		.rpc_resp = &res,
+		.rpc_cred = cred,
 	};
 	int status;
 
@@ -5548,7 +5697,7 @@ static int _nfs4_proc_create_session(struct nfs_client *clp)
  * It is the responsibility of the caller to verify the session is
  * expired before calling this routine.
  */
-int nfs4_proc_create_session(struct nfs_client *clp)
+int nfs4_proc_create_session(struct nfs_client *clp, struct rpc_cred *cred)
 {
 	int status;
 	unsigned *ptr;
@@ -5556,7 +5705,7 @@ int nfs4_proc_create_session(struct nfs_client *clp)
 
 	dprintk("--> %s clp=%p session=%p\n", __func__, clp, session);
 
-	status = _nfs4_proc_create_session(clp);
+	status = _nfs4_proc_create_session(clp, cred);
 	if (status)
 		goto out;
 
@@ -5578,10 +5727,15 @@ out:
  * Issue the over-the-wire RPC DESTROY_SESSION.
  * The caller must serialize access to this routine.
  */
-int nfs4_proc_destroy_session(struct nfs4_session *session)
+int nfs4_proc_destroy_session(struct nfs4_session *session,
+		struct rpc_cred *cred)
 {
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_SESSION],
+		.rpc_argp = session,
+		.rpc_cred = cred,
+	};
 	int status = 0;
-	struct rpc_message msg;
 
 	dprintk("--> nfs4_proc_destroy_session\n");
 
@@ -5589,10 +5743,6 @@ int nfs4_proc_destroy_session(struct nfs4_session *session)
 	if (session->clp->cl_cons_state != NFS_CS_READY)
 		return status;
 
-	msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_SESSION];
-	msg.rpc_argp = session;
-	msg.rpc_resp = NULL;
-	msg.rpc_cred = NULL;
 	status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
 
 	if (status)
@@ -5604,53 +5754,79 @@ int nfs4_proc_destroy_session(struct nfs4_session *session)
 	return status;
 }
 
+/*
+ * With sessions, the client is not marked ready until after a
+ * successful EXCHANGE_ID and CREATE_SESSION.
+ *
+ * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate
+ * other versions of NFS can be tried.
+ */
+static int nfs41_check_session_ready(struct nfs_client *clp)
+{
+	int ret;
+	
+	if (clp->cl_cons_state == NFS_CS_SESSION_INITING) {
+		ret = nfs4_client_recover_expired_lease(clp);
+		if (ret)
+			return ret;
+	}
+	if (clp->cl_cons_state < NFS_CS_READY)
+		return -EPROTONOSUPPORT;
+	smp_rmb();
+	return 0;
+}
+
 int nfs4_init_session(struct nfs_server *server)
 {
 	struct nfs_client *clp = server->nfs_client;
 	struct nfs4_session *session;
 	unsigned int rsize, wsize;
-	int ret;
 
 	if (!nfs4_has_session(clp))
 		return 0;
 
 	session = clp->cl_session;
-	if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state))
-		return 0;
+	spin_lock(&clp->cl_lock);
+	if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
 
-	rsize = server->rsize;
-	if (rsize == 0)
-		rsize = NFS_MAX_FILE_IO_SIZE;
-	wsize = server->wsize;
-	if (wsize == 0)
-		wsize = NFS_MAX_FILE_IO_SIZE;
+		rsize = server->rsize;
+		if (rsize == 0)
+			rsize = NFS_MAX_FILE_IO_SIZE;
+		wsize = server->wsize;
+		if (wsize == 0)
+			wsize = NFS_MAX_FILE_IO_SIZE;
 
-	session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead;
-	session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead;
+		session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead;
+		session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead;
+	}
+	spin_unlock(&clp->cl_lock);
 
-	ret = nfs4_recover_expired_lease(server);
-	if (!ret)
-		ret = nfs4_check_client_ready(clp);
-	return ret;
+	return nfs41_check_session_ready(clp);
 }
 
-int nfs4_init_ds_session(struct nfs_client *clp)
+int nfs4_init_ds_session(struct nfs_client *clp, unsigned long lease_time)
 {
 	struct nfs4_session *session = clp->cl_session;
 	int ret;
 
-	if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state))
-		return 0;
-
-	ret = nfs4_client_recover_expired_lease(clp);
-	if (!ret)
-		/* Test for the DS role */
-		if (!is_ds_client(clp))
-			ret = -ENODEV;
-	if (!ret)
-		ret = nfs4_check_client_ready(clp);
-	return ret;
+	spin_lock(&clp->cl_lock);
+	if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
+		/*
+		 * Do not set NFS_CS_CHECK_LEASE_TIME instead set the
+		 * DS lease to be equal to the MDS lease.
+		 */
+		clp->cl_lease_time = lease_time;
+		clp->cl_last_renewal = jiffies;
+	}
+	spin_unlock(&clp->cl_lock);
 
+	ret = nfs41_check_session_ready(clp);
+	if (ret)
+		return ret;
+	/* Test for the DS role */
+	if (!is_ds_client(clp))
+		return -ENODEV;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(nfs4_init_ds_session);
 
@@ -6557,6 +6733,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
 	.file_inode_ops	= &nfs4_file_inode_operations,
 	.file_ops	= &nfs4_file_operations,
 	.getroot	= nfs4_proc_get_root,
+	.submount	= nfs4_submount,
 	.getattr	= nfs4_proc_getattr,
 	.setattr	= nfs4_proc_setattr,
 	.lookup		= nfs4_proc_lookup,
@@ -6589,13 +6766,13 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
 	.write_rpc_prepare = nfs4_proc_write_rpc_prepare,
 	.write_done	= nfs4_write_done,
 	.commit_setup	= nfs4_proc_commit_setup,
+	.commit_rpc_prepare = nfs4_proc_commit_rpc_prepare,
 	.commit_done	= nfs4_commit_done,
 	.lock		= nfs4_proc_lock,
 	.clear_acl_cache = nfs4_zap_acl_attr,
 	.close_context  = nfs4_close_context,
 	.open_context	= nfs4_atomic_open,
 	.init_client	= nfs4_init_client,
-	.secinfo	= nfs4_proc_secinfo,
 };
 
 static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index dc484c0eae7f..6930bec91bca 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -49,7 +49,7 @@
 #include "nfs4_fs.h"
 #include "delegation.h"
 
-#define NFSDBG_FACILITY	NFSDBG_PROC
+#define NFSDBG_FACILITY		NFSDBG_STATE
 
 void
 nfs4_renew_state(struct work_struct *work)
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 7f0fcfc1fe9d..c679b9ecef63 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -57,6 +57,8 @@
 #include "internal.h"
 #include "pnfs.h"
 
+#define NFSDBG_FACILITY		NFSDBG_STATE
+
 #define OPENOWNER_POOL_SIZE	8
 
 const nfs4_stateid zero_stateid;
@@ -254,7 +256,7 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
 		goto out;
 	set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
 do_confirm:
-	status = nfs4_proc_create_session(clp);
+	status = nfs4_proc_create_session(clp, cred);
 	if (status != 0)
 		goto out;
 	clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
@@ -1106,6 +1108,8 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp)
 		return;
 	if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
 		set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
+	dprintk("%s: scheduling lease recovery for server %s\n", __func__,
+			clp->cl_hostname);
 	nfs4_schedule_state_manager(clp);
 }
 EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery);
@@ -1122,6 +1126,8 @@ static void nfs40_handle_cb_pathdown(struct nfs_client *clp)
 {
 	set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
 	nfs_expire_all_delegations(clp);
+	dprintk("%s: handling CB_PATHDOWN recovery for server %s\n", __func__,
+			clp->cl_hostname);
 }
 
 void nfs4_schedule_path_down_recovery(struct nfs_client *clp)
@@ -1158,6 +1164,8 @@ void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4
 	struct nfs_client *clp = server->nfs_client;
 
 	nfs4_state_mark_reclaim_nograce(clp, state);
+	dprintk("%s: scheduling stateid recovery for server %s\n", __func__,
+			clp->cl_hostname);
 	nfs4_schedule_state_manager(clp);
 }
 EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery);
@@ -1491,19 +1499,25 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 		case -NFS4ERR_BADSLOT:
 		case -NFS4ERR_BAD_HIGH_SLOT:
 		case -NFS4ERR_DEADSESSION:
-		case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
 		case -NFS4ERR_SEQ_FALSE_RETRY:
 		case -NFS4ERR_SEQ_MISORDERED:
 			set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
 			/* Zero session reset errors */
 			break;
+		case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+			set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
+			break;
 		case -EKEYEXPIRED:
 			/* Nothing we can do */
 			nfs4_warn_keyexpired(clp->cl_hostname);
 			break;
 		default:
+			dprintk("%s: failed to handle error %d for server %s\n",
+					__func__, error, clp->cl_hostname);
 			return error;
 	}
+	dprintk("%s: handled error %d for server %s\n", __func__, error,
+			clp->cl_hostname);
 	return 0;
 }
 
@@ -1572,34 +1586,82 @@ out:
 	return nfs4_recovery_handle_error(clp, status);
 }
 
+/* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors
+ * on EXCHANGE_ID for v4.1
+ */
+static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
+{
+	switch (status) {
+	case -NFS4ERR_SEQ_MISORDERED:
+		if (test_and_set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state))
+			return -ESERVERFAULT;
+		/* Lease confirmation error: retry after purging the lease */
+		ssleep(1);
+	case -NFS4ERR_CLID_INUSE:
+	case -NFS4ERR_STALE_CLIENTID:
+		clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+		break;
+	case -EACCES:
+		if (clp->cl_machine_cred == NULL)
+			return -EACCES;
+		/* Handle case where the user hasn't set up machine creds */
+		nfs4_clear_machine_cred(clp);
+	case -NFS4ERR_DELAY:
+	case -ETIMEDOUT:
+	case -EAGAIN:
+		ssleep(1);
+		break;
+
+	case -NFS4ERR_MINOR_VERS_MISMATCH:
+		if (clp->cl_cons_state == NFS_CS_SESSION_INITING)
+			nfs_mark_client_ready(clp, -EPROTONOSUPPORT);
+		dprintk("%s: exit with error %d for server %s\n",
+				__func__, -EPROTONOSUPPORT, clp->cl_hostname);
+		return -EPROTONOSUPPORT;
+	case -EKEYEXPIRED:
+		nfs4_warn_keyexpired(clp->cl_hostname);
+	case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
+				 * in nfs4_exchange_id */
+	default:
+		dprintk("%s: exit with error %d for server %s\n", __func__,
+				status, clp->cl_hostname);
+		return status;
+	}
+	set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+	dprintk("%s: handled error %d for server %s\n", __func__, status,
+			clp->cl_hostname);
+	return 0;
+}
+
 static int nfs4_reclaim_lease(struct nfs_client *clp)
 {
 	struct rpc_cred *cred;
 	const struct nfs4_state_recovery_ops *ops =
 		clp->cl_mvops->reboot_recovery_ops;
-	int status = -ENOENT;
+	int status;
 
 	cred = ops->get_clid_cred(clp);
-	if (cred != NULL) {
-		status = ops->establish_clid(clp, cred);
-		put_rpccred(cred);
-		/* Handle case where the user hasn't set up machine creds */
-		if (status == -EACCES && cred == clp->cl_machine_cred) {
-			nfs4_clear_machine_cred(clp);
-			status = -EAGAIN;
-		}
-		if (status == -NFS4ERR_MINOR_VERS_MISMATCH)
-			status = -EPROTONOSUPPORT;
-	}
-	return status;
+	if (cred == NULL)
+		return -ENOENT;
+	status = ops->establish_clid(clp, cred);
+	put_rpccred(cred);
+	if (status != 0)
+		return nfs4_handle_reclaim_lease_error(clp, status);
+	return 0;
 }
 
 #ifdef CONFIG_NFS_V4_1
-void nfs4_schedule_session_recovery(struct nfs4_session *session)
+void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
 {
 	struct nfs_client *clp = session->clp;
 
-	set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+	switch (err) {
+	default:
+		set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+		break;
+	case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+		set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
+	}
 	nfs4_schedule_lease_recovery(clp);
 }
 EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
@@ -1607,14 +1669,19 @@ EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
 void nfs41_handle_recall_slot(struct nfs_client *clp)
 {
 	set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
+	dprintk("%s: scheduling slot recall for server %s\n", __func__,
+			clp->cl_hostname);
 	nfs4_schedule_state_manager(clp);
 }
 
 static void nfs4_reset_all_state(struct nfs_client *clp)
 {
 	if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
-		clp->cl_boot_time = CURRENT_TIME;
+		set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state);
+		clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
 		nfs4_state_start_reclaim_nograce(clp);
+		dprintk("%s: scheduling reset of all state for server %s!\n",
+				__func__, clp->cl_hostname);
 		nfs4_schedule_state_manager(clp);
 	}
 }
@@ -1623,33 +1690,50 @@ static void nfs41_handle_server_reboot(struct nfs_client *clp)
 {
 	if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
 		nfs4_state_start_reclaim_reboot(clp);
+		dprintk("%s: server %s rebooted!\n", __func__,
+				clp->cl_hostname);
 		nfs4_schedule_state_manager(clp);
 	}
 }
 
 static void nfs41_handle_state_revoked(struct nfs_client *clp)
 {
-	/* Temporary */
 	nfs4_reset_all_state(clp);
+	dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname);
 }
 
 static void nfs41_handle_recallable_state_revoked(struct nfs_client *clp)
 {
 	/* This will need to handle layouts too */
 	nfs_expire_all_delegations(clp);
+	dprintk("%s: Recallable state revoked on server %s!\n", __func__,
+			clp->cl_hostname);
 }
 
-static void nfs41_handle_cb_path_down(struct nfs_client *clp)
+static void nfs41_handle_backchannel_fault(struct nfs_client *clp)
 {
 	nfs_expire_all_delegations(clp);
 	if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0)
 		nfs4_schedule_state_manager(clp);
+	dprintk("%s: server %s declared a backchannel fault\n", __func__,
+			clp->cl_hostname);
+}
+
+static void nfs41_handle_cb_path_down(struct nfs_client *clp)
+{
+	if (test_and_set_bit(NFS4CLNT_BIND_CONN_TO_SESSION,
+		&clp->cl_state) == 0)
+		nfs4_schedule_state_manager(clp);
 }
 
 void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
 {
 	if (!flags)
 		return;
+
+	dprintk("%s: \"%s\" (client ID %llx) flags=0x%08x\n",
+		__func__, clp->cl_hostname, clp->cl_clientid, flags);
+
 	if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED)
 		nfs41_handle_server_reboot(clp);
 	if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED |
@@ -1659,18 +1743,21 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
 		nfs41_handle_state_revoked(clp);
 	if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED)
 		nfs41_handle_recallable_state_revoked(clp);
-	if (flags & (SEQ4_STATUS_CB_PATH_DOWN |
-			    SEQ4_STATUS_BACKCHANNEL_FAULT |
-			    SEQ4_STATUS_CB_PATH_DOWN_SESSION))
+	if (flags & SEQ4_STATUS_BACKCHANNEL_FAULT)
+		nfs41_handle_backchannel_fault(clp);
+	else if (flags & (SEQ4_STATUS_CB_PATH_DOWN |
+				SEQ4_STATUS_CB_PATH_DOWN_SESSION))
 		nfs41_handle_cb_path_down(clp);
 }
 
 static int nfs4_reset_session(struct nfs_client *clp)
 {
+	struct rpc_cred *cred;
 	int status;
 
 	nfs4_begin_drain_session(clp);
-	status = nfs4_proc_destroy_session(clp->cl_session);
+	cred = nfs4_get_exchange_id_cred(clp);
+	status = nfs4_proc_destroy_session(clp->cl_session, cred);
 	if (status && status != -NFS4ERR_BADSESSION &&
 	    status != -NFS4ERR_DEADSESSION) {
 		status = nfs4_recovery_handle_error(clp, status);
@@ -1678,19 +1765,26 @@ static int nfs4_reset_session(struct nfs_client *clp)
 	}
 
 	memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN);
-	status = nfs4_proc_create_session(clp);
+	status = nfs4_proc_create_session(clp, cred);
 	if (status) {
-		status = nfs4_recovery_handle_error(clp, status);
+		dprintk("%s: session reset failed with status %d for server %s!\n",
+			__func__, status, clp->cl_hostname);
+		status = nfs4_handle_reclaim_lease_error(clp, status);
 		goto out;
 	}
 	clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
 	/* create_session negotiated new slot table */
 	clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
+	clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
+	dprintk("%s: session reset was successful for server %s!\n",
+			__func__, clp->cl_hostname);
 
 	 /* Let the state manager reestablish state */
 	if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
 		nfs41_setup_state_renewal(clp);
 out:
+	if (cred)
+		put_rpccred(cred);
 	return status;
 }
 
@@ -1722,37 +1816,41 @@ static int nfs4_recall_slot(struct nfs_client *clp)
 	return 0;
 }
 
-#else /* CONFIG_NFS_V4_1 */
-static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
-static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; }
-static int nfs4_recall_slot(struct nfs_client *clp) { return 0; }
-#endif /* CONFIG_NFS_V4_1 */
-
-/* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors
- * on EXCHANGE_ID for v4.1
- */
-static void nfs4_set_lease_expired(struct nfs_client *clp, int status)
+static int nfs4_bind_conn_to_session(struct nfs_client *clp)
 {
-	switch (status) {
-	case -NFS4ERR_CLID_INUSE:
-	case -NFS4ERR_STALE_CLIENTID:
-		clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+	struct rpc_cred *cred;
+	int ret;
+
+	nfs4_begin_drain_session(clp);
+	cred = nfs4_get_exchange_id_cred(clp);
+	ret = nfs4_proc_bind_conn_to_session(clp, cred);
+	if (cred)
+		put_rpccred(cred);
+	clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
+	switch (ret) {
+	case 0:
+		dprintk("%s: bind_conn_to_session was successful for server %s!\n",
+			__func__, clp->cl_hostname);
 		break;
 	case -NFS4ERR_DELAY:
-	case -ETIMEDOUT:
-	case -EAGAIN:
 		ssleep(1);
+		set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
 		break;
-
-	case -EKEYEXPIRED:
-		nfs4_warn_keyexpired(clp->cl_hostname);
-	case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
-				 * in nfs4_exchange_id */
 	default:
-		return;
+		return nfs4_recovery_handle_error(clp, ret);
 	}
-	set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+	return 0;
 }
+#else /* CONFIG_NFS_V4_1 */
+static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
+static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; }
+static int nfs4_recall_slot(struct nfs_client *clp) { return 0; }
+
+static int nfs4_bind_conn_to_session(struct nfs_client *clp)
+{
+	return 0;
+}
+#endif /* CONFIG_NFS_V4_1 */
 
 static void nfs4_state_manager(struct nfs_client *clp)
 {
@@ -1760,19 +1858,21 @@ static void nfs4_state_manager(struct nfs_client *clp)
 
 	/* Ensure exclusive access to NFSv4 state */
 	do {
+		if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) {
+			status = nfs4_reclaim_lease(clp);
+			if (status < 0)
+				goto out_error;
+			clear_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state);
+			set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+		}
+
 		if (test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) {
 			/* We're going to have to re-establish a clientid */
 			status = nfs4_reclaim_lease(clp);
-			if (status) {
-				nfs4_set_lease_expired(clp, status);
-				if (test_bit(NFS4CLNT_LEASE_EXPIRED,
-							&clp->cl_state))
-					continue;
-				if (clp->cl_cons_state ==
-							NFS_CS_SESSION_INITING)
-					nfs_mark_client_ready(clp, status);
+			if (status < 0)
 				goto out_error;
-			}
+			if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
+				continue;
 			clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
 
 			if (test_and_clear_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH,
@@ -1803,6 +1903,15 @@ static void nfs4_state_manager(struct nfs_client *clp)
 				goto out_error;
 		}
 
+		/* Send BIND_CONN_TO_SESSION */
+		if (test_and_clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION,
+				&clp->cl_state) && nfs4_has_session(clp)) {
+			status = nfs4_bind_conn_to_session(clp);
+			if (status < 0)
+				goto out_error;
+			continue;
+		}
+
 		/* First recover reboot state... */
 		if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
 			status = nfs4_do_reclaim(clp,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index c54aae364bee..ee4a74db95d0 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -53,9 +53,11 @@
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_idmap.h>
+
 #include "nfs4_fs.h"
 #include "internal.h"
 #include "pnfs.h"
+#include "netns.h"
 
 #define NFSDBG_FACILITY		NFSDBG_XDR
 
@@ -99,9 +101,12 @@ static int nfs4_stat_to_errno(int);
 #define nfs4_path_maxsz		(1 + ((3 + NFS4_MAXPATHLEN) >> 2))
 #define nfs4_owner_maxsz	(1 + XDR_QUADLEN(IDMAP_NAMESZ))
 #define nfs4_group_maxsz	(1 + XDR_QUADLEN(IDMAP_NAMESZ))
+/* We support only one layout type per file system */
+#define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8)
 /* This is based on getfattr, which uses the most attributes: */
 #define nfs4_fattr_value_maxsz	(1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \
-				3 + 3 + 3 + nfs4_owner_maxsz + nfs4_group_maxsz))
+				3 + 3 + 3 + nfs4_owner_maxsz + \
+				nfs4_group_maxsz + decode_mdsthreshold_maxsz))
 #define nfs4_fattr_maxsz	(nfs4_fattr_bitmap_maxsz + \
 				nfs4_fattr_value_maxsz)
 #define decode_getattr_maxsz    (op_decode_hdr_maxsz + nfs4_fattr_maxsz)
@@ -321,8 +326,20 @@ static int nfs4_stat_to_errno(int);
 				     1 /* csr_flags */ + \
 				     decode_channel_attrs_maxsz + \
 				     decode_channel_attrs_maxsz)
+#define encode_bind_conn_to_session_maxsz  (op_encode_hdr_maxsz + \
+				     /* bctsa_sessid */ \
+				     XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \
+				     1 /* bctsa_dir */ + \
+				     1 /* bctsa_use_conn_in_rdma_mode */)
+#define decode_bind_conn_to_session_maxsz  (op_decode_hdr_maxsz +	\
+				     /* bctsr_sessid */ \
+				     XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \
+				     1 /* bctsr_dir */ + \
+				     1 /* bctsr_use_conn_in_rdma_mode */)
 #define encode_destroy_session_maxsz    (op_encode_hdr_maxsz + 4)
 #define decode_destroy_session_maxsz    (op_decode_hdr_maxsz)
+#define encode_destroy_clientid_maxsz   (op_encode_hdr_maxsz + 2)
+#define decode_destroy_clientid_maxsz   (op_decode_hdr_maxsz)
 #define encode_sequence_maxsz	(op_encode_hdr_maxsz + \
 				XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 4)
 #define decode_sequence_maxsz	(op_decode_hdr_maxsz + \
@@ -421,30 +438,22 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_enc_commit_sz	(compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
 				encode_putfh_maxsz + \
-				encode_commit_maxsz + \
-				encode_getattr_maxsz)
+				encode_commit_maxsz)
 #define NFS4_dec_commit_sz	(compound_decode_hdr_maxsz + \
 				decode_sequence_maxsz + \
 				decode_putfh_maxsz + \
-				decode_commit_maxsz + \
-				decode_getattr_maxsz)
+				decode_commit_maxsz)
 #define NFS4_enc_open_sz        (compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
 				encode_putfh_maxsz + \
-				encode_savefh_maxsz + \
 				encode_open_maxsz + \
 				encode_getfh_maxsz + \
-				encode_getattr_maxsz + \
-				encode_restorefh_maxsz + \
 				encode_getattr_maxsz)
 #define NFS4_dec_open_sz        (compound_decode_hdr_maxsz + \
 				decode_sequence_maxsz + \
 				decode_putfh_maxsz + \
-				decode_savefh_maxsz + \
 				decode_open_maxsz + \
 				decode_getfh_maxsz + \
-				decode_getattr_maxsz + \
-				decode_restorefh_maxsz + \
 				decode_getattr_maxsz)
 #define NFS4_enc_open_confirm_sz \
 				(compound_encode_hdr_maxsz + \
@@ -595,47 +604,37 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_enc_remove_sz	(compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
 				encode_putfh_maxsz + \
-				encode_remove_maxsz + \
-				encode_getattr_maxsz)
+				encode_remove_maxsz)
 #define NFS4_dec_remove_sz	(compound_decode_hdr_maxsz + \
 				decode_sequence_maxsz + \
 				decode_putfh_maxsz + \
-				decode_remove_maxsz + \
-				decode_getattr_maxsz)
+				decode_remove_maxsz)
 #define NFS4_enc_rename_sz	(compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
 				encode_putfh_maxsz + \
 				encode_savefh_maxsz + \
 				encode_putfh_maxsz + \
-				encode_rename_maxsz + \
-				encode_getattr_maxsz + \
-				encode_restorefh_maxsz + \
-				encode_getattr_maxsz)
+				encode_rename_maxsz)
 #define NFS4_dec_rename_sz	(compound_decode_hdr_maxsz + \
 				decode_sequence_maxsz + \
 				decode_putfh_maxsz + \
 				decode_savefh_maxsz + \
 				decode_putfh_maxsz + \
-				decode_rename_maxsz + \
-				decode_getattr_maxsz + \
-				decode_restorefh_maxsz + \
-				decode_getattr_maxsz)
+				decode_rename_maxsz)
 #define NFS4_enc_link_sz	(compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
 				encode_putfh_maxsz + \
 				encode_savefh_maxsz + \
 				encode_putfh_maxsz + \
 				encode_link_maxsz + \
-				decode_getattr_maxsz + \
 				encode_restorefh_maxsz + \
-				decode_getattr_maxsz)
+				encode_getattr_maxsz)
 #define NFS4_dec_link_sz	(compound_decode_hdr_maxsz + \
 				decode_sequence_maxsz + \
 				decode_putfh_maxsz + \
 				decode_savefh_maxsz + \
 				decode_putfh_maxsz + \
 				decode_link_maxsz + \
-				decode_getattr_maxsz + \
 				decode_restorefh_maxsz + \
 				decode_getattr_maxsz)
 #define NFS4_enc_symlink_sz	(compound_encode_hdr_maxsz + \
@@ -653,20 +652,14 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_enc_create_sz	(compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
 				encode_putfh_maxsz + \
-				encode_savefh_maxsz + \
 				encode_create_maxsz + \
 				encode_getfh_maxsz + \
-				encode_getattr_maxsz + \
-				encode_restorefh_maxsz + \
 				encode_getattr_maxsz)
 #define NFS4_dec_create_sz	(compound_decode_hdr_maxsz + \
 				decode_sequence_maxsz + \
 				decode_putfh_maxsz + \
-				decode_savefh_maxsz + \
 				decode_create_maxsz + \
 				decode_getfh_maxsz + \
-				decode_getattr_maxsz + \
-				decode_restorefh_maxsz + \
 				decode_getattr_maxsz)
 #define NFS4_enc_pathconf_sz	(compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
@@ -738,6 +731,12 @@ static int nfs4_stat_to_errno(int);
 				decode_putfh_maxsz + \
 				decode_secinfo_maxsz)
 #if defined(CONFIG_NFS_V4_1)
+#define NFS4_enc_bind_conn_to_session_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_bind_conn_to_session_maxsz)
+#define NFS4_dec_bind_conn_to_session_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_bind_conn_to_session_maxsz)
 #define NFS4_enc_exchange_id_sz \
 				(compound_encode_hdr_maxsz + \
 				 encode_exchange_id_maxsz)
@@ -754,6 +753,10 @@ static int nfs4_stat_to_errno(int);
 					 encode_destroy_session_maxsz)
 #define NFS4_dec_destroy_session_sz	(compound_decode_hdr_maxsz + \
 					 decode_destroy_session_maxsz)
+#define NFS4_enc_destroy_clientid_sz	(compound_encode_hdr_maxsz + \
+					 encode_destroy_clientid_maxsz)
+#define NFS4_dec_destroy_clientid_sz	(compound_decode_hdr_maxsz + \
+					 decode_destroy_clientid_maxsz)
 #define NFS4_enc_sequence_sz \
 				(compound_decode_hdr_maxsz + \
 				 encode_sequence_maxsz)
@@ -1103,7 +1106,7 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
 	encode_nfs4_stateid(xdr, arg->stateid);
 }
 
-static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)
+static void encode_commit(struct xdr_stream *xdr, const struct nfs_commitargs *args, struct compound_hdr *hdr)
 {
 	__be32 *p;
 
@@ -1194,6 +1197,16 @@ static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct c
 			   bitmask[1] & nfs4_fattr_bitmap[1], hdr);
 }
 
+static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask,
+				 struct compound_hdr *hdr)
+{
+	encode_getattr_three(xdr,
+			     bitmask[0] & nfs4_fattr_bitmap[0],
+			     bitmask[1] & nfs4_fattr_bitmap[1],
+			     bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD,
+			     hdr);
+}
+
 static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
 {
 	encode_getattr_three(xdr,
@@ -1678,6 +1691,20 @@ static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, stru
 
 #if defined(CONFIG_NFS_V4_1)
 /* NFSv4.1 operations */
+static void encode_bind_conn_to_session(struct xdr_stream *xdr,
+				   struct nfs4_session *session,
+				   struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	encode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION,
+		decode_bind_conn_to_session_maxsz, hdr);
+	encode_opaque_fixed(xdr, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+	p = xdr_reserve_space(xdr, 8);
+	*p++ = cpu_to_be32(NFS4_CDFC4_BACK_OR_BOTH);
+	*p = 0;	/* use_conn_in_rdma_mode = False */
+}
+
 static void encode_exchange_id(struct xdr_stream *xdr,
 			       struct nfs41_exchange_id_args *args,
 			       struct compound_hdr *hdr)
@@ -1726,6 +1753,7 @@ static void encode_create_session(struct xdr_stream *xdr,
 	char machine_name[NFS4_MAX_MACHINE_NAME_LEN];
 	uint32_t len;
 	struct nfs_client *clp = args->client;
+	struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
 	u32 max_resp_sz_cached;
 
 	/*
@@ -1767,7 +1795,7 @@ static void encode_create_session(struct xdr_stream *xdr,
 	*p++ = cpu_to_be32(RPC_AUTH_UNIX);			/* auth_sys */
 
 	/* authsys_parms rfc1831 */
-	*p++ = cpu_to_be32((u32)clp->cl_boot_time.tv_nsec);	/* stamp */
+	*p++ = (__be32)nn->boot_time.tv_nsec;		/* stamp */
 	p = xdr_encode_opaque(p, machine_name, len);
 	*p++ = cpu_to_be32(0);				/* UID */
 	*p++ = cpu_to_be32(0);				/* GID */
@@ -1782,6 +1810,14 @@ static void encode_destroy_session(struct xdr_stream *xdr,
 	encode_opaque_fixed(xdr, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
 }
 
+static void encode_destroy_clientid(struct xdr_stream *xdr,
+				   uint64_t clientid,
+				   struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_DESTROY_CLIENTID, decode_destroy_clientid_maxsz, hdr);
+	encode_uint64(xdr, clientid);
+}
+
 static void encode_reclaim_complete(struct xdr_stream *xdr,
 				    struct nfs41_reclaim_complete_args *args,
 				    struct compound_hdr *hdr)
@@ -2064,7 +2100,6 @@ static void nfs4_xdr_enc_remove(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_sequence(xdr, &args->seq_args, &hdr);
 	encode_putfh(xdr, args->fh, &hdr);
 	encode_remove(xdr, &args->name, &hdr);
-	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
 }
 
@@ -2084,9 +2119,6 @@ static void nfs4_xdr_enc_rename(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_savefh(xdr, &hdr);
 	encode_putfh(xdr, args->new_dir, &hdr);
 	encode_rename(xdr, args->old_name, args->new_name, &hdr);
-	encode_getfattr(xdr, args->bitmask, &hdr);
-	encode_restorefh(xdr, &hdr);
-	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
 }
 
@@ -2106,7 +2138,6 @@ static void nfs4_xdr_enc_link(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_savefh(xdr, &hdr);
 	encode_putfh(xdr, args->dir_fh, &hdr);
 	encode_link(xdr, args->name, &hdr);
-	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_restorefh(xdr, &hdr);
 	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
@@ -2125,12 +2156,9 @@ static void nfs4_xdr_enc_create(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_compound_hdr(xdr, req, &hdr);
 	encode_sequence(xdr, &args->seq_args, &hdr);
 	encode_putfh(xdr, args->dir_fh, &hdr);
-	encode_savefh(xdr, &hdr);
 	encode_create(xdr, args, &hdr);
 	encode_getfh(xdr, &hdr);
 	encode_getfattr(xdr, args->bitmask, &hdr);
-	encode_restorefh(xdr, &hdr);
-	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
 }
 
@@ -2191,12 +2219,9 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_compound_hdr(xdr, req, &hdr);
 	encode_sequence(xdr, &args->seq_args, &hdr);
 	encode_putfh(xdr, args->fh, &hdr);
-	encode_savefh(xdr, &hdr);
 	encode_open(xdr, args, &hdr);
 	encode_getfh(xdr, &hdr);
-	encode_getfattr(xdr, args->bitmask, &hdr);
-	encode_restorefh(xdr, &hdr);
-	encode_getfattr(xdr, args->dir_bitmask, &hdr);
+	encode_getfattr_open(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
 }
 
@@ -2448,7 +2473,7 @@ static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr,
  *  a COMMIT request
  */
 static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr,
-				struct nfs_writeargs *args)
+				struct nfs_commitargs *args)
 {
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
@@ -2458,8 +2483,6 @@ static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_sequence(xdr, &args->seq_args, &hdr);
 	encode_putfh(xdr, args->fh, &hdr);
 	encode_commit(xdr, args, &hdr);
-	if (args->bitmask)
-		encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
 }
 
@@ -2602,8 +2625,8 @@ static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req,
 	encode_compound_hdr(xdr, req, &hdr);
 	encode_sequence(xdr, &args->seq_args, &hdr);
 	encode_putfh(xdr, args->fhandle, &hdr);
-	encode_delegreturn(xdr, args->stateid, &hdr);
 	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_delegreturn(xdr, args->stateid, &hdr);
 	encode_nops(&hdr);
 }
 
@@ -2651,6 +2674,22 @@ static void nfs4_xdr_enc_secinfo(struct rpc_rqst *req,
 
 #if defined(CONFIG_NFS_V4_1)
 /*
+ * BIND_CONN_TO_SESSION request
+ */
+static void nfs4_xdr_enc_bind_conn_to_session(struct rpc_rqst *req,
+				struct xdr_stream *xdr,
+				struct nfs_client *clp)
+{
+	struct compound_hdr hdr = {
+		.minorversion = clp->cl_mvops->minor_version,
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_bind_conn_to_session(xdr, clp->cl_session, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
  * EXCHANGE_ID request
  */
 static void nfs4_xdr_enc_exchange_id(struct rpc_rqst *req,
@@ -2699,6 +2738,22 @@ static void nfs4_xdr_enc_destroy_session(struct rpc_rqst *req,
 }
 
 /*
+ * a DESTROY_CLIENTID request
+ */
+static void nfs4_xdr_enc_destroy_clientid(struct rpc_rqst *req,
+					 struct xdr_stream *xdr,
+					 struct nfs_client *clp)
+{
+	struct compound_hdr hdr = {
+		.minorversion = clp->cl_mvops->minor_version,
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_destroy_clientid(xdr, clp->cl_clientid, &hdr);
+	encode_nops(&hdr);
+}
+
+/*
  * a SEQUENCE request
  */
 static void nfs4_xdr_enc_sequence(struct rpc_rqst *req, struct xdr_stream *xdr,
@@ -4102,7 +4157,7 @@ static int decode_verifier(struct xdr_stream *xdr, void *verifier)
 	return decode_opaque_fixed(xdr, verifier, NFS4_VERIFIER_SIZE);
 }
 
-static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res)
+static int decode_commit(struct xdr_stream *xdr, struct nfs_commitres *res)
 {
 	int status;
 
@@ -4220,6 +4275,110 @@ xdr_error:
 	return status;
 }
 
+static int decode_threshold_hint(struct xdr_stream *xdr,
+				  uint32_t *bitmap,
+				  uint64_t *res,
+				  uint32_t hint_bit)
+{
+	__be32 *p;
+
+	*res = 0;
+	if (likely(bitmap[0] & hint_bit)) {
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
+		xdr_decode_hyper(p, res);
+	}
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_first_threshold_item4(struct xdr_stream *xdr,
+					struct nfs4_threshold *res)
+{
+	__be32 *p, *savep;
+	uint32_t bitmap[3] = {0,}, attrlen;
+	int status;
+
+	/* layout type */
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p)) {
+		print_overflow_msg(__func__, xdr);
+		return -EIO;
+	}
+	res->l_type = be32_to_cpup(p);
+
+	/* thi_hintset bitmap */
+	status = decode_attr_bitmap(xdr, bitmap);
+	if (status < 0)
+		goto xdr_error;
+
+	/* thi_hintlist length */
+	status = decode_attr_length(xdr, &attrlen, &savep);
+	if (status < 0)
+		goto xdr_error;
+	/* thi_hintlist */
+	status = decode_threshold_hint(xdr, bitmap, &res->rd_sz, THRESHOLD_RD);
+	if (status < 0)
+		goto xdr_error;
+	status = decode_threshold_hint(xdr, bitmap, &res->wr_sz, THRESHOLD_WR);
+	if (status < 0)
+		goto xdr_error;
+	status = decode_threshold_hint(xdr, bitmap, &res->rd_io_sz,
+				       THRESHOLD_RD_IO);
+	if (status < 0)
+		goto xdr_error;
+	status = decode_threshold_hint(xdr, bitmap, &res->wr_io_sz,
+				       THRESHOLD_WR_IO);
+	if (status < 0)
+		goto xdr_error;
+
+	status = verify_attr_len(xdr, savep, attrlen);
+	res->bm = bitmap[0];
+
+	dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n",
+		 __func__, res->bm, res->rd_sz, res->wr_sz, res->rd_io_sz,
+		res->wr_io_sz);
+xdr_error:
+	dprintk("%s ret=%d!\n", __func__, status);
+	return status;
+}
+
+/*
+ * Thresholds on pNFS direct I/O vrs MDS I/O
+ */
+static int decode_attr_mdsthreshold(struct xdr_stream *xdr,
+				    uint32_t *bitmap,
+				    struct nfs4_threshold *res)
+{
+	__be32 *p;
+	int status = 0;
+	uint32_t num;
+
+	if (unlikely(bitmap[2] & (FATTR4_WORD2_MDSTHRESHOLD - 1U)))
+		return -EIO;
+	if (likely(bitmap[2] & FATTR4_WORD2_MDSTHRESHOLD)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		num = be32_to_cpup(p);
+		if (num == 0)
+			return 0;
+		if (num > 1)
+			printk(KERN_INFO "%s: Warning: Multiple pNFS layout "
+				"drivers per filesystem not supported\n",
+				__func__);
+
+		status = decode_first_threshold_item4(xdr, res);
+	}
+	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
 static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
 		struct nfs_fattr *fattr, struct nfs_fh *fh,
 		struct nfs4_fs_locations *fs_loc,
@@ -4326,6 +4485,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
 		goto xdr_error;
 	fattr->valid |= status;
 
+	status = decode_attr_mdsthreshold(xdr, bitmap, fattr->mdsthreshold);
+	if (status < 0)
+		goto xdr_error;
+
 xdr_error:
 	dprintk("%s: xdr returned %d\n", __func__, -status);
 	return status;
@@ -5156,7 +5319,6 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 	uint32_t dummy;
 	char *dummy_str;
 	int status;
-	struct nfs_client *clp = res->client;
 	uint32_t impl_id_count;
 
 	status = decode_op_hdr(xdr, OP_EXCHANGE_ID);
@@ -5166,36 +5328,39 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 	p = xdr_inline_decode(xdr, 8);
 	if (unlikely(!p))
 		goto out_overflow;
-	xdr_decode_hyper(p, &clp->cl_clientid);
+	xdr_decode_hyper(p, &res->clientid);
 	p = xdr_inline_decode(xdr, 12);
 	if (unlikely(!p))
 		goto out_overflow;
-	clp->cl_seqid = be32_to_cpup(p++);
-	clp->cl_exchange_flags = be32_to_cpup(p++);
+	res->seqid = be32_to_cpup(p++);
+	res->flags = be32_to_cpup(p++);
 
 	/* We ask for SP4_NONE */
 	dummy = be32_to_cpup(p);
 	if (dummy != SP4_NONE)
 		return -EIO;
 
-	/* Throw away minor_id */
+	/* server_owner4.so_minor_id */
 	p = xdr_inline_decode(xdr, 8);
 	if (unlikely(!p))
 		goto out_overflow;
+	p = xdr_decode_hyper(p, &res->server_owner->minor_id);
 
-	/* Throw away Major id */
+	/* server_owner4.so_major_id */
 	status = decode_opaque_inline(xdr, &dummy, &dummy_str);
 	if (unlikely(status))
 		return status;
+	if (unlikely(dummy > NFS4_OPAQUE_LIMIT))
+		return -EIO;
+	memcpy(res->server_owner->major_id, dummy_str, dummy);
+	res->server_owner->major_id_sz = dummy;
 
-	/* Save server_scope */
+	/* server_scope4 */
 	status = decode_opaque_inline(xdr, &dummy, &dummy_str);
 	if (unlikely(status))
 		return status;
-
 	if (unlikely(dummy > NFS4_OPAQUE_LIMIT))
 		return -EIO;
-
 	memcpy(res->server_scope->server_scope, dummy_str, dummy);
 	res->server_scope->server_scope_sz = dummy;
 
@@ -5276,6 +5441,37 @@ static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid)
 	return decode_opaque_fixed(xdr, sid->data, NFS4_MAX_SESSIONID_LEN);
 }
 
+static int decode_bind_conn_to_session(struct xdr_stream *xdr,
+				struct nfs41_bind_conn_to_session_res *res)
+{
+	__be32 *p;
+	int status;
+
+	status = decode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION);
+	if (!status)
+		status = decode_sessionid(xdr, &res->session->sess_id);
+	if (unlikely(status))
+		return status;
+
+	/* dir flags, rdma mode bool */
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
+
+	res->dir = be32_to_cpup(p++);
+	if (res->dir == 0 || res->dir > NFS4_CDFS4_BOTH)
+		return -EIO;
+	if (be32_to_cpup(p) == 0)
+		res->use_conn_in_rdma_mode = false;
+	else
+		res->use_conn_in_rdma_mode = true;
+
+	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
 static int decode_create_session(struct xdr_stream *xdr,
 				 struct nfs41_create_session_res *res)
 {
@@ -5312,6 +5508,11 @@ static int decode_destroy_session(struct xdr_stream *xdr, void *dummy)
 	return decode_op_hdr(xdr, OP_DESTROY_SESSION);
 }
 
+static int decode_destroy_clientid(struct xdr_stream *xdr, void *dummy)
+{
+	return decode_op_hdr(xdr, OP_DESTROY_CLIENTID);
+}
+
 static int decode_reclaim_complete(struct xdr_stream *xdr, void *dummy)
 {
 	return decode_op_hdr(xdr, OP_RECLAIM_COMPLETE);
@@ -5800,9 +6001,6 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	if (status)
 		goto out;
 	status = decode_remove(xdr, &res->cinfo);
-	if (status)
-		goto out;
-	decode_getfattr(xdr, res->dir_attr, res->server);
 out:
 	return status;
 }
@@ -5832,15 +6030,6 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	if (status)
 		goto out;
 	status = decode_rename(xdr, &res->old_cinfo, &res->new_cinfo);
-	if (status)
-		goto out;
-	/* Current FH is target directory */
-	if (decode_getfattr(xdr, res->new_fattr, res->server))
-		goto out;
-	status = decode_restorefh(xdr);
-	if (status)
-		goto out;
-	decode_getfattr(xdr, res->old_fattr, res->server);
 out:
 	return status;
 }
@@ -5876,8 +6065,6 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	 * Note order: OP_LINK leaves the directory as the current
 	 *             filehandle.
 	 */
-	if (decode_getfattr(xdr, res->dir_attr, res->server))
-		goto out;
 	status = decode_restorefh(xdr);
 	if (status)
 		goto out;
@@ -5904,21 +6091,13 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_savefh(xdr);
-	if (status)
-		goto out;
 	status = decode_create(xdr, &res->dir_cinfo);
 	if (status)
 		goto out;
 	status = decode_getfh(xdr, res->fh);
 	if (status)
 		goto out;
-	if (decode_getfattr(xdr, res->fattr, res->server))
-		goto out;
-	status = decode_restorefh(xdr);
-	if (status)
-		goto out;
-	decode_getfattr(xdr, res->dir_fattr, res->server);
+	decode_getfattr(xdr, res->fattr, res->server);
 out:
 	return status;
 }
@@ -6075,19 +6254,12 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_savefh(xdr);
-	if (status)
-		goto out;
 	status = decode_open(xdr, res);
 	if (status)
 		goto out;
 	if (decode_getfh(xdr, &res->fh) != 0)
 		goto out;
-	if (decode_getfattr(xdr, res->f_attr, res->server) != 0)
-		goto out;
-	if (decode_restorefh(xdr) != 0)
-		goto out;
-	decode_getfattr(xdr, res->dir_attr, res->server);
+	decode_getfattr(xdr, res->f_attr, res->server);
 out:
 	return status;
 }
@@ -6353,7 +6525,7 @@ out:
  * Decode COMMIT response
  */
 static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
-			       struct nfs_writeres *res)
+			       struct nfs_commitres *res)
 {
 	struct compound_hdr hdr;
 	int status;
@@ -6368,10 +6540,6 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	if (status)
 		goto out;
 	status = decode_commit(xdr, res);
-	if (status)
-		goto out;
-	if (res->fattr)
-		decode_getfattr(xdr, res->fattr, res->server);
 out:
 	return status;
 }
@@ -6527,10 +6695,10 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp,
 	status = decode_putfh(xdr);
 	if (status != 0)
 		goto out;
-	status = decode_delegreturn(xdr);
+	status = decode_getfattr(xdr, res->fattr, res->server);
 	if (status != 0)
 		goto out;
-	decode_getfattr(xdr, res->fattr, res->server);
+	status = decode_delegreturn(xdr);
 out:
 	return status;
 }
@@ -6591,6 +6759,22 @@ out:
 
 #if defined(CONFIG_NFS_V4_1)
 /*
+ * Decode BIND_CONN_TO_SESSION response
+ */
+static int nfs4_xdr_dec_bind_conn_to_session(struct rpc_rqst *rqstp,
+					struct xdr_stream *xdr,
+					void *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_bind_conn_to_session(xdr, res);
+	return status;
+}
+
+/*
  * Decode EXCHANGE_ID response
  */
 static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp,
@@ -6639,6 +6823,22 @@ static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp,
 }
 
 /*
+ * Decode DESTROY_CLIENTID response
+ */
+static int nfs4_xdr_dec_destroy_clientid(struct rpc_rqst *rqstp,
+					struct xdr_stream *xdr,
+					void *res)
+{
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (!status)
+		status = decode_destroy_clientid(xdr, res);
+	return status;
+}
+
+/*
  * Decode SEQUENCE response
  */
 static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp,
@@ -7085,6 +7285,9 @@ struct rpc_procinfo	nfs4_procedures[] = {
 	PROC(TEST_STATEID,	enc_test_stateid,	dec_test_stateid),
 	PROC(FREE_STATEID,	enc_free_stateid,	dec_free_stateid),
 	PROC(GETDEVICELIST,	enc_getdevicelist,	dec_getdevicelist),
+	PROC(BIND_CONN_TO_SESSION,
+			enc_bind_conn_to_session, dec_bind_conn_to_session),
+	PROC(DESTROY_CLIENTID,	enc_destroy_clientid,	dec_destroy_clientid),
 #endif /* CONFIG_NFS_V4_1 */
 };
 
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 4bff4a3dab46..b47277baebab 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -211,7 +211,7 @@ static void copy_single_comp(struct ore_components *oc, unsigned c,
 	memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred));
 }
 
-int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags,
+static int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags,
 		       struct objio_segment **pseg)
 {
 /*	This is the in memory structure of the objio_segment
@@ -440,11 +440,12 @@ static void _read_done(struct ore_io_state *ios, void *private)
 
 int objio_read_pagelist(struct nfs_read_data *rdata)
 {
+	struct nfs_pgio_header *hdr = rdata->header;
 	struct objio_state *objios;
 	int ret;
 
-	ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout, true,
-			rdata->lseg, rdata->args.pages, rdata->args.pgbase,
+	ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, true,
+			hdr->lseg, rdata->args.pages, rdata->args.pgbase,
 			rdata->args.offset, rdata->args.count, rdata,
 			GFP_KERNEL, &objios);
 	if (unlikely(ret))
@@ -483,12 +484,12 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
 {
 	struct objio_state *objios = priv;
 	struct nfs_write_data *wdata = objios->oir.rpcdata;
+	struct address_space *mapping = wdata->header->inode->i_mapping;
 	pgoff_t index = offset / PAGE_SIZE;
-	struct page *page = find_get_page(wdata->inode->i_mapping, index);
+	struct page *page = find_get_page(mapping, index);
 
 	if (!page) {
-		page = find_or_create_page(wdata->inode->i_mapping,
-						index, GFP_NOFS);
+		page = find_or_create_page(mapping, index, GFP_NOFS);
 		if (unlikely(!page)) {
 			dprintk("%s: grab_cache_page Failed index=0x%lx\n",
 				__func__, index);
@@ -518,11 +519,12 @@ static const struct _ore_r4w_op _r4w_op = {
 
 int objio_write_pagelist(struct nfs_write_data *wdata, int how)
 {
+	struct nfs_pgio_header *hdr = wdata->header;
 	struct objio_state *objios;
 	int ret;
 
-	ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout, false,
-			wdata->lseg, wdata->args.pages, wdata->args.pgbase,
+	ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, false,
+			hdr->lseg, wdata->args.pages, wdata->args.pgbase,
 			wdata->args.offset, wdata->args.count, wdata, GFP_NOFS,
 			&objios);
 	if (unlikely(ret))
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 595c5fc21a19..874613545301 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -258,7 +258,7 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
 	if (status >= 0)
 		rdata->res.count = status;
 	else
-		rdata->pnfs_error = status;
+		rdata->header->pnfs_error = status;
 	objlayout_iodone(oir);
 	/* must not use oir after this point */
 
@@ -279,12 +279,14 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
 enum pnfs_try_status
 objlayout_read_pagelist(struct nfs_read_data *rdata)
 {
+	struct nfs_pgio_header *hdr = rdata->header;
+	struct inode *inode = hdr->inode;
 	loff_t offset = rdata->args.offset;
 	size_t count = rdata->args.count;
 	int err;
 	loff_t eof;
 
-	eof = i_size_read(rdata->inode);
+	eof = i_size_read(inode);
 	if (unlikely(offset + count > eof)) {
 		if (offset >= eof) {
 			err = 0;
@@ -297,17 +299,17 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
 	}
 
 	rdata->res.eof = (offset + count) >= eof;
-	_fix_verify_io_params(rdata->lseg, &rdata->args.pages,
+	_fix_verify_io_params(hdr->lseg, &rdata->args.pages,
 			      &rdata->args.pgbase,
 			      rdata->args.offset, rdata->args.count);
 
 	dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
-		__func__, rdata->inode->i_ino, offset, count, rdata->res.eof);
+		__func__, inode->i_ino, offset, count, rdata->res.eof);
 
 	err = objio_read_pagelist(rdata);
  out:
 	if (unlikely(err)) {
-		rdata->pnfs_error = err;
+		hdr->pnfs_error = err;
 		dprintk("%s: Returned Error %d\n", __func__, err);
 		return PNFS_NOT_ATTEMPTED;
 	}
@@ -340,7 +342,7 @@ objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
 		wdata->res.count = status;
 		wdata->verf.committed = oir->committed;
 	} else {
-		wdata->pnfs_error = status;
+		wdata->header->pnfs_error = status;
 	}
 	objlayout_iodone(oir);
 	/* must not use oir after this point */
@@ -363,15 +365,16 @@ enum pnfs_try_status
 objlayout_write_pagelist(struct nfs_write_data *wdata,
 			 int how)
 {
+	struct nfs_pgio_header *hdr = wdata->header;
 	int err;
 
-	_fix_verify_io_params(wdata->lseg, &wdata->args.pages,
+	_fix_verify_io_params(hdr->lseg, &wdata->args.pages,
 			      &wdata->args.pgbase,
 			      wdata->args.offset, wdata->args.count);
 
 	err = objio_write_pagelist(wdata, how);
 	if (unlikely(err)) {
-		wdata->pnfs_error = err;
+		hdr->pnfs_error = err;
 		dprintk("%s: Returned Error %d\n", __func__, err);
 		return PNFS_NOT_ATTEMPTED;
 	}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index d21fceaa9f62..aed913c833f4 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -26,6 +26,47 @@
 
 static struct kmem_cache *nfs_page_cachep;
 
+bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
+{
+	p->npages = pagecount;
+	if (pagecount <= ARRAY_SIZE(p->page_array))
+		p->pagevec = p->page_array;
+	else {
+		p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
+		if (!p->pagevec)
+			p->npages = 0;
+	}
+	return p->pagevec != NULL;
+}
+
+void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
+		       struct nfs_pgio_header *hdr,
+		       void (*release)(struct nfs_pgio_header *hdr))
+{
+	hdr->req = nfs_list_entry(desc->pg_list.next);
+	hdr->inode = desc->pg_inode;
+	hdr->cred = hdr->req->wb_context->cred;
+	hdr->io_start = req_offset(hdr->req);
+	hdr->good_bytes = desc->pg_count;
+	hdr->dreq = desc->pg_dreq;
+	hdr->release = release;
+	hdr->completion_ops = desc->pg_completion_ops;
+	if (hdr->completion_ops->init_hdr)
+		hdr->completion_ops->init_hdr(hdr);
+}
+
+void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos)
+{
+	spin_lock(&hdr->lock);
+	if (pos < hdr->io_start + hdr->good_bytes) {
+		set_bit(NFS_IOHDR_ERROR, &hdr->flags);
+		clear_bit(NFS_IOHDR_EOF, &hdr->flags);
+		hdr->good_bytes = pos - hdr->io_start;
+		hdr->error = error;
+	}
+	spin_unlock(&hdr->lock);
+}
+
 static inline struct nfs_page *
 nfs_page_alloc(void)
 {
@@ -76,12 +117,8 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
 	 * long write-back delay. This will be adjusted in
 	 * update_nfs_request below if the region is not locked. */
 	req->wb_page    = page;
-	atomic_set(&req->wb_complete, 0);
 	req->wb_index	= page->index;
 	page_cache_get(page);
-	BUG_ON(PagePrivate(page));
-	BUG_ON(!PageLocked(page));
-	BUG_ON(page->mapping->host != inode);
 	req->wb_offset  = offset;
 	req->wb_pgbase	= offset;
 	req->wb_bytes   = count;
@@ -104,6 +141,15 @@ void nfs_unlock_request(struct nfs_page *req)
 	clear_bit(PG_BUSY, &req->wb_flags);
 	smp_mb__after_clear_bit();
 	wake_up_bit(&req->wb_flags, PG_BUSY);
+}
+
+/**
+ * nfs_unlock_and_release_request - Unlock request and release the nfs_page
+ * @req:
+ */
+void nfs_unlock_and_release_request(struct nfs_page *req)
+{
+	nfs_unlock_request(req);
 	nfs_release_request(req);
 }
 
@@ -203,6 +249,7 @@ EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
 void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 		     struct inode *inode,
 		     const struct nfs_pageio_ops *pg_ops,
+		     const struct nfs_pgio_completion_ops *compl_ops,
 		     size_t bsize,
 		     int io_flags)
 {
@@ -215,9 +262,11 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 	desc->pg_recoalesce = 0;
 	desc->pg_inode = inode;
 	desc->pg_ops = pg_ops;
+	desc->pg_completion_ops = compl_ops;
 	desc->pg_ioflags = io_flags;
 	desc->pg_error = 0;
 	desc->pg_lseg = NULL;
+	desc->pg_dreq = NULL;
 }
 
 /**
@@ -241,12 +290,12 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
 		return false;
 	if (req->wb_context->state != prev->wb_context->state)
 		return false;
-	if (req->wb_index != (prev->wb_index + 1))
-		return false;
 	if (req->wb_pgbase != 0)
 		return false;
 	if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
 		return false;
+	if (req_offset(req) != req_offset(prev) + prev->wb_bytes)
+		return false;
 	return pgio->pg_ops->pg_test(pgio, prev, req);
 }
 
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 38512bcd2e98..b8323aa7b543 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -395,6 +395,9 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
 	dprintk("%s:Begin lo %p\n", __func__, lo);
 
 	if (list_empty(&lo->plh_segs)) {
+		/* Reset MDS Threshold I/O counters */
+		NFS_I(lo->plh_inode)->write_io = 0;
+		NFS_I(lo->plh_inode)->read_io = 0;
 		if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
 			put_layout_hdr_locked(lo);
 		return 0;
@@ -455,6 +458,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
 	spin_unlock(&nfsi->vfs_inode.i_lock);
 	pnfs_free_lseg_list(&tmp_list);
 }
+EXPORT_SYMBOL_GPL(pnfs_destroy_layout);
 
 /*
  * Called by the state manger to remove all layouts established under an
@@ -692,6 +696,7 @@ out:
 	dprintk("<-- %s status: %d\n", __func__, status);
 	return status;
 }
+EXPORT_SYMBOL_GPL(_pnfs_return_layout);
 
 bool pnfs_roc(struct inode *ino)
 {
@@ -931,6 +936,81 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
 }
 
 /*
+ * Use mdsthreshold hints set at each OPEN to determine if I/O should go
+ * to the MDS or over pNFS
+ *
+ * The nfs_inode read_io and write_io fields are cumulative counters reset
+ * when there are no layout segments. Note that in pnfs_update_layout iomode
+ * is set to IOMODE_READ for a READ request, and set to IOMODE_RW for a
+ * WRITE request.
+ *
+ * A return of true means use MDS I/O.
+ *
+ * From rfc 5661:
+ * If a file's size is smaller than the file size threshold, data accesses
+ * SHOULD be sent to the metadata server.  If an I/O request has a length that
+ * is below the I/O size threshold, the I/O SHOULD be sent to the metadata
+ * server.  If both file size and I/O size are provided, the client SHOULD
+ * reach or exceed  both thresholds before sending its read or write
+ * requests to the data server.
+ */
+static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
+				     struct inode *ino, int iomode)
+{
+	struct nfs4_threshold *t = ctx->mdsthreshold;
+	struct nfs_inode *nfsi = NFS_I(ino);
+	loff_t fsize = i_size_read(ino);
+	bool size = false, size_set = false, io = false, io_set = false, ret = false;
+
+	if (t == NULL)
+		return ret;
+
+	dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n",
+		__func__, t->bm, t->rd_sz, t->wr_sz, t->rd_io_sz, t->wr_io_sz);
+
+	switch (iomode) {
+	case IOMODE_READ:
+		if (t->bm & THRESHOLD_RD) {
+			dprintk("%s fsize %llu\n", __func__, fsize);
+			size_set = true;
+			if (fsize < t->rd_sz)
+				size = true;
+		}
+		if (t->bm & THRESHOLD_RD_IO) {
+			dprintk("%s nfsi->read_io %llu\n", __func__,
+				nfsi->read_io);
+			io_set = true;
+			if (nfsi->read_io < t->rd_io_sz)
+				io = true;
+		}
+		break;
+	case IOMODE_RW:
+		if (t->bm & THRESHOLD_WR) {
+			dprintk("%s fsize %llu\n", __func__, fsize);
+			size_set = true;
+			if (fsize < t->wr_sz)
+				size = true;
+		}
+		if (t->bm & THRESHOLD_WR_IO) {
+			dprintk("%s nfsi->write_io %llu\n", __func__,
+				nfsi->write_io);
+			io_set = true;
+			if (nfsi->write_io < t->wr_io_sz)
+				io = true;
+		}
+		break;
+	}
+	if (size_set && io_set) {
+		if (size && io)
+			ret = true;
+	} else if (size || io)
+		ret = true;
+
+	dprintk("<-- %s size %d io %d ret %d\n", __func__, size, io, ret);
+	return ret;
+}
+
+/*
  * Layout segment is retreived from the server if not cached.
  * The appropriate layout segment is referenced and returned to the caller.
  */
@@ -957,6 +1037,10 @@ pnfs_update_layout(struct inode *ino,
 
 	if (!pnfs_enabled_sb(NFS_SERVER(ino)))
 		return NULL;
+
+	if (pnfs_within_mdsthreshold(ctx, ino, iomode))
+		return NULL;
+
 	spin_lock(&ino->i_lock);
 	lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
 	if (lo == NULL) {
@@ -1082,6 +1166,10 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
 {
 	BUG_ON(pgio->pg_lseg != NULL);
 
+	if (req->wb_offset != req->wb_pgbase) {
+		nfs_pageio_reset_read_mds(pgio);
+		return;
+	}
 	pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 					   req->wb_context,
 					   req_offset(req),
@@ -1100,6 +1188,10 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *
 {
 	BUG_ON(pgio->pg_lseg != NULL);
 
+	if (req->wb_offset != req->wb_pgbase) {
+		nfs_pageio_reset_write_mds(pgio);
+		return;
+	}
 	pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 					   req->wb_context,
 					   req_offset(req),
@@ -1113,26 +1205,31 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
 
 bool
-pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
+pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode,
+		      const struct nfs_pgio_completion_ops *compl_ops)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
 
 	if (ld == NULL)
 		return false;
-	nfs_pageio_init(pgio, inode, ld->pg_read_ops, server->rsize, 0);
+	nfs_pageio_init(pgio, inode, ld->pg_read_ops, compl_ops,
+			server->rsize, 0);
 	return true;
 }
 
 bool
-pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags)
+pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode,
+		       int ioflags,
+		       const struct nfs_pgio_completion_ops *compl_ops)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
 
 	if (ld == NULL)
 		return false;
-	nfs_pageio_init(pgio, inode, ld->pg_write_ops, server->wsize, ioflags);
+	nfs_pageio_init(pgio, inode, ld->pg_write_ops, compl_ops,
+			server->wsize, ioflags);
 	return true;
 }
 
@@ -1162,13 +1259,15 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
 
-static int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head)
+int pnfs_write_done_resend_to_mds(struct inode *inode,
+				struct list_head *head,
+				const struct nfs_pgio_completion_ops *compl_ops)
 {
 	struct nfs_pageio_descriptor pgio;
 	LIST_HEAD(failed);
 
 	/* Resend all requests through the MDS */
-	nfs_pageio_init_write_mds(&pgio, inode, FLUSH_STABLE);
+	nfs_pageio_init_write_mds(&pgio, inode, FLUSH_STABLE, compl_ops);
 	while (!list_empty(head)) {
 		struct nfs_page *req = nfs_list_entry(head->next);
 
@@ -1188,30 +1287,37 @@ static int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *
 	}
 	return 0;
 }
+EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
+
+static void pnfs_ld_handle_write_error(struct nfs_write_data *data)
+{
+	struct nfs_pgio_header *hdr = data->header;
+
+	dprintk("pnfs write error = %d\n", hdr->pnfs_error);
+	if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
+	    PNFS_LAYOUTRET_ON_ERROR) {
+		clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags);
+		pnfs_return_layout(hdr->inode);
+	}
+	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
+		data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode,
+							&hdr->pages,
+							hdr->completion_ops);
+}
 
 /*
  * Called by non rpc-based layout drivers
  */
 void pnfs_ld_write_done(struct nfs_write_data *data)
 {
-	if (likely(!data->pnfs_error)) {
+	struct nfs_pgio_header *hdr = data->header;
+
+	if (!hdr->pnfs_error) {
 		pnfs_set_layoutcommit(data);
-		data->mds_ops->rpc_call_done(&data->task, data);
-	} else {
-		dprintk("pnfs write error = %d\n", data->pnfs_error);
-		if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags &
-						PNFS_LAYOUTRET_ON_ERROR) {
-			/* Don't lo_commit on error, Server will needs to
-			 * preform a file recovery.
-			 */
-			clear_bit(NFS_INO_LAYOUTCOMMIT,
-				  &NFS_I(data->inode)->flags);
-			pnfs_return_layout(data->inode);
-		}
-		data->task.tk_status = pnfs_write_done_resend_to_mds(data->inode, &data->pages);
-	}
-	put_lseg(data->lseg);
-	data->mds_ops->rpc_release(data);
+		hdr->mds_ops->rpc_call_done(&data->task, data);
+	} else
+		pnfs_ld_handle_write_error(data);
+	hdr->mds_ops->rpc_release(data);
 }
 EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
 
@@ -1219,12 +1325,13 @@ static void
 pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
 		struct nfs_write_data *data)
 {
-	list_splice_tail_init(&data->pages, &desc->pg_list);
-	if (data->req && list_empty(&data->req->wb_list))
-		nfs_list_add_request(data->req, &desc->pg_list);
-	nfs_pageio_reset_write_mds(desc);
-	desc->pg_recoalesce = 1;
-	put_lseg(data->lseg);
+	struct nfs_pgio_header *hdr = data->header;
+
+	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+		list_splice_tail_init(&hdr->pages, &desc->pg_list);
+		nfs_pageio_reset_write_mds(desc);
+		desc->pg_recoalesce = 1;
+	}
 	nfs_writedata_release(data);
 }
 
@@ -1234,23 +1341,18 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,
 			struct pnfs_layout_segment *lseg,
 			int how)
 {
-	struct inode *inode = wdata->inode;
+	struct nfs_pgio_header *hdr = wdata->header;
+	struct inode *inode = hdr->inode;
 	enum pnfs_try_status trypnfs;
 	struct nfs_server *nfss = NFS_SERVER(inode);
 
-	wdata->mds_ops = call_ops;
-	wdata->lseg = get_lseg(lseg);
+	hdr->mds_ops = call_ops;
 
 	dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
 		inode->i_ino, wdata->args.count, wdata->args.offset, how);
-
 	trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how);
-	if (trypnfs == PNFS_NOT_ATTEMPTED) {
-		put_lseg(wdata->lseg);
-		wdata->lseg = NULL;
-	} else
+	if (trypnfs != PNFS_NOT_ATTEMPTED)
 		nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
-
 	dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
 	return trypnfs;
 }
@@ -1266,7 +1368,7 @@ pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *he
 	while (!list_empty(head)) {
 		enum pnfs_try_status trypnfs;
 
-		data = list_entry(head->next, struct nfs_write_data, list);
+		data = list_first_entry(head, struct nfs_write_data, list);
 		list_del_init(&data->list);
 
 		trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
@@ -1276,43 +1378,82 @@ pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *he
 	put_lseg(lseg);
 }
 
+static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
+{
+	put_lseg(hdr->lseg);
+	nfs_writehdr_free(hdr);
+}
+
 int
 pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 {
-	LIST_HEAD(head);
+	struct nfs_write_header *whdr;
+	struct nfs_pgio_header *hdr;
 	int ret;
 
-	ret = nfs_generic_flush(desc, &head);
-	if (ret != 0) {
+	whdr = nfs_writehdr_alloc();
+	if (!whdr) {
+		desc->pg_completion_ops->error_cleanup(&desc->pg_list);
 		put_lseg(desc->pg_lseg);
 		desc->pg_lseg = NULL;
-		return ret;
+		return -ENOMEM;
 	}
-	pnfs_do_multiple_writes(desc, &head, desc->pg_ioflags);
-	return 0;
+	hdr = &whdr->header;
+	nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
+	hdr->lseg = get_lseg(desc->pg_lseg);
+	atomic_inc(&hdr->refcnt);
+	ret = nfs_generic_flush(desc, hdr);
+	if (ret != 0) {
+		put_lseg(desc->pg_lseg);
+		desc->pg_lseg = NULL;
+	} else
+		pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags);
+	if (atomic_dec_and_test(&hdr->refcnt))
+		hdr->completion_ops->completion(hdr);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
 
-static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
+int pnfs_read_done_resend_to_mds(struct inode *inode,
+				struct list_head *head,
+				const struct nfs_pgio_completion_ops *compl_ops)
 {
 	struct nfs_pageio_descriptor pgio;
+	LIST_HEAD(failed);
 
-	put_lseg(data->lseg);
-	data->lseg = NULL;
-	dprintk("pnfs write error = %d\n", data->pnfs_error);
-	if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags &
-						PNFS_LAYOUTRET_ON_ERROR)
-		pnfs_return_layout(data->inode);
-
-	nfs_pageio_init_read_mds(&pgio, data->inode);
-
-	while (!list_empty(&data->pages)) {
-		struct nfs_page *req = nfs_list_entry(data->pages.next);
+	/* Resend all requests through the MDS */
+	nfs_pageio_init_read_mds(&pgio, inode, compl_ops);
+	while (!list_empty(head)) {
+		struct nfs_page *req = nfs_list_entry(head->next);
 
 		nfs_list_remove_request(req);
-		nfs_pageio_add_request(&pgio, req);
+		if (!nfs_pageio_add_request(&pgio, req))
+			nfs_list_add_request(req, &failed);
 	}
 	nfs_pageio_complete(&pgio);
+
+	if (!list_empty(&failed)) {
+		list_move(&failed, head);
+		return -EIO;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds);
+
+static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
+{
+	struct nfs_pgio_header *hdr = data->header;
+
+	dprintk("pnfs read error = %d\n", hdr->pnfs_error);
+	if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
+	    PNFS_LAYOUTRET_ON_ERROR) {
+		clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags);
+		pnfs_return_layout(hdr->inode);
+	}
+	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
+		data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode,
+							&hdr->pages,
+							hdr->completion_ops);
 }
 
 /*
@@ -1320,13 +1461,14 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
  */
 void pnfs_ld_read_done(struct nfs_read_data *data)
 {
-	if (likely(!data->pnfs_error)) {
+	struct nfs_pgio_header *hdr = data->header;
+
+	if (likely(!hdr->pnfs_error)) {
 		__nfs4_read_done_cb(data);
-		data->mds_ops->rpc_call_done(&data->task, data);
+		hdr->mds_ops->rpc_call_done(&data->task, data);
 	} else
 		pnfs_ld_handle_read_error(data);
-	put_lseg(data->lseg);
-	data->mds_ops->rpc_release(data);
+	hdr->mds_ops->rpc_release(data);
 }
 EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
 
@@ -1334,11 +1476,13 @@ static void
 pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
 		struct nfs_read_data *data)
 {
-	list_splice_tail_init(&data->pages, &desc->pg_list);
-	if (data->req && list_empty(&data->req->wb_list))
-		nfs_list_add_request(data->req, &desc->pg_list);
-	nfs_pageio_reset_read_mds(desc);
-	desc->pg_recoalesce = 1;
+	struct nfs_pgio_header *hdr = data->header;
+
+	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+		list_splice_tail_init(&hdr->pages, &desc->pg_list);
+		nfs_pageio_reset_read_mds(desc);
+		desc->pg_recoalesce = 1;
+	}
 	nfs_readdata_release(data);
 }
 
@@ -1350,23 +1494,19 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata,
 		       const struct rpc_call_ops *call_ops,
 		       struct pnfs_layout_segment *lseg)
 {
-	struct inode *inode = rdata->inode;
+	struct nfs_pgio_header *hdr = rdata->header;
+	struct inode *inode = hdr->inode;
 	struct nfs_server *nfss = NFS_SERVER(inode);
 	enum pnfs_try_status trypnfs;
 
-	rdata->mds_ops = call_ops;
-	rdata->lseg = get_lseg(lseg);
+	hdr->mds_ops = call_ops;
 
 	dprintk("%s: Reading ino:%lu %u@%llu\n",
 		__func__, inode->i_ino, rdata->args.count, rdata->args.offset);
 
 	trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata);
-	if (trypnfs == PNFS_NOT_ATTEMPTED) {
-		put_lseg(rdata->lseg);
-		rdata->lseg = NULL;
-	} else {
+	if (trypnfs != PNFS_NOT_ATTEMPTED)
 		nfs_inc_stats(inode, NFSIOS_PNFS_READ);
-	}
 	dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
 	return trypnfs;
 }
@@ -1382,7 +1522,7 @@ pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *hea
 	while (!list_empty(head)) {
 		enum pnfs_try_status trypnfs;
 
-		data = list_entry(head->next, struct nfs_read_data, list);
+		data = list_first_entry(head, struct nfs_read_data, list);
 		list_del_init(&data->list);
 
 		trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
@@ -1392,20 +1532,40 @@ pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *hea
 	put_lseg(lseg);
 }
 
+static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
+{
+	put_lseg(hdr->lseg);
+	nfs_readhdr_free(hdr);
+}
+
 int
 pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
 {
-	LIST_HEAD(head);
+	struct nfs_read_header *rhdr;
+	struct nfs_pgio_header *hdr;
 	int ret;
 
-	ret = nfs_generic_pagein(desc, &head);
-	if (ret != 0) {
+	rhdr = nfs_readhdr_alloc();
+	if (!rhdr) {
+		desc->pg_completion_ops->error_cleanup(&desc->pg_list);
+		ret = -ENOMEM;
 		put_lseg(desc->pg_lseg);
 		desc->pg_lseg = NULL;
 		return ret;
 	}
-	pnfs_do_multiple_reads(desc, &head);
-	return 0;
+	hdr = &rhdr->header;
+	nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
+	hdr->lseg = get_lseg(desc->pg_lseg);
+	atomic_inc(&hdr->refcnt);
+	ret = nfs_generic_pagein(desc, hdr);
+	if (ret != 0) {
+		put_lseg(desc->pg_lseg);
+		desc->pg_lseg = NULL;
+	} else
+		pnfs_do_multiple_reads(desc, &hdr->rpc_list);
+	if (atomic_dec_and_test(&hdr->refcnt))
+		hdr->completion_ops->completion(hdr);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
 
@@ -1438,30 +1598,32 @@ EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
 void
 pnfs_set_layoutcommit(struct nfs_write_data *wdata)
 {
-	struct nfs_inode *nfsi = NFS_I(wdata->inode);
+	struct nfs_pgio_header *hdr = wdata->header;
+	struct inode *inode = hdr->inode;
+	struct nfs_inode *nfsi = NFS_I(inode);
 	loff_t end_pos = wdata->mds_offset + wdata->res.count;
 	bool mark_as_dirty = false;
 
-	spin_lock(&nfsi->vfs_inode.i_lock);
+	spin_lock(&inode->i_lock);
 	if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
 		mark_as_dirty = true;
 		dprintk("%s: Set layoutcommit for inode %lu ",
-			__func__, wdata->inode->i_ino);
+			__func__, inode->i_ino);
 	}
-	if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &wdata->lseg->pls_flags)) {
+	if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) {
 		/* references matched in nfs4_layoutcommit_release */
-		get_lseg(wdata->lseg);
+		get_lseg(hdr->lseg);
 	}
 	if (end_pos > nfsi->layout->plh_lwb)
 		nfsi->layout->plh_lwb = end_pos;
-	spin_unlock(&nfsi->vfs_inode.i_lock);
+	spin_unlock(&inode->i_lock);
 	dprintk("%s: lseg %p end_pos %llu\n",
-		__func__, wdata->lseg, nfsi->layout->plh_lwb);
+		__func__, hdr->lseg, nfsi->layout->plh_lwb);
 
 	/* if pnfs_layoutcommit_inode() runs between inode locks, the next one
 	 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
 	if (mark_as_dirty)
-		mark_inode_dirty_sync(wdata->inode);
+		mark_inode_dirty_sync(inode);
 }
 EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
 
@@ -1550,3 +1712,15 @@ out_free:
 	kfree(data);
 	goto out;
 }
+
+struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
+{
+	struct nfs4_threshold *thp;
+
+	thp = kzalloc(sizeof(*thp), GFP_NOFS);
+	if (!thp) {
+		dprintk("%s mdsthreshold allocation failed\n", __func__);
+		return NULL;
+	}
+	return thp;
+}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 442ebf68eeec..29fd23c0efdc 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -63,6 +63,7 @@ enum {
 	NFS_LAYOUT_BULK_RECALL,		/* bulk recall affecting layout */
 	NFS_LAYOUT_ROC,			/* some lseg had roc bit set */
 	NFS_LAYOUT_DESTROYED,		/* no new use of layout allowed */
+	NFS_LAYOUT_INVALID,		/* layout is being destroyed */
 };
 
 enum layoutdriver_policy_flags {
@@ -94,11 +95,20 @@ struct pnfs_layoutdriver_type {
 	const struct nfs_pageio_ops *pg_read_ops;
 	const struct nfs_pageio_ops *pg_write_ops;
 
+	struct pnfs_ds_commit_info *(*get_ds_info) (struct inode *inode);
 	void (*mark_request_commit) (struct nfs_page *req,
-					struct pnfs_layout_segment *lseg);
-	void (*clear_request_commit) (struct nfs_page *req);
-	int (*scan_commit_lists) (struct inode *inode, int max, spinlock_t *lock);
-	int (*commit_pagelist)(struct inode *inode, struct list_head *mds_pages, int how);
+				     struct pnfs_layout_segment *lseg,
+				     struct nfs_commit_info *cinfo);
+	void (*clear_request_commit) (struct nfs_page *req,
+				      struct nfs_commit_info *cinfo);
+	int (*scan_commit_lists) (struct nfs_commit_info *cinfo,
+				  int max);
+	void (*recover_commit_reqs) (struct list_head *list,
+				     struct nfs_commit_info *cinfo);
+	int (*commit_pagelist)(struct inode *inode,
+			       struct list_head *mds_pages,
+			       int how,
+			       struct nfs_commit_info *cinfo);
 
 	/*
 	 * Return PNFS_ATTEMPTED to indicate the layout code has attempted
@@ -168,8 +178,10 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
 void get_layout_hdr(struct pnfs_layout_hdr *lo);
 void put_lseg(struct pnfs_layout_segment *lseg);
 
-bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
-bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, int);
+bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *,
+			   const struct nfs_pgio_completion_ops *);
+bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *,
+			    int, const struct nfs_pgio_completion_ops *);
 
 void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);
 void unset_pnfs_layoutdriver(struct nfs_server *);
@@ -211,6 +223,11 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
 					       gfp_t gfp_flags);
 
 void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
+int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head,
+			const struct nfs_pgio_completion_ops *compl_ops);
+int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head,
+			const struct nfs_pgio_completion_ops *compl_ops);
+struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
 
 /* nfs4_deviceid_flags */
 enum {
@@ -261,49 +278,66 @@ static inline int pnfs_enabled_sb(struct nfs_server *nfss)
 }
 
 static inline int
-pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how)
+pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how,
+		 struct nfs_commit_info *cinfo)
 {
-	if (!test_and_clear_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags))
+	if (cinfo->ds == NULL || cinfo->ds->ncommitting == 0)
 		return PNFS_NOT_ATTEMPTED;
-	return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how);
+	return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how, cinfo);
+}
+
+static inline struct pnfs_ds_commit_info *
+pnfs_get_ds_info(struct inode *inode)
+{
+	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+
+	if (ld == NULL || ld->get_ds_info == NULL)
+		return NULL;
+	return ld->get_ds_info(inode);
 }
 
 static inline bool
-pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
+pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
+			 struct nfs_commit_info *cinfo)
 {
 	struct inode *inode = req->wb_context->dentry->d_inode;
 	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
 
 	if (lseg == NULL || ld->mark_request_commit == NULL)
 		return false;
-	ld->mark_request_commit(req, lseg);
+	ld->mark_request_commit(req, lseg, cinfo);
 	return true;
 }
 
 static inline bool
-pnfs_clear_request_commit(struct nfs_page *req)
+pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo)
 {
 	struct inode *inode = req->wb_context->dentry->d_inode;
 	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
 
 	if (ld == NULL || ld->clear_request_commit == NULL)
 		return false;
-	ld->clear_request_commit(req);
+	ld->clear_request_commit(req, cinfo);
 	return true;
 }
 
 static inline int
-pnfs_scan_commit_lists(struct inode *inode, int max, spinlock_t *lock)
+pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
+		       int max)
 {
-	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
-	int ret;
-
-	if (ld == NULL || ld->scan_commit_lists == NULL)
+	if (cinfo->ds == NULL || cinfo->ds->nwritten == 0)
 		return 0;
-	ret = ld->scan_commit_lists(inode, max, lock);
-	if (ret != 0)
-		set_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags);
-	return ret;
+	else
+		return NFS_SERVER(inode)->pnfs_curr_ld->scan_commit_lists(cinfo, max);
+}
+
+static inline void
+pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
+			 struct nfs_commit_info *cinfo)
+{
+	if (cinfo->ds == NULL || cinfo->ds->nwritten == 0)
+		return;
+	NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
 }
 
 /* Should the pNFS client commit and return the layout upon a setattr */
@@ -327,6 +361,14 @@ static inline int pnfs_return_layout(struct inode *ino)
 	return 0;
 }
 
+static inline bool
+pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
+		   struct nfs_server *nfss)
+{
+	return (dst && src && src->bm != 0 &&
+					nfss->pnfs_curr_ld->id == src->l_type);
+}
+
 #ifdef NFS_DEBUG
 void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
 #else
@@ -396,45 +438,74 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
 {
 }
 
-static inline bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
+static inline bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode,
+					 const struct nfs_pgio_completion_ops *compl_ops)
 {
 	return false;
 }
 
-static inline bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags)
+static inline bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags,
+					  const struct nfs_pgio_completion_ops *compl_ops)
 {
 	return false;
 }
 
 static inline int
-pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how)
+pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how,
+		 struct nfs_commit_info *cinfo)
 {
 	return PNFS_NOT_ATTEMPTED;
 }
 
+static inline struct pnfs_ds_commit_info *
+pnfs_get_ds_info(struct inode *inode)
+{
+	return NULL;
+}
+
 static inline bool
-pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
+pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
+			 struct nfs_commit_info *cinfo)
 {
 	return false;
 }
 
 static inline bool
-pnfs_clear_request_commit(struct nfs_page *req)
+pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo)
 {
 	return false;
 }
 
 static inline int
-pnfs_scan_commit_lists(struct inode *inode, int max, spinlock_t *lock)
+pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
+		       int max)
 {
 	return 0;
 }
 
+static inline void
+pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
+			 struct nfs_commit_info *cinfo)
+{
+}
+
 static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
 {
 	return 0;
 }
 
+static inline bool
+pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
+		   struct nfs_server *nfss)
+{
+	return false;
+}
+
+static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
+{
+	return NULL;
+}
+
 #endif /* CONFIG_NFS_V4_1 */
 
 #endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index d6408b6437de..a706b6bcc286 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -178,7 +178,7 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 }
 
 static int
-nfs_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name,
+nfs_proc_lookup(struct inode *dir, struct qstr *name,
 		struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
 	struct nfs_diropargs	arg = {
@@ -640,12 +640,14 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 
 static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
+	struct inode *inode = data->header->inode;
+
 	if (nfs_async_handle_expired_key(task))
 		return -EAGAIN;
 
-	nfs_invalidate_atime(data->inode);
+	nfs_invalidate_atime(inode);
 	if (task->tk_status >= 0) {
-		nfs_refresh_inode(data->inode, data->res.fattr);
+		nfs_refresh_inode(inode, data->res.fattr);
 		/* Emulate the eof flag, which isn't normally needed in NFSv2
 		 * as it is guaranteed to always return the file attributes
 		 */
@@ -667,11 +669,13 @@ static void nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_dat
 
 static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
+	struct inode *inode = data->header->inode;
+
 	if (nfs_async_handle_expired_key(task))
 		return -EAGAIN;
 
 	if (task->tk_status >= 0)
-		nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr);
+		nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
 	return 0;
 }
 
@@ -687,8 +691,13 @@ static void nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_d
 	rpc_call_start(task);
 }
 
+static void nfs_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
+{
+	BUG();
+}
+
 static void
-nfs_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)
+nfs_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg)
 {
 	BUG();
 }
@@ -732,6 +741,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
 	.file_inode_ops	= &nfs_file_inode_operations,
 	.file_ops	= &nfs_file_operations,
 	.getroot	= nfs_proc_get_root,
+	.submount	= nfs_submount,
 	.getattr	= nfs_proc_getattr,
 	.setattr	= nfs_proc_setattr,
 	.lookup		= nfs_proc_lookup,
@@ -763,6 +773,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
 	.write_rpc_prepare = nfs_proc_write_rpc_prepare,
 	.write_done	= nfs_write_done,
 	.commit_setup	= nfs_proc_commit_setup,
+	.commit_rpc_prepare = nfs_proc_commit_rpc_prepare,
 	.lock		= nfs_proc_lock,
 	.lock_check_bounds = nfs_lock_check_bounds,
 	.close_context	= nfs_close_context,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 0a4be28c2ea3..86ced7836214 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -30,43 +30,73 @@
 #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
 
 static const struct nfs_pageio_ops nfs_pageio_read_ops;
-static const struct rpc_call_ops nfs_read_partial_ops;
-static const struct rpc_call_ops nfs_read_full_ops;
+static const struct rpc_call_ops nfs_read_common_ops;
+static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops;
 
 static struct kmem_cache *nfs_rdata_cachep;
 
-struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
+struct nfs_read_header *nfs_readhdr_alloc(void)
 {
-	struct nfs_read_data *p;
-
-	p = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
-	if (p) {
-		INIT_LIST_HEAD(&p->pages);
-		p->npages = pagecount;
-		if (pagecount <= ARRAY_SIZE(p->page_array))
-			p->pagevec = p->page_array;
-		else {
-			p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
-			if (!p->pagevec) {
-				kmem_cache_free(nfs_rdata_cachep, p);
-				p = NULL;
-			}
-		}
+	struct nfs_read_header *rhdr;
+
+	rhdr = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
+	if (rhdr) {
+		struct nfs_pgio_header *hdr = &rhdr->header;
+
+		INIT_LIST_HEAD(&hdr->pages);
+		INIT_LIST_HEAD(&hdr->rpc_list);
+		spin_lock_init(&hdr->lock);
+		atomic_set(&hdr->refcnt, 0);
+	}
+	return rhdr;
+}
+
+static struct nfs_read_data *nfs_readdata_alloc(struct nfs_pgio_header *hdr,
+						unsigned int pagecount)
+{
+	struct nfs_read_data *data, *prealloc;
+
+	prealloc = &container_of(hdr, struct nfs_read_header, header)->rpc_data;
+	if (prealloc->header == NULL)
+		data = prealloc;
+	else
+		data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		goto out;
+
+	if (nfs_pgarray_set(&data->pages, pagecount)) {
+		data->header = hdr;
+		atomic_inc(&hdr->refcnt);
+	} else {
+		if (data != prealloc)
+			kfree(data);
+		data = NULL;
 	}
-	return p;
+out:
+	return data;
 }
 
-void nfs_readdata_free(struct nfs_read_data *p)
+void nfs_readhdr_free(struct nfs_pgio_header *hdr)
 {
-	if (p && (p->pagevec != &p->page_array[0]))
-		kfree(p->pagevec);
-	kmem_cache_free(nfs_rdata_cachep, p);
+	struct nfs_read_header *rhdr = container_of(hdr, struct nfs_read_header, header);
+
+	kmem_cache_free(nfs_rdata_cachep, rhdr);
 }
 
 void nfs_readdata_release(struct nfs_read_data *rdata)
 {
+	struct nfs_pgio_header *hdr = rdata->header;
+	struct nfs_read_header *read_header = container_of(hdr, struct nfs_read_header, header);
+
 	put_nfs_open_context(rdata->args.context);
-	nfs_readdata_free(rdata);
+	if (rdata->pages.pagevec != rdata->pages.page_array)
+		kfree(rdata->pages.pagevec);
+	if (rdata != &read_header->rpc_data)
+		kfree(rdata);
+	else
+		rdata->header = NULL;
+	if (atomic_dec_and_test(&hdr->refcnt))
+		hdr->completion_ops->completion(hdr);
 }
 
 static
@@ -78,39 +108,11 @@ int nfs_return_empty_page(struct page *page)
 	return 0;
 }
 
-static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
-{
-	unsigned int remainder = data->args.count - data->res.count;
-	unsigned int base = data->args.pgbase + data->res.count;
-	unsigned int pglen;
-	struct page **pages;
-
-	if (data->res.eof == 0 || remainder == 0)
-		return;
-	/*
-	 * Note: "remainder" can never be negative, since we check for
-	 * 	this in the XDR code.
-	 */
-	pages = &data->args.pages[base >> PAGE_CACHE_SHIFT];
-	base &= ~PAGE_CACHE_MASK;
-	pglen = PAGE_CACHE_SIZE - base;
-	for (;;) {
-		if (remainder <= pglen) {
-			zero_user(*pages, base, remainder);
-			break;
-		}
-		zero_user(*pages, base, pglen);
-		pages++;
-		remainder -= pglen;
-		pglen = PAGE_CACHE_SIZE;
-		base = 0;
-	}
-}
-
 void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
-		struct inode *inode)
+			      struct inode *inode,
+			      const struct nfs_pgio_completion_ops *compl_ops)
 {
-	nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops,
+	nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops, compl_ops,
 			NFS_SERVER(inode)->rsize, 0);
 }
 
@@ -121,11 +123,12 @@ void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
 
-static void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
-		struct inode *inode)
+void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
+			  struct inode *inode,
+			  const struct nfs_pgio_completion_ops *compl_ops)
 {
-	if (!pnfs_pageio_init_read(pgio, inode))
-		nfs_pageio_init_read_mds(pgio, inode);
+	if (!pnfs_pageio_init_read(pgio, inode, compl_ops))
+		nfs_pageio_init_read_mds(pgio, inode, compl_ops);
 }
 
 int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
@@ -146,9 +149,10 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 	if (len < PAGE_CACHE_SIZE)
 		zero_user_segment(page, len, PAGE_CACHE_SIZE);
 
-	nfs_pageio_init_read(&pgio, inode);
+	nfs_pageio_init_read(&pgio, inode, &nfs_async_read_completion_ops);
 	nfs_pageio_add_request(&pgio, new);
 	nfs_pageio_complete(&pgio);
+	NFS_I(inode)->read_io += pgio.pg_bytes_written;
 	return 0;
 }
 
@@ -169,16 +173,49 @@ static void nfs_readpage_release(struct nfs_page *req)
 	nfs_release_request(req);
 }
 
-int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
-		      const struct rpc_call_ops *call_ops)
+/* Note io was page aligned */
+static void nfs_read_completion(struct nfs_pgio_header *hdr)
+{
+	unsigned long bytes = 0;
+
+	if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
+		goto out;
+	while (!list_empty(&hdr->pages)) {
+		struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+		struct page *page = req->wb_page;
+
+		if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) {
+			if (bytes > hdr->good_bytes)
+				zero_user(page, 0, PAGE_SIZE);
+			else if (hdr->good_bytes - bytes < PAGE_SIZE)
+				zero_user_segment(page,
+					hdr->good_bytes & ~PAGE_MASK,
+					PAGE_SIZE);
+		}
+		bytes += req->wb_bytes;
+		if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
+			if (bytes <= hdr->good_bytes)
+				SetPageUptodate(page);
+		} else
+			SetPageUptodate(page);
+		nfs_list_remove_request(req);
+		nfs_readpage_release(req);
+	}
+out:
+	hdr->release(hdr);
+}
+
+int nfs_initiate_read(struct rpc_clnt *clnt,
+		      struct nfs_read_data *data,
+		      const struct rpc_call_ops *call_ops, int flags)
 {
-	struct inode *inode = data->inode;
+	struct inode *inode = data->header->inode;
 	int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
 	struct rpc_task *task;
 	struct rpc_message msg = {
 		.rpc_argp = &data->args,
 		.rpc_resp = &data->res,
-		.rpc_cred = data->cred,
+		.rpc_cred = data->header->cred,
 	};
 	struct rpc_task_setup task_setup_data = {
 		.task = &data->task,
@@ -187,7 +224,7 @@ int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
 		.callback_ops = call_ops,
 		.callback_data = data,
 		.workqueue = nfsiod_workqueue,
-		.flags = RPC_TASK_ASYNC | swap_flags,
+		.flags = RPC_TASK_ASYNC | swap_flags | flags,
 	};
 
 	/* Set up the initial task struct. */
@@ -212,19 +249,15 @@ EXPORT_SYMBOL_GPL(nfs_initiate_read);
 /*
  * Set up the NFS read request struct
  */
-static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
+static void nfs_read_rpcsetup(struct nfs_read_data *data,
 		unsigned int count, unsigned int offset)
 {
-	struct inode *inode = req->wb_context->dentry->d_inode;
-
-	data->req	  = req;
-	data->inode	  = inode;
-	data->cred	  = req->wb_context->cred;
+	struct nfs_page *req = data->header->req;
 
-	data->args.fh     = NFS_FH(inode);
+	data->args.fh     = NFS_FH(data->header->inode);
 	data->args.offset = req_offset(req) + offset;
 	data->args.pgbase = req->wb_pgbase + offset;
-	data->args.pages  = data->pagevec;
+	data->args.pages  = data->pages.pagevec;
 	data->args.count  = count;
 	data->args.context = get_nfs_open_context(req->wb_context);
 	data->args.lock_context = req->wb_lock_context;
@@ -238,9 +271,9 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 static int nfs_do_read(struct nfs_read_data *data,
 		const struct rpc_call_ops *call_ops)
 {
-	struct inode *inode = data->args.context->dentry->d_inode;
+	struct inode *inode = data->header->inode;
 
-	return nfs_initiate_read(data, NFS_CLIENT(inode), call_ops);
+	return nfs_initiate_read(NFS_CLIENT(inode), data, call_ops, 0);
 }
 
 static int
@@ -253,7 +286,7 @@ nfs_do_multiple_reads(struct list_head *head,
 	while (!list_empty(head)) {
 		int ret2;
 
-		data = list_entry(head->next, struct nfs_read_data, list);
+		data = list_first_entry(head, struct nfs_read_data, list);
 		list_del_init(&data->list);
 
 		ret2 = nfs_do_read(data, call_ops);
@@ -275,6 +308,24 @@ nfs_async_read_error(struct list_head *head)
 	}
 }
 
+static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = {
+	.error_cleanup = nfs_async_read_error,
+	.completion = nfs_read_completion,
+};
+
+static void nfs_pagein_error(struct nfs_pageio_descriptor *desc,
+		struct nfs_pgio_header *hdr)
+{
+	set_bit(NFS_IOHDR_REDO, &hdr->flags);
+	while (!list_empty(&hdr->rpc_list)) {
+		struct nfs_read_data *data = list_first_entry(&hdr->rpc_list,
+				struct nfs_read_data, list);
+		list_del(&data->list);
+		nfs_readdata_release(data);
+	}
+	desc->pg_completion_ops->error_cleanup(&desc->pg_list);
+}
+
 /*
  * Generate multiple requests to fill a single page.
  *
@@ -288,93 +339,95 @@ nfs_async_read_error(struct list_head *head)
  * won't see the new data until our attribute cache is updated.  This is more
  * or less conventional NFS client behavior.
  */
-static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, struct list_head *res)
+static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc,
+			    struct nfs_pgio_header *hdr)
 {
-	struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
+	struct nfs_page *req = hdr->req;
 	struct page *page = req->wb_page;
 	struct nfs_read_data *data;
 	size_t rsize = desc->pg_bsize, nbytes;
 	unsigned int offset;
-	int requests = 0;
-	int ret = 0;
-
-	nfs_list_remove_request(req);
 
 	offset = 0;
 	nbytes = desc->pg_count;
 	do {
 		size_t len = min(nbytes,rsize);
 
-		data = nfs_readdata_alloc(1);
-		if (!data)
-			goto out_bad;
-		data->pagevec[0] = page;
-		nfs_read_rpcsetup(req, data, len, offset);
-		list_add(&data->list, res);
-		requests++;
+		data = nfs_readdata_alloc(hdr, 1);
+		if (!data) {
+			nfs_pagein_error(desc, hdr);
+			return -ENOMEM;
+		}
+		data->pages.pagevec[0] = page;
+		nfs_read_rpcsetup(data, len, offset);
+		list_add(&data->list, &hdr->rpc_list);
 		nbytes -= len;
 		offset += len;
-	} while(nbytes != 0);
-	atomic_set(&req->wb_complete, requests);
-	desc->pg_rpc_callops = &nfs_read_partial_ops;
-	return ret;
-out_bad:
-	while (!list_empty(res)) {
-		data = list_entry(res->next, struct nfs_read_data, list);
-		list_del(&data->list);
-		nfs_readdata_release(data);
-	}
-	nfs_readpage_release(req);
-	return -ENOMEM;
+	} while (nbytes != 0);
+
+	nfs_list_remove_request(req);
+	nfs_list_add_request(req, &hdr->pages);
+	desc->pg_rpc_callops = &nfs_read_common_ops;
+	return 0;
 }
 
-static int nfs_pagein_one(struct nfs_pageio_descriptor *desc, struct list_head *res)
+static int nfs_pagein_one(struct nfs_pageio_descriptor *desc,
+			  struct nfs_pgio_header *hdr)
 {
 	struct nfs_page		*req;
 	struct page		**pages;
-	struct nfs_read_data	*data;
+	struct nfs_read_data    *data;
 	struct list_head *head = &desc->pg_list;
-	int ret = 0;
 
-	data = nfs_readdata_alloc(nfs_page_array_len(desc->pg_base,
-						     desc->pg_count));
+	data = nfs_readdata_alloc(hdr, nfs_page_array_len(desc->pg_base,
+							  desc->pg_count));
 	if (!data) {
-		nfs_async_read_error(head);
-		ret = -ENOMEM;
-		goto out;
+		nfs_pagein_error(desc, hdr);
+		return -ENOMEM;
 	}
 
-	pages = data->pagevec;
+	pages = data->pages.pagevec;
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
-		nfs_list_add_request(req, &data->pages);
+		nfs_list_add_request(req, &hdr->pages);
 		*pages++ = req->wb_page;
 	}
-	req = nfs_list_entry(data->pages.next);
 
-	nfs_read_rpcsetup(req, data, desc->pg_count, 0);
-	list_add(&data->list, res);
-	desc->pg_rpc_callops = &nfs_read_full_ops;
-out:
-	return ret;
+	nfs_read_rpcsetup(data, desc->pg_count, 0);
+	list_add(&data->list, &hdr->rpc_list);
+	desc->pg_rpc_callops = &nfs_read_common_ops;
+	return 0;
 }
 
-int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, struct list_head *head)
+int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
+		       struct nfs_pgio_header *hdr)
 {
 	if (desc->pg_bsize < PAGE_CACHE_SIZE)
-		return nfs_pagein_multi(desc, head);
-	return nfs_pagein_one(desc, head);
+		return nfs_pagein_multi(desc, hdr);
+	return nfs_pagein_one(desc, hdr);
 }
 
 static int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
 {
-	LIST_HEAD(head);
+	struct nfs_read_header *rhdr;
+	struct nfs_pgio_header *hdr;
 	int ret;
 
-	ret = nfs_generic_pagein(desc, &head);
+	rhdr = nfs_readhdr_alloc();
+	if (!rhdr) {
+		desc->pg_completion_ops->error_cleanup(&desc->pg_list);
+		return -ENOMEM;
+	}
+	hdr = &rhdr->header;
+	nfs_pgheader_init(desc, hdr, nfs_readhdr_free);
+	atomic_inc(&hdr->refcnt);
+	ret = nfs_generic_pagein(desc, hdr);
 	if (ret == 0)
-		ret = nfs_do_multiple_reads(&head, desc->pg_rpc_callops);
+		ret = nfs_do_multiple_reads(&hdr->rpc_list,
+					    desc->pg_rpc_callops);
+	if (atomic_dec_and_test(&hdr->refcnt))
+		hdr->completion_ops->completion(hdr);
 	return ret;
 }
 
@@ -389,20 +442,21 @@ static const struct nfs_pageio_ops nfs_pageio_read_ops = {
  */
 int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data)
 {
+	struct inode *inode = data->header->inode;
 	int status;
 
 	dprintk("NFS: %s: %5u, (status %d)\n", __func__, task->tk_pid,
 			task->tk_status);
 
-	status = NFS_PROTO(data->inode)->read_done(task, data);
+	status = NFS_PROTO(inode)->read_done(task, data);
 	if (status != 0)
 		return status;
 
-	nfs_add_stats(data->inode, NFSIOS_SERVERREADBYTES, data->res.count);
+	nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, data->res.count);
 
 	if (task->tk_status == -ESTALE) {
-		set_bit(NFS_INO_STALE, &NFS_I(data->inode)->flags);
-		nfs_mark_for_revalidate(data->inode);
+		set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+		nfs_mark_for_revalidate(inode);
 	}
 	return 0;
 }
@@ -412,15 +466,13 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
 	struct nfs_readargs *argp = &data->args;
 	struct nfs_readres *resp = &data->res;
 
-	if (resp->eof || resp->count == argp->count)
-		return;
-
 	/* This is a short read! */
-	nfs_inc_stats(data->inode, NFSIOS_SHORTREAD);
+	nfs_inc_stats(data->header->inode, NFSIOS_SHORTREAD);
 	/* Has the server at least made some progress? */
-	if (resp->count == 0)
+	if (resp->count == 0) {
+		nfs_set_pgio_error(data->header, -EIO, argp->offset);
 		return;
-
+	}
 	/* Yes, so retry the read at the end of the data */
 	data->mds_offset += resp->count;
 	argp->offset += resp->count;
@@ -429,114 +481,46 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
 	rpc_restart_call_prepare(task);
 }
 
-/*
- * Handle a read reply that fills part of a page.
- */
-static void nfs_readpage_result_partial(struct rpc_task *task, void *calldata)
+static void nfs_readpage_result_common(struct rpc_task *task, void *calldata)
 {
 	struct nfs_read_data *data = calldata;
- 
+	struct nfs_pgio_header *hdr = data->header;
+
+	/* Note the only returns of nfs_readpage_result are 0 and -EAGAIN */
 	if (nfs_readpage_result(task, data) != 0)
 		return;
 	if (task->tk_status < 0)
-		return;
-
-	nfs_readpage_truncate_uninitialised_page(data);
-	nfs_readpage_retry(task, data);
+		nfs_set_pgio_error(hdr, task->tk_status, data->args.offset);
+	else if (data->res.eof) {
+		loff_t bound;
+
+		bound = data->args.offset + data->res.count;
+		spin_lock(&hdr->lock);
+		if (bound < hdr->io_start + hdr->good_bytes) {
+			set_bit(NFS_IOHDR_EOF, &hdr->flags);
+			clear_bit(NFS_IOHDR_ERROR, &hdr->flags);
+			hdr->good_bytes = bound - hdr->io_start;
+		}
+		spin_unlock(&hdr->lock);
+	} else if (data->res.count != data->args.count)
+		nfs_readpage_retry(task, data);
 }
 
-static void nfs_readpage_release_partial(void *calldata)
+static void nfs_readpage_release_common(void *calldata)
 {
-	struct nfs_read_data *data = calldata;
-	struct nfs_page *req = data->req;
-	struct page *page = req->wb_page;
-	int status = data->task.tk_status;
-
-	if (status < 0)
-		set_bit(PG_PARTIAL_READ_FAILED, &req->wb_flags);
-
-	if (atomic_dec_and_test(&req->wb_complete)) {
-		if (!test_bit(PG_PARTIAL_READ_FAILED, &req->wb_flags))
-			SetPageUptodate(page);
-		nfs_readpage_release(req);
-	}
 	nfs_readdata_release(calldata);
 }
 
 void nfs_read_prepare(struct rpc_task *task, void *calldata)
 {
 	struct nfs_read_data *data = calldata;
-	NFS_PROTO(data->inode)->read_rpc_prepare(task, data);
-}
-
-static const struct rpc_call_ops nfs_read_partial_ops = {
-	.rpc_call_prepare = nfs_read_prepare,
-	.rpc_call_done = nfs_readpage_result_partial,
-	.rpc_release = nfs_readpage_release_partial,
-};
-
-static void nfs_readpage_set_pages_uptodate(struct nfs_read_data *data)
-{
-	unsigned int count = data->res.count;
-	unsigned int base = data->args.pgbase;
-	struct page **pages;
-
-	if (data->res.eof)
-		count = data->args.count;
-	if (unlikely(count == 0))
-		return;
-	pages = &data->args.pages[base >> PAGE_CACHE_SHIFT];
-	base &= ~PAGE_CACHE_MASK;
-	count += base;
-	for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++)
-		SetPageUptodate(*pages);
-	if (count == 0)
-		return;
-	/* Was this a short read? */
-	if (data->res.eof || data->res.count == data->args.count)
-		SetPageUptodate(*pages);
-}
-
-/*
- * This is the callback from RPC telling us whether a reply was
- * received or some error occurred (timeout or socket shutdown).
- */
-static void nfs_readpage_result_full(struct rpc_task *task, void *calldata)
-{
-	struct nfs_read_data *data = calldata;
-
-	if (nfs_readpage_result(task, data) != 0)
-		return;
-	if (task->tk_status < 0)
-		return;
-	/*
-	 * Note: nfs_readpage_retry may change the values of
-	 * data->args. In the multi-page case, we therefore need
-	 * to ensure that we call nfs_readpage_set_pages_uptodate()
-	 * first.
-	 */
-	nfs_readpage_truncate_uninitialised_page(data);
-	nfs_readpage_set_pages_uptodate(data);
-	nfs_readpage_retry(task, data);
-}
-
-static void nfs_readpage_release_full(void *calldata)
-{
-	struct nfs_read_data *data = calldata;
-
-	while (!list_empty(&data->pages)) {
-		struct nfs_page *req = nfs_list_entry(data->pages.next);
-
-		nfs_list_remove_request(req);
-		nfs_readpage_release(req);
-	}
-	nfs_readdata_release(calldata);
+	NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data);
 }
 
-static const struct rpc_call_ops nfs_read_full_ops = {
+static const struct rpc_call_ops nfs_read_common_ops = {
 	.rpc_call_prepare = nfs_read_prepare,
-	.rpc_call_done = nfs_readpage_result_full,
-	.rpc_release = nfs_readpage_release_full,
+	.rpc_call_done = nfs_readpage_result_common,
+	.rpc_release = nfs_readpage_release_common,
 };
 
 /*
@@ -668,11 +652,12 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
 	if (ret == 0)
 		goto read_complete; /* all pages were read */
 
-	nfs_pageio_init_read(&pgio, inode);
+	nfs_pageio_init_read(&pgio, inode, &nfs_async_read_completion_ops);
 
 	ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
 
 	nfs_pageio_complete(&pgio);
+	NFS_I(inode)->read_io += pgio.pg_bytes_written;
 	npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	nfs_add_stats(inode, NFSIOS_READPAGES, npages);
 read_complete:
@@ -684,7 +669,7 @@ out:
 int __init nfs_init_readpagecache(void)
 {
 	nfs_rdata_cachep = kmem_cache_create("nfs_read_data",
-					     sizeof(struct nfs_read_data),
+					     sizeof(struct nfs_read_header),
 					     0, SLAB_HWCACHE_ALIGN,
 					     NULL);
 	if (nfs_rdata_cachep == NULL)
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 4ac7fca7e4bf..ff656c022684 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -66,6 +66,7 @@
 #include "pnfs.h"
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
+#define NFS_TEXT_DATA		1
 
 #ifdef CONFIG_NFS_V3
 #define NFS_DEFAULT_VERSION 3
@@ -277,12 +278,22 @@ static match_table_t nfs_vers_tokens = {
 	{ Opt_vers_err, NULL }
 };
 
+struct nfs_mount_info {
+	void (*fill_super)(struct super_block *, struct nfs_mount_info *);
+	int (*set_security)(struct super_block *, struct dentry *, struct nfs_mount_info *);
+	struct nfs_parsed_mount_data *parsed;
+	struct nfs_clone_mount *cloned;
+	struct nfs_fh *mntfh;
+};
+
 static void nfs_umount_begin(struct super_block *);
 static int  nfs_statfs(struct dentry *, struct kstatfs *);
 static int  nfs_show_options(struct seq_file *, struct dentry *);
 static int  nfs_show_devname(struct seq_file *, struct dentry *);
 static int  nfs_show_path(struct seq_file *, struct dentry *);
 static int  nfs_show_stats(struct seq_file *, struct dentry *);
+static struct dentry *nfs_fs_mount_common(struct file_system_type *,
+		struct nfs_server *, int, const char *, struct nfs_mount_info *);
 static struct dentry *nfs_fs_mount(struct file_system_type *,
 		int, const char *, void *);
 static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type,
@@ -323,12 +334,11 @@ static const struct super_operations nfs_sops = {
 };
 
 #ifdef CONFIG_NFS_V4
-static int nfs4_validate_text_mount_data(void *options,
+static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *);
+static int nfs4_validate_mount_data(void *options,
 	struct nfs_parsed_mount_data *args, const char *dev_name);
 static struct dentry *nfs4_try_mount(int flags, const char *dev_name,
-	struct nfs_parsed_mount_data *data);
-static struct dentry *nfs4_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data);
+	struct nfs_mount_info *mount_info);
 static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *raw_data);
 static struct dentry *nfs4_xdev_mount(struct file_system_type *fs_type,
@@ -342,7 +352,7 @@ static void nfs4_kill_super(struct super_block *sb);
 static struct file_system_type nfs4_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "nfs4",
-	.mount		= nfs4_mount,
+	.mount		= nfs_fs_mount,
 	.kill_sb	= nfs4_kill_super,
 	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -786,8 +796,8 @@ static void show_pnfs(struct seq_file *m, struct nfs_server *server)
 
 static void show_implementation_id(struct seq_file *m, struct nfs_server *nfss)
 {
-	if (nfss->nfs_client && nfss->nfs_client->impl_id) {
-		struct nfs41_impl_id *impl_id = nfss->nfs_client->impl_id;
+	if (nfss->nfs_client && nfss->nfs_client->cl_implid) {
+		struct nfs41_impl_id *impl_id = nfss->nfs_client->cl_implid;
 		seq_printf(m, "\n\timpl_id:\tname='%s',domain='%s',"
 			   "date='%llu,%u'",
 			   impl_id->name, impl_id->domain,
@@ -938,7 +948,7 @@ static void nfs_umount_begin(struct super_block *sb)
 		rpc_killall_tasks(rpc);
 }
 
-static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int version)
+static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void)
 {
 	struct nfs_parsed_mount_data *data;
 
@@ -953,8 +963,8 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int ve
 		data->nfs_server.protocol = XPRT_TRANSPORT_TCP;
 		data->auth_flavors[0]	= RPC_AUTH_UNIX;
 		data->auth_flavor_len	= 1;
-		data->version		= version;
 		data->minorversion	= 0;
+		data->need_mount	= true;
 		data->net		= current->nsproxy->net_ns;
 		security_init_mnt_opts(&data->lsm_opts);
 	}
@@ -1674,8 +1684,8 @@ static int nfs_walk_authlist(struct nfs_parsed_mount_data *args,
  * Use the remote server's MOUNT service to request the NFS file handle
  * corresponding to the provided path.
  */
-static int nfs_try_mount(struct nfs_parsed_mount_data *args,
-			 struct nfs_fh *root_fh)
+static int nfs_request_mount(struct nfs_parsed_mount_data *args,
+			     struct nfs_fh *root_fh)
 {
 	rpc_authflavor_t server_authlist[NFS_MAX_SECFLAVORS];
 	unsigned int server_authlist_len = ARRAY_SIZE(server_authlist);
@@ -1738,6 +1748,26 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 	return nfs_walk_authlist(args, &request);
 }
 
+static struct dentry *nfs_try_mount(int flags, const char *dev_name,
+				    struct nfs_mount_info *mount_info)
+{
+	int status;
+	struct nfs_server *server;
+
+	if (mount_info->parsed->need_mount) {
+		status = nfs_request_mount(mount_info->parsed, mount_info->mntfh);
+		if (status)
+			return ERR_PTR(status);
+	}
+
+	/* Get a volume representation */
+	server = nfs_create_server(mount_info->parsed, mount_info->mntfh);
+	if (IS_ERR(server))
+		return ERR_CAST(server);
+
+	return nfs_fs_mount_common(&nfs_fs_type, server, flags, dev_name, mount_info);
+}
+
 /*
  * Split "dev_name" into "hostname:export_path".
  *
@@ -1826,10 +1856,10 @@ out_path:
  * + breaking back: trying proto=udp after proto=tcp, v2 after v3,
  *   mountproto=tcp after mountproto=udp, and so on
  */
-static int nfs_validate_mount_data(void *options,
-				   struct nfs_parsed_mount_data *args,
-				   struct nfs_fh *mntfh,
-				   const char *dev_name)
+static int nfs23_validate_mount_data(void *options,
+				     struct nfs_parsed_mount_data *args,
+				     struct nfs_fh *mntfh,
+				     const char *dev_name)
 {
 	struct nfs_mount_data *data = (struct nfs_mount_data *)options;
 	struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
@@ -1883,6 +1913,7 @@ static int nfs_validate_mount_data(void *options,
 		args->acregmax		= data->acregmax;
 		args->acdirmin		= data->acdirmin;
 		args->acdirmax		= data->acdirmax;
+		args->need_mount	= false;
 
 		memcpy(sap, &data->addr, sizeof(data->addr));
 		args->nfs_server.addrlen = sizeof(data->addr);
@@ -1934,43 +1965,8 @@ static int nfs_validate_mount_data(void *options,
 		}
 
 		break;
-	default: {
-		int status;
-
-		if (nfs_parse_mount_options((char *)options, args) == 0)
-			return -EINVAL;
-
-		if (!nfs_verify_server_address(sap))
-			goto out_no_address;
-
-		if (args->version == 4)
-#ifdef CONFIG_NFS_V4
-			return nfs4_validate_text_mount_data(options,
-							     args, dev_name);
-#else
-			goto out_v4_not_compiled;
-#endif
-
-		nfs_set_port(sap, &args->nfs_server.port, 0);
-
-		nfs_set_mount_transport_protocol(args);
-
-		status = nfs_parse_devname(dev_name,
-					   &args->nfs_server.hostname,
-					   PAGE_SIZE,
-					   &args->nfs_server.export_path,
-					   NFS_MAXPATHLEN);
-		if (!status)
-			status = nfs_try_mount(args, mntfh);
-
-		kfree(args->nfs_server.export_path);
-		args->nfs_server.export_path = NULL;
-
-		if (status)
-			return status;
-
-		break;
-		}
+	default:
+		return NFS_TEXT_DATA;
 	}
 
 #ifndef CONFIG_NFS_V3
@@ -1999,12 +1995,6 @@ out_v3_not_compiled:
 	return -EPROTONOSUPPORT;
 #endif /* !CONFIG_NFS_V3 */
 
-#ifndef CONFIG_NFS_V4
-out_v4_not_compiled:
-	dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n");
-	return -EPROTONOSUPPORT;
-#endif /* !CONFIG_NFS_V4 */
-
 out_nomem:
 	dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n");
 	return -ENOMEM;
@@ -2018,6 +2008,82 @@ out_invalid_fh:
 	return -EINVAL;
 }
 
+#ifdef CONFIG_NFS_V4
+static int nfs_validate_mount_data(struct file_system_type *fs_type,
+				   void *options,
+				   struct nfs_parsed_mount_data *args,
+				   struct nfs_fh *mntfh,
+				   const char *dev_name)
+{
+	if (fs_type == &nfs_fs_type)
+		return nfs23_validate_mount_data(options, args, mntfh, dev_name);
+	return nfs4_validate_mount_data(options, args, dev_name);
+}
+#else
+static int nfs_validate_mount_data(struct file_system_type *fs_type,
+				   void *options,
+				   struct nfs_parsed_mount_data *args,
+				   struct nfs_fh *mntfh,
+				   const char *dev_name)
+{
+	return nfs23_validate_mount_data(options, args, mntfh, dev_name);
+}
+#endif
+
+static int nfs_validate_text_mount_data(void *options,
+					struct nfs_parsed_mount_data *args,
+					const char *dev_name)
+{
+	int port = 0;
+	int max_namelen = PAGE_SIZE;
+	int max_pathlen = NFS_MAXPATHLEN;
+	struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
+
+	if (nfs_parse_mount_options((char *)options, args) == 0)
+		return -EINVAL;
+
+	if (!nfs_verify_server_address(sap))
+		goto out_no_address;
+
+	if (args->version == 4) {
+#ifdef CONFIG_NFS_V4
+		port = NFS_PORT;
+		max_namelen = NFS4_MAXNAMLEN;
+		max_pathlen = NFS4_MAXPATHLEN;
+		nfs_validate_transport_protocol(args);
+		nfs4_validate_mount_flags(args);
+#else
+		goto out_v4_not_compiled;
+#endif /* CONFIG_NFS_V4 */
+	} else
+		nfs_set_mount_transport_protocol(args);
+
+	nfs_set_port(sap, &args->nfs_server.port, port);
+
+	if (args->auth_flavor_len > 1)
+		goto out_bad_auth;
+
+	return nfs_parse_devname(dev_name,
+				   &args->nfs_server.hostname,
+				   max_namelen,
+				   &args->nfs_server.export_path,
+				   max_pathlen);
+
+#ifndef CONFIG_NFS_V4
+out_v4_not_compiled:
+	dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n");
+	return -EPROTONOSUPPORT;
+#endif /* !CONFIG_NFS_V4 */
+
+out_no_address:
+	dfprintk(MOUNT, "NFS: mount program didn't pass remote address\n");
+	return -EINVAL;
+
+out_bad_auth:
+	dfprintk(MOUNT, "NFS: Too many RPC auth flavours specified\n");
+	return -EINVAL;
+}
+
 static int
 nfs_compare_remount_data(struct nfs_server *nfss,
 			 struct nfs_parsed_mount_data *data)
@@ -2129,8 +2195,9 @@ static inline void nfs_initialise_sb(struct super_block *sb)
  * Finish setting up an NFS2/3 superblock
  */
 static void nfs_fill_super(struct super_block *sb,
-			   struct nfs_parsed_mount_data *data)
+			   struct nfs_mount_info *mount_info)
 {
+	struct nfs_parsed_mount_data *data = mount_info->parsed;
 	struct nfs_server *server = NFS_SB(sb);
 
 	sb->s_blocksize_bits = 0;
@@ -2154,8 +2221,9 @@ static void nfs_fill_super(struct super_block *sb,
  * Finish setting up a cloned NFS2/3 superblock
  */
 static void nfs_clone_super(struct super_block *sb,
-			    const struct super_block *old_sb)
+			    struct nfs_mount_info *mount_info)
 {
+	const struct super_block *old_sb = mount_info->cloned->sb;
 	struct nfs_server *server = NFS_SB(sb);
 
 	sb->s_blocksize_bits = old_sb->s_blocksize_bits;
@@ -2278,52 +2346,70 @@ static int nfs_compare_super(struct super_block *sb, void *data)
 	return nfs_compare_mount_options(sb, server, mntflags);
 }
 
+#ifdef CONFIG_NFS_FSCACHE
+static void nfs_get_cache_cookie(struct super_block *sb,
+				 struct nfs_parsed_mount_data *parsed,
+				 struct nfs_clone_mount *cloned)
+{
+	char *uniq = NULL;
+	int ulen = 0;
+
+	if (parsed && parsed->fscache_uniq) {
+		uniq = parsed->fscache_uniq;
+		ulen = strlen(parsed->fscache_uniq);
+	} else if (cloned) {
+		struct nfs_server *mnt_s = NFS_SB(cloned->sb);
+		if (mnt_s->fscache_key) {
+			uniq = mnt_s->fscache_key->key.uniquifier;
+			ulen = mnt_s->fscache_key->key.uniq_len;
+		};
+	}
+
+	nfs_fscache_get_super_cookie(sb, uniq, ulen);
+}
+#else
+static void nfs_get_cache_cookie(struct super_block *sb,
+				 struct nfs_parsed_mount_data *parsed,
+				 struct nfs_clone_mount *cloned)
+{
+}
+#endif
+
 static int nfs_bdi_register(struct nfs_server *server)
 {
 	return bdi_register_dev(&server->backing_dev_info, server->s_dev);
 }
 
-static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data)
+static int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot,
+			       struct nfs_mount_info *mount_info)
+{
+	return security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts);
+}
+
+static int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot,
+				 struct nfs_mount_info *mount_info)
+{
+	/* clone any lsm security options from the parent to the new sb */
+	security_sb_clone_mnt_opts(mount_info->cloned->sb, s);
+	if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops)
+		return -ESTALE;
+	return 0;
+}
+
+static struct dentry *nfs_fs_mount_common(struct file_system_type *fs_type,
+					  struct nfs_server *server,
+					  int flags, const char *dev_name,
+					  struct nfs_mount_info *mount_info)
 {
-	struct nfs_server *server = NULL;
 	struct super_block *s;
-	struct nfs_parsed_mount_data *data;
-	struct nfs_fh *mntfh;
 	struct dentry *mntroot = ERR_PTR(-ENOMEM);
 	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
 	struct nfs_sb_mountdata sb_mntdata = {
 		.mntflags = flags,
+		.server = server,
 	};
 	int error;
 
-	data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION);
-	mntfh = nfs_alloc_fhandle();
-	if (data == NULL || mntfh == NULL)
-		goto out;
-
-	/* Validate the mount data */
-	error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name);
-	if (error < 0) {
-		mntroot = ERR_PTR(error);
-		goto out;
-	}
-
-#ifdef CONFIG_NFS_V4
-	if (data->version == 4) {
-		mntroot = nfs4_try_mount(flags, dev_name, data);
-		goto out;
-	}
-#endif	/* CONFIG_NFS_V4 */
-
-	/* Get a volume representation */
-	server = nfs_create_server(data, mntfh);
-	if (IS_ERR(server)) {
-		mntroot = ERR_CAST(server);
-		goto out;
-	}
-	sb_mntdata.server = server;
-
 	if (server->flags & NFS_MOUNT_UNSHARED)
 		compare_super = NULL;
 
@@ -2351,23 +2437,21 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
 
 	if (!s->s_root) {
 		/* initial superblock/root creation */
-		nfs_fill_super(s, data);
-		nfs_fscache_get_super_cookie(s, data->fscache_uniq, NULL);
+		mount_info->fill_super(s, mount_info);
+		nfs_get_cache_cookie(s, mount_info->parsed, mount_info->cloned);
 	}
 
-	mntroot = nfs_get_root(s, mntfh, dev_name);
+	mntroot = nfs_get_root(s, mount_info->mntfh, dev_name);
 	if (IS_ERR(mntroot))
 		goto error_splat_super;
 
-	error = security_sb_set_mnt_opts(s, &data->lsm_opts);
+	error = mount_info->set_security(s, mntroot, mount_info);
 	if (error)
 		goto error_splat_root;
 
 	s->s_flags |= MS_ACTIVE;
 
 out:
-	nfs_free_parsed_mount_data(data);
-	nfs_free_fhandle(mntfh);
 	return mntroot;
 
 out_err_nosb:
@@ -2385,6 +2469,43 @@ error_splat_bdi:
 	goto out;
 }
 
+static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data)
+{
+	struct nfs_mount_info mount_info = {
+		.fill_super = nfs_fill_super,
+		.set_security = nfs_set_sb_security,
+	};
+	struct dentry *mntroot = ERR_PTR(-ENOMEM);
+	int error;
+
+	mount_info.parsed = nfs_alloc_parsed_mount_data();
+	mount_info.mntfh = nfs_alloc_fhandle();
+	if (mount_info.parsed == NULL || mount_info.mntfh == NULL)
+		goto out;
+
+	/* Validate the mount data */
+	error = nfs_validate_mount_data(fs_type, raw_data, mount_info.parsed, mount_info.mntfh, dev_name);
+	if (error == NFS_TEXT_DATA)
+		error = nfs_validate_text_mount_data(raw_data, mount_info.parsed, dev_name);
+	if (error < 0) {
+		mntroot = ERR_PTR(error);
+		goto out;
+	}
+
+#ifdef CONFIG_NFS_V4
+	if (mount_info.parsed->version == 4)
+		mntroot = nfs4_try_mount(flags, dev_name, &mount_info);
+	else
+#endif	/* CONFIG_NFS_V4 */
+		mntroot = nfs_try_mount(flags, dev_name, &mount_info);
+
+out:
+	nfs_free_parsed_mount_data(mount_info.parsed);
+	nfs_free_fhandle(mount_info.mntfh);
+	return mntroot;
+}
+
 /*
  * Ensure that we unregister the bdi before kill_anon_super
  * releases the device name
@@ -2409,93 +2530,51 @@ static void nfs_kill_super(struct super_block *s)
 }
 
 /*
- * Clone an NFS2/3 server record on xdev traversal (FSID-change)
+ * Clone an NFS2/3/4 server record on xdev traversal (FSID-change)
  */
 static struct dentry *
-nfs_xdev_mount(struct file_system_type *fs_type, int flags,
-		const char *dev_name, void *raw_data)
+nfs_xdev_mount_common(struct file_system_type *fs_type, int flags,
+		const char *dev_name, struct nfs_mount_info *mount_info)
 {
-	struct nfs_clone_mount *data = raw_data;
-	struct super_block *s;
+	struct nfs_clone_mount *data = mount_info->cloned;
 	struct nfs_server *server;
-	struct dentry *mntroot;
-	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
-	struct nfs_sb_mountdata sb_mntdata = {
-		.mntflags = flags,
-	};
+	struct dentry *mntroot = ERR_PTR(-ENOMEM);
 	int error;
 
-	dprintk("--> nfs_xdev_mount()\n");
+	dprintk("--> nfs_xdev_mount_common()\n");
+
+	mount_info->mntfh = data->fh;
 
 	/* create a new volume representation */
 	server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr, data->authflavor);
 	if (IS_ERR(server)) {
 		error = PTR_ERR(server);
-		goto out_err_noserver;
-	}
-	sb_mntdata.server = server;
-
-	if (server->flags & NFS_MOUNT_UNSHARED)
-		compare_super = NULL;
-
-	/* -o noac implies -o sync */
-	if (server->flags & NFS_MOUNT_NOAC)
-		sb_mntdata.mntflags |= MS_SYNCHRONOUS;
-
-	/* Get a superblock - note that we may end up sharing one that already exists */
-	s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata);
-	if (IS_ERR(s)) {
-		error = PTR_ERR(s);
-		goto out_err_nosb;
-	}
-
-	if (s->s_fs_info != server) {
-		nfs_free_server(server);
-		server = NULL;
-	} else {
-		error = nfs_bdi_register(server);
-		if (error)
-			goto error_splat_bdi;
-	}
-
-	if (!s->s_root) {
-		/* initial superblock/root creation */
-		nfs_clone_super(s, data->sb);
-		nfs_fscache_get_super_cookie(s, NULL, data);
-	}
-
-	mntroot = nfs_get_root(s, data->fh, dev_name);
-	if (IS_ERR(mntroot)) {
-		error = PTR_ERR(mntroot);
-		goto error_splat_super;
-	}
-	if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) {
-		dput(mntroot);
-		error = -ESTALE;
-		goto error_splat_super;
+		goto out_err;
 	}
 
-	s->s_flags |= MS_ACTIVE;
-
-	/* clone any lsm security options from the parent to the new sb */
-	security_sb_clone_mnt_opts(data->sb, s);
-
-	dprintk("<-- nfs_xdev_mount() = 0\n");
+	mntroot = nfs_fs_mount_common(fs_type, server, flags, dev_name, mount_info);
+	dprintk("<-- nfs_xdev_mount_common() = 0\n");
+out:
 	return mntroot;
 
-out_err_nosb:
-	nfs_free_server(server);
-out_err_noserver:
-	dprintk("<-- nfs_xdev_mount() = %d [error]\n", error);
-	return ERR_PTR(error);
+out_err:
+	dprintk("<-- nfs_xdev_mount_common() = %d [error]\n", error);
+	goto out;
+}
 
-error_splat_super:
-	if (server && !s->s_root)
-		bdi_unregister(&server->backing_dev_info);
-error_splat_bdi:
-	deactivate_locked_super(s);
-	dprintk("<-- nfs_xdev_mount() = %d [splat]\n", error);
-	return ERR_PTR(error);
+/*
+ * Clone an NFS2/3 server record on xdev traversal (FSID-change)
+ */
+static struct dentry *
+nfs_xdev_mount(struct file_system_type *fs_type, int flags,
+		const char *dev_name, void *raw_data)
+{
+	struct nfs_mount_info mount_info = {
+		.fill_super = nfs_clone_super,
+		.set_security = nfs_clone_sb_security,
+		.cloned   = raw_data,
+	};
+	return nfs_xdev_mount_common(&nfs_fs_type, flags, dev_name, &mount_info);
 }
 
 #ifdef CONFIG_NFS_V4
@@ -2504,8 +2583,9 @@ error_splat_bdi:
  * Finish setting up a cloned NFS4 superblock
  */
 static void nfs4_clone_super(struct super_block *sb,
-			    const struct super_block *old_sb)
+			     struct nfs_mount_info *mount_info)
 {
+	const struct super_block *old_sb = mount_info->cloned->sb;
 	sb->s_blocksize_bits = old_sb->s_blocksize_bits;
 	sb->s_blocksize = old_sb->s_blocksize;
 	sb->s_maxbytes = old_sb->s_maxbytes;
@@ -2523,7 +2603,8 @@ static void nfs4_clone_super(struct super_block *sb,
 /*
  * Set up an NFS4 superblock
  */
-static void nfs4_fill_super(struct super_block *sb)
+static void nfs4_fill_super(struct super_block *sb,
+			    struct nfs_mount_info *mount_info)
 {
 	sb->s_time_gran = 1;
 	sb->s_op = &nfs4_sops;
@@ -2542,37 +2623,6 @@ static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args)
 			 NFS_MOUNT_LOCAL_FLOCK|NFS_MOUNT_LOCAL_FCNTL);
 }
 
-static int nfs4_validate_text_mount_data(void *options,
-					 struct nfs_parsed_mount_data *args,
-					 const char *dev_name)
-{
-	struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
-
-	nfs_set_port(sap, &args->nfs_server.port, NFS_PORT);
-
-	nfs_validate_transport_protocol(args);
-
-	nfs4_validate_mount_flags(args);
-
-	if (args->version != 4) {
-		dfprintk(MOUNT,
-			 "NFS4: Illegal mount version\n");
-		return -EINVAL;
-	}
-
-	if (args->auth_flavor_len > 1) {
-		dfprintk(MOUNT,
-			 "NFS4: Too many RPC auth flavours specified\n");
-		return -EINVAL;
-	}
-
-	return nfs_parse_devname(dev_name,
-				   &args->nfs_server.hostname,
-				   NFS4_MAXNAMLEN,
-				   &args->nfs_server.export_path,
-				   NFS4_MAXPATHLEN);
-}
-
 /*
  * Validate NFSv4 mount options
  */
@@ -2643,13 +2693,7 @@ static int nfs4_validate_mount_data(void *options,
 
 		break;
 	default:
-		if (nfs_parse_mount_options((char *)options, args) == 0)
-			return -EINVAL;
-
-		if (!nfs_verify_server_address(sap))
-			return -EINVAL;
-
-		return nfs4_validate_text_mount_data(options, args, dev_name);
+		return NFS_TEXT_DATA;
 	}
 
 	return 0;
@@ -2673,91 +2717,26 @@ out_no_address:
  */
 static struct dentry *
 nfs4_remote_mount(struct file_system_type *fs_type, int flags,
-		  const char *dev_name, void *raw_data)
+		  const char *dev_name, void *info)
 {
-	struct nfs_parsed_mount_data *data = raw_data;
-	struct super_block *s;
+	struct nfs_mount_info *mount_info = info;
 	struct nfs_server *server;
-	struct nfs_fh *mntfh;
-	struct dentry *mntroot;
-	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
-	struct nfs_sb_mountdata sb_mntdata = {
-		.mntflags = flags,
-	};
-	int error = -ENOMEM;
+	struct dentry *mntroot = ERR_PTR(-ENOMEM);
 
-	mntfh = nfs_alloc_fhandle();
-	if (data == NULL || mntfh == NULL)
-		goto out;
+	mount_info->fill_super = nfs4_fill_super;
+	mount_info->set_security = nfs_set_sb_security;
 
 	/* Get a volume representation */
-	server = nfs4_create_server(data, mntfh);
+	server = nfs4_create_server(mount_info->parsed, mount_info->mntfh);
 	if (IS_ERR(server)) {
-		error = PTR_ERR(server);
+		mntroot = ERR_CAST(server);
 		goto out;
 	}
-	sb_mntdata.server = server;
 
-	if (server->flags & NFS4_MOUNT_UNSHARED)
-		compare_super = NULL;
-
-	/* -o noac implies -o sync */
-	if (server->flags & NFS_MOUNT_NOAC)
-		sb_mntdata.mntflags |= MS_SYNCHRONOUS;
-
-	/* Get a superblock - note that we may end up sharing one that already exists */
-	s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
-	if (IS_ERR(s)) {
-		error = PTR_ERR(s);
-		goto out_free;
-	}
-
-	if (s->s_fs_info != server) {
-		nfs_free_server(server);
-		server = NULL;
-	} else {
-		error = nfs_bdi_register(server);
-		if (error)
-			goto error_splat_bdi;
-	}
-
-	if (!s->s_root) {
-		/* initial superblock/root creation */
-		nfs4_fill_super(s);
-		nfs_fscache_get_super_cookie(s, data->fscache_uniq, NULL);
-	}
-
-	mntroot = nfs4_get_root(s, mntfh, dev_name);
-	if (IS_ERR(mntroot)) {
-		error = PTR_ERR(mntroot);
-		goto error_splat_super;
-	}
-
-	error = security_sb_set_mnt_opts(s, &data->lsm_opts);
-	if (error)
-		goto error_splat_root;
-
-	s->s_flags |= MS_ACTIVE;
-
-	nfs_free_fhandle(mntfh);
-	return mntroot;
+	mntroot = nfs_fs_mount_common(fs_type, server, flags, dev_name, mount_info);
 
 out:
-	nfs_free_fhandle(mntfh);
-	return ERR_PTR(error);
-
-out_free:
-	nfs_free_server(server);
-	goto out;
-
-error_splat_root:
-	dput(mntroot);
-error_splat_super:
-	if (server && !s->s_root)
-		bdi_unregister(&server->backing_dev_info);
-error_splat_bdi:
-	deactivate_locked_super(s);
-	goto out;
+	return mntroot;
 }
 
 static struct vfsmount *nfs_do_root_mount(struct file_system_type *fs_type,
@@ -2869,17 +2848,18 @@ static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt,
 }
 
 static struct dentry *nfs4_try_mount(int flags, const char *dev_name,
-			 struct nfs_parsed_mount_data *data)
+			 struct nfs_mount_info *mount_info)
 {
 	char *export_path;
 	struct vfsmount *root_mnt;
 	struct dentry *res;
+	struct nfs_parsed_mount_data *data = mount_info->parsed;
 
 	dfprintk(MOUNT, "--> nfs4_try_mount()\n");
 
 	export_path = data->nfs_server.export_path;
 	data->nfs_server.export_path = "/";
-	root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data,
+	root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, mount_info,
 			data->nfs_server.hostname);
 	data->nfs_server.export_path = export_path;
 
@@ -2891,38 +2871,6 @@ static struct dentry *nfs4_try_mount(int flags, const char *dev_name,
 	return res;
 }
 
-/*
- * Get the superblock for an NFS4 mountpoint
- */
-static struct dentry *nfs4_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data)
-{
-	struct nfs_parsed_mount_data *data;
-	int error = -ENOMEM;
-	struct dentry *res = ERR_PTR(-ENOMEM);
-
-	data = nfs_alloc_parsed_mount_data(4);
-	if (data == NULL)
-		goto out;
-
-	/* Validate the mount data */
-	error = nfs4_validate_mount_data(raw_data, data, dev_name);
-	if (error < 0) {
-		res = ERR_PTR(error);
-		goto out;
-	}
-
-	res = nfs4_try_mount(flags, dev_name, data);
-	if (IS_ERR(res))
-		error = PTR_ERR(res);
-
-out:
-	nfs_free_parsed_mount_data(data);
-	dprintk("<-- nfs4_mount() = %d%s\n", error,
-			error != 0 ? " [error]" : "");
-	return res;
-}
-
 static void nfs4_kill_super(struct super_block *sb)
 {
 	struct nfs_server *server = NFS_SB(sb);
@@ -2942,181 +2890,43 @@ static struct dentry *
 nfs4_xdev_mount(struct file_system_type *fs_type, int flags,
 		 const char *dev_name, void *raw_data)
 {
-	struct nfs_clone_mount *data = raw_data;
-	struct super_block *s;
-	struct nfs_server *server;
-	struct dentry *mntroot;
-	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
-	struct nfs_sb_mountdata sb_mntdata = {
-		.mntflags = flags,
+	struct nfs_mount_info mount_info = {
+		.fill_super = nfs4_clone_super,
+		.set_security = nfs_clone_sb_security,
+		.cloned = raw_data,
 	};
-	int error;
-
-	dprintk("--> nfs4_xdev_mount()\n");
-
-	/* create a new volume representation */
-	server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr, data->authflavor);
-	if (IS_ERR(server)) {
-		error = PTR_ERR(server);
-		goto out_err_noserver;
-	}
-	sb_mntdata.server = server;
-
-	if (server->flags & NFS4_MOUNT_UNSHARED)
-		compare_super = NULL;
-
-	/* -o noac implies -o sync */
-	if (server->flags & NFS_MOUNT_NOAC)
-		sb_mntdata.mntflags |= MS_SYNCHRONOUS;
-
-	/* Get a superblock - note that we may end up sharing one that already exists */
-	s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
-	if (IS_ERR(s)) {
-		error = PTR_ERR(s);
-		goto out_err_nosb;
-	}
-
-	if (s->s_fs_info != server) {
-		nfs_free_server(server);
-		server = NULL;
-	} else {
-		error = nfs_bdi_register(server);
-		if (error)
-			goto error_splat_bdi;
-	}
-
-	if (!s->s_root) {
-		/* initial superblock/root creation */
-		nfs4_clone_super(s, data->sb);
-		nfs_fscache_get_super_cookie(s, NULL, data);
-	}
-
-	mntroot = nfs4_get_root(s, data->fh, dev_name);
-	if (IS_ERR(mntroot)) {
-		error = PTR_ERR(mntroot);
-		goto error_splat_super;
-	}
-	if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) {
-		dput(mntroot);
-		error = -ESTALE;
-		goto error_splat_super;
-	}
-
-	s->s_flags |= MS_ACTIVE;
-
-	security_sb_clone_mnt_opts(data->sb, s);
-
-	dprintk("<-- nfs4_xdev_mount() = 0\n");
-	return mntroot;
-
-out_err_nosb:
-	nfs_free_server(server);
-out_err_noserver:
-	dprintk("<-- nfs4_xdev_mount() = %d [error]\n", error);
-	return ERR_PTR(error);
-
-error_splat_super:
-	if (server && !s->s_root)
-		bdi_unregister(&server->backing_dev_info);
-error_splat_bdi:
-	deactivate_locked_super(s);
-	dprintk("<-- nfs4_xdev_mount() = %d [splat]\n", error);
-	return ERR_PTR(error);
+	return nfs_xdev_mount_common(&nfs4_fs_type, flags, dev_name, &mount_info);
 }
 
 static struct dentry *
 nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags,
 			   const char *dev_name, void *raw_data)
 {
-	struct nfs_clone_mount *data = raw_data;
-	struct super_block *s;
-	struct nfs_server *server;
-	struct dentry *mntroot;
-	struct nfs_fh *mntfh;
-	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
-	struct nfs_sb_mountdata sb_mntdata = {
-		.mntflags = flags,
+	struct nfs_mount_info mount_info = {
+		.fill_super = nfs4_fill_super,
+		.set_security = nfs_clone_sb_security,
+		.cloned = raw_data,
 	};
-	int error = -ENOMEM;
+	struct nfs_server *server;
+	struct dentry *mntroot = ERR_PTR(-ENOMEM);
 
 	dprintk("--> nfs4_referral_get_sb()\n");
 
-	mntfh = nfs_alloc_fhandle();
-	if (mntfh == NULL)
-		goto out_err_nofh;
+	mount_info.mntfh = nfs_alloc_fhandle();
+	if (mount_info.cloned == NULL || mount_info.mntfh == NULL)
+		goto out;
 
 	/* create a new volume representation */
-	server = nfs4_create_referral_server(data, mntfh);
+	server = nfs4_create_referral_server(mount_info.cloned, mount_info.mntfh);
 	if (IS_ERR(server)) {
-		error = PTR_ERR(server);
-		goto out_err_noserver;
-	}
-	sb_mntdata.server = server;
-
-	if (server->flags & NFS4_MOUNT_UNSHARED)
-		compare_super = NULL;
-
-	/* -o noac implies -o sync */
-	if (server->flags & NFS_MOUNT_NOAC)
-		sb_mntdata.mntflags |= MS_SYNCHRONOUS;
-
-	/* Get a superblock - note that we may end up sharing one that already exists */
-	s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
-	if (IS_ERR(s)) {
-		error = PTR_ERR(s);
-		goto out_err_nosb;
-	}
-
-	if (s->s_fs_info != server) {
-		nfs_free_server(server);
-		server = NULL;
-	} else {
-		error = nfs_bdi_register(server);
-		if (error)
-			goto error_splat_bdi;
-	}
-
-	if (!s->s_root) {
-		/* initial superblock/root creation */
-		nfs4_fill_super(s);
-		nfs_fscache_get_super_cookie(s, NULL, data);
-	}
-
-	mntroot = nfs4_get_root(s, mntfh, dev_name);
-	if (IS_ERR(mntroot)) {
-		error = PTR_ERR(mntroot);
-		goto error_splat_super;
-	}
-	if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) {
-		dput(mntroot);
-		error = -ESTALE;
-		goto error_splat_super;
+		mntroot = ERR_CAST(server);
+		goto out;
 	}
 
-	s->s_flags |= MS_ACTIVE;
-
-	security_sb_clone_mnt_opts(data->sb, s);
-
-	nfs_free_fhandle(mntfh);
-	dprintk("<-- nfs4_referral_get_sb() = 0\n");
+	mntroot = nfs_fs_mount_common(&nfs4_fs_type, server, flags, dev_name, &mount_info);
+out:
+	nfs_free_fhandle(mount_info.mntfh);
 	return mntroot;
-
-out_err_nosb:
-	nfs_free_server(server);
-out_err_noserver:
-	nfs_free_fhandle(mntfh);
-out_err_nofh:
-	dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error);
-	return ERR_PTR(error);
-
-error_splat_super:
-	if (server && !s->s_root)
-		bdi_unregister(&server->backing_dev_info);
-error_splat_bdi:
-	deactivate_locked_super(s);
-	nfs_free_fhandle(mntfh);
-	dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
-	return ERR_PTR(error);
 }
 
 /*
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c07462320f6b..e6fe3d69d14c 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -39,20 +39,20 @@
 /*
  * Local function declarations
  */
-static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc,
-				  struct inode *inode, int ioflags);
 static void nfs_redirty_request(struct nfs_page *req);
-static const struct rpc_call_ops nfs_write_partial_ops;
-static const struct rpc_call_ops nfs_write_full_ops;
+static const struct rpc_call_ops nfs_write_common_ops;
 static const struct rpc_call_ops nfs_commit_ops;
+static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops;
+static const struct nfs_commit_completion_ops nfs_commit_completion_ops;
 
 static struct kmem_cache *nfs_wdata_cachep;
 static mempool_t *nfs_wdata_mempool;
+static struct kmem_cache *nfs_cdata_cachep;
 static mempool_t *nfs_commit_mempool;
 
-struct nfs_write_data *nfs_commitdata_alloc(void)
+struct nfs_commit_data *nfs_commitdata_alloc(void)
 {
-	struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS);
+	struct nfs_commit_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS);
 
 	if (p) {
 		memset(p, 0, sizeof(*p));
@@ -62,46 +62,73 @@ struct nfs_write_data *nfs_commitdata_alloc(void)
 }
 EXPORT_SYMBOL_GPL(nfs_commitdata_alloc);
 
-void nfs_commit_free(struct nfs_write_data *p)
+void nfs_commit_free(struct nfs_commit_data *p)
 {
-	if (p && (p->pagevec != &p->page_array[0]))
-		kfree(p->pagevec);
 	mempool_free(p, nfs_commit_mempool);
 }
 EXPORT_SYMBOL_GPL(nfs_commit_free);
 
-struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
+struct nfs_write_header *nfs_writehdr_alloc(void)
 {
-	struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS);
+	struct nfs_write_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS);
 
 	if (p) {
+		struct nfs_pgio_header *hdr = &p->header;
+
 		memset(p, 0, sizeof(*p));
-		INIT_LIST_HEAD(&p->pages);
-		p->npages = pagecount;
-		if (pagecount <= ARRAY_SIZE(p->page_array))
-			p->pagevec = p->page_array;
-		else {
-			p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
-			if (!p->pagevec) {
-				mempool_free(p, nfs_wdata_mempool);
-				p = NULL;
-			}
-		}
+		INIT_LIST_HEAD(&hdr->pages);
+		INIT_LIST_HEAD(&hdr->rpc_list);
+		spin_lock_init(&hdr->lock);
+		atomic_set(&hdr->refcnt, 0);
 	}
 	return p;
 }
 
-void nfs_writedata_free(struct nfs_write_data *p)
+static struct nfs_write_data *nfs_writedata_alloc(struct nfs_pgio_header *hdr,
+						  unsigned int pagecount)
+{
+	struct nfs_write_data *data, *prealloc;
+
+	prealloc = &container_of(hdr, struct nfs_write_header, header)->rpc_data;
+	if (prealloc->header == NULL)
+		data = prealloc;
+	else
+		data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		goto out;
+
+	if (nfs_pgarray_set(&data->pages, pagecount)) {
+		data->header = hdr;
+		atomic_inc(&hdr->refcnt);
+	} else {
+		if (data != prealloc)
+			kfree(data);
+		data = NULL;
+	}
+out:
+	return data;
+}
+
+void nfs_writehdr_free(struct nfs_pgio_header *hdr)
 {
-	if (p && (p->pagevec != &p->page_array[0]))
-		kfree(p->pagevec);
-	mempool_free(p, nfs_wdata_mempool);
+	struct nfs_write_header *whdr = container_of(hdr, struct nfs_write_header, header);
+	mempool_free(whdr, nfs_wdata_mempool);
 }
 
 void nfs_writedata_release(struct nfs_write_data *wdata)
 {
+	struct nfs_pgio_header *hdr = wdata->header;
+	struct nfs_write_header *write_header = container_of(hdr, struct nfs_write_header, header);
+
 	put_nfs_open_context(wdata->args.context);
-	nfs_writedata_free(wdata);
+	if (wdata->pages.pagevec != wdata->pages.page_array)
+		kfree(wdata->pages.pagevec);
+	if (wdata != &write_header->rpc_data)
+		kfree(wdata);
+	else
+		wdata->header = NULL;
+	if (atomic_dec_and_test(&hdr->refcnt))
+		hdr->completion_ops->completion(hdr);
 }
 
 static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
@@ -203,7 +230,6 @@ static int nfs_set_page_writeback(struct page *page)
 		struct inode *inode = page->mapping->host;
 		struct nfs_server *nfss = NFS_SERVER(inode);
 
-		page_cache_get(page);
 		if (atomic_long_inc_return(&nfss->writeback) >
 				NFS_CONGESTION_ON_THRESH) {
 			set_bdi_congested(&nfss->backing_dev_info,
@@ -219,7 +245,6 @@ static void nfs_end_page_writeback(struct page *page)
 	struct nfs_server *nfss = NFS_SERVER(inode);
 
 	end_page_writeback(page);
-	page_cache_release(page);
 	if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
 		clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
 }
@@ -235,10 +260,10 @@ static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblo
 		req = nfs_page_find_request_locked(page);
 		if (req == NULL)
 			break;
-		if (nfs_lock_request_dontget(req))
+		if (nfs_lock_request(req))
 			break;
 		/* Note: If we hold the page lock, as is the case in nfs_writepage,
-		 *	 then the call to nfs_lock_request_dontget() will always
+		 *	 then the call to nfs_lock_request() will always
 		 *	 succeed provided that someone hasn't already marked the
 		 *	 request as dirty (in which case we don't care).
 		 */
@@ -310,7 +335,8 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
 	struct nfs_pageio_descriptor pgio;
 	int err;
 
-	nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc));
+	nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc),
+			      &nfs_async_write_completion_ops);
 	err = nfs_do_writepage(page, wbc, &pgio);
 	nfs_pageio_complete(&pgio);
 	if (err < 0)
@@ -353,7 +379,8 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
 
-	nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
+	nfs_pageio_init_write(&pgio, inode, wb_priority(wbc),
+			      &nfs_async_write_completion_ops);
 	err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
 	nfs_pageio_complete(&pgio);
 
@@ -379,7 +406,7 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
 	struct nfs_inode *nfsi = NFS_I(inode);
 
 	/* Lock the request! */
-	nfs_lock_request_dontget(req);
+	nfs_lock_request(req);
 
 	spin_lock(&inode->i_lock);
 	if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE))
@@ -421,65 +448,88 @@ nfs_mark_request_dirty(struct nfs_page *req)
 /**
  * nfs_request_add_commit_list - add request to a commit list
  * @req: pointer to a struct nfs_page
- * @head: commit list head
+ * @dst: commit list head
+ * @cinfo: holds list lock and accounting info
  *
- * This sets the PG_CLEAN bit, updates the inode global count of
+ * This sets the PG_CLEAN bit, updates the cinfo count of
  * number of outstanding requests requiring a commit as well as
  * the MM page stats.
  *
- * The caller must _not_ hold the inode->i_lock, but must be
+ * The caller must _not_ hold the cinfo->lock, but must be
  * holding the nfs_page lock.
  */
 void
-nfs_request_add_commit_list(struct nfs_page *req, struct list_head *head)
+nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
+			    struct nfs_commit_info *cinfo)
 {
-	struct inode *inode = req->wb_context->dentry->d_inode;
-
 	set_bit(PG_CLEAN, &(req)->wb_flags);
-	spin_lock(&inode->i_lock);
-	nfs_list_add_request(req, head);
-	NFS_I(inode)->ncommit++;
-	spin_unlock(&inode->i_lock);
-	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
-	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+	spin_lock(cinfo->lock);
+	nfs_list_add_request(req, dst);
+	cinfo->mds->ncommit++;
+	spin_unlock(cinfo->lock);
+	if (!cinfo->dreq) {
+		inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+		inc_bdi_stat(req->wb_page->mapping->backing_dev_info,
+			     BDI_RECLAIMABLE);
+		__mark_inode_dirty(req->wb_context->dentry->d_inode,
+				   I_DIRTY_DATASYNC);
+	}
 }
 EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
 
 /**
  * nfs_request_remove_commit_list - Remove request from a commit list
  * @req: pointer to a nfs_page
+ * @cinfo: holds list lock and accounting info
  *
- * This clears the PG_CLEAN bit, and updates the inode global count of
+ * This clears the PG_CLEAN bit, and updates the cinfo's count of
  * number of outstanding requests requiring a commit
  * It does not update the MM page stats.
  *
- * The caller _must_ hold the inode->i_lock and the nfs_page lock.
+ * The caller _must_ hold the cinfo->lock and the nfs_page lock.
  */
 void
-nfs_request_remove_commit_list(struct nfs_page *req)
+nfs_request_remove_commit_list(struct nfs_page *req,
+			       struct nfs_commit_info *cinfo)
 {
-	struct inode *inode = req->wb_context->dentry->d_inode;
-
 	if (!test_and_clear_bit(PG_CLEAN, &(req)->wb_flags))
 		return;
 	nfs_list_remove_request(req);
-	NFS_I(inode)->ncommit--;
+	cinfo->mds->ncommit--;
 }
 EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list);
 
+static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
+				      struct inode *inode)
+{
+	cinfo->lock = &inode->i_lock;
+	cinfo->mds = &NFS_I(inode)->commit_info;
+	cinfo->ds = pnfs_get_ds_info(inode);
+	cinfo->dreq = NULL;
+	cinfo->completion_ops = &nfs_commit_completion_ops;
+}
+
+void nfs_init_cinfo(struct nfs_commit_info *cinfo,
+		    struct inode *inode,
+		    struct nfs_direct_req *dreq)
+{
+	if (dreq)
+		nfs_init_cinfo_from_dreq(cinfo, dreq);
+	else
+		nfs_init_cinfo_from_inode(cinfo, inode);
+}
+EXPORT_SYMBOL_GPL(nfs_init_cinfo);
 
 /*
  * Add a request to the inode's commit list.
  */
-static void
-nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
+void
+nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
+			struct nfs_commit_info *cinfo)
 {
-	struct inode *inode = req->wb_context->dentry->d_inode;
-
-	if (pnfs_mark_request_commit(req, lseg))
+	if (pnfs_mark_request_commit(req, lseg, cinfo))
 		return;
-	nfs_request_add_commit_list(req, &NFS_I(inode)->commit_list);
+	nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo);
 }
 
 static void
@@ -494,11 +544,13 @@ nfs_clear_request_commit(struct nfs_page *req)
 {
 	if (test_bit(PG_CLEAN, &req->wb_flags)) {
 		struct inode *inode = req->wb_context->dentry->d_inode;
+		struct nfs_commit_info cinfo;
 
-		if (!pnfs_clear_request_commit(req)) {
-			spin_lock(&inode->i_lock);
-			nfs_request_remove_commit_list(req);
-			spin_unlock(&inode->i_lock);
+		nfs_init_cinfo_from_inode(&cinfo, inode);
+		if (!pnfs_clear_request_commit(req, &cinfo)) {
+			spin_lock(cinfo.lock);
+			nfs_request_remove_commit_list(req, &cinfo);
+			spin_unlock(cinfo.lock);
 		}
 		nfs_clear_page_commit(req->wb_page);
 	}
@@ -508,28 +560,25 @@ static inline
 int nfs_write_need_commit(struct nfs_write_data *data)
 {
 	if (data->verf.committed == NFS_DATA_SYNC)
-		return data->lseg == NULL;
-	else
-		return data->verf.committed != NFS_FILE_SYNC;
+		return data->header->lseg == NULL;
+	return data->verf.committed != NFS_FILE_SYNC;
 }
 
-static inline
-int nfs_reschedule_unstable_write(struct nfs_page *req,
-				  struct nfs_write_data *data)
+#else
+static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
+				      struct inode *inode)
 {
-	if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) {
-		nfs_mark_request_commit(req, data->lseg);
-		return 1;
-	}
-	if (test_and_clear_bit(PG_NEED_RESCHED, &req->wb_flags)) {
-		nfs_mark_request_dirty(req);
-		return 1;
-	}
-	return 0;
 }
-#else
-static void
-nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
+
+void nfs_init_cinfo(struct nfs_commit_info *cinfo,
+		    struct inode *inode,
+		    struct nfs_direct_req *dreq)
+{
+}
+
+void
+nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
+			struct nfs_commit_info *cinfo)
 {
 }
 
@@ -544,25 +593,57 @@ int nfs_write_need_commit(struct nfs_write_data *data)
 	return 0;
 }
 
-static inline
-int nfs_reschedule_unstable_write(struct nfs_page *req,
-				  struct nfs_write_data *data)
+#endif
+
+static void nfs_write_completion(struct nfs_pgio_header *hdr)
 {
-	return 0;
+	struct nfs_commit_info cinfo;
+	unsigned long bytes = 0;
+
+	if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
+		goto out;
+	nfs_init_cinfo_from_inode(&cinfo, hdr->inode);
+	while (!list_empty(&hdr->pages)) {
+		struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+
+		bytes += req->wb_bytes;
+		nfs_list_remove_request(req);
+		if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) &&
+		    (hdr->good_bytes < bytes)) {
+			nfs_set_pageerror(req->wb_page);
+			nfs_context_set_write_error(req->wb_context, hdr->error);
+			goto remove_req;
+		}
+		if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) {
+			nfs_mark_request_dirty(req);
+			goto next;
+		}
+		if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) {
+			nfs_mark_request_commit(req, hdr->lseg, &cinfo);
+			goto next;
+		}
+remove_req:
+		nfs_inode_remove_request(req);
+next:
+		nfs_unlock_request(req);
+		nfs_end_page_writeback(req->wb_page);
+		nfs_release_request(req);
+	}
+out:
+	hdr->release(hdr);
 }
-#endif
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-static int
-nfs_need_commit(struct nfs_inode *nfsi)
+static unsigned long
+nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
 {
-	return nfsi->ncommit > 0;
+	return cinfo->mds->ncommit;
 }
 
-/* i_lock held by caller */
-static int
-nfs_scan_commit_list(struct list_head *src, struct list_head *dst, int max,
-		spinlock_t *lock)
+/* cinfo->lock held by caller */
+int
+nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
+		     struct nfs_commit_info *cinfo, int max)
 {
 	struct nfs_page *req, *tmp;
 	int ret = 0;
@@ -570,12 +651,13 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst, int max,
 	list_for_each_entry_safe(req, tmp, src, wb_list) {
 		if (!nfs_lock_request(req))
 			continue;
-		if (cond_resched_lock(lock))
+		kref_get(&req->wb_kref);
+		if (cond_resched_lock(cinfo->lock))
 			list_safe_reset_next(req, tmp, wb_list);
-		nfs_request_remove_commit_list(req);
+		nfs_request_remove_commit_list(req, cinfo);
 		nfs_list_add_request(req, dst);
 		ret++;
-		if (ret == max)
+		if ((ret == max) && !cinfo->dreq)
 			break;
 	}
 	return ret;
@@ -584,37 +666,38 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst, int max,
 /*
  * nfs_scan_commit - Scan an inode for commit requests
  * @inode: NFS inode to scan
- * @dst: destination list
+ * @dst: mds destination list
+ * @cinfo: mds and ds lists of reqs ready to commit
  *
  * Moves requests from the inode's 'commit' request list.
  * The requests are *not* checked to ensure that they form a contiguous set.
  */
-static int
-nfs_scan_commit(struct inode *inode, struct list_head *dst)
+int
+nfs_scan_commit(struct inode *inode, struct list_head *dst,
+		struct nfs_commit_info *cinfo)
 {
-	struct nfs_inode *nfsi = NFS_I(inode);
 	int ret = 0;
 
-	spin_lock(&inode->i_lock);
-	if (nfsi->ncommit > 0) {
+	spin_lock(cinfo->lock);
+	if (cinfo->mds->ncommit > 0) {
 		const int max = INT_MAX;
 
-		ret = nfs_scan_commit_list(&nfsi->commit_list, dst, max,
-				&inode->i_lock);
-		ret += pnfs_scan_commit_lists(inode, max - ret,
-				&inode->i_lock);
+		ret = nfs_scan_commit_list(&cinfo->mds->list, dst,
+					   cinfo, max);
+		ret += pnfs_scan_commit_lists(inode, cinfo, max - ret);
 	}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(cinfo->lock);
 	return ret;
 }
 
 #else
-static inline int nfs_need_commit(struct nfs_inode *nfsi)
+static unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
 {
 	return 0;
 }
 
-static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst)
+int nfs_scan_commit(struct inode *inode, struct list_head *dst,
+		    struct nfs_commit_info *cinfo)
 {
 	return 0;
 }
@@ -659,7 +742,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
 		    || end < req->wb_offset)
 			goto out_flushme;
 
-		if (nfs_lock_request_dontget(req))
+		if (nfs_lock_request(req))
 			break;
 
 		/* The request is locked, so wait and then retry */
@@ -729,7 +812,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
 	nfs_grow_file(page, offset, count);
 	nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
 	nfs_mark_request_dirty(req);
-	nfs_unlock_request(req);
+	nfs_unlock_and_release_request(req);
 	return 0;
 }
 
@@ -766,10 +849,14 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
  * the PageUptodate() flag. In this case, we will need to turn off
  * write optimisations that depend on the page contents being correct.
  */
-static int nfs_write_pageuptodate(struct page *page, struct inode *inode)
+static bool nfs_write_pageuptodate(struct page *page, struct inode *inode)
 {
-	return PageUptodate(page) &&
-		!(NFS_I(inode)->cache_validity & (NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA));
+	if (nfs_have_delegated_attributes(inode))
+		goto out;
+	if (NFS_I(inode)->cache_validity & NFS_INO_REVAL_PAGECACHE)
+		return false;
+out:
+	return PageUptodate(page) != 0;
 }
 
 /*
@@ -815,17 +902,6 @@ int nfs_updatepage(struct file *file, struct page *page,
 	return status;
 }
 
-static void nfs_writepage_release(struct nfs_page *req,
-				  struct nfs_write_data *data)
-{
-	struct page *page = req->wb_page;
-
-	if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req, data))
-		nfs_inode_remove_request(req);
-	nfs_unlock_request(req);
-	nfs_end_page_writeback(page);
-}
-
 static int flush_task_priority(int how)
 {
 	switch (how & (FLUSH_HIGHPRI|FLUSH_LOWPRI)) {
@@ -837,18 +913,18 @@ static int flush_task_priority(int how)
 	return RPC_PRIORITY_NORMAL;
 }
 
-int nfs_initiate_write(struct nfs_write_data *data,
-		       struct rpc_clnt *clnt,
+int nfs_initiate_write(struct rpc_clnt *clnt,
+		       struct nfs_write_data *data,
 		       const struct rpc_call_ops *call_ops,
-		       int how)
+		       int how, int flags)
 {
-	struct inode *inode = data->inode;
+	struct inode *inode = data->header->inode;
 	int priority = flush_task_priority(how);
 	struct rpc_task *task;
 	struct rpc_message msg = {
 		.rpc_argp = &data->args,
 		.rpc_resp = &data->res,
-		.rpc_cred = data->cred,
+		.rpc_cred = data->header->cred,
 	};
 	struct rpc_task_setup task_setup_data = {
 		.rpc_client = clnt,
@@ -857,7 +933,7 @@ int nfs_initiate_write(struct nfs_write_data *data,
 		.callback_ops = call_ops,
 		.callback_data = data,
 		.workqueue = nfsiod_workqueue,
-		.flags = RPC_TASK_ASYNC,
+		.flags = RPC_TASK_ASYNC | flags,
 		.priority = priority,
 	};
 	int ret = 0;
@@ -892,26 +968,21 @@ EXPORT_SYMBOL_GPL(nfs_initiate_write);
 /*
  * Set up the argument/result storage required for the RPC call.
  */
-static void nfs_write_rpcsetup(struct nfs_page *req,
-		struct nfs_write_data *data,
+static void nfs_write_rpcsetup(struct nfs_write_data *data,
 		unsigned int count, unsigned int offset,
-		int how)
+		int how, struct nfs_commit_info *cinfo)
 {
-	struct inode *inode = req->wb_context->dentry->d_inode;
+	struct nfs_page *req = data->header->req;
 
 	/* Set up the RPC argument and reply structs
 	 * NB: take care not to mess about with data->commit et al. */
 
-	data->req = req;
-	data->inode = inode = req->wb_context->dentry->d_inode;
-	data->cred = req->wb_context->cred;
-
-	data->args.fh     = NFS_FH(inode);
+	data->args.fh     = NFS_FH(data->header->inode);
 	data->args.offset = req_offset(req) + offset;
 	/* pnfs_set_layoutcommit needs this */
 	data->mds_offset = data->args.offset;
 	data->args.pgbase = req->wb_pgbase + offset;
-	data->args.pages  = data->pagevec;
+	data->args.pages  = data->pages.pagevec;
 	data->args.count  = count;
 	data->args.context = get_nfs_open_context(req->wb_context);
 	data->args.lock_context = req->wb_lock_context;
@@ -920,7 +991,7 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
 	case 0:
 		break;
 	case FLUSH_COND_STABLE:
-		if (nfs_need_commit(NFS_I(inode)))
+		if (nfs_reqs_to_commit(cinfo))
 			break;
 	default:
 		data->args.stable = NFS_FILE_SYNC;
@@ -936,9 +1007,9 @@ static int nfs_do_write(struct nfs_write_data *data,
 		const struct rpc_call_ops *call_ops,
 		int how)
 {
-	struct inode *inode = data->args.context->dentry->d_inode;
+	struct inode *inode = data->header->inode;
 
-	return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how);
+	return nfs_initiate_write(NFS_CLIENT(inode), data, call_ops, how, 0);
 }
 
 static int nfs_do_multiple_writes(struct list_head *head,
@@ -951,7 +1022,7 @@ static int nfs_do_multiple_writes(struct list_head *head,
 	while (!list_empty(head)) {
 		int ret2;
 
-		data = list_entry(head->next, struct nfs_write_data, list);
+		data = list_first_entry(head, struct nfs_write_data, list);
 		list_del_init(&data->list);
 		
 		ret2 = nfs_do_write(data, call_ops, how);
@@ -967,31 +1038,60 @@ static int nfs_do_multiple_writes(struct list_head *head,
  */
 static void nfs_redirty_request(struct nfs_page *req)
 {
-	struct page *page = req->wb_page;
-
 	nfs_mark_request_dirty(req);
 	nfs_unlock_request(req);
-	nfs_end_page_writeback(page);
+	nfs_end_page_writeback(req->wb_page);
+	nfs_release_request(req);
+}
+
+static void nfs_async_write_error(struct list_head *head)
+{
+	struct nfs_page	*req;
+
+	while (!list_empty(head)) {
+		req = nfs_list_entry(head->next);
+		nfs_list_remove_request(req);
+		nfs_redirty_request(req);
+	}
+}
+
+static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = {
+	.error_cleanup = nfs_async_write_error,
+	.completion = nfs_write_completion,
+};
+
+static void nfs_flush_error(struct nfs_pageio_descriptor *desc,
+		struct nfs_pgio_header *hdr)
+{
+	set_bit(NFS_IOHDR_REDO, &hdr->flags);
+	while (!list_empty(&hdr->rpc_list)) {
+		struct nfs_write_data *data = list_first_entry(&hdr->rpc_list,
+				struct nfs_write_data, list);
+		list_del(&data->list);
+		nfs_writedata_release(data);
+	}
+	desc->pg_completion_ops->error_cleanup(&desc->pg_list);
 }
 
 /*
  * Generate multiple small requests to write out a single
  * contiguous dirty area on one page.
  */
-static int nfs_flush_multi(struct nfs_pageio_descriptor *desc, struct list_head *res)
+static int nfs_flush_multi(struct nfs_pageio_descriptor *desc,
+			   struct nfs_pgio_header *hdr)
 {
-	struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
+	struct nfs_page *req = hdr->req;
 	struct page *page = req->wb_page;
 	struct nfs_write_data *data;
 	size_t wsize = desc->pg_bsize, nbytes;
 	unsigned int offset;
 	int requests = 0;
-	int ret = 0;
+	struct nfs_commit_info cinfo;
 
-	nfs_list_remove_request(req);
+	nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
 
 	if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
-	    (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit ||
+	    (desc->pg_moreio || nfs_reqs_to_commit(&cinfo) ||
 	     desc->pg_count > wsize))
 		desc->pg_ioflags &= ~FLUSH_COND_STABLE;
 
@@ -1001,28 +1101,22 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc, struct list_head
 	do {
 		size_t len = min(nbytes, wsize);
 
-		data = nfs_writedata_alloc(1);
-		if (!data)
-			goto out_bad;
-		data->pagevec[0] = page;
-		nfs_write_rpcsetup(req, data, len, offset, desc->pg_ioflags);
-		list_add(&data->list, res);
+		data = nfs_writedata_alloc(hdr, 1);
+		if (!data) {
+			nfs_flush_error(desc, hdr);
+			return -ENOMEM;
+		}
+		data->pages.pagevec[0] = page;
+		nfs_write_rpcsetup(data, len, offset, desc->pg_ioflags, &cinfo);
+		list_add(&data->list, &hdr->rpc_list);
 		requests++;
 		nbytes -= len;
 		offset += len;
 	} while (nbytes != 0);
-	atomic_set(&req->wb_complete, requests);
-	desc->pg_rpc_callops = &nfs_write_partial_ops;
-	return ret;
-
-out_bad:
-	while (!list_empty(res)) {
-		data = list_entry(res->next, struct nfs_write_data, list);
-		list_del(&data->list);
-		nfs_writedata_release(data);
-	}
-	nfs_redirty_request(req);
-	return -ENOMEM;
+	nfs_list_remove_request(req);
+	nfs_list_add_request(req, &hdr->pages);
+	desc->pg_rpc_callops = &nfs_write_common_ops;
+	return 0;
 }
 
 /*
@@ -1033,62 +1127,71 @@ out_bad:
  * This is the case if nfs_updatepage detects a conflicting request
  * that has been written but not committed.
  */
-static int nfs_flush_one(struct nfs_pageio_descriptor *desc, struct list_head *res)
+static int nfs_flush_one(struct nfs_pageio_descriptor *desc,
+			 struct nfs_pgio_header *hdr)
 {
 	struct nfs_page		*req;
 	struct page		**pages;
 	struct nfs_write_data	*data;
 	struct list_head *head = &desc->pg_list;
-	int ret = 0;
+	struct nfs_commit_info cinfo;
 
-	data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base,
-						      desc->pg_count));
+	data = nfs_writedata_alloc(hdr, nfs_page_array_len(desc->pg_base,
+							   desc->pg_count));
 	if (!data) {
-		while (!list_empty(head)) {
-			req = nfs_list_entry(head->next);
-			nfs_list_remove_request(req);
-			nfs_redirty_request(req);
-		}
-		ret = -ENOMEM;
-		goto out;
+		nfs_flush_error(desc, hdr);
+		return -ENOMEM;
 	}
-	pages = data->pagevec;
+
+	nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
+	pages = data->pages.pagevec;
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
-		nfs_list_add_request(req, &data->pages);
+		nfs_list_add_request(req, &hdr->pages);
 		*pages++ = req->wb_page;
 	}
-	req = nfs_list_entry(data->pages.next);
 
 	if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
-	    (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit))
+	    (desc->pg_moreio || nfs_reqs_to_commit(&cinfo)))
 		desc->pg_ioflags &= ~FLUSH_COND_STABLE;
 
 	/* Set up the argument struct */
-	nfs_write_rpcsetup(req, data, desc->pg_count, 0, desc->pg_ioflags);
-	list_add(&data->list, res);
-	desc->pg_rpc_callops = &nfs_write_full_ops;
-out:
-	return ret;
+	nfs_write_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
+	list_add(&data->list, &hdr->rpc_list);
+	desc->pg_rpc_callops = &nfs_write_common_ops;
+	return 0;
 }
 
-int nfs_generic_flush(struct nfs_pageio_descriptor *desc, struct list_head *head)
+int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
+		      struct nfs_pgio_header *hdr)
 {
 	if (desc->pg_bsize < PAGE_CACHE_SIZE)
-		return nfs_flush_multi(desc, head);
-	return nfs_flush_one(desc, head);
+		return nfs_flush_multi(desc, hdr);
+	return nfs_flush_one(desc, hdr);
 }
 
 static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 {
-	LIST_HEAD(head);
+	struct nfs_write_header *whdr;
+	struct nfs_pgio_header *hdr;
 	int ret;
 
-	ret = nfs_generic_flush(desc, &head);
+	whdr = nfs_writehdr_alloc();
+	if (!whdr) {
+		desc->pg_completion_ops->error_cleanup(&desc->pg_list);
+		return -ENOMEM;
+	}
+	hdr = &whdr->header;
+	nfs_pgheader_init(desc, hdr, nfs_writehdr_free);
+	atomic_inc(&hdr->refcnt);
+	ret = nfs_generic_flush(desc, hdr);
 	if (ret == 0)
-		ret = nfs_do_multiple_writes(&head, desc->pg_rpc_callops,
-				desc->pg_ioflags);
+		ret = nfs_do_multiple_writes(&hdr->rpc_list,
+					     desc->pg_rpc_callops,
+					     desc->pg_ioflags);
+	if (atomic_dec_and_test(&hdr->refcnt))
+		hdr->completion_ops->completion(hdr);
 	return ret;
 }
 
@@ -1098,9 +1201,10 @@ static const struct nfs_pageio_ops nfs_pageio_write_ops = {
 };
 
 void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
-				  struct inode *inode, int ioflags)
+			       struct inode *inode, int ioflags,
+			       const struct nfs_pgio_completion_ops *compl_ops)
 {
-	nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops,
+	nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops, compl_ops,
 				NFS_SERVER(inode)->wsize, ioflags);
 }
 
@@ -1111,80 +1215,27 @@ void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
 
-static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
-				  struct inode *inode, int ioflags)
+void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
+			   struct inode *inode, int ioflags,
+			   const struct nfs_pgio_completion_ops *compl_ops)
 {
-	if (!pnfs_pageio_init_write(pgio, inode, ioflags))
-		nfs_pageio_init_write_mds(pgio, inode, ioflags);
+	if (!pnfs_pageio_init_write(pgio, inode, ioflags, compl_ops))
+		nfs_pageio_init_write_mds(pgio, inode, ioflags, compl_ops);
 }
 
-/*
- * Handle a write reply that flushed part of a page.
- */
-static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
+void nfs_write_prepare(struct rpc_task *task, void *calldata)
 {
-	struct nfs_write_data	*data = calldata;
-
-	dprintk("NFS: %5u write(%s/%lld %d@%lld)",
-		task->tk_pid,
-		data->req->wb_context->dentry->d_inode->i_sb->s_id,
-		(long long)
-		  NFS_FILEID(data->req->wb_context->dentry->d_inode),
-		data->req->wb_bytes, (long long)req_offset(data->req));
-
-	nfs_writeback_done(task, data);
+	struct nfs_write_data *data = calldata;
+	NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data);
 }
 
-static void nfs_writeback_release_partial(void *calldata)
+void nfs_commit_prepare(struct rpc_task *task, void *calldata)
 {
-	struct nfs_write_data	*data = calldata;
-	struct nfs_page		*req = data->req;
-	struct page		*page = req->wb_page;
-	int status = data->task.tk_status;
+	struct nfs_commit_data *data = calldata;
 
-	if (status < 0) {
-		nfs_set_pageerror(page);
-		nfs_context_set_write_error(req->wb_context, status);
-		dprintk(", error = %d\n", status);
-		goto out;
-	}
-
-	if (nfs_write_need_commit(data)) {
-		struct inode *inode = page->mapping->host;
-
-		spin_lock(&inode->i_lock);
-		if (test_bit(PG_NEED_RESCHED, &req->wb_flags)) {
-			/* Do nothing we need to resend the writes */
-		} else if (!test_and_set_bit(PG_NEED_COMMIT, &req->wb_flags)) {
-			memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
-			dprintk(" defer commit\n");
-		} else if (memcmp(&req->wb_verf, &data->verf, sizeof(req->wb_verf))) {
-			set_bit(PG_NEED_RESCHED, &req->wb_flags);
-			clear_bit(PG_NEED_COMMIT, &req->wb_flags);
-			dprintk(" server reboot detected\n");
-		}
-		spin_unlock(&inode->i_lock);
-	} else
-		dprintk(" OK\n");
-
-out:
-	if (atomic_dec_and_test(&req->wb_complete))
-		nfs_writepage_release(req, data);
-	nfs_writedata_release(calldata);
+	NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
 }
 
-void nfs_write_prepare(struct rpc_task *task, void *calldata)
-{
-	struct nfs_write_data *data = calldata;
-	NFS_PROTO(data->inode)->write_rpc_prepare(task, data);
-}
-
-static const struct rpc_call_ops nfs_write_partial_ops = {
-	.rpc_call_prepare = nfs_write_prepare,
-	.rpc_call_done = nfs_writeback_done_partial,
-	.rpc_release = nfs_writeback_release_partial,
-};
-
 /*
  * Handle a write reply that flushes a whole page.
  *
@@ -1192,59 +1243,37 @@ static const struct rpc_call_ops nfs_write_partial_ops = {
  *	  writebacks since the page->count is kept > 1 for as long
  *	  as the page has a write request pending.
  */
-static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
+static void nfs_writeback_done_common(struct rpc_task *task, void *calldata)
 {
 	struct nfs_write_data	*data = calldata;
 
 	nfs_writeback_done(task, data);
 }
 
-static void nfs_writeback_release_full(void *calldata)
+static void nfs_writeback_release_common(void *calldata)
 {
 	struct nfs_write_data	*data = calldata;
+	struct nfs_pgio_header *hdr = data->header;
 	int status = data->task.tk_status;
+	struct nfs_page *req = hdr->req;
 
-	/* Update attributes as result of writeback. */
-	while (!list_empty(&data->pages)) {
-		struct nfs_page *req = nfs_list_entry(data->pages.next);
-		struct page *page = req->wb_page;
-
-		nfs_list_remove_request(req);
-
-		dprintk("NFS: %5u write (%s/%lld %d@%lld)",
-			data->task.tk_pid,
-			req->wb_context->dentry->d_inode->i_sb->s_id,
-			(long long)NFS_FILEID(req->wb_context->dentry->d_inode),
-			req->wb_bytes,
-			(long long)req_offset(req));
-
-		if (status < 0) {
-			nfs_set_pageerror(page);
-			nfs_context_set_write_error(req->wb_context, status);
-			dprintk(", error = %d\n", status);
-			goto remove_request;
-		}
-
-		if (nfs_write_need_commit(data)) {
+	if ((status >= 0) && nfs_write_need_commit(data)) {
+		spin_lock(&hdr->lock);
+		if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags))
+			; /* Do nothing */
+		else if (!test_and_set_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags))
 			memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
-			nfs_mark_request_commit(req, data->lseg);
-			dprintk(" marked for commit\n");
-			goto next;
-		}
-		dprintk(" OK\n");
-remove_request:
-		nfs_inode_remove_request(req);
-	next:
-		nfs_unlock_request(req);
-		nfs_end_page_writeback(page);
+		else if (memcmp(&req->wb_verf, &data->verf, sizeof(req->wb_verf)))
+			set_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags);
+		spin_unlock(&hdr->lock);
 	}
-	nfs_writedata_release(calldata);
+	nfs_writedata_release(data);
 }
 
-static const struct rpc_call_ops nfs_write_full_ops = {
+static const struct rpc_call_ops nfs_write_common_ops = {
 	.rpc_call_prepare = nfs_write_prepare,
-	.rpc_call_done = nfs_writeback_done_full,
-	.rpc_release = nfs_writeback_release_full,
+	.rpc_call_done = nfs_writeback_done_common,
+	.rpc_release = nfs_writeback_release_common,
 };
 
 
@@ -1255,6 +1284,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 {
 	struct nfs_writeargs	*argp = &data->args;
 	struct nfs_writeres	*resp = &data->res;
+	struct inode		*inode = data->header->inode;
 	int status;
 
 	dprintk("NFS: %5u nfs_writeback_done (status %d)\n",
@@ -1267,10 +1297,10 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 	 * another writer had changed the file, but some applications
 	 * depend on tighter cache coherency when writing.
 	 */
-	status = NFS_PROTO(data->inode)->write_done(task, data);
+	status = NFS_PROTO(inode)->write_done(task, data);
 	if (status != 0)
 		return;
-	nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
+	nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 	if (resp->verf->committed < argp->stable && task->tk_status >= 0) {
@@ -1288,46 +1318,47 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 		if (time_before(complain, jiffies)) {
 			dprintk("NFS:       faulty NFS server %s:"
 				" (committed = %d) != (stable = %d)\n",
-				NFS_SERVER(data->inode)->nfs_client->cl_hostname,
+				NFS_SERVER(inode)->nfs_client->cl_hostname,
 				resp->verf->committed, argp->stable);
 			complain = jiffies + 300 * HZ;
 		}
 	}
 #endif
-	/* Is this a short write? */
-	if (task->tk_status >= 0 && resp->count < argp->count) {
+	if (task->tk_status < 0)
+		nfs_set_pgio_error(data->header, task->tk_status, argp->offset);
+	else if (resp->count < argp->count) {
 		static unsigned long    complain;
 
-		nfs_inc_stats(data->inode, NFSIOS_SHORTWRITE);
+		/* This a short write! */
+		nfs_inc_stats(inode, NFSIOS_SHORTWRITE);
 
 		/* Has the server at least made some progress? */
-		if (resp->count != 0) {
-			/* Was this an NFSv2 write or an NFSv3 stable write? */
-			if (resp->verf->committed != NFS_UNSTABLE) {
-				/* Resend from where the server left off */
-				data->mds_offset += resp->count;
-				argp->offset += resp->count;
-				argp->pgbase += resp->count;
-				argp->count -= resp->count;
-			} else {
-				/* Resend as a stable write in order to avoid
-				 * headaches in the case of a server crash.
-				 */
-				argp->stable = NFS_FILE_SYNC;
+		if (resp->count == 0) {
+			if (time_before(complain, jiffies)) {
+				printk(KERN_WARNING
+				       "NFS: Server wrote zero bytes, expected %u.\n",
+				       argp->count);
+				complain = jiffies + 300 * HZ;
 			}
-			rpc_restart_call_prepare(task);
+			nfs_set_pgio_error(data->header, -EIO, argp->offset);
+			task->tk_status = -EIO;
 			return;
 		}
-		if (time_before(complain, jiffies)) {
-			printk(KERN_WARNING
-			       "NFS: Server wrote zero bytes, expected %u.\n",
-					argp->count);
-			complain = jiffies + 300 * HZ;
+		/* Was this an NFSv2 write or an NFSv3 stable write? */
+		if (resp->verf->committed != NFS_UNSTABLE) {
+			/* Resend from where the server left off */
+			data->mds_offset += resp->count;
+			argp->offset += resp->count;
+			argp->pgbase += resp->count;
+			argp->count -= resp->count;
+		} else {
+			/* Resend as a stable write in order to avoid
+			 * headaches in the case of a server crash.
+			 */
+			argp->stable = NFS_FILE_SYNC;
 		}
-		/* Can't do anything about it except throw an error. */
-		task->tk_status = -EIO;
+		rpc_restart_call_prepare(task);
 	}
-	return;
 }
 
 
@@ -1347,26 +1378,23 @@ static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
 	return (ret < 0) ? ret : 1;
 }
 
-void nfs_commit_clear_lock(struct nfs_inode *nfsi)
+static void nfs_commit_clear_lock(struct nfs_inode *nfsi)
 {
 	clear_bit(NFS_INO_COMMIT, &nfsi->flags);
 	smp_mb__after_clear_bit();
 	wake_up_bit(&nfsi->flags, NFS_INO_COMMIT);
 }
-EXPORT_SYMBOL_GPL(nfs_commit_clear_lock);
 
-void nfs_commitdata_release(void *data)
+void nfs_commitdata_release(struct nfs_commit_data *data)
 {
-	struct nfs_write_data *wdata = data;
-
-	put_nfs_open_context(wdata->args.context);
-	nfs_commit_free(wdata);
+	put_nfs_open_context(data->context);
+	nfs_commit_free(data);
 }
 EXPORT_SYMBOL_GPL(nfs_commitdata_release);
 
-int nfs_initiate_commit(struct nfs_write_data *data, struct rpc_clnt *clnt,
+int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
 			const struct rpc_call_ops *call_ops,
-			int how)
+			int how, int flags)
 {
 	struct rpc_task *task;
 	int priority = flush_task_priority(how);
@@ -1382,7 +1410,7 @@ int nfs_initiate_commit(struct nfs_write_data *data, struct rpc_clnt *clnt,
 		.callback_ops = call_ops,
 		.callback_data = data,
 		.workqueue = nfsiod_workqueue,
-		.flags = RPC_TASK_ASYNC,
+		.flags = RPC_TASK_ASYNC | flags,
 		.priority = priority,
 	};
 	/* Set up the initial task struct.  */
@@ -1403,9 +1431,10 @@ EXPORT_SYMBOL_GPL(nfs_initiate_commit);
 /*
  * Set up the argument/result storage required for the RPC call.
  */
-void nfs_init_commit(struct nfs_write_data *data,
-			    struct list_head *head,
-			    struct pnfs_layout_segment *lseg)
+void nfs_init_commit(struct nfs_commit_data *data,
+		     struct list_head *head,
+		     struct pnfs_layout_segment *lseg,
+		     struct nfs_commit_info *cinfo)
 {
 	struct nfs_page *first = nfs_list_entry(head->next);
 	struct inode *inode = first->wb_context->dentry->d_inode;
@@ -1419,13 +1448,14 @@ void nfs_init_commit(struct nfs_write_data *data,
 	data->cred	  = first->wb_context->cred;
 	data->lseg	  = lseg; /* reference transferred */
 	data->mds_ops     = &nfs_commit_ops;
+	data->completion_ops = cinfo->completion_ops;
+	data->dreq	  = cinfo->dreq;
 
 	data->args.fh     = NFS_FH(data->inode);
 	/* Note: we always request a commit of the entire inode */
 	data->args.offset = 0;
 	data->args.count  = 0;
-	data->args.context = get_nfs_open_context(first->wb_context);
-	data->res.count   = 0;
+	data->context     = get_nfs_open_context(first->wb_context);
 	data->res.fattr   = &data->fattr;
 	data->res.verf    = &data->verf;
 	nfs_fattr_init(&data->fattr);
@@ -1433,18 +1463,21 @@ void nfs_init_commit(struct nfs_write_data *data,
 EXPORT_SYMBOL_GPL(nfs_init_commit);
 
 void nfs_retry_commit(struct list_head *page_list,
-		      struct pnfs_layout_segment *lseg)
+		      struct pnfs_layout_segment *lseg,
+		      struct nfs_commit_info *cinfo)
 {
 	struct nfs_page *req;
 
 	while (!list_empty(page_list)) {
 		req = nfs_list_entry(page_list->next);
 		nfs_list_remove_request(req);
-		nfs_mark_request_commit(req, lseg);
-		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
-			     BDI_RECLAIMABLE);
-		nfs_unlock_request(req);
+		nfs_mark_request_commit(req, lseg, cinfo);
+		if (!cinfo->dreq) {
+			dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+			dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
+				     BDI_RECLAIMABLE);
+		}
+		nfs_unlock_and_release_request(req);
 	}
 }
 EXPORT_SYMBOL_GPL(nfs_retry_commit);
@@ -1453,9 +1486,10 @@ EXPORT_SYMBOL_GPL(nfs_retry_commit);
  * Commit dirty pages
  */
 static int
-nfs_commit_list(struct inode *inode, struct list_head *head, int how)
+nfs_commit_list(struct inode *inode, struct list_head *head, int how,
+		struct nfs_commit_info *cinfo)
 {
-	struct nfs_write_data	*data;
+	struct nfs_commit_data	*data;
 
 	data = nfs_commitdata_alloc();
 
@@ -1463,11 +1497,13 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
 		goto out_bad;
 
 	/* Set up the argument struct */
-	nfs_init_commit(data, head, NULL);
-	return nfs_initiate_commit(data, NFS_CLIENT(inode), data->mds_ops, how);
+	nfs_init_commit(data, head, NULL, cinfo);
+	atomic_inc(&cinfo->mds->rpcs_out);
+	return nfs_initiate_commit(NFS_CLIENT(inode), data, data->mds_ops,
+				   how, 0);
  out_bad:
-	nfs_retry_commit(head, NULL);
-	nfs_commit_clear_lock(NFS_I(inode));
+	nfs_retry_commit(head, NULL, cinfo);
+	cinfo->completion_ops->error_cleanup(NFS_I(inode));
 	return -ENOMEM;
 }
 
@@ -1476,7 +1512,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
  */
 static void nfs_commit_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs_write_data	*data = calldata;
+	struct nfs_commit_data	*data = calldata;
 
         dprintk("NFS: %5u nfs_commit_done (status %d)\n",
                                 task->tk_pid, task->tk_status);
@@ -1485,10 +1521,11 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
 	NFS_PROTO(data->inode)->commit_done(task, data);
 }
 
-void nfs_commit_release_pages(struct nfs_write_data *data)
+static void nfs_commit_release_pages(struct nfs_commit_data *data)
 {
 	struct nfs_page	*req;
 	int status = data->task.tk_status;
+	struct nfs_commit_info cinfo;
 
 	while (!list_empty(&data->pages)) {
 		req = nfs_list_entry(data->pages.next);
@@ -1519,42 +1556,59 @@ void nfs_commit_release_pages(struct nfs_write_data *data)
 		dprintk(" mismatch\n");
 		nfs_mark_request_dirty(req);
 	next:
-		nfs_unlock_request(req);
+		nfs_unlock_and_release_request(req);
 	}
+	nfs_init_cinfo(&cinfo, data->inode, data->dreq);
+	if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
+		nfs_commit_clear_lock(NFS_I(data->inode));
 }
-EXPORT_SYMBOL_GPL(nfs_commit_release_pages);
 
 static void nfs_commit_release(void *calldata)
 {
-	struct nfs_write_data *data = calldata;
+	struct nfs_commit_data *data = calldata;
 
-	nfs_commit_release_pages(data);
-	nfs_commit_clear_lock(NFS_I(data->inode));
+	data->completion_ops->completion(data);
 	nfs_commitdata_release(calldata);
 }
 
 static const struct rpc_call_ops nfs_commit_ops = {
-	.rpc_call_prepare = nfs_write_prepare,
+	.rpc_call_prepare = nfs_commit_prepare,
 	.rpc_call_done = nfs_commit_done,
 	.rpc_release = nfs_commit_release,
 };
 
+static const struct nfs_commit_completion_ops nfs_commit_completion_ops = {
+	.completion = nfs_commit_release_pages,
+	.error_cleanup = nfs_commit_clear_lock,
+};
+
+int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
+			    int how, struct nfs_commit_info *cinfo)
+{
+	int status;
+
+	status = pnfs_commit_list(inode, head, how, cinfo);
+	if (status == PNFS_NOT_ATTEMPTED)
+		status = nfs_commit_list(inode, head, how, cinfo);
+	return status;
+}
+
 int nfs_commit_inode(struct inode *inode, int how)
 {
 	LIST_HEAD(head);
+	struct nfs_commit_info cinfo;
 	int may_wait = how & FLUSH_SYNC;
 	int res;
 
 	res = nfs_commit_set_lock(NFS_I(inode), may_wait);
 	if (res <= 0)
 		goto out_mark_dirty;
-	res = nfs_scan_commit(inode, &head);
+	nfs_init_cinfo_from_inode(&cinfo, inode);
+	res = nfs_scan_commit(inode, &head, &cinfo);
 	if (res) {
 		int error;
 
-		error = pnfs_commit_list(inode, &head, how);
-		if (error == PNFS_NOT_ATTEMPTED)
-			error = nfs_commit_list(inode, &head, how);
+		error = nfs_generic_commit_list(inode, &head, how, &cinfo);
 		if (error < 0)
 			return error;
 		if (!may_wait)
@@ -1585,14 +1639,14 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr
 	int ret = 0;
 
 	/* no commits means nothing needs to be done */
-	if (!nfsi->ncommit)
+	if (!nfsi->commit_info.ncommit)
 		return ret;
 
 	if (wbc->sync_mode == WB_SYNC_NONE) {
 		/* Don't commit yet if this is a non-blocking flush and there
 		 * are a lot of outstanding writes for this mapping.
 		 */
-		if (nfsi->ncommit <= (nfsi->npages >> 1))
+		if (nfsi->commit_info.ncommit <= (nfsi->npages >> 1))
 			goto out_mark_dirty;
 
 		/* don't wait for the COMMIT response */
@@ -1665,7 +1719,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
 		req = nfs_page_find_request(page);
 		if (req == NULL)
 			break;
-		if (nfs_lock_request_dontget(req)) {
+		if (nfs_lock_request(req)) {
 			nfs_clear_request_commit(req);
 			nfs_inode_remove_request(req);
 			/*
@@ -1673,7 +1727,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
 			 * page as being dirty
 			 */
 			cancel_dirty_page(page, PAGE_CACHE_SIZE);
-			nfs_unlock_request(req);
+			nfs_unlock_and_release_request(req);
 			break;
 		}
 		ret = nfs_wait_on_request(req);
@@ -1742,7 +1796,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
 int __init nfs_init_writepagecache(void)
 {
 	nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
-					     sizeof(struct nfs_write_data),
+					     sizeof(struct nfs_write_header),
 					     0, SLAB_HWCACHE_ALIGN,
 					     NULL);
 	if (nfs_wdata_cachep == NULL)
@@ -1753,6 +1807,13 @@ int __init nfs_init_writepagecache(void)
 	if (nfs_wdata_mempool == NULL)
 		return -ENOMEM;
 
+	nfs_cdata_cachep = kmem_cache_create("nfs_commit_data",
+					     sizeof(struct nfs_commit_data),
+					     0, SLAB_HWCACHE_ALIGN,
+					     NULL);
+	if (nfs_cdata_cachep == NULL)
+		return -ENOMEM;
+
 	nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT,
 						      nfs_wdata_cachep);
 	if (nfs_commit_mempool == NULL)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index d2d3108a611c..d7d711876b6a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -411,12 +411,13 @@ static const struct file_operations proc_lstats_operations = {
 
 static int proc_oom_score(struct task_struct *task, char *buffer)
 {
+	unsigned long totalpages = totalram_pages + total_swap_pages;
 	unsigned long points = 0;
 
 	read_lock(&tasklist_lock);
 	if (pid_alive(task))
-		points = oom_badness(task, NULL, NULL,
-					totalram_pages + total_swap_pages);
+		points = oom_badness(task, NULL, NULL, totalpages) *
+						1000 / totalpages;
 	read_unlock(&tasklist_lock);
 	return sprintf(buffer, "%lu\n", points);
 }
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 1030a716d155..7faaf2acc570 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -784,7 +784,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 
 	/* find the first VMA at or above 'addr' */
 	vma = find_vma(walk->mm, addr);
-	if (pmd_trans_huge_lock(pmd, vma) == 1) {
+	if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
 		for (; addr != end; addr += PAGE_SIZE) {
 			unsigned long offset;
 
diff --git a/fs/splice.c b/fs/splice.c
index f8476841eb04..406ef2b792c2 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1388,7 +1388,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
  */
 static int get_iovec_page_array(const struct iovec __user *iov,
 				unsigned int nr_vecs, struct page **pages,
-				struct partial_page *partial, int aligned,
+				struct partial_page *partial, bool aligned,
 				unsigned int pipe_buffers)
 {
 	int buffers = 0, error = 0;
@@ -1626,7 +1626,7 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
 		return -ENOMEM;
 
 	spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages,
-					    spd.partial, flags & SPLICE_F_GIFT,
+					    spd.partial, false,
 					    pipe->buffers);
 	if (spd.nr_pages <= 0)
 		ret = spd.nr_pages;
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index e2768f188f55..6f2b45a9b6bc 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -445,6 +445,18 @@ static inline int pmd_write(pmd_t pmd)
 #endif /* __HAVE_ARCH_PMD_WRITE */
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
+#ifndef pmd_read_atomic
+static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
+{
+	/*
+	 * Depend on compiler for an atomic pmd read. NOTE: this is
+	 * only going to work, if the pmdval_t isn't larger than
+	 * an unsigned long.
+	 */
+	return *pmdp;
+}
+#endif
+
 /*
  * This function is meant to be used by sites walking pagetables with
  * the mmap_sem hold in read mode to protect against MADV_DONTNEED and
@@ -458,11 +470,17 @@ static inline int pmd_write(pmd_t pmd)
  * undefined so behaving like if the pmd was none is safe (because it
  * can return none anyway). The compiler level barrier() is critically
  * important to compute the two checks atomically on the same pmdval.
+ *
+ * For 32bit kernels with a 64bit large pmd_t this automatically takes
+ * care of reading the pmd atomically to avoid SMP race conditions
+ * against pmd_populate() when the mmap_sem is hold for reading by the
+ * caller (a special atomic read not done by "gcc" as in the generic
+ * version above, is also needed when THP is disabled because the page
+ * fault can populate the pmd from under us).
  */
 static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
 {
-	/* depend on compiler for an atomic pmd read */
-	pmd_t pmdval = *pmd;
+	pmd_t pmdval = pmd_read_atomic(pmd);
 	/*
 	 * The barrier will stabilize the pmdval in a register or on
 	 * the stack so that it will stop changing under the code.
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 4cd59b95858f..7185b8f15ced 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -225,6 +225,7 @@ header-y += kd.h
 header-y += kdev_t.h
 header-y += kernel.h
 header-y += kernelcapi.h
+header-y += kernel-page-flags.h
 header-y += keyboard.h
 header-y += keyctl.h
 header-y += l2tp.h
diff --git a/include/linux/apple_bl.h b/include/linux/apple_bl.h
index 47bedc0eee69..0a95e730fcea 100644
--- a/include/linux/apple_bl.h
+++ b/include/linux/apple_bl.h
@@ -5,7 +5,7 @@
 #ifndef _LINUX_APPLE_BL_H
 #define _LINUX_APPLE_BL_H
 
-#ifdef CONFIG_BACKLIGHT_APPLE
+#if defined(CONFIG_BACKLIGHT_APPLE) || defined(CONFIG_BACKLIGHT_APPLE_MODULE)
 
 extern int apple_bl_register(void);
 extern void apple_bl_unregister(void);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 4d94eb8bcbcc..26435890dc87 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -269,6 +269,14 @@ extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set
 extern void bvec_free_bs(struct bio_set *, struct bio_vec *, unsigned int);
 extern unsigned int bvec_nr_vecs(unsigned short idx);
 
+#ifdef CONFIG_BLK_CGROUP
+int bio_associate_current(struct bio *bio);
+void bio_disassociate_task(struct bio *bio);
+#else	/* CONFIG_BLK_CGROUP */
+static inline int bio_associate_current(struct bio *bio) { return -ENOENT; }
+static inline void bio_disassociate_task(struct bio *bio) { }
+#endif	/* CONFIG_BLK_CGROUP */
+
 /*
  * bio_set is used to allow other portions of the IO system to
  * allocate their own private memory pools for bio and iovec structures.
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 4053cbd4490e..0edb65dd8edd 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -14,6 +14,8 @@ struct bio;
 struct bio_integrity_payload;
 struct page;
 struct block_device;
+struct io_context;
+struct cgroup_subsys_state;
 typedef void (bio_end_io_t) (struct bio *, int);
 typedef void (bio_destructor_t) (struct bio *);
 
@@ -66,6 +68,14 @@ struct bio {
 	bio_end_io_t		*bi_end_io;
 
 	void			*bi_private;
+#ifdef CONFIG_BLK_CGROUP
+	/*
+	 * Optional ioc and css associated with this bio.  Put on bio
+	 * release.  Read comment on top of bio_associate_current().
+	 */
+	struct io_context	*bi_ioc;
+	struct cgroup_subsys_state *bi_css;
+#endif
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 	struct bio_integrity_payload *bi_integrity;  /* data integrity */
 #endif
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4d4ac24a263e..ba43f408baa3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -32,10 +32,17 @@ struct blk_trace;
 struct request;
 struct sg_io_hdr;
 struct bsg_job;
+struct blkcg_gq;
 
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
 
+/*
+ * Maximum number of blkcg policies allowed to be registered concurrently.
+ * Defined here to simplify include dependency.
+ */
+#define BLKCG_MAX_POLS		2
+
 struct request;
 typedef void (rq_end_io_fn)(struct request *, int);
 
@@ -363,6 +370,11 @@ struct request_queue {
 	struct list_head	timeout_list;
 
 	struct list_head	icq_list;
+#ifdef CONFIG_BLK_CGROUP
+	DECLARE_BITMAP		(blkcg_pols, BLKCG_MAX_POLS);
+	struct blkcg_gq		*root_blkg;
+	struct list_head	blkg_list;
+#endif
 
 	struct queue_limits	limits;
 
@@ -390,12 +402,17 @@ struct request_queue {
 
 	struct mutex		sysfs_lock;
 
+	int			bypass_depth;
+
 #if defined(CONFIG_BLK_DEV_BSG)
 	bsg_job_fn		*bsg_job_fn;
 	int			bsg_job_size;
 	struct bsg_class_device bsg_dev;
 #endif
 
+#ifdef CONFIG_BLK_CGROUP
+	struct list_head	all_q_node;
+#endif
 #ifdef CONFIG_BLK_DEV_THROTTLING
 	/* Throttle data */
 	struct throtl_data *td;
@@ -407,7 +424,7 @@ struct request_queue {
 #define	QUEUE_FLAG_SYNCFULL	3	/* read queue has been filled */
 #define QUEUE_FLAG_ASYNCFULL	4	/* write queue has been filled */
 #define QUEUE_FLAG_DEAD		5	/* queue being torn down */
-#define QUEUE_FLAG_ELVSWITCH	6	/* don't use elevator, just do FIFO */
+#define QUEUE_FLAG_BYPASS	6	/* act as dumb FIFO queue */
 #define QUEUE_FLAG_BIDI		7	/* queue supports bidi requests */
 #define QUEUE_FLAG_NOMERGES     8	/* disable merge attempts */
 #define QUEUE_FLAG_SAME_COMP	9	/* complete on same CPU-group */
@@ -491,6 +508,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
 #define blk_queue_dead(q)	test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags)
+#define blk_queue_bypass(q)	test_bit(QUEUE_FLAG_BYPASS, &(q)->queue_flags)
 #define blk_queue_nomerges(q)	test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
 #define blk_queue_noxmerges(q)	\
 	test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags)
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 1a0cd270bb7a..324fe08ea3b1 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -135,9 +135,6 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
 extern int reserve_bootmem_generic(unsigned long addr, unsigned long size,
 				   int flags);
 
-extern void *alloc_bootmem_section(unsigned long size,
-				   unsigned long section_nr);
-
 #ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP
 extern void *alloc_remap(int nid, unsigned long size);
 #else
diff --git a/include/linux/bug.h b/include/linux/bug.h
index 72961c39576a..aaac4bba6f5c 100644
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@@ -30,6 +30,13 @@ struct pt_regs;
 #define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); }))
 #define BUILD_BUG_ON_NULL(e) ((void *)sizeof(struct { int:-!!(e); }))
 
+/*
+ * BUILD_BUG_ON_INVALID() permits the compiler to check the validity of the
+ * expression but avoids the generation of any code, even if that expression
+ * has side-effects.
+ */
+#define BUILD_BUG_ON_INVALID(e) ((void)(sizeof((__force long)(e))))
+
 /**
  * BUILD_BUG_ON - break compile if a condition is true.
  * @condition: the condition which the compiler should know is false.
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 51a90b7f2d60..e988037abd2a 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_COMPACTION_H
 #define _LINUX_COMPACTION_H
 
+#include <linux/node.h>
+
 /* Return values for compact_zone() and try_to_compact_pages() */
 /* compaction didn't start as it was not possible or direct reclaim was more suitable */
 #define COMPACT_SKIPPED		0
@@ -11,6 +13,23 @@
 /* The full zone was compacted */
 #define COMPACT_COMPLETE	3
 
+/*
+ * compaction supports three modes
+ *
+ * COMPACT_ASYNC_MOVABLE uses asynchronous migration and only scans
+ *    MIGRATE_MOVABLE pageblocks as migration sources and targets.
+ * COMPACT_ASYNC_UNMOVABLE uses asynchronous migration and only scans
+ *    MIGRATE_MOVABLE pageblocks as migration sources.
+ *    MIGRATE_UNMOVABLE pageblocks are scanned as potential migration
+ *    targets and convers them to MIGRATE_MOVABLE if possible
+ * COMPACT_SYNC uses synchronous migration and scans all pageblocks
+ */
+enum compact_mode {
+	COMPACT_ASYNC_MOVABLE,
+	COMPACT_ASYNC_UNMOVABLE,
+	COMPACT_SYNC,
+};
+
 #ifdef CONFIG_COMPACTION
 extern int sysctl_compact_memory;
 extern int sysctl_compaction_handler(struct ctl_table *table, int write,
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 9e5f5607eba3..47e3d4850584 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -53,7 +53,7 @@
 
 
 extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.3.11"
+#define REL_VERSION "8.3.13"
 #define API_VERSION 88
 #define PRO_VERSION_MIN 86
 #define PRO_VERSION_MAX 96
@@ -112,8 +112,8 @@ enum drbd_ret_code {
 	ERR_OPEN_MD_DISK	= 105,
 	ERR_DISK_NOT_BDEV	= 107,
 	ERR_MD_NOT_BDEV		= 108,
-	ERR_DISK_TO_SMALL	= 111,
-	ERR_MD_DISK_TO_SMALL	= 112,
+	ERR_DISK_TOO_SMALL	= 111,
+	ERR_MD_DISK_TOO_SMALL	= 112,
 	ERR_BDCLAIM_DISK	= 114,
 	ERR_BDCLAIM_MD_DISK	= 115,
 	ERR_MD_IDX_INVALID	= 116,
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
index 447c36752385..fb670bf603f7 100644
--- a/include/linux/drbd_limits.h
+++ b/include/linux/drbd_limits.h
@@ -48,6 +48,11 @@
 #define DRBD_TIMEOUT_MAX 600
 #define DRBD_TIMEOUT_DEF 60       /* 6 seconds */
 
+ /* If backing disk takes longer than disk_timeout, mark the disk as failed */
+#define DRBD_DISK_TIMEOUT_MIN 0    /* 0 = disabled */
+#define DRBD_DISK_TIMEOUT_MAX 6000 /* 10 Minutes */
+#define DRBD_DISK_TIMEOUT_DEF 0    /* disabled */
+
   /* active connection retries when C_WF_CONNECTION */
 #define DRBD_CONNECT_INT_MIN 1
 #define DRBD_CONNECT_INT_MAX 120
@@ -60,7 +65,7 @@
 
  /* timeout for the ping packets.*/
 #define DRBD_PING_TIMEO_MIN  1
-#define DRBD_PING_TIMEO_MAX  100
+#define DRBD_PING_TIMEO_MAX  300
 #define DRBD_PING_TIMEO_DEF  5
 
   /* max number of write requests between write barriers */
diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h
index ab6159e4fcf0..a8706f08ab36 100644
--- a/include/linux/drbd_nl.h
+++ b/include/linux/drbd_nl.h
@@ -31,9 +31,12 @@ NL_PACKET(disk_conf, 3,
 	NL_INTEGER(	56,	T_MAY_IGNORE,	max_bio_bvecs)
 	NL_BIT(		57,	T_MAY_IGNORE,	no_disk_barrier)
 	NL_BIT(		58,	T_MAY_IGNORE,	no_disk_drain)
+	NL_INTEGER(	89,	T_MAY_IGNORE,	disk_timeout)
 )
 
-NL_PACKET(detach, 4, )
+NL_PACKET(detach, 4,
+	NL_BIT(		88,	T_MANDATORY,	detach_force)
+)
 
 NL_PACKET(net_conf, 5,
 	NL_STRING(	8,	T_MANDATORY,	my_addr,	128)
diff --git a/include/linux/edac.h b/include/linux/edac.h
index c621d762bb2c..91ba3bae42ee 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -71,6 +71,25 @@ enum dev_type {
 #define DEV_FLAG_X64		BIT(DEV_X64)
 
 /**
+ * enum hw_event_mc_err_type - type of the detected error
+ *
+ * @HW_EVENT_ERR_CORRECTED:	Corrected Error - Indicates that an ECC
+ *				corrected error was detected
+ * @HW_EVENT_ERR_UNCORRECTED:	Uncorrected Error - Indicates an error that
+ *				can't be corrected by ECC, but it is not
+ *				fatal (maybe it is on an unused memory area,
+ *				or the memory controller could recover from
+ *				it for example, by re-trying the operation).
+ * @HW_EVENT_ERR_FATAL:		Fatal Error - Uncorrected error that could not
+ *				be recovered.
+ */
+enum hw_event_mc_err_type {
+	HW_EVENT_ERR_CORRECTED,
+	HW_EVENT_ERR_UNCORRECTED,
+	HW_EVENT_ERR_FATAL,
+};
+
+/**
  * enum mem_type - memory types. For a more detailed reference, please see
  *			http://en.wikipedia.org/wiki/DRAM
  *
@@ -313,38 +332,141 @@ enum scrub_type {
  */
 
 /**
+ * enum edac_mc_layer - memory controller hierarchy layer
+ *
+ * @EDAC_MC_LAYER_BRANCH:	memory layer is named "branch"
+ * @EDAC_MC_LAYER_CHANNEL:	memory layer is named "channel"
+ * @EDAC_MC_LAYER_SLOT:		memory layer is named "slot"
+ * @EDAC_MC_LAYER_CHIP_SELECT:	memory layer is named "chip select"
+ *
+ * This enum is used by the drivers to tell edac_mc_sysfs what name should
+ * be used when describing a memory stick location.
+ */
+enum edac_mc_layer_type {
+	EDAC_MC_LAYER_BRANCH,
+	EDAC_MC_LAYER_CHANNEL,
+	EDAC_MC_LAYER_SLOT,
+	EDAC_MC_LAYER_CHIP_SELECT,
+};
+
+/**
+ * struct edac_mc_layer - describes the memory controller hierarchy
+ * @layer:		layer type
+ * @size:		number of components per layer. For example,
+ *			if the channel layer has two channels, size = 2
+ * @is_virt_csrow:	This layer is part of the "csrow" when old API
+ *			compatibility mode is enabled. Otherwise, it is
+ *			a channel
+ */
+struct edac_mc_layer {
+	enum edac_mc_layer_type	type;
+	unsigned		size;
+	bool			is_virt_csrow;
+};
+
+/*
+ * Maximum number of layers used by the memory controller to uniquely
+ * identify a single memory stick.
+ * NOTE: Changing this constant requires not only to change the constant
+ * below, but also to change the existing code at the core, as there are
+ * some code there that are optimized for 3 layers.
+ */
+#define EDAC_MAX_LAYERS		3
+
+/**
+ * EDAC_DIMM_PTR - Macro responsible to find a pointer inside a pointer array
+ *		   for the element given by [layer0,layer1,layer2] position
+ *
+ * @layers:	a struct edac_mc_layer array, describing how many elements
+ *		were allocated for each layer
+ * @var:	name of the var where we want to get the pointer
+ *		(like mci->dimms)
+ * @n_layers:	Number of layers at the @layers array
+ * @layer0:	layer0 position
+ * @layer1:	layer1 position. Unused if n_layers < 2
+ * @layer2:	layer2 position. Unused if n_layers < 3
+ *
+ * For 1 layer, this macro returns &var[layer0]
+ * For 2 layers, this macro is similar to allocate a bi-dimensional array
+ *		and to return "&var[layer0][layer1]"
+ * For 3 layers, this macro is similar to allocate a tri-dimensional array
+ *		and to return "&var[layer0][layer1][layer2]"
+ *
+ * A loop could be used here to make it more generic, but, as we only have
+ * 3 layers, this is a little faster.
+ * By design, layers can never be 0 or more than 3. If that ever happens,
+ * a NULL is returned, causing an OOPS during the memory allocation routine,
+ * with would point to the developer that he's doing something wrong.
+ */
+#define EDAC_DIMM_PTR(layers, var, nlayers, layer0, layer1, layer2) ({	\
+	typeof(var) __p;						\
+	if ((nlayers) == 1)						\
+		__p = &var[layer0];					\
+	else if ((nlayers) == 2)					\
+		__p = &var[(layer1) + ((layers[1]).size * (layer0))];	\
+	else if ((nlayers) == 3)					\
+		__p = &var[(layer2) + ((layers[2]).size * ((layer1) +	\
+			    ((layers[1]).size * (layer0))))];		\
+	else								\
+		__p = NULL;						\
+	__p;								\
+})
+
+
+/* FIXME: add the proper per-location error counts */
+struct dimm_info {
+	char label[EDAC_MC_LABEL_LEN + 1];	/* DIMM label on motherboard */
+
+	/* Memory location data */
+	unsigned location[EDAC_MAX_LAYERS];
+
+	struct mem_ctl_info *mci;	/* the parent */
+
+	u32 grain;		/* granularity of reported error in bytes */
+	enum dev_type dtype;	/* memory device type */
+	enum mem_type mtype;	/* memory dimm type */
+	enum edac_type edac_mode;	/* EDAC mode for this dimm */
+
+	u32 nr_pages;			/* number of pages on this dimm */
+
+	unsigned csrow, cschannel;	/* Points to the old API data */
+};
+
+/**
  * struct rank_info - contains the information for one DIMM rank
  *
  * @chan_idx:	channel number where the rank is (typically, 0 or 1)
  * @ce_count:	number of correctable errors for this rank
- * @label:	DIMM label. Different ranks for the same DIMM should be
- *		filled, on userspace, with the same label.
- *		FIXME: The core currently won't enforce it.
  * @csrow:	A pointer to the chip select row structure (the parent
  *		structure). The location of the rank is given by
  *		the (csrow->csrow_idx, chan_idx) vector.
+ * @dimm:	A pointer to the DIMM structure, where the DIMM label
+ *		information is stored.
+ *
+ * FIXME: Currently, the EDAC core model will assume one DIMM per rank.
+ *	  This is a bad assumption, but it makes this patch easier. Later
+ *	  patches in this series will fix this issue.
  */
 struct rank_info {
 	int chan_idx;
-	u32 ce_count;
-	char label[EDAC_MC_LABEL_LEN + 1];
-	struct csrow_info *csrow;	/* the parent */
+	struct csrow_info *csrow;
+	struct dimm_info *dimm;
+
+	u32 ce_count;		/* Correctable Errors for this csrow */
 };
 
 struct csrow_info {
-	unsigned long first_page;	/* first page number in dimm */
-	unsigned long last_page;	/* last page number in dimm */
+	/* Used only by edac_mc_find_csrow_by_page() */
+	unsigned long first_page;	/* first page number in csrow */
+	unsigned long last_page;	/* last page number in csrow */
 	unsigned long page_mask;	/* used for interleaving -
-					 * 0UL for non intlv
-					 */
-	u32 nr_pages;		/* number of pages in csrow */
-	u32 grain;		/* granularity of reported error in bytes */
-	int csrow_idx;		/* the chip-select row */
-	enum dev_type dtype;	/* memory device type */
+					 * 0UL for non intlv */
+
+	int csrow_idx;			/* the chip-select row */
+
 	u32 ue_count;		/* Uncorrectable Errors for this csrow */
 	u32 ce_count;		/* Correctable Errors for this csrow */
-	enum mem_type mtype;	/* memory csrow type */
-	enum edac_type edac_mode;	/* EDAC mode for this csrow */
+
 	struct mem_ctl_info *mci;	/* the parent */
 
 	struct kobject kobj;	/* sysfs kobject for this csrow */
@@ -426,8 +548,20 @@ struct mem_ctl_info {
 	unsigned long (*ctl_page_to_phys) (struct mem_ctl_info * mci,
 					   unsigned long page);
 	int mc_idx;
-	int nr_csrows;
 	struct csrow_info *csrows;
+	unsigned nr_csrows, num_cschannel;
+
+	/* Memory Controller hierarchy */
+	unsigned n_layers;
+	struct edac_mc_layer *layers;
+	bool mem_is_per_rank;
+
+	/*
+	 * DIMM info. Will eventually remove the entire csrows_info some day
+	 */
+	unsigned tot_dimms;
+	struct dimm_info *dimms;
+
 	/*
 	 * FIXME - what about controllers on other busses? - IDs must be
 	 * unique.  dev pointer should be sufficiently unique, but
@@ -440,12 +574,16 @@ struct mem_ctl_info {
 	const char *dev_name;
 	char proc_name[MC_PROC_NAME_MAX_LEN + 1];
 	void *pvt_info;
-	u32 ue_noinfo_count;	/* Uncorrectable Errors w/o info */
-	u32 ce_noinfo_count;	/* Correctable Errors w/o info */
-	u32 ue_count;		/* Total Uncorrectable Errors for this MC */
-	u32 ce_count;		/* Total Correctable Errors for this MC */
 	unsigned long start_time;	/* mci load start time (in jiffies) */
 
+	/*
+	 * drivers shouldn't access those fields directly, as the core
+	 * already handles that.
+	 */
+	u32 ce_noinfo_count, ue_noinfo_count;
+	u32 ue_mc, ce_mc;
+	u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
+
 	struct completion complete;
 
 	/* edac sysfs device control */
@@ -458,7 +596,7 @@ struct mem_ctl_info {
 	 * by the low level driver.
 	 *
 	 * Set by the low level driver to provide attributes at the
-	 * controller level, same level as 'ue_count' and 'ce_count' above.
+	 * controller level.
 	 * An array of structures, NULL terminated
 	 *
 	 * If attributes are desired, then set to array of attributes
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 7d4e0356f329..c03af7687bb4 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -28,12 +28,13 @@ typedef int (elevator_may_queue_fn) (struct request_queue *, int);
 
 typedef void (elevator_init_icq_fn) (struct io_cq *);
 typedef void (elevator_exit_icq_fn) (struct io_cq *);
-typedef int (elevator_set_req_fn) (struct request_queue *, struct request *, gfp_t);
+typedef int (elevator_set_req_fn) (struct request_queue *, struct request *,
+				   struct bio *, gfp_t);
 typedef void (elevator_put_req_fn) (struct request *);
 typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *);
 typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *);
 
-typedef void *(elevator_init_fn) (struct request_queue *);
+typedef int (elevator_init_fn) (struct request_queue *);
 typedef void (elevator_exit_fn) (struct elevator_queue *);
 
 struct elevator_ops
@@ -129,7 +130,8 @@ extern void elv_unregister_queue(struct request_queue *q);
 extern int elv_may_queue(struct request_queue *, int);
 extern void elv_abort_queue(struct request_queue *);
 extern void elv_completed_request(struct request_queue *, struct request *);
-extern int elv_set_request(struct request_queue *, struct request *, gfp_t);
+extern int elv_set_request(struct request_queue *q, struct request *rq,
+			   struct bio *bio, gfp_t gfp_mask);
 extern void elv_put_request(struct request_queue *, struct request *);
 extern void elv_drain_elevator(struct request_queue *);
 
diff --git a/include/linux/fb.h b/include/linux/fb.h
index d31cb682e173..a3229d7ab9f2 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -554,6 +554,10 @@ struct fb_cursor_user {
 #define FB_EVENT_FB_UNBIND              0x0E
 /*      CONSOLE-SPECIFIC: remap all consoles to new fb - for vga switcheroo */
 #define FB_EVENT_REMAP_ALL_CONSOLE      0x0F
+/*      A hardware display blank early change occured */
+#define FB_EARLY_EVENT_BLANK		0x10
+/*      A hardware display blank revert early change occured */
+#define FB_R_EARLY_EVENT_BLANK		0x11
 
 struct fb_event {
 	struct fb_info *info;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cdc1a9630948..038076b27ea4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1681,7 +1681,6 @@ struct inode_operations {
 	ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*removexattr) (struct dentry *, const char *);
-	void (*truncate_range)(struct inode *, loff_t, loff_t);
 	int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
 		      u64 len);
 } ____cacheline_aligned;
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index c8af7a2efb52..4c59b1131187 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -59,6 +59,8 @@ extern pmd_t *page_check_address_pmd(struct page *page,
 #define HPAGE_PMD_MASK HPAGE_MASK
 #define HPAGE_PMD_SIZE HPAGE_SIZE
 
+extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
+
 #define transparent_hugepage_enabled(__vma)				\
 	((transparent_hugepage_flags &					\
 	  (1<<TRANSPARENT_HUGEPAGE_FLAG) ||				\
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 1a3018063034..df38db2ef45b 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -6,11 +6,7 @@
 #include <linux/workqueue.h>
 
 enum {
-	ICQ_IOPRIO_CHANGED	= 1 << 0,
-	ICQ_CGROUP_CHANGED	= 1 << 1,
 	ICQ_EXITED		= 1 << 2,
-
-	ICQ_CHANGED_MASK	= ICQ_IOPRIO_CHANGED | ICQ_CGROUP_CHANGED,
 };
 
 /*
@@ -100,6 +96,7 @@ struct io_cq {
  */
 struct io_context {
 	atomic_long_t refcount;
+	atomic_t active_ref;
 	atomic_t nr_tasks;
 
 	/* all the fields below are protected by this lock */
@@ -120,29 +117,37 @@ struct io_context {
 	struct work_struct release_work;
 };
 
-static inline struct io_context *ioc_task_link(struct io_context *ioc)
+/**
+ * get_io_context_active - get active reference on ioc
+ * @ioc: ioc of interest
+ *
+ * Only iocs with active reference can issue new IOs.  This function
+ * acquires an active reference on @ioc.  The caller must already have an
+ * active reference on @ioc.
+ */
+static inline void get_io_context_active(struct io_context *ioc)
 {
-	/*
-	 * if ref count is zero, don't allow sharing (ioc is going away, it's
-	 * a race).
-	 */
-	if (ioc && atomic_long_inc_not_zero(&ioc->refcount)) {
-		atomic_inc(&ioc->nr_tasks);
-		return ioc;
-	}
+	WARN_ON_ONCE(atomic_long_read(&ioc->refcount) <= 0);
+	WARN_ON_ONCE(atomic_read(&ioc->active_ref) <= 0);
+	atomic_long_inc(&ioc->refcount);
+	atomic_inc(&ioc->active_ref);
+}
+
+static inline void ioc_task_link(struct io_context *ioc)
+{
+	get_io_context_active(ioc);
 
-	return NULL;
+	WARN_ON_ONCE(atomic_read(&ioc->nr_tasks) <= 0);
+	atomic_inc(&ioc->nr_tasks);
 }
 
 struct task_struct;
 #ifdef CONFIG_BLOCK
 void put_io_context(struct io_context *ioc);
+void put_io_context_active(struct io_context *ioc);
 void exit_io_context(struct task_struct *task);
 struct io_context *get_task_io_context(struct task_struct *task,
 				       gfp_t gfp_flags, int node);
-void ioc_ioprio_changed(struct io_context *ioc, int ioprio);
-void ioc_cgroup_changed(struct io_context *ioc);
-unsigned int icq_get_changed(struct io_cq *icq);
 #else
 struct io_context;
 static inline void put_io_context(struct io_context *ioc) { }
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index d937580417ba..450293f6d68b 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -35,12 +35,13 @@ struct iommu_domain;
 #define IOMMU_FAULT_WRITE	0x1
 
 typedef int (*iommu_fault_handler_t)(struct iommu_domain *,
-				struct device *, unsigned long, int);
+			struct device *, unsigned long, int, void *);
 
 struct iommu_domain {
 	struct iommu_ops *ops;
 	void *priv;
 	iommu_fault_handler_t handler;
+	void *handler_token;
 };
 
 #define IOMMU_CAP_CACHE_COHERENCY	0x1
@@ -95,7 +96,7 @@ extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain,
 extern int iommu_domain_has_cap(struct iommu_domain *domain,
 				unsigned long cap);
 extern void iommu_set_fault_handler(struct iommu_domain *domain,
-					iommu_fault_handler_t handler);
+			iommu_fault_handler_t handler, void *token);
 extern int iommu_device_group(struct device *dev, unsigned int *groupid);
 
 /**
@@ -132,7 +133,8 @@ static inline int report_iommu_fault(struct iommu_domain *domain,
 	 * invoke it.
 	 */
 	if (domain->handler)
-		ret = domain->handler(domain, dev, iova, flags);
+		ret = domain->handler(domain, dev, iova, flags,
+						domain->handler_token);
 
 	return ret;
 }
@@ -191,7 +193,7 @@ static inline int domain_has_cap(struct iommu_domain *domain,
 }
 
 static inline void iommu_set_fault_handler(struct iommu_domain *domain,
-					iommu_fault_handler_t handler)
+				iommu_fault_handler_t handler, void *token)
 {
 }
 
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index 76dad4808847..beb9ce1c2c23 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -42,26 +42,14 @@ enum {
 };
 
 /*
- * if process has set io priority explicitly, use that. if not, convert
- * the cpu scheduler nice value to an io priority
+ * Fallback BE priority
  */
 #define IOPRIO_NORM	(4)
-static inline int task_ioprio(struct io_context *ioc)
-{
-	if (ioprio_valid(ioc->ioprio))
-		return IOPRIO_PRIO_DATA(ioc->ioprio);
-
-	return IOPRIO_NORM;
-}
-
-static inline int task_ioprio_class(struct io_context *ioc)
-{
-	if (ioprio_valid(ioc->ioprio))
-		return IOPRIO_PRIO_CLASS(ioc->ioprio);
-
-	return IOPRIO_CLASS_BE;
-}
 
+/*
+ * if process has set io priority explicitly, use that. if not, convert
+ * the cpu scheduler nice value to an io priority
+ */
 static inline int task_nice_ioprio(struct task_struct *task)
 {
 	return (task_nice(task) + 20) / 5;
diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
index 387571959dd9..6883e197acb9 100644
--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -36,6 +36,7 @@ const char *kallsyms_lookup(unsigned long addr,
 
 /* Look up a kernel symbol and return it in a text buffer. */
 extern int sprint_symbol(char *buffer, unsigned long address);
+extern int sprint_symbol_no_offset(char *buffer, unsigned long address);
 extern int sprint_backtrace(char *buffer, unsigned long address);
 
 /* Look up a kernel symbol and print it to the kernel messages. */
@@ -80,6 +81,12 @@ static inline int sprint_symbol(char *buffer, unsigned long addr)
 	return 0;
 }
 
+static inline int sprint_symbol_no_offset(char *buffer, unsigned long addr)
+{
+	*buffer = '\0';
+	return 0;
+}
+
 static inline int sprint_backtrace(char *buffer, unsigned long addr)
 {
 	*buffer = '\0';
diff --git a/include/linux/kernel-page-flags.h b/include/linux/kernel-page-flags.h
index 26a65711676f..a1bdf6966357 100644
--- a/include/linux/kernel-page-flags.h
+++ b/include/linux/kernel-page-flags.h
@@ -32,6 +32,8 @@
 #define KPF_KSM			21
 #define KPF_THP			22
 
+#ifdef __KERNEL__
+
 /* kernel hacking assistances
  * WARNING: subject to change, never rely on them!
  */
@@ -44,4 +46,6 @@
 #define KPF_ARCH		38
 #define KPF_UNCACHED		39
 
+#endif /* __KERNEL__ */
+
 #endif /* LINUX_KERNEL_PAGE_FLAGS_H */
diff --git a/include/linux/lcd.h b/include/linux/lcd.h
index 8877123f2d6e..e00c3b0ebc6b 100644
--- a/include/linux/lcd.h
+++ b/include/linux/lcd.h
@@ -40,6 +40,16 @@ struct lcd_ops {
 	/* Get the LCD panel power status (0: full on, 1..3: controller
 	   power on, flat panel power off, 4: full off), see FB_BLANK_XXX */
 	int (*get_power)(struct lcd_device *);
+	/*
+	 * Enable or disable power to the LCD(0: on; 4: off, see FB_BLANK_XXX)
+	 * and this callback would be called proir to fb driver's callback.
+	 *
+	 * P.S. note that if early_set_power is not NULL then early fb notifier
+	 *	would be registered.
+	 */
+	int (*early_set_power)(struct lcd_device *, int power);
+	/* revert the effects of the early blank event. */
+	int (*r_early_set_power)(struct lcd_device *, int power);
 	/* Enable or disable power to the LCD (0: on; 4: off, see FB_BLANK_XXX) */
 	int (*set_power)(struct lcd_device *, int power);
 	/* Get the current contrast setting (0-max_contrast) */
diff --git a/include/linux/led-lm3530.h b/include/linux/led-lm3530.h
index eeae6e742471..4b133479d6ea 100644
--- a/include/linux/led-lm3530.h
+++ b/include/linux/led-lm3530.h
@@ -92,7 +92,7 @@ struct lm3530_pwm_data {
  * @als2_resistor_sel: internal resistance from ALS2 input to ground
  * @als_vmin: als input voltage calibrated for max brightness in mV
  * @als_vmax: als input voltage calibrated for min brightness in mV
- * @brt_val: brightness value (0-255)
+ * @brt_val: brightness value (0-127)
  * @pwm_data: PWM control functions (only valid when the mode is PWM)
  */
 struct lm3530_platform_data {
diff --git a/include/linux/leds.h b/include/linux/leds.h
index 5884def15a24..39eee41d8c6f 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -73,6 +73,8 @@ struct led_classdev {
 	struct led_trigger	*trigger;
 	struct list_head	 trig_list;
 	void			*trigger_data;
+	/* true if activated - deactivate routine uses it to do cleanup */
+	bool			activated;
 #endif
 };
 
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index f94efd2f6c27..83e7ba90d6e5 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -63,12 +63,7 @@ extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 					gfp_t gfp_mask);
 
 struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
-struct lruvec *mem_cgroup_lru_add_list(struct zone *, struct page *,
-				       enum lru_list);
-void mem_cgroup_lru_del_list(struct page *, enum lru_list);
-void mem_cgroup_lru_del(struct page *);
-struct lruvec *mem_cgroup_lru_move_lists(struct zone *, struct page *,
-					 enum lru_list, enum lru_list);
+struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
 
 /* For coalescing uncharge for reducing memcg' overhead*/
 extern void mem_cgroup_uncharge_start(void);
@@ -79,6 +74,8 @@ extern void mem_cgroup_uncharge_cache_page(struct page *page);
 
 extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 				     int order);
+bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
+				  struct mem_cgroup *memcg);
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg);
 
 extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
@@ -92,10 +89,13 @@ static inline
 int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup)
 {
 	struct mem_cgroup *memcg;
+	int match;
+
 	rcu_read_lock();
 	memcg = mem_cgroup_from_task(rcu_dereference((mm)->owner));
+	match = __mem_cgroup_same_or_subtree(cgroup, memcg);
 	rcu_read_unlock();
-	return cgroup == memcg;
+	return match;
 }
 
 extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
@@ -114,17 +114,11 @@ void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
 /*
  * For memory reclaim.
  */
-int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg,
-				    struct zone *zone);
-int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg,
-				    struct zone *zone);
+int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec);
+int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec);
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
-unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
-					int nid, int zid, unsigned int lrumask);
-struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
-						      struct zone *zone);
-struct zone_reclaim_stat*
-mem_cgroup_get_reclaim_stat_from_page(struct page *page);
+unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
+void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
 extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
 					struct task_struct *p);
 extern void mem_cgroup_replace_page_cache(struct page *oldpage,
@@ -251,25 +245,8 @@ static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
 	return &zone->lruvec;
 }
 
-static inline struct lruvec *mem_cgroup_lru_add_list(struct zone *zone,
-						     struct page *page,
-						     enum lru_list lru)
-{
-	return &zone->lruvec;
-}
-
-static inline void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru)
-{
-}
-
-static inline void mem_cgroup_lru_del(struct page *page)
-{
-}
-
-static inline struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone,
-						       struct page *page,
-						       enum lru_list from,
-						       enum lru_list to)
+static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
+						    struct zone *zone)
 {
 	return &zone->lruvec;
 }
@@ -333,35 +310,27 @@ static inline bool mem_cgroup_disabled(void)
 }
 
 static inline int
-mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
+mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
 {
 	return 1;
 }
 
 static inline int
-mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)
+mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
 {
 	return 1;
 }
 
 static inline unsigned long
-mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
-				unsigned int lru_mask)
+mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 {
 	return 0;
 }
 
-
-static inline struct zone_reclaim_stat*
-mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, struct zone *zone)
-{
-	return NULL;
-}
-
-static inline struct zone_reclaim_stat*
-mem_cgroup_get_reclaim_stat_from_page(struct page *page)
+static inline void
+mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
+			      int increment)
 {
-	return NULL;
 }
 
 static inline void
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 7c727a90d70d..4aa42732e47f 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -225,8 +225,8 @@ static inline void check_highest_zone(enum zone_type k)
 		policy_zone = k;
 }
 
-int do_migrate_pages(struct mm_struct *mm,
-	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags);
+int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
+		     const nodemask_t *to, int flags);
 
 
 #ifdef CONFIG_TMPFS
@@ -354,9 +354,8 @@ static inline bool mempolicy_nodemask_intersects(struct task_struct *tsk,
 	return false;
 }
 
-static inline int do_migrate_pages(struct mm_struct *mm,
-			const nodemask_t *from_nodes,
-			const nodemask_t *to_nodes, int flags)
+static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
+				   const nodemask_t *to, int flags)
 {
 	return 0;
 }
diff --git a/include/linux/mfd/abx500/ab8500.h b/include/linux/mfd/abx500/ab8500.h
index fccc3002f271..91dd3ef63e99 100644
--- a/include/linux/mfd/abx500/ab8500.h
+++ b/include/linux/mfd/abx500/ab8500.h
@@ -7,6 +7,7 @@
 #ifndef MFD_AB8500_H
 #define MFD_AB8500_H
 
+#include <linux/atomic.h>
 #include <linux/mutex.h>
 
 struct device;
@@ -194,6 +195,14 @@ enum ab8500_version {
 #define AB9540_INT_GPIO52F		123
 #define AB9540_INT_GPIO53F		124
 #define AB9540_INT_GPIO54F		125 /* not 8505 */
+/* ab8500_irq_regoffset[16] -> IT[Source|Latch|Mask]25 */
+#define AB8505_INT_KEYSTUCK		128
+#define AB8505_INT_IKR			129
+#define AB8505_INT_IKP			130
+#define AB8505_INT_KP			131
+#define AB8505_INT_KEYDEGLITCH		132
+#define AB8505_INT_MODPWRSTATUSF	134
+#define AB8505_INT_MODPWRSTATUSR	135
 
 /*
  * AB8500_AB9540_NR_IRQS is used when configuring the IRQ numbers for the
@@ -203,8 +212,8 @@ enum ab8500_version {
  * which is larger.
  */
 #define AB8500_NR_IRQS			112
-#define AB8505_NR_IRQS			128
-#define AB9540_NR_IRQS			128
+#define AB8505_NR_IRQS			136
+#define AB9540_NR_IRQS			136
 /* This is set to the roof of any AB8500 chip variant IRQ counts */
 #define AB8500_MAX_NR_IRQS		AB9540_NR_IRQS
 
@@ -216,6 +225,7 @@ enum ab8500_version {
  * @dev: parent device
  * @lock: read/write operations lock
  * @irq_lock: genirq bus lock
+ * @transfer_ongoing: 0 if no transfer ongoing
  * @irq: irq line
  * @version: chip version id (e.g. ab8500 or ab9540)
  * @chip_id: chip revision id
@@ -234,7 +244,7 @@ struct ab8500 {
 	struct device	*dev;
 	struct mutex	lock;
 	struct mutex	irq_lock;
-
+	atomic_t	transfer_ongoing;
 	int		irq_base;
 	int		irq;
 	enum ab8500_version version;
@@ -280,6 +290,8 @@ extern int __devinit ab8500_init(struct ab8500 *ab8500,
 				 enum ab8500_version version);
 extern int __devexit ab8500_exit(struct ab8500 *ab8500);
 
+extern int ab8500_suspend(struct ab8500 *ab8500);
+
 static inline int is_ab8500(struct ab8500 *ab)
 {
 	return ab->version == AB8500_VERSION_AB8500;
diff --git a/include/linux/mfd/anatop.h b/include/linux/mfd/anatop.h
index 22c1007d3ec5..7f92acf03d9e 100644
--- a/include/linux/mfd/anatop.h
+++ b/include/linux/mfd/anatop.h
@@ -34,7 +34,7 @@ struct anatop {
 	spinlock_t reglock;
 };
 
-extern u32 anatop_get_bits(struct anatop *, u32, int, int);
-extern void anatop_set_bits(struct anatop *, u32, int, int, u32);
+extern u32 anatop_read_reg(struct anatop *, u32);
+extern void anatop_write_reg(struct anatop *, u32, u32, u32);
 
 #endif /*  __LINUX_MFD_ANATOP_H */
diff --git a/include/linux/mfd/asic3.h b/include/linux/mfd/asic3.h
index ef6faa5cee46..e1148d037e7b 100644
--- a/include/linux/mfd/asic3.h
+++ b/include/linux/mfd/asic3.h
@@ -31,6 +31,8 @@ struct asic3_platform_data {
 
 	unsigned int gpio_base;
 
+	unsigned int clock_rate;
+
 	struct asic3_led *leds;
 };
 
diff --git a/include/linux/mfd/da9052/da9052.h b/include/linux/mfd/da9052/da9052.h
index 8313cd9658e3..0507c4c21a7d 100644
--- a/include/linux/mfd/da9052/da9052.h
+++ b/include/linux/mfd/da9052/da9052.h
@@ -33,6 +33,18 @@
 
 #include <linux/mfd/da9052/reg.h>
 
+/* Common - HWMON Channel Definations */
+#define DA9052_ADC_VDDOUT	0
+#define DA9052_ADC_ICH		1
+#define DA9052_ADC_TBAT	2
+#define DA9052_ADC_VBAT	3
+#define DA9052_ADC_IN4		4
+#define DA9052_ADC_IN5		5
+#define DA9052_ADC_IN6		6
+#define DA9052_ADC_TSI		7
+#define DA9052_ADC_TJUNC	8
+#define DA9052_ADC_VBBAT	9
+
 #define DA9052_IRQ_DCIN	0
 #define DA9052_IRQ_VBUS	1
 #define DA9052_IRQ_DCINREM	2
@@ -79,6 +91,9 @@ struct da9052 {
 	struct device *dev;
 	struct regmap *regmap;
 
+	struct mutex auxadc_lock;
+	struct completion done;
+
 	int irq_base;
 	struct regmap_irq_chip_data *irq_data;
 	u8 chip_id;
@@ -86,6 +101,10 @@ struct da9052 {
 	int chip_irq;
 };
 
+/* ADC API */
+int da9052_adc_manual_read(struct da9052 *da9052, unsigned char channel);
+int da9052_adc_read_temp(struct da9052 *da9052);
+
 /* Device I/O API */
 static inline int da9052_reg_read(struct da9052 *da9052, unsigned char reg)
 {
diff --git a/include/linux/mfd/lm3533.h b/include/linux/mfd/lm3533.h
new file mode 100644
index 000000000000..594bc591f256
--- /dev/null
+++ b/include/linux/mfd/lm3533.h
@@ -0,0 +1,104 @@
+/*
+ * lm3533.h -- LM3533 interface
+ *
+ * Copyright (C) 2011-2012 Texas Instruments
+ *
+ * Author: Johan Hovold <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under  the terms of the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#ifndef __LINUX_MFD_LM3533_H
+#define __LINUX_MFD_LM3533_H
+
+#define LM3533_ATTR_RO(_name) \
+	DEVICE_ATTR(_name, S_IRUGO, show_##_name, NULL)
+#define LM3533_ATTR_RW(_name) \
+	DEVICE_ATTR(_name, S_IRUGO | S_IWUSR , show_##_name, store_##_name)
+
+struct device;
+struct regmap;
+
+struct lm3533 {
+	struct device *dev;
+
+	struct regmap *regmap;
+
+	int gpio_hwen;
+	int irq;
+
+	unsigned have_als:1;
+	unsigned have_backlights:1;
+	unsigned have_leds:1;
+};
+
+struct lm3533_ctrlbank {
+	struct lm3533 *lm3533;
+	struct device *dev;
+	int id;
+};
+
+struct lm3533_als_platform_data {
+	unsigned pwm_mode:1;		/* PWM input mode (default analog) */
+	u8 r_select;			/* 1 - 127 (ignored in PWM-mode) */
+};
+
+struct lm3533_bl_platform_data {
+	char *name;
+	u16 max_current;		/* 5000 - 29800 uA (800 uA step) */
+	u8 default_brightness;		/* 0 - 255 */
+	u8 pwm;				/* 0 - 0x3f */
+};
+
+struct lm3533_led_platform_data {
+	char *name;
+	const char *default_trigger;
+	u16 max_current;		/* 5000 - 29800 uA (800 uA step) */
+	u8 pwm;				/* 0 - 0x3f */
+};
+
+enum lm3533_boost_freq {
+	LM3533_BOOST_FREQ_500KHZ,
+	LM3533_BOOST_FREQ_1000KHZ,
+};
+
+enum lm3533_boost_ovp {
+	LM3533_BOOST_OVP_16V,
+	LM3533_BOOST_OVP_24V,
+	LM3533_BOOST_OVP_32V,
+	LM3533_BOOST_OVP_40V,
+};
+
+struct lm3533_platform_data {
+	int gpio_hwen;
+
+	enum lm3533_boost_ovp boost_ovp;
+	enum lm3533_boost_freq boost_freq;
+
+	struct lm3533_als_platform_data *als;
+
+	struct lm3533_bl_platform_data *backlights;
+	int num_backlights;
+
+	struct lm3533_led_platform_data *leds;
+	int num_leds;
+};
+
+extern int lm3533_ctrlbank_enable(struct lm3533_ctrlbank *cb);
+extern int lm3533_ctrlbank_disable(struct lm3533_ctrlbank *cb);
+
+extern int lm3533_ctrlbank_set_brightness(struct lm3533_ctrlbank *cb, u8 val);
+extern int lm3533_ctrlbank_get_brightness(struct lm3533_ctrlbank *cb, u8 *val);
+extern int lm3533_ctrlbank_set_max_current(struct lm3533_ctrlbank *cb,
+								u16 imax);
+extern int lm3533_ctrlbank_set_pwm(struct lm3533_ctrlbank *cb, u8 val);
+extern int lm3533_ctrlbank_get_pwm(struct lm3533_ctrlbank *cb, u8 *val);
+
+extern int lm3533_read(struct lm3533 *lm3533, u8 reg, u8 *val);
+extern int lm3533_write(struct lm3533 *lm3533, u8 reg, u8 val);
+extern int lm3533_update(struct lm3533 *lm3533, u8 reg, u8 val, u8 mask);
+
+#endif	/* __LINUX_MFD_LM3533_H */
diff --git a/include/linux/mfd/lpc_ich.h b/include/linux/mfd/lpc_ich.h
new file mode 100644
index 000000000000..fec5256c3f5d
--- /dev/null
+++ b/include/linux/mfd/lpc_ich.h
@@ -0,0 +1,48 @@
+/*
+ *  linux/drivers/mfd/lpc_ich.h
+ *
+ *  Copyright (c) 2012 Extreme Engineering Solution, Inc.
+ *  Author: Aaron Sierra <[email protected]>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License 2 as published
+ *  by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#ifndef LPC_ICH_H
+#define LPC_ICH_H
+
+/* Watchdog resources */
+#define ICH_RES_IO_TCO	0
+#define ICH_RES_IO_SMI	1
+#define ICH_RES_MEM_OFF	2
+#define ICH_RES_MEM_GCS	0
+
+/* GPIO resources */
+#define ICH_RES_GPIO	0
+#define ICH_RES_GPE0	1
+
+/* GPIO compatibility */
+#define ICH_I3100_GPIO		0x401
+#define ICH_V5_GPIO		0x501
+#define ICH_V6_GPIO		0x601
+#define ICH_V7_GPIO		0x701
+#define ICH_V9_GPIO		0x801
+#define ICH_V10CORP_GPIO	0xa01
+#define ICH_V10CONS_GPIO	0xa11
+
+struct lpc_ich_info {
+	char name[32];
+	unsigned int iTCO_version;
+	unsigned int gpio_version;
+};
+
+#endif
diff --git a/include/linux/mfd/max77693-private.h b/include/linux/mfd/max77693-private.h
new file mode 100644
index 000000000000..68263c5fa53c
--- /dev/null
+++ b/include/linux/mfd/max77693-private.h
@@ -0,0 +1,227 @@
+/*
+ * max77693-private.h - Voltage regulator driver for the Maxim 77693
+ *
+ *  Copyright (C) 2012 Samsung Electrnoics
+ *  SangYoung Son <[email protected]>
+ *
+ * This program is not provided / owned by Maxim Integrated Products.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __LINUX_MFD_MAX77693_PRIV_H
+#define __LINUX_MFD_MAX77693_PRIV_H
+
+#include <linux/i2c.h>
+
+#define MAX77693_NUM_IRQ_MUIC_REGS	3
+#define MAX77693_REG_INVALID		(0xff)
+
+/* Slave addr = 0xCC: PMIC, Charger, Flash LED */
+enum max77693_pmic_reg {
+	MAX77693_LED_REG_IFLASH1			= 0x00,
+	MAX77693_LED_REG_IFLASH2			= 0x01,
+	MAX77693_LED_REG_ITORCH				= 0x02,
+	MAX77693_LED_REG_ITORCHTIMER			= 0x03,
+	MAX77693_LED_REG_FLASH_TIMER			= 0x04,
+	MAX77693_LED_REG_FLASH_EN			= 0x05,
+	MAX77693_LED_REG_MAX_FLASH1			= 0x06,
+	MAX77693_LED_REG_MAX_FLASH2			= 0x07,
+	MAX77693_LED_REG_MAX_FLASH3			= 0x08,
+	MAX77693_LED_REG_MAX_FLASH4			= 0x09,
+	MAX77693_LED_REG_VOUT_CNTL			= 0x0A,
+	MAX77693_LED_REG_VOUT_FLASH1			= 0x0B,
+	MAX77693_LED_REG_VOUT_FLASH2			= 0x0C,
+	MAX77693_LED_REG_FLASH_INT			= 0x0E,
+	MAX77693_LED_REG_FLASH_INT_MASK			= 0x0F,
+	MAX77693_LED_REG_FLASH_INT_STATUS		= 0x10,
+
+	MAX77693_PMIC_REG_PMIC_ID1			= 0x20,
+	MAX77693_PMIC_REG_PMIC_ID2			= 0x21,
+	MAX77693_PMIC_REG_INTSRC			= 0x22,
+	MAX77693_PMIC_REG_INTSRC_MASK			= 0x23,
+	MAX77693_PMIC_REG_TOPSYS_INT			= 0x24,
+	MAX77693_PMIC_REG_TOPSYS_INT_MASK		= 0x26,
+	MAX77693_PMIC_REG_TOPSYS_STAT			= 0x28,
+	MAX77693_PMIC_REG_MAINCTRL1			= 0x2A,
+	MAX77693_PMIC_REG_LSCNFG			= 0x2B,
+
+	MAX77693_CHG_REG_CHG_INT			= 0xB0,
+	MAX77693_CHG_REG_CHG_INT_MASK			= 0xB1,
+	MAX77693_CHG_REG_CHG_INT_OK			= 0xB2,
+	MAX77693_CHG_REG_CHG_DETAILS_00			= 0xB3,
+	MAX77693_CHG_REG_CHG_DETAILS_01			= 0xB4,
+	MAX77693_CHG_REG_CHG_DETAILS_02			= 0xB5,
+	MAX77693_CHG_REG_CHG_DETAILS_03			= 0xB6,
+	MAX77693_CHG_REG_CHG_CNFG_00			= 0xB7,
+	MAX77693_CHG_REG_CHG_CNFG_01			= 0xB8,
+	MAX77693_CHG_REG_CHG_CNFG_02			= 0xB9,
+	MAX77693_CHG_REG_CHG_CNFG_03			= 0xBA,
+	MAX77693_CHG_REG_CHG_CNFG_04			= 0xBB,
+	MAX77693_CHG_REG_CHG_CNFG_05			= 0xBC,
+	MAX77693_CHG_REG_CHG_CNFG_06			= 0xBD,
+	MAX77693_CHG_REG_CHG_CNFG_07			= 0xBE,
+	MAX77693_CHG_REG_CHG_CNFG_08			= 0xBF,
+	MAX77693_CHG_REG_CHG_CNFG_09			= 0xC0,
+	MAX77693_CHG_REG_CHG_CNFG_10			= 0xC1,
+	MAX77693_CHG_REG_CHG_CNFG_11			= 0xC2,
+	MAX77693_CHG_REG_CHG_CNFG_12			= 0xC3,
+	MAX77693_CHG_REG_CHG_CNFG_13			= 0xC4,
+	MAX77693_CHG_REG_CHG_CNFG_14			= 0xC5,
+	MAX77693_CHG_REG_SAFEOUT_CTRL			= 0xC6,
+
+	MAX77693_PMIC_REG_END,
+};
+
+/* Slave addr = 0x4A: MUIC */
+enum max77693_muic_reg {
+	MAX77693_MUIC_REG_ID		= 0x00,
+	MAX77693_MUIC_REG_INT1		= 0x01,
+	MAX77693_MUIC_REG_INT2		= 0x02,
+	MAX77693_MUIC_REG_INT3		= 0x03,
+	MAX77693_MUIC_REG_STATUS1	= 0x04,
+	MAX77693_MUIC_REG_STATUS2	= 0x05,
+	MAX77693_MUIC_REG_STATUS3	= 0x06,
+	MAX77693_MUIC_REG_INTMASK1	= 0x07,
+	MAX77693_MUIC_REG_INTMASK2	= 0x08,
+	MAX77693_MUIC_REG_INTMASK3	= 0x09,
+	MAX77693_MUIC_REG_CDETCTRL1	= 0x0A,
+	MAX77693_MUIC_REG_CDETCTRL2	= 0x0B,
+	MAX77693_MUIC_REG_CTRL1		= 0x0C,
+	MAX77693_MUIC_REG_CTRL2		= 0x0D,
+	MAX77693_MUIC_REG_CTRL3		= 0x0E,
+
+	MAX77693_MUIC_REG_END,
+};
+
+/* Slave addr = 0x90: Haptic */
+enum max77693_haptic_reg {
+	MAX77693_HAPTIC_REG_STATUS		= 0x00,
+	MAX77693_HAPTIC_REG_CONFIG1		= 0x01,
+	MAX77693_HAPTIC_REG_CONFIG2		= 0x02,
+	MAX77693_HAPTIC_REG_CONFIG_CHNL		= 0x03,
+	MAX77693_HAPTIC_REG_CONFG_CYC1		= 0x04,
+	MAX77693_HAPTIC_REG_CONFG_CYC2		= 0x05,
+	MAX77693_HAPTIC_REG_CONFIG_PER1		= 0x06,
+	MAX77693_HAPTIC_REG_CONFIG_PER2		= 0x07,
+	MAX77693_HAPTIC_REG_CONFIG_PER3		= 0x08,
+	MAX77693_HAPTIC_REG_CONFIG_PER4		= 0x09,
+	MAX77693_HAPTIC_REG_CONFIG_DUTY1	= 0x0A,
+	MAX77693_HAPTIC_REG_CONFIG_DUTY2	= 0x0B,
+	MAX77693_HAPTIC_REG_CONFIG_PWM1		= 0x0C,
+	MAX77693_HAPTIC_REG_CONFIG_PWM2		= 0x0D,
+	MAX77693_HAPTIC_REG_CONFIG_PWM3		= 0x0E,
+	MAX77693_HAPTIC_REG_CONFIG_PWM4		= 0x0F,
+	MAX77693_HAPTIC_REG_REV			= 0x10,
+
+	MAX77693_HAPTIC_REG_END,
+};
+
+enum max77693_irq_source {
+	LED_INT = 0,
+	TOPSYS_INT,
+	CHG_INT,
+	MUIC_INT1,
+	MUIC_INT2,
+	MUIC_INT3,
+
+	MAX77693_IRQ_GROUP_NR,
+};
+
+enum max77693_irq {
+	/* PMIC - FLASH */
+	MAX77693_LED_IRQ_FLED2_OPEN,
+	MAX77693_LED_IRQ_FLED2_SHORT,
+	MAX77693_LED_IRQ_FLED1_OPEN,
+	MAX77693_LED_IRQ_FLED1_SHORT,
+	MAX77693_LED_IRQ_MAX_FLASH,
+
+	/* PMIC - TOPSYS */
+	MAX77693_TOPSYS_IRQ_T120C_INT,
+	MAX77693_TOPSYS_IRQ_T140C_INT,
+	MAX77693_TOPSYS_IRQ_LOWSYS_INT,
+
+	/* PMIC - Charger */
+	MAX77693_CHG_IRQ_BYP_I,
+	MAX77693_CHG_IRQ_THM_I,
+	MAX77693_CHG_IRQ_BAT_I,
+	MAX77693_CHG_IRQ_CHG_I,
+	MAX77693_CHG_IRQ_CHGIN_I,
+
+	/* MUIC INT1 */
+	MAX77693_MUIC_IRQ_INT1_ADC,
+	MAX77693_MUIC_IRQ_INT1_ADC_LOW,
+	MAX77693_MUIC_IRQ_INT1_ADC_ERR,
+	MAX77693_MUIC_IRQ_INT1_ADC1K,
+
+	/* MUIC INT2 */
+	MAX77693_MUIC_IRQ_INT2_CHGTYP,
+	MAX77693_MUIC_IRQ_INT2_CHGDETREUN,
+	MAX77693_MUIC_IRQ_INT2_DCDTMR,
+	MAX77693_MUIC_IRQ_INT2_DXOVP,
+	MAX77693_MUIC_IRQ_INT2_VBVOLT,
+	MAX77693_MUIC_IRQ_INT2_VIDRM,
+
+	/* MUIC INT3 */
+	MAX77693_MUIC_IRQ_INT3_EOC,
+	MAX77693_MUIC_IRQ_INT3_CGMBC,
+	MAX77693_MUIC_IRQ_INT3_OVP,
+	MAX77693_MUIC_IRQ_INT3_MBCCHG_ERR,
+	MAX77693_MUIC_IRQ_INT3_CHG_ENABLED,
+	MAX77693_MUIC_IRQ_INT3_BAT_DET,
+
+	MAX77693_IRQ_NR,
+};
+
+struct max77693_dev {
+	struct device *dev;
+	struct i2c_client *i2c;		/* 0xCC , PMIC, Charger, Flash LED */
+	struct i2c_client *muic;	/* 0x4A , MUIC */
+	struct i2c_client *haptic;	/* 0x90 , Haptic */
+	struct mutex iolock;
+
+	int type;
+
+	struct regmap *regmap;
+	struct regmap *regmap_muic;
+	struct regmap *regmap_haptic;
+
+	struct irq_domain *irq_domain;
+
+	int irq;
+	int irq_gpio;
+	bool wakeup;
+	struct mutex irqlock;
+	int irq_masks_cur[MAX77693_IRQ_GROUP_NR];
+	int irq_masks_cache[MAX77693_IRQ_GROUP_NR];
+};
+
+enum max77693_types {
+	TYPE_MAX77693,
+};
+
+extern int max77693_read_reg(struct regmap *map, u8 reg, u8 *dest);
+extern int max77693_bulk_read(struct regmap *map, u8 reg, int count,
+				u8 *buf);
+extern int max77693_write_reg(struct regmap *map, u8 reg, u8 value);
+extern int max77693_bulk_write(struct regmap *map, u8 reg, int count,
+				u8 *buf);
+extern int max77693_update_reg(struct regmap *map, u8 reg, u8 val, u8 mask);
+
+extern int max77693_irq_init(struct max77693_dev *max77686);
+extern void max77693_irq_exit(struct max77693_dev *max77686);
+extern int max77693_irq_resume(struct max77693_dev *max77686);
+
+#endif /*  __LINUX_MFD_MAX77693_PRIV_H */
diff --git a/include/linux/mfd/max77693.h b/include/linux/mfd/max77693.h
new file mode 100644
index 000000000000..1d28ae90384e
--- /dev/null
+++ b/include/linux/mfd/max77693.h
@@ -0,0 +1,36 @@
+/*
+ * max77693.h - Driver for the Maxim 77693
+ *
+ *  Copyright (C) 2012 Samsung Electrnoics
+ *  SangYoung Son <[email protected]>
+ *
+ * This program is not provided / owned by Maxim Integrated Products.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * This driver is based on max8997.h
+ *
+ * MAX77693 has PMIC, Charger, Flash LED, Haptic, MUIC devices.
+ * The devices share the same I2C bus and included in
+ * this mfd driver.
+ */
+
+#ifndef __LINUX_MFD_MAX77693_H
+#define __LINUX_MFD_MAX77693_H
+
+struct max77693_platform_data {
+	int wakeup;
+};
+#endif	/* __LINUX_MFD_MAX77693_H */
diff --git a/include/linux/mfd/sta2x11-mfd.h b/include/linux/mfd/sta2x11-mfd.h
new file mode 100644
index 000000000000..d179227e866f
--- /dev/null
+++ b/include/linux/mfd/sta2x11-mfd.h
@@ -0,0 +1,324 @@
+/*
+ * Copyright (c) 2009-2011 Wind River Systems, Inc.
+ * Copyright (c) 2011 ST Microelectronics (Alessandro Rubini)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * The STMicroelectronics ConneXt (STA2X11) chip has several unrelated
+ * functions in one PCI endpoint functions. This driver simply
+ * registers the platform devices in this iomemregion and exports a few
+ * functions to access common registers
+ */
+
+#ifndef __STA2X11_MFD_H
+#define __STA2X11_MFD_H
+#include <linux/types.h>
+#include <linux/pci.h>
+
+/*
+ * The MFD PCI block includes the GPIO peripherals and other register blocks.
+ * For GPIO, we have 32*4 bits (I use "gsta" for "gpio sta2x11".)
+ */
+#define GSTA_GPIO_PER_BLOCK	32
+#define GSTA_NR_BLOCKS		4
+#define GSTA_NR_GPIO		(GSTA_GPIO_PER_BLOCK * GSTA_NR_BLOCKS)
+
+/* Pinconfig is set by the board definition: altfunc, pull-up, pull-down */
+struct sta2x11_gpio_pdata {
+	unsigned pinconfig[GSTA_NR_GPIO];
+};
+
+/* Macros below lifted from sh_pfc.h, with minor differences */
+#define PINMUX_TYPE_NONE		0
+#define PINMUX_TYPE_FUNCTION		1
+#define PINMUX_TYPE_OUTPUT_LOW		2
+#define PINMUX_TYPE_OUTPUT_HIGH		3
+#define PINMUX_TYPE_INPUT		4
+#define PINMUX_TYPE_INPUT_PULLUP	5
+#define PINMUX_TYPE_INPUT_PULLDOWN	6
+
+/* Give names to GPIO pins, like PXA does, taken from the manual */
+#define STA2X11_GPIO0			0
+#define STA2X11_GPIO1			1
+#define STA2X11_GPIO2			2
+#define STA2X11_GPIO3			3
+#define STA2X11_GPIO4			4
+#define STA2X11_GPIO5			5
+#define STA2X11_GPIO6			6
+#define STA2X11_GPIO7			7
+#define STA2X11_GPIO8_RGBOUT_RED7	8
+#define STA2X11_GPIO9_RGBOUT_RED6	9
+#define STA2X11_GPIO10_RGBOUT_RED5	10
+#define STA2X11_GPIO11_RGBOUT_RED4	11
+#define STA2X11_GPIO12_RGBOUT_RED3	12
+#define STA2X11_GPIO13_RGBOUT_RED2	13
+#define STA2X11_GPIO14_RGBOUT_RED1	14
+#define STA2X11_GPIO15_RGBOUT_RED0	15
+#define STA2X11_GPIO16_RGBOUT_GREEN7	16
+#define STA2X11_GPIO17_RGBOUT_GREEN6	17
+#define STA2X11_GPIO18_RGBOUT_GREEN5	18
+#define STA2X11_GPIO19_RGBOUT_GREEN4	19
+#define STA2X11_GPIO20_RGBOUT_GREEN3	20
+#define STA2X11_GPIO21_RGBOUT_GREEN2	21
+#define STA2X11_GPIO22_RGBOUT_GREEN1	22
+#define STA2X11_GPIO23_RGBOUT_GREEN0	23
+#define STA2X11_GPIO24_RGBOUT_BLUE7	24
+#define STA2X11_GPIO25_RGBOUT_BLUE6	25
+#define STA2X11_GPIO26_RGBOUT_BLUE5	26
+#define STA2X11_GPIO27_RGBOUT_BLUE4	27
+#define STA2X11_GPIO28_RGBOUT_BLUE3	28
+#define STA2X11_GPIO29_RGBOUT_BLUE2	29
+#define STA2X11_GPIO30_RGBOUT_BLUE1	30
+#define STA2X11_GPIO31_RGBOUT_BLUE0	31
+#define STA2X11_GPIO32_RGBOUT_VSYNCH	32
+#define STA2X11_GPIO33_RGBOUT_HSYNCH	33
+#define STA2X11_GPIO34_RGBOUT_DEN	34
+#define STA2X11_GPIO35_ETH_CRS_DV	35
+#define STA2X11_GPIO36_ETH_TXD1		36
+#define STA2X11_GPIO37_ETH_TXD0		37
+#define STA2X11_GPIO38_ETH_TX_EN	38
+#define STA2X11_GPIO39_MDIO		39
+#define STA2X11_GPIO40_ETH_REF_CLK	40
+#define STA2X11_GPIO41_ETH_RXD1		41
+#define STA2X11_GPIO42_ETH_RXD0		42
+#define STA2X11_GPIO43_MDC		43
+#define STA2X11_GPIO44_CAN_TX		44
+#define STA2X11_GPIO45_CAN_RX		45
+#define STA2X11_GPIO46_MLB_DAT		46
+#define STA2X11_GPIO47_MLB_SIG		47
+#define STA2X11_GPIO48_SPI0_CLK		48
+#define STA2X11_GPIO49_SPI0_TXD		49
+#define STA2X11_GPIO50_SPI0_RXD		50
+#define STA2X11_GPIO51_SPI0_FRM		51
+#define STA2X11_GPIO52_SPI1_CLK		52
+#define STA2X11_GPIO53_SPI1_TXD		53
+#define STA2X11_GPIO54_SPI1_RXD		54
+#define STA2X11_GPIO55_SPI1_FRM		55
+#define STA2X11_GPIO56_SPI2_CLK		56
+#define STA2X11_GPIO57_SPI2_TXD		57
+#define STA2X11_GPIO58_SPI2_RXD		58
+#define STA2X11_GPIO59_SPI2_FRM		59
+#define STA2X11_GPIO60_I2C0_SCL		60
+#define STA2X11_GPIO61_I2C0_SDA		61
+#define STA2X11_GPIO62_I2C1_SCL		62
+#define STA2X11_GPIO63_I2C1_SDA		63
+#define STA2X11_GPIO64_I2C2_SCL		64
+#define STA2X11_GPIO65_I2C2_SDA		65
+#define STA2X11_GPIO66_I2C3_SCL		66
+#define STA2X11_GPIO67_I2C3_SDA		67
+#define STA2X11_GPIO68_MSP0_RCK		68
+#define STA2X11_GPIO69_MSP0_RXD		69
+#define STA2X11_GPIO70_MSP0_RFS		70
+#define STA2X11_GPIO71_MSP0_TCK		71
+#define STA2X11_GPIO72_MSP0_TXD		72
+#define STA2X11_GPIO73_MSP0_TFS		73
+#define STA2X11_GPIO74_MSP0_SCK		74
+#define STA2X11_GPIO75_MSP1_CK		75
+#define STA2X11_GPIO76_MSP1_RXD		76
+#define STA2X11_GPIO77_MSP1_FS		77
+#define STA2X11_GPIO78_MSP1_TXD		78
+#define STA2X11_GPIO79_MSP2_CK		79
+#define STA2X11_GPIO80_MSP2_RXD		80
+#define STA2X11_GPIO81_MSP2_FS		81
+#define STA2X11_GPIO82_MSP2_TXD		82
+#define STA2X11_GPIO83_MSP3_CK		83
+#define STA2X11_GPIO84_MSP3_RXD		84
+#define STA2X11_GPIO85_MSP3_FS		85
+#define STA2X11_GPIO86_MSP3_TXD		86
+#define STA2X11_GPIO87_MSP4_CK		87
+#define STA2X11_GPIO88_MSP4_RXD		88
+#define STA2X11_GPIO89_MSP4_FS		89
+#define STA2X11_GPIO90_MSP4_TXD		90
+#define STA2X11_GPIO91_MSP5_CK		91
+#define STA2X11_GPIO92_MSP5_RXD		92
+#define STA2X11_GPIO93_MSP5_FS		93
+#define STA2X11_GPIO94_MSP5_TXD		94
+#define STA2X11_GPIO95_SDIO3_DAT3	95
+#define STA2X11_GPIO96_SDIO3_DAT2	96
+#define STA2X11_GPIO97_SDIO3_DAT1	97
+#define STA2X11_GPIO98_SDIO3_DAT0	98
+#define STA2X11_GPIO99_SDIO3_CLK	99
+#define STA2X11_GPIO100_SDIO3_CMD	100
+#define STA2X11_GPIO101			101
+#define STA2X11_GPIO102			102
+#define STA2X11_GPIO103			103
+#define STA2X11_GPIO104			104
+#define STA2X11_GPIO105_SDIO2_DAT3	105
+#define STA2X11_GPIO106_SDIO2_DAT2	106
+#define STA2X11_GPIO107_SDIO2_DAT1	107
+#define STA2X11_GPIO108_SDIO2_DAT0	108
+#define STA2X11_GPIO109_SDIO2_CLK	109
+#define STA2X11_GPIO110_SDIO2_CMD	110
+#define STA2X11_GPIO111			111
+#define STA2X11_GPIO112			112
+#define STA2X11_GPIO113			113
+#define STA2X11_GPIO114			114
+#define STA2X11_GPIO115_SDIO1_DAT3	115
+#define STA2X11_GPIO116_SDIO1_DAT2	116
+#define STA2X11_GPIO117_SDIO1_DAT1	117
+#define STA2X11_GPIO118_SDIO1_DAT0	118
+#define STA2X11_GPIO119_SDIO1_CLK	119
+#define STA2X11_GPIO120_SDIO1_CMD	120
+#define STA2X11_GPIO121			121
+#define STA2X11_GPIO122			122
+#define STA2X11_GPIO123			123
+#define STA2X11_GPIO124			124
+#define STA2X11_GPIO125_UART2_TXD	125
+#define STA2X11_GPIO126_UART2_RXD	126
+#define STA2X11_GPIO127_UART3_TXD	127
+
+/*
+ * The APB bridge has its own registers, needed by our users as well.
+ * They are accessed with the following read/mask/write function.
+ */
+u32 sta2x11_apbreg_mask(struct pci_dev *pdev, u32 reg, u32 mask, u32 val);
+
+/* CAN and MLB */
+#define APBREG_BSR	0x00	/* Bridge Status Reg */
+#define APBREG_PAER	0x08	/* Peripherals Address Error Reg */
+#define APBREG_PWAC	0x20	/* Peripheral Write Access Control reg */
+#define APBREG_PRAC	0x40	/* Peripheral Read Access Control reg */
+#define APBREG_PCG	0x60	/* Peripheral Clock Gating Reg */
+#define APBREG_PUR	0x80	/* Peripheral Under Reset Reg */
+#define APBREG_EMU_PCG	0xA0	/* Emulator Peripheral Clock Gating Reg */
+
+#define APBREG_CAN	(1 << 1)
+#define APBREG_MLB	(1 << 3)
+
+/* SARAC */
+#define APBREG_BSR_SARAC     0x100 /* Bridge Status Reg */
+#define APBREG_PAER_SARAC    0x108 /* Peripherals Address Error Reg */
+#define APBREG_PWAC_SARAC    0x120 /* Peripheral Write Access Control reg */
+#define APBREG_PRAC_SARAC    0x140 /* Peripheral Read Access Control reg */
+#define APBREG_PCG_SARAC     0x160 /* Peripheral Clock Gating Reg */
+#define APBREG_PUR_SARAC     0x180 /* Peripheral Under Reset Reg */
+#define APBREG_EMU_PCG_SARAC 0x1A0 /* Emulator Peripheral Clock Gating Reg */
+
+#define APBREG_SARAC	(1 << 2)
+
+/*
+ * The system controller has its own registers. Some of these are accessed
+ * by out users as well, using the following read/mask/write/function
+ */
+u32 sta2x11_sctl_mask(struct pci_dev *pdev, u32 reg, u32 mask, u32 val);
+
+#define SCTL_SCCTL		0x00	/* System controller control register */
+#define SCTL_ARMCFG		0x04	/* ARM configuration register */
+#define SCTL_SCPLLCTL		0x08	/* PLL control status register */
+#define SCTL_SCPLLFCTRL		0x0c	/* PLL frequency control register */
+#define SCTL_SCRESFRACT		0x10	/* PLL fractional input register */
+#define SCTL_SCRESCTRL1		0x14	/* Peripheral reset control 1 */
+#define SCTL_SCRESXTRL2		0x18	/* Peripheral reset control 2 */
+#define SCTL_SCPEREN0		0x1c	/* Peripheral clock enable register 0 */
+#define SCTL_SCPEREN1		0x20	/* Peripheral clock enable register 1 */
+#define SCTL_SCPEREN2		0x24	/* Peripheral clock enable register 2 */
+#define SCTL_SCGRST		0x28	/* Peripheral global reset */
+#define SCTL_SCPCIPMCR1		0x30	/* PCI power management control 1 */
+#define SCTL_SCPCIPMCR2		0x34	/* PCI power management control 2 */
+#define SCTL_SCPCIPMSR1		0x38	/* PCI power management status 1 */
+#define SCTL_SCPCIPMSR2		0x3c	/* PCI power management status 2 */
+#define SCTL_SCPCIPMSR3		0x40	/* PCI power management status 3 */
+#define SCTL_SCINTREN		0x44	/* Interrupt enable */
+#define SCTL_SCRISR		0x48	/* RAW interrupt status */
+#define SCTL_SCCLKSTAT0		0x4c	/* Peripheral clocks status 0 */
+#define SCTL_SCCLKSTAT1		0x50	/* Peripheral clocks status 1 */
+#define SCTL_SCCLKSTAT2		0x54	/* Peripheral clocks status 2 */
+#define SCTL_SCRSTSTA		0x58	/* Reset status register */
+
+#define SCTL_SCRESCTRL1_USB_PHY_POR	(1 << 0)
+#define SCTL_SCRESCTRL1_USB_OTG	(1 << 1)
+#define SCTL_SCRESCTRL1_USB_HRST	(1 << 2)
+#define SCTL_SCRESCTRL1_USB_PHY_HOST	(1 << 3)
+#define SCTL_SCRESCTRL1_SATAII	(1 << 4)
+#define SCTL_SCRESCTRL1_VIP		(1 << 5)
+#define SCTL_SCRESCTRL1_PER_MMC0	(1 << 6)
+#define SCTL_SCRESCTRL1_PER_MMC1	(1 << 7)
+#define SCTL_SCRESCTRL1_PER_GPIO0	(1 << 8)
+#define SCTL_SCRESCTRL1_PER_GPIO1	(1 << 9)
+#define SCTL_SCRESCTRL1_PER_GPIO2	(1 << 10)
+#define SCTL_SCRESCTRL1_PER_GPIO3	(1 << 11)
+#define SCTL_SCRESCTRL1_PER_MTU0	(1 << 12)
+#define SCTL_SCRESCTRL1_KER_SPI0	(1 << 13)
+#define SCTL_SCRESCTRL1_KER_SPI1	(1 << 14)
+#define SCTL_SCRESCTRL1_KER_SPI2	(1 << 15)
+#define SCTL_SCRESCTRL1_KER_MCI0	(1 << 16)
+#define SCTL_SCRESCTRL1_KER_MCI1	(1 << 17)
+#define SCTL_SCRESCTRL1_PRE_HSI2C0	(1 << 18)
+#define SCTL_SCRESCTRL1_PER_HSI2C1	(1 << 19)
+#define SCTL_SCRESCTRL1_PER_HSI2C2	(1 << 20)
+#define SCTL_SCRESCTRL1_PER_HSI2C3	(1 << 21)
+#define SCTL_SCRESCTRL1_PER_MSP0	(1 << 22)
+#define SCTL_SCRESCTRL1_PER_MSP1	(1 << 23)
+#define SCTL_SCRESCTRL1_PER_MSP2	(1 << 24)
+#define SCTL_SCRESCTRL1_PER_MSP3	(1 << 25)
+#define SCTL_SCRESCTRL1_PER_MSP4	(1 << 26)
+#define SCTL_SCRESCTRL1_PER_MSP5	(1 << 27)
+#define SCTL_SCRESCTRL1_PER_MMC	(1 << 28)
+#define SCTL_SCRESCTRL1_KER_MSP0	(1 << 29)
+#define SCTL_SCRESCTRL1_KER_MSP1	(1 << 30)
+#define SCTL_SCRESCTRL1_KER_MSP2	(1 << 31)
+
+#define SCTL_SCPEREN0_UART0		(1 << 0)
+#define SCTL_SCPEREN0_UART1		(1 << 1)
+#define SCTL_SCPEREN0_UART2		(1 << 2)
+#define SCTL_SCPEREN0_UART3		(1 << 3)
+#define SCTL_SCPEREN0_MSP0		(1 << 4)
+#define SCTL_SCPEREN0_MSP1		(1 << 5)
+#define SCTL_SCPEREN0_MSP2		(1 << 6)
+#define SCTL_SCPEREN0_MSP3		(1 << 7)
+#define SCTL_SCPEREN0_MSP4		(1 << 8)
+#define SCTL_SCPEREN0_MSP5		(1 << 9)
+#define SCTL_SCPEREN0_SPI0		(1 << 10)
+#define SCTL_SCPEREN0_SPI1		(1 << 11)
+#define SCTL_SCPEREN0_SPI2		(1 << 12)
+#define SCTL_SCPEREN0_I2C0		(1 << 13)
+#define SCTL_SCPEREN0_I2C1		(1 << 14)
+#define SCTL_SCPEREN0_I2C2		(1 << 15)
+#define SCTL_SCPEREN0_I2C3		(1 << 16)
+#define SCTL_SCPEREN0_SVDO_LVDS		(1 << 17)
+#define SCTL_SCPEREN0_USB_HOST		(1 << 18)
+#define SCTL_SCPEREN0_USB_OTG		(1 << 19)
+#define SCTL_SCPEREN0_MCI0		(1 << 20)
+#define SCTL_SCPEREN0_MCI1		(1 << 21)
+#define SCTL_SCPEREN0_MCI2		(1 << 22)
+#define SCTL_SCPEREN0_MCI3		(1 << 23)
+#define SCTL_SCPEREN0_SATA		(1 << 24)
+#define SCTL_SCPEREN0_ETHERNET		(1 << 25)
+#define SCTL_SCPEREN0_VIC		(1 << 26)
+#define SCTL_SCPEREN0_DMA_AUDIO		(1 << 27)
+#define SCTL_SCPEREN0_DMA_SOC		(1 << 28)
+#define SCTL_SCPEREN0_RAM		(1 << 29)
+#define SCTL_SCPEREN0_VIP		(1 << 30)
+#define SCTL_SCPEREN0_ARM		(1 << 31)
+
+#define SCTL_SCPEREN1_UART0		(1 << 0)
+#define SCTL_SCPEREN1_UART1		(1 << 1)
+#define SCTL_SCPEREN1_UART2		(1 << 2)
+#define SCTL_SCPEREN1_UART3		(1 << 3)
+#define SCTL_SCPEREN1_MSP0		(1 << 4)
+#define SCTL_SCPEREN1_MSP1		(1 << 5)
+#define SCTL_SCPEREN1_MSP2		(1 << 6)
+#define SCTL_SCPEREN1_MSP3		(1 << 7)
+#define SCTL_SCPEREN1_MSP4		(1 << 8)
+#define SCTL_SCPEREN1_MSP5		(1 << 9)
+#define SCTL_SCPEREN1_SPI0		(1 << 10)
+#define SCTL_SCPEREN1_SPI1		(1 << 11)
+#define SCTL_SCPEREN1_SPI2		(1 << 12)
+#define SCTL_SCPEREN1_I2C0		(1 << 13)
+#define SCTL_SCPEREN1_I2C1		(1 << 14)
+#define SCTL_SCPEREN1_I2C2		(1 << 15)
+#define SCTL_SCPEREN1_I2C3		(1 << 16)
+#define SCTL_SCPEREN1_USB_PHY		(1 << 17)
+
+#endif /* __STA2X11_MFD_H */
diff --git a/include/linux/mfd/stmpe.h b/include/linux/mfd/stmpe.h
index 8516fd1eaabc..f8d5b4d5843f 100644
--- a/include/linux/mfd/stmpe.h
+++ b/include/linux/mfd/stmpe.h
@@ -117,7 +117,7 @@ struct matrix_keymap_data;
  * @no_autorepeat: disable key autorepeat
  */
 struct stmpe_keypad_platform_data {
-	struct matrix_keymap_data *keymap_data;
+	const struct matrix_keymap_data *keymap_data;
 	unsigned int debounce_ms;
 	unsigned int scan_count;
 	bool no_autorepeat;
diff --git a/include/linux/mfd/tps65910.h b/include/linux/mfd/tps65910.h
index 1c6c2860d1a6..dd8dc0a6c462 100644
--- a/include/linux/mfd/tps65910.h
+++ b/include/linux/mfd/tps65910.h
@@ -18,6 +18,7 @@
 #define __LINUX_MFD_TPS65910_H
 
 #include <linux/gpio.h>
+#include <linux/regmap.h>
 
 /* TPS chip id list */
 #define TPS65910			0
@@ -783,6 +784,18 @@
 #define TPS65910_SLEEP_CONTROL_EXT_INPUT_EN3		0x4
 #define TPS65911_SLEEP_CONTROL_EXT_INPUT_SLEEP		0x8
 
+/*
+ * Sleep keepon data: Maintains the state in sleep mode
+ * @therm_keepon: Keep on the thermal monitoring in sleep state.
+ * @clkout32k_keepon: Keep on the 32KHz clock output in sleep state.
+ * @i2chs_keepon: Keep on high speed internal clock in sleep state.
+ */
+struct tps65910_sleep_keepon_data {
+	unsigned therm_keepon:1;
+	unsigned clkout32k_keepon:1;
+	unsigned i2chs_keepon:1;
+};
+
 /**
  * struct tps65910_board
  * Board platform data may be used to initialize regulators.
@@ -794,6 +807,8 @@ struct tps65910_board {
 	int irq_base;
 	int vmbch_threshold;
 	int vmbch2_threshold;
+	bool en_dev_slp;
+	struct tps65910_sleep_keepon_data *slp_keepon;
 	bool en_gpio_sleep[TPS6591X_MAX_NUM_GPIO];
 	unsigned long regulator_ext_sleep_control[TPS65910_NUM_REGS];
 	struct regulator_init_data *tps65910_pmic_init_data[TPS65910_NUM_REGS];
@@ -809,16 +824,14 @@ struct tps65910 {
 	struct regmap *regmap;
 	struct mutex io_mutex;
 	unsigned int id;
-	int (*read)(struct tps65910 *tps65910, u8 reg, int size, void *dest);
-	int (*write)(struct tps65910 *tps65910, u8 reg, int size, void *src);
 
 	/* Client devices */
 	struct tps65910_pmic *pmic;
 	struct tps65910_rtc *rtc;
 	struct tps65910_power *power;
 
-	/* GPIO Handling */
-	struct gpio_chip gpio;
+	/* Device node parsed board data */
+	struct tps65910_board *of_plat_data;
 
 	/* IRQ Handling */
 	struct mutex irq_lock;
@@ -826,6 +839,7 @@ struct tps65910 {
 	int irq_base;
 	int irq_num;
 	u32 irq_mask;
+	struct irq_domain *domain;
 };
 
 struct tps65910_platform_data {
@@ -833,9 +847,6 @@ struct tps65910_platform_data {
 	int irq_base;
 };
 
-int tps65910_set_bits(struct tps65910 *tps65910, u8 reg, u8 mask);
-int tps65910_clear_bits(struct tps65910 *tps65910, u8 reg, u8 mask);
-void tps65910_gpio_init(struct tps65910 *tps65910, int gpio_base);
 int tps65910_irq_init(struct tps65910 *tps65910, int irq,
 		struct tps65910_platform_data *pdata);
 int tps65910_irq_exit(struct tps65910 *tps65910);
@@ -845,4 +856,28 @@ static inline int tps65910_chip_id(struct tps65910 *tps65910)
 	return tps65910->id;
 }
 
+static inline int tps65910_reg_read(struct tps65910 *tps65910, u8 reg,
+		unsigned int *val)
+{
+	return regmap_read(tps65910->regmap, reg, val);
+}
+
+static inline int tps65910_reg_write(struct tps65910 *tps65910, u8 reg,
+		unsigned int val)
+{
+	return regmap_write(tps65910->regmap, reg, val);
+}
+
+static inline int tps65910_reg_set_bits(struct tps65910 *tps65910, u8 reg,
+		u8 mask)
+{
+	return regmap_update_bits(tps65910->regmap, reg, mask, mask);
+}
+
+static inline int tps65910_reg_clear_bits(struct tps65910 *tps65910, u8 reg,
+		u8 mask)
+{
+	return regmap_update_bits(tps65910->regmap, reg, mask, 0);
+}
+
 #endif /*  __LINUX_MFD_TPS65910_H */
diff --git a/include/linux/mfd/twl6040.h b/include/linux/mfd/twl6040.h
index b15b5f03f5c4..6659487c31e7 100644
--- a/include/linux/mfd/twl6040.h
+++ b/include/linux/mfd/twl6040.h
@@ -27,6 +27,7 @@
 
 #include <linux/interrupt.h>
 #include <linux/mfd/core.h>
+#include <linux/regulator/consumer.h>
 
 #define TWL6040_REG_ASICID		0x01
 #define TWL6040_REG_ASICREV		0x02
@@ -203,6 +204,7 @@ struct regmap;
 struct twl6040 {
 	struct device *dev;
 	struct regmap *regmap;
+	struct regulator_bulk_data supplies[2]; /* supplies for vio, v2v1 */
 	struct mutex mutex;
 	struct mutex io_mutex;
 	struct mutex irq_mutex;
diff --git a/include/linux/mfd/wm831x/core.h b/include/linux/mfd/wm831x/core.h
index 4b1211859f74..4a3b83a77614 100644
--- a/include/linux/mfd/wm831x/core.h
+++ b/include/linux/mfd/wm831x/core.h
@@ -17,6 +17,7 @@
 
 #include <linux/completion.h>
 #include <linux/interrupt.h>
+#include <linux/irqdomain.h>
 #include <linux/list.h>
 #include <linux/regmap.h>
 
@@ -338,6 +339,7 @@
 #define WM831X_FLL_CLK_SRC_WIDTH                     2  /* FLL_CLK_SRC - [1:0] */
 
 struct regulator_dev;
+struct irq_domain;
 
 #define WM831X_NUM_IRQ_REGS 5
 #define WM831X_NUM_GPIO_REGS 16
@@ -367,7 +369,7 @@ struct wm831x {
 
 	int irq;  /* Our chip IRQ */
 	struct mutex irq_lock;
-	int irq_base;
+	struct irq_domain *irq_domain;
 	int irq_masks_cur[WM831X_NUM_IRQ_REGS];   /* Currently active value */
 	int irq_masks_cache[WM831X_NUM_IRQ_REGS]; /* Cached hardware value */
 
@@ -382,7 +384,8 @@ struct wm831x {
 
 	/* Used by the interrupt controller code to post writes */
 	int gpio_update[WM831X_NUM_GPIO_REGS];
-	bool gpio_level[WM831X_NUM_GPIO_REGS];
+	bool gpio_level_high[WM831X_NUM_GPIO_REGS];
+	bool gpio_level_low[WM831X_NUM_GPIO_REGS];
 
 	struct mutex auxadc_lock;
 	struct list_head auxadc_pending;
@@ -417,6 +420,11 @@ int wm831x_irq_init(struct wm831x *wm831x, int irq);
 void wm831x_irq_exit(struct wm831x *wm831x);
 void wm831x_auxadc_init(struct wm831x *wm831x);
 
+static inline int wm831x_irq(struct wm831x *wm831x, int irq)
+{
+	return irq_create_mapping(wm831x->irq_domain, irq);
+}
+
 extern struct regmap_config wm831x_regmap_config;
 
 #endif
diff --git a/include/linux/mfd/wm8350/core.h b/include/linux/mfd/wm8350/core.h
index 98fcc977e82b..9192b6404a73 100644
--- a/include/linux/mfd/wm8350/core.h
+++ b/include/linux/mfd/wm8350/core.h
@@ -602,6 +602,7 @@ extern const u16 wm8352_mode2_defaults[];
 extern const u16 wm8352_mode3_defaults[];
 
 struct wm8350;
+struct regmap;
 
 struct wm8350_hwmon {
 	struct platform_device *pdev;
@@ -612,13 +613,7 @@ struct wm8350 {
 	struct device *dev;
 
 	/* device IO */
-	union {
-		struct i2c_client *i2c_client;
-		struct spi_device *spi_device;
-	};
-	int (*read_dev)(struct wm8350 *wm8350, char reg, int size, void *dest);
-	int (*write_dev)(struct wm8350 *wm8350, char reg, int size,
-			 void *src);
+	struct regmap *regmap;
 	u16 *reg_cache;
 
 	struct mutex auxadc_mutex;
diff --git a/include/linux/mfd/wm8400-private.h b/include/linux/mfd/wm8400-private.h
index 0147b6968510..2de565b94d0c 100644
--- a/include/linux/mfd/wm8400-private.h
+++ b/include/linux/mfd/wm8400-private.h
@@ -24,19 +24,14 @@
 #include <linux/mfd/wm8400.h>
 #include <linux/mutex.h>
 #include <linux/platform_device.h>
-
-struct regmap;
+#include <linux/regmap.h>
 
 #define WM8400_REGISTER_COUNT 0x55
 
 struct wm8400 {
 	struct device *dev;
-
-	struct mutex io_lock;
 	struct regmap *regmap;
 
-	u16 reg_cache[WM8400_REGISTER_COUNT];
-
 	struct platform_device regulators[6];
 };
 
@@ -930,6 +925,11 @@ struct wm8400 {
 
 u16 wm8400_reg_read(struct wm8400 *wm8400, u8 reg);
 int wm8400_block_read(struct wm8400 *wm8400, u8 reg, int count, u16 *data);
-int wm8400_set_bits(struct wm8400 *wm8400, u8 reg, u16 mask, u16 val);
+
+static inline int wm8400_set_bits(struct wm8400 *wm8400, u8 reg,
+				  u16 mask, u16 val)
+{
+	return regmap_update_bits(wm8400->regmap, reg, mask, val);
+}
 
 #endif
diff --git a/include/linux/mfd/wm8994/core.h b/include/linux/mfd/wm8994/core.h
index 6695c3ec4518..1f173306bf05 100644
--- a/include/linux/mfd/wm8994/core.h
+++ b/include/linux/mfd/wm8994/core.h
@@ -57,6 +57,7 @@ struct wm8994 {
 
 	enum wm8994_type type;
 	int revision;
+	int cust_id;
 
 	struct device *dev;
 	struct regmap *regmap;
diff --git a/include/linux/mfd/wm8994/registers.h b/include/linux/mfd/wm8994/registers.h
index 86e6a032a078..053548961c15 100644
--- a/include/linux/mfd/wm8994/registers.h
+++ b/include/linux/mfd/wm8994/registers.h
@@ -2212,6 +2212,9 @@
 /*
  * R256 (0x100) - Chip Revision
  */
+#define WM8994_CUST_ID_MASK                     0xFF00  /* CUST_ID - [15:8] */
+#define WM8994_CUST_ID_SHIFT                         8  /* CUST_ID - [15:8] */
+#define WM8994_CUST_ID_WIDTH                         8  /* CUST_ID - [15:8] */
 #define WM8994_CHIP_REV_MASK                    0x000F  /* CHIP_REV - [3:0] */
 #define WM8994_CHIP_REV_SHIFT                        0  /* CHIP_REV - [3:0] */
 #define WM8994_CHIP_REV_WIDTH                        4  /* CHIP_REV - [3:0] */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7d5c37f24c63..ce26716238c3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -321,6 +321,7 @@ static inline int is_vmalloc_or_module_addr(const void *x)
 static inline void compound_lock(struct page *page)
 {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	VM_BUG_ON(PageSlab(page));
 	bit_spin_lock(PG_compound_lock, &page->flags);
 #endif
 }
@@ -328,6 +329,7 @@ static inline void compound_lock(struct page *page)
 static inline void compound_unlock(struct page *page)
 {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	VM_BUG_ON(PageSlab(page));
 	bit_spin_unlock(PG_compound_lock, &page->flags);
 #endif
 }
@@ -871,8 +873,6 @@ extern void pagefault_out_of_memory(void);
 extern void show_free_areas(unsigned int flags);
 extern bool skip_free_areas_node(unsigned int flags, int nid);
 
-int shmem_lock(struct file *file, int lock, struct user_struct *user);
-struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags);
 int shmem_zero_setup(struct vm_area_struct *);
 
 extern int can_do_mlock(void);
@@ -951,11 +951,9 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
 extern void truncate_pagecache(struct inode *inode, loff_t old, loff_t new);
 extern void truncate_setsize(struct inode *inode, loff_t newsize);
 extern int vmtruncate(struct inode *inode, loff_t offset);
-extern int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end);
 void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
 int truncate_inode_page(struct address_space *mapping, struct page *page);
 int generic_error_remove_page(struct address_space *mapping, struct page *page);
-
 int invalidate_inode_page(struct page *page);
 
 #ifdef CONFIG_MMU
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 227fd3e9a9c9..1397ccf81e91 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -21,22 +21,22 @@ static inline int page_is_file_cache(struct page *page)
 	return !PageSwapBacked(page);
 }
 
-static inline void
-add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list lru)
+static __always_inline void add_page_to_lru_list(struct page *page,
+				struct lruvec *lruvec, enum lru_list lru)
 {
-	struct lruvec *lruvec;
-
-	lruvec = mem_cgroup_lru_add_list(zone, page, lru);
+	int nr_pages = hpage_nr_pages(page);
+	mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
 	list_add(&page->lru, &lruvec->lists[lru]);
-	__mod_zone_page_state(zone, NR_LRU_BASE + lru, hpage_nr_pages(page));
+	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
 }
 
-static inline void
-del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list lru)
+static __always_inline void del_page_from_lru_list(struct page *page,
+				struct lruvec *lruvec, enum lru_list lru)
 {
-	mem_cgroup_lru_del_list(page, lru);
+	int nr_pages = hpage_nr_pages(page);
+	mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
 	list_del(&page->lru);
-	__mod_zone_page_state(zone, NR_LRU_BASE + lru, -hpage_nr_pages(page));
+	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, -nr_pages);
 }
 
 /**
@@ -61,7 +61,7 @@ static inline enum lru_list page_lru_base_type(struct page *page)
  * Returns the LRU list a page was on, as an index into the array of LRU
  * lists; and clears its Unevictable or Active flags, ready for freeing.
  */
-static inline enum lru_list page_off_lru(struct page *page)
+static __always_inline enum lru_list page_off_lru(struct page *page)
 {
 	enum lru_list lru;
 
@@ -85,7 +85,7 @@ static inline enum lru_list page_off_lru(struct page *page)
  * Returns the LRU list a page should be on, as an index
  * into the array of LRU lists.
  */
-static inline enum lru_list page_lru(struct page *page)
+static __always_inline enum lru_list page_lru(struct page *page)
 {
 	enum lru_list lru;
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 26574c726121..dad95bdd06d7 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -345,17 +345,6 @@ struct mm_struct {
 	/* Architecture-specific MM context */
 	mm_context_t context;
 
-	/* Swap token stuff */
-	/*
-	 * Last value of global fault stamp as seen by this process.
-	 * In other words, this value gives an indication of how long
-	 * it has been since this task got the token.
-	 * Look at mm/thrash.c
-	 */
-	unsigned int faultstamp;
-	unsigned int token_priority;
-	unsigned int last_interval;
-
 	unsigned long flags; /* Must use atomic bitops to access the bits */
 
 	struct core_state *core_state; /* coredumping support */
diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index c04ecfe03f7f..580bd587d916 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -4,7 +4,7 @@
 #ifdef CONFIG_DEBUG_VM
 #define VM_BUG_ON(cond) BUG_ON(cond)
 #else
-#define VM_BUG_ON(cond) do { (void)(cond); } while (0)
+#define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
 #endif
 
 #ifdef CONFIG_DEBUG_VIRTUAL
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4871e31ae277..2427706f78b4 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -185,8 +185,25 @@ static inline int is_unevictable_lru(enum lru_list lru)
 	return (lru == LRU_UNEVICTABLE);
 }
 
+struct zone_reclaim_stat {
+	/*
+	 * The pageout code in vmscan.c keeps track of how many of the
+	 * mem/swap backed and file backed pages are refeferenced.
+	 * The higher the rotated/scanned ratio, the more valuable
+	 * that cache is.
+	 *
+	 * The anon LRU stats live in [0], file LRU stats in [1]
+	 */
+	unsigned long		recent_rotated[2];
+	unsigned long		recent_scanned[2];
+};
+
 struct lruvec {
 	struct list_head lists[NR_LRU_LISTS];
+	struct zone_reclaim_stat reclaim_stat;
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+	struct zone *zone;
+#endif
 };
 
 /* Mask used at gathering information at once (see memcontrol.c) */
@@ -195,16 +212,12 @@ struct lruvec {
 #define LRU_ALL_EVICTABLE (LRU_ALL_FILE | LRU_ALL_ANON)
 #define LRU_ALL	     ((1 << NR_LRU_LISTS) - 1)
 
-/* Isolate inactive pages */
-#define ISOLATE_INACTIVE	((__force isolate_mode_t)0x1)
-/* Isolate active pages */
-#define ISOLATE_ACTIVE		((__force isolate_mode_t)0x2)
 /* Isolate clean file */
-#define ISOLATE_CLEAN		((__force isolate_mode_t)0x4)
+#define ISOLATE_CLEAN		((__force isolate_mode_t)0x1)
 /* Isolate unmapped file */
-#define ISOLATE_UNMAPPED	((__force isolate_mode_t)0x8)
+#define ISOLATE_UNMAPPED	((__force isolate_mode_t)0x2)
 /* Isolate for asynchronous migration */
-#define ISOLATE_ASYNC_MIGRATE	((__force isolate_mode_t)0x10)
+#define ISOLATE_ASYNC_MIGRATE	((__force isolate_mode_t)0x4)
 
 /* LRU Isolation modes. */
 typedef unsigned __bitwise__ isolate_mode_t;
@@ -313,19 +326,6 @@ enum zone_type {
 #error ZONES_SHIFT -- too many zones configured adjust calculation
 #endif
 
-struct zone_reclaim_stat {
-	/*
-	 * The pageout code in vmscan.c keeps track of how many of the
-	 * mem/swap backed and file backed pages are refeferenced.
-	 * The higher the rotated/scanned ratio, the more valuable
-	 * that cache is.
-	 *
-	 * The anon LRU stats live in [0], file LRU stats in [1]
-	 */
-	unsigned long		recent_rotated[2];
-	unsigned long		recent_scanned[2];
-};
-
 struct zone {
 	/* Fields commonly accessed by the page allocator */
 
@@ -407,8 +407,6 @@ struct zone {
 	spinlock_t		lru_lock;
 	struct lruvec		lruvec;
 
-	struct zone_reclaim_stat reclaim_stat;
-
 	unsigned long		pages_scanned;	   /* since last reclaim */
 	unsigned long		flags;		   /* zone flags, see below */
 
@@ -734,6 +732,17 @@ extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
 				     unsigned long size,
 				     enum memmap_context context);
 
+extern void lruvec_init(struct lruvec *lruvec, struct zone *zone);
+
+static inline struct zone *lruvec_zone(struct lruvec *lruvec)
+{
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+	return lruvec->zone;
+#else
+	return container_of(lruvec, struct zone, lruvec);
+#endif
+}
+
 #ifdef CONFIG_HAVE_MEMORY_PRESENT
 void memory_present(int nid, unsigned long start, unsigned long end);
 #else
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 0987146b0637..af2d2fa30eee 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -69,6 +69,10 @@
 #define NFS4_CDFC4_FORE_OR_BOTH 0x3
 #define NFS4_CDFC4_BACK_OR_BOTH 0x7
 
+#define NFS4_CDFS4_FORE 0x1
+#define NFS4_CDFS4_BACK 0x2
+#define NFS4_CDFS4_BOTH 0x3
+
 #define NFS4_SET_TO_SERVER_TIME	0
 #define NFS4_SET_TO_CLIENT_TIME	1
 
@@ -526,6 +530,13 @@ enum lock_type4 {
 #define FATTR4_WORD1_MOUNTED_ON_FILEID  (1UL << 23)
 #define FATTR4_WORD1_FS_LAYOUT_TYPES    (1UL << 30)
 #define FATTR4_WORD2_LAYOUT_BLKSIZE     (1UL << 1)
+#define FATTR4_WORD2_MDSTHRESHOLD       (1UL << 4)
+
+/* MDS threshold bitmap bits */
+#define THRESHOLD_RD                    (1UL << 0)
+#define THRESHOLD_WR                    (1UL << 1)
+#define THRESHOLD_RD_IO                 (1UL << 2)
+#define THRESHOLD_WR_IO                 (1UL << 3)
 
 #define NFSPROC4_NULL 0
 #define NFSPROC4_COMPOUND 1
@@ -596,6 +607,8 @@ enum {
 	NFSPROC4_CLNT_TEST_STATEID,
 	NFSPROC4_CLNT_FREE_STATEID,
 	NFSPROC4_CLNT_GETDEVICELIST,
+	NFSPROC4_CLNT_BIND_CONN_TO_SESSION,
+	NFSPROC4_CLNT_DESTROY_CLIENTID,
 };
 
 /* nfs41 types */
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 52a1bdb4ee2b..b23cfc120edb 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -102,6 +102,7 @@ struct nfs_open_context {
 	int error;
 
 	struct list_head list;
+	struct nfs4_threshold	*mdsthreshold;
 };
 
 struct nfs_open_dir_context {
@@ -179,8 +180,7 @@ struct nfs_inode {
 	__be32			cookieverf[2];
 
 	unsigned long		npages;
-	unsigned long		ncommit;
-	struct list_head	commit_list;
+	struct nfs_mds_commit_info commit_info;
 
 	/* Open contexts for shared mmap writes */
 	struct list_head	open_files;
@@ -201,8 +201,10 @@ struct nfs_inode {
 
 	/* pNFS layout information */
 	struct pnfs_layout_hdr *layout;
-	atomic_t		commits_outstanding;
 #endif /* CONFIG_NFS_V4*/
+	/* how many bytes have been written/read and how many bytes queued up */
+	__u64 write_io;
+	__u64 read_io;
 #ifdef CONFIG_NFS_FSCACHE
 	struct fscache_cookie	*fscache;
 #endif
@@ -230,7 +232,6 @@ struct nfs_inode {
 #define NFS_INO_FSCACHE		(5)		/* inode can be cached by FS-Cache */
 #define NFS_INO_FSCACHE_LOCK	(6)		/* FS-Cache cookie management lock */
 #define NFS_INO_COMMIT		(7)		/* inode is committing unstable writes */
-#define NFS_INO_PNFS_COMMIT	(8)		/* use pnfs code for commit */
 #define NFS_INO_LAYOUTCOMMIT	(9)		/* layoutcommit required */
 #define NFS_INO_LAYOUTCOMMITTING (10)		/* layoutcommit inflight */
 
@@ -317,11 +318,6 @@ static inline int nfs_server_capable(struct inode *inode, int cap)
 	return NFS_SERVER(inode)->caps & cap;
 }
 
-static inline int NFS_USE_READDIRPLUS(struct inode *inode)
-{
-	return test_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
-}
-
 static inline void nfs_set_verifier(struct dentry * dentry, unsigned long verf)
 {
 	dentry->d_time = verf;
@@ -552,8 +548,8 @@ extern int nfs_wb_page(struct inode *inode, struct page* page);
 extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 extern int  nfs_commit_inode(struct inode *, int);
-extern struct nfs_write_data *nfs_commitdata_alloc(void);
-extern void nfs_commit_free(struct nfs_write_data *wdata);
+extern struct nfs_commit_data *nfs_commitdata_alloc(void);
+extern void nfs_commit_free(struct nfs_commit_data *data);
 #else
 static inline int
 nfs_commit_inode(struct inode *inode, int how)
@@ -569,12 +565,6 @@ nfs_have_writebacks(struct inode *inode)
 }
 
 /*
- * Allocate nfs_write_data structures
- */
-extern struct nfs_write_data *nfs_writedata_alloc(unsigned int npages);
-extern void nfs_writedata_free(struct nfs_write_data *);
-
-/*
  * linux/fs/nfs/read.c
  */
 extern int  nfs_readpage(struct file *, struct page *);
@@ -585,12 +575,6 @@ extern int  nfs_readpage_async(struct nfs_open_context *, struct inode *,
 			       struct page *);
 
 /*
- * Allocate nfs_read_data structures
- */
-extern struct nfs_read_data *nfs_readdata_alloc(unsigned int npages);
-extern void nfs_readdata_free(struct nfs_read_data *);
-
-/*
  * linux/fs/nfs3proc.c
  */
 #ifdef CONFIG_NFS_V3_ACL
@@ -654,6 +638,7 @@ nfs_fileid_to_ino_t(u64 fileid)
 #define NFSDBG_FSCACHE		0x0800
 #define NFSDBG_PNFS		0x1000
 #define NFSDBG_PNFS_LD		0x2000
+#define NFSDBG_STATE		0x4000
 #define NFSDBG_ALL		0xFFFF
 
 #ifdef __KERNEL__
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 7073fc74481c..fbb78fb09bd2 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -17,7 +17,7 @@ struct nfs4_sequence_args;
 struct nfs4_sequence_res;
 struct nfs_server;
 struct nfs4_minor_version_ops;
-struct server_scope;
+struct nfs41_server_scope;
 struct nfs41_impl_id;
 
 /*
@@ -35,6 +35,9 @@ struct nfs_client {
 #define NFS_CS_RENEWD		3		/* - renewd started */
 #define NFS_CS_STOP_RENEW	4		/* no more state to renew */
 #define NFS_CS_CHECK_LEASE_TIME	5		/* need to check lease time */
+	unsigned long		cl_flags;	/* behavior switches */
+#define NFS_CS_NORESVPORT	0		/* - use ephemeral src port */
+#define NFS_CS_DISCRTRY		1		/* - disconnect on RPC retry */
 	struct sockaddr_storage	cl_addr;	/* server identifier */
 	size_t			cl_addrlen;
 	char *			cl_hostname;	/* hostname of server */
@@ -61,9 +64,6 @@ struct nfs_client {
 
 	struct rpc_wait_queue	cl_rpcwaitq;
 
-	/* used for the setclientid verifier */
-	struct timespec		cl_boot_time;
-
 	/* idmapper */
 	struct idmap *		cl_idmap;
 
@@ -79,16 +79,17 @@ struct nfs_client {
 	u32			cl_seqid;
 	/* The flags used for obtaining the clientid during EXCHANGE_ID */
 	u32			cl_exchange_flags;
-	struct nfs4_session	*cl_session; 	/* sharred session */
+	struct nfs4_session	*cl_session;	/* shared session */
+	struct nfs41_server_owner *cl_serverowner;
+	struct nfs41_server_scope *cl_serverscope;
+	struct nfs41_impl_id	*cl_implid;
 #endif /* CONFIG_NFS_V4 */
 
 #ifdef CONFIG_NFS_FSCACHE
 	struct fscache_cookie	*fscache;	/* client index cache cookie */
 #endif
 
-	struct server_scope	*server_scope;	/* from exchange_id */
-	struct nfs41_impl_id	*impl_id;	/* from exchange_id */
-	struct net		*net;
+	struct net		*cl_net;
 };
 
 /*
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index eac30d6bec17..88d166b555e8 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -27,7 +27,6 @@ enum {
 	PG_CLEAN,
 	PG_NEED_COMMIT,
 	PG_NEED_RESCHED,
-	PG_PARTIAL_READ_FAILED,
 	PG_COMMIT_TO_DS,
 };
 
@@ -37,7 +36,6 @@ struct nfs_page {
 	struct page		*wb_page;	/* page to read in/write out */
 	struct nfs_open_context	*wb_context;	/* File state context info */
 	struct nfs_lock_context	*wb_lock_context;	/* lock context info */
-	atomic_t		wb_complete;	/* i/os we're waiting for */
 	pgoff_t			wb_index;	/* Offset >> PAGE_CACHE_SHIFT */
 	unsigned int		wb_offset,	/* Offset & ~PAGE_CACHE_MASK */
 				wb_pgbase,	/* Start of page data */
@@ -68,7 +66,9 @@ struct nfs_pageio_descriptor {
 	int 			pg_ioflags;
 	int			pg_error;
 	const struct rpc_call_ops *pg_rpc_callops;
+	const struct nfs_pgio_completion_ops *pg_completion_ops;
 	struct pnfs_layout_segment *pg_lseg;
+	struct nfs_direct_req	*pg_dreq;
 };
 
 #define NFS_WBACK_BUSY(req)	(test_bit(PG_BUSY,&(req)->wb_flags))
@@ -84,6 +84,7 @@ extern	void nfs_release_request(struct nfs_page *req);
 extern	void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 			     struct inode *inode,
 			     const struct nfs_pageio_ops *pg_ops,
+			     const struct nfs_pgio_completion_ops *compl_ops,
 			     size_t bsize,
 			     int how);
 extern	int nfs_pageio_add_request(struct nfs_pageio_descriptor *,
@@ -95,26 +96,17 @@ extern bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
 				struct nfs_page *req);
 extern  int nfs_wait_on_request(struct nfs_page *);
 extern	void nfs_unlock_request(struct nfs_page *req);
+extern	void nfs_unlock_and_release_request(struct nfs_page *req);
 
 /*
- * Lock the page of an asynchronous request without getting a new reference
+ * Lock the page of an asynchronous request
  */
 static inline int
-nfs_lock_request_dontget(struct nfs_page *req)
-{
-	return !test_and_set_bit(PG_BUSY, &req->wb_flags);
-}
-
-static inline int
 nfs_lock_request(struct nfs_page *req)
 {
-	if (test_and_set_bit(PG_BUSY, &req->wb_flags))
-		return 0;
-	kref_get(&req->wb_kref);
-	return 1;
+	return !test_and_set_bit(PG_BUSY, &req->wb_flags);
 }
 
-
 /**
  * nfs_list_add_request - Insert a request into a list
  * @req: request
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 7ba3551a0414..d1a7bf51c326 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -35,6 +35,15 @@ static inline int nfs_fsid_equal(const struct nfs_fsid *a, const struct nfs_fsid
 	return a->major == b->major && a->minor == b->minor;
 }
 
+struct nfs4_threshold {
+	__u32	bm;
+	__u32	l_type;
+	__u64	rd_sz;
+	__u64	wr_sz;
+	__u64	rd_io_sz;
+	__u64	wr_io_sz;
+};
+
 struct nfs_fattr {
 	unsigned int		valid;		/* which fields are valid */
 	umode_t			mode;
@@ -67,6 +76,7 @@ struct nfs_fattr {
 	unsigned long		gencount;
 	struct nfs4_string	*owner_name;
 	struct nfs4_string	*group_name;
+	struct nfs4_threshold	*mdsthreshold;	/* pNFS threshold hints */
 };
 
 #define NFS_ATTR_FATTR_TYPE		(1U << 0)
@@ -106,14 +116,14 @@ struct nfs_fattr {
 		| NFS_ATTR_FATTR_FILEID \
 		| NFS_ATTR_FATTR_ATIME \
 		| NFS_ATTR_FATTR_MTIME \
-		| NFS_ATTR_FATTR_CTIME)
+		| NFS_ATTR_FATTR_CTIME \
+		| NFS_ATTR_FATTR_CHANGE)
 #define NFS_ATTR_FATTR_V2 (NFS_ATTR_FATTR \
 		| NFS_ATTR_FATTR_BLOCKS_USED)
 #define NFS_ATTR_FATTR_V3 (NFS_ATTR_FATTR \
 		| NFS_ATTR_FATTR_SPACE_USED)
 #define NFS_ATTR_FATTR_V4 (NFS_ATTR_FATTR \
-		| NFS_ATTR_FATTR_SPACE_USED \
-		| NFS_ATTR_FATTR_CHANGE)
+		| NFS_ATTR_FATTR_SPACE_USED)
 
 /*
  * Info on the file system
@@ -338,7 +348,6 @@ struct nfs_openargs {
 	const struct qstr *	name;
 	const struct nfs_server *server;	 /* Needed for ID mapping */
 	const u32 *		bitmask;
-	const u32 *		dir_bitmask;
 	__u32			claim;
 	struct nfs4_sequence_args	seq_args;
 };
@@ -349,7 +358,6 @@ struct nfs_openres {
 	struct nfs4_change_info	cinfo;
 	__u32                   rflags;
 	struct nfs_fattr *      f_attr;
-	struct nfs_fattr *      dir_attr;
 	struct nfs_seqid *	seqid;
 	const struct nfs_server *server;
 	fmode_t			delegation_type;
@@ -519,12 +527,29 @@ struct nfs_writeres {
 };
 
 /*
+ * Arguments to the commit call.
+ */
+struct nfs_commitargs {
+	struct nfs_fh		*fh;
+	__u64			offset;
+	__u32			count;
+	const u32		*bitmask;
+	struct nfs4_sequence_args	seq_args;
+};
+
+struct nfs_commitres {
+	struct nfs_fattr	*fattr;
+	struct nfs_writeverf	*verf;
+	const struct nfs_server *server;
+	struct nfs4_sequence_res	seq_res;
+};
+
+/*
  * Common arguments to the unlink call
  */
 struct nfs_removeargs {
 	const struct nfs_fh	*fh;
 	struct qstr		name;
-	const u32 *		bitmask;
 	struct nfs4_sequence_args	seq_args;
 };
 
@@ -543,7 +568,6 @@ struct nfs_renameargs {
 	const struct nfs_fh		*new_dir;
 	const struct qstr		*old_name;
 	const struct qstr		*new_name;
-	const u32			*bitmask;
 	struct nfs4_sequence_args	seq_args;
 };
 
@@ -839,7 +863,6 @@ struct nfs4_create_res {
 	struct nfs_fh *			fh;
 	struct nfs_fattr *		fattr;
 	struct nfs4_change_info		dir_cinfo;
-	struct nfs_fattr *		dir_fattr;
 	struct nfs4_sequence_res	seq_res;
 };
 
@@ -1061,6 +1084,21 @@ struct nfstime4 {
 };
 
 #ifdef CONFIG_NFS_V4_1
+
+struct pnfs_commit_bucket {
+	struct list_head written;
+	struct list_head committing;
+	struct pnfs_layout_segment *wlseg;
+	struct pnfs_layout_segment *clseg;
+};
+
+struct pnfs_ds_commit_info {
+	int nwritten;
+	int ncommitting;
+	int nbuckets;
+	struct pnfs_commit_bucket *buckets;
+};
+
 #define NFS4_EXCHANGE_ID_LEN	(48)
 struct nfs41_exchange_id_args {
 	struct nfs_client		*client;
@@ -1070,13 +1108,13 @@ struct nfs41_exchange_id_args {
 	u32				flags;
 };
 
-struct server_owner {
+struct nfs41_server_owner {
 	uint64_t			minor_id;
 	uint32_t			major_id_sz;
 	char				major_id[NFS4_OPAQUE_LIMIT];
 };
 
-struct server_scope {
+struct nfs41_server_scope {
 	uint32_t			server_scope_sz;
 	char 				server_scope[NFS4_OPAQUE_LIMIT];
 };
@@ -1087,10 +1125,18 @@ struct nfs41_impl_id {
 	struct nfstime4			date;
 };
 
+struct nfs41_bind_conn_to_session_res {
+	struct nfs4_session		*session;
+	u32				dir;
+	bool				use_conn_in_rdma_mode;
+};
+
 struct nfs41_exchange_id_res {
-	struct nfs_client		*client;
+	u64				clientid;
+	u32				seqid;
 	u32				flags;
-	struct server_scope		*server_scope;
+	struct nfs41_server_owner	*server_owner;
+	struct nfs41_server_scope	*server_scope;
 	struct nfs41_impl_id		*impl_id;
 };
 
@@ -1143,35 +1189,114 @@ struct nfs41_free_stateid_res {
 	struct nfs4_sequence_res	seq_res;
 };
 
+#else
+
+struct pnfs_ds_commit_info {
+};
+
 #endif /* CONFIG_NFS_V4_1 */
 
 struct nfs_page;
 
 #define NFS_PAGEVEC_SIZE	(8U)
 
+struct nfs_page_array {
+	struct page		**pagevec;
+	unsigned int		npages;		/* Max length of pagevec */
+	struct page		*page_array[NFS_PAGEVEC_SIZE];
+};
+
 struct nfs_read_data {
+	struct nfs_pgio_header	*header;
+	struct list_head	list;
 	struct rpc_task		task;
-	struct inode		*inode;
-	struct rpc_cred		*cred;
 	struct nfs_fattr	fattr;	/* fattr storage */
-	struct list_head	pages;	/* Coalesced read requests */
-	struct list_head	list;	/* lists of struct nfs_read_data */
-	struct nfs_page		*req;	/* multi ops per nfs_page */
-	struct page		**pagevec;
-	unsigned int		npages;	/* Max length of pagevec */
 	struct nfs_readargs args;
 	struct nfs_readres  res;
 	unsigned long		timestamp;	/* For lease renewal */
-	struct pnfs_layout_segment *lseg;
-	struct nfs_client	*ds_clp;	/* pNFS data server */
-	const struct rpc_call_ops *mds_ops;
 	int (*read_done_cb) (struct rpc_task *task, struct nfs_read_data *data);
 	__u64			mds_offset;
+	struct nfs_page_array	pages;
+	struct nfs_client	*ds_clp;	/* pNFS data server */
+};
+
+/* used as flag bits in nfs_pgio_header */
+enum {
+	NFS_IOHDR_ERROR = 0,
+	NFS_IOHDR_EOF,
+	NFS_IOHDR_REDO,
+	NFS_IOHDR_NEED_COMMIT,
+	NFS_IOHDR_NEED_RESCHED,
+};
+
+struct nfs_pgio_header {
+	struct inode		*inode;
+	struct rpc_cred		*cred;
+	struct list_head	pages;
+	struct list_head	rpc_list;
+	atomic_t		refcnt;
+	struct nfs_page		*req;
+	struct pnfs_layout_segment *lseg;
+	loff_t			io_start;
+	const struct rpc_call_ops *mds_ops;
+	void (*release) (struct nfs_pgio_header *hdr);
+	const struct nfs_pgio_completion_ops *completion_ops;
+	struct nfs_direct_req	*dreq;
+	spinlock_t		lock;
+	/* fields protected by lock */
 	int			pnfs_error;
-	struct page		*page_array[NFS_PAGEVEC_SIZE];
+	int			error;		/* merge with pnfs_error */
+	unsigned long		good_bytes;	/* boundary of good data */
+	unsigned long		flags;
+};
+
+struct nfs_read_header {
+	struct nfs_pgio_header	header;
+	struct nfs_read_data	rpc_data;
 };
 
 struct nfs_write_data {
+	struct nfs_pgio_header	*header;
+	struct list_head	list;
+	struct rpc_task		task;
+	struct nfs_fattr	fattr;
+	struct nfs_writeverf	verf;
+	struct nfs_writeargs	args;		/* argument struct */
+	struct nfs_writeres	res;		/* result struct */
+	unsigned long		timestamp;	/* For lease renewal */
+	int (*write_done_cb) (struct rpc_task *task, struct nfs_write_data *data);
+	__u64			mds_offset;	/* Filelayout dense stripe */
+	struct nfs_page_array	pages;
+	struct nfs_client	*ds_clp;	/* pNFS data server */
+};
+
+struct nfs_write_header {
+	struct nfs_pgio_header	header;
+	struct nfs_write_data	rpc_data;
+};
+
+struct nfs_mds_commit_info {
+	atomic_t rpcs_out;
+	unsigned long		ncommit;
+	struct list_head	list;
+};
+
+struct nfs_commit_data;
+struct nfs_inode;
+struct nfs_commit_completion_ops {
+	void (*error_cleanup) (struct nfs_inode *nfsi);
+	void (*completion) (struct nfs_commit_data *data);
+};
+
+struct nfs_commit_info {
+	spinlock_t			*lock;
+	struct nfs_mds_commit_info	*mds;
+	struct pnfs_ds_commit_info	*ds;
+	struct nfs_direct_req		*dreq;	/* O_DIRECT request */
+	const struct nfs_commit_completion_ops *completion_ops;
+};
+
+struct nfs_commit_data {
 	struct rpc_task		task;
 	struct inode		*inode;
 	struct rpc_cred		*cred;
@@ -1179,22 +1304,22 @@ struct nfs_write_data {
 	struct nfs_writeverf	verf;
 	struct list_head	pages;		/* Coalesced requests we wish to flush */
 	struct list_head	list;		/* lists of struct nfs_write_data */
-	struct nfs_page		*req;		/* multi ops per nfs_page */
-	struct page		**pagevec;
-	unsigned int		npages;		/* Max length of pagevec */
-	struct nfs_writeargs	args;		/* argument struct */
-	struct nfs_writeres	res;		/* result struct */
+	struct nfs_direct_req	*dreq;		/* O_DIRECT request */
+	struct nfs_commitargs	args;		/* argument struct */
+	struct nfs_commitres	res;		/* result struct */
+	struct nfs_open_context *context;
 	struct pnfs_layout_segment *lseg;
 	struct nfs_client	*ds_clp;	/* pNFS data server */
 	int			ds_commit_index;
 	const struct rpc_call_ops *mds_ops;
-	int (*write_done_cb) (struct rpc_task *task, struct nfs_write_data *data);
-#ifdef CONFIG_NFS_V4
-	unsigned long		timestamp;	/* For lease renewal */
-#endif
-	__u64			mds_offset;	/* Filelayout dense stripe */
-	int			pnfs_error;
-	struct page		*page_array[NFS_PAGEVEC_SIZE];
+	const struct nfs_commit_completion_ops *completion_ops;
+	int (*commit_done_cb) (struct rpc_task *task, struct nfs_commit_data *data);
+};
+
+struct nfs_pgio_completion_ops {
+	void	(*error_cleanup)(struct list_head *head);
+	void	(*init_hdr)(struct nfs_pgio_header *hdr);
+	void	(*completion)(struct nfs_pgio_header *hdr);
 };
 
 struct nfs_unlinkdata {
@@ -1234,11 +1359,13 @@ struct nfs_rpc_ops {
 
 	int	(*getroot) (struct nfs_server *, struct nfs_fh *,
 			    struct nfs_fsinfo *);
+	struct vfsmount *(*submount) (struct nfs_server *, struct dentry *,
+				      struct nfs_fh *, struct nfs_fattr *);
 	int	(*getattr) (struct nfs_server *, struct nfs_fh *,
 			    struct nfs_fattr *);
 	int	(*setattr) (struct dentry *, struct nfs_fattr *,
 			    struct iattr *);
-	int	(*lookup)  (struct rpc_clnt *clnt, struct inode *, struct qstr *,
+	int	(*lookup)  (struct inode *, struct qstr *,
 			    struct nfs_fh *, struct nfs_fattr *);
 	int	(*access)  (struct inode *, struct nfs_access_entry *);
 	int	(*readlink)(struct inode *, struct page *, unsigned int,
@@ -1277,8 +1404,9 @@ struct nfs_rpc_ops {
 	void	(*write_setup)  (struct nfs_write_data *, struct rpc_message *);
 	void	(*write_rpc_prepare)(struct rpc_task *, struct nfs_write_data *);
 	int	(*write_done)  (struct rpc_task *, struct nfs_write_data *);
-	void	(*commit_setup) (struct nfs_write_data *, struct rpc_message *);
-	int	(*commit_done) (struct rpc_task *, struct nfs_write_data *);
+	void	(*commit_setup) (struct nfs_commit_data *, struct rpc_message *);
+	void	(*commit_rpc_prepare)(struct rpc_task *, struct nfs_commit_data *);
+	int	(*commit_done) (struct rpc_task *, struct nfs_commit_data *);
 	int	(*lock)(struct file *, int, struct file_lock *);
 	int	(*lock_check_bounds)(const struct file_lock *);
 	void	(*clear_acl_cache)(struct inode *);
@@ -1287,9 +1415,9 @@ struct nfs_rpc_ops {
 				struct nfs_open_context *ctx,
 				int open_flags,
 				struct iattr *iattr);
-	int	(*init_client) (struct nfs_client *, const struct rpc_timeout *,
-				const char *, rpc_authflavor_t, int);
-	int	(*secinfo)(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *);
+	struct nfs_client *
+		(*init_client) (struct nfs_client *, const struct rpc_timeout *,
+				const char *, rpc_authflavor_t);
 };
 
 /*
diff --git a/include/linux/of_pci.h b/include/linux/of_pci.h
index f93e21700d3e..bb115deb7612 100644
--- a/include/linux/of_pci.h
+++ b/include/linux/of_pci.h
@@ -5,7 +5,7 @@
 
 struct pci_dev;
 struct of_irq;
-int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq);
+int of_irq_map_pci(const struct pci_dev *pdev, struct of_irq *out_irq);
 
 struct device_node;
 struct device_node *of_pci_find_child_device(struct device_node *parent,
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 3d7647536b03..e4c29bc72e70 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -43,8 +43,9 @@ enum oom_constraint {
 extern void compare_swap_oom_score_adj(int old_val, int new_val);
 extern int test_set_oom_score_adj(int new_val);
 
-extern unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
-			const nodemask_t *nodemask, unsigned long totalpages);
+extern unsigned long oom_badness(struct task_struct *p,
+		struct mem_cgroup *memcg, const nodemask_t *nodemask,
+		unsigned long totalpages);
 extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
 extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
 
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index efa26b4da8d2..7cfad3bbb0cc 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -460,11 +460,11 @@ static inline int fault_in_pages_readable(const char __user *uaddr, int size)
  */
 static inline int fault_in_multipages_writeable(char __user *uaddr, int size)
 {
-	int ret;
+	int ret = 0;
 	char __user *end = uaddr + size - 1;
 
 	if (unlikely(size == 0))
-		return 0;
+		return ret;
 
 	/*
 	 * Writing zeroes into userspace here is OK, because we know that if
@@ -489,11 +489,11 @@ static inline int fault_in_multipages_readable(const char __user *uaddr,
 					       int size)
 {
 	volatile char c;
-	int ret;
+	int ret = 0;
 	const char __user *end = uaddr + size - 1;
 
 	if (unlikely(size == 0))
-		return 0;
+		return ret;
 
 	while (uaddr <= end) {
 		ret = __get_user(c, uaddr);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 17b7b5b01b4a..d8c379dba6ad 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -687,7 +687,7 @@ int __must_check pci_bus_add_device(struct pci_dev *dev);
 void pci_read_bridge_bases(struct pci_bus *child);
 struct resource *pci_find_parent_resource(const struct pci_dev *dev,
 					  struct resource *res);
-u8 pci_swizzle_interrupt_pin(struct pci_dev *dev, u8 pin);
+u8 pci_swizzle_interrupt_pin(const struct pci_dev *dev, u8 pin);
 int pci_get_interrupt_pin(struct pci_dev *dev, struct pci_dev **bridge);
 u8 pci_common_swizzle(struct pci_dev *dev, u8 *pinp);
 extern struct pci_dev *pci_dev_get(struct pci_dev *dev);
@@ -1692,7 +1692,8 @@ extern void pci_release_bus_of_node(struct pci_bus *bus);
 /* Arch may override this (weak) */
 extern struct device_node * __weak pcibios_get_phb_of_node(struct pci_bus *bus);
 
-static inline struct device_node *pci_device_to_OF_node(struct pci_dev *pdev)
+static inline struct device_node *
+pci_device_to_OF_node(const struct pci_dev *pdev)
 {
 	return pdev ? pdev->dev.of_node : NULL;
 }
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 3329965ed63f..ab741b0d0074 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2506,6 +2506,7 @@
 #define PCI_DEVICE_ID_INTEL_MRST_SD2	0x084F
 #define PCI_DEVICE_ID_INTEL_I960	0x0960
 #define PCI_DEVICE_ID_INTEL_I960RM	0x0962
+#define PCI_DEVICE_ID_INTEL_CENTERTON_ILB	0x0c60
 #define PCI_DEVICE_ID_INTEL_8257X_SOL	0x1062
 #define PCI_DEVICE_ID_INTEL_82573E_SOL	0x1085
 #define PCI_DEVICE_ID_INTEL_82573L_SOL	0x108F
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index fb201896a8b0..7d7fbe2ef782 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -119,7 +119,7 @@ int __must_check res_counter_charge_locked(struct res_counter *counter,
 					   unsigned long val, bool force);
 int __must_check res_counter_charge(struct res_counter *counter,
 		unsigned long val, struct res_counter **limit_fail_at);
-int __must_check res_counter_charge_nofail(struct res_counter *counter,
+int res_counter_charge_nofail(struct res_counter *counter,
 		unsigned long val, struct res_counter **limit_fail_at);
 
 /*
@@ -135,6 +135,9 @@ int __must_check res_counter_charge_nofail(struct res_counter *counter,
 void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val);
 void res_counter_uncharge(struct res_counter *counter, unsigned long val);
 
+void res_counter_uncharge_until(struct res_counter *counter,
+				struct res_counter *top,
+				unsigned long val);
 /**
  * res_counter_margin - calculate chargeable space of a counter
  * @cnt: the counter
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index fd07c4542cee..3fce545df394 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -173,8 +173,6 @@ enum ttu_flags {
 };
 #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
 
-bool is_vma_temporary_stack(struct vm_area_struct *vma);
-
 int try_to_unmap(struct page *, enum ttu_flags flags);
 int try_to_unmap_one(struct page *, struct vm_area_struct *,
 			unsigned long address, enum ttu_flags flags);
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index fcabfb4873c8..f071b3922c67 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -91,6 +91,9 @@ struct rtc_pll_info {
 #define RTC_PLL_GET	_IOR('p', 0x11, struct rtc_pll_info)  /* Get PLL correction */
 #define RTC_PLL_SET	_IOW('p', 0x12, struct rtc_pll_info)  /* Set PLL correction */
 
+#define RTC_VL_READ	_IOR('p', 0x13, int)	/* Voltage low detector */
+#define RTC_VL_CLR	_IO('p', 0x14)		/* Clear voltage low information */
+
 /* interrupt flags */
 #define RTC_IRQF 0x80	/* Any of the following is active */
 #define RTC_PF 0x40	/* Periodic interrupt */
diff --git a/include/linux/rtc/ds1307.h b/include/linux/rtc/ds1307.h
new file mode 100644
index 000000000000..291b1c490367
--- /dev/null
+++ b/include/linux/rtc/ds1307.h
@@ -0,0 +1,22 @@
+/*
+ * ds1307.h - platform_data for the ds1307 (and variants) rtc driver
+ * (C) Copyright 2012 by Wolfram Sang, Pengutronix e.K.
+ * same license as the driver
+ */
+
+#ifndef _LINUX_DS1307_H
+#define _LINUX_DS1307_H
+
+#include <linux/types.h>
+
+#define DS1307_TRICKLE_CHARGER_250_OHM	0x01
+#define DS1307_TRICKLE_CHARGER_2K_OHM	0x02
+#define DS1307_TRICKLE_CHARGER_4K_OHM	0x03
+#define DS1307_TRICKLE_CHARGER_NO_DIODE	0x04
+#define DS1307_TRICKLE_CHARGER_DIODE	0x08
+
+struct ds1307_platform_data {
+	u8 trickle_charger_setup;
+};
+
+#endif /* _LINUX_DS1307_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index b1fd5c7925fe..b6661933e252 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -221,8 +221,8 @@ extern unsigned int nr_free_pagecache_pages(void);
 /* linux/mm/swap.c */
 extern void __lru_cache_add(struct page *, enum lru_list lru);
 extern void lru_cache_add_lru(struct page *, enum lru_list lru);
-extern void lru_add_page_tail(struct zone* zone,
-			      struct page *page, struct page *page_tail);
+extern void lru_add_page_tail(struct page *page, struct page *page_tail,
+			      struct lruvec *lruvec);
 extern void activate_page(struct page *);
 extern void mark_page_accessed(struct page *);
 extern void lru_add_drain(void);
@@ -251,7 +251,7 @@ static inline void lru_cache_add_file(struct page *page)
 /* linux/mm/vmscan.c */
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask, nodemask_t *mask);
-extern int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file);
+extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
 						  gfp_t gfp_mask, bool noswap);
 extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
@@ -351,31 +351,14 @@ extern int swap_type_of(dev_t, sector_t, struct block_device **);
 extern unsigned int count_swap_pages(int, int);
 extern sector_t map_swap_page(struct page *, struct block_device **);
 extern sector_t swapdev_block(int, pgoff_t);
+extern int page_swapcount(struct page *);
 extern int reuse_swap_page(struct page *);
 extern int try_to_free_swap(struct page *);
 struct backing_dev_info;
 
-/* linux/mm/thrash.c */
-extern struct mm_struct *swap_token_mm;
-extern void grab_swap_token(struct mm_struct *);
-extern void __put_swap_token(struct mm_struct *);
-extern void disable_swap_token(struct mem_cgroup *memcg);
-
-static inline int has_swap_token(struct mm_struct *mm)
-{
-	return (mm == swap_token_mm);
-}
-
-static inline void put_swap_token(struct mm_struct *mm)
-{
-	if (has_swap_token(mm))
-		__put_swap_token(mm);
-}
-
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 extern void
 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout);
-extern int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep);
 #else
 static inline void
 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
@@ -462,6 +445,11 @@ static inline void delete_from_swap_cache(struct page *page)
 {
 }
 
+static inline int page_swapcount(struct page *page)
+{
+	return 0;
+}
+
 #define reuse_swap_page(page)	(page_mapcount(page) == 1)
 
 static inline int try_to_free_swap(struct page *page)
@@ -476,37 +464,11 @@ static inline swp_entry_t get_swap_page(void)
 	return entry;
 }
 
-/* linux/mm/thrash.c */
-static inline void put_swap_token(struct mm_struct *mm)
-{
-}
-
-static inline void grab_swap_token(struct mm_struct *mm)
-{
-}
-
-static inline int has_swap_token(struct mm_struct *mm)
-{
-	return 0;
-}
-
-static inline void disable_swap_token(struct mem_cgroup *memcg)
-{
-}
-
 static inline void
 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
 {
 }
 
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-static inline int
-mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
-{
-	return 0;
-}
-#endif
-
 #endif /* CONFIG_SWAP */
 #endif /* __KERNEL__*/
 #endif /* _LINUX_SWAP_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index d89f0582b6b6..4a4521699563 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -46,6 +46,7 @@
 #include <linux/list_nulls.h>
 #include <linux/timer.h>
 #include <linux/cache.h>
+#include <linux/bitops.h>
 #include <linux/lockdep.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>	/* struct sk_buff */
@@ -921,12 +922,23 @@ struct proto {
 #endif
 };
 
+/*
+ * Bits in struct cg_proto.flags
+ */
+enum cg_proto_flags {
+	/* Currently active and new sockets should be assigned to cgroups */
+	MEMCG_SOCK_ACTIVE,
+	/* It was ever activated; we must disarm static keys on destruction */
+	MEMCG_SOCK_ACTIVATED,
+};
+
 struct cg_proto {
 	void			(*enter_memory_pressure)(struct sock *sk);
 	struct res_counter	*memory_allocated;	/* Current allocated memory. */
 	struct percpu_counter	*sockets_allocated;	/* Current number of sockets. */
 	int			*memory_pressure;
 	long			*sysctl_mem;
+	unsigned long		flags;
 	/*
 	 * memcg field is used to find which memcg we belong directly
 	 * Each memcg struct can hold more than one cg_proto, so container_of
@@ -942,6 +954,16 @@ struct cg_proto {
 extern int proto_register(struct proto *prot, int alloc_slab);
 extern void proto_unregister(struct proto *prot);
 
+static inline bool memcg_proto_active(struct cg_proto *cg_proto)
+{
+	return test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
+}
+
+static inline bool memcg_proto_activated(struct cg_proto *cg_proto)
+{
+	return test_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags);
+}
+
 #ifdef SOCK_REFCNT_DEBUG
 static inline void sk_refcnt_debug_inc(struct sock *sk)
 {
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index f64560e204bc..bab3b87e4064 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -13,7 +13,7 @@
 #define RECLAIM_WB_ANON		0x0001u
 #define RECLAIM_WB_FILE		0x0002u
 #define RECLAIM_WB_MIXED	0x0010u
-#define RECLAIM_WB_SYNC		0x0004u
+#define RECLAIM_WB_SYNC		0x0004u /* Unused, all reclaim async */
 #define RECLAIM_WB_ASYNC	0x0008u
 
 #define show_reclaim_flags(flags)				\
@@ -25,15 +25,15 @@
 		{RECLAIM_WB_ASYNC,	"RECLAIM_WB_ASYNC"}	\
 		) : "RECLAIM_WB_NONE"
 
-#define trace_reclaim_flags(page, sync) ( \
+#define trace_reclaim_flags(page) ( \
 	(page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
-	(sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC)   \
+	(RECLAIM_WB_ASYNC) \
 	)
 
-#define trace_shrink_flags(file, sync) ( \
-	(sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_MIXED : \
-			(file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON)) |  \
-	(sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \
+#define trace_shrink_flags(file) \
+	( \
+		(file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
+		(RECLAIM_WB_ASYNC) \
 	)
 
 TRACE_EVENT(mm_vmscan_kswapd_sleep,
@@ -263,22 +263,16 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
 		unsigned long nr_requested,
 		unsigned long nr_scanned,
 		unsigned long nr_taken,
-		unsigned long nr_lumpy_taken,
-		unsigned long nr_lumpy_dirty,
-		unsigned long nr_lumpy_failed,
 		isolate_mode_t isolate_mode,
 		int file),
 
-	TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode, file),
+	TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file),
 
 	TP_STRUCT__entry(
 		__field(int, order)
 		__field(unsigned long, nr_requested)
 		__field(unsigned long, nr_scanned)
 		__field(unsigned long, nr_taken)
-		__field(unsigned long, nr_lumpy_taken)
-		__field(unsigned long, nr_lumpy_dirty)
-		__field(unsigned long, nr_lumpy_failed)
 		__field(isolate_mode_t, isolate_mode)
 		__field(int, file)
 	),
@@ -288,22 +282,16 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
 		__entry->nr_requested = nr_requested;
 		__entry->nr_scanned = nr_scanned;
 		__entry->nr_taken = nr_taken;
-		__entry->nr_lumpy_taken = nr_lumpy_taken;
-		__entry->nr_lumpy_dirty = nr_lumpy_dirty;
-		__entry->nr_lumpy_failed = nr_lumpy_failed;
 		__entry->isolate_mode = isolate_mode;
 		__entry->file = file;
 	),
 
-	TP_printk("isolate_mode=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu contig_taken=%lu contig_dirty=%lu contig_failed=%lu file=%d",
+	TP_printk("isolate_mode=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu file=%d",
 		__entry->isolate_mode,
 		__entry->order,
 		__entry->nr_requested,
 		__entry->nr_scanned,
 		__entry->nr_taken,
-		__entry->nr_lumpy_taken,
-		__entry->nr_lumpy_dirty,
-		__entry->nr_lumpy_failed,
 		__entry->file)
 );
 
@@ -313,13 +301,10 @@ DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate,
 		unsigned long nr_requested,
 		unsigned long nr_scanned,
 		unsigned long nr_taken,
-		unsigned long nr_lumpy_taken,
-		unsigned long nr_lumpy_dirty,
-		unsigned long nr_lumpy_failed,
 		isolate_mode_t isolate_mode,
 		int file),
 
-	TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode, file)
+	TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file)
 
 );
 
@@ -329,13 +314,10 @@ DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_memcg_isolate,
 		unsigned long nr_requested,
 		unsigned long nr_scanned,
 		unsigned long nr_taken,
-		unsigned long nr_lumpy_taken,
-		unsigned long nr_lumpy_dirty,
-		unsigned long nr_lumpy_failed,
 		isolate_mode_t isolate_mode,
 		int file),
 
-	TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode, file)
+	TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file)
 
 );
 
@@ -395,88 +377,6 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
 		show_reclaim_flags(__entry->reclaim_flags))
 );
 
-TRACE_EVENT(replace_swap_token,
-	TP_PROTO(struct mm_struct *old_mm,
-		 struct mm_struct *new_mm),
-
-	TP_ARGS(old_mm, new_mm),
-
-	TP_STRUCT__entry(
-		__field(struct mm_struct*,	old_mm)
-		__field(unsigned int,		old_prio)
-		__field(struct mm_struct*,	new_mm)
-		__field(unsigned int,		new_prio)
-	),
-
-	TP_fast_assign(
-		__entry->old_mm   = old_mm;
-		__entry->old_prio = old_mm ? old_mm->token_priority : 0;
-		__entry->new_mm   = new_mm;
-		__entry->new_prio = new_mm->token_priority;
-	),
-
-	TP_printk("old_token_mm=%p old_prio=%u new_token_mm=%p new_prio=%u",
-		  __entry->old_mm, __entry->old_prio,
-		  __entry->new_mm, __entry->new_prio)
-);
-
-DECLARE_EVENT_CLASS(put_swap_token_template,
-	TP_PROTO(struct mm_struct *swap_token_mm),
-
-	TP_ARGS(swap_token_mm),
-
-	TP_STRUCT__entry(
-		__field(struct mm_struct*, swap_token_mm)
-	),
-
-	TP_fast_assign(
-		__entry->swap_token_mm = swap_token_mm;
-	),
-
-	TP_printk("token_mm=%p", __entry->swap_token_mm)
-);
-
-DEFINE_EVENT(put_swap_token_template, put_swap_token,
-	TP_PROTO(struct mm_struct *swap_token_mm),
-	TP_ARGS(swap_token_mm)
-);
-
-DEFINE_EVENT_CONDITION(put_swap_token_template, disable_swap_token,
-	TP_PROTO(struct mm_struct *swap_token_mm),
-	TP_ARGS(swap_token_mm),
-	TP_CONDITION(swap_token_mm != NULL)
-);
-
-TRACE_EVENT_CONDITION(update_swap_token_priority,
-	TP_PROTO(struct mm_struct *mm,
-		 unsigned int old_prio,
-		 struct mm_struct *swap_token_mm),
-
-	TP_ARGS(mm, old_prio, swap_token_mm),
-
-	TP_CONDITION(mm->token_priority != old_prio),
-
-	TP_STRUCT__entry(
-		__field(struct mm_struct*, mm)
-		__field(unsigned int, old_prio)
-		__field(unsigned int, new_prio)
-		__field(struct mm_struct*, swap_token_mm)
-		__field(unsigned int, swap_token_prio)
-	),
-
-	TP_fast_assign(
-		__entry->mm		= mm;
-		__entry->old_prio	= old_prio;
-		__entry->new_prio	= mm->token_priority;
-		__entry->swap_token_mm	= swap_token_mm;
-		__entry->swap_token_prio = swap_token_mm ? swap_token_mm->token_priority : 0;
-	),
-
-	TP_printk("mm=%p old_prio=%u new_prio=%u swap_token_mm=%p token_prio=%u",
-		  __entry->mm, __entry->old_prio, __entry->new_prio,
-		  __entry->swap_token_mm, __entry->swap_token_prio)
-);
-
 #endif /* _TRACE_VMSCAN_H */
 
 /* This part must be outside protection */
diff --git a/init/Kconfig b/init/Kconfig
index 81816b82860b..1e004d057468 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -803,7 +803,7 @@ config RT_GROUP_SCHED
 endif #CGROUP_SCHED
 
 config BLK_CGROUP
-	tristate "Block IO controller"
+	bool "Block IO controller"
 	depends on BLOCK
 	default n
 	---help---
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a0c6af34d500..0f3527d6184a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5132,7 +5132,7 @@ EXPORT_SYMBOL_GPL(css_depth);
  * @root: the css supporsed to be an ancestor of the child.
  *
  * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
- * this function reads css->id, this use rcu_dereference() and rcu_read_lock().
+ * this function reads css->id, the caller must hold rcu_read_lock().
  * But, considering usual usage, the csses should be valid objects after test.
  * Assuming that the caller will do some action to the child if this returns
  * returns true, the caller must take "child";s reference count.
@@ -5144,18 +5144,18 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
 {
 	struct css_id *child_id;
 	struct css_id *root_id;
-	bool ret = true;
 
-	rcu_read_lock();
 	child_id  = rcu_dereference(child->id);
+	if (!child_id)
+		return false;
 	root_id = rcu_dereference(root->id);
-	if (!child_id
-	    || !root_id
-	    || (child_id->depth < root_id->depth)
-	    || (child_id->stack[root_id->depth] != root_id->id))
-		ret = false;
-	rcu_read_unlock();
-	return ret;
+	if (!root_id)
+		return false;
+	if (child_id->depth < root_id->depth)
+		return false;
+	if (child_id->stack[root_id->depth] != root_id->id)
+		return false;
+	return true;
 }
 
 void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
diff --git a/kernel/fork.c b/kernel/fork.c
index 47b4e4f379f9..31a32c7dd169 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -386,7 +386,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		}
 		charge = 0;
 		if (mpnt->vm_flags & VM_ACCOUNT) {
-			unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
+			unsigned long len;
+			len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
 			if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
 				goto fail_nomem;
 			charge = len;
@@ -614,7 +615,6 @@ void mmput(struct mm_struct *mm)
 			list_del(&mm->mmlist);
 			spin_unlock(&mmlist_lock);
 		}
-		put_swap_token(mm);
 		if (mm->binfmt)
 			module_put(mm->binfmt->module);
 		mmdrop(mm);
@@ -831,10 +831,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 	memcpy(mm, oldmm, sizeof(*mm));
 	mm_init_cpumask(mm);
 
-	/* Initializing for Swap token stuff */
-	mm->token_priority = 0;
-	mm->last_interval = 0;
-
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	mm->pmd_huge_pte = NULL;
 #endif
@@ -913,10 +909,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
 		goto fail_nomem;
 
 good_mm:
-	/* Initializing for Swap token stuff */
-	mm->token_priority = 0;
-	mm->last_interval = 0;
-
 	tsk->mm = mm;
 	tsk->active_mm = mm;
 	return 0;
@@ -984,9 +976,8 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
 	 * Share io context with parent, if CLONE_IO is set
 	 */
 	if (clone_flags & CLONE_IO) {
-		tsk->io_context = ioc_task_link(ioc);
-		if (unlikely(!tsk->io_context))
-			return -ENOMEM;
+		ioc_task_link(ioc);
+		tsk->io_context = ioc;
 	} else if (ioprio_valid(ioc->ioprio)) {
 		new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
 		if (unlikely(!new_ioc))
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 079f1d39a8b8..2169feeba529 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -343,7 +343,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
 
 /* Look up a kernel symbol and return it in a text buffer. */
 static int __sprint_symbol(char *buffer, unsigned long address,
-			   int symbol_offset)
+			   int symbol_offset, int add_offset)
 {
 	char *modname;
 	const char *name;
@@ -358,13 +358,13 @@ static int __sprint_symbol(char *buffer, unsigned long address,
 	if (name != buffer)
 		strcpy(buffer, name);
 	len = strlen(buffer);
-	buffer += len;
 	offset -= symbol_offset;
 
+	if (add_offset)
+		len += sprintf(buffer + len, "+%#lx/%#lx", offset, size);
+
 	if (modname)
-		len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname);
-	else
-		len += sprintf(buffer, "+%#lx/%#lx", offset, size);
+		len += sprintf(buffer + len, " [%s]", modname);
 
 	return len;
 }
@@ -382,12 +382,28 @@ static int __sprint_symbol(char *buffer, unsigned long address,
  */
 int sprint_symbol(char *buffer, unsigned long address)
 {
-	return __sprint_symbol(buffer, address, 0);
+	return __sprint_symbol(buffer, address, 0, 1);
 }
-
 EXPORT_SYMBOL_GPL(sprint_symbol);
 
 /**
+ * sprint_symbol_no_offset - Look up a kernel symbol and return it in a text buffer
+ * @buffer: buffer to be stored
+ * @address: address to lookup
+ *
+ * This function looks up a kernel symbol with @address and stores its name
+ * and module name to @buffer if possible. If no symbol was found, just saves
+ * its @address as is.
+ *
+ * This function returns the number of bytes stored in @buffer.
+ */
+int sprint_symbol_no_offset(char *buffer, unsigned long address)
+{
+	return __sprint_symbol(buffer, address, 0, 0);
+}
+EXPORT_SYMBOL_GPL(sprint_symbol_no_offset);
+
+/**
  * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer
  * @buffer: buffer to be stored
  * @address: address to lookup
@@ -403,7 +419,7 @@ EXPORT_SYMBOL_GPL(sprint_symbol);
  */
 int sprint_backtrace(char *buffer, unsigned long address)
 {
-	return __sprint_symbol(buffer, address, -1);
+	return __sprint_symbol(buffer, address, -1, 1);
 }
 
 /* Look up a kernel symbol and print it to the kernel messages. */
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index bebe2b170d49..ad581aa2369a 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -94,13 +94,15 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
 	counter->usage -= val;
 }
 
-void res_counter_uncharge(struct res_counter *counter, unsigned long val)
+void res_counter_uncharge_until(struct res_counter *counter,
+				struct res_counter *top,
+				unsigned long val)
 {
 	unsigned long flags;
 	struct res_counter *c;
 
 	local_irq_save(flags);
-	for (c = counter; c != NULL; c = c->parent) {
+	for (c = counter; c != top; c = c->parent) {
 		spin_lock(&c->lock);
 		res_counter_uncharge_locked(c, val);
 		spin_unlock(&c->lock);
@@ -108,6 +110,10 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val)
 	local_irq_restore(flags);
 }
 
+void res_counter_uncharge(struct res_counter *counter, unsigned long val)
+{
+	res_counter_uncharge_until(counter, NULL, val);
+}
 
 static inline unsigned long long *
 res_counter_member(struct res_counter *counter, int member)
diff --git a/lib/bitmap.c b/lib/bitmap.c
index b5a8b6ad2454..06fdfa1aeba7 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -369,7 +369,8 @@ EXPORT_SYMBOL(bitmap_find_next_zero_area);
  * @nmaskbits: size of bitmap, in bits
  *
  * Exactly @nmaskbits bits are displayed.  Hex digits are grouped into
- * comma-separated sets of eight digits per set.
+ * comma-separated sets of eight digits per set.  Returns the number of
+ * characters which were written to *buf, excluding the trailing \0.
  */
 int bitmap_scnprintf(char *buf, unsigned int buflen,
 	const unsigned long *maskp, int nmaskbits)
@@ -517,8 +518,8 @@ EXPORT_SYMBOL(bitmap_parse_user);
  *
  * Helper routine for bitmap_scnlistprintf().  Write decimal number
  * or range to buf, suppressing output past buf+buflen, with optional
- * comma-prefix.  Return len of what would be written to buf, if it
- * all fit.
+ * comma-prefix.  Return len of what was written to *buf, excluding the
+ * trailing \0.
  */
 static inline int bscnl_emit(char *buf, int buflen, int rbot, int rtop, int len)
 {
@@ -544,9 +545,8 @@ static inline int bscnl_emit(char *buf, int buflen, int rbot, int rtop, int len)
  * the range.  Output format is compatible with the format
  * accepted as input by bitmap_parselist().
  *
- * The return value is the number of characters which would be
- * generated for the given input, excluding the trailing '\0', as
- * per ISO C99.
+ * The return value is the number of characters which were written to *buf
+ * excluding the trailing '\0', as per ISO C99's scnprintf.
  */
 int bitmap_scnlistprintf(char *buf, unsigned int buflen,
 	const unsigned long *maskp, int nmaskbits)
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 13ef2338be41..518aea714d21 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -430,7 +430,7 @@ static struct dma_debug_entry *__dma_entry_alloc(void)
  */
 static struct dma_debug_entry *dma_entry_alloc(void)
 {
-	struct dma_debug_entry *entry = NULL;
+	struct dma_debug_entry *entry;
 	unsigned long flags;
 
 	spin_lock_irqsave(&free_entries_lock, flags);
@@ -438,11 +438,14 @@ static struct dma_debug_entry *dma_entry_alloc(void)
 	if (list_empty(&free_entries)) {
 		pr_err("DMA-API: debugging out of memory - disabling\n");
 		global_disable = true;
-		goto out;
+		spin_unlock_irqrestore(&free_entries_lock, flags);
+		return NULL;
 	}
 
 	entry = __dma_entry_alloc();
 
+	spin_unlock_irqrestore(&free_entries_lock, flags);
+
 #ifdef CONFIG_STACKTRACE
 	entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES;
 	entry->stacktrace.entries = entry->st_entries;
@@ -450,9 +453,6 @@ static struct dma_debug_entry *dma_entry_alloc(void)
 	save_stack_trace(&entry->stacktrace);
 #endif
 
-out:
-	spin_unlock_irqrestore(&free_entries_lock, flags);
-
 	return entry;
 }
 
diff --git a/lib/list_debug.c b/lib/list_debug.c
index 3810b481f940..23a5e031cd8b 100644
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@@ -31,6 +31,9 @@ void __list_add(struct list_head *new,
 		"list_add corruption. prev->next should be "
 		"next (%p), but was %p. (prev=%p).\n",
 		next, prev->next, prev);
+	WARN(new == prev || new == next,
+	     "list_add double add: new=%p, prev=%p, next=%p.\n",
+	     new, prev, next);
 	next->prev = new;
 	new->next = next;
 	new->prev = prev;
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 86516f5588e3..d7c878cc006c 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -73,11 +73,24 @@ static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1] __read_mostly;
 static struct kmem_cache *radix_tree_node_cachep;
 
 /*
+ * The radix tree is variable-height, so an insert operation not only has
+ * to build the branch to its corresponding item, it also has to build the
+ * branch to existing items if the size has to be increased (by
+ * radix_tree_extend).
+ *
+ * The worst case is a zero height tree with just a single item at index 0,
+ * and then inserting an item at index ULONG_MAX. This requires 2 new branches
+ * of RADIX_TREE_MAX_PATH size to be created, with only the root node shared.
+ * Hence:
+ */
+#define RADIX_TREE_PRELOAD_SIZE (RADIX_TREE_MAX_PATH * 2 - 1)
+
+/*
  * Per-cpu pool of preloaded nodes
  */
 struct radix_tree_preload {
 	int nr;
-	struct radix_tree_node *nodes[RADIX_TREE_MAX_PATH];
+	struct radix_tree_node *nodes[RADIX_TREE_PRELOAD_SIZE];
 };
 static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
 
diff --git a/lib/spinlock_debug.c b/lib/spinlock_debug.c
index 525d160d44f0..d0ec4f3d1593 100644
--- a/lib/spinlock_debug.c
+++ b/lib/spinlock_debug.c
@@ -58,7 +58,7 @@ static void spin_dump(raw_spinlock_t *lock, const char *msg)
 	printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n",
 		msg, raw_smp_processor_id(),
 		current->comm, task_pid_nr(current));
-	printk(KERN_EMERG " lock: %p, .magic: %08x, .owner: %s/%d, "
+	printk(KERN_EMERG " lock: %ps, .magic: %08x, .owner: %s/%d, "
 			".owner_cpu: %d\n",
 		lock, lock->magic,
 		owner ? owner->comm : "<none>",
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index dd4ece372699..1cffc223bff5 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -23,15 +23,15 @@
 int string_get_size(u64 size, const enum string_size_units units,
 		    char *buf, int len)
 {
-	const char *units_10[] = { "B", "kB", "MB", "GB", "TB", "PB",
+	static const char *units_10[] = { "B", "kB", "MB", "GB", "TB", "PB",
 				   "EB", "ZB", "YB", NULL};
-	const char *units_2[] = {"B", "KiB", "MiB", "GiB", "TiB", "PiB",
+	static const char *units_2[] = {"B", "KiB", "MiB", "GiB", "TiB", "PiB",
 				 "EiB", "ZiB", "YiB", NULL };
-	const char **units_str[] = {
+	static const char **units_str[] = {
 		[STRING_UNITS_10] =  units_10,
 		[STRING_UNITS_2] = units_2,
 	};
-	const unsigned int divisor[] = {
+	static const unsigned int divisor[] = {
 		[STRING_UNITS_10] = 1000,
 		[STRING_UNITS_2] = 1024,
 	};
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 414f46ed1dcd..45bc1f83a5ad 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -130,11 +130,9 @@ void swiotlb_print_info(void)
 	pstart = virt_to_phys(io_tlb_start);
 	pend = virt_to_phys(io_tlb_end);
 
-	printk(KERN_INFO "Placing %luMB software IO TLB between %p - %p\n",
-	       bytes >> 20, io_tlb_start, io_tlb_end);
-	printk(KERN_INFO "software IO TLB at phys %#llx - %#llx\n",
-	       (unsigned long long)pstart,
-	       (unsigned long long)pend);
+	printk(KERN_INFO "software IO TLB [mem %#010llx-%#010llx] (%luMB) mapped at [%p-%p]\n",
+	       (unsigned long long)pstart, (unsigned long long)pend - 1,
+	       bytes >> 20, io_tlb_start, io_tlb_end - 1);
 }
 
 void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
diff --git a/lib/test-kstrtox.c b/lib/test-kstrtox.c
index d55769d63cb8..bea3f3fa3f02 100644
--- a/lib/test-kstrtox.c
+++ b/lib/test-kstrtox.c
@@ -11,7 +11,7 @@ struct test_fail {
 };
 
 #define DEFINE_TEST_FAIL(test)	\
-	const struct test_fail test[] __initdata
+	const struct test_fail test[] __initconst
 
 #define DECLARE_TEST_OK(type, test_type)	\
 	test_type {				\
@@ -21,7 +21,7 @@ struct test_fail {
 	}
 
 #define DEFINE_TEST_OK(type, test)	\
-	const type test[] __initdata
+	const type test[] __initconst
 
 #define TEST_FAIL(fn, type, fmt, test)					\
 {									\
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index abbabec9720a..5391299c1e78 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -284,6 +284,7 @@ char *number(char *buf, char *end, unsigned long long num,
 	char locase;
 	int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10);
 	int i;
+	bool is_zero = num == 0LL;
 
 	/* locase = 0 or 0x20. ORing digits or letters with 'locase'
 	 * produces same digits or (maybe lowercased) letters */
@@ -305,8 +306,9 @@ char *number(char *buf, char *end, unsigned long long num,
 		}
 	}
 	if (need_pfx) {
-		spec.field_width--;
 		if (spec.base == 16)
+			spec.field_width -= 2;
+		else if (!is_zero)
 			spec.field_width--;
 	}
 
@@ -353,9 +355,11 @@ char *number(char *buf, char *end, unsigned long long num,
 	}
 	/* "0x" / "0" prefix */
 	if (need_pfx) {
-		if (buf < end)
-			*buf = '0';
-		++buf;
+		if (spec.base == 16 || !is_zero) {
+			if (buf < end)
+				*buf = '0';
+			++buf;
+		}
 		if (spec.base == 16) {
 			if (buf < end)
 				*buf = ('X' | locase);
@@ -436,7 +440,7 @@ char *symbol_string(char *buf, char *end, void *ptr,
 	else if (ext != 'f' && ext != 's')
 		sprint_symbol(sym, value);
 	else
-		kallsyms_lookup(value, NULL, NULL, NULL, sym);
+		sprint_symbol_no_offset(sym, value);
 
 	return string(buf, end, sym, spec);
 #else
diff --git a/mm/Kconfig b/mm/Kconfig
index 39220026c797..b2176374b98e 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -349,6 +349,16 @@ choice
 	  benefit.
 endchoice
 
+config CROSS_MEMORY_ATTACH
+	bool "Cross Memory Support"
+	depends on MMU
+	default y
+	help
+	  Enabling this option adds the system calls process_vm_readv and
+	  process_vm_writev which allow a process with the correct privileges
+	  to directly read from or write to to another process's address space.
+	  See the man page for more details.
+
 #
 # UP and nommu archs use km based percpu allocator
 #
diff --git a/mm/Makefile b/mm/Makefile
index 8aada89efbbb..a156285ce88d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,8 +5,11 @@
 mmu-y			:= nommu.o
 mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
-			   vmalloc.o pagewalk.o pgtable-generic.o \
-			   process_vm_access.o
+			   vmalloc.o pagewalk.o pgtable-generic.o
+
+ifdef CONFIG_CROSS_MEMORY_ATTACH
+mmu-$(CONFIG_MMU)	+= process_vm_access.o
+endif
 
 obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
 			   maccess.o page_alloc.o page-writeback.o \
@@ -25,7 +28,7 @@ endif
 obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
 
 obj-$(CONFIG_BOUNCE)	+= bounce.o
-obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
+obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o
 obj-$(CONFIG_HAS_DMA)	+= dmapool.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
 obj-$(CONFIG_NUMA) 	+= mempolicy.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 0131170c9d54..ec4fcb7a56c8 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -77,16 +77,16 @@ unsigned long __init bootmem_bootmap_pages(unsigned long pages)
  */
 static void __init link_bootmem(bootmem_data_t *bdata)
 {
-	struct list_head *iter;
+	bootmem_data_t *ent;
 
-	list_for_each(iter, &bdata_list) {
-		bootmem_data_t *ent;
-
-		ent = list_entry(iter, bootmem_data_t, list);
-		if (bdata->node_min_pfn < ent->node_min_pfn)
-			break;
+	list_for_each_entry(ent, &bdata_list, list) {
+		if (bdata->node_min_pfn < ent->node_min_pfn) {
+			list_add_tail(&bdata->list, &ent->list);
+			return;
+		}
 	}
-	list_add_tail(&bdata->list, iter);
+
+	list_add_tail(&bdata->list, &bdata_list);
 }
 
 /*
@@ -203,7 +203,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 		} else {
 			unsigned long off = 0;
 
-			while (vec && off < BITS_PER_LONG) {
+			vec >>= start & (BITS_PER_LONG - 1);
+			while (vec) {
 				if (vec & 1) {
 					page = pfn_to_page(start + off);
 					__free_pages_bootmem(page, 0);
@@ -467,7 +468,7 @@ static unsigned long __init align_off(struct bootmem_data *bdata,
 	return ALIGN(base + off, align) - base;
 }
 
-static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
+static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata,
 					unsigned long size, unsigned long align,
 					unsigned long goal, unsigned long limit)
 {
@@ -588,14 +589,14 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
 		p_bdata = bootmem_arch_preferred_node(bdata, size, align,
 							goal, limit);
 		if (p_bdata)
-			return alloc_bootmem_core(p_bdata, size, align,
+			return alloc_bootmem_bdata(p_bdata, size, align,
 							goal, limit);
 	}
 #endif
 	return NULL;
 }
 
-static void * __init ___alloc_bootmem_nopanic(unsigned long size,
+static void * __init alloc_bootmem_core(unsigned long size,
 					unsigned long align,
 					unsigned long goal,
 					unsigned long limit)
@@ -603,7 +604,6 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size,
 	bootmem_data_t *bdata;
 	void *region;
 
-restart:
 	region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit);
 	if (region)
 		return region;
@@ -614,11 +614,25 @@ restart:
 		if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
 			break;
 
-		region = alloc_bootmem_core(bdata, size, align, goal, limit);
+		region = alloc_bootmem_bdata(bdata, size, align, goal, limit);
 		if (region)
 			return region;
 	}
 
+	return NULL;
+}
+
+static void * __init ___alloc_bootmem_nopanic(unsigned long size,
+					      unsigned long align,
+					      unsigned long goal,
+					      unsigned long limit)
+{
+	void *ptr;
+
+restart:
+	ptr = alloc_bootmem_core(size, align, goal, limit);
+	if (ptr)
+		return ptr;
 	if (goal) {
 		goal = 0;
 		goto restart;
@@ -684,21 +698,56 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
 	return ___alloc_bootmem(size, align, goal, limit);
 }
 
-static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
+static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				unsigned long size, unsigned long align,
 				unsigned long goal, unsigned long limit)
 {
 	void *ptr;
 
-	ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit);
+again:
+	ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size,
+					   align, goal, limit);
 	if (ptr)
 		return ptr;
 
-	ptr = alloc_bootmem_core(bdata, size, align, goal, limit);
+	ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit);
 	if (ptr)
 		return ptr;
 
-	return ___alloc_bootmem(size, align, goal, limit);
+	ptr = alloc_bootmem_core(size, align, goal, limit);
+	if (ptr)
+		return ptr;
+
+	if (goal) {
+		goal = 0;
+		goto again;
+	}
+
+	return NULL;
+}
+
+void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
+				   unsigned long align, unsigned long goal)
+{
+	if (WARN_ON_ONCE(slab_is_available()))
+		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+}
+
+void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+				    unsigned long align, unsigned long goal,
+				    unsigned long limit)
+{
+	void *ptr;
+
+	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+	if (ptr)
+		return ptr;
+
+	printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+	panic("Out of memory");
+	return NULL;
 }
 
 /**
@@ -722,7 +771,7 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
 	if (WARN_ON_ONCE(slab_is_available()))
 		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
 
-	return  ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
+	return  ___alloc_bootmem_node(pgdat, size, align, goal, 0);
 }
 
 void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
@@ -743,7 +792,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
 		unsigned long new_goal;
 
 		new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
-		ptr = alloc_bootmem_core(pgdat->bdata, size, align,
+		ptr = alloc_bootmem_bdata(pgdat->bdata, size, align,
 						 new_goal, 0);
 		if (ptr)
 			return ptr;
@@ -754,47 +803,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
 
 }
 
-#ifdef CONFIG_SPARSEMEM
-/**
- * alloc_bootmem_section - allocate boot memory from a specific section
- * @size: size of the request in bytes
- * @section_nr: sparse map section to allocate from
- *
- * Return NULL on failure.
- */
-void * __init alloc_bootmem_section(unsigned long size,
-				    unsigned long section_nr)
-{
-	bootmem_data_t *bdata;
-	unsigned long pfn, goal;
-
-	pfn = section_nr_to_pfn(section_nr);
-	goal = pfn << PAGE_SHIFT;
-	bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
-
-	return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0);
-}
-#endif
-
-void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
-				   unsigned long align, unsigned long goal)
-{
-	void *ptr;
-
-	if (WARN_ON_ONCE(slab_is_available()))
-		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-
-	ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
-	if (ptr)
-		return ptr;
-
-	ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
-	if (ptr)
-		return ptr;
-
-	return __alloc_bootmem_nopanic(size, align, goal);
-}
-
 #ifndef ARCH_LOW_ADDRESS_LIMIT
 #define ARCH_LOW_ADDRESS_LIMIT	0xffffffffUL
 #endif
@@ -839,6 +847,6 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
 	if (WARN_ON_ONCE(slab_is_available()))
 		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
 
-	return ___alloc_bootmem_node(pgdat->bdata, size, align,
-				goal, ARCH_LOW_ADDRESS_LIMIT);
+	return ___alloc_bootmem_node(pgdat, size, align,
+				     goal, ARCH_LOW_ADDRESS_LIMIT);
 }
diff --git a/mm/compaction.c b/mm/compaction.c
index da7d35ea5103..4ac338af5120 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -226,7 +226,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	unsigned long last_pageblock_nr = 0, pageblock_nr;
 	unsigned long nr_scanned = 0, nr_isolated = 0;
 	struct list_head *migratelist = &cc->migratepages;
-	isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE;
+	isolate_mode_t mode = 0;
+	struct lruvec *lruvec;
 
 	/*
 	 * Ensure that there are not too many pages isolated from the LRU
@@ -235,7 +236,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 	 */
 	while (unlikely(too_many_isolated(zone))) {
 		/* async migration should just abort */
-		if (!cc->sync)
+		if (cc->mode != COMPACT_SYNC)
 			return 0;
 
 		congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -303,7 +304,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 		 * satisfies the allocation
 		 */
 		pageblock_nr = low_pfn >> pageblock_order;
-		if (!cc->sync && last_pageblock_nr != pageblock_nr &&
+		if (cc->mode != COMPACT_SYNC &&
+		    last_pageblock_nr != pageblock_nr &&
 		    !migrate_async_suitable(get_pageblock_migratetype(page))) {
 			low_pfn += pageblock_nr_pages;
 			low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
@@ -324,17 +326,19 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			continue;
 		}
 
-		if (!cc->sync)
+		if (cc->mode != COMPACT_SYNC)
 			mode |= ISOLATE_ASYNC_MIGRATE;
 
+		lruvec = mem_cgroup_page_lruvec(page, zone);
+
 		/* Try isolate the page */
-		if (__isolate_lru_page(page, mode, 0) != 0)
+		if (__isolate_lru_page(page, mode) != 0)
 			continue;
 
 		VM_BUG_ON(PageTransCompound(page));
 
 		/* Successfully isolated */
-		del_page_from_lru_list(zone, page, page_lru(page));
+		del_page_from_lru_list(page, lruvec, page_lru(page));
 		list_add(&page->lru, migratelist);
 		cc->nr_migratepages++;
 		nr_isolated++;
@@ -357,27 +361,90 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 
 #endif /* CONFIG_COMPACTION || CONFIG_CMA */
 #ifdef CONFIG_COMPACTION
+/*
+ * Returns true if MIGRATE_UNMOVABLE pageblock was successfully
+ * converted to MIGRATE_MOVABLE type, false otherwise.
+ */
+static bool rescue_unmovable_pageblock(struct page *page)
+{
+	unsigned long pfn, start_pfn, end_pfn;
+	struct page *start_page, *end_page;
+
+	pfn = page_to_pfn(page);
+	start_pfn = pfn & ~(pageblock_nr_pages - 1);
+	end_pfn = start_pfn + pageblock_nr_pages;
+
+	start_page = pfn_to_page(start_pfn);
+	end_page = pfn_to_page(end_pfn);
+
+	/* Do not deal with pageblocks that overlap zones */
+	if (page_zone(start_page) != page_zone(end_page))
+		return false;
+
+	for (page = start_page, pfn = start_pfn; page < end_page; pfn++,
+								  page++) {
+		if (!pfn_valid_within(pfn))
+			continue;
+
+		if (PageBuddy(page)) {
+			int order = page_order(page);
+
+			pfn += (1 << order) - 1;
+			page += (1 << order) - 1;
+
+			continue;
+		} else if (page_count(page) == 0 || PageLRU(page))
+			continue;
+
+		return false;
+	}
+
+	set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+	move_freepages_block(page_zone(page), page, MIGRATE_MOVABLE);
+	return true;
+}
+
+enum smt_result {
+	GOOD_AS_MIGRATION_TARGET,
+	FAIL_UNMOVABLE_TARGET,
+	FAIL_BAD_TARGET,
+};
 
-/* Returns true if the page is within a block suitable for migration to */
-static bool suitable_migration_target(struct page *page)
+/*
+ * Returns GOOD_AS_MIGRATION_TARGET if the page is within a block
+ * suitable for migration to, FAIL_UNMOVABLE_TARGET if the page
+ * is within a MIGRATE_UNMOVABLE block, FAIL_BAD_TARGET otherwise.
+ */
+static enum smt_result suitable_migration_target(struct page *page,
+				      struct compact_control *cc)
 {
 
 	int migratetype = get_pageblock_migratetype(page);
 
 	/* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
 	if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
-		return false;
+		return FAIL_BAD_TARGET;
 
 	/* If the page is a large free page, then allow migration */
 	if (PageBuddy(page) && page_order(page) >= pageblock_order)
-		return true;
+		return GOOD_AS_MIGRATION_TARGET;
 
 	/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
-	if (migrate_async_suitable(migratetype))
-		return true;
+	if (cc->mode != COMPACT_ASYNC_UNMOVABLE &&
+	    migrate_async_suitable(migratetype))
+		return GOOD_AS_MIGRATION_TARGET;
+
+	if (cc->mode == COMPACT_ASYNC_MOVABLE &&
+	    migratetype == MIGRATE_UNMOVABLE)
+		return FAIL_UNMOVABLE_TARGET;
+
+	if (cc->mode != COMPACT_ASYNC_MOVABLE &&
+	    migratetype == MIGRATE_UNMOVABLE &&
+	    rescue_unmovable_pageblock(page))
+		return GOOD_AS_MIGRATION_TARGET;
 
 	/* Otherwise skip the block */
-	return false;
+	return FAIL_BAD_TARGET;
 }
 
 /*
@@ -411,6 +478,13 @@ static void isolate_freepages(struct zone *zone,
 	zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
 
 	/*
+	 * isolate_freepages() may be called more than once during
+	 * compact_zone_order() run and we want only the most recent
+	 * count.
+	 */
+	cc->nr_pageblocks_skipped = 0;
+
+	/*
 	 * Isolate free pages until enough are available to migrate the
 	 * pages on cc->migratepages. We stop searching if the migrate
 	 * and free page scanners meet or enough free pages are isolated.
@@ -418,6 +492,7 @@ static void isolate_freepages(struct zone *zone,
 	for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
 					pfn -= pageblock_nr_pages) {
 		unsigned long isolated;
+		enum smt_result ret;
 
 		if (!pfn_valid(pfn))
 			continue;
@@ -434,9 +509,12 @@ static void isolate_freepages(struct zone *zone,
 			continue;
 
 		/* Check the block is suitable for migration */
-		if (!suitable_migration_target(page))
+		ret = suitable_migration_target(page, cc);
+		if (ret != GOOD_AS_MIGRATION_TARGET) {
+			if (ret == FAIL_UNMOVABLE_TARGET)
+				cc->nr_pageblocks_skipped++;
 			continue;
-
+		}
 		/*
 		 * Found a block suitable for isolating free pages from. Now
 		 * we disabled interrupts, double check things are ok and
@@ -445,12 +523,14 @@ static void isolate_freepages(struct zone *zone,
 		 */
 		isolated = 0;
 		spin_lock_irqsave(&zone->lock, flags);
-		if (suitable_migration_target(page)) {
+		ret = suitable_migration_target(page, cc);
+		if (ret == GOOD_AS_MIGRATION_TARGET) {
 			end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
 			isolated = isolate_freepages_block(pfn, end_pfn,
 							   freelist, false);
 			nr_freepages += isolated;
-		}
+		} else if (ret == FAIL_UNMOVABLE_TARGET)
+			cc->nr_pageblocks_skipped++;
 		spin_unlock_irqrestore(&zone->lock, flags);
 
 		/*
@@ -682,8 +762,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 
 		nr_migrate = cc->nr_migratepages;
 		err = migrate_pages(&cc->migratepages, compaction_alloc,
-				(unsigned long)cc, false,
-				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
+			(unsigned long)&cc->freepages, false,
+			(cc->mode == COMPACT_SYNC) ? MIGRATE_SYNC_LIGHT
+						      : MIGRATE_ASYNC);
 		update_nr_listpages(cc);
 		nr_remaining = cc->nr_migratepages;
 
@@ -712,7 +793,8 @@ out:
 
 static unsigned long compact_zone_order(struct zone *zone,
 				 int order, gfp_t gfp_mask,
-				 bool sync)
+				 enum compact_mode mode,
+				 unsigned long *nr_pageblocks_skipped)
 {
 	struct compact_control cc = {
 		.nr_freepages = 0,
@@ -720,12 +802,17 @@ static unsigned long compact_zone_order(struct zone *zone,
 		.order = order,
 		.migratetype = allocflags_to_migratetype(gfp_mask),
 		.zone = zone,
-		.sync = sync,
+		.mode = mode,
 	};
+	unsigned long rc;
+
 	INIT_LIST_HEAD(&cc.freepages);
 	INIT_LIST_HEAD(&cc.migratepages);
 
-	return compact_zone(zone, &cc);
+	rc = compact_zone(zone, &cc);
+	*nr_pageblocks_skipped = cc.nr_pageblocks_skipped;
+
+	return rc;
 }
 
 int sysctl_extfrag_threshold = 500;
@@ -750,6 +837,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 	struct zoneref *z;
 	struct zone *zone;
 	int rc = COMPACT_SKIPPED;
+	unsigned long nr_pageblocks_skipped;
+	enum compact_mode mode;
 
 	/*
 	 * Check whether it is worth even starting compaction. The order check is
@@ -766,12 +855,22 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 								nodemask) {
 		int status;
 
-		status = compact_zone_order(zone, order, gfp_mask, sync);
+		mode = sync ? COMPACT_SYNC : COMPACT_ASYNC_MOVABLE;
+retry:
+		status = compact_zone_order(zone, order, gfp_mask, mode,
+						&nr_pageblocks_skipped);
 		rc = max(status, rc);
 
 		/* If a normal allocation would succeed, stop compacting */
 		if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
 			break;
+
+		if (rc == COMPACT_COMPLETE && mode == COMPACT_ASYNC_MOVABLE) {
+			if (nr_pageblocks_skipped) {
+				mode = COMPACT_ASYNC_UNMOVABLE;
+				goto retry;
+			}
+		}
 	}
 
 	return rc;
@@ -805,7 +904,7 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
 			if (ok && cc->order > zone->compact_order_failed)
 				zone->compact_order_failed = cc->order + 1;
 			/* Currently async compaction is never deferred. */
-			else if (!ok && cc->sync)
+			else if (!ok && cc->mode == COMPACT_SYNC)
 				defer_compaction(zone, cc->order);
 		}
 
@@ -820,7 +919,7 @@ int compact_pgdat(pg_data_t *pgdat, int order)
 {
 	struct compact_control cc = {
 		.order = order,
-		.sync = false,
+		.mode = COMPACT_ASYNC_MOVABLE,
 	};
 
 	return __compact_pgdat(pgdat, &cc);
@@ -830,7 +929,7 @@ static int compact_node(int nid)
 {
 	struct compact_control cc = {
 		.order = -1,
-		.sync = true,
+		.mode = COMPACT_SYNC,
 	};
 
 	return __compact_pgdat(NODE_DATA(nid), &cc);
diff --git a/mm/filemap.c b/mm/filemap.c
index 79c4b2b0b14e..64b48f934b89 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -29,7 +29,6 @@
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
 #include <linux/security.h>
-#include <linux/syscalls.h>
 #include <linux/cpuset.h>
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
@@ -1478,44 +1477,6 @@ out:
 }
 EXPORT_SYMBOL(generic_file_aio_read);
 
-static ssize_t
-do_readahead(struct address_space *mapping, struct file *filp,
-	     pgoff_t index, unsigned long nr)
-{
-	if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
-		return -EINVAL;
-
-	force_page_cache_readahead(mapping, filp, index, nr);
-	return 0;
-}
-
-SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
-{
-	ssize_t ret;
-	struct file *file;
-
-	ret = -EBADF;
-	file = fget(fd);
-	if (file) {
-		if (file->f_mode & FMODE_READ) {
-			struct address_space *mapping = file->f_mapping;
-			pgoff_t start = offset >> PAGE_CACHE_SHIFT;
-			pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
-			unsigned long len = end - start + 1;
-			ret = do_readahead(mapping, file, start, len);
-		}
-		fput(file);
-	}
-	return ret;
-}
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_readahead(long fd, loff_t offset, long count)
-{
-	return SYSC_readahead((int) fd, offset, (size_t) count);
-}
-SYSCALL_ALIAS(sys_readahead, SyS_readahead);
-#endif
-
 #ifdef CONFIG_MMU
 /**
  * page_cache_read - adds requested page to the page cache if not already there
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f0e5306eeb55..57c4b9309015 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -636,16 +636,12 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 					unsigned long haddr, pmd_t *pmd,
 					struct page *page)
 {
-	int ret = 0;
 	pgtable_t pgtable;
 
 	VM_BUG_ON(!PageCompound(page));
 	pgtable = pte_alloc_one(mm, haddr);
-	if (unlikely(!pgtable)) {
-		mem_cgroup_uncharge_page(page);
-		put_page(page);
+	if (unlikely(!pgtable))
 		return VM_FAULT_OOM;
-	}
 
 	clear_huge_page(page, haddr, HPAGE_PMD_NR);
 	__SetPageUptodate(page);
@@ -675,7 +671,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 		spin_unlock(&mm->page_table_lock);
 	}
 
-	return ret;
+	return 0;
 }
 
 static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
@@ -724,8 +720,14 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			put_page(page);
 			goto out;
 		}
+		if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd,
+							  page))) {
+			mem_cgroup_uncharge_page(page);
+			put_page(page);
+			goto out;
+		}
 
-		return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page);
+		return 0;
 	}
 out:
 	/*
@@ -950,6 +952,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		count_vm_event(THP_FAULT_FALLBACK);
 		ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
 						   pmd, orig_pmd, page, haddr);
+		if (ret & VM_FAULT_OOM)
+			split_huge_page(page);
 		put_page(page);
 		goto out;
 	}
@@ -957,6 +961,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
 		put_page(new_page);
+		split_huge_page(page);
 		put_page(page);
 		ret |= VM_FAULT_OOM;
 		goto out;
@@ -968,8 +973,10 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	spin_lock(&mm->page_table_lock);
 	put_page(page);
 	if (unlikely(!pmd_same(*pmd, orig_pmd))) {
+		spin_unlock(&mm->page_table_lock);
 		mem_cgroup_uncharge_page(new_page);
 		put_page(new_page);
+		goto out;
 	} else {
 		pmd_t entry;
 		VM_BUG_ON(!PageHead(page));
@@ -1224,10 +1231,13 @@ static void __split_huge_page_refcount(struct page *page)
 {
 	int i;
 	struct zone *zone = page_zone(page);
+	struct lruvec *lruvec;
 	int tail_count = 0;
 
 	/* prevent PageLRU to go away from under us, and freeze lru stats */
 	spin_lock_irq(&zone->lru_lock);
+	lruvec = mem_cgroup_page_lruvec(page, zone);
+
 	compound_lock(page);
 	/* complete memcg works before add pages to LRU */
 	mem_cgroup_split_huge_fixup(page);
@@ -1302,13 +1312,12 @@ static void __split_huge_page_refcount(struct page *page)
 		BUG_ON(!PageDirty(page_tail));
 		BUG_ON(!PageSwapBacked(page_tail));
 
-
-		lru_add_page_tail(zone, page, page_tail);
+		lru_add_page_tail(page, page_tail, lruvec);
 	}
 	atomic_sub(tail_count, &page->_count);
 	BUG_ON(atomic_read(&page->_count) <= 0);
 
-	__dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+	__mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
 	__mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
 
 	ClearPageCompound(page);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4e28416c47fb..e198831276a3 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -273,8 +273,8 @@ static long region_count(struct list_head *head, long f, long t)
 
 	/* Locate each segment we overlap with, and count that overlap. */
 	list_for_each_entry(rg, head, link) {
-		int seg_from;
-		int seg_to;
+		long seg_from;
+		long seg_to;
 
 		if (rg->to <= f)
 			continue;
@@ -2157,6 +2157,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
 		kref_get(&reservations->refs);
 }
 
+static void resv_map_put(struct vm_area_struct *vma)
+{
+	struct resv_map *reservations = vma_resv_map(vma);
+
+	if (!reservations)
+		return;
+	kref_put(&reservations->refs, resv_map_release);
+}
+
 static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 {
 	struct hstate *h = hstate_vma(vma);
@@ -2173,7 +2182,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 		reserve = (end - start) -
 			region_count(&reservations->regions, start, end);
 
-		kref_put(&reservations->refs, resv_map_release);
+		resv_map_put(vma);
 
 		if (reserve) {
 			hugetlb_acct_memory(h, -reserve);
@@ -2991,12 +3000,16 @@ int hugetlb_reserve_pages(struct inode *inode,
 		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
 	}
 
-	if (chg < 0)
-		return chg;
+	if (chg < 0) {
+		ret = chg;
+		goto out_err;
+	}
 
 	/* There must be enough pages in the subpool for the mapping */
-	if (hugepage_subpool_get_pages(spool, chg))
-		return -ENOSPC;
+	if (hugepage_subpool_get_pages(spool, chg)) {
+		ret = -ENOSPC;
+		goto out_err;
+	}
 
 	/*
 	 * Check enough hugepages are available for the reservation.
@@ -3005,7 +3018,7 @@ int hugetlb_reserve_pages(struct inode *inode,
 	ret = hugetlb_acct_memory(h, chg);
 	if (ret < 0) {
 		hugepage_subpool_put_pages(spool, chg);
-		return ret;
+		goto out_err;
 	}
 
 	/*
@@ -3022,6 +3035,10 @@ int hugetlb_reserve_pages(struct inode *inode,
 	if (!vma || vma->vm_flags & VM_MAYSHARE)
 		region_add(&inode->i_mapping->private_list, from, to);
 	return 0;
+out_err:
+	if (vma)
+		resv_map_put(vma);
+	return ret;
 }
 
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
diff --git a/mm/internal.h b/mm/internal.h
index aee4761cf9a9..4194ab9dc19b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -94,6 +94,9 @@ extern void putback_lru_page(struct page *page);
 /*
  * in mm/page_alloc.c
  */
+extern void set_pageblock_migratetype(struct page *page, int migratetype);
+extern int move_freepages_block(struct zone *zone, struct page *page,
+				int migratetype);
 extern void __free_pages_bootmem(struct page *page, unsigned int order);
 extern void prep_compound_page(struct page *page, unsigned long order);
 #ifdef CONFIG_MEMORY_FAILURE
@@ -101,6 +104,7 @@ extern bool is_free_buddy_page(struct page *page);
 #endif
 
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
+#include <linux/compaction.h>
 
 /*
  * in mm/compaction.c
@@ -119,11 +123,14 @@ struct compact_control {
 	unsigned long nr_migratepages;	/* Number of pages to migrate */
 	unsigned long free_pfn;		/* isolate_freepages search base */
 	unsigned long migrate_pfn;	/* isolate_migratepages search base */
-	bool sync;			/* Synchronous migration */
+	enum compact_mode mode;		/* Compaction mode */
 
 	int order;			/* order a direct compactor needs */
 	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
 	struct zone *zone;
+
+	/* Number of UNMOVABLE destination pageblocks skipped during scan */
+	unsigned long nr_pageblocks_skipped;
 };
 
 unsigned long
@@ -164,7 +171,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
  * to determine if it's being mapped into a LOCKED vma.
  * If so, mark page as mlocked.
  */
-static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page)
+static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
+				    struct page *page)
 {
 	VM_BUG_ON(PageLRU(page));
 
@@ -222,7 +230,7 @@ extern unsigned long vma_address(struct page *page,
 				 struct vm_area_struct *vma);
 #endif
 #else /* !CONFIG_MMU */
-static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
+static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p)
 {
 	return 0;
 }
diff --git a/mm/madvise.c b/mm/madvise.c
index 1ccbba5b6674..deff1b64a08c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -11,8 +11,10 @@
 #include <linux/mempolicy.h>
 #include <linux/page-isolation.h>
 #include <linux/hugetlb.h>
+#include <linux/falloc.h>
 #include <linux/sched.h>
 #include <linux/ksm.h>
+#include <linux/fs.h>
 
 /*
  * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -200,8 +202,7 @@ static long madvise_remove(struct vm_area_struct *vma,
 				struct vm_area_struct **prev,
 				unsigned long start, unsigned long end)
 {
-	struct address_space *mapping;
-	loff_t offset, endoff;
+	loff_t offset;
 	int error;
 
 	*prev = NULL;	/* tell sys_madvise we drop mmap_sem */
@@ -217,16 +218,14 @@ static long madvise_remove(struct vm_area_struct *vma,
 	if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
 		return -EACCES;
 
-	mapping = vma->vm_file->f_mapping;
-
 	offset = (loff_t)(start - vma->vm_start)
 			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
-	endoff = (loff_t)(end - vma->vm_start - 1)
-			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 
-	/* vmtruncate_range needs to take i_mutex */
+	/* filesystem's fallocate may need to take i_mutex */
 	up_read(&current->mm->mmap_sem);
-	error = vmtruncate_range(mapping->host, offset, endoff);
+	error = do_fallocate(vma->vm_file,
+				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+				offset, end - start);
 	down_read(&current->mm->mmap_sem);
 	return error;
 }
diff --git a/mm/memblock.c b/mm/memblock.c
index a44eab3157f8..952123eba433 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -37,6 +37,8 @@ struct memblock memblock __initdata_memblock = {
 
 int memblock_debug __initdata_memblock;
 static int memblock_can_resize __initdata_memblock;
+static int memblock_memory_in_slab __initdata_memblock = 0;
+static int memblock_reserved_in_slab __initdata_memblock = 0;
 
 /* inline so we don't get a warning when pr_debug is compiled out */
 static inline const char *memblock_type_name(struct memblock_type *type)
@@ -187,6 +189,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
 	struct memblock_region *new_array, *old_array;
 	phys_addr_t old_size, new_size, addr;
 	int use_slab = slab_is_available();
+	int *in_slab;
 
 	/* We don't allow resizing until we know about the reserved regions
 	 * of memory that aren't suitable for allocation
@@ -198,6 +201,12 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
 	old_size = type->max * sizeof(struct memblock_region);
 	new_size = old_size << 1;
 
+	/* Retrieve the slab flag */
+	if (type == &memblock.memory)
+		in_slab = &memblock_memory_in_slab;
+	else
+		in_slab = &memblock_reserved_in_slab;
+
 	/* Try to find some space for it.
 	 *
 	 * WARNING: We assume that either slab_is_available() and we use it or
@@ -212,14 +221,15 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
 	if (use_slab) {
 		new_array = kmalloc(new_size, GFP_KERNEL);
 		addr = new_array ? __pa(new_array) : 0;
-	} else
+	} else {
 		addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t));
+		new_array = addr ? __va(addr) : 0;
+	}
 	if (!addr) {
 		pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
 		       memblock_type_name(type), type->max, type->max * 2);
 		return -1;
 	}
-	new_array = __va(addr);
 
 	memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]",
 		 memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1);
@@ -234,22 +244,24 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
 	type->regions = new_array;
 	type->max <<= 1;
 
-	/* If we use SLAB that's it, we are done */
-	if (use_slab)
-		return 0;
-
-	/* Add the new reserved region now. Should not fail ! */
-	BUG_ON(memblock_reserve(addr, new_size));
-
-	/* If the array wasn't our static init one, then free it. We only do
-	 * that before SLAB is available as later on, we don't know whether
-	 * to use kfree or free_bootmem_pages(). Shouldn't be a big deal
-	 * anyways
+	/* Free old array. We needn't free it if the array is the
+	 * static one
 	 */
-	if (old_array != memblock_memory_init_regions &&
-	    old_array != memblock_reserved_init_regions)
+	if (*in_slab)
+		kfree(old_array);
+	else if (old_array != memblock_memory_init_regions &&
+		 old_array != memblock_reserved_init_regions)
 		memblock_free(__pa(old_array), old_size);
 
+	/* Reserve the new array if that comes from the memblock.
+	 * Otherwise, we needn't do it
+	 */
+	if (!use_slab)
+		BUG_ON(memblock_reserve(addr, new_size));
+
+	/* Update slab flag */
+	*in_slab = use_slab;
+
 	return 0;
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f342778a0c0a..ac35bccadb7b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -59,7 +59,7 @@
 
 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES	5
-struct mem_cgroup *root_mem_cgroup __read_mostly;
+static struct mem_cgroup *root_mem_cgroup __read_mostly;
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
@@ -73,7 +73,7 @@ static int really_do_swap_account __initdata = 0;
 #endif
 
 #else
-#define do_swap_account		(0)
+#define do_swap_account		0
 #endif
 
 
@@ -88,18 +88,31 @@ enum mem_cgroup_stat_index {
 	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as anon rss */
 	MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
 	MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
-	MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
 	MEM_CGROUP_STAT_NSTATS,
 };
 
+static const char * const mem_cgroup_stat_names[] = {
+	"cache",
+	"rss",
+	"mapped_file",
+	"swap",
+};
+
 enum mem_cgroup_events_index {
 	MEM_CGROUP_EVENTS_PGPGIN,	/* # of pages paged in */
 	MEM_CGROUP_EVENTS_PGPGOUT,	/* # of pages paged out */
-	MEM_CGROUP_EVENTS_COUNT,	/* # of pages paged in/out */
 	MEM_CGROUP_EVENTS_PGFAULT,	/* # of page-faults */
 	MEM_CGROUP_EVENTS_PGMAJFAULT,	/* # of major page-faults */
 	MEM_CGROUP_EVENTS_NSTATS,
 };
+
+static const char * const mem_cgroup_events_names[] = {
+	"pgpgin",
+	"pgpgout",
+	"pgfault",
+	"pgmajfault",
+};
+
 /*
  * Per memcg event counter is incremented at every pagein/pageout. With THP,
  * it will be incremated by the number of pages. This counter is used for
@@ -112,13 +125,14 @@ enum mem_cgroup_events_target {
 	MEM_CGROUP_TARGET_NUMAINFO,
 	MEM_CGROUP_NTARGETS,
 };
-#define THRESHOLDS_EVENTS_TARGET (128)
-#define SOFTLIMIT_EVENTS_TARGET (1024)
-#define NUMAINFO_EVENTS_TARGET	(1024)
+#define THRESHOLDS_EVENTS_TARGET 128
+#define SOFTLIMIT_EVENTS_TARGET 1024
+#define NUMAINFO_EVENTS_TARGET	1024
 
 struct mem_cgroup_stat_cpu {
 	long count[MEM_CGROUP_STAT_NSTATS];
 	unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
+	unsigned long nr_page_events;
 	unsigned long targets[MEM_CGROUP_NTARGETS];
 };
 
@@ -138,7 +152,6 @@ struct mem_cgroup_per_zone {
 
 	struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
 
-	struct zone_reclaim_stat reclaim_stat;
 	struct rb_node		tree_node;	/* RB tree node */
 	unsigned long long	usage_in_excess;/* Set to the value by which */
 						/* the soft limit is exceeded*/
@@ -182,7 +195,7 @@ struct mem_cgroup_threshold {
 
 /* For threshold */
 struct mem_cgroup_threshold_ary {
-	/* An array index points to threshold just below usage. */
+	/* An array index points to threshold just below or equal to usage. */
 	int current_threshold;
 	/* Size of entries[] */
 	unsigned int size;
@@ -245,8 +258,8 @@ struct mem_cgroup {
 		 */
 		struct rcu_head rcu_freeing;
 		/*
-		 * But when using vfree(), that cannot be done at
-		 * interrupt time, so we must then queue the work.
+		 * We also need some space for a worker in deferred freeing.
+		 * By the time we call it, rcu_freeing is no longer in use.
 		 */
 		struct work_struct work_freeing;
 	};
@@ -305,7 +318,7 @@ struct mem_cgroup {
 	/*
 	 * percpu counter.
 	 */
-	struct mem_cgroup_stat_cpu *stat;
+	struct mem_cgroup_stat_cpu __percpu *stat;
 	/*
 	 * used when a cpu is offlined or other synchronizations
 	 * See mem_cgroup_read_stat().
@@ -360,8 +373,8 @@ static bool move_file(void)
  * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
  * limit reclaim to prevent infinite loops, if they ever occur.
  */
-#define	MEM_CGROUP_MAX_RECLAIM_LOOPS		(100)
-#define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	(2)
+#define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
+#define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
 
 enum charge_type {
 	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
@@ -377,8 +390,8 @@ enum charge_type {
 #define _MEM			(0)
 #define _MEMSWAP		(1)
 #define _OOM_TYPE		(2)
-#define MEMFILE_PRIVATE(x, val)	(((x) << 16) | (val))
-#define MEMFILE_TYPE(val)	(((val) >> 16) & 0xffff)
+#define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
+#define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
 #define MEMFILE_ATTR(val)	((val) & 0xffff)
 /* Used for OOM nofiier */
 #define OOM_CONTROL		(0)
@@ -404,6 +417,7 @@ void sock_update_memcg(struct sock *sk)
 {
 	if (mem_cgroup_sockets_enabled) {
 		struct mem_cgroup *memcg;
+		struct cg_proto *cg_proto;
 
 		BUG_ON(!sk->sk_prot->proto_cgroup);
 
@@ -423,9 +437,10 @@ void sock_update_memcg(struct sock *sk)
 
 		rcu_read_lock();
 		memcg = mem_cgroup_from_task(current);
-		if (!mem_cgroup_is_root(memcg)) {
+		cg_proto = sk->sk_prot->proto_cgroup(memcg);
+		if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) {
 			mem_cgroup_get(memcg);
-			sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg);
+			sk->sk_cgrp = cg_proto;
 		}
 		rcu_read_unlock();
 	}
@@ -454,6 +469,19 @@ EXPORT_SYMBOL(tcp_proto_cgroup);
 #endif /* CONFIG_INET */
 #endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
 
+#if defined(CONFIG_INET) && defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM)
+static void disarm_sock_keys(struct mem_cgroup *memcg)
+{
+	if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
+		return;
+	static_key_slow_dec(&memcg_socket_limit_enabled);
+}
+#else
+static void disarm_sock_keys(struct mem_cgroup *memcg)
+{
+}
+#endif
+
 static void drain_all_stock_async(struct mem_cgroup *memcg);
 
 static struct mem_cgroup_per_zone *
@@ -718,12 +746,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 		nr_pages = -nr_pages; /* for event */
 	}
 
-	__this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
+	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
 
 	preempt_enable();
 }
 
 unsigned long
+mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+{
+	struct mem_cgroup_per_zone *mz;
+
+	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+	return mz->lru_size[lru];
+}
+
+static unsigned long
 mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
 			unsigned int lru_mask)
 {
@@ -770,7 +807,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 {
 	unsigned long val, next;
 
-	val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
+	val = __this_cpu_read(memcg->stat->nr_page_events);
 	next = __this_cpu_read(memcg->stat->targets[target]);
 	/* from time_after() in jiffies.h */
 	if ((long)next - (long)val < 0) {
@@ -1013,7 +1050,7 @@ EXPORT_SYMBOL(mem_cgroup_count_vm_event);
 /**
  * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
  * @zone: zone of the wanted lruvec
- * @mem: memcg of the wanted lruvec
+ * @memcg: memcg of the wanted lruvec
  *
  * Returns the lru list vector holding pages for the given @zone and
  * @mem.  This can be the global zone lruvec, if the memory controller
@@ -1046,19 +1083,11 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
  */
 
 /**
- * mem_cgroup_lru_add_list - account for adding an lru page and return lruvec
- * @zone: zone of the page
+ * mem_cgroup_page_lruvec - return lruvec for adding an lru page
  * @page: the page
- * @lru: current lru
- *
- * This function accounts for @page being added to @lru, and returns
- * the lruvec for the given @zone and the memcg @page is charged to.
- *
- * The callsite is then responsible for physically linking the page to
- * the returned lruvec->lists[@lru].
+ * @zone: zone of the page
  */
-struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
-				       enum lru_list lru)
+struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
 {
 	struct mem_cgroup_per_zone *mz;
 	struct mem_cgroup *memcg;
@@ -1071,7 +1100,7 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
 	memcg = pc->mem_cgroup;
 
 	/*
-	 * Surreptitiously switch any uncharged page to root:
+	 * Surreptitiously switch any uncharged offlist page to root:
 	 * an uncharged page off lru does nothing to secure
 	 * its former mem_cgroup from sudden removal.
 	 *
@@ -1079,85 +1108,60 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
 	 * under page_cgroup lock: between them, they make all uses
 	 * of pc->mem_cgroup safe.
 	 */
-	if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup)
+	if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
 		pc->mem_cgroup = memcg = root_mem_cgroup;
 
 	mz = page_cgroup_zoneinfo(memcg, page);
-	/* compound_order() is stabilized through lru_lock */
-	mz->lru_size[lru] += 1 << compound_order(page);
 	return &mz->lruvec;
 }
 
 /**
- * mem_cgroup_lru_del_list - account for removing an lru page
- * @page: the page
- * @lru: target lru
- *
- * This function accounts for @page being removed from @lru.
+ * mem_cgroup_update_lru_size - account for adding or removing an lru page
+ * @lruvec: mem_cgroup per zone lru vector
+ * @lru: index of lru list the page is sitting on
+ * @nr_pages: positive when adding or negative when removing
  *
- * The callsite is then responsible for physically unlinking
- * @page->lru.
+ * This function must be called when a page is added to or removed from an
+ * lru list.
  */
-void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru)
+void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
+				int nr_pages)
 {
 	struct mem_cgroup_per_zone *mz;
-	struct mem_cgroup *memcg;
-	struct page_cgroup *pc;
+	unsigned long *lru_size;
 
 	if (mem_cgroup_disabled())
 		return;
 
-	pc = lookup_page_cgroup(page);
-	memcg = pc->mem_cgroup;
-	VM_BUG_ON(!memcg);
-	mz = page_cgroup_zoneinfo(memcg, page);
-	/* huge page split is done under lru_lock. so, we have no races. */
-	VM_BUG_ON(mz->lru_size[lru] < (1 << compound_order(page)));
-	mz->lru_size[lru] -= 1 << compound_order(page);
-}
-
-void mem_cgroup_lru_del(struct page *page)
-{
-	mem_cgroup_lru_del_list(page, page_lru(page));
-}
-
-/**
- * mem_cgroup_lru_move_lists - account for moving a page between lrus
- * @zone: zone of the page
- * @page: the page
- * @from: current lru
- * @to: target lru
- *
- * This function accounts for @page being moved between the lrus @from
- * and @to, and returns the lruvec for the given @zone and the memcg
- * @page is charged to.
- *
- * The callsite is then responsible for physically relinking
- * @page->lru to the returned lruvec->lists[@to].
- */
-struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone,
-					 struct page *page,
-					 enum lru_list from,
-					 enum lru_list to)
-{
-	/* XXX: Optimize this, especially for @from == @to */
-	mem_cgroup_lru_del_list(page, from);
-	return mem_cgroup_lru_add_list(zone, page, to);
+	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+	lru_size = mz->lru_size + lru;
+	*lru_size += nr_pages;
+	VM_BUG_ON((long)(*lru_size) < 0);
 }
 
 /*
  * Checks whether given mem is same or in the root_mem_cgroup's
  * hierarchy subtree
  */
+bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
+				  struct mem_cgroup *memcg)
+{
+	if (root_memcg == memcg)
+		return true;
+	if (!root_memcg->use_hierarchy)
+		return false;
+	return css_is_ancestor(&memcg->css, &root_memcg->css);
+}
+
 static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
-		struct mem_cgroup *memcg)
+				       struct mem_cgroup *memcg)
 {
-	if (root_memcg != memcg) {
-		return (root_memcg->use_hierarchy &&
-			css_is_ancestor(&memcg->css, &root_memcg->css));
-	}
+	bool ret;
 
-	return true;
+	rcu_read_lock();
+	ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
+	rcu_read_unlock();
+	return ret;
 }
 
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
@@ -1195,19 +1199,15 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
 	return ret;
 }
 
-int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
+int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
 {
 	unsigned long inactive_ratio;
-	int nid = zone_to_nid(zone);
-	int zid = zone_idx(zone);
 	unsigned long inactive;
 	unsigned long active;
 	unsigned long gb;
 
-	inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
-						BIT(LRU_INACTIVE_ANON));
-	active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
-					      BIT(LRU_ACTIVE_ANON));
+	inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
+	active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
 
 	gb = (inactive + active) >> (30 - PAGE_SHIFT);
 	if (gb)
@@ -1218,49 +1218,17 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
 	return inactive * inactive_ratio < active;
 }
 
-int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)
+int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
 {
 	unsigned long active;
 	unsigned long inactive;
-	int zid = zone_idx(zone);
-	int nid = zone_to_nid(zone);
 
-	inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
-						BIT(LRU_INACTIVE_FILE));
-	active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
-					      BIT(LRU_ACTIVE_FILE));
+	inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);
+	active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);
 
 	return (active > inactive);
 }
 
-struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
-						      struct zone *zone)
-{
-	int nid = zone_to_nid(zone);
-	int zid = zone_idx(zone);
-	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
-
-	return &mz->reclaim_stat;
-}
-
-struct zone_reclaim_stat *
-mem_cgroup_get_reclaim_stat_from_page(struct page *page)
-{
-	struct page_cgroup *pc;
-	struct mem_cgroup_per_zone *mz;
-
-	if (mem_cgroup_disabled())
-		return NULL;
-
-	pc = lookup_page_cgroup(page);
-	if (!PageCgroupUsed(pc))
-		return NULL;
-	/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
-	smp_rmb();
-	mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
-	return &mz->reclaim_stat;
-}
-
 #define mem_cgroup_from_res_counter(counter, member)	\
 	container_of(counter, struct mem_cgroup, member)
 
@@ -1634,7 +1602,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
  * unused nodes. But scan_nodes is lazily updated and may not cotain
  * enough new information. We need to do double check.
  */
-bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
+static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
 {
 	int nid;
 
@@ -1669,7 +1637,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
 	return 0;
 }
 
-bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
+static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
 {
 	return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
 }
@@ -1843,7 +1811,8 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
 /*
  * try to call OOM killer. returns false if we should exit memory-reclaim loop.
  */
-bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
+static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
+				  int order)
 {
 	struct oom_wait_info owait;
 	bool locked, need_to_kill;
@@ -1992,7 +1961,7 @@ struct memcg_stock_pcp {
 	unsigned int nr_pages;
 	struct work_struct work;
 	unsigned long flags;
-#define FLUSHING_CACHED_CHARGE	(0)
+#define FLUSHING_CACHED_CHARGE	0
 };
 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
 static DEFINE_MUTEX(percpu_charge_mutex);
@@ -2139,7 +2108,7 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
 	int i;
 
 	spin_lock(&memcg->pcp_counter_lock);
-	for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
+	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
 		long x = per_cpu(memcg->stat->count[i], cpu);
 
 		per_cpu(memcg->stat->count[i], cpu) = 0;
@@ -2427,6 +2396,24 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
 }
 
 /*
+ * Cancel chrages in this cgroup....doesn't propagate to parent cgroup.
+ * This is useful when moving usage to parent cgroup.
+ */
+static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
+					unsigned int nr_pages)
+{
+	unsigned long bytes = nr_pages * PAGE_SIZE;
+
+	if (mem_cgroup_is_root(memcg))
+		return;
+
+	res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
+	if (do_swap_account)
+		res_counter_uncharge_until(&memcg->memsw,
+						memcg->memsw.parent, bytes);
+}
+
+/*
  * A helper function to get mem_cgroup from ID. must be called under
  * rcu_read_lock(). The caller must check css_is_removed() or some if
  * it's concern. (dropping refcnt from swap can be called against removed
@@ -2481,6 +2468,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
 {
 	struct page_cgroup *pc = lookup_page_cgroup(page);
 	struct zone *uninitialized_var(zone);
+	struct lruvec *lruvec;
 	bool was_on_lru = false;
 	bool anon;
 
@@ -2503,8 +2491,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
 		zone = page_zone(page);
 		spin_lock_irq(&zone->lru_lock);
 		if (PageLRU(page)) {
+			lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
 			ClearPageLRU(page);
-			del_page_from_lru_list(zone, page, page_lru(page));
+			del_page_from_lru_list(page, lruvec, page_lru(page));
 			was_on_lru = true;
 		}
 	}
@@ -2522,9 +2511,10 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
 
 	if (lrucare) {
 		if (was_on_lru) {
+			lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
 			VM_BUG_ON(PageLRU(page));
 			SetPageLRU(page);
-			add_page_to_lru_list(zone, page, page_lru(page));
+			add_page_to_lru_list(page, lruvec, page_lru(page));
 		}
 		spin_unlock_irq(&zone->lru_lock);
 	}
@@ -2547,7 +2537,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 
-#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION))
+#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
 /*
  * Because tail pages are not marked as "used", set it. We're under
  * zone->lru_lock, 'splitting on pmd' and compound_lock.
@@ -2578,23 +2568,19 @@ void mem_cgroup_split_huge_fixup(struct page *head)
  * @pc:	page_cgroup of the page.
  * @from: mem_cgroup which the page is moved from.
  * @to:	mem_cgroup which the page is moved to. @from != @to.
- * @uncharge: whether we should call uncharge and css_put against @from.
  *
  * The caller must confirm following.
  * - page is not on LRU (isolate_page() is useful.)
  * - compound_lock is held when nr_pages > 1
  *
- * This function doesn't do "charge" nor css_get to new cgroup. It should be
- * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is
- * true, this function does "uncharge" from old cgroup, but it doesn't if
- * @uncharge is false, so a caller should do "uncharge".
+ * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
+ * from old cgroup.
  */
 static int mem_cgroup_move_account(struct page *page,
 				   unsigned int nr_pages,
 				   struct page_cgroup *pc,
 				   struct mem_cgroup *from,
-				   struct mem_cgroup *to,
-				   bool uncharge)
+				   struct mem_cgroup *to)
 {
 	unsigned long flags;
 	int ret;
@@ -2628,9 +2614,6 @@ static int mem_cgroup_move_account(struct page *page,
 		preempt_enable();
 	}
 	mem_cgroup_charge_statistics(from, anon, -nr_pages);
-	if (uncharge)
-		/* This is not "cancel", but cancel_charge does all we need. */
-		__mem_cgroup_cancel_charge(from, nr_pages);
 
 	/* caller should have done css_get */
 	pc->mem_cgroup = to;
@@ -2664,15 +2647,13 @@ static int mem_cgroup_move_parent(struct page *page,
 				  struct mem_cgroup *child,
 				  gfp_t gfp_mask)
 {
-	struct cgroup *cg = child->css.cgroup;
-	struct cgroup *pcg = cg->parent;
 	struct mem_cgroup *parent;
 	unsigned int nr_pages;
 	unsigned long uninitialized_var(flags);
 	int ret;
 
 	/* Is ROOT ? */
-	if (!pcg)
+	if (mem_cgroup_is_root(child))
 		return -EINVAL;
 
 	ret = -EBUSY;
@@ -2683,21 +2664,23 @@ static int mem_cgroup_move_parent(struct page *page,
 
 	nr_pages = hpage_nr_pages(page);
 
-	parent = mem_cgroup_from_cont(pcg);
-	ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false);
-	if (ret)
-		goto put_back;
+	parent = parent_mem_cgroup(child);
+	/*
+	 * If no parent, move charges to root cgroup.
+	 */
+	if (!parent)
+		parent = root_mem_cgroup;
 
 	if (nr_pages > 1)
 		flags = compound_lock_irqsave(page);
 
-	ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true);
-	if (ret)
-		__mem_cgroup_cancel_charge(parent, nr_pages);
+	ret = mem_cgroup_move_account(page, nr_pages,
+				pc, child, parent);
+	if (!ret)
+		__mem_cgroup_cancel_local_charge(child, nr_pages);
 
 	if (nr_pages > 1)
 		compound_unlock_irqrestore(page, flags);
-put_back:
 	putback_lru_page(page);
 put:
 	put_page(page);
@@ -2845,24 +2828,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
 	 */
 	if (do_swap_account && PageSwapCache(page)) {
 		swp_entry_t ent = {.val = page_private(page)};
-		struct mem_cgroup *swap_memcg;
-		unsigned short id;
-
-		id = swap_cgroup_record(ent, 0);
-		rcu_read_lock();
-		swap_memcg = mem_cgroup_lookup(id);
-		if (swap_memcg) {
-			/*
-			 * This recorded memcg can be obsolete one. So, avoid
-			 * calling css_tryget
-			 */
-			if (!mem_cgroup_is_root(swap_memcg))
-				res_counter_uncharge(&swap_memcg->memsw,
-						     PAGE_SIZE);
-			mem_cgroup_swap_statistics(swap_memcg, false);
-			mem_cgroup_put(swap_memcg);
-		}
-		rcu_read_unlock();
+		mem_cgroup_uncharge_swap(ent);
 	}
 	/*
 	 * At swapin, we may charge account against cgroup which has no tasks.
@@ -3155,7 +3121,6 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
  * @entry: swap entry to be moved
  * @from:  mem_cgroup which the entry is moved from
  * @to:  mem_cgroup which the entry is moved to
- * @need_fixup: whether we should fixup res_counters and refcounts.
  *
  * It succeeds only when the swap_cgroup's record for this entry is the same
  * as the mem_cgroup's id of @from.
@@ -3166,7 +3131,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
  * both res and memsw, and called css_get().
  */
 static int mem_cgroup_move_swap_account(swp_entry_t entry,
-		struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
+				struct mem_cgroup *from, struct mem_cgroup *to)
 {
 	unsigned short old_id, new_id;
 
@@ -3185,24 +3150,13 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
 		 * swap-in, the refcount of @to might be decreased to 0.
 		 */
 		mem_cgroup_get(to);
-		if (need_fixup) {
-			if (!mem_cgroup_is_root(from))
-				res_counter_uncharge(&from->memsw, PAGE_SIZE);
-			mem_cgroup_put(from);
-			/*
-			 * we charged both to->res and to->memsw, so we should
-			 * uncharge to->res.
-			 */
-			if (!mem_cgroup_is_root(to))
-				res_counter_uncharge(&to->res, PAGE_SIZE);
-		}
 		return 0;
 	}
 	return -EINVAL;
 }
 #else
 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
-		struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
+				struct mem_cgroup *from, struct mem_cgroup *to)
 {
 	return -EINVAL;
 }
@@ -3363,7 +3317,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
 void mem_cgroup_replace_page_cache(struct page *oldpage,
 				  struct page *newpage)
 {
-	struct mem_cgroup *memcg;
+	struct mem_cgroup *memcg = NULL;
 	struct page_cgroup *pc;
 	enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
 
@@ -3373,11 +3327,20 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
 	pc = lookup_page_cgroup(oldpage);
 	/* fix accounting on old pages */
 	lock_page_cgroup(pc);
-	memcg = pc->mem_cgroup;
-	mem_cgroup_charge_statistics(memcg, false, -1);
-	ClearPageCgroupUsed(pc);
+	if (PageCgroupUsed(pc)) {
+		memcg = pc->mem_cgroup;
+		mem_cgroup_charge_statistics(memcg, false, -1);
+		ClearPageCgroupUsed(pc);
+	}
 	unlock_page_cgroup(pc);
 
+	/*
+	 * When called from shmem_replace_page(), in some cases the
+	 * oldpage has already been charged, and in some cases not.
+	 */
+	if (!memcg)
+		return;
+
 	if (PageSwapBacked(oldpage))
 		type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
 
@@ -3793,7 +3756,7 @@ try_to_free:
 	goto move_account;
 }
 
-int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
+static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
 {
 	return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
 }
@@ -4051,103 +4014,13 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
 }
 #endif
 
-
-/* For read statistics */
-enum {
-	MCS_CACHE,
-	MCS_RSS,
-	MCS_FILE_MAPPED,
-	MCS_PGPGIN,
-	MCS_PGPGOUT,
-	MCS_SWAP,
-	MCS_PGFAULT,
-	MCS_PGMAJFAULT,
-	MCS_INACTIVE_ANON,
-	MCS_ACTIVE_ANON,
-	MCS_INACTIVE_FILE,
-	MCS_ACTIVE_FILE,
-	MCS_UNEVICTABLE,
-	NR_MCS_STAT,
-};
-
-struct mcs_total_stat {
-	s64 stat[NR_MCS_STAT];
-};
-
-struct {
-	char *local_name;
-	char *total_name;
-} memcg_stat_strings[NR_MCS_STAT] = {
-	{"cache", "total_cache"},
-	{"rss", "total_rss"},
-	{"mapped_file", "total_mapped_file"},
-	{"pgpgin", "total_pgpgin"},
-	{"pgpgout", "total_pgpgout"},
-	{"swap", "total_swap"},
-	{"pgfault", "total_pgfault"},
-	{"pgmajfault", "total_pgmajfault"},
-	{"inactive_anon", "total_inactive_anon"},
-	{"active_anon", "total_active_anon"},
-	{"inactive_file", "total_inactive_file"},
-	{"active_file", "total_active_file"},
-	{"unevictable", "total_unevictable"}
-};
-
-
-static void
-mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
-{
-	s64 val;
-
-	/* per cpu stat */
-	val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE);
-	s->stat[MCS_CACHE] += val * PAGE_SIZE;
-	val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS);
-	s->stat[MCS_RSS] += val * PAGE_SIZE;
-	val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
-	s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
-	val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN);
-	s->stat[MCS_PGPGIN] += val;
-	val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT);
-	s->stat[MCS_PGPGOUT] += val;
-	if (do_swap_account) {
-		val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
-		s->stat[MCS_SWAP] += val * PAGE_SIZE;
-	}
-	val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGFAULT);
-	s->stat[MCS_PGFAULT] += val;
-	val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT);
-	s->stat[MCS_PGMAJFAULT] += val;
-
-	/* per zone stat */
-	val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
-	s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
-	val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
-	s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
-	val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
-	s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
-	val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
-	s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
-	val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
-	s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
-}
-
-static void
-mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
-{
-	struct mem_cgroup *iter;
-
-	for_each_mem_cgroup_tree(iter, memcg)
-		mem_cgroup_get_local_stat(iter, s);
-}
-
 #ifdef CONFIG_NUMA
-static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
+static int mem_control_numa_stat_show(struct cgroup *cont, struct cftype *cft,
+				      struct seq_file *m)
 {
 	int nid;
 	unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
 	unsigned long node_nr;
-	struct cgroup *cont = m->private;
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 
 	total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
@@ -4188,64 +4061,100 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
 }
 #endif /* CONFIG_NUMA */
 
+static const char * const mem_cgroup_lru_names[] = {
+	"inactive_anon",
+	"active_anon",
+	"inactive_file",
+	"active_file",
+	"unevictable",
+};
+
+static inline void mem_cgroup_lru_names_not_uptodate(void)
+{
+	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
+}
+
 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
-				 struct cgroup_map_cb *cb)
+				 struct seq_file *m)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
-	struct mcs_total_stat mystat;
-	int i;
-
-	memset(&mystat, 0, sizeof(mystat));
-	mem_cgroup_get_local_stat(memcg, &mystat);
+	struct mem_cgroup *mi;
+	unsigned int i;
 
-
-	for (i = 0; i < NR_MCS_STAT; i++) {
-		if (i == MCS_SWAP && !do_swap_account)
+	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
+		if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account)
 			continue;
-		cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
+		seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
+			   mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
 	}
 
+	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
+		seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
+			   mem_cgroup_read_events(memcg, i));
+
+	for (i = 0; i < NR_LRU_LISTS; i++)
+		seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
+			   mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
+
 	/* Hierarchical information */
 	{
 		unsigned long long limit, memsw_limit;
 		memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
-		cb->fill(cb, "hierarchical_memory_limit", limit);
+		seq_printf(m, "hierarchical_memory_limit %llu\n", limit);
 		if (do_swap_account)
-			cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
+			seq_printf(m, "hierarchical_memsw_limit %llu\n",
+				   memsw_limit);
 	}
 
-	memset(&mystat, 0, sizeof(mystat));
-	mem_cgroup_get_total_stat(memcg, &mystat);
-	for (i = 0; i < NR_MCS_STAT; i++) {
-		if (i == MCS_SWAP && !do_swap_account)
+	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
+		long long val = 0;
+
+		if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account)
 			continue;
-		cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
+		for_each_mem_cgroup_tree(mi, memcg)
+			val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
+		seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
+	}
+
+	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
+		unsigned long long val = 0;
+
+		for_each_mem_cgroup_tree(mi, memcg)
+			val += mem_cgroup_read_events(mi, i);
+		seq_printf(m, "total_%s %llu\n",
+			   mem_cgroup_events_names[i], val);
+	}
+
+	for (i = 0; i < NR_LRU_LISTS; i++) {
+		unsigned long long val = 0;
+
+		for_each_mem_cgroup_tree(mi, memcg)
+			val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
+		seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
 	}
 
 #ifdef CONFIG_DEBUG_VM
 	{
 		int nid, zid;
 		struct mem_cgroup_per_zone *mz;
+		struct zone_reclaim_stat *rstat;
 		unsigned long recent_rotated[2] = {0, 0};
 		unsigned long recent_scanned[2] = {0, 0};
 
 		for_each_online_node(nid)
 			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 				mz = mem_cgroup_zoneinfo(memcg, nid, zid);
+				rstat = &mz->lruvec.reclaim_stat;
 
-				recent_rotated[0] +=
-					mz->reclaim_stat.recent_rotated[0];
-				recent_rotated[1] +=
-					mz->reclaim_stat.recent_rotated[1];
-				recent_scanned[0] +=
-					mz->reclaim_stat.recent_scanned[0];
-				recent_scanned[1] +=
-					mz->reclaim_stat.recent_scanned[1];
+				recent_rotated[0] += rstat->recent_rotated[0];
+				recent_rotated[1] += rstat->recent_rotated[1];
+				recent_scanned[0] += rstat->recent_scanned[0];
+				recent_scanned[1] += rstat->recent_scanned[1];
 			}
-		cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
-		cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
-		cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
-		cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
+		seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
+		seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
+		seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
+		seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
 	}
 #endif
 
@@ -4307,7 +4216,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
 	usage = mem_cgroup_usage(memcg, swap);
 
 	/*
-	 * current_threshold points to threshold just below usage.
+	 * current_threshold points to threshold just below or equal to usage.
 	 * If it's not true, a threshold was crossed after last
 	 * call of __mem_cgroup_threshold().
 	 */
@@ -4433,14 +4342,15 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
 	/* Find current threshold */
 	new->current_threshold = -1;
 	for (i = 0; i < size; i++) {
-		if (new->entries[i].threshold < usage) {
+		if (new->entries[i].threshold <= usage) {
 			/*
 			 * new->current_threshold will not be used until
 			 * rcu_assign_pointer(), so it's safe to increment
 			 * it here.
 			 */
 			++new->current_threshold;
-		}
+		} else
+			break;
 	}
 
 	/* Free old spare buffer and save old primary buffer as spare */
@@ -4509,7 +4419,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
 			continue;
 
 		new->entries[j] = thresholds->primary->entries[i];
-		if (new->entries[j].threshold < usage) {
+		if (new->entries[j].threshold <= usage) {
 			/*
 			 * new->current_threshold will not be used
 			 * until rcu_assign_pointer(), so it's safe to increment
@@ -4623,22 +4533,6 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
 	return 0;
 }
 
-#ifdef CONFIG_NUMA
-static const struct file_operations mem_control_numa_stat_file_operations = {
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
-{
-	struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
-
-	file->f_op = &mem_control_numa_stat_file_operations;
-	return single_open(file, mem_control_numa_stat_show, cont);
-}
-#endif /* CONFIG_NUMA */
-
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
@@ -4694,7 +4588,7 @@ static struct cftype mem_cgroup_files[] = {
 	},
 	{
 		.name = "stat",
-		.read_map = mem_control_stat_show,
+		.read_seq_string = mem_control_stat_show,
 	},
 	{
 		.name = "force_empty",
@@ -4726,8 +4620,7 @@ static struct cftype mem_cgroup_files[] = {
 #ifdef CONFIG_NUMA
 	{
 		.name = "numa_stat",
-		.open = mem_control_numa_stat_open,
-		.mode = S_IRUGO,
+		.read_seq_string = mem_control_numa_stat_show,
 	},
 #endif
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -4764,7 +4657,6 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 {
 	struct mem_cgroup_per_node *pn;
 	struct mem_cgroup_per_zone *mz;
-	enum lru_list lru;
 	int zone, tmp = node;
 	/*
 	 * This routine is called against possible nodes.
@@ -4782,8 +4674,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 
 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 		mz = &pn->zoneinfo[zone];
-		for_each_lru(lru)
-			INIT_LIST_HEAD(&mz->lruvec.lists[lru]);
+		lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);
 		mz->usage_in_excess = 0;
 		mz->on_tree = false;
 		mz->memcg = memcg;
@@ -4826,23 +4717,40 @@ out_free:
 }
 
 /*
- * Helpers for freeing a vzalloc()ed mem_cgroup by RCU,
+ * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
  * but in process context.  The work_freeing structure is overlaid
  * on the rcu_freeing structure, which itself is overlaid on memsw.
  */
-static void vfree_work(struct work_struct *work)
+static void free_work(struct work_struct *work)
 {
 	struct mem_cgroup *memcg;
+	int size = sizeof(struct mem_cgroup);
 
 	memcg = container_of(work, struct mem_cgroup, work_freeing);
-	vfree(memcg);
+	/*
+	 * We need to make sure that (at least for now), the jump label
+	 * destruction code runs outside of the cgroup lock. This is because
+	 * get_online_cpus(), which is called from the static_branch update,
+	 * can't be called inside the cgroup_lock. cpusets are the ones
+	 * enforcing this dependency, so if they ever change, we might as well.
+	 *
+	 * schedule_work() will guarantee this happens. Be careful if you need
+	 * to move this code around, and make sure it is outside
+	 * the cgroup_lock.
+	 */
+	disarm_sock_keys(memcg);
+	if (size < PAGE_SIZE)
+		kfree(memcg);
+	else
+		vfree(memcg);
 }
-static void vfree_rcu(struct rcu_head *rcu_head)
+
+static void free_rcu(struct rcu_head *rcu_head)
 {
 	struct mem_cgroup *memcg;
 
 	memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
-	INIT_WORK(&memcg->work_freeing, vfree_work);
+	INIT_WORK(&memcg->work_freeing, free_work);
 	schedule_work(&memcg->work_freeing);
 }
 
@@ -4868,10 +4776,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 		free_mem_cgroup_per_zone_info(memcg, node);
 
 	free_percpu(memcg->stat);
-	if (sizeof(struct mem_cgroup) < PAGE_SIZE)
-		kfree_rcu(memcg, rcu_freeing);
-	else
-		call_rcu(&memcg->rcu_freeing, vfree_rcu);
+	call_rcu(&memcg->rcu_freeing, free_rcu);
 }
 
 static void mem_cgroup_get(struct mem_cgroup *memcg)
@@ -5135,7 +5040,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
 		return NULL;
 	if (PageAnon(page)) {
 		/* we don't move shared anon */
-		if (!move_anon() || page_mapcount(page) > 2)
+		if (!move_anon())
 			return NULL;
 	} else if (!move_file())
 		/* we ignore mapcount for file pages */
@@ -5146,32 +5051,37 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
 	return page;
 }
 
+#ifdef CONFIG_SWAP
 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
 			unsigned long addr, pte_t ptent, swp_entry_t *entry)
 {
-	int usage_count;
 	struct page *page = NULL;
 	swp_entry_t ent = pte_to_swp_entry(ptent);
 
 	if (!move_anon() || non_swap_entry(ent))
 		return NULL;
-	usage_count = mem_cgroup_count_swap_user(ent, &page);
-	if (usage_count > 1) { /* we don't move shared anon */
-		if (page)
-			put_page(page);
-		return NULL;
-	}
+	/*
+	 * Because lookup_swap_cache() updates some statistics counter,
+	 * we call find_get_page() with swapper_space directly.
+	 */
+	page = find_get_page(&swapper_space, ent.val);
 	if (do_swap_account)
 		entry->val = ent.val;
 
 	return page;
 }
+#else
+static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
+			unsigned long addr, pte_t ptent, swp_entry_t *entry)
+{
+	return NULL;
+}
+#endif
 
 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
 			unsigned long addr, pte_t ptent, swp_entry_t *entry)
 {
 	struct page *page = NULL;
-	struct inode *inode;
 	struct address_space *mapping;
 	pgoff_t pgoff;
 
@@ -5180,7 +5090,6 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
 	if (!move_file())
 		return NULL;
 
-	inode = vma->vm_file->f_path.dentry->d_inode;
 	mapping = vma->vm_file->f_mapping;
 	if (pte_none(ptent))
 		pgoff = linear_page_index(vma, addr);
@@ -5479,8 +5388,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 			if (!isolate_lru_page(page)) {
 				pc = lookup_page_cgroup(page);
 				if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
-							     pc, mc.from, mc.to,
-							     false)) {
+							pc, mc.from, mc.to)) {
 					mc.precharge -= HPAGE_PMD_NR;
 					mc.moved_charge += HPAGE_PMD_NR;
 				}
@@ -5510,7 +5418,7 @@ retry:
 				goto put;
 			pc = lookup_page_cgroup(page);
 			if (!mem_cgroup_move_account(page, 1, pc,
-						     mc.from, mc.to, false)) {
+						     mc.from, mc.to)) {
 				mc.precharge--;
 				/* we uncharge from mc.from later. */
 				mc.moved_charge++;
@@ -5521,8 +5429,7 @@ put:			/* get_mctgt_type() gets the page */
 			break;
 		case MC_TARGET_SWAP:
 			ent = target.ent;
-			if (!mem_cgroup_move_swap_account(ent,
-						mc.from, mc.to, false)) {
+			if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
 				mc.precharge--;
 				/* we fixup refcnts and charges later. */
 				mc.moved_swap++;
@@ -5598,7 +5505,6 @@ static void mem_cgroup_move_task(struct cgroup *cont,
 	if (mm) {
 		if (mc.to)
 			mem_cgroup_move_charge(mm);
-		put_swap_token(mm);
 		mmput(mm);
 	}
 	if (mc.to)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c99ad4e6b88c..ab1e7145e290 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1388,16 +1388,16 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
 	 */
 	if (!get_page_unless_zero(compound_head(p))) {
 		if (PageHuge(p)) {
-			pr_info("get_any_page: %#lx free huge page\n", pfn);
+			pr_info("%s: %#lx free huge page\n", __func__, pfn);
 			ret = dequeue_hwpoisoned_huge_page(compound_head(p));
 		} else if (is_free_buddy_page(p)) {
-			pr_info("get_any_page: %#lx free buddy page\n", pfn);
+			pr_info("%s: %#lx free buddy page\n", __func__, pfn);
 			/* Set hwpoison bit while page is still isolated */
 			SetPageHWPoison(p);
 			ret = 0;
 		} else {
-			pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
-				pfn, p->flags);
+			pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
+				__func__, pfn, p->flags);
 			ret = -EIO;
 		}
 	} else {
diff --git a/mm/memory.c b/mm/memory.c
index e40f6759ba98..1b7dc662bf9f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2908,7 +2908,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
 	page = lookup_swap_cache(entry);
 	if (!page) {
-		grab_swap_token(mm); /* Contend for token _before_ read-in */
 		page = swapin_readahead(entry,
 					GFP_HIGHUSER_MOVABLE, vma, address);
 		if (!page) {
@@ -2938,6 +2937,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 
 	locked = lock_page_or_retry(page, mm, flags);
+
 	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
 	if (!locked) {
 		ret |= VM_FAULT_RETRY;
@@ -3486,6 +3486,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (unlikely(is_vm_hugetlb_page(vma)))
 		return hugetlb_fault(mm, vma, address, flags);
 
+retry:
 	pgd = pgd_offset(mm, address);
 	pud = pud_alloc(mm, pgd, address);
 	if (!pud)
@@ -3499,13 +3500,24 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 							  pmd, flags);
 	} else {
 		pmd_t orig_pmd = *pmd;
+		int ret;
+
 		barrier();
 		if (pmd_trans_huge(orig_pmd)) {
 			if (flags & FAULT_FLAG_WRITE &&
 			    !pmd_write(orig_pmd) &&
-			    !pmd_trans_splitting(orig_pmd))
-				return do_huge_pmd_wp_page(mm, vma, address,
-							   pmd, orig_pmd);
+			    !pmd_trans_splitting(orig_pmd)) {
+				ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
+							  orig_pmd);
+				/*
+				 * If COW results in an oom, the huge pmd will
+				 * have been split, so retry the fault on the
+				 * pte for a smaller charge.
+				 */
+				if (unlikely(ret & VM_FAULT_OOM))
+					goto retry;
+				return ret;
+			}
 			return 0;
 		}
 	}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index fc898cb4fe8f..0d7e3ec8e0f3 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -74,8 +74,7 @@ static struct resource *register_memory_resource(u64 start, u64 size)
 	res->end = start + size - 1;
 	res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 	if (request_resource(&iomem_resource, res) < 0) {
-		printk("System RAM resource %llx - %llx cannot be added\n",
-		(unsigned long long)res->start, (unsigned long long)res->end);
+		printk("System RAM resource %pR cannot be added\n", res);
 		kfree(res);
 		res = NULL;
 	}
@@ -502,8 +501,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
 		online_pages_range);
 	if (ret) {
 		mutex_unlock(&zonelists_mutex);
-		printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
-			nr_pages, pfn);
+		printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
+		       (unsigned long long) pfn << PAGE_SHIFT,
+		       (((unsigned long long) pfn + nr_pages)
+			    << PAGE_SHIFT) - 1);
 		memory_notify(MEM_CANCEL_ONLINE, &arg);
 		unlock_memory_hotplug();
 		return ret;
@@ -977,8 +978,9 @@ repeat:
 	return 0;
 
 failed_removal:
-	printk(KERN_INFO "memory offlining %lx to %lx failed\n",
-		start_pfn, end_pfn);
+	printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n",
+	       (unsigned long long) start_pfn << PAGE_SHIFT,
+	       ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
 	memory_notify(MEM_CANCEL_OFFLINE, &arg);
 	/* pushback to free area */
 	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 88f9422b92e7..f15c1b24ca18 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -390,7 +390,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
 {
 	if (!pol)
 		return;
-	if (!mpol_store_user_nodemask(pol) && step == 0 &&
+	if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 		return;
 
@@ -950,8 +950,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
  *
  * Returns the number of page that could not be moved.
  */
-int do_migrate_pages(struct mm_struct *mm,
-	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
+		     const nodemask_t *to, int flags)
 {
 	int busy = 0;
 	int err;
@@ -963,7 +963,7 @@ int do_migrate_pages(struct mm_struct *mm,
 
 	down_read(&mm->mmap_sem);
 
-	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
+	err = migrate_vmas(mm, from, to, flags);
 	if (err)
 		goto out;
 
@@ -998,14 +998,34 @@ int do_migrate_pages(struct mm_struct *mm,
 	 * moved to an empty node, then there is nothing left worth migrating.
 	 */
 
-	tmp = *from_nodes;
+	tmp = *from;
 	while (!nodes_empty(tmp)) {
 		int s,d;
 		int source = -1;
 		int dest = 0;
 
 		for_each_node_mask(s, tmp) {
-			d = node_remap(s, *from_nodes, *to_nodes);
+
+			/*
+			 * do_migrate_pages() tries to maintain the relative
+			 * node relationship of the pages established between
+			 * threads and memory areas.
+                         *
+			 * However if the number of source nodes is not equal to
+			 * the number of destination nodes we can not preserve
+			 * this node relative relationship.  In that case, skip
+			 * copying memory from a node that is in the destination
+			 * mask.
+			 *
+			 * Example: [2,3,4] -> [3,4,5] moves everything.
+			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
+			 */
+
+			if ((nodes_weight(*from) != nodes_weight(*to)) &&
+						(node_isset(s, *to)))
+				continue;
+
+			d = node_remap(s, *from, *to);
 			if (s == d)
 				continue;
 
@@ -1065,8 +1085,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
 {
 }
 
-int do_migrate_pages(struct mm_struct *mm,
-	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
+		     const nodemask_t *to, int flags)
 {
 	return -ENOSYS;
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index e8dcfc7de866..4a9c2a391e28 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1639,33 +1639,34 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 {
 	struct vm_area_struct *vma = NULL;
 
-	if (mm) {
-		/* Check the cache first. */
-		/* (Cache hit rate is typically around 35%.) */
-		vma = mm->mmap_cache;
-		if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
-			struct rb_node * rb_node;
-
-			rb_node = mm->mm_rb.rb_node;
-			vma = NULL;
-
-			while (rb_node) {
-				struct vm_area_struct * vma_tmp;
-
-				vma_tmp = rb_entry(rb_node,
-						struct vm_area_struct, vm_rb);
-
-				if (vma_tmp->vm_end > addr) {
-					vma = vma_tmp;
-					if (vma_tmp->vm_start <= addr)
-						break;
-					rb_node = rb_node->rb_left;
-				} else
-					rb_node = rb_node->rb_right;
-			}
-			if (vma)
-				mm->mmap_cache = vma;
+	if (WARN_ON_ONCE(!mm))		/* Remove this in linux-3.6 */
+		return NULL;
+
+	/* Check the cache first. */
+	/* (Cache hit rate is typically around 35%.) */
+	vma = mm->mmap_cache;
+	if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
+		struct rb_node *rb_node;
+
+		rb_node = mm->mm_rb.rb_node;
+		vma = NULL;
+
+		while (rb_node) {
+			struct vm_area_struct *vma_tmp;
+
+			vma_tmp = rb_entry(rb_node,
+					   struct vm_area_struct, vm_rb);
+
+			if (vma_tmp->vm_end > addr) {
+				vma = vma_tmp;
+				if (vma_tmp->vm_start <= addr)
+					break;
+				rb_node = rb_node->rb_left;
+			} else
+				rb_node = rb_node->rb_right;
 		}
+		if (vma)
+			mm->mmap_cache = vma;
 	}
 	return vma;
 }
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 7cf7b7ddc7c5..6830eab5bf09 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -86,3 +86,17 @@ int memmap_valid_within(unsigned long pfn,
 	return 1;
 }
 #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
+
+void lruvec_init(struct lruvec *lruvec, struct zone *zone)
+{
+	enum lru_list lru;
+
+	memset(lruvec, 0, sizeof(struct lruvec));
+
+	for_each_lru(lru)
+		INIT_LIST_HEAD(&lruvec->lists[lru]);
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+	lruvec->zone = zone;
+#endif
+}
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 1983fb1c7026..d23415c001bc 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -274,86 +274,85 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
 	return ___alloc_bootmem(size, align, goal, limit);
 }
 
-/**
- * __alloc_bootmem_node - allocate boot memory from a specific node
- * @pgdat: node to allocate from
- * @size: size of the request in bytes
- * @align: alignment of the region
- * @goal: preferred starting address of the region
- *
- * The goal is dropped if it can not be satisfied and the allocation will
- * fall back to memory below @goal.
- *
- * Allocation may fall back to any node in the system if the specified node
- * can not hold the requested memory.
- *
- * The function panics if the request can not be satisfied.
- */
-void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
-				   unsigned long align, unsigned long goal)
+static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
+						   unsigned long size,
+						   unsigned long align,
+						   unsigned long goal,
+						   unsigned long limit)
 {
 	void *ptr;
 
-	if (WARN_ON_ONCE(slab_is_available()))
-		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-
 again:
 	ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
-					 goal, -1ULL);
+					goal, limit);
 	if (ptr)
 		return ptr;
 
 	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
-					goal, -1ULL);
-	if (!ptr && goal) {
+					goal, limit);
+	if (ptr)
+		return ptr;
+
+	if (goal) {
 		goal = 0;
 		goto again;
 	}
-	return ptr;
+
+	return NULL;
 }
 
-void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
+void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
 				   unsigned long align, unsigned long goal)
 {
-	return __alloc_bootmem_node(pgdat, size, align, goal);
+	if (WARN_ON_ONCE(slab_is_available()))
+		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
 }
 
-#ifdef CONFIG_SPARSEMEM
-/**
- * alloc_bootmem_section - allocate boot memory from a specific section
- * @size: size of the request in bytes
- * @section_nr: sparse map section to allocate from
- *
- * Return NULL on failure.
- */
-void * __init alloc_bootmem_section(unsigned long size,
-				    unsigned long section_nr)
+void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+				    unsigned long align, unsigned long goal,
+				    unsigned long limit)
 {
-	unsigned long pfn, goal, limit;
+	void *ptr;
 
-	pfn = section_nr_to_pfn(section_nr);
-	goal = pfn << PAGE_SHIFT;
-	limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
+	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
+	if (ptr)
+		return ptr;
 
-	return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
-					 SMP_CACHE_BYTES, goal, limit);
+	printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+	panic("Out of memory");
+	return NULL;
 }
-#endif
 
-void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
+/**
+ * __alloc_bootmem_node - allocate boot memory from a specific node
+ * @pgdat: node to allocate from
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may fall back to any node in the system if the specified node
+ * can not hold the requested memory.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
 				   unsigned long align, unsigned long goal)
 {
-	void *ptr;
-
 	if (WARN_ON_ONCE(slab_is_available()))
 		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
 
-	ptr =  __alloc_memory_core_early(pgdat->node_id, size, align,
-						 goal, -1ULL);
-	if (ptr)
-		return ptr;
+	return ___alloc_bootmem_node(pgdat, size, align, goal, 0);
+}
 
-	return __alloc_bootmem_nopanic(size, align, goal);
+void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
+				   unsigned long align, unsigned long goal)
+{
+	return __alloc_bootmem_node(pgdat, size, align, goal);
 }
 
 #ifndef ARCH_LOW_ADDRESS_LIMIT
@@ -397,16 +396,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
 void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
 				       unsigned long align, unsigned long goal)
 {
-	void *ptr;
-
 	if (WARN_ON_ONCE(slab_is_available()))
 		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
 
-	ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
-				goal, ARCH_LOW_ADDRESS_LIMIT);
-	if (ptr)
-		return ptr;
-
-	return  __alloc_memory_core_early(MAX_NUMNODES, size, align,
-				goal, ARCH_LOW_ADDRESS_LIMIT);
+	return ___alloc_bootmem_node(pgdat, size, align, goal,
+				     ARCH_LOW_ADDRESS_LIMIT);
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 9f09a1fde9f9..ed0e19677360 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -180,10 +180,10 @@ static bool oom_unkillable_task(struct task_struct *p,
  * predictable as possible.  The goal is to return the highest value for the
  * task consuming the most memory to avoid subsequent oom failures.
  */
-unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
-		      const nodemask_t *nodemask, unsigned long totalpages)
+unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
+			  const nodemask_t *nodemask, unsigned long totalpages)
 {
-	long points;
+	unsigned long points;
 
 	if (oom_unkillable_task(p, memcg, nodemask))
 		return 0;
@@ -198,21 +198,11 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	}
 
 	/*
-	 * The memory controller may have a limit of 0 bytes, so avoid a divide
-	 * by zero, if necessary.
-	 */
-	if (!totalpages)
-		totalpages = 1;
-
-	/*
 	 * The baseline for the badness score is the proportion of RAM that each
 	 * task's rss, pagetable and swap space use.
 	 */
-	points = get_mm_rss(p->mm) + p->mm->nr_ptes;
-	points += get_mm_counter(p->mm, MM_SWAPENTS);
-
-	points *= 1000;
-	points /= totalpages;
+	points = get_mm_rss(p->mm) + p->mm->nr_ptes +
+		 get_mm_counter(p->mm, MM_SWAPENTS);
 	task_unlock(p);
 
 	/*
@@ -220,23 +210,20 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	 * implementation used by LSMs.
 	 */
 	if (has_capability_noaudit(p, CAP_SYS_ADMIN))
-		points -= 30;
+		points -= 30 * totalpages / 1000;
 
 	/*
 	 * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
 	 * either completely disable oom killing or always prefer a certain
 	 * task.
 	 */
-	points += p->signal->oom_score_adj;
+	points += p->signal->oom_score_adj * totalpages / 1000;
 
 	/*
-	 * Never return 0 for an eligible task that may be killed since it's
-	 * possible that no single user task uses more than 0.1% of memory and
-	 * no single admin tasks uses more than 3.0%.
+	 * Never return 0 for an eligible task regardless of the root bonus and
+	 * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
 	 */
-	if (points <= 0)
-		return 1;
-	return (points < 1000) ? points : 1000;
+	return points ? points : 1;
 }
 
 /*
@@ -314,7 +301,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
 {
 	struct task_struct *g, *p;
 	struct task_struct *chosen = NULL;
-	*ppoints = 0;
+	unsigned long chosen_points = 0;
 
 	do_each_thread(g, p) {
 		unsigned int points;
@@ -354,7 +341,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
 			 */
 			if (p == current) {
 				chosen = p;
-				*ppoints = 1000;
+				chosen_points = ULONG_MAX;
 			} else if (!force_kill) {
 				/*
 				 * If this task is not being ptraced on exit,
@@ -367,12 +354,13 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
 		}
 
 		points = oom_badness(p, memcg, nodemask, totalpages);
-		if (points > *ppoints) {
+		if (points > chosen_points) {
 			chosen = p;
-			*ppoints = points;
+			chosen_points = points;
 		}
 	} while_each_thread(g, p);
 
+	*ppoints = chosen_points * 1000 / totalpages;
 	return chosen;
 }
 
@@ -572,7 +560,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	}
 
 	check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
-	limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT;
+	limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
 	read_lock(&tasklist_lock);
 	p = select_bad_process(&points, limit, memcg, NULL, false);
 	if (p && PTR_ERR(p) != -1UL)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bab8e3bc4202..6092f331b32e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -219,7 +219,7 @@ EXPORT_SYMBOL(nr_online_nodes);
 
 int page_group_by_mobility_disabled __read_mostly;
 
-static void set_pageblock_migratetype(struct page *page, int migratetype)
+void set_pageblock_migratetype(struct page *page, int migratetype)
 {
 
 	if (unlikely(page_group_by_mobility_disabled))
@@ -954,8 +954,8 @@ static int move_freepages(struct zone *zone,
 	return pages_moved;
 }
 
-static int move_freepages_block(struct zone *zone, struct page *page,
-				int migratetype)
+int move_freepages_block(struct zone *zone, struct page *page,
+			 int migratetype)
 {
 	unsigned long start_pfn, end_pfn;
 	struct page *start_page, *end_page;
@@ -4300,25 +4300,24 @@ static inline void setup_usemap(struct pglist_data *pgdat,
 
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 
-/* Return a sensible default order for the pageblock size. */
-static inline int pageblock_default_order(void)
-{
-	if (HPAGE_SHIFT > PAGE_SHIFT)
-		return HUGETLB_PAGE_ORDER;
-
-	return MAX_ORDER-1;
-}
-
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
-static inline void __init set_pageblock_order(unsigned int order)
+static inline void __init set_pageblock_order(void)
 {
+	unsigned int order;
+
 	/* Check that pageblock_nr_pages has not already been setup */
 	if (pageblock_order)
 		return;
 
+	if (HPAGE_SHIFT > PAGE_SHIFT)
+		order = HUGETLB_PAGE_ORDER;
+	else
+		order = MAX_ORDER - 1;
+
 	/*
 	 * Assume the largest contiguous order of interest is a huge page.
-	 * This value may be variable depending on boot parameters on IA64
+	 * This value may be variable depending on boot parameters on IA64 and
+	 * powerpc.
 	 */
 	pageblock_order = order;
 }
@@ -4326,15 +4325,13 @@ static inline void __init set_pageblock_order(unsigned int order)
 
 /*
  * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
- * and pageblock_default_order() are unused as pageblock_order is set
- * at compile-time. See include/linux/pageblock-flags.h for the values of
- * pageblock_order based on the kernel config
+ * is unused as pageblock_order is set at compile-time. See
+ * include/linux/pageblock-flags.h for the values of pageblock_order based on
+ * the kernel config
  */
-static inline int pageblock_default_order(unsigned int order)
+static inline void set_pageblock_order(void)
 {
-	return MAX_ORDER-1;
 }
-#define set_pageblock_order(x)	do {} while (0)
 
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 
@@ -4361,7 +4358,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long size, realsize, memmap_pages;
-		enum lru_list lru;
 
 		size = zone_spanned_pages_in_node(nid, j, zones_size);
 		realsize = size - zone_absent_pages_in_node(nid, j,
@@ -4411,18 +4407,13 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 		zone->zone_pgdat = pgdat;
 
 		zone_pcp_init(zone);
-		for_each_lru(lru)
-			INIT_LIST_HEAD(&zone->lruvec.lists[lru]);
-		zone->reclaim_stat.recent_rotated[0] = 0;
-		zone->reclaim_stat.recent_rotated[1] = 0;
-		zone->reclaim_stat.recent_scanned[0] = 0;
-		zone->reclaim_stat.recent_scanned[1] = 0;
+		lruvec_init(&zone->lruvec, zone);
 		zap_zone_vm_stats(zone);
 		zone->flags = 0;
 		if (!size)
 			continue;
 
-		set_pageblock_order(pageblock_default_order());
+		set_pageblock_order();
 		setup_usemap(pgdat, zone, size);
 		ret = init_currently_empty_zone(zone, zone_start_pfn,
 						size, MEMMAP_EARLY);
@@ -4815,7 +4806,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 	find_zone_movable_pfns_for_nodes();
 
 	/* Print out the zone ranges */
-	printk("Zone PFN ranges:\n");
+	printk("Zone ranges:\n");
 	for (i = 0; i < MAX_NR_ZONES; i++) {
 		if (i == ZONE_MOVABLE)
 			continue;
@@ -4824,22 +4815,25 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 				arch_zone_highest_possible_pfn[i])
 			printk(KERN_CONT "empty\n");
 		else
-			printk(KERN_CONT "%0#10lx -> %0#10lx\n",
-				arch_zone_lowest_possible_pfn[i],
-				arch_zone_highest_possible_pfn[i]);
+			printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
+				arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
+				(arch_zone_highest_possible_pfn[i]
+					<< PAGE_SHIFT) - 1);
 	}
 
 	/* Print out the PFNs ZONE_MOVABLE begins at in each node */
-	printk("Movable zone start PFN for each node\n");
+	printk("Movable zone start for each node\n");
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		if (zone_movable_pfn[i])
-			printk("  Node %d: %lu\n", i, zone_movable_pfn[i]);
+			printk("  Node %d: %#010lx\n", i,
+			       zone_movable_pfn[i] << PAGE_SHIFT);
 	}
 
 	/* Print out the early_node_map[] */
-	printk("Early memory PFN ranges\n");
+	printk("Early memory node ranges\n");
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
-		printk("  %3d: %0#10lx -> %0#10lx\n", nid, start_pfn, end_pfn);
+		printk("  node %3d: [mem %#010lx-%#010lx]\n", nid,
+		       start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
 
 	/* Initialise every node */
 	mminit_verify_pageflags_layout();
@@ -5657,7 +5651,7 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
 		.nr_migratepages = 0,
 		.order = -1,
 		.zone = page_zone(pfn_to_page(start)),
-		.sync = true,
+		.mode = COMPACT_SYNC,
 	};
 	INIT_LIST_HEAD(&cc.migratepages);
 
@@ -5938,7 +5932,7 @@ bool is_free_buddy_page(struct page *page)
 }
 #endif
 
-static struct trace_print_flags pageflag_names[] = {
+static const struct trace_print_flags pageflag_names[] = {
 	{1UL << PG_locked,		"locked"	},
 	{1UL << PG_error,		"error"		},
 	{1UL << PG_referenced,		"referenced"	},
@@ -5973,7 +5967,9 @@ static struct trace_print_flags pageflag_names[] = {
 #ifdef CONFIG_MEMORY_FAILURE
 	{1UL << PG_hwpoison,		"hwpoison"	},
 #endif
-	{-1UL,				NULL		},
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	{1UL << PG_compound_lock,	"compound_lock"	},
+#endif
 };
 
 static void dump_page_flags(unsigned long flags)
@@ -5982,12 +5978,14 @@ static void dump_page_flags(unsigned long flags)
 	unsigned long mask;
 	int i;
 
+	BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
+
 	printk(KERN_ALERT "page flags: %#lx(", flags);
 
 	/* remove zone id */
 	flags &= (1UL << NR_PAGEFLAGS) - 1;
 
-	for (i = 0; pageflag_names[i].name && flags; i++) {
+	for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
 
 		mask = pageflag_names[i].mask;
 		if ((flags & mask) != mask)
diff --git a/mm/readahead.c b/mm/readahead.c
index cbcbb02f3e28..ea8f8fa21649 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -17,6 +17,8 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/pagevec.h>
 #include <linux/pagemap.h>
+#include <linux/syscalls.h>
+#include <linux/file.h>
 
 /*
  * Initialise a struct file's readahead state.  Assumes that the caller has
@@ -562,3 +564,41 @@ page_cache_async_readahead(struct address_space *mapping,
 	ondemand_readahead(mapping, ra, filp, true, offset, req_size);
 }
 EXPORT_SYMBOL_GPL(page_cache_async_readahead);
+
+static ssize_t
+do_readahead(struct address_space *mapping, struct file *filp,
+	     pgoff_t index, unsigned long nr)
+{
+	if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
+		return -EINVAL;
+
+	force_page_cache_readahead(mapping, filp, index, nr);
+	return 0;
+}
+
+SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
+{
+	ssize_t ret;
+	struct file *file;
+
+	ret = -EBADF;
+	file = fget(fd);
+	if (file) {
+		if (file->f_mode & FMODE_READ) {
+			struct address_space *mapping = file->f_mapping;
+			pgoff_t start = offset >> PAGE_CACHE_SHIFT;
+			pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
+			unsigned long len = end - start + 1;
+			ret = do_readahead(mapping, file, start, len);
+		}
+		fput(file);
+	}
+	return ret;
+}
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_readahead(long fd, loff_t offset, long count)
+{
+	return SYSC_readahead((int) fd, offset, (size_t) count);
+}
+SYSCALL_ALIAS(sys_readahead, SyS_readahead);
+#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 5b5ad584ffb7..0f3b7cda2a24 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -755,12 +755,6 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
 		pte_unmap_unlock(pte, ptl);
 	}
 
-	/* Pretend the page is referenced if the task has the
-	   swap token and is in the middle of a page fault. */
-	if (mm != current->mm && has_swap_token(mm) &&
-			rwsem_is_locked(&mm->mmap_sem))
-		referenced++;
-
 	(*mapcount)--;
 
 	if (referenced)
diff --git a/mm/shmem.c b/mm/shmem.c
index be5af34a070d..d576b84d913c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -53,6 +53,7 @@ static struct vfsmount *shm_mnt;
 #include <linux/blkdev.h>
 #include <linux/pagevec.h>
 #include <linux/percpu_counter.h>
+#include <linux/falloc.h>
 #include <linux/splice.h>
 #include <linux/security.h>
 #include <linux/swapops.h>
@@ -83,12 +84,25 @@ struct shmem_xattr {
 	char value[0];
 };
 
+/*
+ * shmem_fallocate and shmem_writepage communicate via inode->i_private
+ * (with i_mutex making sure that it has only one user at a time):
+ * we would prefer not to enlarge the shmem inode just for that.
+ */
+struct shmem_falloc {
+	pgoff_t start;		/* start of range currently being fallocated */
+	pgoff_t next;		/* the next page offset to be fallocated */
+	pgoff_t nr_falloced;	/* how many new pages have been fallocated */
+	pgoff_t nr_unswapped;	/* how often writepage refused to swap out */
+};
+
 /* Flag allocation requirements to shmem_getpage */
 enum sgp_type {
 	SGP_READ,	/* don't exceed i_size, don't allocate page */
 	SGP_CACHE,	/* don't exceed i_size, may allocate page */
 	SGP_DIRTY,	/* like SGP_CACHE, but set new page dirty */
-	SGP_WRITE,	/* may exceed i_size, may allocate page */
+	SGP_WRITE,	/* may exceed i_size, may allocate !Uptodate page */
+	SGP_FALLOC,	/* like SGP_WRITE, but make existing page Uptodate */
 };
 
 #ifdef CONFIG_TMPFS
@@ -103,6 +117,9 @@ static unsigned long shmem_default_max_inodes(void)
 }
 #endif
 
+static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
+static int shmem_replace_page(struct page **pagep, gfp_t gfp,
+				struct shmem_inode_info *info, pgoff_t index);
 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 	struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
 
@@ -423,27 +440,31 @@ void shmem_unlock_mapping(struct address_space *mapping)
 
 /*
  * Remove range of pages and swap entries from radix tree, and free them.
+ * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
  */
-void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
+static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
+								 bool unfalloc)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct shmem_inode_info *info = SHMEM_I(inode);
 	pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
-	pgoff_t end = (lend >> PAGE_CACHE_SHIFT);
+	pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT;
+	unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1);
+	unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
 	struct pagevec pvec;
 	pgoff_t indices[PAGEVEC_SIZE];
 	long nr_swaps_freed = 0;
 	pgoff_t index;
 	int i;
 
-	BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
+	if (lend == -1)
+		end = -1;	/* unsigned, so actually very big */
 
 	pagevec_init(&pvec, 0);
 	index = start;
-	while (index <= end) {
+	while (index < end) {
 		pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
-			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+				min(end - index, (pgoff_t)PAGEVEC_SIZE),
 							pvec.pages, indices);
 		if (!pvec.nr)
 			break;
@@ -452,10 +473,12 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 			struct page *page = pvec.pages[i];
 
 			index = indices[i];
-			if (index > end)
+			if (index >= end)
 				break;
 
 			if (radix_tree_exceptional_entry(page)) {
+				if (unfalloc)
+					continue;
 				nr_swaps_freed += !shmem_free_swap(mapping,
 								index, page);
 				continue;
@@ -463,9 +486,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 
 			if (!trylock_page(page))
 				continue;
-			if (page->mapping == mapping) {
-				VM_BUG_ON(PageWriteback(page));
-				truncate_inode_page(mapping, page);
+			if (!unfalloc || !PageUptodate(page)) {
+				if (page->mapping == mapping) {
+					VM_BUG_ON(PageWriteback(page));
+					truncate_inode_page(mapping, page);
+				}
 			}
 			unlock_page(page);
 		}
@@ -476,30 +501,47 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 		index++;
 	}
 
-	if (partial) {
+	if (partial_start) {
 		struct page *page = NULL;
 		shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
 		if (page) {
-			zero_user_segment(page, partial, PAGE_CACHE_SIZE);
+			unsigned int top = PAGE_CACHE_SIZE;
+			if (start > end) {
+				top = partial_end;
+				partial_end = 0;
+			}
+			zero_user_segment(page, partial_start, top);
+			set_page_dirty(page);
+			unlock_page(page);
+			page_cache_release(page);
+		}
+	}
+	if (partial_end) {
+		struct page *page = NULL;
+		shmem_getpage(inode, end, &page, SGP_READ, NULL);
+		if (page) {
+			zero_user_segment(page, 0, partial_end);
 			set_page_dirty(page);
 			unlock_page(page);
 			page_cache_release(page);
 		}
 	}
+	if (start >= end)
+		return;
 
 	index = start;
 	for ( ; ; ) {
 		cond_resched();
 		pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
-			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+				min(end - index, (pgoff_t)PAGEVEC_SIZE),
 							pvec.pages, indices);
 		if (!pvec.nr) {
-			if (index == start)
+			if (index == start || unfalloc)
 				break;
 			index = start;
 			continue;
 		}
-		if (index == start && indices[0] > end) {
+		if ((index == start || unfalloc) && indices[0] >= end) {
 			shmem_deswap_pagevec(&pvec);
 			pagevec_release(&pvec);
 			break;
@@ -509,19 +551,23 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 			struct page *page = pvec.pages[i];
 
 			index = indices[i];
-			if (index > end)
+			if (index >= end)
 				break;
 
 			if (radix_tree_exceptional_entry(page)) {
+				if (unfalloc)
+					continue;
 				nr_swaps_freed += !shmem_free_swap(mapping,
 								index, page);
 				continue;
 			}
 
 			lock_page(page);
-			if (page->mapping == mapping) {
-				VM_BUG_ON(PageWriteback(page));
-				truncate_inode_page(mapping, page);
+			if (!unfalloc || !PageUptodate(page)) {
+				if (page->mapping == mapping) {
+					VM_BUG_ON(PageWriteback(page));
+					truncate_inode_page(mapping, page);
+				}
 			}
 			unlock_page(page);
 		}
@@ -535,7 +581,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 	info->swapped -= nr_swaps_freed;
 	shmem_recalc_inode(inode);
 	spin_unlock(&info->lock);
+}
 
+void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
+{
+	shmem_undo_range(inode, lstart, lend, false);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 }
 EXPORT_SYMBOL_GPL(shmem_truncate_range);
@@ -604,12 +654,13 @@ static void shmem_evict_inode(struct inode *inode)
  * If swap found in inode, free it and move page from swapcache to filecache.
  */
 static int shmem_unuse_inode(struct shmem_inode_info *info,
-			     swp_entry_t swap, struct page *page)
+			     swp_entry_t swap, struct page **pagep)
 {
 	struct address_space *mapping = info->vfs_inode.i_mapping;
 	void *radswap;
 	pgoff_t index;
-	int error;
+	gfp_t gfp;
+	int error = 0;
 
 	radswap = swp_to_radix_entry(swap);
 	index = radix_tree_locate_item(&mapping->page_tree, radswap);
@@ -625,22 +676,37 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
 	if (shmem_swaplist.next != &info->swaplist)
 		list_move_tail(&shmem_swaplist, &info->swaplist);
 
+	gfp = mapping_gfp_mask(mapping);
+	if (shmem_should_replace_page(*pagep, gfp)) {
+		mutex_unlock(&shmem_swaplist_mutex);
+		error = shmem_replace_page(pagep, gfp, info, index);
+		mutex_lock(&shmem_swaplist_mutex);
+		/*
+		 * We needed to drop mutex to make that restrictive page
+		 * allocation; but the inode might already be freed by now,
+		 * and we cannot refer to inode or mapping or info to check.
+		 * However, we do hold page lock on the PageSwapCache page,
+		 * so can check if that still has our reference remaining.
+		 */
+		if (!page_swapcount(*pagep))
+			error = -ENOENT;
+	}
+
 	/*
 	 * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
 	 * but also to hold up shmem_evict_inode(): so inode cannot be freed
 	 * beneath us (pagelock doesn't help until the page is in pagecache).
 	 */
-	error = shmem_add_to_page_cache(page, mapping, index,
+	if (!error)
+		error = shmem_add_to_page_cache(*pagep, mapping, index,
 						GFP_NOWAIT, radswap);
-	/* which does mem_cgroup_uncharge_cache_page on error */
-
 	if (error != -ENOMEM) {
 		/*
 		 * Truncation and eviction use free_swap_and_cache(), which
 		 * only does trylock page: if we raced, best clean up here.
 		 */
-		delete_from_swap_cache(page);
-		set_page_dirty(page);
+		delete_from_swap_cache(*pagep);
+		set_page_dirty(*pagep);
 		if (!error) {
 			spin_lock(&info->lock);
 			info->swapped--;
@@ -660,7 +726,14 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
 	struct list_head *this, *next;
 	struct shmem_inode_info *info;
 	int found = 0;
-	int error;
+	int error = 0;
+
+	/*
+	 * There's a faint possibility that swap page was replaced before
+	 * caller locked it: it will come back later with the right page.
+	 */
+	if (unlikely(!PageSwapCache(page)))
+		goto out;
 
 	/*
 	 * Charge page using GFP_KERNEL while we can wait, before taking
@@ -676,7 +749,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
 	list_for_each_safe(this, next, &shmem_swaplist) {
 		info = list_entry(this, struct shmem_inode_info, swaplist);
 		if (info->swapped)
-			found = shmem_unuse_inode(info, swap, page);
+			found = shmem_unuse_inode(info, swap, &page);
 		else
 			list_del_init(&info->swaplist);
 		cond_resched();
@@ -685,8 +758,6 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
 	}
 	mutex_unlock(&shmem_swaplist_mutex);
 
-	if (!found)
-		mem_cgroup_uncharge_cache_page(page);
 	if (found < 0)
 		error = found;
 out:
@@ -727,6 +798,38 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 		WARN_ON_ONCE(1);	/* Still happens? Tell us about it! */
 		goto redirty;
 	}
+
+	/*
+	 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
+	 * value into swapfile.c, the only way we can correctly account for a
+	 * fallocated page arriving here is now to initialize it and write it.
+	 *
+	 * That's okay for a page already fallocated earlier, but if we have
+	 * not yet completed the fallocation, then (a) we want to keep track
+	 * of this page in case we have to undo it, and (b) it may not be a
+	 * good idea to continue anyway, once we're pushing into swap.  So
+	 * reactivate the page, and let shmem_fallocate() quit when too many.
+	 */
+	if (!PageUptodate(page)) {
+		if (inode->i_private) {
+			struct shmem_falloc *shmem_falloc;
+			spin_lock(&inode->i_lock);
+			shmem_falloc = inode->i_private;
+			if (shmem_falloc &&
+			    index >= shmem_falloc->start &&
+			    index < shmem_falloc->next)
+				shmem_falloc->nr_unswapped++;
+			else
+				shmem_falloc = NULL;
+			spin_unlock(&inode->i_lock);
+			if (shmem_falloc)
+				goto redirty;
+		}
+		clear_highpage(page);
+		flush_dcache_page(page);
+		SetPageUptodate(page);
+	}
+
 	swap = get_swap_page();
 	if (!swap.val)
 		goto redirty;
@@ -856,6 +959,84 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
 #endif
 
 /*
+ * When a page is moved from swapcache to shmem filecache (either by the
+ * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
+ * shmem_unuse_inode()), it may have been read in earlier from swap, in
+ * ignorance of the mapping it belongs to.  If that mapping has special
+ * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
+ * we may need to copy to a suitable page before moving to filecache.
+ *
+ * In a future release, this may well be extended to respect cpuset and
+ * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
+ * but for now it is a simple matter of zone.
+ */
+static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
+{
+	return page_zonenum(page) > gfp_zone(gfp);
+}
+
+static int shmem_replace_page(struct page **pagep, gfp_t gfp,
+				struct shmem_inode_info *info, pgoff_t index)
+{
+	struct page *oldpage, *newpage;
+	struct address_space *swap_mapping;
+	pgoff_t swap_index;
+	int error;
+
+	oldpage = *pagep;
+	swap_index = page_private(oldpage);
+	swap_mapping = page_mapping(oldpage);
+
+	/*
+	 * We have arrived here because our zones are constrained, so don't
+	 * limit chance of success by further cpuset and node constraints.
+	 */
+	gfp &= ~GFP_CONSTRAINT_MASK;
+	newpage = shmem_alloc_page(gfp, info, index);
+	if (!newpage)
+		return -ENOMEM;
+	VM_BUG_ON(shmem_should_replace_page(newpage, gfp));
+
+	*pagep = newpage;
+	page_cache_get(newpage);
+	copy_highpage(newpage, oldpage);
+
+	VM_BUG_ON(!PageLocked(oldpage));
+	__set_page_locked(newpage);
+	VM_BUG_ON(!PageUptodate(oldpage));
+	SetPageUptodate(newpage);
+	VM_BUG_ON(!PageSwapBacked(oldpage));
+	SetPageSwapBacked(newpage);
+	VM_BUG_ON(!swap_index);
+	set_page_private(newpage, swap_index);
+	VM_BUG_ON(!PageSwapCache(oldpage));
+	SetPageSwapCache(newpage);
+
+	/*
+	 * Our caller will very soon move newpage out of swapcache, but it's
+	 * a nice clean interface for us to replace oldpage by newpage there.
+	 */
+	spin_lock_irq(&swap_mapping->tree_lock);
+	error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
+								   newpage);
+	__inc_zone_page_state(newpage, NR_FILE_PAGES);
+	__dec_zone_page_state(oldpage, NR_FILE_PAGES);
+	spin_unlock_irq(&swap_mapping->tree_lock);
+	BUG_ON(error);
+
+	mem_cgroup_replace_page_cache(oldpage, newpage);
+	lru_cache_add_anon(newpage);
+
+	ClearPageSwapCache(oldpage);
+	set_page_private(oldpage, 0);
+
+	unlock_page(oldpage);
+	page_cache_release(oldpage);
+	page_cache_release(oldpage);
+	return 0;
+}
+
+/*
  * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
  *
  * If we allocate a new one we do not mark it dirty. That's up to the
@@ -872,6 +1053,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 	swp_entry_t swap;
 	int error;
 	int once = 0;
+	int alloced = 0;
 
 	if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
 		return -EFBIG;
@@ -883,19 +1065,21 @@ repeat:
 		page = NULL;
 	}
 
-	if (sgp != SGP_WRITE &&
+	if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
 	    ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
 		error = -EINVAL;
 		goto failed;
 	}
 
+	/* fallocated page? */
+	if (page && !PageUptodate(page)) {
+		if (sgp != SGP_READ)
+			goto clear;
+		unlock_page(page);
+		page_cache_release(page);
+		page = NULL;
+	}
 	if (page || (sgp == SGP_READ && !swap.val)) {
-		/*
-		 * Once we can get the page lock, it must be uptodate:
-		 * if there were an error in reading back from swap,
-		 * the page would not be inserted into the filecache.
-		 */
-		BUG_ON(page && !PageUptodate(page));
 		*pagep = page;
 		return 0;
 	}
@@ -923,19 +1107,20 @@ repeat:
 
 		/* We have to do this with page locked to prevent races */
 		lock_page(page);
+		if (!PageSwapCache(page) || page->mapping) {
+			error = -EEXIST;	/* try again */
+			goto failed;
+		}
 		if (!PageUptodate(page)) {
 			error = -EIO;
 			goto failed;
 		}
 		wait_on_page_writeback(page);
 
-		/* Someone may have already done it for us */
-		if (page->mapping) {
-			if (page->mapping == mapping &&
-			    page->index == index)
-				goto done;
-			error = -EEXIST;
-			goto failed;
+		if (shmem_should_replace_page(page, gfp)) {
+			error = shmem_replace_page(&page, gfp, info, index);
+			if (error)
+				goto failed;
 		}
 
 		error = mem_cgroup_cache_charge(page, current->mm,
@@ -991,19 +1176,36 @@ repeat:
 		inode->i_blocks += BLOCKS_PER_PAGE;
 		shmem_recalc_inode(inode);
 		spin_unlock(&info->lock);
+		alloced = true;
 
-		clear_highpage(page);
-		flush_dcache_page(page);
-		SetPageUptodate(page);
+		/*
+		 * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
+		 */
+		if (sgp == SGP_FALLOC)
+			sgp = SGP_WRITE;
+clear:
+		/*
+		 * Let SGP_WRITE caller clear ends if write does not fill page;
+		 * but SGP_FALLOC on a page fallocated earlier must initialize
+		 * it now, lest undo on failure cancel our earlier guarantee.
+		 */
+		if (sgp != SGP_WRITE) {
+			clear_highpage(page);
+			flush_dcache_page(page);
+			SetPageUptodate(page);
+		}
 		if (sgp == SGP_DIRTY)
 			set_page_dirty(page);
 	}
-done:
+
 	/* Perhaps the file has been truncated since we checked */
-	if (sgp != SGP_WRITE &&
+	if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
 	    ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
 		error = -EINVAL;
-		goto trunc;
+		if (alloced)
+			goto trunc;
+		else
+			goto failed;
 	}
 	*pagep = page;
 	return 0;
@@ -1012,6 +1214,7 @@ done:
 	 * Error recovery.
 	 */
 trunc:
+	info = SHMEM_I(inode);
 	ClearPageDirty(page);
 	delete_from_page_cache(page);
 	spin_lock(&info->lock);
@@ -1019,6 +1222,7 @@ trunc:
 	inode->i_blocks -= BLOCKS_PER_PAGE;
 	spin_unlock(&info->lock);
 decused:
+	sbinfo = SHMEM_SB(inode->i_sb);
 	if (sbinfo->max_blocks)
 		percpu_counter_add(&sbinfo->used_blocks, -1);
 unacct:
@@ -1204,6 +1408,14 @@ shmem_write_end(struct file *file, struct address_space *mapping,
 	if (pos + copied > inode->i_size)
 		i_size_write(inode, pos + copied);
 
+	if (!PageUptodate(page)) {
+		if (copied < PAGE_CACHE_SIZE) {
+			unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+			zero_user_segments(page, 0, from,
+					from + copied, PAGE_CACHE_SIZE);
+		}
+		SetPageUptodate(page);
+	}
 	set_page_dirty(page);
 	unlock_page(page);
 	page_cache_release(page);
@@ -1462,6 +1674,199 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
 	return error;
 }
 
+/*
+ * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
+ */
+static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
+				    pgoff_t index, pgoff_t end, int origin)
+{
+	struct page *page;
+	struct pagevec pvec;
+	pgoff_t indices[PAGEVEC_SIZE];
+	bool done = false;
+	int i;
+
+	pagevec_init(&pvec, 0);
+	pvec.nr = 1;		/* start small: we may be there already */
+	while (!done) {
+		pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+					pvec.nr, pvec.pages, indices);
+		if (!pvec.nr) {
+			if (origin == SEEK_DATA)
+				index = end;
+			break;
+		}
+		for (i = 0; i < pvec.nr; i++, index++) {
+			if (index < indices[i]) {
+				if (origin == SEEK_HOLE) {
+					done = true;
+					break;
+				}
+				index = indices[i];
+			}
+			page = pvec.pages[i];
+			if (page && !radix_tree_exceptional_entry(page)) {
+				if (!PageUptodate(page))
+					page = NULL;
+			}
+			if (index >= end ||
+			    (page && origin == SEEK_DATA) ||
+			    (!page && origin == SEEK_HOLE)) {
+				done = true;
+				break;
+			}
+		}
+		shmem_deswap_pagevec(&pvec);
+		pagevec_release(&pvec);
+		pvec.nr = PAGEVEC_SIZE;
+		cond_resched();
+	}
+	return index;
+}
+
+static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin)
+{
+	struct address_space *mapping;
+	struct inode *inode;
+	pgoff_t start, end;
+	loff_t new_offset;
+
+	if (origin != SEEK_DATA && origin != SEEK_HOLE)
+		return generic_file_llseek_size(file, offset, origin,
+							MAX_LFS_FILESIZE);
+	mapping = file->f_mapping;
+	inode = mapping->host;
+	mutex_lock(&inode->i_mutex);
+	/* We're holding i_mutex so we can access i_size directly */
+
+	if (offset < 0)
+		offset = -EINVAL;
+	else if (offset >= inode->i_size)
+		offset = -ENXIO;
+	else {
+		start = offset >> PAGE_CACHE_SHIFT;
+		end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+		new_offset = shmem_seek_hole_data(mapping, start, end, origin);
+		new_offset <<= PAGE_CACHE_SHIFT;
+		if (new_offset > offset) {
+			if (new_offset < inode->i_size)
+				offset = new_offset;
+			else if (origin == SEEK_DATA)
+				offset = -ENXIO;
+			else
+				offset = inode->i_size;
+		}
+	}
+
+	if (offset >= 0 && offset != file->f_pos) {
+		file->f_pos = offset;
+		file->f_version = 0;
+	}
+	mutex_unlock(&inode->i_mutex);
+	return offset;
+}
+
+static long shmem_fallocate(struct file *file, int mode, loff_t offset,
+							 loff_t len)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+	struct shmem_falloc shmem_falloc;
+	pgoff_t start, index, end;
+	int error;
+
+	mutex_lock(&inode->i_mutex);
+
+	if (mode & FALLOC_FL_PUNCH_HOLE) {
+		struct address_space *mapping = file->f_mapping;
+		loff_t unmap_start = round_up(offset, PAGE_SIZE);
+		loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
+
+		if ((u64)unmap_end > (u64)unmap_start)
+			unmap_mapping_range(mapping, unmap_start,
+					    1 + unmap_end - unmap_start, 0);
+		shmem_truncate_range(inode, offset, offset + len - 1);
+		/* No need to unmap again: hole-punching leaves COWed pages */
+		error = 0;
+		goto out;
+	}
+
+	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
+	error = inode_newsize_ok(inode, offset + len);
+	if (error)
+		goto out;
+
+	start = offset >> PAGE_CACHE_SHIFT;
+	end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	/* Try to avoid a swapstorm if len is impossible to satisfy */
+	if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
+		error = -ENOSPC;
+		goto out;
+	}
+
+	shmem_falloc.start = start;
+	shmem_falloc.next  = start;
+	shmem_falloc.nr_falloced = 0;
+	shmem_falloc.nr_unswapped = 0;
+	spin_lock(&inode->i_lock);
+	inode->i_private = &shmem_falloc;
+	spin_unlock(&inode->i_lock);
+
+	for (index = start; index < end; index++) {
+		struct page *page;
+
+		/*
+		 * Good, the fallocate(2) manpage permits EINTR: we may have
+		 * been interrupted because we are using up too much memory.
+		 */
+		if (signal_pending(current))
+			error = -EINTR;
+		else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
+			error = -ENOMEM;
+		else
+			error = shmem_getpage(inode, index, &page, SGP_FALLOC,
+									NULL);
+		if (error) {
+			/* Remove the !PageUptodate pages we added */
+			shmem_undo_range(inode,
+				(loff_t)start << PAGE_CACHE_SHIFT,
+				(loff_t)index << PAGE_CACHE_SHIFT, true);
+			goto undone;
+		}
+
+		/*
+		 * Inform shmem_writepage() how far we have reached.
+		 * No need for lock or barrier: we have the page lock.
+		 */
+		shmem_falloc.next++;
+		if (!PageUptodate(page))
+			shmem_falloc.nr_falloced++;
+
+		/*
+		 * If !PageUptodate, leave it that way so that freeable pages
+		 * can be recognized if we need to rollback on error later.
+		 * But set_page_dirty so that memory pressure will swap rather
+		 * than free the pages we are allocating (and SGP_CACHE pages
+		 * might still be clean: we now need to mark those dirty too).
+		 */
+		set_page_dirty(page);
+		unlock_page(page);
+		page_cache_release(page);
+		cond_resched();
+	}
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
+		i_size_write(inode, offset + len);
+	inode->i_ctime = CURRENT_TIME;
+undone:
+	spin_lock(&inode->i_lock);
+	inode->i_private = NULL;
+	spin_unlock(&inode->i_lock);
+out:
+	mutex_unlock(&inode->i_mutex);
+	return error;
+}
+
 static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
@@ -1665,6 +2070,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
 		kaddr = kmap_atomic(page);
 		memcpy(kaddr, symname, len);
 		kunmap_atomic(kaddr);
+		SetPageUptodate(page);
 		set_page_dirty(page);
 		unlock_page(page);
 		page_cache_release(page);
@@ -2270,6 +2676,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
 		}
 	}
 	sb->s_export_op = &shmem_export_ops;
+	sb->s_flags |= MS_NOSEC;
 #else
 	sb->s_flags |= MS_NOUSER;
 #endif
@@ -2364,7 +2771,7 @@ static const struct address_space_operations shmem_aops = {
 static const struct file_operations shmem_file_operations = {
 	.mmap		= shmem_mmap,
 #ifdef CONFIG_TMPFS
-	.llseek		= generic_file_llseek,
+	.llseek		= shmem_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
 	.aio_read	= shmem_file_aio_read,
@@ -2372,12 +2779,12 @@ static const struct file_operations shmem_file_operations = {
 	.fsync		= noop_fsync,
 	.splice_read	= shmem_file_splice_read,
 	.splice_write	= generic_file_splice_write,
+	.fallocate	= shmem_fallocate,
 #endif
 };
 
 static const struct inode_operations shmem_inode_operations = {
 	.setattr	= shmem_setattr,
-	.truncate_range	= shmem_truncate_range,
 #ifdef CONFIG_TMPFS_XATTR
 	.setxattr	= shmem_setxattr,
 	.getxattr	= shmem_getxattr,
diff --git a/mm/sparse.c b/mm/sparse.c
index a8bc7d364deb..6a4bf9160e85 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -273,10 +273,10 @@ static unsigned long *__kmalloc_section_usemap(void)
 #ifdef CONFIG_MEMORY_HOTREMOVE
 static unsigned long * __init
 sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
-					 unsigned long count)
+					 unsigned long size)
 {
-	unsigned long section_nr;
-
+	pg_data_t *host_pgdat;
+	unsigned long goal;
 	/*
 	 * A page may contain usemaps for other sections preventing the
 	 * page being freed and making a section unremovable while
@@ -287,8 +287,10 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
 	 * from the same section as the pgdat where possible to avoid
 	 * this problem.
 	 */
-	section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
-	return alloc_bootmem_section(usemap_size() * count, section_nr);
+	goal = __pa(pgdat) & PAGE_SECTION_MASK;
+	host_pgdat = NODE_DATA(early_pfn_to_nid(goal >> PAGE_SHIFT));
+	return __alloc_bootmem_node_nopanic(host_pgdat, size,
+					    SMP_CACHE_BYTES, goal);
 }
 
 static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -332,9 +334,9 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
 #else
 static unsigned long * __init
 sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
-					 unsigned long count)
+					 unsigned long size)
 {
-	return NULL;
+	return alloc_bootmem_node_nopanic(pgdat, size);
 }
 
 static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -352,13 +354,10 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
 	int size = usemap_size();
 
 	usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
-								 usemap_count);
+							  size * usemap_count);
 	if (!usemap) {
-		usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count);
-		if (!usemap) {
-			printk(KERN_WARNING "%s: allocation failed\n", __func__);
-			return;
-		}
+		printk(KERN_WARNING "%s: allocation failed\n", __func__);
+		return;
 	}
 
 	for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
diff --git a/mm/swap.c b/mm/swap.c
index 5c13f1338972..4e7e2ec67078 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -47,13 +47,15 @@ static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
 static void __page_cache_release(struct page *page)
 {
 	if (PageLRU(page)) {
-		unsigned long flags;
 		struct zone *zone = page_zone(page);
+		struct lruvec *lruvec;
+		unsigned long flags;
 
 		spin_lock_irqsave(&zone->lru_lock, flags);
+		lruvec = mem_cgroup_page_lruvec(page, zone);
 		VM_BUG_ON(!PageLRU(page));
 		__ClearPageLRU(page);
-		del_page_from_lru_list(zone, page, page_off_lru(page));
+		del_page_from_lru_list(page, lruvec, page_off_lru(page));
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 	}
 }
@@ -82,6 +84,25 @@ static void put_compound_page(struct page *page)
 		if (likely(page != page_head &&
 			   get_page_unless_zero(page_head))) {
 			unsigned long flags;
+
+			/*
+			 * THP can not break up slab pages so avoid taking
+			 * compound_lock().  Slab performs non-atomic bit ops
+			 * on page->flags for better performance.  In particular
+			 * slab_unlock() in slub used to be a hot path.  It is
+			 * still hot on arches that do not support
+			 * this_cpu_cmpxchg_double().
+			 */
+			if (PageSlab(page_head)) {
+				if (PageTail(page)) {
+					if (put_page_testzero(page_head))
+						VM_BUG_ON(1);
+
+					atomic_dec(&page->_mapcount);
+					goto skip_lock_tail;
+				} else
+					goto skip_lock;
+			}
 			/*
 			 * page_head wasn't a dangling pointer but it
 			 * may not be a head page anymore by the time
@@ -92,10 +113,10 @@ static void put_compound_page(struct page *page)
 			if (unlikely(!PageTail(page))) {
 				/* __split_huge_page_refcount run before us */
 				compound_unlock_irqrestore(page_head, flags);
-				VM_BUG_ON(PageHead(page_head));
+skip_lock:
 				if (put_page_testzero(page_head))
 					__put_single_page(page_head);
-			out_put_single:
+out_put_single:
 				if (put_page_testzero(page))
 					__put_single_page(page);
 				return;
@@ -115,6 +136,8 @@ static void put_compound_page(struct page *page)
 			VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
 			VM_BUG_ON(atomic_read(&page->_count) != 0);
 			compound_unlock_irqrestore(page_head, flags);
+
+skip_lock_tail:
 			if (put_page_testzero(page_head)) {
 				if (PageHead(page_head))
 					__put_compound_page(page_head);
@@ -162,6 +185,18 @@ bool __get_page_tail(struct page *page)
 	struct page *page_head = compound_trans_head(page);
 
 	if (likely(page != page_head && get_page_unless_zero(page_head))) {
+
+		/* Ref to put_compound_page() comment. */
+		if (PageSlab(page_head)) {
+			if (likely(PageTail(page))) {
+				__get_page_tail_foll(page, false);
+				return true;
+			} else {
+				put_page(page_head);
+				return false;
+			}
+		}
+
 		/*
 		 * page_head wasn't a dangling pointer but it
 		 * may not be a head page anymore by the time
@@ -202,11 +237,12 @@ void put_pages_list(struct list_head *pages)
 EXPORT_SYMBOL(put_pages_list);
 
 static void pagevec_lru_move_fn(struct pagevec *pvec,
-				void (*move_fn)(struct page *page, void *arg),
-				void *arg)
+	void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
+	void *arg)
 {
 	int i;
 	struct zone *zone = NULL;
+	struct lruvec *lruvec;
 	unsigned long flags = 0;
 
 	for (i = 0; i < pagevec_count(pvec); i++) {
@@ -220,7 +256,8 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
 			spin_lock_irqsave(&zone->lru_lock, flags);
 		}
 
-		(*move_fn)(page, arg);
+		lruvec = mem_cgroup_page_lruvec(page, zone);
+		(*move_fn)(page, lruvec, arg);
 	}
 	if (zone)
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
@@ -228,16 +265,13 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
 	pagevec_reinit(pvec);
 }
 
-static void pagevec_move_tail_fn(struct page *page, void *arg)
+static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec,
+				 void *arg)
 {
 	int *pgmoved = arg;
 
 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
 		enum lru_list lru = page_lru_base_type(page);
-		struct lruvec *lruvec;
-
-		lruvec = mem_cgroup_lru_move_lists(page_zone(page),
-						   page, lru, lru);
 		list_move_tail(&page->lru, &lruvec->lists[lru]);
 		(*pgmoved)++;
 	}
@@ -276,41 +310,30 @@ void rotate_reclaimable_page(struct page *page)
 	}
 }
 
-static void update_page_reclaim_stat(struct zone *zone, struct page *page,
+static void update_page_reclaim_stat(struct lruvec *lruvec,
 				     int file, int rotated)
 {
-	struct zone_reclaim_stat *reclaim_stat = &zone->reclaim_stat;
-	struct zone_reclaim_stat *memcg_reclaim_stat;
-
-	memcg_reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page);
+	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
 
 	reclaim_stat->recent_scanned[file]++;
 	if (rotated)
 		reclaim_stat->recent_rotated[file]++;
-
-	if (!memcg_reclaim_stat)
-		return;
-
-	memcg_reclaim_stat->recent_scanned[file]++;
-	if (rotated)
-		memcg_reclaim_stat->recent_rotated[file]++;
 }
 
-static void __activate_page(struct page *page, void *arg)
+static void __activate_page(struct page *page, struct lruvec *lruvec,
+			    void *arg)
 {
-	struct zone *zone = page_zone(page);
-
 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
 		int file = page_is_file_cache(page);
 		int lru = page_lru_base_type(page);
-		del_page_from_lru_list(zone, page, lru);
 
+		del_page_from_lru_list(page, lruvec, lru);
 		SetPageActive(page);
 		lru += LRU_ACTIVE;
-		add_page_to_lru_list(zone, page, lru);
-		__count_vm_event(PGACTIVATE);
+		add_page_to_lru_list(page, lruvec, lru);
 
-		update_page_reclaim_stat(zone, page, file, 1);
+		__count_vm_event(PGACTIVATE);
+		update_page_reclaim_stat(lruvec, file, 1);
 	}
 }
 
@@ -347,7 +370,7 @@ void activate_page(struct page *page)
 	struct zone *zone = page_zone(page);
 
 	spin_lock_irq(&zone->lru_lock);
-	__activate_page(page, NULL);
+	__activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL);
 	spin_unlock_irq(&zone->lru_lock);
 }
 #endif
@@ -414,11 +437,13 @@ void lru_cache_add_lru(struct page *page, enum lru_list lru)
 void add_page_to_unevictable_list(struct page *page)
 {
 	struct zone *zone = page_zone(page);
+	struct lruvec *lruvec;
 
 	spin_lock_irq(&zone->lru_lock);
+	lruvec = mem_cgroup_page_lruvec(page, zone);
 	SetPageUnevictable(page);
 	SetPageLRU(page);
-	add_page_to_lru_list(zone, page, LRU_UNEVICTABLE);
+	add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE);
 	spin_unlock_irq(&zone->lru_lock);
 }
 
@@ -443,11 +468,11 @@ void add_page_to_unevictable_list(struct page *page)
  * be write it out by flusher threads as this is much more effective
  * than the single-page writeout from reclaim.
  */
-static void lru_deactivate_fn(struct page *page, void *arg)
+static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+			      void *arg)
 {
 	int lru, file;
 	bool active;
-	struct zone *zone = page_zone(page);
 
 	if (!PageLRU(page))
 		return;
@@ -460,13 +485,13 @@ static void lru_deactivate_fn(struct page *page, void *arg)
 		return;
 
 	active = PageActive(page);
-
 	file = page_is_file_cache(page);
 	lru = page_lru_base_type(page);
-	del_page_from_lru_list(zone, page, lru + active);
+
+	del_page_from_lru_list(page, lruvec, lru + active);
 	ClearPageActive(page);
 	ClearPageReferenced(page);
-	add_page_to_lru_list(zone, page, lru);
+	add_page_to_lru_list(page, lruvec, lru);
 
 	if (PageWriteback(page) || PageDirty(page)) {
 		/*
@@ -476,19 +501,17 @@ static void lru_deactivate_fn(struct page *page, void *arg)
 		 */
 		SetPageReclaim(page);
 	} else {
-		struct lruvec *lruvec;
 		/*
 		 * The page's writeback ends up during pagevec
 		 * We moves tha page into tail of inactive.
 		 */
-		lruvec = mem_cgroup_lru_move_lists(zone, page, lru, lru);
 		list_move_tail(&page->lru, &lruvec->lists[lru]);
 		__count_vm_event(PGROTATED);
 	}
 
 	if (active)
 		__count_vm_event(PGDEACTIVATE);
-	update_page_reclaim_stat(zone, page, file, 0);
+	update_page_reclaim_stat(lruvec, file, 0);
 }
 
 /*
@@ -588,6 +611,7 @@ void release_pages(struct page **pages, int nr, int cold)
 	int i;
 	LIST_HEAD(pages_to_free);
 	struct zone *zone = NULL;
+	struct lruvec *lruvec;
 	unsigned long uninitialized_var(flags);
 
 	for (i = 0; i < nr; i++) {
@@ -615,9 +639,11 @@ void release_pages(struct page **pages, int nr, int cold)
 				zone = pagezone;
 				spin_lock_irqsave(&zone->lru_lock, flags);
 			}
+
+			lruvec = mem_cgroup_page_lruvec(page, zone);
 			VM_BUG_ON(!PageLRU(page));
 			__ClearPageLRU(page);
-			del_page_from_lru_list(zone, page, page_off_lru(page));
+			del_page_from_lru_list(page, lruvec, page_off_lru(page));
 		}
 
 		list_add(&page->lru, &pages_to_free);
@@ -649,8 +675,8 @@ EXPORT_SYMBOL(__pagevec_release);
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /* used by __split_huge_page_refcount() */
-void lru_add_page_tail(struct zone* zone,
-		       struct page *page, struct page *page_tail)
+void lru_add_page_tail(struct page *page, struct page *page_tail,
+		       struct lruvec *lruvec)
 {
 	int uninitialized_var(active);
 	enum lru_list lru;
@@ -659,7 +685,8 @@ void lru_add_page_tail(struct zone* zone,
 	VM_BUG_ON(!PageHead(page));
 	VM_BUG_ON(PageCompound(page_tail));
 	VM_BUG_ON(PageLRU(page_tail));
-	VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&zone->lru_lock));
+	VM_BUG_ON(NR_CPUS != 1 &&
+		  !spin_is_locked(&lruvec_zone(lruvec)->lru_lock));
 
 	SetPageLRU(page_tail);
 
@@ -688,20 +715,20 @@ void lru_add_page_tail(struct zone* zone,
 		 * Use the standard add function to put page_tail on the list,
 		 * but then correct its position so they all end up in order.
 		 */
-		add_page_to_lru_list(zone, page_tail, lru);
+		add_page_to_lru_list(page_tail, lruvec, lru);
 		list_head = page_tail->lru.prev;
 		list_move_tail(&page_tail->lru, list_head);
 	}
 
 	if (!PageUnevictable(page))
-		update_page_reclaim_stat(zone, page_tail, file, active);
+		update_page_reclaim_stat(lruvec, file, active);
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
-static void __pagevec_lru_add_fn(struct page *page, void *arg)
+static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
+				 void *arg)
 {
 	enum lru_list lru = (enum lru_list)arg;
-	struct zone *zone = page_zone(page);
 	int file = is_file_lru(lru);
 	int active = is_active_lru(lru);
 
@@ -712,8 +739,8 @@ static void __pagevec_lru_add_fn(struct page *page, void *arg)
 	SetPageLRU(page);
 	if (active)
 		SetPageActive(page);
-	add_page_to_lru_list(zone, page, lru);
-	update_page_reclaim_stat(zone, page, file, active);
+	add_page_to_lru_list(page, lruvec, lru);
+	update_page_reclaim_stat(lruvec, file, active);
 }
 
 /*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index fafc26d1b1dc..457b10baef59 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -601,7 +601,7 @@ void swapcache_free(swp_entry_t entry, struct page *page)
  * This does not give an exact answer when swap count is continued,
  * but does include the high COUNT_CONTINUED flag to allow for that.
  */
-static inline int page_swapcount(struct page *page)
+int page_swapcount(struct page *page)
 {
 	int count = 0;
 	struct swap_info_struct *p;
@@ -717,37 +717,6 @@ int free_swap_and_cache(swp_entry_t entry)
 	return p != NULL;
 }
 
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-/**
- * mem_cgroup_count_swap_user - count the user of a swap entry
- * @ent: the swap entry to be checked
- * @pagep: the pointer for the swap cache page of the entry to be stored
- *
- * Returns the number of the user of the swap entry. The number is valid only
- * for swaps of anonymous pages.
- * If the entry is found on swap cache, the page is stored to pagep with
- * refcount of it being incremented.
- */
-int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
-{
-	struct page *page;
-	struct swap_info_struct *p;
-	int count = 0;
-
-	page = find_get_page(&swapper_space, ent.val);
-	if (page)
-		count += page_mapcount(page);
-	p = swap_info_get(ent);
-	if (p) {
-		count += swap_count(p->swap_map[swp_offset(ent)]);
-		spin_unlock(&swap_lock);
-	}
-
-	*pagep = page;
-	return count;
-}
-#endif
-
 #ifdef CONFIG_HIBERNATION
 /*
  * Find the swap type that corresponds to given device (if any).
diff --git a/mm/thrash.c b/mm/thrash.c
deleted file mode 100644
index 57ad495dbd54..000000000000
--- a/mm/thrash.c
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * mm/thrash.c
- *
- * Copyright (C) 2004, Red Hat, Inc.
- * Copyright (C) 2004, Rik van Riel <[email protected]>
- * Released under the GPL, see the file COPYING for details.
- *
- * Simple token based thrashing protection, using the algorithm
- * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html
- *
- * Sep 2006, Ashwin Chaugule <[email protected]>
- * Improved algorithm to pass token:
- * Each task has a priority which is incremented if it contended
- * for the token in an interval less than its previous attempt.
- * If the token is acquired, that task's priority is boosted to prevent
- * the token from bouncing around too often and to let the task make
- * some progress in its execution.
- */
-
-#include <linux/jiffies.h>
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/swap.h>
-#include <linux/memcontrol.h>
-
-#include <trace/events/vmscan.h>
-
-#define TOKEN_AGING_INTERVAL	(0xFF)
-
-static DEFINE_SPINLOCK(swap_token_lock);
-struct mm_struct *swap_token_mm;
-static struct mem_cgroup *swap_token_memcg;
-
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
-{
-	struct mem_cgroup *memcg;
-
-	memcg = try_get_mem_cgroup_from_mm(mm);
-	if (memcg)
-		css_put(mem_cgroup_css(memcg));
-
-	return memcg;
-}
-#else
-static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
-{
-	return NULL;
-}
-#endif
-
-void grab_swap_token(struct mm_struct *mm)
-{
-	int current_interval;
-	unsigned int old_prio = mm->token_priority;
-	static unsigned int global_faults;
-	static unsigned int last_aging;
-
-	global_faults++;
-
-	current_interval = global_faults - mm->faultstamp;
-
-	if (!spin_trylock(&swap_token_lock))
-		return;
-
-	/* First come first served */
-	if (!swap_token_mm)
-		goto replace_token;
-
-	/*
-	 * Usually, we don't need priority aging because long interval faults
-	 * makes priority decrease quickly. But there is one exception. If the
-	 * token owner task is sleeping, it never make long interval faults.
-	 * Thus, we need a priority aging mechanism instead. The requirements
-	 * of priority aging are
-	 *  1) An aging interval is reasonable enough long. Too short aging
-	 *     interval makes quick swap token lost and decrease performance.
-	 *  2) The swap token owner task have to get priority aging even if
-	 *     it's under sleep.
-	 */
-	if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) {
-		swap_token_mm->token_priority /= 2;
-		last_aging = global_faults;
-	}
-
-	if (mm == swap_token_mm) {
-		mm->token_priority += 2;
-		goto update_priority;
-	}
-
-	if (current_interval < mm->last_interval)
-		mm->token_priority++;
-	else {
-		if (likely(mm->token_priority > 0))
-			mm->token_priority--;
-	}
-
-	/* Check if we deserve the token */
-	if (mm->token_priority > swap_token_mm->token_priority)
-		goto replace_token;
-
-update_priority:
-	trace_update_swap_token_priority(mm, old_prio, swap_token_mm);
-
-out:
-	mm->faultstamp = global_faults;
-	mm->last_interval = current_interval;
-	spin_unlock(&swap_token_lock);
-	return;
-
-replace_token:
-	mm->token_priority += 2;
-	trace_replace_swap_token(swap_token_mm, mm);
-	swap_token_mm = mm;
-	swap_token_memcg = swap_token_memcg_from_mm(mm);
-	last_aging = global_faults;
-	goto out;
-}
-
-/* Called on process exit. */
-void __put_swap_token(struct mm_struct *mm)
-{
-	spin_lock(&swap_token_lock);
-	if (likely(mm == swap_token_mm)) {
-		trace_put_swap_token(swap_token_mm);
-		swap_token_mm = NULL;
-		swap_token_memcg = NULL;
-	}
-	spin_unlock(&swap_token_lock);
-}
-
-static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b)
-{
-	if (!a)
-		return true;
-	if (!b)
-		return true;
-	if (a == b)
-		return true;
-	return false;
-}
-
-void disable_swap_token(struct mem_cgroup *memcg)
-{
-	/* memcg reclaim don't disable unrelated mm token. */
-	if (match_memcg(memcg, swap_token_memcg)) {
-		spin_lock(&swap_token_lock);
-		if (match_memcg(memcg, swap_token_memcg)) {
-			trace_disable_swap_token(swap_token_mm);
-			swap_token_mm = NULL;
-			swap_token_memcg = NULL;
-		}
-		spin_unlock(&swap_token_lock);
-	}
-}
diff --git a/mm/truncate.c b/mm/truncate.c
index 61a183b89df6..75801acdaac7 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -602,31 +602,6 @@ int vmtruncate(struct inode *inode, loff_t newsize)
 }
 EXPORT_SYMBOL(vmtruncate);
 
-int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend)
-{
-	struct address_space *mapping = inode->i_mapping;
-	loff_t holebegin = round_up(lstart, PAGE_SIZE);
-	loff_t holelen = 1 + lend - holebegin;
-
-	/*
-	 * If the underlying filesystem is not going to provide
-	 * a way to truncate a range of blocks (punch a hole) -
-	 * we should return failure right now.
-	 */
-	if (!inode->i_op->truncate_range)
-		return -ENOSYS;
-
-	mutex_lock(&inode->i_mutex);
-	inode_dio_wait(inode);
-	unmap_mapping_range(mapping, holebegin, holelen, 1);
-	inode->i_op->truncate_range(inode, lstart, lend);
-	/* unmap again to remove racily COWed private pages */
-	unmap_mapping_range(mapping, holebegin, holelen, 1);
-	mutex_unlock(&inode->i_mutex);
-
-	return 0;
-}
-
 /**
  * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
  * @inode: inode
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 94dff883b449..2aad49981b57 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1185,9 +1185,10 @@ void __init vmalloc_init(void)
 	/* Import existing vmlist entries. */
 	for (tmp = vmlist; tmp; tmp = tmp->next) {
 		va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
-		va->flags = tmp->flags | VM_VM_AREA;
+		va->flags = VM_VM_AREA;
 		va->va_start = (unsigned long)tmp->addr;
 		va->va_end = va->va_start + tmp->size;
+		va->vm = tmp;
 		__insert_vmap_area(va);
 	}
 
@@ -2375,8 +2376,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 		return NULL;
 	}
 
-	vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL);
-	vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL);
+	vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
+	vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
 	if (!vas || !vms)
 		goto err_free2;
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 33dc256033b5..eeb3bc9d1d36 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -53,24 +53,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/vmscan.h>
 
-/*
- * reclaim_mode determines how the inactive list is shrunk
- * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages
- * RECLAIM_MODE_ASYNC:  Do not block
- * RECLAIM_MODE_SYNC:   Allow blocking e.g. call wait_on_page_writeback
- * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference
- *			page from the LRU and reclaim all pages within a
- *			naturally aligned range
- * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of
- *			order-0 pages and then compact the zone
- */
-typedef unsigned __bitwise__ reclaim_mode_t;
-#define RECLAIM_MODE_SINGLE		((__force reclaim_mode_t)0x01u)
-#define RECLAIM_MODE_ASYNC		((__force reclaim_mode_t)0x02u)
-#define RECLAIM_MODE_SYNC		((__force reclaim_mode_t)0x04u)
-#define RECLAIM_MODE_LUMPYRECLAIM	((__force reclaim_mode_t)0x08u)
-#define RECLAIM_MODE_COMPACTION		((__force reclaim_mode_t)0x10u)
-
 struct scan_control {
 	/* Incremented by the number of inactive pages that were scanned */
 	unsigned long nr_scanned;
@@ -96,11 +78,8 @@ struct scan_control {
 
 	int order;
 
-	/*
-	 * Intend to reclaim enough continuous memory rather than reclaim
-	 * enough amount of memory. i.e, mode for high order allocation.
-	 */
-	reclaim_mode_t reclaim_mode;
+	/* Scan (total_size >> priority) pages at once */
+	int priority;
 
 	/*
 	 * The memory cgroup that hit its limit and as a result is the
@@ -115,11 +94,6 @@ struct scan_control {
 	nodemask_t	*nodemask;
 };
 
-struct mem_cgroup_zone {
-	struct mem_cgroup *mem_cgroup;
-	struct zone *zone;
-};
-
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 
 #ifdef ARCH_HAS_PREFETCH
@@ -164,44 +138,21 @@ static bool global_reclaim(struct scan_control *sc)
 {
 	return !sc->target_mem_cgroup;
 }
-
-static bool scanning_global_lru(struct mem_cgroup_zone *mz)
-{
-	return !mz->mem_cgroup;
-}
 #else
 static bool global_reclaim(struct scan_control *sc)
 {
 	return true;
 }
-
-static bool scanning_global_lru(struct mem_cgroup_zone *mz)
-{
-	return true;
-}
 #endif
 
-static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz)
-{
-	if (!scanning_global_lru(mz))
-		return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone);
-
-	return &mz->zone->reclaim_stat;
-}
-
-static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz,
-				       enum lru_list lru)
+static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 {
-	if (!scanning_global_lru(mz))
-		return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup,
-						    zone_to_nid(mz->zone),
-						    zone_idx(mz->zone),
-						    BIT(lru));
+	if (!mem_cgroup_disabled())
+		return mem_cgroup_get_lru_size(lruvec, lru);
 
-	return zone_page_state(mz->zone, NR_LRU_BASE + lru);
+	return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);
 }
 
-
 /*
  * Add a shrinker callback to be called from the vm
  */
@@ -364,39 +315,6 @@ out:
 	return ret;
 }
 
-static void set_reclaim_mode(int priority, struct scan_control *sc,
-				   bool sync)
-{
-	reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
-
-	/*
-	 * Initially assume we are entering either lumpy reclaim or
-	 * reclaim/compaction.Depending on the order, we will either set the
-	 * sync mode or just reclaim order-0 pages later.
-	 */
-	if (COMPACTION_BUILD)
-		sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
-	else
-		sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
-
-	/*
-	 * Avoid using lumpy reclaim or reclaim/compaction if possible by
-	 * restricting when its set to either costly allocations or when
-	 * under memory pressure
-	 */
-	if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
-		sc->reclaim_mode |= syncmode;
-	else if (sc->order && priority < DEF_PRIORITY - 2)
-		sc->reclaim_mode |= syncmode;
-	else
-		sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
-}
-
-static void reset_reclaim_mode(struct scan_control *sc)
-{
-	sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
-}
-
 static inline int is_page_cache_freeable(struct page *page)
 {
 	/*
@@ -416,10 +334,6 @@ static int may_write_to_queue(struct backing_dev_info *bdi,
 		return 1;
 	if (bdi == current->backing_dev_info)
 		return 1;
-
-	/* lumpy reclaim for hugepage often need a lot of write */
-	if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
-		return 1;
 	return 0;
 }
 
@@ -523,8 +437,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 			/* synchronous write or broken a_ops? */
 			ClearPageReclaim(page);
 		}
-		trace_mm_vmscan_writepage(page,
-			trace_reclaim_flags(page, sc->reclaim_mode));
+		trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));
 		inc_zone_page_state(page, NR_VMSCAN_WRITE);
 		return PAGE_SUCCESS;
 	}
@@ -701,19 +614,15 @@ enum page_references {
 };
 
 static enum page_references page_check_references(struct page *page,
-						  struct mem_cgroup_zone *mz,
 						  struct scan_control *sc)
 {
 	int referenced_ptes, referenced_page;
 	unsigned long vm_flags;
 
-	referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags);
+	referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
+					  &vm_flags);
 	referenced_page = TestClearPageReferenced(page);
 
-	/* Lumpy reclaim - ignore references */
-	if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
-		return PAGEREF_RECLAIM;
-
 	/*
 	 * Mlock lost the isolation race with us.  Let try_to_unmap()
 	 * move the page to the unevictable list.
@@ -722,7 +631,7 @@ static enum page_references page_check_references(struct page *page,
 		return PAGEREF_RECLAIM;
 
 	if (referenced_ptes) {
-		if (PageAnon(page))
+		if (PageSwapBacked(page))
 			return PAGEREF_ACTIVATE;
 		/*
 		 * All mapped pages start out with page table
@@ -763,9 +672,8 @@ static enum page_references page_check_references(struct page *page,
  * shrink_page_list() returns the number of reclaimed pages
  */
 static unsigned long shrink_page_list(struct list_head *page_list,
-				      struct mem_cgroup_zone *mz,
+				      struct zone *zone,
 				      struct scan_control *sc,
-				      int priority,
 				      unsigned long *ret_nr_dirty,
 				      unsigned long *ret_nr_writeback)
 {
@@ -794,7 +702,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			goto keep;
 
 		VM_BUG_ON(PageActive(page));
-		VM_BUG_ON(page_zone(page) != mz->zone);
+		VM_BUG_ON(page_zone(page) != zone);
 
 		sc->nr_scanned++;
 
@@ -813,22 +721,11 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
 		if (PageWriteback(page)) {
 			nr_writeback++;
-			/*
-			 * Synchronous reclaim cannot queue pages for
-			 * writeback due to the possibility of stack overflow
-			 * but if it encounters a page under writeback, wait
-			 * for the IO to complete.
-			 */
-			if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
-			    may_enter_fs)
-				wait_on_page_writeback(page);
-			else {
-				unlock_page(page);
-				goto keep_lumpy;
-			}
+			unlock_page(page);
+			goto keep;
 		}
 
-		references = page_check_references(page, mz, sc);
+		references = page_check_references(page, sc);
 		switch (references) {
 		case PAGEREF_ACTIVATE:
 			goto activate_locked;
@@ -879,7 +776,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			 * unless under significant pressure.
 			 */
 			if (page_is_file_cache(page) &&
-					(!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) {
+					(!current_is_kswapd() ||
+					 sc->priority >= DEF_PRIORITY - 2)) {
 				/*
 				 * Immediately reclaim when written back.
 				 * Similar in principal to deactivate_page()
@@ -908,7 +806,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				goto activate_locked;
 			case PAGE_SUCCESS:
 				if (PageWriteback(page))
-					goto keep_lumpy;
+					goto keep;
 				if (PageDirty(page))
 					goto keep;
 
@@ -994,7 +892,6 @@ cull_mlocked:
 			try_to_free_swap(page);
 		unlock_page(page);
 		putback_lru_page(page);
-		reset_reclaim_mode(sc);
 		continue;
 
 activate_locked:
@@ -1007,8 +904,6 @@ activate_locked:
 keep_locked:
 		unlock_page(page);
 keep:
-		reset_reclaim_mode(sc);
-keep_lumpy:
 		list_add(&page->lru, &ret_pages);
 		VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
 	}
@@ -1020,7 +915,7 @@ keep_lumpy:
 	 * will encounter the same problem
 	 */
 	if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
-		zone_set_flag(mz->zone, ZONE_CONGESTED);
+		zone_set_flag(zone, ZONE_CONGESTED);
 
 	free_hot_cold_page_list(&free_pages, 1);
 
@@ -1041,34 +936,15 @@ keep_lumpy:
  *
  * returns 0 on success, -ve errno on failure.
  */
-int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
+int __isolate_lru_page(struct page *page, isolate_mode_t mode)
 {
-	bool all_lru_mode;
 	int ret = -EINVAL;
 
 	/* Only take pages on the LRU. */
 	if (!PageLRU(page))
 		return ret;
 
-	all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
-		(ISOLATE_ACTIVE|ISOLATE_INACTIVE);
-
-	/*
-	 * When checking the active state, we need to be sure we are
-	 * dealing with comparible boolean values.  Take the logical not
-	 * of each.
-	 */
-	if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
-		return ret;
-
-	if (!all_lru_mode && !!page_is_file_cache(page) != file)
-		return ret;
-
-	/*
-	 * When this function is being called for lumpy reclaim, we
-	 * initially look into all LRU pages, active, inactive and
-	 * unevictable; only give shrink_page_list evictable pages.
-	 */
+	/* Do not give back unevictable pages for compaction */
 	if (PageUnevictable(page))
 		return ret;
 
@@ -1135,54 +1011,39 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
  * Appropriate locks must be held before calling this function.
  *
  * @nr_to_scan:	The number of pages to look through on the list.
- * @mz:		The mem_cgroup_zone to pull pages from.
+ * @lruvec:	The LRU vector to pull pages from.
  * @dst:	The temp list to put pages on to.
  * @nr_scanned:	The number of pages that were scanned.
  * @sc:		The scan_control struct for this reclaim session
  * @mode:	One of the LRU isolation modes
- * @active:	True [1] if isolating active pages
- * @file:	True [1] if isolating file [!anon] pages
+ * @lru:	LRU list id for isolating
  *
  * returns how many pages were moved onto *@dst.
  */
 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
-		struct mem_cgroup_zone *mz, struct list_head *dst,
+		struct lruvec *lruvec, struct list_head *dst,
 		unsigned long *nr_scanned, struct scan_control *sc,
-		isolate_mode_t mode, int active, int file)
+		isolate_mode_t mode, enum lru_list lru)
 {
-	struct lruvec *lruvec;
-	struct list_head *src;
+	struct list_head *src = &lruvec->lists[lru];
 	unsigned long nr_taken = 0;
-	unsigned long nr_lumpy_taken = 0;
-	unsigned long nr_lumpy_dirty = 0;
-	unsigned long nr_lumpy_failed = 0;
 	unsigned long scan;
-	int lru = LRU_BASE;
-
-	lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup);
-	if (active)
-		lru += LRU_ACTIVE;
-	if (file)
-		lru += LRU_FILE;
-	src = &lruvec->lists[lru];
 
 	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
 		struct page *page;
-		unsigned long pfn;
-		unsigned long end_pfn;
-		unsigned long page_pfn;
-		int zone_id;
+		int nr_pages;
 
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
 
 		VM_BUG_ON(!PageLRU(page));
 
-		switch (__isolate_lru_page(page, mode, file)) {
+		switch (__isolate_lru_page(page, mode)) {
 		case 0:
-			mem_cgroup_lru_del(page);
+			nr_pages = hpage_nr_pages(page);
+			mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
 			list_move(&page->lru, dst);
-			nr_taken += hpage_nr_pages(page);
+			nr_taken += nr_pages;
 			break;
 
 		case -EBUSY:
@@ -1193,93 +1054,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 		default:
 			BUG();
 		}
-
-		if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM))
-			continue;
-
-		/*
-		 * Attempt to take all pages in the order aligned region
-		 * surrounding the tag page.  Only take those pages of
-		 * the same active state as that tag page.  We may safely
-		 * round the target page pfn down to the requested order
-		 * as the mem_map is guaranteed valid out to MAX_ORDER,
-		 * where that page is in a different zone we will detect
-		 * it from its zone id and abort this block scan.
-		 */
-		zone_id = page_zone_id(page);
-		page_pfn = page_to_pfn(page);
-		pfn = page_pfn & ~((1 << sc->order) - 1);
-		end_pfn = pfn + (1 << sc->order);
-		for (; pfn < end_pfn; pfn++) {
-			struct page *cursor_page;
-
-			/* The target page is in the block, ignore it. */
-			if (unlikely(pfn == page_pfn))
-				continue;
-
-			/* Avoid holes within the zone. */
-			if (unlikely(!pfn_valid_within(pfn)))
-				break;
-
-			cursor_page = pfn_to_page(pfn);
-
-			/* Check that we have not crossed a zone boundary. */
-			if (unlikely(page_zone_id(cursor_page) != zone_id))
-				break;
-
-			/*
-			 * If we don't have enough swap space, reclaiming of
-			 * anon page which don't already have a swap slot is
-			 * pointless.
-			 */
-			if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) &&
-			    !PageSwapCache(cursor_page))
-				break;
-
-			if (__isolate_lru_page(cursor_page, mode, file) == 0) {
-				unsigned int isolated_pages;
-
-				mem_cgroup_lru_del(cursor_page);
-				list_move(&cursor_page->lru, dst);
-				isolated_pages = hpage_nr_pages(cursor_page);
-				nr_taken += isolated_pages;
-				nr_lumpy_taken += isolated_pages;
-				if (PageDirty(cursor_page))
-					nr_lumpy_dirty += isolated_pages;
-				scan++;
-				pfn += isolated_pages - 1;
-			} else {
-				/*
-				 * Check if the page is freed already.
-				 *
-				 * We can't use page_count() as that
-				 * requires compound_head and we don't
-				 * have a pin on the page here. If a
-				 * page is tail, we may or may not
-				 * have isolated the head, so assume
-				 * it's not free, it'd be tricky to
-				 * track the head status without a
-				 * page pin.
-				 */
-				if (!PageTail(cursor_page) &&
-				    !atomic_read(&cursor_page->_count))
-					continue;
-				break;
-			}
-		}
-
-		/* If we break out of the loop above, lumpy reclaim failed */
-		if (pfn < end_pfn)
-			nr_lumpy_failed++;
 	}
 
 	*nr_scanned = scan;
-
-	trace_mm_vmscan_lru_isolate(sc->order,
-			nr_to_scan, scan,
-			nr_taken,
-			nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
-			mode, file);
+	trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
+				    nr_taken, mode, is_file_lru(lru));
 	return nr_taken;
 }
 
@@ -1316,15 +1095,16 @@ int isolate_lru_page(struct page *page)
 
 	if (PageLRU(page)) {
 		struct zone *zone = page_zone(page);
+		struct lruvec *lruvec;
 
 		spin_lock_irq(&zone->lru_lock);
+		lruvec = mem_cgroup_page_lruvec(page, zone);
 		if (PageLRU(page)) {
 			int lru = page_lru(page);
-			ret = 0;
 			get_page(page);
 			ClearPageLRU(page);
-
-			del_page_from_lru_list(zone, page, lru);
+			del_page_from_lru_list(page, lruvec, lru);
+			ret = 0;
 		}
 		spin_unlock_irq(&zone->lru_lock);
 	}
@@ -1357,11 +1137,10 @@ static int too_many_isolated(struct zone *zone, int file,
 }
 
 static noinline_for_stack void
-putback_inactive_pages(struct mem_cgroup_zone *mz,
-		       struct list_head *page_list)
+putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
 {
-	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
-	struct zone *zone = mz->zone;
+	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
+	struct zone *zone = lruvec_zone(lruvec);
 	LIST_HEAD(pages_to_free);
 
 	/*
@@ -1379,9 +1158,13 @@ putback_inactive_pages(struct mem_cgroup_zone *mz,
 			spin_lock_irq(&zone->lru_lock);
 			continue;
 		}
+
+		lruvec = mem_cgroup_page_lruvec(page, zone);
+
 		SetPageLRU(page);
 		lru = page_lru(page);
-		add_page_to_lru_list(zone, page, lru);
+		add_page_to_lru_list(page, lruvec, lru);
+
 		if (is_active_lru(lru)) {
 			int file = is_file_lru(lru);
 			int numpages = hpage_nr_pages(page);
@@ -1390,7 +1173,7 @@ putback_inactive_pages(struct mem_cgroup_zone *mz,
 		if (put_page_testzero(page)) {
 			__ClearPageLRU(page);
 			__ClearPageActive(page);
-			del_page_from_lru_list(zone, page, lru);
+			del_page_from_lru_list(page, lruvec, lru);
 
 			if (unlikely(PageCompound(page))) {
 				spin_unlock_irq(&zone->lru_lock);
@@ -1407,112 +1190,24 @@ putback_inactive_pages(struct mem_cgroup_zone *mz,
 	list_splice(&pages_to_free, page_list);
 }
 
-static noinline_for_stack void
-update_isolated_counts(struct mem_cgroup_zone *mz,
-		       struct list_head *page_list,
-		       unsigned long *nr_anon,
-		       unsigned long *nr_file)
-{
-	struct zone *zone = mz->zone;
-	unsigned int count[NR_LRU_LISTS] = { 0, };
-	unsigned long nr_active = 0;
-	struct page *page;
-	int lru;
-
-	/*
-	 * Count pages and clear active flags
-	 */
-	list_for_each_entry(page, page_list, lru) {
-		int numpages = hpage_nr_pages(page);
-		lru = page_lru_base_type(page);
-		if (PageActive(page)) {
-			lru += LRU_ACTIVE;
-			ClearPageActive(page);
-			nr_active += numpages;
-		}
-		count[lru] += numpages;
-	}
-
-	preempt_disable();
-	__count_vm_events(PGDEACTIVATE, nr_active);
-
-	__mod_zone_page_state(zone, NR_ACTIVE_FILE,
-			      -count[LRU_ACTIVE_FILE]);
-	__mod_zone_page_state(zone, NR_INACTIVE_FILE,
-			      -count[LRU_INACTIVE_FILE]);
-	__mod_zone_page_state(zone, NR_ACTIVE_ANON,
-			      -count[LRU_ACTIVE_ANON]);
-	__mod_zone_page_state(zone, NR_INACTIVE_ANON,
-			      -count[LRU_INACTIVE_ANON]);
-
-	*nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
-	*nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
-
-	__mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
-	__mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
-	preempt_enable();
-}
-
-/*
- * Returns true if a direct reclaim should wait on pages under writeback.
- *
- * If we are direct reclaiming for contiguous pages and we do not reclaim
- * everything in the list, try again and wait for writeback IO to complete.
- * This will stall high-order allocations noticeably. Only do that when really
- * need to free the pages under high memory pressure.
- */
-static inline bool should_reclaim_stall(unsigned long nr_taken,
-					unsigned long nr_freed,
-					int priority,
-					struct scan_control *sc)
-{
-	int lumpy_stall_priority;
-
-	/* kswapd should not stall on sync IO */
-	if (current_is_kswapd())
-		return false;
-
-	/* Only stall on lumpy reclaim */
-	if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
-		return false;
-
-	/* If we have reclaimed everything on the isolated list, no stall */
-	if (nr_freed == nr_taken)
-		return false;
-
-	/*
-	 * For high-order allocations, there are two stall thresholds.
-	 * High-cost allocations stall immediately where as lower
-	 * order allocations such as stacks require the scanning
-	 * priority to be much higher before stalling.
-	 */
-	if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
-		lumpy_stall_priority = DEF_PRIORITY;
-	else
-		lumpy_stall_priority = DEF_PRIORITY / 3;
-
-	return priority <= lumpy_stall_priority;
-}
-
 /*
  * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
  * of reclaimed pages
  */
 static noinline_for_stack unsigned long
-shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
-		     struct scan_control *sc, int priority, int file)
+shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
+		     struct scan_control *sc, enum lru_list lru)
 {
 	LIST_HEAD(page_list);
 	unsigned long nr_scanned;
 	unsigned long nr_reclaimed = 0;
 	unsigned long nr_taken;
-	unsigned long nr_anon;
-	unsigned long nr_file;
 	unsigned long nr_dirty = 0;
 	unsigned long nr_writeback = 0;
-	isolate_mode_t isolate_mode = ISOLATE_INACTIVE;
-	struct zone *zone = mz->zone;
-	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
+	isolate_mode_t isolate_mode = 0;
+	int file = is_file_lru(lru);
+	struct zone *zone = lruvec_zone(lruvec);
+	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
 
 	while (unlikely(too_many_isolated(zone, file, sc))) {
 		congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1522,10 +1217,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
 			return SWAP_CLUSTER_MAX;
 	}
 
-	set_reclaim_mode(priority, sc, false);
-	if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
-		isolate_mode |= ISOLATE_ACTIVE;
-
 	lru_add_drain();
 
 	if (!sc->may_unmap)
@@ -1535,38 +1226,30 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
 
 	spin_lock_irq(&zone->lru_lock);
 
-	nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned,
-				     sc, isolate_mode, 0, file);
+	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
+				     &nr_scanned, sc, isolate_mode, lru);
+
+	__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
+	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+
 	if (global_reclaim(sc)) {
 		zone->pages_scanned += nr_scanned;
 		if (current_is_kswapd())
-			__count_zone_vm_events(PGSCAN_KSWAPD, zone,
-					       nr_scanned);
+			__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
 		else
-			__count_zone_vm_events(PGSCAN_DIRECT, zone,
-					       nr_scanned);
+			__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
 	}
 	spin_unlock_irq(&zone->lru_lock);
 
 	if (nr_taken == 0)
 		return 0;
 
-	update_isolated_counts(mz, &page_list, &nr_anon, &nr_file);
-
-	nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,
+	nr_reclaimed = shrink_page_list(&page_list, zone, sc,
 						&nr_dirty, &nr_writeback);
 
-	/* Check if we should syncronously wait for writeback */
-	if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
-		set_reclaim_mode(priority, sc, true);
-		nr_reclaimed += shrink_page_list(&page_list, mz, sc,
-					priority, &nr_dirty, &nr_writeback);
-	}
-
 	spin_lock_irq(&zone->lru_lock);
 
-	reclaim_stat->recent_scanned[0] += nr_anon;
-	reclaim_stat->recent_scanned[1] += nr_file;
+	reclaim_stat->recent_scanned[file] += nr_taken;
 
 	if (global_reclaim(sc)) {
 		if (current_is_kswapd())
@@ -1577,10 +1260,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
 					       nr_reclaimed);
 	}
 
-	putback_inactive_pages(mz, &page_list);
+	putback_inactive_pages(lruvec, &page_list);
 
-	__mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
-	__mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
+	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
 
 	spin_unlock_irq(&zone->lru_lock);
 
@@ -1609,14 +1291,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
 	 * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
 	 *                     isolated page is PageWriteback
 	 */
-	if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority)))
+	if (nr_writeback && nr_writeback >=
+			(nr_taken >> (DEF_PRIORITY - sc->priority)))
 		wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
 
 	trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
 		zone_idx(zone),
 		nr_scanned, nr_reclaimed,
-		priority,
-		trace_shrink_flags(file, sc->reclaim_mode));
+		sc->priority,
+		trace_shrink_flags(file));
 	return nr_reclaimed;
 }
 
@@ -1638,30 +1321,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
  * But we had to alter page->flags anyway.
  */
 
-static void move_active_pages_to_lru(struct zone *zone,
+static void move_active_pages_to_lru(struct lruvec *lruvec,
 				     struct list_head *list,
 				     struct list_head *pages_to_free,
 				     enum lru_list lru)
 {
+	struct zone *zone = lruvec_zone(lruvec);
 	unsigned long pgmoved = 0;
 	struct page *page;
+	int nr_pages;
 
 	while (!list_empty(list)) {
-		struct lruvec *lruvec;
-
 		page = lru_to_page(list);
+		lruvec = mem_cgroup_page_lruvec(page, zone);
 
 		VM_BUG_ON(PageLRU(page));
 		SetPageLRU(page);
 
-		lruvec = mem_cgroup_lru_add_list(zone, page, lru);
+		nr_pages = hpage_nr_pages(page);
+		mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
 		list_move(&page->lru, &lruvec->lists[lru]);
-		pgmoved += hpage_nr_pages(page);
+		pgmoved += nr_pages;
 
 		if (put_page_testzero(page)) {
 			__ClearPageLRU(page);
 			__ClearPageActive(page);
-			del_page_from_lru_list(zone, page, lru);
+			del_page_from_lru_list(page, lruvec, lru);
 
 			if (unlikely(PageCompound(page))) {
 				spin_unlock_irq(&zone->lru_lock);
@@ -1677,9 +1362,9 @@ static void move_active_pages_to_lru(struct zone *zone,
 }
 
 static void shrink_active_list(unsigned long nr_to_scan,
-			       struct mem_cgroup_zone *mz,
+			       struct lruvec *lruvec,
 			       struct scan_control *sc,
-			       int priority, int file)
+			       enum lru_list lru)
 {
 	unsigned long nr_taken;
 	unsigned long nr_scanned;
@@ -1688,15 +1373,14 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	LIST_HEAD(l_active);
 	LIST_HEAD(l_inactive);
 	struct page *page;
-	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
+	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
 	unsigned long nr_rotated = 0;
-	isolate_mode_t isolate_mode = ISOLATE_ACTIVE;
-	struct zone *zone = mz->zone;
+	isolate_mode_t isolate_mode = 0;
+	int file = is_file_lru(lru);
+	struct zone *zone = lruvec_zone(lruvec);
 
 	lru_add_drain();
 
-	reset_reclaim_mode(sc);
-
 	if (!sc->may_unmap)
 		isolate_mode |= ISOLATE_UNMAPPED;
 	if (!sc->may_writepage)
@@ -1704,18 +1388,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
 
 	spin_lock_irq(&zone->lru_lock);
 
-	nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc,
-				     isolate_mode, 1, file);
+	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
+				     &nr_scanned, sc, isolate_mode, lru);
 	if (global_reclaim(sc))
 		zone->pages_scanned += nr_scanned;
 
 	reclaim_stat->recent_scanned[file] += nr_taken;
 
 	__count_zone_vm_events(PGREFILL, zone, nr_scanned);
-	if (file)
-		__mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
-	else
-		__mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
+	__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
 	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
 	spin_unlock_irq(&zone->lru_lock);
 
@@ -1737,7 +1418,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
 			}
 		}
 
-		if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) {
+		if (page_referenced(page, 0, sc->target_mem_cgroup,
+				    &vm_flags)) {
 			nr_rotated += hpage_nr_pages(page);
 			/*
 			 * Identify referenced, file-backed active pages and
@@ -1770,10 +1452,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	 */
 	reclaim_stat->recent_rotated[file] += nr_rotated;
 
-	move_active_pages_to_lru(zone, &l_active, &l_hold,
-						LRU_ACTIVE + file * LRU_FILE);
-	move_active_pages_to_lru(zone, &l_inactive, &l_hold,
-						LRU_BASE   + file * LRU_FILE);
+	move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
+	move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
 	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
 	spin_unlock_irq(&zone->lru_lock);
 
@@ -1796,13 +1476,12 @@ static int inactive_anon_is_low_global(struct zone *zone)
 
 /**
  * inactive_anon_is_low - check if anonymous pages need to be deactivated
- * @zone: zone to check
- * @sc:   scan control of this context
+ * @lruvec: LRU vector to check
  *
  * Returns true if the zone does not have enough inactive anon pages,
  * meaning some active anon pages need to be deactivated.
  */
-static int inactive_anon_is_low(struct mem_cgroup_zone *mz)
+static int inactive_anon_is_low(struct lruvec *lruvec)
 {
 	/*
 	 * If we don't have swap space, anonymous page deactivation
@@ -1811,14 +1490,13 @@ static int inactive_anon_is_low(struct mem_cgroup_zone *mz)
 	if (!total_swap_pages)
 		return 0;
 
-	if (!scanning_global_lru(mz))
-		return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup,
-						       mz->zone);
+	if (!mem_cgroup_disabled())
+		return mem_cgroup_inactive_anon_is_low(lruvec);
 
-	return inactive_anon_is_low_global(mz->zone);
+	return inactive_anon_is_low_global(lruvec_zone(lruvec));
 }
 #else
-static inline int inactive_anon_is_low(struct mem_cgroup_zone *mz)
+static inline int inactive_anon_is_low(struct lruvec *lruvec)
 {
 	return 0;
 }
@@ -1836,7 +1514,7 @@ static int inactive_file_is_low_global(struct zone *zone)
 
 /**
  * inactive_file_is_low - check if file pages need to be deactivated
- * @mz: memory cgroup and zone to check
+ * @lruvec: LRU vector to check
  *
  * When the system is doing streaming IO, memory pressure here
  * ensures that active file pages get deactivated, until more
@@ -1848,44 +1526,39 @@ static int inactive_file_is_low_global(struct zone *zone)
  * This uses a different ratio than the anonymous pages, because
  * the page cache uses a use-once replacement algorithm.
  */
-static int inactive_file_is_low(struct mem_cgroup_zone *mz)
+static int inactive_file_is_low(struct lruvec *lruvec)
 {
-	if (!scanning_global_lru(mz))
-		return mem_cgroup_inactive_file_is_low(mz->mem_cgroup,
-						       mz->zone);
+	if (!mem_cgroup_disabled())
+		return mem_cgroup_inactive_file_is_low(lruvec);
 
-	return inactive_file_is_low_global(mz->zone);
+	return inactive_file_is_low_global(lruvec_zone(lruvec));
 }
 
-static int inactive_list_is_low(struct mem_cgroup_zone *mz, int file)
+static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
 {
-	if (file)
-		return inactive_file_is_low(mz);
+	if (is_file_lru(lru))
+		return inactive_file_is_low(lruvec);
 	else
-		return inactive_anon_is_low(mz);
+		return inactive_anon_is_low(lruvec);
 }
 
 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
-				 struct mem_cgroup_zone *mz,
-				 struct scan_control *sc, int priority)
+				 struct lruvec *lruvec, struct scan_control *sc)
 {
-	int file = is_file_lru(lru);
-
 	if (is_active_lru(lru)) {
-		if (inactive_list_is_low(mz, file))
-			shrink_active_list(nr_to_scan, mz, sc, priority, file);
+		if (inactive_list_is_low(lruvec, lru))
+			shrink_active_list(nr_to_scan, lruvec, sc, lru);
 		return 0;
 	}
 
-	return shrink_inactive_list(nr_to_scan, mz, sc, priority, file);
+	return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
 }
 
-static int vmscan_swappiness(struct mem_cgroup_zone *mz,
-			     struct scan_control *sc)
+static int vmscan_swappiness(struct scan_control *sc)
 {
 	if (global_reclaim(sc))
 		return vm_swappiness;
-	return mem_cgroup_swappiness(mz->mem_cgroup);
+	return mem_cgroup_swappiness(sc->target_mem_cgroup);
 }
 
 /*
@@ -1896,17 +1569,18 @@ static int vmscan_swappiness(struct mem_cgroup_zone *mz,
  *
  * nr[0] = anon pages to scan; nr[1] = file pages to scan
  */
-static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
-			   unsigned long *nr, int priority)
+static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
+			   unsigned long *nr)
 {
 	unsigned long anon, file, free;
 	unsigned long anon_prio, file_prio;
 	unsigned long ap, fp;
-	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
+	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
 	u64 fraction[2], denominator;
 	enum lru_list lru;
 	int noswap = 0;
 	bool force_scan = false;
+	struct zone *zone = lruvec_zone(lruvec);
 
 	/*
 	 * If the zone or memcg is small, nr[l] can be 0.  This
@@ -1918,7 +1592,7 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
 	 * latencies, so it's better to scan a minimum amount there as
 	 * well.
 	 */
-	if (current_is_kswapd() && mz->zone->all_unreclaimable)
+	if (current_is_kswapd() && zone->all_unreclaimable)
 		force_scan = true;
 	if (!global_reclaim(sc))
 		force_scan = true;
@@ -1932,16 +1606,16 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
 		goto out;
 	}
 
-	anon  = zone_nr_lru_pages(mz, LRU_ACTIVE_ANON) +
-		zone_nr_lru_pages(mz, LRU_INACTIVE_ANON);
-	file  = zone_nr_lru_pages(mz, LRU_ACTIVE_FILE) +
-		zone_nr_lru_pages(mz, LRU_INACTIVE_FILE);
+	anon  = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
+		get_lru_size(lruvec, LRU_INACTIVE_ANON);
+	file  = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
+		get_lru_size(lruvec, LRU_INACTIVE_FILE);
 
 	if (global_reclaim(sc)) {
-		free  = zone_page_state(mz->zone, NR_FREE_PAGES);
+		free  = zone_page_state(zone, NR_FREE_PAGES);
 		/* If we have very few page cache pages,
 		   force-scan anon pages. */
-		if (unlikely(file + free <= high_wmark_pages(mz->zone))) {
+		if (unlikely(file + free <= high_wmark_pages(zone))) {
 			fraction[0] = 1;
 			fraction[1] = 0;
 			denominator = 1;
@@ -1953,8 +1627,8 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
 	 * With swappiness at 100, anonymous and file have the same priority.
 	 * This scanning priority is essentially the inverse of IO cost.
 	 */
-	anon_prio = vmscan_swappiness(mz, sc);
-	file_prio = 200 - vmscan_swappiness(mz, sc);
+	anon_prio = vmscan_swappiness(sc);
+	file_prio = 200 - anon_prio;
 
 	/*
 	 * OK, so we have swap space and a fair amount of page cache
@@ -1967,7 +1641,7 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
 	 *
 	 * anon in [0], file in [1]
 	 */
-	spin_lock_irq(&mz->zone->lru_lock);
+	spin_lock_irq(&zone->lru_lock);
 	if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
 		reclaim_stat->recent_scanned[0] /= 2;
 		reclaim_stat->recent_rotated[0] /= 2;
@@ -1983,12 +1657,12 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
 	 * proportional to the fraction of recently scanned pages on
 	 * each list that were recently referenced and in active use.
 	 */
-	ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1);
+	ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
 	ap /= reclaim_stat->recent_rotated[0] + 1;
 
-	fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
+	fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
 	fp /= reclaim_stat->recent_rotated[1] + 1;
-	spin_unlock_irq(&mz->zone->lru_lock);
+	spin_unlock_irq(&zone->lru_lock);
 
 	fraction[0] = ap;
 	fraction[1] = fp;
@@ -1998,9 +1672,9 @@ out:
 		int file = is_file_lru(lru);
 		unsigned long scan;
 
-		scan = zone_nr_lru_pages(mz, lru);
-		if (priority || noswap) {
-			scan >>= priority;
+		scan = get_lru_size(lruvec, lru);
+		if (sc->priority || noswap || !vmscan_swappiness(sc)) {
+			scan >>= sc->priority;
 			if (!scan && force_scan)
 				scan = SWAP_CLUSTER_MAX;
 			scan = div64_u64(scan * fraction[file], denominator);
@@ -2009,14 +1683,25 @@ out:
 	}
 }
 
+/* Use reclaim/compaction for costly allocs or under memory pressure */
+static bool in_reclaim_compaction(struct scan_control *sc)
+{
+	if (COMPACTION_BUILD && sc->order &&
+			(sc->order > PAGE_ALLOC_COSTLY_ORDER ||
+			 sc->priority < DEF_PRIORITY - 2))
+		return true;
+
+	return false;
+}
+
 /*
- * Reclaim/compaction depends on a number of pages being freed. To avoid
- * disruption to the system, a small number of order-0 pages continue to be
- * rotated and reclaimed in the normal fashion. However, by the time we get
- * back to the allocator and call try_to_compact_zone(), we ensure that
- * there are enough free pages for it to be likely successful
+ * Reclaim/compaction is used for high-order allocation requests. It reclaims
+ * order-0 pages before compacting the zone. should_continue_reclaim() returns
+ * true if more pages should be reclaimed such that when the page allocator
+ * calls try_to_compact_zone() that it will have enough free pages to succeed.
+ * It will give up earlier than that if there is difficulty reclaiming pages.
  */
-static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
+static inline bool should_continue_reclaim(struct lruvec *lruvec,
 					unsigned long nr_reclaimed,
 					unsigned long nr_scanned,
 					struct scan_control *sc)
@@ -2025,7 +1710,7 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
 	unsigned long inactive_lru_pages;
 
 	/* If not in reclaim/compaction mode, stop */
-	if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
+	if (!in_reclaim_compaction(sc))
 		return false;
 
 	/* Consider stopping depending on scan and reclaim activity */
@@ -2056,15 +1741,15 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
 	 * inactive lists are large enough, continue reclaiming
 	 */
 	pages_for_compaction = (2UL << sc->order);
-	inactive_lru_pages = zone_nr_lru_pages(mz, LRU_INACTIVE_FILE);
+	inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE);
 	if (nr_swap_pages > 0)
-		inactive_lru_pages += zone_nr_lru_pages(mz, LRU_INACTIVE_ANON);
+		inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON);
 	if (sc->nr_reclaimed < pages_for_compaction &&
 			inactive_lru_pages > pages_for_compaction)
 		return true;
 
 	/* If compaction would go ahead or the allocation would succeed, stop */
-	switch (compaction_suitable(mz->zone, sc->order)) {
+	switch (compaction_suitable(lruvec_zone(lruvec), sc->order)) {
 	case COMPACT_PARTIAL:
 	case COMPACT_CONTINUE:
 		return false;
@@ -2076,8 +1761,7 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
-static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz,
-				   struct scan_control *sc)
+static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 {
 	unsigned long nr[NR_LRU_LISTS];
 	unsigned long nr_to_scan;
@@ -2089,7 +1773,7 @@ static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz,
 restart:
 	nr_reclaimed = 0;
 	nr_scanned = sc->nr_scanned;
-	get_scan_count(mz, sc, nr, priority);
+	get_scan_count(lruvec, sc, nr);
 
 	blk_start_plug(&plug);
 	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -2101,7 +1785,7 @@ restart:
 				nr[lru] -= nr_to_scan;
 
 				nr_reclaimed += shrink_list(lru, nr_to_scan,
-							    mz, sc, priority);
+							    lruvec, sc);
 			}
 		}
 		/*
@@ -2112,7 +1796,8 @@ restart:
 		 * with multiple processes reclaiming pages, the total
 		 * freeing target can get unreasonably large.
 		 */
-		if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
+		if (nr_reclaimed >= nr_to_reclaim &&
+		    sc->priority < DEF_PRIORITY)
 			break;
 	}
 	blk_finish_plug(&plug);
@@ -2122,35 +1807,33 @@ restart:
 	 * Even if we did not try to evict anon pages at all, we want to
 	 * rebalance the anon lru active/inactive ratio.
 	 */
-	if (inactive_anon_is_low(mz))
-		shrink_active_list(SWAP_CLUSTER_MAX, mz, sc, priority, 0);
+	if (inactive_anon_is_low(lruvec))
+		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
+				   sc, LRU_ACTIVE_ANON);
 
 	/* reclaim/compaction might need reclaim to continue */
-	if (should_continue_reclaim(mz, nr_reclaimed,
-					sc->nr_scanned - nr_scanned, sc))
+	if (should_continue_reclaim(lruvec, nr_reclaimed,
+				    sc->nr_scanned - nr_scanned, sc))
 		goto restart;
 
 	throttle_vm_writeout(sc->gfp_mask);
 }
 
-static void shrink_zone(int priority, struct zone *zone,
-			struct scan_control *sc)
+static void shrink_zone(struct zone *zone, struct scan_control *sc)
 {
 	struct mem_cgroup *root = sc->target_mem_cgroup;
 	struct mem_cgroup_reclaim_cookie reclaim = {
 		.zone = zone,
-		.priority = priority,
+		.priority = sc->priority,
 	};
 	struct mem_cgroup *memcg;
 
 	memcg = mem_cgroup_iter(root, NULL, &reclaim);
 	do {
-		struct mem_cgroup_zone mz = {
-			.mem_cgroup = memcg,
-			.zone = zone,
-		};
+		struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+
+		shrink_lruvec(lruvec, sc);
 
-		shrink_mem_cgroup_zone(priority, &mz, sc);
 		/*
 		 * Limit reclaim has historically picked one memcg and
 		 * scanned it with decreasing priority levels until
@@ -2226,8 +1909,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
  * the caller that it should consider retrying the allocation instead of
  * further reclaim.
  */
-static bool shrink_zones(int priority, struct zonelist *zonelist,
-					struct scan_control *sc)
+static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 {
 	struct zoneref *z;
 	struct zone *zone;
@@ -2254,7 +1936,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
 		if (global_reclaim(sc)) {
 			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 				continue;
-			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+			if (zone->all_unreclaimable &&
+					sc->priority != DEF_PRIORITY)
 				continue;	/* Let kswapd poll it */
 			if (COMPACTION_BUILD) {
 				/*
@@ -2286,7 +1969,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
 			/* need some check for avoid more shrink_zone() */
 		}
 
-		shrink_zone(priority, zone, sc);
+		shrink_zone(zone, sc);
 	}
 
 	return aborted_reclaim;
@@ -2337,7 +2020,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 					struct scan_control *sc,
 					struct shrink_control *shrink)
 {
-	int priority;
 	unsigned long total_scanned = 0;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	struct zoneref *z;
@@ -2350,11 +2032,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 	if (global_reclaim(sc))
 		count_vm_event(ALLOCSTALL);
 
-	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
+	do {
 		sc->nr_scanned = 0;
-		if (!priority)
-			disable_swap_token(sc->target_mem_cgroup);
-		aborted_reclaim = shrink_zones(priority, zonelist, sc);
+		aborted_reclaim = shrink_zones(zonelist, sc);
 
 		/*
 		 * Don't shrink slabs when reclaiming memory from
@@ -2396,7 +2076,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 
 		/* Take a nap, wait for some writeback to complete */
 		if (!sc->hibernation_mode && sc->nr_scanned &&
-		    priority < DEF_PRIORITY - 2) {
+		    sc->priority < DEF_PRIORITY - 2) {
 			struct zone *preferred_zone;
 
 			first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
@@ -2404,7 +2084,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 						&preferred_zone);
 			wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
 		}
-	}
+	} while (--sc->priority >= 0);
 
 out:
 	delayacct_freepages_end();
@@ -2442,6 +2122,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 		.may_unmap = 1,
 		.may_swap = 1,
 		.order = order,
+		.priority = DEF_PRIORITY,
 		.target_mem_cgroup = NULL,
 		.nodemask = nodemask,
 	};
@@ -2474,17 +2155,15 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
 		.may_unmap = 1,
 		.may_swap = !noswap,
 		.order = 0,
+		.priority = 0,
 		.target_mem_cgroup = memcg,
 	};
-	struct mem_cgroup_zone mz = {
-		.mem_cgroup = memcg,
-		.zone = zone,
-	};
+	struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 
 	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
 			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
 
-	trace_mm_vmscan_memcg_softlimit_reclaim_begin(0,
+	trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
 						      sc.may_writepage,
 						      sc.gfp_mask);
 
@@ -2495,7 +2174,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
 	 * will pick up pages from other mem cgroup's as well. We hack
 	 * the priority and make it zero.
 	 */
-	shrink_mem_cgroup_zone(0, &mz, &sc);
+	shrink_lruvec(lruvec, &sc);
 
 	trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
 
@@ -2516,6 +2195,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 		.may_swap = !noswap,
 		.nr_to_reclaim = SWAP_CLUSTER_MAX,
 		.order = 0,
+		.priority = DEF_PRIORITY,
 		.target_mem_cgroup = memcg,
 		.nodemask = NULL, /* we don't care the placement */
 		.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
@@ -2546,8 +2226,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 }
 #endif
 
-static void age_active_anon(struct zone *zone, struct scan_control *sc,
-			    int priority)
+static void age_active_anon(struct zone *zone, struct scan_control *sc)
 {
 	struct mem_cgroup *memcg;
 
@@ -2556,14 +2235,11 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc,
 
 	memcg = mem_cgroup_iter(NULL, NULL, NULL);
 	do {
-		struct mem_cgroup_zone mz = {
-			.mem_cgroup = memcg,
-			.zone = zone,
-		};
+		struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 
-		if (inactive_anon_is_low(&mz))
-			shrink_active_list(SWAP_CLUSTER_MAX, &mz,
-					   sc, priority, 0);
+		if (inactive_anon_is_low(lruvec))
+			shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
+					   sc, LRU_ACTIVE_ANON);
 
 		memcg = mem_cgroup_iter(NULL, memcg, NULL);
 	} while (memcg);
@@ -2672,7 +2348,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 {
 	int all_zones_ok;
 	unsigned long balanced;
-	int priority;
 	int i;
 	int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
 	unsigned long total_scanned;
@@ -2696,18 +2371,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 	};
 loop_again:
 	total_scanned = 0;
+	sc.priority = DEF_PRIORITY;
 	sc.nr_reclaimed = 0;
 	sc.may_writepage = !laptop_mode;
 	count_vm_event(PAGEOUTRUN);
 
-	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
+	do {
 		unsigned long lru_pages = 0;
 		int has_under_min_watermark_zone = 0;
 
-		/* The swap token gets in the way of swapout... */
-		if (!priority)
-			disable_swap_token(NULL);
-
 		all_zones_ok = 1;
 		balanced = 0;
 
@@ -2721,14 +2393,15 @@ loop_again:
 			if (!populated_zone(zone))
 				continue;
 
-			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+			if (zone->all_unreclaimable &&
+			    sc.priority != DEF_PRIORITY)
 				continue;
 
 			/*
 			 * Do some background aging of the anon list, to give
 			 * pages a chance to be referenced before reclaiming.
 			 */
-			age_active_anon(zone, &sc, priority);
+			age_active_anon(zone, &sc);
 
 			/*
 			 * If the number of buffer_heads in the machine
@@ -2776,7 +2449,8 @@ loop_again:
 			if (!populated_zone(zone))
 				continue;
 
-			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+			if (zone->all_unreclaimable &&
+			    sc.priority != DEF_PRIORITY)
 				continue;
 
 			sc.nr_scanned = 0;
@@ -2820,7 +2494,7 @@ loop_again:
 				    !zone_watermark_ok_safe(zone, testorder,
 					high_wmark_pages(zone) + balance_gap,
 					end_zone, 0)) {
-				shrink_zone(priority, zone, &sc);
+				shrink_zone(zone, &sc);
 
 				reclaim_state->reclaimed_slab = 0;
 				nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
@@ -2877,7 +2551,7 @@ loop_again:
 		 * OK, kswapd is getting into trouble.  Take a nap, then take
 		 * another pass across the zones.
 		 */
-		if (total_scanned && (priority < DEF_PRIORITY - 2)) {
+		if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) {
 			if (has_under_min_watermark_zone)
 				count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
 			else
@@ -2892,7 +2566,7 @@ loop_again:
 		 */
 		if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
 			break;
-	}
+	} while (--sc.priority >= 0);
 out:
 
 	/*
@@ -2942,7 +2616,8 @@ out:
 			if (!populated_zone(zone))
 				continue;
 
-			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+			if (zone->all_unreclaimable &&
+			    sc.priority != DEF_PRIORITY)
 				continue;
 
 			/* Would compaction fail due to lack of free memory? */
@@ -3209,6 +2884,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 		.nr_to_reclaim = nr_to_reclaim,
 		.hibernation_mode = 1,
 		.order = 0,
+		.priority = DEF_PRIORITY,
 	};
 	struct shrink_control shrink = {
 		.gfp_mask = sc.gfp_mask,
@@ -3386,7 +3062,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	const unsigned long nr_pages = 1 << order;
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
-	int priority;
 	struct scan_control sc = {
 		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
 		.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
@@ -3395,6 +3070,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 				       SWAP_CLUSTER_MAX),
 		.gfp_mask = gfp_mask,
 		.order = order,
+		.priority = ZONE_RECLAIM_PRIORITY,
 	};
 	struct shrink_control shrink = {
 		.gfp_mask = sc.gfp_mask,
@@ -3417,11 +3093,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 		 * Free memory by calling shrink zone with increasing
 		 * priorities until we have enough memory freed.
 		 */
-		priority = ZONE_RECLAIM_PRIORITY;
 		do {
-			shrink_zone(priority, zone, &sc);
-			priority--;
-		} while (priority >= 0 && sc.nr_reclaimed < nr_pages);
+			shrink_zone(zone, &sc);
+		} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
 	}
 
 	nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
@@ -3536,7 +3210,7 @@ int page_evictable(struct page *page, struct vm_area_struct *vma)
 	if (mapping_unevictable(page_mapping(page)))
 		return 0;
 
-	if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page)))
+	if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page)))
 		return 0;
 
 	return 1;
@@ -3572,6 +3246,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
 			zone = pagezone;
 			spin_lock_irq(&zone->lru_lock);
 		}
+		lruvec = mem_cgroup_page_lruvec(page, zone);
 
 		if (!PageLRU(page) || !PageUnevictable(page))
 			continue;
@@ -3581,11 +3256,8 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
 
 			VM_BUG_ON(PageActive(page));
 			ClearPageUnevictable(page);
-			__dec_zone_state(zone, NR_UNEVICTABLE);
-			lruvec = mem_cgroup_lru_move_lists(zone, page,
-						LRU_UNEVICTABLE, lru);
-			list_move(&page->lru, &lruvec->lists[lru]);
-			__inc_zone_state(zone, NR_INACTIVE_ANON + lru);
+			del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
+			add_page_to_lru_list(page, lruvec, lru);
 			pgrescued++;
 		}
 	}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 0dad31dc1618..1bbbbd9776ad 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1223,7 +1223,6 @@ module_init(setup_vmstat)
 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
 #include <linux/debugfs.h>
 
-static struct dentry *extfrag_debug_root;
 
 /*
  * Return an index indicating how much of the available free memory is
@@ -1361,19 +1360,24 @@ static const struct file_operations extfrag_file_ops = {
 
 static int __init extfrag_debug_init(void)
 {
+	struct dentry *extfrag_debug_root;
+
 	extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
 	if (!extfrag_debug_root)
 		return -ENOMEM;
 
 	if (!debugfs_create_file("unusable_index", 0444,
 			extfrag_debug_root, NULL, &unusable_file_ops))
-		return -ENOMEM;
+		goto fail;
 
 	if (!debugfs_create_file("extfrag_index", 0444,
 			extfrag_debug_root, NULL, &extfrag_file_ops))
-		return -ENOMEM;
+		goto fail;
 
 	return 0;
+fail:
+	debugfs_remove_recursive(extfrag_debug_root);
+	return -ENOMEM;
 }
 
 module_init(extfrag_debug_init);
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 151703791bb0..b6f3583ddfe8 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -74,9 +74,6 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg)
 	percpu_counter_destroy(&tcp->tcp_sockets_allocated);
 
 	val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
-
-	if (val != RESOURCE_MAX)
-		static_key_slow_dec(&memcg_socket_limit_enabled);
 }
 EXPORT_SYMBOL(tcp_destroy_cgroup);
 
@@ -107,10 +104,33 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
 		tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT,
 					     net->ipv4.sysctl_tcp_mem[i]);
 
-	if (val == RESOURCE_MAX && old_lim != RESOURCE_MAX)
-		static_key_slow_dec(&memcg_socket_limit_enabled);
-	else if (old_lim == RESOURCE_MAX && val != RESOURCE_MAX)
-		static_key_slow_inc(&memcg_socket_limit_enabled);
+	if (val == RESOURCE_MAX)
+		clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
+	else if (val != RESOURCE_MAX) {
+		/*
+		 * The active bit needs to be written after the static_key
+		 * update. This is what guarantees that the socket activation
+		 * function is the last one to run. See sock_update_memcg() for
+		 * details, and note that we don't mark any socket as belonging
+		 * to this memcg until that flag is up.
+		 *
+		 * We need to do this, because static_keys will span multiple
+		 * sites, but we can't control their order. If we mark a socket
+		 * as accounted, but the accounting functions are not patched in
+		 * yet, we'll lose accounting.
+		 *
+		 * We never race with the readers in sock_update_memcg(),
+		 * because when this value change, the code to process it is not
+		 * patched in yet.
+		 *
+		 * The activated bit is used to guarantee that no two writers
+		 * will do the update in the same memcg. Without that, we can't
+		 * properly shutdown the static key.
+		 */
+		if (!test_and_set_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags))
+			static_key_slow_inc(&memcg_socket_limit_enabled);
+		set_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
+	}
 
 	return 0;
 }
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 7fee13b331d1..f56f045778ae 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1286,6 +1286,8 @@ call_reserveresult(struct rpc_task *task)
 	}
 
 	switch (status) {
+	case -ENOMEM:
+		rpc_delay(task, HZ >> 2);
 	case -EAGAIN:	/* woken up; retry */
 		task->tk_action = call_reserve;
 		return;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index fd2423991c2d..04040476082e 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -120,7 +120,7 @@ EXPORT_SYMBOL_GPL(rpc_pipe_generic_upcall);
 
 /**
  * rpc_queue_upcall - queue an upcall message to userspace
- * @inode: inode of upcall pipe on which to queue given message
+ * @pipe: upcall pipe on which to queue given message
  * @msg: message to queue
  *
  * Call with an @inode created by rpc_mkpipe() to queue an upcall.
@@ -819,9 +819,7 @@ static int rpc_rmdir_depopulate(struct dentry *dentry,
  * @parent: dentry of directory to create new "pipe" in
  * @name: name of pipe
  * @private: private data to associate with the pipe, for the caller's use
- * @ops: operations defining the behavior of the pipe: upcall, downcall,
- *	release_pipe, open_pipe, and destroy_msg.
- * @flags: rpc_pipe flags
+ * @pipe: &rpc_pipe containing input parameters
  *
  * Data is made available for userspace to read by calls to
  * rpc_queue_upcall().  The actual reads will result in calls to
@@ -943,7 +941,7 @@ struct dentry *rpc_create_client_dir(struct dentry *dentry,
 
 /**
  * rpc_remove_client_dir - Remove a directory created with rpc_create_client_dir()
- * @clnt: rpc client
+ * @dentry: dentry for the pipe
  */
 int rpc_remove_client_dir(struct dentry *dentry)
 {
@@ -1115,7 +1113,7 @@ rpc_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_op = &s_ops;
 	sb->s_time_gran = 1;
 
-	inode = rpc_get_inode(sb, S_IFDIR | 0755);
+	inode = rpc_get_inode(sb, S_IFDIR | S_IRUGO | S_IXUGO);
 	sb->s_root = root = d_make_root(inode);
 	if (!root)
 		return -ENOMEM;
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 78ac39fd9fe7..3c0653439f3d 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -394,6 +394,7 @@ static int rpcb_register_call(struct rpc_clnt *clnt, struct rpc_message *msg)
 
 /**
  * rpcb_register - set or unset a port registration with the local rpcbind svc
+ * @net: target network namespace
  * @prog: RPC program number to bind
  * @vers: RPC version number to bind
  * @prot: transport protocol to register
@@ -521,6 +522,7 @@ static int rpcb_unregister_all_protofamilies(struct sunrpc_net *sn,
 
 /**
  * rpcb_v4_register - set or unset a port registration with the local rpcbind
+ * @net: target network namespace
  * @program: RPC program number of service to (un)register
  * @version: RPC version number of service to (un)register
  * @address: address family, IP address, and port to (un)register
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 6fe2dcead150..3c83035cdaa9 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -979,20 +979,21 @@ static void xprt_alloc_slot(struct rpc_task *task)
 		list_del(&req->rq_list);
 		goto out_init_req;
 	}
-	req = xprt_dynamic_alloc_slot(xprt, GFP_NOWAIT);
+	req = xprt_dynamic_alloc_slot(xprt, GFP_NOWAIT|__GFP_NOWARN);
 	if (!IS_ERR(req))
 		goto out_init_req;
 	switch (PTR_ERR(req)) {
 	case -ENOMEM:
-		rpc_delay(task, HZ >> 2);
 		dprintk("RPC:       dynamic allocation of request slot "
 				"failed! Retrying\n");
+		task->tk_status = -ENOMEM;
 		break;
 	case -EAGAIN:
 		rpc_sleep_on(&xprt->backlog, task, NULL);
 		dprintk("RPC:       waiting for request slot\n");
+	default:
+		task->tk_status = -EAGAIN;
 	}
-	task->tk_status = -EAGAIN;
 	return;
 out_init_req:
 	task->tk_status = 0;
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c
index 7dab7b25b5c6..f576971f6556 100644
--- a/tools/vm/page-types.c
+++ b/tools/vm/page-types.c
@@ -35,6 +35,7 @@
 #include <sys/mount.h>
 #include <sys/statfs.h>
 #include "../../include/linux/magic.h"
+#include "../../include/linux/kernel-page-flags.h"
 
 
 #ifndef MAX_PATH
@@ -73,33 +74,6 @@
 #define KPF_BYTES		8
 #define PROC_KPAGEFLAGS		"/proc/kpageflags"
 
-/* copied from kpageflags_read() */
-#define KPF_LOCKED		0
-#define KPF_ERROR		1
-#define KPF_REFERENCED		2
-#define KPF_UPTODATE		3
-#define KPF_DIRTY		4
-#define KPF_LRU			5
-#define KPF_ACTIVE		6
-#define KPF_SLAB		7
-#define KPF_WRITEBACK		8
-#define KPF_RECLAIM		9
-#define KPF_BUDDY		10
-
-/* [11-20] new additions in 2.6.31 */
-#define KPF_MMAP		11
-#define KPF_ANON		12
-#define KPF_SWAPCACHE		13
-#define KPF_SWAPBACKED		14
-#define KPF_COMPOUND_HEAD	15
-#define KPF_COMPOUND_TAIL	16
-#define KPF_HUGE		17
-#define KPF_UNEVICTABLE		18
-#define KPF_HWPOISON		19
-#define KPF_NOPAGE		20
-#define KPF_KSM			21
-#define KPF_THP			22
-
 /* [32-] kernel hacking assistances */
 #define KPF_RESERVED		32
 #define KPF_MLOCKED		33
@@ -326,7 +300,7 @@ static char *page_flag_name(uint64_t flags)
 {
 	static char buf[65];
 	int present;
-	int i, j;
+	size_t i, j;
 
 	for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) {
 		present = (flags >> i) & 1;
@@ -344,7 +318,7 @@ static char *page_flag_name(uint64_t flags)
 static char *page_flag_longname(uint64_t flags)
 {
 	static char buf[1024];
-	int i, n;
+	size_t i, n;
 
 	for (i = 0, n = 0; i < ARRAY_SIZE(page_flag_names); i++) {
 		if (!page_flag_names[i])
@@ -402,7 +376,7 @@ static void show_page(unsigned long voffset,
 
 static void show_summary(void)
 {
-	int i;
+	size_t i;
 
 	printf("             flags\tpage-count       MB"
 		"  symbolic-flags\t\t\tlong-symbolic-flags\n");
@@ -500,7 +474,7 @@ static int debugfs_valid_mountpoint(const char *debugfs)
 /* find the path to the mounted debugfs */
 static const char *debugfs_find_mountpoint(void)
 {
-	const char **ptr;
+	const char *const *ptr;
 	char type[100];
 	FILE *fp;
 
@@ -537,7 +511,7 @@ static const char *debugfs_find_mountpoint(void)
 
 static void debugfs_mount(void)
 {
-	const char **ptr;
+	const char *const *ptr;
 
 	/* see if it's already mounted */
 	if (debugfs_find_mountpoint())
@@ -614,10 +588,10 @@ static int unpoison_page(unsigned long offset)
  * page frame walker
  */
 
-static int hash_slot(uint64_t flags)
+static size_t hash_slot(uint64_t flags)
 {
-	int k = HASH_KEY(flags);
-	int i;
+	size_t k = HASH_KEY(flags);
+	size_t i;
 
 	/* Explicitly reserve slot 0 for flags 0: the following logic
 	 * cannot distinguish an unoccupied slot from slot (flags==0).
@@ -670,7 +644,7 @@ static void walk_pfn(unsigned long voffset,
 {
 	uint64_t buf[KPAGEFLAGS_BATCH];
 	unsigned long batch;
-	long pages;
+	unsigned long pages;
 	unsigned long i;
 
 	while (count) {
@@ -779,7 +753,7 @@ static const char *page_flag_type(uint64_t flag)
 
 static void usage(void)
 {
-	int i, j;
+	size_t i, j;
 
 	printf(
 "page-types [options]\n"
@@ -938,7 +912,7 @@ static void add_bits_filter(uint64_t mask, uint64_t bits)
 
 static uint64_t parse_flag_name(const char *str, int len)
 {
-	int i;
+	size_t i;
 
 	if (!*str || !len)
 		return 0;