diff options
128 files changed, 5383 insertions, 3222 deletions
diff --git a/Documentation/ABI/testing/pstore b/Documentation/ABI/testing/pstore index ddf451ee2a08..ff1df4e3b059 100644 --- a/Documentation/ABI/testing/pstore +++ b/Documentation/ABI/testing/pstore @@ -39,3 +39,9 @@ Description: Generic interface to platform dependent persistent storage. multiple) files based on the record size of the underlying persistent storage until at least this amount is reached. Default is 10 Kbytes. + + Pstore only supports one backend at a time. If multiple + backends are available, the preferred backend may be + set by passing the pstore.backend= argument to the kernel at + boot time. + diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt index 6b5c42dbbe84..2c656ae43ba7 100644 --- a/Documentation/device-mapper/dm-crypt.txt +++ b/Documentation/device-mapper/dm-crypt.txt @@ -4,7 +4,8 @@ dm-crypt Device-Mapper's "crypt" target provides transparent encryption of block devices using the kernel crypto API. -Parameters: <cipher> <key> <iv_offset> <device path> <offset> +Parameters: <cipher> <key> <iv_offset> <device path> \ + <offset> [<#opt_params> <opt_params>] <cipher> Encryption cipher and an optional IV generation mode. @@ -37,6 +38,24 @@ Parameters: <cipher> <key> <iv_offset> <device path> <offset> <offset> Starting sector within the device where the encrypted data begins. +<#opt_params> + Number of optional parameters. If there are no optional parameters, + the optional paramaters section can be skipped or #opt_params can be zero. + Otherwise #opt_params is the number of following arguments. + + Example of optional parameters section: + 1 allow_discards + +allow_discards + Block discard requests (a.k.a. TRIM) are passed through the crypt device. + The default is to ignore discard requests. + + WARNING: Assess the specific security risks carefully before enabling this + option. For example, allowing discards on encrypted devices may lead to + the leak of information about the ciphertext device (filesystem type, + used space etc.) if the discarded blocks can be located easily on the + device later. + Example scripts =============== LUKS (Linux Unified Key Setup) is now the preferred way to set up disk diff --git a/Documentation/device-mapper/dm-flakey.txt b/Documentation/device-mapper/dm-flakey.txt index c8efdfd19a65..6ff5c2327227 100644 --- a/Documentation/device-mapper/dm-flakey.txt +++ b/Documentation/device-mapper/dm-flakey.txt @@ -1,17 +1,53 @@ dm-flakey ========= -This target is the same as the linear target except that it returns I/O -errors periodically. It's been found useful in simulating failing -devices for testing purposes. +This target is the same as the linear target except that it exhibits +unreliable behaviour periodically. It's been found useful in simulating +failing devices for testing purposes. Starting from the time the table is loaded, the device is available for -<up interval> seconds, then returns errors for <down interval> seconds, -and then this cycle repeats. +<up interval> seconds, then exhibits unreliable behaviour for <down +interval> seconds, and then this cycle repeats. -Parameters: <dev path> <offset> <up interval> <down interval> +Also, consider using this in combination with the dm-delay target too, +which can delay reads and writes and/or send them to different +underlying devices. + +Table parameters +---------------- + <dev path> <offset> <up interval> <down interval> \ + [<num_features> [<feature arguments>]] + +Mandatory parameters: <dev path>: Full pathname to the underlying block-device, or a "major:minor" device-number. <offset>: Starting sector within the device. <up interval>: Number of seconds device is available. <down interval>: Number of seconds device returns errors. + +Optional feature parameters: + If no feature parameters are present, during the periods of + unreliability, all I/O returns errors. + + drop_writes: + All write I/O is silently ignored. + Read I/O is handled correctly. + + corrupt_bio_byte <Nth_byte> <direction> <value> <flags>: + During <down interval>, replace <Nth_byte> of the data of + each matching bio with <value>. + + <Nth_byte>: The offset of the byte to replace. + Counting starts at 1, to replace the first byte. + <direction>: Either 'r' to corrupt reads or 'w' to corrupt writes. + 'w' is incompatible with drop_writes. + <value>: The value (from 0-255) to write. + <flags>: Perform the replacement only if bio->bi_rw has all the + selected flags set. + +Examples: + corrupt_bio_byte 32 r 1 0 + - replaces the 32nd byte of READ bios with the value 1 + + corrupt_bio_byte 224 w 0 32 + - replaces the 224th byte of REQ_META (=32) bios with the value 0 diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt index 33b6b7071ac8..2a8c11331d2d 100644 --- a/Documentation/device-mapper/dm-raid.txt +++ b/Documentation/device-mapper/dm-raid.txt @@ -1,70 +1,108 @@ -Device-mapper RAID (dm-raid) is a bridge from DM to MD. It -provides a way to use device-mapper interfaces to access the MD RAID -drivers. +dm-raid +------- -As with all device-mapper targets, the nominal public interfaces are the -constructor (CTR) tables and the status outputs (both STATUSTYPE_INFO -and STATUSTYPE_TABLE). The CTR table looks like the following: +The device-mapper RAID (dm-raid) target provides a bridge from DM to MD. +It allows the MD RAID drivers to be accessed using a device-mapper +interface. -1: <s> <l> raid \ -2: <raid_type> <#raid_params> <raid_params> \ -3: <#raid_devs> <meta_dev1> <dev1> .. <meta_devN> <devN> - -Line 1 contains the standard first three arguments to any device-mapper -target - the start, length, and target type fields. The target type in -this case is "raid". - -Line 2 contains the arguments that define the particular raid -type/personality/level, the required arguments for that raid type, and -any optional arguments. Possible raid types include: raid4, raid5_la, -raid5_ls, raid5_rs, raid6_zr, raid6_nr, and raid6_nc. (raid1 is -planned for the future.) The list of required and optional parameters -is the same for all the current raid types. The required parameters are -positional, while the optional parameters are given as key/value pairs. -The possible parameters are as follows: - <chunk_size> Chunk size in sectors. - [[no]sync] Force/Prevent RAID initialization - [rebuild <idx>] Rebuild the drive indicated by the index - [daemon_sleep <ms>] Time between bitmap daemon work to clear bits - [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization - [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization - [max_write_behind <sectors>] See '-write-behind=' (man mdadm) - [stripe_cache <sectors>] Stripe cache size for higher RAIDs - -Line 3 contains the list of devices that compose the array in -metadata/data device pairs. If the metadata is stored separately, a '-' -is given for the metadata device position. If a drive has failed or is -missing at creation time, a '-' can be given for both the metadata and -data drives for a given position. - -NB. Currently all metadata devices must be specified as '-'. - -Examples: -# RAID4 - 4 data drives, 1 parity +The target is named "raid" and it accepts the following parameters: + + <raid_type> <#raid_params> <raid_params> \ + <#raid_devs> <metadata_dev0> <dev0> [.. <metadata_devN> <devN>] + +<raid_type>: + raid1 RAID1 mirroring + raid4 RAID4 dedicated parity disk + raid5_la RAID5 left asymmetric + - rotating parity 0 with data continuation + raid5_ra RAID5 right asymmetric + - rotating parity N with data continuation + raid5_ls RAID5 left symmetric + - rotating parity 0 with data restart + raid5_rs RAID5 right symmetric + - rotating parity N with data restart + raid6_zr RAID6 zero restart + - rotating parity zero (left-to-right) with data restart + raid6_nr RAID6 N restart + - rotating parity N (right-to-left) with data restart + raid6_nc RAID6 N continue + - rotating parity N (right-to-left) with data continuation + + Refererence: Chapter 4 of + http://www.snia.org/sites/default/files/SNIA_DDF_Technical_Position_v2.0.pdf + +<#raid_params>: The number of parameters that follow. + +<raid_params> consists of + Mandatory parameters: + <chunk_size>: Chunk size in sectors. This parameter is often known as + "stripe size". It is the only mandatory parameter and + is placed first. + + followed by optional parameters (in any order): + [sync|nosync] Force or prevent RAID initialization. + + [rebuild <idx>] Rebuild drive number idx (first drive is 0). + + [daemon_sleep <ms>] + Interval between runs of the bitmap daemon that + clear bits. A longer interval means less bitmap I/O but + resyncing after a failure is likely to take longer. + + [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization + [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization + [write_mostly <idx>] Drive index is write-mostly + [max_write_behind <sectors>] See '-write-behind=' (man mdadm) + [stripe_cache <sectors>] Stripe cache size (higher RAIDs only) + [region_size <sectors>] + The region_size multiplied by the number of regions is the + logical size of the array. The bitmap records the device + synchronisation state for each region. + +<#raid_devs>: The number of devices composing the array. + Each device consists of two entries. The first is the device + containing the metadata (if any); the second is the one containing the + data. + + If a drive has failed or is missing at creation time, a '-' can be + given for both the metadata and data drives for a given position. + + +Example tables +-------------- +# RAID4 - 4 data drives, 1 parity (no metadata devices) # No metadata devices specified to hold superblock/bitmap info # Chunk size of 1MiB # (Lines separated for easy reading) + 0 1960893648 raid \ raid4 1 2048 \ 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81 -# RAID4 - 4 data drives, 1 parity (no metadata devices) +# RAID4 - 4 data drives, 1 parity (with metadata devices) # Chunk size of 1MiB, force RAID initialization, # min recovery rate at 20 kiB/sec/disk + 0 1960893648 raid \ - raid4 4 2048 min_recovery_rate 20 sync\ - 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81 + raid4 4 2048 sync min_recovery_rate 20 \ + 5 8:17 8:18 8:33 8:34 8:49 8:50 8:65 8:66 8:81 8:82 -Performing a 'dmsetup table' should display the CTR table used to -construct the mapping (with possible reordering of optional -parameters). +'dmsetup table' displays the table used to construct the mapping. +The optional parameters are always printed in the order listed +above with "sync" or "nosync" always output ahead of the other +arguments, regardless of the order used when originally loading the table. +Arguments that can be repeated are ordered by value. -Performing a 'dmsetup status' will yield information on the state and -health of the array. The output is as follows: +'dmsetup status' yields information on the state and health of the +array. +The output is as follows: 1: <s> <l> raid \ 2: <raid_type> <#devices> <1 health char for each dev> <resync_ratio> -Line 1 is standard DM output. Line 2 is best shown by example: +Line 1 is the standard output produced by device-mapper. +Line 2 is produced by the raid target, and best explained by example: 0 1960893648 raid raid4 5 AAAAA 2/490221568 Here we can see the RAID type is raid4, there are 5 devices - all of which are 'A'live, and the array is 2/490221568 complete with recovery. +Faulty or missing devices are marked 'D'. Devices that are out-of-sync +are marked 'a'. diff --git a/Documentation/dmaengine.txt b/Documentation/dmaengine.txt index 5a0cb1ef6164..94b7e0f96b38 100644 --- a/Documentation/dmaengine.txt +++ b/Documentation/dmaengine.txt @@ -10,87 +10,181 @@ NOTE: For DMA Engine usage in async_tx please see: Below is a guide to device driver writers on how to use the Slave-DMA API of the DMA Engine. This is applicable only for slave DMA usage only. -The slave DMA usage consists of following steps +The slave DMA usage consists of following steps: 1. Allocate a DMA slave channel 2. Set slave and controller specific parameters 3. Get a descriptor for transaction -4. Submit the transaction and wait for callback notification +4. Submit the transaction +5. Issue pending requests and wait for callback notification 1. Allocate a DMA slave channel -Channel allocation is slightly different in the slave DMA context, client -drivers typically need a channel from a particular DMA controller only and even -in some cases a specific channel is desired. To request a channel -dma_request_channel() API is used. - -Interface: -struct dma_chan *dma_request_channel(dma_cap_mask_t mask, - dma_filter_fn filter_fn, - void *filter_param); -where dma_filter_fn is defined as: -typedef bool (*dma_filter_fn)(struct dma_chan *chan, void *filter_param); - -When the optional 'filter_fn' parameter is set to NULL dma_request_channel -simply returns the first channel that satisfies the capability mask. Otherwise, -when the mask parameter is insufficient for specifying the necessary channel, -the filter_fn routine can be used to disposition the available channels in the -system. The filter_fn routine is called once for each free channel in the -system. Upon seeing a suitable channel filter_fn returns DMA_ACK which flags -that channel to be the return value from dma_request_channel. A channel -allocated via this interface is exclusive to the caller, until -dma_release_channel() is called. + + Channel allocation is slightly different in the slave DMA context, + client drivers typically need a channel from a particular DMA + controller only and even in some cases a specific channel is desired. + To request a channel dma_request_channel() API is used. + + Interface: + struct dma_chan *dma_request_channel(dma_cap_mask_t mask, + dma_filter_fn filter_fn, + void *filter_param); + where dma_filter_fn is defined as: + typedef bool (*dma_filter_fn)(struct dma_chan *chan, void *filter_param); + + The 'filter_fn' parameter is optional, but highly recommended for + slave and cyclic channels as they typically need to obtain a specific + DMA channel. + + When the optional 'filter_fn' parameter is NULL, dma_request_channel() + simply returns the first channel that satisfies the capability mask. + + Otherwise, the 'filter_fn' routine will be called once for each free + channel which has a capability in 'mask'. 'filter_fn' is expected to + return 'true' when the desired DMA channel is found. + + A channel allocated via this interface is exclusive to the caller, + until dma_release_channel() is called. 2. Set slave and controller specific parameters -Next step is always to pass some specific information to the DMA driver. Most of -the generic information which a slave DMA can use is in struct dma_slave_config. -It allows the clients to specify DMA direction, DMA addresses, bus widths, DMA -burst lengths etc. If some DMA controllers have more parameters to be sent then -they should try to embed struct dma_slave_config in their controller specific -structure. That gives flexibility to client to pass more parameters, if -required. - -Interface: -int dmaengine_slave_config(struct dma_chan *chan, - struct dma_slave_config *config) + + Next step is always to pass some specific information to the DMA + driver. Most of the generic information which a slave DMA can use + is in struct dma_slave_config. This allows the clients to specify + DMA direction, DMA addresses, bus widths, DMA burst lengths etc + for the peripheral. + + If some DMA controllers have more parameters to be sent then they + should try to embed struct dma_slave_config in their controller + specific structure. That gives flexibility to client to pass more + parameters, if required. + + Interface: + int dmaengine_slave_config(struct dma_chan *chan, + struct dma_slave_config *config) + + Please see the dma_slave_config structure definition in dmaengine.h + for a detailed explaination of the struct members. Please note + that the 'direction' member will be going away as it duplicates the + direction given in the prepare call. 3. Get a descriptor for transaction -For slave usage the various modes of slave transfers supported by the -DMA-engine are: -slave_sg - DMA a list of scatter gather buffers from/to a peripheral -dma_cyclic - Perform a cyclic DMA operation from/to a peripheral till the + + For slave usage the various modes of slave transfers supported by the + DMA-engine are: + + slave_sg - DMA a list of scatter gather buffers from/to a peripheral + dma_cyclic - Perform a cyclic DMA operation from/to a peripheral till the operation is explicitly stopped. -The non NULL return of this transfer API represents a "descriptor" for the given -transaction. - -Interface: -struct dma_async_tx_descriptor *(*chan->device->device_prep_dma_sg)( - struct dma_chan *chan, - struct scatterlist *dst_sg, unsigned int dst_nents, - struct scatterlist *src_sg, unsigned int src_nents, + + A non-NULL return of this transfer API represents a "descriptor" for + the given transaction. + + Interface: + struct dma_async_tx_descriptor *(*chan->device->device_prep_slave_sg)( + struct dma_chan *chan, struct scatterlist *sgl, + unsigned int sg_len, enum dma_data_direction direction, unsigned long flags); -struct dma_async_tx_descriptor *(*chan->device->device_prep_dma_cyclic)( + + struct dma_async_tx_descriptor *(*chan->device->device_prep_dma_cyclic)( struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len, size_t period_len, enum dma_data_direction direction); -4. Submit the transaction and wait for callback notification -To schedule the transaction to be scheduled by dma device, the "descriptor" -returned in above (3) needs to be submitted. -To tell the dma driver that a transaction is ready to be serviced, the -descriptor->submit() callback needs to be invoked. This chains the descriptor to -the pending queue. -The transactions in the pending queue can be activated by calling the -issue_pending API. If channel is idle then the first transaction in queue is -started and subsequent ones queued up. -On completion of the DMA operation the next in queue is submitted and a tasklet -triggered. The tasklet would then call the client driver completion callback -routine for notification, if set. -Interface: -void dma_async_issue_pending(struct dma_chan *chan); - -============================================================================== - -Additional usage notes for dma driver writers -1/ Although DMA engine specifies that completion callback routines cannot submit -any new operations, but typically for slave DMA subsequent transaction may not -be available for submit prior to callback routine being called. This requirement -is not a requirement for DMA-slave devices. But they should take care to drop -the spin-lock they might be holding before calling the callback routine + The peripheral driver is expected to have mapped the scatterlist for + the DMA operation prior to calling device_prep_slave_sg, and must + keep the scatterlist mapped until the DMA operation has completed. + The scatterlist must be mapped using the DMA struct device. So, + normal setup should look like this: + + nr_sg = dma_map_sg(chan->device->dev, sgl, sg_len); + if (nr_sg == 0) + /* error */ + + desc = chan->device->device_prep_slave_sg(chan, sgl, nr_sg, + direction, flags); + + Once a descriptor has been obtained, the callback information can be + added and the descriptor must then be submitted. Some DMA engine + drivers may hold a spinlock between a successful preparation and + submission so it is important that these two operations are closely + paired. + + Note: + Although the async_tx API specifies that completion callback + routines cannot submit any new operations, this is not the + case for slave/cyclic DMA. + + For slave DMA, the subsequent transaction may not be available + for submission prior to callback function being invoked, so + slave DMA callbacks are permitted to prepare and submit a new + transaction. + + For cyclic DMA, a callback function may wish to terminate the + DMA via dmaengine_terminate_all(). + + Therefore, it is important that DMA engine drivers drop any + locks before calling the callback function which may cause a + deadlock. + + Note that callbacks will always be invoked from the DMA + engines tasklet, never from interrupt context. + +4. Submit the transaction + + Once the descriptor has been prepared and the callback information + added, it must be placed on the DMA engine drivers pending queue. + + Interface: + dma_cookie_t dmaengine_submit(struct dma_async_tx_descriptor *desc) + + This returns a cookie can be used to check the progress of DMA engine + activity via other DMA engine calls not covered in this document. + + dmaengine_submit() will not start the DMA operation, it merely adds + it to the pending queue. For this, see step 5, dma_async_issue_pending. + +5. Issue pending DMA requests and wait for callback notification + + The transactions in the pending queue can be activated by calling the + issue_pending API. If channel is idle then the first transaction in + queue is started and subsequent ones queued up. + + On completion of each DMA operation, the next in queue is started and + a tasklet triggered. The tasklet will then call the client driver + completion callback routine for notification, if set. + + Interface: + void dma_async_issue_pending(struct dma_chan *chan); + +Further APIs: + +1. int dmaengine_terminate_all(struct dma_chan *chan) + + This causes all activity for the DMA channel to be stopped, and may + discard data in the DMA FIFO which hasn't been fully transferred. + No callback functions will be called for any incomplete transfers. + +2. int dmaengine_pause(struct dma_chan *chan) + + This pauses activity on the DMA channel without data loss. + +3. int dmaengine_resume(struct dma_chan *chan) + + Resume a previously paused DMA channel. It is invalid to resume a + channel which is not currently paused. + +4. enum dma_status dma_async_is_tx_complete(struct dma_chan *chan, + dma_cookie_t cookie, dma_cookie_t *last, dma_cookie_t *used) + + This can be used to check the status of the channel. Please see + the documentation in include/linux/dmaengine.h for a more complete + description of this API. + + This can be used in conjunction with dma_async_is_complete() and + the cookie returned from 'descriptor->submit()' to check for + completion of a specific DMA transaction. + + Note: + Not all DMA engine drivers can return reliable information for + a running DMA channel. It is recommended that DMA engine users + pause or stop (via dmaengine_terminate_all) the channel before + using this API. diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 4ca93898fbd3..26a83743af19 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2153,6 +2153,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. [HW,MOUSE] Controls Logitech smartscroll autorepeat. 0 = disabled, 1 = enabled (default). + pstore.backend= Specify the name of the pstore backend to use + pt. [PARIDE] See Documentation/blockdev/paride.txt. diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 137b277f7e56..64c7ab7e7a81 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -27,6 +27,7 @@ config IA64 select GENERIC_PENDING_IRQ if SMP select IRQ_PER_CPU select GENERIC_IRQ_SHOW + select ARCH_WANT_OPTIONAL_GPIOLIB default y help The Itanium Processor Family is Intel's 64-bit successor to @@ -89,6 +90,9 @@ config GENERIC_TIME_VSYSCALL config HAVE_SETUP_PER_CPU_AREA def_bool y +config GENERIC_GPIO + def_bool y + config DMI bool default y diff --git a/arch/ia64/include/asm/gpio.h b/arch/ia64/include/asm/gpio.h new file mode 100644 index 000000000000..590a20debc4e --- /dev/null +++ b/arch/ia64/include/asm/gpio.h @@ -0,0 +1,55 @@ +/* + * Generic GPIO API implementation for IA-64. + * + * A stright copy of that for PowerPC which was: + * + * Copyright (c) 2007-2008 MontaVista Software, Inc. + * + * Author: Anton Vorontsov <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef _ASM_IA64_GPIO_H +#define _ASM_IA64_GPIO_H + +#include <linux/errno.h> +#include <asm-generic/gpio.h> + +#ifdef CONFIG_GPIOLIB + +/* + * We don't (yet) implement inlined/rapid versions for on-chip gpios. + * Just call gpiolib. + */ +static inline int gpio_get_value(unsigned int gpio) +{ + return __gpio_get_value(gpio); +} + +static inline void gpio_set_value(unsigned int gpio, int value) +{ + __gpio_set_value(gpio, value); +} + +static inline int gpio_cansleep(unsigned int gpio) +{ + return __gpio_cansleep(gpio); +} + +static inline int gpio_to_irq(unsigned int gpio) +{ + return __gpio_to_irq(gpio); +} + +static inline int irq_to_gpio(unsigned int irq) +{ + return -EINVAL; +} + +#endif /* CONFIG_GPIOLIB */ + +#endif /* _ASM_IA64_GPIO_H */ diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c index e6cef8e1b534..6053f4780df9 100644 --- a/drivers/acpi/apei/erst.c +++ b/drivers/acpi/apei/erst.c @@ -932,8 +932,11 @@ static int erst_check_table(struct acpi_table_erst *erst_tab) static int erst_open_pstore(struct pstore_info *psi); static int erst_close_pstore(struct pstore_info *psi); static ssize_t erst_reader(u64 *id, enum pstore_type_id *type, - struct timespec *time); -static u64 erst_writer(enum pstore_type_id type, size_t size); + struct timespec *time, struct pstore_info *psi); +static u64 erst_writer(enum pstore_type_id type, unsigned int part, + size_t size, struct pstore_info *psi); +static int erst_clearer(enum pstore_type_id type, u64 id, + struct pstore_info *psi); static struct pstore_info erst_info = { .owner = THIS_MODULE, @@ -942,7 +945,7 @@ static struct pstore_info erst_info = { .close = erst_close_pstore, .read = erst_reader, .write = erst_writer, - .erase = erst_clear + .erase = erst_clearer }; #define CPER_CREATOR_PSTORE \ @@ -983,7 +986,7 @@ static int erst_close_pstore(struct pstore_info *psi) } static ssize_t erst_reader(u64 *id, enum pstore_type_id *type, - struct timespec *time) + struct timespec *time, struct pstore_info *psi) { int rc; ssize_t len = 0; @@ -1037,7 +1040,8 @@ out: return (rc < 0) ? rc : (len - sizeof(*rcd)); } -static u64 erst_writer(enum pstore_type_id type, size_t size) +static u64 erst_writer(enum pstore_type_id type, unsigned int part, + size_t size, struct pstore_info *psi) { struct cper_pstore_record *rcd = (struct cper_pstore_record *) (erst_info.buf - sizeof(*rcd)); @@ -1080,6 +1084,12 @@ static u64 erst_writer(enum pstore_type_id type, size_t size) return rcd->hdr.record_id; } +static int erst_clearer(enum pstore_type_id type, u64 id, + struct pstore_info *psi) +{ + return erst_clear(id); +} + static int __init erst_init(void) { int rc = 0; diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index b89fffc1d777..33e1bed68fdd 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c @@ -166,7 +166,7 @@ static int create_path(const char *nodepath) { char *path; char *s; - int err; + int err = 0; /* parent directories do not exist, create them */ path = kstrdup(nodepath, GFP_KERNEL); diff --git a/drivers/dma/TODO b/drivers/dma/TODO index a4af8589330c..734ed0206cd5 100644 --- a/drivers/dma/TODO +++ b/drivers/dma/TODO @@ -9,6 +9,5 @@ TODO for slave dma - mxs-dma.c - dw_dmac - intel_mid_dma - - ste_dma40 4. Check other subsystems for dma drivers and merge/move to dmaengine 5. Remove dma_slave_config's dma direction. diff --git a/drivers/dma/amba-pl08x.c b/drivers/dma/amba-pl08x.c index e6d7228b1479..196a7378d332 100644 --- a/drivers/dma/amba-pl08x.c +++ b/drivers/dma/amba-pl08x.c @@ -156,14 +156,10 @@ struct pl08x_driver_data { #define PL08X_BOUNDARY_SHIFT (10) /* 1KB 0x400 */ #define PL08X_BOUNDARY_SIZE (1 << PL08X_BOUNDARY_SHIFT) -/* Minimum period between work queue runs */ -#define PL08X_WQ_PERIODMIN 20 - /* Size (bytes) of each LLI buffer allocated for one transfer */ # define PL08X_LLI_TSFR_SIZE 0x2000 /* Maximum times we call dma_pool_alloc on this pool without freeing */ -#define PL08X_MAX_ALLOCS 0x40 #define MAX_NUM_TSFR_LLIS (PL08X_LLI_TSFR_SIZE/sizeof(struct pl08x_lli)) #define PL08X_ALIGN 8 @@ -495,10 +491,10 @@ static inline u32 pl08x_cctl_bits(u32 cctl, u8 srcwidth, u8 dstwidth, struct pl08x_lli_build_data { struct pl08x_txd *txd; - struct pl08x_driver_data *pl08x; struct pl08x_bus_data srcbus; struct pl08x_bus_data dstbus; size_t remainder; + u32 lli_bus; }; /* @@ -551,8 +547,7 @@ static void pl08x_fill_lli_for_desc(struct pl08x_lli_build_data *bd, llis_va[num_llis].src = bd->srcbus.addr; llis_va[num_llis].dst = bd->dstbus.addr; llis_va[num_llis].lli = llis_bus + (num_llis + 1) * sizeof(struct pl08x_lli); - if (bd->pl08x->lli_buses & PL08X_AHB2) - llis_va[num_llis].lli |= PL080_LLI_LM_AHB2; + llis_va[num_llis].lli |= bd->lli_bus; if (cctl & PL080_CONTROL_SRC_INCR) bd->srcbus.addr += len; @@ -605,9 +600,9 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x, cctl = txd->cctl; bd.txd = txd; - bd.pl08x = pl08x; bd.srcbus.addr = txd->src_addr; bd.dstbus.addr = txd->dst_addr; + bd.lli_bus = (pl08x->lli_buses & PL08X_AHB2) ? PL080_LLI_LM_AHB2 : 0; /* Find maximum width of the source bus */ bd.srcbus.maxwidth = @@ -622,25 +617,15 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x, /* Set up the bus widths to the maximum */ bd.srcbus.buswidth = bd.srcbus.maxwidth; bd.dstbus.buswidth = bd.dstbus.maxwidth; - dev_vdbg(&pl08x->adev->dev, - "%s source bus is %d bytes wide, dest bus is %d bytes wide\n", - __func__, bd.srcbus.buswidth, bd.dstbus.buswidth); - /* * Bytes transferred == tsize * MIN(buswidths), not max(buswidths) */ max_bytes_per_lli = min(bd.srcbus.buswidth, bd.dstbus.buswidth) * PL080_CONTROL_TRANSFER_SIZE_MASK; - dev_vdbg(&pl08x->adev->dev, - "%s max bytes per lli = %zu\n", - __func__, max_bytes_per_lli); /* We need to count this down to zero */ bd.remainder = txd->len; - dev_vdbg(&pl08x->adev->dev, - "%s remainder = %zu\n", - __func__, bd.remainder); /* * Choose bus to align to @@ -649,6 +634,16 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x, */ pl08x_choose_master_bus(&bd, &mbus, &sbus, cctl); + dev_vdbg(&pl08x->adev->dev, "src=0x%08x%s/%u dst=0x%08x%s/%u len=%zu llimax=%zu\n", + bd.srcbus.addr, cctl & PL080_CONTROL_SRC_INCR ? "+" : "", + bd.srcbus.buswidth, + bd.dstbus.addr, cctl & PL080_CONTROL_DST_INCR ? "+" : "", + bd.dstbus.buswidth, + bd.remainder, max_bytes_per_lli); + dev_vdbg(&pl08x->adev->dev, "mbus=%s sbus=%s\n", + mbus == &bd.srcbus ? "src" : "dst", + sbus == &bd.srcbus ? "src" : "dst"); + if (txd->len < mbus->buswidth) { /* Less than a bus width available - send as single bytes */ while (bd.remainder) { @@ -840,15 +835,14 @@ static int pl08x_fill_llis_for_desc(struct pl08x_driver_data *pl08x, { int i; + dev_vdbg(&pl08x->adev->dev, + "%-3s %-9s %-10s %-10s %-10s %s\n", + "lli", "", "csrc", "cdst", "clli", "cctl"); for (i = 0; i < num_llis; i++) { dev_vdbg(&pl08x->adev->dev, - "lli %d @%p: csrc=0x%08x, cdst=0x%08x, cctl=0x%08x, clli=0x%08x\n", - i, - &llis_va[i], - llis_va[i].src, - llis_va[i].dst, - llis_va[i].cctl, - llis_va[i].lli + "%3d @%p: 0x%08x 0x%08x 0x%08x 0x%08x\n", + i, &llis_va[i], llis_va[i].src, + llis_va[i].dst, llis_va[i].lli, llis_va[i].cctl ); } } @@ -1054,64 +1048,105 @@ pl08x_dma_tx_status(struct dma_chan *chan, /* PrimeCell DMA extension */ struct burst_table { - int burstwords; + u32 burstwords; u32 reg; }; static const struct burst_table burst_sizes[] = { { .burstwords = 256, - .reg = (PL080_BSIZE_256 << PL080_CONTROL_SB_SIZE_SHIFT) | - (PL080_BSIZE_256 << PL080_CONTROL_DB_SIZE_SHIFT), + .reg = PL080_BSIZE_256, }, { .burstwords = 128, - .reg = (PL080_BSIZE_128 << PL080_CONTROL_SB_SIZE_SHIFT) | - (PL080_BSIZE_128 << PL080_CONTROL_DB_SIZE_SHIFT), + .reg = PL080_BSIZE_128, }, { .burstwords = 64, - .reg = (PL080_BSIZE_64 << PL080_CONTROL_SB_SIZE_SHIFT) | - (PL080_BSIZE_64 << PL080_CONTROL_DB_SIZE_SHIFT), + .reg = PL080_BSIZE_64, }, { .burstwords = 32, - .reg = (PL080_BSIZE_32 << PL080_CONTROL_SB_SIZE_SHIFT) | - (PL080_BSIZE_32 << PL080_CONTROL_DB_SIZE_SHIFT), + .reg = PL080_BSIZE_32, }, { .burstwords = 16, - .reg = (PL080_BSIZE_16 << PL080_CONTROL_SB_SIZE_SHIFT) | - (PL080_BSIZE_16 << PL080_CONTROL_DB_SIZE_SHIFT), + .reg = PL080_BSIZE_16, }, { .burstwords = 8, - .reg = (PL080_BSIZE_8 << PL080_CONTROL_SB_SIZE_SHIFT) | - (PL080_BSIZE_8 << PL080_CONTROL_DB_SIZE_SHIFT), + .reg = PL080_BSIZE_8, }, { .burstwords = 4, - .reg = (PL080_BSIZE_4 << PL080_CONTROL_SB_SIZE_SHIFT) | - (PL080_BSIZE_4 << PL080_CONTROL_DB_SIZE_SHIFT), + .reg = PL080_BSIZE_4, }, { - .burstwords = 1, - .reg = (PL080_BSIZE_1 << PL080_CONTROL_SB_SIZE_SHIFT) | - (PL080_BSIZE_1 << PL080_CONTROL_DB_SIZE_SHIFT), + .burstwords = 0, + .reg = PL080_BSIZE_1, }, }; +/* + * Given the source and destination available bus masks, select which + * will be routed to each port. We try to have source and destination + * on separate ports, but always respect the allowable settings. + */ +static u32 pl08x_select_bus(u8 src, u8 dst) +{ + u32 cctl = 0; + + if (!(dst & PL08X_AHB1) || ((dst & PL08X_AHB2) && (src & PL08X_AHB1))) + cctl |= PL080_CONTROL_DST_AHB2; + if (!(src & PL08X_AHB1) || ((src & PL08X_AHB2) && !(dst & PL08X_AHB2))) + cctl |= PL080_CONTROL_SRC_AHB2; + + return cctl; +} + +static u32 pl08x_cctl(u32 cctl) +{ + cctl &= ~(PL080_CONTROL_SRC_AHB2 | PL080_CONTROL_DST_AHB2 | + PL080_CONTROL_SRC_INCR | PL080_CONTROL_DST_INCR | + PL080_CONTROL_PROT_MASK); + + /* Access the cell in privileged mode, non-bufferable, non-cacheable */ + return cctl | PL080_CONTROL_PROT_SYS; +} + +static u32 pl08x_width(enum dma_slave_buswidth width) +{ + switch (width) { + case DMA_SLAVE_BUSWIDTH_1_BYTE: + return PL080_WIDTH_8BIT; + case DMA_SLAVE_BUSWIDTH_2_BYTES: + return PL080_WIDTH_16BIT; + case DMA_SLAVE_BUSWIDTH_4_BYTES: + return PL080_WIDTH_32BIT; + default: + return ~0; + } +} + +static u32 pl08x_burst(u32 maxburst) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(burst_sizes); i++) + if (burst_sizes[i].burstwords <= maxburst) + break; + + return burst_sizes[i].reg; +} + static int dma_set_runtime_config(struct dma_chan *chan, struct dma_slave_config *config) { struct pl08x_dma_chan *plchan = to_pl08x_chan(chan); struct pl08x_driver_data *pl08x = plchan->host; - struct pl08x_channel_data *cd = plchan->cd; enum dma_slave_buswidth addr_width; - dma_addr_t addr; - u32 maxburst; + u32 width, burst, maxburst; u32 cctl = 0; - int i; if (!plchan->slave) return -EINVAL; @@ -1119,11 +1154,9 @@ static int dma_set_runtime_config(struct dma_chan *chan, /* Transfer direction */ plchan->runtime_direction = config->direction; if (config->direction == DMA_TO_DEVICE) { - addr = config->dst_addr; addr_width = config->dst_addr_width; maxburst = config->dst_maxburst; } else if (config->direction == DMA_FROM_DEVICE) { - addr = config->src_addr; addr_width = config->src_addr_width; maxburst = config->src_maxburst; } else { @@ -1132,46 +1165,40 @@ static int dma_set_runtime_config(struct dma_chan *chan, return -EINVAL; } - switch (addr_width) { - case DMA_SLAVE_BUSWIDTH_1_BYTE: - cctl |= (PL080_WIDTH_8BIT << PL080_CONTROL_SWIDTH_SHIFT) | - (PL080_WIDTH_8BIT << PL080_CONTROL_DWIDTH_SHIFT); - break; - case DMA_SLAVE_BUSWIDTH_2_BYTES: - cctl |= (PL080_WIDTH_16BIT << PL080_CONTROL_SWIDTH_SHIFT) | - (PL080_WIDTH_16BIT << PL080_CONTROL_DWIDTH_SHIFT); - break; - case DMA_SLAVE_BUSWIDTH_4_BYTES: - cctl |= (PL080_WIDTH_32BIT << PL080_CONTROL_SWIDTH_SHIFT) | - (PL080_WIDTH_32BIT << PL080_CONTROL_DWIDTH_SHIFT); - break; - default: + width = pl08x_width(addr_width); + if (width == ~0) { dev_err(&pl08x->adev->dev, "bad runtime_config: alien address width\n"); return -EINVAL; } + cctl |= width << PL080_CONTROL_SWIDTH_SHIFT; + cctl |= width << PL080_CONTROL_DWIDTH_SHIFT; + /* - * Now decide on a maxburst: * If this channel will only request single transfers, set this * down to ONE element. Also select one element if no maxburst * is specified. */ - if (plchan->cd->single || maxburst == 0) { - cctl |= (PL080_BSIZE_1 << PL080_CONTROL_SB_SIZE_SHIFT) | - (PL080_BSIZE_1 << PL080_CONTROL_DB_SIZE_SHIFT); + if (plchan->cd->single) + maxburst = 1; + + burst = pl08x_burst(maxburst); + cctl |= burst << PL080_CONTROL_SB_SIZE_SHIFT; + cctl |= burst << PL080_CONTROL_DB_SIZE_SHIFT; + + if (plchan->runtime_direction == DMA_FROM_DEVICE) { + plchan->src_addr = config->src_addr; + plchan->src_cctl = pl08x_cctl(cctl) | PL080_CONTROL_DST_INCR | + pl08x_select_bus(plchan->cd->periph_buses, + pl08x->mem_buses); } else { - for (i = 0; i < ARRAY_SIZE(burst_sizes); i++) - if (burst_sizes[i].burstwords <= maxburst) - break; - cctl |= burst_sizes[i].reg; + plchan->dst_addr = config->dst_addr; + plchan->dst_cctl = pl08x_cctl(cctl) | PL080_CONTROL_SRC_INCR | + pl08x_select_bus(pl08x->mem_buses, + plchan->cd->periph_buses); } - plchan->runtime_addr = addr; - - /* Modify the default channel data to fit PrimeCell request */ - cd->cctl = cctl; - dev_dbg(&pl08x->adev->dev, "configured channel %s (%s) for %s, data width %d, " "maxburst %d words, LE, CCTL=0x%08x\n", @@ -1270,23 +1297,6 @@ static int pl08x_prep_channel_resources(struct pl08x_dma_chan *plchan, return 0; } -/* - * Given the source and destination available bus masks, select which - * will be routed to each port. We try to have source and destination - * on separate ports, but always respect the allowable settings. - */ -static u32 pl08x_select_bus(struct pl08x_driver_data *pl08x, u8 src, u8 dst) -{ - u32 cctl = 0; - - if (!(dst & PL08X_AHB1) || ((dst & PL08X_AHB2) && (src & PL08X_AHB1))) - cctl |= PL080_CONTROL_DST_AHB2; - if (!(src & PL08X_AHB1) || ((src & PL08X_AHB2) && !(dst & PL08X_AHB2))) - cctl |= PL080_CONTROL_SRC_AHB2; - - return cctl; -} - static struct pl08x_txd *pl08x_get_txd(struct pl08x_dma_chan *plchan, unsigned long flags) { @@ -1338,8 +1348,8 @@ static struct dma_async_tx_descriptor *pl08x_prep_dma_memcpy( txd->cctl |= PL080_CONTROL_SRC_INCR | PL080_CONTROL_DST_INCR; if (pl08x->vd->dualmaster) - txd->cctl |= pl08x_select_bus(pl08x, - pl08x->mem_buses, pl08x->mem_buses); + txd->cctl |= pl08x_select_bus(pl08x->mem_buses, + pl08x->mem_buses); ret = pl08x_prep_channel_resources(plchan, txd); if (ret) @@ -1356,7 +1366,6 @@ static struct dma_async_tx_descriptor *pl08x_prep_slave_sg( struct pl08x_dma_chan *plchan = to_pl08x_chan(chan); struct pl08x_driver_data *pl08x = plchan->host; struct pl08x_txd *txd; - u8 src_buses, dst_buses; int ret; /* @@ -1390,42 +1399,22 @@ static struct dma_async_tx_descriptor *pl08x_prep_slave_sg( txd->direction = direction; txd->len = sgl->length; - txd->cctl = plchan->cd->cctl & - ~(PL080_CONTROL_SRC_AHB2 | PL080_CONTROL_DST_AHB2 | - PL080_CONTROL_SRC_INCR | PL080_CONTROL_DST_INCR | - PL080_CONTROL_PROT_MASK); - - /* Access the cell in privileged mode, non-bufferable, non-cacheable */ - txd->cctl |= PL080_CONTROL_PROT_SYS; - if (direction == DMA_TO_DEVICE) { txd->ccfg |= PL080_FLOW_MEM2PER << PL080_CONFIG_FLOW_CONTROL_SHIFT; - txd->cctl |= PL080_CONTROL_SRC_INCR; + txd->cctl = plchan->dst_cctl; txd->src_addr = sgl->dma_address; - if (plchan->runtime_addr) - txd->dst_addr = plchan->runtime_addr; - else - txd->dst_addr = plchan->cd->addr; - src_buses = pl08x->mem_buses; - dst_buses = plchan->cd->periph_buses; + txd->dst_addr = plchan->dst_addr; } else if (direction == DMA_FROM_DEVICE) { txd->ccfg |= PL080_FLOW_PER2MEM << PL080_CONFIG_FLOW_CONTROL_SHIFT; - txd->cctl |= PL080_CONTROL_DST_INCR; - if (plchan->runtime_addr) - txd->src_addr = plchan->runtime_addr; - else - txd->src_addr = plchan->cd->addr; + txd->cctl = plchan->src_cctl; + txd->src_addr = plchan->src_addr; txd->dst_addr = sgl->dma_address; - src_buses = plchan->cd->periph_buses; - dst_buses = pl08x->mem_buses; } else { dev_err(&pl08x->adev->dev, "%s direction unsupported\n", __func__); return NULL; } - txd->cctl |= pl08x_select_bus(pl08x, src_buses, dst_buses); - ret = pl08x_prep_channel_resources(plchan, txd); if (ret) return NULL; @@ -1676,6 +1665,20 @@ static irqreturn_t pl08x_irq(int irq, void *dev) return mask ? IRQ_HANDLED : IRQ_NONE; } +static void pl08x_dma_slave_init(struct pl08x_dma_chan *chan) +{ + u32 cctl = pl08x_cctl(chan->cd->cctl); + + chan->slave = true; + chan->name = chan->cd->bus_id; + chan->src_addr = chan->cd->addr; + chan->dst_addr = chan->cd->addr; + chan->src_cctl = cctl | PL080_CONTROL_DST_INCR | + pl08x_select_bus(chan->cd->periph_buses, chan->host->mem_buses); + chan->dst_cctl = cctl | PL080_CONTROL_SRC_INCR | + pl08x_select_bus(chan->host->mem_buses, chan->cd->periph_buses); +} + /* * Initialise the DMAC memcpy/slave channels. * Make a local wrapper to hold required data @@ -1707,9 +1710,8 @@ static int pl08x_dma_init_virtual_channels(struct pl08x_driver_data *pl08x, chan->state = PL08X_CHAN_IDLE; if (slave) { - chan->slave = true; - chan->name = pl08x->pd->slave_channels[i].bus_id; chan->cd = &pl08x->pd->slave_channels[i]; + pl08x_dma_slave_init(chan); } else { chan->cd = &pl08x->pd->memcpy_channel; chan->name = kasprintf(GFP_KERNEL, "memcpy%d", i); diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c index 36144f88d718..6a483eac7b3f 100644 --- a/drivers/dma/at_hdmac.c +++ b/drivers/dma/at_hdmac.c @@ -1216,7 +1216,7 @@ static int __init at_dma_probe(struct platform_device *pdev) atdma->dma_common.cap_mask = pdata->cap_mask; atdma->all_chan_mask = (1 << pdata->nr_channels) - 1; - size = io->end - io->start + 1; + size = resource_size(io); if (!request_mem_region(io->start, size, pdev->dev.driver->name)) { err = -EBUSY; goto err_kfree; @@ -1362,7 +1362,7 @@ static int __exit at_dma_remove(struct platform_device *pdev) atdma->regs = NULL; io = platform_get_resource(pdev, IORESOURCE_MEM, 0); - release_mem_region(io->start, io->end - io->start + 1); + release_mem_region(io->start, resource_size(io)); kfree(atdma); diff --git a/drivers/dma/coh901318.c b/drivers/dma/coh901318.c index a92d95eac86b..4234f416ef11 100644 --- a/drivers/dma/coh901318.c +++ b/drivers/dma/coh901318.c @@ -41,6 +41,8 @@ struct coh901318_desc { struct coh901318_lli *lli; enum dma_data_direction dir; unsigned long flags; + u32 head_config; + u32 head_ctrl; }; struct coh901318_base { @@ -661,6 +663,9 @@ static struct coh901318_desc *coh901318_queue_start(struct coh901318_chan *cohc) coh901318_desc_submit(cohc, cohd); + /* Program the transaction head */ + coh901318_set_conf(cohc, cohd->head_config); + coh901318_set_ctrl(cohc, cohd->head_ctrl); coh901318_prep_linked_list(cohc, cohd->lli); /* start dma job on this channel */ @@ -1091,8 +1096,6 @@ coh901318_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, } else goto err_direction; - coh901318_set_conf(cohc, config); - /* The dma only supports transmitting packages up to * MAX_DMA_PACKET_SIZE. Calculate to total number of * dma elemts required to send the entire sg list @@ -1129,16 +1132,18 @@ coh901318_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, if (ret) goto err_lli_fill; - /* - * Set the default ctrl for the channel to the one from the lli, - * things may have changed due to odd buffer alignment etc. - */ - coh901318_set_ctrl(cohc, lli->control); COH_DBG(coh901318_list_print(cohc, lli)); /* Pick a descriptor to handle this transfer */ cohd = coh901318_desc_get(cohc); + cohd->head_config = config; + /* + * Set the default head ctrl for the channel to the one from the + * lli, things may have changed due to odd buffer alignment + * etc. + */ + cohd->head_ctrl = lli->control; cohd->dir = direction; cohd->flags = flags; cohd->desc.tx_submit = coh901318_tx_submit; diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index 48694c34d96b..26374b2a55a2 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -510,8 +510,8 @@ struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, v dma_chan_name(chan)); list_del_rcu(&device->global_node); } else if (err) - pr_err("dmaengine: failed to get %s: (%d)\n", - dma_chan_name(chan), err); + pr_debug("dmaengine: failed to get %s: (%d)\n", + dma_chan_name(chan), err); else break; if (--device->privatecnt == 0) diff --git a/drivers/dma/ep93xx_dma.c b/drivers/dma/ep93xx_dma.c index 0766c1e53b1d..5d7a49bd7c26 100644 --- a/drivers/dma/ep93xx_dma.c +++ b/drivers/dma/ep93xx_dma.c @@ -902,7 +902,7 @@ static void ep93xx_dma_free_chan_resources(struct dma_chan *chan) * * Returns a valid DMA descriptor or %NULL in case of failure. */ -struct dma_async_tx_descriptor * +static struct dma_async_tx_descriptor * ep93xx_dma_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src, size_t len, unsigned long flags) { diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c index 1eb60ded2f0d..7bd7e98548cd 100644 --- a/drivers/dma/imx-sdma.c +++ b/drivers/dma/imx-sdma.c @@ -1305,8 +1305,10 @@ static int __init sdma_probe(struct platform_device *pdev) goto err_request_irq; sdma->script_addrs = kzalloc(sizeof(*sdma->script_addrs), GFP_KERNEL); - if (!sdma->script_addrs) + if (!sdma->script_addrs) { + ret = -ENOMEM; goto err_alloc; + } if (of_id) pdev->id_entry = of_id->data; diff --git a/drivers/dma/intel_mid_dma.c b/drivers/dma/intel_mid_dma.c index f653517ef744..8a3fdd87db97 100644 --- a/drivers/dma/intel_mid_dma.c +++ b/drivers/dma/intel_mid_dma.c @@ -1351,7 +1351,6 @@ int dma_suspend(struct pci_dev *pci, pm_message_t state) return -EAGAIN; } device->state = SUSPENDED; - pci_set_drvdata(pci, device); pci_save_state(pci); pci_disable_device(pci); pci_set_power_state(pci, PCI_D3hot); @@ -1380,7 +1379,6 @@ int dma_resume(struct pci_dev *pci) } device->state = RUNNING; iowrite32(REG_BIT0, device->dma_base + DMA_CFG); - pci_set_drvdata(pci, device); return 0; } diff --git a/drivers/dma/ipu/ipu_idmac.c b/drivers/dma/ipu/ipu_idmac.c index fd7d2b308cf2..6815905a772f 100644 --- a/drivers/dma/ipu/ipu_idmac.c +++ b/drivers/dma/ipu/ipu_idmac.c @@ -1706,16 +1706,14 @@ static int __init ipu_probe(struct platform_device *pdev) ipu_data.irq_fn, ipu_data.irq_err, ipu_data.irq_base); /* Remap IPU common registers */ - ipu_data.reg_ipu = ioremap(mem_ipu->start, - mem_ipu->end - mem_ipu->start + 1); + ipu_data.reg_ipu = ioremap(mem_ipu->start, resource_size(mem_ipu)); if (!ipu_data.reg_ipu) { ret = -ENOMEM; goto err_ioremap_ipu; } /* Remap Image Converter and Image DMA Controller registers */ - ipu_data.reg_ic = ioremap(mem_ic->start, - mem_ic->end - mem_ic->start + 1); + ipu_data.reg_ic = ioremap(mem_ic->start, resource_size(mem_ic)); if (!ipu_data.reg_ic) { ret = -ENOMEM; goto err_ioremap_ic; diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c index 06f9f27dbe7c..9a353c2216d0 100644 --- a/drivers/dma/mv_xor.c +++ b/drivers/dma/mv_xor.c @@ -1304,7 +1304,8 @@ static int mv_xor_shared_probe(struct platform_device *pdev) if (!res) return -ENODEV; - msp->xor_base = devm_ioremap(&pdev->dev, res->start, resource_size(res)); + msp->xor_base = devm_ioremap(&pdev->dev, res->start, + resource_size(res)); if (!msp->xor_base) return -EBUSY; diff --git a/drivers/dma/mxs-dma.c b/drivers/dma/mxs-dma.c index 88aad4f54002..be641cbd36fc 100644 --- a/drivers/dma/mxs-dma.c +++ b/drivers/dma/mxs-dma.c @@ -327,10 +327,12 @@ static int mxs_dma_alloc_chan_resources(struct dma_chan *chan) memset(mxs_chan->ccw, 0, PAGE_SIZE); - ret = request_irq(mxs_chan->chan_irq, mxs_dma_int_handler, - 0, "mxs-dma", mxs_dma); - if (ret) - goto err_irq; + if (mxs_chan->chan_irq != NO_IRQ) { + ret = request_irq(mxs_chan->chan_irq, mxs_dma_int_handler, + 0, "mxs-dma", mxs_dma); + if (ret) + goto err_irq; + } ret = clk_enable(mxs_dma->clk); if (ret) @@ -535,6 +537,7 @@ static int mxs_dma_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd, switch (cmd) { case DMA_TERMINATE_ALL: mxs_dma_disable_chan(mxs_chan); + mxs_dma_reset_chan(mxs_chan); break; case DMA_PAUSE: mxs_dma_pause_chan(mxs_chan); @@ -707,6 +710,8 @@ static struct platform_device_id mxs_dma_type[] = { }, { .name = "mxs-dma-apbx", .driver_data = MXS_DMA_APBX, + }, { + /* end of list */ } }; diff --git a/drivers/dma/pch_dma.c b/drivers/dma/pch_dma.c index ff5b38f9d45b..1ac8d4b580b7 100644 --- a/drivers/dma/pch_dma.c +++ b/drivers/dma/pch_dma.c @@ -45,7 +45,8 @@ #define DMA_STATUS_MASK_BITS 0x3 #define DMA_STATUS_SHIFT_BITS 16 #define DMA_STATUS_IRQ(x) (0x1 << (x)) -#define DMA_STATUS_ERR(x) (0x1 << ((x) + 8)) +#define DMA_STATUS0_ERR(x) (0x1 << ((x) + 8)) +#define DMA_STATUS2_ERR(x) (0x1 << (x)) #define DMA_DESC_WIDTH_SHIFT_BITS 12 #define DMA_DESC_WIDTH_1_BYTE (0x3 << DMA_DESC_WIDTH_SHIFT_BITS) @@ -61,6 +62,9 @@ #define MAX_CHAN_NR 8 +#define DMA_MASK_CTL0_MODE 0x33333333 +#define DMA_MASK_CTL2_MODE 0x00003333 + static unsigned int init_nr_desc_per_channel = 64; module_param(init_nr_desc_per_channel, uint, 0644); MODULE_PARM_DESC(init_nr_desc_per_channel, @@ -133,6 +137,7 @@ struct pch_dma { #define PCH_DMA_CTL3 0x0C #define PCH_DMA_STS0 0x10 #define PCH_DMA_STS1 0x14 +#define PCH_DMA_STS2 0x18 #define dma_readl(pd, name) \ readl((pd)->membase + PCH_DMA_##name) @@ -183,13 +188,19 @@ static void pdc_enable_irq(struct dma_chan *chan, int enable) { struct pch_dma *pd = to_pd(chan->device); u32 val; + int pos; + + if (chan->chan_id < 8) + pos = chan->chan_id; + else + pos = chan->chan_id + 8; val = dma_readl(pd, CTL2); if (enable) - val |= 0x1 << chan->chan_id; + val |= 0x1 << pos; else - val &= ~(0x1 << chan->chan_id); + val &= ~(0x1 << pos); dma_writel(pd, CTL2, val); @@ -202,10 +213,17 @@ static void pdc_set_dir(struct dma_chan *chan) struct pch_dma_chan *pd_chan = to_pd_chan(chan); struct pch_dma *pd = to_pd(chan->device); u32 val; + u32 mask_mode; + u32 mask_ctl; if (chan->chan_id < 8) { val = dma_readl(pd, CTL0); + mask_mode = DMA_CTL0_MODE_MASK_BITS << + (DMA_CTL0_BITS_PER_CH * chan->chan_id); + mask_ctl = DMA_MASK_CTL0_MODE & ~(DMA_CTL0_MODE_MASK_BITS << + (DMA_CTL0_BITS_PER_CH * chan->chan_id)); + val &= mask_mode; if (pd_chan->dir == DMA_TO_DEVICE) val |= 0x1 << (DMA_CTL0_BITS_PER_CH * chan->chan_id + DMA_CTL0_DIR_SHIFT_BITS); @@ -213,18 +231,24 @@ static void pdc_set_dir(struct dma_chan *chan) val &= ~(0x1 << (DMA_CTL0_BITS_PER_CH * chan->chan_id + DMA_CTL0_DIR_SHIFT_BITS)); + val |= mask_ctl; dma_writel(pd, CTL0, val); } else { int ch = chan->chan_id - 8; /* ch8-->0 ch9-->1 ... ch11->3 */ val = dma_readl(pd, CTL3); + mask_mode = DMA_CTL0_MODE_MASK_BITS << + (DMA_CTL0_BITS_PER_CH * ch); + mask_ctl = DMA_MASK_CTL2_MODE & ~(DMA_CTL0_MODE_MASK_BITS << + (DMA_CTL0_BITS_PER_CH * ch)); + val &= mask_mode; if (pd_chan->dir == DMA_TO_DEVICE) val |= 0x1 << (DMA_CTL0_BITS_PER_CH * ch + DMA_CTL0_DIR_SHIFT_BITS); else val &= ~(0x1 << (DMA_CTL0_BITS_PER_CH * ch + DMA_CTL0_DIR_SHIFT_BITS)); - + val |= mask_ctl; dma_writel(pd, CTL3, val); } @@ -236,33 +260,37 @@ static void pdc_set_mode(struct dma_chan *chan, u32 mode) { struct pch_dma *pd = to_pd(chan->device); u32 val; + u32 mask_ctl; + u32 mask_dir; if (chan->chan_id < 8) { + mask_ctl = DMA_MASK_CTL0_MODE & ~(DMA_CTL0_MODE_MASK_BITS << + (DMA_CTL0_BITS_PER_CH * chan->chan_id)); + mask_dir = 1 << (DMA_CTL0_BITS_PER_CH * chan->chan_id +\ + DMA_CTL0_DIR_SHIFT_BITS); val = dma_readl(pd, CTL0); - - val &= ~(DMA_CTL0_MODE_MASK_BITS << - (DMA_CTL0_BITS_PER_CH * chan->chan_id)); + val &= mask_dir; val |= mode << (DMA_CTL0_BITS_PER_CH * chan->chan_id); - + val |= mask_ctl; dma_writel(pd, CTL0, val); } else { int ch = chan->chan_id - 8; /* ch8-->0 ch9-->1 ... ch11->3 */ - + mask_ctl = DMA_MASK_CTL2_MODE & ~(DMA_CTL0_MODE_MASK_BITS << + (DMA_CTL0_BITS_PER_CH * ch)); + mask_dir = 1 << (DMA_CTL0_BITS_PER_CH * ch +\ + DMA_CTL0_DIR_SHIFT_BITS); val = dma_readl(pd, CTL3); - - val &= ~(DMA_CTL0_MODE_MASK_BITS << - (DMA_CTL0_BITS_PER_CH * ch)); + val &= mask_dir; val |= mode << (DMA_CTL0_BITS_PER_CH * ch); - + val |= mask_ctl; dma_writel(pd, CTL3, val); - } dev_dbg(chan2dev(chan), "pdc_set_mode: chan %d -> %x\n", chan->chan_id, val); } -static u32 pdc_get_status(struct pch_dma_chan *pd_chan) +static u32 pdc_get_status0(struct pch_dma_chan *pd_chan) { struct pch_dma *pd = to_pd(pd_chan->chan.device); u32 val; @@ -272,9 +300,27 @@ static u32 pdc_get_status(struct pch_dma_chan *pd_chan) DMA_STATUS_BITS_PER_CH * pd_chan->chan.chan_id)); } +static u32 pdc_get_status2(struct pch_dma_chan *pd_chan) +{ + struct pch_dma *pd = to_pd(pd_chan->chan.device); + u32 val; + + val = dma_readl(pd, STS2); + return DMA_STATUS_MASK_BITS & (val >> (DMA_STATUS_SHIFT_BITS + + DMA_STATUS_BITS_PER_CH * (pd_chan->chan.chan_id - 8))); +} + static bool pdc_is_idle(struct pch_dma_chan *pd_chan) { - if (pdc_get_status(pd_chan) == DMA_STATUS_IDLE) + u32 sts; + + if (pd_chan->chan.chan_id < 8) + sts = pdc_get_status0(pd_chan); + else + sts = pdc_get_status2(pd_chan); + + + if (sts == DMA_STATUS_IDLE) return true; else return false; @@ -495,11 +541,11 @@ static int pd_alloc_chan_resources(struct dma_chan *chan) list_add_tail(&desc->desc_node, &tmp_list); } - spin_lock_bh(&pd_chan->lock); + spin_lock_irq(&pd_chan->lock); list_splice(&tmp_list, &pd_chan->free_list); pd_chan->descs_allocated = i; pd_chan->completed_cookie = chan->cookie = 1; - spin_unlock_bh(&pd_chan->lock); + spin_unlock_irq(&pd_chan->lock); pdc_enable_irq(chan, 1); @@ -517,10 +563,10 @@ static void pd_free_chan_resources(struct dma_chan *chan) BUG_ON(!list_empty(&pd_chan->active_list)); BUG_ON(!list_empty(&pd_chan->queue)); - spin_lock_bh(&pd_chan->lock); + spin_lock_irq(&pd_chan->lock); list_splice_init(&pd_chan->free_list, &tmp_list); pd_chan->descs_allocated = 0; - spin_unlock_bh(&pd_chan->lock); + spin_unlock_irq(&pd_chan->lock); list_for_each_entry_safe(desc, _d, &tmp_list, desc_node) pci_pool_free(pd->pool, desc, desc->txd.phys); @@ -536,10 +582,10 @@ static enum dma_status pd_tx_status(struct dma_chan *chan, dma_cookie_t cookie, dma_cookie_t last_completed; int ret; - spin_lock_bh(&pd_chan->lock); + spin_lock_irq(&pd_chan->lock); last_completed = pd_chan->completed_cookie; last_used = chan->cookie; - spin_unlock_bh(&pd_chan->lock); + spin_unlock_irq(&pd_chan->lock); ret = dma_async_is_complete(cookie, last_completed, last_used); @@ -654,7 +700,7 @@ static int pd_device_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd, if (cmd != DMA_TERMINATE_ALL) return -ENXIO; - spin_lock_bh(&pd_chan->lock); + spin_lock_irq(&pd_chan->lock); pdc_set_mode(&pd_chan->chan, DMA_CTL0_DISABLE); @@ -664,7 +710,7 @@ static int pd_device_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd, list_for_each_entry_safe(desc, _d, &list, desc_node) pdc_chain_complete(pd_chan, desc); - spin_unlock_bh(&pd_chan->lock); + spin_unlock_irq(&pd_chan->lock); return 0; } @@ -693,30 +739,45 @@ static irqreturn_t pd_irq(int irq, void *devid) struct pch_dma *pd = (struct pch_dma *)devid; struct pch_dma_chan *pd_chan; u32 sts0; + u32 sts2; int i; - int ret = IRQ_NONE; + int ret0 = IRQ_NONE; + int ret2 = IRQ_NONE; sts0 = dma_readl(pd, STS0); + sts2 = dma_readl(pd, STS2); dev_dbg(pd->dma.dev, "pd_irq sts0: %x\n", sts0); for (i = 0; i < pd->dma.chancnt; i++) { pd_chan = &pd->channels[i]; - if (sts0 & DMA_STATUS_IRQ(i)) { - if (sts0 & DMA_STATUS_ERR(i)) - set_bit(0, &pd_chan->err_status); + if (i < 8) { + if (sts0 & DMA_STATUS_IRQ(i)) { + if (sts0 & DMA_STATUS0_ERR(i)) + set_bit(0, &pd_chan->err_status); - tasklet_schedule(&pd_chan->tasklet); - ret = IRQ_HANDLED; - } + tasklet_schedule(&pd_chan->tasklet); + ret0 = IRQ_HANDLED; + } + } else { + if (sts2 & DMA_STATUS_IRQ(i - 8)) { + if (sts2 & DMA_STATUS2_ERR(i)) + set_bit(0, &pd_chan->err_status); + tasklet_schedule(&pd_chan->tasklet); + ret2 = IRQ_HANDLED; + } + } } /* clear interrupt bits in status register */ - dma_writel(pd, STS0, sts0); + if (ret0) + dma_writel(pd, STS0, sts0); + if (ret2) + dma_writel(pd, STS2, sts2); - return ret; + return ret0 | ret2; } #ifdef CONFIG_PM diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c index 6abe1ec1f2ce..00eee59e8b33 100644 --- a/drivers/dma/pl330.c +++ b/drivers/dma/pl330.c @@ -82,7 +82,7 @@ struct dma_pl330_dmac { spinlock_t pool_lock; /* Peripheral channels connected to this DMAC */ - struct dma_pl330_chan peripherals[0]; /* keep at end */ + struct dma_pl330_chan *peripherals; /* keep at end */ }; struct dma_pl330_desc { @@ -451,8 +451,13 @@ static struct dma_pl330_desc *pl330_get_desc(struct dma_pl330_chan *pch) desc->txd.cookie = 0; async_tx_ack(&desc->txd); - desc->req.rqtype = peri->rqtype; - desc->req.peri = peri->peri_id; + if (peri) { + desc->req.rqtype = peri->rqtype; + desc->req.peri = peri->peri_id; + } else { + desc->req.rqtype = MEMTOMEM; + desc->req.peri = 0; + } dma_async_tx_descriptor_init(&desc->txd, &pch->chan); @@ -529,10 +534,10 @@ pl330_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dst, struct pl330_info *pi; int burst; - if (unlikely(!pch || !len || !peri)) + if (unlikely(!pch || !len)) return NULL; - if (peri->rqtype != MEMTOMEM) + if (peri && peri->rqtype != MEMTOMEM) return NULL; pi = &pch->dmac->pif; @@ -577,7 +582,7 @@ pl330_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, int i, burst_size; dma_addr_t addr; - if (unlikely(!pch || !sgl || !sg_len)) + if (unlikely(!pch || !sgl || !sg_len || !peri)) return NULL; /* Make sure the direction is consistent */ @@ -666,17 +671,12 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id) struct dma_device *pd; struct resource *res; int i, ret, irq; + int num_chan; pdat = adev->dev.platform_data; - if (!pdat || !pdat->nr_valid_peri) { - dev_err(&adev->dev, "platform data missing\n"); - return -ENODEV; - } - /* Allocate a new DMAC and its Channels */ - pdmac = kzalloc(pdat->nr_valid_peri * sizeof(*pch) - + sizeof(*pdmac), GFP_KERNEL); + pdmac = kzalloc(sizeof(*pdmac), GFP_KERNEL); if (!pdmac) { dev_err(&adev->dev, "unable to allocate mem\n"); return -ENOMEM; @@ -685,7 +685,7 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id) pi = &pdmac->pif; pi->dev = &adev->dev; pi->pl330_data = NULL; - pi->mcbufsz = pdat->mcbuf_sz; + pi->mcbufsz = pdat ? pdat->mcbuf_sz : 0; res = &adev->res; request_mem_region(res->start, resource_size(res), "dma-pl330"); @@ -717,27 +717,35 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id) INIT_LIST_HEAD(&pd->channels); /* Initialize channel parameters */ - for (i = 0; i < pdat->nr_valid_peri; i++) { - struct dma_pl330_peri *peri = &pdat->peri[i]; - pch = &pdmac->peripherals[i]; + num_chan = max(pdat ? pdat->nr_valid_peri : 0, (u8)pi->pcfg.num_chan); + pdmac->peripherals = kzalloc(num_chan * sizeof(*pch), GFP_KERNEL); - switch (peri->rqtype) { - case MEMTOMEM: + for (i = 0; i < num_chan; i++) { + pch = &pdmac->peripherals[i]; + if (pdat) { + struct dma_pl330_peri *peri = &pdat->peri[i]; + + switch (peri->rqtype) { + case MEMTOMEM: + dma_cap_set(DMA_MEMCPY, pd->cap_mask); + break; + case MEMTODEV: + case DEVTOMEM: + dma_cap_set(DMA_SLAVE, pd->cap_mask); + break; + default: + dev_err(&adev->dev, "DEVTODEV Not Supported\n"); + continue; + } + pch->chan.private = peri; + } else { dma_cap_set(DMA_MEMCPY, pd->cap_mask); - break; - case MEMTODEV: - case DEVTOMEM: - dma_cap_set(DMA_SLAVE, pd->cap_mask); - break; - default: - dev_err(&adev->dev, "DEVTODEV Not Supported\n"); - continue; + pch->chan.private = NULL; } INIT_LIST_HEAD(&pch->work_list); spin_lock_init(&pch->lock); pch->pl330_chid = NULL; - pch->chan.private = peri; pch->chan.device = pd; pch->chan.chan_id = i; pch->dmac = pdmac; diff --git a/drivers/dma/ste_dma40.c b/drivers/dma/ste_dma40.c index 29d1addbe0cf..cd3a7c726bf8 100644 --- a/drivers/dma/ste_dma40.c +++ b/drivers/dma/ste_dma40.c @@ -14,6 +14,7 @@ #include <linux/clk.h> #include <linux/delay.h> #include <linux/err.h> +#include <linux/amba/bus.h> #include <plat/ste_dma40.h> @@ -45,9 +46,6 @@ #define D40_ALLOC_PHY (1 << 30) #define D40_ALLOC_LOG_FREE 0 -/* Hardware designer of the block */ -#define D40_HW_DESIGNER 0x8 - /** * enum 40_command - The different commands and/or statuses. * @@ -186,6 +184,8 @@ struct d40_base; * @log_def: Default logical channel settings. * @lcla: Space for one dst src pair for logical channel transfers. * @lcpa: Pointer to dst and src lcpa settings. + * @runtime_addr: runtime configured address. + * @runtime_direction: runtime configured direction. * * This struct can either "be" a logical or a physical channel. */ @@ -200,6 +200,7 @@ struct d40_chan { struct dma_chan chan; struct tasklet_struct tasklet; struct list_head client; + struct list_head pending_queue; struct list_head active; struct list_head queue; struct stedma40_chan_cfg dma_cfg; @@ -645,7 +646,20 @@ static struct d40_desc *d40_first_active_get(struct d40_chan *d40c) static void d40_desc_queue(struct d40_chan *d40c, struct d40_desc *desc) { - list_add_tail(&desc->node, &d40c->queue); + list_add_tail(&desc->node, &d40c->pending_queue); +} + +static struct d40_desc *d40_first_pending(struct d40_chan *d40c) +{ + struct d40_desc *d; + + if (list_empty(&d40c->pending_queue)) + return NULL; + + d = list_first_entry(&d40c->pending_queue, + struct d40_desc, + node); + return d; } static struct d40_desc *d40_first_queued(struct d40_chan *d40c) @@ -802,6 +816,11 @@ static void d40_term_all(struct d40_chan *d40c) d40_desc_free(d40c, d40d); } + /* Release pending descriptors */ + while ((d40d = d40_first_pending(d40c))) { + d40_desc_remove(d40d); + d40_desc_free(d40c, d40d); + } d40c->pending_tx = 0; d40c->busy = false; @@ -2092,7 +2111,7 @@ dma40_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t dma_addr, struct scatterlist *sg; int i; - sg = kcalloc(periods + 1, sizeof(struct scatterlist), GFP_KERNEL); + sg = kcalloc(periods + 1, sizeof(struct scatterlist), GFP_NOWAIT); for (i = 0; i < periods; i++) { sg_dma_address(&sg[i]) = dma_addr; sg_dma_len(&sg[i]) = period_len; @@ -2152,24 +2171,87 @@ static void d40_issue_pending(struct dma_chan *chan) spin_lock_irqsave(&d40c->lock, flags); - /* Busy means that pending jobs are already being processed */ + list_splice_tail_init(&d40c->pending_queue, &d40c->queue); + + /* Busy means that queued jobs are already being processed */ if (!d40c->busy) (void) d40_queue_start(d40c); spin_unlock_irqrestore(&d40c->lock, flags); } +static int +dma40_config_to_halfchannel(struct d40_chan *d40c, + struct stedma40_half_channel_info *info, + enum dma_slave_buswidth width, + u32 maxburst) +{ + enum stedma40_periph_data_width addr_width; + int psize; + + switch (width) { + case DMA_SLAVE_BUSWIDTH_1_BYTE: + addr_width = STEDMA40_BYTE_WIDTH; + break; + case DMA_SLAVE_BUSWIDTH_2_BYTES: + addr_width = STEDMA40_HALFWORD_WIDTH; + break; + case DMA_SLAVE_BUSWIDTH_4_BYTES: + addr_width = STEDMA40_WORD_WIDTH; + break; + case DMA_SLAVE_BUSWIDTH_8_BYTES: + addr_width = STEDMA40_DOUBLEWORD_WIDTH; + break; + default: + dev_err(d40c->base->dev, + "illegal peripheral address width " + "requested (%d)\n", + width); + return -EINVAL; + } + + if (chan_is_logical(d40c)) { + if (maxburst >= 16) + psize = STEDMA40_PSIZE_LOG_16; + else if (maxburst >= 8) + psize = STEDMA40_PSIZE_LOG_8; + else if (maxburst >= 4) + psize = STEDMA40_PSIZE_LOG_4; + else + psize = STEDMA40_PSIZE_LOG_1; + } else { + if (maxburst >= 16) + psize = STEDMA40_PSIZE_PHY_16; + else if (maxburst >= 8) + psize = STEDMA40_PSIZE_PHY_8; + else if (maxburst >= 4) + psize = STEDMA40_PSIZE_PHY_4; + else + psize = STEDMA40_PSIZE_PHY_1; + } + + info->data_width = addr_width; + info->psize = psize; + info->flow_ctrl = STEDMA40_NO_FLOW_CTRL; + + return 0; +} + /* Runtime reconfiguration extension */ -static void d40_set_runtime_config(struct dma_chan *chan, - struct dma_slave_config *config) +static int d40_set_runtime_config(struct dma_chan *chan, + struct dma_slave_config *config) { struct d40_chan *d40c = container_of(chan, struct d40_chan, chan); struct stedma40_chan_cfg *cfg = &d40c->dma_cfg; - enum dma_slave_buswidth config_addr_width; + enum dma_slave_buswidth src_addr_width, dst_addr_width; dma_addr_t config_addr; - u32 config_maxburst; - enum stedma40_periph_data_width addr_width; - int psize; + u32 src_maxburst, dst_maxburst; + int ret; + + src_addr_width = config->src_addr_width; + src_maxburst = config->src_maxburst; + dst_addr_width = config->dst_addr_width; + dst_maxburst = config->dst_maxburst; if (config->direction == DMA_FROM_DEVICE) { dma_addr_t dev_addr_rx = @@ -2188,8 +2270,11 @@ static void d40_set_runtime_config(struct dma_chan *chan, cfg->dir); cfg->dir = STEDMA40_PERIPH_TO_MEM; - config_addr_width = config->src_addr_width; - config_maxburst = config->src_maxburst; + /* Configure the memory side */ + if (dst_addr_width == DMA_SLAVE_BUSWIDTH_UNDEFINED) + dst_addr_width = src_addr_width; + if (dst_maxburst == 0) + dst_maxburst = src_maxburst; } else if (config->direction == DMA_TO_DEVICE) { dma_addr_t dev_addr_tx = @@ -2208,68 +2293,39 @@ static void d40_set_runtime_config(struct dma_chan *chan, cfg->dir); cfg->dir = STEDMA40_MEM_TO_PERIPH; - config_addr_width = config->dst_addr_width; - config_maxburst = config->dst_maxburst; - + /* Configure the memory side */ + if (src_addr_width == DMA_SLAVE_BUSWIDTH_UNDEFINED) + src_addr_width = dst_addr_width; + if (src_maxburst == 0) + src_maxburst = dst_maxburst; } else { dev_err(d40c->base->dev, "unrecognized channel direction %d\n", config->direction); - return; + return -EINVAL; } - switch (config_addr_width) { - case DMA_SLAVE_BUSWIDTH_1_BYTE: - addr_width = STEDMA40_BYTE_WIDTH; - break; - case DMA_SLAVE_BUSWIDTH_2_BYTES: - addr_width = STEDMA40_HALFWORD_WIDTH; - break; - case DMA_SLAVE_BUSWIDTH_4_BYTES: - addr_width = STEDMA40_WORD_WIDTH; - break; - case DMA_SLAVE_BUSWIDTH_8_BYTES: - addr_width = STEDMA40_DOUBLEWORD_WIDTH; - break; - default: + if (src_maxburst * src_addr_width != dst_maxburst * dst_addr_width) { dev_err(d40c->base->dev, - "illegal peripheral address width " - "requested (%d)\n", - config->src_addr_width); - return; + "src/dst width/maxburst mismatch: %d*%d != %d*%d\n", + src_maxburst, + src_addr_width, + dst_maxburst, + dst_addr_width); + return -EINVAL; } - if (chan_is_logical(d40c)) { - if (config_maxburst >= 16) - psize = STEDMA40_PSIZE_LOG_16; - else if (config_maxburst >= 8) - psize = STEDMA40_PSIZE_LOG_8; - else if (config_maxburst >= 4) - psize = STEDMA40_PSIZE_LOG_4; - else - psize = STEDMA40_PSIZE_LOG_1; - } else { - if (config_maxburst >= 16) - psize = STEDMA40_PSIZE_PHY_16; - else if (config_maxburst >= 8) - psize = STEDMA40_PSIZE_PHY_8; - else if (config_maxburst >= 4) - psize = STEDMA40_PSIZE_PHY_4; - else if (config_maxburst >= 2) - psize = STEDMA40_PSIZE_PHY_2; - else - psize = STEDMA40_PSIZE_PHY_1; - } + ret = dma40_config_to_halfchannel(d40c, &cfg->src_info, + src_addr_width, + src_maxburst); + if (ret) + return ret; - /* Set up all the endpoint configs */ - cfg->src_info.data_width = addr_width; - cfg->src_info.psize = psize; - cfg->src_info.big_endian = false; - cfg->src_info.flow_ctrl = STEDMA40_NO_FLOW_CTRL; - cfg->dst_info.data_width = addr_width; - cfg->dst_info.psize = psize; - cfg->dst_info.big_endian = false; - cfg->dst_info.flow_ctrl = STEDMA40_NO_FLOW_CTRL; + ret = dma40_config_to_halfchannel(d40c, &cfg->dst_info, + dst_addr_width, + dst_maxburst); + if (ret) + return ret; /* Fill in register values */ if (chan_is_logical(d40c)) @@ -2282,12 +2338,14 @@ static void d40_set_runtime_config(struct dma_chan *chan, d40c->runtime_addr = config_addr; d40c->runtime_direction = config->direction; dev_dbg(d40c->base->dev, - "configured channel %s for %s, data width %d, " - "maxburst %d bytes, LE, no flow control\n", + "configured channel %s for %s, data width %d/%d, " + "maxburst %d/%d elements, LE, no flow control\n", dma_chan_name(chan), (config->direction == DMA_FROM_DEVICE) ? "RX" : "TX", - config_addr_width, - config_maxburst); + src_addr_width, dst_addr_width, + src_maxburst, dst_maxburst); + + return 0; } static int d40_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd, @@ -2308,9 +2366,8 @@ static int d40_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd, case DMA_RESUME: return d40_resume(d40c); case DMA_SLAVE_CONFIG: - d40_set_runtime_config(chan, + return d40_set_runtime_config(chan, (struct dma_slave_config *) arg); - return 0; default: break; } @@ -2341,6 +2398,7 @@ static void __init d40_chan_init(struct d40_base *base, struct dma_device *dma, INIT_LIST_HEAD(&d40c->active); INIT_LIST_HEAD(&d40c->queue); + INIT_LIST_HEAD(&d40c->pending_queue); INIT_LIST_HEAD(&d40c->client); tasklet_init(&d40c->tasklet, dma_tasklet, @@ -2502,25 +2560,6 @@ static int __init d40_phy_res_init(struct d40_base *base) static struct d40_base * __init d40_hw_detect_init(struct platform_device *pdev) { - static const struct d40_reg_val dma_id_regs[] = { - /* Peripheral Id */ - { .reg = D40_DREG_PERIPHID0, .val = 0x0040}, - { .reg = D40_DREG_PERIPHID1, .val = 0x0000}, - /* - * D40_DREG_PERIPHID2 Depends on HW revision: - * DB8500ed has 0x0008, - * ? has 0x0018, - * DB8500v1 has 0x0028 - * DB8500v2 has 0x0038 - */ - { .reg = D40_DREG_PERIPHID3, .val = 0x0000}, - - /* PCell Id */ - { .reg = D40_DREG_CELLID0, .val = 0x000d}, - { .reg = D40_DREG_CELLID1, .val = 0x00f0}, - { .reg = D40_DREG_CELLID2, .val = 0x0005}, - { .reg = D40_DREG_CELLID3, .val = 0x00b1} - }; struct stedma40_platform_data *plat_data; struct clk *clk = NULL; void __iomem *virtbase = NULL; @@ -2529,8 +2568,9 @@ static struct d40_base * __init d40_hw_detect_init(struct platform_device *pdev) int num_log_chans = 0; int num_phy_chans; int i; - u32 val; - u32 rev; + u32 pid; + u32 cid; + u8 rev; clk = clk_get(&pdev->dev, NULL); @@ -2554,32 +2594,32 @@ static struct d40_base * __init d40_hw_detect_init(struct platform_device *pdev) if (!virtbase) goto failure; - /* HW version check */ - for (i = 0; i < ARRAY_SIZE(dma_id_regs); i++) { - if (dma_id_regs[i].val != - readl(virtbase + dma_id_regs[i].reg)) { - d40_err(&pdev->dev, - "Unknown hardware! Expected 0x%x at 0x%x but got 0x%x\n", - dma_id_regs[i].val, - dma_id_regs[i].reg, - readl(virtbase + dma_id_regs[i].reg)); - goto failure; - } - } + /* This is just a regular AMBA PrimeCell ID actually */ + for (pid = 0, i = 0; i < 4; i++) + pid |= (readl(virtbase + resource_size(res) - 0x20 + 4 * i) + & 255) << (i * 8); + for (cid = 0, i = 0; i < 4; i++) + cid |= (readl(virtbase + resource_size(res) - 0x10 + 4 * i) + & 255) << (i * 8); - /* Get silicon revision and designer */ - val = readl(virtbase + D40_DREG_PERIPHID2); - - if ((val & D40_DREG_PERIPHID2_DESIGNER_MASK) != - D40_HW_DESIGNER) { + if (cid != AMBA_CID) { + d40_err(&pdev->dev, "Unknown hardware! No PrimeCell ID\n"); + goto failure; + } + if (AMBA_MANF_BITS(pid) != AMBA_VENDOR_ST) { d40_err(&pdev->dev, "Unknown designer! Got %x wanted %x\n", - val & D40_DREG_PERIPHID2_DESIGNER_MASK, - D40_HW_DESIGNER); + AMBA_MANF_BITS(pid), + AMBA_VENDOR_ST); goto failure; } - - rev = (val & D40_DREG_PERIPHID2_REV_MASK) >> - D40_DREG_PERIPHID2_REV_POS; + /* + * HW revision: + * DB8500ed has revision 0 + * ? has revision 1 + * DB8500v1 has revision 2 + * DB8500v2 has revision 3 + */ + rev = AMBA_REV_BITS(pid); /* The number of physical channels on this HW */ num_phy_chans = 4 * (readl(virtbase + D40_DREG_ICFG) & 0x7) + 4; diff --git a/drivers/dma/ste_dma40_ll.h b/drivers/dma/ste_dma40_ll.h index 195ee65ee7f3..b44c455158de 100644 --- a/drivers/dma/ste_dma40_ll.h +++ b/drivers/dma/ste_dma40_ll.h @@ -184,9 +184,6 @@ #define D40_DREG_PERIPHID0 0xFE0 #define D40_DREG_PERIPHID1 0xFE4 #define D40_DREG_PERIPHID2 0xFE8 -#define D40_DREG_PERIPHID2_REV_POS 4 -#define D40_DREG_PERIPHID2_REV_MASK (0xf << D40_DREG_PERIPHID2_REV_POS) -#define D40_DREG_PERIPHID2_DESIGNER_MASK 0xf #define D40_DREG_PERIPHID3 0xFEC #define D40_DREG_CELLID0 0xFF0 #define D40_DREG_CELLID1 0xFF4 diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c index 5f29aafd4462..eacb05e6cfb3 100644 --- a/drivers/firmware/efivars.c +++ b/drivers/firmware/efivars.c @@ -78,6 +78,7 @@ #include <linux/kobject.h> #include <linux/device.h> #include <linux/slab.h> +#include <linux/pstore.h> #include <asm/uaccess.h> @@ -89,6 +90,8 @@ MODULE_DESCRIPTION("sysfs interface to EFI Variables"); MODULE_LICENSE("GPL"); MODULE_VERSION(EFIVARS_VERSION); +#define DUMP_NAME_LEN 52 + /* * The maximum size of VariableName + Data = 1024 * Therefore, it's reasonable to save that much @@ -119,6 +122,10 @@ struct efivar_attribute { ssize_t (*store)(struct efivar_entry *entry, const char *buf, size_t count); }; +#define PSTORE_EFI_ATTRIBUTES \ + (EFI_VARIABLE_NON_VOLATILE | \ + EFI_VARIABLE_BOOTSERVICE_ACCESS | \ + EFI_VARIABLE_RUNTIME_ACCESS) #define EFIVAR_ATTR(_name, _mode, _show, _store) \ struct efivar_attribute efivar_attr_##_name = { \ @@ -141,38 +148,72 @@ efivar_create_sysfs_entry(struct efivars *efivars, /* Return the number of unicode characters in data */ static unsigned long -utf8_strlen(efi_char16_t *data, unsigned long maxlength) +utf16_strnlen(efi_char16_t *s, size_t maxlength) { unsigned long length = 0; - while (*data++ != 0 && length < maxlength) + while (*s++ != 0 && length < maxlength) length++; return length; } +static unsigned long +utf16_strlen(efi_char16_t *s) +{ + return utf16_strnlen(s, ~0UL); +} + /* * Return the number of bytes is the length of this string * Note: this is NOT the same as the number of unicode characters */ static inline unsigned long -utf8_strsize(efi_char16_t *data, unsigned long maxlength) +utf16_strsize(efi_char16_t *data, unsigned long maxlength) { - return utf8_strlen(data, maxlength/sizeof(efi_char16_t)) * sizeof(efi_char16_t); + return utf16_strnlen(data, maxlength/sizeof(efi_char16_t)) * sizeof(efi_char16_t); +} + +static inline int +utf16_strncmp(const efi_char16_t *a, const efi_char16_t *b, size_t len) +{ + while (1) { + if (len == 0) + return 0; + if (*a < *b) + return -1; + if (*a > *b) + return 1; + if (*a == 0) /* implies *b == 0 */ + return 0; + a++; + b++; + len--; + } } static efi_status_t -get_var_data(struct efivars *efivars, struct efi_variable *var) +get_var_data_locked(struct efivars *efivars, struct efi_variable *var) { efi_status_t status; - spin_lock(&efivars->lock); var->DataSize = 1024; status = efivars->ops->get_variable(var->VariableName, &var->VendorGuid, &var->Attributes, &var->DataSize, var->Data); + return status; +} + +static efi_status_t +get_var_data(struct efivars *efivars, struct efi_variable *var) +{ + efi_status_t status; + + spin_lock(&efivars->lock); + status = get_var_data_locked(efivars, var); spin_unlock(&efivars->lock); + if (status != EFI_SUCCESS) { printk(KERN_WARNING "efivars: get_variable() failed 0x%lx!\n", status); @@ -387,12 +428,180 @@ static struct kobj_type efivar_ktype = { .default_attrs = def_attrs, }; +static struct pstore_info efi_pstore_info; + static inline void efivar_unregister(struct efivar_entry *var) { kobject_put(&var->kobj); } +#ifdef CONFIG_PSTORE + +static int efi_pstore_open(struct pstore_info *psi) +{ + struct efivars *efivars = psi->data; + + spin_lock(&efivars->lock); + efivars->walk_entry = list_first_entry(&efivars->list, + struct efivar_entry, list); + return 0; +} + +static int efi_pstore_close(struct pstore_info *psi) +{ + struct efivars *efivars = psi->data; + + spin_unlock(&efivars->lock); + return 0; +} + +static ssize_t efi_pstore_read(u64 *id, enum pstore_type_id *type, + struct timespec *timespec, struct pstore_info *psi) +{ + efi_guid_t vendor = LINUX_EFI_CRASH_GUID; + struct efivars *efivars = psi->data; + char name[DUMP_NAME_LEN]; + int i; + unsigned int part, size; + unsigned long time; + + while (&efivars->walk_entry->list != &efivars->list) { + if (!efi_guidcmp(efivars->walk_entry->var.VendorGuid, + vendor)) { + for (i = 0; i < DUMP_NAME_LEN; i++) { + name[i] = efivars->walk_entry->var.VariableName[i]; + } + if (sscanf(name, "dump-type%u-%u-%lu", type, &part, &time) == 3) { + *id = part; + timespec->tv_sec = time; + timespec->tv_nsec = 0; + get_var_data_locked(efivars, &efivars->walk_entry->var); + size = efivars->walk_entry->var.DataSize; + memcpy(psi->buf, efivars->walk_entry->var.Data, size); + efivars->walk_entry = list_entry(efivars->walk_entry->list.next, + struct efivar_entry, list); + return size; + } + } + efivars->walk_entry = list_entry(efivars->walk_entry->list.next, + struct efivar_entry, list); + } + return 0; +} + +static u64 efi_pstore_write(enum pstore_type_id type, unsigned int part, + size_t size, struct pstore_info *psi) +{ + char name[DUMP_NAME_LEN]; + char stub_name[DUMP_NAME_LEN]; + efi_char16_t efi_name[DUMP_NAME_LEN]; + efi_guid_t vendor = LINUX_EFI_CRASH_GUID; + struct efivars *efivars = psi->data; + struct efivar_entry *entry, *found = NULL; + int i; + + sprintf(stub_name, "dump-type%u-%u-", type, part); + sprintf(name, "%s%lu", stub_name, get_seconds()); + + spin_lock(&efivars->lock); + + for (i = 0; i < DUMP_NAME_LEN; i++) + efi_name[i] = stub_name[i]; + + /* + * Clean up any entries with the same name + */ + + list_for_each_entry(entry, &efivars->list, list) { + get_var_data_locked(efivars, &entry->var); + + if (efi_guidcmp(entry->var.VendorGuid, vendor)) + continue; + if (utf16_strncmp(entry->var.VariableName, efi_name, + utf16_strlen(efi_name))) + continue; + /* Needs to be a prefix */ + if (entry->var.VariableName[utf16_strlen(efi_name)] == 0) + continue; + + /* found */ + found = entry; + efivars->ops->set_variable(entry->var.VariableName, + &entry->var.VendorGuid, + PSTORE_EFI_ATTRIBUTES, + 0, NULL); + } + + if (found) + list_del(&found->list); + + for (i = 0; i < DUMP_NAME_LEN; i++) + efi_name[i] = name[i]; + + efivars->ops->set_variable(efi_name, &vendor, PSTORE_EFI_ATTRIBUTES, + size, psi->buf); + + spin_unlock(&efivars->lock); + + if (found) + efivar_unregister(found); + + if (size) + efivar_create_sysfs_entry(efivars, + utf16_strsize(efi_name, + DUMP_NAME_LEN * 2), + efi_name, &vendor); + + return part; +}; + +static int efi_pstore_erase(enum pstore_type_id type, u64 id, + struct pstore_info *psi) +{ + efi_pstore_write(type, id, 0, psi); + + return 0; +} +#else +static int efi_pstore_open(struct pstore_info *psi) +{ + return 0; +} + +static int efi_pstore_close(struct pstore_info *psi) +{ + return 0; +} + +static ssize_t efi_pstore_read(u64 *id, enum pstore_type_id *type, + struct timespec *time, struct pstore_info *psi) +{ + return -1; +} + +static u64 efi_pstore_write(enum pstore_type_id type, int part, size_t size, + struct pstore_info *psi) +{ + return 0; +} + +static int efi_pstore_erase(enum pstore_type_id type, u64 id, + struct pstore_info *psi) +{ + return 0; +} +#endif + +static struct pstore_info efi_pstore_info = { + .owner = THIS_MODULE, + .name = "efi", + .open = efi_pstore_open, + .close = efi_pstore_close, + .read = efi_pstore_read, + .write = efi_pstore_write, + .erase = efi_pstore_erase, +}; static ssize_t efivar_create(struct file *filp, struct kobject *kobj, struct bin_attribute *bin_attr, @@ -414,8 +623,8 @@ static ssize_t efivar_create(struct file *filp, struct kobject *kobj, * Does this variable already exist? */ list_for_each_entry_safe(search_efivar, n, &efivars->list, list) { - strsize1 = utf8_strsize(search_efivar->var.VariableName, 1024); - strsize2 = utf8_strsize(new_var->VariableName, 1024); + strsize1 = utf16_strsize(search_efivar->var.VariableName, 1024); + strsize2 = utf16_strsize(new_var->VariableName, 1024); if (strsize1 == strsize2 && !memcmp(&(search_efivar->var.VariableName), new_var->VariableName, strsize1) && @@ -447,8 +656,8 @@ static ssize_t efivar_create(struct file *filp, struct kobject *kobj, /* Create the entry in sysfs. Locking is not required here */ status = efivar_create_sysfs_entry(efivars, - utf8_strsize(new_var->VariableName, - 1024), + utf16_strsize(new_var->VariableName, + 1024), new_var->VariableName, &new_var->VendorGuid); if (status) { @@ -477,8 +686,8 @@ static ssize_t efivar_delete(struct file *filp, struct kobject *kobj, * Does this variable already exist? */ list_for_each_entry_safe(search_efivar, n, &efivars->list, list) { - strsize1 = utf8_strsize(search_efivar->var.VariableName, 1024); - strsize2 = utf8_strsize(del_var->VariableName, 1024); + strsize1 = utf16_strsize(search_efivar->var.VariableName, 1024); + strsize2 = utf16_strsize(del_var->VariableName, 1024); if (strsize1 == strsize2 && !memcmp(&(search_efivar->var.VariableName), del_var->VariableName, strsize1) && @@ -763,6 +972,16 @@ int register_efivars(struct efivars *efivars, if (error) unregister_efivars(efivars); + efivars->efi_pstore_info = efi_pstore_info; + + efivars->efi_pstore_info.buf = kmalloc(4096, GFP_KERNEL); + if (efivars->efi_pstore_info.buf) { + efivars->efi_pstore_info.bufsize = 1024; + efivars->efi_pstore_info.data = efivars; + mutex_init(&efivars->efi_pstore_info.buf_mutex); + pstore_register(&efivars->efi_pstore_info); + } + out: kfree(variable_name); diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 8420129fc5ee..f75a66e7d312 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -241,12 +241,13 @@ config DM_MIRROR needed for live data migration tools such as 'pvmove'. config DM_RAID - tristate "RAID 4/5/6 target (EXPERIMENTAL)" + tristate "RAID 1/4/5/6 target (EXPERIMENTAL)" depends on BLK_DEV_DM && EXPERIMENTAL + select MD_RAID1 select MD_RAID456 select BLK_DEV_MD ---help--- - A dm target that supports RAID4, RAID5 and RAID6 mappings + A dm target that supports RAID1, RAID4, RAID5 and RAID6 mappings A RAID-5 set of N drives with a capacity of C MB per drive provides the capacity of C * (N - 1) MB, and protects against a failure diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index bae6c4e23d3f..49da55c1528a 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -30,7 +30,6 @@ #include <linux/device-mapper.h> #define DM_MSG_PREFIX "crypt" -#define MESG_STR(x) x, sizeof(x) /* * context holding the current state of a multi-part conversion @@ -239,7 +238,7 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, struct dm_crypt_request *dmreq) { memset(iv, 0, cc->iv_size); - *(u32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff); + *(__le32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff); return 0; } @@ -248,7 +247,7 @@ static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv, struct dm_crypt_request *dmreq) { memset(iv, 0, cc->iv_size); - *(u64 *)iv = cpu_to_le64(dmreq->iv_sector); + *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector); return 0; } @@ -415,7 +414,7 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private; memset(iv, 0, cc->iv_size); - *(u64 *)iv = cpu_to_le64(dmreq->iv_sector); + *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector); crypto_cipher_encrypt_one(essiv_tfm, iv, iv); return 0; @@ -1575,11 +1574,17 @@ bad_mem: static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct crypt_config *cc; - unsigned int key_size; + unsigned int key_size, opt_params; unsigned long long tmpll; int ret; + struct dm_arg_set as; + const char *opt_string; + + static struct dm_arg _args[] = { + {0, 1, "Invalid number of feature args"}, + }; - if (argc != 5) { + if (argc < 5) { ti->error = "Not enough arguments"; return -EINVAL; } @@ -1648,6 +1653,30 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } cc->start = tmpll; + argv += 5; + argc -= 5; + + /* Optional parameters */ + if (argc) { + as.argc = argc; + as.argv = argv; + + ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error); + if (ret) + goto bad; + + opt_string = dm_shift_arg(&as); + + if (opt_params == 1 && opt_string && + !strcasecmp(opt_string, "allow_discards")) + ti->num_discard_requests = 1; + else if (opt_params) { + ret = -EINVAL; + ti->error = "Invalid feature arguments"; + goto bad; + } + } + ret = -ENOMEM; cc->io_queue = alloc_workqueue("kcryptd_io", WQ_NON_REENTRANT| @@ -1682,9 +1711,16 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, struct dm_crypt_io *io; struct crypt_config *cc; - if (bio->bi_rw & REQ_FLUSH) { + /* + * If bio is REQ_FLUSH or REQ_DISCARD, just bypass crypt queues. + * - for REQ_FLUSH device-mapper core ensures that no IO is in-flight + * - for REQ_DISCARD caller must use flush if IO ordering matters + */ + if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) { cc = ti->private; bio->bi_bdev = cc->dev->bdev; + if (bio_sectors(bio)) + bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector); return DM_MAPIO_REMAPPED; } @@ -1727,6 +1763,10 @@ static int crypt_status(struct dm_target *ti, status_type_t type, DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset, cc->dev->name, (unsigned long long)cc->start); + + if (ti->num_discard_requests) + DMEMIT(" 1 allow_discards"); + break; } return 0; @@ -1770,12 +1810,12 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) if (argc < 2) goto error; - if (!strnicmp(argv[0], MESG_STR("key"))) { + if (!strcasecmp(argv[0], "key")) { if (!test_bit(DM_CRYPT_SUSPENDED, &cc->flags)) { DMWARN("not suspended during key manipulation."); return -EINVAL; } - if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) { + if (argc == 3 && !strcasecmp(argv[1], "set")) { ret = crypt_set_key(cc, argv[2]); if (ret) return ret; @@ -1783,7 +1823,7 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) ret = cc->iv_gen_ops->init(cc); return ret; } - if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) { + if (argc == 2 && !strcasecmp(argv[1], "wipe")) { if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) { ret = cc->iv_gen_ops->wipe(cc); if (ret) @@ -1823,7 +1863,7 @@ static int crypt_iterate_devices(struct dm_target *ti, static struct target_type crypt_target = { .name = "crypt", - .version = {1, 10, 0}, + .version = {1, 11, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index ea790623c30b..89f73ca22cfa 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2003 Sistina Software (UK) Limited. - * Copyright (C) 2004, 2010 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004, 2010-2011 Red Hat, Inc. All rights reserved. * * This file is released under the GPL. */ @@ -15,6 +15,9 @@ #define DM_MSG_PREFIX "flakey" +#define all_corrupt_bio_flags_match(bio, fc) \ + (((bio)->bi_rw & (fc)->corrupt_bio_flags) == (fc)->corrupt_bio_flags) + /* * Flakey: Used for testing only, simulates intermittent, * catastrophic device failure. @@ -25,60 +28,189 @@ struct flakey_c { sector_t start; unsigned up_interval; unsigned down_interval; + unsigned long flags; + unsigned corrupt_bio_byte; + unsigned corrupt_bio_rw; + unsigned corrupt_bio_value; + unsigned corrupt_bio_flags; +}; + +enum feature_flag_bits { + DROP_WRITES }; +static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, + struct dm_target *ti) +{ + int r; + unsigned argc; + const char *arg_name; + + static struct dm_arg _args[] = { + {0, 6, "Invalid number of feature args"}, + {1, UINT_MAX, "Invalid corrupt bio byte"}, + {0, 255, "Invalid corrupt value to write into bio byte (0-255)"}, + {0, UINT_MAX, "Invalid corrupt bio flags mask"}, + }; + + /* No feature arguments supplied. */ + if (!as->argc) + return 0; + + r = dm_read_arg_group(_args, as, &argc, &ti->error); + if (r) + return r; + + while (argc) { + arg_name = dm_shift_arg(as); + argc--; + + /* + * drop_writes + */ + if (!strcasecmp(arg_name, "drop_writes")) { + if (test_and_set_bit(DROP_WRITES, &fc->flags)) { + ti->error = "Feature drop_writes duplicated"; + return -EINVAL; + } + + continue; + } + + /* + * corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags> + */ + if (!strcasecmp(arg_name, "corrupt_bio_byte")) { + if (!argc) + ti->error = "Feature corrupt_bio_byte requires parameters"; + + r = dm_read_arg(_args + 1, as, &fc->corrupt_bio_byte, &ti->error); + if (r) + return r; + argc--; + + /* + * Direction r or w? + */ + arg_name = dm_shift_arg(as); + if (!strcasecmp(arg_name, "w")) + fc->corrupt_bio_rw = WRITE; + else if (!strcasecmp(arg_name, "r")) + fc->corrupt_bio_rw = READ; + else { + ti->error = "Invalid corrupt bio direction (r or w)"; + return -EINVAL; + } + argc--; + + /* + * Value of byte (0-255) to write in place of correct one. + */ + r = dm_read_arg(_args + 2, as, &fc->corrupt_bio_value, &ti->error); + if (r) + return r; + argc--; + + /* + * Only corrupt bios with these flags set. + */ + r = dm_read_arg(_args + 3, as, &fc->corrupt_bio_flags, &ti->error); + if (r) + return r; + argc--; + + continue; + } + + ti->error = "Unrecognised flakey feature requested"; + return -EINVAL; + } + + if (test_bit(DROP_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) { + ti->error = "drop_writes is incompatible with corrupt_bio_byte with the WRITE flag set"; + return -EINVAL; + } + + return 0; +} + /* - * Construct a flakey mapping: <dev_path> <offset> <up interval> <down interval> + * Construct a flakey mapping: + * <dev_path> <offset> <up interval> <down interval> [<#feature args> [<arg>]*] + * + * Feature args: + * [drop_writes] + * [corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags>] + * + * Nth_byte starts from 1 for the first byte. + * Direction is r for READ or w for WRITE. + * bio_flags is ignored if 0. */ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) { + static struct dm_arg _args[] = { + {0, UINT_MAX, "Invalid up interval"}, + {0, UINT_MAX, "Invalid down interval"}, + }; + + int r; struct flakey_c *fc; - unsigned long long tmp; + unsigned long long tmpll; + struct dm_arg_set as; + const char *devname; - if (argc != 4) { - ti->error = "dm-flakey: Invalid argument count"; + as.argc = argc; + as.argv = argv; + + if (argc < 4) { + ti->error = "Invalid argument count"; return -EINVAL; } - fc = kmalloc(sizeof(*fc), GFP_KERNEL); + fc = kzalloc(sizeof(*fc), GFP_KERNEL); if (!fc) { - ti->error = "dm-flakey: Cannot allocate linear context"; + ti->error = "Cannot allocate linear context"; return -ENOMEM; } fc->start_time = jiffies; - if (sscanf(argv[1], "%llu", &tmp) != 1) { - ti->error = "dm-flakey: Invalid device sector"; + devname = dm_shift_arg(&as); + + if (sscanf(dm_shift_arg(&as), "%llu", &tmpll) != 1) { + ti->error = "Invalid device sector"; goto bad; } - fc->start = tmp; + fc->start = tmpll; - if (sscanf(argv[2], "%u", &fc->up_interval) != 1) { - ti->error = "dm-flakey: Invalid up interval"; + r = dm_read_arg(_args, &as, &fc->up_interval, &ti->error); + if (r) goto bad; - } - if (sscanf(argv[3], "%u", &fc->down_interval) != 1) { - ti->error = "dm-flakey: Invalid down interval"; + r = dm_read_arg(_args, &as, &fc->down_interval, &ti->error); + if (r) goto bad; - } if (!(fc->up_interval + fc->down_interval)) { - ti->error = "dm-flakey: Total (up + down) interval is zero"; + ti->error = "Total (up + down) interval is zero"; goto bad; } if (fc->up_interval + fc->down_interval < fc->up_interval) { - ti->error = "dm-flakey: Interval overflow"; + ti->error = "Interval overflow"; goto bad; } - if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &fc->dev)) { - ti->error = "dm-flakey: Device lookup failed"; + r = parse_features(&as, fc, ti); + if (r) + goto bad; + + if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &fc->dev)) { + ti->error = "Device lookup failed"; goto bad; } ti->num_flush_requests = 1; + ti->num_discard_requests = 1; ti->private = fc; return 0; @@ -99,7 +231,7 @@ static sector_t flakey_map_sector(struct dm_target *ti, sector_t bi_sector) { struct flakey_c *fc = ti->private; - return fc->start + (bi_sector - ti->begin); + return fc->start + dm_target_offset(ti, bi_sector); } static void flakey_map_bio(struct dm_target *ti, struct bio *bio) @@ -111,6 +243,25 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio) bio->bi_sector = flakey_map_sector(ti, bio->bi_sector); } +static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc) +{ + unsigned bio_bytes = bio_cur_bytes(bio); + char *data = bio_data(bio); + + /* + * Overwrite the Nth byte of the data returned. + */ + if (data && bio_bytes >= fc->corrupt_bio_byte) { + data[fc->corrupt_bio_byte - 1] = fc->corrupt_bio_value; + + DMDEBUG("Corrupting data bio=%p by writing %u to byte %u " + "(rw=%c bi_rw=%lu bi_sector=%llu cur_bytes=%u)\n", + bio, fc->corrupt_bio_value, fc->corrupt_bio_byte, + (bio_data_dir(bio) == WRITE) ? 'w' : 'r', + bio->bi_rw, (unsigned long long)bio->bi_sector, bio_bytes); + } +} + static int flakey_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) { @@ -119,18 +270,71 @@ static int flakey_map(struct dm_target *ti, struct bio *bio, /* Are we alive ? */ elapsed = (jiffies - fc->start_time) / HZ; - if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) + if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) { + /* + * Flag this bio as submitted while down. + */ + map_context->ll = 1; + + /* + * Map reads as normal. + */ + if (bio_data_dir(bio) == READ) + goto map_bio; + + /* + * Drop writes? + */ + if (test_bit(DROP_WRITES, &fc->flags)) { + bio_endio(bio, 0); + return DM_MAPIO_SUBMITTED; + } + + /* + * Corrupt matching writes. + */ + if (fc->corrupt_bio_byte && (fc->corrupt_bio_rw == WRITE)) { + if (all_corrupt_bio_flags_match(bio, fc)) + corrupt_bio_data(bio, fc); + goto map_bio; + } + + /* + * By default, error all I/O. + */ return -EIO; + } +map_bio: flakey_map_bio(ti, bio); return DM_MAPIO_REMAPPED; } +static int flakey_end_io(struct dm_target *ti, struct bio *bio, + int error, union map_info *map_context) +{ + struct flakey_c *fc = ti->private; + unsigned bio_submitted_while_down = map_context->ll; + + /* + * Corrupt successful READs while in down state. + * If flags were specified, only corrupt those that match. + */ + if (!error && bio_submitted_while_down && + (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) && + all_corrupt_bio_flags_match(bio, fc)) + corrupt_bio_data(bio, fc); + + return error; +} + static int flakey_status(struct dm_target *ti, status_type_t type, char *result, unsigned int maxlen) { + unsigned sz = 0; struct flakey_c *fc = ti->private; + unsigned drop_writes; switch (type) { case STATUSTYPE_INFO: @@ -138,9 +342,22 @@ static int flakey_status(struct dm_target *ti, status_type_t type, break; case STATUSTYPE_TABLE: - snprintf(result, maxlen, "%s %llu %u %u", fc->dev->name, - (unsigned long long)fc->start, fc->up_interval, - fc->down_interval); + DMEMIT("%s %llu %u %u ", fc->dev->name, + (unsigned long long)fc->start, fc->up_interval, + fc->down_interval); + + drop_writes = test_bit(DROP_WRITES, &fc->flags); + DMEMIT("%u ", drop_writes + (fc->corrupt_bio_byte > 0) * 5); + + if (drop_writes) + DMEMIT("drop_writes "); + + if (fc->corrupt_bio_byte) + DMEMIT("corrupt_bio_byte %u %c %u %u ", + fc->corrupt_bio_byte, + (fc->corrupt_bio_rw == WRITE) ? 'w' : 'r', + fc->corrupt_bio_value, fc->corrupt_bio_flags); + break; } return 0; @@ -177,11 +394,12 @@ static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_ static struct target_type flakey_target = { .name = "flakey", - .version = {1, 1, 0}, + .version = {1, 2, 0}, .module = THIS_MODULE, .ctr = flakey_ctr, .dtr = flakey_dtr, .map = flakey_map, + .end_io = flakey_end_io, .status = flakey_status, .ioctl = flakey_ioctl, .merge = flakey_merge, diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 2067288f61f9..ad2eba40e319 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -38,6 +38,8 @@ struct io { struct dm_io_client *client; io_notify_fn callback; void *context; + void *vma_invalidate_address; + unsigned long vma_invalidate_size; } __attribute__((aligned(DM_IO_MAX_REGIONS))); static struct kmem_cache *_dm_io_cache; @@ -116,6 +118,10 @@ static void dec_count(struct io *io, unsigned int region, int error) set_bit(region, &io->error_bits); if (atomic_dec_and_test(&io->count)) { + if (io->vma_invalidate_size) + invalidate_kernel_vmap_range(io->vma_invalidate_address, + io->vma_invalidate_size); + if (io->sleeper) wake_up_process(io->sleeper); @@ -159,6 +165,9 @@ struct dpages { unsigned context_u; void *context_ptr; + + void *vma_invalidate_address; + unsigned long vma_invalidate_size; }; /* @@ -377,6 +386,9 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, io->sleeper = current; io->client = client; + io->vma_invalidate_address = dp->vma_invalidate_address; + io->vma_invalidate_size = dp->vma_invalidate_size; + dispatch_io(rw, num_regions, where, dp, io, 1); while (1) { @@ -415,13 +427,21 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions, io->callback = fn; io->context = context; + io->vma_invalidate_address = dp->vma_invalidate_address; + io->vma_invalidate_size = dp->vma_invalidate_size; + dispatch_io(rw, num_regions, where, dp, io, 0); return 0; } -static int dp_init(struct dm_io_request *io_req, struct dpages *dp) +static int dp_init(struct dm_io_request *io_req, struct dpages *dp, + unsigned long size) { /* Set up dpages based on memory type */ + + dp->vma_invalidate_address = NULL; + dp->vma_invalidate_size = 0; + switch (io_req->mem.type) { case DM_IO_PAGE_LIST: list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset); @@ -432,6 +452,11 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp) break; case DM_IO_VMA: + flush_kernel_vmap_range(io_req->mem.ptr.vma, size); + if ((io_req->bi_rw & RW_MASK) == READ) { + dp->vma_invalidate_address = io_req->mem.ptr.vma; + dp->vma_invalidate_size = size; + } vm_dp_init(dp, io_req->mem.ptr.vma); break; @@ -460,7 +485,7 @@ int dm_io(struct dm_io_request *io_req, unsigned num_regions, int r; struct dpages dp; - r = dp_init(io_req, &dp); + r = dp_init(io_req, &dp, (unsigned long)where->count << SECTOR_SHIFT); if (r) return r; diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 4cacdad2270a..2e9a3ca37bdd 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -128,6 +128,24 @@ static struct hash_cell *__get_uuid_cell(const char *str) return NULL; } +static struct hash_cell *__get_dev_cell(uint64_t dev) +{ + struct mapped_device *md; + struct hash_cell *hc; + + md = dm_get_md(huge_decode_dev(dev)); + if (!md) + return NULL; + + hc = dm_get_mdptr(md); + if (!hc) { + dm_put(md); + return NULL; + } + + return hc; +} + /*----------------------------------------------------------------- * Inserting, removing and renaming a device. *---------------------------------------------------------------*/ @@ -718,25 +736,45 @@ static int dev_create(struct dm_ioctl *param, size_t param_size) */ static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param) { - struct mapped_device *md; - void *mdptr = NULL; + struct hash_cell *hc = NULL; - if (*param->uuid) - return __get_uuid_cell(param->uuid); + if (*param->uuid) { + if (*param->name || param->dev) + return NULL; - if (*param->name) - return __get_name_cell(param->name); + hc = __get_uuid_cell(param->uuid); + if (!hc) + return NULL; + } else if (*param->name) { + if (param->dev) + return NULL; - md = dm_get_md(huge_decode_dev(param->dev)); - if (!md) - goto out; + hc = __get_name_cell(param->name); + if (!hc) + return NULL; + } else if (param->dev) { + hc = __get_dev_cell(param->dev); + if (!hc) + return NULL; + } else + return NULL; - mdptr = dm_get_mdptr(md); - if (!mdptr) - dm_put(md); + /* + * Sneakily write in both the name and the uuid + * while we have the cell. + */ + strlcpy(param->name, hc->name, sizeof(param->name)); + if (hc->uuid) + strlcpy(param->uuid, hc->uuid, sizeof(param->uuid)); + else + param->uuid[0] = '\0'; -out: - return mdptr; + if (hc->new_map) + param->flags |= DM_INACTIVE_PRESENT_FLAG; + else + param->flags &= ~DM_INACTIVE_PRESENT_FLAG; + + return hc; } static struct mapped_device *find_device(struct dm_ioctl *param) @@ -746,24 +784,8 @@ static struct mapped_device *find_device(struct dm_ioctl *param) down_read(&_hash_lock); hc = __find_device_hash_cell(param); - if (hc) { + if (hc) md = hc->md; - - /* - * Sneakily write in both the name and the uuid - * while we have the cell. - */ - strlcpy(param->name, hc->name, sizeof(param->name)); - if (hc->uuid) - strlcpy(param->uuid, hc->uuid, sizeof(param->uuid)); - else - param->uuid[0] = '\0'; - - if (hc->new_map) - param->flags |= DM_INACTIVE_PRESENT_FLAG; - else - param->flags &= ~DM_INACTIVE_PRESENT_FLAG; - } up_read(&_hash_lock); return md; @@ -1402,6 +1424,11 @@ static int target_message(struct dm_ioctl *param, size_t param_size) goto out; } + if (!argc) { + DMWARN("Empty message received."); + goto out; + } + table = dm_get_live_table(md); if (!table) goto out_argv; diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 320401dec104..f82147029636 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -224,8 +224,6 @@ struct kcopyd_job { unsigned int num_dests; struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS]; - sector_t offset; - unsigned int nr_pages; struct page_list *pages; /* @@ -380,7 +378,7 @@ static int run_io_job(struct kcopyd_job *job) .bi_rw = job->rw, .mem.type = DM_IO_PAGE_LIST, .mem.ptr.pl = job->pages, - .mem.offset = job->offset, + .mem.offset = 0, .notify.fn = complete_io, .notify.context = job, .client = job->kc->io_client, @@ -397,10 +395,9 @@ static int run_io_job(struct kcopyd_job *job) static int run_pages_job(struct kcopyd_job *job) { int r; + unsigned nr_pages = dm_div_up(job->dests[0].count, PAGE_SIZE >> 9); - job->nr_pages = dm_div_up(job->dests[0].count + job->offset, - PAGE_SIZE >> 9); - r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages); + r = kcopyd_get_pages(job->kc, nr_pages, &job->pages); if (!r) { /* this job is ready for io */ push(&job->kc->io_jobs, job); @@ -602,8 +599,6 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, job->num_dests = num_dests; memcpy(&job->dests, dests, sizeof(*dests) * num_dests); - job->offset = 0; - job->nr_pages = 0; job->pages = NULL; job->fn = fn; @@ -622,6 +617,37 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, } EXPORT_SYMBOL(dm_kcopyd_copy); +void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc, + dm_kcopyd_notify_fn fn, void *context) +{ + struct kcopyd_job *job; + + job = mempool_alloc(kc->job_pool, GFP_NOIO); + + memset(job, 0, sizeof(struct kcopyd_job)); + job->kc = kc; + job->fn = fn; + job->context = context; + + atomic_inc(&kc->nr_jobs); + + return job; +} +EXPORT_SYMBOL(dm_kcopyd_prepare_callback); + +void dm_kcopyd_do_callback(void *j, int read_err, unsigned long write_err) +{ + struct kcopyd_job *job = j; + struct dm_kcopyd_client *kc = job->kc; + + job->read_err = read_err; + job->write_err = write_err; + + push(&kc->complete_jobs, job); + wake(kc); +} +EXPORT_SYMBOL(dm_kcopyd_do_callback); + /* * Cancels a kcopyd job, eg. someone might be deactivating a * mirror. diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c index aa2e0c374ab3..1021c8986011 100644 --- a/drivers/md/dm-log-userspace-base.c +++ b/drivers/md/dm-log-userspace-base.c @@ -394,8 +394,7 @@ static int flush_by_group(struct log_c *lc, struct list_head *flush_list) group[count] = fe->region; count++; - list_del(&fe->list); - list_add(&fe->list, &tmp_list); + list_move(&fe->list, &tmp_list); type = fe->type; if (count >= MAX_FLUSH_GROUP_COUNT) diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 948e3f4925bf..3b52bb72bd1f 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -197,15 +197,21 @@ EXPORT_SYMBOL(dm_dirty_log_destroy); #define MIRROR_DISK_VERSION 2 #define LOG_OFFSET 2 -struct log_header { - uint32_t magic; +struct log_header_disk { + __le32 magic; /* * Simple, incrementing version. no backward * compatibility. */ + __le32 version; + __le64 nr_regions; +} __packed; + +struct log_header_core { + uint32_t magic; uint32_t version; - sector_t nr_regions; + uint64_t nr_regions; }; struct log_c { @@ -239,10 +245,10 @@ struct log_c { int log_dev_failed; int log_dev_flush_failed; struct dm_dev *log_dev; - struct log_header header; + struct log_header_core header; struct dm_io_region header_location; - struct log_header *disk_header; + struct log_header_disk *disk_header; }; /* @@ -251,34 +257,34 @@ struct log_c { */ static inline int log_test_bit(uint32_t *bs, unsigned bit) { - return test_bit_le(bit, (unsigned long *) bs) ? 1 : 0; + return test_bit_le(bit, bs) ? 1 : 0; } static inline void log_set_bit(struct log_c *l, uint32_t *bs, unsigned bit) { - __test_and_set_bit_le(bit, (unsigned long *) bs); + __set_bit_le(bit, bs); l->touched_cleaned = 1; } static inline void log_clear_bit(struct log_c *l, uint32_t *bs, unsigned bit) { - __test_and_clear_bit_le(bit, (unsigned long *) bs); + __clear_bit_le(bit, bs); l->touched_dirtied = 1; } /*---------------------------------------------------------------- * Header IO *--------------------------------------------------------------*/ -static void header_to_disk(struct log_header *core, struct log_header *disk) +static void header_to_disk(struct log_header_core *core, struct log_header_disk *disk) { disk->magic = cpu_to_le32(core->magic); disk->version = cpu_to_le32(core->version); disk->nr_regions = cpu_to_le64(core->nr_regions); } -static void header_from_disk(struct log_header *core, struct log_header *disk) +static void header_from_disk(struct log_header_core *core, struct log_header_disk *disk) { core->magic = le32_to_cpu(disk->magic); core->version = le32_to_cpu(disk->version); @@ -486,7 +492,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size); lc->sync_count = (sync == NOSYNC) ? region_count : 0; - lc->recovering_bits = vmalloc(bitset_size); + lc->recovering_bits = vzalloc(bitset_size); if (!lc->recovering_bits) { DMWARN("couldn't allocate sync bitset"); vfree(lc->sync_bits); @@ -498,7 +504,6 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, kfree(lc); return -ENOMEM; } - memset(lc->recovering_bits, 0, bitset_size); lc->sync_search = 0; log->context = lc; @@ -739,8 +744,7 @@ static int core_get_resync_work(struct dm_dirty_log *log, region_t *region) return 0; do { - *region = find_next_zero_bit_le( - (unsigned long *) lc->sync_bits, + *region = find_next_zero_bit_le(lc->sync_bits, lc->region_count, lc->sync_search); lc->sync_search = *region + 1; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index c3547016f0f1..5e0090ef4182 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -22,7 +22,6 @@ #include <linux/atomic.h> #define DM_MSG_PREFIX "multipath" -#define MESG_STR(x) x, sizeof(x) #define DM_PG_INIT_DELAY_MSECS 2000 #define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1) @@ -505,80 +504,29 @@ static void trigger_event(struct work_struct *work) * <#paths> <#per-path selector args> * [<path> [<arg>]* ]+ ]+ *---------------------------------------------------------------*/ -struct param { - unsigned min; - unsigned max; - char *error; -}; - -static int read_param(struct param *param, char *str, unsigned *v, char **error) -{ - if (!str || - (sscanf(str, "%u", v) != 1) || - (*v < param->min) || - (*v > param->max)) { - *error = param->error; - return -EINVAL; - } - - return 0; -} - -struct arg_set { - unsigned argc; - char **argv; -}; - -static char *shift(struct arg_set *as) -{ - char *r; - - if (as->argc) { - as->argc--; - r = *as->argv; - as->argv++; - return r; - } - - return NULL; -} - -static void consume(struct arg_set *as, unsigned n) -{ - BUG_ON (as->argc < n); - as->argc -= n; - as->argv += n; -} - -static int parse_path_selector(struct arg_set *as, struct priority_group *pg, +static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg, struct dm_target *ti) { int r; struct path_selector_type *pst; unsigned ps_argc; - static struct param _params[] = { + static struct dm_arg _args[] = { {0, 1024, "invalid number of path selector args"}, }; - pst = dm_get_path_selector(shift(as)); + pst = dm_get_path_selector(dm_shift_arg(as)); if (!pst) { ti->error = "unknown path selector type"; return -EINVAL; } - r = read_param(_params, shift(as), &ps_argc, &ti->error); + r = dm_read_arg_group(_args, as, &ps_argc, &ti->error); if (r) { dm_put_path_selector(pst); return -EINVAL; } - if (ps_argc > as->argc) { - dm_put_path_selector(pst); - ti->error = "not enough arguments for path selector"; - return -EINVAL; - } - r = pst->create(&pg->ps, ps_argc, as->argv); if (r) { dm_put_path_selector(pst); @@ -587,12 +535,12 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg, } pg->ps.type = pst; - consume(as, ps_argc); + dm_consume_args(as, ps_argc); return 0; } -static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, +static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps, struct dm_target *ti) { int r; @@ -609,7 +557,7 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, if (!p) return ERR_PTR(-ENOMEM); - r = dm_get_device(ti, shift(as), dm_table_get_mode(ti->table), + r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), &p->path.dev); if (r) { ti->error = "error getting device"; @@ -660,16 +608,16 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, return ERR_PTR(r); } -static struct priority_group *parse_priority_group(struct arg_set *as, +static struct priority_group *parse_priority_group(struct dm_arg_set *as, struct multipath *m) { - static struct param _params[] = { + static struct dm_arg _args[] = { {1, 1024, "invalid number of paths"}, {0, 1024, "invalid number of selector args"} }; int r; - unsigned i, nr_selector_args, nr_params; + unsigned i, nr_selector_args, nr_args; struct priority_group *pg; struct dm_target *ti = m->ti; @@ -693,26 +641,26 @@ static struct priority_group *parse_priority_group(struct arg_set *as, /* * read the paths */ - r = read_param(_params, shift(as), &pg->nr_pgpaths, &ti->error); + r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error); if (r) goto bad; - r = read_param(_params + 1, shift(as), &nr_selector_args, &ti->error); + r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error); if (r) goto bad; - nr_params = 1 + nr_selector_args; + nr_args = 1 + nr_selector_args; for (i = 0; i < pg->nr_pgpaths; i++) { struct pgpath *pgpath; - struct arg_set path_args; + struct dm_arg_set path_args; - if (as->argc < nr_params) { + if (as->argc < nr_args) { ti->error = "not enough path parameters"; r = -EINVAL; goto bad; } - path_args.argc = nr_params; + path_args.argc = nr_args; path_args.argv = as->argv; pgpath = parse_path(&path_args, &pg->ps, ti); @@ -723,7 +671,7 @@ static struct priority_group *parse_priority_group(struct arg_set *as, pgpath->pg = pg; list_add_tail(&pgpath->list, &pg->pgpaths); - consume(as, nr_params); + dm_consume_args(as, nr_args); } return pg; @@ -733,28 +681,23 @@ static struct priority_group *parse_priority_group(struct arg_set *as, return ERR_PTR(r); } -static int parse_hw_handler(struct arg_set *as, struct multipath *m) +static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m) { unsigned hw_argc; int ret; struct dm_target *ti = m->ti; - static struct param _params[] = { + static struct dm_arg _args[] = { {0, 1024, "invalid number of hardware handler args"}, }; - if (read_param(_params, shift(as), &hw_argc, &ti->error)) + if (dm_read_arg_group(_args, as, &hw_argc, &ti->error)) return -EINVAL; if (!hw_argc) return 0; - if (hw_argc > as->argc) { - ti->error = "not enough arguments for hardware handler"; - return -EINVAL; - } - - m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL); + m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL); request_module("scsi_dh_%s", m->hw_handler_name); if (scsi_dh_handler_exist(m->hw_handler_name) == 0) { ti->error = "unknown hardware handler type"; @@ -778,7 +721,7 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m) for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1) j = sprintf(p, "%s", as->argv[i]); } - consume(as, hw_argc - 1); + dm_consume_args(as, hw_argc - 1); return 0; fail: @@ -787,20 +730,20 @@ fail: return ret; } -static int parse_features(struct arg_set *as, struct multipath *m) +static int parse_features(struct dm_arg_set *as, struct multipath *m) { int r; unsigned argc; struct dm_target *ti = m->ti; - const char *param_name; + const char *arg_name; - static struct param _params[] = { + static struct dm_arg _args[] = { {0, 5, "invalid number of feature args"}, {1, 50, "pg_init_retries must be between 1 and 50"}, {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, }; - r = read_param(_params, shift(as), &argc, &ti->error); + r = dm_read_arg_group(_args, as, &argc, &ti->error); if (r) return -EINVAL; @@ -808,26 +751,24 @@ static int parse_features(struct arg_set *as, struct multipath *m) return 0; do { - param_name = shift(as); + arg_name = dm_shift_arg(as); argc--; - if (!strnicmp(param_name, MESG_STR("queue_if_no_path"))) { + if (!strcasecmp(arg_name, "queue_if_no_path")) { r = queue_if_no_path(m, 1, 0); continue; } - if (!strnicmp(param_name, MESG_STR("pg_init_retries")) && + if (!strcasecmp(arg_name, "pg_init_retries") && (argc >= 1)) { - r = read_param(_params + 1, shift(as), - &m->pg_init_retries, &ti->error); + r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error); argc--; continue; } - if (!strnicmp(param_name, MESG_STR("pg_init_delay_msecs")) && + if (!strcasecmp(arg_name, "pg_init_delay_msecs") && (argc >= 1)) { - r = read_param(_params + 2, shift(as), - &m->pg_init_delay_msecs, &ti->error); + r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error); argc--; continue; } @@ -842,15 +783,15 @@ static int parse_features(struct arg_set *as, struct multipath *m) static int multipath_ctr(struct dm_target *ti, unsigned int argc, char **argv) { - /* target parameters */ - static struct param _params[] = { + /* target arguments */ + static struct dm_arg _args[] = { {0, 1024, "invalid number of priority groups"}, {0, 1024, "invalid initial priority group number"}, }; int r; struct multipath *m; - struct arg_set as; + struct dm_arg_set as; unsigned pg_count = 0; unsigned next_pg_num; @@ -871,11 +812,11 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, if (r) goto bad; - r = read_param(_params, shift(&as), &m->nr_priority_groups, &ti->error); + r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error); if (r) goto bad; - r = read_param(_params + 1, shift(&as), &next_pg_num, &ti->error); + r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error); if (r) goto bad; @@ -1505,10 +1446,10 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) } if (argc == 1) { - if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) { + if (!strcasecmp(argv[0], "queue_if_no_path")) { r = queue_if_no_path(m, 1, 0); goto out; - } else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) { + } else if (!strcasecmp(argv[0], "fail_if_no_path")) { r = queue_if_no_path(m, 0, 0); goto out; } @@ -1519,18 +1460,18 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) goto out; } - if (!strnicmp(argv[0], MESG_STR("disable_group"))) { + if (!strcasecmp(argv[0], "disable_group")) { r = bypass_pg_num(m, argv[1], 1); goto out; - } else if (!strnicmp(argv[0], MESG_STR("enable_group"))) { + } else if (!strcasecmp(argv[0], "enable_group")) { r = bypass_pg_num(m, argv[1], 0); goto out; - } else if (!strnicmp(argv[0], MESG_STR("switch_group"))) { + } else if (!strcasecmp(argv[0], "switch_group")) { r = switch_pg_num(m, argv[1]); goto out; - } else if (!strnicmp(argv[0], MESG_STR("reinstate_path"))) + } else if (!strcasecmp(argv[0], "reinstate_path")) action = reinstate_path; - else if (!strnicmp(argv[0], MESG_STR("fail_path"))) + else if (!strcasecmp(argv[0], "fail_path")) action = fail_path; else { DMWARN("Unrecognised multipath message received."); diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index e5d8904fc8f6..a002dd85db1e 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -8,19 +8,19 @@ #include <linux/slab.h> #include "md.h" +#include "raid1.h" #include "raid5.h" -#include "dm.h" #include "bitmap.h" +#include <linux/device-mapper.h> + #define DM_MSG_PREFIX "raid" /* - * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then - * make it so the flag doesn't set anything. + * The following flags are used by dm-raid.c to set up the array state. + * They must be cleared before md_run is called. */ -#ifndef MD_SYNC_STATE_FORCED -#define MD_SYNC_STATE_FORCED 0 -#endif +#define FirstUse 10 /* rdev flag */ struct raid_dev { /* @@ -43,14 +43,15 @@ struct raid_dev { /* * Flags for rs->print_flags field. */ -#define DMPF_DAEMON_SLEEP 0x1 -#define DMPF_MAX_WRITE_BEHIND 0x2 -#define DMPF_SYNC 0x4 -#define DMPF_NOSYNC 0x8 -#define DMPF_STRIPE_CACHE 0x10 -#define DMPF_MIN_RECOVERY_RATE 0x20 -#define DMPF_MAX_RECOVERY_RATE 0x40 - +#define DMPF_SYNC 0x1 +#define DMPF_NOSYNC 0x2 +#define DMPF_REBUILD 0x4 +#define DMPF_DAEMON_SLEEP 0x8 +#define DMPF_MIN_RECOVERY_RATE 0x10 +#define DMPF_MAX_RECOVERY_RATE 0x20 +#define DMPF_MAX_WRITE_BEHIND 0x40 +#define DMPF_STRIPE_CACHE 0x80 +#define DMPF_REGION_SIZE 0X100 struct raid_set { struct dm_target *ti; @@ -72,6 +73,7 @@ static struct raid_type { const unsigned level; /* RAID level. */ const unsigned algorithm; /* RAID algorithm. */ } raid_types[] = { + {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */}, {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, @@ -105,7 +107,8 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra } sectors_per_dev = ti->len; - if (sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) { + if ((raid_type->level > 1) && + sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) { ti->error = "Target length not divisible by number of data devices"; return ERR_PTR(-EINVAL); } @@ -147,9 +150,16 @@ static void context_free(struct raid_set *rs) { int i; - for (i = 0; i < rs->md.raid_disks; i++) + for (i = 0; i < rs->md.raid_disks; i++) { + if (rs->dev[i].meta_dev) + dm_put_device(rs->ti, rs->dev[i].meta_dev); + if (rs->dev[i].rdev.sb_page) + put_page(rs->dev[i].rdev.sb_page); + rs->dev[i].rdev.sb_page = NULL; + rs->dev[i].rdev.sb_loaded = 0; if (rs->dev[i].data_dev) dm_put_device(rs->ti, rs->dev[i].data_dev); + } kfree(rs); } @@ -159,7 +169,16 @@ static void context_free(struct raid_set *rs) * <meta_dev>: meta device name or '-' if missing * <data_dev>: data device name or '-' if missing * - * This code parses those words. + * The following are permitted: + * - - + * - <data_dev> + * <meta_dev> <data_dev> + * + * The following is not allowed: + * <meta_dev> - + * + * This code parses those words. If there is a failure, + * the caller must use context_free to unwind the operations. */ static int dev_parms(struct raid_set *rs, char **argv) { @@ -182,8 +201,16 @@ static int dev_parms(struct raid_set *rs, char **argv) rs->dev[i].rdev.mddev = &rs->md; if (strcmp(argv[0], "-")) { - rs->ti->error = "Metadata devices not supported"; - return -EINVAL; + ret = dm_get_device(rs->ti, argv[0], + dm_table_get_mode(rs->ti->table), + &rs->dev[i].meta_dev); + rs->ti->error = "RAID metadata device lookup failure"; + if (ret) + return ret; + + rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL); + if (!rs->dev[i].rdev.sb_page) + return -ENOMEM; } if (!strcmp(argv[1], "-")) { @@ -193,6 +220,10 @@ static int dev_parms(struct raid_set *rs, char **argv) return -EINVAL; } + rs->ti->error = "No data device supplied with metadata device"; + if (rs->dev[i].meta_dev) + return -EINVAL; + continue; } @@ -204,6 +235,10 @@ static int dev_parms(struct raid_set *rs, char **argv) return ret; } + if (rs->dev[i].meta_dev) { + metadata_available = 1; + rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev; + } rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev; list_add(&rs->dev[i].rdev.same_set, &rs->md.disks); if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) @@ -235,33 +270,109 @@ static int dev_parms(struct raid_set *rs, char **argv) } /* + * validate_region_size + * @rs + * @region_size: region size in sectors. If 0, pick a size (4MiB default). + * + * Set rs->md.bitmap_info.chunksize (which really refers to 'region size'). + * Ensure that (ti->len/region_size < 2^21) - required by MD bitmap. + * + * Returns: 0 on success, -EINVAL on failure. + */ +static int validate_region_size(struct raid_set *rs, unsigned long region_size) +{ + unsigned long min_region_size = rs->ti->len / (1 << 21); + + if (!region_size) { + /* + * Choose a reasonable default. All figures in sectors. + */ + if (min_region_size > (1 << 13)) { + DMINFO("Choosing default region size of %lu sectors", + region_size); + region_size = min_region_size; + } else { + DMINFO("Choosing default region size of 4MiB"); + region_size = 1 << 13; /* sectors */ + } + } else { + /* + * Validate user-supplied value. + */ + if (region_size > rs->ti->len) { + rs->ti->error = "Supplied region size is too large"; + return -EINVAL; + } + + if (region_size < min_region_size) { + DMERR("Supplied region_size (%lu sectors) below minimum (%lu)", + region_size, min_region_size); + rs->ti->error = "Supplied region size is too small"; + return -EINVAL; + } + + if (!is_power_of_2(region_size)) { + rs->ti->error = "Region size is not a power of 2"; + return -EINVAL; + } + + if (region_size < rs->md.chunk_sectors) { + rs->ti->error = "Region size is smaller than the chunk size"; + return -EINVAL; + } + } + + /* + * Convert sectors to bytes. + */ + rs->md.bitmap_info.chunksize = (region_size << 9); + + return 0; +} + +/* * Possible arguments are... - * RAID456: * <chunk_size> [optional_args] * - * Optional args: - * [[no]sync] Force or prevent recovery of the entire array + * Argument definitions + * <chunk_size> The number of sectors per disk that + * will form the "stripe" + * [[no]sync] Force or prevent recovery of the + * entire array * [rebuild <idx>] Rebuild the drive indicated by the index - * [daemon_sleep <ms>] Time between bitmap daemon work to clear bits + * [daemon_sleep <ms>] Time between bitmap daemon work to + * clear bits * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization + * [write_mostly <idx>] Indicate a write mostly drive via index * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) * [stripe_cache <sectors>] Stripe cache size for higher RAIDs + * [region_size <sectors>] Defines granularity of bitmap */ static int parse_raid_params(struct raid_set *rs, char **argv, unsigned num_raid_params) { unsigned i, rebuild_cnt = 0; - unsigned long value; + unsigned long value, region_size = 0; char *key; /* * First, parse the in-order required arguments + * "chunk_size" is the only argument of this type. */ - if ((strict_strtoul(argv[0], 10, &value) < 0) || - !is_power_of_2(value) || (value < 8)) { + if ((strict_strtoul(argv[0], 10, &value) < 0)) { rs->ti->error = "Bad chunk size"; return -EINVAL; + } else if (rs->raid_type->level == 1) { + if (value) + DMERR("Ignoring chunk size parameter for RAID 1"); + value = 0; + } else if (!is_power_of_2(value)) { + rs->ti->error = "Chunk size must be a power of 2"; + return -EINVAL; + } else if (value < 8) { + rs->ti->error = "Chunk size value is too small"; + return -EINVAL; } rs->md.new_chunk_sectors = rs->md.chunk_sectors = value; @@ -269,22 +380,39 @@ static int parse_raid_params(struct raid_set *rs, char **argv, num_raid_params--; /* - * Second, parse the unordered optional arguments + * We set each individual device as In_sync with a completed + * 'recovery_offset'. If there has been a device failure or + * replacement then one of the following cases applies: + * + * 1) User specifies 'rebuild'. + * - Device is reset when param is read. + * 2) A new device is supplied. + * - No matching superblock found, resets device. + * 3) Device failure was transient and returns on reload. + * - Failure noticed, resets device for bitmap replay. + * 4) Device hadn't completed recovery after previous failure. + * - Superblock is read and overrides recovery_offset. + * + * What is found in the superblocks of the devices is always + * authoritative, unless 'rebuild' or '[no]sync' was specified. */ - for (i = 0; i < rs->md.raid_disks; i++) + for (i = 0; i < rs->md.raid_disks; i++) { set_bit(In_sync, &rs->dev[i].rdev.flags); + rs->dev[i].rdev.recovery_offset = MaxSector; + } + /* + * Second, parse the unordered optional arguments + */ for (i = 0; i < num_raid_params; i++) { - if (!strcmp(argv[i], "nosync")) { + if (!strcasecmp(argv[i], "nosync")) { rs->md.recovery_cp = MaxSector; rs->print_flags |= DMPF_NOSYNC; - rs->md.flags |= MD_SYNC_STATE_FORCED; continue; } - if (!strcmp(argv[i], "sync")) { + if (!strcasecmp(argv[i], "sync")) { rs->md.recovery_cp = 0; rs->print_flags |= DMPF_SYNC; - rs->md.flags |= MD_SYNC_STATE_FORCED; continue; } @@ -300,9 +428,13 @@ static int parse_raid_params(struct raid_set *rs, char **argv, return -EINVAL; } - if (!strcmp(key, "rebuild")) { - if (++rebuild_cnt > rs->raid_type->parity_devs) { - rs->ti->error = "Too many rebuild drives given"; + if (!strcasecmp(key, "rebuild")) { + rebuild_cnt++; + if (((rs->raid_type->level != 1) && + (rebuild_cnt > rs->raid_type->parity_devs)) || + ((rs->raid_type->level == 1) && + (rebuild_cnt > (rs->md.raid_disks - 1)))) { + rs->ti->error = "Too many rebuild devices specified for given RAID type"; return -EINVAL; } if (value > rs->md.raid_disks) { @@ -311,7 +443,22 @@ static int parse_raid_params(struct raid_set *rs, char **argv, } clear_bit(In_sync, &rs->dev[value].rdev.flags); rs->dev[value].rdev.recovery_offset = 0; - } else if (!strcmp(key, "max_write_behind")) { + rs->print_flags |= DMPF_REBUILD; + } else if (!strcasecmp(key, "write_mostly")) { + if (rs->raid_type->level != 1) { + rs->ti->error = "write_mostly option is only valid for RAID1"; + return -EINVAL; + } + if (value > rs->md.raid_disks) { + rs->ti->error = "Invalid write_mostly drive index given"; + return -EINVAL; + } + set_bit(WriteMostly, &rs->dev[value].rdev.flags); + } else if (!strcasecmp(key, "max_write_behind")) { + if (rs->raid_type->level != 1) { + rs->ti->error = "max_write_behind option is only valid for RAID1"; + return -EINVAL; + } rs->print_flags |= DMPF_MAX_WRITE_BEHIND; /* @@ -324,14 +471,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv, return -EINVAL; } rs->md.bitmap_info.max_write_behind = value; - } else if (!strcmp(key, "daemon_sleep")) { + } else if (!strcasecmp(key, "daemon_sleep")) { rs->print_flags |= DMPF_DAEMON_SLEEP; if (!value || (value > MAX_SCHEDULE_TIMEOUT)) { rs->ti->error = "daemon sleep period out of range"; return -EINVAL; } rs->md.bitmap_info.daemon_sleep = value; - } else if (!strcmp(key, "stripe_cache")) { + } else if (!strcasecmp(key, "stripe_cache")) { rs->print_flags |= DMPF_STRIPE_CACHE; /* @@ -348,20 +495,23 @@ static int parse_raid_params(struct raid_set *rs, char **argv, rs->ti->error = "Bad stripe_cache size"; return -EINVAL; } - } else if (!strcmp(key, "min_recovery_rate")) { + } else if (!strcasecmp(key, "min_recovery_rate")) { rs->print_flags |= DMPF_MIN_RECOVERY_RATE; if (value > INT_MAX) { rs->ti->error = "min_recovery_rate out of range"; return -EINVAL; } rs->md.sync_speed_min = (int)value; - } else if (!strcmp(key, "max_recovery_rate")) { + } else if (!strcasecmp(key, "max_recovery_rate")) { rs->print_flags |= DMPF_MAX_RECOVERY_RATE; if (value > INT_MAX) { rs->ti->error = "max_recovery_rate out of range"; return -EINVAL; } rs->md.sync_speed_max = (int)value; + } else if (!strcasecmp(key, "region_size")) { + rs->print_flags |= DMPF_REGION_SIZE; + region_size = value; } else { DMERR("Unable to parse RAID parameter: %s", key); rs->ti->error = "Unable to parse RAID parameters"; @@ -369,6 +519,19 @@ static int parse_raid_params(struct raid_set *rs, char **argv, } } + if (validate_region_size(rs, region_size)) + return -EINVAL; + + if (rs->md.chunk_sectors) + rs->ti->split_io = rs->md.chunk_sectors; + else + rs->ti->split_io = region_size; + + if (rs->md.chunk_sectors) + rs->ti->split_io = rs->md.chunk_sectors; + else + rs->ti->split_io = region_size; + /* Assume there are no metadata devices until the drives are parsed */ rs->md.persistent = 0; rs->md.external = 1; @@ -387,17 +550,351 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits) { struct raid_set *rs = container_of(cb, struct raid_set, callbacks); + if (rs->raid_type->level == 1) + return md_raid1_congested(&rs->md, bits); + return md_raid5_congested(&rs->md, bits); } /* + * This structure is never routinely used by userspace, unlike md superblocks. + * Devices with this superblock should only ever be accessed via device-mapper. + */ +#define DM_RAID_MAGIC 0x64526D44 +struct dm_raid_superblock { + __le32 magic; /* "DmRd" */ + __le32 features; /* Used to indicate possible future changes */ + + __le32 num_devices; /* Number of devices in this array. (Max 64) */ + __le32 array_position; /* The position of this drive in the array */ + + __le64 events; /* Incremented by md when superblock updated */ + __le64 failed_devices; /* Bit field of devices to indicate failures */ + + /* + * This offset tracks the progress of the repair or replacement of + * an individual drive. + */ + __le64 disk_recovery_offset; + + /* + * This offset tracks the progress of the initial array + * synchronisation/parity calculation. + */ + __le64 array_resync_offset; + + /* + * RAID characteristics + */ + __le32 level; + __le32 layout; + __le32 stripe_sectors; + + __u8 pad[452]; /* Round struct to 512 bytes. */ + /* Always set to 0 when writing. */ +} __packed; + +static int read_disk_sb(mdk_rdev_t *rdev, int size) +{ + BUG_ON(!rdev->sb_page); + + if (rdev->sb_loaded) + return 0; + + if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) { + DMERR("Failed to read device superblock"); + return -EINVAL; + } + + rdev->sb_loaded = 1; + + return 0; +} + +static void super_sync(mddev_t *mddev, mdk_rdev_t *rdev) +{ + mdk_rdev_t *r, *t; + uint64_t failed_devices; + struct dm_raid_superblock *sb; + + sb = page_address(rdev->sb_page); + failed_devices = le64_to_cpu(sb->failed_devices); + + rdev_for_each(r, t, mddev) + if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags)) + failed_devices |= (1ULL << r->raid_disk); + + memset(sb, 0, sizeof(*sb)); + + sb->magic = cpu_to_le32(DM_RAID_MAGIC); + sb->features = cpu_to_le32(0); /* No features yet */ + + sb->num_devices = cpu_to_le32(mddev->raid_disks); + sb->array_position = cpu_to_le32(rdev->raid_disk); + + sb->events = cpu_to_le64(mddev->events); + sb->failed_devices = cpu_to_le64(failed_devices); + + sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset); + sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp); + + sb->level = cpu_to_le32(mddev->level); + sb->layout = cpu_to_le32(mddev->layout); + sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors); +} + +/* + * super_load + * + * This function creates a superblock if one is not found on the device + * and will decide which superblock to use if there's a choice. + * + * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise + */ +static int super_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev) +{ + int ret; + struct dm_raid_superblock *sb; + struct dm_raid_superblock *refsb; + uint64_t events_sb, events_refsb; + + rdev->sb_start = 0; + rdev->sb_size = sizeof(*sb); + + ret = read_disk_sb(rdev, rdev->sb_size); + if (ret) + return ret; + + sb = page_address(rdev->sb_page); + if (sb->magic != cpu_to_le32(DM_RAID_MAGIC)) { + super_sync(rdev->mddev, rdev); + + set_bit(FirstUse, &rdev->flags); + + /* Force writing of superblocks to disk */ + set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags); + + /* Any superblock is better than none, choose that if given */ + return refdev ? 0 : 1; + } + + if (!refdev) + return 1; + + events_sb = le64_to_cpu(sb->events); + + refsb = page_address(refdev->sb_page); + events_refsb = le64_to_cpu(refsb->events); + + return (events_sb > events_refsb) ? 1 : 0; +} + +static int super_init_validation(mddev_t *mddev, mdk_rdev_t *rdev) +{ + int role; + struct raid_set *rs = container_of(mddev, struct raid_set, md); + uint64_t events_sb; + uint64_t failed_devices; + struct dm_raid_superblock *sb; + uint32_t new_devs = 0; + uint32_t rebuilds = 0; + mdk_rdev_t *r, *t; + struct dm_raid_superblock *sb2; + + sb = page_address(rdev->sb_page); + events_sb = le64_to_cpu(sb->events); + failed_devices = le64_to_cpu(sb->failed_devices); + + /* + * Initialise to 1 if this is a new superblock. + */ + mddev->events = events_sb ? : 1; + + /* + * Reshaping is not currently allowed + */ + if ((le32_to_cpu(sb->level) != mddev->level) || + (le32_to_cpu(sb->layout) != mddev->layout) || + (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) { + DMERR("Reshaping arrays not yet supported."); + return -EINVAL; + } + + /* We can only change the number of devices in RAID1 right now */ + if ((rs->raid_type->level != 1) && + (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) { + DMERR("Reshaping arrays not yet supported."); + return -EINVAL; + } + + if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))) + mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset); + + /* + * During load, we set FirstUse if a new superblock was written. + * There are two reasons we might not have a superblock: + * 1) The array is brand new - in which case, all of the + * devices must have their In_sync bit set. Also, + * recovery_cp must be 0, unless forced. + * 2) This is a new device being added to an old array + * and the new device needs to be rebuilt - in which + * case the In_sync bit will /not/ be set and + * recovery_cp must be MaxSector. + */ + rdev_for_each(r, t, mddev) { + if (!test_bit(In_sync, &r->flags)) { + if (!test_bit(FirstUse, &r->flags)) + DMERR("Superblock area of " + "rebuild device %d should have been " + "cleared.", r->raid_disk); + set_bit(FirstUse, &r->flags); + rebuilds++; + } else if (test_bit(FirstUse, &r->flags)) + new_devs++; + } + + if (!rebuilds) { + if (new_devs == mddev->raid_disks) { + DMINFO("Superblocks created for new array"); + set_bit(MD_ARRAY_FIRST_USE, &mddev->flags); + } else if (new_devs) { + DMERR("New device injected " + "into existing array without 'rebuild' " + "parameter specified"); + return -EINVAL; + } + } else if (new_devs) { + DMERR("'rebuild' devices cannot be " + "injected into an array with other first-time devices"); + return -EINVAL; + } else if (mddev->recovery_cp != MaxSector) { + DMERR("'rebuild' specified while array is not in-sync"); + return -EINVAL; + } + + /* + * Now we set the Faulty bit for those devices that are + * recorded in the superblock as failed. + */ + rdev_for_each(r, t, mddev) { + if (!r->sb_page) + continue; + sb2 = page_address(r->sb_page); + sb2->failed_devices = 0; + + /* + * Check for any device re-ordering. + */ + if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) { + role = le32_to_cpu(sb2->array_position); + if (role != r->raid_disk) { + if (rs->raid_type->level != 1) { + rs->ti->error = "Cannot change device " + "positions in RAID array"; + return -EINVAL; + } + DMINFO("RAID1 device #%d now at position #%d", + role, r->raid_disk); + } + + /* + * Partial recovery is performed on + * returning failed devices. + */ + if (failed_devices & (1 << role)) + set_bit(Faulty, &r->flags); + } + } + + return 0; +} + +static int super_validate(mddev_t *mddev, mdk_rdev_t *rdev) +{ + struct dm_raid_superblock *sb = page_address(rdev->sb_page); + + /* + * If mddev->events is not set, we know we have not yet initialized + * the array. + */ + if (!mddev->events && super_init_validation(mddev, rdev)) + return -EINVAL; + + mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */ + rdev->mddev->bitmap_info.default_offset = 4096 >> 9; + if (!test_bit(FirstUse, &rdev->flags)) { + rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset); + if (rdev->recovery_offset != MaxSector) + clear_bit(In_sync, &rdev->flags); + } + + /* + * If a device comes back, set it as not In_sync and no longer faulty. + */ + if (test_bit(Faulty, &rdev->flags)) { + clear_bit(Faulty, &rdev->flags); + clear_bit(In_sync, &rdev->flags); + rdev->saved_raid_disk = rdev->raid_disk; + rdev->recovery_offset = 0; + } + + clear_bit(FirstUse, &rdev->flags); + + return 0; +} + +/* + * Analyse superblocks and select the freshest. + */ +static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) +{ + int ret; + mdk_rdev_t *rdev, *freshest, *tmp; + mddev_t *mddev = &rs->md; + + freshest = NULL; + rdev_for_each(rdev, tmp, mddev) { + if (!rdev->meta_bdev) + continue; + + ret = super_load(rdev, freshest); + + switch (ret) { + case 1: + freshest = rdev; + break; + case 0: + break; + default: + ti->error = "Failed to load superblock"; + return ret; + } + } + + if (!freshest) + return 0; + + /* + * Validation of the freshest device provides the source of + * validation for the remaining devices. + */ + ti->error = "Unable to assemble array: Invalid superblocks"; + if (super_validate(mddev, freshest)) + return -EINVAL; + + rdev_for_each(rdev, tmp, mddev) + if ((rdev != freshest) && super_validate(mddev, rdev)) + return -EINVAL; + + return 0; +} + +/* * Construct a RAID4/5/6 mapping: * Args: * <raid_type> <#raid_params> <raid_params> \ * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> } * - * ** metadata devices are not supported yet, use '-' instead ** - * * <raid_params> varies by <raid_type>. See 'parse_raid_params' for * details on possible <raid_params>. */ @@ -465,8 +962,12 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) if (ret) goto bad; + rs->md.sync_super = super_sync; + ret = analyse_superblocks(ti, rs); + if (ret) + goto bad; + INIT_WORK(&rs->md.event_work, do_table_event); - ti->split_io = rs->md.chunk_sectors; ti->private = rs; mutex_lock(&rs->md.reconfig_mutex); @@ -482,6 +983,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) rs->callbacks.congested_fn = raid_is_congested; dm_table_add_target_callbacks(ti->table, &rs->callbacks); + mddev_suspend(&rs->md); return 0; bad: @@ -546,12 +1048,17 @@ static int raid_status(struct dm_target *ti, status_type_t type, break; case STATUSTYPE_TABLE: /* The string you would use to construct this array */ - for (i = 0; i < rs->md.raid_disks; i++) - if (rs->dev[i].data_dev && + for (i = 0; i < rs->md.raid_disks; i++) { + if ((rs->print_flags & DMPF_REBUILD) && + rs->dev[i].data_dev && !test_bit(In_sync, &rs->dev[i].rdev.flags)) - raid_param_cnt++; /* for rebuilds */ + raid_param_cnt += 2; /* for rebuilds */ + if (rs->dev[i].data_dev && + test_bit(WriteMostly, &rs->dev[i].rdev.flags)) + raid_param_cnt += 2; + } - raid_param_cnt += (hweight64(rs->print_flags) * 2); + raid_param_cnt += (hweight64(rs->print_flags & ~DMPF_REBUILD) * 2); if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)) raid_param_cnt--; @@ -565,7 +1072,8 @@ static int raid_status(struct dm_target *ti, status_type_t type, DMEMIT(" nosync"); for (i = 0; i < rs->md.raid_disks; i++) - if (rs->dev[i].data_dev && + if ((rs->print_flags & DMPF_REBUILD) && + rs->dev[i].data_dev && !test_bit(In_sync, &rs->dev[i].rdev.flags)) DMEMIT(" rebuild %u", i); @@ -579,6 +1087,11 @@ static int raid_status(struct dm_target *ti, status_type_t type, if (rs->print_flags & DMPF_MAX_RECOVERY_RATE) DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max); + for (i = 0; i < rs->md.raid_disks; i++) + if (rs->dev[i].data_dev && + test_bit(WriteMostly, &rs->dev[i].rdev.flags)) + DMEMIT(" write_mostly %u", i); + if (rs->print_flags & DMPF_MAX_WRITE_BEHIND) DMEMIT(" max_write_behind %lu", rs->md.bitmap_info.max_write_behind); @@ -591,9 +1104,16 @@ static int raid_status(struct dm_target *ti, status_type_t type, conf ? conf->max_nr_stripes * 2 : 0); } + if (rs->print_flags & DMPF_REGION_SIZE) + DMEMIT(" region_size %lu", + rs->md.bitmap_info.chunksize >> 9); + DMEMIT(" %d", rs->md.raid_disks); for (i = 0; i < rs->md.raid_disks; i++) { - DMEMIT(" -"); /* metadata device */ + if (rs->dev[i].meta_dev) + DMEMIT(" %s", rs->dev[i].meta_dev->name); + else + DMEMIT(" -"); if (rs->dev[i].data_dev) DMEMIT(" %s", rs->dev[i].data_dev->name); @@ -650,12 +1170,13 @@ static void raid_resume(struct dm_target *ti) { struct raid_set *rs = ti->private; + bitmap_load(&rs->md); mddev_resume(&rs->md); } static struct target_type raid_target = { .name = "raid", - .version = {1, 0, 0}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 135c2f1fdbfc..d1f1d7017103 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -58,25 +58,30 @@ #define NUM_SNAPSHOT_HDR_CHUNKS 1 struct disk_header { - uint32_t magic; + __le32 magic; /* * Is this snapshot valid. There is no way of recovering * an invalid snapshot. */ - uint32_t valid; + __le32 valid; /* * Simple, incrementing version. no backward * compatibility. */ - uint32_t version; + __le32 version; /* In sectors */ - uint32_t chunk_size; -}; + __le32 chunk_size; +} __packed; struct disk_exception { + __le64 old_chunk; + __le64 new_chunk; +} __packed; + +struct core_exception { uint64_t old_chunk; uint64_t new_chunk; }; @@ -169,10 +174,9 @@ static int alloc_area(struct pstore *ps) if (!ps->area) goto err_area; - ps->zero_area = vmalloc(len); + ps->zero_area = vzalloc(len); if (!ps->zero_area) goto err_zero_area; - memset(ps->zero_area, 0, len); ps->header_area = vmalloc(len); if (!ps->header_area) @@ -396,32 +400,32 @@ static struct disk_exception *get_exception(struct pstore *ps, uint32_t index) } static void read_exception(struct pstore *ps, - uint32_t index, struct disk_exception *result) + uint32_t index, struct core_exception *result) { - struct disk_exception *e = get_exception(ps, index); + struct disk_exception *de = get_exception(ps, index); /* copy it */ - result->old_chunk = le64_to_cpu(e->old_chunk); - result->new_chunk = le64_to_cpu(e->new_chunk); + result->old_chunk = le64_to_cpu(de->old_chunk); + result->new_chunk = le64_to_cpu(de->new_chunk); } static void write_exception(struct pstore *ps, - uint32_t index, struct disk_exception *de) + uint32_t index, struct core_exception *e) { - struct disk_exception *e = get_exception(ps, index); + struct disk_exception *de = get_exception(ps, index); /* copy it */ - e->old_chunk = cpu_to_le64(de->old_chunk); - e->new_chunk = cpu_to_le64(de->new_chunk); + de->old_chunk = cpu_to_le64(e->old_chunk); + de->new_chunk = cpu_to_le64(e->new_chunk); } static void clear_exception(struct pstore *ps, uint32_t index) { - struct disk_exception *e = get_exception(ps, index); + struct disk_exception *de = get_exception(ps, index); /* clear it */ - e->old_chunk = 0; - e->new_chunk = 0; + de->old_chunk = 0; + de->new_chunk = 0; } /* @@ -437,13 +441,13 @@ static int insert_exceptions(struct pstore *ps, { int r; unsigned int i; - struct disk_exception de; + struct core_exception e; /* presume the area is full */ *full = 1; for (i = 0; i < ps->exceptions_per_area; i++) { - read_exception(ps, i, &de); + read_exception(ps, i, &e); /* * If the new_chunk is pointing at the start of @@ -451,7 +455,7 @@ static int insert_exceptions(struct pstore *ps, * is we know that we've hit the end of the * exceptions. Therefore the area is not full. */ - if (de.new_chunk == 0LL) { + if (e.new_chunk == 0LL) { ps->current_committed = i; *full = 0; break; @@ -460,13 +464,13 @@ static int insert_exceptions(struct pstore *ps, /* * Keep track of the start of the free chunks. */ - if (ps->next_free <= de.new_chunk) - ps->next_free = de.new_chunk + 1; + if (ps->next_free <= e.new_chunk) + ps->next_free = e.new_chunk + 1; /* * Otherwise we add the exception to the snapshot. */ - r = callback(callback_context, de.old_chunk, de.new_chunk); + r = callback(callback_context, e.old_chunk, e.new_chunk); if (r) return r; } @@ -563,7 +567,7 @@ static int persistent_read_metadata(struct dm_exception_store *store, ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) / sizeof(struct disk_exception); ps->callbacks = dm_vcalloc(ps->exceptions_per_area, - sizeof(*ps->callbacks)); + sizeof(*ps->callbacks)); if (!ps->callbacks) return -ENOMEM; @@ -641,12 +645,12 @@ static void persistent_commit_exception(struct dm_exception_store *store, { unsigned int i; struct pstore *ps = get_info(store); - struct disk_exception de; + struct core_exception ce; struct commit_callback *cb; - de.old_chunk = e->old_chunk; - de.new_chunk = e->new_chunk; - write_exception(ps, ps->current_committed++, &de); + ce.old_chunk = e->old_chunk; + ce.new_chunk = e->new_chunk; + write_exception(ps, ps->current_committed++, &ce); /* * Add the callback to the back of the array. This code @@ -670,7 +674,7 @@ static void persistent_commit_exception(struct dm_exception_store *store, * If we completely filled the current area, then wipe the next one. */ if ((ps->current_committed == ps->exceptions_per_area) && - zero_disk_area(ps, ps->current_area + 1)) + zero_disk_area(ps, ps->current_area + 1)) ps->valid = 0; /* @@ -701,7 +705,7 @@ static int persistent_prepare_merge(struct dm_exception_store *store, chunk_t *last_new_chunk) { struct pstore *ps = get_info(store); - struct disk_exception de; + struct core_exception ce; int nr_consecutive; int r; @@ -722,9 +726,9 @@ static int persistent_prepare_merge(struct dm_exception_store *store, ps->current_committed = ps->exceptions_per_area; } - read_exception(ps, ps->current_committed - 1, &de); - *last_old_chunk = de.old_chunk; - *last_new_chunk = de.new_chunk; + read_exception(ps, ps->current_committed - 1, &ce); + *last_old_chunk = ce.old_chunk; + *last_new_chunk = ce.new_chunk; /* * Find number of consecutive chunks within the current area, @@ -733,9 +737,9 @@ static int persistent_prepare_merge(struct dm_exception_store *store, for (nr_consecutive = 1; nr_consecutive < ps->current_committed; nr_consecutive++) { read_exception(ps, ps->current_committed - 1 - nr_consecutive, - &de); - if (de.old_chunk != *last_old_chunk - nr_consecutive || - de.new_chunk != *last_new_chunk - nr_consecutive) + &ce); + if (ce.old_chunk != *last_old_chunk - nr_consecutive || + ce.new_chunk != *last_new_chunk - nr_consecutive) break; } @@ -753,7 +757,7 @@ static int persistent_commit_merge(struct dm_exception_store *store, for (i = 0; i < nr_merged; i++) clear_exception(ps, ps->current_committed - 1 - i); - r = area_io(ps, WRITE); + r = area_io(ps, WRITE_FLUSH_FUA); if (r < 0) return r; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 9ecff5f3023a..6f758870fc19 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -30,16 +30,6 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge"; ((ti)->type->name == dm_snapshot_merge_target_name) /* - * The percentage increment we will wake up users at - */ -#define WAKE_UP_PERCENT 5 - -/* - * kcopyd priority of snapshot operations - */ -#define SNAPSHOT_COPY_PRIORITY 2 - -/* * The size of the mempool used to track chunks in use. */ #define MIN_IOS 256 @@ -180,6 +170,13 @@ struct dm_snap_pending_exception { * kcopyd. */ int started; + + /* + * For writing a complete chunk, bypassing the copy. + */ + struct bio *full_bio; + bio_end_io_t *full_bio_end_io; + void *full_bio_private; }; /* @@ -1055,8 +1052,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) { - ti->error = "Cannot allocate snapshot context private " - "structure"; + ti->error = "Cannot allocate private snapshot structure"; r = -ENOMEM; goto bad; } @@ -1380,6 +1376,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) struct dm_snapshot *s = pe->snap; struct bio *origin_bios = NULL; struct bio *snapshot_bios = NULL; + struct bio *full_bio = NULL; int error = 0; if (!success) { @@ -1415,10 +1412,15 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) */ dm_insert_exception(&s->complete, e); - out: +out: dm_remove_exception(&pe->e); snapshot_bios = bio_list_get(&pe->snapshot_bios); origin_bios = bio_list_get(&pe->origin_bios); + full_bio = pe->full_bio; + if (full_bio) { + full_bio->bi_end_io = pe->full_bio_end_io; + full_bio->bi_private = pe->full_bio_private; + } free_pending_exception(pe); increment_pending_exceptions_done_count(); @@ -1426,10 +1428,15 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) up_write(&s->lock); /* Submit any pending write bios */ - if (error) + if (error) { + if (full_bio) + bio_io_error(full_bio); error_bios(snapshot_bios); - else + } else { + if (full_bio) + bio_endio(full_bio, 0); flush_bios(snapshot_bios); + } retry_origin_bios(s, origin_bios); } @@ -1480,8 +1487,33 @@ static void start_copy(struct dm_snap_pending_exception *pe) dest.count = src.count; /* Hand over to kcopyd */ - dm_kcopyd_copy(s->kcopyd_client, - &src, 1, &dest, 0, copy_callback, pe); + dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe); +} + +static void full_bio_end_io(struct bio *bio, int error) +{ + void *callback_data = bio->bi_private; + + dm_kcopyd_do_callback(callback_data, 0, error ? 1 : 0); +} + +static void start_full_bio(struct dm_snap_pending_exception *pe, + struct bio *bio) +{ + struct dm_snapshot *s = pe->snap; + void *callback_data; + + pe->full_bio = bio; + pe->full_bio_end_io = bio->bi_end_io; + pe->full_bio_private = bio->bi_private; + + callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client, + copy_callback, pe); + + bio->bi_end_io = full_bio_end_io; + bio->bi_private = callback_data; + + generic_make_request(bio); } static struct dm_snap_pending_exception * @@ -1519,6 +1551,7 @@ __find_pending_exception(struct dm_snapshot *s, bio_list_init(&pe->origin_bios); bio_list_init(&pe->snapshot_bios); pe->started = 0; + pe->full_bio = NULL; if (s->store->type->prepare_exception(s->store, &pe->e)) { free_pending_exception(pe); @@ -1612,10 +1645,19 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, } remap_exception(s, &pe->e, bio, chunk); - bio_list_add(&pe->snapshot_bios, bio); r = DM_MAPIO_SUBMITTED; + if (!pe->started && + bio->bi_size == (s->store->chunk_size << SECTOR_SHIFT)) { + pe->started = 1; + up_write(&s->lock); + start_full_bio(pe, bio); + goto out; + } + + bio_list_add(&pe->snapshot_bios, bio); + if (!pe->started) { /* this is protected by snap->lock */ pe->started = 1; @@ -1628,9 +1670,9 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, map_context->ptr = track_chunk(s, chunk); } - out_unlock: +out_unlock: up_write(&s->lock); - out: +out: return r; } @@ -1974,7 +2016,7 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, pe_to_start_now = pe; } - next_snapshot: +next_snapshot: up_write(&snap->lock); if (pe_to_start_now) { diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index bfe9c2333cea..986b8754bb08 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -54,7 +54,6 @@ struct dm_table { sector_t *highs; struct dm_target *targets; - unsigned discards_supported:1; unsigned integrity_supported:1; /* @@ -154,12 +153,11 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size) return NULL; size = nmemb * elem_size; - addr = vmalloc(size); - if (addr) - memset(addr, 0, size); + addr = vzalloc(size); return addr; } +EXPORT_SYMBOL(dm_vcalloc); /* * highs, and targets are managed as dynamic arrays during a @@ -209,7 +207,6 @@ int dm_table_create(struct dm_table **result, fmode_t mode, INIT_LIST_HEAD(&t->devices); INIT_LIST_HEAD(&t->target_callbacks); atomic_set(&t->holders, 0); - t->discards_supported = 1; if (!num_targets) num_targets = KEYS_PER_NODE; @@ -281,6 +278,7 @@ void dm_table_get(struct dm_table *t) { atomic_inc(&t->holders); } +EXPORT_SYMBOL(dm_table_get); void dm_table_put(struct dm_table *t) { @@ -290,6 +288,7 @@ void dm_table_put(struct dm_table *t) smp_mb__before_atomic_dec(); atomic_dec(&t->holders); } +EXPORT_SYMBOL(dm_table_put); /* * Checks to see if we need to extend highs or targets. @@ -455,13 +454,14 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, * Add a device to the list, or just increment the usage count if * it's already present. */ -static int __table_get_device(struct dm_table *t, struct dm_target *ti, - const char *path, fmode_t mode, struct dm_dev **result) +int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, + struct dm_dev **result) { int r; dev_t uninitialized_var(dev); struct dm_dev_internal *dd; unsigned int major, minor; + struct dm_table *t = ti->table; BUG_ON(!t); @@ -509,6 +509,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti, *result = &dd->dm_dev; return 0; } +EXPORT_SYMBOL(dm_get_device); int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) @@ -539,23 +540,15 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, * If not we'll force DM to use PAGE_SIZE or * smaller I/O, just to be safe. */ - - if (q->merge_bvec_fn && !ti->type->merge) + if (dm_queue_merge_is_compulsory(q) && !ti->type->merge) blk_limits_max_hw_sectors(limits, (unsigned int) (PAGE_SIZE >> 9)); return 0; } EXPORT_SYMBOL_GPL(dm_set_device_limits); -int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, - struct dm_dev **result) -{ - return __table_get_device(ti->table, ti, path, mode, result); -} - - /* - * Decrement a devices use count and remove it if necessary. + * Decrement a device's use count and remove it if necessary. */ void dm_put_device(struct dm_target *ti, struct dm_dev *d) { @@ -568,6 +561,7 @@ void dm_put_device(struct dm_target *ti, struct dm_dev *d) kfree(dd); } } +EXPORT_SYMBOL(dm_put_device); /* * Checks to see if the target joins onto the end of the table. @@ -791,8 +785,9 @@ int dm_table_add_target(struct dm_table *t, const char *type, t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; - if (!tgt->num_discard_requests) - t->discards_supported = 0; + if (!tgt->num_discard_requests && tgt->discards_supported) + DMWARN("%s: %s: ignoring discards_supported because num_discard_requests is zero.", + dm_device_name(t->md), type); return 0; @@ -802,6 +797,63 @@ int dm_table_add_target(struct dm_table *t, const char *type, return r; } +/* + * Target argument parsing helpers. + */ +static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set, + unsigned *value, char **error, unsigned grouped) +{ + const char *arg_str = dm_shift_arg(arg_set); + + if (!arg_str || + (sscanf(arg_str, "%u", value) != 1) || + (*value < arg->min) || + (*value > arg->max) || + (grouped && arg_set->argc < *value)) { + *error = arg->error; + return -EINVAL; + } + + return 0; +} + +int dm_read_arg(struct dm_arg *arg, struct dm_arg_set *arg_set, + unsigned *value, char **error) +{ + return validate_next_arg(arg, arg_set, value, error, 0); +} +EXPORT_SYMBOL(dm_read_arg); + +int dm_read_arg_group(struct dm_arg *arg, struct dm_arg_set *arg_set, + unsigned *value, char **error) +{ + return validate_next_arg(arg, arg_set, value, error, 1); +} +EXPORT_SYMBOL(dm_read_arg_group); + +const char *dm_shift_arg(struct dm_arg_set *as) +{ + char *r; + + if (as->argc) { + as->argc--; + r = *as->argv; + as->argv++; + return r; + } + + return NULL; +} +EXPORT_SYMBOL(dm_shift_arg); + +void dm_consume_args(struct dm_arg_set *as, unsigned num_args) +{ + BUG_ON(as->argc < num_args); + as->argc -= num_args; + as->argv += num_args; +} +EXPORT_SYMBOL(dm_consume_args); + static int dm_table_set_type(struct dm_table *t) { unsigned i; @@ -1077,11 +1129,13 @@ void dm_table_event(struct dm_table *t) t->event_fn(t->event_context); mutex_unlock(&_event_lock); } +EXPORT_SYMBOL(dm_table_event); sector_t dm_table_get_size(struct dm_table *t) { return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0; } +EXPORT_SYMBOL(dm_table_get_size); struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index) { @@ -1194,9 +1248,45 @@ static void dm_table_set_integrity(struct dm_table *t) blk_get_integrity(template_disk)); } +static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + unsigned flush = (*(unsigned *)data); + struct request_queue *q = bdev_get_queue(dev->bdev); + + return q && (q->flush_flags & flush); +} + +static bool dm_table_supports_flush(struct dm_table *t, unsigned flush) +{ + struct dm_target *ti; + unsigned i = 0; + + /* + * Require at least one underlying device to support flushes. + * t->devices includes internal dm devices such as mirror logs + * so we need to use iterate_devices here, which targets + * supporting flushes must provide. + */ + while (i < dm_table_get_num_targets(t)) { + ti = dm_table_get_target(t, i++); + + if (!ti->num_flush_requests) + continue; + + if (ti->type->iterate_devices && + ti->type->iterate_devices(ti, device_flush_capable, &flush)) + return 1; + } + + return 0; +} + void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *limits) { + unsigned flush = 0; + /* * Copy table's limits to the DM device's request_queue */ @@ -1207,6 +1297,13 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, else queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); + if (dm_table_supports_flush(t, REQ_FLUSH)) { + flush |= REQ_FLUSH; + if (dm_table_supports_flush(t, REQ_FUA)) + flush |= REQ_FUA; + } + blk_queue_flush(q, flush); + dm_table_set_integrity(t); /* @@ -1237,6 +1334,7 @@ fmode_t dm_table_get_mode(struct dm_table *t) { return t->mode; } +EXPORT_SYMBOL(dm_table_get_mode); static void suspend_targets(struct dm_table *t, unsigned postsuspend) { @@ -1345,6 +1443,7 @@ struct mapped_device *dm_table_get_md(struct dm_table *t) { return t->md; } +EXPORT_SYMBOL(dm_table_get_md); static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) @@ -1359,19 +1458,19 @@ bool dm_table_supports_discards(struct dm_table *t) struct dm_target *ti; unsigned i = 0; - if (!t->discards_supported) - return 0; - /* * Unless any target used by the table set discards_supported, * require at least one underlying device to support discards. * t->devices includes internal dm devices such as mirror logs * so we need to use iterate_devices here, which targets - * supporting discard must provide. + * supporting discard selectively must provide. */ while (i < dm_table_get_num_targets(t)) { ti = dm_table_get_target(t, i++); + if (!ti->num_discard_requests) + continue; + if (ti->discards_supported) return 1; @@ -1382,13 +1481,3 @@ bool dm_table_supports_discards(struct dm_table *t) return 0; } - -EXPORT_SYMBOL(dm_vcalloc); -EXPORT_SYMBOL(dm_get_device); -EXPORT_SYMBOL(dm_put_device); -EXPORT_SYMBOL(dm_table_event); -EXPORT_SYMBOL(dm_table_get_size); -EXPORT_SYMBOL(dm_table_get_mode); -EXPORT_SYMBOL(dm_table_get_md); -EXPORT_SYMBOL(dm_table_put); -EXPORT_SYMBOL(dm_table_get); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 0cf68b478878..52b39f335bb3 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -37,6 +37,8 @@ static const char *_name = DM_NAME; static unsigned int major = 0; static unsigned int _major = 0; +static DEFINE_IDR(_minor_idr); + static DEFINE_SPINLOCK(_minor_lock); /* * For bio-based dm. @@ -109,6 +111,7 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); #define DMF_FREEING 3 #define DMF_DELETING 4 #define DMF_NOFLUSH_SUSPENDING 5 +#define DMF_MERGE_IS_OPTIONAL 6 /* * Work processed by per-device workqueue. @@ -313,6 +316,12 @@ static void __exit dm_exit(void) while (i--) _exits[i](); + + /* + * Should be empty by this point. + */ + idr_remove_all(&_minor_idr); + idr_destroy(&_minor_idr); } /* @@ -1171,7 +1180,8 @@ static int __clone_and_map_discard(struct clone_info *ci) /* * Even though the device advertised discard support, - * reconfiguration might have changed that since the + * that does not mean every target supports it, and + * reconfiguration might also have changed that since the * check was performed. */ if (!ti->num_discard_requests) @@ -1705,8 +1715,6 @@ static int dm_any_congested(void *congested_data, int bdi_bits) /*----------------------------------------------------------------- * An IDR is used to keep track of allocated minor numbers. *---------------------------------------------------------------*/ -static DEFINE_IDR(_minor_idr); - static void free_minor(int minor) { spin_lock(&_minor_lock); @@ -1800,7 +1808,6 @@ static void dm_init_md_queue(struct mapped_device *md) blk_queue_make_request(md->queue, dm_request); blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); blk_queue_merge_bvec(md->queue, dm_merge_bvec); - blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA); } /* @@ -1986,6 +1993,59 @@ static void __set_size(struct mapped_device *md, sector_t size) } /* + * Return 1 if the queue has a compulsory merge_bvec_fn function. + * + * If this function returns 0, then the device is either a non-dm + * device without a merge_bvec_fn, or it is a dm device that is + * able to split any bios it receives that are too big. + */ +int dm_queue_merge_is_compulsory(struct request_queue *q) +{ + struct mapped_device *dev_md; + + if (!q->merge_bvec_fn) + return 0; + + if (q->make_request_fn == dm_request) { + dev_md = q->queuedata; + if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags)) + return 0; + } + + return 1; +} + +static int dm_device_merge_is_compulsory(struct dm_target *ti, + struct dm_dev *dev, sector_t start, + sector_t len, void *data) +{ + struct block_device *bdev = dev->bdev; + struct request_queue *q = bdev_get_queue(bdev); + + return dm_queue_merge_is_compulsory(q); +} + +/* + * Return 1 if it is acceptable to ignore merge_bvec_fn based + * on the properties of the underlying devices. + */ +static int dm_table_merge_is_optional(struct dm_table *table) +{ + unsigned i = 0; + struct dm_target *ti; + + while (i < dm_table_get_num_targets(table)) { + ti = dm_table_get_target(table, i++); + + if (ti->type->iterate_devices && + ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL)) + return 0; + } + + return 1; +} + +/* * Returns old map, which caller must destroy. */ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, @@ -1995,6 +2055,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, struct request_queue *q = md->queue; sector_t size; unsigned long flags; + int merge_is_optional; size = dm_table_get_size(t); @@ -2020,10 +2081,16 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, __bind_mempools(md, t); + merge_is_optional = dm_table_merge_is_optional(t); + write_lock_irqsave(&md->map_lock, flags); old_map = md->map; md->map = t; dm_table_set_restrictions(t, q, limits); + if (merge_is_optional) + set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); + else + clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); write_unlock_irqrestore(&md->map_lock, flags); return old_map; diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 1aaf16746da8..6745dbd278a4 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -66,6 +66,8 @@ int dm_table_alloc_md_mempools(struct dm_table *t); void dm_table_free_md_mempools(struct dm_table *t); struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); +int dm_queue_merge_is_compulsory(struct request_queue *q); + void dm_lock_md_type(struct mapped_device *md); void dm_unlock_md_type(struct mapped_device *md); void dm_set_md_type(struct mapped_device *md, unsigned type); diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index d3e38790906e..d8e6a429e8ba 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -20,6 +20,7 @@ #include <linux/debugfs.h> #include <linux/device.h> #include <linux/slab.h> +#include <linux/async.h> #include <linux/err.h> #include <linux/mutex.h> #include <linux/suspend.h> @@ -33,6 +34,8 @@ #include "dummy.h" +#define rdev_crit(rdev, fmt, ...) \ + pr_crit("%s: " fmt, rdev_get_name(rdev), ##__VA_ARGS__) #define rdev_err(rdev, fmt, ...) \ pr_err("%s: " fmt, rdev_get_name(rdev), ##__VA_ARGS__) #define rdev_warn(rdev, fmt, ...) \ @@ -78,11 +81,13 @@ struct regulator { char *supply_name; struct device_attribute dev_attr; struct regulator_dev *rdev; +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs; +#endif }; static int _regulator_is_enabled(struct regulator_dev *rdev); -static int _regulator_disable(struct regulator_dev *rdev, - struct regulator_dev **supply_rdev_ptr); +static int _regulator_disable(struct regulator_dev *rdev); static int _regulator_get_voltage(struct regulator_dev *rdev); static int _regulator_get_current_limit(struct regulator_dev *rdev); static unsigned int _regulator_get_mode(struct regulator_dev *rdev); @@ -90,6 +95,9 @@ static void _notifier_call_chain(struct regulator_dev *rdev, unsigned long event, void *data); static int _regulator_do_set_voltage(struct regulator_dev *rdev, int min_uV, int max_uV); +static struct regulator *create_regulator(struct regulator_dev *rdev, + struct device *dev, + const char *supply_name); static const char *rdev_get_name(struct regulator_dev *rdev) { @@ -143,8 +151,11 @@ static int regulator_check_voltage(struct regulator_dev *rdev, if (*min_uV < rdev->constraints->min_uV) *min_uV = rdev->constraints->min_uV; - if (*min_uV > *max_uV) + if (*min_uV > *max_uV) { + rdev_err(rdev, "unsupportable voltage range: %d-%duV\n", + *min_uV, *max_uV); return -EINVAL; + } return 0; } @@ -197,8 +208,11 @@ static int regulator_check_current_limit(struct regulator_dev *rdev, if (*min_uA < rdev->constraints->min_uA) *min_uA = rdev->constraints->min_uA; - if (*min_uA > *max_uA) + if (*min_uA > *max_uA) { + rdev_err(rdev, "unsupportable current range: %d-%duA\n", + *min_uA, *max_uA); return -EINVAL; + } return 0; } @@ -213,6 +227,7 @@ static int regulator_mode_constrain(struct regulator_dev *rdev, int *mode) case REGULATOR_MODE_STANDBY: break; default: + rdev_err(rdev, "invalid mode %x specified\n", *mode); return -EINVAL; } @@ -779,7 +794,6 @@ static int machine_constraints_voltage(struct regulator_dev *rdev, if (ret < 0) { rdev_err(rdev, "failed to apply %duV constraint\n", rdev->constraints->min_uV); - rdev->constraints = NULL; return ret; } } @@ -882,7 +896,6 @@ static int set_machine_constraints(struct regulator_dev *rdev, ret = suspend_prepare(rdev, rdev->constraints->initial_state); if (ret < 0) { rdev_err(rdev, "failed to set suspend state\n"); - rdev->constraints = NULL; goto out; } } @@ -909,13 +922,15 @@ static int set_machine_constraints(struct regulator_dev *rdev, ret = ops->enable(rdev); if (ret < 0) { rdev_err(rdev, "failed to enable\n"); - rdev->constraints = NULL; goto out; } } print_constraints(rdev); + return 0; out: + kfree(rdev->constraints); + rdev->constraints = NULL; return ret; } @@ -929,21 +944,20 @@ out: * core if it's child is enabled. */ static int set_supply(struct regulator_dev *rdev, - struct regulator_dev *supply_rdev) + struct regulator_dev *supply_rdev) { int err; - err = sysfs_create_link(&rdev->dev.kobj, &supply_rdev->dev.kobj, - "supply"); - if (err) { - rdev_err(rdev, "could not add device link %s err %d\n", - supply_rdev->dev.kobj.name, err); - goto out; + rdev_info(rdev, "supplied by %s\n", rdev_get_name(supply_rdev)); + + rdev->supply = create_regulator(supply_rdev, &rdev->dev, "SUPPLY"); + if (IS_ERR(rdev->supply)) { + err = PTR_ERR(rdev->supply); + rdev->supply = NULL; + return err; } - rdev->supply = supply_rdev; - list_add(&rdev->slist, &supply_rdev->supply_list); -out: - return err; + + return 0; } /** @@ -1032,7 +1046,7 @@ static void unset_regulator_supplies(struct regulator_dev *rdev) } } -#define REG_STR_SIZE 32 +#define REG_STR_SIZE 64 static struct regulator *create_regulator(struct regulator_dev *rdev, struct device *dev, @@ -1052,8 +1066,9 @@ static struct regulator *create_regulator(struct regulator_dev *rdev, if (dev) { /* create a 'requested_microamps_name' sysfs entry */ - size = scnprintf(buf, REG_STR_SIZE, "microamps_requested_%s", - supply_name); + size = scnprintf(buf, REG_STR_SIZE, + "microamps_requested_%s-%s", + dev_name(dev), supply_name); if (size >= REG_STR_SIZE) goto overflow_err; @@ -1088,7 +1103,28 @@ static struct regulator *create_regulator(struct regulator_dev *rdev, dev->kobj.name, err); goto link_name_err; } + } else { + regulator->supply_name = kstrdup(supply_name, GFP_KERNEL); + if (regulator->supply_name == NULL) + goto attr_err; + } + +#ifdef CONFIG_DEBUG_FS + regulator->debugfs = debugfs_create_dir(regulator->supply_name, + rdev->debugfs); + if (IS_ERR_OR_NULL(regulator->debugfs)) { + rdev_warn(rdev, "Failed to create debugfs directory\n"); + regulator->debugfs = NULL; + } else { + debugfs_create_u32("uA_load", 0444, regulator->debugfs, + ®ulator->uA_load); + debugfs_create_u32("min_uV", 0444, regulator->debugfs, + ®ulator->min_uV); + debugfs_create_u32("max_uV", 0444, regulator->debugfs, + ®ulator->max_uV); } +#endif + mutex_unlock(&rdev->mutex); return regulator; link_name_err: @@ -1267,13 +1303,17 @@ void regulator_put(struct regulator *regulator) mutex_lock(®ulator_list_mutex); rdev = regulator->rdev; +#ifdef CONFIG_DEBUG_FS + debugfs_remove_recursive(regulator->debugfs); +#endif + /* remove any sysfs entries */ if (regulator->dev) { sysfs_remove_link(&rdev->dev.kobj, regulator->supply_name); - kfree(regulator->supply_name); device_remove_file(regulator->dev, ®ulator->dev_attr); kfree(regulator->dev_attr.attr.name); } + kfree(regulator->supply_name); list_del(®ulator->list); kfree(regulator); @@ -1301,19 +1341,6 @@ static int _regulator_enable(struct regulator_dev *rdev) { int ret, delay; - if (rdev->use_count == 0) { - /* do we need to enable the supply regulator first */ - if (rdev->supply) { - mutex_lock(&rdev->supply->mutex); - ret = _regulator_enable(rdev->supply); - mutex_unlock(&rdev->supply->mutex); - if (ret < 0) { - rdev_err(rdev, "failed to enable: %d\n", ret); - return ret; - } - } - } - /* check voltage and requested load before enabling */ if (rdev->constraints && (rdev->constraints->valid_ops_mask & REGULATOR_CHANGE_DRMS)) @@ -1388,19 +1415,27 @@ int regulator_enable(struct regulator *regulator) struct regulator_dev *rdev = regulator->rdev; int ret = 0; + if (rdev->supply) { + ret = regulator_enable(rdev->supply); + if (ret != 0) + return ret; + } + mutex_lock(&rdev->mutex); ret = _regulator_enable(rdev); mutex_unlock(&rdev->mutex); + + if (ret != 0) + regulator_disable(rdev->supply); + return ret; } EXPORT_SYMBOL_GPL(regulator_enable); /* locks held by regulator_disable() */ -static int _regulator_disable(struct regulator_dev *rdev, - struct regulator_dev **supply_rdev_ptr) +static int _regulator_disable(struct regulator_dev *rdev) { int ret = 0; - *supply_rdev_ptr = NULL; if (WARN(rdev->use_count <= 0, "unbalanced disables for %s\n", rdev_get_name(rdev))) @@ -1427,9 +1462,6 @@ static int _regulator_disable(struct regulator_dev *rdev, NULL); } - /* decrease our supplies ref count and disable if required */ - *supply_rdev_ptr = rdev->supply; - rdev->use_count = 0; } else if (rdev->use_count > 1) { @@ -1440,6 +1472,7 @@ static int _regulator_disable(struct regulator_dev *rdev, rdev->use_count--; } + return ret; } @@ -1458,29 +1491,21 @@ static int _regulator_disable(struct regulator_dev *rdev, int regulator_disable(struct regulator *regulator) { struct regulator_dev *rdev = regulator->rdev; - struct regulator_dev *supply_rdev = NULL; int ret = 0; mutex_lock(&rdev->mutex); - ret = _regulator_disable(rdev, &supply_rdev); + ret = _regulator_disable(rdev); mutex_unlock(&rdev->mutex); - /* decrease our supplies ref count and disable if required */ - while (supply_rdev != NULL) { - rdev = supply_rdev; - - mutex_lock(&rdev->mutex); - _regulator_disable(rdev, &supply_rdev); - mutex_unlock(&rdev->mutex); - } + if (ret == 0 && rdev->supply) + regulator_disable(rdev->supply); return ret; } EXPORT_SYMBOL_GPL(regulator_disable); /* locks held by regulator_force_disable() */ -static int _regulator_force_disable(struct regulator_dev *rdev, - struct regulator_dev **supply_rdev_ptr) +static int _regulator_force_disable(struct regulator_dev *rdev) { int ret = 0; @@ -1497,10 +1522,6 @@ static int _regulator_force_disable(struct regulator_dev *rdev, REGULATOR_EVENT_DISABLE, NULL); } - /* decrease our supplies ref count and disable if required */ - *supply_rdev_ptr = rdev->supply; - - rdev->use_count = 0; return ret; } @@ -1516,16 +1537,16 @@ static int _regulator_force_disable(struct regulator_dev *rdev, int regulator_force_disable(struct regulator *regulator) { struct regulator_dev *rdev = regulator->rdev; - struct regulator_dev *supply_rdev = NULL; int ret; mutex_lock(&rdev->mutex); regulator->uA_load = 0; - ret = _regulator_force_disable(rdev, &supply_rdev); + ret = _regulator_force_disable(regulator->rdev); mutex_unlock(&rdev->mutex); - if (supply_rdev) - regulator_disable(get_device_regulator(rdev_get_dev(supply_rdev))); + if (rdev->supply) + while (rdev->open_count--) + regulator_disable(rdev->supply); return ret; } @@ -2136,7 +2157,7 @@ int regulator_set_optimum_mode(struct regulator *regulator, int uA_load) /* get input voltage */ input_uV = 0; if (rdev->supply) - input_uV = _regulator_get_voltage(rdev->supply); + input_uV = regulator_get_voltage(rdev->supply); if (input_uV <= 0) input_uV = rdev->constraints->input_uV; if (input_uV <= 0) { @@ -2206,17 +2227,8 @@ EXPORT_SYMBOL_GPL(regulator_unregister_notifier); static void _notifier_call_chain(struct regulator_dev *rdev, unsigned long event, void *data) { - struct regulator_dev *_rdev; - /* call rdev chain first */ blocking_notifier_call_chain(&rdev->notifier, event, NULL); - - /* now notify regulator we supply */ - list_for_each_entry(_rdev, &rdev->supply_list, slist) { - mutex_lock(&_rdev->mutex); - _notifier_call_chain(_rdev, event, data); - mutex_unlock(&_rdev->mutex); - } } /** @@ -2264,6 +2276,13 @@ err: } EXPORT_SYMBOL_GPL(regulator_bulk_get); +static void regulator_bulk_enable_async(void *data, async_cookie_t cookie) +{ + struct regulator_bulk_data *bulk = data; + + bulk->ret = regulator_enable(bulk->consumer); +} + /** * regulator_bulk_enable - enable multiple regulator consumers * @@ -2279,21 +2298,33 @@ EXPORT_SYMBOL_GPL(regulator_bulk_get); int regulator_bulk_enable(int num_consumers, struct regulator_bulk_data *consumers) { + LIST_HEAD(async_domain); int i; - int ret; + int ret = 0; + + for (i = 0; i < num_consumers; i++) + async_schedule_domain(regulator_bulk_enable_async, + &consumers[i], &async_domain); + + async_synchronize_full_domain(&async_domain); + /* If any consumer failed we need to unwind any that succeeded */ for (i = 0; i < num_consumers; i++) { - ret = regulator_enable(consumers[i].consumer); - if (ret != 0) + if (consumers[i].ret != 0) { + ret = consumers[i].ret; goto err; + } } return 0; err: - pr_err("Failed to enable %s: %d\n", consumers[i].supply, ret); - for (--i; i >= 0; --i) - regulator_disable(consumers[i].consumer); + for (i = 0; i < num_consumers; i++) + if (consumers[i].ret == 0) + regulator_disable(consumers[i].consumer); + else + pr_err("Failed to enable %s: %d\n", + consumers[i].supply, consumers[i].ret); return ret; } @@ -2589,9 +2620,7 @@ struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc, rdev->owner = regulator_desc->owner; rdev->desc = regulator_desc; INIT_LIST_HEAD(&rdev->consumer_list); - INIT_LIST_HEAD(&rdev->supply_list); INIT_LIST_HEAD(&rdev->list); - INIT_LIST_HEAD(&rdev->slist); BLOCKING_INIT_NOTIFIER_HEAD(&rdev->notifier); /* preform any regulator specific init */ @@ -2672,6 +2701,7 @@ unset_supplies: unset_regulator_supplies(rdev); scrub: + kfree(rdev->constraints); device_unregister(&rdev->dev); /* device core frees rdev */ rdev = ERR_PTR(ret); @@ -2703,7 +2733,7 @@ void regulator_unregister(struct regulator_dev *rdev) unset_regulator_supplies(rdev); list_del(&rdev->list); if (rdev->supply) - sysfs_remove_link(&rdev->dev.kobj, "supply"); + regulator_put(rdev->supply); device_unregister(&rdev->dev); kfree(rdev->constraints); mutex_unlock(®ulator_list_mutex); diff --git a/drivers/regulator/dummy.c b/drivers/regulator/dummy.c index c7410bde7b5d..f6ef6694ab98 100644 --- a/drivers/regulator/dummy.c +++ b/drivers/regulator/dummy.c @@ -36,6 +36,29 @@ static struct regulator_desc dummy_desc = { .ops = &dummy_ops, }; +static int __devinit dummy_regulator_probe(struct platform_device *pdev) +{ + int ret; + + dummy_regulator_rdev = regulator_register(&dummy_desc, NULL, + &dummy_initdata, NULL); + if (IS_ERR(dummy_regulator_rdev)) { + ret = PTR_ERR(dummy_regulator_rdev); + pr_err("Failed to register regulator: %d\n", ret); + return ret; + } + + return 0; +} + +static struct platform_driver dummy_regulator_driver = { + .probe = dummy_regulator_probe, + .driver = { + .name = "reg-dummy", + .owner = THIS_MODULE, + }, +}; + static struct platform_device *dummy_pdev; void __init regulator_dummy_init(void) @@ -55,12 +78,9 @@ void __init regulator_dummy_init(void) return; } - dummy_regulator_rdev = regulator_register(&dummy_desc, NULL, - &dummy_initdata, NULL); - if (IS_ERR(dummy_regulator_rdev)) { - ret = PTR_ERR(dummy_regulator_rdev); - pr_err("Failed to register regulator: %d\n", ret); + ret = platform_driver_register(&dummy_regulator_driver); + if (ret != 0) { + pr_err("Failed to register dummy regulator driver: %d\n", ret); platform_device_unregister(dummy_pdev); - return; } } diff --git a/drivers/regulator/tps65910-regulator.c b/drivers/regulator/tps65910-regulator.c index 55dd4e6650db..66d2d60b436a 100644 --- a/drivers/regulator/tps65910-regulator.c +++ b/drivers/regulator/tps65910-regulator.c @@ -49,7 +49,6 @@ #define TPS65911_REG_LDO7 11 #define TPS65911_REG_LDO8 12 -#define TPS65910_NUM_REGULATOR 13 #define TPS65910_SUPPLY_STATE_ENABLED 0x1 /* supported VIO voltages in milivolts */ @@ -264,11 +263,12 @@ static struct tps_info tps65911_regs[] = { }; struct tps65910_reg { - struct regulator_desc desc[TPS65910_NUM_REGULATOR]; + struct regulator_desc *desc; struct tps65910 *mfd; - struct regulator_dev *rdev[TPS65910_NUM_REGULATOR]; - struct tps_info *info[TPS65910_NUM_REGULATOR]; + struct regulator_dev **rdev; + struct tps_info **info; struct mutex mutex; + int num_regulators; int mode; int (*get_ctrl_reg)(int); }; @@ -759,8 +759,13 @@ static int tps65910_list_voltage_dcdc(struct regulator_dev *dev, mult = (selector / VDD1_2_NUM_VOLTS) + 1; volt = VDD1_2_MIN_VOLT + (selector % VDD1_2_NUM_VOLTS) * VDD1_2_OFFSET; + break; case TPS65911_REG_VDDCTRL: volt = VDDCTRL_MIN_VOLT + (selector * VDDCTRL_OFFSET); + break; + default: + BUG(); + return -EINVAL; } return volt * 100 * mult; @@ -897,16 +902,42 @@ static __devinit int tps65910_probe(struct platform_device *pdev) switch(tps65910_chip_id(tps65910)) { case TPS65910: pmic->get_ctrl_reg = &tps65910_get_ctrl_register; + pmic->num_regulators = ARRAY_SIZE(tps65910_regs); info = tps65910_regs; + break; case TPS65911: pmic->get_ctrl_reg = &tps65911_get_ctrl_register; + pmic->num_regulators = ARRAY_SIZE(tps65911_regs); info = tps65911_regs; + break; default: pr_err("Invalid tps chip version\n"); + kfree(pmic); return -ENODEV; } - for (i = 0; i < TPS65910_NUM_REGULATOR; i++, info++, reg_data++) { + pmic->desc = kcalloc(pmic->num_regulators, + sizeof(struct regulator_desc), GFP_KERNEL); + if (!pmic->desc) { + err = -ENOMEM; + goto err_free_pmic; + } + + pmic->info = kcalloc(pmic->num_regulators, + sizeof(struct tps_info *), GFP_KERNEL); + if (!pmic->info) { + err = -ENOMEM; + goto err_free_desc; + } + + pmic->rdev = kcalloc(pmic->num_regulators, + sizeof(struct regulator_dev *), GFP_KERNEL); + if (!pmic->rdev) { + err = -ENOMEM; + goto err_free_info; + } + + for (i = 0; i < pmic->num_regulators; i++, info++, reg_data++) { /* Register the regulators */ pmic->info[i] = info; @@ -938,7 +969,7 @@ static __devinit int tps65910_probe(struct platform_device *pdev) "failed to register %s regulator\n", pdev->name); err = PTR_ERR(rdev); - goto err; + goto err_unregister_regulator; } /* Save regulator for cleanup */ @@ -946,23 +977,31 @@ static __devinit int tps65910_probe(struct platform_device *pdev) } return 0; -err: +err_unregister_regulator: while (--i >= 0) regulator_unregister(pmic->rdev[i]); - + kfree(pmic->rdev); +err_free_info: + kfree(pmic->info); +err_free_desc: + kfree(pmic->desc); +err_free_pmic: kfree(pmic); return err; } static int __devexit tps65910_remove(struct platform_device *pdev) { - struct tps65910_reg *tps65910_reg = platform_get_drvdata(pdev); + struct tps65910_reg *pmic = platform_get_drvdata(pdev); int i; - for (i = 0; i < TPS65910_NUM_REGULATOR; i++) - regulator_unregister(tps65910_reg->rdev[i]); + for (i = 0; i < pmic->num_regulators; i++) + regulator_unregister(pmic->rdev[i]); - kfree(tps65910_reg); + kfree(pmic->rdev); + kfree(pmic->info); + kfree(pmic->desc); + kfree(pmic); return 0; } diff --git a/drivers/regulator/twl-regulator.c b/drivers/regulator/twl-regulator.c index 87fe0f75a56e..ee8747f4fa08 100644 --- a/drivers/regulator/twl-regulator.c +++ b/drivers/regulator/twl-regulator.c @@ -835,8 +835,8 @@ static struct regulator_ops twlsmps_ops = { remap_conf) \ TWL_FIXED_LDO(label, offset, mVolts, num, turnon_delay, \ remap_conf, TWL4030, twl4030fixed_ops) -#define TWL6030_FIXED_LDO(label, offset, mVolts, num, turnon_delay) \ - TWL_FIXED_LDO(label, offset, mVolts, num, turnon_delay, \ +#define TWL6030_FIXED_LDO(label, offset, mVolts, turnon_delay) \ + TWL_FIXED_LDO(label, offset, mVolts, 0x0, turnon_delay, \ 0x0, TWL6030, twl6030fixed_ops) #define TWL4030_ADJUSTABLE_LDO(label, offset, num, turnon_delay, remap_conf) { \ @@ -856,24 +856,22 @@ static struct regulator_ops twlsmps_ops = { }, \ } -#define TWL6030_ADJUSTABLE_LDO(label, offset, min_mVolts, max_mVolts, num) { \ +#define TWL6030_ADJUSTABLE_LDO(label, offset, min_mVolts, max_mVolts) { \ .base = offset, \ - .id = num, \ .min_mV = min_mVolts, \ .max_mV = max_mVolts, \ .desc = { \ .name = #label, \ .id = TWL6030_REG_##label, \ - .n_voltages = (max_mVolts - min_mVolts)/100, \ + .n_voltages = (max_mVolts - min_mVolts)/100 + 1, \ .ops = &twl6030ldo_ops, \ .type = REGULATOR_VOLTAGE, \ .owner = THIS_MODULE, \ }, \ } -#define TWL6025_ADJUSTABLE_LDO(label, offset, min_mVolts, max_mVolts, num) { \ +#define TWL6025_ADJUSTABLE_LDO(label, offset, min_mVolts, max_mVolts) { \ .base = offset, \ - .id = num, \ .min_mV = min_mVolts, \ .max_mV = max_mVolts, \ .desc = { \ @@ -903,9 +901,8 @@ static struct regulator_ops twlsmps_ops = { }, \ } -#define TWL6030_FIXED_RESOURCE(label, offset, num, turnon_delay) { \ +#define TWL6030_FIXED_RESOURCE(label, offset, turnon_delay) { \ .base = offset, \ - .id = num, \ .delay = turnon_delay, \ .desc = { \ .name = #label, \ @@ -916,9 +913,8 @@ static struct regulator_ops twlsmps_ops = { }, \ } -#define TWL6025_ADJUSTABLE_SMPS(label, offset, num) { \ +#define TWL6025_ADJUSTABLE_SMPS(label, offset) { \ .base = offset, \ - .id = num, \ .min_mV = 600, \ .max_mV = 2100, \ .desc = { \ @@ -961,32 +957,32 @@ static struct twlreg_info twl_regs[] = { /* 6030 REG with base as PMC Slave Misc : 0x0030 */ /* Turnon-delay and remap configuration values for 6030 are not verified since the specification is not public */ - TWL6030_ADJUSTABLE_LDO(VAUX1_6030, 0x54, 1000, 3300, 1), - TWL6030_ADJUSTABLE_LDO(VAUX2_6030, 0x58, 1000, 3300, 2), - TWL6030_ADJUSTABLE_LDO(VAUX3_6030, 0x5c, 1000, 3300, 3), - TWL6030_ADJUSTABLE_LDO(VMMC, 0x68, 1000, 3300, 4), - TWL6030_ADJUSTABLE_LDO(VPP, 0x6c, 1000, 3300, 5), - TWL6030_ADJUSTABLE_LDO(VUSIM, 0x74, 1000, 3300, 7), - TWL6030_FIXED_LDO(VANA, 0x50, 2100, 15, 0), - TWL6030_FIXED_LDO(VCXIO, 0x60, 1800, 16, 0), - TWL6030_FIXED_LDO(VDAC, 0x64, 1800, 17, 0), - TWL6030_FIXED_LDO(VUSB, 0x70, 3300, 18, 0), - TWL6030_FIXED_RESOURCE(CLK32KG, 0x8C, 48, 0), + TWL6030_ADJUSTABLE_LDO(VAUX1_6030, 0x54, 1000, 3300), + TWL6030_ADJUSTABLE_LDO(VAUX2_6030, 0x58, 1000, 3300), + TWL6030_ADJUSTABLE_LDO(VAUX3_6030, 0x5c, 1000, 3300), + TWL6030_ADJUSTABLE_LDO(VMMC, 0x68, 1000, 3300), + TWL6030_ADJUSTABLE_LDO(VPP, 0x6c, 1000, 3300), + TWL6030_ADJUSTABLE_LDO(VUSIM, 0x74, 1000, 3300), + TWL6030_FIXED_LDO(VANA, 0x50, 2100, 0), + TWL6030_FIXED_LDO(VCXIO, 0x60, 1800, 0), + TWL6030_FIXED_LDO(VDAC, 0x64, 1800, 0), + TWL6030_FIXED_LDO(VUSB, 0x70, 3300, 0), + TWL6030_FIXED_RESOURCE(CLK32KG, 0x8C, 0), /* 6025 are renamed compared to 6030 versions */ - TWL6025_ADJUSTABLE_LDO(LDO2, 0x54, 1000, 3300, 1), - TWL6025_ADJUSTABLE_LDO(LDO4, 0x58, 1000, 3300, 2), - TWL6025_ADJUSTABLE_LDO(LDO3, 0x5c, 1000, 3300, 3), - TWL6025_ADJUSTABLE_LDO(LDO5, 0x68, 1000, 3300, 4), - TWL6025_ADJUSTABLE_LDO(LDO1, 0x6c, 1000, 3300, 5), - TWL6025_ADJUSTABLE_LDO(LDO7, 0x74, 1000, 3300, 7), - TWL6025_ADJUSTABLE_LDO(LDO6, 0x60, 1000, 3300, 16), - TWL6025_ADJUSTABLE_LDO(LDOLN, 0x64, 1000, 3300, 17), - TWL6025_ADJUSTABLE_LDO(LDOUSB, 0x70, 1000, 3300, 18), - - TWL6025_ADJUSTABLE_SMPS(SMPS3, 0x34, 1), - TWL6025_ADJUSTABLE_SMPS(SMPS4, 0x10, 2), - TWL6025_ADJUSTABLE_SMPS(VIO, 0x16, 3), + TWL6025_ADJUSTABLE_LDO(LDO2, 0x54, 1000, 3300), + TWL6025_ADJUSTABLE_LDO(LDO4, 0x58, 1000, 3300), + TWL6025_ADJUSTABLE_LDO(LDO3, 0x5c, 1000, 3300), + TWL6025_ADJUSTABLE_LDO(LDO5, 0x68, 1000, 3300), + TWL6025_ADJUSTABLE_LDO(LDO1, 0x6c, 1000, 3300), + TWL6025_ADJUSTABLE_LDO(LDO7, 0x74, 1000, 3300), + TWL6025_ADJUSTABLE_LDO(LDO6, 0x60, 1000, 3300), + TWL6025_ADJUSTABLE_LDO(LDOLN, 0x64, 1000, 3300), + TWL6025_ADJUSTABLE_LDO(LDOUSB, 0x70, 1000, 3300), + + TWL6025_ADJUSTABLE_SMPS(SMPS3, 0x34), + TWL6025_ADJUSTABLE_SMPS(SMPS4, 0x10), + TWL6025_ADJUSTABLE_SMPS(VIO, 0x16), }; static u8 twl_get_smps_offset(void) diff --git a/drivers/regulator/wm831x-dcdc.c b/drivers/regulator/wm831x-dcdc.c index a0982e809851..bd3531d8b2ac 100644 --- a/drivers/regulator/wm831x-dcdc.c +++ b/drivers/regulator/wm831x-dcdc.c @@ -267,23 +267,6 @@ static int wm831x_buckv_select_min_voltage(struct regulator_dev *rdev, return vsel; } -static int wm831x_buckv_select_max_voltage(struct regulator_dev *rdev, - int min_uV, int max_uV) -{ - u16 vsel; - - if (max_uV < 600000 || max_uV > 1800000) - return -EINVAL; - - vsel = ((max_uV - 600000) / 12500) + 8; - - if (wm831x_buckv_list_voltage(rdev, vsel) < min_uV || - wm831x_buckv_list_voltage(rdev, vsel) < max_uV) - return -EINVAL; - - return vsel; -} - static int wm831x_buckv_set_dvs(struct regulator_dev *rdev, int state) { struct wm831x_dcdc *dcdc = rdev_get_drvdata(rdev); @@ -338,28 +321,23 @@ static int wm831x_buckv_set_voltage(struct regulator_dev *rdev, if (ret < 0) return ret; - /* Set the high voltage as the DVS voltage. This is optimised - * for CPUfreq usage, most processors will keep the maximum - * voltage constant and lower the minimum with the frequency. */ - vsel = wm831x_buckv_select_max_voltage(rdev, min_uV, max_uV); - if (vsel < 0) { - /* This should never happen - at worst the same vsel - * should be chosen */ - WARN_ON(vsel < 0); - return 0; + /* + * If this VSEL is higher than the last one we've seen then + * remember it as the DVS VSEL. This is optimised for CPUfreq + * usage where we want to get to the highest voltage very + * quickly. + */ + if (vsel > dcdc->dvs_vsel) { + ret = wm831x_set_bits(wm831x, dvs_reg, + WM831X_DC1_DVS_VSEL_MASK, + dcdc->dvs_vsel); + if (ret == 0) + dcdc->dvs_vsel = vsel; + else + dev_warn(wm831x->dev, + "Failed to set DCDC DVS VSEL: %d\n", ret); } - /* Don't bother if it's the same VSEL we're already using */ - if (vsel == dcdc->on_vsel) - return 0; - - ret = wm831x_set_bits(wm831x, dvs_reg, WM831X_DC1_DVS_VSEL_MASK, vsel); - if (ret == 0) - dcdc->dvs_vsel = vsel; - else - dev_warn(wm831x->dev, "Failed to set DCDC DVS VSEL: %d\n", - ret); - return 0; } @@ -456,27 +434,6 @@ static __devinit void wm831x_buckv_dvs_init(struct wm831x_dcdc *dcdc, if (!pdata || !pdata->dvs_gpio) return; - switch (pdata->dvs_control_src) { - case 1: - ctrl = 2 << WM831X_DC1_DVS_SRC_SHIFT; - break; - case 2: - ctrl = 3 << WM831X_DC1_DVS_SRC_SHIFT; - break; - default: - dev_err(wm831x->dev, "Invalid DVS control source %d for %s\n", - pdata->dvs_control_src, dcdc->name); - return; - } - - ret = wm831x_set_bits(wm831x, dcdc->base + WM831X_DCDC_DVS_CONTROL, - WM831X_DC1_DVS_SRC_MASK, ctrl); - if (ret < 0) { - dev_err(wm831x->dev, "Failed to set %s DVS source: %d\n", - dcdc->name, ret); - return; - } - ret = gpio_request(pdata->dvs_gpio, "DCDC DVS"); if (ret < 0) { dev_err(wm831x->dev, "Failed to get %s DVS GPIO: %d\n", @@ -498,17 +455,57 @@ static __devinit void wm831x_buckv_dvs_init(struct wm831x_dcdc *dcdc, } dcdc->dvs_gpio = pdata->dvs_gpio; + + switch (pdata->dvs_control_src) { + case 1: + ctrl = 2 << WM831X_DC1_DVS_SRC_SHIFT; + break; + case 2: + ctrl = 3 << WM831X_DC1_DVS_SRC_SHIFT; + break; + default: + dev_err(wm831x->dev, "Invalid DVS control source %d for %s\n", + pdata->dvs_control_src, dcdc->name); + return; + } + + /* If DVS_VSEL is set to the minimum value then raise it to ON_VSEL + * to make bootstrapping a bit smoother. + */ + if (!dcdc->dvs_vsel) { + ret = wm831x_set_bits(wm831x, + dcdc->base + WM831X_DCDC_DVS_CONTROL, + WM831X_DC1_DVS_VSEL_MASK, dcdc->on_vsel); + if (ret == 0) + dcdc->dvs_vsel = dcdc->on_vsel; + else + dev_warn(wm831x->dev, "Failed to set DVS_VSEL: %d\n", + ret); + } + + ret = wm831x_set_bits(wm831x, dcdc->base + WM831X_DCDC_DVS_CONTROL, + WM831X_DC1_DVS_SRC_MASK, ctrl); + if (ret < 0) { + dev_err(wm831x->dev, "Failed to set %s DVS source: %d\n", + dcdc->name, ret); + } } static __devinit int wm831x_buckv_probe(struct platform_device *pdev) { struct wm831x *wm831x = dev_get_drvdata(pdev->dev.parent); struct wm831x_pdata *pdata = wm831x->dev->platform_data; - int id = pdev->id % ARRAY_SIZE(pdata->dcdc); + int id; struct wm831x_dcdc *dcdc; struct resource *res; int ret, irq; + if (pdata && pdata->wm831x_num) + id = (pdata->wm831x_num * 10) + 1; + else + id = 0; + id = pdev->id - id; + dev_dbg(&pdev->dev, "Probing DCDC%d\n", id + 1); if (pdata == NULL || pdata->dcdc[id] == NULL) @@ -545,7 +542,7 @@ static __devinit int wm831x_buckv_probe(struct platform_device *pdev) } dcdc->on_vsel = ret & WM831X_DC1_ON_VSEL_MASK; - ret = wm831x_reg_read(wm831x, dcdc->base + WM831X_DCDC_ON_CONFIG); + ret = wm831x_reg_read(wm831x, dcdc->base + WM831X_DCDC_DVS_CONTROL); if (ret < 0) { dev_err(wm831x->dev, "Failed to read DVS VSEL: %d\n", ret); goto err; @@ -709,11 +706,17 @@ static __devinit int wm831x_buckp_probe(struct platform_device *pdev) { struct wm831x *wm831x = dev_get_drvdata(pdev->dev.parent); struct wm831x_pdata *pdata = wm831x->dev->platform_data; - int id = pdev->id % ARRAY_SIZE(pdata->dcdc); + int id; struct wm831x_dcdc *dcdc; struct resource *res; int ret, irq; + if (pdata && pdata->wm831x_num) + id = (pdata->wm831x_num * 10) + 1; + else + id = 0; + id = pdev->id - id; + dev_dbg(&pdev->dev, "Probing DCDC%d\n", id + 1); if (pdata == NULL || pdata->dcdc[id] == NULL) @@ -1046,3 +1049,4 @@ MODULE_DESCRIPTION("WM831x DC-DC convertor driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS("platform:wm831x-buckv"); MODULE_ALIAS("platform:wm831x-buckp"); +MODULE_ALIAS("platform:wm831x-epe"); diff --git a/drivers/regulator/wm831x-ldo.c b/drivers/regulator/wm831x-ldo.c index 2220cf8defb1..6709710a059e 100644 --- a/drivers/regulator/wm831x-ldo.c +++ b/drivers/regulator/wm831x-ldo.c @@ -310,11 +310,17 @@ static __devinit int wm831x_gp_ldo_probe(struct platform_device *pdev) { struct wm831x *wm831x = dev_get_drvdata(pdev->dev.parent); struct wm831x_pdata *pdata = wm831x->dev->platform_data; - int id = pdev->id % ARRAY_SIZE(pdata->ldo); + int id; struct wm831x_ldo *ldo; struct resource *res; int ret, irq; + if (pdata && pdata->wm831x_num) + id = (pdata->wm831x_num * 10) + 1; + else + id = 0; + id = pdev->id - id; + dev_dbg(&pdev->dev, "Probing LDO%d\n", id + 1); if (pdata == NULL || pdata->ldo[id] == NULL) @@ -574,11 +580,17 @@ static __devinit int wm831x_aldo_probe(struct platform_device *pdev) { struct wm831x *wm831x = dev_get_drvdata(pdev->dev.parent); struct wm831x_pdata *pdata = wm831x->dev->platform_data; - int id = pdev->id % ARRAY_SIZE(pdata->ldo); + int id; struct wm831x_ldo *ldo; struct resource *res; int ret, irq; + if (pdata && pdata->wm831x_num) + id = (pdata->wm831x_num * 10) + 1; + else + id = 0; + id = pdev->id - id; + dev_dbg(&pdev->dev, "Probing LDO%d\n", id + 1); if (pdata == NULL || pdata->ldo[id] == NULL) @@ -764,11 +776,18 @@ static __devinit int wm831x_alive_ldo_probe(struct platform_device *pdev) { struct wm831x *wm831x = dev_get_drvdata(pdev->dev.parent); struct wm831x_pdata *pdata = wm831x->dev->platform_data; - int id = pdev->id % ARRAY_SIZE(pdata->ldo); + int id; struct wm831x_ldo *ldo; struct resource *res; int ret; + if (pdata && pdata->wm831x_num) + id = (pdata->wm831x_num * 10) + 1; + else + id = 0; + id = pdev->id - id; + + dev_dbg(&pdev->dev, "Probing LDO%d\n", id + 1); if (pdata == NULL || pdata->ldo[id] == NULL) diff --git a/drivers/regulator/wm8994-regulator.c b/drivers/regulator/wm8994-regulator.c index 35b2958d5106..1a6a690f24db 100644 --- a/drivers/regulator/wm8994-regulator.c +++ b/drivers/regulator/wm8994-regulator.c @@ -43,7 +43,7 @@ static int wm8994_ldo_enable(struct regulator_dev *rdev) if (!ldo->enable) return 0; - gpio_set_value(ldo->enable, 1); + gpio_set_value_cansleep(ldo->enable, 1); ldo->is_enabled = true; return 0; @@ -57,7 +57,7 @@ static int wm8994_ldo_disable(struct regulator_dev *rdev) if (!ldo->enable) return -EINVAL; - gpio_set_value(ldo->enable, 0); + gpio_set_value_cansleep(ldo->enable, 0); ldo->is_enabled = false; return 0; diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index f441726ddf2b..86b0735e6aa0 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -36,9 +36,6 @@ config WATCHDOG_CORE and gives them the /dev/watchdog interface (and later also the sysfs interface). - To compile this driver as a module, choose M here: the module will - be called watchdog. - config WATCHDOG_NOWAYOUT bool "Disable watchdog shutdown on close" help diff --git a/drivers/watchdog/nv_tco.c b/drivers/watchdog/nv_tco.c index afa78a54711e..809f41c30c44 100644 --- a/drivers/watchdog/nv_tco.c +++ b/drivers/watchdog/nv_tco.c @@ -458,7 +458,15 @@ static int __devexit nv_tco_remove(struct platform_device *dev) static void nv_tco_shutdown(struct platform_device *dev) { + u32 val; + tco_timer_stop(); + + /* Some BIOSes fail the POST (once) if the NO_REBOOT flag is not + * unset during shutdown. */ + pci_read_config_dword(tco_pci, MCP51_SMBUS_SETUP_B, &val); + val &= ~MCP51_SMBUS_SETUP_B_TCO_REBOOT; + pci_write_config_dword(tco_pci, MCP51_SMBUS_SETUP_B, val); } static struct platform_driver nv_tco_driver = { diff --git a/drivers/watchdog/shwdt.c b/drivers/watchdog/shwdt.c index db84f2322d1a..a267dc078daf 100644 --- a/drivers/watchdog/shwdt.c +++ b/drivers/watchdog/shwdt.c @@ -64,7 +64,7 @@ * misses its deadline, the kernel timer will allow the WDT to overflow. */ static int clock_division_ratio = WTCSR_CKS_4096; -#define next_ping_period(cks) msecs_to_jiffies(cks - 4) +#define next_ping_period(cks) (jiffies + msecs_to_jiffies(cks - 4)) static const struct watchdog_info sh_wdt_info; static struct platform_device *sh_wdt_dev; diff --git a/fs/9p/acl.c b/fs/9p/acl.c index e9cb57f07546..9a1d42630751 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -182,11 +182,11 @@ int v9fs_set_create_acl(struct dentry *dentry, return 0; } -int v9fs_acl_mode(struct inode *dir, mode_t *modep, +int v9fs_acl_mode(struct inode *dir, umode_t *modep, struct posix_acl **dpacl, struct posix_acl **pacl) { int retval = 0; - mode_t mode = *modep; + umode_t mode = *modep; struct posix_acl *acl = NULL; if (!S_ISLNK(mode)) { @@ -319,7 +319,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name, case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; if (acl) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; retval = posix_acl_equiv_mode(acl, &mode); if (retval < 0) goto err_out; diff --git a/fs/9p/acl.h b/fs/9p/acl.h index ddb7ae19d971..559556411965 100644 --- a/fs/9p/acl.h +++ b/fs/9p/acl.h @@ -20,7 +20,7 @@ extern struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type); extern int v9fs_acl_chmod(struct dentry *); extern int v9fs_set_create_acl(struct dentry *, struct posix_acl **, struct posix_acl **); -extern int v9fs_acl_mode(struct inode *dir, mode_t *modep, +extern int v9fs_acl_mode(struct inode *dir, umode_t *modep, struct posix_acl **dpacl, struct posix_acl **pacl); #else #define v9fs_iop_get_acl NULL @@ -38,7 +38,7 @@ static inline int v9fs_set_create_acl(struct dentry *dentry, { return 0; } -static inline int v9fs_acl_mode(struct inode *dir, mode_t *modep, +static inline int v9fs_acl_mode(struct inode *dir, umode_t *modep, struct posix_acl **dpacl, struct posix_acl **pacl) { diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 9a26dce5a99f..b6c8ed205192 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -206,7 +206,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, int err = 0; gid_t gid; int flags; - mode_t mode; + umode_t mode; char *name = NULL; struct file *filp; struct p9_qid qid; @@ -348,7 +348,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct p9_fid *fid = NULL, *dfid = NULL; gid_t gid; char *name; - mode_t mode; + umode_t mode; struct inode *inode; struct p9_qid qid; struct dentry *dir_dentry; @@ -751,7 +751,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, int err; gid_t gid; char *name; - mode_t mode; + umode_t mode; struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL, *dfid = NULL; struct inode *inode; diff --git a/fs/block_dev.c b/fs/block_dev.c index f55aad4d1611..f28680553288 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -552,6 +552,7 @@ struct block_device *bdget(dev_t dev) if (inode->i_state & I_NEW) { bdev->bd_contains = NULL; + bdev->bd_super = NULL; bdev->bd_inode = inode; bdev->bd_block_size = (1 << inode->i_blkbits); bdev->bd_part_count = 0; diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 65a735d8f6e4..4cc5c0164ed6 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -111,7 +111,6 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans, int ret, size = 0; const char *name; char *value = NULL; - mode_t mode; if (acl) { ret = posix_acl_valid(acl); @@ -122,13 +121,11 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans, switch (type) { case ACL_TYPE_ACCESS: - mode = inode->i_mode; name = POSIX_ACL_XATTR_ACCESS; if (acl) { - ret = posix_acl_equiv_mode(acl, &mode); + ret = posix_acl_equiv_mode(acl, &inode->i_mode); if (ret < 0) return ret; - inode->i_mode = mode; } ret = 0; break; @@ -222,19 +219,16 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans, } if (IS_POSIXACL(dir) && acl) { - mode_t mode = inode->i_mode; - if (S_ISDIR(inode->i_mode)) { ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_DEFAULT); if (ret) goto failed; } - ret = posix_acl_create(&acl, GFP_NOFS, &mode); + ret = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); if (ret < 0) return ret; - inode->i_mode = mode; if (ret > 0) { /* we need an acl */ ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 13e6255182e3..ae762dab37f8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3993,12 +3993,19 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) struct btrfs_root *sub_root = root; struct btrfs_key location; int index; - int ret; + int ret = 0; if (dentry->d_name.len > BTRFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - ret = btrfs_inode_by_name(dir, dentry, &location); + if (unlikely(d_need_lookup(dentry))) { + memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key)); + kfree(dentry->d_fsdata); + dentry->d_fsdata = NULL; + d_clear_need_lookup(dentry); + } else { + ret = btrfs_inode_by_name(dir, dentry, &location); + } if (ret < 0) return ERR_PTR(ret); @@ -4053,6 +4060,12 @@ static int btrfs_dentry_delete(const struct dentry *dentry) return 0; } +static void btrfs_dentry_release(struct dentry *dentry) +{ + if (dentry->d_fsdata) + kfree(dentry->d_fsdata); +} + static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { @@ -4075,6 +4088,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, struct btrfs_path *path; struct list_head ins_list; struct list_head del_list; + struct qstr q; int ret; struct extent_buffer *leaf; int slot; @@ -4164,6 +4178,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, while (di_cur < di_total) { struct btrfs_key location; + struct dentry *tmp; if (verify_dir_item(root, leaf, di)) break; @@ -4184,6 +4199,33 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; btrfs_dir_item_key_to_cpu(leaf, di, &location); + q.name = name_ptr; + q.len = name_len; + q.hash = full_name_hash(q.name, q.len); + tmp = d_lookup(filp->f_dentry, &q); + if (!tmp) { + struct btrfs_key *newkey; + + newkey = kzalloc(sizeof(struct btrfs_key), + GFP_NOFS); + if (!newkey) + goto no_dentry; + tmp = d_alloc(filp->f_dentry, &q); + if (!tmp) { + kfree(newkey); + dput(tmp); + goto no_dentry; + } + memcpy(newkey, &location, + sizeof(struct btrfs_key)); + tmp->d_fsdata = newkey; + tmp->d_flags |= DCACHE_NEED_LOOKUP; + d_rehash(tmp); + dput(tmp); + } else { + dput(tmp); + } +no_dentry: /* is this a reference to our own snapshot? If so * skip it */ @@ -7430,4 +7472,5 @@ static const struct inode_operations btrfs_symlink_inode_operations = { const struct dentry_operations btrfs_dentry_operations = { .d_delete = btrfs_dentry_delete, + .d_release = btrfs_dentry_release, }; diff --git a/fs/dcache.c b/fs/dcache.c index b05aac3a8cfc..2347cdb15abb 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -301,6 +301,27 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) return parent; } +/* + * Unhash a dentry without inserting an RCU walk barrier or checking that + * dentry->d_lock is locked. The caller must take care of that, if + * appropriate. + */ +static void __d_shrink(struct dentry *dentry) +{ + if (!d_unhashed(dentry)) { + struct hlist_bl_head *b; + if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) + b = &dentry->d_sb->s_anon; + else + b = d_hash(dentry->d_parent, dentry->d_name.hash); + + hlist_bl_lock(b); + __hlist_bl_del(&dentry->d_hash); + dentry->d_hash.pprev = NULL; + hlist_bl_unlock(b); + } +} + /** * d_drop - drop a dentry * @dentry: dentry to drop @@ -319,17 +340,7 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) void __d_drop(struct dentry *dentry) { if (!d_unhashed(dentry)) { - struct hlist_bl_head *b; - if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) - b = &dentry->d_sb->s_anon; - else - b = d_hash(dentry->d_parent, dentry->d_name.hash); - - hlist_bl_lock(b); - __hlist_bl_del(&dentry->d_hash); - dentry->d_hash.pprev = NULL; - hlist_bl_unlock(b); - + __d_shrink(dentry); dentry_rcuwalk_barrier(dentry); } } @@ -828,44 +839,24 @@ EXPORT_SYMBOL(shrink_dcache_sb); static void shrink_dcache_for_umount_subtree(struct dentry *dentry) { struct dentry *parent; - unsigned detached = 0; BUG_ON(!IS_ROOT(dentry)); - /* detach this root from the system */ - spin_lock(&dentry->d_lock); - dentry_lru_del(dentry); - __d_drop(dentry); - spin_unlock(&dentry->d_lock); - for (;;) { /* descend to the first leaf in the current subtree */ - while (!list_empty(&dentry->d_subdirs)) { - struct dentry *loop; - - /* this is a branch with children - detach all of them - * from the system in one go */ - spin_lock(&dentry->d_lock); - list_for_each_entry(loop, &dentry->d_subdirs, - d_u.d_child) { - spin_lock_nested(&loop->d_lock, - DENTRY_D_LOCK_NESTED); - dentry_lru_del(loop); - __d_drop(loop); - spin_unlock(&loop->d_lock); - } - spin_unlock(&dentry->d_lock); - - /* move to the first child */ + while (!list_empty(&dentry->d_subdirs)) dentry = list_entry(dentry->d_subdirs.next, struct dentry, d_u.d_child); - } /* consume the dentries from this leaf up through its parents * until we find one with children or run out altogether */ do { struct inode *inode; + /* detach from the system */ + dentry_lru_del(dentry); + __d_shrink(dentry); + if (dentry->d_count != 0) { printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%s}" @@ -886,14 +877,10 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) list_del(&dentry->d_u.d_child); } else { parent = dentry->d_parent; - spin_lock(&parent->d_lock); parent->d_count--; list_del(&dentry->d_u.d_child); - spin_unlock(&parent->d_lock); } - detached++; - inode = dentry->d_inode; if (inode) { dentry->d_inode = NULL; @@ -938,9 +925,7 @@ void shrink_dcache_for_umount(struct super_block *sb) dentry = sb->s_root; sb->s_root = NULL; - spin_lock(&dentry->d_lock); dentry->d_count--; - spin_unlock(&dentry->d_lock); shrink_dcache_for_umount_subtree(dentry); while (!hlist_bl_empty(&sb->s_anon)) { diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 52c053763942..35d6a3cfd9ff 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -194,12 +194,10 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) case ACL_TYPE_ACCESS: name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - mode_t mode = inode->i_mode; - error = posix_acl_equiv_mode(acl, &mode); + error = posix_acl_equiv_mode(acl, &inode->i_mode); if (error < 0) return error; else { - inode->i_mode = mode; inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); if (error == 0) @@ -253,16 +251,14 @@ ext2_init_acl(struct inode *inode, struct inode *dir) inode->i_mode &= ~current_umask(); } if (test_opt(inode->i_sb, POSIX_ACL) && acl) { - mode_t mode = inode->i_mode; if (S_ISDIR(inode->i_mode)) { error = ext2_set_acl(inode, ACL_TYPE_DEFAULT, acl); if (error) goto cleanup; } - error = posix_acl_create(&acl, GFP_KERNEL, &mode); + error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); if (error < 0) return error; - inode->i_mode = mode; if (error > 0) { /* This is an extended ACL */ error = ext2_set_acl(inode, ACL_TYPE_ACCESS, acl); diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index 6c29bf0df04a..3091f62e55b6 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -199,12 +199,10 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type, case ACL_TYPE_ACCESS: name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - mode_t mode = inode->i_mode; - error = posix_acl_equiv_mode(acl, &mode); + error = posix_acl_equiv_mode(acl, &inode->i_mode); if (error < 0) return error; else { - inode->i_mode = mode; inode->i_ctime = CURRENT_TIME_SEC; ext3_mark_inode_dirty(handle, inode); if (error == 0) @@ -261,19 +259,16 @@ ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) inode->i_mode &= ~current_umask(); } if (test_opt(inode->i_sb, POSIX_ACL) && acl) { - mode_t mode = inode->i_mode; - if (S_ISDIR(inode->i_mode)) { error = ext3_set_acl(handle, inode, ACL_TYPE_DEFAULT, acl); if (error) goto cleanup; } - error = posix_acl_create(&acl, GFP_NOFS, &mode); + error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); if (error < 0) return error; - inode->i_mode = mode; if (error > 0) { /* This is an extended ACL */ error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 04109460ba9e..56fd8f865930 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ - mmp.o + mmp.o indirect.o ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index dca2d1ded931..a5c29bb3b835 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -198,12 +198,10 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - mode_t mode = inode->i_mode; - error = posix_acl_equiv_mode(acl, &mode); + error = posix_acl_equiv_mode(acl, &inode->i_mode); if (error < 0) return error; else { - inode->i_mode = mode; inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); if (error == 0) @@ -259,19 +257,16 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) inode->i_mode &= ~current_umask(); } if (test_opt(inode->i_sb, POSIX_ACL) && acl) { - mode_t mode = inode->i_mode; - if (S_ISDIR(inode->i_mode)) { error = ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT, acl); if (error) goto cleanup; } - error = posix_acl_create(&acl, GFP_NOFS, &mode); + error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); if (error < 0) return error; - inode->i_mode = mode; if (error > 0) { /* This is an extended ACL */ error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 264f6949511e..f8224adf496e 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -620,3 +620,51 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) } +/** + * ext4_inode_to_goal_block - return a hint for block allocation + * @inode: inode for block allocation + * + * Return the ideal location to start allocating blocks for a + * newly created inode. + */ +ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + ext4_group_t block_group; + ext4_grpblk_t colour; + int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); + ext4_fsblk_t bg_start; + ext4_fsblk_t last_block; + + block_group = ei->i_block_group; + if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { + /* + * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME + * block groups per flexgroup, reserve the first block + * group for directories and special files. Regular + * files will start at the second block group. This + * tends to speed up directory access and improves + * fsck times. + */ + block_group &= ~(flex_size-1); + if (S_ISREG(inode->i_mode)) + block_group++; + } + bg_start = ext4_group_first_block_no(inode->i_sb, block_group); + last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; + + /* + * If we are doing delayed allocation, we don't need take + * colour into account. + */ + if (test_opt(inode->i_sb, DELALLOC)) + return bg_start; + + if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) + colour = (current->pid % 16) * + (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); + else + colour = (current->pid % 16) * ((last_block - bg_start) / 16); + return bg_start + colour; +} + diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index fac90f3fba80..8efb2f0a3447 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -246,3 +246,24 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, return 1; } +int ext4_check_blockref(const char *function, unsigned int line, + struct inode *inode, __le32 *p, unsigned int max) +{ + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + __le32 *bref = p; + unsigned int blk; + + while (bref < p+max) { + blk = le32_to_cpu(*bref++); + if (blk && + unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), + blk, 1))) { + es->s_last_error_block = cpu_to_le64(blk); + ext4_error_inode(inode, function, line, blk, + "invalid block"); + return -EIO; + } + } + return 0; +} + diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index fa44df879711..e717dfd2f2b4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -526,6 +526,7 @@ struct ext4_new_group_data { #define EXT4_FREE_BLOCKS_METADATA 0x0001 #define EXT4_FREE_BLOCKS_FORGET 0x0002 #define EXT4_FREE_BLOCKS_VALIDATED 0x0004 +#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 /* * ioctl commands @@ -939,6 +940,8 @@ struct ext4_inode_info { #define ext4_find_next_zero_bit find_next_zero_bit_le #define ext4_find_next_bit find_next_bit_le +extern void ext4_set_bits(void *bm, int cur, int len); + /* * Maximal mount counts between two filesystem checks */ @@ -1126,7 +1129,8 @@ struct ext4_sb_info { struct journal_s *s_journal; struct list_head s_orphan; struct mutex s_orphan_lock; - struct mutex s_resize_lock; + unsigned long s_resize_flags; /* Flags indicating if there + is a resizer */ unsigned long s_commit_interval; u32 s_max_batch_time; u32 s_min_batch_time; @@ -1214,6 +1218,9 @@ struct ext4_sb_info { /* Kernel thread for multiple mount protection */ struct task_struct *s_mmp_tsk; + + /* record the last minlen when FITRIM is called. */ + atomic_t s_last_trim_minblks; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1743,6 +1750,7 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb, struct ext4_group_desc *desc); #define ext4_free_blocks_after_init(sb, group, desc) \ ext4_init_block_bitmap(sb, NULL, group, desc) +ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); /* dir.c */ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, @@ -1793,7 +1801,7 @@ extern void ext4_free_blocks(handle_t *handle, struct inode *inode, unsigned long count, int flags); extern int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t i, struct ext4_group_desc *desc); -extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, +extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count); extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); @@ -1834,6 +1842,17 @@ extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); + +/* indirect.c */ +extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs); +extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); +extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); +extern void ext4_ind_truncate(struct inode *inode); + /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); @@ -1855,6 +1874,9 @@ extern int ext4_group_extend(struct super_block *sb, ext4_fsblk_t n_blocks_count); /* super.c */ +extern void *ext4_kvmalloc(size_t size, gfp_t flags); +extern void *ext4_kvzalloc(size_t size, gfp_t flags); +extern void ext4_kvfree(void *ptr); extern void __ext4_error(struct super_block *, const char *, unsigned int, const char *, ...) __attribute__ ((format (printf, 4, 5))); @@ -2067,11 +2089,19 @@ struct ext4_group_info { * 5 free 8-block regions. */ }; -#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 +#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 +#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 #define EXT4_MB_GRP_NEED_INIT(grp) \ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_WAS_TRIMMED(grp) \ + (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_SET_TRIMMED(grp) \ + (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ + (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) + #define EXT4_MAX_CONTENTION 8 #define EXT4_CONTENTION_THRESHOLD 2 @@ -2123,6 +2153,19 @@ static inline void ext4_mark_super_dirty(struct super_block *sb) } /* + * Block validity checking + */ +#define ext4_check_indirect_blockref(inode, bh) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + (__le32 *)(bh)->b_data, \ + EXT4_ADDR_PER_BLOCK((inode)->i_sb)) + +#define ext4_ind_check_inode(inode) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + EXT4_I(inode)->i_data, \ + EXT4_NDIR_BLOCKS) + +/* * Inodes and files operations */ @@ -2151,6 +2194,8 @@ extern void ext4_exit_system_zone(void); extern int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, unsigned int count); +extern int ext4_check_blockref(const char *, unsigned int, + struct inode *, __le32 *, unsigned int); /* extents.c */ extern int ext4_ext_tree_init(handle_t *handle, struct inode *); @@ -2230,6 +2275,10 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; +#define EXT4_RESIZING 0 +extern int ext4_resize_begin(struct super_block *sb); +extern void ext4_resize_end(struct super_block *sb); + #endif /* __KERNEL__ */ #endif /* _EXT4_H */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f815cc81e7a2..57cf568a98ab 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -114,12 +114,6 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { - struct ext4_inode_info *ei = EXT4_I(inode); - ext4_fsblk_t bg_start; - ext4_fsblk_t last_block; - ext4_grpblk_t colour; - ext4_group_t block_group; - int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); int depth; if (path) { @@ -161,36 +155,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, } /* OK. use inode's group */ - block_group = ei->i_block_group; - if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { - /* - * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME - * block groups per flexgroup, reserve the first block - * group for directories and special files. Regular - * files will start at the second block group. This - * tends to speed up directory access and improves - * fsck times. - */ - block_group &= ~(flex_size-1); - if (S_ISREG(inode->i_mode)) - block_group++; - } - bg_start = ext4_group_first_block_no(inode->i_sb, block_group); - last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; - - /* - * If we are doing delayed allocation, we don't need take - * colour into account. - */ - if (test_opt(inode->i_sb, DELALLOC)) - return bg_start; - - if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) - colour = (current->pid % 16) * - (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); - else - colour = (current->pid % 16) * ((last_block - bg_start) / 16); - return bg_start + colour + block; + return ext4_inode_to_goal_block(inode); } /* @@ -776,6 +741,16 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, logical, le32_to_cpu(curp->p_idx->ei_block)); return -EIO; } + + if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) + >= le16_to_cpu(curp->p_hdr->eh_max))) { + EXT4_ERROR_INODE(inode, + "eh_entries %d >= eh_max %d!", + le16_to_cpu(curp->p_hdr->eh_entries), + le16_to_cpu(curp->p_hdr->eh_max)); + return -EIO; + } + len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; if (logical > le32_to_cpu(curp->p_idx->ei_block)) { /* insert after */ @@ -805,13 +780,6 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, ext4_idx_store_pblock(ix, ptr); le16_add_cpu(&curp->p_hdr->eh_entries, 1); - if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) - > le16_to_cpu(curp->p_hdr->eh_max))) { - EXT4_ERROR_INODE(inode, - "logical %d == ei_block %d!", - logical, le32_to_cpu(curp->p_idx->ei_block)); - return -EIO; - } if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); return -EIO; @@ -1446,8 +1414,7 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path) * ext4_ext_next_leaf_block: * returns first allocated block from next leaf or EXT_MAX_BLOCKS */ -static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode, - struct ext4_ext_path *path) +static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path) { int depth; @@ -1757,7 +1724,6 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, goto merge; } -repeat: depth = ext_depth(inode); eh = path[depth].p_hdr; if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) @@ -1765,9 +1731,10 @@ repeat: /* probably next leaf has space for us? */ fex = EXT_LAST_EXTENT(eh); - next = ext4_ext_next_leaf_block(inode, path); - if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block) - && next != EXT_MAX_BLOCKS) { + next = EXT_MAX_BLOCKS; + if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) + next = ext4_ext_next_leaf_block(path); + if (next != EXT_MAX_BLOCKS) { ext_debug("next leaf block - %d\n", next); BUG_ON(npath != NULL); npath = ext4_ext_find_extent(inode, next, NULL); @@ -1779,7 +1746,7 @@ repeat: ext_debug("next leaf isn't full(%d)\n", le16_to_cpu(eh->eh_entries)); path = npath; - goto repeat; + goto has_space; } ext_debug("next leaf has no free space(%d,%d)\n", le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); @@ -1839,7 +1806,7 @@ has_space: ext4_ext_pblock(newext), ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext), - nearex, len, nearex + 1, nearex + 2); + nearex, len, nearex, nearex + 1); memmove(nearex + 1, nearex, len); path[depth].p_ext = nearex; } @@ -2052,7 +2019,7 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, } /* - * ext4_ext_in_cache() + * ext4_ext_check_cache() * Checks to see if the given block is in the cache. * If it is, the cached extent is stored in the given * cache extent pointer. If the cached extent is a hole, @@ -2134,8 +2101,6 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, /* * ext4_ext_rm_idx: * removes index from the index block. - * It's used in truncate case only, thus all requests are for - * last index in the block only. */ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) @@ -2153,6 +2118,13 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, err = ext4_ext_get_access(handle, inode, path); if (err) return err; + + if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) { + int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx; + len *= sizeof(struct ext4_extent_idx); + memmove(path->p_idx, path->p_idx + 1, len); + } + le16_add_cpu(&path->p_hdr->eh_entries, -1); err = ext4_ext_dirty(handle, inode, path); if (err) @@ -2534,8 +2506,7 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path) return 1; } -static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, - ext4_lblk_t end) +static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) { struct super_block *sb = inode->i_sb; int depth = ext_depth(inode); @@ -2575,7 +2546,7 @@ again: if (i == depth) { /* this is leaf block */ err = ext4_ext_rm_leaf(handle, inode, path, - start, end); + start, EXT_MAX_BLOCKS - 1); /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); path[i].p_bh = NULL; @@ -3107,12 +3078,10 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, struct ext4_ext_path *path) { struct ext4_extent *ex; - struct ext4_extent_header *eh; int depth; int err = 0; depth = ext_depth(inode); - eh = path[depth].p_hdr; ex = path[depth].p_ext; ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" @@ -3357,8 +3326,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* check in cache */ - if (ext4_ext_in_cache(inode, map->m_lblk, &newex) && - ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) { + if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) && + ext4_ext_in_cache(inode, map->m_lblk, &newex)) { if (!newex.ee_start_lo && !newex.ee_start_hi) { if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { /* @@ -3497,8 +3466,27 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext4_ext_mark_uninitialized(ex); - err = ext4_ext_remove_space(inode, map->m_lblk, - map->m_lblk + punched_out); + ext4_ext_invalidate_cache(inode); + + err = ext4_ext_rm_leaf(handle, inode, path, + map->m_lblk, map->m_lblk + punched_out); + + if (!err && path->p_hdr->eh_entries == 0) { + /* + * Punch hole freed all of this sub tree, + * so we need to correct eh_depth + */ + err = ext4_ext_get_access(handle, inode, path); + if (err == 0) { + ext_inode_hdr(inode)->eh_depth = 0; + ext_inode_hdr(inode)->eh_max = + cpu_to_le16(ext4_ext_space_root( + inode, 0)); + + err = ext4_ext_dirty( + handle, inode, path); + } + } goto out2; } @@ -3596,17 +3584,18 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, } err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len); - if (err) - goto out2; - - err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + if (!err) + err = ext4_ext_insert_extent(handle, inode, path, + &newex, flags); if (err) { + int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? + EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; /* free data blocks we just allocated */ /* not a good idea to call discard here directly, * but otherwise we'd need to call it every free() */ ext4_discard_preallocations(inode); ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex), - ext4_ext_get_actual_len(&newex), 0); + ext4_ext_get_actual_len(&newex), fb_flags); goto out2; } @@ -3699,7 +3688,7 @@ void ext4_ext_truncate(struct inode *inode) last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); - err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); + err = ext4_ext_remove_space(inode, last_block); /* In a multi-transaction truncate, we only make the final * transaction synchronous. @@ -3835,7 +3824,7 @@ retry: blkbits) >> blkbits)) new_size = offset + len; else - new_size = (map.m_lblk + ret) << blkbits; + new_size = ((loff_t) map.m_lblk + ret) << blkbits; ext4_falloc_update_inode(inode, mode, new_size, (map.m_flags & EXT4_MAP_NEW)); diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index da3bed3e0c29..036f78f7a1ef 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -129,15 +129,30 @@ static int ext4_sync_parent(struct inode *inode) { struct writeback_control wbc; struct dentry *dentry = NULL; + struct inode *next; int ret = 0; - while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { + if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) + return 0; + inode = igrab(inode); + while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); - dentry = list_entry(inode->i_dentry.next, - struct dentry, d_alias); - if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode) + dentry = NULL; + spin_lock(&inode->i_lock); + if (!list_empty(&inode->i_dentry)) { + dentry = list_first_entry(&inode->i_dentry, + struct dentry, d_alias); + dget(dentry); + } + spin_unlock(&inode->i_lock); + if (!dentry) break; - inode = dentry->d_parent->d_inode; + next = igrab(dentry->d_parent->d_inode); + dput(dentry); + if (!next) + break; + iput(inode); + inode = next; ret = sync_mapping_buffers(inode->i_mapping); if (ret) break; @@ -148,6 +163,7 @@ static int ext4_sync_parent(struct inode *inode) if (ret) break; } + iput(inode); return ret; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 21bb2f61e502..9c63f273b550 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1287,7 +1287,7 @@ extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, group, used_blks, ext4_itable_unused_count(sb, gdp)); ret = 1; - goto out; + goto err_out; } blk = ext4_inode_table(sb, gdp) + used_blks; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c new file mode 100644 index 000000000000..b8602cde5b5a --- /dev/null +++ b/fs/ext4/indirect.c @@ -0,0 +1,1482 @@ +/* + * linux/fs/ext4/indirect.c + * + * from + * + * linux/fs/ext4/inode.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card ([email protected]) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Goal-directed block allocation by Stephen Tweedie + * ([email protected]), 1993, 1998 + */ + +#include <linux/module.h> +#include "ext4_jbd2.h" +#include "truncate.h" + +#include <trace/events/ext4.h> + +typedef struct { + __le32 *p; + __le32 key; + struct buffer_head *bh; +} Indirect; + +static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) +{ + p->key = *(p->p = v); + p->bh = bh; +} + +/** + * ext4_block_to_path - parse the block number into array of offsets + * @inode: inode in question (we are only interested in its superblock) + * @i_block: block number to be parsed + * @offsets: array to store the offsets in + * @boundary: set this non-zero if the referred-to block is likely to be + * followed (on disk) by an indirect block. + * + * To store the locations of file's data ext4 uses a data structure common + * for UNIX filesystems - tree of pointers anchored in the inode, with + * data blocks at leaves and indirect blocks in intermediate nodes. + * This function translates the block number into path in that tree - + * return value is the path length and @offsets[n] is the offset of + * pointer to (n+1)th node in the nth one. If @block is out of range + * (negative or too large) warning is printed and zero returned. + * + * Note: function doesn't find node addresses, so no IO is needed. All + * we need to know is the capacity of indirect blocks (taken from the + * inode->i_sb). + */ + +/* + * Portability note: the last comparison (check that we fit into triple + * indirect block) is spelled differently, because otherwise on an + * architecture with 32-bit longs and 8Kb pages we might get into trouble + * if our filesystem had 8Kb blocks. We might use long long, but that would + * kill us on x86. Oh, well, at least the sign propagation does not matter - + * i_block would have to be negative in the very beginning, so we would not + * get there at all. + */ + +static int ext4_block_to_path(struct inode *inode, + ext4_lblk_t i_block, + ext4_lblk_t offsets[4], int *boundary) +{ + int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); + int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); + const long direct_blocks = EXT4_NDIR_BLOCKS, + indirect_blocks = ptrs, + double_blocks = (1 << (ptrs_bits * 2)); + int n = 0; + int final = 0; + + if (i_block < direct_blocks) { + offsets[n++] = i_block; + final = direct_blocks; + } else if ((i_block -= direct_blocks) < indirect_blocks) { + offsets[n++] = EXT4_IND_BLOCK; + offsets[n++] = i_block; + final = ptrs; + } else if ((i_block -= indirect_blocks) < double_blocks) { + offsets[n++] = EXT4_DIND_BLOCK; + offsets[n++] = i_block >> ptrs_bits; + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { + offsets[n++] = EXT4_TIND_BLOCK; + offsets[n++] = i_block >> (ptrs_bits * 2); + offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else { + ext4_warning(inode->i_sb, "block %lu > max in inode %lu", + i_block + direct_blocks + + indirect_blocks + double_blocks, inode->i_ino); + } + if (boundary) + *boundary = final - 1 - (i_block & (ptrs - 1)); + return n; +} + +/** + * ext4_get_branch - read the chain of indirect blocks leading to data + * @inode: inode in question + * @depth: depth of the chain (1 - direct pointer, etc.) + * @offsets: offsets of pointers in inode/indirect blocks + * @chain: place to store the result + * @err: here we store the error value + * + * Function fills the array of triples <key, p, bh> and returns %NULL + * if everything went OK or the pointer to the last filled triple + * (incomplete one) otherwise. Upon the return chain[i].key contains + * the number of (i+1)-th block in the chain (as it is stored in memory, + * i.e. little-endian 32-bit), chain[i].p contains the address of that + * number (it points into struct inode for i==0 and into the bh->b_data + * for i>0) and chain[i].bh points to the buffer_head of i-th indirect + * block for i>0 and NULL for i==0. In other words, it holds the block + * numbers of the chain, addresses they were taken from (and where we can + * verify that chain did not change) and buffer_heads hosting these + * numbers. + * + * Function stops when it stumbles upon zero pointer (absent block) + * (pointer to last triple returned, *@err == 0) + * or when it gets an IO error reading an indirect block + * (ditto, *@err == -EIO) + * or when it reads all @depth-1 indirect blocks successfully and finds + * the whole chain, all way to the data (returns %NULL, *err == 0). + * + * Need to be called with + * down_read(&EXT4_I(inode)->i_data_sem) + */ +static Indirect *ext4_get_branch(struct inode *inode, int depth, + ext4_lblk_t *offsets, + Indirect chain[4], int *err) +{ + struct super_block *sb = inode->i_sb; + Indirect *p = chain; + struct buffer_head *bh; + + *err = 0; + /* i_data is not going away, no lock needed */ + add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); + if (!p->key) + goto no_block; + while (--depth) { + bh = sb_getblk(sb, le32_to_cpu(p->key)); + if (unlikely(!bh)) + goto failure; + + if (!bh_uptodate_or_lock(bh)) { + if (bh_submit_read(bh) < 0) { + put_bh(bh); + goto failure; + } + /* validate block references */ + if (ext4_check_indirect_blockref(inode, bh)) { + put_bh(bh); + goto failure; + } + } + + add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); + /* Reader: end */ + if (!p->key) + goto no_block; + } + return NULL; + +failure: + *err = -EIO; +no_block: + return p; +} + +/** + * ext4_find_near - find a place for allocation with sufficient locality + * @inode: owner + * @ind: descriptor of indirect block. + * + * This function returns the preferred place for block allocation. + * It is used when heuristic for sequential allocation fails. + * Rules are: + * + if there is a block to the left of our position - allocate near it. + * + if pointer will live in indirect block - allocate near that block. + * + if pointer will live in inode - allocate in the same + * cylinder group. + * + * In the latter case we colour the starting block by the callers PID to + * prevent it from clashing with concurrent allocations for a different inode + * in the same block group. The PID is used here so that functionally related + * files will be close-by on-disk. + * + * Caller must make sure that @ind is valid and will stay that way. + */ +static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; + __le32 *p; + + /* Try to find previous block */ + for (p = ind->p - 1; p >= start; p--) { + if (*p) + return le32_to_cpu(*p); + } + + /* No such thing, so let's try location of indirect block */ + if (ind->bh) + return ind->bh->b_blocknr; + + /* + * It is going to be referred to from the inode itself? OK, just put it + * into the same cylinder group then. + */ + return ext4_inode_to_goal_block(inode); +} + +/** + * ext4_find_goal - find a preferred place for allocation. + * @inode: owner + * @block: block we want + * @partial: pointer to the last triple within a chain + * + * Normally this function find the preferred place for block allocation, + * returns it. + * Because this is only used for non-extent files, we limit the block nr + * to 32 bits. + */ +static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, + Indirect *partial) +{ + ext4_fsblk_t goal; + + /* + * XXX need to get goal block from mballoc's data structures + */ + + goal = ext4_find_near(inode, partial); + goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; + return goal; +} + +/** + * ext4_blks_to_allocate - Look up the block map and count the number + * of direct blocks need to be allocated for the given branch. + * + * @branch: chain of indirect blocks + * @k: number of blocks need for indirect blocks + * @blks: number of data blocks to be mapped. + * @blocks_to_boundary: the offset in the indirect block + * + * return the total number of blocks to be allocate, including the + * direct and indirect blocks. + */ +static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, + int blocks_to_boundary) +{ + unsigned int count = 0; + + /* + * Simple case, [t,d]Indirect block(s) has not allocated yet + * then it's clear blocks on that path have not allocated + */ + if (k > 0) { + /* right now we don't handle cross boundary allocation */ + if (blks < blocks_to_boundary + 1) + count += blks; + else + count += blocks_to_boundary + 1; + return count; + } + + count++; + while (count < blks && count <= blocks_to_boundary && + le32_to_cpu(*(branch[0].p + count)) == 0) { + count++; + } + return count; +} + +/** + * ext4_alloc_blocks: multiple allocate blocks needed for a branch + * @handle: handle for this transaction + * @inode: inode which needs allocated blocks + * @iblock: the logical block to start allocated at + * @goal: preferred physical block of allocation + * @indirect_blks: the number of blocks need to allocate for indirect + * blocks + * @blks: number of desired blocks + * @new_blocks: on return it will store the new block numbers for + * the indirect blocks(if needed) and the first direct block, + * @err: on return it will store the error code + * + * This function will return the number of blocks allocated as + * requested by the passed-in parameters. + */ +static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, ext4_fsblk_t goal, + int indirect_blks, int blks, + ext4_fsblk_t new_blocks[4], int *err) +{ + struct ext4_allocation_request ar; + int target, i; + unsigned long count = 0, blk_allocated = 0; + int index = 0; + ext4_fsblk_t current_block = 0; + int ret = 0; + + /* + * Here we try to allocate the requested multiple blocks at once, + * on a best-effort basis. + * To build a branch, we should allocate blocks for + * the indirect blocks(if not allocated yet), and at least + * the first direct block of this branch. That's the + * minimum number of blocks need to allocate(required) + */ + /* first we try to allocate the indirect blocks */ + target = indirect_blks; + while (target > 0) { + count = target; + /* allocating blocks for indirect blocks and direct blocks */ + current_block = ext4_new_meta_blocks(handle, inode, goal, + 0, &count, err); + if (*err) + goto failed_out; + + if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { + EXT4_ERROR_INODE(inode, + "current_block %llu + count %lu > %d!", + current_block, count, + EXT4_MAX_BLOCK_FILE_PHYS); + *err = -EIO; + goto failed_out; + } + + target -= count; + /* allocate blocks for indirect blocks */ + while (index < indirect_blks && count) { + new_blocks[index++] = current_block++; + count--; + } + if (count > 0) { + /* + * save the new block number + * for the first direct block + */ + new_blocks[index] = current_block; + printk(KERN_INFO "%s returned more blocks than " + "requested\n", __func__); + WARN_ON(1); + break; + } + } + + target = blks - count ; + blk_allocated = count; + if (!target) + goto allocated; + /* Now allocate data blocks */ + memset(&ar, 0, sizeof(ar)); + ar.inode = inode; + ar.goal = goal; + ar.len = target; + ar.logical = iblock; + if (S_ISREG(inode->i_mode)) + /* enable in-core preallocation only for regular files */ + ar.flags = EXT4_MB_HINT_DATA; + + current_block = ext4_mb_new_blocks(handle, &ar, err); + if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { + EXT4_ERROR_INODE(inode, + "current_block %llu + ar.len %d > %d!", + current_block, ar.len, + EXT4_MAX_BLOCK_FILE_PHYS); + *err = -EIO; + goto failed_out; + } + + if (*err && (target == blks)) { + /* + * if the allocation failed and we didn't allocate + * any blocks before + */ + goto failed_out; + } + if (!*err) { + if (target == blks) { + /* + * save the new block number + * for the first direct block + */ + new_blocks[index] = current_block; + } + blk_allocated += ar.len; + } +allocated: + /* total number of blocks allocated for direct blocks */ + ret = blk_allocated; + *err = 0; + return ret; +failed_out: + for (i = 0; i < index; i++) + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); + return ret; +} + +/** + * ext4_alloc_branch - allocate and set up a chain of blocks. + * @handle: handle for this transaction + * @inode: owner + * @indirect_blks: number of allocated indirect blocks + * @blks: number of allocated direct blocks + * @goal: preferred place for allocation + * @offsets: offsets (in the blocks) to store the pointers to next. + * @branch: place to store the chain in. + * + * This function allocates blocks, zeroes out all but the last one, + * links them into chain and (if we are synchronous) writes them to disk. + * In other words, it prepares a branch that can be spliced onto the + * inode. It stores the information about that chain in the branch[], in + * the same format as ext4_get_branch() would do. We are calling it after + * we had read the existing part of chain and partial points to the last + * triple of that (one with zero ->key). Upon the exit we have the same + * picture as after the successful ext4_get_block(), except that in one + * place chain is disconnected - *branch->p is still zero (we did not + * set the last link), but branch->key contains the number that should + * be placed into *branch->p to fill that gap. + * + * If allocation fails we free all blocks we've allocated (and forget + * their buffer_heads) and return the error value the from failed + * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain + * as described above and return 0. + */ +static int ext4_alloc_branch(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, int indirect_blks, + int *blks, ext4_fsblk_t goal, + ext4_lblk_t *offsets, Indirect *branch) +{ + int blocksize = inode->i_sb->s_blocksize; + int i, n = 0; + int err = 0; + struct buffer_head *bh; + int num; + ext4_fsblk_t new_blocks[4]; + ext4_fsblk_t current_block; + + num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, + *blks, new_blocks, &err); + if (err) + return err; + + branch[0].key = cpu_to_le32(new_blocks[0]); + /* + * metadata blocks and data blocks are allocated. + */ + for (n = 1; n <= indirect_blks; n++) { + /* + * Get buffer_head for parent block, zero it out + * and set the pointer to new one, then send + * parent to disk. + */ + bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + if (unlikely(!bh)) { + err = -EIO; + goto failed; + } + + branch[n].bh = bh; + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + err = ext4_journal_get_create_access(handle, bh); + if (err) { + /* Don't brelse(bh) here; it's done in + * ext4_journal_forget() below */ + unlock_buffer(bh); + goto failed; + } + + memset(bh->b_data, 0, blocksize); + branch[n].p = (__le32 *) bh->b_data + offsets[n]; + branch[n].key = cpu_to_le32(new_blocks[n]); + *branch[n].p = branch[n].key; + if (n == indirect_blks) { + current_block = new_blocks[n]; + /* + * End of chain, update the last new metablock of + * the chain to point to the new allocated + * data blocks numbers + */ + for (i = 1; i < num; i++) + *(branch[n].p + i) = cpu_to_le32(++current_block); + } + BUFFER_TRACE(bh, "marking uptodate"); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (err) + goto failed; + } + *blks = num; + return err; +failed: + /* Allocation failed, free what we already allocated */ + ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); + for (i = 1; i <= n ; i++) { + /* + * branch[i].bh is newly allocated, so there is no + * need to revoke the block, which is why we don't + * need to set EXT4_FREE_BLOCKS_METADATA. + */ + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, + EXT4_FREE_BLOCKS_FORGET); + } + for (i = n+1; i < indirect_blks; i++) + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); + + ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); + + return err; +} + +/** + * ext4_splice_branch - splice the allocated branch onto inode. + * @handle: handle for this transaction + * @inode: owner + * @block: (logical) number of block we are adding + * @chain: chain of indirect blocks (with a missing link - see + * ext4_alloc_branch) + * @where: location of missing link + * @num: number of indirect blocks we are adding + * @blks: number of direct blocks we are adding + * + * This function fills the missing link and does all housekeeping needed in + * inode (->i_blocks, etc.). In case of success we end up with the full + * chain to new block and return 0. + */ +static int ext4_splice_branch(handle_t *handle, struct inode *inode, + ext4_lblk_t block, Indirect *where, int num, + int blks) +{ + int i; + int err = 0; + ext4_fsblk_t current_block; + + /* + * If we're splicing into a [td]indirect block (as opposed to the + * inode) then we need to get write access to the [td]indirect block + * before the splice. + */ + if (where->bh) { + BUFFER_TRACE(where->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, where->bh); + if (err) + goto err_out; + } + /* That's it */ + + *where->p = where->key; + + /* + * Update the host buffer_head or inode to point to more just allocated + * direct blocks blocks + */ + if (num == 0 && blks > 1) { + current_block = le32_to_cpu(where->key) + 1; + for (i = 1; i < blks; i++) + *(where->p + i) = cpu_to_le32(current_block++); + } + + /* We are done with atomic stuff, now do the rest of housekeeping */ + /* had we spliced it onto indirect block? */ + if (where->bh) { + /* + * If we spliced it onto an indirect block, we haven't + * altered the inode. Note however that if it is being spliced + * onto an indirect block at the very end of the file (the + * file is growing) then we *will* alter the inode to reflect + * the new i_size. But that is not done here - it is done in + * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. + */ + jbd_debug(5, "splicing indirect only\n"); + BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, where->bh); + if (err) + goto err_out; + } else { + /* + * OK, we spliced it into the inode itself on a direct block. + */ + ext4_mark_inode_dirty(handle, inode); + jbd_debug(5, "splicing direct\n"); + } + return err; + +err_out: + for (i = 1; i <= num; i++) { + /* + * branch[i].bh is newly allocated, so there is no + * need to revoke the block, which is why we don't + * need to set EXT4_FREE_BLOCKS_METADATA. + */ + ext4_free_blocks(handle, inode, where[i].bh, 0, 1, + EXT4_FREE_BLOCKS_FORGET); + } + ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), + blks, 0); + + return err; +} + +/* + * The ext4_ind_map_blocks() function handles non-extents inodes + * (i.e., using the traditional indirect/double-indirect i_blocks + * scheme) for ext4_map_blocks(). + * + * Allocation strategy is simple: if we have to allocate something, we will + * have to go the whole way to leaf. So let's do it before attaching anything + * to tree, set linkage between the newborn blocks, write them if sync is + * required, recheck the path, free and repeat if check fails, otherwise + * set the last missing link (that will protect us from any truncate-generated + * removals - all blocks on the path are immune now) and possibly force the + * write on the parent block. + * That has a nice additional property: no special recovery from the failed + * allocations is needed - we simply release blocks and do not touch anything + * reachable from inode. + * + * `handle' can be NULL if create == 0. + * + * return > 0, # of blocks mapped or allocated. + * return = 0, if plain lookup failed. + * return < 0, error case. + * + * The ext4_ind_get_blocks() function should be called with + * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem + * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or + * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system + * blocks. + */ +int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + int flags) +{ + int err = -EIO; + ext4_lblk_t offsets[4]; + Indirect chain[4]; + Indirect *partial; + ext4_fsblk_t goal; + int indirect_blks; + int blocks_to_boundary = 0; + int depth; + int count = 0; + ext4_fsblk_t first_block = 0; + + trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); + J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); + J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); + depth = ext4_block_to_path(inode, map->m_lblk, offsets, + &blocks_to_boundary); + + if (depth == 0) + goto out; + + partial = ext4_get_branch(inode, depth, offsets, chain, &err); + + /* Simplest case - block found, no allocation needed */ + if (!partial) { + first_block = le32_to_cpu(chain[depth - 1].key); + count++; + /*map more blocks*/ + while (count < map->m_len && count <= blocks_to_boundary) { + ext4_fsblk_t blk; + + blk = le32_to_cpu(*(chain[depth-1].p + count)); + + if (blk == first_block + count) + count++; + else + break; + } + goto got_it; + } + + /* Next simple case - plain lookup or failed read of indirect block */ + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) + goto cleanup; + + /* + * Okay, we need to do block allocation. + */ + goal = ext4_find_goal(inode, map->m_lblk, partial); + + /* the number of blocks need to allocate for [d,t]indirect blocks */ + indirect_blks = (chain + depth) - partial - 1; + + /* + * Next look up the indirect map to count the totoal number of + * direct blocks to allocate for this branch. + */ + count = ext4_blks_to_allocate(partial, indirect_blks, + map->m_len, blocks_to_boundary); + /* + * Block out ext4_truncate while we alter the tree + */ + err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, + &count, goal, + offsets + (partial - chain), partial); + + /* + * The ext4_splice_branch call will free and forget any buffers + * on the new chain if there is a failure, but that risks using + * up transaction credits, especially for bitmaps where the + * credits cannot be returned. Can we handle this somehow? We + * may need to return -EAGAIN upwards in the worst case. --sct + */ + if (!err) + err = ext4_splice_branch(handle, inode, map->m_lblk, + partial, indirect_blks, count); + if (err) + goto cleanup; + + map->m_flags |= EXT4_MAP_NEW; + + ext4_update_inode_fsync_trans(handle, inode, 1); +got_it: + map->m_flags |= EXT4_MAP_MAPPED; + map->m_pblk = le32_to_cpu(chain[depth-1].key); + map->m_len = count; + if (count > blocks_to_boundary) + map->m_flags |= EXT4_MAP_BOUNDARY; + err = count; + /* Clean up and exit */ + partial = chain + depth - 1; /* the whole chain */ +cleanup: + while (partial > chain) { + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } +out: + trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, + map->m_pblk, map->m_len, err); + return err; +} + +/* + * O_DIRECT for ext3 (or indirect map) based files + * + * If the O_DIRECT write will extend the file then add this inode to the + * orphan list. So recovery will truncate it back to the original size + * if the machine crashes during the write. + * + * If the O_DIRECT write is intantiating holes inside i_size and the machine + * crashes then stale disk data _may_ be exposed inside the file. But current + * VFS code falls back into buffered path in that case so we are safe. + */ +ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct ext4_inode_info *ei = EXT4_I(inode); + handle_t *handle; + ssize_t ret; + int orphan = 0; + size_t count = iov_length(iov, nr_segs); + int retries = 0; + + if (rw == WRITE) { + loff_t final_size = offset + count; + + if (final_size > inode->i_size) { + /* Credits for sb + inode write */ + handle = ext4_journal_start(inode, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + ret = ext4_orphan_add(handle, inode); + if (ret) { + ext4_journal_stop(handle); + goto out; + } + orphan = 1; + ei->i_disksize = inode->i_size; + ext4_journal_stop(handle); + } + } + +retry: + if (rw == READ && ext4_should_dioread_nolock(inode)) + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, + offset, nr_segs, + ext4_get_block, NULL, NULL, 0); + else { + ret = blockdev_direct_IO(rw, iocb, inode, iov, + offset, nr_segs, ext4_get_block); + + if (unlikely((rw & WRITE) && ret < 0)) { + loff_t isize = i_size_read(inode); + loff_t end = offset + iov_length(iov, nr_segs); + + if (end > isize) + ext4_truncate_failed_write(inode); + } + } + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + + if (orphan) { + int err; + + /* Credits for sb + inode write */ + handle = ext4_journal_start(inode, 2); + if (IS_ERR(handle)) { + /* This is really bad luck. We've written the data + * but cannot extend i_size. Bail out and pretend + * the write failed... */ + ret = PTR_ERR(handle); + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + + goto out; + } + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + if (ret > 0) { + loff_t end = offset + ret; + if (end > inode->i_size) { + ei->i_disksize = end; + i_size_write(inode, end); + /* + * We're going to return a positive `ret' + * here due to non-zero-length I/O, so there's + * no way of reporting error returns from + * ext4_mark_inode_dirty() to userspace. So + * ignore it. + */ + ext4_mark_inode_dirty(handle, inode); + } + } + err = ext4_journal_stop(handle); + if (ret == 0) + ret = err; + } +out: + return ret; +} + +/* + * Calculate the number of metadata blocks need to reserve + * to allocate a new block at @lblocks for non extent file based file + */ +int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); + int blk_bits; + + if (lblock < EXT4_NDIR_BLOCKS) + return 0; + + lblock -= EXT4_NDIR_BLOCKS; + + if (ei->i_da_metadata_calc_len && + (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { + ei->i_da_metadata_calc_len++; + return 0; + } + ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; + ei->i_da_metadata_calc_len = 1; + blk_bits = order_base_2(lblock); + return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; +} + +int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + int indirects; + + /* if nrblocks are contiguous */ + if (chunk) { + /* + * With N contiguous data blocks, we need at most + * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, + * 2 dindirect blocks, and 1 tindirect block + */ + return DIV_ROUND_UP(nrblocks, + EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; + } + /* + * if nrblocks are not contiguous, worse case, each block touch + * a indirect block, and each indirect block touch a double indirect + * block, plus a triple indirect block + */ + indirects = nrblocks * 2 + 1; + return indirects; +} + +/* + * Truncate transactions can be complex and absolutely huge. So we need to + * be able to restart the transaction at a conventient checkpoint to make + * sure we don't overflow the journal. + * + * start_transaction gets us a new handle for a truncate transaction, + * and extend_transaction tries to extend the existing one a bit. If + * extend fails, we need to propagate the failure up and restart the + * transaction in the top-level truncate loop. --sct + */ +static handle_t *start_transaction(struct inode *inode) +{ + handle_t *result; + + result = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)); + if (!IS_ERR(result)) + return result; + + ext4_std_error(inode->i_sb, PTR_ERR(result)); + return result; +} + +/* + * Try to extend this transaction for the purposes of truncation. + * + * Returns 0 if we managed to create more room. If we can't create more + * room, and the transaction must be restarted we return 1. + */ +static int try_to_extend_transaction(handle_t *handle, struct inode *inode) +{ + if (!ext4_handle_valid(handle)) + return 0; + if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) + return 0; + if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode))) + return 0; + return 1; +} + +/* + * Probably it should be a library function... search for first non-zero word + * or memcmp with zero_page, whatever is better for particular architecture. + * Linus? + */ +static inline int all_zeroes(__le32 *p, __le32 *q) +{ + while (p < q) + if (*p++) + return 0; + return 1; +} + +/** + * ext4_find_shared - find the indirect blocks for partial truncation. + * @inode: inode in question + * @depth: depth of the affected branch + * @offsets: offsets of pointers in that branch (see ext4_block_to_path) + * @chain: place to store the pointers to partial indirect blocks + * @top: place to the (detached) top of branch + * + * This is a helper function used by ext4_truncate(). + * + * When we do truncate() we may have to clean the ends of several + * indirect blocks but leave the blocks themselves alive. Block is + * partially truncated if some data below the new i_size is referred + * from it (and it is on the path to the first completely truncated + * data block, indeed). We have to free the top of that path along + * with everything to the right of the path. Since no allocation + * past the truncation point is possible until ext4_truncate() + * finishes, we may safely do the latter, but top of branch may + * require special attention - pageout below the truncation point + * might try to populate it. + * + * We atomically detach the top of branch from the tree, store the + * block number of its root in *@top, pointers to buffer_heads of + * partially truncated blocks - in @chain[].bh and pointers to + * their last elements that should not be removed - in + * @chain[].p. Return value is the pointer to last filled element + * of @chain. + * + * The work left to caller to do the actual freeing of subtrees: + * a) free the subtree starting from *@top + * b) free the subtrees whose roots are stored in + * (@chain[i].p+1 .. end of @chain[i].bh->b_data) + * c) free the subtrees growing from the inode past the @chain[0]. + * (no partially truncated stuff there). */ + +static Indirect *ext4_find_shared(struct inode *inode, int depth, + ext4_lblk_t offsets[4], Indirect chain[4], + __le32 *top) +{ + Indirect *partial, *p; + int k, err; + + *top = 0; + /* Make k index the deepest non-null offset + 1 */ + for (k = depth; k > 1 && !offsets[k-1]; k--) + ; + partial = ext4_get_branch(inode, k, offsets, chain, &err); + /* Writer: pointers */ + if (!partial) + partial = chain + k-1; + /* + * If the branch acquired continuation since we've looked at it - + * fine, it should all survive and (new) top doesn't belong to us. + */ + if (!partial->key && *partial->p) + /* Writer: end */ + goto no_top; + for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) + ; + /* + * OK, we've found the last block that must survive. The rest of our + * branch should be detached before unlocking. However, if that rest + * of branch is all ours and does not grow immediately from the inode + * it's easier to cheat and just decrement partial->p. + */ + if (p == chain + k - 1 && p > chain) { + p->p--; + } else { + *top = *p->p; + /* Nope, don't do this in ext4. Must leave the tree intact */ +#if 0 + *p->p = 0; +#endif + } + /* Writer: end */ + + while (partial > p) { + brelse(partial->bh); + partial--; + } +no_top: + return partial; +} + +/* + * Zero a number of block pointers in either an inode or an indirect block. + * If we restart the transaction we must again get write access to the + * indirect block for further modification. + * + * We release `count' blocks on disk, but (last - first) may be greater + * than `count' because there can be holes in there. + * + * Return 0 on success, 1 on invalid block range + * and < 0 on fatal error. + */ +static int ext4_clear_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, + ext4_fsblk_t block_to_free, + unsigned long count, __le32 *first, + __le32 *last) +{ + __le32 *p; + int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; + int err; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + flags |= EXT4_FREE_BLOCKS_METADATA; + + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, + count)) { + EXT4_ERROR_INODE(inode, "attempt to clear invalid " + "blocks %llu len %lu", + (unsigned long long) block_to_free, count); + return 1; + } + + if (try_to_extend_transaction(handle, inode)) { + if (bh) { + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (unlikely(err)) + goto out_err; + } + err = ext4_mark_inode_dirty(handle, inode); + if (unlikely(err)) + goto out_err; + err = ext4_truncate_restart_trans(handle, inode, + ext4_blocks_for_truncate(inode)); + if (unlikely(err)) + goto out_err; + if (bh) { + BUFFER_TRACE(bh, "retaking write access"); + err = ext4_journal_get_write_access(handle, bh); + if (unlikely(err)) + goto out_err; + } + } + + for (p = first; p < last; p++) + *p = 0; + + ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); + return 0; +out_err: + ext4_std_error(inode->i_sb, err); + return err; +} + +/** + * ext4_free_data - free a list of data blocks + * @handle: handle for this transaction + * @inode: inode we are dealing with + * @this_bh: indirect buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: points immediately past the end of array + * + * We are freeing all blocks referred from that array (numbers are stored as + * little-endian 32-bit) and updating @inode->i_blocks appropriately. + * + * We accumulate contiguous runs of blocks to free. Conveniently, if these + * blocks are contiguous then releasing them at one time will only affect one + * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't + * actually use a lot of journal space. + * + * @this_bh will be %NULL if @first and @last point into the inode's direct + * block pointers. + */ +static void ext4_free_data(handle_t *handle, struct inode *inode, + struct buffer_head *this_bh, + __le32 *first, __le32 *last) +{ + ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ + unsigned long count = 0; /* Number of blocks in the run */ + __le32 *block_to_free_p = NULL; /* Pointer into inode/ind + corresponding to + block_to_free */ + ext4_fsblk_t nr; /* Current block # */ + __le32 *p; /* Pointer into inode/ind + for current block */ + int err = 0; + + if (this_bh) { /* For indirect block */ + BUFFER_TRACE(this_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, this_bh); + /* Important: if we can't update the indirect pointers + * to the blocks, we can't free them. */ + if (err) + return; + } + + for (p = first; p < last; p++) { + nr = le32_to_cpu(*p); + if (nr) { + /* accumulate blocks to free if they're contiguous */ + if (count == 0) { + block_to_free = nr; + block_to_free_p = p; + count = 1; + } else if (nr == block_to_free + count) { + count++; + } else { + err = ext4_clear_blocks(handle, inode, this_bh, + block_to_free, count, + block_to_free_p, p); + if (err) + break; + block_to_free = nr; + block_to_free_p = p; + count = 1; + } + } + } + + if (!err && count > 0) + err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, + count, block_to_free_p, p); + if (err < 0) + /* fatal error */ + return; + + if (this_bh) { + BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); + + /* + * The buffer head should have an attached journal head at this + * point. However, if the data is corrupted and an indirect + * block pointed to itself, it would have been detached when + * the block was cleared. Check for this instead of OOPSing. + */ + if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) + ext4_handle_dirty_metadata(handle, inode, this_bh); + else + EXT4_ERROR_INODE(inode, + "circular indirect block detected at " + "block %llu", + (unsigned long long) this_bh->b_blocknr); + } +} + +/** + * ext4_free_branches - free an array of branches + * @handle: JBD handle for this transaction + * @inode: inode we are dealing with + * @parent_bh: the buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: pointer immediately past the end of array + * @depth: depth of the branches to free + * + * We are freeing all blocks referred from these branches (numbers are + * stored as little-endian 32-bit) and updating @inode->i_blocks + * appropriately. + */ +static void ext4_free_branches(handle_t *handle, struct inode *inode, + struct buffer_head *parent_bh, + __le32 *first, __le32 *last, int depth) +{ + ext4_fsblk_t nr; + __le32 *p; + + if (ext4_handle_is_aborted(handle)) + return; + + if (depth--) { + struct buffer_head *bh; + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); + p = last; + while (--p >= first) { + nr = le32_to_cpu(*p); + if (!nr) + continue; /* A hole */ + + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), + nr, 1)) { + EXT4_ERROR_INODE(inode, + "invalid indirect mapped " + "block %lu (level %d)", + (unsigned long) nr, depth); + break; + } + + /* Go read the buffer for the next level down */ + bh = sb_bread(inode->i_sb, nr); + + /* + * A read failure? Report error and clear slot + * (should be rare). + */ + if (!bh) { + EXT4_ERROR_INODE_BLOCK(inode, nr, + "Read failure"); + continue; + } + + /* This zaps the entire block. Bottom up. */ + BUFFER_TRACE(bh, "free child branches"); + ext4_free_branches(handle, inode, bh, + (__le32 *) bh->b_data, + (__le32 *) bh->b_data + addr_per_block, + depth); + brelse(bh); + + /* + * Everything below this this pointer has been + * released. Now let this top-of-subtree go. + * + * We want the freeing of this indirect block to be + * atomic in the journal with the updating of the + * bitmap block which owns it. So make some room in + * the journal. + * + * We zero the parent pointer *after* freeing its + * pointee in the bitmaps, so if extend_transaction() + * for some reason fails to put the bitmap changes and + * the release into the same transaction, recovery + * will merely complain about releasing a free block, + * rather than leaking blocks. + */ + if (ext4_handle_is_aborted(handle)) + return; + if (try_to_extend_transaction(handle, inode)) { + ext4_mark_inode_dirty(handle, inode); + ext4_truncate_restart_trans(handle, inode, + ext4_blocks_for_truncate(inode)); + } + + /* + * The forget flag here is critical because if + * we are journaling (and not doing data + * journaling), we have to make sure a revoke + * record is written to prevent the journal + * replay from overwriting the (former) + * indirect block if it gets reallocated as a + * data block. This must happen in the same + * transaction where the data blocks are + * actually freed. + */ + ext4_free_blocks(handle, inode, NULL, nr, 1, + EXT4_FREE_BLOCKS_METADATA| + EXT4_FREE_BLOCKS_FORGET); + + if (parent_bh) { + /* + * The block which we have just freed is + * pointed to by an indirect block: journal it + */ + BUFFER_TRACE(parent_bh, "get_write_access"); + if (!ext4_journal_get_write_access(handle, + parent_bh)){ + *p = 0; + BUFFER_TRACE(parent_bh, + "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, + inode, + parent_bh); + } + } + } + } else { + /* We have reached the bottom of the tree. */ + BUFFER_TRACE(parent_bh, "free data blocks"); + ext4_free_data(handle, inode, parent_bh, first, last); + } +} + +void ext4_ind_truncate(struct inode *inode) +{ + handle_t *handle; + struct ext4_inode_info *ei = EXT4_I(inode); + __le32 *i_data = ei->i_data; + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); + struct address_space *mapping = inode->i_mapping; + ext4_lblk_t offsets[4]; + Indirect chain[4]; + Indirect *partial; + __le32 nr = 0; + int n = 0; + ext4_lblk_t last_block, max_block; + unsigned blocksize = inode->i_sb->s_blocksize; + + handle = start_transaction(inode); + if (IS_ERR(handle)) + return; /* AKPM: return what? */ + + last_block = (inode->i_size + blocksize-1) + >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); + max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) + >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); + + if (inode->i_size & (blocksize - 1)) + if (ext4_block_truncate_page(handle, mapping, inode->i_size)) + goto out_stop; + + if (last_block != max_block) { + n = ext4_block_to_path(inode, last_block, offsets, NULL); + if (n == 0) + goto out_stop; /* error */ + } + + /* + * OK. This truncate is going to happen. We add the inode to the + * orphan list, so that if this truncate spans multiple transactions, + * and we crash, we will resume the truncate when the filesystem + * recovers. It also marks the inode dirty, to catch the new size. + * + * Implication: the file must always be in a sane, consistent + * truncatable state while each transaction commits. + */ + if (ext4_orphan_add(handle, inode)) + goto out_stop; + + /* + * From here we block out all ext4_get_block() callers who want to + * modify the block allocation tree. + */ + down_write(&ei->i_data_sem); + + ext4_discard_preallocations(inode); + + /* + * The orphan list entry will now protect us from any crash which + * occurs before the truncate completes, so it is now safe to propagate + * the new, shorter inode size (held for now in i_size) into the + * on-disk inode. We do this via i_disksize, which is the value which + * ext4 *really* writes onto the disk inode. + */ + ei->i_disksize = inode->i_size; + + if (last_block == max_block) { + /* + * It is unnecessary to free any data blocks if last_block is + * equal to the indirect block limit. + */ + goto out_unlock; + } else if (n == 1) { /* direct blocks */ + ext4_free_data(handle, inode, NULL, i_data+offsets[0], + i_data + EXT4_NDIR_BLOCKS); + goto do_indirects; + } + + partial = ext4_find_shared(inode, n, offsets, chain, &nr); + /* Kill the top of shared branch (not detached) */ + if (nr) { + if (partial == chain) { + /* Shared branch grows from the inode */ + ext4_free_branches(handle, inode, NULL, + &nr, &nr+1, (chain+n-1) - partial); + *partial->p = 0; + /* + * We mark the inode dirty prior to restart, + * and prior to stop. No need for it here. + */ + } else { + /* Shared branch grows from an indirect block */ + BUFFER_TRACE(partial->bh, "get_write_access"); + ext4_free_branches(handle, inode, partial->bh, + partial->p, + partial->p+1, (chain+n-1) - partial); + } + } + /* Clear the ends of indirect blocks on the shared branch */ + while (partial > chain) { + ext4_free_branches(handle, inode, partial->bh, partial->p + 1, + (__le32*)partial->bh->b_data+addr_per_block, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } +do_indirects: + /* Kill the remaining (whole) subtrees */ + switch (offsets[0]) { + default: + nr = i_data[EXT4_IND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); + i_data[EXT4_IND_BLOCK] = 0; + } + case EXT4_IND_BLOCK: + nr = i_data[EXT4_DIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); + i_data[EXT4_DIND_BLOCK] = 0; + } + case EXT4_DIND_BLOCK: + nr = i_data[EXT4_TIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); + i_data[EXT4_TIND_BLOCK] = 0; + } + case EXT4_TIND_BLOCK: + ; + } + +out_unlock: + up_write(&ei->i_data_sem); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + + /* + * In a multi-transaction truncate, we only make the final transaction + * synchronous + */ + if (IS_SYNC(inode)) + ext4_handle_sync(handle); +out_stop: + /* + * If this was a simple ftruncate(), and the file will remain alive + * then we need to clear up the orphan record which we created above. + * However, if this was a real unlink then we were called by + * ext4_delete_inode(), and we allow that function to clean up the + * orphan info for us. + */ + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + + ext4_journal_stop(handle); + trace_ext4_truncate_exit(inode); +} + diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3e5191f9f398..d47264cafee0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -12,10 +12,6 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * - * Goal-directed block allocation by Stephen Tweedie - * ([email protected]), 1993, 1998 - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller ([email protected]), 1995 * 64-bit file support on 64-bit platforms by Jakub Jelinek * @@ -47,6 +43,7 @@ #include "xattr.h" #include "acl.h" #include "ext4_extents.h" +#include "truncate.h" #include <trace/events/ext4.h> @@ -89,72 +86,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode) } /* - * Work out how many blocks we need to proceed with the next chunk of a - * truncate transaction. - */ -static unsigned long blocks_for_truncate(struct inode *inode) -{ - ext4_lblk_t needed; - - needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); - - /* Give ourselves just enough room to cope with inodes in which - * i_blocks is corrupt: we've seen disk corruptions in the past - * which resulted in random data in an inode which looked enough - * like a regular file for ext4 to try to delete it. Things - * will go a bit crazy if that happens, but at least we should - * try not to panic the whole kernel. */ - if (needed < 2) - needed = 2; - - /* But we need to bound the transaction so we don't overflow the - * journal. */ - if (needed > EXT4_MAX_TRANS_DATA) - needed = EXT4_MAX_TRANS_DATA; - - return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; -} - -/* - * Truncate transactions can be complex and absolutely huge. So we need to - * be able to restart the transaction at a conventient checkpoint to make - * sure we don't overflow the journal. - * - * start_transaction gets us a new handle for a truncate transaction, - * and extend_transaction tries to extend the existing one a bit. If - * extend fails, we need to propagate the failure up and restart the - * transaction in the top-level truncate loop. --sct - */ -static handle_t *start_transaction(struct inode *inode) -{ - handle_t *result; - - result = ext4_journal_start(inode, blocks_for_truncate(inode)); - if (!IS_ERR(result)) - return result; - - ext4_std_error(inode->i_sb, PTR_ERR(result)); - return result; -} - -/* - * Try to extend this transaction for the purposes of truncation. - * - * Returns 0 if we managed to create more room. If we can't create more - * room, and the transaction must be restarted we return 1. - */ -static int try_to_extend_transaction(handle_t *handle, struct inode *inode) -{ - if (!ext4_handle_valid(handle)) - return 0; - if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) - return 0; - if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) - return 0; - return 1; -} - -/* * Restart the transaction associated with *handle. This does a commit, * so before we call here everything must be consistently dirtied against * this transaction. @@ -190,6 +121,33 @@ void ext4_evict_inode(struct inode *inode) trace_ext4_evict_inode(inode); if (inode->i_nlink) { + /* + * When journalling data dirty buffers are tracked only in the + * journal. So although mm thinks everything is clean and + * ready for reaping the inode might still have some pages to + * write in the running transaction or waiting to be + * checkpointed. Thus calling jbd2_journal_invalidatepage() + * (via truncate_inode_pages()) to discard these buffers can + * cause data loss. Also even if we did not discard these + * buffers, we would have no way to find them after the inode + * is reaped and thus user could see stale data if he tries to + * read them before the transaction is checkpointed. So be + * careful and force everything to disk here... We use + * ei->i_datasync_tid to store the newest transaction + * containing inode's data. + * + * Note that directories do not have this problem because they + * don't use page cache. + */ + if (ext4_should_journal_data(inode) && + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; + + jbd2_log_start_commit(journal, commit_tid); + jbd2_log_wait_commit(journal, commit_tid); + filemap_write_and_wait(&inode->i_data); + } truncate_inode_pages(&inode->i_data, 0); goto no_delete; } @@ -204,7 +162,7 @@ void ext4_evict_inode(struct inode *inode) if (is_bad_inode(inode)) goto no_delete; - handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3); + handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* @@ -277,793 +235,6 @@ no_delete: ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ } -typedef struct { - __le32 *p; - __le32 key; - struct buffer_head *bh; -} Indirect; - -static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) -{ - p->key = *(p->p = v); - p->bh = bh; -} - -/** - * ext4_block_to_path - parse the block number into array of offsets - * @inode: inode in question (we are only interested in its superblock) - * @i_block: block number to be parsed - * @offsets: array to store the offsets in - * @boundary: set this non-zero if the referred-to block is likely to be - * followed (on disk) by an indirect block. - * - * To store the locations of file's data ext4 uses a data structure common - * for UNIX filesystems - tree of pointers anchored in the inode, with - * data blocks at leaves and indirect blocks in intermediate nodes. - * This function translates the block number into path in that tree - - * return value is the path length and @offsets[n] is the offset of - * pointer to (n+1)th node in the nth one. If @block is out of range - * (negative or too large) warning is printed and zero returned. - * - * Note: function doesn't find node addresses, so no IO is needed. All - * we need to know is the capacity of indirect blocks (taken from the - * inode->i_sb). - */ - -/* - * Portability note: the last comparison (check that we fit into triple - * indirect block) is spelled differently, because otherwise on an - * architecture with 32-bit longs and 8Kb pages we might get into trouble - * if our filesystem had 8Kb blocks. We might use long long, but that would - * kill us on x86. Oh, well, at least the sign propagation does not matter - - * i_block would have to be negative in the very beginning, so we would not - * get there at all. - */ - -static int ext4_block_to_path(struct inode *inode, - ext4_lblk_t i_block, - ext4_lblk_t offsets[4], int *boundary) -{ - int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); - int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); - const long direct_blocks = EXT4_NDIR_BLOCKS, - indirect_blocks = ptrs, - double_blocks = (1 << (ptrs_bits * 2)); - int n = 0; - int final = 0; - - if (i_block < direct_blocks) { - offsets[n++] = i_block; - final = direct_blocks; - } else if ((i_block -= direct_blocks) < indirect_blocks) { - offsets[n++] = EXT4_IND_BLOCK; - offsets[n++] = i_block; - final = ptrs; - } else if ((i_block -= indirect_blocks) < double_blocks) { - offsets[n++] = EXT4_DIND_BLOCK; - offsets[n++] = i_block >> ptrs_bits; - offsets[n++] = i_block & (ptrs - 1); - final = ptrs; - } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { - offsets[n++] = EXT4_TIND_BLOCK; - offsets[n++] = i_block >> (ptrs_bits * 2); - offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); - offsets[n++] = i_block & (ptrs - 1); - final = ptrs; - } else { - ext4_warning(inode->i_sb, "block %lu > max in inode %lu", - i_block + direct_blocks + - indirect_blocks + double_blocks, inode->i_ino); - } - if (boundary) - *boundary = final - 1 - (i_block & (ptrs - 1)); - return n; -} - -static int __ext4_check_blockref(const char *function, unsigned int line, - struct inode *inode, - __le32 *p, unsigned int max) -{ - struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; - __le32 *bref = p; - unsigned int blk; - - while (bref < p+max) { - blk = le32_to_cpu(*bref++); - if (blk && - unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), - blk, 1))) { - es->s_last_error_block = cpu_to_le64(blk); - ext4_error_inode(inode, function, line, blk, - "invalid block"); - return -EIO; - } - } - return 0; -} - - -#define ext4_check_indirect_blockref(inode, bh) \ - __ext4_check_blockref(__func__, __LINE__, inode, \ - (__le32 *)(bh)->b_data, \ - EXT4_ADDR_PER_BLOCK((inode)->i_sb)) - -#define ext4_check_inode_blockref(inode) \ - __ext4_check_blockref(__func__, __LINE__, inode, \ - EXT4_I(inode)->i_data, \ - EXT4_NDIR_BLOCKS) - -/** - * ext4_get_branch - read the chain of indirect blocks leading to data - * @inode: inode in question - * @depth: depth of the chain (1 - direct pointer, etc.) - * @offsets: offsets of pointers in inode/indirect blocks - * @chain: place to store the result - * @err: here we store the error value - * - * Function fills the array of triples <key, p, bh> and returns %NULL - * if everything went OK or the pointer to the last filled triple - * (incomplete one) otherwise. Upon the return chain[i].key contains - * the number of (i+1)-th block in the chain (as it is stored in memory, - * i.e. little-endian 32-bit), chain[i].p contains the address of that - * number (it points into struct inode for i==0 and into the bh->b_data - * for i>0) and chain[i].bh points to the buffer_head of i-th indirect - * block for i>0 and NULL for i==0. In other words, it holds the block - * numbers of the chain, addresses they were taken from (and where we can - * verify that chain did not change) and buffer_heads hosting these - * numbers. - * - * Function stops when it stumbles upon zero pointer (absent block) - * (pointer to last triple returned, *@err == 0) - * or when it gets an IO error reading an indirect block - * (ditto, *@err == -EIO) - * or when it reads all @depth-1 indirect blocks successfully and finds - * the whole chain, all way to the data (returns %NULL, *err == 0). - * - * Need to be called with - * down_read(&EXT4_I(inode)->i_data_sem) - */ -static Indirect *ext4_get_branch(struct inode *inode, int depth, - ext4_lblk_t *offsets, - Indirect chain[4], int *err) -{ - struct super_block *sb = inode->i_sb; - Indirect *p = chain; - struct buffer_head *bh; - - *err = 0; - /* i_data is not going away, no lock needed */ - add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); - if (!p->key) - goto no_block; - while (--depth) { - bh = sb_getblk(sb, le32_to_cpu(p->key)); - if (unlikely(!bh)) - goto failure; - - if (!bh_uptodate_or_lock(bh)) { - if (bh_submit_read(bh) < 0) { - put_bh(bh); - goto failure; - } - /* validate block references */ - if (ext4_check_indirect_blockref(inode, bh)) { - put_bh(bh); - goto failure; - } - } - - add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); - /* Reader: end */ - if (!p->key) - goto no_block; - } - return NULL; - -failure: - *err = -EIO; -no_block: - return p; -} - -/** - * ext4_find_near - find a place for allocation with sufficient locality - * @inode: owner - * @ind: descriptor of indirect block. - * - * This function returns the preferred place for block allocation. - * It is used when heuristic for sequential allocation fails. - * Rules are: - * + if there is a block to the left of our position - allocate near it. - * + if pointer will live in indirect block - allocate near that block. - * + if pointer will live in inode - allocate in the same - * cylinder group. - * - * In the latter case we colour the starting block by the callers PID to - * prevent it from clashing with concurrent allocations for a different inode - * in the same block group. The PID is used here so that functionally related - * files will be close-by on-disk. - * - * Caller must make sure that @ind is valid and will stay that way. - */ -static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; - __le32 *p; - ext4_fsblk_t bg_start; - ext4_fsblk_t last_block; - ext4_grpblk_t colour; - ext4_group_t block_group; - int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); - - /* Try to find previous block */ - for (p = ind->p - 1; p >= start; p--) { - if (*p) - return le32_to_cpu(*p); - } - - /* No such thing, so let's try location of indirect block */ - if (ind->bh) - return ind->bh->b_blocknr; - - /* - * It is going to be referred to from the inode itself? OK, just put it - * into the same cylinder group then. - */ - block_group = ei->i_block_group; - if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { - block_group &= ~(flex_size-1); - if (S_ISREG(inode->i_mode)) - block_group++; - } - bg_start = ext4_group_first_block_no(inode->i_sb, block_group); - last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; - - /* - * If we are doing delayed allocation, we don't need take - * colour into account. - */ - if (test_opt(inode->i_sb, DELALLOC)) - return bg_start; - - if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) - colour = (current->pid % 16) * - (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); - else - colour = (current->pid % 16) * ((last_block - bg_start) / 16); - return bg_start + colour; -} - -/** - * ext4_find_goal - find a preferred place for allocation. - * @inode: owner - * @block: block we want - * @partial: pointer to the last triple within a chain - * - * Normally this function find the preferred place for block allocation, - * returns it. - * Because this is only used for non-extent files, we limit the block nr - * to 32 bits. - */ -static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, - Indirect *partial) -{ - ext4_fsblk_t goal; - - /* - * XXX need to get goal block from mballoc's data structures - */ - - goal = ext4_find_near(inode, partial); - goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; - return goal; -} - -/** - * ext4_blks_to_allocate - Look up the block map and count the number - * of direct blocks need to be allocated for the given branch. - * - * @branch: chain of indirect blocks - * @k: number of blocks need for indirect blocks - * @blks: number of data blocks to be mapped. - * @blocks_to_boundary: the offset in the indirect block - * - * return the total number of blocks to be allocate, including the - * direct and indirect blocks. - */ -static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, - int blocks_to_boundary) -{ - unsigned int count = 0; - - /* - * Simple case, [t,d]Indirect block(s) has not allocated yet - * then it's clear blocks on that path have not allocated - */ - if (k > 0) { - /* right now we don't handle cross boundary allocation */ - if (blks < blocks_to_boundary + 1) - count += blks; - else - count += blocks_to_boundary + 1; - return count; - } - - count++; - while (count < blks && count <= blocks_to_boundary && - le32_to_cpu(*(branch[0].p + count)) == 0) { - count++; - } - return count; -} - -/** - * ext4_alloc_blocks: multiple allocate blocks needed for a branch - * @handle: handle for this transaction - * @inode: inode which needs allocated blocks - * @iblock: the logical block to start allocated at - * @goal: preferred physical block of allocation - * @indirect_blks: the number of blocks need to allocate for indirect - * blocks - * @blks: number of desired blocks - * @new_blocks: on return it will store the new block numbers for - * the indirect blocks(if needed) and the first direct block, - * @err: on return it will store the error code - * - * This function will return the number of blocks allocated as - * requested by the passed-in parameters. - */ -static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, ext4_fsblk_t goal, - int indirect_blks, int blks, - ext4_fsblk_t new_blocks[4], int *err) -{ - struct ext4_allocation_request ar; - int target, i; - unsigned long count = 0, blk_allocated = 0; - int index = 0; - ext4_fsblk_t current_block = 0; - int ret = 0; - - /* - * Here we try to allocate the requested multiple blocks at once, - * on a best-effort basis. - * To build a branch, we should allocate blocks for - * the indirect blocks(if not allocated yet), and at least - * the first direct block of this branch. That's the - * minimum number of blocks need to allocate(required) - */ - /* first we try to allocate the indirect blocks */ - target = indirect_blks; - while (target > 0) { - count = target; - /* allocating blocks for indirect blocks and direct blocks */ - current_block = ext4_new_meta_blocks(handle, inode, goal, - 0, &count, err); - if (*err) - goto failed_out; - - if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { - EXT4_ERROR_INODE(inode, - "current_block %llu + count %lu > %d!", - current_block, count, - EXT4_MAX_BLOCK_FILE_PHYS); - *err = -EIO; - goto failed_out; - } - - target -= count; - /* allocate blocks for indirect blocks */ - while (index < indirect_blks && count) { - new_blocks[index++] = current_block++; - count--; - } - if (count > 0) { - /* - * save the new block number - * for the first direct block - */ - new_blocks[index] = current_block; - printk(KERN_INFO "%s returned more blocks than " - "requested\n", __func__); - WARN_ON(1); - break; - } - } - - target = blks - count ; - blk_allocated = count; - if (!target) - goto allocated; - /* Now allocate data blocks */ - memset(&ar, 0, sizeof(ar)); - ar.inode = inode; - ar.goal = goal; - ar.len = target; - ar.logical = iblock; - if (S_ISREG(inode->i_mode)) - /* enable in-core preallocation only for regular files */ - ar.flags = EXT4_MB_HINT_DATA; - - current_block = ext4_mb_new_blocks(handle, &ar, err); - if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { - EXT4_ERROR_INODE(inode, - "current_block %llu + ar.len %d > %d!", - current_block, ar.len, - EXT4_MAX_BLOCK_FILE_PHYS); - *err = -EIO; - goto failed_out; - } - - if (*err && (target == blks)) { - /* - * if the allocation failed and we didn't allocate - * any blocks before - */ - goto failed_out; - } - if (!*err) { - if (target == blks) { - /* - * save the new block number - * for the first direct block - */ - new_blocks[index] = current_block; - } - blk_allocated += ar.len; - } -allocated: - /* total number of blocks allocated for direct blocks */ - ret = blk_allocated; - *err = 0; - return ret; -failed_out: - for (i = 0; i < index; i++) - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); - return ret; -} - -/** - * ext4_alloc_branch - allocate and set up a chain of blocks. - * @handle: handle for this transaction - * @inode: owner - * @indirect_blks: number of allocated indirect blocks - * @blks: number of allocated direct blocks - * @goal: preferred place for allocation - * @offsets: offsets (in the blocks) to store the pointers to next. - * @branch: place to store the chain in. - * - * This function allocates blocks, zeroes out all but the last one, - * links them into chain and (if we are synchronous) writes them to disk. - * In other words, it prepares a branch that can be spliced onto the - * inode. It stores the information about that chain in the branch[], in - * the same format as ext4_get_branch() would do. We are calling it after - * we had read the existing part of chain and partial points to the last - * triple of that (one with zero ->key). Upon the exit we have the same - * picture as after the successful ext4_get_block(), except that in one - * place chain is disconnected - *branch->p is still zero (we did not - * set the last link), but branch->key contains the number that should - * be placed into *branch->p to fill that gap. - * - * If allocation fails we free all blocks we've allocated (and forget - * their buffer_heads) and return the error value the from failed - * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain - * as described above and return 0. - */ -static int ext4_alloc_branch(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, int indirect_blks, - int *blks, ext4_fsblk_t goal, - ext4_lblk_t *offsets, Indirect *branch) -{ - int blocksize = inode->i_sb->s_blocksize; - int i, n = 0; - int err = 0; - struct buffer_head *bh; - int num; - ext4_fsblk_t new_blocks[4]; - ext4_fsblk_t current_block; - - num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, - *blks, new_blocks, &err); - if (err) - return err; - - branch[0].key = cpu_to_le32(new_blocks[0]); - /* - * metadata blocks and data blocks are allocated. - */ - for (n = 1; n <= indirect_blks; n++) { - /* - * Get buffer_head for parent block, zero it out - * and set the pointer to new one, then send - * parent to disk. - */ - bh = sb_getblk(inode->i_sb, new_blocks[n-1]); - if (unlikely(!bh)) { - err = -EIO; - goto failed; - } - - branch[n].bh = bh; - lock_buffer(bh); - BUFFER_TRACE(bh, "call get_create_access"); - err = ext4_journal_get_create_access(handle, bh); - if (err) { - /* Don't brelse(bh) here; it's done in - * ext4_journal_forget() below */ - unlock_buffer(bh); - goto failed; - } - - memset(bh->b_data, 0, blocksize); - branch[n].p = (__le32 *) bh->b_data + offsets[n]; - branch[n].key = cpu_to_le32(new_blocks[n]); - *branch[n].p = branch[n].key; - if (n == indirect_blks) { - current_block = new_blocks[n]; - /* - * End of chain, update the last new metablock of - * the chain to point to the new allocated - * data blocks numbers - */ - for (i = 1; i < num; i++) - *(branch[n].p + i) = cpu_to_le32(++current_block); - } - BUFFER_TRACE(bh, "marking uptodate"); - set_buffer_uptodate(bh); - unlock_buffer(bh); - - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, bh); - if (err) - goto failed; - } - *blks = num; - return err; -failed: - /* Allocation failed, free what we already allocated */ - ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); - for (i = 1; i <= n ; i++) { - /* - * branch[i].bh is newly allocated, so there is no - * need to revoke the block, which is why we don't - * need to set EXT4_FREE_BLOCKS_METADATA. - */ - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, - EXT4_FREE_BLOCKS_FORGET); - } - for (i = n+1; i < indirect_blks; i++) - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); - - ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); - - return err; -} - -/** - * ext4_splice_branch - splice the allocated branch onto inode. - * @handle: handle for this transaction - * @inode: owner - * @block: (logical) number of block we are adding - * @chain: chain of indirect blocks (with a missing link - see - * ext4_alloc_branch) - * @where: location of missing link - * @num: number of indirect blocks we are adding - * @blks: number of direct blocks we are adding - * - * This function fills the missing link and does all housekeeping needed in - * inode (->i_blocks, etc.). In case of success we end up with the full - * chain to new block and return 0. - */ -static int ext4_splice_branch(handle_t *handle, struct inode *inode, - ext4_lblk_t block, Indirect *where, int num, - int blks) -{ - int i; - int err = 0; - ext4_fsblk_t current_block; - - /* - * If we're splicing into a [td]indirect block (as opposed to the - * inode) then we need to get write access to the [td]indirect block - * before the splice. - */ - if (where->bh) { - BUFFER_TRACE(where->bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, where->bh); - if (err) - goto err_out; - } - /* That's it */ - - *where->p = where->key; - - /* - * Update the host buffer_head or inode to point to more just allocated - * direct blocks blocks - */ - if (num == 0 && blks > 1) { - current_block = le32_to_cpu(where->key) + 1; - for (i = 1; i < blks; i++) - *(where->p + i) = cpu_to_le32(current_block++); - } - - /* We are done with atomic stuff, now do the rest of housekeeping */ - /* had we spliced it onto indirect block? */ - if (where->bh) { - /* - * If we spliced it onto an indirect block, we haven't - * altered the inode. Note however that if it is being spliced - * onto an indirect block at the very end of the file (the - * file is growing) then we *will* alter the inode to reflect - * the new i_size. But that is not done here - it is done in - * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. - */ - jbd_debug(5, "splicing indirect only\n"); - BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, where->bh); - if (err) - goto err_out; - } else { - /* - * OK, we spliced it into the inode itself on a direct block. - */ - ext4_mark_inode_dirty(handle, inode); - jbd_debug(5, "splicing direct\n"); - } - return err; - -err_out: - for (i = 1; i <= num; i++) { - /* - * branch[i].bh is newly allocated, so there is no - * need to revoke the block, which is why we don't - * need to set EXT4_FREE_BLOCKS_METADATA. - */ - ext4_free_blocks(handle, inode, where[i].bh, 0, 1, - EXT4_FREE_BLOCKS_FORGET); - } - ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), - blks, 0); - - return err; -} - -/* - * The ext4_ind_map_blocks() function handles non-extents inodes - * (i.e., using the traditional indirect/double-indirect i_blocks - * scheme) for ext4_map_blocks(). - * - * Allocation strategy is simple: if we have to allocate something, we will - * have to go the whole way to leaf. So let's do it before attaching anything - * to tree, set linkage between the newborn blocks, write them if sync is - * required, recheck the path, free and repeat if check fails, otherwise - * set the last missing link (that will protect us from any truncate-generated - * removals - all blocks on the path are immune now) and possibly force the - * write on the parent block. - * That has a nice additional property: no special recovery from the failed - * allocations is needed - we simply release blocks and do not touch anything - * reachable from inode. - * - * `handle' can be NULL if create == 0. - * - * return > 0, # of blocks mapped or allocated. - * return = 0, if plain lookup failed. - * return < 0, error case. - * - * The ext4_ind_get_blocks() function should be called with - * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem - * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or - * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system - * blocks. - */ -static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, - struct ext4_map_blocks *map, - int flags) -{ - int err = -EIO; - ext4_lblk_t offsets[4]; - Indirect chain[4]; - Indirect *partial; - ext4_fsblk_t goal; - int indirect_blks; - int blocks_to_boundary = 0; - int depth; - int count = 0; - ext4_fsblk_t first_block = 0; - - trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); - J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); - J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); - depth = ext4_block_to_path(inode, map->m_lblk, offsets, - &blocks_to_boundary); - - if (depth == 0) - goto out; - - partial = ext4_get_branch(inode, depth, offsets, chain, &err); - - /* Simplest case - block found, no allocation needed */ - if (!partial) { - first_block = le32_to_cpu(chain[depth - 1].key); - count++; - /*map more blocks*/ - while (count < map->m_len && count <= blocks_to_boundary) { - ext4_fsblk_t blk; - - blk = le32_to_cpu(*(chain[depth-1].p + count)); - - if (blk == first_block + count) - count++; - else - break; - } - goto got_it; - } - - /* Next simple case - plain lookup or failed read of indirect block */ - if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) - goto cleanup; - - /* - * Okay, we need to do block allocation. - */ - goal = ext4_find_goal(inode, map->m_lblk, partial); - - /* the number of blocks need to allocate for [d,t]indirect blocks */ - indirect_blks = (chain + depth) - partial - 1; - - /* - * Next look up the indirect map to count the totoal number of - * direct blocks to allocate for this branch. - */ - count = ext4_blks_to_allocate(partial, indirect_blks, - map->m_len, blocks_to_boundary); - /* - * Block out ext4_truncate while we alter the tree - */ - err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, - &count, goal, - offsets + (partial - chain), partial); - - /* - * The ext4_splice_branch call will free and forget any buffers - * on the new chain if there is a failure, but that risks using - * up transaction credits, especially for bitmaps where the - * credits cannot be returned. Can we handle this somehow? We - * may need to return -EAGAIN upwards in the worst case. --sct - */ - if (!err) - err = ext4_splice_branch(handle, inode, map->m_lblk, - partial, indirect_blks, count); - if (err) - goto cleanup; - - map->m_flags |= EXT4_MAP_NEW; - - ext4_update_inode_fsync_trans(handle, inode, 1); -got_it: - map->m_flags |= EXT4_MAP_MAPPED; - map->m_pblk = le32_to_cpu(chain[depth-1].key); - map->m_len = count; - if (count > blocks_to_boundary) - map->m_flags |= EXT4_MAP_BOUNDARY; - err = count; - /* Clean up and exit */ - partial = chain + depth - 1; /* the whole chain */ -cleanup: - while (partial > chain) { - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); - partial--; - } -out: - trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, - map->m_pblk, map->m_len, err); - return err; -} - #ifdef CONFIG_QUOTA qsize_t *ext4_get_reserved_space(struct inode *inode) { @@ -1073,33 +244,6 @@ qsize_t *ext4_get_reserved_space(struct inode *inode) /* * Calculate the number of metadata blocks need to reserve - * to allocate a new block at @lblocks for non extent file based file - */ -static int ext4_indirect_calc_metadata_amount(struct inode *inode, - sector_t lblock) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); - int blk_bits; - - if (lblock < EXT4_NDIR_BLOCKS) - return 0; - - lblock -= EXT4_NDIR_BLOCKS; - - if (ei->i_da_metadata_calc_len && - (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { - ei->i_da_metadata_calc_len++; - return 0; - } - ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; - ei->i_da_metadata_calc_len = 1; - blk_bits = order_base_2(lblock); - return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; -} - -/* - * Calculate the number of metadata blocks need to reserve * to allocate a block located at @lblock */ static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) @@ -1107,7 +251,7 @@ static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return ext4_ext_calc_metadata_amount(inode, lblock); - return ext4_indirect_calc_metadata_amount(inode, lblock); + return ext4_ind_calc_metadata_amount(inode, lblock); } /* @@ -1589,16 +733,6 @@ static int do_journal_get_write_access(handle_t *handle, return ret; } -/* - * Truncate blocks that were not used by write. We have to truncate the - * pagecache as well so that corresponding buffers get properly unmapped. - */ -static void ext4_truncate_failed_write(struct inode *inode) -{ - truncate_inode_pages(inode->i_mapping, inode->i_size); - ext4_truncate(inode); -} - static int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); static int ext4_write_begin(struct file *file, struct address_space *mapping, @@ -1863,6 +997,7 @@ static int ext4_journalled_write_end(struct file *file, if (new_i_size > inode->i_size) i_size_write(inode, pos+copied); ext4_set_inode_state(inode, EXT4_STATE_JDATA); + EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; if (new_i_size > EXT4_I(inode)->i_disksize) { ext4_update_i_disksize(inode, new_i_size); ret2 = ext4_mark_inode_dirty(handle, inode); @@ -2571,6 +1706,7 @@ static int __ext4_journalled_writepage(struct page *page, write_end_fn); if (ret == 0) ret = err; + EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; err = ext4_journal_stop(handle); if (!ret) ret = err; @@ -3450,112 +2586,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait) } /* - * O_DIRECT for ext3 (or indirect map) based files - * - * If the O_DIRECT write will extend the file then add this inode to the - * orphan list. So recovery will truncate it back to the original size - * if the machine crashes during the write. - * - * If the O_DIRECT write is intantiating holes inside i_size and the machine - * crashes then stale disk data _may_ be exposed inside the file. But current - * VFS code falls back into buffered path in that case so we are safe. - */ -static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct ext4_inode_info *ei = EXT4_I(inode); - handle_t *handle; - ssize_t ret; - int orphan = 0; - size_t count = iov_length(iov, nr_segs); - int retries = 0; - - if (rw == WRITE) { - loff_t final_size = offset + count; - - if (final_size > inode->i_size) { - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, 2); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - ret = ext4_orphan_add(handle, inode); - if (ret) { - ext4_journal_stop(handle); - goto out; - } - orphan = 1; - ei->i_disksize = inode->i_size; - ext4_journal_stop(handle); - } - } - -retry: - if (rw == READ && ext4_should_dioread_nolock(inode)) - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, - ext4_get_block, NULL, NULL, 0); - else { - ret = blockdev_direct_IO(rw, iocb, inode, iov, - offset, nr_segs, ext4_get_block); - - if (unlikely((rw & WRITE) && ret < 0)) { - loff_t isize = i_size_read(inode); - loff_t end = offset + iov_length(iov, nr_segs); - - if (end > isize) - ext4_truncate_failed_write(inode); - } - } - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - - if (orphan) { - int err; - - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, 2); - if (IS_ERR(handle)) { - /* This is really bad luck. We've written the data - * but cannot extend i_size. Bail out and pretend - * the write failed... */ - ret = PTR_ERR(handle); - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); - - goto out; - } - if (inode->i_nlink) - ext4_orphan_del(handle, inode); - if (ret > 0) { - loff_t end = offset + ret; - if (end > inode->i_size) { - ei->i_disksize = end; - i_size_write(inode, end); - /* - * We're going to return a positive `ret' - * here due to non-zero-length I/O, so there's - * no way of reporting error returns from - * ext4_mark_inode_dirty() to userspace. So - * ignore it. - */ - ext4_mark_inode_dirty(handle, inode); - } - } - err = ext4_journal_stop(handle); - if (ret == 0) - ret = err; - } -out: - return ret; -} - -/* * ext4_get_block used when preparing for a DIO write or buffer write. * We allocate an uinitialized extent if blocks haven't been allocated. * The extent will be converted to initialized after the IO is complete. @@ -4033,383 +3063,6 @@ unlock: return err; } -/* - * Probably it should be a library function... search for first non-zero word - * or memcmp with zero_page, whatever is better for particular architecture. - * Linus? - */ -static inline int all_zeroes(__le32 *p, __le32 *q) -{ - while (p < q) - if (*p++) - return 0; - return 1; -} - -/** - * ext4_find_shared - find the indirect blocks for partial truncation. - * @inode: inode in question - * @depth: depth of the affected branch - * @offsets: offsets of pointers in that branch (see ext4_block_to_path) - * @chain: place to store the pointers to partial indirect blocks - * @top: place to the (detached) top of branch - * - * This is a helper function used by ext4_truncate(). - * - * When we do truncate() we may have to clean the ends of several - * indirect blocks but leave the blocks themselves alive. Block is - * partially truncated if some data below the new i_size is referred - * from it (and it is on the path to the first completely truncated - * data block, indeed). We have to free the top of that path along - * with everything to the right of the path. Since no allocation - * past the truncation point is possible until ext4_truncate() - * finishes, we may safely do the latter, but top of branch may - * require special attention - pageout below the truncation point - * might try to populate it. - * - * We atomically detach the top of branch from the tree, store the - * block number of its root in *@top, pointers to buffer_heads of - * partially truncated blocks - in @chain[].bh and pointers to - * their last elements that should not be removed - in - * @chain[].p. Return value is the pointer to last filled element - * of @chain. - * - * The work left to caller to do the actual freeing of subtrees: - * a) free the subtree starting from *@top - * b) free the subtrees whose roots are stored in - * (@chain[i].p+1 .. end of @chain[i].bh->b_data) - * c) free the subtrees growing from the inode past the @chain[0]. - * (no partially truncated stuff there). */ - -static Indirect *ext4_find_shared(struct inode *inode, int depth, - ext4_lblk_t offsets[4], Indirect chain[4], - __le32 *top) -{ - Indirect *partial, *p; - int k, err; - - *top = 0; - /* Make k index the deepest non-null offset + 1 */ - for (k = depth; k > 1 && !offsets[k-1]; k--) - ; - partial = ext4_get_branch(inode, k, offsets, chain, &err); - /* Writer: pointers */ - if (!partial) - partial = chain + k-1; - /* - * If the branch acquired continuation since we've looked at it - - * fine, it should all survive and (new) top doesn't belong to us. - */ - if (!partial->key && *partial->p) - /* Writer: end */ - goto no_top; - for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) - ; - /* - * OK, we've found the last block that must survive. The rest of our - * branch should be detached before unlocking. However, if that rest - * of branch is all ours and does not grow immediately from the inode - * it's easier to cheat and just decrement partial->p. - */ - if (p == chain + k - 1 && p > chain) { - p->p--; - } else { - *top = *p->p; - /* Nope, don't do this in ext4. Must leave the tree intact */ -#if 0 - *p->p = 0; -#endif - } - /* Writer: end */ - - while (partial > p) { - brelse(partial->bh); - partial--; - } -no_top: - return partial; -} - -/* - * Zero a number of block pointers in either an inode or an indirect block. - * If we restart the transaction we must again get write access to the - * indirect block for further modification. - * - * We release `count' blocks on disk, but (last - first) may be greater - * than `count' because there can be holes in there. - * - * Return 0 on success, 1 on invalid block range - * and < 0 on fatal error. - */ -static int ext4_clear_blocks(handle_t *handle, struct inode *inode, - struct buffer_head *bh, - ext4_fsblk_t block_to_free, - unsigned long count, __le32 *first, - __le32 *last) -{ - __le32 *p; - int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; - int err; - - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - flags |= EXT4_FREE_BLOCKS_METADATA; - - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, - count)) { - EXT4_ERROR_INODE(inode, "attempt to clear invalid " - "blocks %llu len %lu", - (unsigned long long) block_to_free, count); - return 1; - } - - if (try_to_extend_transaction(handle, inode)) { - if (bh) { - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, bh); - if (unlikely(err)) - goto out_err; - } - err = ext4_mark_inode_dirty(handle, inode); - if (unlikely(err)) - goto out_err; - err = ext4_truncate_restart_trans(handle, inode, - blocks_for_truncate(inode)); - if (unlikely(err)) - goto out_err; - if (bh) { - BUFFER_TRACE(bh, "retaking write access"); - err = ext4_journal_get_write_access(handle, bh); - if (unlikely(err)) - goto out_err; - } - } - - for (p = first; p < last; p++) - *p = 0; - - ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); - return 0; -out_err: - ext4_std_error(inode->i_sb, err); - return err; -} - -/** - * ext4_free_data - free a list of data blocks - * @handle: handle for this transaction - * @inode: inode we are dealing with - * @this_bh: indirect buffer_head which contains *@first and *@last - * @first: array of block numbers - * @last: points immediately past the end of array - * - * We are freeing all blocks referred from that array (numbers are stored as - * little-endian 32-bit) and updating @inode->i_blocks appropriately. - * - * We accumulate contiguous runs of blocks to free. Conveniently, if these - * blocks are contiguous then releasing them at one time will only affect one - * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't - * actually use a lot of journal space. - * - * @this_bh will be %NULL if @first and @last point into the inode's direct - * block pointers. - */ -static void ext4_free_data(handle_t *handle, struct inode *inode, - struct buffer_head *this_bh, - __le32 *first, __le32 *last) -{ - ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ - unsigned long count = 0; /* Number of blocks in the run */ - __le32 *block_to_free_p = NULL; /* Pointer into inode/ind - corresponding to - block_to_free */ - ext4_fsblk_t nr; /* Current block # */ - __le32 *p; /* Pointer into inode/ind - for current block */ - int err = 0; - - if (this_bh) { /* For indirect block */ - BUFFER_TRACE(this_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, this_bh); - /* Important: if we can't update the indirect pointers - * to the blocks, we can't free them. */ - if (err) - return; - } - - for (p = first; p < last; p++) { - nr = le32_to_cpu(*p); - if (nr) { - /* accumulate blocks to free if they're contiguous */ - if (count == 0) { - block_to_free = nr; - block_to_free_p = p; - count = 1; - } else if (nr == block_to_free + count) { - count++; - } else { - err = ext4_clear_blocks(handle, inode, this_bh, - block_to_free, count, - block_to_free_p, p); - if (err) - break; - block_to_free = nr; - block_to_free_p = p; - count = 1; - } - } - } - - if (!err && count > 0) - err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, - count, block_to_free_p, p); - if (err < 0) - /* fatal error */ - return; - - if (this_bh) { - BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); - - /* - * The buffer head should have an attached journal head at this - * point. However, if the data is corrupted and an indirect - * block pointed to itself, it would have been detached when - * the block was cleared. Check for this instead of OOPSing. - */ - if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) - ext4_handle_dirty_metadata(handle, inode, this_bh); - else - EXT4_ERROR_INODE(inode, - "circular indirect block detected at " - "block %llu", - (unsigned long long) this_bh->b_blocknr); - } -} - -/** - * ext4_free_branches - free an array of branches - * @handle: JBD handle for this transaction - * @inode: inode we are dealing with - * @parent_bh: the buffer_head which contains *@first and *@last - * @first: array of block numbers - * @last: pointer immediately past the end of array - * @depth: depth of the branches to free - * - * We are freeing all blocks referred from these branches (numbers are - * stored as little-endian 32-bit) and updating @inode->i_blocks - * appropriately. - */ -static void ext4_free_branches(handle_t *handle, struct inode *inode, - struct buffer_head *parent_bh, - __le32 *first, __le32 *last, int depth) -{ - ext4_fsblk_t nr; - __le32 *p; - - if (ext4_handle_is_aborted(handle)) - return; - - if (depth--) { - struct buffer_head *bh; - int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); - p = last; - while (--p >= first) { - nr = le32_to_cpu(*p); - if (!nr) - continue; /* A hole */ - - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), - nr, 1)) { - EXT4_ERROR_INODE(inode, - "invalid indirect mapped " - "block %lu (level %d)", - (unsigned long) nr, depth); - break; - } - - /* Go read the buffer for the next level down */ - bh = sb_bread(inode->i_sb, nr); - - /* - * A read failure? Report error and clear slot - * (should be rare). - */ - if (!bh) { - EXT4_ERROR_INODE_BLOCK(inode, nr, - "Read failure"); - continue; - } - - /* This zaps the entire block. Bottom up. */ - BUFFER_TRACE(bh, "free child branches"); - ext4_free_branches(handle, inode, bh, - (__le32 *) bh->b_data, - (__le32 *) bh->b_data + addr_per_block, - depth); - brelse(bh); - - /* - * Everything below this this pointer has been - * released. Now let this top-of-subtree go. - * - * We want the freeing of this indirect block to be - * atomic in the journal with the updating of the - * bitmap block which owns it. So make some room in - * the journal. - * - * We zero the parent pointer *after* freeing its - * pointee in the bitmaps, so if extend_transaction() - * for some reason fails to put the bitmap changes and - * the release into the same transaction, recovery - * will merely complain about releasing a free block, - * rather than leaking blocks. - */ - if (ext4_handle_is_aborted(handle)) - return; - if (try_to_extend_transaction(handle, inode)) { - ext4_mark_inode_dirty(handle, inode); - ext4_truncate_restart_trans(handle, inode, - blocks_for_truncate(inode)); - } - - /* - * The forget flag here is critical because if - * we are journaling (and not doing data - * journaling), we have to make sure a revoke - * record is written to prevent the journal - * replay from overwriting the (former) - * indirect block if it gets reallocated as a - * data block. This must happen in the same - * transaction where the data blocks are - * actually freed. - */ - ext4_free_blocks(handle, inode, NULL, nr, 1, - EXT4_FREE_BLOCKS_METADATA| - EXT4_FREE_BLOCKS_FORGET); - - if (parent_bh) { - /* - * The block which we have just freed is - * pointed to by an indirect block: journal it - */ - BUFFER_TRACE(parent_bh, "get_write_access"); - if (!ext4_journal_get_write_access(handle, - parent_bh)){ - *p = 0; - BUFFER_TRACE(parent_bh, - "call ext4_handle_dirty_metadata"); - ext4_handle_dirty_metadata(handle, - inode, - parent_bh); - } - } - } - } else { - /* We have reached the bottom of the tree. */ - BUFFER_TRACE(parent_bh, "free data blocks"); - ext4_free_data(handle, inode, parent_bh, first, last); - } -} - int ext4_can_truncate(struct inode *inode) { if (S_ISREG(inode->i_mode)) @@ -4476,19 +3129,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) */ void ext4_truncate(struct inode *inode) { - handle_t *handle; - struct ext4_inode_info *ei = EXT4_I(inode); - __le32 *i_data = ei->i_data; - int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); - struct address_space *mapping = inode->i_mapping; - ext4_lblk_t offsets[4]; - Indirect chain[4]; - Indirect *partial; - __le32 nr = 0; - int n = 0; - ext4_lblk_t last_block, max_block; - unsigned blocksize = inode->i_sb->s_blocksize; - trace_ext4_truncate_enter(inode); if (!ext4_can_truncate(inode)) @@ -4499,149 +3139,11 @@ void ext4_truncate(struct inode *inode) if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ext4_ext_truncate(inode); - trace_ext4_truncate_exit(inode); - return; - } - - handle = start_transaction(inode); - if (IS_ERR(handle)) - return; /* AKPM: return what? */ - - last_block = (inode->i_size + blocksize-1) - >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); - max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) - >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); - - if (inode->i_size & (blocksize - 1)) - if (ext4_block_truncate_page(handle, mapping, inode->i_size)) - goto out_stop; - - if (last_block != max_block) { - n = ext4_block_to_path(inode, last_block, offsets, NULL); - if (n == 0) - goto out_stop; /* error */ - } - - /* - * OK. This truncate is going to happen. We add the inode to the - * orphan list, so that if this truncate spans multiple transactions, - * and we crash, we will resume the truncate when the filesystem - * recovers. It also marks the inode dirty, to catch the new size. - * - * Implication: the file must always be in a sane, consistent - * truncatable state while each transaction commits. - */ - if (ext4_orphan_add(handle, inode)) - goto out_stop; - - /* - * From here we block out all ext4_get_block() callers who want to - * modify the block allocation tree. - */ - down_write(&ei->i_data_sem); - - ext4_discard_preallocations(inode); - - /* - * The orphan list entry will now protect us from any crash which - * occurs before the truncate completes, so it is now safe to propagate - * the new, shorter inode size (held for now in i_size) into the - * on-disk inode. We do this via i_disksize, which is the value which - * ext4 *really* writes onto the disk inode. - */ - ei->i_disksize = inode->i_size; - - if (last_block == max_block) { - /* - * It is unnecessary to free any data blocks if last_block is - * equal to the indirect block limit. - */ - goto out_unlock; - } else if (n == 1) { /* direct blocks */ - ext4_free_data(handle, inode, NULL, i_data+offsets[0], - i_data + EXT4_NDIR_BLOCKS); - goto do_indirects; - } - - partial = ext4_find_shared(inode, n, offsets, chain, &nr); - /* Kill the top of shared branch (not detached) */ - if (nr) { - if (partial == chain) { - /* Shared branch grows from the inode */ - ext4_free_branches(handle, inode, NULL, - &nr, &nr+1, (chain+n-1) - partial); - *partial->p = 0; - /* - * We mark the inode dirty prior to restart, - * and prior to stop. No need for it here. - */ - } else { - /* Shared branch grows from an indirect block */ - BUFFER_TRACE(partial->bh, "get_write_access"); - ext4_free_branches(handle, inode, partial->bh, - partial->p, - partial->p+1, (chain+n-1) - partial); - } - } - /* Clear the ends of indirect blocks on the shared branch */ - while (partial > chain) { - ext4_free_branches(handle, inode, partial->bh, partial->p + 1, - (__le32*)partial->bh->b_data+addr_per_block, - (chain+n-1) - partial); - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); - partial--; - } -do_indirects: - /* Kill the remaining (whole) subtrees */ - switch (offsets[0]) { - default: - nr = i_data[EXT4_IND_BLOCK]; - if (nr) { - ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); - i_data[EXT4_IND_BLOCK] = 0; - } - case EXT4_IND_BLOCK: - nr = i_data[EXT4_DIND_BLOCK]; - if (nr) { - ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); - i_data[EXT4_DIND_BLOCK] = 0; - } - case EXT4_DIND_BLOCK: - nr = i_data[EXT4_TIND_BLOCK]; - if (nr) { - ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); - i_data[EXT4_TIND_BLOCK] = 0; - } - case EXT4_TIND_BLOCK: - ; - } - -out_unlock: - up_write(&ei->i_data_sem); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - - /* - * In a multi-transaction truncate, we only make the final transaction - * synchronous - */ - if (IS_SYNC(inode)) - ext4_handle_sync(handle); -out_stop: - /* - * If this was a simple ftruncate(), and the file will remain alive - * then we need to clear up the orphan record which we created above. - * However, if this was a real unlink then we were called by - * ext4_delete_inode(), and we allow that function to clean up the - * orphan info for us. - */ - if (inode->i_nlink) - ext4_orphan_del(handle, inode); + else + ext4_ind_truncate(inode); - ext4_journal_stop(handle); trace_ext4_truncate_exit(inode); } @@ -5012,7 +3514,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) (S_ISLNK(inode->i_mode) && !ext4_inode_is_fast_symlink(inode))) { /* Validate block references which are part of inode */ - ret = ext4_check_inode_blockref(inode); + ret = ext4_ind_check_inode(inode); } if (ret) goto bad_inode; @@ -5459,34 +3961,10 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, return 0; } -static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, - int chunk) -{ - int indirects; - - /* if nrblocks are contiguous */ - if (chunk) { - /* - * With N contiguous data blocks, we need at most - * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, - * 2 dindirect blocks, and 1 tindirect block - */ - return DIV_ROUND_UP(nrblocks, - EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; - } - /* - * if nrblocks are not contiguous, worse case, each block touch - * a indirect block, and each indirect block touch a double indirect - * block, plus a triple indirect block - */ - indirects = nrblocks * 2 + 1; - return indirects; -} - static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) { if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - return ext4_indirect_trans_blocks(inode, nrblocks, chunk); + return ext4_ind_trans_blocks(inode, nrblocks, chunk); return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); } diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 808c554e773f..f18bfe37aff8 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -202,8 +202,9 @@ setversion_out: struct super_block *sb = inode->i_sb; int err, err2=0; - if (!capable(CAP_SYS_RESOURCE)) - return -EPERM; + err = ext4_resize_begin(sb); + if (err) + return err; if (get_user(n_blocks_count, (__u32 __user *)arg)) return -EFAULT; @@ -221,6 +222,7 @@ setversion_out: if (err == 0) err = err2; mnt_drop_write(filp->f_path.mnt); + ext4_resize_end(sb); return err; } @@ -271,8 +273,9 @@ mext_out: struct super_block *sb = inode->i_sb; int err, err2=0; - if (!capable(CAP_SYS_RESOURCE)) - return -EPERM; + err = ext4_resize_begin(sb); + if (err) + return err; if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, sizeof(input))) @@ -291,6 +294,7 @@ mext_out: if (err == 0) err = err2; mnt_drop_write(filp->f_path.mnt); + ext4_resize_end(sb); return err; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 6ed859d56850..17a5a57c415a 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -75,8 +75,8 @@ * * The inode preallocation space is used looking at the _logical_ start * block. If only the logical file block falls within the range of prealloc - * space we will consume the particular prealloc space. This make sure that - * that the we have contiguous physical blocks representing the file blocks + * space we will consume the particular prealloc space. This makes sure that + * we have contiguous physical blocks representing the file blocks * * The important thing to be noted in case of inode prealloc space is that * we don't modify the values associated to inode prealloc space except @@ -84,7 +84,7 @@ * * If we are not able to find blocks in the inode prealloc space and if we * have the group allocation flag set then we look at the locality group - * prealloc space. These are per CPU prealloc list repreasented as + * prealloc space. These are per CPU prealloc list represented as * * ext4_sb_info.s_locality_groups[smp_processor_id()] * @@ -128,12 +128,13 @@ * we are doing a group prealloc we try to normalize the request to * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is * 512 blocks. This can be tuned via - * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in + * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in * terms of number of blocks. If we have mounted the file system with -O * stripe=<value> option the group prealloc request is normalized to the - * stripe value (sbi->s_stripe) + * the smallest multiple of the stripe value (sbi->s_stripe) which is + * greater than the default mb_group_prealloc. * - * The regular allocator(using the buddy cache) supports few tunables. + * The regular allocator (using the buddy cache) supports a few tunables. * * /sys/fs/ext4/<partition>/mb_min_to_scan * /sys/fs/ext4/<partition>/mb_max_to_scan @@ -152,7 +153,7 @@ * best extent in the found extents. Searching for the blocks starts with * the group specified as the goal value in allocation context via * ac_g_ex. Each group is first checked based on the criteria whether it - * can used for allocation. ext4_mb_good_group explains how the groups are + * can be used for allocation. ext4_mb_good_group explains how the groups are * checked. * * Both the prealloc space are getting populated as above. So for the first @@ -492,10 +493,11 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) b2 = (unsigned char *) bitmap; for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { if (b1[i] != b2[i]) { - printk(KERN_ERR "corruption in group %u " - "at byte %u(%u): %x in copy != %x " - "on disk/prealloc\n", - e4b->bd_group, i, i * 8, b1[i], b2[i]); + ext4_msg(e4b->bd_sb, KERN_ERR, + "corruption in group %u " + "at byte %u(%u): %x in copy != %x " + "on disk/prealloc", + e4b->bd_group, i, i * 8, b1[i], b2[i]); BUG(); } } @@ -1125,7 +1127,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, grp = ext4_get_group_info(sb, group); e4b->bd_blkbits = sb->s_blocksize_bits; - e4b->bd_info = ext4_get_group_info(sb, group); + e4b->bd_info = grp; e4b->bd_sb = sb; e4b->bd_group = group; e4b->bd_buddy_page = NULL; @@ -1281,7 +1283,7 @@ static void mb_clear_bits(void *bm, int cur, int len) } } -static void mb_set_bits(void *bm, int cur, int len) +void ext4_set_bits(void *bm, int cur, int len) { __u32 *addr; @@ -1510,7 +1512,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) } mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); - mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); + ext4_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); mb_check_buddy(e4b); return ret; @@ -2223,8 +2225,8 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, EXT4_DESC_PER_BLOCK_BITS(sb); meta_group_info = kmalloc(metalen, GFP_KERNEL); if (meta_group_info == NULL) { - printk(KERN_ERR "EXT4-fs: can't allocate mem for a " - "buddy group\n"); + ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate mem " + "for a buddy group"); goto exit_meta_group_info; } sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = @@ -2237,7 +2239,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); if (meta_group_info[i] == NULL) { - printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); + ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate buddy mem"); goto exit_group_info; } memset(meta_group_info[i], 0, kmem_cache_size(cachep)); @@ -2279,8 +2281,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, exit_group_info: /* If a meta_group_info table has been allocated, release it now */ - if (group % EXT4_DESC_PER_BLOCK(sb) == 0) + if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL; + } exit_meta_group_info: return -ENOMEM; } /* ext4_mb_add_groupinfo */ @@ -2328,23 +2332,26 @@ static int ext4_mb_init_backend(struct super_block *sb) /* An 8TB filesystem with 64-bit pointers requires a 4096 byte * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. * So a two level scheme suffices for now. */ - sbi->s_group_info = kzalloc(array_size, GFP_KERNEL); + sbi->s_group_info = ext4_kvzalloc(array_size, GFP_KERNEL); if (sbi->s_group_info == NULL) { - printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); + ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); return -ENOMEM; } sbi->s_buddy_cache = new_inode(sb); if (sbi->s_buddy_cache == NULL) { - printk(KERN_ERR "EXT4-fs: can't get new inode\n"); + ext4_msg(sb, KERN_ERR, "can't get new inode"); goto err_freesgi; } - sbi->s_buddy_cache->i_ino = get_next_ino(); + /* To avoid potentially colliding with an valid on-disk inode number, + * use EXT4_BAD_INO for the buddy cache inode number. This inode is + * not in the inode hash, so it should never be found by iget(), but + * this will avoid confusion if it ever shows up during debugging. */ + sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; for (i = 0; i < ngroups; i++) { desc = ext4_get_group_desc(sb, i, NULL); if (desc == NULL) { - printk(KERN_ERR - "EXT4-fs: can't read descriptor %u\n", i); + ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i); goto err_freebuddy; } if (ext4_mb_add_groupinfo(sb, i, desc) != 0) @@ -2362,7 +2369,7 @@ err_freebuddy: kfree(sbi->s_group_info[i]); iput(sbi->s_buddy_cache); err_freesgi: - kfree(sbi->s_group_info); + ext4_kvfree(sbi->s_group_info); return -ENOMEM; } @@ -2404,14 +2411,15 @@ static int ext4_groupinfo_create_slab(size_t size) slab_size, 0, SLAB_RECLAIM_ACCOUNT, NULL); + ext4_groupinfo_caches[cache_index] = cachep; + mutex_unlock(&ext4_grpinfo_slab_create_mutex); if (!cachep) { - printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n"); + printk(KERN_EMERG + "EXT4-fs: no memory for groupinfo slab cache\n"); return -ENOMEM; } - ext4_groupinfo_caches[cache_index] = cachep; - return 0; } @@ -2457,12 +2465,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) i++; } while (i <= sb->s_blocksize_bits + 1); - /* init file for buddy data */ - ret = ext4_mb_init_backend(sb); - if (ret != 0) { - goto out; - } - spin_lock_init(&sbi->s_md_lock); spin_lock_init(&sbi->s_bal_lock); @@ -2472,6 +2474,18 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; + /* + * If there is a s_stripe > 1, then we set the s_mb_group_prealloc + * to the lowest multiple of s_stripe which is bigger than + * the s_mb_group_prealloc as determined above. We want + * the preallocation size to be an exact multiple of the + * RAID stripe size so that preallocations don't fragment + * the stripes. + */ + if (sbi->s_stripe > 1) { + sbi->s_mb_group_prealloc = roundup( + sbi->s_mb_group_prealloc, sbi->s_stripe); + } sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); if (sbi->s_locality_groups == NULL) { @@ -2487,6 +2501,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) spin_lock_init(&lg->lg_prealloc_lock); } + /* init file for buddy data */ + ret = ext4_mb_init_backend(sb); + if (ret != 0) { + goto out; + } + if (sbi->s_proc) proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, &ext4_mb_seq_groups_fops, sb); @@ -2544,32 +2564,32 @@ int ext4_mb_release(struct super_block *sb) EXT4_DESC_PER_BLOCK_BITS(sb); for (i = 0; i < num_meta_group_infos; i++) kfree(sbi->s_group_info[i]); - kfree(sbi->s_group_info); + ext4_kvfree(sbi->s_group_info); } kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); if (sbi->s_buddy_cache) iput(sbi->s_buddy_cache); if (sbi->s_mb_stats) { - printk(KERN_INFO - "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n", + ext4_msg(sb, KERN_INFO, + "mballoc: %u blocks %u reqs (%u success)", atomic_read(&sbi->s_bal_allocated), atomic_read(&sbi->s_bal_reqs), atomic_read(&sbi->s_bal_success)); - printk(KERN_INFO - "EXT4-fs: mballoc: %u extents scanned, %u goal hits, " - "%u 2^N hits, %u breaks, %u lost\n", + ext4_msg(sb, KERN_INFO, + "mballoc: %u extents scanned, %u goal hits, " + "%u 2^N hits, %u breaks, %u lost", atomic_read(&sbi->s_bal_ex_scanned), atomic_read(&sbi->s_bal_goals), atomic_read(&sbi->s_bal_2orders), atomic_read(&sbi->s_bal_breaks), atomic_read(&sbi->s_mb_lost_chunks)); - printk(KERN_INFO - "EXT4-fs: mballoc: %lu generated and it took %Lu\n", - sbi->s_mb_buddies_generated++, + ext4_msg(sb, KERN_INFO, + "mballoc: %lu generated and it took %Lu", + sbi->s_mb_buddies_generated, sbi->s_mb_generation_time); - printk(KERN_INFO - "EXT4-fs: mballoc: %u preallocated, %u discarded\n", + ext4_msg(sb, KERN_INFO, + "mballoc: %u preallocated, %u discarded", atomic_read(&sbi->s_mb_preallocated), atomic_read(&sbi->s_mb_discarded)); } @@ -2628,6 +2648,15 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) rb_erase(&entry->node, &(db->bb_free_root)); mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); + /* + * Clear the trimmed flag for the group so that the next + * ext4_trim_fs can trim it. + * If the volume is mounted with -o discard, online discard + * is supported and the free blocks will be trimmed online. + */ + if (!test_opt(sb, DISCARD)) + EXT4_MB_GRP_CLEAR_TRIMMED(db); + if (!db->bb_free_root.rb_node) { /* No more items in the per group rb tree * balance refcounts from ext4_mb_free_metadata() @@ -2771,8 +2800,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, * We leak some of the blocks here. */ ext4_lock_group(sb, ac->ac_b_ex.fe_group); - mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, - ac->ac_b_ex.fe_len); + ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len); ext4_unlock_group(sb, ac->ac_b_ex.fe_group); err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!err) @@ -2790,7 +2819,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, } } #endif - mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len); + ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_blks_set(sb, gdp, @@ -2830,8 +2860,9 @@ out_err: /* * here we normalize request for locality group - * Group request are normalized to s_strip size if we set the same via mount - * option. If not we set it to s_mb_group_prealloc which can be configured via + * Group request are normalized to s_mb_group_prealloc, which goes to + * s_strip if we set the same via mount option. + * s_mb_group_prealloc can be configured via * /sys/fs/ext4/<partition>/mb_group_prealloc * * XXX: should we try to preallocate more than the group has now? @@ -2842,10 +2873,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) struct ext4_locality_group *lg = ac->ac_lg; BUG_ON(lg == NULL); - if (EXT4_SB(sb)->s_stripe) - ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe; - else - ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; + ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; mb_debug(1, "#%u: goal %u blocks for locality group\n", current->pid, ac->ac_g_ex.fe_len); } @@ -3001,9 +3029,10 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, if (start + size <= ac->ac_o_ex.fe_logical && start > ac->ac_o_ex.fe_logical) { - printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n", - (unsigned long) start, (unsigned long) size, - (unsigned long) ac->ac_o_ex.fe_logical); + ext4_msg(ac->ac_sb, KERN_ERR, + "start %lu, size %lu, fe_logical %lu", + (unsigned long) start, (unsigned long) size, + (unsigned long) ac->ac_o_ex.fe_logical); } BUG_ON(start + size <= ac->ac_o_ex.fe_logical && start > ac->ac_o_ex.fe_logical); @@ -3262,7 +3291,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, while (n) { entry = rb_entry(n, struct ext4_free_data, node); - mb_set_bits(bitmap, entry->start_blk, entry->count); + ext4_set_bits(bitmap, entry->start_blk, entry->count); n = rb_next(n); } return; @@ -3304,7 +3333,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, if (unlikely(len == 0)) continue; BUG_ON(groupnr != group); - mb_set_bits(bitmap, start, len); + ext4_set_bits(bitmap, start, len); preallocated += len; count++; } @@ -3584,10 +3613,11 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, bit = next + 1; } if (free != pa->pa_free) { - printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n", - pa, (unsigned long) pa->pa_lstart, - (unsigned long) pa->pa_pstart, - (unsigned long) pa->pa_len); + ext4_msg(e4b->bd_sb, KERN_CRIT, + "pa %p: logic %lu, phys. %lu, len %lu", + pa, (unsigned long) pa->pa_lstart, + (unsigned long) pa->pa_pstart, + (unsigned long) pa->pa_len); ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", free, pa->pa_free); /* @@ -3775,7 +3805,8 @@ repeat: * use preallocation while we're discarding it */ spin_unlock(&pa->pa_lock); spin_unlock(&ei->i_prealloc_lock); - printk(KERN_ERR "uh-oh! used pa while discarding\n"); + ext4_msg(sb, KERN_ERR, + "uh-oh! used pa while discarding"); WARN_ON(1); schedule_timeout_uninterruptible(HZ); goto repeat; @@ -3852,12 +3883,13 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) return; - printk(KERN_ERR "EXT4-fs: Can't allocate:" - " Allocation context details:\n"); - printk(KERN_ERR "EXT4-fs: status %d flags %d\n", + ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: Can't allocate:" + " Allocation context details:"); + ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: status %d flags %d", ac->ac_status, ac->ac_flags); - printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, " - "best %lu/%lu/%lu@%lu cr %d\n", + ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: orig %lu/%lu/%lu@%lu, " + "goal %lu/%lu/%lu@%lu, " + "best %lu/%lu/%lu@%lu cr %d", (unsigned long)ac->ac_o_ex.fe_group, (unsigned long)ac->ac_o_ex.fe_start, (unsigned long)ac->ac_o_ex.fe_len, @@ -3871,9 +3903,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (unsigned long)ac->ac_b_ex.fe_len, (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); - printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned, - ac->ac_found); - printk(KERN_ERR "EXT4-fs: groups: \n"); + ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: %lu scanned, %d found", + ac->ac_ex_scanned, ac->ac_found); + ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: groups: "); ngroups = ext4_get_groups_count(sb); for (i = 0; i < ngroups; i++) { struct ext4_group_info *grp = ext4_get_group_info(sb, i); @@ -4637,7 +4669,7 @@ do_more: } ext4_mark_super_dirty(sb); error_return: - if (freed) + if (freed && !(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) dquot_free_block(inode, freed); brelse(bitmap_bh); ext4_std_error(sb, err); @@ -4645,7 +4677,7 @@ error_return: } /** - * ext4_add_groupblocks() -- Add given blocks to an existing group + * ext4_group_add_blocks() -- Add given blocks to an existing group * @handle: handle to this transaction * @sb: super block * @block: start physcial block to add to the block group @@ -4653,7 +4685,7 @@ error_return: * * This marks the blocks as free in the bitmap and buddy. */ -void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, +int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count) { struct buffer_head *bitmap_bh = NULL; @@ -4666,25 +4698,35 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, struct ext4_buddy e4b; int err = 0, ret, blk_free_count; ext4_grpblk_t blocks_freed; - struct ext4_group_info *grp; ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); + if (count == 0) + return 0; + ext4_get_group_no_and_offset(sb, block, &block_group, &bit); - grp = ext4_get_group_info(sb, block_group); /* * Check to see if we are freeing blocks across a group * boundary. */ - if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) + if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { + ext4_warning(sb, "too much blocks added to group %u\n", + block_group); + err = -EINVAL; goto error_return; + } bitmap_bh = ext4_read_block_bitmap(sb, block_group); - if (!bitmap_bh) + if (!bitmap_bh) { + err = -EIO; goto error_return; + } + desc = ext4_get_group_desc(sb, block_group, &gd_bh); - if (!desc) + if (!desc) { + err = -EIO; goto error_return; + } if (in_range(ext4_block_bitmap(sb, desc), block, count) || in_range(ext4_inode_bitmap(sb, desc), block, count) || @@ -4694,6 +4736,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, ext4_error(sb, "Adding blocks in system zones - " "Block = %llu, count = %lu", block, count); + err = -EINVAL; goto error_return; } @@ -4762,7 +4805,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, error_return: brelse(bitmap_bh); ext4_std_error(sb, err); - return; + return err; } /** @@ -4782,6 +4825,8 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count, { struct ext4_free_extent ex; + trace_ext4_trim_extent(sb, group, start, count); + assert_spin_locked(ext4_group_lock_ptr(sb, group)); ex.fe_start = start; @@ -4802,7 +4847,7 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count, /** * ext4_trim_all_free -- function to trim all free space in alloc. group * @sb: super block for file system - * @e4b: ext4 buddy + * @group: group to be trimmed * @start: first group block to examine * @max: last group block to examine * @minblocks: minimum extent block count @@ -4823,10 +4868,12 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_grpblk_t minblocks) { void *bitmap; - ext4_grpblk_t next, count = 0; + ext4_grpblk_t next, count = 0, free_count = 0; struct ext4_buddy e4b; int ret; + trace_ext4_trim_all_free(sb, group, start, max); + ret = ext4_mb_load_buddy(sb, group, &e4b); if (ret) { ext4_error(sb, "Error in loading buddy " @@ -4836,6 +4883,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, bitmap = e4b.bd_bitmap; ext4_lock_group(sb, group); + if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) && + minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) + goto out; + start = (e4b.bd_info->bb_first_free > start) ? e4b.bd_info->bb_first_free : start; @@ -4850,6 +4901,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, next - start, group, &e4b); count += next - start; } + free_count += next - start; start = next + 1; if (fatal_signal_pending(current)) { @@ -4863,9 +4915,13 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_lock_group(sb, group); } - if ((e4b.bd_info->bb_free - count) < minblocks) + if ((e4b.bd_info->bb_free - free_count) < minblocks) break; } + + if (!ret) + EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); +out: ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); @@ -4904,6 +4960,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) return -EINVAL; + if (start + len <= first_data_blk) + goto out; if (start < first_data_blk) { len -= first_data_blk - start; start = first_data_blk; @@ -4952,5 +5010,9 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) } range->len = trimmed * sb->s_blocksize; + if (!ret) + atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); + +out: return ret; } diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 20b5e7bfebd1..9d4a636b546c 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -187,7 +187,6 @@ struct ext4_allocation_context { __u16 ac_flags; /* allocation hints */ __u8 ac_status; __u8 ac_criteria; - __u8 ac_repeats; __u8 ac_2order; /* if request is to allocate 2^N blocks and * N > 0, the field stores N, otherwise 0 */ __u8 ac_op; /* operation, for history only */ diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 8c9babac43dc..565a154e22d4 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -289,7 +289,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent while (len--) printk("%c", *name++); ext4fs_dirhash(de->name, de->name_len, &h); printk(":%x.%u ", h.hash, - ((char *) de - base)); + (unsigned) ((char *) de - base)); } space += EXT4_DIR_REC_LEN(de->name_len); names++; @@ -1013,7 +1013,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q *err = -ENOENT; errout: - dxtrace(printk(KERN_DEBUG "%s not found\n", name)); + dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name)); dx_release (frames); return NULL; } @@ -1985,18 +1985,11 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) if (!list_empty(&EXT4_I(inode)->i_orphan)) goto out_unlock; - /* Orphan handling is only valid for files with data blocks - * being truncated, or files being unlinked. */ - - /* @@@ FIXME: Observation from aviro: - * I think I can trigger J_ASSERT in ext4_orphan_add(). We block - * here (on s_orphan_lock), so race with ext4_link() which might bump - * ->i_nlink. For, say it, character device. Not a regular file, - * not a directory, not a symlink and ->i_nlink > 0. - * - * tytso, 4/25/2009: I'm not sure how that could happen; - * shouldn't the fs core protect us from these sort of - * unlink()/link() races? + /* + * Orphan handling is only valid for files with data blocks + * being truncated, or files being unlinked. Note that we either + * hold i_mutex, or the inode can not be referenced from outside, + * so i_nlink should not be bumped due to race */ J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 7bb8f76d470a..430c401d0895 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -285,11 +285,7 @@ static int io_submit_init(struct ext4_io_submit *io, io_end = ext4_init_io_end(inode, GFP_NOFS); if (!io_end) return -ENOMEM; - do { - bio = bio_alloc(GFP_NOIO, nvecs); - nvecs >>= 1; - } while (bio == NULL); - + bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; bio->bi_private = io->io_end = io_end; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 80bbc9c60c24..707d3f16f7ce 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -16,6 +16,35 @@ #include "ext4_jbd2.h" +int ext4_resize_begin(struct super_block *sb) +{ + int ret = 0; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + /* + * We are not allowed to do online-resizing on a filesystem mounted + * with error, because it can destroy the filesystem easily. + */ + if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { + ext4_warning(sb, "There are errors in the filesystem, " + "so online resizing is not allowed\n"); + return -EPERM; + } + + if (test_and_set_bit_lock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags)) + ret = -EBUSY; + + return ret; +} + +void ext4_resize_end(struct super_block *sb) +{ + clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags); + smp_mb__after_clear_bit(); +} + #define outside(b, first, last) ((b) < (first) || (b) >= (last)) #define inside(b, first, last) ((b) >= (first) && (b) < (last)) @@ -118,10 +147,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, brelse(bh); bh = ERR_PTR(err); } else { - lock_buffer(bh); memset(bh->b_data, 0, sb->s_blocksize); set_buffer_uptodate(bh); - unlock_buffer(bh); } return bh; @@ -132,8 +159,7 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, * If that fails, restart the transaction & regain write access for the * buffer head which is used for block_bitmap modifications. */ -static int extend_or_restart_transaction(handle_t *handle, int thresh, - struct buffer_head *bh) +static int extend_or_restart_transaction(handle_t *handle, int thresh) { int err; @@ -144,9 +170,8 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh, if (err < 0) return err; if (err) { - if ((err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) - return err; - if ((err = ext4_journal_get_write_access(handle, bh))) + err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA); + if (err) return err; } @@ -181,21 +206,7 @@ static int setup_new_group_blocks(struct super_block *sb, if (IS_ERR(handle)) return PTR_ERR(handle); - mutex_lock(&sbi->s_resize_lock); - if (input->group != sbi->s_groups_count) { - err = -EBUSY; - goto exit_journal; - } - - if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) { - err = PTR_ERR(bh); - goto exit_journal; - } - - if (ext4_bg_has_super(sb, input->group)) { - ext4_debug("mark backup superblock %#04llx (+0)\n", start); - ext4_set_bit(0, bh->b_data); - } + BUG_ON(input->group != sbi->s_groups_count); /* Copy all of the GDT blocks into the backup in this group */ for (i = 0, bit = 1, block = start + 1; @@ -203,29 +214,26 @@ static int setup_new_group_blocks(struct super_block *sb, struct buffer_head *gdb; ext4_debug("update backup group %#04llx (+%d)\n", block, bit); - - if ((err = extend_or_restart_transaction(handle, 1, bh))) - goto exit_bh; + err = extend_or_restart_transaction(handle, 1); + if (err) + goto exit_journal; gdb = sb_getblk(sb, block); if (!gdb) { err = -EIO; - goto exit_bh; + goto exit_journal; } if ((err = ext4_journal_get_write_access(handle, gdb))) { brelse(gdb); - goto exit_bh; + goto exit_journal; } - lock_buffer(gdb); memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); set_buffer_uptodate(gdb); - unlock_buffer(gdb); err = ext4_handle_dirty_metadata(handle, NULL, gdb); if (unlikely(err)) { brelse(gdb); - goto exit_bh; + goto exit_journal; } - ext4_set_bit(bit, bh->b_data); brelse(gdb); } @@ -235,9 +243,22 @@ static int setup_new_group_blocks(struct super_block *sb, err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb, GFP_NOFS); if (err) - goto exit_bh; - for (i = 0, bit = gdblocks + 1; i < reserved_gdb; i++, bit++) - ext4_set_bit(bit, bh->b_data); + goto exit_journal; + + err = extend_or_restart_transaction(handle, 2); + if (err) + goto exit_journal; + + bh = bclean(handle, sb, input->block_bitmap); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + goto exit_journal; + } + + if (ext4_bg_has_super(sb, input->group)) { + ext4_debug("mark backup group tables %#04llx (+0)\n", start); + ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + 1); + } ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, input->block_bitmap - start); @@ -253,12 +274,9 @@ static int setup_new_group_blocks(struct super_block *sb, err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); if (err) goto exit_bh; - for (i = 0, bit = input->inode_table - start; - i < sbi->s_itb_per_group; i++, bit++) - ext4_set_bit(bit, bh->b_data); + ext4_set_bits(bh->b_data, input->inode_table - start, + sbi->s_itb_per_group); - if ((err = extend_or_restart_transaction(handle, 2, bh))) - goto exit_bh; ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data); @@ -285,7 +303,6 @@ exit_bh: brelse(bh); exit_journal: - mutex_unlock(&sbi->s_resize_lock); if ((err2 = ext4_journal_stop(handle)) && !err) err = err2; @@ -377,15 +394,15 @@ static int verify_reserved_gdb(struct super_block *sb, * fail once we start modifying the data on disk, because JBD has no rollback. */ static int add_new_gdb(handle_t *handle, struct inode *inode, - struct ext4_new_group_data *input, - struct buffer_head **primary) + ext4_group_t group) { struct super_block *sb = inode->i_sb; struct ext4_super_block *es = EXT4_SB(sb)->s_es; - unsigned long gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); + unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; struct buffer_head **o_group_desc, **n_group_desc; struct buffer_head *dind; + struct buffer_head *gdb_bh; int gdbackups; struct ext4_iloc iloc; __le32 *data; @@ -408,11 +425,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, return -EPERM; } - *primary = sb_bread(sb, gdblock); - if (!*primary) + gdb_bh = sb_bread(sb, gdblock); + if (!gdb_bh) return -EIO; - if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) { + gdbackups = verify_reserved_gdb(sb, gdb_bh); + if (gdbackups < 0) { err = gdbackups; goto exit_bh; } @@ -427,7 +445,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, data = (__le32 *)dind->b_data; if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) { ext4_warning(sb, "new group %u GDT block %llu not reserved", - input->group, gdblock); + group, gdblock); err = -EINVAL; goto exit_dind; } @@ -436,7 +454,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, if (unlikely(err)) goto exit_dind; - err = ext4_journal_get_write_access(handle, *primary); + err = ext4_journal_get_write_access(handle, gdb_bh); if (unlikely(err)) goto exit_sbh; @@ -449,12 +467,13 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, if (unlikely(err)) goto exit_dindj; - n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *), - GFP_NOFS); + n_group_desc = ext4_kvmalloc((gdb_num + 1) * + sizeof(struct buffer_head *), + GFP_NOFS); if (!n_group_desc) { err = -ENOMEM; - ext4_warning(sb, - "not enough memory for %lu groups", gdb_num + 1); + ext4_warning(sb, "not enough memory for %lu groups", + gdb_num + 1); goto exit_inode; } @@ -475,8 +494,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, } inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; ext4_mark_iloc_dirty(handle, inode, &iloc); - memset((*primary)->b_data, 0, sb->s_blocksize); - err = ext4_handle_dirty_metadata(handle, NULL, *primary); + memset(gdb_bh->b_data, 0, sb->s_blocksize); + err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); if (unlikely(err)) { ext4_std_error(sb, err); goto exit_inode; @@ -486,10 +505,10 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, o_group_desc = EXT4_SB(sb)->s_group_desc; memcpy(n_group_desc, o_group_desc, EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); - n_group_desc[gdb_num] = *primary; + n_group_desc[gdb_num] = gdb_bh; EXT4_SB(sb)->s_group_desc = n_group_desc; EXT4_SB(sb)->s_gdb_count++; - kfree(o_group_desc); + ext4_kvfree(o_group_desc); le16_add_cpu(&es->s_reserved_gdt_blocks, -1); err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); @@ -499,6 +518,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, return err; exit_inode: + ext4_kvfree(n_group_desc); /* ext4_handle_release_buffer(handle, iloc.bh); */ brelse(iloc.bh); exit_dindj: @@ -508,7 +528,7 @@ exit_sbh: exit_dind: brelse(dind); exit_bh: - brelse(*primary); + brelse(gdb_bh); ext4_debug("leaving with error %d\n", err); return err; @@ -528,7 +548,7 @@ exit_bh: * backup GDT blocks are stored in their reserved primary GDT block. */ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, - struct ext4_new_group_data *input) + ext4_group_t group) { struct super_block *sb = inode->i_sb; int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); @@ -599,7 +619,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, * Finally we can add each of the reserved backup GDT blocks from * the new group to its reserved primary GDT block. */ - blk = input->group * EXT4_BLOCKS_PER_GROUP(sb); + blk = group * EXT4_BLOCKS_PER_GROUP(sb); for (i = 0; i < reserved_gdb; i++) { int err2; data = (__le32 *)primary[i]->b_data; @@ -799,13 +819,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) goto exit_put; } - mutex_lock(&sbi->s_resize_lock); - if (input->group != sbi->s_groups_count) { - ext4_warning(sb, "multiple resizers run on filesystem!"); - err = -EBUSY; - goto exit_journal; - } - if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh))) goto exit_journal; @@ -820,16 +833,25 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if ((err = ext4_journal_get_write_access(handle, primary))) goto exit_journal; - if (reserved_gdb && ext4_bg_num_gdb(sb, input->group) && - (err = reserve_backup_gdb(handle, inode, input))) + if (reserved_gdb && ext4_bg_num_gdb(sb, input->group)) { + err = reserve_backup_gdb(handle, inode, input->group); + if (err) + goto exit_journal; + } + } else { + /* + * Note that we can access new group descriptor block safely + * only if add_new_gdb() succeeds. + */ + err = add_new_gdb(handle, inode, input->group); + if (err) goto exit_journal; - } else if ((err = add_new_gdb(handle, inode, input, &primary))) - goto exit_journal; + primary = sbi->s_group_desc[gdb_num]; + } /* * OK, now we've set up the new group. Time to make it active. * - * We do not lock all allocations via s_resize_lock * so we have to be safe wrt. concurrent accesses the group * data. So we need to be careful to set all of the relevant * group descriptor data etc. *before* we enable the group. @@ -886,13 +908,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) * * The precise rules we use are: * - * * Writers of s_groups_count *must* hold s_resize_lock - * AND * * Writers must perform a smp_wmb() after updating all dependent * data and before modifying the groups count * - * * Readers must hold s_resize_lock over the access - * OR * * Readers must perform an smp_rmb() after reading the groups count * and before reading any dependent data. * @@ -937,10 +955,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) ext4_handle_dirty_super(handle, sb); exit_journal: - mutex_unlock(&sbi->s_resize_lock); if ((err2 = ext4_journal_stop(handle)) && !err) err = err2; - if (!err) { + if (!err && primary) { update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, sizeof(struct ext4_super_block)); update_backups(sb, primary->b_blocknr, primary->b_data, @@ -969,16 +986,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_grpblk_t add; struct buffer_head *bh; handle_t *handle; - int err; + int err, err2; ext4_group_t group; - /* We don't need to worry about locking wrt other resizers just - * yet: we're going to revalidate es->s_blocks_count after - * taking the s_resize_lock below. */ o_blocks_count = ext4_blocks_count(es); if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n", + printk(KERN_DEBUG "EXT4-fs: extending last group from %llu to %llu blocks\n", o_blocks_count, n_blocks_count); if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) @@ -995,7 +1009,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, if (n_blocks_count < o_blocks_count) { ext4_warning(sb, "can't shrink FS - resize aborted"); - return -EBUSY; + return -EINVAL; } /* Handle the remaining blocks in the last group only. */ @@ -1038,32 +1052,25 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, goto exit_put; } - mutex_lock(&EXT4_SB(sb)->s_resize_lock); - if (o_blocks_count != ext4_blocks_count(es)) { - ext4_warning(sb, "multiple resizers run on filesystem!"); - mutex_unlock(&EXT4_SB(sb)->s_resize_lock); - ext4_journal_stop(handle); - err = -EBUSY; - goto exit_put; - } - if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh))) { ext4_warning(sb, "error %d on journal write access", err); - mutex_unlock(&EXT4_SB(sb)->s_resize_lock); ext4_journal_stop(handle); goto exit_put; } ext4_blocks_count_set(es, o_blocks_count + add); - mutex_unlock(&EXT4_SB(sb)->s_resize_lock); ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); /* We add the blocks to the bitmap and set the group need init bit */ - ext4_add_groupblocks(handle, sb, o_blocks_count, add); + err = ext4_group_add_blocks(handle, sb, o_blocks_count, add); ext4_handle_dirty_super(handle, sb); ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); - if ((err = ext4_journal_stop(handle))) + err2 = ext4_journal_stop(handle); + if (!err && err2) + err = err2; + + if (err) goto exit_put; if (test_opt(sb, DEBUG)) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9ea71aa864b3..e2d88baf91d3 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -110,6 +110,35 @@ static struct file_system_type ext3_fs_type = { #define IS_EXT3_SB(sb) (0) #endif +void *ext4_kvmalloc(size_t size, gfp_t flags) +{ + void *ret; + + ret = kmalloc(size, flags); + if (!ret) + ret = __vmalloc(size, flags, PAGE_KERNEL); + return ret; +} + +void *ext4_kvzalloc(size_t size, gfp_t flags) +{ + void *ret; + + ret = kmalloc(size, flags); + if (!ret) + ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL); + return ret; +} + +void ext4_kvfree(void *ptr) +{ + if (is_vmalloc_addr(ptr)) + vfree(ptr); + else + kfree(ptr); + +} + ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, struct ext4_group_desc *bg) { @@ -269,6 +298,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) journal_t *journal; handle_t *handle; + trace_ext4_journal_start(sb, nblocks, _RET_IP_); if (sb->s_flags & MS_RDONLY) return ERR_PTR(-EROFS); @@ -789,11 +819,8 @@ static void ext4_put_super(struct super_block *sb) for (i = 0; i < sbi->s_gdb_count; i++) brelse(sbi->s_group_desc[i]); - kfree(sbi->s_group_desc); - if (is_vmalloc_addr(sbi->s_flex_groups)) - vfree(sbi->s_flex_groups); - else - kfree(sbi->s_flex_groups); + ext4_kvfree(sbi->s_group_desc); + ext4_kvfree(sbi->s_flex_groups); percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); @@ -1976,15 +2003,11 @@ static int ext4_fill_flex_info(struct super_block *sb) ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex; size = flex_group_count * sizeof(struct flex_groups); - sbi->s_flex_groups = kzalloc(size, GFP_KERNEL); + sbi->s_flex_groups = ext4_kvzalloc(size, GFP_KERNEL); if (sbi->s_flex_groups == NULL) { - sbi->s_flex_groups = vzalloc(size); - if (sbi->s_flex_groups == NULL) { - ext4_msg(sb, KERN_ERR, - "not enough memory for %u flex groups", - flex_group_count); - goto failed; - } + ext4_msg(sb, KERN_ERR, "not enough memory for %u flex groups", + flex_group_count); + goto failed; } for (i = 0; i < sbi->s_groups_count; i++) { @@ -2383,17 +2406,25 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride); unsigned long stripe_width = le32_to_cpu(sbi->s_es->s_raid_stripe_width); + int ret; if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) - return sbi->s_stripe; - - if (stripe_width <= sbi->s_blocks_per_group) - return stripe_width; + ret = sbi->s_stripe; + else if (stripe_width <= sbi->s_blocks_per_group) + ret = stripe_width; + else if (stride <= sbi->s_blocks_per_group) + ret = stride; + else + ret = 0; - if (stride <= sbi->s_blocks_per_group) - return stride; + /* + * If the stripe width is 1, this makes no sense and + * we set it to 0 to turn off stripe handling code. + */ + if (ret <= 1) + ret = 0; - return 0; + return ret; } /* sysfs supprt */ @@ -3408,8 +3439,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); - sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), - GFP_KERNEL); + sbi->s_group_desc = ext4_kvmalloc(db_count * + sizeof(struct buffer_head *), + GFP_KERNEL); if (sbi->s_group_desc == NULL) { ext4_msg(sb, KERN_ERR, "not enough memory"); goto failed_mount; @@ -3491,7 +3523,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); - mutex_init(&sbi->s_resize_lock); + sbi->s_resize_flags = 0; sb->s_root = NULL; @@ -3741,12 +3773,8 @@ failed_mount_wq: } failed_mount3: del_timer(&sbi->s_err_report); - if (sbi->s_flex_groups) { - if (is_vmalloc_addr(sbi->s_flex_groups)) - vfree(sbi->s_flex_groups); - else - kfree(sbi->s_flex_groups); - } + if (sbi->s_flex_groups) + ext4_kvfree(sbi->s_flex_groups); percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); @@ -3756,7 +3784,7 @@ failed_mount3: failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); - kfree(sbi->s_group_desc); + ext4_kvfree(sbi->s_group_desc); failed_mount: if (sbi->s_proc) { remove_proc_entry(sb->s_id, ext4_proc_root); diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h new file mode 100644 index 000000000000..011ba6670d99 --- /dev/null +++ b/fs/ext4/truncate.h @@ -0,0 +1,43 @@ +/* + * linux/fs/ext4/truncate.h + * + * Common inline functions needed for truncate support + */ + +/* + * Truncate blocks that were not used by write. We have to truncate the + * pagecache as well so that corresponding buffers get properly unmapped. + */ +static inline void ext4_truncate_failed_write(struct inode *inode) +{ + truncate_inode_pages(inode->i_mapping, inode->i_size); + ext4_truncate(inode); +} + +/* + * Work out how many blocks we need to proceed with the next chunk of a + * truncate transaction. + */ +static inline unsigned long ext4_blocks_for_truncate(struct inode *inode) +{ + ext4_lblk_t needed; + + needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); + + /* Give ourselves just enough room to cope with inodes in which + * i_blocks is corrupt: we've seen disk corruptions in the past + * which resulted in random data in an inode which looked enough + * like a regular file for ext4 to try to delete it. Things + * will go a bit crazy if that happens, but at least we should + * try not to panic the whole kernel. */ + if (needed < 2) + needed = 2; + + /* But we need to bound the transaction so we don't overflow the + * journal. */ + if (needed > EXT4_MAX_TRANS_DATA) + needed = EXT4_MAX_TRANS_DATA; + + return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; +} + diff --git a/fs/generic_acl.c b/fs/generic_acl.c index d5e33a077a67..d0dddaceac59 100644 --- a/fs/generic_acl.c +++ b/fs/generic_acl.c @@ -82,18 +82,14 @@ generic_acl_set(struct dentry *dentry, const char *name, const void *value, return PTR_ERR(acl); } if (acl) { - mode_t mode; - error = posix_acl_valid(acl); if (error) goto failed; switch (type) { case ACL_TYPE_ACCESS: - mode = inode->i_mode; - error = posix_acl_equiv_mode(acl, &mode); + error = posix_acl_equiv_mode(acl, &inode->i_mode); if (error < 0) goto failed; - inode->i_mode = mode; inode->i_ctime = CURRENT_TIME; if (error == 0) { posix_acl_release(acl); @@ -125,21 +121,20 @@ int generic_acl_init(struct inode *inode, struct inode *dir) { struct posix_acl *acl = NULL; - mode_t mode = inode->i_mode; int error; - inode->i_mode = mode & ~current_umask(); if (!S_ISLNK(inode->i_mode)) acl = get_cached_acl(dir, ACL_TYPE_DEFAULT); if (acl) { if (S_ISDIR(inode->i_mode)) set_cached_acl(inode, ACL_TYPE_DEFAULT, acl); - error = posix_acl_create(&acl, GFP_KERNEL, &mode); + error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); if (error < 0) return error; - inode->i_mode = mode; if (error > 0) set_cached_acl(inode, ACL_TYPE_ACCESS, acl); + } else { + inode->i_mode &= ~current_umask(); } error = 0; diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 884c9af0542f..34501b64bc47 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -72,7 +72,7 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type) return gfs2_acl_get(GFS2_I(inode), type); } -static int gfs2_set_mode(struct inode *inode, mode_t mode) +static int gfs2_set_mode(struct inode *inode, umode_t mode) { int error = 0; @@ -117,7 +117,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct posix_acl *acl; - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; int error = 0; if (!sdp->sd_args.ar_posix_acl) @@ -276,7 +276,7 @@ static int gfs2_xattr_system_set(struct dentry *dentry, const char *name, goto out_release; if (type == ACL_TYPE_ACCESS) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; error = posix_acl_equiv_mode(acl, &mode); if (error <= 0) { diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 8635be5ffd97..970ea987b3f6 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -16,6 +16,7 @@ #include <linux/statfs.h> #include <linux/types.h> #include <linux/pid_namespace.h> +#include <linux/namei.h> #include <asm/uaccess.h> #include "os.h" diff --git a/fs/inode.c b/fs/inode.c index d0c72ff6b30e..5aab80dc008c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -399,12 +399,12 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval) EXPORT_SYMBOL(__insert_inode_hash); /** - * remove_inode_hash - remove an inode from the hash + * __remove_inode_hash - remove an inode from the hash * @inode: inode to unhash * * Remove an inode from the superblock. */ -void remove_inode_hash(struct inode *inode) +void __remove_inode_hash(struct inode *inode) { spin_lock(&inode_hash_lock); spin_lock(&inode->i_lock); @@ -412,7 +412,7 @@ void remove_inode_hash(struct inode *inode) spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); } -EXPORT_SYMBOL(remove_inode_hash); +EXPORT_SYMBOL(__remove_inode_hash); void end_writeback(struct inode *inode) { @@ -454,7 +454,9 @@ static void evict(struct inode *inode) BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(!list_empty(&inode->i_lru)); - inode_wb_list_del(inode); + if (!list_empty(&inode->i_wb_list)) + inode_wb_list_del(inode); + inode_sb_list_del(inode); if (op->evict_inode) { @@ -1328,7 +1330,8 @@ static void iput_final(struct inode *inode) } inode->i_state |= I_FREEING; - inode_lru_list_del(inode); + if (!list_empty(&inode->i_lru)) + inode_lru_list_del(inode); spin_unlock(&inode->i_lock); evict(inode); diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 2c62c5aae82f..16a698bd906d 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -257,9 +257,12 @@ static void __flush_batch(journal_t *journal, int *batch_count) { int i; + struct blk_plug plug; + blk_start_plug(&plug); for (i = 0; i < *batch_count; i++) - write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE); + write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE_SYNC); + blk_finish_plug(&plug); for (i = 0; i < *batch_count; i++) { struct buffer_head *bh = journal->j_chkpt_bhs[i]; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 0dfa5b598e68..f24df13adc4e 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2390,73 +2390,6 @@ static void __exit journal_exit(void) jbd2_journal_destroy_caches(); } -/* - * jbd2_dev_to_name is a utility function used by the jbd2 and ext4 - * tracing infrastructure to map a dev_t to a device name. - * - * The caller should use rcu_read_lock() in order to make sure the - * device name stays valid until its done with it. We use - * rcu_read_lock() as well to make sure we're safe in case the caller - * gets sloppy, and because rcu_read_lock() is cheap and can be safely - * nested. - */ -struct devname_cache { - struct rcu_head rcu; - dev_t device; - char devname[BDEVNAME_SIZE]; -}; -#define CACHE_SIZE_BITS 6 -static struct devname_cache *devcache[1 << CACHE_SIZE_BITS]; -static DEFINE_SPINLOCK(devname_cache_lock); - -static void free_devcache(struct rcu_head *rcu) -{ - kfree(rcu); -} - -const char *jbd2_dev_to_name(dev_t device) -{ - int i = hash_32(device, CACHE_SIZE_BITS); - char *ret; - struct block_device *bd; - static struct devname_cache *new_dev; - - rcu_read_lock(); - if (devcache[i] && devcache[i]->device == device) { - ret = devcache[i]->devname; - rcu_read_unlock(); - return ret; - } - rcu_read_unlock(); - - new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL); - if (!new_dev) - return "NODEV-ALLOCFAILURE"; /* Something non-NULL */ - bd = bdget(device); - spin_lock(&devname_cache_lock); - if (devcache[i]) { - if (devcache[i]->device == device) { - kfree(new_dev); - bdput(bd); - ret = devcache[i]->devname; - spin_unlock(&devname_cache_lock); - return ret; - } - call_rcu(&devcache[i]->rcu, free_devcache); - } - devcache[i] = new_dev; - devcache[i]->device = device; - if (bd) { - bdevname(bd, devcache[i]->devname); - bdput(bd); - } else - __bdevname(device, devcache[i]->devname); - ret = devcache[i]->devname; - spin_unlock(&devname_cache_lock); - return ret; -} -EXPORT_SYMBOL(jbd2_dev_to_name); - MODULE_LICENSE("GPL"); module_init(journal_init); module_exit(journal_exit); diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 27c511a1cf05..926d02068a14 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -227,7 +227,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) case ACL_TYPE_ACCESS: xprefix = JFFS2_XPREFIX_ACL_ACCESS; if (acl) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; rc = posix_acl_equiv_mode(acl, &mode); if (rc < 0) return rc; @@ -259,7 +259,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) return rc; } -int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, mode_t *i_mode) +int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, umode_t *i_mode) { struct posix_acl *acl; int rc; diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index b3421c78d9f8..9b477246f2a6 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -28,7 +28,7 @@ struct jffs2_acl_header { struct posix_acl *jffs2_get_acl(struct inode *inode, int type); extern int jffs2_acl_chmod(struct inode *); -extern int jffs2_init_acl_pre(struct inode *, struct inode *, mode_t *); +extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *); extern int jffs2_init_acl_post(struct inode *); extern const struct xattr_handler jffs2_acl_access_xattr_handler; diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index b81b35ddf4e4..bbcb9755dd2b 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -406,7 +406,7 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data) /* jffs2_new_inode: allocate a new inode and inocache, add it to the hash, fill in the raw_inode while you're at it. */ -struct inode *jffs2_new_inode (struct inode *dir_i, mode_t mode, struct jffs2_raw_inode *ri) +struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_raw_inode *ri) { struct inode *inode; struct super_block *sb = dir_i->i_sb; diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index 526979c607b6..6c1755c59c0f 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -173,7 +173,7 @@ int jffs2_do_setattr (struct inode *, struct iattr *); struct inode *jffs2_iget(struct super_block *, unsigned long); void jffs2_evict_inode (struct inode *); void jffs2_dirty_inode(struct inode *inode, int flags); -struct inode *jffs2_new_inode (struct inode *dir_i, mode_t mode, +struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_raw_inode *ri); int jffs2_statfs (struct dentry *, struct kstatfs *); int jffs2_remount_fs (struct super_block *, int *, char *); diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index b3a32caf2b45..45559dc3ea2f 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -127,16 +127,14 @@ int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) return PTR_ERR(acl); if (acl) { - mode_t mode = inode->i_mode; if (S_ISDIR(inode->i_mode)) { rc = jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, acl); if (rc) goto cleanup; } - rc = posix_acl_create(&acl, GFP_KERNEL, &mode); + rc = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); if (rc < 0) goto cleanup; /* posix_acl_release(NULL) is no-op */ - inode->i_mode = mode; if (rc > 0) rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl); cleanup: diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 24838f1eeee5..e87fedef23db 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -693,8 +693,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name, return rc; } if (acl) { - mode_t mode = inode->i_mode; - rc = posix_acl_equiv_mode(acl, &mode); + rc = posix_acl_equiv_mode(acl, &inode->i_mode); posix_acl_release(acl); if (rc < 0) { printk(KERN_ERR @@ -702,7 +701,6 @@ static int can_set_system_xattr(struct inode *inode, const char *name, rc); return rc; } - inode->i_mode = mode; mark_inode_dirty(inode); } /* diff --git a/fs/namei.c b/fs/namei.c index f8c69d373793..445fd5da11fa 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -716,19 +716,25 @@ static int follow_automount(struct path *path, unsigned flags, if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_PARENT)) return -EISDIR; /* we actually want to stop here */ - /* We want to mount if someone is trying to open/create a file of any - * type under the mountpoint, wants to traverse through the mountpoint - * or wants to open the mounted directory. - * + /* * We don't want to mount if someone's just doing a stat and they've * set AT_SYMLINK_NOFOLLOW - unless they're stat'ing a directory and * appended a '/' to the name. */ - if (!(flags & LOOKUP_FOLLOW) && - !(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | - LOOKUP_OPEN | LOOKUP_CREATE))) - return -EISDIR; - + if (!(flags & LOOKUP_FOLLOW)) { + /* We do, however, want to mount if someone wants to open or + * create a file of any type under the mountpoint, wants to + * traverse through the mountpoint or wants to open the mounted + * directory. + * Also, autofs may mark negative dentries as being automount + * points. These will need the attentions of the daemon to + * instantiate them before they can be used. + */ + if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | + LOOKUP_OPEN | LOOKUP_CREATE)) && + path->dentry->d_inode) + return -EISDIR; + } current->total_link_count++; if (current->total_link_count >= 40) return -ELOOP; diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index e49e73107e62..7ef23979896d 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -415,7 +415,7 @@ fail: } int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode, - mode_t mode) + umode_t mode) { struct posix_acl *dfacl, *acl; int error = 0; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 38053d823eb0..85f1690ca08c 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -316,7 +316,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, int flags, struct nfs_open_context *ctx) { struct nfs3_createdata *data; - mode_t mode = sattr->ia_mode; + umode_t mode = sattr->ia_mode; int status = -ENOMEM; dprintk("NFS call create %s\n", dentry->d_name.name); @@ -562,7 +562,7 @@ static int nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { struct nfs3_createdata *data; - int mode = sattr->ia_mode; + umode_t mode = sattr->ia_mode; int status = -ENOMEM; dprintk("NFS call mkdir %s\n", dentry->d_name.name); @@ -681,7 +681,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, dev_t rdev) { struct nfs3_createdata *data; - mode_t mode = sattr->ia_mode; + umode_t mode = sattr->ia_mode; int status = -ENOMEM; dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name, diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 783c58d9daf1..a7219075b4de 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -247,7 +247,7 @@ static int ocfs2_set_acl(handle_t *handle, case ACL_TYPE_ACCESS: name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; ret = posix_acl_equiv_mode(acl, &mode); if (ret < 0) return ret; @@ -351,7 +351,7 @@ int ocfs2_init_acl(handle_t *handle, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct posix_acl *acl = NULL; int ret = 0, ret2; - mode_t mode; + umode_t mode; if (!S_ISLNK(inode->i_mode)) { if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { diff --git a/fs/posix_acl.c b/fs/posix_acl.c index d43729a760e2..10027b42b7e2 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -149,10 +149,10 @@ posix_acl_valid(const struct posix_acl *acl) * file mode permission bits, or else 1. Returns -E... on error. */ int -posix_acl_equiv_mode(const struct posix_acl *acl, mode_t *mode_p) +posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p) { const struct posix_acl_entry *pa, *pe; - mode_t mode = 0; + umode_t mode = 0; int not_equiv = 0; FOREACH_ACL_ENTRY(pa, acl, pe) { @@ -188,7 +188,7 @@ posix_acl_equiv_mode(const struct posix_acl *acl, mode_t *mode_p) * Create an ACL representing the file mode permission bits of an inode. */ struct posix_acl * -posix_acl_from_mode(mode_t mode, gfp_t flags) +posix_acl_from_mode(umode_t mode, gfp_t flags) { struct posix_acl *acl = posix_acl_alloc(3, flags); if (!acl) @@ -279,11 +279,11 @@ check_perm: * system calls. All permissions that are not granted by the acl are removed. * The permissions in the acl are changed to reflect the mode_p parameter. */ -static int posix_acl_create_masq(struct posix_acl *acl, mode_t *mode_p) +static int posix_acl_create_masq(struct posix_acl *acl, umode_t *mode_p) { struct posix_acl_entry *pa, *pe; struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; - mode_t mode = *mode_p; + umode_t mode = *mode_p; int not_equiv = 0; /* assert(atomic_read(acl->a_refcount) == 1); */ @@ -336,7 +336,7 @@ static int posix_acl_create_masq(struct posix_acl *acl, mode_t *mode_p) /* * Modify the ACL for the chmod syscall. */ -static int posix_acl_chmod_masq(struct posix_acl *acl, mode_t mode) +static int posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode) { struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; struct posix_acl_entry *pa, *pe; @@ -382,7 +382,7 @@ static int posix_acl_chmod_masq(struct posix_acl *acl, mode_t mode) } int -posix_acl_create(struct posix_acl **acl, gfp_t gfp, mode_t *mode_p) +posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p) { struct posix_acl *clone = posix_acl_clone(*acl, gfp); int err = -ENOMEM; @@ -400,7 +400,7 @@ posix_acl_create(struct posix_acl **acl, gfp_t gfp, mode_t *mode_p) EXPORT_SYMBOL(posix_acl_create); int -posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, mode_t mode) +posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode) { struct posix_acl *clone = posix_acl_clone(*acl, gfp); int err = -ENOMEM; diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 977ed2723845..893b961dcfd8 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -39,8 +39,9 @@ #define PSTORE_NAMELEN 64 struct pstore_private { + struct pstore_info *psi; + enum pstore_type_id type; u64 id; - int (*erase)(u64); ssize_t size; char data[]; }; @@ -73,7 +74,7 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) { struct pstore_private *p = dentry->d_inode->i_private; - p->erase(p->id); + p->psi->erase(p->type, p->id, p->psi); return simple_unlink(dir, dentry); } @@ -175,8 +176,8 @@ int pstore_is_mounted(void) * Set the mtime & ctime to the date that this record was originally stored. */ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, - char *data, size_t size, - struct timespec time, int (*erase)(u64)) + char *data, size_t size, struct timespec time, + struct pstore_info *psi) { struct dentry *root = pstore_sb->s_root; struct dentry *dentry; @@ -192,8 +193,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, private = kmalloc(sizeof *private + size, GFP_KERNEL); if (!private) goto fail_alloc; + private->type = type; private->id = id; - private->erase = erase; + private->psi = psi; switch (type) { case PSTORE_TYPE_DMESG: diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 8c9f23eb1645..611c1b3c46fa 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -2,5 +2,5 @@ extern void pstore_set_kmsg_bytes(int); extern void pstore_get_records(void); extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id, char *data, size_t size, - struct timespec time, int (*erase)(u64)); + struct timespec time, struct pstore_info *psi); extern int pstore_is_mounted(void); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index f2c3ff20ea68..c5300ec31696 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -37,6 +37,8 @@ static DEFINE_SPINLOCK(pstore_lock); static struct pstore_info *psinfo; +static char *backend; + /* How much of the console log to snapshot */ static unsigned long kmsg_bytes = 10240; @@ -67,7 +69,8 @@ static void pstore_dump(struct kmsg_dumper *dumper, unsigned long size, total = 0; char *dst, *why; u64 id; - int hsize, part = 1; + int hsize; + unsigned int part = 1; if (reason < ARRAY_SIZE(reason_str)) why = reason_str[reason]; @@ -78,7 +81,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, oopscount++; while (total < kmsg_bytes) { dst = psinfo->buf; - hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part++); + hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part); size = psinfo->bufsize - hsize; dst += hsize; @@ -94,14 +97,16 @@ static void pstore_dump(struct kmsg_dumper *dumper, memcpy(dst, s1 + s1_start, l1_cpy); memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy); - id = psinfo->write(PSTORE_TYPE_DMESG, hsize + l1_cpy + l2_cpy); + id = psinfo->write(PSTORE_TYPE_DMESG, part, + hsize + l1_cpy + l2_cpy, psinfo); if (reason == KMSG_DUMP_OOPS && pstore_is_mounted()) pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf, hsize + l1_cpy + l2_cpy, - CURRENT_TIME, psinfo->erase); + CURRENT_TIME, psinfo); l1 -= l1_cpy; l2 -= l2_cpy; total += l1_cpy + l2_cpy; + part++; } mutex_unlock(&psinfo->buf_mutex); } @@ -128,6 +133,12 @@ int pstore_register(struct pstore_info *psi) spin_unlock(&pstore_lock); return -EBUSY; } + + if (backend && strcmp(backend, psi->name)) { + spin_unlock(&pstore_lock); + return -EINVAL; + } + psinfo = psi; spin_unlock(&pstore_lock); @@ -166,9 +177,9 @@ void pstore_get_records(void) if (rc) goto out; - while ((size = psi->read(&id, &type, &time)) > 0) { + while ((size = psi->read(&id, &type, &time, psi)) > 0) { if (pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size, - time, psi->erase)) + time, psi)) failed++; } psi->close(psi); @@ -196,12 +207,15 @@ int pstore_write(enum pstore_type_id type, char *buf, size_t size) mutex_lock(&psinfo->buf_mutex); memcpy(psinfo->buf, buf, size); - id = psinfo->write(type, size); + id = psinfo->write(type, 0, size, psinfo); if (pstore_is_mounted()) pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf, - size, CURRENT_TIME, psinfo->erase); + size, CURRENT_TIME, psinfo); mutex_unlock(&psinfo->buf_mutex); return 0; } EXPORT_SYMBOL_GPL(pstore_write); + +module_param(backend, charp, 0444); +MODULE_PARM_DESC(backend, "Pstore backend to use"); diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 7362cf4c946a..6da0396e5052 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -272,12 +272,10 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; if (acl) { - mode_t mode = inode->i_mode; - error = posix_acl_equiv_mode(acl, &mode); + error = posix_acl_equiv_mode(acl, &inode->i_mode); if (error < 0) return error; else { - inode->i_mode = mode; if (error == 0) acl = NULL; } @@ -354,8 +352,6 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, return PTR_ERR(acl); if (acl) { - mode_t mode = inode->i_mode; - /* Copy the default ACL to the default ACL of a new directory */ if (S_ISDIR(inode->i_mode)) { err = reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT, @@ -366,12 +362,10 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, /* Now we reconcile the new ACL and the mode, potentially modifying both */ - err = posix_acl_create(&acl, GFP_NOFS, &mode); + err = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); if (err < 0) return err; - inode->i_mode = mode; - /* If we need an ACL.. */ if (err > 0) err = reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS, acl); diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c index 44ce51656804..b6c4b3795c4a 100644 --- a/fs/xfs/linux-2.6/xfs_acl.c +++ b/fs/xfs/linux-2.6/xfs_acl.c @@ -221,7 +221,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) } static int -xfs_set_mode(struct inode *inode, mode_t mode) +xfs_set_mode(struct inode *inode, umode_t mode) { int error = 0; @@ -267,7 +267,7 @@ posix_acl_default_exists(struct inode *inode) int xfs_inherit_acl(struct inode *inode, struct posix_acl *acl) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; int error = 0, inherit = 0; if (S_ISDIR(inode->i_mode)) { @@ -381,7 +381,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name, goto out_release; if (type == ACL_TYPE_ACCESS) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; error = posix_acl_equiv_mode(acl, &mode); if (error <= 0) { diff --git a/include/linux/amba/pl08x.h b/include/linux/amba/pl08x.h index 3111385b8ca7..e6e28f37d8ec 100644 --- a/include/linux/amba/pl08x.h +++ b/include/linux/amba/pl08x.h @@ -172,8 +172,11 @@ struct pl08x_dma_chan { int phychan_hold; struct tasklet_struct tasklet; char *name; - struct pl08x_channel_data *cd; - dma_addr_t runtime_addr; + const struct pl08x_channel_data *cd; + dma_addr_t src_addr; + dma_addr_t dst_addr; + u32 src_cctl; + u32 dst_cctl; enum dma_data_direction runtime_direction; dma_cookie_t lc; struct list_head pend_list; @@ -202,7 +205,7 @@ struct pl08x_dma_chan { * @mem_buses: buses which memory can be accessed from: PL08X_AHB1 | PL08X_AHB2 */ struct pl08x_platform_data { - struct pl08x_channel_data *slave_channels; + const struct pl08x_channel_data *slave_channels; unsigned int num_slave_channels; struct pl08x_channel_data memcpy_channel; int (*get_signal)(struct pl08x_dma_chan *); diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 4427e0454051..3fa1f3d90ce0 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -208,6 +208,49 @@ struct dm_target_callbacks { int dm_register_target(struct target_type *t); void dm_unregister_target(struct target_type *t); +/* + * Target argument parsing. + */ +struct dm_arg_set { + unsigned argc; + char **argv; +}; + +/* + * The minimum and maximum value of a numeric argument, together with + * the error message to use if the number is found to be outside that range. + */ +struct dm_arg { + unsigned min; + unsigned max; + char *error; +}; + +/* + * Validate the next argument, either returning it as *value or, if invalid, + * returning -EINVAL and setting *error. + */ +int dm_read_arg(struct dm_arg *arg, struct dm_arg_set *arg_set, + unsigned *value, char **error); + +/* + * Process the next argument as the start of a group containing between + * arg->min and arg->max further arguments. Either return the size as + * *num_args or, if invalid, return -EINVAL and set *error. + */ +int dm_read_arg_group(struct dm_arg *arg, struct dm_arg_set *arg_set, + unsigned *num_args, char **error); + +/* + * Return the current argument and shift to the next. + */ +const char *dm_shift_arg(struct dm_arg_set *as); + +/* + * Move through num_args arguments. + */ +void dm_consume_args(struct dm_arg_set *as, unsigned num_args); + /*----------------------------------------------------------------- * Functions for creating and manipulating mapped devices. * Drop the reference with dm_put when you finish with the object. diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h index 3708455ee6c3..0cb8eff76bd6 100644 --- a/include/linux/dm-ioctl.h +++ b/include/linux/dm-ioctl.h @@ -267,9 +267,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 20 +#define DM_VERSION_MINOR 21 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2011-02-02)" +#define DM_VERSION_EXTRA "-ioctl (2011-07-06)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ diff --git a/include/linux/dm-kcopyd.h b/include/linux/dm-kcopyd.h index 298d587e349b..5e54458e920f 100644 --- a/include/linux/dm-kcopyd.h +++ b/include/linux/dm-kcopyd.h @@ -42,5 +42,20 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, unsigned num_dests, struct dm_io_region *dests, unsigned flags, dm_kcopyd_notify_fn fn, void *context); +/* + * Prepare a callback and submit it via the kcopyd thread. + * + * dm_kcopyd_prepare_callback allocates a callback structure and returns it. + * It must not be called from interrupt context. + * The returned value should be passed into dm_kcopyd_do_callback. + * + * dm_kcopyd_do_callback submits the callback. + * It may be called from interrupt context. + * The callback is issued from the kcopyd thread. + */ +void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc, + dm_kcopyd_notify_fn fn, void *context); +void dm_kcopyd_do_callback(void *job, int read_err, unsigned long write_err); + #endif /* __KERNEL__ */ #endif /* _LINUX_DM_KCOPYD_H */ diff --git a/include/linux/efi.h b/include/linux/efi.h index ec2572693925..2362a0bc7f0d 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -19,6 +19,7 @@ #include <linux/rtc.h> #include <linux/ioport.h> #include <linux/pfn.h> +#include <linux/pstore.h> #include <asm/page.h> #include <asm/system.h> @@ -232,6 +233,9 @@ typedef efi_status_t efi_query_capsule_caps_t(efi_capsule_header_t **capsules, #define UV_SYSTEM_TABLE_GUID \ EFI_GUID( 0x3b13a7d4, 0x633e, 0x11dd, 0x93, 0xec, 0xda, 0x25, 0x56, 0xd8, 0x95, 0x93 ) +#define LINUX_EFI_CRASH_GUID \ + EFI_GUID( 0xcfc8fc79, 0xbe2e, 0x4ddc, 0x97, 0xf0, 0x9f, 0x98, 0xbf, 0xe2, 0x98, 0xa0 ) + typedef struct { efi_guid_t guid; unsigned long table; @@ -458,6 +462,8 @@ struct efivars { struct kset *kset; struct bin_attribute *new_var, *del_var; const struct efivar_operations *ops; + struct efivar_entry *walk_entry; + struct pstore_info efi_pstore_info; }; int register_efivars(struct efivars *efivars, diff --git a/include/linux/fs.h b/include/linux/fs.h index f23bcb77260c..786b3b1113cf 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2317,11 +2317,18 @@ extern int should_remove_suid(struct dentry *); extern int file_remove_suid(struct file *); extern void __insert_inode_hash(struct inode *, unsigned long hashval); -extern void remove_inode_hash(struct inode *); static inline void insert_inode_hash(struct inode *inode) { __insert_inode_hash(inode, inode->i_ino); } + +extern void __remove_inode_hash(struct inode *); +static inline void remove_inode_hash(struct inode *inode) +{ + if (!inode_unhashed(inode)) + __remove_inode_hash(inode); +} + extern void inode_sb_list_add(struct inode *inode); #ifdef CONFIG_BLOCK diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index d087c2e7b2aa..38f307b8c334 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1329,12 +1329,6 @@ extern int jbd_blocks_per_page(struct inode *inode); #define BUFFER_TRACE2(bh, bh2, info) do {} while (0) #define JBUFFER_TRACE(jh, info) do {} while (0) -/* - * jbd2_dev_to_name is a utility function used by the jbd2 and ext4 - * tracing infrastructure to map a dev_t to a device name. - */ -extern const char *jbd2_dev_to_name(dev_t device); - #endif /* __KERNEL__ */ #endif /* _LINUX_JBD2_H */ diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index b96fb99072ff..eaac770f886e 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -569,12 +569,12 @@ extern struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type); extern int nfs3_proc_setacl(struct inode *inode, int type, struct posix_acl *acl); extern int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode, - mode_t mode); + umode_t mode); extern void nfs3_forget_cached_acls(struct inode *inode); #else static inline int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode, - mode_t mode) + umode_t mode) { return 0; } diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index 9a53b99818e2..951bba82d50d 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -75,10 +75,10 @@ extern void posix_acl_init(struct posix_acl *, int); extern struct posix_acl *posix_acl_alloc(int, gfp_t); extern int posix_acl_valid(const struct posix_acl *); extern int posix_acl_permission(struct inode *, const struct posix_acl *, int); -extern struct posix_acl *posix_acl_from_mode(mode_t, gfp_t); -extern int posix_acl_equiv_mode(const struct posix_acl *, mode_t *); -extern int posix_acl_create(struct posix_acl **, gfp_t, mode_t *); -extern int posix_acl_chmod(struct posix_acl **, gfp_t, mode_t); +extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t); +extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *); +extern int posix_acl_create(struct posix_acl **, gfp_t, umode_t *); +extern int posix_acl_chmod(struct posix_acl **, gfp_t, umode_t); extern struct posix_acl *get_posix_acl(struct inode *, int); extern int set_posix_acl(struct inode *, int, struct posix_acl *); diff --git a/include/linux/pstore.h b/include/linux/pstore.h index 2455ef2683f0..cc03bbf5c4b8 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -38,9 +38,12 @@ struct pstore_info { int (*open)(struct pstore_info *psi); int (*close)(struct pstore_info *psi); ssize_t (*read)(u64 *id, enum pstore_type_id *type, - struct timespec *time); - u64 (*write)(enum pstore_type_id type, size_t size); - int (*erase)(u64 id); + struct timespec *time, struct pstore_info *psi); + u64 (*write)(enum pstore_type_id type, unsigned int part, + size_t size, struct pstore_info *psi); + int (*erase)(enum pstore_type_id type, u64 id, + struct pstore_info *psi); + void *data; }; #ifdef CONFIG_PSTORE diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index 9e87c1cb7270..26f6ea4444e3 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -122,6 +122,9 @@ struct regulator; struct regulator_bulk_data { const char *supply; struct regulator *consumer; + + /* Internal use */ + int ret; }; #if defined(CONFIG_REGULATOR) diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index 6c433b89c80d..1a80bc77517d 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -188,18 +188,16 @@ struct regulator_dev { /* lists we belong to */ struct list_head list; /* list of all regulators */ - struct list_head slist; /* list of supplied regulators */ /* lists we own */ struct list_head consumer_list; /* consumers we supply */ - struct list_head supply_list; /* regulators we supply */ struct blocking_notifier_head notifier; struct mutex mutex; /* consumer lock */ struct module *owner; struct device dev; struct regulation_constraints *constraints; - struct regulator_dev *supply; /* for tree */ + struct regulator *supply; /* for tree */ void *reg_data; /* regulator_dev data */ diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 6363193a3418..b50a54736242 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -23,7 +23,7 @@ TRACE_EVENT(ext4_free_inode, TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( umode_t, mode ) + __field( __u16, mode ) __field( uid_t, uid ) __field( gid_t, gid ) __field( __u64, blocks ) @@ -52,7 +52,7 @@ TRACE_EVENT(ext4_request_inode, TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, dir ) - __field( umode_t, mode ) + __field( __u16, mode ) ), TP_fast_assign( @@ -75,7 +75,7 @@ TRACE_EVENT(ext4_allocate_inode, __field( dev_t, dev ) __field( ino_t, ino ) __field( ino_t, dir ) - __field( umode_t, mode ) + __field( __u16, mode ) ), TP_fast_assign( @@ -725,7 +725,7 @@ TRACE_EVENT(ext4_free_blocks, TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( umode_t, mode ) + __field( __u16, mode ) __field( __u64, block ) __field( unsigned long, count ) __field( int, flags ) @@ -1012,7 +1012,7 @@ TRACE_EVENT(ext4_forget, TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( umode_t, mode ) + __field( __u16, mode ) __field( int, is_metadata ) __field( __u64, block ) ), @@ -1039,7 +1039,7 @@ TRACE_EVENT(ext4_da_update_reserve_space, TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( umode_t, mode ) + __field( __u16, mode ) __field( __u64, i_blocks ) __field( int, used_blocks ) __field( int, reserved_data_blocks ) @@ -1076,7 +1076,7 @@ TRACE_EVENT(ext4_da_reserve_space, TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( umode_t, mode ) + __field( __u16, mode ) __field( __u64, i_blocks ) __field( int, md_needed ) __field( int, reserved_data_blocks ) @@ -1110,7 +1110,7 @@ TRACE_EVENT(ext4_da_release_space, TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( umode_t, mode ) + __field( __u16, mode ) __field( __u64, i_blocks ) __field( int, freed_blocks ) __field( int, reserved_data_blocks ) @@ -1518,6 +1518,77 @@ TRACE_EVENT(ext4_load_inode, (unsigned long) __entry->ino) ); +TRACE_EVENT(ext4_journal_start, + TP_PROTO(struct super_block *sb, int nblocks, unsigned long IP), + + TP_ARGS(sb, nblocks, IP), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, nblocks ) + __field(unsigned long, ip ) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->nblocks = nblocks; + __entry->ip = IP; + ), + + TP_printk("dev %d,%d nblocks %d caller %pF", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->nblocks, (void *)__entry->ip) +); + +DECLARE_EVENT_CLASS(ext4__trim, + TP_PROTO(struct super_block *sb, + ext4_group_t group, + ext4_grpblk_t start, + ext4_grpblk_t len), + + TP_ARGS(sb, group, start, len), + + TP_STRUCT__entry( + __field( int, dev_major ) + __field( int, dev_minor ) + __field( __u32, group ) + __field( int, start ) + __field( int, len ) + ), + + TP_fast_assign( + __entry->dev_major = MAJOR(sb->s_dev); + __entry->dev_minor = MINOR(sb->s_dev); + __entry->group = group; + __entry->start = start; + __entry->len = len; + ), + + TP_printk("dev %d,%d group %u, start %d, len %d", + __entry->dev_major, __entry->dev_minor, + __entry->group, __entry->start, __entry->len) +); + +DEFINE_EVENT(ext4__trim, ext4_trim_extent, + + TP_PROTO(struct super_block *sb, + ext4_group_t group, + ext4_grpblk_t start, + ext4_grpblk_t len), + + TP_ARGS(sb, group, start, len) +); + +DEFINE_EVENT(ext4__trim, ext4_trim_all_free, + + TP_PROTO(struct super_block *sb, + ext4_group_t group, + ext4_grpblk_t start, + ext4_grpblk_t len), + + TP_ARGS(sb, group, start, len) +); + #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h index bf16545cc977..75964412ddbb 100644 --- a/include/trace/events/jbd2.h +++ b/include/trace/events/jbd2.h @@ -26,8 +26,8 @@ TRACE_EVENT(jbd2_checkpoint, __entry->result = result; ), - TP_printk("dev %s result %d", - jbd2_dev_to_name(__entry->dev), __entry->result) + TP_printk("dev %d,%d result %d", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->result) ); DECLARE_EVENT_CLASS(jbd2_commit, @@ -48,9 +48,9 @@ DECLARE_EVENT_CLASS(jbd2_commit, __entry->transaction = commit_transaction->t_tid; ), - TP_printk("dev %s transaction %d sync %d", - jbd2_dev_to_name(__entry->dev), __entry->transaction, - __entry->sync_commit) + TP_printk("dev %d,%d transaction %d sync %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->transaction, __entry->sync_commit) ); DEFINE_EVENT(jbd2_commit, jbd2_start_commit, @@ -100,9 +100,9 @@ TRACE_EVENT(jbd2_end_commit, __entry->head = journal->j_tail_sequence; ), - TP_printk("dev %s transaction %d sync %d head %d", - jbd2_dev_to_name(__entry->dev), __entry->transaction, - __entry->sync_commit, __entry->head) + TP_printk("dev %d,%d transaction %d sync %d head %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->transaction, __entry->sync_commit, __entry->head) ); TRACE_EVENT(jbd2_submit_inode_data, @@ -120,8 +120,9 @@ TRACE_EVENT(jbd2_submit_inode_data, __entry->ino = inode->i_ino; ), - TP_printk("dev %s ino %lu", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino) + TP_printk("dev %d,%d ino %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino) ); TRACE_EVENT(jbd2_run_stats, @@ -156,9 +157,9 @@ TRACE_EVENT(jbd2_run_stats, __entry->blocks_logged = stats->rs_blocks_logged; ), - TP_printk("dev %s tid %lu wait %u running %u locked %u flushing %u " + TP_printk("dev %d,%d tid %lu wait %u running %u locked %u flushing %u " "logging %u handle_count %u blocks %u blocks_logged %u", - jbd2_dev_to_name(__entry->dev), __entry->tid, + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, jiffies_to_msecs(__entry->wait), jiffies_to_msecs(__entry->running), jiffies_to_msecs(__entry->locked), @@ -192,9 +193,9 @@ TRACE_EVENT(jbd2_checkpoint_stats, __entry->dropped = stats->cs_dropped; ), - TP_printk("dev %s tid %lu chp_time %u forced_to_close %u " + TP_printk("dev %d,%d tid %lu chp_time %u forced_to_close %u " "written %u dropped %u", - jbd2_dev_to_name(__entry->dev), __entry->tid, + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, jiffies_to_msecs(__entry->chp_time), __entry->forced_to_close, __entry->written, __entry->dropped) ); @@ -222,9 +223,10 @@ TRACE_EVENT(jbd2_cleanup_journal_tail, __entry->freed = freed; ), - TP_printk("dev %s from %u to %u offset %lu freed %lu", - jbd2_dev_to_name(__entry->dev), __entry->tail_sequence, - __entry->first_tid, __entry->block_nr, __entry->freed) + TP_printk("dev %d,%d from %u to %u offset %lu freed %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tail_sequence, __entry->first_tid, + __entry->block_nr, __entry->freed) ); #endif /* _TRACE_JBD2_H */ diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index a11db956dd62..34872482315e 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -42,6 +42,8 @@ /* Our I/O buffers. */ static char remcom_in_buffer[BUFMAX]; static char remcom_out_buffer[BUFMAX]; +static int gdbstub_use_prev_in_buf; +static int gdbstub_prev_in_buf_pos; /* Storage for the registers, in GDB format. */ static unsigned long gdb_regs[(NUMREGBYTES + @@ -58,6 +60,13 @@ static int gdbstub_read_wait(void) int ret = -1; int i; + if (unlikely(gdbstub_use_prev_in_buf)) { + if (gdbstub_prev_in_buf_pos < gdbstub_use_prev_in_buf) + return remcom_in_buffer[gdbstub_prev_in_buf_pos++]; + else + gdbstub_use_prev_in_buf = 0; + } + /* poll any additional I/O interfaces that are defined */ while (ret < 0) for (i = 0; kdb_poll_funcs[i] != NULL; i++) { @@ -109,7 +118,6 @@ static void get_packet(char *buffer) buffer[count] = ch; count = count + 1; } - buffer[count] = 0; if (ch == '#') { xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4; @@ -124,6 +132,7 @@ static void get_packet(char *buffer) if (dbg_io_ops->flush) dbg_io_ops->flush(); } + buffer[count] = 0; } while (checksum != xmitcsum); } @@ -1082,12 +1091,11 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd) case 'c': strcpy(remcom_in_buffer, cmd); return 0; - case '?': - gdb_cmd_status(ks); - break; - case '\0': - strcpy(remcom_out_buffer, ""); - break; + case '$': + strcpy(remcom_in_buffer, cmd); + gdbstub_use_prev_in_buf = strlen(remcom_in_buffer); + gdbstub_prev_in_buf_pos = 0; + return 0; } dbg_io_ops->write_char('+'); put_packet(remcom_out_buffer); diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 2f62fe85f16a..7179eac7b41c 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -112,9 +112,8 @@ kdb_bt(int argc, const char **argv) unsigned long addr; long offset; - kdbgetintenv("BTARGS", &argcount); /* Arguments to print */ - kdbgetintenv("BTAPROMPT", &btaprompt); /* Prompt after each - * proc in bta */ + /* Prompt after each proc in bta */ + kdbgetintenv("BTAPROMPT", &btaprompt); if (strcmp(argv[0], "bta") == 0) { struct task_struct *g, *p; diff --git a/kernel/debug/kdb/kdb_cmds b/kernel/debug/kdb/kdb_cmds index 56c88e4db309..9834ad303ab6 100644 --- a/kernel/debug/kdb/kdb_cmds +++ b/kernel/debug/kdb/kdb_cmds @@ -18,16 +18,12 @@ defcmd dumpcommon "" "Common kdb debugging" endefcmd defcmd dumpall "" "First line debugging" - set BTSYMARG 1 - set BTARGS 9 pid R -dumpcommon -bta endefcmd defcmd dumpcpu "" "Same as dumpall but only tasks on cpus" - set BTSYMARG 1 - set BTARGS 9 pid R -dumpcommon -btc diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index dd0b1b7dd02c..d9ca9aa481ec 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -30,6 +30,8 @@ EXPORT_SYMBOL_GPL(kdb_poll_funcs); int kdb_poll_idx = 1; EXPORT_SYMBOL_GPL(kdb_poll_idx); +static struct kgdb_state *kdb_ks; + int kdb_stub(struct kgdb_state *ks) { int error = 0; @@ -39,6 +41,7 @@ int kdb_stub(struct kgdb_state *ks) kdb_dbtrap_t db_result = KDB_DB_NOBPT; int i; + kdb_ks = ks; if (KDB_STATE(REENTRY)) { reason = KDB_REASON_SWITCH; KDB_STATE_CLEAR(REENTRY); @@ -123,20 +126,8 @@ int kdb_stub(struct kgdb_state *ks) KDB_STATE_CLEAR(PAGER); kdbnearsym_cleanup(); if (error == KDB_CMD_KGDB) { - if (KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)) { - /* - * This inteface glue which allows kdb to transition in into - * the gdb stub. In order to do this the '?' or '' gdb serial - * packet response is processed here. And then control is - * passed to the gdbstub. - */ - if (KDB_STATE(DOING_KGDB)) - gdbstub_state(ks, "?"); - else - gdbstub_state(ks, ""); + if (KDB_STATE(DOING_KGDB)) KDB_STATE_CLEAR(DOING_KGDB); - KDB_STATE_CLEAR(DOING_KGDB2); - } return DBG_PASS_EVENT; } kdb_bp_install(ks->linux_regs); @@ -166,3 +157,7 @@ int kdb_stub(struct kgdb_state *ks) return kgdb_info[ks->cpu].ret_state; } +void kdb_gdb_state_pass(char *buf) +{ + gdbstub_state(kdb_ks, buf); +} diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 96fdaac46a80..4802eb5840e1 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -31,15 +31,21 @@ char kdb_prompt_str[CMD_BUFLEN]; int kdb_trap_printk; -static void kgdb_transition_check(char *buffer) +static int kgdb_transition_check(char *buffer) { - int slen = strlen(buffer); - if (strncmp(buffer, "$?#3f", slen) != 0 && - strncmp(buffer, "$qSupported#37", slen) != 0 && - strncmp(buffer, "+$qSupported#37", slen) != 0) { + if (buffer[0] != '+' && buffer[0] != '$') { KDB_STATE_SET(KGDB_TRANS); kdb_printf("%s", buffer); + } else { + int slen = strlen(buffer); + if (slen > 3 && buffer[slen - 3] == '#') { + kdb_gdb_state_pass(buffer); + strcpy(buffer, "kgdb"); + KDB_STATE_SET(DOING_KGDB); + return 1; + } } + return 0; } static int kdb_read_get_key(char *buffer, size_t bufsize) @@ -251,6 +257,10 @@ poll_again: case 13: /* enter */ *lastchar++ = '\n'; *lastchar++ = '\0'; + if (!KDB_STATE(KGDB_TRANS)) { + KDB_STATE_SET(KGDB_TRANS); + kdb_printf("%s", buffer); + } kdb_printf("\n"); return buffer; case 4: /* Del */ @@ -382,22 +392,26 @@ poll_again: * printed characters if we think that * kgdb is connecting, until the check * fails */ - if (!KDB_STATE(KGDB_TRANS)) - kgdb_transition_check(buffer); - else + if (!KDB_STATE(KGDB_TRANS)) { + if (kgdb_transition_check(buffer)) + return buffer; + } else { kdb_printf("%c", key); + } } /* Special escape to kgdb */ if (lastchar - buffer >= 5 && strcmp(lastchar - 5, "$?#3f") == 0) { + kdb_gdb_state_pass(lastchar - 5); strcpy(buffer, "kgdb"); KDB_STATE_SET(DOING_KGDB); return buffer; } - if (lastchar - buffer >= 14 && - strcmp(lastchar - 14, "$qSupported#37") == 0) { + if (lastchar - buffer >= 11 && + strcmp(lastchar - 11, "$qSupported") == 0) { + kdb_gdb_state_pass(lastchar - 11); strcpy(buffer, "kgdb"); - KDB_STATE_SET(DOING_KGDB2); + KDB_STATE_SET(DOING_KGDB); return buffer; } } diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index be14779bcef6..63786e71a3cd 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -145,7 +145,6 @@ static char *__env[] = { #endif "RADIX=16", "MDCOUNT=8", /* lines of md output */ - "BTARGS=9", /* 9 possible args in bt */ KDB_PLATFORM_ENV, "DTABCOUNT=30", "NOSECT=1", @@ -172,6 +171,7 @@ static char *__env[] = { (char *)0, (char *)0, (char *)0, + (char *)0, }; static const int __nenv = (sizeof(__env) / sizeof(char *)); @@ -1386,7 +1386,7 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error, } if (result == KDB_CMD_KGDB) { - if (!(KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2))) + if (!KDB_STATE(DOING_KGDB)) kdb_printf("Entering please attach debugger " "or use $D#44+ or $3#33\n"); break; diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 35d69ed1dfb5..e381d105b40b 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -21,7 +21,6 @@ #define KDB_CMD_SS (-1003) #define KDB_CMD_SSB (-1004) #define KDB_CMD_KGDB (-1005) -#define KDB_CMD_KGDB2 (-1006) /* Internal debug flags */ #define KDB_DEBUG_FLAG_BP 0x0002 /* Breakpoint subsystem debug */ @@ -146,7 +145,6 @@ extern int kdb_state; * keyboard on this cpu */ #define KDB_STATE_KEXEC 0x00040000 /* kexec issued */ #define KDB_STATE_DOING_KGDB 0x00080000 /* kgdb enter now issued */ -#define KDB_STATE_DOING_KGDB2 0x00100000 /* kgdb enter now issued */ #define KDB_STATE_KGDB_TRANS 0x00200000 /* Transition to kgdb */ #define KDB_STATE_ARCH 0xff000000 /* Reserved for arch * specific use */ @@ -218,6 +216,7 @@ extern void kdb_print_nameval(const char *name, unsigned long val); extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); extern void kdb_meminfo_proc_show(void); extern char *kdb_getstr(char *, size_t, char *); +extern void kdb_gdb_state_pass(char *buf); /* Defines for kdb_symbol_print */ #define KDB_SP_SPACEB 0x0001 /* Space before string */ diff --git a/mm/oom_kill.c b/mm/oom_kill.c index eafff89b3dd6..626303b52f3c 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -303,7 +303,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, do_each_thread(g, p) { unsigned int points; - if (!p->mm) + if (p->exit_state) continue; if (oom_unkillable_task(p, mem, nodemask)) continue; @@ -319,6 +319,8 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, */ if (test_tsk_thread_flag(p, TIF_MEMDIE)) return ERR_PTR(-1UL); + if (!p->mm) + continue; if (p->flags & PF_EXITING) { /* diff --git a/sound/core/pcm_compat.c b/sound/core/pcm_compat.c index 5fb2e28e796f..91cdf9435fec 100644 --- a/sound/core/pcm_compat.c +++ b/sound/core/pcm_compat.c @@ -342,7 +342,7 @@ static int snd_pcm_ioctl_xfern_compat(struct snd_pcm_substream *substream, kfree(bufs); return -EFAULT; } - bufs[ch] = compat_ptr(ptr); + bufs[i] = compat_ptr(ptr); bufptr++; } if (dir == SNDRV_PCM_STREAM_PLAYBACK) diff --git a/sound/core/rtctimer.c b/sound/core/rtctimer.c index 0851cd13e303..e85e72baff9e 100644 --- a/sound/core/rtctimer.c +++ b/sound/core/rtctimer.c @@ -22,7 +22,7 @@ #include <linux/init.h> #include <linux/interrupt.h> -#include <linux/moduleparam.h> +#include <linux/module.h> #include <linux/log2.h> #include <sound/core.h> #include <sound/timer.h> diff --git a/sound/pci/asihpi/hpidspcd.c b/sound/pci/asihpi/hpidspcd.c index 3a7afa31c1d8..71d32c868c92 100644 --- a/sound/pci/asihpi/hpidspcd.c +++ b/sound/pci/asihpi/hpidspcd.c @@ -43,6 +43,7 @@ short hpi_dsp_code_open(u32 adapter, void *os_data, struct dsp_code *dsp_code, struct pci_dev *dev = os_data; struct code_header header; char fw_name[20]; + short err_ret = HPI_ERROR_DSP_FILE_NOT_FOUND; int err; sprintf(fw_name, "asihpi/dsp%04x.bin", adapter); @@ -85,8 +86,10 @@ short hpi_dsp_code_open(u32 adapter, void *os_data, struct dsp_code *dsp_code, HPI_DEBUG_LOG(DEBUG, "dsp code %s opened\n", fw_name); dsp_code->pvt = kmalloc(sizeof(*dsp_code->pvt), GFP_KERNEL); - if (!dsp_code->pvt) - return HPI_ERROR_MEMORY_ALLOC; + if (!dsp_code->pvt) { + err_ret = HPI_ERROR_MEMORY_ALLOC; + goto error2; + } dsp_code->pvt->dev = dev; dsp_code->pvt->firmware = firmware; @@ -99,7 +102,7 @@ error2: release_firmware(firmware); error1: dsp_code->block_length = 0; - return HPI_ERROR_DSP_FILE_NOT_FOUND; + return err_ret; } /*-------------------------------------------------------------------*/ diff --git a/sound/pci/asihpi/hpioctl.c b/sound/pci/asihpi/hpioctl.c index 9683f84ecdc8..a32502e796de 100644 --- a/sound/pci/asihpi/hpioctl.c +++ b/sound/pci/asihpi/hpioctl.c @@ -177,16 +177,21 @@ long asihpi_hpi_ioctl(struct file *file, unsigned int cmd, unsigned long arg) } else { u16 __user *ptr = NULL; u32 size = 0; - + u32 adapter_present; /* -1=no data 0=read from user mem, 1=write to user mem */ int wrflag = -1; - u32 adapter = hm->h.adapter_index; - struct hpi_adapter *pa = &adapters[adapter]; + struct hpi_adapter *pa; + + if (hm->h.adapter_index < HPI_MAX_ADAPTERS) { + pa = &adapters[hm->h.adapter_index]; + adapter_present = pa->type; + } else { + adapter_present = 0; + } - if ((adapter >= HPI_MAX_ADAPTERS) || (!pa->type)) { - hpi_init_response(&hr->r0, HPI_OBJ_ADAPTER, - HPI_ADAPTER_OPEN, - HPI_ERROR_BAD_ADAPTER_NUMBER); + if (!adapter_present) { + hpi_init_response(&hr->r0, hm->h.object, + hm->h.function, HPI_ERROR_BAD_ADAPTER_NUMBER); uncopied_bytes = copy_to_user(puhr, hr, sizeof(hr->h)); diff --git a/sound/pci/rme9652/hdspm.c b/sound/pci/rme9652/hdspm.c index af130ee0c45d..6edc67ced905 100644 --- a/sound/pci/rme9652/hdspm.c +++ b/sound/pci/rme9652/hdspm.c @@ -521,6 +521,7 @@ MODULE_SUPPORTED_DEVICE("{{RME HDSPM-MADI}}"); #define HDSPM_DMA_AREA_KILOBYTES (HDSPM_DMA_AREA_BYTES/1024) /* revisions >= 230 indicate AES32 card */ +#define HDSPM_MADI_ANCIENT_REV 204 #define HDSPM_MADI_OLD_REV 207 #define HDSPM_MADI_REV 210 #define HDSPM_RAYDAT_REV 211 @@ -1217,6 +1218,22 @@ static int hdspm_external_sample_rate(struct hdspm *hdspm) rate = 0; break; } + + /* QS and DS rates normally can not be detected + * automatically by the card. Only exception is MADI + * in 96k frame mode. + * + * So if we read SS values (32 .. 48k), check for + * user-provided DS/QS bits in the control register + * and multiply the base frequency accordingly. + */ + if (rate <= 48000) { + if (hdspm->control_register & HDSPM_QuadSpeed) + rate *= 4; + else if (hdspm->control_register & + HDSPM_DoubleSpeed) + rate *= 2; + } } break; } @@ -3415,6 +3432,91 @@ static int snd_hdspm_put_qs_wire(struct snd_kcontrol *kcontrol, return change; } +#define HDSPM_MADI_SPEEDMODE(xname, xindex) \ +{ .iface = SNDRV_CTL_ELEM_IFACE_MIXER, \ + .name = xname, \ + .index = xindex, \ + .info = snd_hdspm_info_madi_speedmode, \ + .get = snd_hdspm_get_madi_speedmode, \ + .put = snd_hdspm_put_madi_speedmode \ +} + +static int hdspm_madi_speedmode(struct hdspm *hdspm) +{ + if (hdspm->control_register & HDSPM_QuadSpeed) + return 2; + if (hdspm->control_register & HDSPM_DoubleSpeed) + return 1; + return 0; +} + +static int hdspm_set_madi_speedmode(struct hdspm *hdspm, int mode) +{ + hdspm->control_register &= ~(HDSPM_DoubleSpeed | HDSPM_QuadSpeed); + switch (mode) { + case 0: + break; + case 1: + hdspm->control_register |= HDSPM_DoubleSpeed; + break; + case 2: + hdspm->control_register |= HDSPM_QuadSpeed; + break; + } + hdspm_write(hdspm, HDSPM_controlRegister, hdspm->control_register); + + return 0; +} + +static int snd_hdspm_info_madi_speedmode(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_info *uinfo) +{ + static char *texts[] = { "Single", "Double", "Quad" }; + + uinfo->type = SNDRV_CTL_ELEM_TYPE_ENUMERATED; + uinfo->count = 1; + uinfo->value.enumerated.items = 3; + + if (uinfo->value.enumerated.item >= uinfo->value.enumerated.items) + uinfo->value.enumerated.item = + uinfo->value.enumerated.items - 1; + strcpy(uinfo->value.enumerated.name, + texts[uinfo->value.enumerated.item]); + + return 0; +} + +static int snd_hdspm_get_madi_speedmode(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct hdspm *hdspm = snd_kcontrol_chip(kcontrol); + + spin_lock_irq(&hdspm->lock); + ucontrol->value.enumerated.item[0] = hdspm_madi_speedmode(hdspm); + spin_unlock_irq(&hdspm->lock); + return 0; +} + +static int snd_hdspm_put_madi_speedmode(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct hdspm *hdspm = snd_kcontrol_chip(kcontrol); + int change; + int val; + + if (!snd_hdspm_use_is_exclusive(hdspm)) + return -EBUSY; + val = ucontrol->value.integer.value[0]; + if (val < 0) + val = 0; + if (val > 2) + val = 2; + spin_lock_irq(&hdspm->lock); + change = val != hdspm_madi_speedmode(hdspm); + hdspm_set_madi_speedmode(hdspm, val); + spin_unlock_irq(&hdspm->lock); + return change; +} #define HDSPM_MIXER(xname, xindex) \ { .iface = SNDRV_CTL_ELEM_IFACE_HWDEP, \ @@ -4289,7 +4391,8 @@ static struct snd_kcontrol_new snd_hdspm_controls_madi[] = { HDSPM_TX_64("TX 64 channels mode", 0), HDSPM_C_TMS("Clear Track Marker", 0), HDSPM_SAFE_MODE("Safe Mode", 0), - HDSPM_INPUT_SELECT("Input Select", 0) + HDSPM_INPUT_SELECT("Input Select", 0), + HDSPM_MADI_SPEEDMODE("MADI Speed Mode", 0) }; @@ -4302,7 +4405,8 @@ static struct snd_kcontrol_new snd_hdspm_controls_madiface[] = { HDSPM_SYNC_CHECK("MADI SyncCheck", 0), HDSPM_TX_64("TX 64 channels mode", 0), HDSPM_C_TMS("Clear Track Marker", 0), - HDSPM_SAFE_MODE("Safe Mode", 0) + HDSPM_SAFE_MODE("Safe Mode", 0), + HDSPM_MADI_SPEEDMODE("MADI Speed Mode", 0) }; static struct snd_kcontrol_new snd_hdspm_controls_aio[] = { @@ -6381,6 +6485,7 @@ static int __devinit snd_hdspm_create(struct snd_card *card, switch (hdspm->firmware_rev) { case HDSPM_MADI_REV: case HDSPM_MADI_OLD_REV: + case HDSPM_MADI_ANCIENT_REV: hdspm->io_type = MADI; hdspm->card_name = "RME MADI"; hdspm->midiPorts = 3; diff --git a/sound/soc/txx9/txx9aclc.c b/sound/soc/txx9/txx9aclc.c index 34aa972669ed..3de99af8cb82 100644 --- a/sound/soc/txx9/txx9aclc.c +++ b/sound/soc/txx9/txx9aclc.c @@ -290,6 +290,7 @@ static void txx9aclc_pcm_free_dma_buffers(struct snd_pcm *pcm) static int txx9aclc_pcm_new(struct snd_soc_pcm_runtime *rtd) { + struct snd_card *card = rtd->card->snd_card; struct snd_soc_dai *dai = rtd->cpu_dai; struct snd_pcm *pcm = rtd->pcm; struct platform_device *pdev = to_platform_device(dai->platform->dev); |