aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--block/badblocks.c1618
-rw-r--r--block/blk-flush.c11
-rw-r--r--block/blk-mq-debugfs.c1
-rw-r--r--block/blk-mq.c45
-rw-r--r--block/blk-mq.h57
-rw-r--r--block/partitions/ibm.c98
-rw-r--r--drivers/block/aoe/aoenet.c3
-rw-r--r--drivers/block/null_blk/main.c22
-rw-r--r--drivers/block/virtio_blk.c2
-rw-r--r--drivers/cdrom/cdrom.c1
-rw-r--r--drivers/md/dm-raid.c17
-rw-r--r--drivers/md/md-autodetect.c4
-rw-r--r--drivers/md/md-bitmap.c61
-rw-r--r--drivers/md/md-linear.c28
-rw-r--r--drivers/md/md-linear.h2
-rw-r--r--drivers/md/md.c805
-rw-r--r--drivers/md/md.h52
-rw-r--r--drivers/md/raid1.c6
-rw-r--r--drivers/md/raid10.c3
-rw-r--r--drivers/md/raid5-cache.c64
-rw-r--r--drivers/md/raid5.c59
-rw-r--r--drivers/nvme/host/pci.c1
-rw-r--r--include/linux/badblocks.h30
-rw-r--r--include/linux/blk-mq.h2
24 files changed, 2094 insertions, 898 deletions
diff --git a/block/badblocks.c b/block/badblocks.c
index 3afb550c0f7b..fc92d4e18aa3 100644
--- a/block/badblocks.c
+++ b/block/badblocks.c
@@ -16,119 +16,830 @@
#include <linux/types.h>
#include <linux/slab.h>
-/**
- * badblocks_check() - check a given range for bad sectors
- * @bb: the badblocks structure that holds all badblock information
- * @s: sector (start) at which to check for badblocks
- * @sectors: number of sectors to check for badblocks
- * @first_bad: pointer to store location of the first badblock
- * @bad_sectors: pointer to store number of badblocks after @first_bad
+/*
+ * The purpose of badblocks set/clear is to manage bad blocks ranges which are
+ * identified by LBA addresses.
*
- * We can record which blocks on each device are 'bad' and so just
- * fail those blocks, or that stripe, rather than the whole device.
- * Entries in the bad-block table are 64bits wide. This comprises:
- * Length of bad-range, in sectors: 0-511 for lengths 1-512
- * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
- * A 'shift' can be set so that larger blocks are tracked and
- * consequently larger devices can be covered.
- * 'Acknowledged' flag - 1 bit. - the most significant bit.
+ * When the caller of badblocks_set() wants to set a range of bad blocks, the
+ * setting range can be acked or unacked. And the setting range may merge,
+ * overwrite, skip the overlapped already set range, depends on who they are
+ * overlapped or adjacent, and the acknowledgment type of the ranges. It can be
+ * more complicated when the setting range covers multiple already set bad block
+ * ranges, with restrictions of maximum length of each bad range and the bad
+ * table space limitation.
*
- * Locking of the bad-block table uses a seqlock so badblocks_check
- * might need to retry if it is very unlucky.
- * We will sometimes want to check for bad blocks in a bi_end_io function,
- * so we use the write_seqlock_irq variant.
+ * It is difficult and unnecessary to take care of all the possible situations,
+ * for setting a large range of bad blocks, we can handle it by dividing the
+ * large range into smaller ones when encounter overlap, max range length or
+ * bad table full conditions. Every time only a smaller piece of the bad range
+ * is handled with a limited number of conditions how it is interacted with
+ * possible overlapped or adjacent already set bad block ranges. Then the hard
+ * complicated problem can be much simpler to handle in proper way.
*
- * When looking for a bad block we specify a range and want to
- * know if any block in the range is bad. So we binary-search
- * to the last range that starts at-or-before the given endpoint,
- * (or "before the sector after the target range")
- * then see if it ends after the given start.
+ * When setting a range of bad blocks to the bad table, the simplified situations
+ * to be considered are, (The already set bad blocks ranges are naming with
+ * prefix E, and the setting bad blocks range is naming with prefix S)
*
- * Return:
- * 0: there are no known bad blocks in the range
- * 1: there are known bad block which are all acknowledged
- * -1: there are bad blocks which have not yet been acknowledged in metadata.
- * plus the start/length of the first bad section we overlap.
+ * 1) A setting range is not overlapped or adjacent to any other already set bad
+ * block range.
+ * +--------+
+ * | S |
+ * +--------+
+ * +-------------+ +-------------+
+ * | E1 | | E2 |
+ * +-------------+ +-------------+
+ * For this situation if the bad blocks table is not full, just allocate a
+ * free slot from the bad blocks table to mark the setting range S. The
+ * result is,
+ * +-------------+ +--------+ +-------------+
+ * | E1 | | S | | E2 |
+ * +-------------+ +--------+ +-------------+
+ * 2) A setting range starts exactly at a start LBA of an already set bad blocks
+ * range.
+ * 2.1) The setting range size < already set range size
+ * +--------+
+ * | S |
+ * +--------+
+ * +-------------+
+ * | E |
+ * +-------------+
+ * 2.1.1) If S and E are both acked or unacked range, the setting range S can
+ * be merged into existing bad range E. The result is,
+ * +-------------+
+ * | S |
+ * +-------------+
+ * 2.1.2) If S is unacked setting and E is acked, the setting will be denied, and
+ * the result is,
+ * +-------------+
+ * | E |
+ * +-------------+
+ * 2.1.3) If S is acked setting and E is unacked, range S can overwrite on E.
+ * An extra slot from the bad blocks table will be allocated for S, and head
+ * of E will move to end of the inserted range S. The result is,
+ * +--------+----+
+ * | S | E |
+ * +--------+----+
+ * 2.2) The setting range size == already set range size
+ * 2.2.1) If S and E are both acked or unacked range, the setting range S can
+ * be merged into existing bad range E. The result is,
+ * +-------------+
+ * | S |
+ * +-------------+
+ * 2.2.2) If S is unacked setting and E is acked, the setting will be denied, and
+ * the result is,
+ * +-------------+
+ * | E |
+ * +-------------+
+ * 2.2.3) If S is acked setting and E is unacked, range S can overwrite all of
+ bad blocks range E. The result is,
+ * +-------------+
+ * | S |
+ * +-------------+
+ * 2.3) The setting range size > already set range size
+ * +-------------------+
+ * | S |
+ * +-------------------+
+ * +-------------+
+ * | E |
+ * +-------------+
+ * For such situation, the setting range S can be treated as two parts, the
+ * first part (S1) is as same size as the already set range E, the second
+ * part (S2) is the rest of setting range.
+ * +-------------+-----+ +-------------+ +-----+
+ * | S1 | S2 | | S1 | | S2 |
+ * +-------------+-----+ ===> +-------------+ +-----+
+ * +-------------+ +-------------+
+ * | E | | E |
+ * +-------------+ +-------------+
+ * Now we only focus on how to handle the setting range S1 and already set
+ * range E, which are already explained in 2.2), for the rest S2 it will be
+ * handled later in next loop.
+ * 3) A setting range starts before the start LBA of an already set bad blocks
+ * range.
+ * +-------------+
+ * | S |
+ * +-------------+
+ * +-------------+
+ * | E |
+ * +-------------+
+ * For this situation, the setting range S can be divided into two parts, the
+ * first (S1) ends at the start LBA of already set range E, the second part
+ * (S2) starts exactly at a start LBA of the already set range E.
+ * +----+---------+ +----+ +---------+
+ * | S1 | S2 | | S1 | | S2 |
+ * +----+---------+ ===> +----+ +---------+
+ * +-------------+ +-------------+
+ * | E | | E |
+ * +-------------+ +-------------+
+ * Now only the first part S1 should be handled in this loop, which is in
+ * similar condition as 1). The rest part S2 has exact same start LBA address
+ * of the already set range E, they will be handled in next loop in one of
+ * situations in 2).
+ * 4) A setting range starts after the start LBA of an already set bad blocks
+ * range.
+ * 4.1) If the setting range S exactly matches the tail part of already set bad
+ * blocks range E, like the following chart shows,
+ * +---------+
+ * | S |
+ * +---------+
+ * +-------------+
+ * | E |
+ * +-------------+
+ * 4.1.1) If range S and E have same acknowledge value (both acked or unacked),
+ * they will be merged into one, the result is,
+ * +-------------+
+ * | S |
+ * +-------------+
+ * 4.1.2) If range E is acked and the setting range S is unacked, the setting
+ * request of S will be rejected, the result is,
+ * +-------------+
+ * | E |
+ * +-------------+
+ * 4.1.3) If range E is unacked, and the setting range S is acked, then S may
+ * overwrite the overlapped range of E, the result is,
+ * +---+---------+
+ * | E | S |
+ * +---+---------+
+ * 4.2) If the setting range S stays in middle of an already set range E, like
+ * the following chart shows,
+ * +----+
+ * | S |
+ * +----+
+ * +--------------+
+ * | E |
+ * +--------------+
+ * 4.2.1) If range S and E have same acknowledge value (both acked or unacked),
+ * they will be merged into one, the result is,
+ * +--------------+
+ * | S |
+ * +--------------+
+ * 4.2.2) If range E is acked and the setting range S is unacked, the setting
+ * request of S will be rejected, the result is also,
+ * +--------------+
+ * | E |
+ * +--------------+
+ * 4.2.3) If range E is unacked, and the setting range S is acked, then S will
+ * inserted into middle of E and split previous range E into two parts (E1
+ * and E2), the result is,
+ * +----+----+----+
+ * | E1 | S | E2 |
+ * +----+----+----+
+ * 4.3) If the setting bad blocks range S is overlapped with an already set bad
+ * blocks range E. The range S starts after the start LBA of range E, and
+ * ends after the end LBA of range E, as the following chart shows,
+ * +-------------------+
+ * | S |
+ * +-------------------+
+ * +-------------+
+ * | E |
+ * +-------------+
+ * For this situation the range S can be divided into two parts, the first
+ * part (S1) ends at end range E, and the second part (S2) has rest range of
+ * origin S.
+ * +---------+---------+ +---------+ +---------+
+ * | S1 | S2 | | S1 | | S2 |
+ * +---------+---------+ ===> +---------+ +---------+
+ * +-------------+ +-------------+
+ * | E | | E |
+ * +-------------+ +-------------+
+ * Now in this loop the setting range S1 and already set range E can be
+ * handled as the situations 4.1), the rest range S2 will be handled in next
+ * loop and ignored in this loop.
+ * 5) A setting bad blocks range S is adjacent to one or more already set bad
+ * blocks range(s), and they are all acked or unacked range.
+ * 5.1) Front merge: If the already set bad blocks range E is before setting
+ * range S and they are adjacent,
+ * +------+
+ * | S |
+ * +------+
+ * +-------+
+ * | E |
+ * +-------+
+ * 5.1.1) When total size of range S and E <= BB_MAX_LEN, and their acknowledge
+ * values are same, the setting range S can front merges into range E. The
+ * result is,
+ * +--------------+
+ * | S |
+ * +--------------+
+ * 5.1.2) Otherwise these two ranges cannot merge, just insert the setting
+ * range S right after already set range E into the bad blocks table. The
+ * result is,
+ * +--------+------+
+ * | E | S |
+ * +--------+------+
+ * 6) Special cases which above conditions cannot handle
+ * 6.1) Multiple already set ranges may merge into less ones in a full bad table
+ * +-------------------------------------------------------+
+ * | S |
+ * +-------------------------------------------------------+
+ * |<----- BB_MAX_LEN ----->|
+ * +-----+ +-----+ +-----+
+ * | E1 | | E2 | | E3 |
+ * +-----+ +-----+ +-----+
+ * In the above example, when the bad blocks table is full, inserting the
+ * first part of setting range S will fail because no more available slot
+ * can be allocated from bad blocks table. In this situation a proper
+ * setting method should be go though all the setting bad blocks range and
+ * look for chance to merge already set ranges into less ones. When there
+ * is available slot from bad blocks table, re-try again to handle more
+ * setting bad blocks ranges as many as possible.
+ * +------------------------+
+ * | S3 |
+ * +------------------------+
+ * |<----- BB_MAX_LEN ----->|
+ * +-----+-----+-----+---+-----+--+
+ * | S1 | S2 |
+ * +-----+-----+-----+---+-----+--+
+ * The above chart shows although the first part (S3) cannot be inserted due
+ * to no-space in bad blocks table, but the following E1, E2 and E3 ranges
+ * can be merged with rest part of S into less range S1 and S2. Now there is
+ * 1 free slot in bad blocks table.
+ * +------------------------+-----+-----+-----+---+-----+--+
+ * | S3 | S1 | S2 |
+ * +------------------------+-----+-----+-----+---+-----+--+
+ * Since the bad blocks table is not full anymore, re-try again for the
+ * origin setting range S. Now the setting range S3 can be inserted into the
+ * bad blocks table with previous freed slot from multiple ranges merge.
+ * 6.2) Front merge after overwrite
+ * In the following example, in bad blocks table, E1 is an acked bad blocks
+ * range and E2 is an unacked bad blocks range, therefore they are not able
+ * to merge into a larger range. The setting bad blocks range S is acked,
+ * therefore part of E2 can be overwritten by S.
+ * +--------+
+ * | S | acknowledged
+ * +--------+ S: 1
+ * +-------+-------------+ E1: 1
+ * | E1 | E2 | E2: 0
+ * +-------+-------------+
+ * With previous simplified routines, after overwriting part of E2 with S,
+ * the bad blocks table should be (E3 is remaining part of E2 which is not
+ * overwritten by S),
+ * acknowledged
+ * +-------+--------+----+ S: 1
+ * | E1 | S | E3 | E1: 1
+ * +-------+--------+----+ E3: 0
+ * The above result is correct but not perfect. Range E1 and S in the bad
+ * blocks table are all acked, merging them into a larger one range may
+ * occupy less bad blocks table space and make badblocks_check() faster.
+ * Therefore in such situation, after overwriting range S, the previous range
+ * E1 should be checked for possible front combination. Then the ideal
+ * result can be,
+ * +----------------+----+ acknowledged
+ * | E1 | E3 | E1: 1
+ * +----------------+----+ E3: 0
+ * 6.3) Behind merge: If the already set bad blocks range E is behind the setting
+ * range S and they are adjacent. Normally we don't need to care about this
+ * because front merge handles this while going though range S from head to
+ * tail, except for the tail part of range S. When the setting range S are
+ * fully handled, all the above simplified routine doesn't check whether the
+ * tail LBA of range S is adjacent to the next already set range and not
+ * merge them even it is possible.
+ * +------+
+ * | S |
+ * +------+
+ * +-------+
+ * | E |
+ * +-------+
+ * For the above special situation, when the setting range S are all handled
+ * and the loop ends, an extra check is necessary for whether next already
+ * set range E is right after S and mergeable.
+ * 6.3.1) When total size of range E and S <= BB_MAX_LEN, and their acknowledge
+ * values are same, the setting range S can behind merges into range E. The
+ * result is,
+ * +--------------+
+ * | S |
+ * +--------------+
+ * 6.3.2) Otherwise these two ranges cannot merge, just insert the setting range
+ * S in front of the already set range E in the bad blocks table. The result
+ * is,
+ * +------+-------+
+ * | S | E |
+ * +------+-------+
+ *
+ * All the above 5 simplified situations and 3 special cases may cover 99%+ of
+ * the bad block range setting conditions. Maybe there is some rare corner case
+ * is not considered and optimized, it won't hurt if badblocks_set() fails due
+ * to no space, or some ranges are not merged to save bad blocks table space.
+ *
+ * Inside badblocks_set() each loop starts by jumping to re_insert label, every
+ * time for the new loop prev_badblocks() is called to find an already set range
+ * which starts before or at current setting range. Since the setting bad blocks
+ * range is handled from head to tail, most of the cases it is unnecessary to do
+ * the binary search inside prev_badblocks(), it is possible to provide a hint
+ * to prev_badblocks() for a fast path, then the expensive binary search can be
+ * avoided. In my test with the hint to prev_badblocks(), except for the first
+ * loop, all rested calls to prev_badblocks() can go into the fast path and
+ * return correct bad blocks table index immediately.
+ *
+ *
+ * Clearing a bad blocks range from the bad block table has similar idea as
+ * setting does, but much more simpler. The only thing needs to be noticed is
+ * when the clearing range hits middle of a bad block range, the existing bad
+ * block range will split into two, and one more item should be added into the
+ * bad block table. The simplified situations to be considered are, (The already
+ * set bad blocks ranges in bad block table are naming with prefix E, and the
+ * clearing bad blocks range is naming with prefix C)
+ *
+ * 1) A clearing range is not overlapped to any already set ranges in bad block
+ * table.
+ * +-----+ | +-----+ | +-----+
+ * | C | | | C | | | C |
+ * +-----+ or +-----+ or +-----+
+ * +---+ | +----+ +----+ | +---+
+ * | E | | | E1 | | E2 | | | E |
+ * +---+ | +----+ +----+ | +---+
+ * For the above situations, no bad block to be cleared and no failure
+ * happens, simply returns 0.
+ * 2) The clearing range hits middle of an already setting bad blocks range in
+ * the bad block table.
+ * +---+
+ * | C |
+ * +---+
+ * +-----------------+
+ * | E |
+ * +-----------------+
+ * In this situation if the bad block table is not full, the range E will be
+ * split into two ranges E1 and E2. The result is,
+ * +------+ +------+
+ * | E1 | | E2 |
+ * +------+ +------+
+ * 3) The clearing range starts exactly at same LBA as an already set bad block range
+ * from the bad block table.
+ * 3.1) Partially covered at head part
+ * +------------+
+ * | C |
+ * +------------+
+ * +-----------------+
+ * | E |
+ * +-----------------+
+ * For this situation, the overlapped already set range will update the
+ * start LBA to end of C and shrink the range to BB_LEN(E) - BB_LEN(C). No
+ * item deleted from bad block table. The result is,
+ * +----+
+ * | E1 |
+ * +----+
+ * 3.2) Exact fully covered
+ * +-----------------+
+ * | C |
+ * +-----------------+
+ * +-----------------+
+ * | E |
+ * +-----------------+
+ * For this situation the whole bad blocks range E will be cleared and its
+ * corresponded item is deleted from the bad block table.
+ * 4) The clearing range exactly ends at same LBA as an already set bad block
+ * range.
+ * +-------+
+ * | C |
+ * +-------+
+ * +-----------------+
+ * | E |
+ * +-----------------+
+ * For the above situation, the already set range E is updated to shrink its
+ * end to the start of C, and reduce its length to BB_LEN(E) - BB_LEN(C).
+ * The result is,
+ * +---------+
+ * | E |
+ * +---------+
+ * 5) The clearing range is partially overlapped with an already set bad block
+ * range from the bad block table.
+ * 5.1) The already set bad block range is front overlapped with the clearing
+ * range.
+ * +----------+
+ * | C |
+ * +----------+
+ * +------------+
+ * | E |
+ * +------------+
+ * For such situation, the clearing range C can be treated as two parts. The
+ * first part ends at the start LBA of range E, and the second part starts at
+ * same LBA of range E.
+ * +----+-----+ +----+ +-----+
+ * | C1 | C2 | | C1 | | C2 |
+ * +----+-----+ ===> +----+ +-----+
+ * +------------+ +------------+
+ * | E | | E |
+ * +------------+ +------------+
+ * Now the first part C1 can be handled as condition 1), and the second part C2 can be
+ * handled as condition 3.1) in next loop.
+ * 5.2) The already set bad block range is behind overlaopped with the clearing
+ * range.
+ * +----------+
+ * | C |
+ * +----------+
+ * +------------+
+ * | E |
+ * +------------+
+ * For such situation, the clearing range C can be treated as two parts. The
+ * first part C1 ends at same end LBA of range E, and the second part starts
+ * at end LBA of range E.
+ * +----+-----+ +----+ +-----+
+ * | C1 | C2 | | C1 | | C2 |
+ * +----+-----+ ===> +----+ +-----+
+ * +------------+ +------------+
+ * | E | | E |
+ * +------------+ +------------+
+ * Now the first part clearing range C1 can be handled as condition 4), and
+ * the second part clearing range C2 can be handled as condition 1) in next
+ * loop.
+ *
+ * All bad blocks range clearing can be simplified into the above 5 situations
+ * by only handling the head part of the clearing range in each run of the
+ * while-loop. The idea is similar to bad blocks range setting but much
+ * simpler.
*/
-int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
- sector_t *first_bad, int *bad_sectors)
+
+/*
+ * Find the range starts at-or-before 's' from bad table. The search
+ * starts from index 'hint' and stops at index 'hint_end' from the bad
+ * table.
+ */
+static int prev_by_hint(struct badblocks *bb, sector_t s, int hint)
{
- int hi;
- int lo;
+ int hint_end = hint + 2;
u64 *p = bb->page;
- int rv;
- sector_t target = s + sectors;
- unsigned seq;
+ int ret = -1;
- if (bb->shift > 0) {
- /* round the start down, and the end up */
- s >>= bb->shift;
- target += (1<<bb->shift) - 1;
- target >>= bb->shift;
+ while ((hint < hint_end) && ((hint + 1) <= bb->count) &&
+ (BB_OFFSET(p[hint]) <= s)) {
+ if ((hint + 1) == bb->count || BB_OFFSET(p[hint + 1]) > s) {
+ ret = hint;
+ break;
+ }
+ hint++;
+ }
+
+ return ret;
+}
+
+/*
+ * Find the range starts at-or-before bad->start. If 'hint' is provided
+ * (hint >= 0) then search in the bad table from hint firstly. It is
+ * very probably the wanted bad range can be found from the hint index,
+ * then the unnecessary while-loop iteration can be avoided.
+ */
+static int prev_badblocks(struct badblocks *bb, struct badblocks_context *bad,
+ int hint)
+{
+ sector_t s = bad->start;
+ int ret = -1;
+ int lo, hi;
+ u64 *p;
+
+ if (!bb->count)
+ goto out;
+
+ if (hint >= 0) {
+ ret = prev_by_hint(bb, s, hint);
+ if (ret >= 0)
+ goto out;
}
- /* 'target' is now the first block after the bad range */
-retry:
- seq = read_seqbegin(&bb->lock);
lo = 0;
- rv = 0;
hi = bb->count;
+ p = bb->page;
- /* Binary search between lo and hi for 'target'
- * i.e. for the last range that starts before 'target'
- */
- /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
- * are known not to be the last range before target.
- * VARIANT: hi-lo is the number of possible
- * ranges, and decreases until it reaches 1
- */
+ /* The following bisect search might be unnecessary */
+ if (BB_OFFSET(p[lo]) > s)
+ return -1;
+ if (BB_OFFSET(p[hi - 1]) <= s)
+ return hi - 1;
+
+ /* Do bisect search in bad table */
while (hi - lo > 1) {
- int mid = (lo + hi) / 2;
+ int mid = (lo + hi)/2;
sector_t a = BB_OFFSET(p[mid]);
- if (a < target)
- /* This could still be the one, earlier ranges
- * could not.
- */
+ if (a == s) {
+ ret = mid;
+ goto out;
+ }
+
+ if (a < s)
lo = mid;
else
- /* This and later ranges are definitely out. */
hi = mid;
}
- /* 'lo' might be the last that started before target, but 'hi' isn't */
- if (hi > lo) {
- /* need to check all range that end after 's' to see if
- * any are unacknowledged.
+
+ if (BB_OFFSET(p[lo]) <= s)
+ ret = lo;
+out:
+ return ret;
+}
+
+/*
+ * Return 'true' if the range indicated by 'bad' can be backward merged
+ * with the bad range (from the bad table) index by 'behind'.
+ */
+static bool can_merge_behind(struct badblocks *bb,
+ struct badblocks_context *bad, int behind)
+{
+ sector_t sectors = bad->len;
+ sector_t s = bad->start;
+ u64 *p = bb->page;
+
+ if ((s < BB_OFFSET(p[behind])) &&
+ ((s + sectors) >= BB_OFFSET(p[behind])) &&
+ ((BB_END(p[behind]) - s) <= BB_MAX_LEN) &&
+ BB_ACK(p[behind]) == bad->ack)
+ return true;
+ return false;
+}
+
+/*
+ * Do backward merge for range indicated by 'bad' and the bad range
+ * (from the bad table) indexed by 'behind'. The return value is merged
+ * sectors from bad->len.
+ */
+static int behind_merge(struct badblocks *bb, struct badblocks_context *bad,
+ int behind)
+{
+ sector_t sectors = bad->len;
+ sector_t s = bad->start;
+ u64 *p = bb->page;
+ int merged = 0;
+
+ WARN_ON(s >= BB_OFFSET(p[behind]));
+ WARN_ON((s + sectors) < BB_OFFSET(p[behind]));
+
+ if (s < BB_OFFSET(p[behind])) {
+ merged = BB_OFFSET(p[behind]) - s;
+ p[behind] = BB_MAKE(s, BB_LEN(p[behind]) + merged, bad->ack);
+
+ WARN_ON((BB_LEN(p[behind]) + merged) >= BB_MAX_LEN);
+ }
+
+ return merged;
+}
+
+/*
+ * Return 'true' if the range indicated by 'bad' can be forward
+ * merged with the bad range (from the bad table) indexed by 'prev'.
+ */
+static bool can_merge_front(struct badblocks *bb, int prev,
+ struct badblocks_context *bad)
+{
+ sector_t s = bad->start;
+ u64 *p = bb->page;
+
+ if (BB_ACK(p[prev]) == bad->ack &&
+ (s < BB_END(p[prev]) ||
+ (s == BB_END(p[prev]) && (BB_LEN(p[prev]) < BB_MAX_LEN))))
+ return true;
+ return false;
+}
+
+/*
+ * Do forward merge for range indicated by 'bad' and the bad range
+ * (from bad table) indexed by 'prev'. The return value is sectors
+ * merged from bad->len.
+ */
+static int front_merge(struct badblocks *bb, int prev, struct badblocks_context *bad)
+{
+ sector_t sectors = bad->len;
+ sector_t s = bad->start;
+ u64 *p = bb->page;
+ int merged = 0;
+
+ WARN_ON(s > BB_END(p[prev]));
+
+ if (s < BB_END(p[prev])) {
+ merged = min_t(sector_t, sectors, BB_END(p[prev]) - s);
+ } else {
+ merged = min_t(sector_t, sectors, BB_MAX_LEN - BB_LEN(p[prev]));
+ if ((prev + 1) < bb->count &&
+ merged > (BB_OFFSET(p[prev + 1]) - BB_END(p[prev]))) {
+ merged = BB_OFFSET(p[prev + 1]) - BB_END(p[prev]);
+ }
+
+ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+ BB_LEN(p[prev]) + merged, bad->ack);
+ }
+
+ return merged;
+}
+
+/*
+ * 'Combine' is a special case which can_merge_front() is not able to
+ * handle: If a bad range (indexed by 'prev' from bad table) exactly
+ * starts as bad->start, and the bad range ahead of 'prev' (indexed by
+ * 'prev - 1' from bad table) exactly ends at where 'prev' starts, and
+ * the sum of their lengths does not exceed BB_MAX_LEN limitation, then
+ * these two bad range (from bad table) can be combined.
+ *
+ * Return 'true' if bad ranges indexed by 'prev' and 'prev - 1' from bad
+ * table can be combined.
+ */
+static bool can_combine_front(struct badblocks *bb, int prev,
+ struct badblocks_context *bad)
+{
+ u64 *p = bb->page;
+
+ if ((prev > 0) &&
+ (BB_OFFSET(p[prev]) == bad->start) &&
+ (BB_END(p[prev - 1]) == BB_OFFSET(p[prev])) &&
+ (BB_LEN(p[prev - 1]) + BB_LEN(p[prev]) <= BB_MAX_LEN) &&
+ (BB_ACK(p[prev - 1]) == BB_ACK(p[prev])))
+ return true;
+ return false;
+}
+
+/*
+ * Combine the bad ranges indexed by 'prev' and 'prev - 1' (from bad
+ * table) into one larger bad range, and the new range is indexed by
+ * 'prev - 1'.
+ * The caller of front_combine() will decrease bb->count, therefore
+ * it is unnecessary to clear p[perv] after front merge.
+ */
+static void front_combine(struct badblocks *bb, int prev)
+{
+ u64 *p = bb->page;
+
+ p[prev - 1] = BB_MAKE(BB_OFFSET(p[prev - 1]),
+ BB_LEN(p[prev - 1]) + BB_LEN(p[prev]),
+ BB_ACK(p[prev]));
+ if ((prev + 1) < bb->count)
+ memmove(p + prev, p + prev + 1, (bb->count - prev - 1) * 8);
+}
+
+/*
+ * Return 'true' if the range indicated by 'bad' is exactly forward
+ * overlapped with the bad range (from bad table) indexed by 'front'.
+ * Exactly forward overlap means the bad range (from bad table) indexed
+ * by 'prev' does not cover the whole range indicated by 'bad'.
+ */
+static bool overlap_front(struct badblocks *bb, int front,
+ struct badblocks_context *bad)
+{
+ u64 *p = bb->page;
+
+ if (bad->start >= BB_OFFSET(p[front]) &&
+ bad->start < BB_END(p[front]))
+ return true;
+ return false;
+}
+
+/*
+ * Return 'true' if the range indicated by 'bad' is exactly backward
+ * overlapped with the bad range (from bad table) indexed by 'behind'.
+ */
+static bool overlap_behind(struct badblocks *bb, struct badblocks_context *bad,
+ int behind)
+{
+ u64 *p = bb->page;
+
+ if (bad->start < BB_OFFSET(p[behind]) &&
+ (bad->start + bad->len) > BB_OFFSET(p[behind]))
+ return true;
+ return false;
+}
+
+/*
+ * Return 'true' if the range indicated by 'bad' can overwrite the bad
+ * range (from bad table) indexed by 'prev'.
+ *
+ * The range indicated by 'bad' can overwrite the bad range indexed by
+ * 'prev' when,
+ * 1) The whole range indicated by 'bad' can cover partial or whole bad
+ * range (from bad table) indexed by 'prev'.
+ * 2) The ack value of 'bad' is larger or equal to the ack value of bad
+ * range 'prev'.
+ *
+ * If the overwriting doesn't cover the whole bad range (from bad table)
+ * indexed by 'prev', new range might be split from existing bad range,
+ * 1) The overwrite covers head or tail part of existing bad range, 1
+ * extra bad range will be split and added into the bad table.
+ * 2) The overwrite covers middle of existing bad range, 2 extra bad
+ * ranges will be split (ahead and after the overwritten range) and
+ * added into the bad table.
+ * The number of extra split ranges of the overwriting is stored in
+ * 'extra' and returned for the caller.
+ */
+static bool can_front_overwrite(struct badblocks *bb, int prev,
+ struct badblocks_context *bad, int *extra)
+{
+ u64 *p = bb->page;
+ int len;
+
+ WARN_ON(!overlap_front(bb, prev, bad));
+
+ if (BB_ACK(p[prev]) >= bad->ack)
+ return false;
+
+ if (BB_END(p[prev]) <= (bad->start + bad->len)) {
+ len = BB_END(p[prev]) - bad->start;
+ if (BB_OFFSET(p[prev]) == bad->start)
+ *extra = 0;
+ else
+ *extra = 1;
+
+ bad->len = len;
+ } else {
+ if (BB_OFFSET(p[prev]) == bad->start)
+ *extra = 1;
+ else
+ /*
+ * prev range will be split into two, beside the overwritten
+ * one, an extra slot needed from bad table.
*/
- while (lo >= 0 &&
- BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
- if (BB_OFFSET(p[lo]) < target) {
- /* starts before the end, and finishes after
- * the start, so they must overlap
- */
- if (rv != -1 && BB_ACK(p[lo]))
- rv = 1;
- else
- rv = -1;
- *first_bad = BB_OFFSET(p[lo]);
- *bad_sectors = BB_LEN(p[lo]);
- }
- lo--;
+ *extra = 2;
+ }
+
+ if ((bb->count + (*extra)) >= MAX_BADBLOCKS)
+ return false;
+
+ return true;
+}
+
+/*
+ * Do the overwrite from the range indicated by 'bad' to the bad range
+ * (from bad table) indexed by 'prev'.
+ * The previously called can_front_overwrite() will provide how many
+ * extra bad range(s) might be split and added into the bad table. All
+ * the splitting cases in the bad table will be handled here.
+ */
+static int front_overwrite(struct badblocks *bb, int prev,
+ struct badblocks_context *bad, int extra)
+{
+ u64 *p = bb->page;
+ sector_t orig_end = BB_END(p[prev]);
+ int orig_ack = BB_ACK(p[prev]);
+
+ switch (extra) {
+ case 0:
+ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), BB_LEN(p[prev]),
+ bad->ack);
+ break;
+ case 1:
+ if (BB_OFFSET(p[prev]) == bad->start) {
+ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+ bad->len, bad->ack);
+ memmove(p + prev + 2, p + prev + 1,
+ (bb->count - prev - 1) * 8);
+ p[prev + 1] = BB_MAKE(bad->start + bad->len,
+ orig_end - BB_END(p[prev]),
+ orig_ack);
+ } else {
+ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+ bad->start - BB_OFFSET(p[prev]),
+ orig_ack);
+ /*
+ * prev +2 -> prev + 1 + 1, which is for,
+ * 1) prev + 1: the slot index of the previous one
+ * 2) + 1: one more slot for extra being 1.
+ */
+ memmove(p + prev + 2, p + prev + 1,
+ (bb->count - prev - 1) * 8);
+ p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);
}
+ break;
+ case 2:
+ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+ bad->start - BB_OFFSET(p[prev]),
+ orig_ack);
+ /*
+ * prev + 3 -> prev + 1 + 2, which is for,
+ * 1) prev + 1: the slot index of the previous one
+ * 2) + 2: two more slots for extra being 2.
+ */
+ memmove(p + prev + 3, p + prev + 1,
+ (bb->count - prev - 1) * 8);
+ p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);
+ p[prev + 2] = BB_MAKE(BB_END(p[prev + 1]),
+ orig_end - BB_END(p[prev + 1]),
+ orig_ack);
+ break;
+ default:
+ break;
}
- if (read_seqretry(&bb->lock, seq))
- goto retry;
+ return bad->len;
+}
- return rv;
+/*
+ * Explicitly insert a range indicated by 'bad' to the bad table, where
+ * the location is indexed by 'at'.
+ */
+static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad)
+{
+ u64 *p = bb->page;
+ int len;
+
+ WARN_ON(badblocks_full(bb));
+
+ len = min_t(sector_t, bad->len, BB_MAX_LEN);
+ if (at < bb->count)
+ memmove(p + at + 1, p + at, (bb->count - at) * 8);
+ p[at] = BB_MAKE(bad->start, len, bad->ack);
+
+ return len;
}
-EXPORT_SYMBOL_GPL(badblocks_check);
static void badblocks_update_acked(struct badblocks *bb)
{
+ bool unacked = false;
u64 *p = bb->page;
int i;
- bool unacked = false;
if (!bb->unacked_exist)
return;
@@ -144,281 +855,600 @@ static void badblocks_update_acked(struct badblocks *bb)
bb->unacked_exist = 0;
}
-/**
- * badblocks_set() - Add a range of bad blocks to the table.
- * @bb: the badblocks structure that holds all badblock information
- * @s: first sector to mark as bad
- * @sectors: number of sectors to mark as bad
- * @acknowledged: weather to mark the bad sectors as acknowledged
- *
- * This might extend the table, or might contract it if two adjacent ranges
- * can be merged. We binary-search to find the 'insertion' point, then
- * decide how best to handle it.
- *
- * Return:
- * 0: success
- * 1: failed to set badblocks (out of space)
- */
-int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
- int acknowledged)
+/* Do exact work to set bad block range into the bad block table */
+static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+ int acknowledged)
{
- u64 *p;
- int lo, hi;
- int rv = 0;
+ int retried = 0, space_desired = 0;
+ int orig_len, len = 0, added = 0;
+ struct badblocks_context bad;
+ int prev = -1, hint = -1;
+ sector_t orig_start;
unsigned long flags;
+ int rv = 0;
+ u64 *p;
if (bb->shift < 0)
/* badblocks are disabled */
return 1;
+ if (sectors == 0)
+ /* Invalid sectors number */
+ return 1;
+
if (bb->shift) {
/* round the start down, and the end up */
sector_t next = s + sectors;
- s >>= bb->shift;
- next += (1<<bb->shift) - 1;
- next >>= bb->shift;
+ rounddown(s, bb->shift);
+ roundup(next, bb->shift);
sectors = next - s;
}
write_seqlock_irqsave(&bb->lock, flags);
+ orig_start = s;
+ orig_len = sectors;
+ bad.ack = acknowledged;
p = bb->page;
- lo = 0;
- hi = bb->count;
- /* Find the last range that starts at-or-before 's' */
- while (hi - lo > 1) {
- int mid = (lo + hi) / 2;
- sector_t a = BB_OFFSET(p[mid]);
- if (a <= s)
- lo = mid;
- else
- hi = mid;
+re_insert:
+ bad.start = s;
+ bad.len = sectors;
+ len = 0;
+
+ if (badblocks_empty(bb)) {
+ len = insert_at(bb, 0, &bad);
+ bb->count++;
+ added++;
+ goto update_sectors;
}
- if (hi > lo && BB_OFFSET(p[lo]) > s)
- hi = lo;
- if (hi > lo) {
- /* we found a range that might merge with the start
- * of our new range
- */
- sector_t a = BB_OFFSET(p[lo]);
- sector_t e = a + BB_LEN(p[lo]);
- int ack = BB_ACK(p[lo]);
-
- if (e >= s) {
- /* Yes, we can merge with a previous range */
- if (s == a && s + sectors >= e)
- /* new range covers old */
- ack = acknowledged;
- else
- ack = ack && acknowledged;
-
- if (e < s + sectors)
- e = s + sectors;
- if (e - a <= BB_MAX_LEN) {
- p[lo] = BB_MAKE(a, e-a, ack);
- s = e;
+ prev = prev_badblocks(bb, &bad, hint);
+
+ /* start before all badblocks */
+ if (prev < 0) {
+ if (!badblocks_full(bb)) {
+ /* insert on the first */
+ if (bad.len > (BB_OFFSET(p[0]) - bad.start))
+ bad.len = BB_OFFSET(p[0]) - bad.start;
+ len = insert_at(bb, 0, &bad);
+ bb->count++;
+ added++;
+ hint = 0;
+ goto update_sectors;
+ }
+
+ /* No sapce, try to merge */
+ if (overlap_behind(bb, &bad, 0)) {
+ if (can_merge_behind(bb, &bad, 0)) {
+ len = behind_merge(bb, &bad, 0);
+ added++;
} else {
- /* does not all fit in one range,
- * make p[lo] maximal
- */
- if (BB_LEN(p[lo]) != BB_MAX_LEN)
- p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
- s = a + BB_MAX_LEN;
+ len = BB_OFFSET(p[0]) - s;
+ space_desired = 1;
}
- sectors = e - s;
+ hint = 0;
+ goto update_sectors;
}
+
+ /* no table space and give up */
+ goto out;
}
- if (sectors && hi < bb->count) {
- /* 'hi' points to the first range that starts after 's'.
- * Maybe we can merge with the start of that range
- */
- sector_t a = BB_OFFSET(p[hi]);
- sector_t e = a + BB_LEN(p[hi]);
- int ack = BB_ACK(p[hi]);
-
- if (a <= s + sectors) {
- /* merging is possible */
- if (e <= s + sectors) {
- /* full overlap */
- e = s + sectors;
- ack = acknowledged;
- } else
- ack = ack && acknowledged;
-
- a = s;
- if (e - a <= BB_MAX_LEN) {
- p[hi] = BB_MAKE(a, e-a, ack);
- s = e;
- } else {
- p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
- s = a + BB_MAX_LEN;
+
+ /* in case p[prev-1] can be merged with p[prev] */
+ if (can_combine_front(bb, prev, &bad)) {
+ front_combine(bb, prev);
+ bb->count--;
+ added++;
+ hint = prev;
+ goto update_sectors;
+ }
+
+ if (overlap_front(bb, prev, &bad)) {
+ if (can_merge_front(bb, prev, &bad)) {
+ len = front_merge(bb, prev, &bad);
+ added++;
+ } else {
+ int extra = 0;
+
+ if (!can_front_overwrite(bb, prev, &bad, &extra)) {
+ len = min_t(sector_t,
+ BB_END(p[prev]) - s, sectors);
+ hint = prev;
+ goto update_sectors;
+ }
+
+ len = front_overwrite(bb, prev, &bad, extra);
+ added++;
+ bb->count += extra;
+
+ if (can_combine_front(bb, prev, &bad)) {
+ front_combine(bb, prev);
+ bb->count--;
}
- sectors = e - s;
- lo = hi;
- hi++;
}
+ hint = prev;
+ goto update_sectors;
+ }
+
+ if (can_merge_front(bb, prev, &bad)) {
+ len = front_merge(bb, prev, &bad);
+ added++;
+ hint = prev;
+ goto update_sectors;
}
- if (sectors == 0 && hi < bb->count) {
- /* we might be able to combine lo and hi */
- /* Note: 's' is at the end of 'lo' */
- sector_t a = BB_OFFSET(p[hi]);
- int lolen = BB_LEN(p[lo]);
- int hilen = BB_LEN(p[hi]);
- int newlen = lolen + hilen - (s - a);
-
- if (s >= a && newlen < BB_MAX_LEN) {
- /* yes, we can combine them */
- int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
-
- p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
- memmove(p + hi, p + hi + 1,
- (bb->count - hi - 1) * 8);
- bb->count--;
+
+ /* if no space in table, still try to merge in the covered range */
+ if (badblocks_full(bb)) {
+ /* skip the cannot-merge range */
+ if (((prev + 1) < bb->count) &&
+ overlap_behind(bb, &bad, prev + 1) &&
+ ((s + sectors) >= BB_END(p[prev + 1]))) {
+ len = BB_END(p[prev + 1]) - s;
+ hint = prev + 1;
+ goto update_sectors;
}
+
+ /* no retry any more */
+ len = sectors;
+ space_desired = 1;
+ hint = -1;
+ goto update_sectors;
}
- while (sectors) {
- /* didn't merge (it all).
- * Need to add a range just before 'hi'
- */
- if (bb->count >= MAX_BADBLOCKS) {
- /* No room for more */
- rv = 1;
- break;
- } else {
- int this_sectors = sectors;
- memmove(p + hi + 1, p + hi,
- (bb->count - hi) * 8);
- bb->count++;
+ /* cannot merge and there is space in bad table */
+ if ((prev + 1) < bb->count &&
+ overlap_behind(bb, &bad, prev + 1))
+ bad.len = min_t(sector_t,
+ bad.len, BB_OFFSET(p[prev + 1]) - bad.start);
- if (this_sectors > BB_MAX_LEN)
- this_sectors = BB_MAX_LEN;
- p[hi] = BB_MAKE(s, this_sectors, acknowledged);
- sectors -= this_sectors;
- s += this_sectors;
- }
+ len = insert_at(bb, prev + 1, &bad);
+ bb->count++;
+ added++;
+ hint = prev + 1;
+
+update_sectors:
+ s += len;
+ sectors -= len;
+
+ if (sectors > 0)
+ goto re_insert;
+
+ WARN_ON(sectors < 0);
+
+ /*
+ * Check whether the following already set range can be
+ * merged. (prev < 0) condition is not handled here,
+ * because it's already complicated enough.
+ */
+ if (prev >= 0 &&
+ (prev + 1) < bb->count &&
+ BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) &&
+ (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN &&
+ BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) {
+ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+ BB_LEN(p[prev]) + BB_LEN(p[prev + 1]),
+ BB_ACK(p[prev]));
+
+ if ((prev + 2) < bb->count)
+ memmove(p + prev + 1, p + prev + 2,
+ (bb->count - (prev + 2)) * 8);
+ bb->count--;
+ }
+
+ if (space_desired && !badblocks_full(bb)) {
+ s = orig_start;
+ sectors = orig_len;
+ space_desired = 0;
+ if (retried++ < 3)
+ goto re_insert;
+ }
+
+out:
+ if (added) {
+ set_changed(bb);
+
+ if (!acknowledged)
+ bb->unacked_exist = 1;
+ else
+ badblocks_update_acked(bb);
}
- bb->changed = 1;
- if (!acknowledged)
- bb->unacked_exist = 1;
- else
- badblocks_update_acked(bb);
write_sequnlock_irqrestore(&bb->lock, flags);
+ if (!added)
+ rv = 1;
+
return rv;
}
-EXPORT_SYMBOL_GPL(badblocks_set);
-/**
- * badblocks_clear() - Remove a range of bad blocks to the table.
- * @bb: the badblocks structure that holds all badblock information
- * @s: first sector to mark as bad
- * @sectors: number of sectors to mark as bad
- *
- * This may involve extending the table if we spilt a region,
- * but it must not fail. So if the table becomes full, we just
- * drop the remove request.
- *
- * Return:
- * 0: success
- * 1: failed to clear badblocks
+/*
+ * Clear the bad block range from bad block table which is front overlapped
+ * with the clearing range. The return value is how many sectors from an
+ * already set bad block range are cleared. If the whole bad block range is
+ * covered by the clearing range and fully cleared, 'delete' is set as 1 for
+ * the caller to reduce bb->count.
*/
-int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+static int front_clear(struct badblocks *bb, int prev,
+ struct badblocks_context *bad, int *deleted)
{
- u64 *p;
- int lo, hi;
- sector_t target = s + sectors;
+ sector_t sectors = bad->len;
+ sector_t s = bad->start;
+ u64 *p = bb->page;
+ int cleared = 0;
+
+ *deleted = 0;
+ if (s == BB_OFFSET(p[prev])) {
+ if (BB_LEN(p[prev]) > sectors) {
+ p[prev] = BB_MAKE(BB_OFFSET(p[prev]) + sectors,
+ BB_LEN(p[prev]) - sectors,
+ BB_ACK(p[prev]));
+ cleared = sectors;
+ } else {
+ /* BB_LEN(p[prev]) <= sectors */
+ cleared = BB_LEN(p[prev]);
+ if ((prev + 1) < bb->count)
+ memmove(p + prev, p + prev + 1,
+ (bb->count - prev - 1) * 8);
+ *deleted = 1;
+ }
+ } else if (s > BB_OFFSET(p[prev])) {
+ if (BB_END(p[prev]) <= (s + sectors)) {
+ cleared = BB_END(p[prev]) - s;
+ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+ s - BB_OFFSET(p[prev]),
+ BB_ACK(p[prev]));
+ } else {
+ /* Splitting is handled in front_splitting_clear() */
+ BUG();
+ }
+ }
+
+ return cleared;
+}
+
+/*
+ * Handle the condition that the clearing range hits middle of an already set
+ * bad block range from bad block table. In this condition the existing bad
+ * block range is split into two after the middle part is cleared.
+ */
+static int front_splitting_clear(struct badblocks *bb, int prev,
+ struct badblocks_context *bad)
+{
+ u64 *p = bb->page;
+ u64 end = BB_END(p[prev]);
+ int ack = BB_ACK(p[prev]);
+ sector_t sectors = bad->len;
+ sector_t s = bad->start;
+
+ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+ s - BB_OFFSET(p[prev]),
+ ack);
+ memmove(p + prev + 2, p + prev + 1, (bb->count - prev - 1) * 8);
+ p[prev + 1] = BB_MAKE(s + sectors, end - s - sectors, ack);
+ return sectors;
+}
+
+/* Do the exact work to clear bad block range from the bad block table */
+static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+{
+ struct badblocks_context bad;
+ int prev = -1, hint = -1;
+ int len = 0, cleared = 0;
int rv = 0;
+ u64 *p;
+
+ if (bb->shift < 0)
+ /* badblocks are disabled */
+ return 1;
+
+ if (sectors == 0)
+ /* Invalid sectors number */
+ return 1;
+
+ if (bb->shift) {
+ sector_t target;
- if (bb->shift > 0) {
/* When clearing we round the start up and the end down.
* This should not matter as the shift should align with
* the block size and no rounding should ever be needed.
* However it is better the think a block is bad when it
* isn't than to think a block is not bad when it is.
*/
- s += (1<<bb->shift) - 1;
- s >>= bb->shift;
- target >>= bb->shift;
+ target = s + sectors;
+ roundup(s, bb->shift);
+ rounddown(target, bb->shift);
+ sectors = target - s;
}
write_seqlock_irq(&bb->lock);
+ bad.ack = true;
p = bb->page;
- lo = 0;
- hi = bb->count;
- /* Find the last range that starts before 'target' */
- while (hi - lo > 1) {
- int mid = (lo + hi) / 2;
- sector_t a = BB_OFFSET(p[mid]);
- if (a < target)
- lo = mid;
- else
- hi = mid;
+re_clear:
+ bad.start = s;
+ bad.len = sectors;
+
+ if (badblocks_empty(bb)) {
+ len = sectors;
+ cleared++;
+ goto update_sectors;
}
- if (hi > lo) {
- /* p[lo] is the last range that could overlap the
- * current range. Earlier ranges could also overlap,
- * but only this one can overlap the end of the range.
- */
- if ((BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) &&
- (BB_OFFSET(p[lo]) < target)) {
- /* Partial overlap, leave the tail of this range */
- int ack = BB_ACK(p[lo]);
- sector_t a = BB_OFFSET(p[lo]);
- sector_t end = a + BB_LEN(p[lo]);
-
- if (a < s) {
- /* we need to split this range */
- if (bb->count >= MAX_BADBLOCKS) {
- rv = -ENOSPC;
- goto out;
- }
- memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
- bb->count++;
- p[lo] = BB_MAKE(a, s-a, ack);
- lo++;
- }
- p[lo] = BB_MAKE(target, end - target, ack);
- /* there is no longer an overlap */
- hi = lo;
- lo--;
- }
- while (lo >= 0 &&
- (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) &&
- (BB_OFFSET(p[lo]) < target)) {
- /* This range does overlap */
- if (BB_OFFSET(p[lo]) < s) {
- /* Keep the early parts of this range. */
- int ack = BB_ACK(p[lo]);
- sector_t start = BB_OFFSET(p[lo]);
-
- p[lo] = BB_MAKE(start, s - start, ack);
- /* now low doesn't overlap, so.. */
- break;
- }
- lo--;
+
+
+ prev = prev_badblocks(bb, &bad, hint);
+
+ /* Start before all badblocks */
+ if (prev < 0) {
+ if (overlap_behind(bb, &bad, 0)) {
+ len = BB_OFFSET(p[0]) - s;
+ hint = 0;
+ } else {
+ len = sectors;
}
- /* 'lo' is strictly before, 'hi' is strictly after,
- * anything between needs to be discarded
+ /*
+ * Both situations are to clear non-bad range,
+ * should be treated as successful
*/
- if (hi - lo > 1) {
- memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
- bb->count -= (hi - lo - 1);
+ cleared++;
+ goto update_sectors;
+ }
+
+ /* Start after all badblocks */
+ if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) {
+ len = sectors;
+ cleared++;
+ goto update_sectors;
+ }
+
+ /* Clear will split a bad record but the table is full */
+ if (badblocks_full(bb) && (BB_OFFSET(p[prev]) < bad.start) &&
+ (BB_END(p[prev]) > (bad.start + sectors))) {
+ len = sectors;
+ goto update_sectors;
+ }
+
+ if (overlap_front(bb, prev, &bad)) {
+ if ((BB_OFFSET(p[prev]) < bad.start) &&
+ (BB_END(p[prev]) > (bad.start + bad.len))) {
+ /* Splitting */
+ if ((bb->count + 1) < MAX_BADBLOCKS) {
+ len = front_splitting_clear(bb, prev, &bad);
+ bb->count += 1;
+ cleared++;
+ } else {
+ /* No space to split, give up */
+ len = sectors;
+ }
+ } else {
+ int deleted = 0;
+
+ len = front_clear(bb, prev, &bad, &deleted);
+ bb->count -= deleted;
+ cleared++;
+ hint = prev;
}
+
+ goto update_sectors;
+ }
+
+ /* Not front overlap, but behind overlap */
+ if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {
+ len = BB_OFFSET(p[prev + 1]) - bad.start;
+ hint = prev + 1;
+ /* Clear non-bad range should be treated as successful */
+ cleared++;
+ goto update_sectors;
+ }
+
+ /* Not cover any badblocks range in the table */
+ len = sectors;
+ /* Clear non-bad range should be treated as successful */
+ cleared++;
+
+update_sectors:
+ s += len;
+ sectors -= len;
+
+ if (sectors > 0)
+ goto re_clear;
+
+ WARN_ON(sectors < 0);
+
+ if (cleared) {
+ badblocks_update_acked(bb);
+ set_changed(bb);
}
- badblocks_update_acked(bb);
- bb->changed = 1;
-out:
write_sequnlock_irq(&bb->lock);
+
+ if (!cleared)
+ rv = 1;
+
return rv;
}
+
+/* Do the exact work to check bad blocks range from the bad block table */
+static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ sector_t *first_bad, int *bad_sectors)
+{
+ int unacked_badblocks, acked_badblocks;
+ int prev = -1, hint = -1, set = 0;
+ struct badblocks_context bad;
+ unsigned int seq;
+ int len, rv;
+ u64 *p;
+
+ WARN_ON(bb->shift < 0 || sectors == 0);
+
+ if (bb->shift > 0) {
+ sector_t target;
+
+ /* round the start down, and the end up */
+ target = s + sectors;
+ rounddown(s, bb->shift);
+ roundup(target, bb->shift);
+ sectors = target - s;
+ }
+
+retry:
+ seq = read_seqbegin(&bb->lock);
+
+ p = bb->page;
+ unacked_badblocks = 0;
+ acked_badblocks = 0;
+
+re_check:
+ bad.start = s;
+ bad.len = sectors;
+
+ if (badblocks_empty(bb)) {
+ len = sectors;
+ goto update_sectors;
+ }
+
+ prev = prev_badblocks(bb, &bad, hint);
+
+ /* start after all badblocks */
+ if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) {
+ len = sectors;
+ goto update_sectors;
+ }
+
+ if (overlap_front(bb, prev, &bad)) {
+ if (BB_ACK(p[prev]))
+ acked_badblocks++;
+ else
+ unacked_badblocks++;
+
+ if (BB_END(p[prev]) >= (s + sectors))
+ len = sectors;
+ else
+ len = BB_END(p[prev]) - s;
+
+ if (set == 0) {
+ *first_bad = BB_OFFSET(p[prev]);
+ *bad_sectors = BB_LEN(p[prev]);
+ set = 1;
+ }
+ goto update_sectors;
+ }
+
+ /* Not front overlap, but behind overlap */
+ if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {
+ len = BB_OFFSET(p[prev + 1]) - bad.start;
+ hint = prev + 1;
+ goto update_sectors;
+ }
+
+ /* not cover any badblocks range in the table */
+ len = sectors;
+
+update_sectors:
+ s += len;
+ sectors -= len;
+
+ if (sectors > 0)
+ goto re_check;
+
+ WARN_ON(sectors < 0);
+
+ if (unacked_badblocks > 0)
+ rv = -1;
+ else if (acked_badblocks > 0)
+ rv = 1;
+ else
+ rv = 0;
+
+ if (read_seqretry(&bb->lock, seq))
+ goto retry;
+
+ return rv;
+}
+
+/**
+ * badblocks_check() - check a given range for bad sectors
+ * @bb: the badblocks structure that holds all badblock information
+ * @s: sector (start) at which to check for badblocks
+ * @sectors: number of sectors to check for badblocks
+ * @first_bad: pointer to store location of the first badblock
+ * @bad_sectors: pointer to store number of badblocks after @first_bad
+ *
+ * We can record which blocks on each device are 'bad' and so just
+ * fail those blocks, or that stripe, rather than the whole device.
+ * Entries in the bad-block table are 64bits wide. This comprises:
+ * Length of bad-range, in sectors: 0-511 for lengths 1-512
+ * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
+ * A 'shift' can be set so that larger blocks are tracked and
+ * consequently larger devices can be covered.
+ * 'Acknowledged' flag - 1 bit. - the most significant bit.
+ *
+ * Locking of the bad-block table uses a seqlock so badblocks_check
+ * might need to retry if it is very unlucky.
+ * We will sometimes want to check for bad blocks in a bi_end_io function,
+ * so we use the write_seqlock_irq variant.
+ *
+ * When looking for a bad block we specify a range and want to
+ * know if any block in the range is bad. So we binary-search
+ * to the last range that starts at-or-before the given endpoint,
+ * (or "before the sector after the target range")
+ * then see if it ends after the given start.
+ *
+ * Return:
+ * 0: there are no known bad blocks in the range
+ * 1: there are known bad block which are all acknowledged
+ * -1: there are bad blocks which have not yet been acknowledged in metadata.
+ * plus the start/length of the first bad section we overlap.
+ */
+int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ sector_t *first_bad, int *bad_sectors)
+{
+ return _badblocks_check(bb, s, sectors, first_bad, bad_sectors);
+}
+EXPORT_SYMBOL_GPL(badblocks_check);
+
+/**
+ * badblocks_set() - Add a range of bad blocks to the table.
+ * @bb: the badblocks structure that holds all badblock information
+ * @s: first sector to mark as bad
+ * @sectors: number of sectors to mark as bad
+ * @acknowledged: weather to mark the bad sectors as acknowledged
+ *
+ * This might extend the table, or might contract it if two adjacent ranges
+ * can be merged. We binary-search to find the 'insertion' point, then
+ * decide how best to handle it.
+ *
+ * Return:
+ * 0: success
+ * 1: failed to set badblocks (out of space)
+ */
+int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+ int acknowledged)
+{
+ return _badblocks_set(bb, s, sectors, acknowledged);
+}
+EXPORT_SYMBOL_GPL(badblocks_set);
+
+/**
+ * badblocks_clear() - Remove a range of bad blocks to the table.
+ * @bb: the badblocks structure that holds all badblock information
+ * @s: first sector to mark as bad
+ * @sectors: number of sectors to mark as bad
+ *
+ * This may involve extending the table if we spilt a region,
+ * but it must not fail. So if the table becomes full, we just
+ * drop the remove request.
+ *
+ * Return:
+ * 0: success
+ * 1: failed to clear badblocks
+ */
+int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+{
+ return _badblocks_clear(bb, s, sectors);
+}
EXPORT_SYMBOL_GPL(badblocks_clear);
/**
diff --git a/block/blk-flush.c b/block/blk-flush.c
index e73dc22d05c1..3f4d41952ef2 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -323,16 +323,9 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
flush_rq->mq_ctx = first_rq->mq_ctx;
flush_rq->mq_hctx = first_rq->mq_hctx;
- if (!q->elevator) {
+ if (!q->elevator)
flush_rq->tag = first_rq->tag;
-
- /*
- * We borrow data request's driver tag, so have to mark
- * this flush request as INFLIGHT for avoiding double
- * account of this driver tag
- */
- flush_rq->rq_flags |= RQF_MQ_INFLIGHT;
- } else
+ else
flush_rq->internal_tag = first_rq->internal_tag;
flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index c3b5930106b2..5cbeb9344f2f 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -246,7 +246,6 @@ static const char *const rqf_name[] = {
RQF_NAME(STARTED),
RQF_NAME(FLUSH_SEQ),
RQF_NAME(MIXED_MERGE),
- RQF_NAME(MQ_INFLIGHT),
RQF_NAME(DONTPREP),
RQF_NAME(SCHED_TAGS),
RQF_NAME(USE_SCHED),
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1fafd54dce3c..e2d11183f62e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -426,6 +426,8 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
rq_list_add(data->cached_rq, rq);
nr++;
}
+ if (!(data->rq_flags & RQF_SCHED_TAGS))
+ blk_mq_add_active_requests(data->hctx, nr);
/* caller already holds a reference, add for remainder */
percpu_ref_get_many(&data->q->q_usage_counter, nr - 1);
data->nr_tags -= nr;
@@ -510,6 +512,8 @@ retry:
goto retry;
}
+ if (!(data->rq_flags & RQF_SCHED_TAGS))
+ blk_mq_inc_active_requests(data->hctx);
rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag);
blk_mq_rq_time_init(rq, alloc_time_ns);
return rq;
@@ -669,6 +673,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
tag = blk_mq_get_tag(&data);
if (tag == BLK_MQ_NO_TAG)
goto out_queue_exit;
+ if (!(data.rq_flags & RQF_SCHED_TAGS))
+ blk_mq_inc_active_requests(data.hctx);
rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag);
blk_mq_rq_time_init(rq, alloc_time_ns);
rq->__data_len = 0;
@@ -708,11 +714,10 @@ static void __blk_mq_free_request(struct request *rq)
blk_pm_mark_last_busy(rq);
rq->mq_hctx = NULL;
- if (rq->rq_flags & RQF_MQ_INFLIGHT)
- __blk_mq_dec_active_requests(hctx);
-
- if (rq->tag != BLK_MQ_NO_TAG)
+ if (rq->tag != BLK_MQ_NO_TAG) {
+ blk_mq_dec_active_requests(hctx);
blk_mq_put_tag(hctx->tags, ctx, rq->tag);
+ }
if (sched_tag != BLK_MQ_NO_TAG)
blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
blk_mq_sched_restart(hctx);
@@ -1061,12 +1066,7 @@ static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx,
{
struct request_queue *q = hctx->queue;
- /*
- * All requests should have been marked as RQF_MQ_INFLIGHT, so
- * update hctx->nr_active in batch
- */
- if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
- __blk_mq_sub_active_requests(hctx, nr_tags);
+ blk_mq_sub_active_requests(hctx, nr_tags);
blk_mq_put_tags(hctx->tags, tag_array, nr_tags);
percpu_ref_put_many(&q->q_usage_counter, nr_tags);
@@ -1259,6 +1259,7 @@ void blk_mq_start_request(struct request *rq)
blk_add_timer(rq);
WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
+ rq->mq_hctx->tags->rqs[rq->tag] = rq;
#ifdef CONFIG_BLK_DEV_INTEGRITY
if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
@@ -1748,7 +1749,7 @@ struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
return data.rq;
}
-static bool __blk_mq_alloc_driver_tag(struct request *rq)
+bool __blk_mq_alloc_driver_tag(struct request *rq)
{
struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
@@ -1769,20 +1770,7 @@ static bool __blk_mq_alloc_driver_tag(struct request *rq)
return false;
rq->tag = tag + tag_offset;
- return true;
-}
-
-bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq)
-{
- if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq))
- return false;
-
- if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
- !(rq->rq_flags & RQF_MQ_INFLIGHT)) {
- rq->rq_flags |= RQF_MQ_INFLIGHT;
- __blk_mq_inc_active_requests(hctx);
- }
- hctx->tags->rqs[rq->tag] = rq;
+ blk_mq_inc_active_requests(rq->mq_hctx);
return true;
}
@@ -2794,13 +2782,8 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
* If we do, we can dispatch the whole plug list in one go. We
* already know at this point that all requests belong to the
* same queue, caller must ensure that's the case.
- *
- * Since we pass off the full list to the driver at this point,
- * we do not increment the active request count for the queue.
- * Bypass shared tags for now because of that.
*/
- if (q->mq_ops->queue_rqs &&
- !(rq->mq_hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
+ if (q->mq_ops->queue_rqs) {
blk_mq_run_dispatch_ops(q,
__blk_mq_flush_plug_list(q, plug));
if (rq_list_empty(plug->mq_list))
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 1743857e0b01..f75a9ecfebde 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -271,12 +271,18 @@ static inline int blk_mq_get_rq_budget_token(struct request *rq)
return -1;
}
-static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
+static inline void __blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx,
+ int val)
{
if (blk_mq_is_shared_tags(hctx->flags))
- atomic_inc(&hctx->queue->nr_active_requests_shared_tags);
+ atomic_add(val, &hctx->queue->nr_active_requests_shared_tags);
else
- atomic_inc(&hctx->nr_active);
+ atomic_add(val, &hctx->nr_active);
+}
+
+static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
+{
+ __blk_mq_add_active_requests(hctx, 1);
}
static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx,
@@ -293,6 +299,32 @@ static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
__blk_mq_sub_active_requests(hctx, 1);
}
+static inline void blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx,
+ int val)
+{
+ if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+ __blk_mq_add_active_requests(hctx, val);
+}
+
+static inline void blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
+{
+ if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+ __blk_mq_inc_active_requests(hctx);
+}
+
+static inline void blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx,
+ int val)
+{
+ if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+ __blk_mq_sub_active_requests(hctx, val);
+}
+
+static inline void blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
+{
+ if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+ __blk_mq_dec_active_requests(hctx);
+}
+
static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx)
{
if (blk_mq_is_shared_tags(hctx->flags))
@@ -302,13 +334,9 @@ static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx)
static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
+ blk_mq_dec_active_requests(hctx);
blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag);
rq->tag = BLK_MQ_NO_TAG;
-
- if (rq->rq_flags & RQF_MQ_INFLIGHT) {
- rq->rq_flags &= ~RQF_MQ_INFLIGHT;
- __blk_mq_dec_active_requests(hctx);
- }
}
static inline void blk_mq_put_driver_tag(struct request *rq)
@@ -319,19 +347,14 @@ static inline void blk_mq_put_driver_tag(struct request *rq)
__blk_mq_put_driver_tag(rq->mq_hctx, rq);
}
-bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq);
+bool __blk_mq_alloc_driver_tag(struct request *rq);
static inline bool blk_mq_get_driver_tag(struct request *rq)
{
- struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
-
- if (rq->tag != BLK_MQ_NO_TAG &&
- !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
- hctx->tags->rqs[rq->tag] = rq;
- return true;
- }
+ if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq))
+ return false;
- return __blk_mq_get_driver_tag(hctx, rq);
+ return true;
}
static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c
index 403756dbd50d..82d9c4c3fb41 100644
--- a/block/partitions/ibm.c
+++ b/block/partitions/ibm.c
@@ -61,6 +61,47 @@ static sector_t cchhb2blk(struct vtoc_cchhb *ptr, struct hd_geometry *geo)
ptr->b;
}
+/* Volume Label Type/ID Length */
+#define DASD_VOL_TYPE_LEN 4
+#define DASD_VOL_ID_LEN 6
+
+/* Volume Label Types */
+#define DASD_VOLLBL_TYPE_VOL1 0
+#define DASD_VOLLBL_TYPE_LNX1 1
+#define DASD_VOLLBL_TYPE_CMS1 2
+
+struct dasd_vollabel {
+ char *type;
+ int idx;
+};
+
+static struct dasd_vollabel dasd_vollabels[] = {
+ [DASD_VOLLBL_TYPE_VOL1] = {
+ .type = "VOL1",
+ .idx = DASD_VOLLBL_TYPE_VOL1,
+ },
+ [DASD_VOLLBL_TYPE_LNX1] = {
+ .type = "LNX1",
+ .idx = DASD_VOLLBL_TYPE_LNX1,
+ },
+ [DASD_VOLLBL_TYPE_CMS1] = {
+ .type = "CMS1",
+ .idx = DASD_VOLLBL_TYPE_CMS1,
+ },
+};
+
+static int get_label_by_type(const char *type)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(dasd_vollabels); i++) {
+ if (!memcmp(type, dasd_vollabels[i].type, DASD_VOL_TYPE_LEN))
+ return dasd_vollabels[i].idx;
+ }
+
+ return -1;
+}
+
static int find_label(struct parsed_partitions *state,
dasd_information2_t *info,
struct hd_geometry *geo,
@@ -70,12 +111,10 @@ static int find_label(struct parsed_partitions *state,
char type[],
union label_t *label)
{
- Sector sect;
- unsigned char *data;
sector_t testsect[3];
- unsigned char temp[5];
- int found = 0;
int i, testcount;
+ Sector sect;
+ void *data;
/* There a three places where we may find a valid label:
* - on an ECKD disk it's block 2
@@ -103,31 +142,27 @@ static int find_label(struct parsed_partitions *state,
if (data == NULL)
continue;
memcpy(label, data, sizeof(*label));
- memcpy(temp, data, 4);
- temp[4] = 0;
- EBCASC(temp, 4);
+ memcpy(type, data, DASD_VOL_TYPE_LEN);
+ EBCASC(type, DASD_VOL_TYPE_LEN);
put_dev_sector(sect);
- if (!strcmp(temp, "VOL1") ||
- !strcmp(temp, "LNX1") ||
- !strcmp(temp, "CMS1")) {
- if (!strcmp(temp, "VOL1")) {
- strncpy(type, label->vol.vollbl, 4);
- strncpy(name, label->vol.volid, 6);
- } else {
- strncpy(type, label->lnx.vollbl, 4);
- strncpy(name, label->lnx.volid, 6);
- }
- EBCASC(type, 4);
- EBCASC(name, 6);
+ switch (get_label_by_type(type)) {
+ case DASD_VOLLBL_TYPE_VOL1:
+ memcpy(name, label->vol.volid, DASD_VOL_ID_LEN);
+ EBCASC(name, DASD_VOL_ID_LEN);
+ *labelsect = testsect[i];
+ return 1;
+ case DASD_VOLLBL_TYPE_LNX1:
+ case DASD_VOLLBL_TYPE_CMS1:
+ memcpy(name, label->lnx.volid, DASD_VOL_ID_LEN);
+ EBCASC(name, DASD_VOL_ID_LEN);
*labelsect = testsect[i];
- found = 1;
+ return 1;
+ default:
break;
}
}
- if (!found)
- memset(label, 0, sizeof(*label));
- return found;
+ return 0;
}
static int find_vol1_partitions(struct parsed_partitions *state,
@@ -297,8 +332,8 @@ int ibm_partition(struct parsed_partitions *state)
sector_t nr_sectors;
dasd_information2_t *info;
struct hd_geometry *geo;
- char type[5] = {0,};
- char name[7] = {0,};
+ char type[DASD_VOL_TYPE_LEN + 1] = "";
+ char name[DASD_VOL_ID_LEN + 1] = "";
sector_t labelsect;
union label_t *label;
@@ -330,18 +365,21 @@ int ibm_partition(struct parsed_partitions *state)
info = NULL;
}
- if (find_label(state, info, geo, blocksize, &labelsect, name, type,
- label)) {
- if (!strncmp(type, "VOL1", 4)) {
+ if (find_label(state, info, geo, blocksize, &labelsect, name, type, label)) {
+ switch (get_label_by_type(type)) {
+ case DASD_VOLLBL_TYPE_VOL1:
res = find_vol1_partitions(state, geo, blocksize, name,
label);
- } else if (!strncmp(type, "LNX1", 4)) {
+ break;
+ case DASD_VOLLBL_TYPE_LNX1:
res = find_lnx1_partitions(state, geo, blocksize, name,
label, labelsect, nr_sectors,
info);
- } else if (!strncmp(type, "CMS1", 4)) {
+ break;
+ case DASD_VOLLBL_TYPE_CMS1:
res = find_cms1_partitions(state, geo, blocksize, name,
label, labelsect);
+ break;
}
} else if (info) {
/*
diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c
index 63773a90581d..c51ea95bc2ce 100644
--- a/drivers/block/aoe/aoenet.c
+++ b/drivers/block/aoe/aoenet.c
@@ -39,8 +39,7 @@ static struct ktstate kts;
#ifndef MODULE
static int __init aoe_iflist_setup(char *str)
{
- strncpy(aoe_iflist, str, IFLISTSZ);
- aoe_iflist[IFLISTSZ - 1] = '\0';
+ strscpy(aoe_iflist, str, IFLISTSZ);
return 1;
}
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index 968090935eb2..22a3cf7f32e2 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -1750,6 +1750,25 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
return null_handle_cmd(cmd, sector, nr_sectors, req_op(rq));
}
+static void null_queue_rqs(struct request **rqlist)
+{
+ struct request *requeue_list = NULL;
+ struct request **requeue_lastp = &requeue_list;
+ struct blk_mq_queue_data bd = { };
+ blk_status_t ret;
+
+ do {
+ struct request *rq = rq_list_pop(rqlist);
+
+ bd.rq = rq;
+ ret = null_queue_rq(rq->mq_hctx, &bd);
+ if (ret != BLK_STS_OK)
+ rq_list_add_tail(&requeue_lastp, rq);
+ } while (!rq_list_empty(*rqlist));
+
+ *rqlist = requeue_list;
+}
+
static void cleanup_queue(struct nullb_queue *nq)
{
bitmap_free(nq->tag_map);
@@ -1802,6 +1821,7 @@ static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
static const struct blk_mq_ops null_mq_ops = {
.queue_rq = null_queue_rq,
+ .queue_rqs = null_queue_rqs,
.complete = null_complete_rq,
.timeout = null_timeout_rq,
.poll = null_poll,
@@ -1946,7 +1966,7 @@ static int null_gendisk_register(struct nullb *nullb)
else
disk->fops = &null_bio_ops;
disk->private_data = nullb;
- strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN);
+ strscpy_pad(disk->disk_name, nullb->disk_name, DISK_NAME_LEN);
if (nullb->dev->zoned) {
int ret = null_register_zoned_dev(nullb);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 1fe011676d07..4689ac2e0c0e 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -470,8 +470,6 @@ static bool virtblk_prep_rq_batch(struct request *req)
struct virtio_blk *vblk = req->mq_hctx->queue->queuedata;
struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
- req->mq_hctx->tags->rqs[req->tag] = req;
-
return virtblk_prep_rq(req->mq_hctx, vblk, req, vbr) == BLK_STS_OK;
}
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index cc2839805983..a5e07270e0d4 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -3655,7 +3655,6 @@ static struct ctl_table cdrom_table[] = {
.mode = 0644,
.proc_handler = cdrom_sysctl_handler
},
- { }
};
static struct ctl_table_header *cdrom_sysctl_header;
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 5f9991765f27..a4692f8f98ee 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -749,7 +749,11 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
return ERR_PTR(-ENOMEM);
}
- mddev_init(&rs->md);
+ if (mddev_init(&rs->md)) {
+ kfree(rs);
+ ti->error = "Cannot initialize raid context";
+ return ERR_PTR(-ENOMEM);
+ }
rs->raid_disks = raid_devs;
rs->delta_disks = 0;
@@ -798,6 +802,7 @@ static void raid_set_free(struct raid_set *rs)
dm_put_device(rs->ti, rs->dev[i].data_dev);
}
+ mddev_destroy(&rs->md);
kfree(rs);
}
@@ -3239,7 +3244,7 @@ size_check:
set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
/* Has to be held on running the array */
- mddev_lock_nointr(&rs->md);
+ mddev_suspend_and_lock_nointr(&rs->md);
r = md_run(&rs->md);
rs->md.in_sync = 0; /* Assume already marked dirty */
if (r) {
@@ -3263,7 +3268,6 @@ size_check:
}
}
- mddev_suspend(&rs->md);
set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags);
/* Try to adjust the raid4/5/6 stripe cache size to the stripe size */
@@ -3793,9 +3797,7 @@ static void raid_postsuspend(struct dm_target *ti)
if (!test_bit(MD_RECOVERY_FROZEN, &rs->md.recovery))
md_stop_writes(&rs->md);
- mddev_lock_nointr(&rs->md);
- mddev_suspend(&rs->md);
- mddev_unlock(&rs->md);
+ mddev_suspend(&rs->md, false);
}
}
@@ -4054,8 +4056,7 @@ static void raid_resume(struct dm_target *ti)
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
mddev->ro = 0;
mddev->in_sync = 0;
- mddev_resume(mddev);
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
}
}
diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c
index 6eaa0eab40f9..4b80165afd23 100644
--- a/drivers/md/md-autodetect.c
+++ b/drivers/md/md-autodetect.c
@@ -175,7 +175,7 @@ static void __init md_setup_drive(struct md_setup_args *args)
return;
}
- err = mddev_lock(mddev);
+ err = mddev_suspend_and_lock(mddev);
if (err) {
pr_err("md: failed to lock array %s\n", name);
goto out_mddev_put;
@@ -221,7 +221,7 @@ static void __init md_setup_drive(struct md_setup_args *args)
if (err)
pr_warn("md: starting %s failed\n", name);
out_unlock:
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
out_mddev_put:
mddev_put(mddev);
}
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 6f9ff14971f9..9672f75c3050 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1861,7 +1861,7 @@ void md_bitmap_destroy(struct mddev *mddev)
md_bitmap_wait_behind_writes(mddev);
if (!mddev->serialize_policy)
- mddev_destroy_serial_pool(mddev, NULL, true);
+ mddev_destroy_serial_pool(mddev, NULL);
mutex_lock(&mddev->bitmap_info.mutex);
spin_lock(&mddev->lock);
@@ -1977,7 +1977,7 @@ int md_bitmap_load(struct mddev *mddev)
goto out;
rdev_for_each(rdev, mddev)
- mddev_create_serial_pool(mddev, rdev, true);
+ mddev_create_serial_pool(mddev, rdev);
if (mddev_is_clustered(mddev))
md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes);
@@ -2348,14 +2348,11 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
{
int rv;
- rv = mddev_lock(mddev);
+ rv = mddev_suspend_and_lock(mddev);
if (rv)
return rv;
+
if (mddev->pers) {
- if (!mddev->pers->quiesce) {
- rv = -EBUSY;
- goto out;
- }
if (mddev->recovery || mddev->sync_thread) {
rv = -EBUSY;
goto out;
@@ -2369,11 +2366,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
rv = -EBUSY;
goto out;
}
- if (mddev->pers) {
- mddev_suspend(mddev);
- md_bitmap_destroy(mddev);
- mddev_resume(mddev);
- }
+
+ md_bitmap_destroy(mddev);
mddev->bitmap_info.offset = 0;
if (mddev->bitmap_info.file) {
struct file *f = mddev->bitmap_info.file;
@@ -2383,6 +2377,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
} else {
/* No bitmap, OK to set a location */
long long offset;
+ struct bitmap *bitmap;
+
if (strncmp(buf, "none", 4) == 0)
/* nothing to be done */;
else if (strncmp(buf, "file:", 5) == 0) {
@@ -2406,25 +2402,20 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
rv = -EINVAL;
goto out;
}
+
mddev->bitmap_info.offset = offset;
- if (mddev->pers) {
- struct bitmap *bitmap;
- bitmap = md_bitmap_create(mddev, -1);
- mddev_suspend(mddev);
- if (IS_ERR(bitmap))
- rv = PTR_ERR(bitmap);
- else {
- mddev->bitmap = bitmap;
- rv = md_bitmap_load(mddev);
- if (rv)
- mddev->bitmap_info.offset = 0;
- }
- if (rv) {
- md_bitmap_destroy(mddev);
- mddev_resume(mddev);
- goto out;
- }
- mddev_resume(mddev);
+ bitmap = md_bitmap_create(mddev, -1);
+ if (IS_ERR(bitmap)) {
+ rv = PTR_ERR(bitmap);
+ goto out;
+ }
+
+ mddev->bitmap = bitmap;
+ rv = md_bitmap_load(mddev);
+ if (rv) {
+ mddev->bitmap_info.offset = 0;
+ md_bitmap_destroy(mddev);
+ goto out;
}
}
}
@@ -2437,7 +2428,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
}
rv = 0;
out:
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
if (rv)
return rv;
return len;
@@ -2546,7 +2537,7 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
if (backlog > COUNTER_MAX)
return -EINVAL;
- rv = mddev_lock(mddev);
+ rv = mddev_suspend_and_lock(mddev);
if (rv)
return rv;
@@ -2571,16 +2562,16 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
if (!backlog && mddev->serial_info_pool) {
/* serial_info_pool is not needed if backlog is zero */
if (!mddev->serialize_policy)
- mddev_destroy_serial_pool(mddev, NULL, false);
+ mddev_destroy_serial_pool(mddev, NULL);
} else if (backlog && !mddev->serial_info_pool) {
/* serial_info_pool is needed since backlog is not zero */
rdev_for_each(rdev, mddev)
- mddev_create_serial_pool(mddev, rdev, false);
+ mddev_create_serial_pool(mddev, rdev);
}
if (old_mwb != backlog)
md_bitmap_update_sb(mddev->bitmap);
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return len;
}
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index 71ac99646827..8eca7693b793 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -69,6 +69,19 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
if (!conf)
return NULL;
+ /*
+ * conf->raid_disks is copy of mddev->raid_disks. The reason to
+ * keep a copy of mddev->raid_disks in struct linear_conf is,
+ * mddev->raid_disks may not be consistent with pointers number of
+ * conf->disks[] when it is updated in linear_add() and used to
+ * iterate old conf->disks[] earray in linear_congested().
+ * Here conf->raid_disks is always consitent with number of
+ * pointers in conf->disks[] array, and mddev->private is updated
+ * with rcu_assign_pointer() in linear_addr(), such race can be
+ * avoided.
+ */
+ conf->raid_disks = raid_disks;
+
cnt = 0;
conf->array_sectors = 0;
@@ -112,19 +125,6 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
conf->disks[i-1].end_sector +
conf->disks[i].rdev->sectors;
- /*
- * conf->raid_disks is copy of mddev->raid_disks. The reason to
- * keep a copy of mddev->raid_disks in struct linear_conf is,
- * mddev->raid_disks may not be consistent with pointers number of
- * conf->disks[] when it is updated in linear_add() and used to
- * iterate old conf->disks[] earray in linear_congested().
- * Here conf->raid_disks is always consitent with number of
- * pointers in conf->disks[] array, and mddev->private is updated
- * with rcu_assign_pointer() in linear_addr(), such race can be
- * avoided.
- */
- conf->raid_disks = raid_disks;
-
return conf;
out:
@@ -183,7 +183,6 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
* in linear_congested(), therefore kfree_rcu() is used to free
* oldconf until no one uses it anymore.
*/
- mddev_suspend(mddev);
oldconf = rcu_dereference_protected(mddev->private,
lockdep_is_held(&mddev->reconfig_mutex));
mddev->raid_disks++;
@@ -192,7 +191,6 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
rcu_assign_pointer(mddev->private, newconf);
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
- mddev_resume(mddev);
kfree_rcu(oldconf, rcu);
return 0;
}
diff --git a/drivers/md/md-linear.h b/drivers/md/md-linear.h
index 24e97db50ebb..5587eeedb882 100644
--- a/drivers/md/md-linear.h
+++ b/drivers/md/md-linear.h
@@ -12,6 +12,6 @@ struct linear_conf
struct rcu_head rcu;
sector_t array_sectors;
int raid_disks; /* a copy of mddev->raid_disks */
- struct dev_info disks[];
+ struct dev_info disks[] __counted_by(raid_disks);
};
#endif
diff --git a/drivers/md/md.c b/drivers/md/md.c
index a104a025084d..8ee079c4dc1e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -206,8 +206,7 @@ static int rdev_need_serial(struct md_rdev *rdev)
* 1. rdev is the first device which return true from rdev_enable_serial.
* 2. rdev is NULL, means we want to enable serialization for all rdevs.
*/
-void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
- bool is_suspend)
+void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
{
int ret = 0;
@@ -215,15 +214,12 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
!test_bit(CollisionCheck, &rdev->flags))
return;
- if (!is_suspend)
- mddev_suspend(mddev);
-
if (!rdev)
ret = rdevs_init_serial(mddev);
else
ret = rdev_init_serial(rdev);
if (ret)
- goto abort;
+ return;
if (mddev->serial_info_pool == NULL) {
/*
@@ -238,10 +234,6 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
pr_err("can't alloc memory pool for serialization\n");
}
}
-
-abort:
- if (!is_suspend)
- mddev_resume(mddev);
}
/*
@@ -250,8 +242,7 @@ abort:
* 2. when bitmap is destroyed while policy is not enabled.
* 3. for disable policy, the pool is destroyed only when no rdev needs it.
*/
-void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
- bool is_suspend)
+void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
{
if (rdev && !test_bit(CollisionCheck, &rdev->flags))
return;
@@ -260,8 +251,6 @@ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
struct md_rdev *temp;
int num = 0; /* used to track if other rdevs need the pool */
- if (!is_suspend)
- mddev_suspend(mddev);
rdev_for_each(temp, mddev) {
if (!rdev) {
if (!mddev->serialize_policy ||
@@ -283,8 +272,6 @@ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
mempool_destroy(mddev->serial_info_pool);
mddev->serial_info_pool = NULL;
}
- if (!is_suspend)
- mddev_resume(mddev);
}
}
@@ -359,11 +346,11 @@ static bool is_suspended(struct mddev *mddev, struct bio *bio)
return true;
if (bio_data_dir(bio) != WRITE)
return false;
- if (mddev->suspend_lo >= mddev->suspend_hi)
+ if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi))
return false;
- if (bio->bi_iter.bi_sector >= mddev->suspend_hi)
+ if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi))
return false;
- if (bio_end_sector(bio) < mddev->suspend_lo)
+ if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo))
return false;
return true;
}
@@ -431,42 +418,73 @@ static void md_submit_bio(struct bio *bio)
md_handle_request(mddev, bio);
}
-/* mddev_suspend makes sure no new requests are submitted
- * to the device, and that any requests that have been submitted
- * are completely handled.
- * Once mddev_detach() is called and completes, the module will be
- * completely unused.
+/*
+ * Make sure no new requests are submitted to the device, and any requests that
+ * have been submitted are completely handled.
*/
-void mddev_suspend(struct mddev *mddev)
+int mddev_suspend(struct mddev *mddev, bool interruptible)
{
- struct md_thread *thread = rcu_dereference_protected(mddev->thread,
- lockdep_is_held(&mddev->reconfig_mutex));
+ int err = 0;
- WARN_ON_ONCE(thread && current == thread->tsk);
- if (mddev->suspended++)
- return;
- wake_up(&mddev->sb_wait);
- set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags);
- percpu_ref_kill(&mddev->active_io);
+ /*
+ * hold reconfig_mutex to wait for normal io will deadlock, because
+ * other context can't update super_block, and normal io can rely on
+ * updating super_block.
+ */
+ lockdep_assert_not_held(&mddev->reconfig_mutex);
- if (mddev->pers->prepare_suspend)
- mddev->pers->prepare_suspend(mddev);
+ if (interruptible)
+ err = mutex_lock_interruptible(&mddev->suspend_mutex);
+ else
+ mutex_lock(&mddev->suspend_mutex);
+ if (err)
+ return err;
+
+ if (mddev->suspended) {
+ WRITE_ONCE(mddev->suspended, mddev->suspended + 1);
+ mutex_unlock(&mddev->suspend_mutex);
+ return 0;
+ }
+
+ percpu_ref_kill(&mddev->active_io);
+ if (interruptible)
+ err = wait_event_interruptible(mddev->sb_wait,
+ percpu_ref_is_zero(&mddev->active_io));
+ else
+ wait_event(mddev->sb_wait,
+ percpu_ref_is_zero(&mddev->active_io));
+ if (err) {
+ percpu_ref_resurrect(&mddev->active_io);
+ mutex_unlock(&mddev->suspend_mutex);
+ return err;
+ }
- wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io));
- clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
- wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
+ /*
+ * For raid456, io might be waiting for reshape to make progress,
+ * allow new reshape to start while waiting for io to be done to
+ * prevent deadlock.
+ */
+ WRITE_ONCE(mddev->suspended, mddev->suspended + 1);
del_timer_sync(&mddev->safemode_timer);
/* restrict memory reclaim I/O during raid array is suspend */
mddev->noio_flag = memalloc_noio_save();
+
+ mutex_unlock(&mddev->suspend_mutex);
+ return 0;
}
EXPORT_SYMBOL_GPL(mddev_suspend);
void mddev_resume(struct mddev *mddev)
{
- lockdep_assert_held(&mddev->reconfig_mutex);
- if (--mddev->suspended)
+ lockdep_assert_not_held(&mddev->reconfig_mutex);
+
+ mutex_lock(&mddev->suspend_mutex);
+ WRITE_ONCE(mddev->suspended, mddev->suspended - 1);
+ if (mddev->suspended) {
+ mutex_unlock(&mddev->suspend_mutex);
return;
+ }
/* entred the memalloc scope from mddev_suspend() */
memalloc_noio_restore(mddev->noio_flag);
@@ -477,6 +495,8 @@ void mddev_resume(struct mddev *mddev)
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
+
+ mutex_unlock(&mddev->suspend_mutex);
}
EXPORT_SYMBOL_GPL(mddev_resume);
@@ -616,34 +636,63 @@ static inline struct mddev *mddev_get(struct mddev *mddev)
static void mddev_delayed_delete(struct work_struct *ws);
+static void __mddev_put(struct mddev *mddev)
+{
+ if (mddev->raid_disks || !list_empty(&mddev->disks) ||
+ mddev->ctime || mddev->hold_active)
+ return;
+
+ /* Array is not configured at all, and not held active, so destroy it */
+ set_bit(MD_DELETED, &mddev->flags);
+
+ /*
+ * Call queue_work inside the spinlock so that flush_workqueue() after
+ * mddev_find will succeed in waiting for the work to be done.
+ */
+ queue_work(md_misc_wq, &mddev->del_work);
+}
+
void mddev_put(struct mddev *mddev)
{
if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
return;
- if (!mddev->raid_disks && list_empty(&mddev->disks) &&
- mddev->ctime == 0 && !mddev->hold_active) {
- /* Array is not configured at all, and not held active,
- * so destroy it */
- set_bit(MD_DELETED, &mddev->flags);
- /*
- * Call queue_work inside the spinlock so that
- * flush_workqueue() after mddev_find will succeed in waiting
- * for the work to be done.
- */
- INIT_WORK(&mddev->del_work, mddev_delayed_delete);
- queue_work(md_misc_wq, &mddev->del_work);
- }
+ __mddev_put(mddev);
spin_unlock(&all_mddevs_lock);
}
static void md_safemode_timeout(struct timer_list *t);
+static void md_start_sync(struct work_struct *ws);
+
+static void active_io_release(struct percpu_ref *ref)
+{
+ struct mddev *mddev = container_of(ref, struct mddev, active_io);
+
+ wake_up(&mddev->sb_wait);
+}
+
+static void no_op(struct percpu_ref *r) {}
-void mddev_init(struct mddev *mddev)
+int mddev_init(struct mddev *mddev)
{
+
+ if (percpu_ref_init(&mddev->active_io, active_io_release,
+ PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (percpu_ref_init(&mddev->writes_pending, no_op,
+ PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
+ percpu_ref_exit(&mddev->active_io);
+ return -ENOMEM;
+ }
+
+ /* We want to start with the refcount at zero */
+ percpu_ref_put(&mddev->writes_pending);
+
mutex_init(&mddev->open_mutex);
mutex_init(&mddev->reconfig_mutex);
mutex_init(&mddev->sync_mutex);
+ mutex_init(&mddev->suspend_mutex);
mutex_init(&mddev->bitmap_info.mutex);
INIT_LIST_HEAD(&mddev->disks);
INIT_LIST_HEAD(&mddev->all_mddevs);
@@ -662,9 +711,21 @@ void mddev_init(struct mddev *mddev)
mddev->resync_min = 0;
mddev->resync_max = MaxSector;
mddev->level = LEVEL_NONE;
+
+ INIT_WORK(&mddev->sync_work, md_start_sync);
+ INIT_WORK(&mddev->del_work, mddev_delayed_delete);
+
+ return 0;
}
EXPORT_SYMBOL_GPL(mddev_init);
+void mddev_destroy(struct mddev *mddev)
+{
+ percpu_ref_exit(&mddev->active_io);
+ percpu_ref_exit(&mddev->writes_pending);
+}
+EXPORT_SYMBOL_GPL(mddev_destroy);
+
static struct mddev *mddev_find_locked(dev_t unit)
{
struct mddev *mddev;
@@ -708,13 +769,16 @@ static struct mddev *mddev_alloc(dev_t unit)
new = kzalloc(sizeof(*new), GFP_KERNEL);
if (!new)
return ERR_PTR(-ENOMEM);
- mddev_init(new);
+
+ error = mddev_init(new);
+ if (error)
+ goto out_free_new;
spin_lock(&all_mddevs_lock);
if (unit) {
error = -EEXIST;
if (mddev_find_locked(unit))
- goto out_free_new;
+ goto out_destroy_new;
new->unit = unit;
if (MAJOR(unit) == MD_MAJOR)
new->md_minor = MINOR(unit);
@@ -725,7 +789,7 @@ static struct mddev *mddev_alloc(dev_t unit)
error = -ENODEV;
new->unit = mddev_alloc_unit();
if (!new->unit)
- goto out_free_new;
+ goto out_destroy_new;
new->md_minor = MINOR(new->unit);
new->hold_active = UNTIL_STOP;
}
@@ -733,8 +797,11 @@ static struct mddev *mddev_alloc(dev_t unit)
list_add(&new->all_mddevs, &all_mddevs);
spin_unlock(&all_mddevs_lock);
return new;
-out_free_new:
+
+out_destroy_new:
spin_unlock(&all_mddevs_lock);
+ mddev_destroy(new);
+out_free_new:
kfree(new);
return ERR_PTR(error);
}
@@ -745,6 +812,7 @@ static void mddev_free(struct mddev *mddev)
list_del(&mddev->all_mddevs);
spin_unlock(&all_mddevs_lock);
+ mddev_destroy(mddev);
kfree(mddev);
}
@@ -2412,7 +2480,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
pr_debug("md: bind<%s>\n", b);
if (mddev->raid_disks)
- mddev_create_serial_pool(mddev, rdev, false);
+ mddev_create_serial_pool(mddev, rdev);
if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
goto fail;
@@ -2465,7 +2533,7 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev)
bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
list_del_rcu(&rdev->same_set);
pr_debug("md: unbind<%pg>\n", rdev->bdev);
- mddev_destroy_serial_pool(rdev->mddev, rdev, false);
+ mddev_destroy_serial_pool(rdev->mddev, rdev);
rdev->mddev = NULL;
sysfs_remove_link(&rdev->kobj, "block");
sysfs_put(rdev->sysfs_state);
@@ -2795,11 +2863,7 @@ static int add_bound_rdev(struct md_rdev *rdev)
*/
super_types[mddev->major_version].
validate_super(mddev, rdev);
- if (add_journal)
- mddev_suspend(mddev);
err = mddev->pers->hot_add_disk(mddev, rdev);
- if (add_journal)
- mddev_resume(mddev);
if (err) {
md_kick_rdev_from_array(rdev);
return err;
@@ -2936,11 +3000,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
}
} else if (cmd_match(buf, "writemostly")) {
set_bit(WriteMostly, &rdev->flags);
- mddev_create_serial_pool(rdev->mddev, rdev, false);
+ mddev_create_serial_pool(rdev->mddev, rdev);
need_update_sb = true;
err = 0;
} else if (cmd_match(buf, "-writemostly")) {
- mddev_destroy_serial_pool(rdev->mddev, rdev, false);
+ mddev_destroy_serial_pool(rdev->mddev, rdev);
clear_bit(WriteMostly, &rdev->flags);
need_update_sb = true;
err = 0;
@@ -3552,6 +3616,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
struct kernfs_node *kn = NULL;
+ bool suspend = false;
ssize_t rv;
struct mddev *mddev = rdev->mddev;
@@ -3559,17 +3624,25 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
return -EIO;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
+ if (!mddev)
+ return -ENODEV;
- if (entry->store == state_store && cmd_match(page, "remove"))
- kn = sysfs_break_active_protection(kobj, attr);
+ if (entry->store == state_store) {
+ if (cmd_match(page, "remove"))
+ kn = sysfs_break_active_protection(kobj, attr);
+ if (cmd_match(page, "remove") || cmd_match(page, "re-add") ||
+ cmd_match(page, "writemostly") ||
+ cmd_match(page, "-writemostly"))
+ suspend = true;
+ }
- rv = mddev ? mddev_lock(mddev) : -ENODEV;
+ rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev);
if (!rv) {
if (rdev->mddev == NULL)
rv = -ENODEV;
else
rv = entry->store(rdev, page, length);
- mddev_unlock(mddev);
+ suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev);
}
if (kn)
@@ -3874,12 +3947,12 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
if (slen == 0 || slen >= sizeof(clevel))
return -EINVAL;
- rv = mddev_lock(mddev);
+ rv = mddev_suspend_and_lock(mddev);
if (rv)
return rv;
if (mddev->pers == NULL) {
- strncpy(mddev->clevel, buf, slen);
+ memcpy(mddev->clevel, buf, slen);
if (mddev->clevel[slen-1] == '\n')
slen--;
mddev->clevel[slen] = 0;
@@ -3912,7 +3985,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
}
/* Now find the new personality */
- strncpy(clevel, buf, slen);
+ memcpy(clevel, buf, slen);
if (clevel[slen-1] == '\n')
slen--;
clevel[slen] = 0;
@@ -3967,7 +4040,6 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
}
/* Looks like we have a winner */
- mddev_suspend(mddev);
mddev_detach(mddev);
spin_lock(&mddev->lock);
@@ -4053,14 +4125,13 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
blk_set_stacking_limits(&mddev->queue->limits);
pers->run(mddev);
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
- mddev_resume(mddev);
if (!mddev->thread)
md_update_sb(mddev, 1);
sysfs_notify_dirent_safe(mddev->sysfs_level);
md_new_event();
rv = len;
out_unlock:
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return rv;
}
@@ -4368,6 +4439,18 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
int err = 0;
enum array_state st = match_word(buf, array_states);
+ /* No lock dependent actions */
+ switch (st) {
+ case suspended: /* not supported yet */
+ case write_pending: /* cannot be set */
+ case active_idle: /* cannot be set */
+ case broken: /* cannot be set */
+ case bad_word:
+ return -EINVAL;
+ default:
+ break;
+ }
+
if (mddev->pers && (st == active || st == clean) &&
mddev->ro != MD_RDONLY) {
/* don't take reconfig_mutex when toggling between
@@ -4392,23 +4475,16 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
err = mddev_lock(mddev);
if (err)
return err;
- err = -EINVAL;
- switch(st) {
- case bad_word:
- break;
- case clear:
- /* stopping an active array */
- err = do_md_stop(mddev, 0, NULL);
- break;
+
+ switch (st) {
case inactive:
- /* stopping an active array */
+ /* stop an active array, return 0 otherwise */
if (mddev->pers)
err = do_md_stop(mddev, 2, NULL);
- else
- err = 0; /* already inactive */
break;
- case suspended:
- break; /* not supported yet */
+ case clear:
+ err = do_md_stop(mddev, 0, NULL);
+ break;
case readonly:
if (mddev->pers)
err = md_set_readonly(mddev, NULL);
@@ -4459,10 +4535,8 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
err = do_md_run(mddev);
}
break;
- case write_pending:
- case active_idle:
- case broken:
- /* these cannot be set */
+ default:
+ err = -EINVAL;
break;
}
@@ -4535,7 +4609,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
minor != MINOR(dev))
return -EOVERFLOW;
- err = mddev_lock(mddev);
+ err = mddev_suspend_and_lock(mddev);
if (err)
return err;
if (mddev->persistent) {
@@ -4556,14 +4630,14 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
rdev = md_import_device(dev, -1, -1);
if (IS_ERR(rdev)) {
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return PTR_ERR(rdev);
}
err = bind_rdev_to_array(rdev, mddev);
out:
if (err)
export_rdev(rdev, mddev);
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
if (!err)
md_new_event();
return err ? err : len;
@@ -4698,7 +4772,7 @@ metadata_store(struct mddev *mddev, const char *buf, size_t len)
size_t namelen = len-9;
if (namelen >= sizeof(mddev->metadata_type))
namelen = sizeof(mddev->metadata_type)-1;
- strncpy(mddev->metadata_type, buf+9, namelen);
+ memcpy(mddev->metadata_type, buf+9, namelen);
mddev->metadata_type[namelen] = 0;
if (namelen && mddev->metadata_type[namelen-1] == '\n')
mddev->metadata_type[--namelen] = 0;
@@ -4872,6 +4946,7 @@ action_store(struct mddev *mddev, const char *page, size_t len)
/* A write to sync_action is enough to justify
* canceling read-auto mode
*/
+ flush_work(&mddev->sync_work);
mddev->ro = MD_RDWR;
md_wakeup_thread(mddev->sync_thread);
}
@@ -5128,7 +5203,8 @@ __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
static ssize_t
suspend_lo_show(struct mddev *mddev, char *page)
{
- return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
+ return sprintf(page, "%llu\n",
+ (unsigned long long)READ_ONCE(mddev->suspend_lo));
}
static ssize_t
@@ -5143,21 +5219,14 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
if (new != (sector_t)new)
return -EINVAL;
- err = mddev_lock(mddev);
+ err = mddev_suspend(mddev, true);
if (err)
return err;
- err = -EINVAL;
- if (mddev->pers == NULL ||
- mddev->pers->quiesce == NULL)
- goto unlock;
- mddev_suspend(mddev);
- mddev->suspend_lo = new;
+
+ WRITE_ONCE(mddev->suspend_lo, new);
mddev_resume(mddev);
- err = 0;
-unlock:
- mddev_unlock(mddev);
- return err ?: len;
+ return len;
}
static struct md_sysfs_entry md_suspend_lo =
__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
@@ -5165,7 +5234,8 @@ __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
static ssize_t
suspend_hi_show(struct mddev *mddev, char *page)
{
- return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
+ return sprintf(page, "%llu\n",
+ (unsigned long long)READ_ONCE(mddev->suspend_hi));
}
static ssize_t
@@ -5180,21 +5250,14 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
if (new != (sector_t)new)
return -EINVAL;
- err = mddev_lock(mddev);
+ err = mddev_suspend(mddev, true);
if (err)
return err;
- err = -EINVAL;
- if (mddev->pers == NULL)
- goto unlock;
- mddev_suspend(mddev);
- mddev->suspend_hi = new;
+ WRITE_ONCE(mddev->suspend_hi, new);
mddev_resume(mddev);
- err = 0;
-unlock:
- mddev_unlock(mddev);
- return err ?: len;
+ return len;
}
static struct md_sysfs_entry md_suspend_hi =
__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
@@ -5441,7 +5504,7 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
if (value == mddev->serialize_policy)
return len;
- err = mddev_lock(mddev);
+ err = mddev_suspend_and_lock(mddev);
if (err)
return err;
if (mddev->pers == NULL || (mddev->pers->level != 1)) {
@@ -5450,15 +5513,13 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
goto unlock;
}
- mddev_suspend(mddev);
if (value)
- mddev_create_serial_pool(mddev, NULL, true);
+ mddev_create_serial_pool(mddev, NULL);
else
- mddev_destroy_serial_pool(mddev, NULL, true);
+ mddev_destroy_serial_pool(mddev, NULL);
mddev->serialize_policy = value;
- mddev_resume(mddev);
unlock:
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return err ?: len;
}
@@ -5597,21 +5658,6 @@ static void mddev_delayed_delete(struct work_struct *ws)
kobject_put(&mddev->kobj);
}
-static void no_op(struct percpu_ref *r) {}
-
-int mddev_init_writes_pending(struct mddev *mddev)
-{
- if (mddev->writes_pending.percpu_count_ptr)
- return 0;
- if (percpu_ref_init(&mddev->writes_pending, no_op,
- PERCPU_REF_ALLOW_REINIT, GFP_KERNEL) < 0)
- return -ENOMEM;
- /* We want to start with the refcount at zero */
- percpu_ref_put(&mddev->writes_pending);
- return 0;
-}
-EXPORT_SYMBOL_GPL(mddev_init_writes_pending);
-
struct mddev *md_alloc(dev_t dev, char *name)
{
/*
@@ -5783,12 +5829,6 @@ static void md_safemode_timeout(struct timer_list *t)
}
static int start_dirty_degraded;
-static void active_io_release(struct percpu_ref *ref)
-{
- struct mddev *mddev = container_of(ref, struct mddev, active_io);
-
- wake_up(&mddev->sb_wait);
-}
int md_run(struct mddev *mddev)
{
@@ -5869,15 +5909,10 @@ int md_run(struct mddev *mddev)
nowait = nowait && bdev_nowait(rdev->bdev);
}
- err = percpu_ref_init(&mddev->active_io, active_io_release,
- PERCPU_REF_ALLOW_REINIT, GFP_KERNEL);
- if (err)
- return err;
-
if (!bioset_initialized(&mddev->bio_set)) {
err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
if (err)
- goto exit_active_io;
+ return err;
}
if (!bioset_initialized(&mddev->sync_set)) {
err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
@@ -6074,8 +6109,6 @@ exit_sync_set:
bioset_exit(&mddev->sync_set);
exit_bio_set:
bioset_exit(&mddev->bio_set);
-exit_active_io:
- percpu_ref_exit(&mddev->active_io);
return err;
}
EXPORT_SYMBOL_GPL(md_run);
@@ -6249,7 +6282,7 @@ static void __md_stop_writes(struct mddev *mddev)
}
/* disable policy to guarantee rdevs free resources for serialization */
mddev->serialize_policy = 0;
- mddev_destroy_serial_pool(mddev, NULL, true);
+ mddev_destroy_serial_pool(mddev, NULL);
}
void md_stop_writes(struct mddev *mddev)
@@ -6291,7 +6324,6 @@ static void __md_stop(struct mddev *mddev)
module_put(pers->owner);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- percpu_ref_exit(&mddev->active_io);
bioset_exit(&mddev->bio_set);
bioset_exit(&mddev->sync_set);
bioset_exit(&mddev->io_clone_set);
@@ -6306,7 +6338,6 @@ void md_stop(struct mddev *mddev)
*/
__md_stop_writes(mddev);
__md_stop(mddev);
- percpu_ref_exit(&mddev->writes_pending);
}
EXPORT_SYMBOL_GPL(md_stop);
@@ -6543,13 +6574,13 @@ static void autorun_devices(int part)
if (IS_ERR(mddev))
break;
- if (mddev_lock(mddev))
+ if (mddev_suspend_and_lock(mddev))
pr_warn("md: %s locked, cannot run\n", mdname(mddev));
else if (mddev->raid_disks || mddev->major_version
|| !list_empty(&mddev->disks)) {
pr_warn("md: %s already running, cannot run %pg\n",
mdname(mddev), rdev0->bdev);
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
} else {
pr_debug("md: created %s\n", mdname(mddev));
mddev->persistent = 1;
@@ -6559,7 +6590,7 @@ static void autorun_devices(int part)
export_rdev(rdev, mddev);
}
autorun_array(mddev);
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
}
/* on success, candidates will be empty, on error
* it won't...
@@ -7109,7 +7140,6 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
struct bitmap *bitmap;
bitmap = md_bitmap_create(mddev, -1);
- mddev_suspend(mddev);
if (!IS_ERR(bitmap)) {
mddev->bitmap = bitmap;
err = md_bitmap_load(mddev);
@@ -7119,11 +7149,8 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
md_bitmap_destroy(mddev);
fd = -1;
}
- mddev_resume(mddev);
} else if (fd < 0) {
- mddev_suspend(mddev);
md_bitmap_destroy(mddev);
- mddev_resume(mddev);
}
}
if (fd < 0) {
@@ -7412,7 +7439,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
mddev->bitmap_info.space =
mddev->bitmap_info.default_space;
bitmap = md_bitmap_create(mddev, -1);
- mddev_suspend(mddev);
if (!IS_ERR(bitmap)) {
mddev->bitmap = bitmap;
rv = md_bitmap_load(mddev);
@@ -7420,7 +7446,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
rv = PTR_ERR(bitmap);
if (rv)
md_bitmap_destroy(mddev);
- mddev_resume(mddev);
} else {
/* remove the bitmap */
if (!mddev->bitmap) {
@@ -7445,9 +7470,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
module_put(md_cluster_mod);
mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
}
- mddev_suspend(mddev);
md_bitmap_destroy(mddev);
- mddev_resume(mddev);
mddev->bitmap_info.offset = 0;
}
}
@@ -7518,6 +7541,20 @@ static inline bool md_ioctl_valid(unsigned int cmd)
}
}
+static bool md_ioctl_need_suspend(unsigned int cmd)
+{
+ switch (cmd) {
+ case ADD_NEW_DISK:
+ case HOT_ADD_DISK:
+ case HOT_REMOVE_DISK:
+ case SET_BITMAP_FILE:
+ case SET_ARRAY_INFO:
+ return true;
+ default:
+ return false;
+ }
+}
+
static int __md_set_array_info(struct mddev *mddev, void __user *argp)
{
mdu_array_info_t info;
@@ -7646,7 +7683,12 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
mutex_unlock(&mddev->open_mutex);
sync_blockdev(bdev);
}
- err = mddev_lock(mddev);
+
+ if (!md_is_rdwr(mddev))
+ flush_work(&mddev->sync_work);
+
+ err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) :
+ mddev_lock(mddev);
if (err) {
pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n",
err, cmd);
@@ -7774,7 +7816,10 @@ unlock:
if (mddev->hold_active == UNTIL_IOCTL &&
err != -EINVAL)
mddev->hold_active = 0;
- mddev_unlock(mddev);
+
+ md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) :
+ mddev_unlock(mddev);
+
out:
if(did_set_md_closing)
clear_bit(MD_CLOSING, &mddev->flags);
@@ -7886,7 +7931,6 @@ static void md_free_disk(struct gendisk *disk)
{
struct mddev *mddev = disk->private_data;
- percpu_ref_exit(&mddev->writes_pending);
mddev_free(mddev);
}
@@ -8202,105 +8246,46 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev)
}
static void *md_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(&all_mddevs_lock)
{
- struct list_head *tmp;
- loff_t l = *pos;
- struct mddev *mddev;
+ struct md_personality *pers;
- if (l == 0x10000) {
- ++*pos;
- return (void *)2;
- }
- if (l > 0x10000)
- return NULL;
- if (!l--)
- /* header */
- return (void*)1;
+ seq_puts(seq, "Personalities : ");
+ spin_lock(&pers_lock);
+ list_for_each_entry(pers, &pers_list, list)
+ seq_printf(seq, "[%s] ", pers->name);
+
+ spin_unlock(&pers_lock);
+ seq_puts(seq, "\n");
+ seq->poll_event = atomic_read(&md_event_count);
spin_lock(&all_mddevs_lock);
- list_for_each(tmp,&all_mddevs)
- if (!l--) {
- mddev = list_entry(tmp, struct mddev, all_mddevs);
- if (!mddev_get(mddev))
- continue;
- spin_unlock(&all_mddevs_lock);
- return mddev;
- }
- spin_unlock(&all_mddevs_lock);
- if (!l--)
- return (void*)2;/* tail */
- return NULL;
+
+ return seq_list_start(&all_mddevs, *pos);
}
static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct list_head *tmp;
- struct mddev *next_mddev, *mddev = v;
- struct mddev *to_put = NULL;
-
- ++*pos;
- if (v == (void*)2)
- return NULL;
-
- spin_lock(&all_mddevs_lock);
- if (v == (void*)1) {
- tmp = all_mddevs.next;
- } else {
- to_put = mddev;
- tmp = mddev->all_mddevs.next;
- }
-
- for (;;) {
- if (tmp == &all_mddevs) {
- next_mddev = (void*)2;
- *pos = 0x10000;
- break;
- }
- next_mddev = list_entry(tmp, struct mddev, all_mddevs);
- if (mddev_get(next_mddev))
- break;
- mddev = next_mddev;
- tmp = mddev->all_mddevs.next;
- }
- spin_unlock(&all_mddevs_lock);
-
- if (to_put)
- mddev_put(to_put);
- return next_mddev;
-
+ return seq_list_next(v, &all_mddevs, pos);
}
static void md_seq_stop(struct seq_file *seq, void *v)
+ __releases(&all_mddevs_lock)
{
- struct mddev *mddev = v;
-
- if (mddev && v != (void*)1 && v != (void*)2)
- mddev_put(mddev);
+ status_unused(seq);
+ spin_unlock(&all_mddevs_lock);
}
static int md_seq_show(struct seq_file *seq, void *v)
{
- struct mddev *mddev = v;
+ struct mddev *mddev = list_entry(v, struct mddev, all_mddevs);
sector_t sectors;
struct md_rdev *rdev;
- if (v == (void*)1) {
- struct md_personality *pers;
- seq_printf(seq, "Personalities : ");
- spin_lock(&pers_lock);
- list_for_each_entry(pers, &pers_list, list)
- seq_printf(seq, "[%s] ", pers->name);
-
- spin_unlock(&pers_lock);
- seq_printf(seq, "\n");
- seq->poll_event = atomic_read(&md_event_count);
- return 0;
- }
- if (v == (void*)2) {
- status_unused(seq);
+ if (!mddev_get(mddev))
return 0;
- }
+ spin_unlock(&all_mddevs_lock);
spin_lock(&mddev->lock);
if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
seq_printf(seq, "%s : %sactive", mdname(mddev),
@@ -8371,6 +8356,9 @@ static int md_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "\n");
}
spin_unlock(&mddev->lock);
+ spin_lock(&all_mddevs_lock);
+ if (atomic_dec_and_test(&mddev->active))
+ __mddev_put(mddev);
return 0;
}
@@ -8570,6 +8558,7 @@ bool md_write_start(struct mddev *mddev, struct bio *bi)
BUG_ON(mddev->ro == MD_RDONLY);
if (mddev->ro == MD_AUTO_READ) {
/* need to switch to read/write */
+ flush_work(&mddev->sync_work);
mddev->ro = MD_RDWR;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
@@ -9161,6 +9150,85 @@ void md_do_sync(struct md_thread *thread)
}
EXPORT_SYMBOL_GPL(md_do_sync);
+static bool rdev_removeable(struct md_rdev *rdev)
+{
+ /* rdev is not used. */
+ if (rdev->raid_disk < 0)
+ return false;
+
+ /* There are still inflight io, don't remove this rdev. */
+ if (atomic_read(&rdev->nr_pending))
+ return false;
+
+ /*
+ * An error occurred but has not yet been acknowledged by the metadata
+ * handler, don't remove this rdev.
+ */
+ if (test_bit(Blocked, &rdev->flags))
+ return false;
+
+ /* Fautly rdev is not used, it's safe to remove it. */
+ if (test_bit(Faulty, &rdev->flags))
+ return true;
+
+ /* Journal disk can only be removed if it's faulty. */
+ if (test_bit(Journal, &rdev->flags))
+ return false;
+
+ /*
+ * 'In_sync' is cleared while 'raid_disk' is valid, which means
+ * replacement has just become active from pers->spare_active(), and
+ * then pers->hot_remove_disk() will replace this rdev with replacement.
+ */
+ if (!test_bit(In_sync, &rdev->flags))
+ return true;
+
+ return false;
+}
+
+static bool rdev_is_spare(struct md_rdev *rdev)
+{
+ return !test_bit(Candidate, &rdev->flags) && rdev->raid_disk >= 0 &&
+ !test_bit(In_sync, &rdev->flags) &&
+ !test_bit(Journal, &rdev->flags) &&
+ !test_bit(Faulty, &rdev->flags);
+}
+
+static bool rdev_addable(struct md_rdev *rdev)
+{
+ /* rdev is already used, don't add it again. */
+ if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 ||
+ test_bit(Faulty, &rdev->flags))
+ return false;
+
+ /* Allow to add journal disk. */
+ if (test_bit(Journal, &rdev->flags))
+ return true;
+
+ /* Allow to add if array is read-write. */
+ if (md_is_rdwr(rdev->mddev))
+ return true;
+
+ /*
+ * For read-only array, only allow to readd a rdev. And if bitmap is
+ * used, don't allow to readd a rdev that is too old.
+ */
+ if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags))
+ return true;
+
+ return false;
+}
+
+static bool md_spares_need_change(struct mddev *mddev)
+{
+ struct md_rdev *rdev;
+
+ rdev_for_each(rdev, mddev)
+ if (rdev_removeable(rdev) || rdev_addable(rdev))
+ return true;
+ return false;
+}
+
static int remove_and_add_spares(struct mddev *mddev,
struct md_rdev *this)
{
@@ -9193,12 +9261,8 @@ static int remove_and_add_spares(struct mddev *mddev,
synchronize_rcu();
rdev_for_each(rdev, mddev) {
if ((this == NULL || rdev == this) &&
- rdev->raid_disk >= 0 &&
- !test_bit(Blocked, &rdev->flags) &&
- ((test_bit(RemoveSynchronized, &rdev->flags) ||
- (!test_bit(In_sync, &rdev->flags) &&
- !test_bit(Journal, &rdev->flags))) &&
- atomic_read(&rdev->nr_pending)==0)) {
+ (test_bit(RemoveSynchronized, &rdev->flags) ||
+ rdev_removeable(rdev))) {
if (mddev->pers->hot_remove_disk(
mddev, rdev) == 0) {
sysfs_unlink_rdev(mddev, rdev);
@@ -9220,25 +9284,12 @@ static int remove_and_add_spares(struct mddev *mddev,
rdev_for_each(rdev, mddev) {
if (this && this != rdev)
continue;
- if (test_bit(Candidate, &rdev->flags))
- continue;
- if (rdev->raid_disk >= 0 &&
- !test_bit(In_sync, &rdev->flags) &&
- !test_bit(Journal, &rdev->flags) &&
- !test_bit(Faulty, &rdev->flags))
+ if (rdev_is_spare(rdev))
spares++;
- if (rdev->raid_disk >= 0)
+ if (!rdev_addable(rdev))
continue;
- if (test_bit(Faulty, &rdev->flags))
- continue;
- if (!test_bit(Journal, &rdev->flags)) {
- if (!md_is_rdwr(mddev) &&
- !(rdev->saved_raid_disk >= 0 &&
- !test_bit(Bitmap_sync, &rdev->flags)))
- continue;
-
+ if (!test_bit(Journal, &rdev->flags))
rdev->recovery_offset = 0;
- }
if (mddev->pers->hot_add_disk(mddev, rdev) == 0) {
/* failure here is OK */
sysfs_link_rdev(mddev, rdev);
@@ -9254,9 +9305,86 @@ no_add:
return spares;
}
+static bool md_choose_sync_action(struct mddev *mddev, int *spares)
+{
+ /* Check if reshape is in progress first. */
+ if (mddev->reshape_position != MaxSector) {
+ if (mddev->pers->check_reshape == NULL ||
+ mddev->pers->check_reshape(mddev) != 0)
+ return false;
+
+ set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ return true;
+ }
+
+ /*
+ * Remove any failed drives, then add spares if possible. Spares are
+ * also removed and re-added, to allow the personality to fail the
+ * re-add.
+ */
+ *spares = remove_and_add_spares(mddev, NULL);
+ if (*spares) {
+ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+
+ /* Start new recovery. */
+ set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ return true;
+ }
+
+ /* Check if recovery is in progress. */
+ if (mddev->recovery_cp < MaxSector) {
+ set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ return true;
+ }
+
+ /* Delay to choose resync/check/repair in md_do_sync(). */
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+ return true;
+
+ /* Nothing to be done */
+ return false;
+}
+
static void md_start_sync(struct work_struct *ws)
{
- struct mddev *mddev = container_of(ws, struct mddev, del_work);
+ struct mddev *mddev = container_of(ws, struct mddev, sync_work);
+ int spares = 0;
+ bool suspend = false;
+
+ if (md_spares_need_change(mddev))
+ suspend = true;
+
+ suspend ? mddev_suspend_and_lock_nointr(mddev) :
+ mddev_lock_nointr(mddev);
+
+ if (!md_is_rdwr(mddev)) {
+ /*
+ * On a read-only array we can:
+ * - remove failed devices
+ * - add already-in_sync devices if the array itself is in-sync.
+ * As we only add devices that are already in-sync, we can
+ * activate the spares immediately.
+ */
+ remove_and_add_spares(mddev, NULL);
+ goto not_running;
+ }
+
+ if (!md_choose_sync_action(mddev, &spares))
+ goto not_running;
+
+ if (!mddev->pers->sync_request)
+ goto not_running;
+
+ /*
+ * We are adding a device or devices to an array which has the bitmap
+ * stored on all devices. So make sure all bitmap pages get written.
+ */
+ if (spares)
+ md_bitmap_write_all(mddev->bitmap);
rcu_assign_pointer(mddev->sync_thread,
md_register_thread(md_do_sync, mddev, "resync"));
@@ -9264,20 +9392,27 @@ static void md_start_sync(struct work_struct *ws)
pr_warn("%s: could not start resync thread...\n",
mdname(mddev));
/* leave the spares where they are, it shouldn't hurt */
- clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
- clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
- clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
- clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
- clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
- wake_up(&resync_wait);
- if (test_and_clear_bit(MD_RECOVERY_RECOVER,
- &mddev->recovery))
- if (mddev->sysfs_action)
- sysfs_notify_dirent_safe(mddev->sysfs_action);
- } else
- md_wakeup_thread(mddev->sync_thread);
+ goto not_running;
+ }
+
+ suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev);
+ md_wakeup_thread(mddev->sync_thread);
sysfs_notify_dirent_safe(mddev->sysfs_action);
md_new_event();
+ return;
+
+not_running:
+ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+ clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+ clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev);
+
+ wake_up(&resync_wait);
+ if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
+ mddev->sysfs_action)
+ sysfs_notify_dirent_safe(mddev->sysfs_action);
}
/*
@@ -9304,19 +9439,7 @@ static void md_start_sync(struct work_struct *ws)
*/
void md_check_recovery(struct mddev *mddev)
{
- if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
- /* Write superblock - thread that called mddev_suspend()
- * holds reconfig_mutex for us.
- */
- set_bit(MD_UPDATING_SB, &mddev->flags);
- smp_mb__after_atomic();
- if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
- md_update_sb(mddev, 0);
- clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
- wake_up(&mddev->sb_wait);
- }
-
- if (is_md_suspended(mddev))
+ if (READ_ONCE(mddev->suspended))
return;
if (mddev->bitmap)
@@ -9345,7 +9468,6 @@ void md_check_recovery(struct mddev *mddev)
return;
if (mddev_trylock(mddev)) {
- int spares = 0;
bool try_set_sync = mddev->safemode != 0;
if (!mddev->external && mddev->safemode == 1)
@@ -9353,30 +9475,43 @@ void md_check_recovery(struct mddev *mddev)
if (!md_is_rdwr(mddev)) {
struct md_rdev *rdev;
+
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
+ /* sync_work already queued. */
+ clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ goto unlock;
+ }
+
if (!mddev->external && mddev->in_sync)
- /* 'Blocked' flag not needed as failed devices
+ /*
+ * 'Blocked' flag not needed as failed devices
* will be recorded if array switched to read/write.
* Leaving it set will prevent the device
* from being removed.
*/
rdev_for_each(rdev, mddev)
clear_bit(Blocked, &rdev->flags);
- /* On a read-only array we can:
- * - remove failed devices
- * - add already-in_sync devices if the array itself
- * is in-sync.
- * As we only add devices that are already in-sync,
- * we can activate the spares immediately.
- */
- remove_and_add_spares(mddev, NULL);
- /* There is no thread, but we need to call
+
+ /*
+ * There is no thread, but we need to call
* ->spare_active and clear saved_raid_disk
*/
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev);
+
+ /*
+ * Let md_start_sync() to remove and add rdevs to the
+ * array.
+ */
+ if (md_spares_need_change(mddev)) {
+ set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ queue_work(md_misc_wq, &mddev->sync_work);
+ }
+
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
+
goto unlock;
}
@@ -9432,56 +9567,14 @@ void md_check_recovery(struct mddev *mddev)
clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
- if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
- test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
- goto not_running;
- /* no recovery is running.
- * remove any failed drives, then
- * add spares if possible.
- * Spares are also removed and re-added, to allow
- * the personality to fail the re-add.
- */
-
- if (mddev->reshape_position != MaxSector) {
- if (mddev->pers->check_reshape == NULL ||
- mddev->pers->check_reshape(mddev) != 0)
- /* Cannot proceed */
- goto not_running;
- set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
- clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
- } else if ((spares = remove_and_add_spares(mddev, NULL))) {
- clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
- clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
- clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
- set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
- } else if (mddev->recovery_cp < MaxSector) {
- set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
- clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
- } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
- /* nothing to be done ... */
- goto not_running;
-
- if (mddev->pers->sync_request) {
- if (spares) {
- /* We are adding a device or devices to an array
- * which has the bitmap stored on all devices.
- * So make sure all bitmap pages get written
- */
- md_bitmap_write_all(mddev->bitmap);
- }
- INIT_WORK(&mddev->del_work, md_start_sync);
- queue_work(md_misc_wq, &mddev->del_work);
- goto unlock;
- }
- not_running:
- if (!mddev->sync_thread) {
+ if (test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
+ queue_work(md_misc_wq, &mddev->sync_work);
+ } else {
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
wake_up(&resync_wait);
- if (test_and_clear_bit(MD_RECOVERY_RECOVER,
- &mddev->recovery))
- if (mddev->sysfs_action)
- sysfs_notify_dirent_safe(mddev->sysfs_action);
}
+
unlock:
wake_up(&mddev->sb_wait);
mddev_unlock(mddev);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 7c9c13abd7ca..55d01d431418 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -248,10 +248,6 @@ struct md_cluster_info;
* become failed.
* @MD_HAS_PPL: The raid array has PPL feature set.
* @MD_HAS_MULTIPLE_PPLS: The raid array has multiple PPLs feature set.
- * @MD_ALLOW_SB_UPDATE: md_check_recovery is allowed to update the metadata
- * without taking reconfig_mutex.
- * @MD_UPDATING_SB: md_check_recovery is updating the metadata without
- * explicitly holding reconfig_mutex.
* @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that
* array is ready yet.
* @MD_BROKEN: This is used to stop writes and mark array as failed.
@@ -268,8 +264,6 @@ enum mddev_flags {
MD_FAILFAST_SUPPORTED,
MD_HAS_PPL,
MD_HAS_MULTIPLE_PPLS,
- MD_ALLOW_SB_UPDATE,
- MD_UPDATING_SB,
MD_NOT_READY,
MD_BROKEN,
MD_DELETED,
@@ -316,6 +310,7 @@ struct mddev {
unsigned long sb_flags;
int suspended;
+ struct mutex suspend_mutex;
struct percpu_ref active_io;
int ro;
int sysfs_active; /* set when sysfs deletes
@@ -453,7 +448,10 @@ struct mddev {
struct kernfs_node *sysfs_degraded; /*handle for 'degraded' */
struct kernfs_node *sysfs_level; /*handle for 'level' */
- struct work_struct del_work; /* used for delayed sysfs removal */
+ /* used for delayed sysfs removal */
+ struct work_struct del_work;
+ /* used for register new sync thread */
+ struct work_struct sync_work;
/* "lock" protects:
* flush_bio transition from NULL to !NULL
@@ -768,7 +766,6 @@ extern void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **t
extern void md_wakeup_thread(struct md_thread __rcu *thread);
extern void md_check_recovery(struct mddev *mddev);
extern void md_reap_sync_thread(struct mddev *mddev);
-extern int mddev_init_writes_pending(struct mddev *mddev);
extern bool md_write_start(struct mddev *mddev, struct bio *bi);
extern void md_write_inc(struct mddev *mddev, struct bio *bi);
extern void md_write_end(struct mddev *mddev);
@@ -795,7 +792,8 @@ extern int md_integrity_register(struct mddev *mddev);
extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev);
extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
-extern void mddev_init(struct mddev *mddev);
+extern int mddev_init(struct mddev *mddev);
+extern void mddev_destroy(struct mddev *mddev);
struct mddev *md_alloc(dev_t dev, char *name);
void mddev_put(struct mddev *mddev);
extern int md_run(struct mddev *mddev);
@@ -806,15 +804,14 @@ extern int md_rdev_init(struct md_rdev *rdev);
extern void md_rdev_clear(struct md_rdev *rdev);
extern void md_handle_request(struct mddev *mddev, struct bio *bio);
-extern void mddev_suspend(struct mddev *mddev);
+extern int mddev_suspend(struct mddev *mddev, bool interruptible);
extern void mddev_resume(struct mddev *mddev);
extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force);
-extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
- bool is_suspend);
-extern void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
- bool is_suspend);
+extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev);
+extern void mddev_destroy_serial_pool(struct mddev *mddev,
+ struct md_rdev *rdev);
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev);
@@ -852,6 +849,33 @@ static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio
mddev->queue->limits.max_write_zeroes_sectors = 0;
}
+static inline int mddev_suspend_and_lock(struct mddev *mddev)
+{
+ int ret;
+
+ ret = mddev_suspend(mddev, true);
+ if (ret)
+ return ret;
+
+ ret = mddev_lock(mddev);
+ if (ret)
+ mddev_resume(mddev);
+
+ return ret;
+}
+
+static inline void mddev_suspend_and_lock_nointr(struct mddev *mddev)
+{
+ mddev_suspend(mddev, false);
+ mutex_lock(&mddev->reconfig_mutex);
+}
+
+static inline void mddev_unlock_and_resume(struct mddev *mddev)
+{
+ mddev_unlock(mddev);
+ mddev_resume(mddev);
+}
+
struct mdu_array_info_s;
struct mdu_disk_info_s;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 2aabac773fe7..35d12948e0a9 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1345,6 +1345,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
int first_clone;
int max_sectors;
bool write_behind = false;
+ bool is_discard = (bio_op(bio) == REQ_OP_DISCARD);
if (mddev_is_clustered(mddev) &&
md_cluster_ops->area_resyncing(mddev, WRITE,
@@ -1405,7 +1406,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
* write-mostly, which means we could allocate write behind
* bio later.
*/
- if (rdev && test_bit(WriteMostly, &rdev->flags))
+ if (!is_discard && rdev && test_bit(WriteMostly, &rdev->flags))
write_behind = true;
if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
@@ -3122,8 +3123,7 @@ static int raid1_run(struct mddev *mddev)
mdname(mddev));
return -EIO;
}
- if (mddev_init_writes_pending(mddev) < 0)
- return -ENOMEM;
+
/*
* copy the already verified devices into our private RAID1
* bookkeeping area. [whatever we allocate in run(),
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 023413120851..a5927e98dc67 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4154,9 +4154,6 @@ static int raid10_run(struct mddev *mddev)
sector_t min_offset_diff = 0;
int first = 1;
- if (mddev_init_writes_pending(mddev) < 0)
- return -ENOMEM;
-
if (mddev->private == NULL) {
conf = setup_conf(mddev);
if (IS_ERR(conf))
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 518b7cfa78b9..6157f5beb9fe 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -327,8 +327,9 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
void r5c_check_stripe_cache_usage(struct r5conf *conf)
{
int total_cached;
+ struct r5l_log *log = READ_ONCE(conf->log);
- if (!r5c_is_writeback(conf->log))
+ if (!r5c_is_writeback(log))
return;
total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
@@ -344,7 +345,7 @@ void r5c_check_stripe_cache_usage(struct r5conf *conf)
*/
if (total_cached > conf->min_nr_stripes * 1 / 2 ||
atomic_read(&conf->empty_inactive_list_nr) > 0)
- r5l_wake_reclaim(conf->log, 0);
+ r5l_wake_reclaim(log, 0);
}
/*
@@ -353,7 +354,9 @@ void r5c_check_stripe_cache_usage(struct r5conf *conf)
*/
void r5c_check_cached_full_stripe(struct r5conf *conf)
{
- if (!r5c_is_writeback(conf->log))
+ struct r5l_log *log = READ_ONCE(conf->log);
+
+ if (!r5c_is_writeback(log))
return;
/*
@@ -363,7 +366,7 @@ void r5c_check_cached_full_stripe(struct r5conf *conf)
if (atomic_read(&conf->r5c_cached_full_stripes) >=
min(R5C_FULL_STRIPE_FLUSH_BATCH(conf),
conf->chunk_sectors >> RAID5_STRIPE_SHIFT(conf)))
- r5l_wake_reclaim(conf->log, 0);
+ r5l_wake_reclaim(log, 0);
}
/*
@@ -396,7 +399,7 @@ void r5c_check_cached_full_stripe(struct r5conf *conf)
*/
static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
{
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
if (!r5c_is_writeback(log))
return 0;
@@ -449,7 +452,7 @@ static inline void r5c_update_log_state(struct r5l_log *log)
void r5c_make_stripe_write_out(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
BUG_ON(!r5c_is_writeback(log));
@@ -491,7 +494,7 @@ static void r5c_handle_parity_cached(struct stripe_head *sh)
*/
static void r5c_finish_cache_stripe(struct stripe_head *sh)
{
- struct r5l_log *log = sh->raid_conf->log;
+ struct r5l_log *log = READ_ONCE(sh->raid_conf->log);
if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
@@ -683,7 +686,6 @@ static void r5c_disable_writeback_async(struct work_struct *work)
disable_writeback_work);
struct mddev *mddev = log->rdev->mddev;
struct r5conf *conf = mddev->private;
- int locked = 0;
if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
return;
@@ -692,14 +694,14 @@ static void r5c_disable_writeback_async(struct work_struct *work)
/* wait superblock change before suspend */
wait_event(mddev->sb_wait,
- conf->log == NULL ||
- (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) &&
- (locked = mddev_trylock(mddev))));
- if (locked) {
- mddev_suspend(mddev);
+ !READ_ONCE(conf->log) ||
+ !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
+
+ log = READ_ONCE(conf->log);
+ if (log) {
+ mddev_suspend(mddev, false);
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
mddev_resume(mddev);
- mddev_unlock(mddev);
}
}
@@ -1151,7 +1153,7 @@ static void r5l_run_no_space_stripes(struct r5l_log *log)
static sector_t r5c_calculate_new_cp(struct r5conf *conf)
{
struct stripe_head *sh;
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
sector_t new_cp;
unsigned long flags;
@@ -1159,12 +1161,12 @@ static sector_t r5c_calculate_new_cp(struct r5conf *conf)
return log->next_checkpoint;
spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
- if (list_empty(&conf->log->stripe_in_journal_list)) {
+ if (list_empty(&log->stripe_in_journal_list)) {
/* all stripes flushed */
spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
return log->next_checkpoint;
}
- sh = list_first_entry(&conf->log->stripe_in_journal_list,
+ sh = list_first_entry(&log->stripe_in_journal_list,
struct stripe_head, r5c);
new_cp = sh->log_start;
spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
@@ -1399,7 +1401,7 @@ void r5c_flush_cache(struct r5conf *conf, int num)
struct stripe_head *sh, *next;
lockdep_assert_held(&conf->device_lock);
- if (!conf->log)
+ if (!READ_ONCE(conf->log))
return;
count = 0;
@@ -1420,7 +1422,7 @@ void r5c_flush_cache(struct r5conf *conf, int num)
static void r5c_do_reclaim(struct r5conf *conf)
{
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
struct stripe_head *sh;
int count = 0;
unsigned long flags;
@@ -1549,7 +1551,7 @@ static void r5l_reclaim_thread(struct md_thread *thread)
{
struct mddev *mddev = thread->mddev;
struct r5conf *conf = mddev->private;
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
if (!log)
return;
@@ -1591,7 +1593,7 @@ void r5l_quiesce(struct r5l_log *log, int quiesce)
bool r5l_log_disk_error(struct r5conf *conf)
{
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
/* don't allow write if journal disk is missing */
if (!log)
@@ -2583,9 +2585,7 @@ int r5c_journal_mode_set(struct mddev *mddev, int mode)
mode == R5C_JOURNAL_MODE_WRITE_BACK)
return -EINVAL;
- mddev_suspend(mddev);
conf->log->r5c_journal_mode = mode;
- mddev_resume(mddev);
pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n",
mdname(mddev), mode, r5c_journal_mode_str[mode]);
@@ -2610,11 +2610,11 @@ static ssize_t r5c_journal_mode_store(struct mddev *mddev,
if (strlen(r5c_journal_mode_str[mode]) == len &&
!strncmp(page, r5c_journal_mode_str[mode], len))
break;
- ret = mddev_lock(mddev);
+ ret = mddev_suspend_and_lock(mddev);
if (ret)
return ret;
ret = r5c_journal_mode_set(mddev, mode);
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return ret ?: length;
}
@@ -2635,7 +2635,7 @@ int r5c_try_caching_write(struct r5conf *conf,
struct stripe_head_state *s,
int disks)
{
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
int i;
struct r5dev *dev;
int to_cache = 0;
@@ -2802,7 +2802,7 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
struct stripe_head *sh,
struct stripe_head_state *s)
{
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
int i;
int do_wakeup = 0;
sector_t tree_index;
@@ -2941,7 +2941,7 @@ int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
/* check whether this big stripe is in write back cache. */
bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect)
{
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
sector_t tree_index;
void *slot;
@@ -3049,14 +3049,14 @@ int r5l_start(struct r5l_log *log)
void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
{
struct r5conf *conf = mddev->private;
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
if (!log)
return;
if ((raid5_calc_degraded(conf) > 0 ||
test_bit(Journal, &rdev->flags)) &&
- conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
+ log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
schedule_work(&log->disable_writeback_work);
}
@@ -3145,7 +3145,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
spin_lock_init(&log->stripe_in_journal_lock);
atomic_set(&log->stripe_in_journal_count, 0);
- conf->log = log;
+ WRITE_ONCE(conf->log, log);
set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
return 0;
@@ -3173,7 +3173,7 @@ void r5l_exit_log(struct r5conf *conf)
* 'reconfig_mutex' is held by caller, set 'confg->log' to NULL to
* ensure disable_writeback_work wakes up and exits.
*/
- conf->log = NULL;
+ WRITE_ONCE(conf->log, NULL);
wake_up(&conf->mddev->sb_wait);
flush_work(&log->disable_writeback_work);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 4cb9c608ee19..d6de084a85e5 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -70,6 +70,8 @@ MODULE_PARM_DESC(devices_handle_discard_safely,
"Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
static struct workqueue_struct *raid5_wq;
+static void raid5_quiesce(struct mddev *mddev, int quiesce);
+
static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
{
int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK;
@@ -2492,15 +2494,12 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
unsigned long cpu;
int err = 0;
- /*
- * Never shrink. And mddev_suspend() could deadlock if this is called
- * from raid5d. In that case, scribble_disks and scribble_sectors
- * should equal to new_disks and new_sectors
- */
+ /* Never shrink. */
if (conf->scribble_disks >= new_disks &&
conf->scribble_sectors >= new_sectors)
return 0;
- mddev_suspend(conf->mddev);
+
+ raid5_quiesce(conf->mddev, true);
cpus_read_lock();
for_each_present_cpu(cpu) {
@@ -2514,7 +2513,8 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
}
cpus_read_unlock();
- mddev_resume(conf->mddev);
+ raid5_quiesce(conf->mddev, false);
+
if (!err) {
conf->scribble_disks = new_disks;
conf->scribble_sectors = new_sectors;
@@ -7025,7 +7025,7 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len)
new != roundup_pow_of_two(new))
return -EINVAL;
- err = mddev_lock(mddev);
+ err = mddev_suspend_and_lock(mddev);
if (err)
return err;
@@ -7049,7 +7049,6 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len)
goto out_unlock;
}
- mddev_suspend(mddev);
mutex_lock(&conf->cache_size_mutex);
size = conf->max_nr_stripes;
@@ -7064,10 +7063,9 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len)
err = -ENOMEM;
}
mutex_unlock(&conf->cache_size_mutex);
- mddev_resume(mddev);
out_unlock:
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return err ?: len;
}
@@ -7153,7 +7151,7 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
return -EINVAL;
new = !!new;
- err = mddev_lock(mddev);
+ err = mddev_suspend_and_lock(mddev);
if (err)
return err;
conf = mddev->private;
@@ -7162,15 +7160,13 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
else if (new != conf->skip_copy) {
struct request_queue *q = mddev->queue;
- mddev_suspend(mddev);
conf->skip_copy = new;
if (new)
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
else
blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q);
- mddev_resume(mddev);
}
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return err ?: len;
}
@@ -7225,15 +7221,13 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
if (new > 8192)
return -EINVAL;
- err = mddev_lock(mddev);
+ err = mddev_suspend_and_lock(mddev);
if (err)
return err;
conf = mddev->private;
if (!conf)
err = -ENODEV;
else if (new != conf->worker_cnt_per_group) {
- mddev_suspend(mddev);
-
old_groups = conf->worker_groups;
if (old_groups)
flush_workqueue(raid5_wq);
@@ -7250,9 +7244,8 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
kfree(old_groups[0].workers);
kfree(old_groups);
}
- mddev_resume(mddev);
}
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return err ?: len;
}
@@ -7778,9 +7771,6 @@ static int raid5_run(struct mddev *mddev)
long long min_offset_diff = 0;
int first = 1;
- if (mddev_init_writes_pending(mddev) < 0)
- return -ENOMEM;
-
if (mddev->recovery_cp != MaxSector)
pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
mdname(mddev));
@@ -8561,8 +8551,8 @@ static int raid5_start_reshape(struct mddev *mddev)
* the reshape wasn't running - like Discard or Read - have
* completed.
*/
- mddev_suspend(mddev);
- mddev_resume(mddev);
+ raid5_quiesce(mddev, true);
+ raid5_quiesce(mddev, false);
/* Add some new drives, as many as will fit.
* We know there are enough to make the newly sized array work.
@@ -8977,12 +8967,12 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
struct r5conf *conf;
int err;
- err = mddev_lock(mddev);
+ err = mddev_suspend_and_lock(mddev);
if (err)
return err;
conf = mddev->private;
if (!conf) {
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return -ENODEV;
}
@@ -8992,19 +8982,14 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
err = log_init(conf, NULL, true);
if (!err) {
err = resize_stripes(conf, conf->pool_size);
- if (err) {
- mddev_suspend(mddev);
+ if (err)
log_exit(conf);
- mddev_resume(mddev);
- }
}
} else
err = -EINVAL;
} else if (strncmp(buf, "resync", 6) == 0) {
if (raid5_has_ppl(conf)) {
- mddev_suspend(mddev);
log_exit(conf);
- mddev_resume(mddev);
err = resize_stripes(conf, conf->pool_size);
} else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
r5l_log_disk_error(conf)) {
@@ -9017,11 +9002,9 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
break;
}
- if (!journal_dev_exists) {
- mddev_suspend(mddev);
+ if (!journal_dev_exists)
clear_bit(MD_HAS_JOURNAL, &mddev->flags);
- mddev_resume(mddev);
- } else /* need remove journal device first */
+ else /* need remove journal device first */
err = -EBUSY;
} else
err = -EINVAL;
@@ -9032,7 +9015,7 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
if (!err)
md_update_sb(mddev, 1);
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return err;
}
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 347cb5daebc3..60a08dfe8d75 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -924,7 +924,6 @@ static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req)
if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true)))
return false;
- req->mq_hctx->tags->rqs[req->tag] = req;
return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK;
}
diff --git a/include/linux/badblocks.h b/include/linux/badblocks.h
index 2426276b9bd3..670f2dae692f 100644
--- a/include/linux/badblocks.h
+++ b/include/linux/badblocks.h
@@ -15,6 +15,7 @@
#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9)
#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1)
#define BB_ACK(x) (!!((x) & BB_ACK_MASK))
+#define BB_END(x) (BB_OFFSET(x) + BB_LEN(x))
#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
/* Bad block numbers are stored sorted in a single page.
@@ -41,6 +42,12 @@ struct badblocks {
sector_t size; /* in sectors */
};
+struct badblocks_context {
+ sector_t start;
+ sector_t len;
+ int ack;
+};
+
int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
sector_t *first_bad, int *bad_sectors);
int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
@@ -63,4 +70,27 @@ static inline void devm_exit_badblocks(struct device *dev, struct badblocks *bb)
}
badblocks_exit(bb);
}
+
+static inline int badblocks_full(struct badblocks *bb)
+{
+ return (bb->count >= MAX_BADBLOCKS);
+}
+
+static inline int badblocks_empty(struct badblocks *bb)
+{
+ return (bb->count == 0);
+}
+
+static inline void set_changed(struct badblocks *bb)
+{
+ if (bb->changed != 1)
+ bb->changed = 1;
+}
+
+static inline void clear_changed(struct badblocks *bb)
+{
+ if (bb->changed != 0)
+ bb->changed = 0;
+}
+
#endif
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 958ed7e89b30..1ab3081c82ed 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -32,8 +32,6 @@ typedef __u32 __bitwise req_flags_t;
#define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << 4))
/* merge of different types, fail separately */
#define RQF_MIXED_MERGE ((__force req_flags_t)(1 << 5))
-/* track inflight for MQ */
-#define RQF_MQ_INFLIGHT ((__force req_flags_t)(1 << 6))
/* don't call prep for this one */
#define RQF_DONTPREP ((__force req_flags_t)(1 << 7))
/* use hctx->sched_tags */