aboutsummaryrefslogtreecommitdiff
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c932
1 files changed, 601 insertions, 331 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 3c7e106c12a2..aeeb8d6854e2 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -55,12 +55,16 @@
#include <linux/ratelimit.h>
#include <linux/nodemask.h>
#include <linux/flex_array.h>
+#include <linux/sched/signal.h>
+
#include <trace/events/block.h>
+#include <linux/list_sort.h>
#include "md.h"
#include "raid5.h"
#include "raid0.h"
#include "bitmap.h"
+#include "raid5-log.h"
#define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED)
@@ -99,8 +103,7 @@ static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
{
int i;
- local_irq_disable();
- spin_lock(conf->hash_locks);
+ spin_lock_irq(conf->hash_locks);
for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
spin_lock(&conf->device_lock);
@@ -110,9 +113,9 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
{
int i;
spin_unlock(&conf->device_lock);
- for (i = NR_STRIPE_HASH_LOCKS; i; i--)
- spin_unlock(conf->hash_locks + i - 1);
- local_irq_enable();
+ for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--)
+ spin_unlock(conf->hash_locks + i);
+ spin_unlock_irq(conf->hash_locks);
}
/* Find first data disk in a raid6 stripe */
@@ -154,17 +157,6 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
return slot;
}
-static void return_io(struct bio_list *return_bi)
-{
- struct bio *bi;
- while ((bi = bio_list_pop(return_bi)) != NULL) {
- bi->bi_iter.bi_size = 0;
- trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
- bi, 0);
- bio_endio(bi);
- }
-}
-
static void print_raid5_conf (struct r5conf *conf);
static int stripe_operations_active(struct stripe_head *sh)
@@ -174,6 +166,13 @@ static int stripe_operations_active(struct stripe_head *sh)
test_bit(STRIPE_COMPUTE_RUN, &sh->state);
}
+static bool stripe_is_lowprio(struct stripe_head *sh)
+{
+ return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) ||
+ test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) &&
+ !test_bit(STRIPE_R5C_CACHING, &sh->state);
+}
+
static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
@@ -189,7 +188,10 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
if (list_empty(&sh->lru)) {
struct r5worker_group *group;
group = conf->worker_groups + cpu_to_group(cpu);
- list_add_tail(&sh->lru, &group->handle_list);
+ if (stripe_is_lowprio(sh))
+ list_add_tail(&sh->lru, &group->loprio_list);
+ else
+ list_add_tail(&sh->lru, &group->handle_list);
group->stripes_cnt++;
sh->group = group;
}
@@ -231,11 +233,15 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
if (test_bit(R5_InJournal, &sh->dev[i].flags))
injournal++;
/*
- * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with
- * data in journal, so they are not released to cached lists
+ * In the following cases, the stripe cannot be released to cached
+ * lists. Therefore, we make the stripe write out and set
+ * STRIPE_HANDLE:
+ * 1. when quiesce in r5c write back;
+ * 2. when resync is requested fot the stripe.
*/
- if (conf->quiesce && r5c_is_writeback(conf->log) &&
- !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) {
+ if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) ||
+ (conf->quiesce && r5c_is_writeback(conf->log) &&
+ !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) {
if (test_bit(STRIPE_R5C_CACHING, &sh->state))
r5c_make_stripe_write_out(sh);
set_bit(STRIPE_HANDLE, &sh->state);
@@ -252,7 +258,12 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
clear_bit(STRIPE_DELAYED, &sh->state);
clear_bit(STRIPE_BIT_DELAY, &sh->state);
if (conf->worker_cnt_per_group == 0) {
- list_add_tail(&sh->lru, &conf->handle_list);
+ if (stripe_is_lowprio(sh))
+ list_add_tail(&sh->lru,
+ &conf->loprio_list);
+ else
+ list_add_tail(&sh->lru,
+ &conf->handle_list);
} else {
raid5_wakeup_stripe_thread(sh);
return;
@@ -281,13 +292,13 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
atomic_dec(&conf->r5c_cached_partial_stripes);
list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
r5c_check_cached_full_stripe(conf);
- } else {
- /* partial stripe */
- if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE,
- &sh->state))
- atomic_inc(&conf->r5c_cached_partial_stripes);
+ } else
+ /*
+ * STRIPE_R5C_PARTIAL_STRIPE is set in
+ * r5c_try_caching_write(). No need to
+ * set it again.
+ */
list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
- }
}
}
}
@@ -353,17 +364,15 @@ static void release_inactive_stripe_list(struct r5conf *conf,
static int release_stripe_list(struct r5conf *conf,
struct list_head *temp_inactive_list)
{
- struct stripe_head *sh;
+ struct stripe_head *sh, *t;
int count = 0;
struct llist_node *head;
head = llist_del_all(&conf->released_stripes);
head = llist_reverse_order(head);
- while (head) {
+ llist_for_each_entry_safe(sh, t, head, release_list) {
int hash;
- sh = llist_entry(head, struct stripe_head, release_list);
- head = llist_next(head);
/* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
smp_mb();
clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
@@ -481,6 +490,7 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
sh->dev[i].page = page;
sh->dev[i].orig_page = page;
}
+
return 0;
}
@@ -707,12 +717,11 @@ static bool is_full_stripe_write(struct stripe_head *sh)
static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
{
- local_irq_disable();
if (sh1 > sh2) {
- spin_lock(&sh2->stripe_lock);
+ spin_lock_irq(&sh2->stripe_lock);
spin_lock_nested(&sh1->stripe_lock, 1);
} else {
- spin_lock(&sh1->stripe_lock);
+ spin_lock_irq(&sh1->stripe_lock);
spin_lock_nested(&sh2->stripe_lock, 1);
}
}
@@ -720,8 +729,7 @@ static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
{
spin_unlock(&sh1->stripe_lock);
- spin_unlock(&sh2->stripe_lock);
- local_irq_enable();
+ spin_unlock_irq(&sh2->stripe_lock);
}
/* Only freshly new full stripe normal write stripe can be added to a batch list */
@@ -729,7 +737,7 @@ static bool stripe_can_batch(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
- if (conf->log)
+ if (conf->log || raid5_has_ppl(conf))
return false;
return test_bit(STRIPE_BATCH_READY, &sh->state) &&
!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
@@ -863,6 +871,109 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
return 1;
}
+static void dispatch_bio_list(struct bio_list *tmp)
+{
+ struct bio *bio;
+
+ while ((bio = bio_list_pop(tmp)))
+ generic_make_request(bio);
+}
+
+static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b)
+{
+ const struct r5pending_data *da = list_entry(a,
+ struct r5pending_data, sibling);
+ const struct r5pending_data *db = list_entry(b,
+ struct r5pending_data, sibling);
+ if (da->sector > db->sector)
+ return 1;
+ if (da->sector < db->sector)
+ return -1;
+ return 0;
+}
+
+static void dispatch_defer_bios(struct r5conf *conf, int target,
+ struct bio_list *list)
+{
+ struct r5pending_data *data;
+ struct list_head *first, *next = NULL;
+ int cnt = 0;
+
+ if (conf->pending_data_cnt == 0)
+ return;
+
+ list_sort(NULL, &conf->pending_list, cmp_stripe);
+
+ first = conf->pending_list.next;
+
+ /* temporarily move the head */
+ if (conf->next_pending_data)
+ list_move_tail(&conf->pending_list,
+ &conf->next_pending_data->sibling);
+
+ while (!list_empty(&conf->pending_list)) {
+ data = list_first_entry(&conf->pending_list,
+ struct r5pending_data, sibling);
+ if (&data->sibling == first)
+ first = data->sibling.next;
+ next = data->sibling.next;
+
+ bio_list_merge(list, &data->bios);
+ list_move(&data->sibling, &conf->free_list);
+ cnt++;
+ if (cnt >= target)
+ break;
+ }
+ conf->pending_data_cnt -= cnt;
+ BUG_ON(conf->pending_data_cnt < 0 || cnt < target);
+
+ if (next != &conf->pending_list)
+ conf->next_pending_data = list_entry(next,
+ struct r5pending_data, sibling);
+ else
+ conf->next_pending_data = NULL;
+ /* list isn't empty */
+ if (first != &conf->pending_list)
+ list_move_tail(&conf->pending_list, first);
+}
+
+static void flush_deferred_bios(struct r5conf *conf)
+{
+ struct bio_list tmp = BIO_EMPTY_LIST;
+
+ if (conf->pending_data_cnt == 0)
+ return;
+
+ spin_lock(&conf->pending_bios_lock);
+ dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp);
+ BUG_ON(conf->pending_data_cnt != 0);
+ spin_unlock(&conf->pending_bios_lock);
+
+ dispatch_bio_list(&tmp);
+}
+
+static void defer_issue_bios(struct r5conf *conf, sector_t sector,
+ struct bio_list *bios)
+{
+ struct bio_list tmp = BIO_EMPTY_LIST;
+ struct r5pending_data *ent;
+
+ spin_lock(&conf->pending_bios_lock);
+ ent = list_first_entry(&conf->free_list, struct r5pending_data,
+ sibling);
+ list_move_tail(&ent->sibling, &conf->pending_list);
+ ent->sector = sector;
+ bio_list_init(&ent->bios);
+ bio_list_merge(&ent->bios, bios);
+ conf->pending_data_cnt++;
+ if (conf->pending_data_cnt >= PENDING_IO_MAX)
+ dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp);
+
+ spin_unlock(&conf->pending_bios_lock);
+
+ dispatch_bio_list(&tmp);
+}
+
static void
raid5_end_read_request(struct bio *bi);
static void
@@ -873,21 +984,15 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
struct r5conf *conf = sh->raid_conf;
int i, disks = sh->disks;
struct stripe_head *head_sh = sh;
+ struct bio_list pending_bios = BIO_EMPTY_LIST;
+ bool should_defer;
might_sleep();
- if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
- /* writing out phase */
- if (s->waiting_extra_page)
- return;
- if (r5l_write_stripe(conf->log, sh) == 0)
- return;
- } else { /* caching phase */
- if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) {
- r5c_cache_data(conf->log, sh, s);
- return;
- }
- }
+ if (log_stripe(sh, s) == 0)
+ return;
+
+ should_defer = conf->batch_bio_dispatch && conf->group_cnt;
for (i = disks; i--; ) {
int op, op_flags = 0;
@@ -1043,7 +1148,10 @@ again:
trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
bi, disk_devt(conf->mddev->gendisk),
sh->dev[i].sector);
- generic_make_request(bi);
+ if (should_defer && op_is_write(op))
+ bio_list_add(&pending_bios, bi);
+ else
+ generic_make_request(bi);
}
if (rrdev) {
if (s->syncing || s->expanding || s->expanded
@@ -1088,7 +1196,10 @@ again:
trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
rbi, disk_devt(conf->mddev->gendisk),
sh->dev[i].sector);
- generic_make_request(rbi);
+ if (should_defer && op_is_write(op))
+ bio_list_add(&pending_bios, rbi);
+ else
+ generic_make_request(rbi);
}
if (!rdev && !rrdev) {
if (op_is_write(op))
@@ -1106,6 +1217,9 @@ again:
if (sh != head_sh)
goto again;
}
+
+ if (should_defer && !bio_list_empty(&pending_bios))
+ defer_issue_bios(conf, head_sh->sector, &pending_bios);
}
static struct dma_async_tx_descriptor *
@@ -1175,7 +1289,6 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
static void ops_complete_biofill(void *stripe_head_ref)
{
struct stripe_head *sh = stripe_head_ref;
- struct bio_list return_bi = BIO_EMPTY_LIST;
int i;
pr_debug("%s: stripe %llu\n", __func__,
@@ -1199,16 +1312,13 @@ static void ops_complete_biofill(void *stripe_head_ref)
while (rbi && rbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
rbi2 = r5_next_bio(rbi, dev->sector);
- if (!raid5_dec_bi_active_stripes(rbi))
- bio_list_add(&return_bi, rbi);
+ bio_endio(rbi);
rbi = rbi2;
}
}
}
clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
- return_io(&return_bi);
-
set_bit(STRIPE_HANDLE, &sh->state);
raid5_release_stripe(sh);
}
@@ -1364,7 +1474,8 @@ static int set_syndrome_sources(struct page **srcs,
(test_bit(R5_Wantdrain, &dev->flags) ||
test_bit(R5_InJournal, &dev->flags))) ||
(srctype == SYNDROME_SRC_WRITTEN &&
- dev->written)) {
+ (dev->written ||
+ test_bit(R5_InJournal, &dev->flags)))) {
if (test_bit(R5_InJournal, &dev->flags))
srcs[slot] = sh->dev[i].orig_page;
else
@@ -1976,6 +2087,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
tx = ops_run_prexor6(sh, percpu, tx);
}
+ if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
+ tx = ops_run_partial_parity(sh, percpu, tx);
+
if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
tx = ops_run_biodrain(sh, tx);
overlap_clear++;
@@ -2008,8 +2122,15 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
put_cpu();
}
+static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
+{
+ if (sh->ppl_page)
+ __free_page(sh->ppl_page);
+ kmem_cache_free(sc, sh);
+}
+
static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
- int disks)
+ int disks, struct r5conf *conf)
{
struct stripe_head *sh;
int i;
@@ -2023,6 +2144,7 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
INIT_LIST_HEAD(&sh->r5c);
INIT_LIST_HEAD(&sh->log_list);
atomic_set(&sh->count, 1);
+ sh->raid_conf = conf;
sh->log_start = MaxSector;
for (i = 0; i < disks; i++) {
struct r5dev *dev = &sh->dev[i];
@@ -2030,6 +2152,14 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
bio_init(&dev->req, &dev->vec, 1);
bio_init(&dev->rreq, &dev->rvec, 1);
}
+
+ if (raid5_has_ppl(conf)) {
+ sh->ppl_page = alloc_page(gfp);
+ if (!sh->ppl_page) {
+ free_stripe(sc, sh);
+ sh = NULL;
+ }
+ }
}
return sh;
}
@@ -2037,15 +2167,13 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
{
struct stripe_head *sh;
- sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size);
+ sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf);
if (!sh)
return 0;
- sh->raid_conf = conf;
-
if (grow_buffers(sh, gfp)) {
shrink_buffers(sh);
- kmem_cache_free(conf->slab_cache, sh);
+ free_stripe(conf->slab_cache, sh);
return 0;
}
sh->hash_lock_index =
@@ -2172,7 +2300,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
* pages have been transferred over, and the old kmem_cache is
* freed when all stripes are done.
* 3/ reallocate conf->disks to be suitable bigger. If this fails,
- * we simple return a failre status - no need to clean anything up.
+ * we simple return a failure status - no need to clean anything up.
* 4/ allocate new pages for the new slots in the new stripe_heads.
* If this fails, we don't bother trying the shrink the
* stripe_heads down again, we just leave them as they are.
@@ -2185,17 +2313,12 @@ static int resize_stripes(struct r5conf *conf, int newsize)
struct stripe_head *osh, *nsh;
LIST_HEAD(newstripes);
struct disk_info *ndisks;
- int err;
+ int err = 0;
struct kmem_cache *sc;
int i;
int hash, cnt;
- if (newsize <= conf->pool_size)
- return 0; /* never bother to shrink */
-
- err = md_allow_write(conf->mddev);
- if (err)
- return err;
+ md_allow_write(conf->mddev);
/* Step 1 */
sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
@@ -2208,11 +2331,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
mutex_lock(&conf->cache_size_mutex);
for (i = conf->max_nr_stripes; i; i--) {
- nsh = alloc_stripe(sc, GFP_KERNEL, newsize);
+ nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf);
if (!nsh)
break;
- nsh->raid_conf = conf;
list_add(&nsh->lru, &newstripes);
}
if (i) {
@@ -2220,7 +2342,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
while (!list_empty(&newstripes)) {
nsh = list_entry(newstripes.next, struct stripe_head, lru);
list_del(&nsh->lru);
- kmem_cache_free(sc, nsh);
+ free_stripe(sc, nsh);
}
kmem_cache_destroy(sc);
mutex_unlock(&conf->cache_size_mutex);
@@ -2246,7 +2368,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
nsh->dev[i].orig_page = osh->dev[i].page;
}
nsh->hash_lock_index = hash;
- kmem_cache_free(conf->slab_cache, osh);
+ free_stripe(conf->slab_cache, osh);
cnt++;
if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
!!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
@@ -2285,6 +2407,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
err = -ENOMEM;
mutex_unlock(&conf->cache_size_mutex);
+
+ conf->slab_cache = sc;
+ conf->active_name = 1-conf->active_name;
+
/* Step 4, return new stripes to service */
while(!list_empty(&newstripes)) {
nsh = list_entry(newstripes.next, struct stripe_head, lru);
@@ -2302,8 +2428,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
}
/* critical section pass, GFP_NOIO no longer needed */
- conf->slab_cache = sc;
- conf->active_name = 1-conf->active_name;
if (!err)
conf->pool_size = newsize;
return err;
@@ -2321,7 +2445,7 @@ static int drop_one_stripe(struct r5conf *conf)
return 0;
BUG_ON(atomic_read(&sh->count));
shrink_buffers(sh);
- kmem_cache_free(conf->slab_cache, sh);
+ free_stripe(conf->slab_cache, sh);
atomic_dec(&conf->active_stripes);
conf->max_nr_stripes--;
return 1;
@@ -2352,7 +2476,7 @@ static void raid5_end_read_request(struct bio * bi)
pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",
(unsigned long long)sh->sector, i, atomic_read(&sh->count),
- bi->bi_error);
+ bi->bi_status);
if (i == disks) {
bio_reset(bi);
BUG();
@@ -2372,7 +2496,7 @@ static void raid5_end_read_request(struct bio * bi)
s = sh->sector + rdev->new_data_offset;
else
s = sh->sector + rdev->data_offset;
- if (!bi->bi_error) {
+ if (!bi->bi_status) {
set_bit(R5_UPTODATE, &sh->dev[i].flags);
if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
/* Note that this cannot happen on a
@@ -2489,7 +2613,7 @@ static void raid5_end_write_request(struct bio *bi)
}
pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",
(unsigned long long)sh->sector, i, atomic_read(&sh->count),
- bi->bi_error);
+ bi->bi_status);
if (i == disks) {
bio_reset(bi);
BUG();
@@ -2497,14 +2621,14 @@ static void raid5_end_write_request(struct bio *bi)
}
if (replacement) {
- if (bi->bi_error)
+ if (bi->bi_status)
md_error(conf->mddev, rdev);
else if (is_badblock(rdev, sh->sector,
STRIPE_SECTORS,
&first_bad, &bad_sectors))
set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
} else {
- if (bi->bi_error) {
+ if (bi->bi_status) {
set_bit(STRIPE_DEGRADED, &sh->state);
set_bit(WriteErrorSeen, &rdev->flags);
set_bit(R5_WriteError, &sh->dev[i].flags);
@@ -2525,7 +2649,7 @@ static void raid5_end_write_request(struct bio *bi)
}
rdev_dec_pending(rdev, conf->mddev);
- if (sh->batch_head && bi->bi_error && !replacement)
+ if (sh->batch_head && bi->bi_status && !replacement)
set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
bio_reset(bi);
@@ -2569,7 +2693,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
bdevname(rdev->bdev, b),
mdname(mddev),
conf->raid_disks - mddev->degraded);
- r5c_update_on_rdev_error(mddev);
+ r5c_update_on_rdev_error(mddev, rdev);
}
/*
@@ -2914,12 +3038,44 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
* like to flush data in journal to RAID disks first, so complex rmw
* is handled in the write patch (handle_stripe_dirtying).
*
+ * 2. when journal space is critical (R5C_LOG_CRITICAL=1)
+ *
+ * It is important to be able to flush all stripes in raid5-cache.
+ * Therefore, we need reserve some space on the journal device for
+ * these flushes. If flush operation includes pending writes to the
+ * stripe, we need to reserve (conf->raid_disk + 1) pages per stripe
+ * for the flush out. If we exclude these pending writes from flush
+ * operation, we only need (conf->max_degraded + 1) pages per stripe.
+ * Therefore, excluding pending writes in these cases enables more
+ * efficient use of the journal device.
+ *
+ * Note: To make sure the stripe makes progress, we only delay
+ * towrite for stripes with data already in journal (injournal > 0).
+ * When LOG_CRITICAL, stripes with injournal == 0 will be sent to
+ * no_space_stripes list.
+ *
+ * 3. during journal failure
+ * In journal failure, we try to flush all cached data to raid disks
+ * based on data in stripe cache. The array is read-only to upper
+ * layers, so we would skip all pending writes.
+ *
*/
-static inline bool delay_towrite(struct r5dev *dev,
- struct stripe_head_state *s)
-{
- return !test_bit(R5_OVERWRITE, &dev->flags) &&
- !test_bit(R5_Insync, &dev->flags) && s->injournal;
+static inline bool delay_towrite(struct r5conf *conf,
+ struct r5dev *dev,
+ struct stripe_head_state *s)
+{
+ /* case 1 above */
+ if (!test_bit(R5_OVERWRITE, &dev->flags) &&
+ !test_bit(R5_Insync, &dev->flags) && s->injournal)
+ return true;
+ /* case 2 above */
+ if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
+ s->injournal > 0)
+ return true;
+ /* case 3 above */
+ if (s->log_failed && s->injournal)
+ return true;
+ return false;
}
static void
@@ -2942,7 +3098,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
- if (dev->towrite && !delay_towrite(dev, s)) {
+ if (dev->towrite && !delay_towrite(conf, dev, s)) {
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantdrain, &dev->flags);
if (!expand)
@@ -3020,6 +3176,12 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
s->locked++;
}
+ if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page &&
+ test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
+ !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
+ test_bit(R5_Insync, &sh->dev[pd_idx].flags))
+ set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
+
pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
__func__, (unsigned long long)sh->sector,
s->locked, s->ops_request);
@@ -3041,14 +3203,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
(unsigned long long)bi->bi_iter.bi_sector,
(unsigned long long)sh->sector);
- /*
- * If several bio share a stripe. The bio bi_phys_segments acts as a
- * reference count to avoid race. The reference count should already be
- * increased before this function is called (for example, in
- * raid5_make_request()), so other bio sharing this stripe will not free the
- * stripe. If a stripe is owned by one stripe, the stripe lock will
- * protect it.
- */
spin_lock_irq(&sh->stripe_lock);
/* Don't allow new IO added to stripes in batch list */
if (sh->batch_head)
@@ -3067,6 +3221,36 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
goto overlap;
+ if (forwrite && raid5_has_ppl(conf)) {
+ /*
+ * With PPL only writes to consecutive data chunks within a
+ * stripe are allowed because for a single stripe_head we can
+ * only have one PPL entry at a time, which describes one data
+ * range. Not really an overlap, but wait_for_overlap can be
+ * used to handle this.
+ */
+ sector_t sector;
+ sector_t first = 0;
+ sector_t last = 0;
+ int count = 0;
+ int i;
+
+ for (i = 0; i < sh->disks; i++) {
+ if (i != sh->pd_idx &&
+ (i == dd_idx || sh->dev[i].towrite)) {
+ sector = sh->dev[i].sector;
+ if (count == 0 || sector < first)
+ first = sector;
+ if (sector > last)
+ last = sector;
+ count++;
+ }
+ }
+
+ if (first + conf->chunk_sectors * (count - 1) != last)
+ goto overlap;
+ }
+
if (!forwrite || previous)
clear_bit(STRIPE_BATCH_READY, &sh->state);
@@ -3074,7 +3258,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
if (*bip)
bi->bi_next = *bip;
*bip = bi;
- raid5_inc_bi_active_stripes(bi);
+ bio_inc_remaining(bi);
+ md_write_inc(conf->mddev, bi);
if (forwrite) {
/* check if page is covered */
@@ -3151,8 +3336,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
static void
handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
- struct stripe_head_state *s, int disks,
- struct bio_list *return_bi)
+ struct stripe_head_state *s, int disks)
{
int i;
BUG_ON(sh->batch_head);
@@ -3188,7 +3372,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
if (bi)
bitmap_end = 1;
- r5l_stripe_write_finished(sh);
+ log_stripe_write_finished(sh);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
@@ -3197,11 +3381,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
- bi->bi_error = -EIO;
- if (!raid5_dec_bi_active_stripes(bi)) {
- md_write_end(conf->mddev);
- bio_list_add(return_bi, bi);
- }
+ bi->bi_status = BLK_STS_IOERR;
+ md_write_end(conf->mddev);
+ bio_endio(bi);
bi = nextbi;
}
if (bitmap_end)
@@ -3221,11 +3403,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
- bi->bi_error = -EIO;
- if (!raid5_dec_bi_active_stripes(bi)) {
- md_write_end(conf->mddev);
- bio_list_add(return_bi, bi);
- }
+ bi->bi_status = BLK_STS_IOERR;
+ md_write_end(conf->mddev);
+ bio_endio(bi);
bi = bi2;
}
@@ -3249,9 +3429,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
struct bio *nextbi =
r5_next_bio(bi, sh->dev[i].sector);
- bi->bi_error = -EIO;
- if (!raid5_dec_bi_active_stripes(bi))
- bio_list_add(return_bi, bi);
+ bi->bi_status = BLK_STS_IOERR;
+ bio_endio(bi);
bi = nextbi;
}
}
@@ -3387,7 +3566,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
!test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
/* Pre-reads at not permitted until after short delay
* to gather multiple requests. However if this
- * device is no Insync, the block could only be be computed
+ * device is no Insync, the block could only be computed
* and there is no need to delay that.
*/
return 0;
@@ -3406,7 +3585,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
/* If we are forced to do a reconstruct-write, either because
* the current RAID6 implementation only supports that, or
- * or because parity cannot be trusted and we are currently
+ * because parity cannot be trusted and we are currently
* recovering it, there is extra need to be careful.
* If one of the devices that we would need to read, because
* it is not being overwritten (and maybe not written at all)
@@ -3446,9 +3625,20 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
BUG_ON(test_bit(R5_Wantread, &dev->flags));
BUG_ON(sh->batch_head);
+
+ /*
+ * In the raid6 case if the only non-uptodate disk is P
+ * then we already trusted P to compute the other failed
+ * drives. It is safe to compute rather than re-read P.
+ * In other cases we only compute blocks from failed
+ * devices, otherwise check/repair might fail to detect
+ * a real inconsistency.
+ */
+
if ((s->uptodate == disks - 1) &&
+ ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) ||
(s->failed && (disk_idx == s->failed_num[0] ||
- disk_idx == s->failed_num[1]))) {
+ disk_idx == s->failed_num[1])))) {
/* have disk failed, and we're requested to fetch it;
* do compute it
*/
@@ -3550,7 +3740,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
* never LOCKED, so we don't need to test 'failed' directly.
*/
static void handle_stripe_clean_event(struct r5conf *conf,
- struct stripe_head *sh, int disks, struct bio_list *return_bi)
+ struct stripe_head *sh, int disks)
{
int i;
struct r5dev *dev;
@@ -3582,10 +3772,8 @@ returnbi:
while (wbi && wbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
wbi2 = r5_next_bio(wbi, dev->sector);
- if (!raid5_dec_bi_active_stripes(wbi)) {
- md_write_end(conf->mddev);
- bio_list_add(return_bi, wbi);
- }
+ md_write_end(conf->mddev);
+ bio_endio(wbi);
wbi = wbi2;
}
bitmap_endwrite(conf->mddev->bitmap, sh->sector,
@@ -3607,7 +3795,7 @@ returnbi:
discard_pending = 1;
}
- r5l_stripe_write_finished(sh);
+ log_stripe_write_finished(sh);
if (!discard_pending &&
test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
@@ -3694,7 +3882,7 @@ static int handle_stripe_dirtying(struct r5conf *conf,
} else for (i = disks; i--; ) {
/* would I have to read this buffer for read_modify_write */
struct r5dev *dev = &sh->dev[i];
- if (((dev->towrite && !delay_towrite(dev, s)) ||
+ if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
i == sh->pd_idx || i == sh->qd_idx ||
test_bit(R5_InJournal, &dev->flags)) &&
!test_bit(R5_LOCKED, &dev->flags) &&
@@ -3718,8 +3906,8 @@ static int handle_stripe_dirtying(struct r5conf *conf,
}
}
- pr_debug("for sector %llu, rmw=%d rcw=%d\n",
- (unsigned long long)sh->sector, rmw, rcw);
+ pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n",
+ (unsigned long long)sh->sector, sh->state, rmw, rcw);
set_bit(STRIPE_HANDLE, &sh->state);
if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
/* prefer read-modify-write, but need to get some data */
@@ -3759,7 +3947,7 @@ static int handle_stripe_dirtying(struct r5conf *conf,
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
- if (((dev->towrite && !delay_towrite(dev, s)) ||
+ if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
i == sh->pd_idx || i == sh->qd_idx ||
test_bit(R5_InJournal, &dev->flags)) &&
!test_bit(R5_LOCKED, &dev->flags) &&
@@ -3897,10 +4085,15 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
set_bit(STRIPE_INSYNC, &sh->state);
else {
atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
- if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
+ if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
/* don't try to repair!! */
set_bit(STRIPE_INSYNC, &sh->state);
- else {
+ pr_warn_ratelimited("%s: mismatch sector in range "
+ "%llu-%llu\n", mdname(conf->mddev),
+ (unsigned long long) sh->sector,
+ (unsigned long long) sh->sector +
+ STRIPE_SECTORS);
+ } else {
sh->check_state = check_state_compute_run;
set_bit(STRIPE_COMPUTE_RUN, &sh->state);
set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
@@ -4049,10 +4242,15 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
}
} else {
atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
- if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
+ if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
/* don't try to repair!! */
set_bit(STRIPE_INSYNC, &sh->state);
- else {
+ pr_warn_ratelimited("%s: mismatch sector in range "
+ "%llu-%llu\n", mdname(conf->mddev),
+ (unsigned long long) sh->sector,
+ (unsigned long long) sh->sector +
+ STRIPE_SECTORS);
+ } else {
int *target = &sh->ops.target;
sh->ops.target = -1;
@@ -4472,8 +4670,13 @@ static void handle_stripe(struct stripe_head *sh)
if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
spin_lock(&sh->stripe_lock);
- /* Cannot process 'sync' concurrently with 'discard' */
- if (!test_bit(STRIPE_DISCARD, &sh->state) &&
+ /*
+ * Cannot process 'sync' concurrently with 'discard'.
+ * Flush data in r5cache before 'sync'.
+ */
+ if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
+ !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) &&
+ !test_bit(STRIPE_DISCARD, &sh->state) &&
test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
set_bit(STRIPE_SYNCING, &sh->state);
clear_bit(STRIPE_INSYNC, &sh->state);
@@ -4494,7 +4697,8 @@ static void handle_stripe(struct stripe_head *sh)
if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
goto finish;
- if (s.handle_bad_blocks) {
+ if (s.handle_bad_blocks ||
+ test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
set_bit(STRIPE_HANDLE, &sh->state);
goto finish;
}
@@ -4519,15 +4723,20 @@ static void handle_stripe(struct stripe_head *sh)
" to_write=%d failed=%d failed_num=%d,%d\n",
s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
s.failed_num[0], s.failed_num[1]);
- /* check if the array has lost more than max_degraded devices and,
+ /*
+ * check if the array has lost more than max_degraded devices and,
* if so, some requests might need to be failed.
+ *
+ * When journal device failed (log_failed), we will only process
+ * the stripe if there is data need write to raid disks
*/
- if (s.failed > conf->max_degraded || s.log_failed) {
+ if (s.failed > conf->max_degraded ||
+ (s.log_failed && s.injournal == 0)) {
sh->check_state = 0;
sh->reconstruct_state = 0;
break_stripe_batch_list(sh, 0);
if (s.to_read+s.to_write+s.written)
- handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
+ handle_failed_stripe(conf, sh, &s, disks);
if (s.syncing + s.replacing)
handle_failed_sync(conf, sh, &s);
}
@@ -4593,11 +4802,11 @@ static void handle_stripe(struct stripe_head *sh)
&& !test_bit(R5_LOCKED, &qdev->flags)
&& (test_bit(R5_UPTODATE, &qdev->flags) ||
test_bit(R5_Discard, &qdev->flags))))))
- handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
+ handle_stripe_clean_event(conf, sh, disks);
if (s.just_cached)
- r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi);
- r5l_stripe_write_finished(sh);
+ r5c_handle_cached_data_endio(conf, sh, disks);
+ log_stripe_write_finished(sh);
/* Now we might consider reading some blocks, either to check/generate
* parity, or to satisfy requests
@@ -4824,16 +5033,6 @@ finish:
md_wakeup_thread(conf->mddev->thread);
}
- if (!bio_list_empty(&s.return_bi)) {
- if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
- spin_lock_irq(&conf->device_lock);
- bio_list_merge(&conf->return_bi, &s.return_bi);
- spin_unlock_irq(&conf->device_lock);
- md_wakeup_thread(conf->mddev->thread);
- } else
- return_io(&s.return_bi);
- }
-
clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
}
@@ -4922,12 +5121,14 @@ static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
md_wakeup_thread(conf->mddev->thread);
}
-static struct bio *remove_bio_from_retry(struct r5conf *conf)
+static struct bio *remove_bio_from_retry(struct r5conf *conf,
+ unsigned int *offset)
{
struct bio *bi;
bi = conf->retry_read_aligned;
if (bi) {
+ *offset = conf->retry_read_offset;
conf->retry_read_aligned = NULL;
return bi;
}
@@ -4935,11 +5136,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
if(bi) {
conf->retry_read_aligned_list = bi->bi_next;
bi->bi_next = NULL;
- /*
- * this sets the active strip count to 1 and the processed
- * strip count to zero (upper 8 bits)
- */
- raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */
+ *offset = 0;
}
return bi;
@@ -4957,7 +5154,7 @@ static void raid5_align_endio(struct bio *bi)
struct mddev *mddev;
struct r5conf *conf;
struct md_rdev *rdev;
- int error = bi->bi_error;
+ blk_status_t error = bi->bi_status;
bio_put(bi);
@@ -4969,8 +5166,6 @@ static void raid5_align_endio(struct bio *bi)
rdev_dec_pending(rdev, conf->mddev);
if (!error) {
- trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
- raid_bi, 0);
bio_endio(raid_bi);
if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_quiescent);
@@ -4995,9 +5190,9 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
return 0;
}
/*
- * use bio_clone_mddev to make a copy of the bio
+ * use bio_clone_fast to make a copy of the bio
*/
- align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev);
+ align_bi = bio_clone_fast(raid_bio, GFP_NOIO, mddev->bio_set);
if (!align_bi)
return 0;
/*
@@ -5025,6 +5220,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
rdev->recovery_offset >= end_sector)))
rdev = NULL;
}
+
+ if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) {
+ rcu_read_unlock();
+ bio_put(align_bi);
+ return 0;
+ }
+
if (rdev) {
sector_t first_bad;
int bad_sectors;
@@ -5069,24 +5271,20 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
{
struct bio *split;
+ sector_t sector = raid_bio->bi_iter.bi_sector;
+ unsigned chunk_sects = mddev->chunk_sectors;
+ unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
- do {
- sector_t sector = raid_bio->bi_iter.bi_sector;
- unsigned chunk_sects = mddev->chunk_sectors;
- unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
-
- if (sectors < bio_sectors(raid_bio)) {
- split = bio_split(raid_bio, sectors, GFP_NOIO, fs_bio_set);
- bio_chain(split, raid_bio);
- } else
- split = raid_bio;
+ if (sectors < bio_sectors(raid_bio)) {
+ struct r5conf *conf = mddev->private;
+ split = bio_split(raid_bio, sectors, GFP_NOIO, conf->bio_split);
+ bio_chain(split, raid_bio);
+ generic_make_request(raid_bio);
+ raid_bio = split;
+ }
- if (!raid5_read_one_chunk(mddev, split)) {
- if (split != raid_bio)
- generic_make_request(raid_bio);
- return split;
- }
- } while (split != raid_bio);
+ if (!raid5_read_one_chunk(mddev, raid_bio))
+ return raid_bio;
return NULL;
}
@@ -5103,19 +5301,29 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
*/
static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
{
- struct stripe_head *sh = NULL, *tmp;
+ struct stripe_head *sh, *tmp;
struct list_head *handle_list = NULL;
- struct r5worker_group *wg = NULL;
+ struct r5worker_group *wg;
+ bool second_try = !r5c_is_writeback(conf->log) &&
+ !r5l_log_disk_error(conf);
+ bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) ||
+ r5l_log_disk_error(conf);
+again:
+ wg = NULL;
+ sh = NULL;
if (conf->worker_cnt_per_group == 0) {
- handle_list = &conf->handle_list;
+ handle_list = try_loprio ? &conf->loprio_list :
+ &conf->handle_list;
} else if (group != ANY_GROUP) {
- handle_list = &conf->worker_groups[group].handle_list;
+ handle_list = try_loprio ? &conf->worker_groups[group].loprio_list :
+ &conf->worker_groups[group].handle_list;
wg = &conf->worker_groups[group];
} else {
int i;
for (i = 0; i < conf->group_cnt; i++) {
- handle_list = &conf->worker_groups[i].handle_list;
+ handle_list = try_loprio ? &conf->worker_groups[i].loprio_list :
+ &conf->worker_groups[i].handle_list;
wg = &conf->worker_groups[i];
if (!list_empty(handle_list))
break;
@@ -5166,8 +5374,13 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
wg = NULL;
}
- if (!sh)
- return NULL;
+ if (!sh) {
+ if (second_try)
+ return NULL;
+ second_try = true;
+ try_loprio = !try_loprio;
+ goto again;
+ }
if (wg) {
wg->stripes_cnt--;
@@ -5256,7 +5469,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
struct r5conf *conf = mddev->private;
sector_t logical_sector, last_sector;
struct stripe_head *sh;
- int remaining;
int stripe_sectors;
if (mddev->reshape_position != MaxSector)
@@ -5267,7 +5479,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
bi->bi_next = NULL;
- bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
stripe_sectors = conf->chunk_sectors *
(conf->raid_disks - conf->max_degraded);
@@ -5313,7 +5524,8 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
continue;
sh->dev[d].towrite = bi;
set_bit(R5_OVERWRITE, &sh->dev[d].flags);
- raid5_inc_bi_active_stripes(bi);
+ bio_inc_remaining(bi);
+ md_write_inc(mddev, bi);
sh->overwrite_disks++;
}
spin_unlock_irq(&sh->stripe_lock);
@@ -5336,14 +5548,10 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
release_stripe_plug(mddev, sh);
}
- remaining = raid5_dec_bi_active_stripes(bi);
- if (remaining == 0) {
- md_write_end(mddev);
- bio_endio(bi);
- }
+ bio_endio(bi);
}
-static void raid5_make_request(struct mddev *mddev, struct bio * bi)
+static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
{
struct r5conf *conf = mddev->private;
int dd_idx;
@@ -5351,7 +5559,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
sector_t logical_sector, last_sector;
struct stripe_head *sh;
const int rw = bio_data_dir(bi);
- int remaining;
DEFINE_WAIT(w);
bool do_prepare;
bool do_flush = false;
@@ -5360,10 +5567,10 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
int ret = r5l_handle_flush_request(conf->log, bi);
if (ret == 0)
- return;
+ return true;
if (ret == -ENODEV) {
md_flush_request(mddev, bi);
- return;
+ return true;
}
/* ret == -EAGAIN, fallback */
/*
@@ -5373,30 +5580,29 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
do_flush = bi->bi_opf & REQ_PREFLUSH;
}
- md_write_start(mddev, bi);
-
+ if (!md_write_start(mddev, bi))
+ return false;
/*
* If array is degraded, better not do chunk aligned read because
* later we might have to read it again in order to reconstruct
* data on failed drives.
*/
if (rw == READ && mddev->degraded == 0 &&
- !r5c_is_writeback(conf->log) &&
mddev->reshape_position == MaxSector) {
bi = chunk_aligned_read(mddev, bi);
if (!bi)
- return;
+ return true;
}
if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) {
make_discard_request(mddev, bi);
- return;
+ md_write_end(mddev);
+ return true;
}
logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
last_sector = bio_end_sector(bi);
bi->bi_next = NULL;
- bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
@@ -5487,12 +5693,15 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
* userspace, we want an interruptible
* wait.
*/
- flush_signals(current);
prepare_to_wait(&conf->wait_for_overlap,
&w, TASK_INTERRUPTIBLE);
if (logical_sector >= mddev->suspend_lo &&
logical_sector < mddev->suspend_hi) {
+ sigset_t full, old;
+ sigfillset(&full);
+ sigprocmask(SIG_BLOCK, &full, &old);
schedule();
+ sigprocmask(SIG_SETMASK, &old, NULL);
do_prepare = true;
}
goto retry;
@@ -5525,22 +5734,16 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
release_stripe_plug(mddev, sh);
} else {
/* cannot get stripe for read-ahead, just give-up */
- bi->bi_error = -EIO;
+ bi->bi_status = BLK_STS_IOERR;
break;
}
}
finish_wait(&conf->wait_for_overlap, &w);
- remaining = raid5_dec_bi_active_stripes(bi);
- if (remaining == 0) {
-
- if ( rw == WRITE )
- md_write_end(mddev);
-
- trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
- bi, 0);
- bio_endio(bi);
- }
+ if (rw == WRITE)
+ md_write_end(mddev);
+ bio_endio(bi);
+ return true;
}
static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
@@ -5889,7 +6092,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
return STRIPE_SECTORS;
}
-static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
+static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
+ unsigned int offset)
{
/* We may not be able to submit a whole bio at once as there
* may not be enough stripe_heads available.
@@ -5905,7 +6109,6 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
int dd_idx;
sector_t sector, logical_sector, last_sector;
int scnt = 0;
- int remaining;
int handled = 0;
logical_sector = raid_bio->bi_iter.bi_sector &
@@ -5919,7 +6122,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
sector += STRIPE_SECTORS,
scnt++) {
- if (scnt < raid5_bi_processed_stripes(raid_bio))
+ if (scnt < offset)
/* already done this stripe */
continue;
@@ -5927,15 +6130,15 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
if (!sh) {
/* failed to get a stripe - must wait */
- raid5_set_bi_processed_stripes(raid_bio, scnt);
conf->retry_read_aligned = raid_bio;
+ conf->retry_read_offset = scnt;
return handled;
}
if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
raid5_release_stripe(sh);
- raid5_set_bi_processed_stripes(raid_bio, scnt);
conf->retry_read_aligned = raid_bio;
+ conf->retry_read_offset = scnt;
return handled;
}
@@ -5944,12 +6147,9 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
raid5_release_stripe(sh);
handled++;
}
- remaining = raid5_dec_bi_active_stripes(raid_bio);
- if (remaining == 0) {
- trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
- raid_bio, 0);
- bio_endio(raid_bio);
- }
+
+ bio_endio(raid_bio);
+
if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_quiescent);
return handled;
@@ -5992,7 +6192,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
for (i = 0; i < batch_size; i++)
handle_stripe(batch[i]);
- r5l_write_stripe_run(conf->log);
+ log_write_stripe_run(conf);
cond_resched();
@@ -6009,6 +6209,7 @@ static void raid5_do_work(struct work_struct *work)
struct r5worker *worker = container_of(work, struct r5worker, work);
struct r5worker_group *group = worker->group;
struct r5conf *conf = group->conf;
+ struct mddev *mddev = conf->mddev;
int group_id = group - conf->worker_groups;
int handled;
struct blk_plug plug;
@@ -6029,6 +6230,9 @@ static void raid5_do_work(struct work_struct *work)
if (!batch_size && !released)
break;
handled += batch_size;
+ wait_event_lock_irq(mddev->sb_wait,
+ !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
+ conf->device_lock);
}
pr_debug("%d stripes handled\n", handled);
@@ -6056,24 +6260,13 @@ static void raid5d(struct md_thread *thread)
md_check_recovery(mddev);
- if (!bio_list_empty(&conf->return_bi) &&
- !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
- struct bio_list tmp = BIO_EMPTY_LIST;
- spin_lock_irq(&conf->device_lock);
- if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
- bio_list_merge(&tmp, &conf->return_bi);
- bio_list_init(&conf->return_bi);
- }
- spin_unlock_irq(&conf->device_lock);
- return_io(&tmp);
- }
-
blk_start_plug(&plug);
handled = 0;
spin_lock_irq(&conf->device_lock);
while (1) {
struct bio *bio;
int batch_size, released;
+ unsigned int offset;
released = release_stripe_list(conf, conf->temp_inactive_list);
if (released)
@@ -6091,10 +6284,10 @@ static void raid5d(struct md_thread *thread)
}
raid5_activate_delayed(conf);
- while ((bio = remove_bio_from_retry(conf))) {
+ while ((bio = remove_bio_from_retry(conf, &offset))) {
int ok;
spin_unlock_irq(&conf->device_lock);
- ok = retry_aligned_read(conf, bio);
+ ok = retry_aligned_read(conf, bio, offset);
spin_lock_irq(&conf->device_lock);
if (!ok)
break;
@@ -6126,6 +6319,8 @@ static void raid5d(struct md_thread *thread)
mutex_unlock(&conf->cache_size_mutex);
}
+ flush_deferred_bios(conf);
+
r5l_flush_stripe_to_raid(conf->log);
async_tx_issue_pending_all();
@@ -6151,7 +6346,6 @@ int
raid5_set_cache_size(struct mddev *mddev, int size)
{
struct r5conf *conf = mddev->private;
- int err;
if (size <= 16 || size > 32768)
return -EINVAL;
@@ -6163,10 +6357,7 @@ raid5_set_cache_size(struct mddev *mddev, int size)
;
mutex_unlock(&conf->cache_size_mutex);
-
- err = md_allow_write(mddev);
- if (err)
- return err;
+ md_allow_write(mddev);
mutex_lock(&conf->cache_size_mutex);
while (size > conf->max_nr_stripes)
@@ -6331,10 +6522,10 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
mddev_suspend(mddev);
conf->skip_copy = new;
if (new)
- mddev->queue->backing_dev_info.capabilities |=
+ mddev->queue->backing_dev_info->capabilities |=
BDI_CAP_STABLE_WRITES;
else
- mddev->queue->backing_dev_info.capabilities &=
+ mddev->queue->backing_dev_info->capabilities &=
~BDI_CAP_STABLE_WRITES;
mddev_resume(mddev);
}
@@ -6476,6 +6667,7 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt,
group = &(*worker_groups)[i];
INIT_LIST_HEAD(&group->handle_list);
+ INIT_LIST_HEAD(&group->loprio_list);
group->conf = conf;
group->workers = workers + i * cnt;
@@ -6566,8 +6758,8 @@ static void free_conf(struct r5conf *conf)
{
int i;
- if (conf->log)
- r5l_exit_log(conf->log);
+ log_exit(conf);
+
if (conf->shrinker.nr_deferred)
unregister_shrinker(&conf->shrinker);
@@ -6578,7 +6770,10 @@ static void free_conf(struct r5conf *conf)
if (conf->disks[i].extra_page)
put_page(conf->disks[i].extra_page);
kfree(conf->disks);
+ if (conf->bio_split)
+ bioset_free(conf->bio_split);
kfree(conf->stripe_hashtbl);
+ kfree(conf->pending_data);
kfree(conf);
}
@@ -6688,6 +6883,14 @@ static struct r5conf *setup_conf(struct mddev *mddev)
conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
if (conf == NULL)
goto abort;
+ INIT_LIST_HEAD(&conf->free_list);
+ INIT_LIST_HEAD(&conf->pending_list);
+ conf->pending_data = kzalloc(sizeof(struct r5pending_data) *
+ PENDING_IO_MAX, GFP_KERNEL);
+ if (!conf->pending_data)
+ goto abort;
+ for (i = 0; i < PENDING_IO_MAX; i++)
+ list_add(&conf->pending_data[i].sibling, &conf->free_list);
/* Don't enable multi-threading by default*/
if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
&new_group)) {
@@ -6703,14 +6906,25 @@ static struct r5conf *setup_conf(struct mddev *mddev)
init_waitqueue_head(&conf->wait_for_stripe);
init_waitqueue_head(&conf->wait_for_overlap);
INIT_LIST_HEAD(&conf->handle_list);
+ INIT_LIST_HEAD(&conf->loprio_list);
INIT_LIST_HEAD(&conf->hold_list);
INIT_LIST_HEAD(&conf->delayed_list);
INIT_LIST_HEAD(&conf->bitmap_list);
- bio_list_init(&conf->return_bi);
init_llist_head(&conf->released_stripes);
atomic_set(&conf->active_stripes, 0);
atomic_set(&conf->preread_active_stripes, 0);
atomic_set(&conf->active_aligned_reads, 0);
+ spin_lock_init(&conf->pending_bios_lock);
+ conf->batch_bio_dispatch = true;
+ rdev_for_each(rdev, mddev) {
+ if (test_bit(Journal, &rdev->flags))
+ continue;
+ if (blk_queue_nonrot(bdev_get_queue(rdev->bdev))) {
+ conf->batch_bio_dispatch = false;
+ break;
+ }
+ }
+
conf->bypass_threshold = BYPASS_THRESHOLD;
conf->recovery_disabled = mddev->recovery_disabled - 1;
@@ -6733,6 +6947,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
goto abort;
}
+ conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0);
+ if (!conf->bio_split)
+ goto abort;
conf->mddev = mddev;
if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
@@ -6757,6 +6974,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
INIT_LIST_HEAD(&conf->r5c_full_stripe_list);
atomic_set(&conf->r5c_cached_partial_stripes, 0);
INIT_LIST_HEAD(&conf->r5c_partial_stripe_list);
+ atomic_set(&conf->r5c_flushing_full_stripes, 0);
+ atomic_set(&conf->r5c_flushing_partial_stripes, 0);
conf->level = mddev->new_level;
conf->chunk_sectors = mddev->new_chunk_sectors;
@@ -6903,6 +7122,9 @@ static int raid5_run(struct mddev *mddev)
long long min_offset_diff = 0;
int first = 1;
+ if (mddev_init_writes_pending(mddev) < 0)
+ return -ENOMEM;
+
if (mddev->recovery_cp != MaxSector)
pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
mdname(mddev));
@@ -7015,6 +7237,13 @@ static int raid5_run(struct mddev *mddev)
BUG_ON(mddev->delta_disks != 0);
}
+ if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
+ test_bit(MD_HAS_PPL, &mddev->flags)) {
+ pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
+ mdname(mddev));
+ clear_bit(MD_HAS_PPL, &mddev->flags);
+ }
+
if (mddev->private == NULL)
conf = setup_conf(mddev);
else
@@ -7106,7 +7335,10 @@ static int raid5_run(struct mddev *mddev)
if (mddev->degraded > dirty_parity_disks &&
mddev->recovery_cp != MaxSector) {
- if (mddev->ok_start_degraded)
+ if (test_bit(MD_HAS_PPL, &mddev->flags))
+ pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
+ mdname(mddev));
+ else if (mddev->ok_start_degraded)
pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
mdname(mddev));
else {
@@ -7145,7 +7377,6 @@ static int raid5_run(struct mddev *mddev)
if (mddev->queue) {
int chunk_size;
- bool discard_supported = true;
/* read-ahead size must cover two whole stripes, which
* is 2 * (datadisks) * chunksize where 'n' is the
* number of raid devices
@@ -7153,8 +7384,8 @@ static int raid5_run(struct mddev *mddev)
int data_disks = conf->previous_raid_disks - conf->max_degraded;
int stripe = data_disks *
((mddev->chunk_sectors << 9) / PAGE_SIZE);
- if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
- mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
+ if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
+ mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
chunk_size = mddev->chunk_sectors << 9;
blk_queue_io_min(mddev->queue, chunk_size);
@@ -7173,56 +7404,32 @@ static int raid5_run(struct mddev *mddev)
mddev->queue->limits.discard_alignment = stripe;
mddev->queue->limits.discard_granularity = stripe;
- /*
- * We use 16-bit counter of active stripes in bi_phys_segments
- * (minus one for over-loaded initialization)
- */
- blk_queue_max_hw_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS);
- blk_queue_max_discard_sectors(mddev->queue,
- 0xfffe * STRIPE_SECTORS);
-
- /*
- * unaligned part of discard request will be ignored, so can't
- * guarantee discard_zeroes_data
- */
- mddev->queue->limits.discard_zeroes_data = 0;
-
blk_queue_max_write_same_sectors(mddev->queue, 0);
+ blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
rdev_for_each(rdev, mddev) {
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->new_data_offset << 9);
- /*
- * discard_zeroes_data is required, otherwise data
- * could be lost. Consider a scenario: discard a stripe
- * (the stripe could be inconsistent if
- * discard_zeroes_data is 0); write one disk of the
- * stripe (the stripe could be inconsistent again
- * depending on which disks are used to calculate
- * parity); the disk is broken; The stripe data of this
- * disk is lost.
- */
- if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
- !bdev_get_queue(rdev->bdev)->
- limits.discard_zeroes_data)
- discard_supported = false;
- /* Unfortunately, discard_zeroes_data is not currently
- * a guarantee - just a hint. So we only allow DISCARD
- * if the sysadmin has confirmed that only safe devices
- * are in use by setting a module parameter.
- */
- if (!devices_handle_discard_safely) {
- if (discard_supported) {
- pr_info("md/raid456: discard support disabled due to uncertainty.\n");
- pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n");
- }
- discard_supported = false;
- }
}
- if (discard_supported &&
+ /*
+ * zeroing is required, otherwise data
+ * could be lost. Consider a scenario: discard a stripe
+ * (the stripe could be inconsistent if
+ * discard_zeroes_data is 0); write one disk of the
+ * stripe (the stripe could be inconsistent again
+ * depending on which disks are used to calculate
+ * parity); the disk is broken; The stripe data of this
+ * disk is lost.
+ *
+ * We only allow DISCARD if the sysadmin has confirmed that
+ * only safe devices are in use by setting a module parameter.
+ * A better idea might be to turn DISCARD into WRITE_ZEROES
+ * requests, as that is required to be safe.
+ */
+ if (devices_handle_discard_safely &&
mddev->queue->limits.max_discard_sectors >= (stripe >> 9) &&
mddev->queue->limits.discard_granularity >= stripe)
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
@@ -7234,14 +7441,8 @@ static int raid5_run(struct mddev *mddev)
blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
}
- if (journal_dev) {
- char b[BDEVNAME_SIZE];
-
- pr_debug("md/raid:%s: using device %s as journal\n",
- mdname(mddev), bdevname(journal_dev->bdev, b));
- if (r5l_init_log(conf, journal_dev))
- goto abort;
- }
+ if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
+ goto abort;
return 0;
abort:
@@ -7355,17 +7556,18 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
print_raid5_conf(conf);
if (test_bit(Journal, &rdev->flags) && conf->log) {
- struct r5l_log *log;
/*
* we can't wait pending write here, as this is called in
* raid5d, wait will deadlock.
+ * neilb: there is no locking about new writes here,
+ * so this cannot be safe.
*/
- if (atomic_read(&mddev->writes_pending))
+ if (atomic_read(&conf->active_stripes) ||
+ atomic_read(&conf->r5c_cached_full_stripes) ||
+ atomic_read(&conf->r5c_cached_partial_stripes)) {
return -EBUSY;
- log = conf->log;
- conf->log = NULL;
- synchronize_rcu();
- r5l_exit_log(log);
+ }
+ log_exit(conf);
return 0;
}
if (rdev == p->rdev)
@@ -7404,6 +7606,11 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
*rdevp = rdev;
}
}
+ if (!err) {
+ err = log_modify(conf, rdev, false);
+ if (err)
+ goto abort;
+ }
if (p->replacement) {
/* We must have just cleared 'rdev' */
p->rdev = p->replacement;
@@ -7412,12 +7619,12 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
* but will never see neither - if they are careful
*/
p->replacement = NULL;
- clear_bit(WantReplacement, &rdev->flags);
- } else
- /* We might have just removed the Replacement as faulty-
- * clear the bit just in case
- */
- clear_bit(WantReplacement, &rdev->flags);
+
+ if (!err)
+ err = log_modify(conf, p->rdev, true);
+ }
+
+ clear_bit(WantReplacement, &rdev->flags);
abort:
print_raid5_conf(conf);
@@ -7434,7 +7641,6 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
int last = conf->raid_disks - 1;
if (test_bit(Journal, &rdev->flags)) {
- char b[BDEVNAME_SIZE];
if (conf->log)
return -EBUSY;
@@ -7443,9 +7649,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
* The array is in readonly mode if journal is missing, so no
* write requests running. We should be safe
*/
- r5l_init_log(conf, rdev);
- pr_debug("md/raid:%s: using device %s as journal\n",
- mdname(mddev), bdevname(rdev->bdev, b));
+ log_init(conf, rdev, false);
return 0;
}
if (mddev->recovery_disabled == conf->recovery_disabled)
@@ -7472,10 +7676,12 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (p->rdev == NULL) {
clear_bit(In_sync, &rdev->flags);
rdev->raid_disk = disk;
- err = 0;
if (rdev->saved_raid_disk != disk)
conf->fullsync = 1;
rcu_assign_pointer(p->rdev, rdev);
+
+ err = log_modify(conf, rdev, true);
+
goto out;
}
}
@@ -7509,7 +7715,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
sector_t newsize;
struct r5conf *conf = mddev->private;
- if (conf->log)
+ if (conf->log || raid5_has_ppl(conf))
return -EINVAL;
sectors &= ~((sector_t)conf->chunk_sectors - 1);
newsize = raid5_size(mddev, sectors, mddev->raid_disks);
@@ -7522,8 +7728,6 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
return ret;
}
md_set_array_sectors(mddev, newsize);
- set_capacity(mddev->gendisk, mddev->array_sectors);
- revalidate_disk(mddev->gendisk);
if (sectors > mddev->dev_sectors &&
mddev->recovery_cp > mddev->dev_sectors) {
mddev->recovery_cp = mddev->dev_sectors;
@@ -7562,7 +7766,7 @@ static int check_reshape(struct mddev *mddev)
{
struct r5conf *conf = mddev->private;
- if (conf->log)
+ if (conf->log || raid5_has_ppl(conf))
return -EINVAL;
if (mddev->delta_disks == 0 &&
mddev->new_layout == mddev->layout &&
@@ -7595,6 +7799,9 @@ static int check_reshape(struct mddev *mddev)
mddev->chunk_sectors)
) < 0)
return -ENOMEM;
+
+ if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size)
+ return 0; /* never bother to shrink */
return resize_stripes(conf, (conf->previous_raid_disks
+ mddev->delta_disks));
}
@@ -7744,12 +7951,10 @@ static void end_reshape(struct r5conf *conf)
{
if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
- struct md_rdev *rdev;
spin_lock_irq(&conf->device_lock);
conf->previous_raid_disks = conf->raid_disks;
- rdev_for_each(rdev, conf->mddev)
- rdev->data_offset = rdev->new_data_offset;
+ md_finish_reshape(conf->mddev);
smp_wmb();
conf->reshape_progress = MaxSector;
conf->mddev->reshape_position = MaxSector;
@@ -7763,8 +7968,8 @@ static void end_reshape(struct r5conf *conf)
int data_disks = conf->raid_disks - conf->max_degraded;
int stripe = data_disks * ((conf->chunk_sectors << 9)
/ PAGE_SIZE);
- if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
- conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
+ if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
+ conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
}
}
}
@@ -8085,6 +8290,68 @@ static void *raid6_takeover(struct mddev *mddev)
return setup_conf(mddev);
}
+static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
+{
+ struct r5conf *conf;
+ int err;
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ conf = mddev->private;
+ if (!conf) {
+ mddev_unlock(mddev);
+ return -ENODEV;
+ }
+
+ if (strncmp(buf, "ppl", 3) == 0) {
+ /* ppl only works with RAID 5 */
+ if (!raid5_has_ppl(conf) && conf->level == 5) {
+ err = log_init(conf, NULL, true);
+ if (!err) {
+ err = resize_stripes(conf, conf->pool_size);
+ if (err)
+ log_exit(conf);
+ }
+ } else
+ err = -EINVAL;
+ } else if (strncmp(buf, "resync", 6) == 0) {
+ if (raid5_has_ppl(conf)) {
+ mddev_suspend(mddev);
+ log_exit(conf);
+ mddev_resume(mddev);
+ err = resize_stripes(conf, conf->pool_size);
+ } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
+ r5l_log_disk_error(conf)) {
+ bool journal_dev_exists = false;
+ struct md_rdev *rdev;
+
+ rdev_for_each(rdev, mddev)
+ if (test_bit(Journal, &rdev->flags)) {
+ journal_dev_exists = true;
+ break;
+ }
+
+ if (!journal_dev_exists) {
+ mddev_suspend(mddev);
+ clear_bit(MD_HAS_JOURNAL, &mddev->flags);
+ mddev_resume(mddev);
+ } else /* need remove journal device first */
+ err = -EBUSY;
+ } else
+ err = -EINVAL;
+ } else {
+ err = -EINVAL;
+ }
+
+ if (!err)
+ md_update_sb(mddev, 1);
+
+ mddev_unlock(mddev);
+
+ return err;
+}
+
static struct md_personality raid6_personality =
{
.name = "raid6",
@@ -8107,6 +8374,7 @@ static struct md_personality raid6_personality =
.quiesce = raid5_quiesce,
.takeover = raid6_takeover,
.congested = raid5_congested,
+ .change_consistency_policy = raid5_change_consistency_policy,
};
static struct md_personality raid5_personality =
{
@@ -8130,6 +8398,7 @@ static struct md_personality raid5_personality =
.quiesce = raid5_quiesce,
.takeover = raid5_takeover,
.congested = raid5_congested,
+ .change_consistency_policy = raid5_change_consistency_policy,
};
static struct md_personality raid4_personality =
@@ -8154,6 +8423,7 @@ static struct md_personality raid4_personality =
.quiesce = raid5_quiesce,
.takeover = raid4_takeover,
.congested = raid5_congested,
+ .change_consistency_policy = raid5_change_consistency_policy,
};
static int __init raid5_init(void)