aboutsummaryrefslogtreecommitdiff
path: root/fs/btrfs/volumes.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/volumes.c')
-rw-r--r--fs/btrfs/volumes.c557
1 files changed, 425 insertions, 132 deletions
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 58b9c419a2b6..bc3b33efddc5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -31,6 +31,7 @@
#include "space-info.h"
#include "block-group.h"
#include "discard.h"
+#include "zoned.h"
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = {
@@ -374,6 +375,7 @@ void btrfs_free_device(struct btrfs_device *device)
rcu_string_free(device->name);
extent_io_tree_release(&device->alloc_state);
bio_put(device->flush_bio);
+ btrfs_destroy_dev_zone_info(device);
kfree(device);
}
@@ -419,7 +421,7 @@ static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
* Preallocate a bio that's always going to be used for flushing device
* barriers and matches the device lifespan
*/
- dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
+ dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
if (!dev->flush_bio) {
kfree(dev);
return ERR_PTR(-ENOMEM);
@@ -822,7 +824,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
} else {
mutex_lock(&fs_devices->device_list_mutex);
device = btrfs_find_device(fs_devices, devid,
- disk_super->dev_item.uuid, NULL, false);
+ disk_super->dev_item.uuid, NULL);
/*
* If this disk has been pulled into an fs devices created by
@@ -929,25 +931,30 @@ static noinline struct btrfs_device *device_list_add(const char *path,
* make sure it's the same device if the device is mounted
*/
if (device->bdev) {
- struct block_device *path_bdev;
+ int error;
+ dev_t path_dev;
- path_bdev = lookup_bdev(path);
- if (IS_ERR(path_bdev)) {
+ error = lookup_bdev(path, &path_dev);
+ if (error) {
mutex_unlock(&fs_devices->device_list_mutex);
- return ERR_CAST(path_bdev);
+ return ERR_PTR(error);
}
- if (device->bdev != path_bdev) {
- bdput(path_bdev);
+ if (device->bdev->bd_dev != path_dev) {
mutex_unlock(&fs_devices->device_list_mutex);
- btrfs_warn_in_rcu(device->fs_info,
+ /*
+ * device->fs_info may not be reliable here, so
+ * pass in a NULL instead. This avoids a
+ * possible use-after-free when the fs_info and
+ * fs_info->sb are already torn down.
+ */
+ btrfs_warn_in_rcu(NULL,
"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
path, devid, found_transid,
current->comm,
task_pid_nr(current));
return ERR_PTR(-EEXIST);
}
- bdput(path_bdev);
btrfs_info_in_rcu(device->fs_info,
"devid %llu device path %s changed to %s scanned by %s (%d)",
devid, rcu_str_deref(device->name),
@@ -1038,7 +1045,7 @@ error:
}
static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
- int step, struct btrfs_device **latest_dev)
+ struct btrfs_device **latest_dev)
{
struct btrfs_device *device, *next;
@@ -1056,22 +1063,13 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
continue;
}
- if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
- /*
- * In the first step, keep the device which has
- * the correct fsid and the devid that is used
- * for the dev_replace procedure.
- * In the second step, the dev_replace state is
- * read from the device tree and it is known
- * whether the procedure is really active or
- * not, which means whether this device is
- * used or whether it should be removed.
- */
- if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
- &device->dev_state)) {
- continue;
- }
- }
+ /*
+ * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
+ * in btrfs_init_dev_replace() so just continue.
+ */
+ if (device->devid == BTRFS_DEV_REPLACE_DEVID)
+ continue;
+
if (device->bdev) {
blkdev_put(device->bdev, device->mode);
device->bdev = NULL;
@@ -1080,9 +1078,6 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
list_del_init(&device->dev_alloc_list);
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
- if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
- &device->dev_state))
- fs_devices->rw_devices--;
}
list_del_init(&device->dev_list);
fs_devices->num_devices--;
@@ -1095,16 +1090,16 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
* After we have read the system tree and know devids belonging to this
* filesystem, remove the device which does not belong there.
*/
-void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
+void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *latest_dev = NULL;
struct btrfs_fs_devices *seed_dev;
mutex_lock(&uuid_mutex);
- __btrfs_free_extra_devids(fs_devices, step, &latest_dev);
+ __btrfs_free_extra_devids(fs_devices, &latest_dev);
list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
- __btrfs_free_extra_devids(seed_dev, step, &latest_dev);
+ __btrfs_free_extra_devids(seed_dev, &latest_dev);
fs_devices->latest_bdev = latest_dev->bdev;
@@ -1143,6 +1138,7 @@ static void btrfs_close_one_device(struct btrfs_device *device)
device->bdev = NULL;
}
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ btrfs_destroy_dev_zone_info(device);
device->fs_info = NULL;
atomic_set(&device->dev_stats_ccnt, 0);
@@ -1223,6 +1219,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
fs_devices->latest_bdev = latest_dev->bdev;
fs_devices->total_rw_bytes = 0;
fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
+ fs_devices->read_policy = BTRFS_READ_POLICY_PID;
return 0;
}
@@ -1274,7 +1271,7 @@ void btrfs_release_disk_super(struct btrfs_super_block *super)
}
static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
- u64 bytenr)
+ u64 bytenr, u64 bytenr_orig)
{
struct btrfs_super_block *disk_super;
struct page *page;
@@ -1305,7 +1302,7 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev
/* align our pointer to the offset of the super block */
disk_super = p + offset_in_page(bytenr);
- if (btrfs_super_bytenr(disk_super) != bytenr ||
+ if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
btrfs_release_disk_super(p);
return ERR_PTR(-EINVAL);
@@ -1340,7 +1337,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
bool new_device_added = false;
struct btrfs_device *device = NULL;
struct block_device *bdev;
- u64 bytenr;
+ u64 bytenr, bytenr_orig;
+ int ret;
lockdep_assert_held(&uuid_mutex);
@@ -1350,14 +1348,18 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
* So, we need to add a special mount option to scan for
* later supers, using BTRFS_SUPER_MIRROR_MAX instead
*/
- bytenr = btrfs_sb_offset(0);
flags |= FMODE_EXCL;
bdev = blkdev_get_by_path(path, flags, holder);
if (IS_ERR(bdev))
return ERR_CAST(bdev);
- disk_super = btrfs_read_disk_super(bdev, bytenr);
+ bytenr_orig = btrfs_sb_offset(0);
+ ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
+ if (ret)
+ return ERR_PTR(ret);
+
+ disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
if (IS_ERR(disk_super)) {
device = ERR_CAST(disk_super);
goto error_bdev_put;
@@ -1412,11 +1414,62 @@ static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
* make sure to start at an offset of at least 1MB.
*/
return max_t(u64, start, SZ_1M);
+ case BTRFS_CHUNK_ALLOC_ZONED:
+ /*
+ * We don't care about the starting region like regular
+ * allocator, because we anyway use/reserve the first two zones
+ * for superblock logging.
+ */
+ return ALIGN(start, device->zone_info->zone_size);
default:
BUG();
}
}
+static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
+ u64 *hole_start, u64 *hole_size,
+ u64 num_bytes)
+{
+ u64 zone_size = device->zone_info->zone_size;
+ u64 pos;
+ int ret;
+ bool changed = false;
+
+ ASSERT(IS_ALIGNED(*hole_start, zone_size));
+
+ while (*hole_size > 0) {
+ pos = btrfs_find_allocatable_zones(device, *hole_start,
+ *hole_start + *hole_size,
+ num_bytes);
+ if (pos != *hole_start) {
+ *hole_size = *hole_start + *hole_size - pos;
+ *hole_start = pos;
+ changed = true;
+ if (*hole_size < num_bytes)
+ break;
+ }
+
+ ret = btrfs_ensure_empty_zones(device, pos, num_bytes);
+
+ /* Range is ensured to be empty */
+ if (!ret)
+ return changed;
+
+ /* Given hole range was invalid (outside of device) */
+ if (ret == -ERANGE) {
+ *hole_start += *hole_size;
+ *hole_size = 0;
+ return 1;
+ }
+
+ *hole_start += zone_size;
+ *hole_size -= zone_size;
+ changed = true;
+ }
+
+ return changed;
+}
+
/**
* dev_extent_hole_check - check if specified hole is suitable for allocation
* @device: the device which we have the hole
@@ -1424,7 +1477,7 @@ static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
* @hole_size: the size of the hole
* @num_bytes: the size of the free space that we need
*
- * This function may modify @hole_start and @hole_end to reflect the suitable
+ * This function may modify @hole_start and @hole_size to reflect the suitable
* position for allocation. Returns 1 if hole position is updated, 0 otherwise.
*/
static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
@@ -1433,24 +1486,39 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
bool changed = false;
u64 hole_end = *hole_start + *hole_size;
- /*
- * Check before we set max_hole_start, otherwise we could end up
- * sending back this offset anyway.
- */
- if (contains_pending_extent(device, hole_start, *hole_size)) {
- if (hole_end >= *hole_start)
- *hole_size = hole_end - *hole_start;
- else
- *hole_size = 0;
- changed = true;
- }
+ for (;;) {
+ /*
+ * Check before we set max_hole_start, otherwise we could end up
+ * sending back this offset anyway.
+ */
+ if (contains_pending_extent(device, hole_start, *hole_size)) {
+ if (hole_end >= *hole_start)
+ *hole_size = hole_end - *hole_start;
+ else
+ *hole_size = 0;
+ changed = true;
+ }
+
+ switch (device->fs_devices->chunk_alloc_policy) {
+ case BTRFS_CHUNK_ALLOC_REGULAR:
+ /* No extra check */
+ break;
+ case BTRFS_CHUNK_ALLOC_ZONED:
+ if (dev_extent_hole_check_zoned(device, hole_start,
+ hole_size, num_bytes)) {
+ changed = true;
+ /*
+ * The changed hole can contain pending extent.
+ * Loop again to check that.
+ */
+ continue;
+ }
+ break;
+ default:
+ BUG();
+ }
- switch (device->fs_devices->chunk_alloc_policy) {
- case BTRFS_CHUNK_ALLOC_REGULAR:
- /* No extra check */
break;
- default:
- BUG();
}
return changed;
@@ -1503,6 +1571,9 @@ static int find_free_dev_extent_start(struct btrfs_device *device,
search_start = dev_extent_search_start(device, search_start);
+ WARN_ON(device->zone_info &&
+ !IS_ALIGNED(num_bytes, device->zone_info->zone_size));
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -2021,6 +2092,11 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
if (IS_ERR(disk_super))
continue;
+ if (bdev_is_zoned(bdev)) {
+ btrfs_reset_sb_log_zones(bdev, copy_num);
+ continue;
+ }
+
memset(&disk_super->magic, 0, sizeof(disk_super->magic));
page = virt_to_page(disk_super);
@@ -2099,6 +2175,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
mutex_unlock(&uuid_mutex);
ret = btrfs_shrink_device(device, 0);
+ if (!ret)
+ btrfs_reada_remove_dev(device);
mutex_lock(&uuid_mutex);
if (ret)
goto error_undo;
@@ -2179,6 +2257,7 @@ out:
return ret;
error_undo:
+ btrfs_reada_undo_remove_dev(device);
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
mutex_lock(&fs_info->chunk_mutex);
list_add(&device->dev_alloc_list,
@@ -2296,10 +2375,10 @@ static struct btrfs_device *btrfs_find_device_by_path(
dev_uuid = disk_super->dev_item.uuid;
if (btrfs_fs_incompat(fs_info, METADATA_UUID))
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- disk_super->metadata_uuid, true);
+ disk_super->metadata_uuid);
else
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- disk_super->fsid, true);
+ disk_super->fsid);
btrfs_release_disk_super(disk_super);
if (!device)
@@ -2319,7 +2398,7 @@ struct btrfs_device *btrfs_find_device_by_devspec(
if (devid) {
device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
- NULL, true);
+ NULL);
if (!device)
return ERR_PTR(-ENOENT);
return device;
@@ -2468,7 +2547,7 @@ next_slot:
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_FSID_SIZE);
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- fs_uuid, true);
+ fs_uuid);
BUG_ON(!device); /* Logic error */
if (device->fs_devices->seeding) {
@@ -2510,6 +2589,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
if (IS_ERR(bdev))
return PTR_ERR(bdev);
+ if (!btrfs_check_device_zone_type(fs_info, bdev)) {
+ ret = -EINVAL;
+ goto error;
+ }
+
if (fs_devices->seeding) {
seeding_dev = 1;
down_write(&sb->s_umount);
@@ -2543,10 +2627,17 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
}
rcu_assign_pointer(device->name, name);
+ device->fs_info = fs_info;
+ device->bdev = bdev;
+
+ ret = btrfs_get_dev_zone_info(device);
+ if (ret)
+ goto error_free_device;
+
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- goto error_free_device;
+ goto error_free_zone;
}
q = bdev_get_queue(bdev);
@@ -2559,8 +2650,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
fs_info->sectorsize);
device->disk_total_bytes = device->total_bytes;
device->commit_total_bytes = device->total_bytes;
- device->fs_info = fs_info;
- device->bdev = bdev;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
device->mode = FMODE_EXCL;
@@ -2568,7 +2657,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
if (seeding_dev) {
- sb->s_flags &= ~SB_RDONLY;
+ btrfs_clear_sb_rdonly(sb);
ret = btrfs_prepare_sprout(fs_info);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -2704,9 +2793,11 @@ error_sysfs:
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
error_trans:
if (seeding_dev)
- sb->s_flags |= SB_RDONLY;
+ btrfs_set_sb_rdonly(sb);
if (trans)
btrfs_end_transaction(trans);
+error_free_zone:
+ btrfs_destroy_dev_zone_info(device);
error_free_device:
btrfs_free_device(device);
error:
@@ -4291,6 +4382,8 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
btrfs_warn(fs_info,
"balance: cannot set exclusive op status, resume manually");
+ btrfs_release_path(path);
+
mutex_lock(&fs_info->balance_mutex);
BUG_ON(fs_info->balance_ctl);
spin_lock(&fs_info->balance_lock);
@@ -4640,11 +4733,10 @@ again:
}
ret = btrfs_previous_item(root, path, 0, key.type);
- if (ret)
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
- if (ret < 0)
- goto done;
if (ret) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ if (ret < 0)
+ goto done;
ret = 0;
btrfs_release_path(path);
break;
@@ -4876,6 +4968,37 @@ static void init_alloc_chunk_ctl_policy_regular(
ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
}
+static void init_alloc_chunk_ctl_policy_zoned(
+ struct btrfs_fs_devices *fs_devices,
+ struct alloc_chunk_ctl *ctl)
+{
+ u64 zone_size = fs_devices->fs_info->zone_size;
+ u64 limit;
+ int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
+ int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
+ u64 min_chunk_size = min_data_stripes * zone_size;
+ u64 type = ctl->type;
+
+ ctl->max_stripe_size = zone_size;
+ if (type & BTRFS_BLOCK_GROUP_DATA) {
+ ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
+ zone_size);
+ } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
+ ctl->max_chunk_size = ctl->max_stripe_size;
+ } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
+ ctl->max_chunk_size = 2 * ctl->max_stripe_size;
+ ctl->devs_max = min_t(int, ctl->devs_max,
+ BTRFS_MAX_DEVS_SYS_CHUNK);
+ }
+
+ /* We don't want a chunk larger than 10% of writable space */
+ limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1),
+ zone_size),
+ min_chunk_size);
+ ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
+ ctl->dev_extent_min = zone_size * ctl->dev_stripes;
+}
+
static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
struct alloc_chunk_ctl *ctl)
{
@@ -4896,6 +5019,9 @@ static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
case BTRFS_CHUNK_ALLOC_REGULAR:
init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
break;
+ case BTRFS_CHUNK_ALLOC_ZONED:
+ init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
+ break;
default:
BUG();
}
@@ -5022,6 +5148,38 @@ static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
return 0;
}
+static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
+ struct btrfs_device_info *devices_info)
+{
+ u64 zone_size = devices_info[0].dev->zone_info->zone_size;
+ /* Number of stripes that count for block group size */
+ int data_stripes;
+
+ /*
+ * It should hold because:
+ * dev_extent_min == dev_extent_want == zone_size * dev_stripes
+ */
+ ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
+
+ ctl->stripe_size = zone_size;
+ ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
+ data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
+
+ /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */
+ if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
+ ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
+ ctl->stripe_size) + ctl->nparity,
+ ctl->dev_stripes);
+ ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
+ data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
+ ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
+ }
+
+ ctl->chunk_size = ctl->stripe_size * data_stripes;
+
+ return 0;
+}
+
static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
struct alloc_chunk_ctl *ctl,
struct btrfs_device_info *devices_info)
@@ -5049,6 +5207,8 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
switch (fs_devices->chunk_alloc_policy) {
case BTRFS_CHUNK_ALLOC_REGULAR:
return decide_stripe_size_regular(ctl, devices_info);
+ case BTRFS_CHUNK_ALLOC_ZONED:
+ return decide_stripe_size_zoned(ctl, devices_info);
default:
BUG();
}
@@ -5482,7 +5642,18 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
else
num_stripes = map->num_stripes;
- preferred_mirror = first + current->pid % num_stripes;
+ switch (fs_info->fs_devices->read_policy) {
+ default:
+ /* Shouldn't happen, just warn and use pid instead of failing */
+ btrfs_warn_rl(fs_info,
+ "unknown read_policy type %u, reset to pid",
+ fs_info->fs_devices->read_policy);
+ fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
+ fallthrough;
+ case BTRFS_READ_POLICY_PID:
+ preferred_mirror = first + (current->pid % num_stripes);
+ break;
+ }
if (dev_replace_is_ongoing &&
fs_info->dev_replace.cont_reading_from_srcdev_mode ==
@@ -5802,9 +5973,29 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
return ret;
}
+static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
+{
+ struct btrfs_block_group *cache;
+ bool ret;
+
+ /* Non zoned filesystem does not use "to_copy" flag */
+ if (!btrfs_is_zoned(fs_info))
+ return false;
+
+ cache = btrfs_lookup_block_group(fs_info, logical);
+
+ spin_lock(&cache->lock);
+ ret = cache->to_copy;
+ spin_unlock(&cache->lock);
+
+ btrfs_put_block_group(cache);
+ return ret;
+}
+
static void handle_ops_on_dev_replace(enum btrfs_map_op op,
struct btrfs_bio **bbio_ret,
struct btrfs_dev_replace *dev_replace,
+ u64 logical,
int *num_stripes_ret, int *max_errors_ret)
{
struct btrfs_bio *bbio = *bbio_ret;
@@ -5818,6 +6009,13 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
int index_where_to_add;
/*
+ * A block group which have "to_copy" set will eventually
+ * copied by dev-replace process. We can avoid cloning IO here.
+ */
+ if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
+ return;
+
+ /*
* duplicate the write operations while the dev replace
* procedure is running. Since the copying of the old disk to
* the new disk takes place at run time while the filesystem is
@@ -5902,23 +6100,24 @@ static bool need_full_stripe(enum btrfs_map_op op)
}
/*
- * btrfs_get_io_geometry - calculates the geomery of a particular (address, len)
- * tuple. This information is used to calculate how big a
- * particular bio can get before it straddles a stripe.
+ * Calculate the geometry of a particular (address, len) tuple. This
+ * information is used to calculate how big a particular bio can get before it
+ * straddles a stripe.
*
- * @fs_info - the filesystem
- * @logical - address that we want to figure out the geometry of
- * @len - the length of IO we are going to perform, starting at @logical
- * @op - type of operation - write or read
- * @io_geom - pointer used to return values
+ * @fs_info: the filesystem
+ * @em: mapping containing the logical extent
+ * @op: type of operation - write or read
+ * @logical: address that we want to figure out the geometry of
+ * @len: the length of IO we are going to perform, starting at @logical
+ * @io_geom: pointer used to return values
*
* Returns < 0 in case a chunk for the given logical address cannot be found,
* usually shouldn't happen unless @logical is corrupted, 0 otherwise.
*/
-int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
- u64 logical, u64 len, struct btrfs_io_geometry *io_geom)
+int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
+ enum btrfs_map_op op, u64 logical, u64 len,
+ struct btrfs_io_geometry *io_geom)
{
- struct extent_map *em;
struct map_lookup *map;
u64 offset;
u64 stripe_offset;
@@ -5926,14 +6125,9 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 stripe_len;
u64 raid56_full_stripe_start = (u64)-1;
int data_stripes;
- int ret = 0;
ASSERT(op != BTRFS_MAP_DISCARD);
- em = btrfs_get_chunk_map(fs_info, logical, len);
- if (IS_ERR(em))
- return PTR_ERR(em);
-
map = em->map_lookup;
/* Offset of this logical address in the chunk */
offset = logical - em->start;
@@ -5947,8 +6141,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
btrfs_crit(fs_info,
"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
stripe_offset, offset, em->start, logical, stripe_len);
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
/* stripe_offset is the offset of this block in its stripe */
@@ -5995,10 +6188,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
io_geom->stripe_offset = stripe_offset;
io_geom->raid56_stripe_offset = raid56_full_stripe_start;
-out:
- /* once for us */
- free_extent_map(em);
- return ret;
+ return 0;
}
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
@@ -6031,12 +6221,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
ASSERT(bbio_ret);
ASSERT(op != BTRFS_MAP_DISCARD);
- ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom);
+ em = btrfs_get_chunk_map(fs_info, logical, *length);
+ ASSERT(!IS_ERR(em));
+
+ ret = btrfs_get_io_geometry(fs_info, em, op, logical, *length, &geom);
if (ret < 0)
return ret;
- em = btrfs_get_chunk_map(fs_info, logical, *length);
- ASSERT(!IS_ERR(em));
map = em->map_lookup;
*length = geom.len;
@@ -6212,8 +6403,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
need_full_stripe(op)) {
- handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
- &max_errors);
+ handle_ops_on_dev_replace(op, &bbio, dev_replace, logical,
+ &num_stripes, &max_errors);
}
*bbio_ret = bbio;
@@ -6284,7 +6475,7 @@ static void btrfs_end_bio(struct bio *bio)
struct btrfs_device *dev = btrfs_io_bio(bio)->device;
ASSERT(dev->bdev);
- if (bio_op(bio) == REQ_OP_WRITE)
+ if (btrfs_op(bio) == BTRFS_MAP_WRITE)
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_WRITE_ERRS);
else if (!(bio->bi_opf & REQ_RAHEAD))
@@ -6336,9 +6527,23 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
btrfs_io_bio(bio)->device = dev;
bio->bi_end_io = btrfs_end_bio;
bio->bi_iter.bi_sector = physical >> 9;
+ /*
+ * For zone append writing, bi_sector must point the beginning of the
+ * zone
+ */
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ if (btrfs_dev_is_sequential(dev, physical)) {
+ u64 zone_start = round_down(physical, fs_info->zone_size);
+
+ bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
+ } else {
+ bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
+ bio->bi_opf |= REQ_OP_WRITE;
+ }
+ }
btrfs_debug_in_rcu(fs_info,
"btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
- bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector,
+ bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
(unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
dev->devid, bio->bi_iter.bi_size);
bio_set_dev(bio, dev->bdev);
@@ -6370,7 +6575,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
{
struct btrfs_device *dev;
struct bio *first_bio = bio;
- u64 logical = (u64)bio->bi_iter.bi_sector << 9;
+ u64 logical = bio->bi_iter.bi_sector << 9;
u64 length = 0;
u64 map_length;
int ret;
@@ -6397,10 +6602,10 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
atomic_set(&bbio->stripes_pending, bbio->num_stripes);
if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
- ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) {
+ ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
/* In this case, map_length has been set to the length of
a single stripe; not the whole write */
- if (bio_op(bio) == REQ_OP_WRITE) {
+ if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
ret = raid56_parity_write(fs_info, bio, bbio,
map_length);
} else {
@@ -6423,7 +6628,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
dev = bbio->stripes[dev_nr].dev;
if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
&dev->dev_state) ||
- (bio_op(first_bio) == REQ_OP_WRITE &&
+ (btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
bbio_error(bbio, first_bio, logical);
continue;
@@ -6450,8 +6655,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
* If @seed is true, traverse through the seed devices.
*/
struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
- u64 devid, u8 *uuid, u8 *fsid,
- bool seed)
+ u64 devid, u8 *uuid, u8 *fsid)
{
struct btrfs_device *device;
struct btrfs_fs_devices *seed_devs;
@@ -6658,7 +6862,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
btrfs_stripe_dev_uuid_nr(chunk, i),
BTRFS_UUID_SIZE);
map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
- devid, uuid, NULL, true);
+ devid, uuid, NULL);
if (!map->stripes[i].dev &&
!btrfs_test_opt(fs_info, DEGRADED)) {
free_extent_map(em);
@@ -6797,7 +7001,7 @@ static int read_one_dev(struct extent_buffer *leaf,
}
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- fs_uuid, true);
+ fs_uuid);
if (!device) {
if (!btrfs_test_opt(fs_info, DEGRADED)) {
btrfs_report_missing_device(fs_info, devid,
@@ -6860,6 +7064,16 @@ static int read_one_dev(struct extent_buffer *leaf,
}
fill_device_from_item(leaf, dev_item, device);
+ if (device->bdev) {
+ u64 max_total_bytes = i_size_read(device->bdev->bd_inode);
+
+ if (device->total_bytes > max_total_bytes) {
+ btrfs_err(fs_info,
+ "device total_bytes should be at most %llu but found %llu",
+ max_total_bytes, device->total_bytes);
+ return -EINVAL;
+ }
+ }
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
@@ -6894,11 +7108,11 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
* fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
* overallocate but we can keep it as-is, only the first page is used.
*/
- sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET);
+ sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET,
+ root->root_key.objectid, 0);
if (IS_ERR(sb))
return PTR_ERR(sb);
set_extent_buffer_uptodate(sb);
- btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
/*
* The sb extent buffer is artificial and just used to read the system array.
* set_extent_buffer_uptodate() call does not properly mark all it's
@@ -7062,12 +7276,8 @@ static void readahead_tree_node_children(struct extent_buffer *node)
int i;
const int nr_items = btrfs_header_nritems(node);
- for (i = 0; i < nr_items; i++) {
- u64 start;
-
- start = btrfs_node_blockptr(node, i);
- readahead_tree_block(node->fs_info, start);
- }
+ for (i = 0; i < nr_items; i++)
+ btrfs_readahead_node_child(node, i);
}
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
@@ -7454,8 +7664,7 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
int i;
mutex_lock(&fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL,
- true);
+ dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL);
mutex_unlock(&fs_devices->device_list_mutex);
if (!dev) {
@@ -7586,28 +7795,13 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
}
/* Make sure no dev extent is beyond device bondary */
- dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
+ dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
if (!dev) {
btrfs_err(fs_info, "failed to find devid %llu", devid);
ret = -EUCLEAN;
goto out;
}
- /* It's possible this device is a dummy for seed device */
- if (dev->disk_total_bytes == 0) {
- struct btrfs_fs_devices *devs;
-
- devs = list_first_entry(&fs_info->fs_devices->seed_list,
- struct btrfs_fs_devices, seed_list);
- dev = btrfs_find_device(devs, devid, NULL, NULL, false);
- if (!dev) {
- btrfs_err(fs_info, "failed to find seed devid %llu",
- devid);
- ret = -EUCLEAN;
- goto out;
- }
- }
-
if (physical_offset + physical_len > dev->disk_total_bytes) {
btrfs_err(fs_info,
"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
@@ -7616,6 +7810,20 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
ret = -EUCLEAN;
goto out;
}
+
+ if (dev->zone_info) {
+ u64 zone_size = dev->zone_info->zone_size;
+
+ if (!IS_ALIGNED(physical_offset, zone_size) ||
+ !IS_ALIGNED(physical_len, zone_size)) {
+ btrfs_err(fs_info,
+"zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
+ devid, physical_offset, physical_len);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+
out:
free_extent_map(em);
return ret;
@@ -7662,6 +7870,19 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
u64 prev_dev_ext_end = 0;
int ret = 0;
+ /*
+ * We don't have a dev_root because we mounted with ignorebadroots and
+ * failed to load the root, so we want to skip the verification in this
+ * case for sure.
+ *
+ * However if the dev root is fine, but the tree itself is corrupted
+ * we'd still fail to mount. This verification is only to make sure
+ * writes can happen safely, so instead just bypass this check
+ * completely in the case of IGNOREBADROOTS.
+ */
+ if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
+ return 0;
+
key.objectid = 1;
key.type = BTRFS_DEV_EXTENT_KEY;
key.offset = 0;
@@ -7759,3 +7980,75 @@ bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
spin_unlock(&fs_info->swapfile_pins_lock);
return node != NULL;
}
+
+static int relocating_repair_kthread(void *data)
+{
+ struct btrfs_block_group *cache = (struct btrfs_block_group *)data;
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ u64 target;
+ int ret = 0;
+
+ target = cache->start;
+ btrfs_put_block_group(cache);
+
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
+ btrfs_info(fs_info,
+ "zoned: skip relocating block group %llu to repair: EBUSY",
+ target);
+ return -EBUSY;
+ }
+
+ mutex_lock(&fs_info->delete_unused_bgs_mutex);
+
+ /* Ensure block group still exists */
+ cache = btrfs_lookup_block_group(fs_info, target);
+ if (!cache)
+ goto out;
+
+ if (!cache->relocating_repair)
+ goto out;
+
+ ret = btrfs_may_alloc_data_chunk(fs_info, target);
+ if (ret < 0)
+ goto out;
+
+ btrfs_info(fs_info,
+ "zoned: relocating block group %llu to repair IO failure",
+ target);
+ ret = btrfs_relocate_chunk(fs_info, target);
+
+out:
+ if (cache)
+ btrfs_put_block_group(cache);
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ btrfs_exclop_finish(fs_info);
+
+ return ret;
+}
+
+int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
+{
+ struct btrfs_block_group *cache;
+
+ /* Do not attempt to repair in degraded state */
+ if (btrfs_test_opt(fs_info, DEGRADED))
+ return 0;
+
+ cache = btrfs_lookup_block_group(fs_info, logical);
+ if (!cache)
+ return 0;
+
+ spin_lock(&cache->lock);
+ if (cache->relocating_repair) {
+ spin_unlock(&cache->lock);
+ btrfs_put_block_group(cache);
+ return 0;
+ }
+ cache->relocating_repair = 1;
+ spin_unlock(&cache->lock);
+
+ kthread_run(relocating_repair_kthread, cache,
+ "btrfs-relocating-repair");
+
+ return 0;
+}