diff options
Diffstat (limited to 'fs/btrfs/volumes.c')
| -rw-r--r-- | fs/btrfs/volumes.c | 557 |
1 files changed, 425 insertions, 132 deletions
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 58b9c419a2b6..bc3b33efddc5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -31,6 +31,7 @@ #include "space-info.h" #include "block-group.h" #include "discard.h" +#include "zoned.h" const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { @@ -374,6 +375,7 @@ void btrfs_free_device(struct btrfs_device *device) rcu_string_free(device->name); extent_io_tree_release(&device->alloc_state); bio_put(device->flush_bio); + btrfs_destroy_dev_zone_info(device); kfree(device); } @@ -419,7 +421,7 @@ static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info) * Preallocate a bio that's always going to be used for flushing device * barriers and matches the device lifespan */ - dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL); + dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0); if (!dev->flush_bio) { kfree(dev); return ERR_PTR(-ENOMEM); @@ -822,7 +824,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, } else { mutex_lock(&fs_devices->device_list_mutex); device = btrfs_find_device(fs_devices, devid, - disk_super->dev_item.uuid, NULL, false); + disk_super->dev_item.uuid, NULL); /* * If this disk has been pulled into an fs devices created by @@ -929,25 +931,30 @@ static noinline struct btrfs_device *device_list_add(const char *path, * make sure it's the same device if the device is mounted */ if (device->bdev) { - struct block_device *path_bdev; + int error; + dev_t path_dev; - path_bdev = lookup_bdev(path); - if (IS_ERR(path_bdev)) { + error = lookup_bdev(path, &path_dev); + if (error) { mutex_unlock(&fs_devices->device_list_mutex); - return ERR_CAST(path_bdev); + return ERR_PTR(error); } - if (device->bdev != path_bdev) { - bdput(path_bdev); + if (device->bdev->bd_dev != path_dev) { mutex_unlock(&fs_devices->device_list_mutex); - btrfs_warn_in_rcu(device->fs_info, + /* + * device->fs_info may not be reliable here, so + * pass in a NULL instead. This avoids a + * possible use-after-free when the fs_info and + * fs_info->sb are already torn down. + */ + btrfs_warn_in_rcu(NULL, "duplicate device %s devid %llu generation %llu scanned by %s (%d)", path, devid, found_transid, current->comm, task_pid_nr(current)); return ERR_PTR(-EEXIST); } - bdput(path_bdev); btrfs_info_in_rcu(device->fs_info, "devid %llu device path %s changed to %s scanned by %s (%d)", devid, rcu_str_deref(device->name), @@ -1038,7 +1045,7 @@ error: } static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, - int step, struct btrfs_device **latest_dev) + struct btrfs_device **latest_dev) { struct btrfs_device *device, *next; @@ -1056,22 +1063,13 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, continue; } - if (device->devid == BTRFS_DEV_REPLACE_DEVID) { - /* - * In the first step, keep the device which has - * the correct fsid and the devid that is used - * for the dev_replace procedure. - * In the second step, the dev_replace state is - * read from the device tree and it is known - * whether the procedure is really active or - * not, which means whether this device is - * used or whether it should be removed. - */ - if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, - &device->dev_state)) { - continue; - } - } + /* + * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID, + * in btrfs_init_dev_replace() so just continue. + */ + if (device->devid == BTRFS_DEV_REPLACE_DEVID) + continue; + if (device->bdev) { blkdev_put(device->bdev, device->mode); device->bdev = NULL; @@ -1080,9 +1078,6 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { list_del_init(&device->dev_alloc_list); clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); - if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, - &device->dev_state)) - fs_devices->rw_devices--; } list_del_init(&device->dev_list); fs_devices->num_devices--; @@ -1095,16 +1090,16 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, * After we have read the system tree and know devids belonging to this * filesystem, remove the device which does not belong there. */ -void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) +void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *latest_dev = NULL; struct btrfs_fs_devices *seed_dev; mutex_lock(&uuid_mutex); - __btrfs_free_extra_devids(fs_devices, step, &latest_dev); + __btrfs_free_extra_devids(fs_devices, &latest_dev); list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) - __btrfs_free_extra_devids(seed_dev, step, &latest_dev); + __btrfs_free_extra_devids(seed_dev, &latest_dev); fs_devices->latest_bdev = latest_dev->bdev; @@ -1143,6 +1138,7 @@ static void btrfs_close_one_device(struct btrfs_device *device) device->bdev = NULL; } clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); + btrfs_destroy_dev_zone_info(device); device->fs_info = NULL; atomic_set(&device->dev_stats_ccnt, 0); @@ -1223,6 +1219,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, fs_devices->latest_bdev = latest_dev->bdev; fs_devices->total_rw_bytes = 0; fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; + fs_devices->read_policy = BTRFS_READ_POLICY_PID; return 0; } @@ -1274,7 +1271,7 @@ void btrfs_release_disk_super(struct btrfs_super_block *super) } static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, - u64 bytenr) + u64 bytenr, u64 bytenr_orig) { struct btrfs_super_block *disk_super; struct page *page; @@ -1305,7 +1302,7 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev /* align our pointer to the offset of the super block */ disk_super = p + offset_in_page(bytenr); - if (btrfs_super_bytenr(disk_super) != bytenr || + if (btrfs_super_bytenr(disk_super) != bytenr_orig || btrfs_super_magic(disk_super) != BTRFS_MAGIC) { btrfs_release_disk_super(p); return ERR_PTR(-EINVAL); @@ -1340,7 +1337,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, bool new_device_added = false; struct btrfs_device *device = NULL; struct block_device *bdev; - u64 bytenr; + u64 bytenr, bytenr_orig; + int ret; lockdep_assert_held(&uuid_mutex); @@ -1350,14 +1348,18 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, * So, we need to add a special mount option to scan for * later supers, using BTRFS_SUPER_MIRROR_MAX instead */ - bytenr = btrfs_sb_offset(0); flags |= FMODE_EXCL; bdev = blkdev_get_by_path(path, flags, holder); if (IS_ERR(bdev)) return ERR_CAST(bdev); - disk_super = btrfs_read_disk_super(bdev, bytenr); + bytenr_orig = btrfs_sb_offset(0); + ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr); + if (ret) + return ERR_PTR(ret); + + disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig); if (IS_ERR(disk_super)) { device = ERR_CAST(disk_super); goto error_bdev_put; @@ -1412,11 +1414,62 @@ static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) * make sure to start at an offset of at least 1MB. */ return max_t(u64, start, SZ_1M); + case BTRFS_CHUNK_ALLOC_ZONED: + /* + * We don't care about the starting region like regular + * allocator, because we anyway use/reserve the first two zones + * for superblock logging. + */ + return ALIGN(start, device->zone_info->zone_size); default: BUG(); } } +static bool dev_extent_hole_check_zoned(struct btrfs_device *device, + u64 *hole_start, u64 *hole_size, + u64 num_bytes) +{ + u64 zone_size = device->zone_info->zone_size; + u64 pos; + int ret; + bool changed = false; + + ASSERT(IS_ALIGNED(*hole_start, zone_size)); + + while (*hole_size > 0) { + pos = btrfs_find_allocatable_zones(device, *hole_start, + *hole_start + *hole_size, + num_bytes); + if (pos != *hole_start) { + *hole_size = *hole_start + *hole_size - pos; + *hole_start = pos; + changed = true; + if (*hole_size < num_bytes) + break; + } + + ret = btrfs_ensure_empty_zones(device, pos, num_bytes); + + /* Range is ensured to be empty */ + if (!ret) + return changed; + + /* Given hole range was invalid (outside of device) */ + if (ret == -ERANGE) { + *hole_start += *hole_size; + *hole_size = 0; + return 1; + } + + *hole_start += zone_size; + *hole_size -= zone_size; + changed = true; + } + + return changed; +} + /** * dev_extent_hole_check - check if specified hole is suitable for allocation * @device: the device which we have the hole @@ -1424,7 +1477,7 @@ static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) * @hole_size: the size of the hole * @num_bytes: the size of the free space that we need * - * This function may modify @hole_start and @hole_end to reflect the suitable + * This function may modify @hole_start and @hole_size to reflect the suitable * position for allocation. Returns 1 if hole position is updated, 0 otherwise. */ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, @@ -1433,24 +1486,39 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, bool changed = false; u64 hole_end = *hole_start + *hole_size; - /* - * Check before we set max_hole_start, otherwise we could end up - * sending back this offset anyway. - */ - if (contains_pending_extent(device, hole_start, *hole_size)) { - if (hole_end >= *hole_start) - *hole_size = hole_end - *hole_start; - else - *hole_size = 0; - changed = true; - } + for (;;) { + /* + * Check before we set max_hole_start, otherwise we could end up + * sending back this offset anyway. + */ + if (contains_pending_extent(device, hole_start, *hole_size)) { + if (hole_end >= *hole_start) + *hole_size = hole_end - *hole_start; + else + *hole_size = 0; + changed = true; + } + + switch (device->fs_devices->chunk_alloc_policy) { + case BTRFS_CHUNK_ALLOC_REGULAR: + /* No extra check */ + break; + case BTRFS_CHUNK_ALLOC_ZONED: + if (dev_extent_hole_check_zoned(device, hole_start, + hole_size, num_bytes)) { + changed = true; + /* + * The changed hole can contain pending extent. + * Loop again to check that. + */ + continue; + } + break; + default: + BUG(); + } - switch (device->fs_devices->chunk_alloc_policy) { - case BTRFS_CHUNK_ALLOC_REGULAR: - /* No extra check */ break; - default: - BUG(); } return changed; @@ -1503,6 +1571,9 @@ static int find_free_dev_extent_start(struct btrfs_device *device, search_start = dev_extent_search_start(device, search_start); + WARN_ON(device->zone_info && + !IS_ALIGNED(num_bytes, device->zone_info->zone_size)); + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -2021,6 +2092,11 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, if (IS_ERR(disk_super)) continue; + if (bdev_is_zoned(bdev)) { + btrfs_reset_sb_log_zones(bdev, copy_num); + continue; + } + memset(&disk_super->magic, 0, sizeof(disk_super->magic)); page = virt_to_page(disk_super); @@ -2099,6 +2175,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, mutex_unlock(&uuid_mutex); ret = btrfs_shrink_device(device, 0); + if (!ret) + btrfs_reada_remove_dev(device); mutex_lock(&uuid_mutex); if (ret) goto error_undo; @@ -2179,6 +2257,7 @@ out: return ret; error_undo: + btrfs_reada_undo_remove_dev(device); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); list_add(&device->dev_alloc_list, @@ -2296,10 +2375,10 @@ static struct btrfs_device *btrfs_find_device_by_path( dev_uuid = disk_super->dev_item.uuid; if (btrfs_fs_incompat(fs_info, METADATA_UUID)) device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, - disk_super->metadata_uuid, true); + disk_super->metadata_uuid); else device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, - disk_super->fsid, true); + disk_super->fsid); btrfs_release_disk_super(disk_super); if (!device) @@ -2319,7 +2398,7 @@ struct btrfs_device *btrfs_find_device_by_devspec( if (devid) { device = btrfs_find_device(fs_info->fs_devices, devid, NULL, - NULL, true); + NULL); if (!device) return ERR_PTR(-ENOENT); return device; @@ -2468,7 +2547,7 @@ next_slot: read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), BTRFS_FSID_SIZE); device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, - fs_uuid, true); + fs_uuid); BUG_ON(!device); /* Logic error */ if (device->fs_devices->seeding) { @@ -2510,6 +2589,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (IS_ERR(bdev)) return PTR_ERR(bdev); + if (!btrfs_check_device_zone_type(fs_info, bdev)) { + ret = -EINVAL; + goto error; + } + if (fs_devices->seeding) { seeding_dev = 1; down_write(&sb->s_umount); @@ -2543,10 +2627,17 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path } rcu_assign_pointer(device->name, name); + device->fs_info = fs_info; + device->bdev = bdev; + + ret = btrfs_get_dev_zone_info(device); + if (ret) + goto error_free_device; + trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - goto error_free_device; + goto error_free_zone; } q = bdev_get_queue(bdev); @@ -2559,8 +2650,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path fs_info->sectorsize); device->disk_total_bytes = device->total_bytes; device->commit_total_bytes = device->total_bytes; - device->fs_info = fs_info; - device->bdev = bdev; set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); device->mode = FMODE_EXCL; @@ -2568,7 +2657,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); if (seeding_dev) { - sb->s_flags &= ~SB_RDONLY; + btrfs_clear_sb_rdonly(sb); ret = btrfs_prepare_sprout(fs_info); if (ret) { btrfs_abort_transaction(trans, ret); @@ -2704,9 +2793,11 @@ error_sysfs: mutex_unlock(&fs_info->fs_devices->device_list_mutex); error_trans: if (seeding_dev) - sb->s_flags |= SB_RDONLY; + btrfs_set_sb_rdonly(sb); if (trans) btrfs_end_transaction(trans); +error_free_zone: + btrfs_destroy_dev_zone_info(device); error_free_device: btrfs_free_device(device); error: @@ -4291,6 +4382,8 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) btrfs_warn(fs_info, "balance: cannot set exclusive op status, resume manually"); + btrfs_release_path(path); + mutex_lock(&fs_info->balance_mutex); BUG_ON(fs_info->balance_ctl); spin_lock(&fs_info->balance_lock); @@ -4640,11 +4733,10 @@ again: } ret = btrfs_previous_item(root, path, 0, key.type); - if (ret) - mutex_unlock(&fs_info->delete_unused_bgs_mutex); - if (ret < 0) - goto done; if (ret) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); + if (ret < 0) + goto done; ret = 0; btrfs_release_path(path); break; @@ -4876,6 +4968,37 @@ static void init_alloc_chunk_ctl_policy_regular( ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; } +static void init_alloc_chunk_ctl_policy_zoned( + struct btrfs_fs_devices *fs_devices, + struct alloc_chunk_ctl *ctl) +{ + u64 zone_size = fs_devices->fs_info->zone_size; + u64 limit; + int min_num_stripes = ctl->devs_min * ctl->dev_stripes; + int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies; + u64 min_chunk_size = min_data_stripes * zone_size; + u64 type = ctl->type; + + ctl->max_stripe_size = zone_size; + if (type & BTRFS_BLOCK_GROUP_DATA) { + ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE, + zone_size); + } else if (type & BTRFS_BLOCK_GROUP_METADATA) { + ctl->max_chunk_size = ctl->max_stripe_size; + } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { + ctl->max_chunk_size = 2 * ctl->max_stripe_size; + ctl->devs_max = min_t(int, ctl->devs_max, + BTRFS_MAX_DEVS_SYS_CHUNK); + } + + /* We don't want a chunk larger than 10% of writable space */ + limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1), + zone_size), + min_chunk_size); + ctl->max_chunk_size = min(limit, ctl->max_chunk_size); + ctl->dev_extent_min = zone_size * ctl->dev_stripes; +} + static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, struct alloc_chunk_ctl *ctl) { @@ -4896,6 +5019,9 @@ static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, case BTRFS_CHUNK_ALLOC_REGULAR: init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); break; + case BTRFS_CHUNK_ALLOC_ZONED: + init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl); + break; default: BUG(); } @@ -5022,6 +5148,38 @@ static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, return 0; } +static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, + struct btrfs_device_info *devices_info) +{ + u64 zone_size = devices_info[0].dev->zone_info->zone_size; + /* Number of stripes that count for block group size */ + int data_stripes; + + /* + * It should hold because: + * dev_extent_min == dev_extent_want == zone_size * dev_stripes + */ + ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min); + + ctl->stripe_size = zone_size; + ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; + data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; + + /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */ + if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { + ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies, + ctl->stripe_size) + ctl->nparity, + ctl->dev_stripes); + ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; + data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; + ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size); + } + + ctl->chunk_size = ctl->stripe_size * data_stripes; + + return 0; +} + static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, struct alloc_chunk_ctl *ctl, struct btrfs_device_info *devices_info) @@ -5049,6 +5207,8 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, switch (fs_devices->chunk_alloc_policy) { case BTRFS_CHUNK_ALLOC_REGULAR: return decide_stripe_size_regular(ctl, devices_info); + case BTRFS_CHUNK_ALLOC_ZONED: + return decide_stripe_size_zoned(ctl, devices_info); default: BUG(); } @@ -5482,7 +5642,18 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, else num_stripes = map->num_stripes; - preferred_mirror = first + current->pid % num_stripes; + switch (fs_info->fs_devices->read_policy) { + default: + /* Shouldn't happen, just warn and use pid instead of failing */ + btrfs_warn_rl(fs_info, + "unknown read_policy type %u, reset to pid", + fs_info->fs_devices->read_policy); + fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID; + fallthrough; + case BTRFS_READ_POLICY_PID: + preferred_mirror = first + (current->pid % num_stripes); + break; + } if (dev_replace_is_ongoing && fs_info->dev_replace.cont_reading_from_srcdev_mode == @@ -5802,9 +5973,29 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, return ret; } +static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) +{ + struct btrfs_block_group *cache; + bool ret; + + /* Non zoned filesystem does not use "to_copy" flag */ + if (!btrfs_is_zoned(fs_info)) + return false; + + cache = btrfs_lookup_block_group(fs_info, logical); + + spin_lock(&cache->lock); + ret = cache->to_copy; + spin_unlock(&cache->lock); + + btrfs_put_block_group(cache); + return ret; +} + static void handle_ops_on_dev_replace(enum btrfs_map_op op, struct btrfs_bio **bbio_ret, struct btrfs_dev_replace *dev_replace, + u64 logical, int *num_stripes_ret, int *max_errors_ret) { struct btrfs_bio *bbio = *bbio_ret; @@ -5818,6 +6009,13 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, int index_where_to_add; /* + * A block group which have "to_copy" set will eventually + * copied by dev-replace process. We can avoid cloning IO here. + */ + if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical)) + return; + + /* * duplicate the write operations while the dev replace * procedure is running. Since the copying of the old disk to * the new disk takes place at run time while the filesystem is @@ -5902,23 +6100,24 @@ static bool need_full_stripe(enum btrfs_map_op op) } /* - * btrfs_get_io_geometry - calculates the geomery of a particular (address, len) - * tuple. This information is used to calculate how big a - * particular bio can get before it straddles a stripe. + * Calculate the geometry of a particular (address, len) tuple. This + * information is used to calculate how big a particular bio can get before it + * straddles a stripe. * - * @fs_info - the filesystem - * @logical - address that we want to figure out the geometry of - * @len - the length of IO we are going to perform, starting at @logical - * @op - type of operation - write or read - * @io_geom - pointer used to return values + * @fs_info: the filesystem + * @em: mapping containing the logical extent + * @op: type of operation - write or read + * @logical: address that we want to figure out the geometry of + * @len: the length of IO we are going to perform, starting at @logical + * @io_geom: pointer used to return values * * Returns < 0 in case a chunk for the given logical address cannot be found, * usually shouldn't happen unless @logical is corrupted, 0 otherwise. */ -int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 len, struct btrfs_io_geometry *io_geom) +int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, + enum btrfs_map_op op, u64 logical, u64 len, + struct btrfs_io_geometry *io_geom) { - struct extent_map *em; struct map_lookup *map; u64 offset; u64 stripe_offset; @@ -5926,14 +6125,9 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 stripe_len; u64 raid56_full_stripe_start = (u64)-1; int data_stripes; - int ret = 0; ASSERT(op != BTRFS_MAP_DISCARD); - em = btrfs_get_chunk_map(fs_info, logical, len); - if (IS_ERR(em)) - return PTR_ERR(em); - map = em->map_lookup; /* Offset of this logical address in the chunk */ offset = logical - em->start; @@ -5947,8 +6141,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, btrfs_crit(fs_info, "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu", stripe_offset, offset, em->start, logical, stripe_len); - ret = -EINVAL; - goto out; + return -EINVAL; } /* stripe_offset is the offset of this block in its stripe */ @@ -5995,10 +6188,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, io_geom->stripe_offset = stripe_offset; io_geom->raid56_stripe_offset = raid56_full_stripe_start; -out: - /* once for us */ - free_extent_map(em); - return ret; + return 0; } static int __btrfs_map_block(struct btrfs_fs_info *fs_info, @@ -6031,12 +6221,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, ASSERT(bbio_ret); ASSERT(op != BTRFS_MAP_DISCARD); - ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom); + em = btrfs_get_chunk_map(fs_info, logical, *length); + ASSERT(!IS_ERR(em)); + + ret = btrfs_get_io_geometry(fs_info, em, op, logical, *length, &geom); if (ret < 0) return ret; - em = btrfs_get_chunk_map(fs_info, logical, *length); - ASSERT(!IS_ERR(em)); map = em->map_lookup; *length = geom.len; @@ -6212,8 +6403,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && need_full_stripe(op)) { - handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes, - &max_errors); + handle_ops_on_dev_replace(op, &bbio, dev_replace, logical, + &num_stripes, &max_errors); } *bbio_ret = bbio; @@ -6284,7 +6475,7 @@ static void btrfs_end_bio(struct bio *bio) struct btrfs_device *dev = btrfs_io_bio(bio)->device; ASSERT(dev->bdev); - if (bio_op(bio) == REQ_OP_WRITE) + if (btrfs_op(bio) == BTRFS_MAP_WRITE) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); else if (!(bio->bi_opf & REQ_RAHEAD)) @@ -6336,9 +6527,23 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, btrfs_io_bio(bio)->device = dev; bio->bi_end_io = btrfs_end_bio; bio->bi_iter.bi_sector = physical >> 9; + /* + * For zone append writing, bi_sector must point the beginning of the + * zone + */ + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + if (btrfs_dev_is_sequential(dev, physical)) { + u64 zone_start = round_down(physical, fs_info->zone_size); + + bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; + } else { + bio->bi_opf &= ~REQ_OP_ZONE_APPEND; + bio->bi_opf |= REQ_OP_WRITE; + } + } btrfs_debug_in_rcu(fs_info, "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", - bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector, + bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid, bio->bi_iter.bi_size); bio_set_dev(bio, dev->bdev); @@ -6370,7 +6575,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, { struct btrfs_device *dev; struct bio *first_bio = bio; - u64 logical = (u64)bio->bi_iter.bi_sector << 9; + u64 logical = bio->bi_iter.bi_sector << 9; u64 length = 0; u64 map_length; int ret; @@ -6397,10 +6602,10 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, atomic_set(&bbio->stripes_pending, bbio->num_stripes); if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && - ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) { + ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) { /* In this case, map_length has been set to the length of a single stripe; not the whole write */ - if (bio_op(bio) == REQ_OP_WRITE) { + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { ret = raid56_parity_write(fs_info, bio, bbio, map_length); } else { @@ -6423,7 +6628,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, dev = bbio->stripes[dev_nr].dev; if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || - (bio_op(first_bio) == REQ_OP_WRITE && + (btrfs_op(first_bio) == BTRFS_MAP_WRITE && !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { bbio_error(bbio, first_bio, logical); continue; @@ -6450,8 +6655,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, * If @seed is true, traverse through the seed devices. */ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, - u64 devid, u8 *uuid, u8 *fsid, - bool seed) + u64 devid, u8 *uuid, u8 *fsid) { struct btrfs_device *device; struct btrfs_fs_devices *seed_devs; @@ -6658,7 +6862,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, btrfs_stripe_dev_uuid_nr(chunk, i), BTRFS_UUID_SIZE); map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, - devid, uuid, NULL, true); + devid, uuid, NULL); if (!map->stripes[i].dev && !btrfs_test_opt(fs_info, DEGRADED)) { free_extent_map(em); @@ -6797,7 +7001,7 @@ static int read_one_dev(struct extent_buffer *leaf, } device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, - fs_uuid, true); + fs_uuid); if (!device) { if (!btrfs_test_opt(fs_info, DEGRADED)) { btrfs_report_missing_device(fs_info, devid, @@ -6860,6 +7064,16 @@ static int read_one_dev(struct extent_buffer *leaf, } fill_device_from_item(leaf, dev_item, device); + if (device->bdev) { + u64 max_total_bytes = i_size_read(device->bdev->bd_inode); + + if (device->total_bytes > max_total_bytes) { + btrfs_err(fs_info, + "device total_bytes should be at most %llu but found %llu", + max_total_bytes, device->total_bytes); + return -EINVAL; + } + } set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { @@ -6894,11 +7108,11 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will * overallocate but we can keep it as-is, only the first page is used. */ - sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET); + sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET, + root->root_key.objectid, 0); if (IS_ERR(sb)) return PTR_ERR(sb); set_extent_buffer_uptodate(sb); - btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); /* * The sb extent buffer is artificial and just used to read the system array. * set_extent_buffer_uptodate() call does not properly mark all it's @@ -7062,12 +7276,8 @@ static void readahead_tree_node_children(struct extent_buffer *node) int i; const int nr_items = btrfs_header_nritems(node); - for (i = 0; i < nr_items; i++) { - u64 start; - - start = btrfs_node_blockptr(node, i); - readahead_tree_block(node->fs_info, start); - } + for (i = 0; i < nr_items; i++) + btrfs_readahead_node_child(node, i); } int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) @@ -7454,8 +7664,7 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, int i; mutex_lock(&fs_devices->device_list_mutex); - dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL, - true); + dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL); mutex_unlock(&fs_devices->device_list_mutex); if (!dev) { @@ -7586,28 +7795,13 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, } /* Make sure no dev extent is beyond device bondary */ - dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true); + dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); if (!dev) { btrfs_err(fs_info, "failed to find devid %llu", devid); ret = -EUCLEAN; goto out; } - /* It's possible this device is a dummy for seed device */ - if (dev->disk_total_bytes == 0) { - struct btrfs_fs_devices *devs; - - devs = list_first_entry(&fs_info->fs_devices->seed_list, - struct btrfs_fs_devices, seed_list); - dev = btrfs_find_device(devs, devid, NULL, NULL, false); - if (!dev) { - btrfs_err(fs_info, "failed to find seed devid %llu", - devid); - ret = -EUCLEAN; - goto out; - } - } - if (physical_offset + physical_len > dev->disk_total_bytes) { btrfs_err(fs_info, "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", @@ -7616,6 +7810,20 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, ret = -EUCLEAN; goto out; } + + if (dev->zone_info) { + u64 zone_size = dev->zone_info->zone_size; + + if (!IS_ALIGNED(physical_offset, zone_size) || + !IS_ALIGNED(physical_len, zone_size)) { + btrfs_err(fs_info, +"zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone", + devid, physical_offset, physical_len); + ret = -EUCLEAN; + goto out; + } + } + out: free_extent_map(em); return ret; @@ -7662,6 +7870,19 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) u64 prev_dev_ext_end = 0; int ret = 0; + /* + * We don't have a dev_root because we mounted with ignorebadroots and + * failed to load the root, so we want to skip the verification in this + * case for sure. + * + * However if the dev root is fine, but the tree itself is corrupted + * we'd still fail to mount. This verification is only to make sure + * writes can happen safely, so instead just bypass this check + * completely in the case of IGNOREBADROOTS. + */ + if (btrfs_test_opt(fs_info, IGNOREBADROOTS)) + return 0; + key.objectid = 1; key.type = BTRFS_DEV_EXTENT_KEY; key.offset = 0; @@ -7759,3 +7980,75 @@ bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) spin_unlock(&fs_info->swapfile_pins_lock); return node != NULL; } + +static int relocating_repair_kthread(void *data) +{ + struct btrfs_block_group *cache = (struct btrfs_block_group *)data; + struct btrfs_fs_info *fs_info = cache->fs_info; + u64 target; + int ret = 0; + + target = cache->start; + btrfs_put_block_group(cache); + + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { + btrfs_info(fs_info, + "zoned: skip relocating block group %llu to repair: EBUSY", + target); + return -EBUSY; + } + + mutex_lock(&fs_info->delete_unused_bgs_mutex); + + /* Ensure block group still exists */ + cache = btrfs_lookup_block_group(fs_info, target); + if (!cache) + goto out; + + if (!cache->relocating_repair) + goto out; + + ret = btrfs_may_alloc_data_chunk(fs_info, target); + if (ret < 0) + goto out; + + btrfs_info(fs_info, + "zoned: relocating block group %llu to repair IO failure", + target); + ret = btrfs_relocate_chunk(fs_info, target); + +out: + if (cache) + btrfs_put_block_group(cache); + mutex_unlock(&fs_info->delete_unused_bgs_mutex); + btrfs_exclop_finish(fs_info); + + return ret; +} + +int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) +{ + struct btrfs_block_group *cache; + + /* Do not attempt to repair in degraded state */ + if (btrfs_test_opt(fs_info, DEGRADED)) + return 0; + + cache = btrfs_lookup_block_group(fs_info, logical); + if (!cache) + return 0; + + spin_lock(&cache->lock); + if (cache->relocating_repair) { + spin_unlock(&cache->lock); + btrfs_put_block_group(cache); + return 0; + } + cache->relocating_repair = 1; + spin_unlock(&cache->lock); + + kthread_run(relocating_repair_kthread, cache, + "btrfs-relocating-repair"); + + return 0; +} |