diff options
Diffstat (limited to 'fs')
78 files changed, 1171 insertions, 1223 deletions
@@ -52,7 +52,8 @@ struct aio_ring { unsigned id; /* kernel internal index number */ unsigned nr; /* number of io_events */ - unsigned head; + unsigned head; /* Written to by userland or under ring_lock + * mutex by aio_read_events_ring(). */ unsigned tail; unsigned magic; @@ -243,6 +244,11 @@ static void aio_free_ring(struct kioctx *ctx) { int i; + /* Disconnect the kiotx from the ring file. This prevents future + * accesses to the kioctx from page migration. + */ + put_aio_ring_file(ctx); + for (i = 0; i < ctx->nr_pages; i++) { struct page *page; pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, @@ -254,8 +260,6 @@ static void aio_free_ring(struct kioctx *ctx) put_page(page); } - put_aio_ring_file(ctx); - if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) { kfree(ctx->ring_pages); ctx->ring_pages = NULL; @@ -283,29 +287,38 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, { struct kioctx *ctx; unsigned long flags; + pgoff_t idx; int rc; rc = 0; - /* Make sure the old page hasn't already been changed */ + /* mapping->private_lock here protects against the kioctx teardown. */ spin_lock(&mapping->private_lock); ctx = mapping->private_data; - if (ctx) { - pgoff_t idx; - spin_lock_irqsave(&ctx->completion_lock, flags); - idx = old->index; - if (idx < (pgoff_t)ctx->nr_pages) { - if (ctx->ring_pages[idx] != old) - rc = -EAGAIN; - } else - rc = -EINVAL; - spin_unlock_irqrestore(&ctx->completion_lock, flags); + if (!ctx) { + rc = -EINVAL; + goto out; + } + + /* The ring_lock mutex. The prevents aio_read_events() from writing + * to the ring's head, and prevents page migration from mucking in + * a partially initialized kiotx. + */ + if (!mutex_trylock(&ctx->ring_lock)) { + rc = -EAGAIN; + goto out; + } + + idx = old->index; + if (idx < (pgoff_t)ctx->nr_pages) { + /* Make sure the old page hasn't already been changed */ + if (ctx->ring_pages[idx] != old) + rc = -EAGAIN; } else rc = -EINVAL; - spin_unlock(&mapping->private_lock); if (rc != 0) - return rc; + goto out_unlock; /* Writeback must be complete */ BUG_ON(PageWriteback(old)); @@ -314,38 +327,26 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1); if (rc != MIGRATEPAGE_SUCCESS) { put_page(new); - return rc; + goto out_unlock; } - /* We can potentially race against kioctx teardown here. Use the - * address_space's private data lock to protect the mapping's - * private_data. + /* Take completion_lock to prevent other writes to the ring buffer + * while the old page is copied to the new. This prevents new + * events from being lost. */ - spin_lock(&mapping->private_lock); - ctx = mapping->private_data; - if (ctx) { - pgoff_t idx; - spin_lock_irqsave(&ctx->completion_lock, flags); - migrate_page_copy(new, old); - idx = old->index; - if (idx < (pgoff_t)ctx->nr_pages) { - /* And only do the move if things haven't changed */ - if (ctx->ring_pages[idx] == old) - ctx->ring_pages[idx] = new; - else - rc = -EAGAIN; - } else - rc = -EINVAL; - spin_unlock_irqrestore(&ctx->completion_lock, flags); - } else - rc = -EBUSY; - spin_unlock(&mapping->private_lock); + spin_lock_irqsave(&ctx->completion_lock, flags); + migrate_page_copy(new, old); + BUG_ON(ctx->ring_pages[idx] != old); + ctx->ring_pages[idx] = new; + spin_unlock_irqrestore(&ctx->completion_lock, flags); - if (rc == MIGRATEPAGE_SUCCESS) - put_page(old); - else - put_page(new); + /* The old page is no longer accessible. */ + put_page(old); +out_unlock: + mutex_unlock(&ctx->ring_lock); +out: + spin_unlock(&mapping->private_lock); return rc; } #endif @@ -380,7 +381,7 @@ static int aio_setup_ring(struct kioctx *ctx) file = aio_private_file(ctx, nr_pages); if (IS_ERR(file)) { ctx->aio_ring_file = NULL; - return -EAGAIN; + return -ENOMEM; } ctx->aio_ring_file = file; @@ -415,7 +416,7 @@ static int aio_setup_ring(struct kioctx *ctx) if (unlikely(i != nr_pages)) { aio_free_ring(ctx); - return -EAGAIN; + return -ENOMEM; } ctx->mmap_size = nr_pages * PAGE_SIZE; @@ -429,7 +430,7 @@ static int aio_setup_ring(struct kioctx *ctx) if (IS_ERR((void *)ctx->mmap_base)) { ctx->mmap_size = 0; aio_free_ring(ctx); - return -EAGAIN; + return -ENOMEM; } pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); @@ -556,6 +557,10 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); + /* While kioctx setup is in progress, + * we are protected from page migration + * changes ring_pages by ->ring_lock. + */ ring = kmap_atomic(ctx->ring_pages[0]); ring->id = ctx->id; kunmap_atomic(ring); @@ -640,24 +645,28 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) ctx->max_reqs = nr_events; - if (percpu_ref_init(&ctx->users, free_ioctx_users)) - goto err; - - if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs)) - goto err; - spin_lock_init(&ctx->ctx_lock); spin_lock_init(&ctx->completion_lock); mutex_init(&ctx->ring_lock); + /* Protect against page migration throughout kiotx setup by keeping + * the ring_lock mutex held until setup is complete. */ + mutex_lock(&ctx->ring_lock); init_waitqueue_head(&ctx->wait); INIT_LIST_HEAD(&ctx->active_reqs); + if (percpu_ref_init(&ctx->users, free_ioctx_users)) + goto err; + + if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs)) + goto err; + ctx->cpu = alloc_percpu(struct kioctx_cpu); if (!ctx->cpu) goto err; - if (aio_setup_ring(ctx) < 0) + err = aio_setup_ring(ctx); + if (err < 0) goto err; atomic_set(&ctx->reqs_available, ctx->nr_events - 1); @@ -683,6 +692,9 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) if (err) goto err_cleanup; + /* Release the ring_lock mutex now that all setup is complete. */ + mutex_unlock(&ctx->ring_lock); + pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", ctx, ctx->user_id, mm, ctx->nr_events); return ctx; @@ -692,6 +704,7 @@ err_cleanup: err_ctx: aio_free_ring(ctx); err: + mutex_unlock(&ctx->ring_lock); free_percpu(ctx->cpu); free_percpu(ctx->reqs.pcpu_count); free_percpu(ctx->users.pcpu_count); @@ -1024,6 +1037,7 @@ static long aio_read_events_ring(struct kioctx *ctx, mutex_lock(&ctx->ring_lock); + /* Access to ->ring_pages here is protected by ctx->ring_lock. */ ring = kmap_atomic(ctx->ring_pages[0]); head = ring->head; tail = ring->tail; @@ -1002,7 +1002,7 @@ struct bio_map_data { }; static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, - struct sg_iovec *iov, int iov_count, + const struct sg_iovec *iov, int iov_count, int is_our_pages) { memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count); @@ -1022,7 +1022,7 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs, sizeof(struct sg_iovec) * iov_count, gfp_mask); } -static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count, +static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_count, int to_user, int from_user, int do_free_page) { int ret = 0, i; @@ -1120,7 +1120,7 @@ EXPORT_SYMBOL(bio_uncopy_user); */ struct bio *bio_copy_user_iov(struct request_queue *q, struct rq_map_data *map_data, - struct sg_iovec *iov, int iov_count, + const struct sg_iovec *iov, int iov_count, int write_to_vm, gfp_t gfp_mask) { struct bio_map_data *bmd; @@ -1259,7 +1259,7 @@ EXPORT_SYMBOL(bio_copy_user); static struct bio *__bio_map_user_iov(struct request_queue *q, struct block_device *bdev, - struct sg_iovec *iov, int iov_count, + const struct sg_iovec *iov, int iov_count, int write_to_vm, gfp_t gfp_mask) { int i, j; @@ -1407,7 +1407,7 @@ EXPORT_SYMBOL(bio_map_user); * device. Returns an error pointer in case of error. */ struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev, - struct sg_iovec *iov, int iov_count, + const struct sg_iovec *iov, int iov_count, int write_to_vm, gfp_t gfp_mask) { struct bio *bio; diff --git a/fs/block_dev.c b/fs/block_dev.c index ba0d2b05bb78..552a8d13bc32 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1518,7 +1518,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, BUG_ON(iocb->ki_pos != pos); blk_start_plug(&plug); - ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); + ret = __generic_file_aio_write(iocb, iov, nr_segs); if (ret > 0) { ssize_t err; diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index ecb5832c0967..5a201d81049c 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -323,6 +323,8 @@ void btrfs_destroy_workqueue(struct btrfs_workqueue *wq) void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max) { + if (!wq) + return; wq->normal->max_active = max; if (wq->high) wq->high->max_active = max; diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index aad7201ad11b..10db21fa0926 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -330,7 +330,10 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; } - root_level = btrfs_old_root_level(root, time_seq); + if (path->search_commit_root) + root_level = btrfs_header_level(root->commit_root); + else + root_level = btrfs_old_root_level(root, time_seq); if (root_level + 1 == level) { srcu_read_unlock(&fs_info->subvol_srcu, index); @@ -1099,9 +1102,9 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, * * returns 0 on success, < 0 on error. */ -int btrfs_find_all_roots(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots) +static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist **roots) { struct ulist *tmp; struct ulist_node *node = NULL; @@ -1137,6 +1140,20 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, return 0; } +int btrfs_find_all_roots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist **roots) +{ + int ret; + + if (!trans) + down_read(&fs_info->commit_root_sem); + ret = __btrfs_find_all_roots(trans, fs_info, bytenr, time_seq, roots); + if (!trans) + up_read(&fs_info->commit_root_sem); + return ret; +} + /* * this makes the path point to (inum INODE_ITEM ioff) */ @@ -1516,6 +1533,8 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, if (IS_ERR(trans)) return PTR_ERR(trans); btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); + } else { + down_read(&fs_info->commit_root_sem); } ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, @@ -1526,8 +1545,8 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, ULIST_ITER_INIT(&ref_uiter); while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { - ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, - tree_mod_seq_elem.seq, &roots); + ret = __btrfs_find_all_roots(trans, fs_info, ref_node->val, + tree_mod_seq_elem.seq, &roots); if (ret) break; ULIST_ITER_INIT(&root_uiter); @@ -1549,6 +1568,8 @@ out: if (!search_commit_root) { btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); btrfs_end_transaction(trans, fs_info->extent_root); + } else { + up_read(&fs_info->commit_root_sem); } return ret; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 88d1b1eedc9c..1bcfcdb23cf4 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2769,9 +2769,13 @@ again: * the commit roots are read only * so we always do read locks */ + if (p->need_commit_sem) + down_read(&root->fs_info->commit_root_sem); b = root->commit_root; extent_buffer_get(b); level = btrfs_header_level(b); + if (p->need_commit_sem) + up_read(&root->fs_info->commit_root_sem); if (!p->skip_locking) btrfs_tree_read_lock(b); } else { @@ -5360,7 +5364,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root, { int ret; int cmp; - struct btrfs_trans_handle *trans = NULL; struct btrfs_path *left_path = NULL; struct btrfs_path *right_path = NULL; struct btrfs_key left_key; @@ -5378,9 +5381,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root, u64 right_blockptr; u64 left_gen; u64 right_gen; - u64 left_start_ctransid; - u64 right_start_ctransid; - u64 ctransid; left_path = btrfs_alloc_path(); if (!left_path) { @@ -5404,21 +5404,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root, right_path->search_commit_root = 1; right_path->skip_locking = 1; - spin_lock(&left_root->root_item_lock); - left_start_ctransid = btrfs_root_ctransid(&left_root->root_item); - spin_unlock(&left_root->root_item_lock); - - spin_lock(&right_root->root_item_lock); - right_start_ctransid = btrfs_root_ctransid(&right_root->root_item); - spin_unlock(&right_root->root_item_lock); - - trans = btrfs_join_transaction(left_root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - /* * Strategy: Go to the first items of both trees. Then do * @@ -5455,6 +5440,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, * the right if possible or go up and right. */ + down_read(&left_root->fs_info->commit_root_sem); left_level = btrfs_header_level(left_root->commit_root); left_root_level = left_level; left_path->nodes[left_level] = left_root->commit_root; @@ -5464,6 +5450,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, right_root_level = right_level; right_path->nodes[right_level] = right_root->commit_root; extent_buffer_get(right_path->nodes[right_level]); + up_read(&left_root->fs_info->commit_root_sem); if (left_level == 0) btrfs_item_key_to_cpu(left_path->nodes[left_level], @@ -5482,67 +5469,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root, advance_left = advance_right = 0; while (1) { - /* - * We need to make sure the transaction does not get committed - * while we do anything on commit roots. This means, we need to - * join and leave transactions for every item that we process. - */ - if (trans && btrfs_should_end_transaction(trans, left_root)) { - btrfs_release_path(left_path); - btrfs_release_path(right_path); - - ret = btrfs_end_transaction(trans, left_root); - trans = NULL; - if (ret < 0) - goto out; - } - /* now rejoin the transaction */ - if (!trans) { - trans = btrfs_join_transaction(left_root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - - spin_lock(&left_root->root_item_lock); - ctransid = btrfs_root_ctransid(&left_root->root_item); - spin_unlock(&left_root->root_item_lock); - if (ctransid != left_start_ctransid) - left_start_ctransid = 0; - - spin_lock(&right_root->root_item_lock); - ctransid = btrfs_root_ctransid(&right_root->root_item); - spin_unlock(&right_root->root_item_lock); - if (ctransid != right_start_ctransid) - right_start_ctransid = 0; - - if (!left_start_ctransid || !right_start_ctransid) { - WARN(1, KERN_WARNING - "BTRFS: btrfs_compare_tree detected " - "a change in one of the trees while " - "iterating. This is probably a " - "bug.\n"); - ret = -EIO; - goto out; - } - - /* - * the commit root may have changed, so start again - * where we stopped - */ - left_path->lowest_level = left_level; - right_path->lowest_level = right_level; - ret = btrfs_search_slot(NULL, left_root, - &left_key, left_path, 0, 0); - if (ret < 0) - goto out; - ret = btrfs_search_slot(NULL, right_root, - &right_key, right_path, 0, 0); - if (ret < 0) - goto out; - } - if (advance_left && !left_end_reached) { ret = tree_advance(left_root, left_path, &left_level, left_root_level, @@ -5672,14 +5598,6 @@ out: btrfs_free_path(left_path); btrfs_free_path(right_path); kfree(tmp_buf); - - if (trans) { - if (!ret) - ret = btrfs_end_transaction(trans, left_root); - else - btrfs_end_transaction(trans, left_root); - } - return ret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index bc96c03dd259..4c48df572bd6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -609,6 +609,7 @@ struct btrfs_path { unsigned int skip_locking:1; unsigned int leave_spinning:1; unsigned int search_commit_root:1; + unsigned int need_commit_sem:1; }; /* @@ -986,7 +987,8 @@ struct btrfs_dev_replace_item { #define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) #define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7) #define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8) -#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE +#define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \ + BTRFS_SPACE_INFO_GLOBAL_RSV) enum btrfs_raid_types { BTRFS_RAID_RAID10, @@ -1018,6 +1020,12 @@ enum btrfs_raid_types { */ #define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48) +/* + * A fake block group type that is used to communicate global block reserve + * size to userspace via the SPACE_INFO ioctl. + */ +#define BTRFS_SPACE_INFO_GLOBAL_RSV (1ULL << 49) + #define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \ BTRFS_AVAIL_ALLOC_BIT_SINGLE) @@ -1440,7 +1448,7 @@ struct btrfs_fs_info { */ struct mutex ordered_extent_flush_mutex; - struct rw_semaphore extent_commit_sem; + struct rw_semaphore commit_root_sem; struct rw_semaphore cleanup_work_sem; @@ -1711,7 +1719,6 @@ struct btrfs_root { struct btrfs_block_rsv *block_rsv; /* free ino cache stuff */ - struct mutex fs_commit_mutex; struct btrfs_free_space_ctl *free_ino_ctl; enum btrfs_caching_type cached; spinlock_t cache_lock; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index bd0f752b797b..029d46c2e170 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -329,6 +329,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, { struct extent_state *cached_state = NULL; int ret; + bool need_lock = (current->journal_info == + (void *)BTRFS_SEND_TRANS_STUB); if (!parent_transid || btrfs_header_generation(eb) == parent_transid) return 0; @@ -336,6 +338,11 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, if (atomic) return -EAGAIN; + if (need_lock) { + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + } + lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, 0, &cached_state); if (extent_buffer_uptodate(eb) && @@ -347,10 +354,21 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, "found %llu\n", eb->start, parent_transid, btrfs_header_generation(eb)); ret = 1; - clear_extent_buffer_uptodate(eb); + + /* + * Things reading via commit roots that don't have normal protection, + * like send, can have a really old block in cache that may point at a + * block that has been free'd and re-allocated. So don't clear uptodate + * if we find an eb that is under IO (dirty/writeback) because we could + * end up reading in the stale data and then writing it back out and + * making everybody very sad. + */ + if (!extent_buffer_under_io(eb)) + clear_extent_buffer_uptodate(eb); out: unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, &cached_state, GFP_NOFS); + btrfs_tree_read_unlock_blocking(eb); return ret; } @@ -1546,7 +1564,6 @@ int btrfs_init_fs_root(struct btrfs_root *root) root->subv_writers = writers; btrfs_init_free_ino_ctl(root); - mutex_init(&root->fs_commit_mutex); spin_lock_init(&root->cache_lock); init_waitqueue_head(&root->cache_wait); @@ -2324,7 +2341,7 @@ int open_ctree(struct super_block *sb, mutex_init(&fs_info->transaction_kthread_mutex); mutex_init(&fs_info->cleaner_mutex); mutex_init(&fs_info->volume_mutex); - init_rwsem(&fs_info->extent_commit_sem); + init_rwsem(&fs_info->commit_root_sem); init_rwsem(&fs_info->cleanup_work_sem); init_rwsem(&fs_info->subvol_sem); sema_init(&fs_info->uuid_tree_rescan_sem, 1); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c6b6a6e3e735..1306487c82cf 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -419,7 +419,7 @@ static noinline void caching_thread(struct btrfs_work *work) again: mutex_lock(&caching_ctl->mutex); /* need to make sure the commit_root doesn't disappear */ - down_read(&fs_info->extent_commit_sem); + down_read(&fs_info->commit_root_sem); next: ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); @@ -443,10 +443,10 @@ next: break; if (need_resched() || - rwsem_is_contended(&fs_info->extent_commit_sem)) { + rwsem_is_contended(&fs_info->commit_root_sem)) { caching_ctl->progress = last; btrfs_release_path(path); - up_read(&fs_info->extent_commit_sem); + up_read(&fs_info->commit_root_sem); mutex_unlock(&caching_ctl->mutex); cond_resched(); goto again; @@ -513,7 +513,7 @@ next: err: btrfs_free_path(path); - up_read(&fs_info->extent_commit_sem); + up_read(&fs_info->commit_root_sem); free_excluded_extents(extent_root, block_group); @@ -633,10 +633,10 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, return 0; } - down_write(&fs_info->extent_commit_sem); + down_write(&fs_info->commit_root_sem); atomic_inc(&caching_ctl->count); list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); - up_write(&fs_info->extent_commit_sem); + up_write(&fs_info->commit_root_sem); btrfs_get_block_group(cache); @@ -2444,7 +2444,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, spin_unlock(&locked_ref->lock); spin_lock(&delayed_refs->lock); spin_lock(&locked_ref->lock); - if (rb_first(&locked_ref->ref_root)) { + if (rb_first(&locked_ref->ref_root) || + locked_ref->extent_op) { spin_unlock(&locked_ref->lock); spin_unlock(&delayed_refs->lock); continue; @@ -5470,7 +5471,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *cache; struct btrfs_space_info *space_info; - down_write(&fs_info->extent_commit_sem); + down_write(&fs_info->commit_root_sem); list_for_each_entry_safe(caching_ctl, next, &fs_info->caching_block_groups, list) { @@ -5489,7 +5490,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, else fs_info->pinned_extents = &fs_info->freed_extents[0]; - up_write(&fs_info->extent_commit_sem); + up_write(&fs_info->commit_root_sem); list_for_each_entry_rcu(space_info, &fs_info->space_info, list) percpu_counter_set(&space_info->total_bytes_pinned, 0); @@ -5744,6 +5745,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", bytenr, parent, root_objectid, owner_objectid, owner_offset); + btrfs_abort_transaction(trans, extent_root, ret); + goto out; } else { btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -8255,14 +8258,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) struct btrfs_caching_control *caching_ctl; struct rb_node *n; - down_write(&info->extent_commit_sem); + down_write(&info->commit_root_sem); while (!list_empty(&info->caching_block_groups)) { caching_ctl = list_entry(info->caching_block_groups.next, struct btrfs_caching_control, list); list_del(&caching_ctl->list); put_caching_control(caching_ctl); } - up_write(&info->extent_commit_sem); + up_write(&info->commit_root_sem); spin_lock(&info->block_group_cache_lock); while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { @@ -8336,9 +8339,15 @@ static void __link_block_group(struct btrfs_space_info *space_info, struct btrfs_block_group_cache *cache) { int index = get_block_group_index(cache); + bool first = false; down_write(&space_info->groups_sem); - if (list_empty(&space_info->block_groups[index])) { + if (list_empty(&space_info->block_groups[index])) + first = true; + list_add_tail(&cache->list, &space_info->block_groups[index]); + up_write(&space_info->groups_sem); + + if (first) { struct kobject *kobj = &space_info->block_group_kobjs[index]; int ret; @@ -8350,8 +8359,6 @@ static void __link_block_group(struct btrfs_space_info *space_info, kobject_put(&space_info->kobj); } } - list_add_tail(&cache->list, &space_info->block_groups[index]); - up_write(&space_info->groups_sem); } static struct btrfs_block_group_cache * diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ae69a00387e7..3955e475ceec 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -749,6 +749,7 @@ again: * our range starts */ node = tree_search(tree, start); +process_node: if (!node) break; @@ -769,7 +770,10 @@ again: if (start > end) break; - cond_resched_lock(&tree->lock); + if (!cond_resched_lock(&tree->lock)) { + node = rb_next(node); + goto process_node; + } } out: spin_unlock(&tree->lock); @@ -4306,7 +4310,7 @@ static void __free_extent_buffer(struct extent_buffer *eb) kmem_cache_free(extent_buffer_cache, eb); } -static int extent_buffer_under_io(struct extent_buffer *eb) +int extent_buffer_under_io(struct extent_buffer *eb) { return (atomic_read(&eb->io_pages) || test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 58b27e5ab521..c488b45237bf 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -320,6 +320,7 @@ int set_extent_buffer_dirty(struct extent_buffer *eb); int set_extent_buffer_uptodate(struct extent_buffer *eb); int clear_extent_buffer_uptodate(struct extent_buffer *eb); int extent_buffer_uptodate(struct extent_buffer *eb); +int extent_buffer_under_io(struct extent_buffer *eb); int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, unsigned long min_len, char **map, unsigned long *map_start, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index c660527af838..eb742c07e7a4 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -425,13 +425,8 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, struct page *page = prepared_pages[pg]; /* * Copy data from userspace to the current page - * - * Disable pagefault to avoid recursive lock since - * the pages are already locked */ - pagefault_disable(); copied = iov_iter_copy_from_user_atomic(page, i, offset, count); - pagefault_enable(); /* Flush processor's dcache for this page */ flush_dcache_page(page); @@ -1665,7 +1660,7 @@ again: static ssize_t __btrfs_direct_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos, - loff_t *ppos, size_t count, size_t ocount) + size_t count, size_t ocount) { struct file *file = iocb->ki_filp; struct iov_iter i; @@ -1674,7 +1669,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, loff_t endbyte; int err; - written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos, + written = generic_file_direct_write(iocb, iov, &nr_segs, pos, count, ocount); if (written < 0 || written == count) @@ -1693,7 +1688,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, if (err) goto out; written += written_buffered; - *ppos = pos + written_buffered; + iocb->ki_pos = pos + written_buffered; invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT, endbyte >> PAGE_CACHE_SHIFT); out: @@ -1725,8 +1720,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; - loff_t *ppos = &iocb->ki_pos; u64 start_pos; + u64 end_pos; ssize_t num_written = 0; ssize_t err = 0; size_t count, ocount; @@ -1781,7 +1776,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, start_pos = round_down(pos, root->sectorsize); if (start_pos > i_size_read(inode)) { - err = btrfs_cont_expand(inode, i_size_read(inode), start_pos); + /* Expand hole size to cover write data, preventing empty gap */ + end_pos = round_up(pos + iov->iov_len, root->sectorsize); + err = btrfs_cont_expand(inode, i_size_read(inode), end_pos); if (err) { mutex_unlock(&inode->i_mutex); goto out; @@ -1793,7 +1790,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, if (unlikely(file->f_flags & O_DIRECT)) { num_written = __btrfs_direct_write(iocb, iov, nr_segs, - pos, ppos, count, ocount); + pos, count, ocount); } else { struct iov_iter i; @@ -1801,7 +1798,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, num_written = __btrfs_buffered_write(file, &i, pos); if (num_written > 0) - *ppos = pos + num_written; + iocb->ki_pos = pos + num_written; } mutex_unlock(&inode->i_mutex); diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index ab485e57b6fe..cc8ca193d830 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -55,7 +55,7 @@ static int caching_kthread(void *data) key.type = BTRFS_INODE_ITEM_KEY; again: /* need to make sure the commit_root doesn't disappear */ - mutex_lock(&root->fs_commit_mutex); + down_read(&fs_info->commit_root_sem); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -88,7 +88,7 @@ again: btrfs_item_key_to_cpu(leaf, &key, 0); btrfs_release_path(path); root->cache_progress = last; - mutex_unlock(&root->fs_commit_mutex); + up_read(&fs_info->commit_root_sem); schedule_timeout(1); goto again; } else @@ -127,7 +127,7 @@ next: btrfs_unpin_free_ino(root); out: wake_up(&root->cache_wait); - mutex_unlock(&root->fs_commit_mutex); + up_read(&fs_info->commit_root_sem); btrfs_free_path(path); @@ -223,11 +223,11 @@ again: * or the caching work is done. */ - mutex_lock(&root->fs_commit_mutex); + down_write(&root->fs_info->commit_root_sem); spin_lock(&root->cache_lock); if (root->cached == BTRFS_CACHE_FINISHED) { spin_unlock(&root->cache_lock); - mutex_unlock(&root->fs_commit_mutex); + up_write(&root->fs_info->commit_root_sem); goto again; } spin_unlock(&root->cache_lock); @@ -240,7 +240,7 @@ again: else __btrfs_add_free_space(pinned, objectid, 1); - mutex_unlock(&root->fs_commit_mutex); + up_write(&root->fs_info->commit_root_sem); } } @@ -250,7 +250,7 @@ again: * and others will just be dropped, because the commit root we were * searching has changed. * - * Must be called with root->fs_commit_mutex held + * Must be called with root->fs_info->commit_root_sem held */ void btrfs_unpin_free_ino(struct btrfs_root *root) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 06e9a4152b14..5f805bc944fa 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -394,6 +394,14 @@ static noinline int compress_file_range(struct inode *inode, (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) btrfs_add_inode_defrag(NULL, inode); + /* + * skip compression for a small file range(<=blocksize) that + * isn't an inline extent, since it dosen't save disk space at all. + */ + if ((end - start + 1) <= blocksize && + (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) + goto cleanup_and_bail_uncompressed; + actual_end = min_t(u64, isize, end + 1); again: will_compress = 0; @@ -1271,6 +1279,15 @@ next_slot: disk_bytenr += cur_offset - found_key.offset; num_bytes = min(end + 1, extent_end) - cur_offset; /* + * if there are pending snapshots for this root, + * we fall into common COW way. + */ + if (!nolock) { + err = btrfs_start_nocow_write(root); + if (!err) + goto out_check; + } + /* * force cow if csum exists in the range. * this ensure that csum for a given extent are * either valid or do not exist. @@ -1289,6 +1306,8 @@ next_slot: out_check: if (extent_end <= start) { path->slots[0]++; + if (!nolock && nocow) + btrfs_end_nocow_write(root); goto next_slot; } if (!nocow) { @@ -1306,8 +1325,11 @@ out_check: ret = cow_file_range(inode, locked_page, cow_start, found_key.offset - 1, page_started, nr_written, 1); - if (ret) + if (ret) { + if (!nolock && nocow) + btrfs_end_nocow_write(root); goto error; + } cow_start = (u64)-1; } @@ -1354,8 +1376,11 @@ out_check: BTRFS_DATA_RELOC_TREE_OBJECTID) { ret = btrfs_reloc_clone_csums(inode, cur_offset, num_bytes); - if (ret) + if (ret) { + if (!nolock && nocow) + btrfs_end_nocow_write(root); goto error; + } } extent_clear_unlock_delalloc(inode, cur_offset, @@ -1363,6 +1388,8 @@ out_check: locked_page, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_SET_PRIVATE2); + if (!nolock && nocow) + btrfs_end_nocow_write(root); cur_offset = extent_end; if (cur_offset > end) break; @@ -8476,19 +8503,20 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput, else iput(inode); ret = -ENOMEM; - break; + goto out; } list_add_tail(&work->list, &works); btrfs_queue_work(root->fs_info->flush_workers, &work->work); ret++; if (nr != -1 && ret >= nr) - break; + goto out; cond_resched(); spin_lock(&root->delalloc_lock); } spin_unlock(&root->delalloc_lock); +out: list_for_each_entry_safe(work, next, &works, list) { list_del_init(&work->list); btrfs_wait_and_free_delalloc_work(work); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0401397b5c92..e79ff6b90cb7 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1472,6 +1472,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, struct btrfs_trans_handle *trans; struct btrfs_device *device = NULL; char *sizestr; + char *retptr; char *devstr = NULL; int ret = 0; int mod = 0; @@ -1539,8 +1540,8 @@ static noinline int btrfs_ioctl_resize(struct file *file, mod = 1; sizestr++; } - new_size = memparse(sizestr, NULL); - if (new_size == 0) { + new_size = memparse(sizestr, &retptr); + if (*retptr != '\0' || new_size == 0) { ret = -EINVAL; goto out_free; } @@ -3140,8 +3141,9 @@ process_slot: new_key.offset + datal, 1); if (ret) { - btrfs_abort_transaction(trans, root, - ret); + if (ret != -EINVAL) + btrfs_abort_transaction(trans, + root, ret); btrfs_end_transaction(trans, root); goto out; } @@ -3538,6 +3540,11 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) up_read(&info->groups_sem); } + /* + * Global block reserve, exported as a space_info + */ + slot_count++; + /* space_slots == 0 means they are asking for a count */ if (space_args.space_slots == 0) { space_args.total_spaces = slot_count; @@ -3596,6 +3603,21 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) up_read(&info->groups_sem); } + /* + * Add global block reserve + */ + if (slot_count) { + struct btrfs_block_rsv *block_rsv = &root->fs_info->global_block_rsv; + + spin_lock(&block_rsv->lock); + space.total_bytes = block_rsv->size; + space.used_bytes = block_rsv->size - block_rsv->reserved; + spin_unlock(&block_rsv->lock); + space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV; + memcpy(dest, &space, sizeof(space)); + space_args.total_spaces++; + } + user_dest = (struct btrfs_ioctl_space_info __user *) (arg + sizeof(struct btrfs_ioctl_space_args)); @@ -4531,9 +4553,8 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file, } args64 = kmalloc(sizeof(*args64), GFP_NOFS); - if (IS_ERR(args64)) { - ret = PTR_ERR(args64); - args64 = NULL; + if (!args64) { + ret = -ENOMEM; goto out; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index def428a25b2a..7f92ab1daa87 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2317,7 +2317,6 @@ void free_reloc_roots(struct list_head *list) static noinline_for_stack int merge_reloc_roots(struct reloc_control *rc) { - struct btrfs_trans_handle *trans; struct btrfs_root *root; struct btrfs_root *reloc_root; u64 last_snap; @@ -2375,26 +2374,6 @@ again: list_add_tail(&reloc_root->root_list, &reloc_roots); goto out; - } else if (!ret) { - /* - * recover the last snapshot tranid to avoid - * the space balance break NOCOW. - */ - root = read_fs_root(rc->extent_root->fs_info, - objectid); - if (IS_ERR(root)) - continue; - - trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); - - /* Check if the fs/file tree was snapshoted or not. */ - if (btrfs_root_last_snapshot(&root->root_item) == - otransid - 1) - btrfs_set_root_last_snapshot(&root->root_item, - last_snap); - - btrfs_end_transaction(trans, root); } } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 93e6d7172844..0be77993378e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2235,6 +2235,47 @@ behind_scrub_pages: return 0; } +/* + * Given a physical address, this will calculate it's + * logical offset. if this is a parity stripe, it will return + * the most left data stripe's logical offset. + * + * return 0 if it is a data stripe, 1 means parity stripe. + */ +static int get_raid56_logic_offset(u64 physical, int num, + struct map_lookup *map, u64 *offset) +{ + int i; + int j = 0; + u64 stripe_nr; + u64 last_offset; + int stripe_index; + int rot; + + last_offset = (physical - map->stripes[num].physical) * + nr_data_stripes(map); + *offset = last_offset; + for (i = 0; i < nr_data_stripes(map); i++) { + *offset = last_offset + i * map->stripe_len; + + stripe_nr = *offset; + do_div(stripe_nr, map->stripe_len); + do_div(stripe_nr, nr_data_stripes(map)); + + /* Work out the disk rotation on this stripe-set */ + rot = do_div(stripe_nr, map->num_stripes); + /* calculate which stripe this data locates */ + rot += i; + stripe_index = rot % map->num_stripes; + if (stripe_index == num) + return 0; + if (stripe_index < num) + j++; + } + *offset = last_offset + j * map->stripe_len; + return 1; +} + static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct map_lookup *map, struct btrfs_device *scrub_dev, @@ -2256,6 +2297,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, u64 physical; u64 logical; u64 logic_end; + u64 physical_end; u64 generation; int mirror_num; struct reada_control *reada1; @@ -2269,16 +2311,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, u64 extent_len; struct btrfs_device *extent_dev; int extent_mirror_num; - int stop_loop; - - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { - if (num >= nr_data_stripes(map)) { - return 0; - } - } + int stop_loop = 0; nstripes = length; + physical = map->stripes[num].physical; offset = 0; do_div(nstripes, map->stripe_len); if (map->type & BTRFS_BLOCK_GROUP_RAID0) { @@ -2296,6 +2332,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { increment = map->stripe_len; mirror_num = num % map->num_stripes + 1; + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + get_raid56_logic_offset(physical, num, map, &offset); + increment = map->stripe_len * nr_data_stripes(map); + mirror_num = 1; } else { increment = map->stripe_len; mirror_num = 1; @@ -2319,7 +2360,15 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, * to not hold off transaction commits */ logical = base + offset; - + physical_end = physical + nstripes * map->stripe_len; + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + get_raid56_logic_offset(physical_end, num, + map, &logic_end); + logic_end += base; + } else { + logic_end = logical + increment * nstripes; + } wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); scrub_blocked_if_needed(fs_info); @@ -2328,7 +2377,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, key_start.objectid = logical; key_start.type = BTRFS_EXTENT_ITEM_KEY; key_start.offset = (u64)0; - key_end.objectid = base + offset + nstripes * increment; + key_end.objectid = logic_end; key_end.type = BTRFS_METADATA_ITEM_KEY; key_end.offset = (u64)-1; reada1 = btrfs_reada_add(root, &key_start, &key_end); @@ -2338,7 +2387,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, key_start.offset = logical; key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; key_end.type = BTRFS_EXTENT_CSUM_KEY; - key_end.offset = base + offset + nstripes * increment; + key_end.offset = logic_end; reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); if (!IS_ERR(reada1)) @@ -2356,11 +2405,17 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, /* * now find all extents for each stripe and scrub them */ - logical = base + offset; - physical = map->stripes[num].physical; - logic_end = logical + increment * nstripes; ret = 0; - while (logical < logic_end) { + while (physical < physical_end) { + /* for raid56, we skip parity stripe */ + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + ret = get_raid56_logic_offset(physical, num, + map, &logical); + logical += base; + if (ret) + goto skip; + } /* * canceled? */ @@ -2504,15 +2559,29 @@ again: scrub_free_csums(sctx); if (extent_logical + extent_len < key.objectid + bytes) { - logical += increment; - physical += map->stripe_len; - + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + /* + * loop until we find next data stripe + * or we have finished all stripes. + */ + do { + physical += map->stripe_len; + ret = get_raid56_logic_offset( + physical, num, + map, &logical); + logical += base; + } while (physical < physical_end && ret); + } else { + physical += map->stripe_len; + logical += increment; + } if (logical < key.objectid + bytes) { cond_resched(); goto again; } - if (logical >= logic_end) { + if (physical >= physical_end) { stop_loop = 1; break; } @@ -2521,6 +2590,7 @@ next: path->slots[0]++; } btrfs_release_path(path); +skip: logical += increment; physical += map->stripe_len; spin_lock(&sctx->stat_lock); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 9b6da9d55f9a..1ac3ca98c429 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -493,6 +493,7 @@ static struct btrfs_path *alloc_path_for_send(void) return NULL; path->search_commit_root = 1; path->skip_locking = 1; + path->need_commit_sem = 1; return path; } @@ -771,29 +772,22 @@ out: /* * Helper function to retrieve some fields from an inode item. */ -static int get_inode_info(struct btrfs_root *root, - u64 ino, u64 *size, u64 *gen, - u64 *mode, u64 *uid, u64 *gid, - u64 *rdev) +static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path, + u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid, + u64 *gid, u64 *rdev) { int ret; struct btrfs_inode_item *ii; struct btrfs_key key; - struct btrfs_path *path; - - path = alloc_path_for_send(); - if (!path) - return -ENOMEM; key.objectid = ino; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; if (ret) { - ret = -ENOENT; - goto out; + if (ret > 0) + ret = -ENOENT; + return ret; } ii = btrfs_item_ptr(path->nodes[0], path->slots[0], @@ -811,7 +805,22 @@ static int get_inode_info(struct btrfs_root *root, if (rdev) *rdev = btrfs_inode_rdev(path->nodes[0], ii); -out: + return ret; +} + +static int get_inode_info(struct btrfs_root *root, + u64 ino, u64 *size, u64 *gen, + u64 *mode, u64 *uid, u64 *gid, + u64 *rdev) +{ + struct btrfs_path *path; + int ret; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid, + rdev); btrfs_free_path(path); return ret; } @@ -1085,6 +1094,7 @@ out: struct backref_ctx { struct send_ctx *sctx; + struct btrfs_path *path; /* number of total found references */ u64 found; @@ -1155,8 +1165,9 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) * There are inodes that have extents that lie behind its i_size. Don't * accept clones from these extents. */ - ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL, - NULL); + ret = __get_inode_info(found->root, bctx->path, ino, &i_size, NULL, NULL, + NULL, NULL, NULL); + btrfs_release_path(bctx->path); if (ret < 0) return ret; @@ -1235,12 +1246,17 @@ static int find_extent_clone(struct send_ctx *sctx, if (!tmp_path) return -ENOMEM; + /* We only use this path under the commit sem */ + tmp_path->need_commit_sem = 0; + backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS); if (!backref_ctx) { ret = -ENOMEM; goto out; } + backref_ctx->path = tmp_path; + if (data_offset >= ino_size) { /* * There may be extents that lie behind the file's size. @@ -1268,8 +1284,10 @@ static int find_extent_clone(struct send_ctx *sctx, } logical = disk_byte + btrfs_file_extent_offset(eb, fi); + down_read(&sctx->send_root->fs_info->commit_root_sem); ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path, &found_key, &flags); + up_read(&sctx->send_root->fs_info->commit_root_sem); btrfs_release_path(tmp_path); if (ret < 0) @@ -4418,6 +4436,9 @@ static int send_hole(struct send_ctx *sctx, u64 end) p = fs_path_alloc(); if (!p) return -ENOMEM; + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto tlv_put_failure; memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE); while (offset < end) { len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE); @@ -4425,9 +4446,6 @@ static int send_hole(struct send_ctx *sctx, u64 end) ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) break; - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - break; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len); @@ -4968,7 +4986,9 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) if (S_ISREG(sctx->cur_inode_mode)) { if (need_send_hole(sctx)) { - if (sctx->cur_inode_last_extent == (u64)-1) { + if (sctx->cur_inode_last_extent == (u64)-1 || + sctx->cur_inode_last_extent < + sctx->cur_inode_size) { ret = get_last_extent(sctx, (u64)-1); if (ret) goto out; @@ -5367,57 +5387,21 @@ out: static int full_send_tree(struct send_ctx *sctx) { int ret; - struct btrfs_trans_handle *trans = NULL; struct btrfs_root *send_root = sctx->send_root; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_path *path; struct extent_buffer *eb; int slot; - u64 start_ctransid; - u64 ctransid; path = alloc_path_for_send(); if (!path) return -ENOMEM; - spin_lock(&send_root->root_item_lock); - start_ctransid = btrfs_root_ctransid(&send_root->root_item); - spin_unlock(&send_root->root_item_lock); - key.objectid = BTRFS_FIRST_FREE_OBJECTID; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; -join_trans: - /* - * We need to make sure the transaction does not get committed - * while we do anything on commit roots. Join a transaction to prevent - * this. - */ - trans = btrfs_join_transaction(send_root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - - /* - * Make sure the tree has not changed after re-joining. We detect this - * by comparing start_ctransid and ctransid. They should always match. - */ - spin_lock(&send_root->root_item_lock); - ctransid = btrfs_root_ctransid(&send_root->root_item); - spin_unlock(&send_root->root_item_lock); - - if (ctransid != start_ctransid) { - WARN(1, KERN_WARNING "BTRFS: the root that you're trying to " - "send was modified in between. This is " - "probably a bug.\n"); - ret = -EIO; - goto out; - } - ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0); if (ret < 0) goto out; @@ -5425,19 +5409,6 @@ join_trans: goto out_finish; while (1) { - /* - * When someone want to commit while we iterate, end the - * joined transaction and rejoin. - */ - if (btrfs_should_end_transaction(trans, send_root)) { - ret = btrfs_end_transaction(trans, send_root); - trans = NULL; - if (ret < 0) - goto out; - btrfs_release_path(path); - goto join_trans; - } - eb = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(eb, &found_key, slot); @@ -5465,12 +5436,6 @@ out_finish: out: btrfs_free_path(path); - if (trans) { - if (!ret) - ret = btrfs_end_transaction(trans, send_root); - else - btrfs_end_transaction(trans, send_root); - } return ret; } @@ -5718,7 +5683,9 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) NULL); sort_clone_roots = 1; + current->journal_info = (void *)BTRFS_SEND_TRANS_STUB; ret = send_subvol(sctx); + current->journal_info = NULL; if (ret < 0) goto out; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9dbf42395153..5011aadacab8 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -66,6 +66,8 @@ static const struct super_operations btrfs_super_ops; static struct file_system_type btrfs_fs_type; +static int btrfs_remount(struct super_block *sb, int *flags, char *data); + static const char *btrfs_decode_error(int errno) { char *errstr = "unknown"; @@ -1185,6 +1187,26 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs); kfree(newargs); + + if (PTR_RET(mnt) == -EBUSY) { + if (flags & MS_RDONLY) { + mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name, + newargs); + } else { + int r; + mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name, + newargs); + if (IS_ERR(mnt)) + return ERR_CAST(mnt); + + r = btrfs_remount(mnt->mnt_sb, &flags, NULL); + if (r < 0) { + /* FIXME: release vfsmount mnt ??*/ + return ERR_PTR(r); + } + } + } + if (IS_ERR(mnt)) return ERR_CAST(mnt); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a04707f740d6..7579f6d0b854 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -75,10 +75,21 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) } } -static noinline void switch_commit_root(struct btrfs_root *root) +static noinline void switch_commit_roots(struct btrfs_transaction *trans, + struct btrfs_fs_info *fs_info) { - free_extent_buffer(root->commit_root); - root->commit_root = btrfs_root_node(root); + struct btrfs_root *root, *tmp; + + down_write(&fs_info->commit_root_sem); + list_for_each_entry_safe(root, tmp, &trans->switch_commits, + dirty_list) { + list_del_init(&root->dirty_list); + free_extent_buffer(root->commit_root); + root->commit_root = btrfs_root_node(root); + if (is_fstree(root->objectid)) + btrfs_unpin_free_ino(root); + } + up_write(&fs_info->commit_root_sem); } static inline void extwriter_counter_inc(struct btrfs_transaction *trans, @@ -208,6 +219,7 @@ loop: INIT_LIST_HEAD(&cur_trans->pending_snapshots); INIT_LIST_HEAD(&cur_trans->ordered_operations); INIT_LIST_HEAD(&cur_trans->pending_chunks); + INIT_LIST_HEAD(&cur_trans->switch_commits); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, fs_info->btree_inode->i_mapping); @@ -375,7 +387,8 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return ERR_PTR(-EROFS); - if (current->journal_info) { + if (current->journal_info && + current->journal_info != (void *)BTRFS_SEND_TRANS_STUB) { WARN_ON(type & TRANS_EXTWRITERS); h = current->journal_info; h->use_count++; @@ -919,9 +932,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, return ret; } - if (root != root->fs_info->extent_root) - switch_commit_root(root); - return 0; } @@ -977,15 +987,16 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, list_del_init(next); root = list_entry(next, struct btrfs_root, dirty_list); + if (root != fs_info->extent_root) + list_add_tail(&root->dirty_list, + &trans->transaction->switch_commits); ret = update_cowonly_root(trans, root); if (ret) return ret; } - down_write(&fs_info->extent_commit_sem); - switch_commit_root(fs_info->extent_root); - up_write(&fs_info->extent_commit_sem); - + list_add_tail(&fs_info->extent_root->dirty_list, + &trans->transaction->switch_commits); btrfs_after_dev_replace_commit(fs_info); return 0; @@ -1042,11 +1053,8 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, smp_wmb(); if (root->commit_root != root->node) { - mutex_lock(&root->fs_commit_mutex); - switch_commit_root(root); - btrfs_unpin_free_ino(root); - mutex_unlock(&root->fs_commit_mutex); - + list_add_tail(&root->dirty_list, + &trans->transaction->switch_commits); btrfs_set_root_node(&root->root_item, root->node); } @@ -1857,11 +1865,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_set_root_node(&root->fs_info->tree_root->root_item, root->fs_info->tree_root->node); - switch_commit_root(root->fs_info->tree_root); + list_add_tail(&root->fs_info->tree_root->dirty_list, + &cur_trans->switch_commits); btrfs_set_root_node(&root->fs_info->chunk_root->root_item, root->fs_info->chunk_root->node); - switch_commit_root(root->fs_info->chunk_root); + list_add_tail(&root->fs_info->chunk_root->dirty_list, + &cur_trans->switch_commits); + + switch_commit_roots(cur_trans, root->fs_info); assert_qgroups_uptodate(trans); update_super_roots(root); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 6ac037e9f9f0..b57b924e8e03 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -57,6 +57,7 @@ struct btrfs_transaction { struct list_head pending_snapshots; struct list_head ordered_operations; struct list_head pending_chunks; + struct list_head switch_commits; struct btrfs_delayed_ref_root delayed_refs; int aborted; }; @@ -78,6 +79,8 @@ struct btrfs_transaction { #define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \ __TRANS_ATTACH) +#define BTRFS_SEND_TRANS_STUB 1 + struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d241130a32fd..49d7fab73360 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -448,6 +448,14 @@ static void pending_bios_fn(struct btrfs_work *work) run_scheduled_bios(device); } +/* + * Add new device to list of registered devices + * + * Returns: + * 1 - first time device is seen + * 0 - device already known + * < 0 - error + */ static noinline int device_list_add(const char *path, struct btrfs_super_block *disk_super, u64 devid, struct btrfs_fs_devices **fs_devices_ret) @@ -455,6 +463,7 @@ static noinline int device_list_add(const char *path, struct btrfs_device *device; struct btrfs_fs_devices *fs_devices; struct rcu_string *name; + int ret = 0; u64 found_transid = btrfs_super_generation(disk_super); fs_devices = find_fsid(disk_super->fsid); @@ -495,6 +504,7 @@ static noinline int device_list_add(const char *path, fs_devices->num_devices++; mutex_unlock(&fs_devices->device_list_mutex); + ret = 1; device->fs_devices = fs_devices; } else if (!device->name || strcmp(device->name->str, path)) { name = rcu_string_strdup(path, GFP_NOFS); @@ -513,7 +523,8 @@ static noinline int device_list_add(const char *path, fs_devices->latest_trans = found_transid; } *fs_devices_ret = fs_devices; - return 0; + + return ret; } static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) @@ -910,17 +921,19 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, transid = btrfs_super_generation(disk_super); total_devices = btrfs_super_num_devices(disk_super); - if (disk_super->label[0]) { - if (disk_super->label[BTRFS_LABEL_SIZE - 1]) - disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; - printk(KERN_INFO "BTRFS: device label %s ", disk_super->label); - } else { - printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid); - } - - printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path); - ret = device_list_add(path, disk_super, devid, fs_devices_ret); + if (ret > 0) { + if (disk_super->label[0]) { + if (disk_super->label[BTRFS_LABEL_SIZE - 1]) + disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; + printk(KERN_INFO "BTRFS: device label %s ", disk_super->label); + } else { + printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid); + } + + printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path); + ret = 0; + } if (!ret && fs_devices_ret) (*fs_devices_ret)->total_devices = total_devices; diff --git a/fs/buffer.c b/fs/buffer.c index 8c53a2b15ecb..9ddb9fc7d923 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2114,8 +2114,8 @@ EXPORT_SYMBOL(generic_write_end); * Returns true if all buffers which correspond to a file portion * we want to read are uptodate. */ -int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, - unsigned long from) +int block_is_partially_uptodate(struct page *page, unsigned long from, + unsigned long count) { unsigned block_start, block_end, blocksize; unsigned to; @@ -2127,7 +2127,7 @@ int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, head = page_buffers(page); blocksize = head->b_size; - to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count); + to = min_t(unsigned, PAGE_CACHE_SIZE - from, count); to = from + to; if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize) return 0; diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c index 622f4696e484..5b99bafc31d1 100644 --- a/fs/cachefiles/bind.c +++ b/fs/cachefiles/bind.c @@ -124,7 +124,6 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) /* check parameters */ ret = -EOPNOTSUPP; if (!root->d_inode || - !root->d_inode->i_op || !root->d_inode->i_op->lookup || !root->d_inode->i_op->mkdir || !root->d_inode->i_op->setxattr || diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 6494d9f673aa..c0a681705104 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -779,8 +779,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, } ret = -EPERM; - if (!subdir->d_inode->i_op || - !subdir->d_inode->i_op->setxattr || + if (!subdir->d_inode->i_op->setxattr || !subdir->d_inode->i_op->getxattr || !subdir->d_inode->i_op->lookup || !subdir->d_inode->i_op->mkdir || diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 66075a4ad979..39da1c2efa50 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -601,7 +601,7 @@ ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov, false); if (IS_ERR(req)) { ret = PTR_ERR(req); - goto out; + break; } num_pages = calc_pages_for(page_align, len); @@ -719,7 +719,7 @@ static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov, false); if (IS_ERR(req)) { ret = PTR_ERR(req); - goto out; + break; } /* @@ -972,6 +972,7 @@ retry_snap: } } else { loff_t old_size = inode->i_size; + struct iov_iter from; /* * No need to acquire the i_truncate_mutex. Because * the MDS revokes Fwb caps before sending truncate @@ -979,9 +980,10 @@ retry_snap: * are pending vmtruncate. So write and vmtruncate * can not run at the same time */ - written = generic_file_buffered_write(iocb, iov, nr_segs, - pos, &iocb->ki_pos, - count, 0); + iov_iter_init(&from, iov, nr_segs, count, 0); + written = generic_perform_write(file, &from, pos); + if (likely(written >= 0)) + iocb->ki_pos = pos + written; if (inode->i_size > old_size) ceph_fscache_update_objectsize(inode); mutex_unlock(&inode->i_mutex); diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index efbe08289292..fdf941b44ff1 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -1,9 +1,8 @@ +#include <linux/ceph/ceph_debug.h> #include <linux/in.h> #include "super.h" #include "mds_client.h" -#include <linux/ceph/ceph_debug.h> - #include "ioctl.h" diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 2c70cbe35d39..5be1f997ecde 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -253,6 +253,11 @@ cifs_alloc_inode(struct super_block *sb) cifs_set_oplock_level(cifs_inode, 0); cifs_inode->delete_pending = false; cifs_inode->invalid_mapping = false; + clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cifs_inode->flags); + clear_bit(CIFS_INODE_PENDING_WRITERS, &cifs_inode->flags); + clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cifs_inode->flags); + spin_lock_init(&cifs_inode->writers_lock); + cifs_inode->writers = 0; cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ cifs_inode->server_eof = 0; cifs_inode->uniqueid = 0; @@ -732,19 +737,26 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct inode *inode = file_inode(iocb->ki_filp); + struct cifsInodeInfo *cinode = CIFS_I(inode); ssize_t written; int rc; + written = cifs_get_writer(cinode); + if (written) + return written; + written = generic_file_aio_write(iocb, iov, nr_segs, pos); if (CIFS_CACHE_WRITE(CIFS_I(inode))) - return written; + goto out; rc = filemap_fdatawrite(inode->i_mapping); if (rc) cifs_dbg(FYI, "cifs_file_aio_write: %d rc on %p inode\n", rc, inode); +out: + cifs_put_writer(cinode); return written; } @@ -850,7 +862,6 @@ const struct inode_operations cifs_file_inode_ops = { /* revalidate:cifs_revalidate, */ .setattr = cifs_setattr, .getattr = cifs_getattr, /* do we need this anymore? */ - .rename = cifs_rename, .permission = cifs_permission, #ifdef CONFIG_CIFS_XATTR .setxattr = cifs_setxattr, diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index c0f3718b77a8..30f6e9251a4a 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -228,6 +228,8 @@ struct smb_version_operations { /* verify the message */ int (*check_message)(char *, unsigned int); bool (*is_oplock_break)(char *, struct TCP_Server_Info *); + void (*downgrade_oplock)(struct TCP_Server_Info *, + struct cifsInodeInfo *, bool); /* process transaction2 response */ bool (*check_trans2)(struct mid_q_entry *, struct TCP_Server_Info *, char *, int); @@ -1113,6 +1115,12 @@ struct cifsInodeInfo { unsigned int epoch; /* used to track lease state changes */ bool delete_pending; /* DELETE_ON_CLOSE is set */ bool invalid_mapping; /* pagecache is invalid */ + unsigned long flags; +#define CIFS_INODE_PENDING_OPLOCK_BREAK (0) /* oplock break in progress */ +#define CIFS_INODE_PENDING_WRITERS (1) /* Writes in progress */ +#define CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2 (2) /* Downgrade oplock to L2 */ + spinlock_t writers_lock; + unsigned int writers; /* Number of writers on this inode */ unsigned long time; /* jiffies of last update of inode */ u64 server_eof; /* current file size on server -- protected by i_lock */ u64 uniqueid; /* server inode number */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index acc4ee8ed075..ca7980a1e303 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -127,6 +127,9 @@ extern u64 cifs_UnixTimeToNT(struct timespec); extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset); extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock); +extern int cifs_get_writer(struct cifsInodeInfo *cinode); +extern void cifs_put_writer(struct cifsInodeInfo *cinode); +extern void cifs_done_oplock_break(struct cifsInodeInfo *cinode); extern int cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, const unsigned int xid); extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index f3264bd7a83d..6ce4e0954b98 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -6197,6 +6197,9 @@ QAllEAsRetry: cifs_dbg(FYI, "ea length %d\n", list_len); if (list_len <= 8) { cifs_dbg(FYI, "empty EA list returned from server\n"); + /* didn't find the named attribute */ + if (ea_name) + rc = -ENODATA; goto QAllEAsOut; } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 216d7e99f921..5ed03e0b8b40 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2579,19 +2579,32 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov, struct cifsInodeInfo *cinode = CIFS_I(inode); struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; ssize_t rc = -EACCES; - loff_t lock_pos = pos; + loff_t lock_pos = iocb->ki_pos; - if (file->f_flags & O_APPEND) - lock_pos = i_size_read(inode); /* * We need to hold the sem to be sure nobody modifies lock list * with a brlock that prevents writing. */ down_read(&cinode->lock_sem); + mutex_lock(&inode->i_mutex); + if (file->f_flags & O_APPEND) + lock_pos = i_size_read(inode); if (!cifs_find_lock_conflict(cfile, lock_pos, iov_length(iov, nr_segs), server->vals->exclusive_lock_type, NULL, - CIFS_WRITE_OP)) - rc = generic_file_aio_write(iocb, iov, nr_segs, pos); + CIFS_WRITE_OP)) { + rc = __generic_file_aio_write(iocb, iov, nr_segs); + mutex_unlock(&inode->i_mutex); + + if (rc > 0) { + ssize_t err; + + err = generic_write_sync(file, iocb->ki_pos - rc, rc); + if (err < 0) + rc = err; + } + } else { + mutex_unlock(&inode->i_mutex); + } up_read(&cinode->lock_sem); return rc; } @@ -2608,12 +2621,20 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); ssize_t written; + written = cifs_get_writer(cinode); + if (written) + return written; + if (CIFS_CACHE_WRITE(cinode)) { if (cap_unix(tcon->ses) && (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) - && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) - return generic_file_aio_write(iocb, iov, nr_segs, pos); - return cifs_writev(iocb, iov, nr_segs, pos); + && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) { + written = generic_file_aio_write( + iocb, iov, nr_segs, pos); + goto out; + } + written = cifs_writev(iocb, iov, nr_segs, pos); + goto out; } /* * For non-oplocked files in strict cache mode we need to write the data @@ -2633,6 +2654,8 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, inode); cinode->oplock = 0; } +out: + cifs_put_writer(cinode); return written; } @@ -2727,56 +2750,27 @@ cifs_retry_async_readv(struct cifs_readdata *rdata) /** * cifs_readdata_to_iov - copy data from pages in response to an iovec * @rdata: the readdata response with list of pages holding data - * @iov: vector in which we should copy the data - * @nr_segs: number of segments in vector - * @offset: offset into file of the first iovec - * @copied: used to return the amount of data copied to the iov + * @iter: destination for our data * * This function copies data from a list of pages in a readdata response into * an array of iovecs. It will first calculate where the data should go * based on the info in the readdata and then copy the data into that spot. */ -static ssize_t -cifs_readdata_to_iov(struct cifs_readdata *rdata, const struct iovec *iov, - unsigned long nr_segs, loff_t offset, ssize_t *copied) +static int +cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter) { - int rc = 0; - struct iov_iter ii; - size_t pos = rdata->offset - offset; - ssize_t remaining = rdata->bytes; - unsigned char *pdata; + size_t remaining = rdata->bytes; unsigned int i; - /* set up iov_iter and advance to the correct offset */ - iov_iter_init(&ii, iov, nr_segs, iov_length(iov, nr_segs), 0); - iov_iter_advance(&ii, pos); - - *copied = 0; for (i = 0; i < rdata->nr_pages; i++) { - ssize_t copy; struct page *page = rdata->pages[i]; - - /* copy a whole page or whatever's left */ - copy = min_t(ssize_t, remaining, PAGE_SIZE); - - /* ...but limit it to whatever space is left in the iov */ - copy = min_t(ssize_t, copy, iov_iter_count(&ii)); - - /* go while there's data to be copied and no errors */ - if (copy && !rc) { - pdata = kmap(page); - rc = memcpy_toiovecend(ii.iov, pdata, ii.iov_offset, - (int)copy); - kunmap(page); - if (!rc) { - *copied += copy; - remaining -= copy; - iov_iter_advance(&ii, copy); - } - } + size_t copy = min_t(size_t, remaining, PAGE_SIZE); + size_t written = copy_page_to_iter(page, 0, copy, iter); + remaining -= written; + if (written < copy && iov_iter_count(iter) > 0) + break; } - - return rc; + return remaining ? -EFAULT : 0; } static void @@ -2837,20 +2831,21 @@ cifs_uncached_read_into_pages(struct TCP_Server_Info *server, return total_read > 0 ? total_read : result; } -static ssize_t -cifs_iovec_read(struct file *file, const struct iovec *iov, - unsigned long nr_segs, loff_t *poffset) +ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { + struct file *file = iocb->ki_filp; ssize_t rc; size_t len, cur_len; ssize_t total_read = 0; - loff_t offset = *poffset; + loff_t offset = pos; unsigned int npages; struct cifs_sb_info *cifs_sb; struct cifs_tcon *tcon; struct cifsFileInfo *open_file; struct cifs_readdata *rdata, *tmp; struct list_head rdata_list; + struct iov_iter to; pid_t pid; if (!nr_segs) @@ -2860,6 +2855,8 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, if (!len) return 0; + iov_iter_init(&to, iov, nr_segs, len, 0); + INIT_LIST_HEAD(&rdata_list); cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); open_file = file->private_data; @@ -2885,7 +2882,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, cifs_uncached_readv_complete); if (!rdata) { rc = -ENOMEM; - goto error; + break; } rc = cifs_read_allocate_pages(rdata, npages); @@ -2917,55 +2914,44 @@ error: if (!list_empty(&rdata_list)) rc = 0; + len = iov_iter_count(&to); /* the loop below should proceed in the order of increasing offsets */ -restart_loop: list_for_each_entry_safe(rdata, tmp, &rdata_list, list) { + again: if (!rc) { - ssize_t copied; - /* FIXME: freezable sleep too? */ rc = wait_for_completion_killable(&rdata->done); if (rc) rc = -EINTR; - else if (rdata->result) + else if (rdata->result) { rc = rdata->result; - else { - rc = cifs_readdata_to_iov(rdata, iov, - nr_segs, *poffset, - &copied); - total_read += copied; + /* resend call if it's a retryable error */ + if (rc == -EAGAIN) { + rc = cifs_retry_async_readv(rdata); + goto again; + } + } else { + rc = cifs_readdata_to_iov(rdata, &to); } - /* resend call if it's a retryable error */ - if (rc == -EAGAIN) { - rc = cifs_retry_async_readv(rdata); - goto restart_loop; - } } list_del_init(&rdata->list); kref_put(&rdata->refcount, cifs_uncached_readdata_release); } + total_read = len - iov_iter_count(&to); + cifs_stats_bytes_read(tcon, total_read); - *poffset += total_read; /* mask nodata case */ if (rc == -ENODATA) rc = 0; - return total_read ? total_read : rc; -} - -ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - ssize_t read; - - read = cifs_iovec_read(iocb->ki_filp, iov, nr_segs, &pos); - if (read > 0) - iocb->ki_pos = pos; - - return read; + if (total_read) { + iocb->ki_pos = pos + total_read; + return total_read; + } + return rc; } ssize_t @@ -3645,6 +3631,13 @@ static int cifs_launder_page(struct page *page) return rc; } +static int +cifs_pending_writers_wait(void *unused) +{ + schedule(); + return 0; +} + void cifs_oplock_break(struct work_struct *work) { struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, @@ -3652,8 +3645,15 @@ void cifs_oplock_break(struct work_struct *work) struct inode *inode = cfile->dentry->d_inode; struct cifsInodeInfo *cinode = CIFS_I(inode); struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; int rc = 0; + wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS, + cifs_pending_writers_wait, TASK_UNINTERRUPTIBLE); + + server->ops->downgrade_oplock(server, cinode, + test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags)); + if (!CIFS_CACHE_WRITE(cinode) && CIFS_CACHE_READ(cinode) && cifs_has_mand_locks(cinode)) { cifs_dbg(FYI, "Reset oplock to None for inode=%p due to mand locks\n", @@ -3690,6 +3690,7 @@ void cifs_oplock_break(struct work_struct *work) cinode); cifs_dbg(FYI, "Oplock release rc = %d\n", rc); } + cifs_done_oplock_break(cinode); } /* diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 2f9f3790679d..3b0c62e622da 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -466,8 +466,22 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) cifs_dbg(FYI, "file id match, oplock break\n"); pCifsInode = CIFS_I(netfile->dentry->d_inode); - cifs_set_oplock_level(pCifsInode, - pSMB->OplockLevel ? OPLOCK_READ : 0); + set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, + &pCifsInode->flags); + + /* + * Set flag if the server downgrades the oplock + * to L2 else clear. + */ + if (pSMB->OplockLevel) + set_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &pCifsInode->flags); + else + clear_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &pCifsInode->flags); + queue_work(cifsiod_wq, &netfile->oplock_break); netfile->oplock_break_cancelled = false; @@ -551,6 +565,62 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) cinode->oplock = 0; } +static int +cifs_oplock_break_wait(void *unused) +{ + schedule(); + return signal_pending(current) ? -ERESTARTSYS : 0; +} + +/* + * We wait for oplock breaks to be processed before we attempt to perform + * writes. + */ +int cifs_get_writer(struct cifsInodeInfo *cinode) +{ + int rc; + +start: + rc = wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK, + cifs_oplock_break_wait, TASK_KILLABLE); + if (rc) + return rc; + + spin_lock(&cinode->writers_lock); + if (!cinode->writers) + set_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); + cinode->writers++; + /* Check to see if we have started servicing an oplock break */ + if (test_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags)) { + cinode->writers--; + if (cinode->writers == 0) { + clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS); + } + spin_unlock(&cinode->writers_lock); + goto start; + } + spin_unlock(&cinode->writers_lock); + return 0; +} + +void cifs_put_writer(struct cifsInodeInfo *cinode) +{ + spin_lock(&cinode->writers_lock); + cinode->writers--; + if (cinode->writers == 0) { + clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS); + } + spin_unlock(&cinode->writers_lock); +} + +void cifs_done_oplock_break(struct cifsInodeInfo *cinode) +{ + clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags); + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK); +} + bool backup_cred(struct cifs_sb_info *cifs_sb) { diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 526fb89f9230..d1fdfa848703 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -372,6 +372,16 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr) return 0; } +static void +cifs_downgrade_oplock(struct TCP_Server_Info *server, + struct cifsInodeInfo *cinode, bool set_level2) +{ + if (set_level2) + cifs_set_oplock_level(cinode, OPLOCK_READ); + else + cifs_set_oplock_level(cinode, 0); +} + static bool cifs_check_trans2(struct mid_q_entry *mid, struct TCP_Server_Info *server, char *buf, int malformed) @@ -1019,6 +1029,7 @@ struct smb_version_operations smb1_operations = { .clear_stats = cifs_clear_stats, .print_stats = cifs_print_stats, .is_oplock_break = is_valid_oplock_break, + .downgrade_oplock = cifs_downgrade_oplock, .check_trans2 = cifs_check_trans2, .need_neg = cifs_need_neg, .negotiate = cifs_negotiate, diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index fb3966265b6e..b8021fde987d 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -575,9 +575,21 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) else cfile->oplock_break_cancelled = false; - server->ops->set_oplock_level(cinode, - rsp->OplockLevel ? SMB2_OPLOCK_LEVEL_II : 0, - 0, NULL); + set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, + &cinode->flags); + + /* + * Set flag if the server downgrades the oplock + * to L2 else clear. + */ + if (rsp->OplockLevel) + set_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &cinode->flags); + else + clear_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &cinode->flags); queue_work(cifsiod_wq, &cfile->oplock_break); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 192f51a12cf1..35ddc3ed119d 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -905,6 +905,17 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, } static void +smb2_downgrade_oplock(struct TCP_Server_Info *server, + struct cifsInodeInfo *cinode, bool set_level2) +{ + if (set_level2) + server->ops->set_oplock_level(cinode, SMB2_OPLOCK_LEVEL_II, + 0, NULL); + else + server->ops->set_oplock_level(cinode, 0, 0, NULL); +} + +static void smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, unsigned int epoch, bool *purge_cache) { @@ -1110,6 +1121,7 @@ struct smb_version_operations smb20_operations = { .clear_stats = smb2_clear_stats, .print_stats = smb2_print_stats, .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, @@ -1184,6 +1196,7 @@ struct smb_version_operations smb21_operations = { .clear_stats = smb2_clear_stats, .print_stats = smb2_print_stats, .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, @@ -1259,6 +1272,7 @@ struct smb_version_operations smb30_operations = { .print_stats = smb2_print_stats, .dump_share_caps = smb2_dump_share_caps, .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 860344701067..3802f8c94acc 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1352,7 +1352,6 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid) { int rc; - char *res_key = NULL; struct compress_ioctl fsctl_input; char *ret_data = NULL; @@ -1365,7 +1364,6 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, 2 /* in data len */, &ret_data /* out data */, NULL); cifs_dbg(FYI, "set compression rc %d\n", rc); - kfree(res_key); return rc; } diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 3190ca973dd6..1e5b45359509 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -424,7 +424,7 @@ int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len) } /* Data available on socket or listen socket received a connect */ -static void lowcomms_data_ready(struct sock *sk, int count_unused) +static void lowcomms_data_ready(struct sock *sk) { struct connection *con = sock2con(sk); if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags)) diff --git a/fs/exec.c b/fs/exec.c index 9e81c630dfa7..476f3ebf437e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -813,7 +813,7 @@ EXPORT_SYMBOL(kernel_read); ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) { - ssize_t res = file->f_op->read(file, (void __user *)addr, len, &pos); + ssize_t res = vfs_read(file, (void __user *)addr, len, &pos); if (res > 0) flush_icache_range(addr, addr + len); return res; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 4e508fc83dcf..ca7502d89fde 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -146,7 +146,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov, overwrite = 1; } - ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); + ret = __generic_file_aio_write(iocb, iov, nr_segs); mutex_unlock(&inode->i_mutex); if (ret > 0) { diff --git a/fs/file.c b/fs/file.c index b61293badfb1..8f294cfac697 100644 --- a/fs/file.c +++ b/fs/file.c @@ -25,7 +25,10 @@ int sysctl_nr_open __read_mostly = 1024*1024; int sysctl_nr_open_min = BITS_PER_LONG; -int sysctl_nr_open_max = 1024 * 1024; /* raised later */ +/* our max() is unusable in constant expressions ;-/ */ +#define __const_max(x, y) ((x) < (y) ? (x) : (y)) +int sysctl_nr_open_max = __const_max(INT_MAX, ~(size_t)0/sizeof(void *)) & + -BITS_PER_LONG; static void *alloc_fdmem(size_t size) { @@ -429,12 +432,6 @@ void exit_files(struct task_struct *tsk) } } -void __init files_defer_init(void) -{ - sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) & - -BITS_PER_LONG; -} - struct files_struct init_files = { .count = ATOMIC_INIT(1), .fdt = &init_files.fdtab, diff --git a/fs/file_table.c b/fs/file_table.c index 01071c4d752e..a374f5033e97 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -52,7 +52,6 @@ static void file_free_rcu(struct rcu_head *head) static inline void file_free(struct file *f) { percpu_counter_dec(&nr_files); - file_check_state(f); call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); } @@ -178,47 +177,12 @@ struct file *alloc_file(struct path *path, fmode_t mode, file->f_mapping = path->dentry->d_inode->i_mapping; file->f_mode = mode; file->f_op = fop; - - /* - * These mounts don't really matter in practice - * for r/o bind mounts. They aren't userspace- - * visible. We do this for consistency, and so - * that we can do debugging checks at __fput() - */ - if ((mode & FMODE_WRITE) && !special_file(path->dentry->d_inode->i_mode)) { - file_take_write(file); - WARN_ON(mnt_clone_write(path->mnt)); - } if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_inc(path->dentry->d_inode); return file; } EXPORT_SYMBOL(alloc_file); -/** - * drop_file_write_access - give up ability to write to a file - * @file: the file to which we will stop writing - * - * This is a central place which will give up the ability - * to write to @file, along with access to write through - * its vfsmount. - */ -static void drop_file_write_access(struct file *file) -{ - struct vfsmount *mnt = file->f_path.mnt; - struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; - - put_write_access(inode); - - if (special_file(inode->i_mode)) - return; - if (file_check_writeable(file) != 0) - return; - __mnt_drop_write(mnt); - file_release_write(file); -} - /* the real guts of fput() - releasing the last reference to file */ static void __fput(struct file *file) @@ -253,8 +217,10 @@ static void __fput(struct file *file) put_pid(file->f_owner.pid); if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_dec(inode); - if (file->f_mode & FMODE_WRITE) - drop_file_write_access(file); + if (file->f_mode & FMODE_WRITER) { + put_write_access(inode); + __mnt_drop_write(mnt); + } file->f_path.dentry = NULL; file->f_path.mnt = NULL; file->f_inode = NULL; @@ -359,6 +325,5 @@ void __init files_init(unsigned long mempages) n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); - files_defer_init(); percpu_counter_init(&nr_files, 0); } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 0a648bb455ae..aac71ce373e4 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -667,15 +667,15 @@ static void fuse_copy_finish(struct fuse_copy_state *cs) struct pipe_buffer *buf = cs->currbuf; if (!cs->write) { - buf->ops->unmap(cs->pipe, buf, cs->mapaddr); + kunmap_atomic(cs->mapaddr); } else { - kunmap(buf->page); + kunmap_atomic(cs->mapaddr); buf->len = PAGE_SIZE - cs->len; } cs->currbuf = NULL; cs->mapaddr = NULL; } else if (cs->mapaddr) { - kunmap(cs->pg); + kunmap_atomic(cs->mapaddr); if (cs->write) { flush_dcache_page(cs->pg); set_page_dirty_lock(cs->pg); @@ -706,7 +706,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) BUG_ON(!cs->nr_segs); cs->currbuf = buf; - cs->mapaddr = buf->ops->map(cs->pipe, buf, 0); + cs->mapaddr = kmap_atomic(buf->page); cs->len = buf->len; cs->buf = cs->mapaddr + buf->offset; cs->pipebufs++; @@ -726,7 +726,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) buf->len = 0; cs->currbuf = buf; - cs->mapaddr = kmap(page); + cs->mapaddr = kmap_atomic(page); cs->buf = cs->mapaddr; cs->len = PAGE_SIZE; cs->pipebufs++; @@ -745,7 +745,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) return err; BUG_ON(err != 1); offset = cs->addr % PAGE_SIZE; - cs->mapaddr = kmap(cs->pg); + cs->mapaddr = kmap_atomic(cs->pg); cs->buf = cs->mapaddr + offset; cs->len = min(PAGE_SIZE - offset, cs->seglen); cs->seglen -= cs->len; @@ -874,7 +874,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) out_fallback_unlock: unlock_page(newpage); out_fallback: - cs->mapaddr = buf->ops->map(cs->pipe, buf, 1); + cs->mapaddr = kmap_atomic(buf->page); cs->buf = cs->mapaddr + buf->offset; err = lock_request(cs->fc, cs->req); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 48992cac714b..13f8bdec5110 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1086,9 +1086,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, if (mapping_writably_mapped(mapping)) flush_dcache_page(page); - pagefault_disable(); tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); - pagefault_enable(); flush_dcache_page(page); mark_page_accessed(page); @@ -1237,8 +1235,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, goto out; if (file->f_flags & O_DIRECT) { - written = generic_file_direct_write(iocb, iov, &nr_segs, - pos, &iocb->ki_pos, + written = generic_file_direct_write(iocb, iov, &nr_segs, pos, count, ocount); if (written < 0 || written == count) goto out; diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index abb0f1f53d93..985217626e66 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -48,14 +48,18 @@ void __init kernfs_inode_init(void) static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) { + static DEFINE_MUTEX(iattr_mutex); + struct kernfs_iattrs *ret; struct iattr *iattrs; + mutex_lock(&iattr_mutex); + if (kn->iattr) - return kn->iattr; + goto out_unlock; kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL); if (!kn->iattr) - return NULL; + goto out_unlock; iattrs = &kn->iattr->ia_iattr; /* assign default attributes */ @@ -65,8 +69,10 @@ static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME; simple_xattrs_init(&kn->iattr->xattrs); - - return kn->iattr; +out_unlock: + ret = kn->iattr; + mutex_unlock(&iattr_mutex); + return ret; } static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) diff --git a/fs/mount.h b/fs/mount.h index b29e42f05f34..d55297f2fa05 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -10,7 +10,7 @@ struct mnt_namespace { struct user_namespace *user_ns; u64 seq; /* Sequence number to prevent loops */ wait_queue_head_t poll; - int event; + u64 event; }; struct mnt_pcp { @@ -104,6 +104,9 @@ struct proc_mounts { struct mnt_namespace *ns; struct path root; int (*show)(struct seq_file *, struct vfsmount *); + void *cached_mount; + u64 cached_event; + loff_t cached_index; }; #define proc_mounts(p) (container_of((p), struct proc_mounts, m)) diff --git a/fs/namei.c b/fs/namei.c index 88339f59efb5..c6157c894fce 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -358,6 +358,7 @@ int generic_permission(struct inode *inode, int mask) return -EACCES; } +EXPORT_SYMBOL(generic_permission); /* * We _really_ want to just do "generic_permission()" without @@ -455,6 +456,7 @@ int inode_permission(struct inode *inode, int mask) return retval; return __inode_permission(inode, mask); } +EXPORT_SYMBOL(inode_permission); /** * path_get - get a reference to a path @@ -924,6 +926,7 @@ int follow_up(struct path *path) path->mnt = &parent->mnt; return 1; } +EXPORT_SYMBOL(follow_up); /* * Perform an automount @@ -1085,6 +1088,7 @@ int follow_down_one(struct path *path) } return 0; } +EXPORT_SYMBOL(follow_down_one); static inline bool managed_dentry_might_block(struct dentry *dentry) { @@ -1223,6 +1227,7 @@ int follow_down(struct path *path) } return 0; } +EXPORT_SYMBOL(follow_down); /* * Skip to top of mountpoint pile in refwalk mode for follow_dotdot() @@ -2025,6 +2030,7 @@ int kern_path(const char *name, unsigned int flags, struct path *path) *path = nd.path; return res; } +EXPORT_SYMBOL(kern_path); /** * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair @@ -2049,6 +2055,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, *path = nd.path; return err; } +EXPORT_SYMBOL(vfs_path_lookup); /* * Restricted form of lookup. Doesn't follow links, single-component only, @@ -2111,6 +2118,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) return __lookup_hash(&this, base, 0); } +EXPORT_SYMBOL(lookup_one_len); int user_path_at_empty(int dfd, const char __user *name, unsigned flags, struct path *path, int *empty) @@ -2135,6 +2143,7 @@ int user_path_at(int dfd, const char __user *name, unsigned flags, { return user_path_at_empty(dfd, name, flags, path, NULL); } +EXPORT_SYMBOL(user_path_at); /* * NB: most callers don't do anything directly with the reference to the @@ -2477,6 +2486,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); return NULL; } +EXPORT_SYMBOL(lock_rename); void unlock_rename(struct dentry *p1, struct dentry *p2) { @@ -2486,6 +2496,7 @@ void unlock_rename(struct dentry *p1, struct dentry *p2) mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex); } } +EXPORT_SYMBOL(unlock_rename); int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool want_excl) @@ -2506,6 +2517,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, fsnotify_create(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_create); static int may_open(struct path *path, int acc_mode, int flag) { @@ -3375,6 +3387,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) fsnotify_create(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_mknod); static int may_mknod(umode_t mode) { @@ -3464,6 +3477,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) fsnotify_mkdir(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_mkdir); SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) { @@ -3518,6 +3532,7 @@ void dentry_unhash(struct dentry *dentry) __d_drop(dentry); spin_unlock(&dentry->d_lock); } +EXPORT_SYMBOL(dentry_unhash); int vfs_rmdir(struct inode *dir, struct dentry *dentry) { @@ -3555,6 +3570,7 @@ out: d_delete(dentry); return error; } +EXPORT_SYMBOL(vfs_rmdir); static long do_rmdir(int dfd, const char __user *pathname) { @@ -3672,6 +3688,7 @@ out: return error; } +EXPORT_SYMBOL(vfs_unlink); /* * Make sure that the actual truncation of the file will occur outside its @@ -3785,6 +3802,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) fsnotify_create(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_symlink); SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, int, newdfd, const char __user *, newname) @@ -3893,6 +3911,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de fsnotify_link(dir, inode, new_dentry); return error; } +EXPORT_SYMBOL(vfs_link); /* * Hardlinks are often used in delicate situations. We avoid @@ -4152,6 +4171,7 @@ out: return error; } +EXPORT_SYMBOL(vfs_rename); SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname, unsigned int, flags) @@ -4304,11 +4324,9 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } -int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link) +int readlink_copy(char __user *buffer, int buflen, const char *link) { - int len; - - len = PTR_ERR(link); + int len = PTR_ERR(link); if (IS_ERR(link)) goto out; @@ -4320,6 +4338,7 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const c out: return len; } +EXPORT_SYMBOL(readlink_copy); /* * A helper for ->readlink(). This should be used *ONLY* for symlinks that @@ -4337,11 +4356,12 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) if (IS_ERR(cookie)) return PTR_ERR(cookie); - res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd)); + res = readlink_copy(buffer, buflen, nd_get_link(&nd)); if (dentry->d_inode->i_op->put_link) dentry->d_inode->i_op->put_link(dentry, &nd, cookie); return res; } +EXPORT_SYMBOL(generic_readlink); /* get the link contents into pagecache */ static char *page_getlink(struct dentry * dentry, struct page **ppage) @@ -4361,14 +4381,14 @@ static char *page_getlink(struct dentry * dentry, struct page **ppage) int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) { struct page *page = NULL; - char *s = page_getlink(dentry, &page); - int res = vfs_readlink(dentry,buffer,buflen,s); + int res = readlink_copy(buffer, buflen, page_getlink(dentry, &page)); if (page) { kunmap(page); page_cache_release(page); } return res; } +EXPORT_SYMBOL(page_readlink); void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd) { @@ -4376,6 +4396,7 @@ void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd) nd_set_link(nd, page_getlink(dentry, &page)); return page; } +EXPORT_SYMBOL(page_follow_link_light); void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) { @@ -4386,6 +4407,7 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) page_cache_release(page); } } +EXPORT_SYMBOL(page_put_link); /* * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS @@ -4423,45 +4445,18 @@ retry: fail: return err; } +EXPORT_SYMBOL(__page_symlink); int page_symlink(struct inode *inode, const char *symname, int len) { return __page_symlink(inode, symname, len, !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS)); } +EXPORT_SYMBOL(page_symlink); const struct inode_operations page_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, }; - -EXPORT_SYMBOL(user_path_at); -EXPORT_SYMBOL(follow_down_one); -EXPORT_SYMBOL(follow_down); -EXPORT_SYMBOL(follow_up); -EXPORT_SYMBOL(get_write_access); /* nfsd */ -EXPORT_SYMBOL(lock_rename); -EXPORT_SYMBOL(lookup_one_len); -EXPORT_SYMBOL(page_follow_link_light); -EXPORT_SYMBOL(page_put_link); -EXPORT_SYMBOL(page_readlink); -EXPORT_SYMBOL(__page_symlink); -EXPORT_SYMBOL(page_symlink); EXPORT_SYMBOL(page_symlink_inode_operations); -EXPORT_SYMBOL(kern_path); -EXPORT_SYMBOL(vfs_path_lookup); -EXPORT_SYMBOL(inode_permission); -EXPORT_SYMBOL(unlock_rename); -EXPORT_SYMBOL(vfs_create); -EXPORT_SYMBOL(vfs_link); -EXPORT_SYMBOL(vfs_mkdir); -EXPORT_SYMBOL(vfs_mknod); -EXPORT_SYMBOL(generic_permission); -EXPORT_SYMBOL(vfs_readlink); -EXPORT_SYMBOL(vfs_rename); -EXPORT_SYMBOL(vfs_rmdir); -EXPORT_SYMBOL(vfs_symlink); -EXPORT_SYMBOL(vfs_unlink); -EXPORT_SYMBOL(dentry_unhash); -EXPORT_SYMBOL(generic_readlink); diff --git a/fs/namespace.c b/fs/namespace.c index 2ffc5a2905d4..182bc41cd887 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -52,7 +52,7 @@ static int __init set_mphash_entries(char *str) } __setup("mphash_entries=", set_mphash_entries); -static int event; +static u64 event; static DEFINE_IDA(mnt_id_ida); static DEFINE_IDA(mnt_group_ida); static DEFINE_SPINLOCK(mnt_id_lock); @@ -414,9 +414,7 @@ EXPORT_SYMBOL_GPL(mnt_clone_write); */ int __mnt_want_write_file(struct file *file) { - struct inode *inode = file_inode(file); - - if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode)) + if (!(file->f_mode & FMODE_WRITER)) return __mnt_want_write(file->f_path.mnt); else return mnt_clone_write(file->f_path.mnt); @@ -570,13 +568,17 @@ int sb_prepare_remount_readonly(struct super_block *sb) static void free_vfsmnt(struct mount *mnt) { kfree(mnt->mnt_devname); - mnt_free_id(mnt); #ifdef CONFIG_SMP free_percpu(mnt->mnt_pcp); #endif kmem_cache_free(mnt_cache, mnt); } +static void delayed_free_vfsmnt(struct rcu_head *head) +{ + free_vfsmnt(container_of(head, struct mount, mnt_rcu)); +} + /* call under rcu_read_lock */ bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) { @@ -848,6 +850,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void root = mount_fs(type, flags, name, data); if (IS_ERR(root)) { + mnt_free_id(mnt); free_vfsmnt(mnt); return ERR_CAST(root); } @@ -885,7 +888,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, goto out_free; } - mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD; + mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED); /* Don't allow unprivileged users to change mount flags */ if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY)) mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; @@ -928,20 +931,11 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, return mnt; out_free: + mnt_free_id(mnt); free_vfsmnt(mnt); return ERR_PTR(err); } -static void delayed_free(struct rcu_head *head) -{ - struct mount *mnt = container_of(head, struct mount, mnt_rcu); - kfree(mnt->mnt_devname); -#ifdef CONFIG_SMP - free_percpu(mnt->mnt_pcp); -#endif - kmem_cache_free(mnt_cache, mnt); -} - static void mntput_no_expire(struct mount *mnt) { put_again: @@ -991,7 +985,7 @@ put_again: dput(mnt->mnt.mnt_root); deactivate_super(mnt->mnt.mnt_sb); mnt_free_id(mnt); - call_rcu(&mnt->mnt_rcu, delayed_free); + call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); } void mntput(struct vfsmount *mnt) @@ -1100,14 +1094,29 @@ static void *m_start(struct seq_file *m, loff_t *pos) struct proc_mounts *p = proc_mounts(m); down_read(&namespace_sem); - return seq_list_start(&p->ns->list, *pos); + if (p->cached_event == p->ns->event) { + void *v = p->cached_mount; + if (*pos == p->cached_index) + return v; + if (*pos == p->cached_index + 1) { + v = seq_list_next(v, &p->ns->list, &p->cached_index); + return p->cached_mount = v; + } + } + + p->cached_event = p->ns->event; + p->cached_mount = seq_list_start(&p->ns->list, *pos); + p->cached_index = *pos; + return p->cached_mount; } static void *m_next(struct seq_file *m, void *v, loff_t *pos) { struct proc_mounts *p = proc_mounts(m); - return seq_list_next(v, &p->ns->list, pos); + p->cached_mount = seq_list_next(v, &p->ns->list, pos); + p->cached_index = *pos; + return p->cached_mount; } static void m_stop(struct seq_file *m, void *v) @@ -1661,9 +1670,9 @@ static int attach_recursive_mnt(struct mount *source_mnt, if (err) goto out; err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); + lock_mount_hash(); if (err) goto out_cleanup_ids; - lock_mount_hash(); for (p = source_mnt; p; p = next_mnt(p, source_mnt)) set_mnt_shared(p); } else { @@ -1690,6 +1699,11 @@ static int attach_recursive_mnt(struct mount *source_mnt, return 0; out_cleanup_ids: + while (!hlist_empty(&tree_list)) { + child = hlist_entry(tree_list.first, struct mount, mnt_hash); + umount_tree(child, 0); + } + unlock_mount_hash(); cleanup_group_ids(source_mnt, NULL); out: return err; @@ -2044,7 +2058,7 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) struct mount *parent; int err; - mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED | MNT_SYNC_UMOUNT); + mnt_flags &= ~MNT_INTERNAL_FLAGS; mp = lock_mount(path); if (IS_ERR(mp)) diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 81b4f643ecef..e31e589369a4 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -470,9 +470,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) { struct ncp_mount_data_kernel data; struct ncp_server *server; - struct file *ncp_filp; struct inode *root_inode; - struct inode *sock_inode; struct socket *sock; int error; int default_bufsize; @@ -541,18 +539,10 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) if (!uid_valid(data.mounted_uid) || !uid_valid(data.uid) || !gid_valid(data.gid)) goto out; - error = -EBADF; - ncp_filp = fget(data.ncp_fd); - if (!ncp_filp) - goto out; - error = -ENOTSOCK; - sock_inode = file_inode(ncp_filp); - if (!S_ISSOCK(sock_inode->i_mode)) - goto out_fput; - sock = SOCKET_I(sock_inode); + sock = sockfd_lookup(data.ncp_fd, &error); if (!sock) - goto out_fput; - + goto out; + if (sock->type == SOCK_STREAM) default_bufsize = 0xF000; else @@ -574,27 +564,16 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) if (error) goto out_fput; - server->ncp_filp = ncp_filp; server->ncp_sock = sock; if (data.info_fd != -1) { - struct socket *info_sock; - - error = -EBADF; - server->info_filp = fget(data.info_fd); - if (!server->info_filp) - goto out_bdi; - error = -ENOTSOCK; - sock_inode = file_inode(server->info_filp); - if (!S_ISSOCK(sock_inode->i_mode)) - goto out_fput2; - info_sock = SOCKET_I(sock_inode); + struct socket *info_sock = sockfd_lookup(data.info_fd, &error); if (!info_sock) - goto out_fput2; + goto out_bdi; + server->info_sock = info_sock; error = -EBADFD; if (info_sock->type != SOCK_STREAM) goto out_fput2; - server->info_sock = info_sock; } /* server->lock = 0; */ @@ -766,17 +745,12 @@ out_nls: mutex_destroy(&server->root_setup_lock); mutex_destroy(&server->mutex); out_fput2: - if (server->info_filp) - fput(server->info_filp); + if (server->info_sock) + sockfd_put(server->info_sock); out_bdi: bdi_destroy(&server->bdi); out_fput: - /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>: - * - * The previously used put_filp(ncp_filp); was bogus, since - * it doesn't perform proper unlocking. - */ - fput(ncp_filp); + sockfd_put(sock); out: put_pid(data.wdog_pid); sb->s_fs_info = NULL; @@ -809,9 +783,9 @@ static void ncp_put_super(struct super_block *sb) mutex_destroy(&server->root_setup_lock); mutex_destroy(&server->mutex); - if (server->info_filp) - fput(server->info_filp); - fput(server->ncp_filp); + if (server->info_sock) + sockfd_put(server->info_sock); + sockfd_put(server->ncp_sock); kill_pid(server->m.wdog_pid, SIGTERM, 1); put_pid(server->m.wdog_pid); diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h index b81e97adc5a9..55e26fd80886 100644 --- a/fs/ncpfs/ncp_fs_sb.h +++ b/fs/ncpfs/ncp_fs_sb.h @@ -45,9 +45,7 @@ struct ncp_server { __u8 name_space[NCP_NUMBER_OF_VOLUMES + 2]; - struct file *ncp_filp; /* File pointer to ncp socket */ struct socket *ncp_sock;/* ncp socket */ - struct file *info_filp; struct socket *info_sock; u8 sequence; @@ -111,7 +109,7 @@ struct ncp_server { spinlock_t requests_lock; /* Lock accesses to tx.requests, tx.creq and rcv.creq when STREAM mode */ - void (*data_ready)(struct sock* sk, int len); + void (*data_ready)(struct sock* sk); void (*error_report)(struct sock* sk); void (*write_space)(struct sock* sk); /* STREAM mode only */ struct { @@ -153,7 +151,7 @@ extern void ncp_tcp_tx_proc(struct work_struct *work); extern void ncpdgram_rcv_proc(struct work_struct *work); extern void ncpdgram_timeout_proc(struct work_struct *work); extern void ncpdgram_timeout_call(unsigned long server); -extern void ncp_tcp_data_ready(struct sock* sk, int len); +extern void ncp_tcp_data_ready(struct sock* sk); extern void ncp_tcp_write_space(struct sock* sk); extern void ncp_tcp_error_report(struct sock* sk); diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c index 04a69a4d8e96..471bc3d1139e 100644 --- a/fs/ncpfs/sock.c +++ b/fs/ncpfs/sock.c @@ -97,11 +97,11 @@ static void ncp_req_put(struct ncp_request_reply *req) kfree(req); } -void ncp_tcp_data_ready(struct sock *sk, int len) +void ncp_tcp_data_ready(struct sock *sk) { struct ncp_server *server = sk->sk_user_data; - server->data_ready(sk, len); + server->data_ready(sk); schedule_work(&server->rcv.tq); } diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 9d8153ebacfb..f47af5e6e230 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1704,8 +1704,6 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) iput(bvi); skip_large_index_stuff: /* Setup the operations for this index inode. */ - vi->i_op = NULL; - vi->i_fop = NULL; vi->i_mapping->a_ops = &ntfs_mst_aops; vi->i_blocks = ni->allocated_size >> 9; /* diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index eb649d23a4de..c6b90e670389 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -137,7 +137,7 @@ static int o2net_sys_err_translations[O2NET_ERR_MAX] = static void o2net_sc_connect_completed(struct work_struct *work); static void o2net_rx_until_empty(struct work_struct *work); static void o2net_shutdown_sc(struct work_struct *work); -static void o2net_listen_data_ready(struct sock *sk, int bytes); +static void o2net_listen_data_ready(struct sock *sk); static void o2net_sc_send_keep_req(struct work_struct *work); static void o2net_idle_timer(unsigned long data); static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); @@ -597,9 +597,9 @@ static void o2net_set_nn_state(struct o2net_node *nn, } /* see o2net_register_callbacks() */ -static void o2net_data_ready(struct sock *sk, int bytes) +static void o2net_data_ready(struct sock *sk) { - void (*ready)(struct sock *sk, int bytes); + void (*ready)(struct sock *sk); read_lock(&sk->sk_callback_lock); if (sk->sk_user_data) { @@ -613,7 +613,7 @@ static void o2net_data_ready(struct sock *sk, int bytes) } read_unlock(&sk->sk_callback_lock); - ready(sk, bytes); + ready(sk); } /* see o2net_register_callbacks() */ @@ -916,57 +916,30 @@ static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key) static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len) { - int ret; - mm_segment_t oldfs; - struct kvec vec = { - .iov_len = len, - .iov_base = data, - }; - struct msghdr msg = { - .msg_iovlen = 1, - .msg_iov = (struct iovec *)&vec, - .msg_flags = MSG_DONTWAIT, - }; - - oldfs = get_fs(); - set_fs(get_ds()); - ret = sock_recvmsg(sock, &msg, len, msg.msg_flags); - set_fs(oldfs); - - return ret; + struct kvec vec = { .iov_len = len, .iov_base = data, }; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT, }; + return kernel_recvmsg(sock, &msg, &vec, 1, len, msg.msg_flags); } static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec, size_t veclen, size_t total) { int ret; - mm_segment_t oldfs; - struct msghdr msg = { - .msg_iov = (struct iovec *)vec, - .msg_iovlen = veclen, - }; + struct msghdr msg; if (sock == NULL) { ret = -EINVAL; goto out; } - oldfs = get_fs(); - set_fs(get_ds()); - ret = sock_sendmsg(sock, &msg, total); - set_fs(oldfs); - if (ret != total) { - mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, - total); - if (ret >= 0) - ret = -EPIPE; /* should be smarter, I bet */ - goto out; - } - - ret = 0; + ret = kernel_sendmsg(sock, &msg, vec, veclen, total); + if (likely(ret == total)) + return 0; + mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, total); + if (ret >= 0) + ret = -EPIPE; /* should be smarter, I bet */ out: - if (ret < 0) - mlog(0, "returning error: %d\n", ret); + mlog(0, "returning error: %d\n", ret); return ret; } @@ -1953,9 +1926,9 @@ static void o2net_accept_many(struct work_struct *work) cond_resched(); } -static void o2net_listen_data_ready(struct sock *sk, int bytes) +static void o2net_listen_data_ready(struct sock *sk) { - void (*ready)(struct sock *sk, int bytes); + void (*ready)(struct sock *sk); read_lock(&sk->sk_callback_lock); ready = sk->sk_user_data; @@ -1978,7 +1951,6 @@ static void o2net_listen_data_ready(struct sock *sk, int bytes) */ if (sk->sk_state == TCP_LISTEN) { - mlog(ML_TCP, "bytes: %d\n", bytes); queue_work(o2net_wq, &o2net_listen_work); } else { ready = NULL; @@ -1987,7 +1959,7 @@ static void o2net_listen_data_ready(struct sock *sk, int bytes) out: read_unlock(&sk->sk_callback_lock); if (ready != NULL) - ready(sk, bytes); + ready(sk); } static int o2net_open_listening_sock(__be32 addr, __be16 port) diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index 4cbcb65784a3..dc024367110a 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -165,7 +165,7 @@ struct o2net_sock_container { /* original handlers for the sockets */ void (*sc_state_change)(struct sock *sk); - void (*sc_data_ready)(struct sock *sk, int bytes); + void (*sc_data_ready)(struct sock *sk); u32 sc_msg_key; u16 sc_msg_type; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index ff33c5ef87f2..8970dcf74de5 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2367,15 +2367,18 @@ relock: if (direct_io) { written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, - ppos, count, ocount); + count, ocount); if (written < 0) { ret = written; goto out_dio; } } else { + struct iov_iter from; + iov_iter_init(&from, iov, nr_segs, count, 0); current->backing_dev_info = file->f_mapping->backing_dev_info; - written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos, - ppos, count, 0); + written = generic_perform_write(file, &from, *ppos); + if (likely(written >= 0)) + iocb->ki_pos = *ppos + written; current->backing_dev_info = NULL; } diff --git a/fs/open.c b/fs/open.c index 631aea815def..3d30eb1fc95e 100644 --- a/fs/open.c +++ b/fs/open.c @@ -655,35 +655,6 @@ out: return error; } -/* - * You have to be very careful that these write - * counts get cleaned up in error cases and - * upon __fput(). This should probably never - * be called outside of __dentry_open(). - */ -static inline int __get_file_write_access(struct inode *inode, - struct vfsmount *mnt) -{ - int error; - error = get_write_access(inode); - if (error) - return error; - /* - * Do not take mount writer counts on - * special files since no writes to - * the mount itself will occur. - */ - if (!special_file(inode->i_mode)) { - /* - * Balanced in __fput() - */ - error = __mnt_want_write(mnt); - if (error) - put_write_access(inode); - } - return error; -} - int open_check_o_direct(struct file *f) { /* NB: we're sure to have correct a_ops only after f_op->open */ @@ -708,26 +679,28 @@ static int do_dentry_open(struct file *f, f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; - if (unlikely(f->f_flags & O_PATH)) - f->f_mode = FMODE_PATH; - path_get(&f->f_path); inode = f->f_inode = f->f_path.dentry->d_inode; - if (f->f_mode & FMODE_WRITE) { - error = __get_file_write_access(inode, f->f_path.mnt); - if (error) - goto cleanup_file; - if (!special_file(inode->i_mode)) - file_take_write(f); - } - f->f_mapping = inode->i_mapping; - if (unlikely(f->f_mode & FMODE_PATH)) { + if (unlikely(f->f_flags & O_PATH)) { + f->f_mode = FMODE_PATH; f->f_op = &empty_fops; return 0; } + if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { + error = get_write_access(inode); + if (unlikely(error)) + goto cleanup_file; + error = __mnt_want_write(f->f_path.mnt); + if (unlikely(error)) { + put_write_access(inode); + goto cleanup_file; + } + f->f_mode |= FMODE_WRITER; + } + /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */ if (S_ISREG(inode->i_mode)) f->f_mode |= FMODE_ATOMIC_POS; @@ -764,18 +737,9 @@ static int do_dentry_open(struct file *f, cleanup_all: fops_put(f->f_op); - if (f->f_mode & FMODE_WRITE) { + if (f->f_mode & FMODE_WRITER) { put_write_access(inode); - if (!special_file(inode->i_mode)) { - /* - * We don't consider this a real - * mnt_want/drop_write() pair - * because it all happenend right - * here, so just reset the state. - */ - file_reset_write(f); - __mnt_drop_write(f->f_path.mnt); - } + __mnt_drop_write(f->f_path.mnt); } cleanup_file: path_put(&f->f_path); diff --git a/fs/pipe.c b/fs/pipe.c index 78fd0d0788db..034bffac3f97 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -142,55 +142,6 @@ pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len, return 0; } -static int -pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len, - int atomic) -{ - unsigned long copy; - - while (len > 0) { - while (!iov->iov_len) - iov++; - copy = min_t(unsigned long, len, iov->iov_len); - - if (atomic) { - if (__copy_to_user_inatomic(iov->iov_base, from, copy)) - return -EFAULT; - } else { - if (copy_to_user(iov->iov_base, from, copy)) - return -EFAULT; - } - from += copy; - len -= copy; - iov->iov_base += copy; - iov->iov_len -= copy; - } - return 0; -} - -/* - * Attempt to pre-fault in the user memory, so we can use atomic copies. - * Returns the number of bytes not faulted in. - */ -static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len) -{ - while (!iov->iov_len) - iov++; - - while (len > 0) { - unsigned long this_len; - - this_len = min_t(unsigned long, len, iov->iov_len); - if (fault_in_pages_writeable(iov->iov_base, this_len)) - break; - - len -= this_len; - iov++; - } - - return len; -} - /* * Pre-fault in the user memory, so we can use atomic copies. */ @@ -226,52 +177,6 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe, } /** - * generic_pipe_buf_map - virtually map a pipe buffer - * @pipe: the pipe that the buffer belongs to - * @buf: the buffer that should be mapped - * @atomic: whether to use an atomic map - * - * Description: - * This function returns a kernel virtual address mapping for the - * pipe_buffer passed in @buf. If @atomic is set, an atomic map is provided - * and the caller has to be careful not to fault before calling - * the unmap function. - * - * Note that this function calls kmap_atomic() if @atomic != 0. - */ -void *generic_pipe_buf_map(struct pipe_inode_info *pipe, - struct pipe_buffer *buf, int atomic) -{ - if (atomic) { - buf->flags |= PIPE_BUF_FLAG_ATOMIC; - return kmap_atomic(buf->page); - } - - return kmap(buf->page); -} -EXPORT_SYMBOL(generic_pipe_buf_map); - -/** - * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer - * @pipe: the pipe that the buffer belongs to - * @buf: the buffer that should be unmapped - * @map_data: the data that the mapping function returned - * - * Description: - * This function undoes the mapping that ->map() provided. - */ -void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, - struct pipe_buffer *buf, void *map_data) -{ - if (buf->flags & PIPE_BUF_FLAG_ATOMIC) { - buf->flags &= ~PIPE_BUF_FLAG_ATOMIC; - kunmap_atomic(map_data); - } else - kunmap(buf->page); -} -EXPORT_SYMBOL(generic_pipe_buf_unmap); - -/** * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to attempt to steal @@ -351,8 +256,6 @@ EXPORT_SYMBOL(generic_pipe_buf_release); static const struct pipe_buf_operations anon_pipe_buf_ops = { .can_merge = 1, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = anon_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -361,8 +264,6 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = { static const struct pipe_buf_operations packet_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = anon_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -379,12 +280,15 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, ssize_t ret; struct iovec *iov = (struct iovec *)_iov; size_t total_len; + struct iov_iter iter; total_len = iov_length(iov, nr_segs); /* Null read succeeds. */ if (unlikely(total_len == 0)) return 0; + iov_iter_init(&iter, iov, nr_segs, total_len, 0); + do_wakeup = 0; ret = 0; __pipe_lock(pipe); @@ -394,9 +298,9 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, int curbuf = pipe->curbuf; struct pipe_buffer *buf = pipe->bufs + curbuf; const struct pipe_buf_operations *ops = buf->ops; - void *addr; size_t chars = buf->len; - int error, atomic; + size_t written; + int error; if (chars > total_len) chars = total_len; @@ -408,21 +312,10 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, break; } - atomic = !iov_fault_in_pages_write(iov, chars); -redo: - addr = ops->map(pipe, buf, atomic); - error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic); - ops->unmap(pipe, buf, addr); - if (unlikely(error)) { - /* - * Just retry with the slow path if we failed. - */ - if (atomic) { - atomic = 0; - goto redo; - } + written = copy_page_to_iter(buf->page, buf->offset, chars, &iter); + if (unlikely(written < chars)) { if (!ret) - ret = error; + ret = -EFAULT; break; } ret += chars; @@ -538,10 +431,16 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov, iov_fault_in_pages_read(iov, chars); redo1: - addr = ops->map(pipe, buf, atomic); + if (atomic) + addr = kmap_atomic(buf->page); + else + addr = kmap(buf->page); error = pipe_iov_copy_from_user(offset + addr, iov, chars, atomic); - ops->unmap(pipe, buf, addr); + if (atomic) + kunmap_atomic(addr); + else + kunmap(buf->page); ret = error; do_wakeup = 1; if (error) { diff --git a/fs/pnode.c b/fs/pnode.c index 88396df725b4..302bf22c4a30 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -164,46 +164,94 @@ static struct mount *propagation_next(struct mount *m, } } -/* - * return the source mount to be used for cloning - * - * @dest the current destination mount - * @last_dest the last seen destination mount - * @last_src the last seen source mount - * @type return CL_SLAVE if the new mount has to be - * cloned as a slave. - */ -static struct mount *get_source(struct mount *dest, - struct mount *last_dest, - struct mount *last_src, - int *type) +static struct mount *next_group(struct mount *m, struct mount *origin) { - struct mount *p_last_src = NULL; - struct mount *p_last_dest = NULL; - - while (last_dest != dest->mnt_master) { - p_last_dest = last_dest; - p_last_src = last_src; - last_dest = last_dest->mnt_master; - last_src = last_src->mnt_master; + while (1) { + while (1) { + struct mount *next; + if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) + return first_slave(m); + next = next_peer(m); + if (m->mnt_group_id == origin->mnt_group_id) { + if (next == origin) + return NULL; + } else if (m->mnt_slave.next != &next->mnt_slave) + break; + m = next; + } + /* m is the last peer */ + while (1) { + struct mount *master = m->mnt_master; + if (m->mnt_slave.next != &master->mnt_slave_list) + return next_slave(m); + m = next_peer(master); + if (master->mnt_group_id == origin->mnt_group_id) + break; + if (master->mnt_slave.next == &m->mnt_slave) + break; + m = master; + } + if (m == origin) + return NULL; } +} - if (p_last_dest) { - do { - p_last_dest = next_peer(p_last_dest); - } while (IS_MNT_NEW(p_last_dest)); - /* is that a peer of the earlier? */ - if (dest == p_last_dest) { - *type = CL_MAKE_SHARED; - return p_last_src; +/* all accesses are serialized by namespace_sem */ +static struct user_namespace *user_ns; +static struct mount *last_dest, *last_source, *dest_master; +static struct mountpoint *mp; +static struct hlist_head *list; + +static int propagate_one(struct mount *m) +{ + struct mount *child; + int type; + /* skip ones added by this propagate_mnt() */ + if (IS_MNT_NEW(m)) + return 0; + /* skip if mountpoint isn't covered by it */ + if (!is_subdir(mp->m_dentry, m->mnt.mnt_root)) + return 0; + if (m->mnt_group_id == last_dest->mnt_group_id) { + type = CL_MAKE_SHARED; + } else { + struct mount *n, *p; + for (n = m; ; n = p) { + p = n->mnt_master; + if (p == dest_master || IS_MNT_MARKED(p)) { + while (last_dest->mnt_master != p) { + last_source = last_source->mnt_master; + last_dest = last_source->mnt_parent; + } + if (n->mnt_group_id != last_dest->mnt_group_id) { + last_source = last_source->mnt_master; + last_dest = last_source->mnt_parent; + } + break; + } } + type = CL_SLAVE; + /* beginning of peer group among the slaves? */ + if (IS_MNT_SHARED(m)) + type |= CL_MAKE_SHARED; } - /* slave of the earlier, then */ - *type = CL_SLAVE; - /* beginning of peer group among the slaves? */ - if (IS_MNT_SHARED(dest)) - *type |= CL_MAKE_SHARED; - return last_src; + + /* Notice when we are propagating across user namespaces */ + if (m->mnt_ns->user_ns != user_ns) + type |= CL_UNPRIVILEGED; + child = copy_tree(last_source, last_source->mnt.mnt_root, type); + if (IS_ERR(child)) + return PTR_ERR(child); + mnt_set_mountpoint(m, mp, child); + last_dest = m; + last_source = child; + if (m->mnt_master != dest_master) { + read_seqlock_excl(&mount_lock); + SET_MNT_MARK(m->mnt_master); + read_sequnlock_excl(&mount_lock); + } + hlist_add_head(&child->mnt_hash, list); + return 0; } /* @@ -222,56 +270,48 @@ static struct mount *get_source(struct mount *dest, int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, struct mount *source_mnt, struct hlist_head *tree_list) { - struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; - struct mount *m, *child; + struct mount *m, *n; int ret = 0; - struct mount *prev_dest_mnt = dest_mnt; - struct mount *prev_src_mnt = source_mnt; - HLIST_HEAD(tmp_list); - - for (m = propagation_next(dest_mnt, dest_mnt); m; - m = propagation_next(m, dest_mnt)) { - int type; - struct mount *source; - - if (IS_MNT_NEW(m)) - continue; - - source = get_source(m, prev_dest_mnt, prev_src_mnt, &type); - - /* Notice when we are propagating across user namespaces */ - if (m->mnt_ns->user_ns != user_ns) - type |= CL_UNPRIVILEGED; - - child = copy_tree(source, source->mnt.mnt_root, type); - if (IS_ERR(child)) { - ret = PTR_ERR(child); - tmp_list = *tree_list; - tmp_list.first->pprev = &tmp_list.first; - INIT_HLIST_HEAD(tree_list); + + /* + * we don't want to bother passing tons of arguments to + * propagate_one(); everything is serialized by namespace_sem, + * so globals will do just fine. + */ + user_ns = current->nsproxy->mnt_ns->user_ns; + last_dest = dest_mnt; + last_source = source_mnt; + mp = dest_mp; + list = tree_list; + dest_master = dest_mnt->mnt_master; + + /* all peers of dest_mnt, except dest_mnt itself */ + for (n = next_peer(dest_mnt); n != dest_mnt; n = next_peer(n)) { + ret = propagate_one(n); + if (ret) goto out; - } + } - if (is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) { - mnt_set_mountpoint(m, dest_mp, child); - hlist_add_head(&child->mnt_hash, tree_list); - } else { - /* - * This can happen if the parent mount was bind mounted - * on some subdirectory of a shared/slave mount. - */ - hlist_add_head(&child->mnt_hash, &tmp_list); - } - prev_dest_mnt = m; - prev_src_mnt = child; + /* all slave groups */ + for (m = next_group(dest_mnt, dest_mnt); m; + m = next_group(m, dest_mnt)) { + /* everything in that slave group */ + n = m; + do { + ret = propagate_one(n); + if (ret) + goto out; + n = next_peer(n); + } while (n != m); } out: - lock_mount_hash(); - while (!hlist_empty(&tmp_list)) { - child = hlist_entry(tmp_list.first, struct mount, mnt_hash); - umount_tree(child, 0); + read_seqlock_excl(&mount_lock); + hlist_for_each_entry(n, tree_list, mnt_hash) { + m = n->mnt_parent; + if (m->mnt_master != dest_mnt->mnt_master) + CLEAR_MNT_MARK(m->mnt_master); } - unlock_mount_hash(); + read_sequnlock_excl(&mount_lock); return ret; } diff --git a/fs/pnode.h b/fs/pnode.h index fc28a27fa892..4a246358b031 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -16,6 +16,9 @@ #define IS_MNT_NEW(m) (!(m)->mnt_ns) #define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED) #define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE) +#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED) +#define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED) +#define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED) #define CL_EXPIRE 0x01 #define CL_SLAVE 0x02 diff --git a/fs/proc/base.c b/fs/proc/base.c index 6b7087e2e8fb..2d696b0c93bf 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -200,41 +200,9 @@ static int proc_root_link(struct dentry *dentry, struct path *path) return result; } -static int proc_pid_cmdline(struct task_struct *task, char * buffer) +static int proc_pid_cmdline(struct task_struct *task, char *buffer) { - int res = 0; - unsigned int len; - struct mm_struct *mm = get_task_mm(task); - if (!mm) - goto out; - if (!mm->arg_end) - goto out_mm; /* Shh! No looking before we're done */ - - len = mm->arg_end - mm->arg_start; - - if (len > PAGE_SIZE) - len = PAGE_SIZE; - - res = access_process_vm(task, mm->arg_start, buffer, len, 0); - - // If the nul at the end of args has been overwritten, then - // assume application is using setproctitle(3). - if (res > 0 && buffer[res-1] != '\0' && len < PAGE_SIZE) { - len = strnlen(buffer, res); - if (len < res) { - res = len; - } else { - len = mm->env_end - mm->env_start; - if (len > PAGE_SIZE - res) - len = PAGE_SIZE - res; - res += access_process_vm(task, mm->env_start, buffer+res, len, 0); - res = strnlen(buffer, res); - } - } -out_mm: - mmput(mm); -out: - return res; + return get_cmdline(task, buffer, PAGE_SIZE); } static int proc_pid_auxv(struct task_struct *task, char *buffer) diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 9ae46b87470d..89026095f2b5 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -146,7 +146,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl struct task_struct *task; void *ns; char name[50]; - int len = -EACCES; + int res = -EACCES; task = get_proc_task(inode); if (!task) @@ -155,24 +155,18 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out_put_task; - len = -ENOENT; + res = -ENOENT; ns = ns_ops->get(task); if (!ns) goto out_put_task; snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns)); - len = strlen(name); - - if (len > buflen) - len = buflen; - if (copy_to_user(buffer, name, len)) - len = -EFAULT; - + res = readlink_copy(buffer, buflen, name); ns_ops->put(ns); out_put_task: put_task_struct(task); out: - return len; + return res; } static const struct inode_operations proc_ns_link_inode_operations = { diff --git a/fs/proc/self.c b/fs/proc/self.c index ffeb202ec942..4348bb8907c2 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -16,7 +16,7 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer, if (!tgid) return -ENOENT; sprintf(tmp, "%d", tgid); - return vfs_readlink(dentry,buffer,buflen,tmp); + return readlink_copy(buffer, buflen, tmp); } static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 7be26f03a3f5..1a81373947f3 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -267,6 +267,7 @@ static int mounts_open_common(struct inode *inode, struct file *file, p->root = root; p->m.poll_event = ns->event; p->show = show; + p->cached_event = ~0ULL; return 0; diff --git a/fs/splice.c b/fs/splice.c index 12028fa41def..9bc07d2b53cf 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -136,8 +136,6 @@ error: const struct pipe_buf_operations page_cache_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = page_cache_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .steal = page_cache_pipe_buf_steal, @@ -156,8 +154,6 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, static const struct pipe_buf_operations user_page_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .steal = user_page_pipe_buf_steal, @@ -547,8 +543,6 @@ EXPORT_SYMBOL(generic_file_splice_read); static const struct pipe_buf_operations default_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -564,8 +558,6 @@ static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, /* Pipe buffer operations for a socket and similar. */ const struct pipe_buf_operations nosteal_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, .steal = generic_pipe_buf_nosteal, @@ -767,13 +759,13 @@ int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, goto out; if (buf->page != page) { - char *src = buf->ops->map(pipe, buf, 1); + char *src = kmap_atomic(buf->page); char *dst = kmap_atomic(page); memcpy(dst + offset, src + buf->offset, this_len); flush_dcache_page(page); kunmap_atomic(dst); - buf->ops->unmap(pipe, buf, src); + kunmap_atomic(src); } ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len, page, fsdata); @@ -1067,9 +1059,9 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, void *data; loff_t tmp = sd->pos; - data = buf->ops->map(pipe, buf, 0); + data = kmap(buf->page); ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp); - buf->ops->unmap(pipe, buf, data); + kunmap(buf->page); return ret; } @@ -1528,116 +1520,48 @@ static int get_iovec_page_array(const struct iovec __user *iov, static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { - char *src; - int ret; - - /* - * See if we can use the atomic maps, by prefaulting in the - * pages and doing an atomic copy - */ - if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) { - src = buf->ops->map(pipe, buf, 1); - ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset, - sd->len); - buf->ops->unmap(pipe, buf, src); - if (!ret) { - ret = sd->len; - goto out; - } - } - - /* - * No dice, use slow non-atomic map and copy - */ - src = buf->ops->map(pipe, buf, 0); - - ret = sd->len; - if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len)) - ret = -EFAULT; - - buf->ops->unmap(pipe, buf, src); -out: - if (ret > 0) - sd->u.userptr += ret; - return ret; + int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data); + return n == sd->len ? n : -EFAULT; } /* * For lack of a better implementation, implement vmsplice() to userspace * as a simple copy of the pipes pages to the user iov. */ -static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, +static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov, unsigned long nr_segs, unsigned int flags) { struct pipe_inode_info *pipe; struct splice_desc sd; - ssize_t size; - int error; long ret; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + ssize_t count = 0; pipe = get_pipe_info(file); if (!pipe) return -EBADF; - pipe_lock(pipe); - - error = ret = 0; - while (nr_segs) { - void __user *base; - size_t len; - - /* - * Get user address base and length for this iovec. - */ - error = get_user(base, &iov->iov_base); - if (unlikely(error)) - break; - error = get_user(len, &iov->iov_len); - if (unlikely(error)) - break; - - /* - * Sanity check this iovec. 0 read succeeds. - */ - if (unlikely(!len)) - break; - if (unlikely(!base)) { - error = -EFAULT; - break; - } - - if (unlikely(!access_ok(VERIFY_WRITE, base, len))) { - error = -EFAULT; - break; - } - - sd.len = 0; - sd.total_len = len; - sd.flags = flags; - sd.u.userptr = base; - sd.pos = 0; - - size = __splice_from_pipe(pipe, &sd, pipe_to_user); - if (size < 0) { - if (!ret) - ret = size; - - break; - } - - ret += size; + ret = rw_copy_check_uvector(READ, uiov, nr_segs, + ARRAY_SIZE(iovstack), iovstack, &iov); + if (ret <= 0) + return ret; - if (size < len) - break; + iov_iter_init(&iter, iov, nr_segs, count, 0); - nr_segs--; - iov++; - } + sd.len = 0; + sd.total_len = count; + sd.flags = flags; + sd.u.data = &iter; + sd.pos = 0; + pipe_lock(pipe); + ret = __splice_from_pipe(pipe, &sd, pipe_to_user); pipe_unlock(pipe); - if (!ret) - ret = error; + if (iov != iovstack) + kfree(iov); return ret; } diff --git a/fs/super.c b/fs/super.c index e9dc3c3fe159..48377f7463c0 100644 --- a/fs/super.c +++ b/fs/super.c @@ -800,7 +800,10 @@ void emergency_remount(void) static DEFINE_IDA(unnamed_dev_ida); static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */ -static int unnamed_dev_start = 0; /* don't bother trying below it */ +/* Many userspace utilities consider an FSID of 0 invalid. + * Always return at least 1 from get_anon_bdev. + */ +static int unnamed_dev_start = 1; int get_anon_bdev(dev_t *p) { diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 1b8b91b67fdb..28cc1acd5439 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -453,95 +453,3 @@ void sysfs_remove_bin_file(struct kobject *kobj, kernfs_remove_by_name(kobj->sd, attr->attr.name); } EXPORT_SYMBOL_GPL(sysfs_remove_bin_file); - -struct sysfs_schedule_callback_struct { - struct list_head workq_list; - struct kobject *kobj; - void (*func)(void *); - void *data; - struct module *owner; - struct work_struct work; -}; - -static struct workqueue_struct *sysfs_workqueue; -static DEFINE_MUTEX(sysfs_workq_mutex); -static LIST_HEAD(sysfs_workq); -static void sysfs_schedule_callback_work(struct work_struct *work) -{ - struct sysfs_schedule_callback_struct *ss = container_of(work, - struct sysfs_schedule_callback_struct, work); - - (ss->func)(ss->data); - kobject_put(ss->kobj); - module_put(ss->owner); - mutex_lock(&sysfs_workq_mutex); - list_del(&ss->workq_list); - mutex_unlock(&sysfs_workq_mutex); - kfree(ss); -} - -/** - * sysfs_schedule_callback - helper to schedule a callback for a kobject - * @kobj: object we're acting for. - * @func: callback function to invoke later. - * @data: argument to pass to @func. - * @owner: module owning the callback code - * - * sysfs attribute methods must not unregister themselves or their parent - * kobject (which would amount to the same thing). Attempts to do so will - * deadlock, since unregistration is mutually exclusive with driver - * callbacks. - * - * Instead methods can call this routine, which will attempt to allocate - * and schedule a workqueue request to call back @func with @data as its - * argument in the workqueue's process context. @kobj will be pinned - * until @func returns. - * - * Returns 0 if the request was submitted, -ENOMEM if storage could not - * be allocated, -ENODEV if a reference to @owner isn't available, - * -EAGAIN if a callback has already been scheduled for @kobj. - */ -int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), - void *data, struct module *owner) -{ - struct sysfs_schedule_callback_struct *ss, *tmp; - - if (!try_module_get(owner)) - return -ENODEV; - - mutex_lock(&sysfs_workq_mutex); - list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list) - if (ss->kobj == kobj) { - module_put(owner); - mutex_unlock(&sysfs_workq_mutex); - return -EAGAIN; - } - mutex_unlock(&sysfs_workq_mutex); - - if (sysfs_workqueue == NULL) { - sysfs_workqueue = create_singlethread_workqueue("sysfsd"); - if (sysfs_workqueue == NULL) { - module_put(owner); - return -ENOMEM; - } - } - - ss = kmalloc(sizeof(*ss), GFP_KERNEL); - if (!ss) { - module_put(owner); - return -ENOMEM; - } - kobject_get(kobj); - ss->kobj = kobj; - ss->func = func; - ss->data = data; - ss->owner = owner; - INIT_WORK(&ss->work, sysfs_schedule_callback_work); - INIT_LIST_HEAD(&ss->workq_list); - mutex_lock(&sysfs_workq_mutex); - list_add_tail(&ss->workq_list, &sysfs_workq); - mutex_unlock(&sysfs_workq_mutex); - queue_work(sysfs_workqueue, &ss->work); - return 0; -} -EXPORT_SYMBOL_GPL(sysfs_schedule_callback); diff --git a/fs/udf/file.c b/fs/udf/file.c index 1037637957c7..d2c170f8b035 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -171,7 +171,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, } else up_write(&iinfo->i_data_sem); - retval = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); + retval = __generic_file_aio_write(iocb, iov, nr_segs); mutex_unlock(&inode->i_mutex); if (retval > 0) { diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 75df77d09f75..0479c32c5eb1 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1344,6 +1344,14 @@ __xfs_get_blocks( /* * If this is O_DIRECT or the mpage code calling tell them how large * the mapping is, so that we can avoid repeated get_blocks calls. + * + * If the mapping spans EOF, then we have to break the mapping up as the + * mapping for blocks beyond EOF must be marked new so that sub block + * regions can be correctly zeroed. We can't do this for mappings within + * EOF unless the mapping was just allocated or is unwritten, otherwise + * the callers would overwrite existing data with zeros. Hence we have + * to split the mapping into a range up to and including EOF, and a + * second mapping for beyond EOF. */ if (direct || size > (1 << inode->i_blkbits)) { xfs_off_t mapping_size; @@ -1354,6 +1362,12 @@ __xfs_get_blocks( ASSERT(mapping_size > 0); if (mapping_size > size) mapping_size = size; + if (offset < i_size_read(inode) && + offset + mapping_size >= i_size_read(inode)) { + /* limit mapping to block that spans EOF */ + mapping_size = roundup_64(i_size_read(inode) - offset, + 1 << inode->i_blkbits); + } if (mapping_size > LONG_MAX) mapping_size = LONG_MAX; @@ -1566,6 +1580,16 @@ xfs_vm_write_failed( xfs_vm_kill_delalloc_range(inode, block_offset, block_offset + bh->b_size); + + /* + * This buffer does not contain data anymore. make sure anyone + * who finds it knows that for certain. + */ + clear_buffer_delay(bh); + clear_buffer_uptodate(bh); + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_dirty(bh); } } @@ -1599,12 +1623,21 @@ xfs_vm_write_begin( status = __block_write_begin(page, pos, len, xfs_get_blocks); if (unlikely(status)) { struct inode *inode = mapping->host; + size_t isize = i_size_read(inode); xfs_vm_write_failed(inode, page, pos, len); unlock_page(page); - if (pos + len > i_size_read(inode)) - truncate_pagecache(inode, i_size_read(inode)); + /* + * If the write is beyond EOF, we only want to kill blocks + * allocated in this write, not blocks that were previously + * written successfully. + */ + if (pos + len > isize) { + ssize_t start = max_t(ssize_t, pos, isize); + + truncate_pagecache_range(inode, start, pos + len); + } page_cache_release(page); page = NULL; @@ -1615,9 +1648,12 @@ xfs_vm_write_begin( } /* - * On failure, we only need to kill delalloc blocks beyond EOF because they - * will never be written. For blocks within EOF, generic_write_end() zeros them - * so they are safe to leave alone and be written with all the other valid data. + * On failure, we only need to kill delalloc blocks beyond EOF in the range of + * this specific write because they will never be written. Previous writes + * beyond EOF where block allocation succeeded do not need to be trashed, so + * only new blocks from this write should be trashed. For blocks within + * EOF, generic_write_end() zeros them so they are safe to leave alone and be + * written with all the other valid data. */ STATIC int xfs_vm_write_end( @@ -1640,8 +1676,11 @@ xfs_vm_write_end( loff_t to = pos + len; if (to > isize) { - truncate_pagecache(inode, isize); + /* only kill blocks in this write beyond EOF */ + if (pos > isize) + isize = pos; xfs_vm_kill_delalloc_range(inode, isize, to); + truncate_pagecache_range(inode, isize, to); } } return ret; diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 5b6092ef51ef..f0efc7e970ef 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5413,6 +5413,7 @@ xfs_bmap_shift_extents( int whichfork = XFS_DATA_FORK; int logflags; xfs_filblks_t blockcount = 0; + int total_extents; if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && @@ -5429,7 +5430,6 @@ xfs_bmap_shift_extents( ASSERT(current_ext != NULL); ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { /* Read in all the extents */ error = xfs_iread_extents(tp, ip, whichfork); @@ -5456,7 +5456,6 @@ xfs_bmap_shift_extents( /* We are going to change core inode */ logflags = XFS_ILOG_CORE; - if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_private.b.firstblock = *firstblock; @@ -5467,8 +5466,14 @@ xfs_bmap_shift_extents( logflags |= XFS_ILOG_DEXT; } - while (nexts++ < num_exts && - *current_ext < XFS_IFORK_NEXTENTS(ip, whichfork)) { + /* + * There may be delalloc extents in the data fork before the range we + * are collapsing out, so we cannot + * use the count of real extents here. Instead we have to calculate it + * from the incore fork. + */ + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + while (nexts++ < num_exts && *current_ext < total_extents) { gotp = xfs_iext_get_ext(ifp, *current_ext); xfs_bmbt_get_all(gotp, &got); @@ -5556,10 +5561,11 @@ xfs_bmap_shift_extents( } (*current_ext)++; + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); } /* Check if we are done */ - if (*current_ext == XFS_IFORK_NEXTENTS(ip, whichfork)) + if (*current_ext == total_extents) *done = 1; del_cursor: @@ -5568,6 +5574,5 @@ del_cursor: error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); xfs_trans_log_inode(tp, ip, logflags); - return error; } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 01f6a646caa1..296160b8e78c 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1418,6 +1418,8 @@ xfs_zero_file_space( xfs_off_t end_boundary; int error; + trace_xfs_zero_file_space(ip); + granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); /* @@ -1432,9 +1434,18 @@ xfs_zero_file_space( ASSERT(end_boundary <= offset + len); if (start_boundary < end_boundary - 1) { - /* punch out the page cache over the conversion range */ + /* + * punch out delayed allocation blocks and the page cache over + * the conversion range + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, + XFS_B_TO_FSBT(mp, start_boundary), + XFS_B_TO_FSB(mp, end_boundary - start_boundary)); + xfs_iunlock(ip, XFS_ILOCK_EXCL); truncate_pagecache_range(VFS_I(ip), start_boundary, end_boundary - 1); + /* convert the blocks */ error = xfs_alloc_file_space(ip, start_boundary, end_boundary - start_boundary - 1, diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 107f2fdfe41f..cb10a0aaab3a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1372,21 +1372,29 @@ xfs_buf_iorequest( xfs_buf_wait_unpin(bp); xfs_buf_hold(bp); - /* Set the count to 1 initially, this will stop an I/O + /* + * Set the count to 1 initially, this will stop an I/O * completion callout which happens before we have started * all the I/O from calling xfs_buf_ioend too early. */ atomic_set(&bp->b_io_remaining, 1); _xfs_buf_ioapply(bp); - _xfs_buf_ioend(bp, 1); + /* + * If _xfs_buf_ioapply failed, we'll get back here with + * only the reference we took above. _xfs_buf_ioend will + * drop it to zero, so we'd better not queue it for later, + * or we'll free it before it's done. + */ + _xfs_buf_ioend(bp, bp->b_error ? 0 : 1); xfs_buf_rele(bp); } /* * Waits for I/O to complete on the buffer supplied. It returns immediately if - * no I/O is pending or there is already a pending error on the buffer. It - * returns the I/O error code, if any, or 0 if there was no error. + * no I/O is pending or there is already a pending error on the buffer, in which + * case nothing will ever complete. It returns the I/O error code, if any, or + * 0 if there was no error. */ int xfs_buf_iowait( diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 003c0051b62f..82afdcb33183 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -679,7 +679,7 @@ xfs_file_dio_aio_write( goto out; if (mapping->nrpages) { - ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, pos, -1); if (ret) goto out; @@ -699,7 +699,7 @@ xfs_file_dio_aio_write( trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); ret = generic_file_direct_write(iocb, iovp, - &nr_segs, pos, &iocb->ki_pos, count, ocount); + &nr_segs, pos, count, ocount); out: xfs_rw_iunlock(ip, iolock); @@ -715,7 +715,7 @@ xfs_file_buffered_aio_write( const struct iovec *iovp, unsigned long nr_segs, loff_t pos, - size_t ocount) + size_t count) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -724,7 +724,7 @@ xfs_file_buffered_aio_write( ssize_t ret; int enospc = 0; int iolock = XFS_IOLOCK_EXCL; - size_t count = ocount; + struct iov_iter from; xfs_rw_ilock(ip, iolock); @@ -732,14 +732,15 @@ xfs_file_buffered_aio_write( if (ret) goto out; + iov_iter_init(&from, iovp, nr_segs, count, 0); /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; write_retry: trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); - ret = generic_file_buffered_write(iocb, iovp, nr_segs, - pos, &iocb->ki_pos, count, 0); - + ret = generic_perform_write(file, &from, pos); + if (likely(ret >= 0)) + iocb->ki_pos = pos + ret; /* * If we just got an ENOSPC, try to write back all dirty inodes to * convert delalloc space to free up some of the excess reserved diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 5e7a38fa6ee6..768087bedbac 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1334,7 +1334,8 @@ int xfs_create_tmpfile( struct xfs_inode *dp, struct dentry *dentry, - umode_t mode) + umode_t mode, + struct xfs_inode **ipp) { struct xfs_mount *mp = dp->i_mount; struct xfs_inode *ip = NULL; @@ -1402,7 +1403,6 @@ xfs_create_tmpfile( xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); ip->i_d.di_nlink--; - d_tmpfile(dentry, VFS_I(ip)); error = xfs_iunlink(tp, ip); if (error) goto out_trans_abort; @@ -1415,6 +1415,7 @@ xfs_create_tmpfile( xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); + *ipp = ip; return 0; out_trans_abort: diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 396cc1fafd0d..f2fcde52b66d 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -334,7 +334,7 @@ int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, int xfs_create(struct xfs_inode *dp, struct xfs_name *name, umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp); int xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry, - umode_t mode); + umode_t mode, struct xfs_inode **ipp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index bcfe61202115..0b18776b075e 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -271,32 +271,6 @@ xfs_open_by_handle( return error; } -/* - * This is a copy from fs/namei.c:vfs_readlink(), except for removing it's - * unused first argument. - */ -STATIC int -do_readlink( - char __user *buffer, - int buflen, - const char *link) -{ - int len; - - len = PTR_ERR(link); - if (IS_ERR(link)) - goto out; - - len = strlen(link); - if (len > (unsigned) buflen) - len = buflen; - if (copy_to_user(buffer, link, len)) - len = -EFAULT; - out: - return len; -} - - int xfs_readlink_by_handle( struct file *parfilp, @@ -334,7 +308,7 @@ xfs_readlink_by_handle( error = -xfs_readlink(XFS_I(dentry->d_inode), link); if (error) goto out_kfree; - error = do_readlink(hreq->ohandle, olen, link); + error = readlink_copy(hreq->ohandle, olen, link); if (error) goto out_kfree; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 89b07e43ca28..ef1ca010f417 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1053,11 +1053,25 @@ xfs_vn_tmpfile( struct dentry *dentry, umode_t mode) { - int error; + int error; + struct xfs_inode *ip; + struct inode *inode; - error = xfs_create_tmpfile(XFS_I(dir), dentry, mode); + error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip); + if (unlikely(error)) + return -error; - return -error; + inode = VFS_I(ip); + + error = xfs_init_security(inode, dir, &dentry->d_name); + if (unlikely(error)) { + iput(inode); + return -error; + } + + d_tmpfile(dentry, inode); + + return 0; } static const struct inode_operations xfs_inode_operations = { diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 8497a00e399d..08624dc67317 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1181,11 +1181,14 @@ xlog_iodone(xfs_buf_t *bp) /* log I/O is always issued ASYNC */ ASSERT(XFS_BUF_ISASYNC(bp)); xlog_state_done_syncing(iclog, aborted); + /* - * do not reference the buffer (bp) here as we could race - * with it being freed after writing the unmount record to the - * log. + * drop the buffer lock now that we are done. Nothing references + * the buffer after this, so an unmount waiting on this lock can now + * tear it down safely. As such, it is unsafe to reference the buffer + * (bp) after the unlock as we could race with it being freed. */ + xfs_buf_unlock(bp); } /* @@ -1368,8 +1371,16 @@ xlog_alloc_log( bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0); if (!bp) goto out_free_log; - bp->b_iodone = xlog_iodone; + + /* + * The iclogbuf buffer locks are held over IO but we are not going to do + * IO yet. Hence unlock the buffer so that the log IO path can grab it + * when appropriately. + */ ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); + + bp->b_iodone = xlog_iodone; log->l_xbuf = bp; spin_lock_init(&log->l_icloglock); @@ -1398,6 +1409,9 @@ xlog_alloc_log( if (!bp) goto out_free_iclog; + ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); + bp->b_iodone = xlog_iodone; iclog->ic_bp = bp; iclog->ic_data = bp->b_addr; @@ -1422,7 +1436,6 @@ xlog_alloc_log( iclog->ic_callback_tail = &(iclog->ic_callback); iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; - ASSERT(xfs_buf_islocked(iclog->ic_bp)); init_waitqueue_head(&iclog->ic_force_wait); init_waitqueue_head(&iclog->ic_write_wait); @@ -1631,6 +1644,12 @@ xlog_cksum( * we transition the iclogs to IOERROR state *after* flushing all existing * iclogs to disk. This is because we don't want anymore new transactions to be * started or completed afterwards. + * + * We lock the iclogbufs here so that we can serialise against IO completion + * during unmount. We might be processing a shutdown triggered during unmount, + * and that can occur asynchronously to the unmount thread, and hence we need to + * ensure that completes before tearing down the iclogbufs. Hence we need to + * hold the buffer lock across the log IO to acheive that. */ STATIC int xlog_bdstrat( @@ -1638,6 +1657,7 @@ xlog_bdstrat( { struct xlog_in_core *iclog = bp->b_fspriv; + xfs_buf_lock(bp); if (iclog->ic_state & XLOG_STATE_IOERROR) { xfs_buf_ioerror(bp, EIO); xfs_buf_stale(bp); @@ -1645,7 +1665,8 @@ xlog_bdstrat( /* * It would seem logical to return EIO here, but we rely on * the log state machine to propagate I/O errors instead of - * doing it here. + * doing it here. Similarly, IO completion will unlock the + * buffer, so we don't do it here. */ return 0; } @@ -1847,14 +1868,28 @@ xlog_dealloc_log( xlog_cil_destroy(log); /* - * always need to ensure that the extra buffer does not point to memory - * owned by another log buffer before we free it. + * Cycle all the iclogbuf locks to make sure all log IO completion + * is done before we tear down these buffers. */ + iclog = log->l_iclog; + for (i = 0; i < log->l_iclog_bufs; i++) { + xfs_buf_lock(iclog->ic_bp); + xfs_buf_unlock(iclog->ic_bp); + iclog = iclog->ic_next; + } + + /* + * Always need to ensure that the extra buffer does not point to memory + * owned by another log buffer before we free it. Also, cycle the lock + * first to ensure we've completed IO on it. + */ + xfs_buf_lock(log->l_xbuf); + xfs_buf_unlock(log->l_xbuf); xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size)); xfs_buf_free(log->l_xbuf); iclog = log->l_iclog; - for (i=0; i<log->l_iclog_bufs; i++) { + for (i = 0; i < log->l_iclog_bufs; i++) { xfs_buf_free(iclog->ic_bp); next_iclog = iclog->ic_next; kmem_free(iclog); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index a4ae41c179a8..65d8c793a25c 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -603,6 +603,7 @@ DEFINE_INODE_EVENT(xfs_readlink); DEFINE_INODE_EVENT(xfs_inactive_symlink); DEFINE_INODE_EVENT(xfs_alloc_file_space); DEFINE_INODE_EVENT(xfs_free_file_space); +DEFINE_INODE_EVENT(xfs_zero_file_space); DEFINE_INODE_EVENT(xfs_collapse_file_space); DEFINE_INODE_EVENT(xfs_readdir); #ifdef CONFIG_XFS_POSIX_ACL |