diff options
Diffstat (limited to 'fs')
67 files changed, 1613 insertions, 917 deletions
diff --git a/fs/affs/super.c b/fs/affs/super.c index 3f89c9e05b40..5b50c4ca43a7 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -18,6 +18,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/writeback.h> +#include <linux/blkdev.h> #include "affs.h" static int affs_statfs(struct dentry *dentry, struct kstatfs *buf); @@ -352,18 +353,19 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) * blocks, we will have to change it. */ - size = sb->s_bdev->bd_inode->i_size >> 9; + size = i_size_read(sb->s_bdev->bd_inode) >> 9; pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size); affs_set_blocksize(sb, PAGE_SIZE); /* Try to find root block. Its location depends on the block size. */ - i = 512; - j = 4096; + i = bdev_logical_block_size(sb->s_bdev); + j = PAGE_SIZE; if (blocksize > 0) { i = j = blocksize; size = size / (blocksize / 512); } + for (blocksize = i; blocksize <= j; blocksize <<= 1, size >>= 1) { sbi->s_root_block = root_block; if (root_block < 0) diff --git a/fs/block_dev.c b/fs/block_dev.c index 33b813e04f79..22ea424ee741 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -28,6 +28,7 @@ #include <linux/namei.h> #include <linux/log2.h> #include <linux/cleancache.h> +#include <linux/dax.h> #include <asm/uaccess.h> #include "internal.h" @@ -441,7 +442,7 @@ EXPORT_SYMBOL_GPL(bdev_write_page); * accessible at this address. */ long bdev_direct_access(struct block_device *bdev, sector_t sector, - void **addr, unsigned long *pfn, long size) + void __pmem **addr, unsigned long *pfn, long size) { long avail; const struct block_device_operations *ops = bdev->bd_disk->fops; @@ -462,7 +463,7 @@ long bdev_direct_access(struct block_device *bdev, sector_t sector, sector += get_start_sect(bdev); if (sector % (PAGE_SIZE / 512)) return -EINVAL; - avail = ops->direct_access(bdev, sector, addr, pfn, size); + avail = ops->direct_access(bdev, sector, addr, pfn); if (!avail) return -ERANGE; return min(avail, size); diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 1ce06c849a86..3e36e4adc4a3 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -42,8 +42,14 @@ struct __btrfs_workqueue { /* Thresholding related variants */ atomic_t pending; - int max_active; - int current_max; + + /* Up limit of concurrency workers */ + int limit_active; + + /* Current number of concurrency workers */ + int current_active; + + /* Threshold to change current_active */ int thresh; unsigned int count; spinlock_t thres_lock; @@ -88,7 +94,7 @@ BTRFS_WORK_HELPER(scrubnc_helper); BTRFS_WORK_HELPER(scrubparity_helper); static struct __btrfs_workqueue * -__btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active, +__btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active, int thresh) { struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); @@ -96,26 +102,31 @@ __btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active, if (!ret) return NULL; - ret->max_active = max_active; + ret->limit_active = limit_active; atomic_set(&ret->pending, 0); if (thresh == 0) thresh = DFT_THRESHOLD; /* For low threshold, disabling threshold is a better choice */ if (thresh < DFT_THRESHOLD) { - ret->current_max = max_active; + ret->current_active = limit_active; ret->thresh = NO_THRESHOLD; } else { - ret->current_max = 1; + /* + * For threshold-able wq, let its concurrency grow on demand. + * Use minimal max_active at alloc time to reduce resource + * usage. + */ + ret->current_active = 1; ret->thresh = thresh; } if (flags & WQ_HIGHPRI) ret->normal_wq = alloc_workqueue("%s-%s-high", flags, - ret->max_active, - "btrfs", name); + ret->current_active, "btrfs", + name); else ret->normal_wq = alloc_workqueue("%s-%s", flags, - ret->max_active, "btrfs", + ret->current_active, "btrfs", name); if (!ret->normal_wq) { kfree(ret); @@ -134,7 +145,7 @@ __btrfs_destroy_workqueue(struct __btrfs_workqueue *wq); struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, unsigned int flags, - int max_active, + int limit_active, int thresh) { struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); @@ -143,14 +154,14 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, return NULL; ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI, - max_active, thresh); + limit_active, thresh); if (!ret->normal) { kfree(ret); return NULL; } if (flags & WQ_HIGHPRI) { - ret->high = __btrfs_alloc_workqueue(name, flags, max_active, + ret->high = __btrfs_alloc_workqueue(name, flags, limit_active, thresh); if (!ret->high) { __btrfs_destroy_workqueue(ret->normal); @@ -180,7 +191,7 @@ static inline void thresh_queue_hook(struct __btrfs_workqueue *wq) */ static inline void thresh_exec_hook(struct __btrfs_workqueue *wq) { - int new_max_active; + int new_current_active; long pending; int need_change = 0; @@ -197,7 +208,7 @@ static inline void thresh_exec_hook(struct __btrfs_workqueue *wq) wq->count %= (wq->thresh / 4); if (!wq->count) goto out; - new_max_active = wq->current_max; + new_current_active = wq->current_active; /* * pending may be changed later, but it's OK since we really @@ -205,19 +216,19 @@ static inline void thresh_exec_hook(struct __btrfs_workqueue *wq) */ pending = atomic_read(&wq->pending); if (pending > wq->thresh) - new_max_active++; + new_current_active++; if (pending < wq->thresh / 2) - new_max_active--; - new_max_active = clamp_val(new_max_active, 1, wq->max_active); - if (new_max_active != wq->current_max) { + new_current_active--; + new_current_active = clamp_val(new_current_active, 1, wq->limit_active); + if (new_current_active != wq->current_active) { need_change = 1; - wq->current_max = new_max_active; + wq->current_active = new_current_active; } out: spin_unlock(&wq->thres_lock); if (need_change) { - workqueue_set_max_active(wq->normal_wq, wq->current_max); + workqueue_set_max_active(wq->normal_wq, wq->current_active); } } @@ -351,13 +362,13 @@ void btrfs_destroy_workqueue(struct btrfs_workqueue *wq) kfree(wq); } -void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max) +void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int limit_active) { if (!wq) return; - wq->normal->max_active = max; + wq->normal->limit_active = limit_active; if (wq->high) - wq->high->max_active = max; + wq->high->limit_active = limit_active; } void btrfs_set_work_high_priority(struct btrfs_work *work) diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index b0b093b6afec..ad4d0647d1a6 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -69,7 +69,7 @@ BTRFS_WORK_HELPER_PROTO(scrubparity_helper); struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, unsigned int flags, - int max_active, + int limit_active, int thresh); void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper, btrfs_func_t func, diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 564a7de17d99..e54dd5905cee 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -183,8 +183,7 @@ no_valid_dev_replace_entry_found: } out: - if (path) - btrfs_free_path(path); + btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9ebd34f1c677..0d98aee34fee 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3443,6 +3443,26 @@ static int barrier_all_devices(struct btrfs_fs_info *info) return 0; } +int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) +{ + if ((flags & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_AVAIL_ALLOC_BIT_SINGLE)) || + ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0)) + return 0; + + if (flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID10)) + return 1; + + if (flags & BTRFS_BLOCK_GROUP_RAID6) + return 2; + + pr_warn("BTRFS: unknown raid type: %llu\n", flags); + return 0; +} + int btrfs_calc_num_tolerated_disk_barrier_failures( struct btrfs_fs_info *fs_info) { @@ -3452,13 +3472,12 @@ int btrfs_calc_num_tolerated_disk_barrier_failures( BTRFS_BLOCK_GROUP_SYSTEM, BTRFS_BLOCK_GROUP_METADATA, BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA}; - int num_types = 4; int i; int c; int num_tolerated_disk_barrier_failures = (int)fs_info->fs_devices->num_devices; - for (i = 0; i < num_types; i++) { + for (i = 0; i < ARRAY_SIZE(types); i++) { struct btrfs_space_info *tmp; sinfo = NULL; @@ -3476,44 +3495,21 @@ int btrfs_calc_num_tolerated_disk_barrier_failures( down_read(&sinfo->groups_sem); for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { - if (!list_empty(&sinfo->block_groups[c])) { - u64 flags; - - btrfs_get_block_group_info( - &sinfo->block_groups[c], &space); - if (space.total_bytes == 0 || - space.used_bytes == 0) - continue; - flags = space.flags; - /* - * return - * 0: if dup, single or RAID0 is configured for - * any of metadata, system or data, else - * 1: if RAID5 is configured, or if RAID1 or - * RAID10 is configured and only two mirrors - * are used, else - * 2: if RAID6 is configured, else - * num_mirrors - 1: if RAID1 or RAID10 is - * configured and more than - * 2 mirrors are used. - */ - if (num_tolerated_disk_barrier_failures > 0 && - ((flags & (BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID0)) || - ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) - == 0))) - num_tolerated_disk_barrier_failures = 0; - else if (num_tolerated_disk_barrier_failures > 1) { - if (flags & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID10)) { - num_tolerated_disk_barrier_failures = 1; - } else if (flags & - BTRFS_BLOCK_GROUP_RAID6) { - num_tolerated_disk_barrier_failures = 2; - } - } - } + u64 flags; + + if (list_empty(&sinfo->block_groups[c])) + continue; + + btrfs_get_block_group_info(&sinfo->block_groups[c], + &space); + if (space.total_bytes == 0 || space.used_bytes == 0) + continue; + flags = space.flags; + + num_tolerated_disk_barrier_failures = min( + num_tolerated_disk_barrier_failures, + btrfs_get_num_tolerated_disk_barrier_failures( + flags)); } up_read(&sinfo->groups_sem); } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index d4cbfeeeedd4..bdfb479ea859 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -139,6 +139,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid); int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); +int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int btrfs_calc_num_tolerated_disk_barrier_failures( struct btrfs_fs_info *fs_info); int __init btrfs_end_io_wq_init(void); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 237da012f7d0..a0fa7253a2d7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6909,8 +6909,7 @@ out: trace_btrfs_get_extent(root, em); - if (path) - btrfs_free_path(path); + btrfs_free_path(path); if (trans) { ret = btrfs_end_transaction(trans, root); if (!err) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 9a11db0c47ee..a39f5d1144e8 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3267,13 +3267,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, scrub_blocked_if_needed(fs_info); } - /* for raid56, we skip parity stripe */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ret = get_raid56_logic_offset(physical, num, map, &logical, &stripe_logical); logical += base; if (ret) { + /* it is parity strip */ stripe_logical += base; stripe_end = stripe_logical + increment; ret = scrub_raid56_parity(sctx, map, scrub_dev, @@ -3480,7 +3480,6 @@ out: static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, - u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, u64 dev_offset, int is_dev_replace) { @@ -3531,8 +3530,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, struct btrfs_root *root = sctx->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; u64 length; - u64 chunk_tree; - u64 chunk_objectid; u64 chunk_offset; int ret = 0; int slot; @@ -3596,8 +3593,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (found_key.offset + length <= start) goto skip; - chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); - chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); /* @@ -3630,9 +3625,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, dev_replace->cursor_right = found_key.offset + length; dev_replace->cursor_left = found_key.offset; dev_replace->item_needs_writeback = 1; - ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid, - chunk_offset, length, found_key.offset, - is_dev_replace); + ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length, + found_key.offset, is_dev_replace); /* * flush, submit all pending read and write bios, afterwards diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index a4b9c8b2d35a..f31db4325339 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -115,8 +115,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, ret = -EAGAIN; } out: - if (path) - btrfs_free_path(path); + btrfs_free_path(path); if (ret == -EAGAIN) { if (root->defrag_max.objectid > root->defrag_progress.objectid) goto done; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 76201d6f6ce4..6fc735869c18 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3585,23 +3585,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } while (read_seqretry(&fs_info->profiles_lock, seq)); if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { - int num_tolerated_disk_barrier_failures; - u64 target = bctl->sys.target; - - num_tolerated_disk_barrier_failures = - btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); - if (num_tolerated_disk_barrier_failures > 0 && - (target & - (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_AVAIL_ALLOC_BIT_SINGLE))) - num_tolerated_disk_barrier_failures = 0; - else if (num_tolerated_disk_barrier_failures > 1 && - (target & - (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))) - num_tolerated_disk_barrier_failures = 1; - - fs_info->num_tolerated_disk_barrier_failures = - num_tolerated_disk_barrier_failures; + fs_info->num_tolerated_disk_barrier_failures = min( + btrfs_calc_num_tolerated_disk_barrier_failures(fs_info), + btrfs_get_num_tolerated_disk_barrier_failures( + bctl->sys.target)); } ret = insert_balance_item(fs_info->tree_root, bctl); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 890c50971a69..9d23e788d1df 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -276,7 +276,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) for (i = 0; i < num_pages; i++) { struct page *page = osd_data->pages[i]; - if (rc < 0) + if (rc < 0 && rc != ENOENT) goto unlock; if (bytes < (int)PAGE_CACHE_SIZE) { /* zero (remainder of) page */ @@ -717,8 +717,10 @@ static int ceph_writepages_start(struct address_space *mapping, wbc->sync_mode == WB_SYNC_NONE ? "NONE" : (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); - if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { + if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { pr_warn("writepage_start %p on forced umount\n", inode); + truncate_pagecache(inode, 0); + mapping_set_error(mapping, -EIO); return -EIO; /* we're in a forced umount, don't write! */ } if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize) @@ -1593,7 +1595,7 @@ out: return err; } -static struct vm_operations_struct ceph_vmops = { +static const struct vm_operations_struct ceph_vmops = { .fault = ceph_filemap_fault, .page_mkwrite = ceph_page_mkwrite, }; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index ddd5e9471290..27b566874bc1 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2413,6 +2413,14 @@ again: goto out_unlock; } + if (!__ceph_is_any_caps(ci) && + ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + dout("get_cap_refs %p forced umount\n", inode); + *err = -EIO; + ret = 1; + goto out_unlock; + } + dout("get_cap_refs %p have %s needed %s\n", inode, ceph_cap_string(have), ceph_cap_string(need)); } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 8b79d87eaf46..0c62868b5c56 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -136,7 +136,6 @@ int ceph_open(struct inode *inode, struct file *file) struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; struct ceph_file_info *cf = file->private_data; - struct inode *parent_inode = NULL; int err; int flags, fmode, wanted; @@ -210,10 +209,7 @@ int ceph_open(struct inode *inode, struct file *file) ihold(inode); req->r_num_caps = 1; - if (flags & O_CREAT) - parent_inode = ceph_get_dentry_parent_inode(file->f_path.dentry); - err = ceph_mdsc_do_request(mdsc, parent_inode, req); - iput(parent_inode); + err = ceph_mdsc_do_request(mdsc, NULL, req); if (!err) err = ceph_init_file(inode, file, req->r_fmode); ceph_mdsc_put_request(req); @@ -279,7 +275,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, if (err) goto out_req; - if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) + if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); if (d_unhashed(dentry)) { @@ -956,6 +952,12 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); + if (iocb->ki_flags & IOCB_APPEND) { + err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); + if (err < 0) + goto out; + } + err = generic_write_checks(iocb, from); if (err <= 0) goto out; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 6aa07af67603..51cb02da75d9 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2107,7 +2107,6 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, msg = create_request_message(mdsc, req, mds, drop_cap_releases); if (IS_ERR(msg)) { req->r_err = PTR_ERR(msg); - complete_request(mdsc, req); return PTR_ERR(msg); } req->r_request = msg; @@ -2135,7 +2134,7 @@ static int __do_request(struct ceph_mds_client *mdsc, { struct ceph_mds_session *session = NULL; int mds = -1; - int err = -EAGAIN; + int err = 0; if (req->r_err || req->r_got_result) { if (req->r_aborted) @@ -2149,6 +2148,11 @@ static int __do_request(struct ceph_mds_client *mdsc, err = -EIO; goto finish; } + if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + dout("do_request forced umount\n"); + err = -EIO; + goto finish; + } put_request_session(req); @@ -2196,13 +2200,15 @@ static int __do_request(struct ceph_mds_client *mdsc, out_session: ceph_put_mds_session(session); +finish: + if (err) { + dout("__do_request early error %d\n", err); + req->r_err = err; + complete_request(mdsc, req); + __unregister_request(mdsc, req); + } out: return err; - -finish: - req->r_err = err; - complete_request(mdsc, req); - goto out; } /* @@ -2289,8 +2295,6 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, if (req->r_err) { err = req->r_err; - __unregister_request(mdsc, req); - dout("do_request early error %d\n", err); goto out; } @@ -2411,7 +2415,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) mutex_unlock(&mdsc->mutex); goto out; } - if (req->r_got_safe && !head->safe) { + if (req->r_got_safe) { pr_warn("got unsafe after safe on %llu from mds%d\n", tid, mds); mutex_unlock(&mdsc->mutex); @@ -2520,8 +2524,7 @@ out_err: if (err) { req->r_err = err; } else { - req->r_reply = msg; - ceph_msg_get(msg); + req->r_reply = ceph_msg_get(msg); req->r_got_result = true; } } else { @@ -3555,7 +3558,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) { u64 want_tid, want_flush, want_snap; - if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) + if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) return; dout("sync\n"); @@ -3584,7 +3587,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) */ static bool done_closing_sessions(struct ceph_mds_client *mdsc) { - if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) + if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) return true; return atomic_read(&mdsc->num_sessions) == 0; } @@ -3643,6 +3646,34 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) dout("stopped\n"); } +void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc) +{ + struct ceph_mds_session *session; + int mds; + + dout("force umount\n"); + + mutex_lock(&mdsc->mutex); + for (mds = 0; mds < mdsc->max_sessions; mds++) { + session = __ceph_lookup_mds_session(mdsc, mds); + if (!session) + continue; + mutex_unlock(&mdsc->mutex); + mutex_lock(&session->s_mutex); + __close_session(mdsc, session); + if (session->s_state == CEPH_MDS_SESSION_CLOSING) { + cleanup_session_requests(mdsc, session); + remove_session_caps(session); + } + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + mutex_lock(&mdsc->mutex); + kick_requests(mdsc, mds); + } + __wake_requests(mdsc, &mdsc->waiting_for_map); + mutex_unlock(&mdsc->mutex); +} + static void ceph_mdsc_stop(struct ceph_mds_client *mdsc) { dout("stop\n"); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 762757e6cebf..f575eafe2261 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -366,6 +366,7 @@ extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, extern int ceph_mdsc_init(struct ceph_fs_client *fsc); extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc); +extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc); extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc); extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 233d906aec02..4aa7122a8d38 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -338,12 +338,6 @@ static int build_snap_context(struct ceph_snap_realm *realm) return 0; } - if (num == 0 && realm->seq == ceph_empty_snapc->seq) { - ceph_get_snap_context(ceph_empty_snapc); - snapc = ceph_empty_snapc; - goto done; - } - /* alloc new snap context */ err = -ENOMEM; if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64)) @@ -381,7 +375,6 @@ static int build_snap_context(struct ceph_snap_realm *realm) realm->ino, realm, snapc, snapc->seq, (unsigned int) snapc->num_snaps); -done: ceph_put_snap_context(realm->cached_context); realm->cached_context = snapc; return 0; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 7b6bfcbf801c..f446afada328 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -708,6 +708,7 @@ static void ceph_umount_begin(struct super_block *sb) if (!fsc) return; fsc->mount_state = CEPH_MOUNT_SHUTDOWN; + ceph_mdsc_force_umount(fsc->mdsc); return; } diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h new file mode 100644 index 000000000000..0065256881d8 --- /dev/null +++ b/fs/cifs/cifs_ioctl.h @@ -0,0 +1,42 @@ +/* + * fs/cifs/cifs_ioctl.h + * + * Structure definitions for io control for cifs/smb3 + * + * Copyright (c) 2015 Steve French <steve.french@primarydata.com> + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + */ + +struct smb_mnt_fs_info { + __u32 version; /* 0001 */ + __u16 protocol_id; + __u16 tcon_flags; + __u32 vol_serial_number; + __u32 vol_create_time; + __u32 share_caps; + __u32 share_flags; + __u32 sector_flags; + __u32 optimal_sector_size; + __u32 max_bytes_chunk; + __u32 fs_attributes; + __u32 max_path_component; + __u32 device_type; + __u32 device_characteristics; + __u32 maximal_access; + __u64 cifs_posix_caps; +} __packed; + +#define CIFS_IOCTL_MAGIC 0xCF +#define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int) +#define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4) +#define CIFS_IOC_GET_MNT_INFO _IOR(CIFS_IOCTL_MAGIC, 5, struct smb_mnt_fs_info) diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index a782b22904e4..27aea110e923 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -136,5 +136,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.06" +#define CIFS_VERSION "2.07" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 47b030da0781..f5b87303ce46 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -2245,6 +2245,20 @@ typedef struct { #define FILE_DEVICE_VIRTUAL_DISK 0x00000024 #define FILE_DEVICE_NETWORK_REDIRECTOR 0x00000028 +/* Device Characteristics */ +#define FILE_REMOVABLE_MEDIA 0x00000001 +#define FILE_READ_ONLY_DEVICE 0x00000002 +#define FILE_FLOPPY_DISKETTE 0x00000004 +#define FILE_WRITE_ONCE_MEDIA 0x00000008 +#define FILE_REMOTE_DEVICE 0x00000010 +#define FILE_DEVICE_IS_MOUNTED 0x00000020 +#define FILE_VIRTUAL_VOLUME 0x00000040 +#define FILE_DEVICE_SECURE_OPEN 0x00000100 +#define FILE_CHARACTERISTIC_TS_DEVICE 0x00001000 +#define FILE_CHARACTERISTIC_WEBDAV_DEVICE 0x00002000 +#define FILE_PORTABLE_DEVICE 0x00004000 +#define FILE_DEVICE_ALLOW_APPCONTAINER_TRAVERSAL 0x00020000 + typedef struct { __le32 DeviceType; __le32 DeviceCharacteristics; diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 672ef35c9f73..90b4f9f7de66 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -696,7 +696,9 @@ cifs_echo_callback(struct mid_q_entry *mid) { struct TCP_Server_Info *server = mid->callback_data; + mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); + mutex_unlock(&server->srv_mutex); add_credits(server, 1, CIFS_ECHO_OP); } @@ -1572,7 +1574,9 @@ cifs_readv_callback(struct mid_q_entry *mid) } queue_work(cifsiod_wq, &rdata->work); + mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); + mutex_unlock(&server->srv_mutex); add_credits(server, 1, 0); } @@ -2032,6 +2036,7 @@ cifs_writev_callback(struct mid_q_entry *mid) { struct cifs_writedata *wdata = mid->callback_data; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; unsigned int written; WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf; @@ -2068,7 +2073,9 @@ cifs_writev_callback(struct mid_q_entry *mid) } queue_work(cifsiod_wq, &wdata->work); + mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); + mutex_unlock(&server->srv_mutex); add_credits(tcon->ses->server, 1, 0); } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 3f50cee79df9..e2a6af1508af 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3216,7 +3216,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) return VM_FAULT_LOCKED; } -static struct vm_operations_struct cifs_file_vm_ops = { +static const struct vm_operations_struct cifs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = cifs_page_mkwrite, diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 49b8b6e41a18..c63f5227b681 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -31,12 +31,9 @@ #include "cifsproto.h" #include "cifs_debug.h" #include "cifsfs.h" +#include "cifs_ioctl.h" #include <linux/btrfs.h> -#define CIFS_IOCTL_MAGIC 0xCF -#define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int) -#define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4) - static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file, unsigned long srcfd, u64 off, u64 len, u64 destoff, bool dup_extents) @@ -135,6 +132,43 @@ out_drop_write: return rc; } +static long smb_mnt_get_fsinfo(unsigned int xid, struct cifs_tcon *tcon, + void __user *arg) +{ + int rc = 0; + struct smb_mnt_fs_info *fsinf; + + fsinf = kzalloc(sizeof(struct smb_mnt_fs_info), GFP_KERNEL); + if (fsinf == NULL) + return -ENOMEM; + + fsinf->version = 1; + fsinf->protocol_id = tcon->ses->server->vals->protocol_id; + fsinf->device_characteristics = + le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics); + fsinf->device_type = le32_to_cpu(tcon->fsDevInfo.DeviceType); + fsinf->fs_attributes = le32_to_cpu(tcon->fsAttrInfo.Attributes); + fsinf->max_path_component = + le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength); +#ifdef CONFIG_CIFS_SMB2 + fsinf->vol_serial_number = tcon->vol_serial_number; + fsinf->vol_create_time = le64_to_cpu(tcon->vol_create_time); + fsinf->share_flags = tcon->share_flags; + fsinf->share_caps = le32_to_cpu(tcon->capabilities); + fsinf->sector_flags = tcon->ss_flags; + fsinf->optimal_sector_size = tcon->perf_sector_size; + fsinf->max_bytes_chunk = tcon->max_bytes_chunk; + fsinf->maximal_access = tcon->maximal_access; +#endif /* SMB2 */ + fsinf->cifs_posix_caps = le64_to_cpu(tcon->fsUnixInfo.Capability); + + if (copy_to_user(arg, fsinf, sizeof(struct smb_mnt_fs_info))) + rc = -EFAULT; + + kfree(fsinf); + return rc; +} + long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) { struct inode *inode = file_inode(filep); @@ -148,8 +182,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) xid = get_xid(); - cifs_dbg(FYI, "ioctl file %p cmd %u arg %lu\n", filep, command, arg); - cifs_sb = CIFS_SB(inode->i_sb); switch (command) { @@ -228,6 +260,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) else rc = -EOPNOTSUPP; break; + case CIFS_IOC_GET_MNT_INFO: + tcon = tlink_tcon(pSMBFile->tlink); + rc = smb_mnt_get_fsinfo(xid, tcon, (void __user *)arg); + break; default: cifs_dbg(FYI, "unsupported ioctl\n"); break; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index b8b4f08ee094..070fb2ad85ce 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1626,7 +1626,9 @@ smb2_echo_callback(struct mid_q_entry *mid) if (mid->mid_state == MID_RESPONSE_RECEIVED) credits_received = le16_to_cpu(smb2->hdr.CreditRequest); + mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); + mutex_unlock(&server->srv_mutex); add_credits(server, credits_received, CIFS_ECHO_OP); } @@ -1810,7 +1812,9 @@ smb2_readv_callback(struct mid_q_entry *mid) cifs_stats_fail_inc(tcon, SMB2_READ_HE); queue_work(cifsiod_wq, &rdata->work); + mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); + mutex_unlock(&server->srv_mutex); add_credits(server, credits_received, 0); } @@ -1938,6 +1942,7 @@ smb2_writev_callback(struct mid_q_entry *mid) { struct cifs_writedata *wdata = mid->callback_data; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; unsigned int written; struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf; unsigned int credits_received = 1; @@ -1977,7 +1982,9 @@ smb2_writev_callback(struct mid_q_entry *mid) cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); queue_work(cifsiod_wq, &wdata->work); + mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); + mutex_unlock(&server->srv_mutex); add_credits(tcon->ses->server, credits_received, 0); } diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 126f46b887cc..2a24c524fb9a 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -644,7 +644,9 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) } spin_unlock(&GlobalMid_Lock); + mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); + mutex_unlock(&server->srv_mutex); return rc; } diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index 9b1ffaa0572e..f6c6c8adbc01 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -353,7 +353,7 @@ int venus_readlink(struct super_block *sb, struct CodaFid *fid, char *result; insize = max_t(unsigned int, - INSIZE(readlink), OUTSIZE(readlink)+ *length + 1); + INSIZE(readlink), OUTSIZE(readlink)+ *length); UPARG(CODA_READLINK); inp->coda_readlink.VFid = *fid; @@ -361,8 +361,8 @@ int venus_readlink(struct super_block *sb, struct CodaFid *fid, error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); if (!error) { retlen = outp->coda_readlink.count; - if ( retlen > *length ) - retlen = *length; + if (retlen >= *length) + retlen = *length - 1; *length = retlen; result = (char *)outp + (long)outp->coda_readlink.data; memcpy(buffer, result, retlen); diff --git a/fs/coredump.c b/fs/coredump.c index c5ecde6f3eed..a8f75640ac86 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -513,10 +513,10 @@ void do_coredump(const siginfo_t *siginfo) const struct cred *old_cred; struct cred *cred; int retval = 0; - int flag = 0; int ispipe; struct files_struct *displaced; - bool need_nonrelative = false; + /* require nonrelative corefile path and be extra careful */ + bool need_suid_safe = false; bool core_dumped = false; static atomic_t core_dump_count = ATOMIC_INIT(0); struct coredump_params cprm = { @@ -550,9 +550,8 @@ void do_coredump(const siginfo_t *siginfo) */ if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) { /* Setuid core dump mode */ - flag = O_EXCL; /* Stop rewrite attacks */ cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */ - need_nonrelative = true; + need_suid_safe = true; } retval = coredump_wait(siginfo->si_signo, &core_state); @@ -633,7 +632,7 @@ void do_coredump(const siginfo_t *siginfo) if (cprm.limit < binfmt->min_coredump) goto fail_unlock; - if (need_nonrelative && cn.corename[0] != '/') { + if (need_suid_safe && cn.corename[0] != '/') { printk(KERN_WARNING "Pid %d(%s) can only dump core "\ "to fully qualified path!\n", task_tgid_vnr(current), current->comm); @@ -641,8 +640,35 @@ void do_coredump(const siginfo_t *siginfo) goto fail_unlock; } + /* + * Unlink the file if it exists unless this is a SUID + * binary - in that case, we're running around with root + * privs and don't want to unlink another user's coredump. + */ + if (!need_suid_safe) { + mm_segment_t old_fs; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + /* + * If it doesn't exist, that's fine. If there's some + * other problem, we'll catch it at the filp_open(). + */ + (void) sys_unlink((const char __user *)cn.corename); + set_fs(old_fs); + } + + /* + * There is a race between unlinking and creating the + * file, but if that causes an EEXIST here, that's + * fine - another process raced with us while creating + * the corefile, and the other process won. To userspace, + * what matters is that at least one of the two processes + * writes its coredump successfully, not which one. + */ cprm.file = filp_open(cn.corename, - O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, + O_CREAT | 2 | O_NOFOLLOW | + O_LARGEFILE | O_EXCL, 0600); if (IS_ERR(cprm.file)) goto fail_unlock; @@ -659,11 +685,15 @@ void do_coredump(const siginfo_t *siginfo) if (!S_ISREG(inode->i_mode)) goto close_fail; /* - * Dont allow local users get cute and trick others to coredump - * into their pre-created files. + * Don't dump core if the filesystem changed owner or mode + * of the file during file creation. This is an issue when + * a process dumps core while its cwd is e.g. on a vfat + * filesystem. */ if (!uid_eq(inode->i_uid, current_fsuid())) goto close_fail; + if ((inode->i_mode & 0677) != 0600) + goto close_fail; if (!(cprm.file->f_mode & FMODE_CAN_WRITE)) goto close_fail; if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file)) @@ -17,12 +17,14 @@ #include <linux/atomic.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> +#include <linux/dax.h> #include <linux/fs.h> #include <linux/genhd.h> #include <linux/highmem.h> #include <linux/memcontrol.h> #include <linux/mm.h> #include <linux/mutex.h> +#include <linux/pmem.h> #include <linux/sched.h> #include <linux/uio.h> #include <linux/vmstat.h> @@ -34,7 +36,7 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long size) might_sleep(); do { - void *addr; + void __pmem *addr; unsigned long pfn; long count; @@ -46,10 +48,7 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long size) unsigned pgsz = PAGE_SIZE - offset_in_page(addr); if (pgsz > count) pgsz = count; - if (pgsz < PAGE_SIZE) - memset(addr, 0, pgsz); - else - clear_page(addr); + clear_pmem(addr, pgsz); addr += pgsz; size -= pgsz; count -= pgsz; @@ -59,26 +58,29 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long size) } } while (size); + wmb_pmem(); return 0; } EXPORT_SYMBOL_GPL(dax_clear_blocks); -static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits) +static long dax_get_addr(struct buffer_head *bh, void __pmem **addr, + unsigned blkbits) { unsigned long pfn; sector_t sector = bh->b_blocknr << (blkbits - 9); return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size); } -static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos, - loff_t end) +/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */ +static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first, + loff_t pos, loff_t end) { loff_t final = end - pos + first; /* The final byte of the buffer */ if (first > 0) - memset(addr, 0, first); + clear_pmem(addr, first); if (final < size) - memset(addr + final, 0, size - final); + clear_pmem(addr + final, size - final); } static bool buffer_written(struct buffer_head *bh) @@ -106,14 +108,15 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, loff_t pos = start; loff_t max = start; loff_t bh_max = start; - void *addr; + void __pmem *addr; bool hole = false; + bool need_wmb = false; if (iov_iter_rw(iter) != WRITE) end = min(end, i_size_read(inode)); while (pos < end) { - unsigned len; + size_t len; if (pos == max) { unsigned blkbits = inode->i_blkbits; sector_t block = pos >> blkbits; @@ -145,19 +148,23 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, retval = dax_get_addr(bh, &addr, blkbits); if (retval < 0) break; - if (buffer_unwritten(bh) || buffer_new(bh)) + if (buffer_unwritten(bh) || buffer_new(bh)) { dax_new_buf(addr, retval, first, pos, end); + need_wmb = true; + } addr += first; size = retval - first; } max = min(pos + size, end); } - if (iov_iter_rw(iter) == WRITE) - len = copy_from_iter_nocache(addr, max - pos, iter); - else if (!hole) - len = copy_to_iter(addr, max - pos, iter); + if (iov_iter_rw(iter) == WRITE) { + len = copy_from_iter_pmem(addr, max - pos, iter); + need_wmb = true; + } else if (!hole) + len = copy_to_iter((void __force *)addr, max - pos, + iter); else len = iov_iter_zero(max - pos, iter); @@ -168,6 +175,9 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, addr += len; } + if (need_wmb) + wmb_pmem(); + return (pos == start) ? retval : pos - start; } @@ -260,11 +270,13 @@ static int dax_load_hole(struct address_space *mapping, struct page *page, static int copy_user_bh(struct page *to, struct buffer_head *bh, unsigned blkbits, unsigned long vaddr) { - void *vfrom, *vto; + void __pmem *vfrom; + void *vto; + if (dax_get_addr(bh, &vfrom, blkbits) < 0) return -EIO; vto = kmap_atomic(to); - copy_user_page(vto, vfrom, vaddr, to); + copy_user_page(vto, (void __force *)vfrom, vaddr, to); kunmap_atomic(vto); return 0; } @@ -272,16 +284,13 @@ static int copy_user_bh(struct page *to, struct buffer_head *bh, static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, struct vm_area_struct *vma, struct vm_fault *vmf) { - struct address_space *mapping = inode->i_mapping; sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); unsigned long vaddr = (unsigned long)vmf->virtual_address; - void *addr; + void __pmem *addr; unsigned long pfn; pgoff_t size; int error; - i_mmap_lock_read(mapping); - /* * Check truncate didn't happen while we were allocating a block. * If it did, this block may or may not be still allocated to the @@ -303,14 +312,14 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, goto out; } - if (buffer_unwritten(bh) || buffer_new(bh)) - clear_page(addr); + if (buffer_unwritten(bh) || buffer_new(bh)) { + clear_pmem(addr, PAGE_SIZE); + wmb_pmem(); + } error = vm_insert_mixed(vma, vaddr, pfn); out: - i_mmap_unlock_read(mapping); - return error; } @@ -372,15 +381,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, * from a read fault and we've raced with a truncate */ error = -EIO; - goto unlock_page; + goto unlock; } + } else { + i_mmap_lock_write(mapping); } error = get_block(inode, block, &bh, 0); if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; /* fs corruption? */ if (error) - goto unlock_page; + goto unlock; if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { if (vmf->flags & FAULT_FLAG_WRITE) { @@ -391,8 +402,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; if (error) - goto unlock_page; + goto unlock; } else { + i_mmap_unlock_write(mapping); return dax_load_hole(mapping, page, vmf); } } @@ -404,17 +416,15 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, else clear_user_highpage(new_page, vaddr); if (error) - goto unlock_page; + goto unlock; vmf->page = page; if (!page) { - i_mmap_lock_read(mapping); /* Check we didn't race with truncate */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) { - i_mmap_unlock_read(mapping); error = -EIO; - goto out; + goto unlock; } } return VM_FAULT_LOCKED; @@ -450,6 +460,8 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); } + if (!page) + i_mmap_unlock_write(mapping); out: if (error == -ENOMEM) return VM_FAULT_OOM | major; @@ -458,11 +470,14 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, return VM_FAULT_SIGBUS | major; return VM_FAULT_NOPAGE | major; - unlock_page: + unlock: if (page) { unlock_page(page); page_cache_release(page); + } else { + i_mmap_unlock_write(mapping); } + goto out; } EXPORT_SYMBOL(__dax_fault); @@ -494,6 +509,177 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, } EXPORT_SYMBOL_GPL(dax_fault); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * The 'colour' (ie low bits) within a PMD of a page offset. This comes up + * more often than one might expect in the below function. + */ +#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) + +int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, unsigned int flags, get_block_t get_block, + dax_iodone_t complete_unwritten) +{ + struct file *file = vma->vm_file; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct buffer_head bh; + unsigned blkbits = inode->i_blkbits; + unsigned long pmd_addr = address & PMD_MASK; + bool write = flags & FAULT_FLAG_WRITE; + long length; + void __pmem *kaddr; + pgoff_t size, pgoff; + sector_t block, sector; + unsigned long pfn; + int result = 0; + + /* Fall back to PTEs if we're going to COW */ + if (write && !(vma->vm_flags & VM_SHARED)) + return VM_FAULT_FALLBACK; + /* If the PMD would extend outside the VMA */ + if (pmd_addr < vma->vm_start) + return VM_FAULT_FALLBACK; + if ((pmd_addr + PMD_SIZE) > vma->vm_end) + return VM_FAULT_FALLBACK; + + pgoff = linear_page_index(vma, pmd_addr); + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (pgoff >= size) + return VM_FAULT_SIGBUS; + /* If the PMD would cover blocks out of the file */ + if ((pgoff | PG_PMD_COLOUR) >= size) + return VM_FAULT_FALLBACK; + + memset(&bh, 0, sizeof(bh)); + block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); + + bh.b_size = PMD_SIZE; + i_mmap_lock_write(mapping); + length = get_block(inode, block, &bh, write); + if (length) + return VM_FAULT_SIGBUS; + + /* + * If the filesystem isn't willing to tell us the length of a hole, + * just fall back to PTEs. Calling get_block 512 times in a loop + * would be silly. + */ + if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) + goto fallback; + + if (buffer_unwritten(&bh) || buffer_new(&bh)) { + int i; + for (i = 0; i < PTRS_PER_PMD; i++) + clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE); + wmb_pmem(); + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + result |= VM_FAULT_MAJOR; + } + + /* + * If we allocated new storage, make sure no process has any + * zero pages covering this hole + */ + if (buffer_new(&bh)) { + i_mmap_unlock_write(mapping); + unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0); + i_mmap_lock_write(mapping); + } + + /* + * If a truncate happened while we were allocating blocks, we may + * leave blocks allocated to the file that are beyond EOF. We can't + * take i_mutex here, so just leave them hanging; they'll be freed + * when the file is deleted. + */ + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (pgoff >= size) { + result = VM_FAULT_SIGBUS; + goto out; + } + if ((pgoff | PG_PMD_COLOUR) >= size) + goto fallback; + + if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { + spinlock_t *ptl; + pmd_t entry; + struct page *zero_page = get_huge_zero_page(); + + if (unlikely(!zero_page)) + goto fallback; + + ptl = pmd_lock(vma->vm_mm, pmd); + if (!pmd_none(*pmd)) { + spin_unlock(ptl); + goto fallback; + } + + entry = mk_pmd(zero_page, vma->vm_page_prot); + entry = pmd_mkhuge(entry); + set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); + result = VM_FAULT_NOPAGE; + spin_unlock(ptl); + } else { + sector = bh.b_blocknr << (blkbits - 9); + length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn, + bh.b_size); + if (length < 0) { + result = VM_FAULT_SIGBUS; + goto out; + } + if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR)) + goto fallback; + + result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write); + } + + out: + if (buffer_unwritten(&bh)) + complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); + + i_mmap_unlock_write(mapping); + + return result; + + fallback: + count_vm_event(THP_FAULT_FALLBACK); + result = VM_FAULT_FALLBACK; + goto out; +} +EXPORT_SYMBOL_GPL(__dax_pmd_fault); + +/** + * dax_pmd_fault - handle a PMD fault on a DAX file + * @vma: The virtual memory area where the fault occurred + * @vmf: The description of the fault + * @get_block: The filesystem method used to translate file offsets to blocks + * + * When a page fault occurs, filesystems may call this helper in their + * pmd_fault handler for DAX files. + */ +int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, unsigned int flags, get_block_t get_block, + dax_iodone_t complete_unwritten) +{ + int result; + struct super_block *sb = file_inode(vma->vm_file)->i_sb; + + if (flags & FAULT_FLAG_WRITE) { + sb_start_pagefault(sb); + file_update_time(vma->vm_file); + } + result = __dax_pmd_fault(vma, address, pmd, flags, get_block, + complete_unwritten); + if (flags & FAULT_FLAG_WRITE) + sb_end_pagefault(sb); + + return result; +} +EXPORT_SYMBOL_GPL(dax_pmd_fault); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + /** * dax_pfn_mkwrite - handle first write to DAX page * @vma: The virtual memory area where the fault occurred @@ -548,11 +734,12 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, if (err < 0) return err; if (buffer_written(&bh)) { - void *addr; + void __pmem *addr; err = dax_get_addr(&bh, &addr, inode->i_blkbits); if (err < 0) return err; - memset(addr + offset, 0, length); + clear_pmem(addr + offset, length); + wmb_pmem(); } return 0; diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 284f9aa0028b..6c55ade071c3 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -435,8 +435,8 @@ struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode, } EXPORT_SYMBOL_GPL(debugfs_create_atomic_t); -static ssize_t read_file_bool(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) +ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) { char buf[3]; u32 *val = file->private_data; @@ -449,9 +449,10 @@ static ssize_t read_file_bool(struct file *file, char __user *user_buf, buf[2] = 0x00; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); } +EXPORT_SYMBOL_GPL(debugfs_read_file_bool); -static ssize_t write_file_bool(struct file *file, const char __user *user_buf, - size_t count, loff_t *ppos) +ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) { char buf[32]; size_t buf_size; @@ -468,10 +469,11 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf, return count; } +EXPORT_SYMBOL_GPL(debugfs_write_file_bool); static const struct file_operations fops_bool = { - .read = read_file_bool, - .write = write_file_bool, + .read = debugfs_read_file_bool, + .write = debugfs_write_file_bool, .open = simple_open, .llseek = default_llseek, }; diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 97315f2f6816..80d6901493cf 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -258,8 +258,7 @@ void ecryptfs_destroy_mount_crypt_stat( &mount_crypt_stat->global_auth_tok_list, mount_crypt_stat_list) { list_del(&auth_tok->mount_crypt_stat_list); - if (auth_tok->global_auth_tok_key - && !(auth_tok->flags & ECRYPTFS_AUTH_TOK_INVALID)) + if (!(auth_tok->flags & ECRYPTFS_AUTH_TOK_INVALID)) key_put(auth_tok->global_auth_tok_key); kmem_cache_free(ecryptfs_global_auth_tok_cache, auth_tok); } diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index 8db0b464483f..63cd2c147221 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -45,20 +45,20 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags) { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); - int rc; - - if (!(lower_dentry->d_flags & DCACHE_OP_REVALIDATE)) - return 1; + int rc = 1; if (flags & LOOKUP_RCU) return -ECHILD; - rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags); + if (lower_dentry->d_flags & DCACHE_OP_REVALIDATE) + rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags); + if (d_really_is_positive(dentry)) { - struct inode *lower_inode = - ecryptfs_inode_to_lower(d_inode(dentry)); + struct inode *inode = d_inode(dentry); - fsstack_copy_attr_all(d_inode(dentry), lower_inode); + fsstack_copy_attr_all(inode, ecryptfs_inode_to_lower(inode)); + if (!inode->i_nlink) + return 0; } return rc; } diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 3b57c9f83c9b..1982c3f11aec 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -20,6 +20,7 @@ #include <linux/time.h> #include <linux/pagemap.h> +#include <linux/dax.h> #include <linux/quotaops.h> #include "ext2.h" #include "xattr.h" @@ -31,6 +32,12 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return dax_fault(vma, vmf, ext2_get_block, NULL); } +static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, unsigned int flags) +{ + return dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL); +} + static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { return dax_mkwrite(vma, vmf, ext2_get_block, NULL); @@ -38,6 +45,7 @@ static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) static const struct vm_operations_struct ext2_dax_vm_ops = { .fault = ext2_dax_fault, + .pmd_fault = ext2_dax_pmd_fault, .page_mkwrite = ext2_dax_mkwrite, .pfn_mkwrite = dax_pfn_mkwrite, }; @@ -49,7 +57,7 @@ static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); vma->vm_ops = &ext2_dax_vm_ops; - vma->vm_flags |= VM_MIXEDMAP; + vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; return 0; } #else diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index a3a404c5df2e..c60a248c640c 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -25,6 +25,7 @@ #include <linux/time.h> #include <linux/highuid.h> #include <linux/pagemap.h> +#include <linux/dax.h> #include <linux/quotaops.h> #include <linux/writeback.h> #include <linux/buffer_head.h> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 32071f5c1c26..fd1f28be5296 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2272,6 +2272,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); +int ext4_get_block_dax(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, diff --git a/fs/ext4/file.c b/fs/ext4/file.c index bc313ac5d3fa..113837e7ba98 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -22,6 +22,7 @@ #include <linux/fs.h> #include <linux/mount.h> #include <linux/path.h> +#include <linux/dax.h> #include <linux/quotaops.h> #include <linux/pagevec.h> #include <linux/uio.h> @@ -195,7 +196,7 @@ out: static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) { struct inode *inode = bh->b_assoc_map->host; - /* XXX: breaks on 32-bit > 16GB. Is that even supported? */ + /* XXX: breaks on 32-bit > 16TB. Is that even supported? */ loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; int err; if (!uptodate) @@ -206,17 +207,74 @@ static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten); - /* Is this the right get_block? */ + int result; + handle_t *handle = NULL; + struct super_block *sb = file_inode(vma->vm_file)->i_sb; + bool write = vmf->flags & FAULT_FLAG_WRITE; + + if (write) { + sb_start_pagefault(sb); + file_update_time(vma->vm_file); + handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, + EXT4_DATA_TRANS_BLOCKS(sb)); + } + + if (IS_ERR(handle)) + result = VM_FAULT_SIGBUS; + else + result = __dax_fault(vma, vmf, ext4_get_block_dax, + ext4_end_io_unwritten); + + if (write) { + if (!IS_ERR(handle)) + ext4_journal_stop(handle); + sb_end_pagefault(sb); + } + + return result; +} + +static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, unsigned int flags) +{ + int result; + handle_t *handle = NULL; + struct inode *inode = file_inode(vma->vm_file); + struct super_block *sb = inode->i_sb; + bool write = flags & FAULT_FLAG_WRITE; + + if (write) { + sb_start_pagefault(sb); + file_update_time(vma->vm_file); + handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, + ext4_chunk_trans_blocks(inode, + PMD_SIZE / PAGE_SIZE)); + } + + if (IS_ERR(handle)) + result = VM_FAULT_SIGBUS; + else + result = __dax_pmd_fault(vma, addr, pmd, flags, + ext4_get_block_dax, ext4_end_io_unwritten); + + if (write) { + if (!IS_ERR(handle)) + ext4_journal_stop(handle); + sb_end_pagefault(sb); + } + + return result; } static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten); + return dax_mkwrite(vma, vmf, ext4_get_block_dax, + ext4_end_io_unwritten); } static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, + .pmd_fault = ext4_dax_pmd_fault, .page_mkwrite = ext4_dax_mkwrite, .pfn_mkwrite = dax_pfn_mkwrite, }; @@ -244,7 +302,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); if (IS_DAX(file_inode(file))) { vma->vm_ops = &ext4_dax_vm_ops; - vma->vm_flags |= VM_MIXEDMAP; + vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; } else { vma->vm_ops = &ext4_file_vm_ops; } diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 4f6ac499f09e..2468261748b2 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -22,6 +22,7 @@ #include "ext4_jbd2.h" #include "truncate.h" +#include <linux/dax.h> #include <linux/uio.h> #include <trace/events/ext4.h> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 29f1af7c2cab..612fbcf76b5c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -22,6 +22,7 @@ #include <linux/time.h> #include <linux/highuid.h> #include <linux/pagemap.h> +#include <linux/dax.h> #include <linux/quotaops.h> #include <linux/string.h> #include <linux/buffer_head.h> @@ -3020,6 +3021,17 @@ static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, EXT4_GET_BLOCKS_NO_LOCK); } +int ext4_get_block_dax(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int flags = EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_UNWRIT_EXT; + if (create) + flags |= EXT4_GET_BLOCKS_CREATE; + ext4_debug("ext4_get_block_dax: inode %lu, create flag %d\n", + inode->i_ino, create); + return _ext4_get_block(inode, iblock, bh_result, flags); +} + static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ssize_t size, void *private) { diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index ae0f438c2ee6..587ac08eabb6 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -53,8 +53,6 @@ struct wb_writeback_work { unsigned int for_background:1; unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ unsigned int auto_free:1; /* free on completion */ - unsigned int single_wait:1; - unsigned int single_done:1; enum wb_reason reason; /* why was writeback initiated? */ struct list_head list; /* pending work list */ @@ -178,14 +176,11 @@ static void wb_wakeup(struct bdi_writeback *wb) static void wb_queue_work(struct bdi_writeback *wb, struct wb_writeback_work *work) { - trace_writeback_queue(wb->bdi, work); + trace_writeback_queue(wb, work); spin_lock_bh(&wb->work_lock); - if (!test_bit(WB_registered, &wb->state)) { - if (work->single_wait) - work->single_done = 1; + if (!test_bit(WB_registered, &wb->state)) goto out_unlock; - } if (work->done) atomic_inc(&work->done->cnt); list_add_tail(&work->list, &wb->work_list); @@ -706,7 +701,7 @@ EXPORT_SYMBOL_GPL(wbc_account_io); /** * inode_congested - test whether an inode is congested - * @inode: inode to test for congestion + * @inode: inode to test for congestion (may be NULL) * @cong_bits: mask of WB_[a]sync_congested bits to test * * Tests whether @inode is congested. @cong_bits is the mask of congestion @@ -716,6 +711,9 @@ EXPORT_SYMBOL_GPL(wbc_account_io); * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg * associated with @inode is congested; otherwise, the root wb's congestion * state is used. + * + * @inode is allowed to be NULL as this function is often called on + * mapping->host which is NULL for the swapper space. */ int inode_congested(struct inode *inode, int cong_bits) { @@ -738,32 +736,6 @@ int inode_congested(struct inode *inode, int cong_bits) EXPORT_SYMBOL_GPL(inode_congested); /** - * wb_wait_for_single_work - wait for completion of a single bdi_writeback_work - * @bdi: bdi the work item was issued to - * @work: work item to wait for - * - * Wait for the completion of @work which was issued to one of @bdi's - * bdi_writeback's. The caller must have set @work->single_wait before - * issuing it. This wait operates independently fo - * wb_wait_for_completion() and also disables automatic freeing of @work. - */ -static void wb_wait_for_single_work(struct backing_dev_info *bdi, - struct wb_writeback_work *work) -{ - if (WARN_ON_ONCE(!work->single_wait)) - return; - - wait_event(bdi->wb_waitq, work->single_done); - - /* - * Paired with smp_wmb() in wb_do_writeback() and ensures that all - * modifications to @work prior to assertion of ->single_done is - * visible to the caller once this function returns. - */ - smp_rmb(); -} - -/** * wb_split_bdi_pages - split nr_pages to write according to bandwidth * @wb: target bdi_writeback to split @nr_pages to * @nr_pages: number of pages to write for the whole bdi @@ -792,38 +764,6 @@ static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) } /** - * wb_clone_and_queue_work - clone a wb_writeback_work and issue it to a wb - * @wb: target bdi_writeback - * @base_work: source wb_writeback_work - * - * Try to make a clone of @base_work and issue it to @wb. If cloning - * succeeds, %true is returned; otherwise, @base_work is issued directly - * and %false is returned. In the latter case, the caller is required to - * wait for @base_work's completion using wb_wait_for_single_work(). - * - * A clone is auto-freed on completion. @base_work never is. - */ -static bool wb_clone_and_queue_work(struct bdi_writeback *wb, - struct wb_writeback_work *base_work) -{ - struct wb_writeback_work *work; - - work = kmalloc(sizeof(*work), GFP_ATOMIC); - if (work) { - *work = *base_work; - work->auto_free = 1; - work->single_wait = 0; - } else { - work = base_work; - work->auto_free = 0; - work->single_wait = 1; - } - work->single_done = 0; - wb_queue_work(wb, work); - return work != base_work; -} - -/** * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi * @bdi: target backing_dev_info * @base_work: wb_writeback_work to issue @@ -838,15 +778,19 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, struct wb_writeback_work *base_work, bool skip_if_busy) { - long nr_pages = base_work->nr_pages; - int next_blkcg_id = 0; + int next_memcg_id = 0; struct bdi_writeback *wb; struct wb_iter iter; might_sleep(); restart: rcu_read_lock(); - bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) { + bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) { + DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done); + struct wb_writeback_work fallback_work; + struct wb_writeback_work *work; + long nr_pages; + /* SYNC_ALL writes out I_DIRTY_TIME too */ if (!wb_has_dirty_io(wb) && (base_work->sync_mode == WB_SYNC_NONE || @@ -855,13 +799,30 @@ restart: if (skip_if_busy && writeback_in_progress(wb)) continue; - base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages); - if (!wb_clone_and_queue_work(wb, base_work)) { - next_blkcg_id = wb->blkcg_css->id + 1; - rcu_read_unlock(); - wb_wait_for_single_work(bdi, base_work); - goto restart; + nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages); + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) { + *work = *base_work; + work->nr_pages = nr_pages; + work->auto_free = 1; + wb_queue_work(wb, work); + continue; } + + /* alloc failed, execute synchronously using on-stack fallback */ + work = &fallback_work; + *work = *base_work; + work->nr_pages = nr_pages; + work->auto_free = 0; + work->done = &fallback_work_done; + + wb_queue_work(wb, work); + + next_memcg_id = wb->memcg_css->id + 1; + rcu_read_unlock(); + wb_wait_for_completion(bdi, &fallback_work_done); + goto restart; } rcu_read_unlock(); } @@ -902,8 +863,6 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) { base_work->auto_free = 0; - base_work->single_wait = 0; - base_work->single_done = 0; wb_queue_work(&bdi->wb, base_work); } } @@ -924,7 +883,7 @@ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, */ work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { - trace_writeback_nowork(wb->bdi); + trace_writeback_nowork(wb); wb_wakeup(wb); return; } @@ -954,7 +913,7 @@ void wb_start_background_writeback(struct bdi_writeback *wb) * We just wake up the flusher thread. It will perform background * writeback as soon as there is no other work to do. */ - trace_writeback_wake_background(wb->bdi); + trace_writeback_wake_background(wb); wb_wakeup(wb); } @@ -1421,6 +1380,10 @@ static long writeback_chunk_size(struct bdi_writeback *wb, * Write a portion of b_io inodes which belong to @sb. * * Return the number of pages and/or inodes written. + * + * NOTE! This is called with wb->list_lock held, and will + * unlock and relock that for each inode it ends up doing + * IO for. */ static long writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, @@ -1439,9 +1402,7 @@ static long writeback_sb_inodes(struct super_block *sb, unsigned long start_time = jiffies; long write_chunk; long wrote = 0; /* count both pages and inodes */ - struct blk_plug plug; - blk_start_plug(&plug); while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); @@ -1539,7 +1500,6 @@ static long writeback_sb_inodes(struct super_block *sb, break; } } - blk_finish_plug(&plug); return wrote; } @@ -1586,12 +1546,15 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, .range_cyclic = 1, .reason = reason, }; + struct blk_plug plug; + blk_start_plug(&plug); spin_lock(&wb->list_lock); if (list_empty(&wb->b_io)) queue_io(wb, &work); __writeback_inodes_wb(wb, &work); spin_unlock(&wb->list_lock); + blk_finish_plug(&plug); return nr_pages - work.nr_pages; } @@ -1619,10 +1582,12 @@ static long wb_writeback(struct bdi_writeback *wb, unsigned long oldest_jif; struct inode *inode; long progress; + struct blk_plug plug; oldest_jif = jiffies; work->older_than_this = &oldest_jif; + blk_start_plug(&plug); spin_lock(&wb->list_lock); for (;;) { /* @@ -1660,14 +1625,14 @@ static long wb_writeback(struct bdi_writeback *wb, } else if (work->for_background) oldest_jif = jiffies; - trace_writeback_start(wb->bdi, work); + trace_writeback_start(wb, work); if (list_empty(&wb->b_io)) queue_io(wb, work); if (work->sb) progress = writeback_sb_inodes(work->sb, wb, work); else progress = __writeback_inodes_wb(wb, work); - trace_writeback_written(wb->bdi, work); + trace_writeback_written(wb, work); wb_update_bandwidth(wb, wb_start); @@ -1692,7 +1657,7 @@ static long wb_writeback(struct bdi_writeback *wb, * we'll just busyloop. */ if (!list_empty(&wb->b_more_io)) { - trace_writeback_wait(wb->bdi, work); + trace_writeback_wait(wb, work); inode = wb_inode(wb->b_more_io.prev); spin_lock(&inode->i_lock); spin_unlock(&wb->list_lock); @@ -1702,6 +1667,7 @@ static long wb_writeback(struct bdi_writeback *wb, } } spin_unlock(&wb->list_lock); + blk_finish_plug(&plug); return nr_pages - work->nr_pages; } @@ -1797,26 +1763,14 @@ static long wb_do_writeback(struct bdi_writeback *wb) set_bit(WB_writeback_running, &wb->state); while ((work = get_next_work_item(wb)) != NULL) { struct wb_completion *done = work->done; - bool need_wake_up = false; - trace_writeback_exec(wb->bdi, work); + trace_writeback_exec(wb, work); wrote += wb_writeback(wb, work); - if (work->single_wait) { - WARN_ON_ONCE(work->auto_free); - /* paired w/ rmb in wb_wait_for_single_work() */ - smp_wmb(); - work->single_done = 1; - need_wake_up = true; - } else if (work->auto_free) { + if (work->auto_free) kfree(work); - } - if (done && atomic_dec_and_test(&done->cnt)) - need_wake_up = true; - - if (need_wake_up) wake_up_all(&wb->bdi->wb_waitq); } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index a38e38f7b6fc..9bd1244caf38 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -34,6 +34,7 @@ #include <linux/percpu.h> #include <linux/list_sort.h> #include <linux/lockref.h> +#include <linux/rhashtable.h> #include "gfs2.h" #include "incore.h" @@ -50,9 +51,8 @@ #include "trace_gfs2.h" struct gfs2_glock_iter { - int hash; /* hash bucket index */ - unsigned nhash; /* Index within current bucket */ struct gfs2_sbd *sdp; /* incore superblock */ + struct rhashtable_iter hti; /* rhashtable iterator */ struct gfs2_glock *gl; /* current glock struct */ loff_t last_pos; /* last position */ }; @@ -70,44 +70,19 @@ static DEFINE_SPINLOCK(lru_lock); #define GFS2_GL_HASH_SHIFT 15 #define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT) -#define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1) -static struct hlist_bl_head gl_hash_table[GFS2_GL_HASH_SIZE]; -static struct dentry *gfs2_root; - -/** - * gl_hash() - Turn glock number into hash bucket number - * @lock: The glock number - * - * Returns: The number of the corresponding hash bucket - */ - -static unsigned int gl_hash(const struct gfs2_sbd *sdp, - const struct lm_lockname *name) -{ - unsigned int h; - - h = jhash(&name->ln_number, sizeof(u64), 0); - h = jhash(&name->ln_type, sizeof(unsigned int), h); - h = jhash(&sdp, sizeof(struct gfs2_sbd *), h); - h &= GFS2_GL_HASH_MASK; - - return h; -} - -static inline void spin_lock_bucket(unsigned int hash) -{ - hlist_bl_lock(&gl_hash_table[hash]); -} +static struct rhashtable_params ht_parms = { + .nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4, + .key_len = sizeof(struct lm_lockname), + .key_offset = offsetof(struct gfs2_glock, gl_name), + .head_offset = offsetof(struct gfs2_glock, gl_node), +}; -static inline void spin_unlock_bucket(unsigned int hash) -{ - hlist_bl_unlock(&gl_hash_table[hash]); -} +static struct rhashtable gl_hash_table; -static void gfs2_glock_dealloc(struct rcu_head *rcu) +void gfs2_glock_free(struct gfs2_glock *gl) { - struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu); + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; if (gl->gl_ops->go_flags & GLOF_ASPACE) { kmem_cache_free(gfs2_glock_aspace_cachep, gl); @@ -115,13 +90,6 @@ static void gfs2_glock_dealloc(struct rcu_head *rcu) kfree(gl->gl_lksb.sb_lvbptr); kmem_cache_free(gfs2_glock_cachep, gl); } -} - -void gfs2_glock_free(struct gfs2_glock *gl) -{ - struct gfs2_sbd *sdp = gl->gl_sbd; - - call_rcu(&gl->gl_rcu, gfs2_glock_dealloc); if (atomic_dec_and_test(&sdp->sd_glock_disposal)) wake_up(&sdp->sd_glock_wait); } @@ -192,7 +160,7 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) void gfs2_glock_put(struct gfs2_glock *gl) { - struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct address_space *mapping = gfs2_glock2aspace(gl); if (lockref_put_or_lock(&gl->gl_lockref)) @@ -202,9 +170,7 @@ void gfs2_glock_put(struct gfs2_glock *gl) gfs2_glock_remove_from_lru(gl); spin_unlock(&gl->gl_lockref.lock); - spin_lock_bucket(gl->gl_hash); - hlist_bl_del_rcu(&gl->gl_list); - spin_unlock_bucket(gl->gl_hash); + rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms); GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); GLOCK_BUG_ON(gl, mapping && mapping->nrpages); trace_gfs2_glock_put(gl); @@ -212,33 +178,6 @@ void gfs2_glock_put(struct gfs2_glock *gl) } /** - * search_bucket() - Find struct gfs2_glock by lock number - * @bucket: the bucket to search - * @name: The lock name - * - * Returns: NULL, or the struct gfs2_glock with the requested number - */ - -static struct gfs2_glock *search_bucket(unsigned int hash, - const struct gfs2_sbd *sdp, - const struct lm_lockname *name) -{ - struct gfs2_glock *gl; - struct hlist_bl_node *h; - - hlist_bl_for_each_entry_rcu(gl, h, &gl_hash_table[hash], gl_list) { - if (!lm_name_equal(&gl->gl_name, name)) - continue; - if (gl->gl_sbd != sdp) - continue; - if (lockref_get_not_dead(&gl->gl_lockref)) - return gl; - } - - return NULL; -} - -/** * may_grant - check if its ok to grant a new lock * @gl: The glock * @gh: The lock request which we wish to grant @@ -506,7 +445,7 @@ __releases(&gl->gl_spin) __acquires(&gl->gl_spin) { const struct gfs2_glock_operations *glops = gl->gl_ops; - struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; unsigned int lck_flags = gh ? gh->gh_flags : 0; int ret; @@ -628,7 +567,7 @@ out_unlock: static void delete_work_func(struct work_struct *work) { struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete); - struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct gfs2_inode *ip; struct inode *inode; u64 no_addr = gl->gl_name.ln_number; @@ -704,15 +643,17 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, struct gfs2_glock **glp) { struct super_block *s = sdp->sd_vfs; - struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type }; - struct gfs2_glock *gl, *tmp; - unsigned int hash = gl_hash(sdp, &name); + struct lm_lockname name = { .ln_number = number, + .ln_type = glops->go_type, + .ln_sbd = sdp }; + struct gfs2_glock *gl, *tmp = NULL; struct address_space *mapping; struct kmem_cache *cachep; + int ret, tries = 0; - rcu_read_lock(); - gl = search_bucket(hash, sdp, &name); - rcu_read_unlock(); + gl = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms); + if (gl && !lockref_get_not_dead(&gl->gl_lockref)) + gl = NULL; *glp = gl; if (gl) @@ -739,14 +680,13 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, } atomic_inc(&sdp->sd_glock_disposal); - gl->gl_sbd = sdp; + gl->gl_node.next = NULL; gl->gl_flags = 0; gl->gl_name = name; gl->gl_lockref.count = 1; gl->gl_state = LM_ST_UNLOCKED; gl->gl_target = LM_ST_UNLOCKED; gl->gl_demote_state = LM_ST_EXCLUSIVE; - gl->gl_hash = hash; gl->gl_ops = glops; gl->gl_dstamp = ktime_set(0, 0); preempt_disable(); @@ -771,22 +711,34 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, mapping->writeback_index = 0; } - spin_lock_bucket(hash); - tmp = search_bucket(hash, sdp, &name); - if (tmp) { - spin_unlock_bucket(hash); - kfree(gl->gl_lksb.sb_lvbptr); - kmem_cache_free(cachep, gl); - atomic_dec(&sdp->sd_glock_disposal); - gl = tmp; - } else { - hlist_bl_add_head_rcu(&gl->gl_list, &gl_hash_table[hash]); - spin_unlock_bucket(hash); +again: + ret = rhashtable_lookup_insert_fast(&gl_hash_table, &gl->gl_node, + ht_parms); + if (ret == 0) { + *glp = gl; + return 0; } - *glp = gl; + if (ret == -EEXIST) { + ret = 0; + tmp = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms); + if (tmp == NULL || !lockref_get_not_dead(&tmp->gl_lockref)) { + if (++tries < 100) { + cond_resched(); + goto again; + } + tmp = NULL; + ret = -ENOMEM; + } + } else { + WARN_ON_ONCE(ret); + } + kfree(gl->gl_lksb.sb_lvbptr); + kmem_cache_free(cachep, gl); + atomic_dec(&sdp->sd_glock_disposal); + *glp = tmp; - return 0; + return ret; } /** @@ -928,7 +880,7 @@ __releases(&gl->gl_spin) __acquires(&gl->gl_spin) { struct gfs2_glock *gl = gh->gh_gl; - struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct list_head *insert_pt = NULL; struct gfs2_holder *gh2; int try_futile = 0; @@ -1006,7 +958,7 @@ trap_recursive: int gfs2_glock_nq(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; - struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; int error = 0; if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) @@ -1313,7 +1265,7 @@ static int gfs2_should_freeze(const struct gfs2_glock *gl) void gfs2_glock_complete(struct gfs2_glock *gl, int ret) { - struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; + struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct; spin_lock(&gl->gl_spin); gl->gl_reply = ret; @@ -1462,31 +1414,26 @@ static struct shrinker glock_shrinker = { * */ -static void examine_bucket(glock_examiner examiner, const struct gfs2_sbd *sdp, - unsigned int hash) +static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) { struct gfs2_glock *gl; - struct hlist_bl_head *head = &gl_hash_table[hash]; - struct hlist_bl_node *pos; + struct rhash_head *pos, *next; + const struct bucket_table *tbl; + int i; rcu_read_lock(); - hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) { - if ((gl->gl_sbd == sdp) && lockref_get_not_dead(&gl->gl_lockref)) - examiner(gl); + tbl = rht_dereference_rcu(gl_hash_table.tbl, &gl_hash_table); + for (i = 0; i < tbl->size; i++) { + rht_for_each_entry_safe(gl, pos, next, tbl, i, gl_node) { + if ((gl->gl_name.ln_sbd == sdp) && + lockref_get_not_dead(&gl->gl_lockref)) + examiner(gl); + } } rcu_read_unlock(); cond_resched(); } -static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) -{ - unsigned x; - - for (x = 0; x < GFS2_GL_HASH_SIZE; x++) - examine_bucket(examiner, sdp, x); -} - - /** * thaw_glock - thaw out a glock which has an unprocessed reply waiting * @gl: The glock to thaw @@ -1569,7 +1516,7 @@ void gfs2_glock_finish_truncate(struct gfs2_inode *ip) int ret; ret = gfs2_truncatei_resume(ip); - gfs2_assert_withdraw(gl->gl_sbd, ret == 0); + gfs2_assert_withdraw(gl->gl_name.ln_sbd, ret == 0); spin_lock(&gl->gl_spin); clear_bit(GLF_LOCK, &gl->gl_flags); @@ -1733,17 +1680,17 @@ static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr) { struct gfs2_glock *gl = iter_ptr; - seq_printf(seq, "G: n:%u/%llx rtt:%lld/%lld rttb:%lld/%lld irt:%lld/%lld dcnt: %lld qcnt: %lld\n", + seq_printf(seq, "G: n:%u/%llx rtt:%llu/%llu rttb:%llu/%llu irt:%llu/%llu dcnt: %llu qcnt: %llu\n", gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number, - (long long)gl->gl_stats.stats[GFS2_LKS_SRTT], - (long long)gl->gl_stats.stats[GFS2_LKS_SRTTVAR], - (long long)gl->gl_stats.stats[GFS2_LKS_SRTTB], - (long long)gl->gl_stats.stats[GFS2_LKS_SRTTVARB], - (long long)gl->gl_stats.stats[GFS2_LKS_SIRT], - (long long)gl->gl_stats.stats[GFS2_LKS_SIRTVAR], - (long long)gl->gl_stats.stats[GFS2_LKS_DCOUNT], - (long long)gl->gl_stats.stats[GFS2_LKS_QCOUNT]); + (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTT], + (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTVAR], + (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTB], + (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTVARB], + (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SIRT], + (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SIRTVAR], + (unsigned long long)gl->gl_stats.stats[GFS2_LKS_DCOUNT], + (unsigned long long)gl->gl_stats.stats[GFS2_LKS_QCOUNT]); return 0; } @@ -1776,11 +1723,10 @@ static const char *gfs2_stype[] = { static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr) { - struct gfs2_glock_iter *gi = seq->private; - struct gfs2_sbd *sdp = gi->sdp; - unsigned index = gi->hash >> 3; - unsigned subindex = gi->hash & 0x07; - s64 value; + struct gfs2_sbd *sdp = seq->private; + loff_t pos = *(loff_t *)iter_ptr; + unsigned index = pos >> 3; + unsigned subindex = pos & 0x07; int i; if (index == 0 && subindex != 0) @@ -1791,12 +1737,12 @@ static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr) for_each_possible_cpu(i) { const struct gfs2_pcpu_lkstats *lkstats = per_cpu_ptr(sdp->sd_lkstats, i); - if (index == 0) { - value = i; - } else { - value = lkstats->lkstats[index - 1].stats[subindex]; - } - seq_printf(seq, " %15lld", (long long)value); + + if (index == 0) + seq_printf(seq, " %15u", i); + else + seq_printf(seq, " %15llu", (unsigned long long)lkstats-> + lkstats[index - 1].stats[subindex]); } seq_putc(seq, '\n'); return 0; @@ -1804,20 +1750,24 @@ static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr) int __init gfs2_glock_init(void) { - unsigned i; - for(i = 0; i < GFS2_GL_HASH_SIZE; i++) { - INIT_HLIST_BL_HEAD(&gl_hash_table[i]); - } + int ret; + + ret = rhashtable_init(&gl_hash_table, &ht_parms); + if (ret < 0) + return ret; glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 0); - if (!glock_workqueue) + if (!glock_workqueue) { + rhashtable_destroy(&gl_hash_table); return -ENOMEM; + } gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_MEM_RECLAIM | WQ_FREEZABLE, 0); if (!gfs2_delete_workqueue) { destroy_workqueue(glock_workqueue); + rhashtable_destroy(&gl_hash_table); return -ENOMEM; } @@ -1829,72 +1779,41 @@ int __init gfs2_glock_init(void) void gfs2_glock_exit(void) { unregister_shrinker(&glock_shrinker); + rhashtable_destroy(&gl_hash_table); destroy_workqueue(glock_workqueue); destroy_workqueue(gfs2_delete_workqueue); } -static inline struct gfs2_glock *glock_hash_chain(unsigned hash) +static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi) { - return hlist_bl_entry(hlist_bl_first_rcu(&gl_hash_table[hash]), - struct gfs2_glock, gl_list); -} - -static inline struct gfs2_glock *glock_hash_next(struct gfs2_glock *gl) -{ - return hlist_bl_entry(rcu_dereference(gl->gl_list.next), - struct gfs2_glock, gl_list); -} - -static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi) -{ - struct gfs2_glock *gl; - do { - gl = gi->gl; - if (gl) { - gi->gl = glock_hash_next(gl); - gi->nhash++; - } else { - if (gi->hash >= GFS2_GL_HASH_SIZE) { - rcu_read_unlock(); - return 1; - } - gi->gl = glock_hash_chain(gi->hash); - gi->nhash = 0; - } - while (gi->gl == NULL) { - gi->hash++; - if (gi->hash >= GFS2_GL_HASH_SIZE) { - rcu_read_unlock(); - return 1; - } - gi->gl = glock_hash_chain(gi->hash); - gi->nhash = 0; + gi->gl = rhashtable_walk_next(&gi->hti); + if (IS_ERR(gi->gl)) { + if (PTR_ERR(gi->gl) == -EAGAIN) + continue; + gi->gl = NULL; } /* Skip entries for other sb and dead entries */ - } while (gi->sdp != gi->gl->gl_sbd || - __lockref_is_dead(&gi->gl->gl_lockref)); - - return 0; + } while ((gi->gl) && ((gi->sdp != gi->gl->gl_name.ln_sbd) || + __lockref_is_dead(&gi->gl->gl_lockref))); } static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) { struct gfs2_glock_iter *gi = seq->private; loff_t n = *pos; + int ret; if (gi->last_pos <= *pos) - n = gi->nhash + (*pos - gi->last_pos); - else - gi->hash = 0; + n = (*pos - gi->last_pos); - gi->nhash = 0; - rcu_read_lock(); + ret = rhashtable_walk_start(&gi->hti); + if (ret) + return NULL; do { - if (gfs2_glock_iter_next(gi)) - return NULL; - } while (n--); + gfs2_glock_iter_next(gi); + } while (gi->gl && n--); gi->last_pos = *pos; return gi->gl; @@ -1907,9 +1826,7 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr, (*pos)++; gi->last_pos = *pos; - if (gfs2_glock_iter_next(gi)) - return NULL; - + gfs2_glock_iter_next(gi); return gi->gl; } @@ -1917,9 +1834,8 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) { struct gfs2_glock_iter *gi = seq->private; - if (gi->gl) - rcu_read_unlock(); gi->gl = NULL; + rhashtable_walk_stop(&gi->hti); } static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr) @@ -1930,26 +1846,19 @@ static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr) static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos) { - struct gfs2_glock_iter *gi = seq->private; - - gi->hash = *pos; + preempt_disable(); if (*pos >= GFS2_NR_SBSTATS) return NULL; - preempt_disable(); - return SEQ_START_TOKEN; + return pos; } static void *gfs2_sbstats_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos) { - struct gfs2_glock_iter *gi = seq->private; (*pos)++; - gi->hash++; - if (gi->hash >= GFS2_NR_SBSTATS) { - preempt_enable(); + if (*pos >= GFS2_NR_SBSTATS) return NULL; - } - return SEQ_START_TOKEN; + return pos; } static void gfs2_sbstats_seq_stop(struct seq_file *seq, void *iter_ptr) @@ -1987,14 +1896,28 @@ static int gfs2_glocks_open(struct inode *inode, struct file *file) if (ret == 0) { struct seq_file *seq = file->private_data; struct gfs2_glock_iter *gi = seq->private; + gi->sdp = inode->i_private; + gi->last_pos = 0; seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN); if (seq->buf) seq->size = GFS2_SEQ_GOODSIZE; + gi->gl = NULL; + ret = rhashtable_walk_init(&gl_hash_table, &gi->hti); } return ret; } +static int gfs2_glocks_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct gfs2_glock_iter *gi = seq->private; + + gi->gl = NULL; + rhashtable_walk_exit(&gi->hti); + return seq_release_private(inode, file); +} + static int gfs2_glstats_open(struct inode *inode, struct file *file) { int ret = seq_open_private(file, &gfs2_glstats_seq_ops, @@ -2003,21 +1926,22 @@ static int gfs2_glstats_open(struct inode *inode, struct file *file) struct seq_file *seq = file->private_data; struct gfs2_glock_iter *gi = seq->private; gi->sdp = inode->i_private; + gi->last_pos = 0; seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN); if (seq->buf) seq->size = GFS2_SEQ_GOODSIZE; + gi->gl = NULL; + ret = rhashtable_walk_init(&gl_hash_table, &gi->hti); } return ret; } static int gfs2_sbstats_open(struct inode *inode, struct file *file) { - int ret = seq_open_private(file, &gfs2_sbstats_seq_ops, - sizeof(struct gfs2_glock_iter)); + int ret = seq_open(file, &gfs2_sbstats_seq_ops); if (ret == 0) { struct seq_file *seq = file->private_data; - struct gfs2_glock_iter *gi = seq->private; - gi->sdp = inode->i_private; + seq->private = inode->i_private; /* sdp */ } return ret; } @@ -2027,7 +1951,7 @@ static const struct file_operations gfs2_glocks_fops = { .open = gfs2_glocks_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = gfs2_glocks_release, }; static const struct file_operations gfs2_glstats_fops = { @@ -2035,7 +1959,7 @@ static const struct file_operations gfs2_glstats_fops = { .open = gfs2_glstats_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = gfs2_glocks_release, }; static const struct file_operations gfs2_sbstats_fops = { @@ -2043,7 +1967,7 @@ static const struct file_operations gfs2_sbstats_fops = { .open = gfs2_sbstats_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release, }; int gfs2_create_debugfs_file(struct gfs2_sbd *sdp) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index fa3fa5e94553..1f6c9c3fe5cb 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -32,13 +32,15 @@ struct workqueue_struct *gfs2_freeze_wq; static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh) { - fs_err(gl->gl_sbd, "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page state 0x%lx\n", + fs_err(gl->gl_name.ln_sbd, + "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page " + "state 0x%lx\n", bh, (unsigned long long)bh->b_blocknr, bh->b_state, bh->b_page->mapping, bh->b_page->flags); - fs_err(gl->gl_sbd, "AIL glock %u:%llu mapping %p\n", + fs_err(gl->gl_name.ln_sbd, "AIL glock %u:%llu mapping %p\n", gl->gl_name.ln_type, gl->gl_name.ln_number, gfs2_glock2aspace(gl)); - gfs2_lm_withdraw(gl->gl_sbd, "AIL error\n"); + gfs2_lm_withdraw(gl->gl_name.ln_sbd, "AIL error\n"); } /** @@ -52,7 +54,7 @@ static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh) static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync, unsigned int nr_revokes) { - struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct list_head *head = &gl->gl_ail_list; struct gfs2_bufdata *bd, *tmp; struct buffer_head *bh; @@ -80,7 +82,7 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync, static void gfs2_ail_empty_gl(struct gfs2_glock *gl) { - struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct gfs2_trans tr; memset(&tr, 0, sizeof(tr)); @@ -109,7 +111,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) { - struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; unsigned int revokes = atomic_read(&gl->gl_ail_count); unsigned int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64); int ret; @@ -139,7 +141,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) static void rgrp_go_sync(struct gfs2_glock *gl) { - struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct address_space *mapping = &sdp->sd_aspace; struct gfs2_rgrpd *rgd; int error; @@ -179,7 +181,7 @@ static void rgrp_go_sync(struct gfs2_glock *gl) static void rgrp_go_inval(struct gfs2_glock *gl, int flags) { - struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct address_space *mapping = &sdp->sd_aspace; struct gfs2_rgrpd *rgd = gl->gl_object; @@ -218,7 +220,7 @@ static void inode_go_sync(struct gfs2_glock *gl) GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); - gfs2_log_flush(gl->gl_sbd, gl, NORMAL_FLUSH); + gfs2_log_flush(gl->gl_name.ln_sbd, gl, NORMAL_FLUSH); filemap_fdatawrite(metamapping); if (ip) { struct address_space *mapping = ip->i_inode.i_mapping; @@ -252,7 +254,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags) { struct gfs2_inode *ip = gl->gl_object; - gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); + gfs2_assert_withdraw(gl->gl_name.ln_sbd, !atomic_read(&gl->gl_ail_count)); if (flags & DIO_METADATA) { struct address_space *mapping = gfs2_glock2aspace(gl); @@ -264,9 +266,9 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags) } } - if (ip == GFS2_I(gl->gl_sbd->sd_rindex)) { - gfs2_log_flush(gl->gl_sbd, NULL, NORMAL_FLUSH); - gl->gl_sbd->sd_rindex_uptodate = 0; + if (ip == GFS2_I(gl->gl_name.ln_sbd->sd_rindex)) { + gfs2_log_flush(gl->gl_name.ln_sbd, NULL, NORMAL_FLUSH); + gl->gl_name.ln_sbd->sd_rindex_uptodate = 0; } if (ip && S_ISREG(ip->i_inode.i_mode)) truncate_inode_pages(ip->i_inode.i_mapping, 0); @@ -281,7 +283,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags) static int inode_go_demote_ok(const struct gfs2_glock *gl) { - struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct gfs2_holder *gh; if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object) @@ -416,7 +418,7 @@ int gfs2_inode_refresh(struct gfs2_inode *ip) static int inode_go_lock(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; - struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct gfs2_inode *ip = gl->gl_object; int error = 0; @@ -477,7 +479,7 @@ static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) static void freeze_go_sync(struct gfs2_glock *gl) { int error = 0; - struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; if (gl->gl_state == LM_ST_SHARED && test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { @@ -500,7 +502,7 @@ static void freeze_go_sync(struct gfs2_glock *gl) static int freeze_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh) { - struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode); struct gfs2_glock *j_gl = ip->i_gl; struct gfs2_log_header_host head; @@ -545,7 +547,7 @@ static int freeze_go_demote_ok(const struct gfs2_glock *gl) static void iopen_go_callback(struct gfs2_glock *gl, bool remote) { struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object; - struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; if (!remote || (sdp->sd_vfs->s_flags & MS_RDONLY)) return; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index a1ec7c20e498..121ed08d9d9f 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -22,6 +22,7 @@ #include <linux/ktime.h> #include <linux/percpu.h> #include <linux/lockref.h> +#include <linux/rhashtable.h> #define DIO_WAIT 0x00000010 #define DIO_METADATA 0x00000020 @@ -203,13 +204,15 @@ enum { }; struct lm_lockname { + struct gfs2_sbd *ln_sbd; u64 ln_number; unsigned int ln_type; }; #define lm_name_equal(name1, name2) \ - (((name1)->ln_number == (name2)->ln_number) && \ - ((name1)->ln_type == (name2)->ln_type)) + (((name1)->ln_number == (name2)->ln_number) && \ + ((name1)->ln_type == (name2)->ln_type) && \ + ((name1)->ln_sbd == (name2)->ln_sbd)) struct gfs2_glock_operations { @@ -241,7 +244,7 @@ enum { }; struct gfs2_lkstats { - s64 stats[GFS2_NR_LKSTATS]; + u64 stats[GFS2_NR_LKSTATS]; }; enum { @@ -327,7 +330,6 @@ enum { struct gfs2_glock { struct hlist_bl_node gl_list; - struct gfs2_sbd *gl_sbd; unsigned long gl_flags; /* GLF_... */ struct lm_lockname gl_name; @@ -341,7 +343,6 @@ struct gfs2_glock { gl_req:2, /* State in last dlm request */ gl_reply:8; /* Last reply from the dlm */ - unsigned int gl_hash; unsigned long gl_demote_time; /* time of first demote request */ long gl_hold_time; struct list_head gl_holders; @@ -367,7 +368,7 @@ struct gfs2_glock { loff_t end; } gl_vm; }; - struct rcu_head gl_rcu; + struct rhash_head gl_node; }; #define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */ @@ -835,7 +836,7 @@ static inline void gfs2_glstats_inc(struct gfs2_glock *gl, int which) static inline void gfs2_sbstats_inc(const struct gfs2_glock *gl, int which) { - const struct gfs2_sbd *sdp = gl->gl_sbd; + const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; preempt_disable(); this_cpu_ptr(sdp->sd_lkstats)->lkstats[gl->gl_name.ln_type].stats[which]++; preempt_enable(); diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 641383a9c1bb..284c1542783e 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -31,7 +31,7 @@ extern struct workqueue_struct *gfs2_control_wq; * * @delta is the difference between the current rtt sample and the * running average srtt. We add 1/8 of that to the srtt in order to - * update the current srtt estimate. The varience estimate is a bit + * update the current srtt estimate. The variance estimate is a bit * more complicated. We subtract the abs value of the @delta from * the current variance estimate and add 1/4 of that to the running * total. @@ -80,7 +80,7 @@ static inline void gfs2_update_reply_times(struct gfs2_glock *gl) preempt_disable(); rtt = ktime_to_ns(ktime_sub(ktime_get_real(), gl->gl_dstamp)); - lks = this_cpu_ptr(gl->gl_sbd->sd_lkstats); + lks = this_cpu_ptr(gl->gl_name.ln_sbd->sd_lkstats); gfs2_update_stats(&gl->gl_stats, index, rtt); /* Local */ gfs2_update_stats(&lks->lkstats[gltype], index, rtt); /* Global */ preempt_enable(); @@ -108,7 +108,7 @@ static inline void gfs2_update_request_times(struct gfs2_glock *gl) dstamp = gl->gl_dstamp; gl->gl_dstamp = ktime_get_real(); irt = ktime_to_ns(ktime_sub(gl->gl_dstamp, dstamp)); - lks = this_cpu_ptr(gl->gl_sbd->sd_lkstats); + lks = this_cpu_ptr(gl->gl_name.ln_sbd->sd_lkstats); gfs2_update_stats(&gl->gl_stats, GFS2_LKS_SIRT, irt); /* Local */ gfs2_update_stats(&lks->lkstats[gltype], GFS2_LKS_SIRT, irt); /* Global */ preempt_enable(); @@ -253,7 +253,7 @@ static void gfs2_reverse_hex(char *c, u64 value) static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, unsigned int flags) { - struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; + struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct; int req; u32 lkf; char strname[GDLM_STRNAME_BYTES] = ""; @@ -281,7 +281,7 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, static void gdlm_put_lock(struct gfs2_glock *gl) { - struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct lm_lockstruct *ls = &sdp->sd_lockstruct; int lvb_needs_unlock = 0; int error; @@ -319,7 +319,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl) static void gdlm_cancel(struct gfs2_glock *gl) { - struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; + struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct; dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl); } diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 92324ac58290..d5369a109781 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -70,7 +70,7 @@ static bool buffer_is_rgrp(const struct gfs2_bufdata *bd) static void maybe_release_space(struct gfs2_bufdata *bd) { struct gfs2_glock *gl = bd->bd_gl; - struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct gfs2_rgrpd *rgd = gl->gl_object; unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number; struct gfs2_bitmap *bi = rgd->rd_bits + index; @@ -578,7 +578,7 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, static void gfs2_meta_sync(struct gfs2_glock *gl) { struct address_space *mapping = gfs2_glock2aspace(gl); - struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; int error; if (mapping == NULL) @@ -588,7 +588,7 @@ static void gfs2_meta_sync(struct gfs2_glock *gl) error = filemap_fdatawait(mapping); if (error) - gfs2_io_error(gl->gl_sbd); + gfs2_io_error(gl->gl_name.ln_sbd); } static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index b984a6e190bc..0e1d4be5865a 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -114,7 +114,7 @@ const struct address_space_operations gfs2_rgrp_aops = { struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) { struct address_space *mapping = gfs2_glock2aspace(gl); - struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct page *page; struct buffer_head *bh; unsigned int shift; @@ -200,7 +200,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, struct buffer_head **bhp) { - struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct buffer_head *bh; if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) { @@ -362,7 +362,7 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) { - struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct buffer_head *first_bh, *bh; u32 max_ra = gfs2_tune_get(sdp, gt_max_readahead) >> sdp->sd_sb.sb_bsize_shift; diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index ac5d8027d335..8ca161567a93 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -44,7 +44,7 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping) { struct inode *inode = mapping->host; if (mapping->a_ops == &gfs2_meta_aops) - return (((struct gfs2_glock *)mapping) - 1)->gl_sbd; + return (((struct gfs2_glock *)mapping) - 1)->gl_name.ln_sbd; else if (mapping->a_ops == &gfs2_rgrp_aops) return container_of(mapping, struct gfs2_sbd, sd_aspace); else diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 9b61f92fcfdf..3a31226531ea 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -119,7 +119,7 @@ static void gfs2_qd_dispose(struct list_head *list) while (!list_empty(list)) { qd = list_entry(list->next, struct gfs2_quota_data, qd_lru); - sdp = qd->qd_gl->gl_sbd; + sdp = qd->qd_gl->gl_name.ln_sbd; list_del(&qd->qd_lru); @@ -302,7 +302,7 @@ static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, static void qd_hold(struct gfs2_quota_data *qd) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd; gfs2_assert(sdp, !__lockref_is_dead(&qd->qd_lockref)); lockref_get(&qd->qd_lockref); } @@ -367,7 +367,7 @@ static void slot_put(struct gfs2_quota_data *qd) static int bh_get(struct gfs2_quota_data *qd) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); unsigned int block, offset; struct buffer_head *bh; @@ -414,7 +414,7 @@ fail: static void bh_put(struct gfs2_quota_data *qd) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd; mutex_lock(&sdp->sd_quota_mutex); gfs2_assert(sdp, qd->qd_bh_count); @@ -486,7 +486,7 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp) static void qd_unlock(struct gfs2_quota_data *qd) { - gfs2_assert_warn(qd->qd_gl->gl_sbd, + gfs2_assert_warn(qd->qd_gl->gl_name.ln_sbd, test_bit(QDF_LOCKED, &qd->qd_flags)); clear_bit(QDF_LOCKED, &qd->qd_flags); bh_put(qd); @@ -614,7 +614,7 @@ static int sort_qd(const void *a, const void *b) static void do_qc(struct gfs2_quota_data *qd, s64 change) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); struct gfs2_quota_change *qc = qd->qd_bh_qc; s64 x; @@ -831,7 +831,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) { - struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd; + struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_name.ln_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); struct gfs2_alloc_parms ap = { .aflags = 0, }; unsigned int data_blocks, ind_blocks; @@ -922,7 +922,7 @@ out: gfs2_glock_dq_uninit(&ghs[qx]); mutex_unlock(&ip->i_inode.i_mutex); kfree(ghs); - gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl, NORMAL_FLUSH); + gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl, NORMAL_FLUSH); return error; } @@ -954,7 +954,7 @@ static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd) static int do_glock(struct gfs2_quota_data *qd, int force_refresh, struct gfs2_holder *q_gh) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); struct gfs2_holder i_gh; int error; @@ -1037,7 +1037,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) static int need_sync(struct gfs2_quota_data *qd) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd; struct gfs2_tune *gt = &sdp->sd_tune; s64 value; unsigned int num, den; @@ -1125,7 +1125,7 @@ out: static int print_message(struct gfs2_quota_data *qd, char *type) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd; fs_info(sdp, "quota %s for %s %u\n", type, diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index c6c62321dfd6..475985d14758 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1860,13 +1860,13 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops) { const struct gfs2_glock *gl = rgd->rd_gl; - const struct gfs2_sbd *sdp = gl->gl_sbd; + const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct gfs2_lkstats *st; - s64 r_dcount, l_dcount; - s64 l_srttb, a_srttb = 0; + u64 r_dcount, l_dcount; + u64 l_srttb, a_srttb = 0; s64 srttb_diff; - s64 sqr_diff; - s64 var; + u64 sqr_diff; + u64 var; int cpu, nonzero = 0; preempt_disable(); diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index 20c007d747ab..49ac55da4e33 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -104,7 +104,7 @@ TRACE_EVENT(gfs2_glock_state_change, ), TP_fast_assign( - __entry->dev = gl->gl_sbd->sd_vfs->s_dev; + __entry->dev = gl->gl_name.ln_sbd->sd_vfs->s_dev; __entry->glnum = gl->gl_name.ln_number; __entry->gltype = gl->gl_name.ln_type; __entry->cur_state = glock_trace_state(gl->gl_state); @@ -140,7 +140,7 @@ TRACE_EVENT(gfs2_glock_put, ), TP_fast_assign( - __entry->dev = gl->gl_sbd->sd_vfs->s_dev; + __entry->dev = gl->gl_name.ln_sbd->sd_vfs->s_dev; __entry->gltype = gl->gl_name.ln_type; __entry->glnum = gl->gl_name.ln_number; __entry->cur_state = glock_trace_state(gl->gl_state); @@ -174,7 +174,7 @@ TRACE_EVENT(gfs2_demote_rq, ), TP_fast_assign( - __entry->dev = gl->gl_sbd->sd_vfs->s_dev; + __entry->dev = gl->gl_name.ln_sbd->sd_vfs->s_dev; __entry->gltype = gl->gl_name.ln_type; __entry->glnum = gl->gl_name.ln_number; __entry->cur_state = glock_trace_state(gl->gl_state); @@ -209,7 +209,7 @@ TRACE_EVENT(gfs2_promote, ), TP_fast_assign( - __entry->dev = gh->gh_gl->gl_sbd->sd_vfs->s_dev; + __entry->dev = gh->gh_gl->gl_name.ln_sbd->sd_vfs->s_dev; __entry->glnum = gh->gh_gl->gl_name.ln_number; __entry->gltype = gh->gh_gl->gl_name.ln_type; __entry->first = first; @@ -239,7 +239,7 @@ TRACE_EVENT(gfs2_glock_queue, ), TP_fast_assign( - __entry->dev = gh->gh_gl->gl_sbd->sd_vfs->s_dev; + __entry->dev = gh->gh_gl->gl_name.ln_sbd->sd_vfs->s_dev; __entry->glnum = gh->gh_gl->gl_name.ln_number; __entry->gltype = gh->gh_gl->gl_name.ln_type; __entry->queue = queue; @@ -267,18 +267,18 @@ TRACE_EVENT(gfs2_glock_lock_time, __field( int, status ) __field( char, flags ) __field( s64, tdiff ) - __field( s64, srtt ) - __field( s64, srttvar ) - __field( s64, srttb ) - __field( s64, srttvarb ) - __field( s64, sirt ) - __field( s64, sirtvar ) - __field( s64, dcount ) - __field( s64, qcount ) + __field( u64, srtt ) + __field( u64, srttvar ) + __field( u64, srttb ) + __field( u64, srttvarb ) + __field( u64, sirt ) + __field( u64, sirtvar ) + __field( u64, dcount ) + __field( u64, qcount ) ), TP_fast_assign( - __entry->dev = gl->gl_sbd->sd_vfs->s_dev; + __entry->dev = gl->gl_name.ln_sbd->sd_vfs->s_dev; __entry->glnum = gl->gl_name.ln_number; __entry->gltype = gl->gl_name.ln_type; __entry->status = gl->gl_lksb.sb_status; @@ -333,7 +333,7 @@ TRACE_EVENT(gfs2_pin, ), TP_fast_assign( - __entry->dev = bd->bd_gl->gl_sbd->sd_vfs->s_dev; + __entry->dev = bd->bd_gl->gl_name.ln_sbd->sd_vfs->s_dev; __entry->pin = pin; __entry->len = bd->bd_bh->b_size; __entry->block = bd->bd_bh->b_blocknr; @@ -449,7 +449,7 @@ TRACE_EVENT(gfs2_bmap, ), TP_fast_assign( - __entry->dev = ip->i_gl->gl_sbd->sd_vfs->s_dev; + __entry->dev = ip->i_gl->gl_name.ln_sbd->sd_vfs->s_dev; __entry->lblock = lblock; __entry->pblock = buffer_mapped(bh) ? bh->b_blocknr : 0; __entry->inum = ip->i_no_addr; @@ -489,7 +489,7 @@ TRACE_EVENT(gfs2_block_alloc, ), TP_fast_assign( - __entry->dev = rgd->rd_gl->gl_sbd->sd_vfs->s_dev; + __entry->dev = rgd->rd_gl->gl_name.ln_sbd->sd_vfs->s_dev; __entry->start = block; __entry->inum = ip->i_no_addr; __entry->len = len; diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 88bff2430669..b95d0d625f32 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -158,7 +158,7 @@ static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl, void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh) { struct gfs2_trans *tr = current->journal_info; - struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct address_space *mapping = bh->b_page->mapping; struct gfs2_inode *ip = GFS2_I(mapping->host); struct gfs2_bufdata *bd; @@ -224,7 +224,7 @@ static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) { - struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct gfs2_bufdata *bd; lock_buffer(bh); diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index d3fa6bd9503e..221719eac5de 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -288,7 +288,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) page_cache_release(page); goto fail; } - page_cache_release(page); node->page[i] = page; } @@ -398,11 +397,11 @@ node_error: void hfs_bnode_free(struct hfs_bnode *node) { - //int i; + int i; - //for (i = 0; i < node->tree->pages_per_bnode; i++) - // if (node->page[i]) - // page_cache_release(node->page[i]); + for (i = 0; i < node->tree->pages_per_bnode; i++) + if (node->page[i]) + page_cache_release(node->page[i]); kfree(node); } diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c index 9f4ee7f52026..6fc766df0461 100644 --- a/fs/hfs/brec.c +++ b/fs/hfs/brec.c @@ -131,13 +131,16 @@ skip: hfs_bnode_write(node, entry, data_off + key_len, entry_len); hfs_bnode_dump(node); - if (new_node) { - /* update parent key if we inserted a key - * at the start of the first node - */ - if (!rec && new_node != node) - hfs_brec_update_parent(fd); + /* + * update parent key if we inserted a key + * at the start of the node and it is not the new node + */ + if (!rec && new_node != node) { + hfs_bnode_read_key(node, fd->search_key, data_off + size); + hfs_brec_update_parent(fd); + } + if (new_node) { hfs_bnode_put(fd->bnode); if (!new_node->parent) { hfs_btree_inc_height(tree); @@ -166,9 +169,6 @@ skip: goto again; } - if (!rec) - hfs_brec_update_parent(fd); - return 0; } @@ -366,6 +366,8 @@ again: if (IS_ERR(parent)) return PTR_ERR(parent); __hfs_brec_find(parent, fd); + if (fd->record < 0) + return -ENOENT; hfs_bnode_dump(parent); rec = fd->record; diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 759708fd9331..63924662aaf3 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -454,7 +454,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) page_cache_release(page); goto fail; } - page_cache_release(page); node->page[i] = page; } @@ -566,13 +565,11 @@ node_error: void hfs_bnode_free(struct hfs_bnode *node) { -#if 0 int i; for (i = 0; i < node->tree->pages_per_bnode; i++) if (node->page[i]) page_cache_release(node->page[i]); -#endif kfree(node); } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 973c24ce59ad..316adb968b65 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -12,6 +12,7 @@ #include <linux/thread_info.h> #include <asm/current.h> #include <linux/sched.h> /* remove ASAP */ +#include <linux/falloc.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/file.h> @@ -84,6 +85,29 @@ static const match_table_t tokens = { {Opt_err, NULL}, }; +#ifdef CONFIG_NUMA +static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma, + struct inode *inode, pgoff_t index) +{ + vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy, + index); +} + +static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma) +{ + mpol_cond_put(vma->vm_policy); +} +#else +static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma, + struct inode *inode, pgoff_t index) +{ +} + +static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma) +{ +} +#endif + static void huge_pagevec_release(struct pagevec *pvec) { int i; @@ -293,26 +317,61 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, return -EINVAL; } -static void truncate_huge_page(struct page *page) +static void remove_huge_page(struct page *page) { ClearPageDirty(page); ClearPageUptodate(page); delete_from_page_cache(page); } -static void truncate_hugepages(struct inode *inode, loff_t lstart) + +/* + * remove_inode_hugepages handles two distinct cases: truncation and hole + * punch. There are subtle differences in operation for each case. + + * truncation is indicated by end of range being LLONG_MAX + * In this case, we first scan the range and release found pages. + * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv + * maps and global counts. + * hole punch is indicated if end is not LLONG_MAX + * In the hole punch case we scan the range and release found pages. + * Only when releasing a page is the associated region/reserv map + * deleted. The region/reserv map for ranges without associated + * pages are not modified. + * Note: If the passed end of range value is beyond the end of file, but + * not LLONG_MAX this routine still performs a hole punch operation. + */ +static void remove_inode_hugepages(struct inode *inode, loff_t lstart, + loff_t lend) { struct hstate *h = hstate_inode(inode); struct address_space *mapping = &inode->i_data; const pgoff_t start = lstart >> huge_page_shift(h); + const pgoff_t end = lend >> huge_page_shift(h); + struct vm_area_struct pseudo_vma; struct pagevec pvec; pgoff_t next; int i, freed = 0; + long lookup_nr = PAGEVEC_SIZE; + bool truncate_op = (lend == LLONG_MAX); + memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); + pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pagevec_init(&pvec, 0); next = start; - while (1) { - if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + while (next < end) { + /* + * Make sure to never grab more pages that we + * might possibly need. + */ + if (end - next < lookup_nr) + lookup_nr = end - next; + + /* + * This pagevec_lookup() may return pages past 'end', + * so we must check for page->index > end. + */ + if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) { if (next == start) break; next = start; @@ -321,26 +380,69 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart) for (i = 0; i < pagevec_count(&pvec); ++i) { struct page *page = pvec.pages[i]; + u32 hash; + + hash = hugetlb_fault_mutex_hash(h, current->mm, + &pseudo_vma, + mapping, next, 0); + mutex_lock(&hugetlb_fault_mutex_table[hash]); lock_page(page); + if (page->index >= end) { + unlock_page(page); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + next = end; /* we are done */ + break; + } + + /* + * If page is mapped, it was faulted in after being + * unmapped. Do nothing in this race case. In the + * normal case page is not mapped. + */ + if (!page_mapped(page)) { + bool rsv_on_error = !PagePrivate(page); + /* + * We must free the huge page and remove + * from page cache (remove_huge_page) BEFORE + * removing the region/reserve map + * (hugetlb_unreserve_pages). In rare out + * of memory conditions, removal of the + * region/reserve map could fail. Before + * free'ing the page, note PagePrivate which + * is used in case of error. + */ + remove_huge_page(page); + freed++; + if (!truncate_op) { + if (unlikely(hugetlb_unreserve_pages( + inode, next, + next + 1, 1))) + hugetlb_fix_reserve_counts( + inode, rsv_on_error); + } + } + if (page->index > next) next = page->index; + ++next; - truncate_huge_page(page); unlock_page(page); - freed++; + + mutex_unlock(&hugetlb_fault_mutex_table[hash]); } huge_pagevec_release(&pvec); } - BUG_ON(!lstart && mapping->nrpages); - hugetlb_unreserve_pages(inode, start, freed); + + if (truncate_op) + (void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed); } static void hugetlbfs_evict_inode(struct inode *inode) { struct resv_map *resv_map; - truncate_hugepages(inode, 0); + remove_inode_hugepages(inode, 0, LLONG_MAX); resv_map = (struct resv_map *)inode->i_mapping->private_data; /* root inode doesn't have the resv_map, so we should check it */ if (resv_map) @@ -349,11 +451,15 @@ static void hugetlbfs_evict_inode(struct inode *inode) } static inline void -hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff) +hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end) { struct vm_area_struct *vma; - vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) { + /* + * end == 0 indicates that the entire range after + * start should be unmapped. + */ + vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) { unsigned long v_offset; /* @@ -362,13 +468,20 @@ hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff) * which overlap the truncated area starting at pgoff, * and no vma on a 32-bit arch can span beyond the 4GB. */ - if (vma->vm_pgoff < pgoff) - v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT; + if (vma->vm_pgoff < start) + v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT; else v_offset = 0; - unmap_hugepage_range(vma, vma->vm_start + v_offset, - vma->vm_end, NULL); + if (end) { + end = ((end - start) << PAGE_SHIFT) + + vma->vm_start + v_offset; + if (end > vma->vm_end) + end = vma->vm_end; + } else + end = vma->vm_end; + + unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL); } } @@ -384,12 +497,164 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) i_size_write(inode, offset); i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap)) - hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); + hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0); i_mmap_unlock_write(mapping); - truncate_hugepages(inode, offset); + remove_inode_hugepages(inode, offset, LLONG_MAX); return 0; } +static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) +{ + struct hstate *h = hstate_inode(inode); + loff_t hpage_size = huge_page_size(h); + loff_t hole_start, hole_end; + + /* + * For hole punch round up the beginning offset of the hole and + * round down the end. + */ + hole_start = round_up(offset, hpage_size); + hole_end = round_down(offset + len, hpage_size); + + if (hole_end > hole_start) { + struct address_space *mapping = inode->i_mapping; + + mutex_lock(&inode->i_mutex); + i_mmap_lock_write(mapping); + if (!RB_EMPTY_ROOT(&mapping->i_mmap)) + hugetlb_vmdelete_list(&mapping->i_mmap, + hole_start >> PAGE_SHIFT, + hole_end >> PAGE_SHIFT); + i_mmap_unlock_write(mapping); + remove_inode_hugepages(inode, hole_start, hole_end); + mutex_unlock(&inode->i_mutex); + } + + return 0; +} + +static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + struct address_space *mapping = inode->i_mapping; + struct hstate *h = hstate_inode(inode); + struct vm_area_struct pseudo_vma; + struct mm_struct *mm = current->mm; + loff_t hpage_size = huge_page_size(h); + unsigned long hpage_shift = huge_page_shift(h); + pgoff_t start, index, end; + int error; + u32 hash; + + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + if (mode & FALLOC_FL_PUNCH_HOLE) + return hugetlbfs_punch_hole(inode, offset, len); + + /* + * Default preallocate case. + * For this range, start is rounded down and end is rounded up + * as well as being converted to page offsets. + */ + start = offset >> hpage_shift; + end = (offset + len + hpage_size - 1) >> hpage_shift; + + mutex_lock(&inode->i_mutex); + + /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ + error = inode_newsize_ok(inode, offset + len); + if (error) + goto out; + + /* + * Initialize a pseudo vma as this is required by the huge page + * allocation routines. If NUMA is configured, use page index + * as input to create an allocation policy. + */ + memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); + pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); + pseudo_vma.vm_file = file; + + for (index = start; index < end; index++) { + /* + * This is supposed to be the vaddr where the page is being + * faulted in, but we have no vaddr here. + */ + struct page *page; + unsigned long addr; + int avoid_reserve = 0; + + cond_resched(); + + /* + * fallocate(2) manpage permits EINTR; we may have been + * interrupted because we are using up too much memory. + */ + if (signal_pending(current)) { + error = -EINTR; + break; + } + + /* Set numa allocation policy based on index */ + hugetlb_set_vma_policy(&pseudo_vma, inode, index); + + /* addr is the offset within the file (zero based) */ + addr = index * hpage_size; + + /* mutex taken here, fault path and hole punch */ + hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping, + index, addr); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + + /* See if already present in mapping to avoid alloc/free */ + page = find_get_page(mapping, index); + if (page) { + put_page(page); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + hugetlb_drop_vma_policy(&pseudo_vma); + continue; + } + + /* Allocate page and add to page cache */ + page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve); + hugetlb_drop_vma_policy(&pseudo_vma); + if (IS_ERR(page)) { + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + error = PTR_ERR(page); + goto out; + } + clear_huge_page(page, addr, pages_per_huge_page(h)); + __SetPageUptodate(page); + error = huge_add_to_page_cache(page, mapping, index); + if (unlikely(error)) { + put_page(page); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + goto out; + } + + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + + /* + * page_put due to reference from alloc_huge_page() + * unlock_page because locked by add_to_page_cache() + */ + put_page(page); + unlock_page(page); + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) + i_size_write(inode, offset + len); + inode->i_ctime = CURRENT_TIME; + spin_lock(&inode->i_lock); + inode->i_private = NULL; + spin_unlock(&inode->i_lock); +out: + mutex_unlock(&inode->i_mutex); + return error; +} + static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -701,7 +966,8 @@ const struct file_operations hugetlbfs_file_operations = { .mmap = hugetlbfs_file_mmap, .fsync = noop_fsync, .get_unmapped_area = hugetlb_get_unmapped_area, - .llseek = default_llseek, + .llseek = default_llseek, + .fallocate = hugetlbfs_fallocate, }; static const struct inode_operations hugetlbfs_dir_inode_operations = { diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 2d48d28e1640..91e004518237 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -92,6 +92,29 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) } /** + * kernfs_path_len - determine the length of the full path of a given node + * @kn: kernfs_node of interest + * + * The returned length doesn't include the space for the terminating '\0'. + */ +size_t kernfs_path_len(struct kernfs_node *kn) +{ + size_t len = 0; + unsigned long flags; + + spin_lock_irqsave(&kernfs_rename_lock, flags); + + do { + len += strlen(kn->name) + 1; + kn = kn->parent; + } while (kn && kn->parent); + + spin_unlock_irqrestore(&kernfs_rename_lock, flags); + + return len; +} + +/** * kernfs_path - build full path of a given node * @kn: kernfs_node of interest * @buf: buffer to copy @kn's name into diff --git a/fs/namei.c b/fs/namei.c index 29b927938b8c..726d211db484 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2438,7 +2438,7 @@ done: /** * path_mountpoint - look up a path to be umounted - * @nameidata: lookup context + * @nd: lookup context * @flags: lookup flags * @path: pointer to container for result * diff --git a/fs/nsfs.c b/fs/nsfs.c index e4905fbf3396..8f20d6016e20 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -142,7 +142,8 @@ static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry) struct inode *inode = d_inode(dentry); const struct proc_ns_operations *ns_ops = dentry->d_fsdata; - return seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino); + seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino); + return 0; } static const struct super_operations nsfs_ops = { diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index d0e436dc6437..ce12e0b1a31f 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1776,7 +1776,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, struct dlm_migratable_lockres *mres) { struct dlm_migratable_lock *ml; - struct list_head *queue; + struct list_head *queue, *iter; struct list_head *tmpq = NULL; struct dlm_lock *newlock = NULL; struct dlm_lockstatus *lksb = NULL; @@ -1821,7 +1821,9 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { tmpq = dlm_list_idx_to_ptr(res, j); - list_for_each_entry(lock, tmpq, list) { + list_for_each(iter, tmpq) { + lock = list_entry(iter, + struct dlm_lock, list); if (lock->ml.cookie == ml->cookie) break; lock = NULL; diff --git a/fs/proc/base.c b/fs/proc/base.c index aa50d1ac28fc..b25eee4cead5 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1230,10 +1230,9 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); - char *page, *tmp; - ssize_t length; uid_t loginuid; kuid_t kloginuid; + int rv; rcu_read_lock(); if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { @@ -1242,46 +1241,28 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, } rcu_read_unlock(); - if (count >= PAGE_SIZE) - count = PAGE_SIZE - 1; - if (*ppos != 0) { /* No partial writes. */ return -EINVAL; } - page = (char*)__get_free_page(GFP_TEMPORARY); - if (!page) - return -ENOMEM; - length = -EFAULT; - if (copy_from_user(page, buf, count)) - goto out_free_page; - - page[count] = '\0'; - loginuid = simple_strtoul(page, &tmp, 10); - if (tmp == page) { - length = -EINVAL; - goto out_free_page; - } + rv = kstrtou32_from_user(buf, count, 10, &loginuid); + if (rv < 0) + return rv; /* is userspace tring to explicitly UNSET the loginuid? */ if (loginuid == AUDIT_UID_UNSET) { kloginuid = INVALID_UID; } else { kloginuid = make_kuid(file->f_cred->user_ns, loginuid); - if (!uid_valid(kloginuid)) { - length = -EINVAL; - goto out_free_page; - } + if (!uid_valid(kloginuid)) + return -EINVAL; } - length = audit_set_loginuid(kloginuid); - if (likely(length == 0)) - length = count; - -out_free_page: - free_page((unsigned long) page); - return length; + rv = audit_set_loginuid(kloginuid); + if (rv < 0) + return rv; + return count; } static const struct file_operations proc_loginuid_operations = { @@ -1335,8 +1316,9 @@ static ssize_t proc_fault_inject_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct task_struct *task; - char buffer[PROC_NUMBUF], *end; + char buffer[PROC_NUMBUF]; int make_it_fail; + int rv; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; @@ -1345,9 +1327,9 @@ static ssize_t proc_fault_inject_write(struct file * file, count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; - make_it_fail = simple_strtol(strstrip(buffer), &end, 0); - if (*end) - return -EINVAL; + rv = kstrtoint(strstrip(buffer), 0, &make_it_fail); + if (rv < 0) + return rv; if (make_it_fail < 0 || make_it_fail > 1) return -EINVAL; @@ -1836,8 +1818,6 @@ end_instantiate: return dir_emit(ctx, name, len, 1, DT_UNKNOWN); } -#ifdef CONFIG_CHECKPOINT_RESTORE - /* * dname_to_vma_addr - maps a dentry name into two unsigned longs * which represent vma start and end addresses. @@ -1864,11 +1844,6 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - if (!capable(CAP_SYS_ADMIN)) { - status = -EPERM; - goto out_notask; - } - inode = d_inode(dentry); task = get_proc_task(inode); if (!task) @@ -1957,6 +1932,29 @@ struct map_files_info { unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ }; +/* + * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the + * symlinks may be used to bypass permissions on ancestor directories in the + * path to the file in question. + */ +static const char * +proc_map_files_follow_link(struct dentry *dentry, void **cookie) +{ + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + return proc_pid_follow_link(dentry, NULL); +} + +/* + * Identical to proc_pid_link_inode_operations except for follow_link() + */ +static const struct inode_operations proc_map_files_link_inode_operations = { + .readlink = proc_pid_readlink, + .follow_link = proc_map_files_follow_link, + .setattr = proc_setattr, +}; + static int proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) @@ -1972,7 +1970,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, ei = PROC_I(inode); ei->op.proc_get_link = proc_map_files_get_link; - inode->i_op = &proc_pid_link_inode_operations; + inode->i_op = &proc_map_files_link_inode_operations; inode->i_size = 64; inode->i_mode = S_IFLNK; @@ -1996,10 +1994,6 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, int result; struct mm_struct *mm; - result = -EPERM; - if (!capable(CAP_SYS_ADMIN)) - goto out; - result = -ENOENT; task = get_proc_task(dir); if (!task) @@ -2053,10 +2047,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) struct map_files_info *p; int ret; - ret = -EPERM; - if (!capable(CAP_SYS_ADMIN)) - goto out; - ret = -ENOENT; task = get_proc_task(file_inode(file)); if (!task) @@ -2245,7 +2235,6 @@ static const struct file_operations proc_timers_operations = { .llseek = seq_lseek, .release = seq_release_private, }; -#endif /* CONFIG_CHECKPOINT_RESTORE */ static int proc_pident_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) @@ -2481,32 +2470,20 @@ static ssize_t proc_coredump_filter_write(struct file *file, { struct task_struct *task; struct mm_struct *mm; - char buffer[PROC_NUMBUF], *end; unsigned int val; int ret; int i; unsigned long mask; - ret = -EFAULT; - memset(buffer, 0, sizeof(buffer)); - if (count > sizeof(buffer) - 1) - count = sizeof(buffer) - 1; - if (copy_from_user(buffer, buf, count)) - goto out_no_task; - - ret = -EINVAL; - val = (unsigned int)simple_strtoul(buffer, &end, 0); - if (*end == '\n') - end++; - if (end - buffer == 0) - goto out_no_task; + ret = kstrtouint_from_user(buf, count, 0, &val); + if (ret < 0) + return ret; ret = -ESRCH; task = get_proc_task(file_inode(file)); if (!task) goto out_no_task; - ret = end - buffer; mm = get_task_mm(task); if (!mm) goto out_no_mm; @@ -2522,7 +2499,9 @@ static ssize_t proc_coredump_filter_write(struct file *file, out_no_mm: put_task_struct(task); out_no_task: - return ret; + if (ret < 0) + return ret; + return count; } static const struct file_operations proc_coredump_filter_operations = { @@ -2744,9 +2723,7 @@ static const struct inode_operations proc_task_inode_operations; static const struct pid_entry tgid_base_stuff[] = { DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), -#ifdef CONFIG_CHECKPOINT_RESTORE DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations), -#endif DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), #ifdef CONFIG_NET diff --git a/fs/proc/generic.c b/fs/proc/generic.c index e5dee5c3188e..ff3ffc76a937 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -26,7 +26,7 @@ #include "internal.h" -static DEFINE_SPINLOCK(proc_subdir_lock); +static DEFINE_RWLOCK(proc_subdir_lock); static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de) { @@ -172,9 +172,9 @@ static int xlate_proc_name(const char *name, struct proc_dir_entry **ret, { int rv; - spin_lock(&proc_subdir_lock); + read_lock(&proc_subdir_lock); rv = __xlate_proc_name(name, ret, residual); - spin_unlock(&proc_subdir_lock); + read_unlock(&proc_subdir_lock); return rv; } @@ -231,11 +231,11 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, { struct inode *inode; - spin_lock(&proc_subdir_lock); + read_lock(&proc_subdir_lock); de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len); if (de) { pde_get(de); - spin_unlock(&proc_subdir_lock); + read_unlock(&proc_subdir_lock); inode = proc_get_inode(dir->i_sb, de); if (!inode) return ERR_PTR(-ENOMEM); @@ -243,7 +243,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, d_add(dentry, inode); return NULL; } - spin_unlock(&proc_subdir_lock); + read_unlock(&proc_subdir_lock); return ERR_PTR(-ENOENT); } @@ -270,12 +270,12 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file, if (!dir_emit_dots(file, ctx)) return 0; - spin_lock(&proc_subdir_lock); + read_lock(&proc_subdir_lock); de = pde_subdir_first(de); i = ctx->pos - 2; for (;;) { if (!de) { - spin_unlock(&proc_subdir_lock); + read_unlock(&proc_subdir_lock); return 0; } if (!i) @@ -287,19 +287,19 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file, do { struct proc_dir_entry *next; pde_get(de); - spin_unlock(&proc_subdir_lock); + read_unlock(&proc_subdir_lock); if (!dir_emit(ctx, de->name, de->namelen, de->low_ino, de->mode >> 12)) { pde_put(de); return 0; } - spin_lock(&proc_subdir_lock); + read_lock(&proc_subdir_lock); ctx->pos++; next = pde_subdir_next(de); pde_put(de); de = next; } while (de); - spin_unlock(&proc_subdir_lock); + read_unlock(&proc_subdir_lock); return 1; } @@ -338,16 +338,16 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp if (ret) return ret; - spin_lock(&proc_subdir_lock); + write_lock(&proc_subdir_lock); dp->parent = dir; if (pde_subdir_insert(dir, dp) == false) { WARN(1, "proc_dir_entry '%s/%s' already registered\n", dir->name, dp->name); - spin_unlock(&proc_subdir_lock); + write_unlock(&proc_subdir_lock); proc_free_inum(dp->low_ino); return -EEXIST; } - spin_unlock(&proc_subdir_lock); + write_unlock(&proc_subdir_lock); return 0; } @@ -549,9 +549,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) const char *fn = name; unsigned int len; - spin_lock(&proc_subdir_lock); + write_lock(&proc_subdir_lock); if (__xlate_proc_name(name, &parent, &fn) != 0) { - spin_unlock(&proc_subdir_lock); + write_unlock(&proc_subdir_lock); return; } len = strlen(fn); @@ -559,7 +559,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) de = pde_subdir_find(parent, fn, len); if (de) rb_erase(&de->subdir_node, &parent->subdir); - spin_unlock(&proc_subdir_lock); + write_unlock(&proc_subdir_lock); if (!de) { WARN(1, "name '%s'\n", name); return; @@ -583,16 +583,16 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) const char *fn = name; unsigned int len; - spin_lock(&proc_subdir_lock); + write_lock(&proc_subdir_lock); if (__xlate_proc_name(name, &parent, &fn) != 0) { - spin_unlock(&proc_subdir_lock); + write_unlock(&proc_subdir_lock); return -ENOENT; } len = strlen(fn); root = pde_subdir_find(parent, fn, len); if (!root) { - spin_unlock(&proc_subdir_lock); + write_unlock(&proc_subdir_lock); return -ENOENT; } rb_erase(&root->subdir_node, &parent->subdir); @@ -605,7 +605,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) de = next; continue; } - spin_unlock(&proc_subdir_lock); + write_unlock(&proc_subdir_lock); proc_entry_rundown(de); next = de->parent; @@ -616,7 +616,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) break; pde_put(de); - spin_lock(&proc_subdir_lock); + write_lock(&proc_subdir_lock); de = next; } pde_put(root); diff --git a/fs/proc/page.c b/fs/proc/page.c index 7eee2d8b97d9..93484034a03d 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -9,12 +9,16 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/hugetlb.h> +#include <linux/memcontrol.h> +#include <linux/mmu_notifier.h> +#include <linux/page_idle.h> #include <linux/kernel-page-flags.h> #include <asm/uaccess.h> #include "internal.h" #define KPMSIZE sizeof(u64) #define KPMMASK (KPMSIZE - 1) +#define KPMBITS (KPMSIZE * BITS_PER_BYTE) /* /proc/kpagecount - an array exposing page counts * @@ -54,6 +58,8 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, pfn++; out++; count -= KPMSIZE; + + cond_resched(); } *ppos += (char __user *)out - buf; @@ -146,6 +152,9 @@ u64 stable_page_flags(struct page *page) if (PageBalloon(page)) u |= 1 << KPF_BALLOON; + if (page_is_idle(page)) + u |= 1 << KPF_IDLE; + u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); @@ -212,6 +221,8 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, pfn++; out++; count -= KPMSIZE; + + cond_resched(); } *ppos += (char __user *)out - buf; @@ -225,10 +236,64 @@ static const struct file_operations proc_kpageflags_operations = { .read = kpageflags_read, }; +#ifdef CONFIG_MEMCG +static ssize_t kpagecgroup_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 __user *out = (u64 __user *)buf; + struct page *ppage; + unsigned long src = *ppos; + unsigned long pfn; + ssize_t ret = 0; + u64 ino; + + pfn = src / KPMSIZE; + count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src); + if (src & KPMMASK || count & KPMMASK) + return -EINVAL; + + while (count > 0) { + if (pfn_valid(pfn)) + ppage = pfn_to_page(pfn); + else + ppage = NULL; + + if (ppage) + ino = page_cgroup_ino(ppage); + else + ino = 0; + + if (put_user(ino, out)) { + ret = -EFAULT; + break; + } + + pfn++; + out++; + count -= KPMSIZE; + + cond_resched(); + } + + *ppos += (char __user *)out - buf; + if (!ret) + ret = (char __user *)out - buf; + return ret; +} + +static const struct file_operations proc_kpagecgroup_operations = { + .llseek = mem_lseek, + .read = kpagecgroup_read, +}; +#endif /* CONFIG_MEMCG */ + static int __init proc_page_init(void) { proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations); proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations); +#ifdef CONFIG_MEMCG + proc_create("kpagecgroup", S_IRUSR, NULL, &proc_kpagecgroup_operations); +#endif return 0; } fs_initcall(proc_page_init); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3b4d8255e806..e2d46adb54b4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -13,6 +13,7 @@ #include <linux/swap.h> #include <linux/swapops.h> #include <linux/mmu_notifier.h> +#include <linux/page_idle.h> #include <asm/elf.h> #include <asm/uaccess.h> @@ -446,6 +447,7 @@ struct mem_size_stats { unsigned long anonymous_thp; unsigned long swap; u64 pss; + u64 swap_pss; }; static void smaps_account(struct mem_size_stats *mss, struct page *page, @@ -458,7 +460,7 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, mss->resident += size; /* Accumulate the size in pages that have been accessed. */ - if (young || PageReferenced(page)) + if (young || page_is_young(page) || PageReferenced(page)) mss->referenced += size; mapcount = page_mapcount(page); if (mapcount >= 2) { @@ -492,9 +494,20 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } else if (is_swap_pte(*pte)) { swp_entry_t swpent = pte_to_swp_entry(*pte); - if (!non_swap_entry(swpent)) + if (!non_swap_entry(swpent)) { + int mapcount; + mss->swap += PAGE_SIZE; - else if (is_migration_entry(swpent)) + mapcount = swp_swapcount(swpent); + if (mapcount >= 2) { + u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT; + + do_div(pss_delta, mapcount); + mss->swap_pss += pss_delta; + } else { + mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; + } + } else if (is_migration_entry(swpent)) page = migration_entry_to_page(swpent); } @@ -640,6 +653,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) "Anonymous: %8lu kB\n" "AnonHugePages: %8lu kB\n" "Swap: %8lu kB\n" + "SwapPss: %8lu kB\n" "KernelPageSize: %8lu kB\n" "MMUPageSize: %8lu kB\n" "Locked: %8lu kB\n", @@ -654,6 +668,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) mss.anonymous >> 10, mss.anonymous_thp >> 10, mss.swap >> 10, + (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)), vma_kernel_pagesize(vma) >> 10, vma_mmu_pagesize(vma) >> 10, (vma->vm_flags & VM_LOCKED) ? @@ -712,23 +727,6 @@ const struct file_operations proc_tid_smaps_operations = { .release = proc_map_release, }; -/* - * We do not want to have constant page-shift bits sitting in - * pagemap entries and are about to reuse them some time soon. - * - * Here's the "migration strategy": - * 1. when the system boots these bits remain what they are, - * but a warning about future change is printed in log; - * 2. once anyone clears soft-dirty bits via clear_refs file, - * these flag is set to denote, that user is aware of the - * new API and those page-shift bits change their meaning. - * The respective warning is printed in dmesg; - * 3. In a couple of releases we will remove all the mentions - * of page-shift in pagemap entries. - */ - -static bool soft_dirty_cleared __read_mostly; - enum clear_refs_types { CLEAR_REFS_ALL = 1, CLEAR_REFS_ANON, @@ -810,6 +808,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, /* Clear accessed and referenced bits. */ pmdp_test_and_clear_young(vma, addr, pmd); + test_and_clear_page_young(page); ClearPageReferenced(page); out: spin_unlock(ptl); @@ -837,6 +836,7 @@ out: /* Clear accessed and referenced bits. */ ptep_test_and_clear_young(vma, addr, pte); + test_and_clear_page_young(page); ClearPageReferenced(page); } pte_unmap_unlock(pte - 1, ptl); @@ -889,13 +889,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) return -EINVAL; - if (type == CLEAR_REFS_SOFT_DIRTY) { - soft_dirty_cleared = true; - pr_warn_once("The pagemap bits 55-60 has changed their meaning!" - " See the linux/Documentation/vm/pagemap.txt for " - "details.\n"); - } - task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; @@ -963,36 +956,26 @@ typedef struct { struct pagemapread { int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ pagemap_entry_t *buffer; - bool v2; + bool show_pfn; }; #define PAGEMAP_WALK_SIZE (PMD_SIZE) #define PAGEMAP_WALK_MASK (PMD_MASK) -#define PM_ENTRY_BYTES sizeof(pagemap_entry_t) -#define PM_STATUS_BITS 3 -#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) -#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) -#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK) -#define PM_PSHIFT_BITS 6 -#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) -#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) -#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) -#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) -#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) -/* in "new" pagemap pshift bits are occupied with more status bits */ -#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT)) - -#define __PM_SOFT_DIRTY (1LL) -#define PM_PRESENT PM_STATUS(4LL) -#define PM_SWAP PM_STATUS(2LL) -#define PM_FILE PM_STATUS(1LL) -#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0) +#define PM_ENTRY_BYTES sizeof(pagemap_entry_t) +#define PM_PFRAME_BITS 55 +#define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0) +#define PM_SOFT_DIRTY BIT_ULL(55) +#define PM_MMAP_EXCLUSIVE BIT_ULL(56) +#define PM_FILE BIT_ULL(61) +#define PM_SWAP BIT_ULL(62) +#define PM_PRESENT BIT_ULL(63) + #define PM_END_OF_BUFFER 1 -static inline pagemap_entry_t make_pme(u64 val) +static inline pagemap_entry_t make_pme(u64 frame, u64 flags) { - return (pagemap_entry_t) { .pme = val }; + return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags }; } static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, @@ -1013,7 +996,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, while (addr < end) { struct vm_area_struct *vma = find_vma(walk->mm, addr); - pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); + pagemap_entry_t pme = make_pme(0, 0); /* End of address space hole, which we mark as non-present. */ unsigned long hole_end; @@ -1033,7 +1016,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, /* Addresses in the VMA. */ if (vma->vm_flags & VM_SOFTDIRTY) - pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY); + pme = make_pme(0, PM_SOFT_DIRTY); for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { err = add_to_pagemap(addr, &pme, pm); if (err) @@ -1044,67 +1027,42 @@ out: return err; } -static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, +static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, struct vm_area_struct *vma, unsigned long addr, pte_t pte) { - u64 frame, flags; + u64 frame = 0, flags = 0; struct page *page = NULL; - int flags2 = 0; if (pte_present(pte)) { - frame = pte_pfn(pte); - flags = PM_PRESENT; + if (pm->show_pfn) + frame = pte_pfn(pte); + flags |= PM_PRESENT; page = vm_normal_page(vma, addr, pte); if (pte_soft_dirty(pte)) - flags2 |= __PM_SOFT_DIRTY; + flags |= PM_SOFT_DIRTY; } else if (is_swap_pte(pte)) { swp_entry_t entry; if (pte_swp_soft_dirty(pte)) - flags2 |= __PM_SOFT_DIRTY; + flags |= PM_SOFT_DIRTY; entry = pte_to_swp_entry(pte); frame = swp_type(entry) | (swp_offset(entry) << MAX_SWAPFILES_SHIFT); - flags = PM_SWAP; + flags |= PM_SWAP; if (is_migration_entry(entry)) page = migration_entry_to_page(entry); - } else { - if (vma->vm_flags & VM_SOFTDIRTY) - flags2 |= __PM_SOFT_DIRTY; - *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); - return; } if (page && !PageAnon(page)) flags |= PM_FILE; - if ((vma->vm_flags & VM_SOFTDIRTY)) - flags2 |= __PM_SOFT_DIRTY; - - *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); -} + if (page && page_mapcount(page) == 1) + flags |= PM_MMAP_EXCLUSIVE; + if (vma->vm_flags & VM_SOFTDIRTY) + flags |= PM_SOFT_DIRTY; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, - pmd_t pmd, int offset, int pmd_flags2) -{ - /* - * Currently pmd for thp is always present because thp can not be - * swapped-out, migrated, or HWPOISONed (split in such cases instead.) - * This if-check is just to prepare for future implementation. - */ - if (pmd_present(pmd)) - *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) - | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); - else - *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2)); + return make_pme(frame, flags); } -#else -static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, - pmd_t pmd, int offset, int pmd_flags2) -{ -} -#endif -static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, +static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; @@ -1113,41 +1071,58 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t *pte, *orig_pte; int err = 0; - if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { - int pmd_flags2; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) { + u64 flags = 0, frame = 0; + pmd_t pmd = *pmdp; - if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) - pmd_flags2 = __PM_SOFT_DIRTY; - else - pmd_flags2 = 0; + if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd)) + flags |= PM_SOFT_DIRTY; + + /* + * Currently pmd for thp is always present because thp + * can not be swapped-out, migrated, or HWPOISONed + * (split in such cases instead.) + * This if-check is just to prepare for future implementation. + */ + if (pmd_present(pmd)) { + struct page *page = pmd_page(pmd); + + if (page_mapcount(page) == 1) + flags |= PM_MMAP_EXCLUSIVE; + + flags |= PM_PRESENT; + if (pm->show_pfn) + frame = pmd_pfn(pmd) + + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + } for (; addr != end; addr += PAGE_SIZE) { - unsigned long offset; - pagemap_entry_t pme; + pagemap_entry_t pme = make_pme(frame, flags); - offset = (addr & ~PAGEMAP_WALK_MASK) >> - PAGE_SHIFT; - thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2); err = add_to_pagemap(addr, &pme, pm); if (err) break; + if (pm->show_pfn && (flags & PM_PRESENT)) + frame++; } spin_unlock(ptl); return err; } - if (pmd_trans_unstable(pmd)) + if (pmd_trans_unstable(pmdp)) return 0; +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* * We can assume that @vma always points to a valid one and @end never * goes beyond vma->vm_end. */ - orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl); for (; addr < end; pte++, addr += PAGE_SIZE) { pagemap_entry_t pme; - pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); + pme = pte_to_pagemap_entry(pm, vma, addr, *pte); err = add_to_pagemap(addr, &pme, pm); if (err) break; @@ -1160,40 +1135,44 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } #ifdef CONFIG_HUGETLB_PAGE -static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, - pte_t pte, int offset, int flags2) -{ - if (pte_present(pte)) - *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) | - PM_STATUS2(pm->v2, flags2) | - PM_PRESENT); - else - *pme = make_pme(PM_NOT_PRESENT(pm->v2) | - PM_STATUS2(pm->v2, flags2)); -} - /* This function walks within one hugetlb entry in the single call */ -static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, +static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct pagemapread *pm = walk->private; struct vm_area_struct *vma = walk->vma; + u64 flags = 0, frame = 0; int err = 0; - int flags2; - pagemap_entry_t pme; + pte_t pte; if (vma->vm_flags & VM_SOFTDIRTY) - flags2 = __PM_SOFT_DIRTY; - else - flags2 = 0; + flags |= PM_SOFT_DIRTY; + + pte = huge_ptep_get(ptep); + if (pte_present(pte)) { + struct page *page = pte_page(pte); + + if (!PageAnon(page)) + flags |= PM_FILE; + + if (page_mapcount(page) == 1) + flags |= PM_MMAP_EXCLUSIVE; + + flags |= PM_PRESENT; + if (pm->show_pfn) + frame = pte_pfn(pte) + + ((addr & ~hmask) >> PAGE_SHIFT); + } for (; addr != end; addr += PAGE_SIZE) { - int offset = (addr & ~hmask) >> PAGE_SHIFT; - huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2); + pagemap_entry_t pme = make_pme(frame, flags); + err = add_to_pagemap(addr, &pme, pm); if (err) return err; + if (pm->show_pfn && (flags & PM_PRESENT)) + frame++; } cond_resched(); @@ -1211,7 +1190,9 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, * Bits 0-54 page frame number (PFN) if present * Bits 0-4 swap type if swapped * Bits 5-54 swap offset if swapped - * Bits 55-60 page shift (page size = 1<<page shift) + * Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt) + * Bit 56 page exclusively mapped + * Bits 57-60 zero * Bit 61 page is file-page or shared-anon * Bit 62 page swapped * Bit 63 page present @@ -1229,42 +1210,37 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, static ssize_t pagemap_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task = get_proc_task(file_inode(file)); - struct mm_struct *mm; + struct mm_struct *mm = file->private_data; struct pagemapread pm; - int ret = -ESRCH; struct mm_walk pagemap_walk = {}; unsigned long src; unsigned long svpfn; unsigned long start_vaddr; unsigned long end_vaddr; - int copied = 0; + int ret = 0, copied = 0; - if (!task) + if (!mm || !atomic_inc_not_zero(&mm->mm_users)) goto out; ret = -EINVAL; /* file position must be aligned */ if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) - goto out_task; + goto out_mm; ret = 0; if (!count) - goto out_task; + goto out_mm; + + /* do not disclose physical addresses: attack vector */ + pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN); - pm.v2 = soft_dirty_cleared; pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY); ret = -ENOMEM; if (!pm.buffer) - goto out_task; - - mm = mm_access(task, PTRACE_MODE_READ); - ret = PTR_ERR(mm); - if (!mm || IS_ERR(mm)) - goto out_free; + goto out_mm; - pagemap_walk.pmd_entry = pagemap_pte_range; + pagemap_walk.pmd_entry = pagemap_pmd_range; pagemap_walk.pte_hole = pagemap_pte_hole; #ifdef CONFIG_HUGETLB_PAGE pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; @@ -1275,10 +1251,10 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, src = *ppos; svpfn = src / PM_ENTRY_BYTES; start_vaddr = svpfn << PAGE_SHIFT; - end_vaddr = TASK_SIZE_OF(task); + end_vaddr = mm->task_size; /* watch out for wraparound */ - if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT) + if (svpfn > mm->task_size >> PAGE_SHIFT) start_vaddr = end_vaddr; /* @@ -1305,7 +1281,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, len = min(count, PM_ENTRY_BYTES * pm.pos); if (copy_to_user(buf, pm.buffer, len)) { ret = -EFAULT; - goto out_mm; + goto out_free; } copied += len; buf += len; @@ -1315,24 +1291,31 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, if (!ret || ret == PM_END_OF_BUFFER) ret = copied; -out_mm: - mmput(mm); out_free: kfree(pm.buffer); -out_task: - put_task_struct(task); +out_mm: + mmput(mm); out: return ret; } static int pagemap_open(struct inode *inode, struct file *file) { - /* do not disclose physical addresses: attack vector */ - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about " - "to stop being page-shift some time soon. See the " - "linux/Documentation/vm/pagemap.txt for details.\n"); + struct mm_struct *mm; + + mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(mm)) + return PTR_ERR(mm); + file->private_data = mm; + return 0; +} + +static int pagemap_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + + if (mm) + mmdrop(mm); return 0; } @@ -1340,6 +1323,7 @@ const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, .open = pagemap_open, + .release = pagemap_release, }; #endif /* CONFIG_PROC_PAGE_MONITOR */ diff --git a/fs/seq_file.c b/fs/seq_file.c index ce9e39fd5daf..225586e141ca 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -12,6 +12,7 @@ #include <linux/slab.h> #include <linux/cred.h> #include <linux/mm.h> +#include <linux/printk.h> #include <asm/uaccess.h> #include <asm/page.h> @@ -371,16 +372,16 @@ EXPORT_SYMBOL(seq_release); * @esc: set of characters that need escaping * * Puts string into buffer, replacing each occurrence of character from - * @esc with usual octal escape. Returns 0 in case of success, -1 - in - * case of overflow. + * @esc with usual octal escape. + * Use seq_has_overflowed() to check for errors. */ -int seq_escape(struct seq_file *m, const char *s, const char *esc) +void seq_escape(struct seq_file *m, const char *s, const char *esc) { char *end = m->buf + m->size; - char *p; + char *p; char c; - for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) { + for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) { if (!strchr(esc, c)) { *p++ = c; continue; @@ -393,14 +394,13 @@ int seq_escape(struct seq_file *m, const char *s, const char *esc) continue; } seq_set_overflow(m); - return -1; - } + return; + } m->count = p - m->buf; - return 0; } EXPORT_SYMBOL(seq_escape); -int seq_vprintf(struct seq_file *m, const char *f, va_list args) +void seq_vprintf(struct seq_file *m, const char *f, va_list args) { int len; @@ -408,24 +408,20 @@ int seq_vprintf(struct seq_file *m, const char *f, va_list args) len = vsnprintf(m->buf + m->count, m->size - m->count, f, args); if (m->count + len < m->size) { m->count += len; - return 0; + return; } } seq_set_overflow(m); - return -1; } EXPORT_SYMBOL(seq_vprintf); -int seq_printf(struct seq_file *m, const char *f, ...) +void seq_printf(struct seq_file *m, const char *f, ...) { - int ret; va_list args; va_start(args, f); - ret = seq_vprintf(m, f, args); + seq_vprintf(m, f, args); va_end(args); - - return ret; } EXPORT_SYMBOL(seq_printf); @@ -663,26 +659,25 @@ int seq_open_private(struct file *filp, const struct seq_operations *ops, } EXPORT_SYMBOL(seq_open_private); -int seq_putc(struct seq_file *m, char c) +void seq_putc(struct seq_file *m, char c) { - if (m->count < m->size) { - m->buf[m->count++] = c; - return 0; - } - return -1; + if (m->count >= m->size) + return; + + m->buf[m->count++] = c; } EXPORT_SYMBOL(seq_putc); -int seq_puts(struct seq_file *m, const char *s) +void seq_puts(struct seq_file *m, const char *s) { int len = strlen(s); - if (m->count + len < m->size) { - memcpy(m->buf + m->count, s, len); - m->count += len; - return 0; + + if (m->count + len >= m->size) { + seq_set_overflow(m); + return; } - seq_set_overflow(m); - return -1; + memcpy(m->buf + m->count, s, len); + m->count += len; } EXPORT_SYMBOL(seq_puts); @@ -693,8 +688,8 @@ EXPORT_SYMBOL(seq_puts); * This routine is very quick when you show lots of numbers. * In usual cases, it will be better to use seq_printf(). It's easier to read. */ -int seq_put_decimal_ull(struct seq_file *m, char delimiter, - unsigned long long num) +void seq_put_decimal_ull(struct seq_file *m, char delimiter, + unsigned long long num) { int len; @@ -706,35 +701,33 @@ int seq_put_decimal_ull(struct seq_file *m, char delimiter, if (num < 10) { m->buf[m->count++] = num + '0'; - return 0; + return; } len = num_to_str(m->buf + m->count, m->size - m->count, num); if (!len) goto overflow; m->count += len; - return 0; + return; + overflow: seq_set_overflow(m); - return -1; } EXPORT_SYMBOL(seq_put_decimal_ull); -int seq_put_decimal_ll(struct seq_file *m, char delimiter, - long long num) +void seq_put_decimal_ll(struct seq_file *m, char delimiter, long long num) { if (num < 0) { if (m->count + 3 >= m->size) { seq_set_overflow(m); - return -1; + return; } if (delimiter) m->buf[m->count++] = delimiter; num = -num; delimiter = '-'; } - return seq_put_decimal_ull(m, delimiter, num); - + seq_put_decimal_ull(m, delimiter, num); } EXPORT_SYMBOL(seq_put_decimal_ll); @@ -773,6 +766,47 @@ void seq_pad(struct seq_file *m, char c) } EXPORT_SYMBOL(seq_pad); +/* A complete analogue of print_hex_dump() */ +void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type, + int rowsize, int groupsize, const void *buf, size_t len, + bool ascii) +{ + const u8 *ptr = buf; + int i, linelen, remaining = len; + int ret; + + if (rowsize != 16 && rowsize != 32) + rowsize = 16; + + for (i = 0; i < len && !seq_has_overflowed(m); i += rowsize) { + linelen = min(remaining, rowsize); + remaining -= rowsize; + + switch (prefix_type) { + case DUMP_PREFIX_ADDRESS: + seq_printf(m, "%s%p: ", prefix_str, ptr + i); + break; + case DUMP_PREFIX_OFFSET: + seq_printf(m, "%s%.8x: ", prefix_str, i); + break; + default: + seq_printf(m, "%s", prefix_str); + break; + } + + ret = hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize, + m->buf + m->count, m->size - m->count, + ascii); + if (ret >= m->size - m->count) { + seq_set_overflow(m); + } else { + m->count += ret; + seq_putc(m, '\n'); + } + } +} +EXPORT_SYMBOL(seq_hex_dump); + struct list_head *seq_list_start(struct list_head *head, loff_t pos) { struct list_head *lh; diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index fb8b54eb77c5..dc5fae601c24 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -417,14 +417,14 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, if (oldcount == 0) { result = ufs_alloc_fragments (inode, cgno, goal, count, err); if (result) { + ufs_clear_frags(inode, result + oldcount, + newcount - oldcount, locked_page != NULL); write_seqlock(&UFS_I(inode)->meta_lock); ufs_cpu_to_data_ptr(sb, p, result); write_sequnlock(&UFS_I(inode)->meta_lock); *err = 0; UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, fragment + count); - ufs_clear_frags(inode, result + oldcount, - newcount - oldcount, locked_page != NULL); } mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT, result %llu\n", (unsigned long long)result); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 331c1ccf8264..c79b717d9b88 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -23,6 +23,7 @@ #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/fs.h> +#include <linux/dax.h> #include <linux/buffer_head.h> #include <linux/uio.h> #include <linux/list_lru.h> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index de2c2376242b..e78feb400e22 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1546,8 +1546,36 @@ xfs_filemap_fault( return ret; } +STATIC int +xfs_filemap_pmd_fault( + struct vm_area_struct *vma, + unsigned long addr, + pmd_t *pmd, + unsigned int flags) +{ + struct inode *inode = file_inode(vma->vm_file); + struct xfs_inode *ip = XFS_I(inode); + int ret; + + if (!IS_DAX(inode)) + return VM_FAULT_FALLBACK; + + trace_xfs_filemap_pmd_fault(ip); + + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_direct, + xfs_end_io_dax_write); + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + sb_end_pagefault(inode->i_sb); + + return ret; +} + static const struct vm_operations_struct xfs_file_vm_ops = { .fault = xfs_filemap_fault, + .pmd_fault = xfs_filemap_pmd_fault, .map_pages = filemap_map_pages, .page_mkwrite = xfs_filemap_page_mkwrite, }; @@ -1560,7 +1588,7 @@ xfs_file_mmap( file_accessed(filp); vma->vm_ops = &xfs_file_vm_ops; if (IS_DAX(file_inode(filp))) - vma->vm_flags |= VM_MIXEDMAP; + vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; return 0; } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 9aeeb21bc3d0..5ed36b1e04c1 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -687,6 +687,7 @@ DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag); DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid); DEFINE_INODE_EVENT(xfs_filemap_fault); +DEFINE_INODE_EVENT(xfs_filemap_pmd_fault); DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite); DECLARE_EVENT_CLASS(xfs_iref_class, |