aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/affs/super.c8
-rw-r--r--fs/block_dev.c5
-rw-r--r--fs/btrfs/async-thread.c57
-rw-r--r--fs/btrfs/async-thread.h2
-rw-r--r--fs/btrfs/dev-replace.c3
-rw-r--r--fs/btrfs/disk-io.c76
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/inode.c3
-rw-r--r--fs/btrfs/scrub.c12
-rw-r--r--fs/btrfs/tree-defrag.c3
-rw-r--r--fs/btrfs/volumes.c21
-rw-r--r--fs/ceph/addr.c8
-rw-r--r--fs/ceph/caps.c8
-rw-r--r--fs/ceph/file.c14
-rw-r--r--fs/ceph/mds_client.c59
-rw-r--r--fs/ceph/mds_client.h1
-rw-r--r--fs/ceph/snap.c7
-rw-r--r--fs/ceph/super.c1
-rw-r--r--fs/cifs/cifs_ioctl.h42
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifspdu.h14
-rw-r--r--fs/cifs/cifssmb.c7
-rw-r--r--fs/cifs/file.c2
-rw-r--r--fs/cifs/ioctl.c48
-rw-r--r--fs/cifs/smb2pdu.c7
-rw-r--r--fs/cifs/transport.c2
-rw-r--r--fs/coda/upcall.c6
-rw-r--r--fs/coredump.c46
-rw-r--r--fs/dax.c261
-rw-r--r--fs/debugfs/file.c14
-rw-r--r--fs/ecryptfs/crypto.c3
-rw-r--r--fs/ecryptfs/dentry.c16
-rw-r--r--fs/ext2/file.c10
-rw-r--r--fs/ext2/inode.c1
-rw-r--r--fs/ext4/ext4.h2
-rw-r--r--fs/ext4/file.c68
-rw-r--r--fs/ext4/indirect.c1
-rw-r--r--fs/ext4/inode.c12
-rw-r--r--fs/fs-writeback.c152
-rw-r--r--fs/gfs2/glock.c348
-rw-r--r--fs/gfs2/glops.c38
-rw-r--r--fs/gfs2/incore.h15
-rw-r--r--fs/gfs2/lock_dlm.c12
-rw-r--r--fs/gfs2/lops.c6
-rw-r--r--fs/gfs2/meta_io.c6
-rw-r--r--fs/gfs2/meta_io.h2
-rw-r--r--fs/gfs2/quota.c22
-rw-r--r--fs/gfs2/rgrp.c10
-rw-r--r--fs/gfs2/trace_gfs2.h34
-rw-r--r--fs/gfs2/trans.c4
-rw-r--r--fs/hfs/bnode.c9
-rw-r--r--fs/hfs/brec.c20
-rw-r--r--fs/hfsplus/bnode.c3
-rw-r--r--fs/hugetlbfs/inode.c302
-rw-r--r--fs/kernfs/dir.c23
-rw-r--r--fs/namei.c2
-rw-r--r--fs/nsfs.c3
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c6
-rw-r--r--fs/proc/base.c113
-rw-r--r--fs/proc/generic.c44
-rw-r--r--fs/proc/page.c65
-rw-r--r--fs/proc/task_mmu.c290
-rw-r--r--fs/seq_file.c112
-rw-r--r--fs/ufs/balloc.c4
-rw-r--r--fs/xfs/xfs_buf.h1
-rw-r--r--fs/xfs/xfs_file.c30
-rw-r--r--fs/xfs/xfs_trace.h1
67 files changed, 1613 insertions, 917 deletions
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 3f89c9e05b40..5b50c4ca43a7 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -18,6 +18,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/writeback.h>
+#include <linux/blkdev.h>
#include "affs.h"
static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
@@ -352,18 +353,19 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
* blocks, we will have to change it.
*/
- size = sb->s_bdev->bd_inode->i_size >> 9;
+ size = i_size_read(sb->s_bdev->bd_inode) >> 9;
pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size);
affs_set_blocksize(sb, PAGE_SIZE);
/* Try to find root block. Its location depends on the block size. */
- i = 512;
- j = 4096;
+ i = bdev_logical_block_size(sb->s_bdev);
+ j = PAGE_SIZE;
if (blocksize > 0) {
i = j = blocksize;
size = size / (blocksize / 512);
}
+
for (blocksize = i; blocksize <= j; blocksize <<= 1, size >>= 1) {
sbi->s_root_block = root_block;
if (root_block < 0)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 33b813e04f79..22ea424ee741 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -28,6 +28,7 @@
#include <linux/namei.h>
#include <linux/log2.h>
#include <linux/cleancache.h>
+#include <linux/dax.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -441,7 +442,7 @@ EXPORT_SYMBOL_GPL(bdev_write_page);
* accessible at this address.
*/
long bdev_direct_access(struct block_device *bdev, sector_t sector,
- void **addr, unsigned long *pfn, long size)
+ void __pmem **addr, unsigned long *pfn, long size)
{
long avail;
const struct block_device_operations *ops = bdev->bd_disk->fops;
@@ -462,7 +463,7 @@ long bdev_direct_access(struct block_device *bdev, sector_t sector,
sector += get_start_sect(bdev);
if (sector % (PAGE_SIZE / 512))
return -EINVAL;
- avail = ops->direct_access(bdev, sector, addr, pfn, size);
+ avail = ops->direct_access(bdev, sector, addr, pfn);
if (!avail)
return -ERANGE;
return min(avail, size);
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 1ce06c849a86..3e36e4adc4a3 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -42,8 +42,14 @@ struct __btrfs_workqueue {
/* Thresholding related variants */
atomic_t pending;
- int max_active;
- int current_max;
+
+ /* Up limit of concurrency workers */
+ int limit_active;
+
+ /* Current number of concurrency workers */
+ int current_active;
+
+ /* Threshold to change current_active */
int thresh;
unsigned int count;
spinlock_t thres_lock;
@@ -88,7 +94,7 @@ BTRFS_WORK_HELPER(scrubnc_helper);
BTRFS_WORK_HELPER(scrubparity_helper);
static struct __btrfs_workqueue *
-__btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active,
+__btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active,
int thresh)
{
struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
@@ -96,26 +102,31 @@ __btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active,
if (!ret)
return NULL;
- ret->max_active = max_active;
+ ret->limit_active = limit_active;
atomic_set(&ret->pending, 0);
if (thresh == 0)
thresh = DFT_THRESHOLD;
/* For low threshold, disabling threshold is a better choice */
if (thresh < DFT_THRESHOLD) {
- ret->current_max = max_active;
+ ret->current_active = limit_active;
ret->thresh = NO_THRESHOLD;
} else {
- ret->current_max = 1;
+ /*
+ * For threshold-able wq, let its concurrency grow on demand.
+ * Use minimal max_active at alloc time to reduce resource
+ * usage.
+ */
+ ret->current_active = 1;
ret->thresh = thresh;
}
if (flags & WQ_HIGHPRI)
ret->normal_wq = alloc_workqueue("%s-%s-high", flags,
- ret->max_active,
- "btrfs", name);
+ ret->current_active, "btrfs",
+ name);
else
ret->normal_wq = alloc_workqueue("%s-%s", flags,
- ret->max_active, "btrfs",
+ ret->current_active, "btrfs",
name);
if (!ret->normal_wq) {
kfree(ret);
@@ -134,7 +145,7 @@ __btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
unsigned int flags,
- int max_active,
+ int limit_active,
int thresh)
{
struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
@@ -143,14 +154,14 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
return NULL;
ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
- max_active, thresh);
+ limit_active, thresh);
if (!ret->normal) {
kfree(ret);
return NULL;
}
if (flags & WQ_HIGHPRI) {
- ret->high = __btrfs_alloc_workqueue(name, flags, max_active,
+ ret->high = __btrfs_alloc_workqueue(name, flags, limit_active,
thresh);
if (!ret->high) {
__btrfs_destroy_workqueue(ret->normal);
@@ -180,7 +191,7 @@ static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
*/
static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
{
- int new_max_active;
+ int new_current_active;
long pending;
int need_change = 0;
@@ -197,7 +208,7 @@ static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
wq->count %= (wq->thresh / 4);
if (!wq->count)
goto out;
- new_max_active = wq->current_max;
+ new_current_active = wq->current_active;
/*
* pending may be changed later, but it's OK since we really
@@ -205,19 +216,19 @@ static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
*/
pending = atomic_read(&wq->pending);
if (pending > wq->thresh)
- new_max_active++;
+ new_current_active++;
if (pending < wq->thresh / 2)
- new_max_active--;
- new_max_active = clamp_val(new_max_active, 1, wq->max_active);
- if (new_max_active != wq->current_max) {
+ new_current_active--;
+ new_current_active = clamp_val(new_current_active, 1, wq->limit_active);
+ if (new_current_active != wq->current_active) {
need_change = 1;
- wq->current_max = new_max_active;
+ wq->current_active = new_current_active;
}
out:
spin_unlock(&wq->thres_lock);
if (need_change) {
- workqueue_set_max_active(wq->normal_wq, wq->current_max);
+ workqueue_set_max_active(wq->normal_wq, wq->current_active);
}
}
@@ -351,13 +362,13 @@ void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
kfree(wq);
}
-void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max)
+void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int limit_active)
{
if (!wq)
return;
- wq->normal->max_active = max;
+ wq->normal->limit_active = limit_active;
if (wq->high)
- wq->high->max_active = max;
+ wq->high->limit_active = limit_active;
}
void btrfs_set_work_high_priority(struct btrfs_work *work)
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index b0b093b6afec..ad4d0647d1a6 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -69,7 +69,7 @@ BTRFS_WORK_HELPER_PROTO(scrubparity_helper);
struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
unsigned int flags,
- int max_active,
+ int limit_active,
int thresh);
void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper,
btrfs_func_t func,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 564a7de17d99..e54dd5905cee 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -183,8 +183,7 @@ no_valid_dev_replace_entry_found:
}
out:
- if (path)
- btrfs_free_path(path);
+ btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9ebd34f1c677..0d98aee34fee 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3443,6 +3443,26 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
return 0;
}
+int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
+{
+ if ((flags & (BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_AVAIL_ALLOC_BIT_SINGLE)) ||
+ ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0))
+ return 0;
+
+ if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID10))
+ return 1;
+
+ if (flags & BTRFS_BLOCK_GROUP_RAID6)
+ return 2;
+
+ pr_warn("BTRFS: unknown raid type: %llu\n", flags);
+ return 0;
+}
+
int btrfs_calc_num_tolerated_disk_barrier_failures(
struct btrfs_fs_info *fs_info)
{
@@ -3452,13 +3472,12 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
BTRFS_BLOCK_GROUP_SYSTEM,
BTRFS_BLOCK_GROUP_METADATA,
BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
- int num_types = 4;
int i;
int c;
int num_tolerated_disk_barrier_failures =
(int)fs_info->fs_devices->num_devices;
- for (i = 0; i < num_types; i++) {
+ for (i = 0; i < ARRAY_SIZE(types); i++) {
struct btrfs_space_info *tmp;
sinfo = NULL;
@@ -3476,44 +3495,21 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
down_read(&sinfo->groups_sem);
for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
- if (!list_empty(&sinfo->block_groups[c])) {
- u64 flags;
-
- btrfs_get_block_group_info(
- &sinfo->block_groups[c], &space);
- if (space.total_bytes == 0 ||
- space.used_bytes == 0)
- continue;
- flags = space.flags;
- /*
- * return
- * 0: if dup, single or RAID0 is configured for
- * any of metadata, system or data, else
- * 1: if RAID5 is configured, or if RAID1 or
- * RAID10 is configured and only two mirrors
- * are used, else
- * 2: if RAID6 is configured, else
- * num_mirrors - 1: if RAID1 or RAID10 is
- * configured and more than
- * 2 mirrors are used.
- */
- if (num_tolerated_disk_barrier_failures > 0 &&
- ((flags & (BTRFS_BLOCK_GROUP_DUP |
- BTRFS_BLOCK_GROUP_RAID0)) ||
- ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
- == 0)))
- num_tolerated_disk_barrier_failures = 0;
- else if (num_tolerated_disk_barrier_failures > 1) {
- if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID10)) {
- num_tolerated_disk_barrier_failures = 1;
- } else if (flags &
- BTRFS_BLOCK_GROUP_RAID6) {
- num_tolerated_disk_barrier_failures = 2;
- }
- }
- }
+ u64 flags;
+
+ if (list_empty(&sinfo->block_groups[c]))
+ continue;
+
+ btrfs_get_block_group_info(&sinfo->block_groups[c],
+ &space);
+ if (space.total_bytes == 0 || space.used_bytes == 0)
+ continue;
+ flags = space.flags;
+
+ num_tolerated_disk_barrier_failures = min(
+ num_tolerated_disk_barrier_failures,
+ btrfs_get_num_tolerated_disk_barrier_failures(
+ flags));
}
up_read(&sinfo->groups_sem);
}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index d4cbfeeeedd4..bdfb479ea859 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -139,6 +139,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid);
int btree_lock_page_hook(struct page *page, void *data,
void (*flush_fn)(void *));
+int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
int btrfs_calc_num_tolerated_disk_barrier_failures(
struct btrfs_fs_info *fs_info);
int __init btrfs_end_io_wq_init(void);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 237da012f7d0..a0fa7253a2d7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6909,8 +6909,7 @@ out:
trace_btrfs_get_extent(root, em);
- if (path)
- btrfs_free_path(path);
+ btrfs_free_path(path);
if (trans) {
ret = btrfs_end_transaction(trans, root);
if (!err)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 9a11db0c47ee..a39f5d1144e8 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3267,13 +3267,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
scrub_blocked_if_needed(fs_info);
}
- /* for raid56, we skip parity stripe */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
ret = get_raid56_logic_offset(physical, num, map,
&logical,
&stripe_logical);
logical += base;
if (ret) {
+ /* it is parity strip */
stripe_logical += base;
stripe_end = stripe_logical + increment;
ret = scrub_raid56_parity(sctx, map, scrub_dev,
@@ -3480,7 +3480,6 @@ out:
static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev,
- u64 chunk_tree, u64 chunk_objectid,
u64 chunk_offset, u64 length,
u64 dev_offset, int is_dev_replace)
{
@@ -3531,8 +3530,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
struct btrfs_root *root = sctx->dev_root;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 length;
- u64 chunk_tree;
- u64 chunk_objectid;
u64 chunk_offset;
int ret = 0;
int slot;
@@ -3596,8 +3593,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (found_key.offset + length <= start)
goto skip;
- chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
- chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
/*
@@ -3630,9 +3625,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
dev_replace->cursor_right = found_key.offset + length;
dev_replace->cursor_left = found_key.offset;
dev_replace->item_needs_writeback = 1;
- ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
- chunk_offset, length, found_key.offset,
- is_dev_replace);
+ ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
+ found_key.offset, is_dev_replace);
/*
* flush, submit all pending read and write bios, afterwards
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index a4b9c8b2d35a..f31db4325339 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -115,8 +115,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
ret = -EAGAIN;
}
out:
- if (path)
- btrfs_free_path(path);
+ btrfs_free_path(path);
if (ret == -EAGAIN) {
if (root->defrag_max.objectid > root->defrag_progress.objectid)
goto done;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 76201d6f6ce4..6fc735869c18 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3585,23 +3585,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
} while (read_seqretry(&fs_info->profiles_lock, seq));
if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
- int num_tolerated_disk_barrier_failures;
- u64 target = bctl->sys.target;
-
- num_tolerated_disk_barrier_failures =
- btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
- if (num_tolerated_disk_barrier_failures > 0 &&
- (target &
- (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_AVAIL_ALLOC_BIT_SINGLE)))
- num_tolerated_disk_barrier_failures = 0;
- else if (num_tolerated_disk_barrier_failures > 1 &&
- (target &
- (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)))
- num_tolerated_disk_barrier_failures = 1;
-
- fs_info->num_tolerated_disk_barrier_failures =
- num_tolerated_disk_barrier_failures;
+ fs_info->num_tolerated_disk_barrier_failures = min(
+ btrfs_calc_num_tolerated_disk_barrier_failures(fs_info),
+ btrfs_get_num_tolerated_disk_barrier_failures(
+ bctl->sys.target));
}
ret = insert_balance_item(fs_info->tree_root, bctl);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 890c50971a69..9d23e788d1df 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -276,7 +276,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
for (i = 0; i < num_pages; i++) {
struct page *page = osd_data->pages[i];
- if (rc < 0)
+ if (rc < 0 && rc != ENOENT)
goto unlock;
if (bytes < (int)PAGE_CACHE_SIZE) {
/* zero (remainder of) page */
@@ -717,8 +717,10 @@ static int ceph_writepages_start(struct address_space *mapping,
wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
(wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
- if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
+ if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
pr_warn("writepage_start %p on forced umount\n", inode);
+ truncate_pagecache(inode, 0);
+ mapping_set_error(mapping, -EIO);
return -EIO; /* we're in a forced umount, don't write! */
}
if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
@@ -1593,7 +1595,7 @@ out:
return err;
}
-static struct vm_operations_struct ceph_vmops = {
+static const struct vm_operations_struct ceph_vmops = {
.fault = ceph_filemap_fault,
.page_mkwrite = ceph_page_mkwrite,
};
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index ddd5e9471290..27b566874bc1 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2413,6 +2413,14 @@ again:
goto out_unlock;
}
+ if (!__ceph_is_any_caps(ci) &&
+ ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+ dout("get_cap_refs %p forced umount\n", inode);
+ *err = -EIO;
+ ret = 1;
+ goto out_unlock;
+ }
+
dout("get_cap_refs %p have %s needed %s\n", inode,
ceph_cap_string(have), ceph_cap_string(need));
}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 8b79d87eaf46..0c62868b5c56 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -136,7 +136,6 @@ int ceph_open(struct inode *inode, struct file *file)
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
struct ceph_file_info *cf = file->private_data;
- struct inode *parent_inode = NULL;
int err;
int flags, fmode, wanted;
@@ -210,10 +209,7 @@ int ceph_open(struct inode *inode, struct file *file)
ihold(inode);
req->r_num_caps = 1;
- if (flags & O_CREAT)
- parent_inode = ceph_get_dentry_parent_inode(file->f_path.dentry);
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- iput(parent_inode);
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
if (!err)
err = ceph_init_file(inode, file, req->r_fmode);
ceph_mdsc_put_request(req);
@@ -279,7 +275,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
if (err)
goto out_req;
- if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
+ if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
if (d_unhashed(dentry)) {
@@ -956,6 +952,12 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
/* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(inode);
+ if (iocb->ki_flags & IOCB_APPEND) {
+ err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
+ if (err < 0)
+ goto out;
+ }
+
err = generic_write_checks(iocb, from);
if (err <= 0)
goto out;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 6aa07af67603..51cb02da75d9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2107,7 +2107,6 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
msg = create_request_message(mdsc, req, mds, drop_cap_releases);
if (IS_ERR(msg)) {
req->r_err = PTR_ERR(msg);
- complete_request(mdsc, req);
return PTR_ERR(msg);
}
req->r_request = msg;
@@ -2135,7 +2134,7 @@ static int __do_request(struct ceph_mds_client *mdsc,
{
struct ceph_mds_session *session = NULL;
int mds = -1;
- int err = -EAGAIN;
+ int err = 0;
if (req->r_err || req->r_got_result) {
if (req->r_aborted)
@@ -2149,6 +2148,11 @@ static int __do_request(struct ceph_mds_client *mdsc,
err = -EIO;
goto finish;
}
+ if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+ dout("do_request forced umount\n");
+ err = -EIO;
+ goto finish;
+ }
put_request_session(req);
@@ -2196,13 +2200,15 @@ static int __do_request(struct ceph_mds_client *mdsc,
out_session:
ceph_put_mds_session(session);
+finish:
+ if (err) {
+ dout("__do_request early error %d\n", err);
+ req->r_err = err;
+ complete_request(mdsc, req);
+ __unregister_request(mdsc, req);
+ }
out:
return err;
-
-finish:
- req->r_err = err;
- complete_request(mdsc, req);
- goto out;
}
/*
@@ -2289,8 +2295,6 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
if (req->r_err) {
err = req->r_err;
- __unregister_request(mdsc, req);
- dout("do_request early error %d\n", err);
goto out;
}
@@ -2411,7 +2415,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
mutex_unlock(&mdsc->mutex);
goto out;
}
- if (req->r_got_safe && !head->safe) {
+ if (req->r_got_safe) {
pr_warn("got unsafe after safe on %llu from mds%d\n",
tid, mds);
mutex_unlock(&mdsc->mutex);
@@ -2520,8 +2524,7 @@ out_err:
if (err) {
req->r_err = err;
} else {
- req->r_reply = msg;
- ceph_msg_get(msg);
+ req->r_reply = ceph_msg_get(msg);
req->r_got_result = true;
}
} else {
@@ -3555,7 +3558,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
{
u64 want_tid, want_flush, want_snap;
- if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
+ if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
return;
dout("sync\n");
@@ -3584,7 +3587,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
*/
static bool done_closing_sessions(struct ceph_mds_client *mdsc)
{
- if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
+ if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
return true;
return atomic_read(&mdsc->num_sessions) == 0;
}
@@ -3643,6 +3646,34 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
dout("stopped\n");
}
+void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
+{
+ struct ceph_mds_session *session;
+ int mds;
+
+ dout("force umount\n");
+
+ mutex_lock(&mdsc->mutex);
+ for (mds = 0; mds < mdsc->max_sessions; mds++) {
+ session = __ceph_lookup_mds_session(mdsc, mds);
+ if (!session)
+ continue;
+ mutex_unlock(&mdsc->mutex);
+ mutex_lock(&session->s_mutex);
+ __close_session(mdsc, session);
+ if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
+ cleanup_session_requests(mdsc, session);
+ remove_session_caps(session);
+ }
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+ mutex_lock(&mdsc->mutex);
+ kick_requests(mdsc, mds);
+ }
+ __wake_requests(mdsc, &mdsc->waiting_for_map);
+ mutex_unlock(&mdsc->mutex);
+}
+
static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
{
dout("stop\n");
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 762757e6cebf..f575eafe2261 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -366,6 +366,7 @@ extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc);
extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 233d906aec02..4aa7122a8d38 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -338,12 +338,6 @@ static int build_snap_context(struct ceph_snap_realm *realm)
return 0;
}
- if (num == 0 && realm->seq == ceph_empty_snapc->seq) {
- ceph_get_snap_context(ceph_empty_snapc);
- snapc = ceph_empty_snapc;
- goto done;
- }
-
/* alloc new snap context */
err = -ENOMEM;
if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64))
@@ -381,7 +375,6 @@ static int build_snap_context(struct ceph_snap_realm *realm)
realm->ino, realm, snapc, snapc->seq,
(unsigned int) snapc->num_snaps);
-done:
ceph_put_snap_context(realm->cached_context);
realm->cached_context = snapc;
return 0;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 7b6bfcbf801c..f446afada328 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -708,6 +708,7 @@ static void ceph_umount_begin(struct super_block *sb)
if (!fsc)
return;
fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
+ ceph_mdsc_force_umount(fsc->mdsc);
return;
}
diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h
new file mode 100644
index 000000000000..0065256881d8
--- /dev/null
+++ b/fs/cifs/cifs_ioctl.h
@@ -0,0 +1,42 @@
+/*
+ * fs/cifs/cifs_ioctl.h
+ *
+ * Structure definitions for io control for cifs/smb3
+ *
+ * Copyright (c) 2015 Steve French <steve.french@primarydata.com>
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU Lesser General Public License for more details.
+ *
+ */
+
+struct smb_mnt_fs_info {
+ __u32 version; /* 0001 */
+ __u16 protocol_id;
+ __u16 tcon_flags;
+ __u32 vol_serial_number;
+ __u32 vol_create_time;
+ __u32 share_caps;
+ __u32 share_flags;
+ __u32 sector_flags;
+ __u32 optimal_sector_size;
+ __u32 max_bytes_chunk;
+ __u32 fs_attributes;
+ __u32 max_path_component;
+ __u32 device_type;
+ __u32 device_characteristics;
+ __u32 maximal_access;
+ __u64 cifs_posix_caps;
+} __packed;
+
+#define CIFS_IOCTL_MAGIC 0xCF
+#define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int)
+#define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4)
+#define CIFS_IOC_GET_MNT_INFO _IOR(CIFS_IOCTL_MAGIC, 5, struct smb_mnt_fs_info)
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index a782b22904e4..27aea110e923 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -136,5 +136,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.06"
+#define CIFS_VERSION "2.07"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 47b030da0781..f5b87303ce46 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -2245,6 +2245,20 @@ typedef struct {
#define FILE_DEVICE_VIRTUAL_DISK 0x00000024
#define FILE_DEVICE_NETWORK_REDIRECTOR 0x00000028
+/* Device Characteristics */
+#define FILE_REMOVABLE_MEDIA 0x00000001
+#define FILE_READ_ONLY_DEVICE 0x00000002
+#define FILE_FLOPPY_DISKETTE 0x00000004
+#define FILE_WRITE_ONCE_MEDIA 0x00000008
+#define FILE_REMOTE_DEVICE 0x00000010
+#define FILE_DEVICE_IS_MOUNTED 0x00000020
+#define FILE_VIRTUAL_VOLUME 0x00000040
+#define FILE_DEVICE_SECURE_OPEN 0x00000100
+#define FILE_CHARACTERISTIC_TS_DEVICE 0x00001000
+#define FILE_CHARACTERISTIC_WEBDAV_DEVICE 0x00002000
+#define FILE_PORTABLE_DEVICE 0x00004000
+#define FILE_DEVICE_ALLOW_APPCONTAINER_TRAVERSAL 0x00020000
+
typedef struct {
__le32 DeviceType;
__le32 DeviceCharacteristics;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 672ef35c9f73..90b4f9f7de66 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -696,7 +696,9 @@ cifs_echo_callback(struct mid_q_entry *mid)
{
struct TCP_Server_Info *server = mid->callback_data;
+ mutex_lock(&server->srv_mutex);
DeleteMidQEntry(mid);
+ mutex_unlock(&server->srv_mutex);
add_credits(server, 1, CIFS_ECHO_OP);
}
@@ -1572,7 +1574,9 @@ cifs_readv_callback(struct mid_q_entry *mid)
}
queue_work(cifsiod_wq, &rdata->work);
+ mutex_lock(&server->srv_mutex);
DeleteMidQEntry(mid);
+ mutex_unlock(&server->srv_mutex);
add_credits(server, 1, 0);
}
@@ -2032,6 +2036,7 @@ cifs_writev_callback(struct mid_q_entry *mid)
{
struct cifs_writedata *wdata = mid->callback_data;
struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+ struct TCP_Server_Info *server = tcon->ses->server;
unsigned int written;
WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf;
@@ -2068,7 +2073,9 @@ cifs_writev_callback(struct mid_q_entry *mid)
}
queue_work(cifsiod_wq, &wdata->work);
+ mutex_lock(&server->srv_mutex);
DeleteMidQEntry(mid);
+ mutex_unlock(&server->srv_mutex);
add_credits(tcon->ses->server, 1, 0);
}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 3f50cee79df9..e2a6af1508af 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3216,7 +3216,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
return VM_FAULT_LOCKED;
}
-static struct vm_operations_struct cifs_file_vm_ops = {
+static const struct vm_operations_struct cifs_file_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = cifs_page_mkwrite,
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 49b8b6e41a18..c63f5227b681 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -31,12 +31,9 @@
#include "cifsproto.h"
#include "cifs_debug.h"
#include "cifsfs.h"
+#include "cifs_ioctl.h"
#include <linux/btrfs.h>
-#define CIFS_IOCTL_MAGIC 0xCF
-#define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int)
-#define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4)
-
static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
unsigned long srcfd, u64 off, u64 len, u64 destoff,
bool dup_extents)
@@ -135,6 +132,43 @@ out_drop_write:
return rc;
}
+static long smb_mnt_get_fsinfo(unsigned int xid, struct cifs_tcon *tcon,
+ void __user *arg)
+{
+ int rc = 0;
+ struct smb_mnt_fs_info *fsinf;
+
+ fsinf = kzalloc(sizeof(struct smb_mnt_fs_info), GFP_KERNEL);
+ if (fsinf == NULL)
+ return -ENOMEM;
+
+ fsinf->version = 1;
+ fsinf->protocol_id = tcon->ses->server->vals->protocol_id;
+ fsinf->device_characteristics =
+ le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics);
+ fsinf->device_type = le32_to_cpu(tcon->fsDevInfo.DeviceType);
+ fsinf->fs_attributes = le32_to_cpu(tcon->fsAttrInfo.Attributes);
+ fsinf->max_path_component =
+ le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength);
+#ifdef CONFIG_CIFS_SMB2
+ fsinf->vol_serial_number = tcon->vol_serial_number;
+ fsinf->vol_create_time = le64_to_cpu(tcon->vol_create_time);
+ fsinf->share_flags = tcon->share_flags;
+ fsinf->share_caps = le32_to_cpu(tcon->capabilities);
+ fsinf->sector_flags = tcon->ss_flags;
+ fsinf->optimal_sector_size = tcon->perf_sector_size;
+ fsinf->max_bytes_chunk = tcon->max_bytes_chunk;
+ fsinf->maximal_access = tcon->maximal_access;
+#endif /* SMB2 */
+ fsinf->cifs_posix_caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
+
+ if (copy_to_user(arg, fsinf, sizeof(struct smb_mnt_fs_info)))
+ rc = -EFAULT;
+
+ kfree(fsinf);
+ return rc;
+}
+
long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
{
struct inode *inode = file_inode(filep);
@@ -148,8 +182,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
xid = get_xid();
- cifs_dbg(FYI, "ioctl file %p cmd %u arg %lu\n", filep, command, arg);
-
cifs_sb = CIFS_SB(inode->i_sb);
switch (command) {
@@ -228,6 +260,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
else
rc = -EOPNOTSUPP;
break;
+ case CIFS_IOC_GET_MNT_INFO:
+ tcon = tlink_tcon(pSMBFile->tlink);
+ rc = smb_mnt_get_fsinfo(xid, tcon, (void __user *)arg);
+ break;
default:
cifs_dbg(FYI, "unsupported ioctl\n");
break;
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index b8b4f08ee094..070fb2ad85ce 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1626,7 +1626,9 @@ smb2_echo_callback(struct mid_q_entry *mid)
if (mid->mid_state == MID_RESPONSE_RECEIVED)
credits_received = le16_to_cpu(smb2->hdr.CreditRequest);
+ mutex_lock(&server->srv_mutex);
DeleteMidQEntry(mid);
+ mutex_unlock(&server->srv_mutex);
add_credits(server, credits_received, CIFS_ECHO_OP);
}
@@ -1810,7 +1812,9 @@ smb2_readv_callback(struct mid_q_entry *mid)
cifs_stats_fail_inc(tcon, SMB2_READ_HE);
queue_work(cifsiod_wq, &rdata->work);
+ mutex_lock(&server->srv_mutex);
DeleteMidQEntry(mid);
+ mutex_unlock(&server->srv_mutex);
add_credits(server, credits_received, 0);
}
@@ -1938,6 +1942,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
{
struct cifs_writedata *wdata = mid->callback_data;
struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+ struct TCP_Server_Info *server = tcon->ses->server;
unsigned int written;
struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf;
unsigned int credits_received = 1;
@@ -1977,7 +1982,9 @@ smb2_writev_callback(struct mid_q_entry *mid)
cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
queue_work(cifsiod_wq, &wdata->work);
+ mutex_lock(&server->srv_mutex);
DeleteMidQEntry(mid);
+ mutex_unlock(&server->srv_mutex);
add_credits(tcon->ses->server, credits_received, 0);
}
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 126f46b887cc..2a24c524fb9a 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -644,7 +644,9 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
}
spin_unlock(&GlobalMid_Lock);
+ mutex_lock(&server->srv_mutex);
DeleteMidQEntry(mid);
+ mutex_unlock(&server->srv_mutex);
return rc;
}
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 9b1ffaa0572e..f6c6c8adbc01 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -353,7 +353,7 @@ int venus_readlink(struct super_block *sb, struct CodaFid *fid,
char *result;
insize = max_t(unsigned int,
- INSIZE(readlink), OUTSIZE(readlink)+ *length + 1);
+ INSIZE(readlink), OUTSIZE(readlink)+ *length);
UPARG(CODA_READLINK);
inp->coda_readlink.VFid = *fid;
@@ -361,8 +361,8 @@ int venus_readlink(struct super_block *sb, struct CodaFid *fid,
error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
if (!error) {
retlen = outp->coda_readlink.count;
- if ( retlen > *length )
- retlen = *length;
+ if (retlen >= *length)
+ retlen = *length - 1;
*length = retlen;
result = (char *)outp + (long)outp->coda_readlink.data;
memcpy(buffer, result, retlen);
diff --git a/fs/coredump.c b/fs/coredump.c
index c5ecde6f3eed..a8f75640ac86 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -513,10 +513,10 @@ void do_coredump(const siginfo_t *siginfo)
const struct cred *old_cred;
struct cred *cred;
int retval = 0;
- int flag = 0;
int ispipe;
struct files_struct *displaced;
- bool need_nonrelative = false;
+ /* require nonrelative corefile path and be extra careful */
+ bool need_suid_safe = false;
bool core_dumped = false;
static atomic_t core_dump_count = ATOMIC_INIT(0);
struct coredump_params cprm = {
@@ -550,9 +550,8 @@ void do_coredump(const siginfo_t *siginfo)
*/
if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) {
/* Setuid core dump mode */
- flag = O_EXCL; /* Stop rewrite attacks */
cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */
- need_nonrelative = true;
+ need_suid_safe = true;
}
retval = coredump_wait(siginfo->si_signo, &core_state);
@@ -633,7 +632,7 @@ void do_coredump(const siginfo_t *siginfo)
if (cprm.limit < binfmt->min_coredump)
goto fail_unlock;
- if (need_nonrelative && cn.corename[0] != '/') {
+ if (need_suid_safe && cn.corename[0] != '/') {
printk(KERN_WARNING "Pid %d(%s) can only dump core "\
"to fully qualified path!\n",
task_tgid_vnr(current), current->comm);
@@ -641,8 +640,35 @@ void do_coredump(const siginfo_t *siginfo)
goto fail_unlock;
}
+ /*
+ * Unlink the file if it exists unless this is a SUID
+ * binary - in that case, we're running around with root
+ * privs and don't want to unlink another user's coredump.
+ */
+ if (!need_suid_safe) {
+ mm_segment_t old_fs;
+
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ /*
+ * If it doesn't exist, that's fine. If there's some
+ * other problem, we'll catch it at the filp_open().
+ */
+ (void) sys_unlink((const char __user *)cn.corename);
+ set_fs(old_fs);
+ }
+
+ /*
+ * There is a race between unlinking and creating the
+ * file, but if that causes an EEXIST here, that's
+ * fine - another process raced with us while creating
+ * the corefile, and the other process won. To userspace,
+ * what matters is that at least one of the two processes
+ * writes its coredump successfully, not which one.
+ */
cprm.file = filp_open(cn.corename,
- O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
+ O_CREAT | 2 | O_NOFOLLOW |
+ O_LARGEFILE | O_EXCL,
0600);
if (IS_ERR(cprm.file))
goto fail_unlock;
@@ -659,11 +685,15 @@ void do_coredump(const siginfo_t *siginfo)
if (!S_ISREG(inode->i_mode))
goto close_fail;
/*
- * Dont allow local users get cute and trick others to coredump
- * into their pre-created files.
+ * Don't dump core if the filesystem changed owner or mode
+ * of the file during file creation. This is an issue when
+ * a process dumps core while its cwd is e.g. on a vfat
+ * filesystem.
*/
if (!uid_eq(inode->i_uid, current_fsuid()))
goto close_fail;
+ if ((inode->i_mode & 0677) != 0600)
+ goto close_fail;
if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
goto close_fail;
if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
diff --git a/fs/dax.c b/fs/dax.c
index a7f77e1fa18c..93bf2f990ace 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -17,12 +17,14 @@
#include <linux/atomic.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
+#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/genhd.h>
#include <linux/highmem.h>
#include <linux/memcontrol.h>
#include <linux/mm.h>
#include <linux/mutex.h>
+#include <linux/pmem.h>
#include <linux/sched.h>
#include <linux/uio.h>
#include <linux/vmstat.h>
@@ -34,7 +36,7 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long size)
might_sleep();
do {
- void *addr;
+ void __pmem *addr;
unsigned long pfn;
long count;
@@ -46,10 +48,7 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long size)
unsigned pgsz = PAGE_SIZE - offset_in_page(addr);
if (pgsz > count)
pgsz = count;
- if (pgsz < PAGE_SIZE)
- memset(addr, 0, pgsz);
- else
- clear_page(addr);
+ clear_pmem(addr, pgsz);
addr += pgsz;
size -= pgsz;
count -= pgsz;
@@ -59,26 +58,29 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long size)
}
} while (size);
+ wmb_pmem();
return 0;
}
EXPORT_SYMBOL_GPL(dax_clear_blocks);
-static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits)
+static long dax_get_addr(struct buffer_head *bh, void __pmem **addr,
+ unsigned blkbits)
{
unsigned long pfn;
sector_t sector = bh->b_blocknr << (blkbits - 9);
return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
}
-static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos,
- loff_t end)
+/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
+static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
+ loff_t pos, loff_t end)
{
loff_t final = end - pos + first; /* The final byte of the buffer */
if (first > 0)
- memset(addr, 0, first);
+ clear_pmem(addr, first);
if (final < size)
- memset(addr + final, 0, size - final);
+ clear_pmem(addr + final, size - final);
}
static bool buffer_written(struct buffer_head *bh)
@@ -106,14 +108,15 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
loff_t pos = start;
loff_t max = start;
loff_t bh_max = start;
- void *addr;
+ void __pmem *addr;
bool hole = false;
+ bool need_wmb = false;
if (iov_iter_rw(iter) != WRITE)
end = min(end, i_size_read(inode));
while (pos < end) {
- unsigned len;
+ size_t len;
if (pos == max) {
unsigned blkbits = inode->i_blkbits;
sector_t block = pos >> blkbits;
@@ -145,19 +148,23 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
retval = dax_get_addr(bh, &addr, blkbits);
if (retval < 0)
break;
- if (buffer_unwritten(bh) || buffer_new(bh))
+ if (buffer_unwritten(bh) || buffer_new(bh)) {
dax_new_buf(addr, retval, first, pos,
end);
+ need_wmb = true;
+ }
addr += first;
size = retval - first;
}
max = min(pos + size, end);
}
- if (iov_iter_rw(iter) == WRITE)
- len = copy_from_iter_nocache(addr, max - pos, iter);
- else if (!hole)
- len = copy_to_iter(addr, max - pos, iter);
+ if (iov_iter_rw(iter) == WRITE) {
+ len = copy_from_iter_pmem(addr, max - pos, iter);
+ need_wmb = true;
+ } else if (!hole)
+ len = copy_to_iter((void __force *)addr, max - pos,
+ iter);
else
len = iov_iter_zero(max - pos, iter);
@@ -168,6 +175,9 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
addr += len;
}
+ if (need_wmb)
+ wmb_pmem();
+
return (pos == start) ? retval : pos - start;
}
@@ -260,11 +270,13 @@ static int dax_load_hole(struct address_space *mapping, struct page *page,
static int copy_user_bh(struct page *to, struct buffer_head *bh,
unsigned blkbits, unsigned long vaddr)
{
- void *vfrom, *vto;
+ void __pmem *vfrom;
+ void *vto;
+
if (dax_get_addr(bh, &vfrom, blkbits) < 0)
return -EIO;
vto = kmap_atomic(to);
- copy_user_page(vto, vfrom, vaddr, to);
+ copy_user_page(vto, (void __force *)vfrom, vaddr, to);
kunmap_atomic(vto);
return 0;
}
@@ -272,16 +284,13 @@ static int copy_user_bh(struct page *to, struct buffer_head *bh,
static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
- struct address_space *mapping = inode->i_mapping;
sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
unsigned long vaddr = (unsigned long)vmf->virtual_address;
- void *addr;
+ void __pmem *addr;
unsigned long pfn;
pgoff_t size;
int error;
- i_mmap_lock_read(mapping);
-
/*
* Check truncate didn't happen while we were allocating a block.
* If it did, this block may or may not be still allocated to the
@@ -303,14 +312,14 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
goto out;
}
- if (buffer_unwritten(bh) || buffer_new(bh))
- clear_page(addr);
+ if (buffer_unwritten(bh) || buffer_new(bh)) {
+ clear_pmem(addr, PAGE_SIZE);
+ wmb_pmem();
+ }
error = vm_insert_mixed(vma, vaddr, pfn);
out:
- i_mmap_unlock_read(mapping);
-
return error;
}
@@ -372,15 +381,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
* from a read fault and we've raced with a truncate
*/
error = -EIO;
- goto unlock_page;
+ goto unlock;
}
+ } else {
+ i_mmap_lock_write(mapping);
}
error = get_block(inode, block, &bh, 0);
if (!error && (bh.b_size < PAGE_SIZE))
error = -EIO; /* fs corruption? */
if (error)
- goto unlock_page;
+ goto unlock;
if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
if (vmf->flags & FAULT_FLAG_WRITE) {
@@ -391,8 +402,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
if (!error && (bh.b_size < PAGE_SIZE))
error = -EIO;
if (error)
- goto unlock_page;
+ goto unlock;
} else {
+ i_mmap_unlock_write(mapping);
return dax_load_hole(mapping, page, vmf);
}
}
@@ -404,17 +416,15 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
else
clear_user_highpage(new_page, vaddr);
if (error)
- goto unlock_page;
+ goto unlock;
vmf->page = page;
if (!page) {
- i_mmap_lock_read(mapping);
/* Check we didn't race with truncate */
size = (i_size_read(inode) + PAGE_SIZE - 1) >>
PAGE_SHIFT;
if (vmf->pgoff >= size) {
- i_mmap_unlock_read(mapping);
error = -EIO;
- goto out;
+ goto unlock;
}
}
return VM_FAULT_LOCKED;
@@ -450,6 +460,8 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
}
+ if (!page)
+ i_mmap_unlock_write(mapping);
out:
if (error == -ENOMEM)
return VM_FAULT_OOM | major;
@@ -458,11 +470,14 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
return VM_FAULT_SIGBUS | major;
return VM_FAULT_NOPAGE | major;
- unlock_page:
+ unlock:
if (page) {
unlock_page(page);
page_cache_release(page);
+ } else {
+ i_mmap_unlock_write(mapping);
}
+
goto out;
}
EXPORT_SYMBOL(__dax_fault);
@@ -494,6 +509,177 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
}
EXPORT_SYMBOL_GPL(dax_fault);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
+ * more often than one might expect in the below function.
+ */
+#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
+
+int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, unsigned int flags, get_block_t get_block,
+ dax_iodone_t complete_unwritten)
+{
+ struct file *file = vma->vm_file;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct buffer_head bh;
+ unsigned blkbits = inode->i_blkbits;
+ unsigned long pmd_addr = address & PMD_MASK;
+ bool write = flags & FAULT_FLAG_WRITE;
+ long length;
+ void __pmem *kaddr;
+ pgoff_t size, pgoff;
+ sector_t block, sector;
+ unsigned long pfn;
+ int result = 0;
+
+ /* Fall back to PTEs if we're going to COW */
+ if (write && !(vma->vm_flags & VM_SHARED))
+ return VM_FAULT_FALLBACK;
+ /* If the PMD would extend outside the VMA */
+ if (pmd_addr < vma->vm_start)
+ return VM_FAULT_FALLBACK;
+ if ((pmd_addr + PMD_SIZE) > vma->vm_end)
+ return VM_FAULT_FALLBACK;
+
+ pgoff = linear_page_index(vma, pmd_addr);
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (pgoff >= size)
+ return VM_FAULT_SIGBUS;
+ /* If the PMD would cover blocks out of the file */
+ if ((pgoff | PG_PMD_COLOUR) >= size)
+ return VM_FAULT_FALLBACK;
+
+ memset(&bh, 0, sizeof(bh));
+ block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
+
+ bh.b_size = PMD_SIZE;
+ i_mmap_lock_write(mapping);
+ length = get_block(inode, block, &bh, write);
+ if (length)
+ return VM_FAULT_SIGBUS;
+
+ /*
+ * If the filesystem isn't willing to tell us the length of a hole,
+ * just fall back to PTEs. Calling get_block 512 times in a loop
+ * would be silly.
+ */
+ if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE)
+ goto fallback;
+
+ if (buffer_unwritten(&bh) || buffer_new(&bh)) {
+ int i;
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE);
+ wmb_pmem();
+ count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+ result |= VM_FAULT_MAJOR;
+ }
+
+ /*
+ * If we allocated new storage, make sure no process has any
+ * zero pages covering this hole
+ */
+ if (buffer_new(&bh)) {
+ i_mmap_unlock_write(mapping);
+ unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
+ i_mmap_lock_write(mapping);
+ }
+
+ /*
+ * If a truncate happened while we were allocating blocks, we may
+ * leave blocks allocated to the file that are beyond EOF. We can't
+ * take i_mutex here, so just leave them hanging; they'll be freed
+ * when the file is deleted.
+ */
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (pgoff >= size) {
+ result = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ if ((pgoff | PG_PMD_COLOUR) >= size)
+ goto fallback;
+
+ if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
+ spinlock_t *ptl;
+ pmd_t entry;
+ struct page *zero_page = get_huge_zero_page();
+
+ if (unlikely(!zero_page))
+ goto fallback;
+
+ ptl = pmd_lock(vma->vm_mm, pmd);
+ if (!pmd_none(*pmd)) {
+ spin_unlock(ptl);
+ goto fallback;
+ }
+
+ entry = mk_pmd(zero_page, vma->vm_page_prot);
+ entry = pmd_mkhuge(entry);
+ set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
+ result = VM_FAULT_NOPAGE;
+ spin_unlock(ptl);
+ } else {
+ sector = bh.b_blocknr << (blkbits - 9);
+ length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn,
+ bh.b_size);
+ if (length < 0) {
+ result = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
+ goto fallback;
+
+ result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write);
+ }
+
+ out:
+ if (buffer_unwritten(&bh))
+ complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
+
+ i_mmap_unlock_write(mapping);
+
+ return result;
+
+ fallback:
+ count_vm_event(THP_FAULT_FALLBACK);
+ result = VM_FAULT_FALLBACK;
+ goto out;
+}
+EXPORT_SYMBOL_GPL(__dax_pmd_fault);
+
+/**
+ * dax_pmd_fault - handle a PMD fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * When a page fault occurs, filesystems may call this helper in their
+ * pmd_fault handler for DAX files.
+ */
+int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, unsigned int flags, get_block_t get_block,
+ dax_iodone_t complete_unwritten)
+{
+ int result;
+ struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+
+ if (flags & FAULT_FLAG_WRITE) {
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
+ }
+ result = __dax_pmd_fault(vma, address, pmd, flags, get_block,
+ complete_unwritten);
+ if (flags & FAULT_FLAG_WRITE)
+ sb_end_pagefault(sb);
+
+ return result;
+}
+EXPORT_SYMBOL_GPL(dax_pmd_fault);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
/**
* dax_pfn_mkwrite - handle first write to DAX page
* @vma: The virtual memory area where the fault occurred
@@ -548,11 +734,12 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
if (err < 0)
return err;
if (buffer_written(&bh)) {
- void *addr;
+ void __pmem *addr;
err = dax_get_addr(&bh, &addr, inode->i_blkbits);
if (err < 0)
return err;
- memset(addr + offset, 0, length);
+ clear_pmem(addr + offset, length);
+ wmb_pmem();
}
return 0;
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 284f9aa0028b..6c55ade071c3 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -435,8 +435,8 @@ struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
}
EXPORT_SYMBOL_GPL(debugfs_create_atomic_t);
-static ssize_t read_file_bool(struct file *file, char __user *user_buf,
- size_t count, loff_t *ppos)
+ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
{
char buf[3];
u32 *val = file->private_data;
@@ -449,9 +449,10 @@ static ssize_t read_file_bool(struct file *file, char __user *user_buf,
buf[2] = 0x00;
return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
}
+EXPORT_SYMBOL_GPL(debugfs_read_file_bool);
-static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
- size_t count, loff_t *ppos)
+ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
{
char buf[32];
size_t buf_size;
@@ -468,10 +469,11 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
return count;
}
+EXPORT_SYMBOL_GPL(debugfs_write_file_bool);
static const struct file_operations fops_bool = {
- .read = read_file_bool,
- .write = write_file_bool,
+ .read = debugfs_read_file_bool,
+ .write = debugfs_write_file_bool,
.open = simple_open,
.llseek = default_llseek,
};
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 97315f2f6816..80d6901493cf 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -258,8 +258,7 @@ void ecryptfs_destroy_mount_crypt_stat(
&mount_crypt_stat->global_auth_tok_list,
mount_crypt_stat_list) {
list_del(&auth_tok->mount_crypt_stat_list);
- if (auth_tok->global_auth_tok_key
- && !(auth_tok->flags & ECRYPTFS_AUTH_TOK_INVALID))
+ if (!(auth_tok->flags & ECRYPTFS_AUTH_TOK_INVALID))
key_put(auth_tok->global_auth_tok_key);
kmem_cache_free(ecryptfs_global_auth_tok_cache, auth_tok);
}
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 8db0b464483f..63cd2c147221 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -45,20 +45,20 @@
static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags)
{
struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
- int rc;
-
- if (!(lower_dentry->d_flags & DCACHE_OP_REVALIDATE))
- return 1;
+ int rc = 1;
if (flags & LOOKUP_RCU)
return -ECHILD;
- rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags);
+ if (lower_dentry->d_flags & DCACHE_OP_REVALIDATE)
+ rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags);
+
if (d_really_is_positive(dentry)) {
- struct inode *lower_inode =
- ecryptfs_inode_to_lower(d_inode(dentry));
+ struct inode *inode = d_inode(dentry);
- fsstack_copy_attr_all(d_inode(dentry), lower_inode);
+ fsstack_copy_attr_all(inode, ecryptfs_inode_to_lower(inode));
+ if (!inode->i_nlink)
+ return 0;
}
return rc;
}
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 3b57c9f83c9b..1982c3f11aec 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -20,6 +20,7 @@
#include <linux/time.h>
#include <linux/pagemap.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include "ext2.h"
#include "xattr.h"
@@ -31,6 +32,12 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
return dax_fault(vma, vmf, ext2_get_block, NULL);
}
+static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned int flags)
+{
+ return dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL);
+}
+
static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
@@ -38,6 +45,7 @@ static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
static const struct vm_operations_struct ext2_dax_vm_ops = {
.fault = ext2_dax_fault,
+ .pmd_fault = ext2_dax_pmd_fault,
.page_mkwrite = ext2_dax_mkwrite,
.pfn_mkwrite = dax_pfn_mkwrite,
};
@@ -49,7 +57,7 @@ static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed(file);
vma->vm_ops = &ext2_dax_vm_ops;
- vma->vm_flags |= VM_MIXEDMAP;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
return 0;
}
#else
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index a3a404c5df2e..c60a248c640c 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -25,6 +25,7 @@
#include <linux/time.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h>
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 32071f5c1c26..fd1f28be5296 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2272,6 +2272,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
int ext4_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
+int ext4_get_block_dax(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index bc313ac5d3fa..113837e7ba98 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -22,6 +22,7 @@
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/path.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/pagevec.h>
#include <linux/uio.h>
@@ -195,7 +196,7 @@ out:
static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
{
struct inode *inode = bh->b_assoc_map->host;
- /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
+ /* XXX: breaks on 32-bit > 16TB. Is that even supported? */
loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
int err;
if (!uptodate)
@@ -206,17 +207,74 @@ static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
- /* Is this the right get_block? */
+ int result;
+ handle_t *handle = NULL;
+ struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+
+ if (write) {
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
+ EXT4_DATA_TRANS_BLOCKS(sb));
+ }
+
+ if (IS_ERR(handle))
+ result = VM_FAULT_SIGBUS;
+ else
+ result = __dax_fault(vma, vmf, ext4_get_block_dax,
+ ext4_end_io_unwritten);
+
+ if (write) {
+ if (!IS_ERR(handle))
+ ext4_journal_stop(handle);
+ sb_end_pagefault(sb);
+ }
+
+ return result;
+}
+
+static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned int flags)
+{
+ int result;
+ handle_t *handle = NULL;
+ struct inode *inode = file_inode(vma->vm_file);
+ struct super_block *sb = inode->i_sb;
+ bool write = flags & FAULT_FLAG_WRITE;
+
+ if (write) {
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
+ ext4_chunk_trans_blocks(inode,
+ PMD_SIZE / PAGE_SIZE));
+ }
+
+ if (IS_ERR(handle))
+ result = VM_FAULT_SIGBUS;
+ else
+ result = __dax_pmd_fault(vma, addr, pmd, flags,
+ ext4_get_block_dax, ext4_end_io_unwritten);
+
+ if (write) {
+ if (!IS_ERR(handle))
+ ext4_journal_stop(handle);
+ sb_end_pagefault(sb);
+ }
+
+ return result;
}
static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
+ return dax_mkwrite(vma, vmf, ext4_get_block_dax,
+ ext4_end_io_unwritten);
}
static const struct vm_operations_struct ext4_dax_vm_ops = {
.fault = ext4_dax_fault,
+ .pmd_fault = ext4_dax_pmd_fault,
.page_mkwrite = ext4_dax_mkwrite,
.pfn_mkwrite = dax_pfn_mkwrite,
};
@@ -244,7 +302,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed(file);
if (IS_DAX(file_inode(file))) {
vma->vm_ops = &ext4_dax_vm_ops;
- vma->vm_flags |= VM_MIXEDMAP;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
} else {
vma->vm_ops = &ext4_file_vm_ops;
}
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 4f6ac499f09e..2468261748b2 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -22,6 +22,7 @@
#include "ext4_jbd2.h"
#include "truncate.h"
+#include <linux/dax.h>
#include <linux/uio.h>
#include <trace/events/ext4.h>
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 29f1af7c2cab..612fbcf76b5c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -22,6 +22,7 @@
#include <linux/time.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/string.h>
#include <linux/buffer_head.h>
@@ -3020,6 +3021,17 @@ static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
EXT4_GET_BLOCKS_NO_LOCK);
}
+int ext4_get_block_dax(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int flags = EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_UNWRIT_EXT;
+ if (create)
+ flags |= EXT4_GET_BLOCKS_CREATE;
+ ext4_debug("ext4_get_block_dax: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ return _ext4_get_block(inode, iblock, bh_result, flags);
+}
+
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
ssize_t size, void *private)
{
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ae0f438c2ee6..587ac08eabb6 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -53,8 +53,6 @@ struct wb_writeback_work {
unsigned int for_background:1;
unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
unsigned int auto_free:1; /* free on completion */
- unsigned int single_wait:1;
- unsigned int single_done:1;
enum wb_reason reason; /* why was writeback initiated? */
struct list_head list; /* pending work list */
@@ -178,14 +176,11 @@ static void wb_wakeup(struct bdi_writeback *wb)
static void wb_queue_work(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
- trace_writeback_queue(wb->bdi, work);
+ trace_writeback_queue(wb, work);
spin_lock_bh(&wb->work_lock);
- if (!test_bit(WB_registered, &wb->state)) {
- if (work->single_wait)
- work->single_done = 1;
+ if (!test_bit(WB_registered, &wb->state))
goto out_unlock;
- }
if (work->done)
atomic_inc(&work->done->cnt);
list_add_tail(&work->list, &wb->work_list);
@@ -706,7 +701,7 @@ EXPORT_SYMBOL_GPL(wbc_account_io);
/**
* inode_congested - test whether an inode is congested
- * @inode: inode to test for congestion
+ * @inode: inode to test for congestion (may be NULL)
* @cong_bits: mask of WB_[a]sync_congested bits to test
*
* Tests whether @inode is congested. @cong_bits is the mask of congestion
@@ -716,6 +711,9 @@ EXPORT_SYMBOL_GPL(wbc_account_io);
* determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
* associated with @inode is congested; otherwise, the root wb's congestion
* state is used.
+ *
+ * @inode is allowed to be NULL as this function is often called on
+ * mapping->host which is NULL for the swapper space.
*/
int inode_congested(struct inode *inode, int cong_bits)
{
@@ -738,32 +736,6 @@ int inode_congested(struct inode *inode, int cong_bits)
EXPORT_SYMBOL_GPL(inode_congested);
/**
- * wb_wait_for_single_work - wait for completion of a single bdi_writeback_work
- * @bdi: bdi the work item was issued to
- * @work: work item to wait for
- *
- * Wait for the completion of @work which was issued to one of @bdi's
- * bdi_writeback's. The caller must have set @work->single_wait before
- * issuing it. This wait operates independently fo
- * wb_wait_for_completion() and also disables automatic freeing of @work.
- */
-static void wb_wait_for_single_work(struct backing_dev_info *bdi,
- struct wb_writeback_work *work)
-{
- if (WARN_ON_ONCE(!work->single_wait))
- return;
-
- wait_event(bdi->wb_waitq, work->single_done);
-
- /*
- * Paired with smp_wmb() in wb_do_writeback() and ensures that all
- * modifications to @work prior to assertion of ->single_done is
- * visible to the caller once this function returns.
- */
- smp_rmb();
-}
-
-/**
* wb_split_bdi_pages - split nr_pages to write according to bandwidth
* @wb: target bdi_writeback to split @nr_pages to
* @nr_pages: number of pages to write for the whole bdi
@@ -792,38 +764,6 @@ static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
}
/**
- * wb_clone_and_queue_work - clone a wb_writeback_work and issue it to a wb
- * @wb: target bdi_writeback
- * @base_work: source wb_writeback_work
- *
- * Try to make a clone of @base_work and issue it to @wb. If cloning
- * succeeds, %true is returned; otherwise, @base_work is issued directly
- * and %false is returned. In the latter case, the caller is required to
- * wait for @base_work's completion using wb_wait_for_single_work().
- *
- * A clone is auto-freed on completion. @base_work never is.
- */
-static bool wb_clone_and_queue_work(struct bdi_writeback *wb,
- struct wb_writeback_work *base_work)
-{
- struct wb_writeback_work *work;
-
- work = kmalloc(sizeof(*work), GFP_ATOMIC);
- if (work) {
- *work = *base_work;
- work->auto_free = 1;
- work->single_wait = 0;
- } else {
- work = base_work;
- work->auto_free = 0;
- work->single_wait = 1;
- }
- work->single_done = 0;
- wb_queue_work(wb, work);
- return work != base_work;
-}
-
-/**
* bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
* @bdi: target backing_dev_info
* @base_work: wb_writeback_work to issue
@@ -838,15 +778,19 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
struct wb_writeback_work *base_work,
bool skip_if_busy)
{
- long nr_pages = base_work->nr_pages;
- int next_blkcg_id = 0;
+ int next_memcg_id = 0;
struct bdi_writeback *wb;
struct wb_iter iter;
might_sleep();
restart:
rcu_read_lock();
- bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) {
+ bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) {
+ DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
+ struct wb_writeback_work fallback_work;
+ struct wb_writeback_work *work;
+ long nr_pages;
+
/* SYNC_ALL writes out I_DIRTY_TIME too */
if (!wb_has_dirty_io(wb) &&
(base_work->sync_mode == WB_SYNC_NONE ||
@@ -855,13 +799,30 @@ restart:
if (skip_if_busy && writeback_in_progress(wb))
continue;
- base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages);
- if (!wb_clone_and_queue_work(wb, base_work)) {
- next_blkcg_id = wb->blkcg_css->id + 1;
- rcu_read_unlock();
- wb_wait_for_single_work(bdi, base_work);
- goto restart;
+ nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
+
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (work) {
+ *work = *base_work;
+ work->nr_pages = nr_pages;
+ work->auto_free = 1;
+ wb_queue_work(wb, work);
+ continue;
}
+
+ /* alloc failed, execute synchronously using on-stack fallback */
+ work = &fallback_work;
+ *work = *base_work;
+ work->nr_pages = nr_pages;
+ work->auto_free = 0;
+ work->done = &fallback_work_done;
+
+ wb_queue_work(wb, work);
+
+ next_memcg_id = wb->memcg_css->id + 1;
+ rcu_read_unlock();
+ wb_wait_for_completion(bdi, &fallback_work_done);
+ goto restart;
}
rcu_read_unlock();
}
@@ -902,8 +863,6 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
base_work->auto_free = 0;
- base_work->single_wait = 0;
- base_work->single_done = 0;
wb_queue_work(&bdi->wb, base_work);
}
}
@@ -924,7 +883,7 @@ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
*/
work = kzalloc(sizeof(*work), GFP_ATOMIC);
if (!work) {
- trace_writeback_nowork(wb->bdi);
+ trace_writeback_nowork(wb);
wb_wakeup(wb);
return;
}
@@ -954,7 +913,7 @@ void wb_start_background_writeback(struct bdi_writeback *wb)
* We just wake up the flusher thread. It will perform background
* writeback as soon as there is no other work to do.
*/
- trace_writeback_wake_background(wb->bdi);
+ trace_writeback_wake_background(wb);
wb_wakeup(wb);
}
@@ -1421,6 +1380,10 @@ static long writeback_chunk_size(struct bdi_writeback *wb,
* Write a portion of b_io inodes which belong to @sb.
*
* Return the number of pages and/or inodes written.
+ *
+ * NOTE! This is called with wb->list_lock held, and will
+ * unlock and relock that for each inode it ends up doing
+ * IO for.
*/
static long writeback_sb_inodes(struct super_block *sb,
struct bdi_writeback *wb,
@@ -1439,9 +1402,7 @@ static long writeback_sb_inodes(struct super_block *sb,
unsigned long start_time = jiffies;
long write_chunk;
long wrote = 0; /* count both pages and inodes */
- struct blk_plug plug;
- blk_start_plug(&plug);
while (!list_empty(&wb->b_io)) {
struct inode *inode = wb_inode(wb->b_io.prev);
@@ -1539,7 +1500,6 @@ static long writeback_sb_inodes(struct super_block *sb,
break;
}
}
- blk_finish_plug(&plug);
return wrote;
}
@@ -1586,12 +1546,15 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
.range_cyclic = 1,
.reason = reason,
};
+ struct blk_plug plug;
+ blk_start_plug(&plug);
spin_lock(&wb->list_lock);
if (list_empty(&wb->b_io))
queue_io(wb, &work);
__writeback_inodes_wb(wb, &work);
spin_unlock(&wb->list_lock);
+ blk_finish_plug(&plug);
return nr_pages - work.nr_pages;
}
@@ -1619,10 +1582,12 @@ static long wb_writeback(struct bdi_writeback *wb,
unsigned long oldest_jif;
struct inode *inode;
long progress;
+ struct blk_plug plug;
oldest_jif = jiffies;
work->older_than_this = &oldest_jif;
+ blk_start_plug(&plug);
spin_lock(&wb->list_lock);
for (;;) {
/*
@@ -1660,14 +1625,14 @@ static long wb_writeback(struct bdi_writeback *wb,
} else if (work->for_background)
oldest_jif = jiffies;
- trace_writeback_start(wb->bdi, work);
+ trace_writeback_start(wb, work);
if (list_empty(&wb->b_io))
queue_io(wb, work);
if (work->sb)
progress = writeback_sb_inodes(work->sb, wb, work);
else
progress = __writeback_inodes_wb(wb, work);
- trace_writeback_written(wb->bdi, work);
+ trace_writeback_written(wb, work);
wb_update_bandwidth(wb, wb_start);
@@ -1692,7 +1657,7 @@ static long wb_writeback(struct bdi_writeback *wb,
* we'll just busyloop.
*/
if (!list_empty(&wb->b_more_io)) {
- trace_writeback_wait(wb->bdi, work);
+ trace_writeback_wait(wb, work);
inode = wb_inode(wb->b_more_io.prev);
spin_lock(&inode->i_lock);
spin_unlock(&wb->list_lock);
@@ -1702,6 +1667,7 @@ static long wb_writeback(struct bdi_writeback *wb,
}
}
spin_unlock(&wb->list_lock);
+ blk_finish_plug(&plug);
return nr_pages - work->nr_pages;
}
@@ -1797,26 +1763,14 @@ static long wb_do_writeback(struct bdi_writeback *wb)
set_bit(WB_writeback_running, &wb->state);
while ((work = get_next_work_item(wb)) != NULL) {
struct wb_completion *done = work->done;
- bool need_wake_up = false;
- trace_writeback_exec(wb->bdi, work);
+ trace_writeback_exec(wb, work);
wrote += wb_writeback(wb, work);
- if (work->single_wait) {
- WARN_ON_ONCE(work->auto_free);
- /* paired w/ rmb in wb_wait_for_single_work() */
- smp_wmb();
- work->single_done = 1;
- need_wake_up = true;
- } else if (work->auto_free) {
+ if (work->auto_free)
kfree(work);
- }
-
if (done && atomic_dec_and_test(&done->cnt))
- need_wake_up = true;
-
- if (need_wake_up)
wake_up_all(&wb->bdi->wb_waitq);
}
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a38e38f7b6fc..9bd1244caf38 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -34,6 +34,7 @@
#include <linux/percpu.h>
#include <linux/list_sort.h>
#include <linux/lockref.h>
+#include <linux/rhashtable.h>
#include "gfs2.h"
#include "incore.h"
@@ -50,9 +51,8 @@
#include "trace_gfs2.h"
struct gfs2_glock_iter {
- int hash; /* hash bucket index */
- unsigned nhash; /* Index within current bucket */
struct gfs2_sbd *sdp; /* incore superblock */
+ struct rhashtable_iter hti; /* rhashtable iterator */
struct gfs2_glock *gl; /* current glock struct */
loff_t last_pos; /* last position */
};
@@ -70,44 +70,19 @@ static DEFINE_SPINLOCK(lru_lock);
#define GFS2_GL_HASH_SHIFT 15
#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT)
-#define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1)
-static struct hlist_bl_head gl_hash_table[GFS2_GL_HASH_SIZE];
-static struct dentry *gfs2_root;
-
-/**
- * gl_hash() - Turn glock number into hash bucket number
- * @lock: The glock number
- *
- * Returns: The number of the corresponding hash bucket
- */
-
-static unsigned int gl_hash(const struct gfs2_sbd *sdp,
- const struct lm_lockname *name)
-{
- unsigned int h;
-
- h = jhash(&name->ln_number, sizeof(u64), 0);
- h = jhash(&name->ln_type, sizeof(unsigned int), h);
- h = jhash(&sdp, sizeof(struct gfs2_sbd *), h);
- h &= GFS2_GL_HASH_MASK;
-
- return h;
-}
-
-static inline void spin_lock_bucket(unsigned int hash)
-{
- hlist_bl_lock(&gl_hash_table[hash]);
-}
+static struct rhashtable_params ht_parms = {
+ .nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4,
+ .key_len = sizeof(struct lm_lockname),
+ .key_offset = offsetof(struct gfs2_glock, gl_name),
+ .head_offset = offsetof(struct gfs2_glock, gl_node),
+};
-static inline void spin_unlock_bucket(unsigned int hash)
-{
- hlist_bl_unlock(&gl_hash_table[hash]);
-}
+static struct rhashtable gl_hash_table;
-static void gfs2_glock_dealloc(struct rcu_head *rcu)
+void gfs2_glock_free(struct gfs2_glock *gl)
{
- struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu);
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
if (gl->gl_ops->go_flags & GLOF_ASPACE) {
kmem_cache_free(gfs2_glock_aspace_cachep, gl);
@@ -115,13 +90,6 @@ static void gfs2_glock_dealloc(struct rcu_head *rcu)
kfree(gl->gl_lksb.sb_lvbptr);
kmem_cache_free(gfs2_glock_cachep, gl);
}
-}
-
-void gfs2_glock_free(struct gfs2_glock *gl)
-{
- struct gfs2_sbd *sdp = gl->gl_sbd;
-
- call_rcu(&gl->gl_rcu, gfs2_glock_dealloc);
if (atomic_dec_and_test(&sdp->sd_glock_disposal))
wake_up(&sdp->sd_glock_wait);
}
@@ -192,7 +160,7 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
void gfs2_glock_put(struct gfs2_glock *gl)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct address_space *mapping = gfs2_glock2aspace(gl);
if (lockref_put_or_lock(&gl->gl_lockref))
@@ -202,9 +170,7 @@ void gfs2_glock_put(struct gfs2_glock *gl)
gfs2_glock_remove_from_lru(gl);
spin_unlock(&gl->gl_lockref.lock);
- spin_lock_bucket(gl->gl_hash);
- hlist_bl_del_rcu(&gl->gl_list);
- spin_unlock_bucket(gl->gl_hash);
+ rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms);
GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
trace_gfs2_glock_put(gl);
@@ -212,33 +178,6 @@ void gfs2_glock_put(struct gfs2_glock *gl)
}
/**
- * search_bucket() - Find struct gfs2_glock by lock number
- * @bucket: the bucket to search
- * @name: The lock name
- *
- * Returns: NULL, or the struct gfs2_glock with the requested number
- */
-
-static struct gfs2_glock *search_bucket(unsigned int hash,
- const struct gfs2_sbd *sdp,
- const struct lm_lockname *name)
-{
- struct gfs2_glock *gl;
- struct hlist_bl_node *h;
-
- hlist_bl_for_each_entry_rcu(gl, h, &gl_hash_table[hash], gl_list) {
- if (!lm_name_equal(&gl->gl_name, name))
- continue;
- if (gl->gl_sbd != sdp)
- continue;
- if (lockref_get_not_dead(&gl->gl_lockref))
- return gl;
- }
-
- return NULL;
-}
-
-/**
* may_grant - check if its ok to grant a new lock
* @gl: The glock
* @gh: The lock request which we wish to grant
@@ -506,7 +445,7 @@ __releases(&gl->gl_spin)
__acquires(&gl->gl_spin)
{
const struct gfs2_glock_operations *glops = gl->gl_ops;
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
unsigned int lck_flags = gh ? gh->gh_flags : 0;
int ret;
@@ -628,7 +567,7 @@ out_unlock:
static void delete_work_func(struct work_struct *work)
{
struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_inode *ip;
struct inode *inode;
u64 no_addr = gl->gl_name.ln_number;
@@ -704,15 +643,17 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
struct gfs2_glock **glp)
{
struct super_block *s = sdp->sd_vfs;
- struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type };
- struct gfs2_glock *gl, *tmp;
- unsigned int hash = gl_hash(sdp, &name);
+ struct lm_lockname name = { .ln_number = number,
+ .ln_type = glops->go_type,
+ .ln_sbd = sdp };
+ struct gfs2_glock *gl, *tmp = NULL;
struct address_space *mapping;
struct kmem_cache *cachep;
+ int ret, tries = 0;
- rcu_read_lock();
- gl = search_bucket(hash, sdp, &name);
- rcu_read_unlock();
+ gl = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms);
+ if (gl && !lockref_get_not_dead(&gl->gl_lockref))
+ gl = NULL;
*glp = gl;
if (gl)
@@ -739,14 +680,13 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
}
atomic_inc(&sdp->sd_glock_disposal);
- gl->gl_sbd = sdp;
+ gl->gl_node.next = NULL;
gl->gl_flags = 0;
gl->gl_name = name;
gl->gl_lockref.count = 1;
gl->gl_state = LM_ST_UNLOCKED;
gl->gl_target = LM_ST_UNLOCKED;
gl->gl_demote_state = LM_ST_EXCLUSIVE;
- gl->gl_hash = hash;
gl->gl_ops = glops;
gl->gl_dstamp = ktime_set(0, 0);
preempt_disable();
@@ -771,22 +711,34 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
mapping->writeback_index = 0;
}
- spin_lock_bucket(hash);
- tmp = search_bucket(hash, sdp, &name);
- if (tmp) {
- spin_unlock_bucket(hash);
- kfree(gl->gl_lksb.sb_lvbptr);
- kmem_cache_free(cachep, gl);
- atomic_dec(&sdp->sd_glock_disposal);
- gl = tmp;
- } else {
- hlist_bl_add_head_rcu(&gl->gl_list, &gl_hash_table[hash]);
- spin_unlock_bucket(hash);
+again:
+ ret = rhashtable_lookup_insert_fast(&gl_hash_table, &gl->gl_node,
+ ht_parms);
+ if (ret == 0) {
+ *glp = gl;
+ return 0;
}
- *glp = gl;
+ if (ret == -EEXIST) {
+ ret = 0;
+ tmp = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms);
+ if (tmp == NULL || !lockref_get_not_dead(&tmp->gl_lockref)) {
+ if (++tries < 100) {
+ cond_resched();
+ goto again;
+ }
+ tmp = NULL;
+ ret = -ENOMEM;
+ }
+ } else {
+ WARN_ON_ONCE(ret);
+ }
+ kfree(gl->gl_lksb.sb_lvbptr);
+ kmem_cache_free(cachep, gl);
+ atomic_dec(&sdp->sd_glock_disposal);
+ *glp = tmp;
- return 0;
+ return ret;
}
/**
@@ -928,7 +880,7 @@ __releases(&gl->gl_spin)
__acquires(&gl->gl_spin)
{
struct gfs2_glock *gl = gh->gh_gl;
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct list_head *insert_pt = NULL;
struct gfs2_holder *gh2;
int try_futile = 0;
@@ -1006,7 +958,7 @@ trap_recursive:
int gfs2_glock_nq(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
int error = 0;
if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
@@ -1313,7 +1265,7 @@ static int gfs2_should_freeze(const struct gfs2_glock *gl)
void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
{
- struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+ struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct;
spin_lock(&gl->gl_spin);
gl->gl_reply = ret;
@@ -1462,31 +1414,26 @@ static struct shrinker glock_shrinker = {
*
*/
-static void examine_bucket(glock_examiner examiner, const struct gfs2_sbd *sdp,
- unsigned int hash)
+static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
{
struct gfs2_glock *gl;
- struct hlist_bl_head *head = &gl_hash_table[hash];
- struct hlist_bl_node *pos;
+ struct rhash_head *pos, *next;
+ const struct bucket_table *tbl;
+ int i;
rcu_read_lock();
- hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) {
- if ((gl->gl_sbd == sdp) && lockref_get_not_dead(&gl->gl_lockref))
- examiner(gl);
+ tbl = rht_dereference_rcu(gl_hash_table.tbl, &gl_hash_table);
+ for (i = 0; i < tbl->size; i++) {
+ rht_for_each_entry_safe(gl, pos, next, tbl, i, gl_node) {
+ if ((gl->gl_name.ln_sbd == sdp) &&
+ lockref_get_not_dead(&gl->gl_lockref))
+ examiner(gl);
+ }
}
rcu_read_unlock();
cond_resched();
}
-static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
-{
- unsigned x;
-
- for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
- examine_bucket(examiner, sdp, x);
-}
-
-
/**
* thaw_glock - thaw out a glock which has an unprocessed reply waiting
* @gl: The glock to thaw
@@ -1569,7 +1516,7 @@ void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
int ret;
ret = gfs2_truncatei_resume(ip);
- gfs2_assert_withdraw(gl->gl_sbd, ret == 0);
+ gfs2_assert_withdraw(gl->gl_name.ln_sbd, ret == 0);
spin_lock(&gl->gl_spin);
clear_bit(GLF_LOCK, &gl->gl_flags);
@@ -1733,17 +1680,17 @@ static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr)
{
struct gfs2_glock *gl = iter_ptr;
- seq_printf(seq, "G: n:%u/%llx rtt:%lld/%lld rttb:%lld/%lld irt:%lld/%lld dcnt: %lld qcnt: %lld\n",
+ seq_printf(seq, "G: n:%u/%llx rtt:%llu/%llu rttb:%llu/%llu irt:%llu/%llu dcnt: %llu qcnt: %llu\n",
gl->gl_name.ln_type,
(unsigned long long)gl->gl_name.ln_number,
- (long long)gl->gl_stats.stats[GFS2_LKS_SRTT],
- (long long)gl->gl_stats.stats[GFS2_LKS_SRTTVAR],
- (long long)gl->gl_stats.stats[GFS2_LKS_SRTTB],
- (long long)gl->gl_stats.stats[GFS2_LKS_SRTTVARB],
- (long long)gl->gl_stats.stats[GFS2_LKS_SIRT],
- (long long)gl->gl_stats.stats[GFS2_LKS_SIRTVAR],
- (long long)gl->gl_stats.stats[GFS2_LKS_DCOUNT],
- (long long)gl->gl_stats.stats[GFS2_LKS_QCOUNT]);
+ (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTT],
+ (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTVAR],
+ (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTB],
+ (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTVARB],
+ (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SIRT],
+ (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SIRTVAR],
+ (unsigned long long)gl->gl_stats.stats[GFS2_LKS_DCOUNT],
+ (unsigned long long)gl->gl_stats.stats[GFS2_LKS_QCOUNT]);
return 0;
}
@@ -1776,11 +1723,10 @@ static const char *gfs2_stype[] = {
static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr)
{
- struct gfs2_glock_iter *gi = seq->private;
- struct gfs2_sbd *sdp = gi->sdp;
- unsigned index = gi->hash >> 3;
- unsigned subindex = gi->hash & 0x07;
- s64 value;
+ struct gfs2_sbd *sdp = seq->private;
+ loff_t pos = *(loff_t *)iter_ptr;
+ unsigned index = pos >> 3;
+ unsigned subindex = pos & 0x07;
int i;
if (index == 0 && subindex != 0)
@@ -1791,12 +1737,12 @@ static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr)
for_each_possible_cpu(i) {
const struct gfs2_pcpu_lkstats *lkstats = per_cpu_ptr(sdp->sd_lkstats, i);
- if (index == 0) {
- value = i;
- } else {
- value = lkstats->lkstats[index - 1].stats[subindex];
- }
- seq_printf(seq, " %15lld", (long long)value);
+
+ if (index == 0)
+ seq_printf(seq, " %15u", i);
+ else
+ seq_printf(seq, " %15llu", (unsigned long long)lkstats->
+ lkstats[index - 1].stats[subindex]);
}
seq_putc(seq, '\n');
return 0;
@@ -1804,20 +1750,24 @@ static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr)
int __init gfs2_glock_init(void)
{
- unsigned i;
- for(i = 0; i < GFS2_GL_HASH_SIZE; i++) {
- INIT_HLIST_BL_HEAD(&gl_hash_table[i]);
- }
+ int ret;
+
+ ret = rhashtable_init(&gl_hash_table, &ht_parms);
+ if (ret < 0)
+ return ret;
glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
WQ_HIGHPRI | WQ_FREEZABLE, 0);
- if (!glock_workqueue)
+ if (!glock_workqueue) {
+ rhashtable_destroy(&gl_hash_table);
return -ENOMEM;
+ }
gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
WQ_MEM_RECLAIM | WQ_FREEZABLE,
0);
if (!gfs2_delete_workqueue) {
destroy_workqueue(glock_workqueue);
+ rhashtable_destroy(&gl_hash_table);
return -ENOMEM;
}
@@ -1829,72 +1779,41 @@ int __init gfs2_glock_init(void)
void gfs2_glock_exit(void)
{
unregister_shrinker(&glock_shrinker);
+ rhashtable_destroy(&gl_hash_table);
destroy_workqueue(glock_workqueue);
destroy_workqueue(gfs2_delete_workqueue);
}
-static inline struct gfs2_glock *glock_hash_chain(unsigned hash)
+static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
{
- return hlist_bl_entry(hlist_bl_first_rcu(&gl_hash_table[hash]),
- struct gfs2_glock, gl_list);
-}
-
-static inline struct gfs2_glock *glock_hash_next(struct gfs2_glock *gl)
-{
- return hlist_bl_entry(rcu_dereference(gl->gl_list.next),
- struct gfs2_glock, gl_list);
-}
-
-static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
-{
- struct gfs2_glock *gl;
-
do {
- gl = gi->gl;
- if (gl) {
- gi->gl = glock_hash_next(gl);
- gi->nhash++;
- } else {
- if (gi->hash >= GFS2_GL_HASH_SIZE) {
- rcu_read_unlock();
- return 1;
- }
- gi->gl = glock_hash_chain(gi->hash);
- gi->nhash = 0;
- }
- while (gi->gl == NULL) {
- gi->hash++;
- if (gi->hash >= GFS2_GL_HASH_SIZE) {
- rcu_read_unlock();
- return 1;
- }
- gi->gl = glock_hash_chain(gi->hash);
- gi->nhash = 0;
+ gi->gl = rhashtable_walk_next(&gi->hti);
+ if (IS_ERR(gi->gl)) {
+ if (PTR_ERR(gi->gl) == -EAGAIN)
+ continue;
+ gi->gl = NULL;
}
/* Skip entries for other sb and dead entries */
- } while (gi->sdp != gi->gl->gl_sbd ||
- __lockref_is_dead(&gi->gl->gl_lockref));
-
- return 0;
+ } while ((gi->gl) && ((gi->sdp != gi->gl->gl_name.ln_sbd) ||
+ __lockref_is_dead(&gi->gl->gl_lockref)));
}
static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
{
struct gfs2_glock_iter *gi = seq->private;
loff_t n = *pos;
+ int ret;
if (gi->last_pos <= *pos)
- n = gi->nhash + (*pos - gi->last_pos);
- else
- gi->hash = 0;
+ n = (*pos - gi->last_pos);
- gi->nhash = 0;
- rcu_read_lock();
+ ret = rhashtable_walk_start(&gi->hti);
+ if (ret)
+ return NULL;
do {
- if (gfs2_glock_iter_next(gi))
- return NULL;
- } while (n--);
+ gfs2_glock_iter_next(gi);
+ } while (gi->gl && n--);
gi->last_pos = *pos;
return gi->gl;
@@ -1907,9 +1826,7 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
(*pos)++;
gi->last_pos = *pos;
- if (gfs2_glock_iter_next(gi))
- return NULL;
-
+ gfs2_glock_iter_next(gi);
return gi->gl;
}
@@ -1917,9 +1834,8 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
{
struct gfs2_glock_iter *gi = seq->private;
- if (gi->gl)
- rcu_read_unlock();
gi->gl = NULL;
+ rhashtable_walk_stop(&gi->hti);
}
static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
@@ -1930,26 +1846,19 @@ static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos)
{
- struct gfs2_glock_iter *gi = seq->private;
-
- gi->hash = *pos;
+ preempt_disable();
if (*pos >= GFS2_NR_SBSTATS)
return NULL;
- preempt_disable();
- return SEQ_START_TOKEN;
+ return pos;
}
static void *gfs2_sbstats_seq_next(struct seq_file *seq, void *iter_ptr,
loff_t *pos)
{
- struct gfs2_glock_iter *gi = seq->private;
(*pos)++;
- gi->hash++;
- if (gi->hash >= GFS2_NR_SBSTATS) {
- preempt_enable();
+ if (*pos >= GFS2_NR_SBSTATS)
return NULL;
- }
- return SEQ_START_TOKEN;
+ return pos;
}
static void gfs2_sbstats_seq_stop(struct seq_file *seq, void *iter_ptr)
@@ -1987,14 +1896,28 @@ static int gfs2_glocks_open(struct inode *inode, struct file *file)
if (ret == 0) {
struct seq_file *seq = file->private_data;
struct gfs2_glock_iter *gi = seq->private;
+
gi->sdp = inode->i_private;
+ gi->last_pos = 0;
seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN);
if (seq->buf)
seq->size = GFS2_SEQ_GOODSIZE;
+ gi->gl = NULL;
+ ret = rhashtable_walk_init(&gl_hash_table, &gi->hti);
}
return ret;
}
+static int gfs2_glocks_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct gfs2_glock_iter *gi = seq->private;
+
+ gi->gl = NULL;
+ rhashtable_walk_exit(&gi->hti);
+ return seq_release_private(inode, file);
+}
+
static int gfs2_glstats_open(struct inode *inode, struct file *file)
{
int ret = seq_open_private(file, &gfs2_glstats_seq_ops,
@@ -2003,21 +1926,22 @@ static int gfs2_glstats_open(struct inode *inode, struct file *file)
struct seq_file *seq = file->private_data;
struct gfs2_glock_iter *gi = seq->private;
gi->sdp = inode->i_private;
+ gi->last_pos = 0;
seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN);
if (seq->buf)
seq->size = GFS2_SEQ_GOODSIZE;
+ gi->gl = NULL;
+ ret = rhashtable_walk_init(&gl_hash_table, &gi->hti);
}
return ret;
}
static int gfs2_sbstats_open(struct inode *inode, struct file *file)
{
- int ret = seq_open_private(file, &gfs2_sbstats_seq_ops,
- sizeof(struct gfs2_glock_iter));
+ int ret = seq_open(file, &gfs2_sbstats_seq_ops);
if (ret == 0) {
struct seq_file *seq = file->private_data;
- struct gfs2_glock_iter *gi = seq->private;
- gi->sdp = inode->i_private;
+ seq->private = inode->i_private; /* sdp */
}
return ret;
}
@@ -2027,7 +1951,7 @@ static const struct file_operations gfs2_glocks_fops = {
.open = gfs2_glocks_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = gfs2_glocks_release,
};
static const struct file_operations gfs2_glstats_fops = {
@@ -2035,7 +1959,7 @@ static const struct file_operations gfs2_glstats_fops = {
.open = gfs2_glstats_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = gfs2_glocks_release,
};
static const struct file_operations gfs2_sbstats_fops = {
@@ -2043,7 +1967,7 @@ static const struct file_operations gfs2_sbstats_fops = {
.open = gfs2_sbstats_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = seq_release,
};
int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index fa3fa5e94553..1f6c9c3fe5cb 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -32,13 +32,15 @@ struct workqueue_struct *gfs2_freeze_wq;
static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
{
- fs_err(gl->gl_sbd, "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page state 0x%lx\n",
+ fs_err(gl->gl_name.ln_sbd,
+ "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page "
+ "state 0x%lx\n",
bh, (unsigned long long)bh->b_blocknr, bh->b_state,
bh->b_page->mapping, bh->b_page->flags);
- fs_err(gl->gl_sbd, "AIL glock %u:%llu mapping %p\n",
+ fs_err(gl->gl_name.ln_sbd, "AIL glock %u:%llu mapping %p\n",
gl->gl_name.ln_type, gl->gl_name.ln_number,
gfs2_glock2aspace(gl));
- gfs2_lm_withdraw(gl->gl_sbd, "AIL error\n");
+ gfs2_lm_withdraw(gl->gl_name.ln_sbd, "AIL error\n");
}
/**
@@ -52,7 +54,7 @@ static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync,
unsigned int nr_revokes)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct list_head *head = &gl->gl_ail_list;
struct gfs2_bufdata *bd, *tmp;
struct buffer_head *bh;
@@ -80,7 +82,7 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync,
static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_trans tr;
memset(&tr, 0, sizeof(tr));
@@ -109,7 +111,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
unsigned int revokes = atomic_read(&gl->gl_ail_count);
unsigned int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64);
int ret;
@@ -139,7 +141,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
static void rgrp_go_sync(struct gfs2_glock *gl)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct address_space *mapping = &sdp->sd_aspace;
struct gfs2_rgrpd *rgd;
int error;
@@ -179,7 +181,7 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct address_space *mapping = &sdp->sd_aspace;
struct gfs2_rgrpd *rgd = gl->gl_object;
@@ -218,7 +220,7 @@ static void inode_go_sync(struct gfs2_glock *gl)
GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
- gfs2_log_flush(gl->gl_sbd, gl, NORMAL_FLUSH);
+ gfs2_log_flush(gl->gl_name.ln_sbd, gl, NORMAL_FLUSH);
filemap_fdatawrite(metamapping);
if (ip) {
struct address_space *mapping = ip->i_inode.i_mapping;
@@ -252,7 +254,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
{
struct gfs2_inode *ip = gl->gl_object;
- gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
+ gfs2_assert_withdraw(gl->gl_name.ln_sbd, !atomic_read(&gl->gl_ail_count));
if (flags & DIO_METADATA) {
struct address_space *mapping = gfs2_glock2aspace(gl);
@@ -264,9 +266,9 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
}
}
- if (ip == GFS2_I(gl->gl_sbd->sd_rindex)) {
- gfs2_log_flush(gl->gl_sbd, NULL, NORMAL_FLUSH);
- gl->gl_sbd->sd_rindex_uptodate = 0;
+ if (ip == GFS2_I(gl->gl_name.ln_sbd->sd_rindex)) {
+ gfs2_log_flush(gl->gl_name.ln_sbd, NULL, NORMAL_FLUSH);
+ gl->gl_name.ln_sbd->sd_rindex_uptodate = 0;
}
if (ip && S_ISREG(ip->i_inode.i_mode))
truncate_inode_pages(ip->i_inode.i_mapping, 0);
@@ -281,7 +283,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
static int inode_go_demote_ok(const struct gfs2_glock *gl)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_holder *gh;
if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object)
@@ -416,7 +418,7 @@ int gfs2_inode_refresh(struct gfs2_inode *ip)
static int inode_go_lock(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_inode *ip = gl->gl_object;
int error = 0;
@@ -477,7 +479,7 @@ static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
static void freeze_go_sync(struct gfs2_glock *gl)
{
int error = 0;
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
if (gl->gl_state == LM_ST_SHARED &&
test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
@@ -500,7 +502,7 @@ static void freeze_go_sync(struct gfs2_glock *gl)
static int freeze_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
struct gfs2_glock *j_gl = ip->i_gl;
struct gfs2_log_header_host head;
@@ -545,7 +547,7 @@ static int freeze_go_demote_ok(const struct gfs2_glock *gl)
static void iopen_go_callback(struct gfs2_glock *gl, bool remote)
{
struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object;
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
if (!remote || (sdp->sd_vfs->s_flags & MS_RDONLY))
return;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index a1ec7c20e498..121ed08d9d9f 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -22,6 +22,7 @@
#include <linux/ktime.h>
#include <linux/percpu.h>
#include <linux/lockref.h>
+#include <linux/rhashtable.h>
#define DIO_WAIT 0x00000010
#define DIO_METADATA 0x00000020
@@ -203,13 +204,15 @@ enum {
};
struct lm_lockname {
+ struct gfs2_sbd *ln_sbd;
u64 ln_number;
unsigned int ln_type;
};
#define lm_name_equal(name1, name2) \
- (((name1)->ln_number == (name2)->ln_number) && \
- ((name1)->ln_type == (name2)->ln_type))
+ (((name1)->ln_number == (name2)->ln_number) && \
+ ((name1)->ln_type == (name2)->ln_type) && \
+ ((name1)->ln_sbd == (name2)->ln_sbd))
struct gfs2_glock_operations {
@@ -241,7 +244,7 @@ enum {
};
struct gfs2_lkstats {
- s64 stats[GFS2_NR_LKSTATS];
+ u64 stats[GFS2_NR_LKSTATS];
};
enum {
@@ -327,7 +330,6 @@ enum {
struct gfs2_glock {
struct hlist_bl_node gl_list;
- struct gfs2_sbd *gl_sbd;
unsigned long gl_flags; /* GLF_... */
struct lm_lockname gl_name;
@@ -341,7 +343,6 @@ struct gfs2_glock {
gl_req:2, /* State in last dlm request */
gl_reply:8; /* Last reply from the dlm */
- unsigned int gl_hash;
unsigned long gl_demote_time; /* time of first demote request */
long gl_hold_time;
struct list_head gl_holders;
@@ -367,7 +368,7 @@ struct gfs2_glock {
loff_t end;
} gl_vm;
};
- struct rcu_head gl_rcu;
+ struct rhash_head gl_node;
};
#define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */
@@ -835,7 +836,7 @@ static inline void gfs2_glstats_inc(struct gfs2_glock *gl, int which)
static inline void gfs2_sbstats_inc(const struct gfs2_glock *gl, int which)
{
- const struct gfs2_sbd *sdp = gl->gl_sbd;
+ const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
preempt_disable();
this_cpu_ptr(sdp->sd_lkstats)->lkstats[gl->gl_name.ln_type].stats[which]++;
preempt_enable();
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 641383a9c1bb..284c1542783e 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -31,7 +31,7 @@ extern struct workqueue_struct *gfs2_control_wq;
*
* @delta is the difference between the current rtt sample and the
* running average srtt. We add 1/8 of that to the srtt in order to
- * update the current srtt estimate. The varience estimate is a bit
+ * update the current srtt estimate. The variance estimate is a bit
* more complicated. We subtract the abs value of the @delta from
* the current variance estimate and add 1/4 of that to the running
* total.
@@ -80,7 +80,7 @@ static inline void gfs2_update_reply_times(struct gfs2_glock *gl)
preempt_disable();
rtt = ktime_to_ns(ktime_sub(ktime_get_real(), gl->gl_dstamp));
- lks = this_cpu_ptr(gl->gl_sbd->sd_lkstats);
+ lks = this_cpu_ptr(gl->gl_name.ln_sbd->sd_lkstats);
gfs2_update_stats(&gl->gl_stats, index, rtt); /* Local */
gfs2_update_stats(&lks->lkstats[gltype], index, rtt); /* Global */
preempt_enable();
@@ -108,7 +108,7 @@ static inline void gfs2_update_request_times(struct gfs2_glock *gl)
dstamp = gl->gl_dstamp;
gl->gl_dstamp = ktime_get_real();
irt = ktime_to_ns(ktime_sub(gl->gl_dstamp, dstamp));
- lks = this_cpu_ptr(gl->gl_sbd->sd_lkstats);
+ lks = this_cpu_ptr(gl->gl_name.ln_sbd->sd_lkstats);
gfs2_update_stats(&gl->gl_stats, GFS2_LKS_SIRT, irt); /* Local */
gfs2_update_stats(&lks->lkstats[gltype], GFS2_LKS_SIRT, irt); /* Global */
preempt_enable();
@@ -253,7 +253,7 @@ static void gfs2_reverse_hex(char *c, u64 value)
static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
unsigned int flags)
{
- struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+ struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct;
int req;
u32 lkf;
char strname[GDLM_STRNAME_BYTES] = "";
@@ -281,7 +281,7 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
static void gdlm_put_lock(struct gfs2_glock *gl)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
int lvb_needs_unlock = 0;
int error;
@@ -319,7 +319,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
static void gdlm_cancel(struct gfs2_glock *gl)
{
- struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+ struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct;
dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl);
}
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 92324ac58290..d5369a109781 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -70,7 +70,7 @@ static bool buffer_is_rgrp(const struct gfs2_bufdata *bd)
static void maybe_release_space(struct gfs2_bufdata *bd)
{
struct gfs2_glock *gl = bd->bd_gl;
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_rgrpd *rgd = gl->gl_object;
unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number;
struct gfs2_bitmap *bi = rgd->rd_bits + index;
@@ -578,7 +578,7 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
static void gfs2_meta_sync(struct gfs2_glock *gl)
{
struct address_space *mapping = gfs2_glock2aspace(gl);
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
int error;
if (mapping == NULL)
@@ -588,7 +588,7 @@ static void gfs2_meta_sync(struct gfs2_glock *gl)
error = filemap_fdatawait(mapping);
if (error)
- gfs2_io_error(gl->gl_sbd);
+ gfs2_io_error(gl->gl_name.ln_sbd);
}
static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index b984a6e190bc..0e1d4be5865a 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -114,7 +114,7 @@ const struct address_space_operations gfs2_rgrp_aops = {
struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
{
struct address_space *mapping = gfs2_glock2aspace(gl);
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct page *page;
struct buffer_head *bh;
unsigned int shift;
@@ -200,7 +200,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
struct buffer_head **bhp)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct buffer_head *bh;
if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
@@ -362,7 +362,7 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct buffer_head *first_bh, *bh;
u32 max_ra = gfs2_tune_get(sdp, gt_max_readahead) >>
sdp->sd_sb.sb_bsize_shift;
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index ac5d8027d335..8ca161567a93 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -44,7 +44,7 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
{
struct inode *inode = mapping->host;
if (mapping->a_ops == &gfs2_meta_aops)
- return (((struct gfs2_glock *)mapping) - 1)->gl_sbd;
+ return (((struct gfs2_glock *)mapping) - 1)->gl_name.ln_sbd;
else if (mapping->a_ops == &gfs2_rgrp_aops)
return container_of(mapping, struct gfs2_sbd, sd_aspace);
else
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 9b61f92fcfdf..3a31226531ea 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -119,7 +119,7 @@ static void gfs2_qd_dispose(struct list_head *list)
while (!list_empty(list)) {
qd = list_entry(list->next, struct gfs2_quota_data, qd_lru);
- sdp = qd->qd_gl->gl_sbd;
+ sdp = qd->qd_gl->gl_name.ln_sbd;
list_del(&qd->qd_lru);
@@ -302,7 +302,7 @@ static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
static void qd_hold(struct gfs2_quota_data *qd)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
gfs2_assert(sdp, !__lockref_is_dead(&qd->qd_lockref));
lockref_get(&qd->qd_lockref);
}
@@ -367,7 +367,7 @@ static void slot_put(struct gfs2_quota_data *qd)
static int bh_get(struct gfs2_quota_data *qd)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
unsigned int block, offset;
struct buffer_head *bh;
@@ -414,7 +414,7 @@ fail:
static void bh_put(struct gfs2_quota_data *qd)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
mutex_lock(&sdp->sd_quota_mutex);
gfs2_assert(sdp, qd->qd_bh_count);
@@ -486,7 +486,7 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
static void qd_unlock(struct gfs2_quota_data *qd)
{
- gfs2_assert_warn(qd->qd_gl->gl_sbd,
+ gfs2_assert_warn(qd->qd_gl->gl_name.ln_sbd,
test_bit(QDF_LOCKED, &qd->qd_flags));
clear_bit(QDF_LOCKED, &qd->qd_flags);
bh_put(qd);
@@ -614,7 +614,7 @@ static int sort_qd(const void *a, const void *b)
static void do_qc(struct gfs2_quota_data *qd, s64 change)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
struct gfs2_quota_change *qc = qd->qd_bh_qc;
s64 x;
@@ -831,7 +831,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
{
- struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd;
+ struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_name.ln_sbd;
struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
struct gfs2_alloc_parms ap = { .aflags = 0, };
unsigned int data_blocks, ind_blocks;
@@ -922,7 +922,7 @@ out:
gfs2_glock_dq_uninit(&ghs[qx]);
mutex_unlock(&ip->i_inode.i_mutex);
kfree(ghs);
- gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl, NORMAL_FLUSH);
+ gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl, NORMAL_FLUSH);
return error;
}
@@ -954,7 +954,7 @@ static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd)
static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
struct gfs2_holder *q_gh)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
struct gfs2_holder i_gh;
int error;
@@ -1037,7 +1037,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
static int need_sync(struct gfs2_quota_data *qd)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
struct gfs2_tune *gt = &sdp->sd_tune;
s64 value;
unsigned int num, den;
@@ -1125,7 +1125,7 @@ out:
static int print_message(struct gfs2_quota_data *qd, char *type)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
fs_info(sdp, "quota %s for %s %u\n",
type,
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index c6c62321dfd6..475985d14758 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1860,13 +1860,13 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
{
const struct gfs2_glock *gl = rgd->rd_gl;
- const struct gfs2_sbd *sdp = gl->gl_sbd;
+ const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_lkstats *st;
- s64 r_dcount, l_dcount;
- s64 l_srttb, a_srttb = 0;
+ u64 r_dcount, l_dcount;
+ u64 l_srttb, a_srttb = 0;
s64 srttb_diff;
- s64 sqr_diff;
- s64 var;
+ u64 sqr_diff;
+ u64 var;
int cpu, nonzero = 0;
preempt_disable();
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 20c007d747ab..49ac55da4e33 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -104,7 +104,7 @@ TRACE_EVENT(gfs2_glock_state_change,
),
TP_fast_assign(
- __entry->dev = gl->gl_sbd->sd_vfs->s_dev;
+ __entry->dev = gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->glnum = gl->gl_name.ln_number;
__entry->gltype = gl->gl_name.ln_type;
__entry->cur_state = glock_trace_state(gl->gl_state);
@@ -140,7 +140,7 @@ TRACE_EVENT(gfs2_glock_put,
),
TP_fast_assign(
- __entry->dev = gl->gl_sbd->sd_vfs->s_dev;
+ __entry->dev = gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->gltype = gl->gl_name.ln_type;
__entry->glnum = gl->gl_name.ln_number;
__entry->cur_state = glock_trace_state(gl->gl_state);
@@ -174,7 +174,7 @@ TRACE_EVENT(gfs2_demote_rq,
),
TP_fast_assign(
- __entry->dev = gl->gl_sbd->sd_vfs->s_dev;
+ __entry->dev = gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->gltype = gl->gl_name.ln_type;
__entry->glnum = gl->gl_name.ln_number;
__entry->cur_state = glock_trace_state(gl->gl_state);
@@ -209,7 +209,7 @@ TRACE_EVENT(gfs2_promote,
),
TP_fast_assign(
- __entry->dev = gh->gh_gl->gl_sbd->sd_vfs->s_dev;
+ __entry->dev = gh->gh_gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->glnum = gh->gh_gl->gl_name.ln_number;
__entry->gltype = gh->gh_gl->gl_name.ln_type;
__entry->first = first;
@@ -239,7 +239,7 @@ TRACE_EVENT(gfs2_glock_queue,
),
TP_fast_assign(
- __entry->dev = gh->gh_gl->gl_sbd->sd_vfs->s_dev;
+ __entry->dev = gh->gh_gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->glnum = gh->gh_gl->gl_name.ln_number;
__entry->gltype = gh->gh_gl->gl_name.ln_type;
__entry->queue = queue;
@@ -267,18 +267,18 @@ TRACE_EVENT(gfs2_glock_lock_time,
__field( int, status )
__field( char, flags )
__field( s64, tdiff )
- __field( s64, srtt )
- __field( s64, srttvar )
- __field( s64, srttb )
- __field( s64, srttvarb )
- __field( s64, sirt )
- __field( s64, sirtvar )
- __field( s64, dcount )
- __field( s64, qcount )
+ __field( u64, srtt )
+ __field( u64, srttvar )
+ __field( u64, srttb )
+ __field( u64, srttvarb )
+ __field( u64, sirt )
+ __field( u64, sirtvar )
+ __field( u64, dcount )
+ __field( u64, qcount )
),
TP_fast_assign(
- __entry->dev = gl->gl_sbd->sd_vfs->s_dev;
+ __entry->dev = gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->glnum = gl->gl_name.ln_number;
__entry->gltype = gl->gl_name.ln_type;
__entry->status = gl->gl_lksb.sb_status;
@@ -333,7 +333,7 @@ TRACE_EVENT(gfs2_pin,
),
TP_fast_assign(
- __entry->dev = bd->bd_gl->gl_sbd->sd_vfs->s_dev;
+ __entry->dev = bd->bd_gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->pin = pin;
__entry->len = bd->bd_bh->b_size;
__entry->block = bd->bd_bh->b_blocknr;
@@ -449,7 +449,7 @@ TRACE_EVENT(gfs2_bmap,
),
TP_fast_assign(
- __entry->dev = ip->i_gl->gl_sbd->sd_vfs->s_dev;
+ __entry->dev = ip->i_gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->lblock = lblock;
__entry->pblock = buffer_mapped(bh) ? bh->b_blocknr : 0;
__entry->inum = ip->i_no_addr;
@@ -489,7 +489,7 @@ TRACE_EVENT(gfs2_block_alloc,
),
TP_fast_assign(
- __entry->dev = rgd->rd_gl->gl_sbd->sd_vfs->s_dev;
+ __entry->dev = rgd->rd_gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->start = block;
__entry->inum = ip->i_no_addr;
__entry->len = len;
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 88bff2430669..b95d0d625f32 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -158,7 +158,7 @@ static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl,
void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh)
{
struct gfs2_trans *tr = current->journal_info;
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct address_space *mapping = bh->b_page->mapping;
struct gfs2_inode *ip = GFS2_I(mapping->host);
struct gfs2_bufdata *bd;
@@ -224,7 +224,7 @@ static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_bufdata *bd;
lock_buffer(bh);
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index d3fa6bd9503e..221719eac5de 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -288,7 +288,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
page_cache_release(page);
goto fail;
}
- page_cache_release(page);
node->page[i] = page;
}
@@ -398,11 +397,11 @@ node_error:
void hfs_bnode_free(struct hfs_bnode *node)
{
- //int i;
+ int i;
- //for (i = 0; i < node->tree->pages_per_bnode; i++)
- // if (node->page[i])
- // page_cache_release(node->page[i]);
+ for (i = 0; i < node->tree->pages_per_bnode; i++)
+ if (node->page[i])
+ page_cache_release(node->page[i]);
kfree(node);
}
diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c
index 9f4ee7f52026..6fc766df0461 100644
--- a/fs/hfs/brec.c
+++ b/fs/hfs/brec.c
@@ -131,13 +131,16 @@ skip:
hfs_bnode_write(node, entry, data_off + key_len, entry_len);
hfs_bnode_dump(node);
- if (new_node) {
- /* update parent key if we inserted a key
- * at the start of the first node
- */
- if (!rec && new_node != node)
- hfs_brec_update_parent(fd);
+ /*
+ * update parent key if we inserted a key
+ * at the start of the node and it is not the new node
+ */
+ if (!rec && new_node != node) {
+ hfs_bnode_read_key(node, fd->search_key, data_off + size);
+ hfs_brec_update_parent(fd);
+ }
+ if (new_node) {
hfs_bnode_put(fd->bnode);
if (!new_node->parent) {
hfs_btree_inc_height(tree);
@@ -166,9 +169,6 @@ skip:
goto again;
}
- if (!rec)
- hfs_brec_update_parent(fd);
-
return 0;
}
@@ -366,6 +366,8 @@ again:
if (IS_ERR(parent))
return PTR_ERR(parent);
__hfs_brec_find(parent, fd);
+ if (fd->record < 0)
+ return -ENOENT;
hfs_bnode_dump(parent);
rec = fd->record;
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 759708fd9331..63924662aaf3 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -454,7 +454,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
page_cache_release(page);
goto fail;
}
- page_cache_release(page);
node->page[i] = page;
}
@@ -566,13 +565,11 @@ node_error:
void hfs_bnode_free(struct hfs_bnode *node)
{
-#if 0
int i;
for (i = 0; i < node->tree->pages_per_bnode; i++)
if (node->page[i])
page_cache_release(node->page[i]);
-#endif
kfree(node);
}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 973c24ce59ad..316adb968b65 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -12,6 +12,7 @@
#include <linux/thread_info.h>
#include <asm/current.h>
#include <linux/sched.h> /* remove ASAP */
+#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/file.h>
@@ -84,6 +85,29 @@ static const match_table_t tokens = {
{Opt_err, NULL},
};
+#ifdef CONFIG_NUMA
+static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
+ struct inode *inode, pgoff_t index)
+{
+ vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
+ index);
+}
+
+static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
+{
+ mpol_cond_put(vma->vm_policy);
+}
+#else
+static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
+ struct inode *inode, pgoff_t index)
+{
+}
+
+static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
+{
+}
+#endif
+
static void huge_pagevec_release(struct pagevec *pvec)
{
int i;
@@ -293,26 +317,61 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
return -EINVAL;
}
-static void truncate_huge_page(struct page *page)
+static void remove_huge_page(struct page *page)
{
ClearPageDirty(page);
ClearPageUptodate(page);
delete_from_page_cache(page);
}
-static void truncate_hugepages(struct inode *inode, loff_t lstart)
+
+/*
+ * remove_inode_hugepages handles two distinct cases: truncation and hole
+ * punch. There are subtle differences in operation for each case.
+
+ * truncation is indicated by end of range being LLONG_MAX
+ * In this case, we first scan the range and release found pages.
+ * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
+ * maps and global counts.
+ * hole punch is indicated if end is not LLONG_MAX
+ * In the hole punch case we scan the range and release found pages.
+ * Only when releasing a page is the associated region/reserv map
+ * deleted. The region/reserv map for ranges without associated
+ * pages are not modified.
+ * Note: If the passed end of range value is beyond the end of file, but
+ * not LLONG_MAX this routine still performs a hole punch operation.
+ */
+static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
+ loff_t lend)
{
struct hstate *h = hstate_inode(inode);
struct address_space *mapping = &inode->i_data;
const pgoff_t start = lstart >> huge_page_shift(h);
+ const pgoff_t end = lend >> huge_page_shift(h);
+ struct vm_area_struct pseudo_vma;
struct pagevec pvec;
pgoff_t next;
int i, freed = 0;
+ long lookup_nr = PAGEVEC_SIZE;
+ bool truncate_op = (lend == LLONG_MAX);
+ memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
+ pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
pagevec_init(&pvec, 0);
next = start;
- while (1) {
- if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+ while (next < end) {
+ /*
+ * Make sure to never grab more pages that we
+ * might possibly need.
+ */
+ if (end - next < lookup_nr)
+ lookup_nr = end - next;
+
+ /*
+ * This pagevec_lookup() may return pages past 'end',
+ * so we must check for page->index > end.
+ */
+ if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) {
if (next == start)
break;
next = start;
@@ -321,26 +380,69 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
for (i = 0; i < pagevec_count(&pvec); ++i) {
struct page *page = pvec.pages[i];
+ u32 hash;
+
+ hash = hugetlb_fault_mutex_hash(h, current->mm,
+ &pseudo_vma,
+ mapping, next, 0);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
lock_page(page);
+ if (page->index >= end) {
+ unlock_page(page);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ next = end; /* we are done */
+ break;
+ }
+
+ /*
+ * If page is mapped, it was faulted in after being
+ * unmapped. Do nothing in this race case. In the
+ * normal case page is not mapped.
+ */
+ if (!page_mapped(page)) {
+ bool rsv_on_error = !PagePrivate(page);
+ /*
+ * We must free the huge page and remove
+ * from page cache (remove_huge_page) BEFORE
+ * removing the region/reserve map
+ * (hugetlb_unreserve_pages). In rare out
+ * of memory conditions, removal of the
+ * region/reserve map could fail. Before
+ * free'ing the page, note PagePrivate which
+ * is used in case of error.
+ */
+ remove_huge_page(page);
+ freed++;
+ if (!truncate_op) {
+ if (unlikely(hugetlb_unreserve_pages(
+ inode, next,
+ next + 1, 1)))
+ hugetlb_fix_reserve_counts(
+ inode, rsv_on_error);
+ }
+ }
+
if (page->index > next)
next = page->index;
+
++next;
- truncate_huge_page(page);
unlock_page(page);
- freed++;
+
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
}
huge_pagevec_release(&pvec);
}
- BUG_ON(!lstart && mapping->nrpages);
- hugetlb_unreserve_pages(inode, start, freed);
+
+ if (truncate_op)
+ (void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
}
static void hugetlbfs_evict_inode(struct inode *inode)
{
struct resv_map *resv_map;
- truncate_hugepages(inode, 0);
+ remove_inode_hugepages(inode, 0, LLONG_MAX);
resv_map = (struct resv_map *)inode->i_mapping->private_data;
/* root inode doesn't have the resv_map, so we should check it */
if (resv_map)
@@ -349,11 +451,15 @@ static void hugetlbfs_evict_inode(struct inode *inode)
}
static inline void
-hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
+hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
{
struct vm_area_struct *vma;
- vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) {
+ /*
+ * end == 0 indicates that the entire range after
+ * start should be unmapped.
+ */
+ vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
unsigned long v_offset;
/*
@@ -362,13 +468,20 @@ hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
* which overlap the truncated area starting at pgoff,
* and no vma on a 32-bit arch can span beyond the 4GB.
*/
- if (vma->vm_pgoff < pgoff)
- v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT;
+ if (vma->vm_pgoff < start)
+ v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
else
v_offset = 0;
- unmap_hugepage_range(vma, vma->vm_start + v_offset,
- vma->vm_end, NULL);
+ if (end) {
+ end = ((end - start) << PAGE_SHIFT) +
+ vma->vm_start + v_offset;
+ if (end > vma->vm_end)
+ end = vma->vm_end;
+ } else
+ end = vma->vm_end;
+
+ unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
}
}
@@ -384,12 +497,164 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
i_size_write(inode, offset);
i_mmap_lock_write(mapping);
if (!RB_EMPTY_ROOT(&mapping->i_mmap))
- hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
+ hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
i_mmap_unlock_write(mapping);
- truncate_hugepages(inode, offset);
+ remove_inode_hugepages(inode, offset, LLONG_MAX);
return 0;
}
+static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+ struct hstate *h = hstate_inode(inode);
+ loff_t hpage_size = huge_page_size(h);
+ loff_t hole_start, hole_end;
+
+ /*
+ * For hole punch round up the beginning offset of the hole and
+ * round down the end.
+ */
+ hole_start = round_up(offset, hpage_size);
+ hole_end = round_down(offset + len, hpage_size);
+
+ if (hole_end > hole_start) {
+ struct address_space *mapping = inode->i_mapping;
+
+ mutex_lock(&inode->i_mutex);
+ i_mmap_lock_write(mapping);
+ if (!RB_EMPTY_ROOT(&mapping->i_mmap))
+ hugetlb_vmdelete_list(&mapping->i_mmap,
+ hole_start >> PAGE_SHIFT,
+ hole_end >> PAGE_SHIFT);
+ i_mmap_unlock_write(mapping);
+ remove_inode_hugepages(inode, hole_start, hole_end);
+ mutex_unlock(&inode->i_mutex);
+ }
+
+ return 0;
+}
+
+static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ struct address_space *mapping = inode->i_mapping;
+ struct hstate *h = hstate_inode(inode);
+ struct vm_area_struct pseudo_vma;
+ struct mm_struct *mm = current->mm;
+ loff_t hpage_size = huge_page_size(h);
+ unsigned long hpage_shift = huge_page_shift(h);
+ pgoff_t start, index, end;
+ int error;
+ u32 hash;
+
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
+
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ return hugetlbfs_punch_hole(inode, offset, len);
+
+ /*
+ * Default preallocate case.
+ * For this range, start is rounded down and end is rounded up
+ * as well as being converted to page offsets.
+ */
+ start = offset >> hpage_shift;
+ end = (offset + len + hpage_size - 1) >> hpage_shift;
+
+ mutex_lock(&inode->i_mutex);
+
+ /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
+ error = inode_newsize_ok(inode, offset + len);
+ if (error)
+ goto out;
+
+ /*
+ * Initialize a pseudo vma as this is required by the huge page
+ * allocation routines. If NUMA is configured, use page index
+ * as input to create an allocation policy.
+ */
+ memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
+ pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
+ pseudo_vma.vm_file = file;
+
+ for (index = start; index < end; index++) {
+ /*
+ * This is supposed to be the vaddr where the page is being
+ * faulted in, but we have no vaddr here.
+ */
+ struct page *page;
+ unsigned long addr;
+ int avoid_reserve = 0;
+
+ cond_resched();
+
+ /*
+ * fallocate(2) manpage permits EINTR; we may have been
+ * interrupted because we are using up too much memory.
+ */
+ if (signal_pending(current)) {
+ error = -EINTR;
+ break;
+ }
+
+ /* Set numa allocation policy based on index */
+ hugetlb_set_vma_policy(&pseudo_vma, inode, index);
+
+ /* addr is the offset within the file (zero based) */
+ addr = index * hpage_size;
+
+ /* mutex taken here, fault path and hole punch */
+ hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
+ index, addr);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+ /* See if already present in mapping to avoid alloc/free */
+ page = find_get_page(mapping, index);
+ if (page) {
+ put_page(page);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ hugetlb_drop_vma_policy(&pseudo_vma);
+ continue;
+ }
+
+ /* Allocate page and add to page cache */
+ page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve);
+ hugetlb_drop_vma_policy(&pseudo_vma);
+ if (IS_ERR(page)) {
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ error = PTR_ERR(page);
+ goto out;
+ }
+ clear_huge_page(page, addr, pages_per_huge_page(h));
+ __SetPageUptodate(page);
+ error = huge_add_to_page_cache(page, mapping, index);
+ if (unlikely(error)) {
+ put_page(page);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ goto out;
+ }
+
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+
+ /*
+ * page_put due to reference from alloc_huge_page()
+ * unlock_page because locked by add_to_page_cache()
+ */
+ put_page(page);
+ unlock_page(page);
+ }
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
+ i_size_write(inode, offset + len);
+ inode->i_ctime = CURRENT_TIME;
+ spin_lock(&inode->i_lock);
+ inode->i_private = NULL;
+ spin_unlock(&inode->i_lock);
+out:
+ mutex_unlock(&inode->i_mutex);
+ return error;
+}
+
static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
@@ -701,7 +966,8 @@ const struct file_operations hugetlbfs_file_operations = {
.mmap = hugetlbfs_file_mmap,
.fsync = noop_fsync,
.get_unmapped_area = hugetlb_get_unmapped_area,
- .llseek = default_llseek,
+ .llseek = default_llseek,
+ .fallocate = hugetlbfs_fallocate,
};
static const struct inode_operations hugetlbfs_dir_inode_operations = {
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 2d48d28e1640..91e004518237 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -92,6 +92,29 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
}
/**
+ * kernfs_path_len - determine the length of the full path of a given node
+ * @kn: kernfs_node of interest
+ *
+ * The returned length doesn't include the space for the terminating '\0'.
+ */
+size_t kernfs_path_len(struct kernfs_node *kn)
+{
+ size_t len = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&kernfs_rename_lock, flags);
+
+ do {
+ len += strlen(kn->name) + 1;
+ kn = kn->parent;
+ } while (kn && kn->parent);
+
+ spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+
+ return len;
+}
+
+/**
* kernfs_path - build full path of a given node
* @kn: kernfs_node of interest
* @buf: buffer to copy @kn's name into
diff --git a/fs/namei.c b/fs/namei.c
index 29b927938b8c..726d211db484 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2438,7 +2438,7 @@ done:
/**
* path_mountpoint - look up a path to be umounted
- * @nameidata: lookup context
+ * @nd: lookup context
* @flags: lookup flags
* @path: pointer to container for result
*
diff --git a/fs/nsfs.c b/fs/nsfs.c
index e4905fbf3396..8f20d6016e20 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -142,7 +142,8 @@ static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry)
struct inode *inode = d_inode(dentry);
const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
- return seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino);
+ seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino);
+ return 0;
}
static const struct super_operations nsfs_ops = {
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index d0e436dc6437..ce12e0b1a31f 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1776,7 +1776,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
struct dlm_migratable_lockres *mres)
{
struct dlm_migratable_lock *ml;
- struct list_head *queue;
+ struct list_head *queue, *iter;
struct list_head *tmpq = NULL;
struct dlm_lock *newlock = NULL;
struct dlm_lockstatus *lksb = NULL;
@@ -1821,7 +1821,9 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
spin_lock(&res->spinlock);
for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
tmpq = dlm_list_idx_to_ptr(res, j);
- list_for_each_entry(lock, tmpq, list) {
+ list_for_each(iter, tmpq) {
+ lock = list_entry(iter,
+ struct dlm_lock, list);
if (lock->ml.cookie == ml->cookie)
break;
lock = NULL;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index aa50d1ac28fc..b25eee4cead5 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1230,10 +1230,9 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
size_t count, loff_t *ppos)
{
struct inode * inode = file_inode(file);
- char *page, *tmp;
- ssize_t length;
uid_t loginuid;
kuid_t kloginuid;
+ int rv;
rcu_read_lock();
if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
@@ -1242,46 +1241,28 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
}
rcu_read_unlock();
- if (count >= PAGE_SIZE)
- count = PAGE_SIZE - 1;
-
if (*ppos != 0) {
/* No partial writes. */
return -EINVAL;
}
- page = (char*)__get_free_page(GFP_TEMPORARY);
- if (!page)
- return -ENOMEM;
- length = -EFAULT;
- if (copy_from_user(page, buf, count))
- goto out_free_page;
-
- page[count] = '\0';
- loginuid = simple_strtoul(page, &tmp, 10);
- if (tmp == page) {
- length = -EINVAL;
- goto out_free_page;
- }
+ rv = kstrtou32_from_user(buf, count, 10, &loginuid);
+ if (rv < 0)
+ return rv;
/* is userspace tring to explicitly UNSET the loginuid? */
if (loginuid == AUDIT_UID_UNSET) {
kloginuid = INVALID_UID;
} else {
kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
- if (!uid_valid(kloginuid)) {
- length = -EINVAL;
- goto out_free_page;
- }
+ if (!uid_valid(kloginuid))
+ return -EINVAL;
}
- length = audit_set_loginuid(kloginuid);
- if (likely(length == 0))
- length = count;
-
-out_free_page:
- free_page((unsigned long) page);
- return length;
+ rv = audit_set_loginuid(kloginuid);
+ if (rv < 0)
+ return rv;
+ return count;
}
static const struct file_operations proc_loginuid_operations = {
@@ -1335,8 +1316,9 @@ static ssize_t proc_fault_inject_write(struct file * file,
const char __user * buf, size_t count, loff_t *ppos)
{
struct task_struct *task;
- char buffer[PROC_NUMBUF], *end;
+ char buffer[PROC_NUMBUF];
int make_it_fail;
+ int rv;
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
@@ -1345,9 +1327,9 @@ static ssize_t proc_fault_inject_write(struct file * file,
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
return -EFAULT;
- make_it_fail = simple_strtol(strstrip(buffer), &end, 0);
- if (*end)
- return -EINVAL;
+ rv = kstrtoint(strstrip(buffer), 0, &make_it_fail);
+ if (rv < 0)
+ return rv;
if (make_it_fail < 0 || make_it_fail > 1)
return -EINVAL;
@@ -1836,8 +1818,6 @@ end_instantiate:
return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
}
-#ifdef CONFIG_CHECKPOINT_RESTORE
-
/*
* dname_to_vma_addr - maps a dentry name into two unsigned longs
* which represent vma start and end addresses.
@@ -1864,11 +1844,6 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
if (flags & LOOKUP_RCU)
return -ECHILD;
- if (!capable(CAP_SYS_ADMIN)) {
- status = -EPERM;
- goto out_notask;
- }
-
inode = d_inode(dentry);
task = get_proc_task(inode);
if (!task)
@@ -1957,6 +1932,29 @@ struct map_files_info {
unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
};
+/*
+ * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the
+ * symlinks may be used to bypass permissions on ancestor directories in the
+ * path to the file in question.
+ */
+static const char *
+proc_map_files_follow_link(struct dentry *dentry, void **cookie)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ return proc_pid_follow_link(dentry, NULL);
+}
+
+/*
+ * Identical to proc_pid_link_inode_operations except for follow_link()
+ */
+static const struct inode_operations proc_map_files_link_inode_operations = {
+ .readlink = proc_pid_readlink,
+ .follow_link = proc_map_files_follow_link,
+ .setattr = proc_setattr,
+};
+
static int
proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
struct task_struct *task, const void *ptr)
@@ -1972,7 +1970,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
ei = PROC_I(inode);
ei->op.proc_get_link = proc_map_files_get_link;
- inode->i_op = &proc_pid_link_inode_operations;
+ inode->i_op = &proc_map_files_link_inode_operations;
inode->i_size = 64;
inode->i_mode = S_IFLNK;
@@ -1996,10 +1994,6 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
int result;
struct mm_struct *mm;
- result = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- goto out;
-
result = -ENOENT;
task = get_proc_task(dir);
if (!task)
@@ -2053,10 +2047,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
struct map_files_info *p;
int ret;
- ret = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- goto out;
-
ret = -ENOENT;
task = get_proc_task(file_inode(file));
if (!task)
@@ -2245,7 +2235,6 @@ static const struct file_operations proc_timers_operations = {
.llseek = seq_lseek,
.release = seq_release_private,
};
-#endif /* CONFIG_CHECKPOINT_RESTORE */
static int proc_pident_instantiate(struct inode *dir,
struct dentry *dentry, struct task_struct *task, const void *ptr)
@@ -2481,32 +2470,20 @@ static ssize_t proc_coredump_filter_write(struct file *file,
{
struct task_struct *task;
struct mm_struct *mm;
- char buffer[PROC_NUMBUF], *end;
unsigned int val;
int ret;
int i;
unsigned long mask;
- ret = -EFAULT;
- memset(buffer, 0, sizeof(buffer));
- if (count > sizeof(buffer) - 1)
- count = sizeof(buffer) - 1;
- if (copy_from_user(buffer, buf, count))
- goto out_no_task;
-
- ret = -EINVAL;
- val = (unsigned int)simple_strtoul(buffer, &end, 0);
- if (*end == '\n')
- end++;
- if (end - buffer == 0)
- goto out_no_task;
+ ret = kstrtouint_from_user(buf, count, 0, &val);
+ if (ret < 0)
+ return ret;
ret = -ESRCH;
task = get_proc_task(file_inode(file));
if (!task)
goto out_no_task;
- ret = end - buffer;
mm = get_task_mm(task);
if (!mm)
goto out_no_mm;
@@ -2522,7 +2499,9 @@ static ssize_t proc_coredump_filter_write(struct file *file,
out_no_mm:
put_task_struct(task);
out_no_task:
- return ret;
+ if (ret < 0)
+ return ret;
+ return count;
}
static const struct file_operations proc_coredump_filter_operations = {
@@ -2744,9 +2723,7 @@ static const struct inode_operations proc_task_inode_operations;
static const struct pid_entry tgid_base_stuff[] = {
DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
-#ifdef CONFIG_CHECKPOINT_RESTORE
DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
-#endif
DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index e5dee5c3188e..ff3ffc76a937 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -26,7 +26,7 @@
#include "internal.h"
-static DEFINE_SPINLOCK(proc_subdir_lock);
+static DEFINE_RWLOCK(proc_subdir_lock);
static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de)
{
@@ -172,9 +172,9 @@ static int xlate_proc_name(const char *name, struct proc_dir_entry **ret,
{
int rv;
- spin_lock(&proc_subdir_lock);
+ read_lock(&proc_subdir_lock);
rv = __xlate_proc_name(name, ret, residual);
- spin_unlock(&proc_subdir_lock);
+ read_unlock(&proc_subdir_lock);
return rv;
}
@@ -231,11 +231,11 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
{
struct inode *inode;
- spin_lock(&proc_subdir_lock);
+ read_lock(&proc_subdir_lock);
de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len);
if (de) {
pde_get(de);
- spin_unlock(&proc_subdir_lock);
+ read_unlock(&proc_subdir_lock);
inode = proc_get_inode(dir->i_sb, de);
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -243,7 +243,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
d_add(dentry, inode);
return NULL;
}
- spin_unlock(&proc_subdir_lock);
+ read_unlock(&proc_subdir_lock);
return ERR_PTR(-ENOENT);
}
@@ -270,12 +270,12 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
if (!dir_emit_dots(file, ctx))
return 0;
- spin_lock(&proc_subdir_lock);
+ read_lock(&proc_subdir_lock);
de = pde_subdir_first(de);
i = ctx->pos - 2;
for (;;) {
if (!de) {
- spin_unlock(&proc_subdir_lock);
+ read_unlock(&proc_subdir_lock);
return 0;
}
if (!i)
@@ -287,19 +287,19 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
do {
struct proc_dir_entry *next;
pde_get(de);
- spin_unlock(&proc_subdir_lock);
+ read_unlock(&proc_subdir_lock);
if (!dir_emit(ctx, de->name, de->namelen,
de->low_ino, de->mode >> 12)) {
pde_put(de);
return 0;
}
- spin_lock(&proc_subdir_lock);
+ read_lock(&proc_subdir_lock);
ctx->pos++;
next = pde_subdir_next(de);
pde_put(de);
de = next;
} while (de);
- spin_unlock(&proc_subdir_lock);
+ read_unlock(&proc_subdir_lock);
return 1;
}
@@ -338,16 +338,16 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
if (ret)
return ret;
- spin_lock(&proc_subdir_lock);
+ write_lock(&proc_subdir_lock);
dp->parent = dir;
if (pde_subdir_insert(dir, dp) == false) {
WARN(1, "proc_dir_entry '%s/%s' already registered\n",
dir->name, dp->name);
- spin_unlock(&proc_subdir_lock);
+ write_unlock(&proc_subdir_lock);
proc_free_inum(dp->low_ino);
return -EEXIST;
}
- spin_unlock(&proc_subdir_lock);
+ write_unlock(&proc_subdir_lock);
return 0;
}
@@ -549,9 +549,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
const char *fn = name;
unsigned int len;
- spin_lock(&proc_subdir_lock);
+ write_lock(&proc_subdir_lock);
if (__xlate_proc_name(name, &parent, &fn) != 0) {
- spin_unlock(&proc_subdir_lock);
+ write_unlock(&proc_subdir_lock);
return;
}
len = strlen(fn);
@@ -559,7 +559,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
de = pde_subdir_find(parent, fn, len);
if (de)
rb_erase(&de->subdir_node, &parent->subdir);
- spin_unlock(&proc_subdir_lock);
+ write_unlock(&proc_subdir_lock);
if (!de) {
WARN(1, "name '%s'\n", name);
return;
@@ -583,16 +583,16 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
const char *fn = name;
unsigned int len;
- spin_lock(&proc_subdir_lock);
+ write_lock(&proc_subdir_lock);
if (__xlate_proc_name(name, &parent, &fn) != 0) {
- spin_unlock(&proc_subdir_lock);
+ write_unlock(&proc_subdir_lock);
return -ENOENT;
}
len = strlen(fn);
root = pde_subdir_find(parent, fn, len);
if (!root) {
- spin_unlock(&proc_subdir_lock);
+ write_unlock(&proc_subdir_lock);
return -ENOENT;
}
rb_erase(&root->subdir_node, &parent->subdir);
@@ -605,7 +605,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
de = next;
continue;
}
- spin_unlock(&proc_subdir_lock);
+ write_unlock(&proc_subdir_lock);
proc_entry_rundown(de);
next = de->parent;
@@ -616,7 +616,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
break;
pde_put(de);
- spin_lock(&proc_subdir_lock);
+ write_lock(&proc_subdir_lock);
de = next;
}
pde_put(root);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 7eee2d8b97d9..93484034a03d 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -9,12 +9,16 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/hugetlb.h>
+#include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
#include <linux/kernel-page-flags.h>
#include <asm/uaccess.h>
#include "internal.h"
#define KPMSIZE sizeof(u64)
#define KPMMASK (KPMSIZE - 1)
+#define KPMBITS (KPMSIZE * BITS_PER_BYTE)
/* /proc/kpagecount - an array exposing page counts
*
@@ -54,6 +58,8 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
pfn++;
out++;
count -= KPMSIZE;
+
+ cond_resched();
}
*ppos += (char __user *)out - buf;
@@ -146,6 +152,9 @@ u64 stable_page_flags(struct page *page)
if (PageBalloon(page))
u |= 1 << KPF_BALLOON;
+ if (page_is_idle(page))
+ u |= 1 << KPF_IDLE;
+
u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
@@ -212,6 +221,8 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
pfn++;
out++;
count -= KPMSIZE;
+
+ cond_resched();
}
*ppos += (char __user *)out - buf;
@@ -225,10 +236,64 @@ static const struct file_operations proc_kpageflags_operations = {
.read = kpageflags_read,
};
+#ifdef CONFIG_MEMCG
+static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ u64 __user *out = (u64 __user *)buf;
+ struct page *ppage;
+ unsigned long src = *ppos;
+ unsigned long pfn;
+ ssize_t ret = 0;
+ u64 ino;
+
+ pfn = src / KPMSIZE;
+ count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
+ if (src & KPMMASK || count & KPMMASK)
+ return -EINVAL;
+
+ while (count > 0) {
+ if (pfn_valid(pfn))
+ ppage = pfn_to_page(pfn);
+ else
+ ppage = NULL;
+
+ if (ppage)
+ ino = page_cgroup_ino(ppage);
+ else
+ ino = 0;
+
+ if (put_user(ino, out)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ pfn++;
+ out++;
+ count -= KPMSIZE;
+
+ cond_resched();
+ }
+
+ *ppos += (char __user *)out - buf;
+ if (!ret)
+ ret = (char __user *)out - buf;
+ return ret;
+}
+
+static const struct file_operations proc_kpagecgroup_operations = {
+ .llseek = mem_lseek,
+ .read = kpagecgroup_read,
+};
+#endif /* CONFIG_MEMCG */
+
static int __init proc_page_init(void)
{
proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations);
proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
+#ifdef CONFIG_MEMCG
+ proc_create("kpagecgroup", S_IRUSR, NULL, &proc_kpagecgroup_operations);
+#endif
return 0;
}
fs_initcall(proc_page_init);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3b4d8255e806..e2d46adb54b4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -13,6 +13,7 @@
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
#include <asm/elf.h>
#include <asm/uaccess.h>
@@ -446,6 +447,7 @@ struct mem_size_stats {
unsigned long anonymous_thp;
unsigned long swap;
u64 pss;
+ u64 swap_pss;
};
static void smaps_account(struct mem_size_stats *mss, struct page *page,
@@ -458,7 +460,7 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
mss->resident += size;
/* Accumulate the size in pages that have been accessed. */
- if (young || PageReferenced(page))
+ if (young || page_is_young(page) || PageReferenced(page))
mss->referenced += size;
mapcount = page_mapcount(page);
if (mapcount >= 2) {
@@ -492,9 +494,20 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
} else if (is_swap_pte(*pte)) {
swp_entry_t swpent = pte_to_swp_entry(*pte);
- if (!non_swap_entry(swpent))
+ if (!non_swap_entry(swpent)) {
+ int mapcount;
+
mss->swap += PAGE_SIZE;
- else if (is_migration_entry(swpent))
+ mapcount = swp_swapcount(swpent);
+ if (mapcount >= 2) {
+ u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
+
+ do_div(pss_delta, mapcount);
+ mss->swap_pss += pss_delta;
+ } else {
+ mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
+ }
+ } else if (is_migration_entry(swpent))
page = migration_entry_to_page(swpent);
}
@@ -640,6 +653,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
"Anonymous: %8lu kB\n"
"AnonHugePages: %8lu kB\n"
"Swap: %8lu kB\n"
+ "SwapPss: %8lu kB\n"
"KernelPageSize: %8lu kB\n"
"MMUPageSize: %8lu kB\n"
"Locked: %8lu kB\n",
@@ -654,6 +668,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
mss.anonymous >> 10,
mss.anonymous_thp >> 10,
mss.swap >> 10,
+ (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
vma_kernel_pagesize(vma) >> 10,
vma_mmu_pagesize(vma) >> 10,
(vma->vm_flags & VM_LOCKED) ?
@@ -712,23 +727,6 @@ const struct file_operations proc_tid_smaps_operations = {
.release = proc_map_release,
};
-/*
- * We do not want to have constant page-shift bits sitting in
- * pagemap entries and are about to reuse them some time soon.
- *
- * Here's the "migration strategy":
- * 1. when the system boots these bits remain what they are,
- * but a warning about future change is printed in log;
- * 2. once anyone clears soft-dirty bits via clear_refs file,
- * these flag is set to denote, that user is aware of the
- * new API and those page-shift bits change their meaning.
- * The respective warning is printed in dmesg;
- * 3. In a couple of releases we will remove all the mentions
- * of page-shift in pagemap entries.
- */
-
-static bool soft_dirty_cleared __read_mostly;
-
enum clear_refs_types {
CLEAR_REFS_ALL = 1,
CLEAR_REFS_ANON,
@@ -810,6 +808,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
/* Clear accessed and referenced bits. */
pmdp_test_and_clear_young(vma, addr, pmd);
+ test_and_clear_page_young(page);
ClearPageReferenced(page);
out:
spin_unlock(ptl);
@@ -837,6 +836,7 @@ out:
/* Clear accessed and referenced bits. */
ptep_test_and_clear_young(vma, addr, pte);
+ test_and_clear_page_young(page);
ClearPageReferenced(page);
}
pte_unmap_unlock(pte - 1, ptl);
@@ -889,13 +889,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
return -EINVAL;
- if (type == CLEAR_REFS_SOFT_DIRTY) {
- soft_dirty_cleared = true;
- pr_warn_once("The pagemap bits 55-60 has changed their meaning!"
- " See the linux/Documentation/vm/pagemap.txt for "
- "details.\n");
- }
-
task = get_proc_task(file_inode(file));
if (!task)
return -ESRCH;
@@ -963,36 +956,26 @@ typedef struct {
struct pagemapread {
int pos, len; /* units: PM_ENTRY_BYTES, not bytes */
pagemap_entry_t *buffer;
- bool v2;
+ bool show_pfn;
};
#define PAGEMAP_WALK_SIZE (PMD_SIZE)
#define PAGEMAP_WALK_MASK (PMD_MASK)
-#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
-#define PM_STATUS_BITS 3
-#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
-#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
-#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
-#define PM_PSHIFT_BITS 6
-#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
-#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
-#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
-#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
-#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
-/* in "new" pagemap pshift bits are occupied with more status bits */
-#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
-
-#define __PM_SOFT_DIRTY (1LL)
-#define PM_PRESENT PM_STATUS(4LL)
-#define PM_SWAP PM_STATUS(2LL)
-#define PM_FILE PM_STATUS(1LL)
-#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0)
+#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
+#define PM_PFRAME_BITS 55
+#define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
+#define PM_SOFT_DIRTY BIT_ULL(55)
+#define PM_MMAP_EXCLUSIVE BIT_ULL(56)
+#define PM_FILE BIT_ULL(61)
+#define PM_SWAP BIT_ULL(62)
+#define PM_PRESENT BIT_ULL(63)
+
#define PM_END_OF_BUFFER 1
-static inline pagemap_entry_t make_pme(u64 val)
+static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
{
- return (pagemap_entry_t) { .pme = val };
+ return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
}
static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
@@ -1013,7 +996,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
while (addr < end) {
struct vm_area_struct *vma = find_vma(walk->mm, addr);
- pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
+ pagemap_entry_t pme = make_pme(0, 0);
/* End of address space hole, which we mark as non-present. */
unsigned long hole_end;
@@ -1033,7 +1016,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
/* Addresses in the VMA. */
if (vma->vm_flags & VM_SOFTDIRTY)
- pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY);
+ pme = make_pme(0, PM_SOFT_DIRTY);
for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
err = add_to_pagemap(addr, &pme, pm);
if (err)
@@ -1044,67 +1027,42 @@ out:
return err;
}
-static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
+static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
struct vm_area_struct *vma, unsigned long addr, pte_t pte)
{
- u64 frame, flags;
+ u64 frame = 0, flags = 0;
struct page *page = NULL;
- int flags2 = 0;
if (pte_present(pte)) {
- frame = pte_pfn(pte);
- flags = PM_PRESENT;
+ if (pm->show_pfn)
+ frame = pte_pfn(pte);
+ flags |= PM_PRESENT;
page = vm_normal_page(vma, addr, pte);
if (pte_soft_dirty(pte))
- flags2 |= __PM_SOFT_DIRTY;
+ flags |= PM_SOFT_DIRTY;
} else if (is_swap_pte(pte)) {
swp_entry_t entry;
if (pte_swp_soft_dirty(pte))
- flags2 |= __PM_SOFT_DIRTY;
+ flags |= PM_SOFT_DIRTY;
entry = pte_to_swp_entry(pte);
frame = swp_type(entry) |
(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
- flags = PM_SWAP;
+ flags |= PM_SWAP;
if (is_migration_entry(entry))
page = migration_entry_to_page(entry);
- } else {
- if (vma->vm_flags & VM_SOFTDIRTY)
- flags2 |= __PM_SOFT_DIRTY;
- *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
- return;
}
if (page && !PageAnon(page))
flags |= PM_FILE;
- if ((vma->vm_flags & VM_SOFTDIRTY))
- flags2 |= __PM_SOFT_DIRTY;
-
- *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
-}
+ if (page && page_mapcount(page) == 1)
+ flags |= PM_MMAP_EXCLUSIVE;
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ flags |= PM_SOFT_DIRTY;
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
- pmd_t pmd, int offset, int pmd_flags2)
-{
- /*
- * Currently pmd for thp is always present because thp can not be
- * swapped-out, migrated, or HWPOISONed (split in such cases instead.)
- * This if-check is just to prepare for future implementation.
- */
- if (pmd_present(pmd))
- *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
- | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
- else
- *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2));
+ return make_pme(frame, flags);
}
-#else
-static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
- pmd_t pmd, int offset, int pmd_flags2)
-{
-}
-#endif
-static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
@@ -1113,41 +1071,58 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *pte, *orig_pte;
int err = 0;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
- int pmd_flags2;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) {
+ u64 flags = 0, frame = 0;
+ pmd_t pmd = *pmdp;
- if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
- pmd_flags2 = __PM_SOFT_DIRTY;
- else
- pmd_flags2 = 0;
+ if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd))
+ flags |= PM_SOFT_DIRTY;
+
+ /*
+ * Currently pmd for thp is always present because thp
+ * can not be swapped-out, migrated, or HWPOISONed
+ * (split in such cases instead.)
+ * This if-check is just to prepare for future implementation.
+ */
+ if (pmd_present(pmd)) {
+ struct page *page = pmd_page(pmd);
+
+ if (page_mapcount(page) == 1)
+ flags |= PM_MMAP_EXCLUSIVE;
+
+ flags |= PM_PRESENT;
+ if (pm->show_pfn)
+ frame = pmd_pfn(pmd) +
+ ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ }
for (; addr != end; addr += PAGE_SIZE) {
- unsigned long offset;
- pagemap_entry_t pme;
+ pagemap_entry_t pme = make_pme(frame, flags);
- offset = (addr & ~PAGEMAP_WALK_MASK) >>
- PAGE_SHIFT;
- thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);
err = add_to_pagemap(addr, &pme, pm);
if (err)
break;
+ if (pm->show_pfn && (flags & PM_PRESENT))
+ frame++;
}
spin_unlock(ptl);
return err;
}
- if (pmd_trans_unstable(pmd))
+ if (pmd_trans_unstable(pmdp))
return 0;
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
* We can assume that @vma always points to a valid one and @end never
* goes beyond vma->vm_end.
*/
- orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl);
for (; addr < end; pte++, addr += PAGE_SIZE) {
pagemap_entry_t pme;
- pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
+ pme = pte_to_pagemap_entry(pm, vma, addr, *pte);
err = add_to_pagemap(addr, &pme, pm);
if (err)
break;
@@ -1160,40 +1135,44 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
}
#ifdef CONFIG_HUGETLB_PAGE
-static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
- pte_t pte, int offset, int flags2)
-{
- if (pte_present(pte))
- *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) |
- PM_STATUS2(pm->v2, flags2) |
- PM_PRESENT);
- else
- *pme = make_pme(PM_NOT_PRESENT(pm->v2) |
- PM_STATUS2(pm->v2, flags2));
-}
-
/* This function walks within one hugetlb entry in the single call */
-static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
+static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct pagemapread *pm = walk->private;
struct vm_area_struct *vma = walk->vma;
+ u64 flags = 0, frame = 0;
int err = 0;
- int flags2;
- pagemap_entry_t pme;
+ pte_t pte;
if (vma->vm_flags & VM_SOFTDIRTY)
- flags2 = __PM_SOFT_DIRTY;
- else
- flags2 = 0;
+ flags |= PM_SOFT_DIRTY;
+
+ pte = huge_ptep_get(ptep);
+ if (pte_present(pte)) {
+ struct page *page = pte_page(pte);
+
+ if (!PageAnon(page))
+ flags |= PM_FILE;
+
+ if (page_mapcount(page) == 1)
+ flags |= PM_MMAP_EXCLUSIVE;
+
+ flags |= PM_PRESENT;
+ if (pm->show_pfn)
+ frame = pte_pfn(pte) +
+ ((addr & ~hmask) >> PAGE_SHIFT);
+ }
for (; addr != end; addr += PAGE_SIZE) {
- int offset = (addr & ~hmask) >> PAGE_SHIFT;
- huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2);
+ pagemap_entry_t pme = make_pme(frame, flags);
+
err = add_to_pagemap(addr, &pme, pm);
if (err)
return err;
+ if (pm->show_pfn && (flags & PM_PRESENT))
+ frame++;
}
cond_resched();
@@ -1211,7 +1190,9 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
* Bits 0-54 page frame number (PFN) if present
* Bits 0-4 swap type if swapped
* Bits 5-54 swap offset if swapped
- * Bits 55-60 page shift (page size = 1<<page shift)
+ * Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
+ * Bit 56 page exclusively mapped
+ * Bits 57-60 zero
* Bit 61 page is file-page or shared-anon
* Bit 62 page swapped
* Bit 63 page present
@@ -1229,42 +1210,37 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
static ssize_t pagemap_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
- struct task_struct *task = get_proc_task(file_inode(file));
- struct mm_struct *mm;
+ struct mm_struct *mm = file->private_data;
struct pagemapread pm;
- int ret = -ESRCH;
struct mm_walk pagemap_walk = {};
unsigned long src;
unsigned long svpfn;
unsigned long start_vaddr;
unsigned long end_vaddr;
- int copied = 0;
+ int ret = 0, copied = 0;
- if (!task)
+ if (!mm || !atomic_inc_not_zero(&mm->mm_users))
goto out;
ret = -EINVAL;
/* file position must be aligned */
if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
- goto out_task;
+ goto out_mm;
ret = 0;
if (!count)
- goto out_task;
+ goto out_mm;
+
+ /* do not disclose physical addresses: attack vector */
+ pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);
- pm.v2 = soft_dirty_cleared;
pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);
ret = -ENOMEM;
if (!pm.buffer)
- goto out_task;
-
- mm = mm_access(task, PTRACE_MODE_READ);
- ret = PTR_ERR(mm);
- if (!mm || IS_ERR(mm))
- goto out_free;
+ goto out_mm;
- pagemap_walk.pmd_entry = pagemap_pte_range;
+ pagemap_walk.pmd_entry = pagemap_pmd_range;
pagemap_walk.pte_hole = pagemap_pte_hole;
#ifdef CONFIG_HUGETLB_PAGE
pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
@@ -1275,10 +1251,10 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
src = *ppos;
svpfn = src / PM_ENTRY_BYTES;
start_vaddr = svpfn << PAGE_SHIFT;
- end_vaddr = TASK_SIZE_OF(task);
+ end_vaddr = mm->task_size;
/* watch out for wraparound */
- if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
+ if (svpfn > mm->task_size >> PAGE_SHIFT)
start_vaddr = end_vaddr;
/*
@@ -1305,7 +1281,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
len = min(count, PM_ENTRY_BYTES * pm.pos);
if (copy_to_user(buf, pm.buffer, len)) {
ret = -EFAULT;
- goto out_mm;
+ goto out_free;
}
copied += len;
buf += len;
@@ -1315,24 +1291,31 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
if (!ret || ret == PM_END_OF_BUFFER)
ret = copied;
-out_mm:
- mmput(mm);
out_free:
kfree(pm.buffer);
-out_task:
- put_task_struct(task);
+out_mm:
+ mmput(mm);
out:
return ret;
}
static int pagemap_open(struct inode *inode, struct file *file)
{
- /* do not disclose physical addresses: attack vector */
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
- "to stop being page-shift some time soon. See the "
- "linux/Documentation/vm/pagemap.txt for details.\n");
+ struct mm_struct *mm;
+
+ mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR(mm))
+ return PTR_ERR(mm);
+ file->private_data = mm;
+ return 0;
+}
+
+static int pagemap_release(struct inode *inode, struct file *file)
+{
+ struct mm_struct *mm = file->private_data;
+
+ if (mm)
+ mmdrop(mm);
return 0;
}
@@ -1340,6 +1323,7 @@ const struct file_operations proc_pagemap_operations = {
.llseek = mem_lseek, /* borrow this */
.read = pagemap_read,
.open = pagemap_open,
+ .release = pagemap_release,
};
#endif /* CONFIG_PROC_PAGE_MONITOR */
diff --git a/fs/seq_file.c b/fs/seq_file.c
index ce9e39fd5daf..225586e141ca 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -12,6 +12,7 @@
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/mm.h>
+#include <linux/printk.h>
#include <asm/uaccess.h>
#include <asm/page.h>
@@ -371,16 +372,16 @@ EXPORT_SYMBOL(seq_release);
* @esc: set of characters that need escaping
*
* Puts string into buffer, replacing each occurrence of character from
- * @esc with usual octal escape. Returns 0 in case of success, -1 - in
- * case of overflow.
+ * @esc with usual octal escape.
+ * Use seq_has_overflowed() to check for errors.
*/
-int seq_escape(struct seq_file *m, const char *s, const char *esc)
+void seq_escape(struct seq_file *m, const char *s, const char *esc)
{
char *end = m->buf + m->size;
- char *p;
+ char *p;
char c;
- for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) {
+ for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) {
if (!strchr(esc, c)) {
*p++ = c;
continue;
@@ -393,14 +394,13 @@ int seq_escape(struct seq_file *m, const char *s, const char *esc)
continue;
}
seq_set_overflow(m);
- return -1;
- }
+ return;
+ }
m->count = p - m->buf;
- return 0;
}
EXPORT_SYMBOL(seq_escape);
-int seq_vprintf(struct seq_file *m, const char *f, va_list args)
+void seq_vprintf(struct seq_file *m, const char *f, va_list args)
{
int len;
@@ -408,24 +408,20 @@ int seq_vprintf(struct seq_file *m, const char *f, va_list args)
len = vsnprintf(m->buf + m->count, m->size - m->count, f, args);
if (m->count + len < m->size) {
m->count += len;
- return 0;
+ return;
}
}
seq_set_overflow(m);
- return -1;
}
EXPORT_SYMBOL(seq_vprintf);
-int seq_printf(struct seq_file *m, const char *f, ...)
+void seq_printf(struct seq_file *m, const char *f, ...)
{
- int ret;
va_list args;
va_start(args, f);
- ret = seq_vprintf(m, f, args);
+ seq_vprintf(m, f, args);
va_end(args);
-
- return ret;
}
EXPORT_SYMBOL(seq_printf);
@@ -663,26 +659,25 @@ int seq_open_private(struct file *filp, const struct seq_operations *ops,
}
EXPORT_SYMBOL(seq_open_private);
-int seq_putc(struct seq_file *m, char c)
+void seq_putc(struct seq_file *m, char c)
{
- if (m->count < m->size) {
- m->buf[m->count++] = c;
- return 0;
- }
- return -1;
+ if (m->count >= m->size)
+ return;
+
+ m->buf[m->count++] = c;
}
EXPORT_SYMBOL(seq_putc);
-int seq_puts(struct seq_file *m, const char *s)
+void seq_puts(struct seq_file *m, const char *s)
{
int len = strlen(s);
- if (m->count + len < m->size) {
- memcpy(m->buf + m->count, s, len);
- m->count += len;
- return 0;
+
+ if (m->count + len >= m->size) {
+ seq_set_overflow(m);
+ return;
}
- seq_set_overflow(m);
- return -1;
+ memcpy(m->buf + m->count, s, len);
+ m->count += len;
}
EXPORT_SYMBOL(seq_puts);
@@ -693,8 +688,8 @@ EXPORT_SYMBOL(seq_puts);
* This routine is very quick when you show lots of numbers.
* In usual cases, it will be better to use seq_printf(). It's easier to read.
*/
-int seq_put_decimal_ull(struct seq_file *m, char delimiter,
- unsigned long long num)
+void seq_put_decimal_ull(struct seq_file *m, char delimiter,
+ unsigned long long num)
{
int len;
@@ -706,35 +701,33 @@ int seq_put_decimal_ull(struct seq_file *m, char delimiter,
if (num < 10) {
m->buf[m->count++] = num + '0';
- return 0;
+ return;
}
len = num_to_str(m->buf + m->count, m->size - m->count, num);
if (!len)
goto overflow;
m->count += len;
- return 0;
+ return;
+
overflow:
seq_set_overflow(m);
- return -1;
}
EXPORT_SYMBOL(seq_put_decimal_ull);
-int seq_put_decimal_ll(struct seq_file *m, char delimiter,
- long long num)
+void seq_put_decimal_ll(struct seq_file *m, char delimiter, long long num)
{
if (num < 0) {
if (m->count + 3 >= m->size) {
seq_set_overflow(m);
- return -1;
+ return;
}
if (delimiter)
m->buf[m->count++] = delimiter;
num = -num;
delimiter = '-';
}
- return seq_put_decimal_ull(m, delimiter, num);
-
+ seq_put_decimal_ull(m, delimiter, num);
}
EXPORT_SYMBOL(seq_put_decimal_ll);
@@ -773,6 +766,47 @@ void seq_pad(struct seq_file *m, char c)
}
EXPORT_SYMBOL(seq_pad);
+/* A complete analogue of print_hex_dump() */
+void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
+ int rowsize, int groupsize, const void *buf, size_t len,
+ bool ascii)
+{
+ const u8 *ptr = buf;
+ int i, linelen, remaining = len;
+ int ret;
+
+ if (rowsize != 16 && rowsize != 32)
+ rowsize = 16;
+
+ for (i = 0; i < len && !seq_has_overflowed(m); i += rowsize) {
+ linelen = min(remaining, rowsize);
+ remaining -= rowsize;
+
+ switch (prefix_type) {
+ case DUMP_PREFIX_ADDRESS:
+ seq_printf(m, "%s%p: ", prefix_str, ptr + i);
+ break;
+ case DUMP_PREFIX_OFFSET:
+ seq_printf(m, "%s%.8x: ", prefix_str, i);
+ break;
+ default:
+ seq_printf(m, "%s", prefix_str);
+ break;
+ }
+
+ ret = hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize,
+ m->buf + m->count, m->size - m->count,
+ ascii);
+ if (ret >= m->size - m->count) {
+ seq_set_overflow(m);
+ } else {
+ m->count += ret;
+ seq_putc(m, '\n');
+ }
+ }
+}
+EXPORT_SYMBOL(seq_hex_dump);
+
struct list_head *seq_list_start(struct list_head *head, loff_t pos)
{
struct list_head *lh;
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index fb8b54eb77c5..dc5fae601c24 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -417,14 +417,14 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
if (oldcount == 0) {
result = ufs_alloc_fragments (inode, cgno, goal, count, err);
if (result) {
+ ufs_clear_frags(inode, result + oldcount,
+ newcount - oldcount, locked_page != NULL);
write_seqlock(&UFS_I(inode)->meta_lock);
ufs_cpu_to_data_ptr(sb, p, result);
write_sequnlock(&UFS_I(inode)->meta_lock);
*err = 0;
UFS_I(inode)->i_lastfrag =
max(UFS_I(inode)->i_lastfrag, fragment + count);
- ufs_clear_frags(inode, result + oldcount,
- newcount - oldcount, locked_page != NULL);
}
mutex_unlock(&UFS_SB(sb)->s_lock);
UFSD("EXIT, result %llu\n", (unsigned long long)result);
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 331c1ccf8264..c79b717d9b88 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -23,6 +23,7 @@
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <linux/fs.h>
+#include <linux/dax.h>
#include <linux/buffer_head.h>
#include <linux/uio.h>
#include <linux/list_lru.h>
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index de2c2376242b..e78feb400e22 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1546,8 +1546,36 @@ xfs_filemap_fault(
return ret;
}
+STATIC int
+xfs_filemap_pmd_fault(
+ struct vm_area_struct *vma,
+ unsigned long addr,
+ pmd_t *pmd,
+ unsigned int flags)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ struct xfs_inode *ip = XFS_I(inode);
+ int ret;
+
+ if (!IS_DAX(inode))
+ return VM_FAULT_FALLBACK;
+
+ trace_xfs_filemap_pmd_fault(ip);
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_direct,
+ xfs_end_io_dax_write);
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ sb_end_pagefault(inode->i_sb);
+
+ return ret;
+}
+
static const struct vm_operations_struct xfs_file_vm_ops = {
.fault = xfs_filemap_fault,
+ .pmd_fault = xfs_filemap_pmd_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = xfs_filemap_page_mkwrite,
};
@@ -1560,7 +1588,7 @@ xfs_file_mmap(
file_accessed(filp);
vma->vm_ops = &xfs_file_vm_ops;
if (IS_DAX(file_inode(filp)))
- vma->vm_flags |= VM_MIXEDMAP;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
return 0;
}
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 9aeeb21bc3d0..5ed36b1e04c1 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -687,6 +687,7 @@ DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
DEFINE_INODE_EVENT(xfs_filemap_fault);
+DEFINE_INODE_EVENT(xfs_filemap_pmd_fault);
DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
DECLARE_EVENT_CLASS(xfs_iref_class,