aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/compression.c17
-rw-r--r--fs/btrfs/extent-tree.c2
-rw-r--r--fs/btrfs/file-item.c108
-rw-r--r--fs/btrfs/inode.c19
-rw-r--r--fs/btrfs/reflink.c38
-rw-r--r--fs/btrfs/tree-log.c21
-rw-r--r--fs/gfs2/file.c5
-rw-r--r--fs/gfs2/glock.c28
-rw-r--r--fs/gfs2/glops.c2
-rw-r--r--fs/gfs2/log.c6
-rw-r--r--fs/gfs2/log.h1
-rw-r--r--fs/gfs2/lops.c7
-rw-r--r--fs/gfs2/lops.h1
-rw-r--r--fs/gfs2/util.c1
-rw-r--r--fs/io_uring.c1
-rw-r--r--fs/notify/fanotify/fanotify_user.c30
-rw-r--r--fs/notify/fdinfo.c2
17 files changed, 214 insertions, 75 deletions
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index d17ac301032e..1346d698463a 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -457,7 +457,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
bytes_left = compressed_len;
for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
int submit = 0;
- int len;
+ int len = 0;
page = compressed_pages[pg_index];
page->mapping = inode->vfs_inode.i_mapping;
@@ -465,10 +465,17 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio,
0);
- if (pg_index == 0 && use_append)
- len = bio_add_zone_append_page(bio, page, PAGE_SIZE, 0);
- else
- len = bio_add_page(bio, page, PAGE_SIZE, 0);
+ /*
+ * Page can only be added to bio if the current bio fits in
+ * stripe.
+ */
+ if (!submit) {
+ if (pg_index == 0 && use_append)
+ len = bio_add_zone_append_page(bio, page,
+ PAGE_SIZE, 0);
+ else
+ len = bio_add_page(bio, page, PAGE_SIZE, 0);
+ }
page->mapping = NULL;
if (submit || len < PAGE_SIZE) {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f1d15b68994a..3d5c35e4cb76 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1868,7 +1868,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
trace_run_delayed_ref_head(fs_info, head, 0);
btrfs_delayed_ref_unlock(head);
btrfs_put_delayed_ref_head(head);
- return 0;
+ return ret;
}
static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 294602f139ef..441cee7fbb62 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -788,7 +788,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
u64 end_byte = bytenr + len;
u64 csum_end;
struct extent_buffer *leaf;
- int ret;
+ int ret = 0;
const u32 csum_size = fs_info->csum_size;
u32 blocksize_bits = fs_info->sectorsize_bits;
@@ -806,6 +806,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
+ ret = 0;
if (path->slots[0] == 0)
break;
path->slots[0]--;
@@ -862,7 +863,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
ret = btrfs_del_items(trans, root, path,
path->slots[0], del_nr);
if (ret)
- goto out;
+ break;
if (key.offset == bytenr)
break;
} else if (key.offset < bytenr && csum_end > end_byte) {
@@ -906,8 +907,9 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
ret = btrfs_split_item(trans, root, path, &key, offset);
if (ret && ret != -EAGAIN) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ break;
}
+ ret = 0;
key.offset = end_byte - 1;
} else {
@@ -917,12 +919,41 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
}
- ret = 0;
-out:
btrfs_free_path(path);
return ret;
}
+static int find_next_csum_offset(struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 *next_offset)
+{
+ const u32 nritems = btrfs_header_nritems(path->nodes[0]);
+ struct btrfs_key found_key;
+ int slot = path->slots[0] + 1;
+ int ret;
+
+ if (nritems == 0 || slot >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ *next_offset = (u64)-1;
+ return 0;
+ }
+ slot = path->slots[0];
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
+
+ if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+ found_key.type != BTRFS_EXTENT_CSUM_KEY)
+ *next_offset = (u64)-1;
+ else
+ *next_offset = found_key.offset;
+
+ return 0;
+}
+
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums)
@@ -938,7 +969,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
u64 total_bytes = 0;
u64 csum_offset;
u64 bytenr;
- u32 nritems;
u32 ins_size;
int index = 0;
int found_next;
@@ -981,26 +1011,10 @@ again:
goto insert;
}
} else {
- int slot = path->slots[0] + 1;
- /* we didn't find a csum item, insert one */
- nritems = btrfs_header_nritems(path->nodes[0]);
- if (!nritems || (path->slots[0] >= nritems - 1)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- goto out;
- } else if (ret > 0) {
- found_next = 1;
- goto insert;
- }
- slot = path->slots[0];
- }
- btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
- if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
- found_key.type != BTRFS_EXTENT_CSUM_KEY) {
- found_next = 1;
- goto insert;
- }
- next_offset = found_key.offset;
+ /* We didn't find a csum item, insert one. */
+ ret = find_next_csum_offset(root, path, &next_offset);
+ if (ret < 0)
+ goto out;
found_next = 1;
goto insert;
}
@@ -1056,8 +1070,48 @@ extend_csum:
tmp = sums->len - total_bytes;
tmp >>= fs_info->sectorsize_bits;
WARN_ON(tmp < 1);
+ extend_nr = max_t(int, 1, tmp);
+
+ /*
+ * A log tree can already have checksum items with a subset of
+ * the checksums we are trying to log. This can happen after
+ * doing a sequence of partial writes into prealloc extents and
+ * fsyncs in between, with a full fsync logging a larger subrange
+ * of an extent for which a previous fast fsync logged a smaller
+ * subrange. And this happens in particular due to merging file
+ * extent items when we complete an ordered extent for a range
+ * covered by a prealloc extent - this is done at
+ * btrfs_mark_extent_written().
+ *
+ * So if we try to extend the previous checksum item, which has
+ * a range that ends at the start of the range we want to insert,
+ * make sure we don't extend beyond the start offset of the next
+ * checksum item. If we are at the last item in the leaf, then
+ * forget the optimization of extending and add a new checksum
+ * item - it is not worth the complexity of releasing the path,
+ * getting the first key for the next leaf, repeat the btree
+ * search, etc, because log trees are temporary anyway and it
+ * would only save a few bytes of leaf space.
+ */
+ if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+ if (path->slots[0] + 1 >=
+ btrfs_header_nritems(path->nodes[0])) {
+ ret = find_next_csum_offset(root, path, &next_offset);
+ if (ret < 0)
+ goto out;
+ found_next = 1;
+ goto insert;
+ }
+
+ ret = find_next_csum_offset(root, path, &next_offset);
+ if (ret < 0)
+ goto out;
+
+ tmp = (next_offset - bytenr) >> fs_info->sectorsize_bits;
+ if (tmp <= INT_MAX)
+ extend_nr = min_t(int, extend_nr, tmp);
+ }
- extend_nr = max_t(int, 1, (int)tmp);
diff = (csum_offset + extend_nr) * csum_size;
diff = min(diff,
MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 33f14573f2ec..46f392943f4d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3000,6 +3000,18 @@ out:
if (ret || truncated) {
u64 unwritten_start = start;
+ /*
+ * If we failed to finish this ordered extent for any reason we
+ * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
+ * extent, and mark the inode with the error if it wasn't
+ * already set. Any error during writeback would have already
+ * set the mapping error, so we need to set it if we're the ones
+ * marking this ordered extent as failed.
+ */
+ if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR,
+ &ordered_extent->flags))
+ mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
+
if (truncated)
unwritten_start += logical_len;
clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
@@ -9076,6 +9088,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
int ret2;
bool root_log_pinned = false;
bool dest_log_pinned = false;
+ bool need_abort = false;
/* we only allow rename subvolume link between subvolumes */
if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
@@ -9135,6 +9148,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
old_idx);
if (ret)
goto out_fail;
+ need_abort = true;
}
/* And now for the dest. */
@@ -9150,8 +9164,11 @@ static int btrfs_rename_exchange(struct inode *old_dir,
new_ino,
btrfs_ino(BTRFS_I(old_dir)),
new_idx);
- if (ret)
+ if (ret) {
+ if (need_abort)
+ btrfs_abort_transaction(trans, ret);
goto out_fail;
+ }
}
/* Update inode version and ctime/mtime. */
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index d434dc78dadf..9178da07cc9c 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -203,10 +203,7 @@ static int clone_copy_inline_extent(struct inode *dst,
* inline extent's data to the page.
*/
ASSERT(key.offset > 0);
- ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
- inline_data, size, datal,
- comp_type);
- goto out;
+ goto copy_to_page;
}
} else if (i_size_read(dst) <= datal) {
struct btrfs_file_extent_item *ei;
@@ -222,13 +219,10 @@ static int clone_copy_inline_extent(struct inode *dst,
BTRFS_FILE_EXTENT_INLINE)
goto copy_inline_extent;
- ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
- inline_data, size, datal, comp_type);
- goto out;
+ goto copy_to_page;
}
copy_inline_extent:
- ret = 0;
/*
* We have no extent items, or we have an extent at offset 0 which may
* or may not be inlined. All these cases are dealt the same way.
@@ -240,11 +234,13 @@ copy_inline_extent:
* clone. Deal with all these cases by copying the inline extent
* data into the respective page at the destination inode.
*/
- ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
- inline_data, size, datal, comp_type);
- goto out;
+ goto copy_to_page;
}
+ /*
+ * Release path before starting a new transaction so we don't hold locks
+ * that would confuse lockdep.
+ */
btrfs_release_path(path);
/*
* If we end up here it means were copy the inline extent into a leaf
@@ -282,11 +278,6 @@ copy_inline_extent:
out:
if (!ret && !trans) {
/*
- * Release path before starting a new transaction so we don't
- * hold locks that would confuse lockdep.
- */
- btrfs_release_path(path);
- /*
* No transaction here means we copied the inline extent into a
* page of the destination inode.
*
@@ -306,6 +297,21 @@ out:
*trans_out = trans;
return ret;
+
+copy_to_page:
+ /*
+ * Release our path because we don't need it anymore and also because
+ * copy_inline_to_page() needs to reserve data and metadata, which may
+ * need to flush delalloc when we are low on available space and
+ * therefore cause a deadlock if writeback of an inline extent needs to
+ * write to the same leaf or an ordered extent completion needs to write
+ * to the same leaf.
+ */
+ btrfs_release_path(path);
+
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ inline_data, size, datal, comp_type);
+ goto out;
}
/**
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 326be57f2828..362d14db1e38 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1574,7 +1574,9 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ if (ret)
+ goto out;
}
ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
@@ -1749,7 +1751,9 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
if (nlink != inode->i_nlink) {
set_nlink(inode, nlink);
- btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ if (ret)
+ goto out;
}
BTRFS_I(inode)->index_cnt = (u64)-1;
@@ -1787,6 +1791,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
break;
if (ret == 1) {
+ ret = 0;
if (path->slots[0] == 0)
break;
path->slots[0]--;
@@ -1799,17 +1804,19 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
ret = btrfs_del_item(trans, root, path);
if (ret)
- goto out;
+ break;
btrfs_release_path(path);
inode = read_one_inode(root, key.offset);
- if (!inode)
- return -EIO;
+ if (!inode) {
+ ret = -EIO;
+ break;
+ }
ret = fixup_inode_link_count(trans, root, inode);
iput(inode);
if (ret)
- goto out;
+ break;
/*
* fixup on a directory may create new entries,
@@ -1818,8 +1825,6 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
*/
key.offset = (u64)-1;
}
- ret = 0;
-out:
btrfs_release_path(path);
return ret;
}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index a0b542d84cd9..493a83e3f590 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -911,8 +911,11 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
current->backing_dev_info = inode_to_bdi(inode);
buffered = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
current->backing_dev_info = NULL;
- if (unlikely(buffered <= 0))
+ if (unlikely(buffered <= 0)) {
+ if (!ret)
+ ret = buffered;
goto out_unlock;
+ }
/*
* We need to ensure that the page cache pages are written to
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index ea7fc5c641c7..d9cb261f55b0 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -582,6 +582,16 @@ out_locked:
spin_unlock(&gl->gl_lockref.lock);
}
+static bool is_system_glock(struct gfs2_glock *gl)
+{
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+ struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+
+ if (gl == m_ip->i_gl)
+ return true;
+ return false;
+}
+
/**
* do_xmote - Calls the DLM to change the state of a lock
* @gl: The lock state
@@ -671,17 +681,25 @@ skip_inval:
* to see sd_log_error and withdraw, and in the meantime, requeue the
* work for later.
*
+ * We make a special exception for some system glocks, such as the
+ * system statfs inode glock, which needs to be granted before the
+ * gfs2_quotad daemon can exit, and that exit needs to finish before
+ * we can unmount the withdrawn file system.
+ *
* However, if we're just unlocking the lock (say, for unmount, when
* gfs2_gl_hash_clear calls clear_glock) and recovery is complete
* then it's okay to tell dlm to unlock it.
*/
if (unlikely(sdp->sd_log_error && !gfs2_withdrawn(sdp)))
gfs2_withdraw_delayed(sdp);
- if (glock_blocked_by_withdraw(gl)) {
- if (target != LM_ST_UNLOCKED ||
- test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags)) {
+ if (glock_blocked_by_withdraw(gl) &&
+ (target != LM_ST_UNLOCKED ||
+ test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags))) {
+ if (!is_system_glock(gl)) {
gfs2_glock_queue_work(gl, GL_GLOCK_DFT_HOLD);
goto out;
+ } else {
+ clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
}
}
@@ -1466,9 +1484,11 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
glock_blocked_by_withdraw(gl) &&
gh->gh_gl != sdp->sd_jinode_gl) {
sdp->sd_glock_dqs_held++;
+ spin_unlock(&gl->gl_lockref.lock);
might_sleep();
wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
TASK_UNINTERRUPTIBLE);
+ spin_lock(&gl->gl_lockref.lock);
}
if (gh->gh_flags & GL_NOCACHE)
handle_callback(gl, LM_ST_UNLOCKED, 0, false);
@@ -1775,6 +1795,7 @@ __acquires(&lru_lock)
while(!list_empty(list)) {
gl = list_first_entry(list, struct gfs2_glock, gl_lru);
list_del_init(&gl->gl_lru);
+ clear_bit(GLF_LRU, &gl->gl_flags);
if (!spin_trylock(&gl->gl_lockref.lock)) {
add_back_to_lru:
list_add(&gl->gl_lru, &lru_list);
@@ -1820,7 +1841,6 @@ static long gfs2_scan_glock_lru(int nr)
if (!test_bit(GLF_LOCK, &gl->gl_flags)) {
list_move(&gl->gl_lru, &dispose);
atomic_dec(&lru_count);
- clear_bit(GLF_LRU, &gl->gl_flags);
freed++;
continue;
}
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 454095e9fedf..54d3fbeb3002 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -396,7 +396,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
struct timespec64 atime;
u16 height, depth;
umode_t mode = be32_to_cpu(str->di_mode);
- bool is_new = ip->i_inode.i_flags & I_NEW;
+ bool is_new = ip->i_inode.i_state & I_NEW;
if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
goto corrupt;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 97d54e581a7b..42c15cfc0821 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -926,10 +926,10 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
}
/**
- * ail_drain - drain the ail lists after a withdraw
+ * gfs2_ail_drain - drain the ail lists after a withdraw
* @sdp: Pointer to GFS2 superblock
*/
-static void ail_drain(struct gfs2_sbd *sdp)
+void gfs2_ail_drain(struct gfs2_sbd *sdp)
{
struct gfs2_trans *tr;
@@ -956,6 +956,7 @@ static void ail_drain(struct gfs2_sbd *sdp)
list_del(&tr->tr_list);
gfs2_trans_free(sdp, tr);
}
+ gfs2_drain_revokes(sdp);
spin_unlock(&sdp->sd_ail_lock);
}
@@ -1162,7 +1163,6 @@ out_withdraw:
if (tr && list_empty(&tr->tr_list))
list_add(&tr->tr_list, &sdp->sd_ail1_list);
spin_unlock(&sdp->sd_ail_lock);
- ail_drain(sdp); /* frees all transactions */
tr = NULL;
goto out_end;
}
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index eea58015710e..fc905c2af53c 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -93,5 +93,6 @@ extern int gfs2_logd(void *data);
extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
extern void gfs2_glock_remove_revoke(struct gfs2_glock *gl);
extern void gfs2_flush_revokes(struct gfs2_sbd *sdp);
+extern void gfs2_ail_drain(struct gfs2_sbd *sdp);
#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 221e7118cc3b..8ee05d25dfa6 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -885,7 +885,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
gfs2_log_write_page(sdp, page);
}
-static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+void gfs2_drain_revokes(struct gfs2_sbd *sdp)
{
struct list_head *head = &sdp->sd_log_revokes;
struct gfs2_bufdata *bd;
@@ -900,6 +900,11 @@ static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
}
}
+static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+ gfs2_drain_revokes(sdp);
+}
+
static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
struct gfs2_log_header_host *head, int pass)
{
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 31b6dd0d2e5d..f707601597dc 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -20,6 +20,7 @@ extern void gfs2_log_submit_bio(struct bio **biop, int opf);
extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
struct gfs2_log_header_host *head, bool keep_cache);
+extern void gfs2_drain_revokes(struct gfs2_sbd *sdp);
static inline unsigned int buf_limit(struct gfs2_sbd *sdp)
{
return sdp->sd_ldptrs;
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 3e08027a6c81..f4325b44956d 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -131,6 +131,7 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp)
if (test_bit(SDF_NORECOVERY, &sdp->sd_flags) || !sdp->sd_jdesc)
return;
+ gfs2_ail_drain(sdp); /* frees all transactions */
inode = sdp->sd_jdesc->jd_inode;
ip = GFS2_I(inode);
i_gl = ip->i_gl;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 903458afd56c..42380ed563c4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8228,6 +8228,7 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
{
int i, ret;
+ imu->acct_pages = 0;
for (i = 0; i < nr_pages; i++) {
if (!PageCompound(pages[i])) {
imu->acct_pages++;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 71fefb30e015..be5b6d2c01e7 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -424,11 +424,18 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
* events generated by the listener process itself, without disclosing
* the pids of other processes.
*/
- if (!capable(CAP_SYS_ADMIN) &&
+ if (FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
task_tgid(current) != event->pid)
metadata.pid = 0;
- if (path && path->mnt && path->dentry) {
+ /*
+ * For now, fid mode is required for an unprivileged listener and
+ * fid mode does not report fd in events. Keep this check anyway
+ * for safety in case fid mode requirement is relaxed in the future
+ * to allow unprivileged listener to get events with no fd and no fid.
+ */
+ if (!FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
+ path && path->mnt && path->dentry) {
fd = create_fd(group, path, &f);
if (fd < 0)
return fd;
@@ -1040,6 +1047,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
int f_flags, fd;
unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
unsigned int class = flags & FANOTIFY_CLASS_BITS;
+ unsigned int internal_flags = 0;
pr_debug("%s: flags=%x event_f_flags=%x\n",
__func__, flags, event_f_flags);
@@ -1053,6 +1061,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
*/
if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode)
return -EPERM;
+
+ /*
+ * Setting the internal flag FANOTIFY_UNPRIV on the group
+ * prevents setting mount/filesystem marks on this group and
+ * prevents reporting pid and open fd in events.
+ */
+ internal_flags |= FANOTIFY_UNPRIV;
}
#ifdef CONFIG_AUDITSYSCALL
@@ -1105,7 +1120,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
goto out_destroy_group;
}
- group->fanotify_data.flags = flags;
+ group->fanotify_data.flags = flags | internal_flags;
group->memcg = get_mem_cgroup_from_mm(current->mm);
group->fanotify_data.merge_hash = fanotify_alloc_merge_hash();
@@ -1305,11 +1320,13 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
group = f.file->private_data;
/*
- * An unprivileged user is not allowed to watch a mount point nor
- * a filesystem.
+ * An unprivileged user is not allowed to setup mount nor filesystem
+ * marks. This also includes setting up such marks by a group that
+ * was initialized by an unprivileged user.
*/
ret = -EPERM;
- if (!capable(CAP_SYS_ADMIN) &&
+ if ((!capable(CAP_SYS_ADMIN) ||
+ FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) &&
mark_type != FAN_MARK_INODE)
goto fput_and_out;
@@ -1460,6 +1477,7 @@ static int __init fanotify_user_setup(void)
max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS,
FANOTIFY_DEFAULT_MAX_USER_MARKS);
+ BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10);
BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index a712b2aaa9ac..57f0d5d9f934 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -144,7 +144,7 @@ void fanotify_show_fdinfo(struct seq_file *m, struct file *f)
struct fsnotify_group *group = f->private_data;
seq_printf(m, "fanotify flags:%x event-flags:%x\n",
- group->fanotify_data.flags,
+ group->fanotify_data.flags & FANOTIFY_INIT_FLAGS,
group->fanotify_data.f_flags);
show_fdinfo(m, f, fanotify_fdinfo);