diff options
Diffstat (limited to 'fs')
479 files changed, 13042 insertions, 10590 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index a97ceb105cd8..819c75233235 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -68,16 +68,22 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; struct p9_fid *fid = rreq->netfs_priv; + unsigned long long pos = subreq->start + subreq->transferred; int total, err; - total = p9_client_read(fid, subreq->start + subreq->transferred, - &subreq->io_iter, &err); + total = p9_client_read(fid, pos, &subreq->io_iter, &err); /* if we just extended the file size, any portion not in * cache won't be on server and is zeroes */ - __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + if (subreq->rreq->origin != NETFS_DIO_READ) + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + if (pos + total >= i_size_read(rreq->inode)) + __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); - netfs_subreq_terminated(subreq, err ?: total, false); + if (!err) + subreq->transferred += total; + + netfs_read_subreq_terminated(subreq, err, false); } /** diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index a183e213a4a5..21527189e430 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -55,12 +55,11 @@ static void adfs_write_failed(struct address_space *mapping, loff_t to) static int adfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - *pagep = NULL; - ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata, + ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, adfs_get_block, &ADFS_I(mapping->host)->mmu_private); if (unlikely(ret)) diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 2e612834329a..e8c2c4535cb3 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -14,8 +14,6 @@ /* Ugly macros make the code more pretty. */ -#define GET_END_PTR(st,p,sz) ((st *)((char *)(p)+((sz)-sizeof(st)))) -#define AFFS_GET_HASHENTRY(data,hashkey) be32_to_cpu(((struct dir_front *)data)->hashtable[hashkey]) #define AFFS_BLOCK(sb, bh, blk) (AFFS_HEAD(bh)->table[AFFS_SB(sb)->s_hashsize-1-(blk)]) #define AFFS_HEAD(bh) ((struct affs_head *)(bh)->b_data) diff --git a/fs/affs/amigaffs.h b/fs/affs/amigaffs.h index 1b973a669d23..da3217ab6adb 100644 --- a/fs/affs/amigaffs.h +++ b/fs/affs/amigaffs.h @@ -49,12 +49,13 @@ struct affs_short_date { struct affs_root_head { __be32 ptype; + /* The following fields are not used, but kept as documentation. */ __be32 spare1; __be32 spare2; __be32 hash_size; __be32 spare3; __be32 checksum; - __be32 hashtable[1]; + __be32 hashtable[]; }; struct affs_root_tail { diff --git a/fs/affs/dir.c b/fs/affs/dir.c index b2bf7016e1b3..bd40d5f08810 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -17,13 +17,44 @@ #include <linux/iversion.h> #include "affs.h" +struct affs_dir_data { + unsigned long ino; + u64 cookie; +}; + static int affs_readdir(struct file *, struct dir_context *); +static loff_t affs_dir_llseek(struct file *file, loff_t offset, int whence) +{ + struct affs_dir_data *data = file->private_data; + + return generic_llseek_cookie(file, offset, whence, &data->cookie); +} + +static int affs_dir_open(struct inode *inode, struct file *file) +{ + struct affs_dir_data *data; + + data = kzalloc(sizeof(struct affs_dir_data), GFP_KERNEL); + if (!data) + return -ENOMEM; + file->private_data = data; + return 0; +} + +static int affs_dir_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + const struct file_operations affs_dir_operations = { + .open = affs_dir_open, .read = generic_read_dir, - .llseek = generic_file_llseek, + .llseek = affs_dir_llseek, .iterate_shared = affs_readdir, .fsync = affs_file_fsync, + .release = affs_dir_release, }; /* @@ -45,6 +76,7 @@ static int affs_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); + struct affs_dir_data *data = file->private_data; struct super_block *sb = inode->i_sb; struct buffer_head *dir_bh = NULL; struct buffer_head *fh_bh = NULL; @@ -59,7 +91,7 @@ affs_readdir(struct file *file, struct dir_context *ctx) pr_debug("%s(ino=%lu,f_pos=%llx)\n", __func__, inode->i_ino, ctx->pos); if (ctx->pos < 2) { - file->private_data = (void *)0; + data->ino = 0; if (!dir_emit_dots(file, ctx)) return 0; } @@ -80,8 +112,8 @@ affs_readdir(struct file *file, struct dir_context *ctx) /* If the directory hasn't changed since the last call to readdir(), * we can jump directly to where we left off. */ - ino = (u32)(long)file->private_data; - if (ino && inode_eq_iversion(inode, file->f_version)) { + ino = data->ino; + if (ino && inode_eq_iversion(inode, data->cookie)) { pr_debug("readdir() left off=%d\n", ino); goto inside; } @@ -131,8 +163,8 @@ inside: } while (ino); } done: - file->f_version = inode_query_iversion(inode); - file->private_data = (void *)(long)ino; + data->cookie = inode_query_iversion(inode); + data->ino = ino; affs_brelse(fh_bh); out_brelse_dir: diff --git a/fs/affs/file.c b/fs/affs/file.c index 04c018e19602..a5a861dd5223 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -417,12 +417,11 @@ affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static int affs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - *pagep = NULL; - ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata, + ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, affs_get_block, &AFFS_I(mapping->host)->mmu_private); if (unlikely(ret)) @@ -433,12 +432,12 @@ static int affs_write_begin(struct file *file, struct address_space *mapping, static int affs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned int len, unsigned int copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int ret; - ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); /* Clear Archived bit on file writes, as AmigaOS would do */ if (AFFS_I(inode)->i_protect & FIBF_ARCHIVED) { @@ -648,7 +647,7 @@ static int affs_read_folio_ofs(struct file *file, struct folio *folio) static int affs_write_begin_ofs(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; struct folio *folio; @@ -671,7 +670,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); - *pagep = &folio->page; + *foliop = folio; if (folio_test_uptodate(folio)) return 0; @@ -687,9 +686,8 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping static int affs_write_end_ofs(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); struct inode *inode = mapping->host; struct super_block *sb = inode->i_sb; struct buffer_head *bh, *prev_bh; @@ -882,14 +880,14 @@ affs_truncate(struct inode *inode) if (inode->i_size > AFFS_I(inode)->mmu_private) { struct address_space *mapping = inode->i_mapping; - struct page *page; + struct folio *folio; void *fsdata = NULL; loff_t isize = inode->i_size; int res; - res = mapping->a_ops->write_begin(NULL, mapping, isize, 0, &page, &fsdata); + res = mapping->a_ops->write_begin(NULL, mapping, isize, 0, &folio, &fsdata); if (!res) - res = mapping->a_ops->write_end(NULL, mapping, isize, 0, 0, page, fsdata); + res = mapping->a_ops->write_end(NULL, mapping, isize, 0, 0, folio, fsdata); else inode->i_size = AFFS_I(inode)->mmu_private; mark_inode_dirty(inode); diff --git a/fs/afs/file.c b/fs/afs/file.c index c3f0c45ae9a9..492d857a3fa0 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -16,6 +16,7 @@ #include <linux/mm.h> #include <linux/swap.h> #include <linux/netfs.h> +#include <trace/events/netfs.h> #include "internal.h" static int afs_file_mmap(struct file *file, struct vm_area_struct *vma); @@ -242,8 +243,10 @@ static void afs_fetch_data_notify(struct afs_operation *op) req->error = error; if (subreq) { - __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); - netfs_subreq_terminated(subreq, error ?: req->actual_len, false); + subreq->rreq->i_size = req->file_size; + if (req->pos + req->actual_len >= req->file_size) + __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); + netfs_read_subreq_terminated(subreq, error, false); req->subreq = NULL; } else if (req->done) { req->done(req); @@ -261,6 +264,12 @@ static void afs_fetch_data_success(struct afs_operation *op) afs_fetch_data_notify(op); } +static void afs_fetch_data_aborted(struct afs_operation *op) +{ + afs_check_for_remote_deletion(op); + afs_fetch_data_notify(op); +} + static void afs_fetch_data_put(struct afs_operation *op) { op->fetch.req->error = afs_op_error(op); @@ -271,7 +280,7 @@ static const struct afs_operation_ops afs_fetch_data_operation = { .issue_afs_rpc = afs_fs_fetch_data, .issue_yfs_rpc = yfs_fs_fetch_data, .success = afs_fetch_data_success, - .aborted = afs_check_for_remote_deletion, + .aborted = afs_fetch_data_aborted, .failed = afs_fetch_data_notify, .put = afs_fetch_data_put, }; @@ -293,7 +302,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req) op = afs_alloc_operation(req->key, vnode->volume); if (IS_ERR(op)) { if (req->subreq) - netfs_subreq_terminated(req->subreq, PTR_ERR(op), false); + netfs_read_subreq_terminated(req->subreq, PTR_ERR(op), false); return PTR_ERR(op); } @@ -304,14 +313,15 @@ int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req) return afs_do_sync_operation(op); } -static void afs_issue_read(struct netfs_io_subrequest *subreq) +static void afs_read_worker(struct work_struct *work) { + struct netfs_io_subrequest *subreq = container_of(work, struct netfs_io_subrequest, work); struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode); struct afs_read *fsreq; fsreq = afs_alloc_read(GFP_NOFS); if (!fsreq) - return netfs_subreq_terminated(subreq, -ENOMEM, false); + return netfs_read_subreq_terminated(subreq, -ENOMEM, false); fsreq->subreq = subreq; fsreq->pos = subreq->start + subreq->transferred; @@ -320,10 +330,17 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq) fsreq->vnode = vnode; fsreq->iter = &subreq->io_iter; + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); afs_fetch_data(fsreq->vnode, fsreq); afs_put_read(fsreq); } +static void afs_issue_read(struct netfs_io_subrequest *subreq) +{ + INIT_WORK(&subreq->work, afs_read_worker); + queue_work(system_long_wq, &subreq->work); +} + static int afs_symlink_read_folio(struct file *file, struct folio *folio) { struct afs_vnode *vnode = AFS_FS_I(folio->mapping->host); diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 79cd30775b7a..098fa034a1cc 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -304,6 +304,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) struct afs_vnode_param *vp = &op->file[0]; struct afs_read *req = op->fetch.req; const __be32 *bp; + size_t count_before; int ret; _enter("{%u,%zu,%zu/%llu}", @@ -345,10 +346,14 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) /* extract the returned data */ case 2: - _debug("extract data %zu/%llu", - iov_iter_count(call->iter), req->actual_len); + count_before = call->iov_len; + _debug("extract data %zu/%llu", count_before, req->actual_len); ret = afs_extract_data(call, true); + if (req->subreq) { + req->subreq->transferred += count_before - call->iov_len; + netfs_read_subreq_progress(req->subreq, false); + } if (ret < 0) return ret; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 3acf5e050072..a95e77670b49 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -695,13 +695,18 @@ static void afs_setattr_edit_file(struct afs_operation *op) { struct afs_vnode_param *vp = &op->file[0]; struct afs_vnode *vnode = vp->vnode; + struct inode *inode = &vnode->netfs.inode; if (op->setattr.attr->ia_valid & ATTR_SIZE) { loff_t size = op->setattr.attr->ia_size; - loff_t i_size = op->setattr.old_i_size; + loff_t old = op->setattr.old_i_size; + + /* Note: inode->i_size was updated by afs_apply_status() inside + * the I/O and callback locks. + */ - if (size != i_size) { - truncate_setsize(&vnode->netfs.inode, size); + if (size != old) { + truncate_pagecache(inode, size); netfs_resize_file(&vnode->netfs, size, true); fscache_resize_cookie(afs_vnode_cache(vnode), size); } diff --git a/fs/afs/write.c b/fs/afs/write.c index e959640694c2..34107b55f834 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -89,10 +89,12 @@ static const struct afs_operation_ops afs_store_data_operation = { */ void afs_prepare_write(struct netfs_io_subrequest *subreq) { + struct netfs_io_stream *stream = &subreq->rreq->io_streams[subreq->stream_nr]; + //if (test_bit(NETFS_SREQ_RETRYING, &subreq->flags)) // subreq->max_len = 512 * 1024; //else - subreq->max_len = 256 * 1024 * 1024; + stream->sreq_max_len = 256 * 1024 * 1024; } /* diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index f521e66d3bf6..024227aba4cd 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -355,6 +355,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) struct afs_vnode_param *vp = &op->file[0]; struct afs_read *req = op->fetch.req; const __be32 *bp; + size_t count_before; int ret; _enter("{%u,%zu, %zu/%llu}", @@ -391,10 +392,14 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) /* extract the returned data */ case 2: - _debug("extract data %zu/%llu", - iov_iter_count(call->iter), req->actual_len); + count_before = call->iov_len; + _debug("extract data %zu/%llu", count_before, req->actual_len); ret = afs_extract_data(call, true); + if (req->subreq) { + req->subreq->transferred += count_before - call->iov_len; + netfs_read_subreq_progress(req->subreq, false); + } if (ret < 0) return ret; @@ -100,7 +100,7 @@ struct kioctx { unsigned long user_id; - struct __percpu kioctx_cpu *cpu; + struct kioctx_cpu __percpu *cpu; /* * For percpu reqs_available, number of slots we move to/from global diff --git a/fs/attr.c b/fs/attr.c index 825007d5cda4..c04d19b58f12 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -487,9 +487,17 @@ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry, error = security_inode_setattr(idmap, dentry, attr); if (error) return error; - error = try_break_deleg(inode, delegated_inode); - if (error) - return error; + + /* + * If ATTR_DELEG is set, then these attributes are being set on + * behalf of the holder of a write delegation. We want to avoid + * breaking the delegation in this case. + */ + if (!(ia_valid & ATTR_DELEG)) { + error = try_break_deleg(inode, delegated_inode); + if (error) + return error; + } if (inode->i_op->setattr) error = inode->i_op->setattr(idmap, dentry, attr); diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 8c1d587b3eef..77c7991d89aa 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -62,6 +62,7 @@ struct autofs_info { struct list_head expiring; struct autofs_sb_info *sbi; + unsigned long exp_timeout; unsigned long last_used; int count; @@ -81,6 +82,9 @@ struct autofs_info { */ #define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */ +#define AUTOFS_INF_EXPIRE_SET (1<<3) /* per-dentry expire timeout set for + this mount point. + */ struct autofs_wait_queue { wait_queue_head_t queue; struct autofs_wait_queue *next; diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c index 5bf781ea6d67..f011e026358e 100644 --- a/fs/autofs/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -128,7 +128,13 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) goto out; } + /* Setting the per-dentry expire timeout requires a trailing + * path component, ie. no '/', so invert the logic of the + * check_name() return for AUTOFS_DEV_IOCTL_TIMEOUT_CMD. + */ err = check_name(param->path); + if (cmd == AUTOFS_DEV_IOCTL_TIMEOUT_CMD) + err = err ? 0 : -EINVAL; if (err) { pr_warn("invalid path supplied for cmd(0x%08x)\n", cmd); @@ -396,16 +402,97 @@ static int autofs_dev_ioctl_catatonic(struct file *fp, return 0; } -/* Set the autofs mount timeout */ +/* + * Set the autofs mount expire timeout. + * + * There are two places an expire timeout can be set, in the autofs + * super block info. (this is all that's needed for direct and offset + * mounts because there's a distinct mount corresponding to each of + * these) and per-dentry within within the dentry info. If a per-dentry + * timeout is set it will override the expire timeout set in the parent + * autofs super block info. + * + * If setting the autofs super block expire timeout the autofs_dev_ioctl + * size field will be equal to the autofs_dev_ioctl structure size. If + * setting the per-dentry expire timeout the mount point name is passed + * in the autofs_dev_ioctl path field and the size field updated to + * reflect this. + * + * Setting the autofs mount expire timeout sets the timeout in the super + * block info. struct. Setting the per-dentry timeout does a little more. + * If the timeout is equal to -1 the per-dentry timeout (and flag) is + * cleared which reverts to using the super block timeout, otherwise if + * timeout is 0 the timeout is set to this value and the flag is left + * set which disables expiration for the mount point, lastly the flag + * and the timeout are set enabling the dentry to use this timeout. + */ static int autofs_dev_ioctl_timeout(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { - unsigned long timeout; + unsigned long timeout = param->timeout.timeout; + + /* If setting the expire timeout for an individual indirect + * mount point dentry the mount trailing component path is + * placed in param->path and param->size adjusted to account + * for it otherwise param->size it is set to the structure + * size. + */ + if (param->size == AUTOFS_DEV_IOCTL_SIZE) { + param->timeout.timeout = sbi->exp_timeout / HZ; + sbi->exp_timeout = timeout * HZ; + } else { + struct dentry *base = fp->f_path.dentry; + struct inode *inode = base->d_inode; + int path_len = param->size - AUTOFS_DEV_IOCTL_SIZE - 1; + struct dentry *dentry; + struct autofs_info *ino; + + if (!autofs_type_indirect(sbi->type)) + return -EINVAL; + + /* An expire timeout greater than the superblock timeout + * could be a problem at shutdown but the super block + * timeout itself can change so all we can really do is + * warn the user. + */ + if (timeout >= sbi->exp_timeout) + pr_warn("per-mount expire timeout is greater than " + "the parent autofs mount timeout which could " + "prevent shutdown\n"); + + inode_lock_shared(inode); + dentry = try_lookup_one_len(param->path, base, path_len); + inode_unlock_shared(inode); + if (IS_ERR_OR_NULL(dentry)) + return dentry ? PTR_ERR(dentry) : -ENOENT; + ino = autofs_dentry_ino(dentry); + if (!ino) { + dput(dentry); + return -ENOENT; + } + + if (ino->exp_timeout && ino->flags & AUTOFS_INF_EXPIRE_SET) + param->timeout.timeout = ino->exp_timeout / HZ; + else + param->timeout.timeout = sbi->exp_timeout / HZ; + + if (timeout == -1) { + /* Revert to using the super block timeout */ + ino->flags &= ~AUTOFS_INF_EXPIRE_SET; + ino->exp_timeout = 0; + } else { + /* Set the dentry expire flag and timeout. + * + * If timeout is 0 it will prevent the expire + * of this particular automount. + */ + ino->flags |= AUTOFS_INF_EXPIRE_SET; + ino->exp_timeout = timeout * HZ; + } + dput(dentry); + } - timeout = param->timeout.timeout; - param->timeout.timeout = sbi->exp_timeout / HZ; - sbi->exp_timeout = timeout * HZ; return 0; } diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index 39d8c84c16f4..5c2d459e1e48 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -429,8 +429,6 @@ static struct dentry *autofs_expire_indirect(struct super_block *sb, if (!root) return NULL; - timeout = sbi->exp_timeout; - dentry = NULL; while ((dentry = get_next_positive_subdir(dentry, root))) { spin_lock(&sbi->fs_lock); @@ -441,6 +439,11 @@ static struct dentry *autofs_expire_indirect(struct super_block *sb, } spin_unlock(&sbi->fs_lock); + if (ino->flags & AUTOFS_INF_EXPIRE_SET) + timeout = ino->exp_timeout; + else + timeout = sbi->exp_timeout; + expired = should_expire(dentry, mnt, timeout, how); if (!expired) continue; diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index cf792d4de4f1..ee2edccaef70 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -19,6 +19,7 @@ struct autofs_info *autofs_new_ino(struct autofs_sb_info *sbi) INIT_LIST_HEAD(&ino->expiring); ino->last_used = jiffies; ino->sbi = sbi; + ino->exp_timeout = -1; ino->count = 1; } return ino; @@ -28,6 +29,7 @@ void autofs_clean_ino(struct autofs_info *ino) { ino->uid = GLOBAL_ROOT_UID; ino->gid = GLOBAL_ROOT_GID; + ino->exp_timeout = -1; ino->last_used = jiffies; } @@ -172,8 +174,7 @@ static int autofs_parse_fd(struct fs_context *fc, struct autofs_sb_info *sbi, ret = autofs_check_pipe(pipe); if (ret < 0) { errorf(fc, "Invalid/unusable pipe"); - if (param->type != fs_value_is_file) - fput(pipe); + fput(pipe); return -EBADF; } diff --git a/fs/backing-file.c b/fs/backing-file.c index afb557446c27..8860dac58c37 100644 --- a/fs/backing-file.c +++ b/fs/backing-file.c @@ -303,13 +303,16 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, if (WARN_ON_ONCE(!(out->f_mode & FMODE_BACKING))) return -EIO; + if (!out->f_op->splice_write) + return -EINVAL; + ret = file_remove_privs(ctx->user_file); if (ret) return ret; old_cred = override_creds(ctx->cred); file_start_write(out); - ret = iter_file_splice_write(pipe, out, ppos, len, flags); + ret = out->f_op->splice_write(pipe, out, ppos, len, flags); file_end_write(out); revert_creds(old_cred); diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index a7b425d3c8a0..331a17f3f113 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -272,16 +272,19 @@ bch2_acl_to_xattr(struct btree_trans *trans, return xattr; } -struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap, - struct dentry *dentry, int type) +struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu) { - struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); + struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); - struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; struct posix_acl *acl = NULL; + + if (rcu) + return ERR_PTR(-ECHILD); + + struct btree_trans *trans = bch2_trans_get(c); retry: bch2_trans_begin(trans); diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h index 27e7eec0f278..fe730a6bf0c1 100644 --- a/fs/bcachefs/acl.h +++ b/fs/bcachefs/acl.h @@ -28,7 +28,7 @@ void bch2_acl_to_text(struct printbuf *, const void *, size_t); #ifdef CONFIG_BCACHEFS_POSIX_ACL -struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int); +struct posix_acl *bch2_get_acl(struct inode *, int, bool); int bch2_set_acl_trans(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index d9c5a92fa708..dc3a4024aab6 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -196,121 +196,119 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) return DIV_ROUND_UP(bytes, sizeof(u64)); } -int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); int ret = 0; /* allow for unknown fields */ - bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), c, err, - alloc_v1_val_size_bad, + bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), + c, alloc_v1_val_size_bad, "incorrect value size (%zu < %u)", bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v)); fsck_err: return ret; } -int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_alloc_unpacked u; int ret = 0; - bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), c, err, - alloc_v2_unpack_error, + bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), + c, alloc_v2_unpack_error, "unpack error"); fsck_err: return ret; } -int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_alloc_unpacked u; int ret = 0; - bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), c, err, - alloc_v2_unpack_error, + bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), + c, alloc_v2_unpack_error, "unpack error"); fsck_err: return ret; } -int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, struct printbuf *err) +int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { - struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); + struct bch_alloc_v4 a; int ret = 0; - bkey_fsck_err_on(alloc_v4_u64s_noerror(a.v) > bkey_val_u64s(k.k), c, err, - alloc_v4_val_size_bad, + bkey_val_copy(&a, bkey_s_c_to_alloc_v4(k)); + + bkey_fsck_err_on(alloc_v4_u64s_noerror(&a) > bkey_val_u64s(k.k), + c, alloc_v4_val_size_bad, "bad val size (%u > %zu)", - alloc_v4_u64s_noerror(a.v), bkey_val_u64s(k.k)); + alloc_v4_u64s_noerror(&a), bkey_val_u64s(k.k)); - bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) && - BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), c, err, - alloc_v4_backpointers_start_bad, + bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(&a) && + BCH_ALLOC_V4_NR_BACKPOINTERS(&a), + c, alloc_v4_backpointers_start_bad, "invalid backpointers_start"); - bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, c, err, - alloc_key_data_type_bad, + bkey_fsck_err_on(alloc_data_type(a, a.data_type) != a.data_type, + c, alloc_key_data_type_bad, "invalid data type (got %u should be %u)", - a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); + a.data_type, alloc_data_type(a, a.data_type)); for (unsigned i = 0; i < 2; i++) - bkey_fsck_err_on(a.v->io_time[i] > LRU_TIME_MAX, - c, err, - alloc_key_io_time_bad, + bkey_fsck_err_on(a.io_time[i] > LRU_TIME_MAX, + c, alloc_key_io_time_bad, "invalid io_time[%s]: %llu, max %llu", i == READ ? "read" : "write", - a.v->io_time[i], LRU_TIME_MAX); + a.io_time[i], LRU_TIME_MAX); - unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(a.v) * sizeof(u64) > + unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(&a) * sizeof(u64) > offsetof(struct bch_alloc_v4, stripe_sectors) - ? a.v->stripe_sectors + ? a.stripe_sectors : 0; - switch (a.v->data_type) { + switch (a.data_type) { case BCH_DATA_free: case BCH_DATA_need_gc_gens: case BCH_DATA_need_discard: bkey_fsck_err_on(stripe_sectors || - a.v->dirty_sectors || - a.v->cached_sectors || - a.v->stripe, - c, err, alloc_key_empty_but_have_data, + a.dirty_sectors || + a.cached_sectors || + a.stripe, + c, alloc_key_empty_but_have_data, "empty data type free but have data %u.%u.%u %u", stripe_sectors, - a.v->dirty_sectors, - a.v->cached_sectors, - a.v->stripe); + a.dirty_sectors, + a.cached_sectors, + a.stripe); break; case BCH_DATA_sb: case BCH_DATA_journal: case BCH_DATA_btree: case BCH_DATA_user: case BCH_DATA_parity: - bkey_fsck_err_on(!a.v->dirty_sectors && + bkey_fsck_err_on(!a.dirty_sectors && !stripe_sectors, - c, err, alloc_key_dirty_sectors_0, + c, alloc_key_dirty_sectors_0, "data_type %s but dirty_sectors==0", - bch2_data_type_str(a.v->data_type)); + bch2_data_type_str(a.data_type)); break; case BCH_DATA_cached: - bkey_fsck_err_on(!a.v->cached_sectors || - a.v->dirty_sectors || + bkey_fsck_err_on(!a.cached_sectors || + a.dirty_sectors || stripe_sectors || - a.v->stripe, - c, err, alloc_key_cached_inconsistency, + a.stripe, + c, alloc_key_cached_inconsistency, "data type inconsistency"); - bkey_fsck_err_on(!a.v->io_time[READ] && + bkey_fsck_err_on(!a.io_time[READ] && c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, - c, err, alloc_key_cached_but_read_time_zero, + c, alloc_key_cached_but_read_time_zero, "cached bucket with read_time == 0"); break; case BCH_DATA_stripe: @@ -513,14 +511,13 @@ static unsigned alloc_gen(struct bkey_s_c k, unsigned offset) : 0; } -int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_bucket_gens_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { int ret = 0; - bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), c, err, - bucket_gens_val_size_bad, + bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), + c, bucket_gens_val_size_bad, "bad val size (%zu != %zu)", bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens)); fsck_err: @@ -561,7 +558,7 @@ int bch2_bucket_gens_init(struct bch_fs *c) struct bpos pos = alloc_gens_pos(iter.pos, &offset); int ret2 = 0; - if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) { + if (have_bucket_gens_key && !bkey_eq(g.k.p, pos)) { ret2 = bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); if (ret2) @@ -829,7 +826,19 @@ int bch2_trigger_alloc(struct btree_trans *trans, struct bch_alloc_v4 old_a_convert; const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); - struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v; + + struct bch_alloc_v4 *new_a; + if (likely(new.k->type == KEY_TYPE_alloc_v4)) { + new_a = bkey_s_to_alloc_v4(new).v; + } else { + BUG_ON(!(flags & (BTREE_TRIGGER_gc|BTREE_TRIGGER_check_repair))); + + struct bkey_i_alloc_v4 *new_ka = bch2_alloc_to_v4_mut_inlined(trans, new.s_c); + ret = PTR_ERR_OR_ZERO(new_ka); + if (unlikely(ret)) + goto err; + new_a = &new_ka->v; + } if (flags & BTREE_TRIGGER_transactional) { alloc_data_type_set(new_a, new_a->data_type); @@ -1865,26 +1874,26 @@ static void bch2_do_discards_work(struct work_struct *work) trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); - bch2_write_ref_put(c, BCH_WRITE_REF_discard); percpu_ref_put(&ca->io_ref); + bch2_write_ref_put(c, BCH_WRITE_REF_discard); } void bch2_dev_do_discards(struct bch_dev *ca) { struct bch_fs *c = ca->fs; - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard)) return; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard)) - goto put_ioref; + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + goto put_write_ref; if (queue_work(c->write_ref_wq, &ca->discard_work)) return; - bch2_write_ref_put(c, BCH_WRITE_REF_discard); -put_ioref: percpu_ref_put(&ca->io_ref); +put_write_ref: + bch2_write_ref_put(c, BCH_WRITE_REF_discard); } void bch2_do_discards(struct bch_fs *c) @@ -1959,8 +1968,8 @@ static void bch2_do_discards_fast_work(struct work_struct *work) break; } - bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); percpu_ref_put(&ca->io_ref); + bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); } static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) @@ -1970,18 +1979,18 @@ static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) if (discard_in_flight_add(ca, bucket, false)) return; - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast)) return; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast)) - goto put_ioref; + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + goto put_ref; if (queue_work(c->write_ref_wq, &ca->discard_fast_work)) return; - bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); -put_ioref: percpu_ref_put(&ca->io_ref); +put_ref: + bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); } static int invalidate_one_bucket(struct btree_trans *trans, @@ -2123,26 +2132,26 @@ static void bch2_do_invalidates_work(struct work_struct *work) bch2_trans_iter_exit(trans, &iter); err: bch2_trans_put(trans); - bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); percpu_ref_put(&ca->io_ref); + bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); } void bch2_dev_do_invalidates(struct bch_dev *ca) { struct bch_fs *c = ca->fs; - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate)) return; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate)) - goto put_ioref; + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + goto put_ref; if (queue_work(c->write_ref_wq, &ca->invalidate_work)) return; - bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); -put_ioref: percpu_ref_put(&ca->io_ref); +put_ref: + bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); } void bch2_do_invalidates(struct bch_fs *c) diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index 8d2b62c9588e..fd790b03fbe1 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -82,6 +82,14 @@ static inline bool bucket_data_type_mismatch(enum bch_data_type bucket, bucket_data_type(bucket) != bucket_data_type(ptr); } +/* + * It is my general preference to use unsigned types for unsigned quantities - + * however, these helpers are used in disk accounting calculations run by + * triggers where the output will be negated and added to an s64. unsigned is + * right out even though all these quantities will fit in 32 bits, since it + * won't be sign extended correctly; u64 will negate "correctly", but s64 is the + * simpler option here. + */ static inline s64 bch2_bucket_sectors_total(struct bch_alloc_v4 a) { return a.stripe_sectors + a.dirty_sectors + a.cached_sectors; @@ -142,7 +150,9 @@ static inline void alloc_data_type_set(struct bch_alloc_v4 *a, enum bch_data_typ static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a) { - return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0; + return a.data_type == BCH_DATA_cached + ? a.io_time[READ] & LRU_TIME_MAX + : 0; } #define DATA_TYPES_MOVABLE \ @@ -166,8 +176,8 @@ static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a, * avoid overflowing LRU_TIME_BITS on a corrupted fs, when * bucket_sectors_dirty is (much) bigger than bucket_size */ - u64 d = min(bch2_bucket_sectors_dirty(a), - ca->mi.bucket_size); + u64 d = min_t(s64, bch2_bucket_sectors_dirty(a), + ca->mi.bucket_size); return div_u64(d * (1ULL << 31), ca->mi.bucket_size); } @@ -232,52 +242,48 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); -int bch2_alloc_v1_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); -int bch2_alloc_v2_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); -int bch2_alloc_v3_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); -int bch2_alloc_v4_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); void bch2_alloc_v4_swab(struct bkey_s); void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_alloc ((struct bkey_ops) { \ - .key_invalid = bch2_alloc_v1_invalid, \ + .key_validate = bch2_alloc_v1_validate, \ .val_to_text = bch2_alloc_to_text, \ .trigger = bch2_trigger_alloc, \ .min_val_size = 8, \ }) #define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) { \ - .key_invalid = bch2_alloc_v2_invalid, \ + .key_validate = bch2_alloc_v2_validate, \ .val_to_text = bch2_alloc_to_text, \ .trigger = bch2_trigger_alloc, \ .min_val_size = 8, \ }) #define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) { \ - .key_invalid = bch2_alloc_v3_invalid, \ + .key_validate = bch2_alloc_v3_validate, \ .val_to_text = bch2_alloc_to_text, \ .trigger = bch2_trigger_alloc, \ .min_val_size = 16, \ }) #define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) { \ - .key_invalid = bch2_alloc_v4_invalid, \ + .key_validate = bch2_alloc_v4_validate, \ .val_to_text = bch2_alloc_to_text, \ .swab = bch2_alloc_v4_swab, \ .trigger = bch2_trigger_alloc, \ .min_val_size = 48, \ }) -int bch2_bucket_gens_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_bucket_gens_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \ - .key_invalid = bch2_bucket_gens_invalid, \ + .key_validate = bch2_bucket_gens_validate, \ .val_to_text = bch2_bucket_gens_to_text, \ }) diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h index 47d9d006502c..f754a2951d8a 100644 --- a/fs/bcachefs/alloc_background_format.h +++ b/fs/bcachefs/alloc_background_format.h @@ -69,6 +69,7 @@ struct bch_alloc_v4 { __u64 io_time[2]; __u32 stripe; __u32 nr_external_backpointers; + /* end of fields in original version of alloc_v4 */ __u64 fragmentation_lru; __u32 stripe_sectors; __u32 pad; diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 618d2ff0292e..8563c2d26847 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -1603,7 +1603,8 @@ void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct ope prt_newline(out); } -void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c) +void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c, + struct bch_dev *ca) { struct open_bucket *ob; @@ -1613,7 +1614,8 @@ void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c) ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { spin_lock(&ob->lock); - if (ob->valid && !ob->on_partial_list) + if (ob->valid && !ob->on_partial_list && + (!ca || ob->dev == ca->dev_idx)) bch2_open_bucket_to_text(out, c, ob); spin_unlock(&ob->lock); } @@ -1738,7 +1740,7 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) printbuf_tabstop_push(out, 16); printbuf_tabstop_push(out, 16); - bch2_dev_usage_to_text(out, &stats); + bch2_dev_usage_to_text(out, ca, &stats); prt_newline(out); @@ -1756,11 +1758,12 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) prt_printf(out, "buckets to invalidate\t%llu\r\n", should_invalidate_buckets(ca, stats)); } -void bch2_print_allocator_stuck(struct bch_fs *c) +static noinline void bch2_print_allocator_stuck(struct bch_fs *c) { struct printbuf buf = PRINTBUF; - prt_printf(&buf, "Allocator stuck? Waited for 10 seconds\n"); + prt_printf(&buf, "Allocator stuck? Waited for %u seconds\n", + c->opts.allocator_stuck_timeout); prt_printf(&buf, "Allocator debug:\n"); printbuf_indent_add(&buf, 2); @@ -1790,3 +1793,24 @@ void bch2_print_allocator_stuck(struct bch_fs *c) bch2_print_string_as_lines(KERN_ERR, buf.buf); printbuf_exit(&buf); } + +static inline unsigned allocator_wait_timeout(struct bch_fs *c) +{ + if (c->allocator_last_stuck && + time_after(c->allocator_last_stuck + HZ * 60 * 2, jiffies)) + return 0; + + return c->opts.allocator_stuck_timeout * HZ; +} + +void __bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl) +{ + unsigned t = allocator_wait_timeout(c); + + if (t && closure_sync_timeout(cl, t)) { + c->allocator_last_stuck = jiffies; + bch2_print_allocator_stuck(c); + } + + closure_sync(cl); +} diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 6da9e7e29026..386d231ceca3 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -223,7 +223,7 @@ static inline struct write_point_specifier writepoint_ptr(struct write_point *wp void bch2_fs_allocator_foreground_init(struct bch_fs *); void bch2_open_bucket_to_text(struct printbuf *, struct bch_fs *, struct open_bucket *); -void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *); +void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *, struct bch_dev *); void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *); void bch2_write_points_to_text(struct printbuf *, struct bch_fs *); @@ -231,6 +231,11 @@ void bch2_write_points_to_text(struct printbuf *, struct bch_fs *); void bch2_fs_alloc_debug_to_text(struct printbuf *, struct bch_fs *); void bch2_dev_alloc_debug_to_text(struct printbuf *, struct bch_dev *); -void bch2_print_allocator_stuck(struct bch_fs *); +void __bch2_wait_on_allocator(struct bch_fs *, struct closure *); +static inline void bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl) +{ + if (cl->closure_get_happened) + __bch2_wait_on_allocator(c, cl); +} #endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 3cc02479a982..d4da6343efa9 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -47,9 +47,8 @@ static bool extent_matches_bp(struct bch_fs *c, return false; } -int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); @@ -68,8 +67,7 @@ int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k, bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size || !bpos_eq(bp.k->p, bp_pos), - c, err, - backpointer_bucket_offset_wrong, + c, backpointer_bucket_offset_wrong, "backpointer bucket_offset wrong"); fsck_err: return ret; @@ -763,27 +761,22 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, btree < BTREE_ID_NR && !ret; btree++) { unsigned depth = (BIT_ULL(btree) & btree_leaf_mask) ? 0 : 1; - struct btree_iter iter; - struct btree *b; if (!(BIT_ULL(btree) & btree_leaf_mask) && !(BIT_ULL(btree) & btree_interior_mask)) continue; - bch2_trans_begin(trans); - - __for_each_btree_node(trans, iter, btree, + ret = __for_each_btree_node(trans, iter, btree, btree == start.btree ? start.pos : POS_MIN, - 0, depth, BTREE_ITER_prefetch, b, ret) { + 0, depth, BTREE_ITER_prefetch, b, ({ mem_may_pin -= btree_buf_bytes(b); if (mem_may_pin <= 0) { c->btree_cache.pinned_nodes_end = *end = BBPOS(btree, b->key.k.p); - bch2_trans_iter_exit(trans, &iter); - return 0; + break; } - } - bch2_trans_iter_exit(trans, &iter); + 0; + })); } return ret; diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 6021de1c5e98..7daecadb764e 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -18,14 +18,13 @@ static inline u64 swab40(u64 x) ((x & 0xff00000000ULL) >> 32)); } -int bch2_backpointer_invalid(struct bch_fs *, struct bkey_s_c k, - enum bch_validate_flags, struct printbuf *); +int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k, enum bch_validate_flags); void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *); void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_backpointer_swab(struct bkey_s); #define bch2_bkey_ops_backpointer ((struct bkey_ops) { \ - .key_invalid = bch2_backpointer_invalid, \ + .key_validate = bch2_backpointer_validate, \ .val_to_text = bch2_backpointer_k_to_text, \ .swab = bch2_backpointer_swab, \ .min_val_size = 32, \ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 91361a167dcd..0c7086e00d18 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -447,6 +447,7 @@ BCH_DEBUG_PARAMS_DEBUG() x(blocked_journal_low_on_space) \ x(blocked_journal_low_on_pin) \ x(blocked_journal_max_in_flight) \ + x(blocked_key_cache_flush) \ x(blocked_allocate) \ x(blocked_allocate_open_bucket) \ x(blocked_write_buffer_full) \ @@ -893,6 +894,8 @@ struct bch_fs { struct bch_fs_usage_base __percpu *usage; u64 __percpu *online_reserved; + unsigned long allocator_last_stuck; + struct io_clock io_clock[2]; /* JOURNAL SEQ BLACKLIST */ diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 74a60b1a4ddf..14ce726bf5a3 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -675,7 +675,10 @@ struct bch_sb_field_ext { x(btree_subvolume_children, BCH_VERSION(1, 6)) \ x(mi_btree_bitmap, BCH_VERSION(1, 7)) \ x(bucket_stripe_sectors, BCH_VERSION(1, 8)) \ - x(disk_accounting_v2, BCH_VERSION(1, 9)) + x(disk_accounting_v2, BCH_VERSION(1, 9)) \ + x(disk_accounting_v3, BCH_VERSION(1, 10)) \ + x(disk_accounting_inum, BCH_VERSION(1, 11)) \ + x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -836,6 +839,8 @@ LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI, LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE, struct bch_sb, flags[5], 0, 16); +LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT, + struct bch_sb, flags[5], 16, 32); static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) { diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index 936357149cf0..e34cb2bf329c 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -10,9 +10,10 @@ #include "vstructs.h" enum bch_validate_flags { - BCH_VALIDATE_write = (1U << 0), - BCH_VALIDATE_commit = (1U << 1), - BCH_VALIDATE_journal = (1U << 2), + BCH_VALIDATE_write = BIT(0), + BCH_VALIDATE_commit = BIT(1), + BCH_VALIDATE_journal = BIT(2), + BCH_VALIDATE_silent = BIT(3), }; #if 0 diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 5f07cf853d0c..88d8958281e8 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -27,27 +27,27 @@ const char * const bch2_bkey_types[] = { NULL }; -static int deleted_key_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, struct printbuf *err) +static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { return 0; } #define bch2_bkey_ops_deleted ((struct bkey_ops) { \ - .key_invalid = deleted_key_invalid, \ + .key_validate = deleted_key_validate, \ }) #define bch2_bkey_ops_whiteout ((struct bkey_ops) { \ - .key_invalid = deleted_key_invalid, \ + .key_validate = deleted_key_validate, \ }) -static int empty_val_key_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, struct printbuf *err) +static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { int ret = 0; - bkey_fsck_err_on(bkey_val_bytes(k.k), c, err, - bkey_val_size_nonzero, + bkey_fsck_err_on(bkey_val_bytes(k.k), + c, bkey_val_size_nonzero, "incorrect value size (%zu != 0)", bkey_val_bytes(k.k)); fsck_err: @@ -55,11 +55,11 @@ fsck_err: } #define bch2_bkey_ops_error ((struct bkey_ops) { \ - .key_invalid = empty_val_key_invalid, \ + .key_validate = empty_val_key_validate, \ }) -static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, struct printbuf *err) +static int key_type_cookie_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { return 0; } @@ -73,17 +73,17 @@ static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c, } #define bch2_bkey_ops_cookie ((struct bkey_ops) { \ - .key_invalid = key_type_cookie_invalid, \ + .key_validate = key_type_cookie_validate, \ .val_to_text = key_type_cookie_to_text, \ .min_val_size = 8, \ }) #define bch2_bkey_ops_hash_whiteout ((struct bkey_ops) {\ - .key_invalid = empty_val_key_invalid, \ + .key_validate = empty_val_key_validate, \ }) -static int key_type_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, struct printbuf *err) +static int key_type_inline_data_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { return 0; } @@ -98,9 +98,9 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, datalen, min(datalen, 32U), d.v->data); } -#define bch2_bkey_ops_inline_data ((struct bkey_ops) { \ - .key_invalid = key_type_inline_data_invalid, \ - .val_to_text = key_type_inline_data_to_text, \ +#define bch2_bkey_ops_inline_data ((struct bkey_ops) { \ + .key_validate = key_type_inline_data_validate, \ + .val_to_text = key_type_inline_data_to_text, \ }) static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) @@ -110,7 +110,7 @@ static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_ } #define bch2_bkey_ops_set ((struct bkey_ops) { \ - .key_invalid = empty_val_key_invalid, \ + .key_validate = empty_val_key_validate, \ .key_merge = key_type_set_merge, \ }) @@ -123,9 +123,8 @@ const struct bkey_ops bch2_bkey_ops[] = { const struct bkey_ops bch2_bkey_null_ops = { }; -int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) return 0; @@ -133,15 +132,15 @@ int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k, const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); int ret = 0; - bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size, c, err, - bkey_val_size_too_small, + bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size, + c, bkey_val_size_too_small, "bad val size (%zu < %u)", bkey_val_bytes(k.k), ops->min_val_size); - if (!ops->key_invalid) + if (!ops->key_validate) return 0; - ret = ops->key_invalid(c, k, flags, err); + ret = ops->key_validate(c, k, flags); fsck_err: return ret; } @@ -161,18 +160,17 @@ const char *bch2_btree_node_type_str(enum btree_node_type type) return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1); } -int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, - enum btree_node_type type, - enum bch_validate_flags flags, - struct printbuf *err) +int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, + enum btree_node_type type, + enum bch_validate_flags flags) { if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) return 0; int ret = 0; - bkey_fsck_err_on(k.k->u64s < BKEY_U64s, c, err, - bkey_u64s_too_small, + bkey_fsck_err_on(k.k->u64s < BKEY_U64s, + c, bkey_u64s_too_small, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s); if (type >= BKEY_TYPE_NR) @@ -180,8 +178,8 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX && (type == BKEY_TYPE_btree || (flags & BCH_VALIDATE_commit)) && - !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err, - bkey_invalid_type_for_btree, + !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), + c, bkey_invalid_type_for_btree, "invalid key type for btree %s (%s)", bch2_btree_node_type_str(type), k.k->type < KEY_TYPE_MAX @@ -189,17 +187,17 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, : "(unknown)"); if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { - bkey_fsck_err_on(k.k->size == 0, c, err, - bkey_extent_size_zero, + bkey_fsck_err_on(k.k->size == 0, + c, bkey_extent_size_zero, "size == 0"); - bkey_fsck_err_on(k.k->size > k.k->p.offset, c, err, - bkey_extent_size_greater_than_offset, + bkey_fsck_err_on(k.k->size > k.k->p.offset, + c, bkey_extent_size_greater_than_offset, "size greater than offset (%u > %llu)", k.k->size, k.k->p.offset); } else { - bkey_fsck_err_on(k.k->size, c, err, - bkey_size_nonzero, + bkey_fsck_err_on(k.k->size, + c, bkey_size_nonzero, "size != 0"); } @@ -207,12 +205,12 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, enum btree_id btree = type - 1; if (btree_type_has_snapshots(btree)) { - bkey_fsck_err_on(!k.k->p.snapshot, c, err, - bkey_snapshot_zero, + bkey_fsck_err_on(!k.k->p.snapshot, + c, bkey_snapshot_zero, "snapshot == 0"); } else if (!btree_type_has_snapshot_field(btree)) { - bkey_fsck_err_on(k.k->p.snapshot, c, err, - bkey_snapshot_nonzero, + bkey_fsck_err_on(k.k->p.snapshot, + c, bkey_snapshot_nonzero, "nonzero snapshot"); } else { /* @@ -221,34 +219,33 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, */ } - bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), c, err, - bkey_at_pos_max, + bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), + c, bkey_at_pos_max, "key at POS_MAX"); } fsck_err: return ret; } -int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, +int bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, enum btree_node_type type, - enum bch_validate_flags flags, - struct printbuf *err) + enum bch_validate_flags flags) { - return __bch2_bkey_invalid(c, k, type, flags, err) ?: - bch2_bkey_val_invalid(c, k, flags, err); + return __bch2_bkey_validate(c, k, type, flags) ?: + bch2_bkey_val_validate(c, k, flags); } int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b, - struct bkey_s_c k, struct printbuf *err) + struct bkey_s_c k, enum bch_validate_flags flags) { int ret = 0; - bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key), c, err, - bkey_before_start_of_btree_node, + bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key), + c, bkey_before_start_of_btree_node, "key before start of btree node"); - bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key), c, err, - bkey_after_end_of_btree_node, + bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key), + c, bkey_after_end_of_btree_node, "key past end of btree node"); fsck_err: return ret; diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h index baef0722f5fb..3df3dd2723a1 100644 --- a/fs/bcachefs/bkey_methods.h +++ b/fs/bcachefs/bkey_methods.h @@ -14,15 +14,15 @@ extern const char * const bch2_bkey_types[]; extern const struct bkey_ops bch2_bkey_null_ops; /* - * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If + * key_validate: checks validity of @k, returns 0 if good or -EINVAL if bad. If * invalid, entire key will be deleted. * * When invalid, error string is returned via @err. @rw indicates whether key is * being read or written; more aggressive checks can be enabled when rw == WRITE. */ struct bkey_ops { - int (*key_invalid)(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, struct printbuf *err); + int (*key_validate)(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags); void (*val_to_text)(struct printbuf *, struct bch_fs *, struct bkey_s_c); void (*swab)(struct bkey_s); @@ -48,14 +48,13 @@ static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type) : &bch2_bkey_null_ops; } -int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); -int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type, - enum bch_validate_flags, struct printbuf *); -int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type, - enum bch_validate_flags, struct printbuf *); -int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, - struct bkey_s_c, struct printbuf *); +int bch2_bkey_val_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int __bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, enum btree_node_type, + enum bch_validate_flags); +int bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, enum btree_node_type, + enum bch_validate_flags); +int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, struct bkey_s_c, + enum bch_validate_flags); void bch2_bpos_to_text(struct printbuf *, struct bpos); void bch2_bkey_to_text(struct printbuf *, const struct bkey *); diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index f5d85b50b6f2..e52a06d3418c 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -159,6 +159,16 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) return b; } +void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) +{ + mutex_lock(&c->btree_cache.lock); + list_move(&b->list, &c->btree_cache.freeable); + mutex_unlock(&c->btree_cache.lock); + + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); +} + /* Btree in memory cache - hash table */ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) @@ -736,6 +746,13 @@ out: start_time); memalloc_nofs_restore(flags); + + int ret = bch2_trans_relock(trans); + if (unlikely(ret)) { + bch2_btree_node_to_freelist(c, b); + return ERR_PTR(ret); + } + return b; err: mutex_lock(&bc->lock); @@ -856,6 +873,10 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, bch2_btree_node_read(trans, b, sync); + int ret = bch2_trans_relock(trans); + if (ret) + return ERR_PTR(ret); + if (!sync) return NULL; @@ -974,6 +995,10 @@ retry: bch2_btree_node_wait_on_read(b); + ret = bch2_trans_relock(trans); + if (ret) + return ERR_PTR(ret); + /* * should_be_locked is not set on this path yet, so we need to * relock it specifically: diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index c0eb87a057cc..f82064007127 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -12,6 +12,8 @@ struct btree_iter; void bch2_recalc_btree_reserve(struct bch_fs *); +void bch2_btree_node_to_freelist(struct bch_fs *, struct btree *); + void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 6cbf2aa6a947..eb3002c4eae7 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -741,12 +741,9 @@ fsck_err: static int bch2_mark_superblocks(struct bch_fs *c) { - mutex_lock(&c->sb_lock); gc_pos_set(c, gc_phase(GC_PHASE_sb)); - int ret = bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc); - mutex_unlock(&c->sb_lock); - return ret; + return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc); } static void bch2_gc_free(struct bch_fs *c) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 2c424435ca4a..56ea9a77cd4a 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -836,14 +836,13 @@ fsck_err: return ret; } -static int bset_key_invalid(struct bch_fs *c, struct btree *b, - struct bkey_s_c k, - bool updated_range, int rw, - struct printbuf *err) +static int bset_key_validate(struct bch_fs *c, struct btree *b, + struct bkey_s_c k, + bool updated_range, int rw) { - return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?: - (!updated_range ? bch2_bkey_in_btree_node(c, b, k, err) : 0) ?: - (rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0); + return __bch2_bkey_validate(c, k, btree_node_type(b), 0) ?: + (!updated_range ? bch2_bkey_in_btree_node(c, b, k, 0) : 0) ?: + (rw == WRITE ? bch2_bkey_val_validate(c, k, 0) : 0); } static bool bkey_packed_valid(struct bch_fs *c, struct btree *b, @@ -858,12 +857,9 @@ static bool bkey_packed_valid(struct bch_fs *c, struct btree *b, if (!bkeyp_u64s_valid(&b->format, k)) return false; - struct printbuf buf = PRINTBUF; struct bkey tmp; struct bkey_s u = __bkey_disassemble(b, k, &tmp); - bool ret = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b), READ, &buf); - printbuf_exit(&buf); - return ret; + return !__bch2_bkey_validate(c, u.s_c, btree_node_type(b), BCH_VALIDATE_silent); } static int validate_bset_keys(struct bch_fs *c, struct btree *b, @@ -915,19 +911,11 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, u = __bkey_disassemble(b, k, &tmp); - printbuf_reset(&buf); - if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) { - printbuf_reset(&buf); - bset_key_invalid(c, b, u.s_c, updated_range, write, &buf); - prt_printf(&buf, "\n "); - bch2_bkey_val_to_text(&buf, c, u.s_c); - - btree_err(-BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, k, - btree_node_bad_bkey, - "invalid bkey: %s", buf.buf); + ret = bset_key_validate(c, b, u.s_c, updated_range, write); + if (ret == -BCH_ERR_fsck_delete_bkey) goto drop_this_key; - } + if (ret) + goto fsck_err; if (write) bch2_bkey_compat(b->c.level, b->c.btree_id, version, @@ -1228,23 +1216,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, struct bkey tmp; struct bkey_s u = __bkey_disassemble(b, k, &tmp); - printbuf_reset(&buf); - - if (bch2_bkey_val_invalid(c, u.s_c, READ, &buf) || + ret = bch2_bkey_val_validate(c, u.s_c, READ); + if (ret == -BCH_ERR_fsck_delete_bkey || (bch2_inject_invalid_keys && !bversion_cmp(u.k->version, MAX_VERSION))) { - printbuf_reset(&buf); - - prt_printf(&buf, "invalid bkey: "); - bch2_bkey_val_invalid(c, u.s_c, READ, &buf); - prt_printf(&buf, "\n "); - bch2_bkey_val_to_text(&buf, c, u.s_c); - - btree_err(-BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, k, - btree_node_bad_bkey, - "%s", buf.buf); - btree_keys_account_key_drop(&b->nr, 0, k); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); @@ -1253,6 +1228,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, set_btree_bset_end(b, b->set); continue; } + if (ret) + goto fsck_err; if (u.k->type == KEY_TYPE_btree_ptr_v2) { struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u); @@ -1767,6 +1744,8 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, set_btree_node_read_in_flight(b); + /* we can't pass the trans to read_done() for fsck errors, so it must be unlocked */ + bch2_trans_unlock(trans); bch2_btree_node_read(trans, b, true); if (btree_node_read_error(b)) { @@ -1952,18 +1931,14 @@ static void btree_node_write_endio(struct bio *bio) static int validate_bset_for_write(struct bch_fs *c, struct btree *b, struct bset *i, unsigned sectors) { - struct printbuf buf = PRINTBUF; bool saw_error; - int ret; - - ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), - BKEY_TYPE_btree, WRITE, &buf); - if (ret) - bch2_fs_inconsistent(c, "invalid btree node key before write: %s", buf.buf); - printbuf_exit(&buf); - if (ret) + int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key), + BKEY_TYPE_btree, WRITE); + if (ret) { + bch2_fs_inconsistent(c, "invalid btree node key before write"); return ret; + } ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?: validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error); diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 36872207f09b..2e84d22e17bd 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1900,6 +1900,7 @@ err: goto out; } +/* Only kept for -tools */ struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter) { struct btree *b; @@ -1921,6 +1922,11 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) bch2_trans_verify_not_in_restart(trans); bch2_btree_iter_verify(iter); + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (ret) + goto err; + + struct btree_path *path = btree_iter_path(trans, iter); /* already at end? */ diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index c7725865309c..222b7ce8a901 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -569,6 +569,15 @@ static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans, bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter, \ _btree_id, _pos, _flags, KEY_TYPE_##_type)) +#define bkey_val_copy(_dst_v, _src_k) \ +do { \ + unsigned b = min_t(unsigned, sizeof(*_dst_v), \ + bkey_val_bytes(_src_k.k)); \ + memcpy(_dst_v, _src_k.v, b); \ + if (b < sizeof(*_dst_v)) \ + memset((void *) (_dst_v) + b, 0, sizeof(*_dst_v) - b); \ +} while (0) + static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans, unsigned btree_id, struct bpos pos, unsigned flags, unsigned type, @@ -600,23 +609,35 @@ void bch2_trans_srcu_unlock(struct btree_trans *); u32 bch2_trans_begin(struct btree_trans *); -/* - * XXX - * this does not handle transaction restarts from bch2_btree_iter_next_node() - * correctly - */ -#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ - _locks_want, _depth, _flags, _b, _ret) \ - for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id), \ - _start, _locks_want, _depth, _flags); \ - (_b) = bch2_btree_iter_peek_node_and_restart(&(_iter)), \ - !((_ret) = PTR_ERR_OR_ZERO(_b)) && (_b); \ - (_b) = bch2_btree_iter_next_node(&(_iter))) +#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ + _locks_want, _depth, _flags, _b, _do) \ +({ \ + bch2_trans_begin((_trans)); \ + \ + struct btree_iter _iter; \ + bch2_trans_node_iter_init((_trans), &_iter, (_btree_id), \ + _start, _locks_want, _depth, _flags); \ + int _ret3 = 0; \ + do { \ + _ret3 = lockrestart_do((_trans), ({ \ + struct btree *_b = bch2_btree_iter_peek_node(&_iter); \ + if (!_b) \ + break; \ + \ + PTR_ERR_OR_ZERO(_b) ?: (_do); \ + })) ?: \ + lockrestart_do((_trans), \ + PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(&_iter))); \ + } while (!_ret3); \ + \ + bch2_trans_iter_exit((_trans), &(_iter)); \ + _ret3; \ +}) #define for_each_btree_node(_trans, _iter, _btree_id, _start, \ - _flags, _b, _ret) \ - __for_each_btree_node(_trans, _iter, _btree_id, _start, \ - 0, 0, _flags, _b, _ret) + _flags, _b, _do) \ + __for_each_btree_node(_trans, _iter, _btree_id, _start, \ + 0, 0, _flags, _b, _do) static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter, unsigned flags) diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 74933490aaba..c1657182c275 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -530,6 +530,8 @@ static void __journal_keys_sort(struct journal_keys *keys) { sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL); + cond_resched(); + struct journal_key *dst = keys->data; darray_for_each(*keys, src) { diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index f2f2e525460b..fda7998734cb 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -497,11 +497,6 @@ int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path path->l[1].b = NULL; - if (bch2_btree_node_relock_notrace(trans, path, 0)) { - path->uptodate = BTREE_ITER_UPTODATE; - return 0; - } - int ret; do { ret = btree_path_traverse_cached_fast(trans, path); @@ -731,6 +726,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + path->should_be_locked = false; } static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, @@ -782,6 +778,20 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, rcu_read_lock(); tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); + + /* + * Scanning is expensive while a rehash is in progress - most elements + * will be on the new hashtable, if it's in progress + * + * A rehash could still start while we're scanning - that's ok, we'll + * still see most elements. + */ + if (unlikely(tbl->nest)) { + rcu_read_unlock(); + srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); + return SHRINK_STOP; + } + if (bc->shrink_iter >= tbl->size) bc->shrink_iter = 0; start = bc->shrink_iter; @@ -789,7 +799,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, do { struct rhash_head *pos, *next; - pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter)); + pos = rht_ptr_rcu(&tbl->buckets[bc->shrink_iter]); while (!rht_is_a_nulls(pos)) { next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter); @@ -870,12 +880,22 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) while (atomic_long_read(&bc->nr_keys)) { rcu_read_lock(); tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); - if (tbl) + if (tbl) { + if (tbl->nest) { + /* wait for in progress rehash */ + rcu_read_unlock(); + mutex_lock(&bc->table.mutex); + mutex_unlock(&bc->table.mutex); + rcu_read_lock(); + continue; + } for (i = 0; i < tbl->size; i++) - rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { + while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) { + ck = container_of(pos, struct bkey_cached, hash); bkey_cached_evict(bc, ck); list_add(&ck->list, &items); } + } rcu_read_unlock(); } diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h index e6b2cd0dd2c1..51d6289b8dee 100644 --- a/fs/bcachefs/btree_key_cache.h +++ b/fs/bcachefs/btree_key_cache.h @@ -11,13 +11,27 @@ static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) return max_t(ssize_t, 0, nr_dirty - max_dirty); } -static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) +static inline ssize_t __bch2_btree_key_cache_must_wait(struct bch_fs *c) { size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); size_t max_dirty = 4096 + (nr_keys * 3) / 4; - return nr_dirty > max_dirty; + return nr_dirty - max_dirty; +} + +static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) +{ + return __bch2_btree_key_cache_must_wait(c) > 0; +} + +static inline bool bch2_btree_key_cache_wait_done(struct bch_fs *c) +{ + size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); + size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); + size_t max_dirty = 2048 + (nr_keys * 5) / 8; + + return nr_dirty <= max_dirty; } int bch2_btree_key_cache_journal_flush(struct journal *, diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 001107226377..b28c649c6838 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -530,7 +530,7 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); printbuf_exit(&buf); - BUG_ON(bch2_bkey_invalid(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0, NULL)); + BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0)); ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k); if (ret) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index cca336fe46e9..a0101d9c5d83 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -712,7 +712,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, a->k.version = journal_pos_to_bversion(&trans->journal_res, (u64 *) entry - (u64 *) trans->journal_entries); BUG_ON(bversion_zero(a->k.version)); - ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), false); + ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), false, false); if (ret) goto revert_fs_usage; } @@ -798,7 +798,7 @@ revert_fs_usage: struct bkey_s_accounting a = bkey_i_to_s_accounting(entry2->start); bch2_accounting_neg(a); - bch2_accounting_mem_mod_locked(trans, a.c, false); + bch2_accounting_mem_mod_locked(trans, a.c, false, false); bch2_accounting_neg(a); } percpu_up_read(&c->mark_lock); @@ -818,50 +818,6 @@ static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); } -static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, - enum bch_validate_flags flags, - struct btree_insert_entry *i, - struct printbuf *err) -{ - struct bch_fs *c = trans->c; - - printbuf_reset(err); - prt_printf(err, "invalid bkey on insert from %s -> %ps\n", - trans->fn, (void *) i->ip_allocated); - printbuf_indent_add(err, 2); - - bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k)); - prt_newline(err); - - bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, flags, err); - bch2_print_string_as_lines(KERN_ERR, err->buf); - - bch2_inconsistent_error(c); - bch2_dump_trans_updates(trans); - - return -EINVAL; -} - -static noinline int bch2_trans_commit_journal_entry_invalid(struct btree_trans *trans, - struct jset_entry *i) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "invalid bkey on insert from %s\n", trans->fn); - printbuf_indent_add(&buf, 2); - - bch2_journal_entry_to_text(&buf, c, i); - prt_newline(&buf); - - bch2_print_string_as_lines(KERN_ERR, buf.buf); - - bch2_inconsistent_error(c); - bch2_dump_trans_updates(trans); - - return -EINVAL; -} - static int bch2_trans_commit_journal_pin_flush(struct journal *j, struct journal_entry_pin *_pin, u64 seq) { @@ -927,7 +883,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags static int journal_reclaim_wait_done(struct bch_fs *c) { int ret = bch2_journal_error(&c->journal) ?: - !bch2_btree_key_cache_must_wait(c); + bch2_btree_key_cache_wait_done(c); if (!ret) journal_reclaim_kick(&c->journal); @@ -973,9 +929,13 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, bch2_trans_unlock(trans); trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip); + track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], true); wait_event_freezable(c->journal.reclaim_wait, (ret = journal_reclaim_wait_done(c))); + + track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], false); + if (ret < 0) break; @@ -1060,20 +1020,19 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) goto out_reset; trans_for_each_update(trans, i) { - struct printbuf buf = PRINTBUF; enum bch_validate_flags invalid_flags = 0; if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; - if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), - i->bkey_type, invalid_flags, &buf))) - ret = bch2_trans_commit_bkey_invalid(trans, invalid_flags, i, &buf); - btree_insert_entry_checks(trans, i); - printbuf_exit(&buf); - - if (ret) + ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), + i->bkey_type, invalid_flags); + if (unlikely(ret)){ + bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n", + trans->fn, (void *) i->ip_allocated); return ret; + } + btree_insert_entry_checks(trans, i); } for (struct jset_entry *i = trans->journal_entries; @@ -1084,13 +1043,14 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; - if (unlikely(bch2_journal_entry_validate(c, NULL, i, - bcachefs_metadata_version_current, - CPU_BIG_ENDIAN, invalid_flags))) - ret = bch2_trans_commit_journal_entry_invalid(trans, i); - - if (ret) + ret = bch2_journal_entry_validate(c, NULL, i, + bcachefs_metadata_version_current, + CPU_BIG_ENDIAN, invalid_flags); + if (unlikely(ret)) { + bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n", + trans->fn); return ret; + } } if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) { diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 31ee50184be2..8fd112026e7a 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -317,6 +317,12 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, : 0; int ret; + b = bch2_btree_node_mem_alloc(trans, interior_node); + if (IS_ERR(b)) + return b; + + BUG_ON(b->ob.nr); + mutex_lock(&c->btree_reserve_cache_lock); if (c->btree_reserve_cache_nr > nr_reserve) { struct btree_alloc *a = @@ -325,10 +331,9 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, obs = a->ob; bkey_copy(&tmp.k, &a->k); mutex_unlock(&c->btree_reserve_cache_lock); - goto mem_alloc; + goto out; } mutex_unlock(&c->btree_reserve_cache_lock); - retry: ret = bch2_alloc_sectors_start_trans(trans, c->opts.metadata_target ?: @@ -341,7 +346,7 @@ retry: c->opts.metadata_replicas_required), watermark, 0, cl, &wp); if (unlikely(ret)) - return ERR_PTR(ret); + goto err; if (wp->sectors_free < btree_sectors(c)) { struct open_bucket *ob; @@ -360,19 +365,16 @@ retry: bch2_open_bucket_get(c, wp, &obs); bch2_alloc_sectors_done(c, wp); -mem_alloc: - b = bch2_btree_node_mem_alloc(trans, interior_node); - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - - /* we hold cannibalize_lock: */ - BUG_ON(IS_ERR(b)); - BUG_ON(b->ob.nr); - +out: bkey_copy(&b->key, &tmp.k); b->ob = obs; + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); return b; +err: + bch2_btree_node_to_freelist(c, b); + return ERR_PTR(ret); } static struct btree *bch2_btree_node_alloc(struct btree_update *as, @@ -1264,7 +1266,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl); bch2_trans_unlock(trans); - closure_sync(&cl); + bch2_wait_on_allocator(c, &cl); } while (bch2_err_matches(ret, BCH_ERR_operation_blocked)); } @@ -1364,18 +1366,10 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags))) bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); - if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert), - btree_node_type(b), WRITE, &buf) ?: - bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf)) { - printbuf_reset(&buf); - prt_printf(&buf, "inserting invalid bkey\n "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - prt_printf(&buf, "\n "); - bch2_bkey_invalid(c, bkey_i_to_s_c(insert), - btree_node_type(b), WRITE, &buf); - bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf); - - bch2_fs_inconsistent(c, "%s", buf.buf); + if (bch2_bkey_validate(c, bkey_i_to_s_c(insert), + btree_node_type(b), BCH_VALIDATE_write) ?: + bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), BCH_VALIDATE_write)) { + bch2_fs_inconsistent(c, "%s: inserting invalid bkey", __func__); dump_stack(); } @@ -2447,6 +2441,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite } new_hash = bch2_btree_node_mem_alloc(trans, false); + ret = PTR_ERR_OR_ZERO(new_hash); + if (ret) + goto err; } path->intent_ref++; @@ -2454,14 +2451,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite commit_flags, skip_triggers); --path->intent_ref; - if (new_hash) { - mutex_lock(&c->btree_cache.lock); - list_move(&new_hash->list, &c->btree_cache.freeable); - mutex_unlock(&c->btree_cache.lock); - - six_unlock_write(&new_hash->c.lock); - six_unlock_intent(&new_hash->c.lock); - } + if (new_hash) + bch2_btree_node_to_freelist(c, new_hash); +err: closure_sync(&cl); bch2_btree_cache_cannibalize_unlock(trans); return ret; @@ -2530,6 +2522,10 @@ int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id b = bch2_btree_node_mem_alloc(trans, false); bch2_btree_cache_cannibalize_unlock(trans); + ret = PTR_ERR_OR_ZERO(b); + if (ret) + return ret; + set_btree_node_fake(b); set_btree_node_need_rewrite(b); b->c.level = level; @@ -2561,7 +2557,7 @@ int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level) { - bch2_trans_run(c, bch2_btree_root_alloc_fake_trans(trans, id, level)); + bch2_trans_run(c, lockrestart_do(trans, bch2_btree_root_alloc_fake_trans(trans, id, level))); } static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as) diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 2650a0d24663..721bbe1dffc1 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -71,17 +71,21 @@ bch2_fs_usage_read_short(struct bch_fs *c) return ret; } -void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage) +void bch2_dev_usage_to_text(struct printbuf *out, + struct bch_dev *ca, + struct bch_dev_usage *usage) { prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n"); for (unsigned i = 0; i < BCH_DATA_NR; i++) { bch2_prt_data_type(out, i); prt_printf(out, "\t%llu\r%llu\r%llu\r\n", - usage->d[i].buckets, - usage->d[i].sectors, - usage->d[i].fragmented); + usage->d[i].buckets, + usage->d[i].sectors, + usage->d[i].fragmented); } + + prt_printf(out, "capacity\t%llu\r\n", ca->mi.nbuckets); } static int bch2_check_fix_ptr(struct btree_trans *trans, @@ -96,12 +100,13 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); if (!ca) { - if (fsck_err(trans, ptr_to_invalid_device, - "pointer to missing device %u\n" - "while marking %s", - p.ptr.dev, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + if (fsck_err_on(p.ptr.dev != BCH_SB_MEMBER_INVALID, + trans, ptr_to_invalid_device, + "pointer to missing device %u\n" + "while marking %s", + p.ptr.dev, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) *do_update = true; return 0; } @@ -558,7 +563,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans, struct bch_fs *c = trans->c; struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); if (unlikely(!ca)) { - if (insert) + if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID) ret = -EIO; goto err; } @@ -695,7 +700,8 @@ err: static int __trigger_extent(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c k, - enum btree_iter_update_trigger_flags flags) + enum btree_iter_update_trigger_flags flags, + s64 *replicas_sectors) { bool gc = flags & BTREE_TRIGGER_gc; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -704,7 +710,6 @@ static int __trigger_extent(struct btree_trans *trans, enum bch_data_type data_type = bkey_is_btree_ptr(k.k) ? BCH_DATA_btree : BCH_DATA_user; - s64 replicas_sectors = 0; int ret = 0; struct disk_accounting_pos acc_replicas_key = { @@ -735,7 +740,7 @@ static int __trigger_extent(struct btree_trans *trans, if (ret) return ret; } else if (!p.has_ec) { - replicas_sectors += disk_sectors; + *replicas_sectors += disk_sectors; acc_replicas_key.replicas.devs[acc_replicas_key.replicas.nr_devs++] = p.ptr.dev; } else { ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags); @@ -773,7 +778,7 @@ static int __trigger_extent(struct btree_trans *trans, } if (acc_replicas_key.replicas.nr_devs) { - ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, &replicas_sectors, 1, gc); + ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, replicas_sectors, 1, gc); if (ret) return ret; } @@ -783,7 +788,7 @@ static int __trigger_extent(struct btree_trans *trans, .type = BCH_DISK_ACCOUNTING_snapshot, .snapshot.id = k.k->p.snapshot, }; - ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, &replicas_sectors, 1, gc); + ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, replicas_sectors, 1, gc); if (ret) return ret; } @@ -803,16 +808,21 @@ static int __trigger_extent(struct btree_trans *trans, .type = BCH_DISK_ACCOUNTING_btree, .btree.id = btree_id, }; - ret = bch2_disk_accounting_mod(trans, &acc_btree_key, &replicas_sectors, 1, gc); + ret = bch2_disk_accounting_mod(trans, &acc_btree_key, replicas_sectors, 1, gc); if (ret) return ret; - } - - if (bch2_bkey_rebalance_opts(k)) { - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_rebalance_work, + } else { + bool insert = !(flags & BTREE_TRIGGER_overwrite); + struct disk_accounting_pos acc_inum_key = { + .type = BCH_DISK_ACCOUNTING_inum, + .inum.inum = k.k->p.inode, }; - ret = bch2_disk_accounting_mod(trans, &acc, &replicas_sectors, 1, gc); + s64 v[3] = { + insert ? 1 : -1, + insert ? k.k->size : -((s64) k.k->size), + *replicas_sectors, + }; + ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc); if (ret) return ret; } @@ -825,6 +835,7 @@ int bch2_trigger_extent(struct btree_trans *trans, struct bkey_s_c old, struct bkey_s new, enum btree_iter_update_trigger_flags flags) { + struct bch_fs *c = trans->c; struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c); struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old); unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start; @@ -840,21 +851,53 @@ int bch2_trigger_extent(struct btree_trans *trans, new_ptrs_bytes)) return 0; - if (flags & BTREE_TRIGGER_transactional) { - struct bch_fs *c = trans->c; - int mod = (int) bch2_bkey_needs_rebalance(c, new.s_c) - - (int) bch2_bkey_needs_rebalance(c, old); + if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { + s64 old_replicas_sectors = 0, new_replicas_sectors = 0; - if (mod) { + if (old.k->type) { + int ret = __trigger_extent(trans, btree, level, old, + flags & ~BTREE_TRIGGER_insert, + &old_replicas_sectors); + if (ret) + return ret; + } + + if (new.k->type) { + int ret = __trigger_extent(trans, btree, level, new.s_c, + flags & ~BTREE_TRIGGER_overwrite, + &new_replicas_sectors); + if (ret) + return ret; + } + + int need_rebalance_delta = 0; + s64 need_rebalance_sectors_delta = 0; + + s64 s = bch2_bkey_sectors_need_rebalance(c, old); + need_rebalance_delta -= s != 0; + need_rebalance_sectors_delta -= s; + + s = bch2_bkey_sectors_need_rebalance(c, new.s_c); + need_rebalance_delta += s != 0; + need_rebalance_sectors_delta += s; + + if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) { int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, - new.k->p, mod > 0); + new.k->p, need_rebalance_delta > 0); if (ret) return ret; } - } - if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) - return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree, level, old, new, flags); + if (need_rebalance_sectors_delta) { + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_rebalance_work, + }; + int ret = bch2_disk_accounting_mod(trans, &acc, &need_rebalance_sectors_delta, 1, + flags & BTREE_TRIGGER_gc); + if (ret) + return ret; + } + } return 0; } @@ -897,7 +940,6 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, enum bch_data_type type, unsigned sectors) { - struct bch_fs *c = trans->c; struct btree_iter iter; int ret = 0; @@ -907,7 +949,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, return PTR_ERR(a); if (a->v.data_type && type && a->v.data_type != type) { - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, bucket_metadata_type_mismatch, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", @@ -1028,13 +1070,18 @@ static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *ca, enum btree_iter_update_trigger_flags flags) { - struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; + struct bch_fs *c = trans->c; + + mutex_lock(&c->sb_lock); + struct bch_sb_layout layout = ca->disk_sb.sb->layout; + mutex_unlock(&c->sb_lock); + u64 bucket = 0; unsigned i, bucket_sectors = 0; int ret; - for (i = 0; i < layout->nr_superblocks; i++) { - u64 offset = le64_to_cpu(layout->sb_offset[i]); + for (i = 0; i < layout.nr_superblocks; i++) { + u64 offset = le64_to_cpu(layout.sb_offset[i]); if (offset == BCH_SB_SECTOR) { ret = bch2_trans_mark_metadata_sectors(trans, ca, @@ -1045,7 +1092,7 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *c } ret = bch2_trans_mark_metadata_sectors(trans, ca, offset, - offset + (1 << layout->sb_max_size_bits), + offset + (1 << layout.sb_max_size_bits), BCH_DATA_sb, &bucket, &bucket_sectors, flags); if (ret) return ret; diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 2d35eeb24a2d..edbdffd508fc 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -212,7 +212,7 @@ static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) return ret; } -void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev_usage *); +void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage *); static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark) { diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c index ec1b636ef78d..f9fb150eda70 100644 --- a/fs/bcachefs/buckets_waiting_for_journal.c +++ b/fs/bcachefs/buckets_waiting_for_journal.c @@ -93,7 +93,7 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, .dev_bucket = (u64) dev << 56 | bucket, .journal_seq = journal_seq, }; - size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0; + size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0, nr_rehashes_this_size = 0; int ret = 0; mutex_lock(&b->lock); @@ -106,8 +106,8 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, for (i = 0; i < size; i++) nr_elements += t->d[i].journal_seq > flushed_seq; - new_bits = t->bits + (nr_elements * 3 > size); - + new_bits = ilog2(roundup_pow_of_two(nr_elements * 3)); +realloc: n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL); if (!n) { ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set; @@ -115,7 +115,16 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, } retry_rehash: + if (nr_rehashes_this_size == 3) { + new_bits++; + nr_rehashes_this_size = 0; + kvfree(n); + goto realloc; + } + nr_rehashes++; + nr_rehashes_this_size++; + bucket_table_init(n, new_bits); tmp = new; diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 0087b8555ead..004894ad4147 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -20,6 +20,76 @@ #include "subvolume.h" #include "trace.h" +static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + bkey_for_each_ptr(ptrs, ptr) + bch2_dev_put(bch2_dev_have_ref(c, ptr->dev)); +} + +static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + bkey_for_each_ptr(ptrs, ptr) { + if (!bch2_dev_tryget(c, ptr->dev)) { + bkey_for_each_ptr(ptrs, ptr2) { + if (ptr2 == ptr) + break; + bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev)); + } + return false; + } + } + return true; +} + +static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); + struct bpos bucket = PTR_BUCKET_POS(ca, ptr); + + bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); + } +} + +static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); + struct bpos bucket = PTR_BUCKET_POS(ca, ptr); + + if (ctxt) { + bool locked; + + move_ctxt_wait_event(ctxt, + (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) || + list_empty(&ctxt->ios)); + + if (!locked) + bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0); + } else { + if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) { + bkey_for_each_ptr(ptrs, ptr2) { + if (ptr2 == ptr) + break; + + bucket = PTR_BUCKET_POS(ca, ptr2); + bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); + } + return false; + } + } + } + return true; +} + static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k) { if (trace_move_extent_finish_enabled()) { @@ -250,10 +320,8 @@ restart_drop_extra_replicas: * it's been hard to reproduce, so this should give us some more * information when it does occur: */ - struct printbuf err = PRINTBUF; - int invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), 0, &err); - printbuf_exit(&err); - + int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), + BCH_VALIDATE_commit); if (invalid) { struct printbuf buf = PRINTBUF; @@ -269,6 +337,7 @@ restart_drop_extra_replicas: printbuf_exit(&buf); bch2_fatal_error(c); + ret = -EIO; goto out; } @@ -357,17 +426,11 @@ void bch2_data_update_read_done(struct data_update *m, void bch2_data_update_exit(struct data_update *update) { struct bch_fs *c = update->op.c; - struct bkey_ptrs_c ptrs = - bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k)); - - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - if (c->opts.nocow_enabled) - bch2_bucket_nocow_unlock(&c->nocow_locks, - PTR_BUCKET_POS(ca, ptr), 0); - bch2_dev_put(ca); - } + struct bkey_s_c k = bkey_i_to_s_c(update->k.k); + if (c->opts.nocow_enabled) + bkey_nocow_unlock(c, k); + bkey_put_dev_refs(c, k); bch2_bkey_buf_exit(&update->k, c); bch2_disk_reservation_put(c, &update->op.res); bch2_bio_free_pages_pool(c, &update->op.wbio.bio); @@ -477,6 +540,9 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, bch2_compression_opt_to_text(out, background_compression(*io_opts)); prt_newline(out); + prt_str(out, "opts.replicas:\t"); + prt_u64(out, io_opts->data_replicas); + prt_str(out, "extra replicas:\t"); prt_u64(out, data_opts->extra_replicas); } @@ -545,7 +611,6 @@ int bch2_data_update_init(struct btree_trans *trans, const union bch_extent_entry *entry; struct extent_ptr_decoded p; unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas; - unsigned ptrs_locked = 0; int ret = 0; /* @@ -556,6 +621,15 @@ int bch2_data_update_init(struct btree_trans *trans, if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot))) return -BCH_ERR_data_update_done; + if (!bkey_get_dev_refs(c, k)) + return -BCH_ERR_data_update_done; + + if (c->opts.nocow_enabled && + !bkey_nocow_lock(c, ctxt, k)) { + bkey_put_dev_refs(c, k); + return -BCH_ERR_nocow_lock_blocked; + } + bch2_bkey_buf_init(&m->k); bch2_bkey_buf_reassemble(&m->k, c, k); m->btree_id = btree_id; @@ -577,40 +651,24 @@ int bch2_data_update_init(struct btree_trans *trans, m->op.compression_opt = background_compression(io_opts); m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; - bkey_for_each_ptr(ptrs, ptr) { - if (!bch2_dev_tryget(c, ptr->dev)) { - bkey_for_each_ptr(ptrs, ptr2) { - if (ptr2 == ptr) - break; - bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev)); - } - return -BCH_ERR_data_update_done; - } - } - unsigned durability_have = 0, durability_removing = 0; i = 0; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - struct bch_dev *ca = bch2_dev_have_ref(c, p.ptr.dev); - struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr); - bool locked; - - rcu_read_lock(); - if (((1U << i) & m->data_opts.rewrite_ptrs)) { - BUG_ON(p.ptr.cached); - - if (crc_is_compressed(p.crc)) - reserve_sectors += k.k->size; - - m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p); - durability_removing += bch2_extent_ptr_desired_durability(c, &p); - } else if (!p.ptr.cached && - !((1U << i) & m->data_opts.kill_ptrs)) { - bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); - durability_have += bch2_extent_ptr_durability(c, &p); + if (!p.ptr.cached) { + rcu_read_lock(); + if (BIT(i) & m->data_opts.rewrite_ptrs) { + if (crc_is_compressed(p.crc)) + reserve_sectors += k.k->size; + + m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p); + durability_removing += bch2_extent_ptr_desired_durability(c, &p); + } else if (!(BIT(i) & m->data_opts.kill_ptrs)) { + bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); + durability_have += bch2_extent_ptr_durability(c, &p); + } + rcu_read_unlock(); } - rcu_read_unlock(); /* * op->csum_type is normally initialized from the fs/file's @@ -625,24 +683,6 @@ int bch2_data_update_init(struct btree_trans *trans, if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) m->op.incompressible = true; - if (c->opts.nocow_enabled) { - if (ctxt) { - move_ctxt_wait_event(ctxt, - (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, - bucket, 0)) || - list_empty(&ctxt->ios)); - - if (!locked) - bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0); - } else { - if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) { - ret = -BCH_ERR_nocow_lock_blocked; - goto err; - } - } - ptrs_locked |= (1U << i); - } - i++; } @@ -656,16 +696,6 @@ int bch2_data_update_init(struct btree_trans *trans, * Increasing replication is an explicit operation triggered by * rereplicate, currently, so that users don't get an unexpected -ENOSPC */ - if (!(m->data_opts.write_flags & BCH_WRITE_CACHED) && - !durability_required) { - m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs; - m->data_opts.rewrite_ptrs = 0; - /* if iter == NULL, it's just a promote */ - if (iter) - ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts); - goto done; - } - m->op.nr_replicas = min(durability_removing, durability_required) + m->data_opts.extra_replicas; @@ -677,48 +707,38 @@ int bch2_data_update_init(struct btree_trans *trans, if (!(durability_have + durability_removing)) m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1); - if (!m->op.nr_replicas) { - struct printbuf buf = PRINTBUF; + m->op.nr_replicas_required = m->op.nr_replicas; - bch2_data_update_to_text(&buf, m); - WARN(1, "trying to move an extent, but nr_replicas=0\n%s", buf.buf); - printbuf_exit(&buf); - ret = -BCH_ERR_data_update_done; - goto done; + /* + * It might turn out that we don't need any new replicas, if the + * replicas or durability settings have been changed since the extent + * was written: + */ + if (!m->op.nr_replicas) { + m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs; + m->data_opts.rewrite_ptrs = 0; + /* if iter == NULL, it's just a promote */ + if (iter) + ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts); + goto out; } - m->op.nr_replicas_required = m->op.nr_replicas; - if (reserve_sectors) { ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, m->data_opts.extra_replicas ? 0 : BCH_DISK_RESERVATION_NOFAIL); if (ret) - goto err; + goto out; } if (bkey_extent_is_unwritten(k)) { bch2_update_unwritten_extent(trans, m); - goto done; + goto out; } return 0; -err: - i = 0; - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - struct bch_dev *ca = bch2_dev_have_ref(c, p.ptr.dev); - struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr); - if ((1U << i) & ptrs_locked) - bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); - bch2_dev_put(ca); - i++; - } - - bch2_bkey_buf_exit(&m->k, c); - bch2_bio_free_pages_pool(c, &m->op.wbio.bio); - return ret; -done: +out: bch2_data_update_exit(m); return ret ?: -BCH_ERR_data_update_done; } diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index ebabab171fe5..45aec1afdb0e 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -397,47 +397,27 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; - struct btree_trans *trans; - struct btree_iter iter; - struct btree *b; - ssize_t ret; i->ubuf = buf; i->size = size; i->ret = 0; - ret = flush_buf(i); + ssize_t ret = flush_buf(i); if (ret) return ret; if (bpos_eq(SPOS_MAX, i->from)) return i->ret; - trans = bch2_trans_get(i->c); -retry: - bch2_trans_begin(trans); - - for_each_btree_node(trans, iter, i->id, i->from, 0, b, ret) { - bch2_btree_node_to_text(&i->buf, i->c, b); - i->from = !bpos_eq(SPOS_MAX, b->key.k.p) - ? bpos_successor(b->key.k.p) - : b->key.k.p; - - ret = drop_locks_do(trans, flush_buf(i)); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_put(trans); - - if (!ret) - ret = flush_buf(i); + return bch2_trans_run(i->c, + for_each_btree_node(trans, iter, i->id, i->from, 0, b, ({ + bch2_btree_node_to_text(&i->buf, i->c, b); + i->from = !bpos_eq(SPOS_MAX, b->key.k.p) + ? bpos_successor(b->key.k.p) + : b->key.k.p; - return ret ?: i->ret; + drop_locks_do(trans, flush_buf(i)); + }))) ?: i->ret; } static const struct file_operations btree_format_debug_ops = { diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index d743da89308e..32bfdf19289a 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -100,20 +100,19 @@ const struct bch_hash_desc bch2_dirent_hash_desc = { .is_visible = dirent_is_visible, }; -int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); struct qstr d_name = bch2_dirent_get_name(d); int ret = 0; - bkey_fsck_err_on(!d_name.len, c, err, - dirent_empty_name, + bkey_fsck_err_on(!d_name.len, + c, dirent_empty_name, "empty name"); - bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len), c, err, - dirent_val_too_big, + bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len), + c, dirent_val_too_big, "value too big (%zu > %u)", bkey_val_u64s(k.k), dirent_val_u64s(d_name.len)); @@ -121,27 +120,27 @@ int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k, * Check new keys don't exceed the max length * (older keys may be larger.) */ - bkey_fsck_err_on((flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX, c, err, - dirent_name_too_long, + bkey_fsck_err_on((flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX, + c, dirent_name_too_long, "dirent name too big (%u > %u)", d_name.len, BCH_NAME_MAX); - bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len), c, err, - dirent_name_embedded_nul, + bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len), + c, dirent_name_embedded_nul, "dirent has stray data after name's NUL"); bkey_fsck_err_on((d_name.len == 1 && !memcmp(d_name.name, ".", 1)) || - (d_name.len == 2 && !memcmp(d_name.name, "..", 2)), c, err, - dirent_name_dot_or_dotdot, + (d_name.len == 2 && !memcmp(d_name.name, "..", 2)), + c, dirent_name_dot_or_dotdot, "invalid name"); - bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len), c, err, - dirent_name_has_slash, + bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len), + c, dirent_name_has_slash, "name with /"); bkey_fsck_err_on(d.v->d_type != DT_SUBVOL && - le64_to_cpu(d.v->d_inum) == d.k->p.inode, c, err, - dirent_to_itself, + le64_to_cpu(d.v->d_inum) == d.k->p.inode, + c, dirent_to_itself, "dirent points to own directory"); fsck_err: return ret; diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index 24037e6e0a09..8945145865c5 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -7,12 +7,11 @@ enum bch_validate_flags; extern const struct bch_hash_desc bch2_dirent_hash_desc; -int bch2_dirent_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_dirent_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_dirent ((struct bkey_ops) { \ - .key_invalid = bch2_dirent_invalid, \ + .key_validate = bch2_dirent_validate, \ .val_to_text = bch2_dirent_to_text, \ .min_val_size = 16, \ }) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index dcdd59249c23..e972e2bca546 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -114,11 +114,73 @@ int bch2_mod_dev_cached_sectors(struct btree_trans *trans, return bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); } -int bch2_accounting_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +static inline bool is_zero(char *start, char *end) { - return 0; + BUG_ON(start > end); + + for (; start < end; start++) + if (*start) + return false; + return true; +} + +#define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member)) + +int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) +{ + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, k.k->p); + void *end = &acc_k + 1; + int ret = 0; + + switch (acc_k.type) { + case BCH_DISK_ACCOUNTING_nr_inodes: + end = field_end(acc_k, nr_inodes); + break; + case BCH_DISK_ACCOUNTING_persistent_reserved: + end = field_end(acc_k, persistent_reserved); + break; + case BCH_DISK_ACCOUNTING_replicas: + bkey_fsck_err_on(!acc_k.replicas.nr_devs, + c, accounting_key_replicas_nr_devs_0, + "accounting key replicas entry with nr_devs=0"); + + bkey_fsck_err_on(acc_k.replicas.nr_required > acc_k.replicas.nr_devs || + (acc_k.replicas.nr_required > 1 && + acc_k.replicas.nr_required == acc_k.replicas.nr_devs), + c, accounting_key_replicas_nr_required_bad, + "accounting key replicas entry with bad nr_required"); + + for (unsigned i = 0; i + 1 < acc_k.replicas.nr_devs; i++) + bkey_fsck_err_on(acc_k.replicas.devs[i] >= acc_k.replicas.devs[i + 1], + c, accounting_key_replicas_devs_unsorted, + "accounting key replicas entry with unsorted devs"); + + end = (void *) &acc_k.replicas + replicas_entry_bytes(&acc_k.replicas); + break; + case BCH_DISK_ACCOUNTING_dev_data_type: + end = field_end(acc_k, dev_data_type); + break; + case BCH_DISK_ACCOUNTING_compression: + end = field_end(acc_k, compression); + break; + case BCH_DISK_ACCOUNTING_snapshot: + end = field_end(acc_k, snapshot); + break; + case BCH_DISK_ACCOUNTING_btree: + end = field_end(acc_k, btree); + break; + case BCH_DISK_ACCOUNTING_rebalance_work: + end = field_end(acc_k, rebalance_work); + break; + } + + bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)), + c, accounting_key_junk_at_end, + "junk at end of accounting key"); +fsck_err: + return ret; } void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_pos *k) @@ -465,6 +527,9 @@ int bch2_gc_accounting_done(struct bch_fs *c) struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, e->pos); + if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) + continue; + u64 src_v[BCH_ACCOUNTING_MAX_COUNTERS]; u64 dst_v[BCH_ACCOUNTING_MAX_COUNTERS]; @@ -501,7 +566,7 @@ int bch2_gc_accounting_done(struct bch_fs *c) struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; accounting_key_init(&k_i.k, &acc_k, src_v, nr); - bch2_accounting_mem_mod_locked(trans, bkey_i_to_s_c_accounting(&k_i.k), false); + bch2_accounting_mem_mod_locked(trans, bkey_i_to_s_c_accounting(&k_i.k), false, false); preempt_disable(); struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); @@ -530,7 +595,7 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) return 0; percpu_down_read(&c->mark_lock); - int ret = __bch2_accounting_mem_mod(c, bkey_s_c_to_accounting(k), false); + int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), false, true); percpu_up_read(&c->mark_lock); if (bch2_accounting_key_is_zero(bkey_s_c_to_accounting(k)) && @@ -697,6 +762,15 @@ void bch2_verify_accounting_clean(struct bch_fs *c) struct bkey_s_c_accounting a = bkey_s_c_to_accounting(k); unsigned nr = bch2_accounting_counters(k.k); + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, k.k->p); + + if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) + continue; + + if (acc_k.type == BCH_DISK_ACCOUNTING_inum) + continue; + bch2_accounting_mem_read(c, k.k->p, v, nr); if (memcmp(a.v->d, v, nr * sizeof(u64))) { @@ -712,9 +786,6 @@ void bch2_verify_accounting_clean(struct bch_fs *c) mismatch = true; } - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, a.k->p); - switch (acc_k.type) { case BCH_DISK_ACCOUNTING_persistent_reserved: base.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index 3d3f25e08b69..f29fd0dd9581 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -82,14 +82,13 @@ int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *, s64 *, unsigned, bool); int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool); -int bch2_accounting_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *); void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_accounting_swab(struct bkey_s); #define bch2_bkey_ops_accounting ((struct bkey_ops) { \ - .key_invalid = bch2_accounting_invalid, \ + .key_validate = bch2_accounting_validate, \ .val_to_text = bch2_accounting_to_text, \ .swab = bch2_accounting_swab, \ .min_val_size = 8, \ @@ -107,41 +106,20 @@ static inline int accounting_pos_cmp(const void *_l, const void *_r) int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, bool); void bch2_accounting_mem_gc(struct bch_fs *); -static inline int __bch2_accounting_mem_mod(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc) -{ - struct bch_accounting_mem *acc = &c->accounting; - unsigned idx; - - EBUG_ON(gc && !acc->gc_running); - - while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, &a.k->p)) >= acc->k.nr) { - int ret = bch2_accounting_mem_insert(c, a, gc); - if (ret) - return ret; - } - - struct accounting_mem_entry *e = &acc->k.data[idx]; - - EBUG_ON(bch2_accounting_counters(a.k) != e->nr_counters); - - for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++) - this_cpu_add(e->v[gc][i], a.v->d[i]); - return 0; -} - /* * Update in memory counters so they match the btree update we're doing; called * from transaction commit path */ -static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc) +static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc, bool read) { struct bch_fs *c = trans->c; + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, a.k->p); - if (!gc) { - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, a.k->p); + if (acc_k.type == BCH_DISK_ACCOUNTING_inum) + return 0; + if (!gc && !read) { switch (acc_k.type) { case BCH_DISK_ACCOUNTING_persistent_reserved: trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; @@ -162,13 +140,31 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, stru } } - return __bch2_accounting_mem_mod(c, a, gc); + struct bch_accounting_mem *acc = &c->accounting; + unsigned idx; + + EBUG_ON(gc && !acc->gc_running); + + while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, &a.k->p)) >= acc->k.nr) { + int ret = bch2_accounting_mem_insert(c, a, gc); + if (ret) + return ret; + } + + struct accounting_mem_entry *e = &acc->k.data[idx]; + + EBUG_ON(bch2_accounting_counters(a.k) != e->nr_counters); + + for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++) + this_cpu_add(e->v[gc][i], a.v->d[i]); + return 0; } static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc) { percpu_down_read(&trans->c->mark_lock); - int ret = bch2_accounting_mem_mod_locked(trans, a, gc); + int ret = bch2_accounting_mem_mod_locked(trans, a, gc, false); percpu_up_read(&trans->c->mark_lock); return ret; } diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h index cba417060b33..7b6e6c97e6aa 100644 --- a/fs/bcachefs/disk_accounting_format.h +++ b/fs/bcachefs/disk_accounting_format.h @@ -103,7 +103,8 @@ static inline bool data_type_is_hidden(enum bch_data_type type) x(compression, 4) \ x(snapshot, 5) \ x(btree, 6) \ - x(rebalance_work, 7) + x(rebalance_work, 7) \ + x(inum, 8) enum disk_accounting_type { #define x(f, nr) BCH_DISK_ACCOUNTING_##f = nr, @@ -124,20 +125,23 @@ struct bch_dev_data_type { __u8 data_type; }; -struct bch_dev_stripe_buckets { - __u8 dev; -}; - struct bch_acct_compression { __u8 type; }; struct bch_acct_snapshot { __u32 id; -}; +} __packed; struct bch_acct_btree { __u32 id; +} __packed; + +struct bch_acct_inum { + __u64 inum; +} __packed; + +struct bch_acct_rebalance_work { }; struct disk_accounting_pos { @@ -149,12 +153,13 @@ struct disk_accounting_pos { struct bch_persistent_reserved persistent_reserved; struct bch_replicas_entry_v1 replicas; struct bch_dev_data_type dev_data_type; - struct bch_dev_stripe_buckets dev_stripe_buckets; struct bch_acct_compression compression; struct bch_acct_snapshot snapshot; struct bch_acct_btree btree; - }; - }; + struct bch_acct_rebalance_work rebalance_work; + struct bch_acct_inum inum; + } __packed; + } __packed; struct bpos _pad; }; }; diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 9b5b5c9a6c63..141a4c63142f 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -107,24 +107,23 @@ struct ec_bio { /* Stripes btree keys: */ -int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; int ret = 0; bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) || - bpos_gt(k.k->p, POS(0, U32_MAX)), c, err, - stripe_pos_bad, + bpos_gt(k.k->p, POS(0, U32_MAX)), + c, stripe_pos_bad, "stripe at bad pos"); - bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), c, err, - stripe_val_size_bad, + bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), + c, stripe_val_size_bad, "incorrect value size (%zu < %u)", bkey_val_u64s(k.k), stripe_val_u64s(s)); - ret = bch2_bkey_ptrs_invalid(c, k, flags, err); + ret = bch2_bkey_ptrs_validate(c, k, flags); fsck_err: return ret; } @@ -1809,6 +1808,9 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ BUG_ON(v->nr_blocks != h->s->nr_data + h->s->nr_parity); BUG_ON(v->nr_redundant != h->s->nr_parity); + /* * We bypass the sector allocator which normally does this: */ + bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); + for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) { __clear_bit(v->ptrs[i].dev, devs.d); if (i < h->s->nr_data) @@ -2235,6 +2237,23 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) mutex_unlock(&c->ec_stripes_heap_lock); } +static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c, + struct ec_stripe_new *s) +{ + prt_printf(out, "\tidx %llu blocks %u+%u allocated %u ref %u %u %s obs", + s->idx, s->nr_data, s->nr_parity, + bitmap_weight(s->blocks_allocated, s->nr_data), + atomic_read(&s->ref[STRIPE_REF_io]), + atomic_read(&s->ref[STRIPE_REF_stripe]), + bch2_watermarks[s->h->watermark]); + + struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; + unsigned i; + for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) + prt_printf(out, " %u", s->blocks[i]); + prt_newline(out); +} + void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) { struct ec_stripe_head *h; @@ -2247,23 +2266,15 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) bch2_watermarks[h->watermark]); if (h->s) - prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n", - h->s->idx, h->s->nr_data, h->s->nr_parity, - bitmap_weight(h->s->blocks_allocated, - h->s->nr_data)); + bch2_new_stripe_to_text(out, c, h->s); } mutex_unlock(&c->ec_stripe_head_lock); prt_printf(out, "in flight:\n"); mutex_lock(&c->ec_stripe_new_lock); - list_for_each_entry(s, &c->ec_stripe_new_list, list) { - prt_printf(out, "\tidx %llu blocks %u+%u ref %u %u %s\n", - s->idx, s->nr_data, s->nr_parity, - atomic_read(&s->ref[STRIPE_REF_io]), - atomic_read(&s->ref[STRIPE_REF_stripe]), - bch2_watermarks[s->h->watermark]); - } + list_for_each_entry(s, &c->ec_stripe_new_list, list) + bch2_new_stripe_to_text(out, c, s); mutex_unlock(&c->ec_stripe_new_lock); } diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 84a23eeb6249..9baf3411a8f9 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -8,8 +8,7 @@ enum bch_validate_flags; -int bch2_stripe_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned, @@ -17,7 +16,7 @@ int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned, enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_stripe ((struct bkey_ops) { \ - .key_invalid = bch2_stripe_invalid, \ + .key_validate = bch2_stripe_validate, \ .val_to_text = bch2_stripe_to_text, \ .swab = bch2_ptr_swab, \ .trigger = bch2_trigger_stripe, \ @@ -98,7 +97,9 @@ static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe const struct bch_extent_ptr *data_ptr, unsigned sectors) { - return data_ptr->dev == stripe_ptr->dev && + return (data_ptr->dev == stripe_ptr->dev || + data_ptr->dev == BCH_SB_MEMBER_INVALID || + stripe_ptr->dev == BCH_SB_MEMBER_INVALID) && data_ptr->gen == stripe_ptr->gen && data_ptr->offset >= stripe_ptr->offset && data_ptr->offset < stripe_ptr->offset + sectors; diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index a268af3e52bf..742dcdd3e5d7 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -166,6 +166,7 @@ x(0, journal_reclaim_would_deadlock) \ x(EINVAL, fsck) \ x(BCH_ERR_fsck, fsck_fix) \ + x(BCH_ERR_fsck, fsck_delete_bkey) \ x(BCH_ERR_fsck, fsck_ignore) \ x(BCH_ERR_fsck, fsck_errors_not_fixed) \ x(BCH_ERR_fsck, fsck_repair_unimplemented) \ @@ -256,7 +257,6 @@ x(BCH_ERR_nopromote, nopromote_in_flight) \ x(BCH_ERR_nopromote, nopromote_no_writes) \ x(BCH_ERR_nopromote, nopromote_enomem) \ - x(0, need_inode_lock) \ x(0, invalid_snapshot_node) \ x(0, option_needs_open_fs) diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index a62b63108820..95afa7bf2020 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -416,6 +416,28 @@ err: return ret; } +int __bch2_bkey_fsck_err(struct bch_fs *c, + struct bkey_s_c k, + enum bch_fsck_flags flags, + enum bch_sb_error_id err, + const char *fmt, ...) +{ + struct printbuf buf = PRINTBUF; + va_list args; + + prt_str(&buf, "invalid bkey "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\n "); + va_start(args, fmt); + prt_vprintf(&buf, fmt, args); + va_end(args); + prt_str(&buf, ": delete?"); + + int ret = __bch2_fsck_err(c, NULL, flags, err, "%s", buf.buf); + printbuf_exit(&buf); + return ret; +} + void bch2_flush_fsck_errs(struct bch_fs *c) { struct fsck_err_state *s, *n; diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 995e6bba9bad..2f1b86978f36 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -4,6 +4,7 @@ #include <linux/list.h> #include <linux/printk.h> +#include "bkey_types.h" #include "sb-errors.h" struct bch_dev; @@ -166,24 +167,30 @@ void bch2_flush_fsck_errs(struct bch_fs *); #define fsck_err_on(cond, c, _err_type, ...) \ __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) -__printf(4, 0) -static inline void bch2_bkey_fsck_err(struct bch_fs *c, - struct printbuf *err_msg, - enum bch_sb_error_id err_type, - const char *fmt, ...) -{ - va_list args; +__printf(5, 6) +int __bch2_bkey_fsck_err(struct bch_fs *, + struct bkey_s_c, + enum bch_fsck_flags, + enum bch_sb_error_id, + const char *, ...); - va_start(args, fmt); - prt_vprintf(err_msg, fmt, args); - va_end(args); -} - -#define bkey_fsck_err(c, _err_msg, _err_type, ...) \ +/* + * for now, bkey fsck errors are always handled by deleting the entire key - + * this will change at some point + */ +#define bkey_fsck_err(c, _err_type, _err_msg, ...) \ do { \ - prt_printf(_err_msg, __VA_ARGS__); \ - bch2_sb_error_count(c, BCH_FSCK_ERR_##_err_type); \ - ret = -BCH_ERR_invalid_bkey; \ + if ((flags & BCH_VALIDATE_silent)) { \ + ret = -BCH_ERR_fsck_delete_bkey; \ + goto fsck_err; \ + } \ + int _ret = __bch2_bkey_fsck_err(c, k, FSCK_CAN_FIX, \ + BCH_FSCK_ERR_##_err_type, \ + _err_msg, ##__VA_ARGS__); \ + if (_ret != -BCH_ERR_fsck_fix && \ + _ret != -BCH_ERR_fsck_ignore) \ + ret = _ret; \ + ret = -BCH_ERR_fsck_delete_bkey; \ goto fsck_err; \ } while (0) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 07973198e35f..324303bf4353 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -171,17 +171,16 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, /* KEY_TYPE_btree_ptr: */ -int bch2_btree_ptr_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { int ret = 0; - bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, c, err, - btree_ptr_val_too_big, + bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, + c, btree_ptr_val_too_big, "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX); - ret = bch2_bkey_ptrs_invalid(c, k, flags, err); + ret = bch2_bkey_ptrs_validate(c, k, flags); fsck_err: return ret; } @@ -192,28 +191,27 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, bch2_bkey_ptrs_to_text(out, c, k); } -int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); int ret = 0; bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, - c, err, btree_ptr_v2_val_too_big, + c, btree_ptr_v2_val_too_big, "value too big (%zu > %zu)", bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p), - c, err, btree_ptr_v2_min_key_bad, + c, btree_ptr_v2_min_key_bad, "min_key > key"); if (flags & BCH_VALIDATE_write) bkey_fsck_err_on(!bp.v->sectors_written, - c, err, btree_ptr_v2_written_0, + c, btree_ptr_v2_written_0, "sectors_written == 0"); - ret = bch2_bkey_ptrs_invalid(c, k, flags, err); + ret = bch2_bkey_ptrs_validate(c, k, flags); fsck_err: return ret; } @@ -399,15 +397,14 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) /* KEY_TYPE_reservation: */ -int bch2_reservation_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); int ret = 0; - bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, c, err, - reservation_key_nr_replicas_invalid, + bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, + c, reservation_key_nr_replicas_invalid, "invalid nr_replicas (%u)", r.v->nr_replicas); fsck_err: return ret; @@ -784,14 +781,17 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, /* * Returns pointer to the next entry after the one being dropped: */ -union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k, - struct bch_extent_ptr *ptr) +void bch2_bkey_drop_ptr_noerror(struct bkey_s k, struct bch_extent_ptr *ptr) { struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); union bch_extent_entry *entry = to_entry(ptr), *next; - union bch_extent_entry *ret = entry; bool drop_crc = true; + if (k.k->type == KEY_TYPE_stripe) { + ptr->dev = BCH_SB_MEMBER_INVALID; + return; + } + EBUG_ON(ptr < &ptrs.start->ptr || ptr >= &ptrs.end->ptr); EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); @@ -814,21 +814,16 @@ union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k, break; if ((extent_entry_is_crc(entry) && drop_crc) || - extent_entry_is_stripe_ptr(entry)) { - ret = (void *) ret - extent_entry_bytes(entry); + extent_entry_is_stripe_ptr(entry)) extent_entry_drop(k, entry); - } } - - return ret; } -union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, - struct bch_extent_ptr *ptr) +void bch2_bkey_drop_ptr(struct bkey_s k, struct bch_extent_ptr *ptr) { bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr; - union bch_extent_entry *ret = - bch2_bkey_drop_ptr_noerror(k, ptr); + + bch2_bkey_drop_ptr_noerror(k, ptr); /* * If we deleted all the dirty pointers and there's still cached @@ -840,14 +835,10 @@ union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, !bch2_bkey_dirty_devs(k.s_c).nr) { k.k->type = KEY_TYPE_error; set_bkey_val_u64s(k.k, 0); - ret = NULL; } else if (!bch2_bkey_nr_ptrs(k.s_c)) { k.k->type = KEY_TYPE_deleted; set_bkey_val_u64s(k.k, 0); - ret = NULL; } - - return ret; } void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) @@ -932,8 +923,29 @@ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2) bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) if (p1.ptr.dev == p2.ptr.dev && p1.ptr.gen == p2.ptr.gen && + + /* + * This checks that the two pointers point + * to the same region on disk - adjusting + * for the difference in where the extents + * start, since one may have been trimmed: + */ (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == - (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) + (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) && + + /* + * This additionally checks that the + * extents overlap on disk, since the + * previous check may trigger spuriously + * when one extent is immediately partially + * overwritten with another extent (so that + * on disk they are adjacent) and + * compression is in use: + */ + ((p1.ptr.offset >= p2.ptr.offset && + p1.ptr.offset < p2.ptr.offset + p2.crc.compressed_size) || + (p2.ptr.offset >= p1.ptr.offset && + p2.ptr.offset < p1.ptr.offset + p1.crc.compressed_size))) return true; return false; @@ -1020,6 +1032,8 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc prt_printf(out, "ptr: %u:%llu:%u gen %u", ptr->dev, b, offset, ptr->gen); + if (ca->mi.durability != 1) + prt_printf(out, " d=%u", ca->mi.durability); if (ptr->cached) prt_str(out, " cached"); if (ptr->unwritten) @@ -1102,14 +1116,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, } } - -static int extent_ptr_invalid(struct bch_fs *c, - struct bkey_s_c k, - enum bch_validate_flags flags, - const struct bch_extent_ptr *ptr, - unsigned size_ondisk, - bool metadata, - struct printbuf *err) +static int extent_ptr_validate(struct bch_fs *c, + struct bkey_s_c k, + enum bch_validate_flags flags, + const struct bch_extent_ptr *ptr, + unsigned size_ondisk, + bool metadata) { int ret = 0; @@ -1128,28 +1140,27 @@ static int extent_ptr_invalid(struct bch_fs *c, struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr2) - bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, c, err, - ptr_to_duplicate_device, + bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, + c, ptr_to_duplicate_device, "multiple pointers to same device (%u)", ptr->dev); - bkey_fsck_err_on(bucket >= nbuckets, c, err, - ptr_after_last_bucket, + bkey_fsck_err_on(bucket >= nbuckets, + c, ptr_after_last_bucket, "pointer past last bucket (%llu > %llu)", bucket, nbuckets); - bkey_fsck_err_on(bucket < first_bucket, c, err, - ptr_before_first_bucket, + bkey_fsck_err_on(bucket < first_bucket, + c, ptr_before_first_bucket, "pointer before first bucket (%llu < %u)", bucket, first_bucket); - bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size, c, err, - ptr_spans_multiple_buckets, + bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size, + c, ptr_spans_multiple_buckets, "pointer spans multiple buckets (%u + %u > %u)", bucket_offset, size_ondisk, bucket_size); fsck_err: return ret; } -int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -1164,25 +1175,24 @@ int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k, size_ondisk = btree_sectors(c); bkey_extent_entry_for_each(ptrs, entry) { - bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, c, err, - extent_ptrs_invalid_entry, - "invalid extent entry type (got %u, max %u)", - __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); + bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, + c, extent_ptrs_invalid_entry, + "invalid extent entry type (got %u, max %u)", + __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); bkey_fsck_err_on(bkey_is_btree_ptr(k.k) && - !extent_entry_is_ptr(entry), c, err, - btree_ptr_has_non_ptr, + !extent_entry_is_ptr(entry), + c, btree_ptr_has_non_ptr, "has non ptr field"); switch (extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: - ret = extent_ptr_invalid(c, k, flags, &entry->ptr, - size_ondisk, false, err); + ret = extent_ptr_validate(c, k, flags, &entry->ptr, size_ondisk, false); if (ret) return ret; - bkey_fsck_err_on(entry->ptr.cached && have_ec, c, err, - ptr_cached_and_erasure_coded, + bkey_fsck_err_on(entry->ptr.cached && have_ec, + c, ptr_cached_and_erasure_coded, "cached, erasure coded ptr"); if (!entry->ptr.unwritten) @@ -1199,44 +1209,50 @@ int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k, case BCH_EXTENT_ENTRY_crc128: crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, c, err, - ptr_crc_uncompressed_size_too_small, + bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, + c, ptr_crc_uncompressed_size_too_small, "checksum offset + key size > uncompressed size"); - bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), c, err, - ptr_crc_csum_type_unknown, + bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), + c, ptr_crc_csum_type_unknown, "invalid checksum type"); - bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, c, err, - ptr_crc_compression_type_unknown, + bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, + c, ptr_crc_compression_type_unknown, "invalid compression type"); if (bch2_csum_type_is_encryption(crc.csum_type)) { if (nonce == UINT_MAX) nonce = crc.offset + crc.nonce; else if (nonce != crc.offset + crc.nonce) - bkey_fsck_err(c, err, ptr_crc_nonce_mismatch, + bkey_fsck_err(c, ptr_crc_nonce_mismatch, "incorrect nonce"); } - bkey_fsck_err_on(crc_since_last_ptr, c, err, - ptr_crc_redundant, + bkey_fsck_err_on(crc_since_last_ptr, + c, ptr_crc_redundant, "redundant crc entry"); crc_since_last_ptr = true; bkey_fsck_err_on(crc_is_encoded(crc) && (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) && - (flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), c, err, - ptr_crc_uncompressed_size_too_big, + (flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), + c, ptr_crc_uncompressed_size_too_big, "too large encoded extent"); size_ondisk = crc.compressed_size; break; case BCH_EXTENT_ENTRY_stripe_ptr: - bkey_fsck_err_on(have_ec, c, err, - ptr_stripe_redundant, + bkey_fsck_err_on(have_ec, + c, ptr_stripe_redundant, "redundant stripe entry"); have_ec = true; break; case BCH_EXTENT_ENTRY_rebalance: { + /* + * this shouldn't be a fsck error, for forward + * compatibility; the rebalance code should just refetch + * the compression opt if it's unknown + */ +#if 0 const struct bch_extent_rebalance *r = &entry->rebalance; if (!bch2_compression_opt_valid(r->compression)) { @@ -1245,28 +1261,29 @@ int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k, opt.type, opt.level); return -BCH_ERR_invalid_bkey; } +#endif break; } } } - bkey_fsck_err_on(!nr_ptrs, c, err, - extent_ptrs_no_ptrs, + bkey_fsck_err_on(!nr_ptrs, + c, extent_ptrs_no_ptrs, "no ptrs"); - bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, c, err, - extent_ptrs_too_many_ptrs, + bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, + c, extent_ptrs_too_many_ptrs, "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX); - bkey_fsck_err_on(have_written && have_unwritten, c, err, - extent_ptrs_written_and_unwritten, + bkey_fsck_err_on(have_written && have_unwritten, + c, extent_ptrs_written_and_unwritten, "extent with unwritten and written ptrs"); - bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, c, err, - extent_ptrs_unwritten, + bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, + c, extent_ptrs_unwritten, "has unwritten ptrs"); - bkey_fsck_err_on(crc_since_last_ptr, c, err, - extent_ptrs_redundant_crc, + bkey_fsck_err_on(crc_since_last_ptr, + c, extent_ptrs_redundant_crc, "redundant crc entry"); - bkey_fsck_err_on(have_ec, c, err, - extent_ptrs_redundant_stripe, + bkey_fsck_err_on(have_ec, + c, extent_ptrs_redundant_stripe, "redundant stripe entry"); fsck_err: return ret; @@ -1377,6 +1394,45 @@ bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k) return r != NULL; } +static u64 __bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k, + unsigned target, unsigned compression) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + u64 sectors = 0; + + if (compression) { + unsigned compression_type = bch2_compression_opt_to_type(compression); + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || + p.ptr.unwritten) { + sectors = 0; + goto incompressible; + } + + if (!p.ptr.cached && p.crc.compression_type != compression_type) + sectors += p.crc.compressed_size; + } + } +incompressible: + if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) { + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, target)) + sectors += p.crc.compressed_size; + } + + return sectors; +} + +u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) +{ + const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); + + return r ? __bch2_bkey_sectors_need_rebalance(c, k, r->target, r->compression) : 0; +} + int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k, struct bch_io_opts *opts) { diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index facdb8a86eec..42a7c6d820a0 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -409,26 +409,26 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, /* KEY_TYPE_btree_ptr: */ -int bch2_btree_ptr_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_btree_ptr_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_btree_ptr_v2_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_btree_ptr_v2_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, int, struct bkey_s); #define bch2_bkey_ops_btree_ptr ((struct bkey_ops) { \ - .key_invalid = bch2_btree_ptr_invalid, \ + .key_validate = bch2_btree_ptr_validate, \ .val_to_text = bch2_btree_ptr_to_text, \ .swab = bch2_ptr_swab, \ .trigger = bch2_trigger_extent, \ }) #define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) { \ - .key_invalid = bch2_btree_ptr_v2_invalid, \ + .key_validate = bch2_btree_ptr_v2_validate, \ .val_to_text = bch2_btree_ptr_v2_to_text, \ .swab = bch2_ptr_swab, \ .compat = bch2_btree_ptr_v2_compat, \ @@ -441,7 +441,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); #define bch2_bkey_ops_extent ((struct bkey_ops) { \ - .key_invalid = bch2_bkey_ptrs_invalid, \ + .key_validate = bch2_bkey_ptrs_validate, \ .val_to_text = bch2_bkey_ptrs_to_text, \ .swab = bch2_ptr_swab, \ .key_normalize = bch2_extent_normalize, \ @@ -451,13 +451,13 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); /* KEY_TYPE_reservation: */ -int bch2_reservation_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_reservation_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); #define bch2_bkey_ops_reservation ((struct bkey_ops) { \ - .key_invalid = bch2_reservation_invalid, \ + .key_validate = bch2_reservation_validate, \ .val_to_text = bch2_reservation_to_text, \ .key_merge = bch2_reservation_merge, \ .trigger = bch2_trigger_reservation, \ @@ -649,26 +649,21 @@ static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr void bch2_extent_ptr_decoded_append(struct bkey_i *, struct extent_ptr_decoded *); -union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s, - struct bch_extent_ptr *); -union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, - struct bch_extent_ptr *); +void bch2_bkey_drop_ptr_noerror(struct bkey_s, struct bch_extent_ptr *); +void bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *); #define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \ do { \ - struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k); \ + __label__ _again; \ + struct bkey_ptrs _ptrs; \ +_again: \ + _ptrs = bch2_bkey_ptrs(_k); \ \ - struct bch_extent_ptr *_ptr = &_ptrs.start->ptr; \ - \ - while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) { \ + bkey_for_each_ptr(_ptrs, _ptr) \ if (_cond) { \ - _ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr); \ - _ptrs = bch2_bkey_ptrs(_k); \ - continue; \ + bch2_bkey_drop_ptr(_k, _ptr); \ + goto _again; \ } \ - \ - (_ptr)++; \ - } \ } while (0) bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, @@ -683,8 +678,8 @@ bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *); void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); void bch2_ptr_swab(struct bkey_s); @@ -692,6 +687,7 @@ const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c); unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c, unsigned, unsigned); bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c); +u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *, struct bch_io_opts *); diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index cc33d763f722..ff60c041abe5 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -534,7 +534,7 @@ do_io: if (f_sectors > w->tmp_sectors) { kfree(w->tmp); - w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), __GFP_NOFAIL); + w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), GFP_NOFS|__GFP_NOFAIL); w->tmp_sectors = f_sectors; } @@ -659,7 +659,7 @@ int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc int bch2_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; @@ -728,12 +728,11 @@ out: goto err; } - *pagep = &folio->page; + *foliop = folio; return 0; err: folio_unlock(folio); folio_put(folio); - *pagep = NULL; err_unlock: bch2_pagecache_add_put(inode); kfree(res); @@ -743,12 +742,11 @@ err_unlock: int bch2_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch2_folio_reservation *res = fsdata; - struct folio *folio = page_folio(page); unsigned offset = pos - folio_pos(folio); lockdep_assert_held(&inode->v.i_rwsem); @@ -802,8 +800,7 @@ static noinline void folios_trunc(folios *fs, struct folio **fi) static int __bch2_buffered_write(struct bch_inode_info *inode, struct address_space *mapping, struct iov_iter *iter, - loff_t pos, unsigned len, - bool inode_locked) + loff_t pos, unsigned len) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch2_folio_reservation res; @@ -827,15 +824,6 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, BUG_ON(!fs.nr); - /* - * If we're not using the inode lock, we need to lock all the folios for - * atomiticity of writes vs. other writes: - */ - if (!inode_locked && folio_end_pos(darray_last(fs)) < end) { - ret = -BCH_ERR_need_inode_lock; - goto out; - } - f = darray_first(fs); if (pos != folio_pos(f) && !folio_test_uptodate(f)) { ret = bch2_read_single_folio(f, mapping); @@ -932,10 +920,8 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, end = pos + copied; spin_lock(&inode->v.i_lock); - if (end > inode->v.i_size) { - BUG_ON(!inode_locked); + if (end > inode->v.i_size) i_size_write(&inode->v, end); - } spin_unlock(&inode->v.i_lock); f_pos = pos; @@ -979,68 +965,12 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct bch_inode_info *inode = file_bch_inode(file); - loff_t pos; - bool inode_locked = false; - ssize_t written = 0, written2 = 0, ret = 0; - - /* - * We don't take the inode lock unless i_size will be changing. Folio - * locks provide exclusion with other writes, and the pagecache add lock - * provides exclusion with truncate and hole punching. - * - * There is one nasty corner case where atomicity would be broken - * without great care: when copying data from userspace to the page - * cache, we do that with faults disable - a page fault would recurse - * back into the filesystem, taking filesystem locks again, and - * deadlock; so it's done with faults disabled, and we fault in the user - * buffer when we aren't holding locks. - * - * If we do part of the write, but we then race and in the userspace - * buffer have been evicted and are no longer resident, then we have to - * drop our folio locks to re-fault them in, breaking write atomicity. - * - * To fix this, we restart the write from the start, if we weren't - * holding the inode lock. - * - * There is another wrinkle after that; if we restart the write from the - * start, and then get an unrecoverable error, we _cannot_ claim to - * userspace that we did not write data we actually did - so we must - * track (written2) the most we ever wrote. - */ - - if ((iocb->ki_flags & IOCB_APPEND) || - (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) { - inode_lock(&inode->v); - inode_locked = true; - } - - ret = generic_write_checks(iocb, iter); - if (ret <= 0) - goto unlock; - - ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0); - if (ret) { - if (!inode_locked) { - inode_lock(&inode->v); - inode_locked = true; - ret = file_remove_privs_flags(file, 0); - } - if (ret) - goto unlock; - } - - ret = file_update_time(file); - if (ret) - goto unlock; - - pos = iocb->ki_pos; + loff_t pos = iocb->ki_pos; + ssize_t written = 0; + int ret = 0; bch2_pagecache_add_get(inode); - if (!inode_locked && - (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) - goto get_inode_lock; - do { unsigned offset = pos & (PAGE_SIZE - 1); unsigned bytes = iov_iter_count(iter); @@ -1065,17 +995,12 @@ again: } } - if (unlikely(bytes != iov_iter_count(iter) && !inode_locked)) - goto get_inode_lock; - if (unlikely(fatal_signal_pending(current))) { ret = -EINTR; break; } - ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked); - if (ret == -BCH_ERR_need_inode_lock) - goto get_inode_lock; + ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); if (unlikely(ret < 0)) break; @@ -1096,46 +1021,50 @@ again: } pos += ret; written += ret; - written2 = max(written, written2); - - if (ret != bytes && !inode_locked) - goto get_inode_lock; ret = 0; balance_dirty_pages_ratelimited(mapping); - - if (0) { -get_inode_lock: - bch2_pagecache_add_put(inode); - inode_lock(&inode->v); - inode_locked = true; - bch2_pagecache_add_get(inode); - - iov_iter_revert(iter, written); - pos -= written; - written = 0; - ret = 0; - } } while (iov_iter_count(iter)); - bch2_pagecache_add_put(inode); -unlock: - if (inode_locked) - inode_unlock(&inode->v); - iocb->ki_pos += written; + bch2_pagecache_add_put(inode); - ret = max(written, written2) ?: ret; - if (ret > 0) - ret = generic_write_sync(iocb, ret); - return ret; + return written ? written : ret; } -ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter) +ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) { - ssize_t ret = iocb->ki_flags & IOCB_DIRECT - ? bch2_direct_write(iocb, iter) - : bch2_buffered_write(iocb, iter); + struct file *file = iocb->ki_filp; + struct bch_inode_info *inode = file_bch_inode(file); + ssize_t ret; + + if (iocb->ki_flags & IOCB_DIRECT) { + ret = bch2_direct_write(iocb, from); + goto out; + } + + inode_lock(&inode->v); + + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto unlock; + + ret = file_remove_privs(file); + if (ret) + goto unlock; + + ret = file_update_time(file); + if (ret) + goto unlock; + + ret = bch2_buffered_write(iocb, from); + if (likely(ret > 0)) + iocb->ki_pos += ret; +unlock: + inode_unlock(&inode->v); + if (ret > 0) + ret = generic_write_sync(iocb, ret); +out: return bch2_err_class(ret); } diff --git a/fs/bcachefs/fs-io-buffered.h b/fs/bcachefs/fs-io-buffered.h index a6126ff790e6..3207ebbb4ab4 100644 --- a/fs/bcachefs/fs-io-buffered.h +++ b/fs/bcachefs/fs-io-buffered.h @@ -10,10 +10,10 @@ int bch2_read_folio(struct file *, struct folio *); int bch2_writepages(struct address_space *, struct writeback_control *); void bch2_readahead(struct readahead_control *); -int bch2_write_begin(struct file *, struct address_space *, loff_t, - unsigned, struct page **, void **); +int bch2_write_begin(struct file *, struct address_space *, loff_t pos, + unsigned len, struct folio **, void **); int bch2_write_end(struct file *, struct address_space *, loff_t, - unsigned, unsigned, struct page *, void *); + unsigned len, unsigned copied, struct folio *, void *); ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index aea8132d2c40..99c7fe987c74 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -328,9 +328,8 @@ static int bch2_ioc_setlabel(struct bch_fs *c, mutex_lock(&c->sb_lock); strscpy(c->disk_sb.sb->label, label, BCH_SB_LABEL_SIZE); - mutex_unlock(&c->sb_lock); - ret = bch2_write_super(c); + mutex_unlock(&c->sb_lock); mnt_drop_write_file(file); return ret; diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 3a5f49affa0a..011817afc3ad 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -177,6 +177,14 @@ static unsigned bch2_inode_hash(subvol_inum inum) return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL); } +struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) +{ + return to_bch_ei(ilookup5_nowait(c->vfs_sb, + bch2_inode_hash(inum), + bch2_iget5_test, + &inum)); +} + static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode) { subvol_inum inum = inode_inum(inode); @@ -193,7 +201,7 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino * only insert fully created inodes in the inode hash table. But * discard_new_inode() expects it to be set... */ - inode->v.i_flags |= I_NEW; + inode->v.i_state |= I_NEW; /* * We don't want bch2_evict_inode() to delete the inode on disk, * we just raced and had another inode in cache. Normally new @@ -1199,7 +1207,7 @@ static const struct inode_operations bch_file_inode_operations = { .fiemap = bch2_fiemap, .listxattr = bch2_xattr_list, #ifdef CONFIG_BCACHEFS_POSIX_ACL - .get_acl = bch2_get_acl, + .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif }; @@ -1219,7 +1227,7 @@ static const struct inode_operations bch_dir_inode_operations = { .tmpfile = bch2_tmpfile, .listxattr = bch2_xattr_list, #ifdef CONFIG_BCACHEFS_POSIX_ACL - .get_acl = bch2_get_acl, + .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif }; @@ -1241,7 +1249,7 @@ static const struct inode_operations bch_symlink_inode_operations = { .setattr = bch2_setattr, .listxattr = bch2_xattr_list, #ifdef CONFIG_BCACHEFS_POSIX_ACL - .get_acl = bch2_get_acl, + .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif }; @@ -1251,7 +1259,7 @@ static const struct inode_operations bch_special_inode_operations = { .setattr = bch2_setattr, .listxattr = bch2_xattr_list, #ifdef CONFIG_BCACHEFS_POSIX_ACL - .get_acl = bch2_get_acl, + .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif }; @@ -1644,14 +1652,16 @@ again: break; } } else if (clean_pass && this_pass_clean) { - wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW); - DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW); + struct wait_bit_queue_entry wqe; + struct wait_queue_head *wq_head; - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW); + prepare_to_wait_event(wq_head, &wqe.wq_entry, + TASK_UNINTERRUPTIBLE); mutex_unlock(&c->vfs_inodes_lock); schedule(); - finish_wait(wq, &wait.wq_entry); + finish_wait(wq_head, &wqe.wq_entry); goto again; } } diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index c3af7225ff69..990ec43e0365 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -56,6 +56,8 @@ static inline subvol_inum inode_inum(struct bch_inode_info *inode) }; } +struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *, subvol_inum); + /* * Set if we've gotten a btree error for this inode, and thus the vfs inode and * btree inode may be inconsistent: @@ -194,6 +196,11 @@ int bch2_vfs_init(void); #define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); }) +static inline struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) +{ + return NULL; +} + static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) {} static inline void bch2_vfs_exit(void) {} diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 9138944c5ae6..9b3470a97546 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -8,6 +8,7 @@ #include "darray.h" #include "dirent.h" #include "error.h" +#include "fs.h" #include "fs-common.h" #include "fsck.h" #include "inode.h" @@ -962,6 +963,22 @@ fsck_err: return ret; } +static bool bch2_inode_open(struct bch_fs *c, struct bpos p) +{ + subvol_inum inum = { + .subvol = snapshot_t(c, p.snapshot)->subvol, + .inum = p.offset, + }; + + /* snapshot tree corruption, can't safely delete */ + if (!inum.subvol) { + bch_err_ratelimited(c, "%s(): snapshot %u has no subvol", __func__, p.snapshot); + return true; + } + + return __bch2_inode_hash_find(c, inum) != NULL; +} + static int check_inode(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, @@ -1040,6 +1057,7 @@ static int check_inode(struct btree_trans *trans, } if (u.bi_flags & BCH_INODE_unlinked && + !bch2_inode_open(c, k.k->p) && (!c->sb.clean || fsck_err(trans, inode_unlinked_but_clean, "filesystem marked clean, but inode %llu unlinked", @@ -2006,7 +2024,6 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * if (ret) { bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum); ret = -BCH_ERR_fsck_repair_unimplemented; - ret = 0; goto err; } @@ -2216,6 +2233,8 @@ int bch2_check_xattrs(struct bch_fs *c) NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_xattr(trans, &iter, k, &hash_info, &inode))); + + inode_walker_exit(&inode); bch_err_fn(c, ret); return ret; } @@ -2469,8 +2488,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino : bch2_inode_unpack(inode_k, &inode); if (ret) { /* Should have been caught in dirents pass */ - if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err(c, "error looking up parent directory: %i", ret); + bch_err_msg(c, ret, "error looking up parent directory"); break; } diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 1e20020eadd1..2be6be33afa3 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -434,100 +434,98 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k) return &inode_p->inode.k_i; } -static int __bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) +static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bch_inode_unpacked unpacked; int ret = 0; - bkey_fsck_err_on(k.k->p.inode, c, err, - inode_pos_inode_nonzero, + bkey_fsck_err_on(k.k->p.inode, + c, inode_pos_inode_nonzero, "nonzero k.p.inode"); - bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, c, err, - inode_pos_blockdev_range, + bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, + c, inode_pos_blockdev_range, "fs inode in blockdev range"); - bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), c, err, - inode_unpack_error, + bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), + c, inode_unpack_error, "invalid variable length fields"); - bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, c, err, - inode_checksum_type_invalid, + bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, + c, inode_checksum_type_invalid, "invalid data checksum type (%u >= %u", unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1); bkey_fsck_err_on(unpacked.bi_compression && - !bch2_compression_opt_valid(unpacked.bi_compression - 1), c, err, - inode_compression_type_invalid, + !bch2_compression_opt_valid(unpacked.bi_compression - 1), + c, inode_compression_type_invalid, "invalid compression opt %u", unpacked.bi_compression - 1); bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) && - unpacked.bi_nlink != 0, c, err, - inode_unlinked_but_nlink_nonzero, + unpacked.bi_nlink != 0, + c, inode_unlinked_but_nlink_nonzero, "flagged as unlinked but bi_nlink != 0"); - bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), c, err, - inode_subvol_root_but_not_dir, + bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), + c, inode_subvol_root_but_not_dir, "subvolume root but not a directory"); fsck_err: return ret; } -int bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); int ret = 0; - bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err, - inode_str_hash_invalid, + bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR, + c, inode_str_hash_invalid, "invalid str hash type (%llu >= %u)", INODE_STR_HASH(inode.v), BCH_STR_HASH_NR); - ret = __bch2_inode_invalid(c, k, err); + ret = __bch2_inode_validate(c, k, flags); fsck_err: return ret; } -int bch2_inode_v2_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); int ret = 0; - bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err, - inode_str_hash_invalid, + bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, + c, inode_str_hash_invalid, "invalid str hash type (%llu >= %u)", INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR); - ret = __bch2_inode_invalid(c, k, err); + ret = __bch2_inode_validate(c, k, flags); fsck_err: return ret; } -int bch2_inode_v3_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); int ret = 0; bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL || - INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), c, err, - inode_v3_fields_start_bad, + INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), + c, inode_v3_fields_start_bad, "invalid fields_start (got %llu, min %u max %zu)", INODEv3_FIELDS_START(inode.v), INODEv3_FIELDS_START_INITIAL, bkey_val_u64s(inode.k)); - bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err, - inode_str_hash_invalid, + bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, + c, inode_str_hash_invalid, "invalid str hash type (%llu >= %u)", INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR); - ret = __bch2_inode_invalid(c, k, err); + ret = __bch2_inode_validate(c, k, flags); fsck_err: return ret; } @@ -625,14 +623,13 @@ int bch2_trigger_inode(struct btree_trans *trans, return 0; } -int bch2_inode_generation_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_inode_generation_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { int ret = 0; - bkey_fsck_err_on(k.k->p.inode, c, err, - inode_pos_inode_nonzero, + bkey_fsck_err_on(k.k->p.inode, + c, inode_pos_inode_nonzero, "nonzero k.p.inode"); fsck_err: return ret; diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index da0e4a745099..f1fcb4c58039 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -9,12 +9,12 @@ enum bch_validate_flags; extern const char * const bch2_inode_opts[]; -int bch2_inode_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); -int bch2_inode_v2_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); -int bch2_inode_v3_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_inode_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); +int bch2_inode_v2_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); +int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned, @@ -22,21 +22,21 @@ int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned, enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_inode ((struct bkey_ops) { \ - .key_invalid = bch2_inode_invalid, \ + .key_validate = bch2_inode_validate, \ .val_to_text = bch2_inode_to_text, \ .trigger = bch2_trigger_inode, \ .min_val_size = 16, \ }) #define bch2_bkey_ops_inode_v2 ((struct bkey_ops) { \ - .key_invalid = bch2_inode_v2_invalid, \ + .key_validate = bch2_inode_v2_validate, \ .val_to_text = bch2_inode_to_text, \ .trigger = bch2_trigger_inode, \ .min_val_size = 32, \ }) #define bch2_bkey_ops_inode_v3 ((struct bkey_ops) { \ - .key_invalid = bch2_inode_v3_invalid, \ + .key_validate = bch2_inode_v3_validate, \ .val_to_text = bch2_inode_to_text, \ .trigger = bch2_trigger_inode, \ .min_val_size = 48, \ @@ -49,12 +49,12 @@ static inline bool bkey_is_inode(const struct bkey *k) k->type == KEY_TYPE_inode_v3; } -int bch2_inode_generation_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_inode_generation_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \ - .key_invalid = bch2_inode_generation_invalid, \ + .key_validate = bch2_inode_generation_validate, \ .val_to_text = bch2_inode_generation_to_text, \ .min_val_size = 8, \ }) diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index 2cf6297756f8..177ed331c00b 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -126,11 +126,7 @@ err_noprint: if (closure_nr_remaining(&cl) != 1) { bch2_trans_unlock_long(trans); - - if (closure_sync_timeout(&cl, HZ * 10)) { - bch2_print_allocator_stuck(c); - closure_sync(&cl); - } + bch2_wait_on_allocator(c, &cl); } return ret; diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 4531c9ab3e12..7ee3b75480df 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -406,6 +406,7 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio bch2_trans_iter_init(trans, &iter, rbio->data_btree, rbio->read_pos, BTREE_ITER_slots); retry: + bch2_trans_begin(trans); rbio->bio.bi_status = 0; k = bch2_btree_iter_peek_slot(&iter); diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index d31c8d006d97..1d4761d15002 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -1503,10 +1503,7 @@ err: if ((op->flags & BCH_WRITE_SYNC) || (!(op->flags & BCH_WRITE_SUBMITTED) && !(op->flags & BCH_WRITE_IN_WORKER))) { - if (closure_sync_timeout(&op->cl, HZ * 10)) { - bch2_print_allocator_stuck(c); - closure_sync(&op->cl); - } + bch2_wait_on_allocator(c, &op->cl); __bch2_write_index(op); diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 649e3a01608a..f5f7db50ca31 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1260,7 +1260,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) } if (!had_entries) - j->last_empty_seq = cur_seq; + j->last_empty_seq = cur_seq - 1; /* to match j->seq */ spin_lock(&j->lock); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 7a833a3f1c63..7664b68e6a15 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -332,7 +332,6 @@ static int journal_validate_key(struct bch_fs *c, { int write = flags & BCH_VALIDATE_write; void *next = vstruct_next(entry); - struct printbuf buf = PRINTBUF; int ret = 0; if (journal_entry_err_on(!k->k.u64s, @@ -368,34 +367,21 @@ static int journal_validate_key(struct bch_fs *c, bch2_bkey_compat(level, btree_id, version, big_endian, write, NULL, bkey_to_packed(k)); - if (bch2_bkey_invalid(c, bkey_i_to_s_c(k), - __btree_node_type(level, btree_id), write, &buf)) { - printbuf_reset(&buf); - journal_entry_err_msg(&buf, version, jset, entry); - prt_newline(&buf); - printbuf_indent_add(&buf, 2); - - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); - prt_newline(&buf); - bch2_bkey_invalid(c, bkey_i_to_s_c(k), - __btree_node_type(level, btree_id), write, &buf); - - mustfix_fsck_err(c, journal_entry_bkey_invalid, - "%s", buf.buf); - + ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), + __btree_node_type(level, btree_id), write); + if (ret == -BCH_ERR_fsck_delete_bkey) { le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); memmove(k, bkey_next(k), next - (void *) bkey_next(k)); journal_entry_null_range(vstruct_next(entry), next); - - printbuf_exit(&buf); return FSCK_DELETED_KEY; } + if (ret) + goto fsck_err; if (write) bch2_bkey_compat(level, btree_id, version, big_endian, write, NULL, bkey_to_packed(k)); fsck_err: - printbuf_exit(&buf); return ret; } diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c index db80e506e3ab..62b910f2fb27 100644 --- a/fs/bcachefs/journal_sb.c +++ b/fs/bcachefs/journal_sb.c @@ -104,6 +104,7 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); int ret = -BCH_ERR_invalid_sb_journal; + u64 sum = 0; unsigned nr; unsigned i; struct u64_range *b; @@ -119,6 +120,15 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f for (i = 0; i < nr; i++) { b[i].start = le64_to_cpu(journal->d[i].start); b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr); + + if (b[i].end <= b[i].start) { + prt_printf(err, "journal buckets entry with bad nr: %llu+%llu", + le64_to_cpu(journal->d[i].start), + le64_to_cpu(journal->d[i].nr)); + goto err; + } + + sum += le64_to_cpu(journal->d[i].nr); } sort(b, nr, sizeof(*b), u64_range_cmp, NULL); @@ -148,6 +158,11 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f } } + if (sum > UINT_MAX) { + prt_printf(err, "too many journal buckets: %llu > %u", sum, UINT_MAX); + goto err; + } + ret = 0; err: kfree(b); diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index 83b1586cb371..96f2f4f8c397 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -10,14 +10,13 @@ #include "recovery.h" /* KEY_TYPE_lru is obsolete: */ -int bch2_lru_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_lru_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { int ret = 0; - bkey_fsck_err_on(!lru_pos_time(k.k->p), c, err, - lru_entry_at_time_0, + bkey_fsck_err_on(!lru_pos_time(k.k->p), + c, lru_entry_at_time_0, "lru entry at time=0"); fsck_err: return ret; diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index 5bd8974a7f11..e6a7d8241bb8 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -33,14 +33,13 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l) return BCH_LRU_read; } -int bch2_lru_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_lru_pos_to_text(struct printbuf *, struct bpos); #define bch2_bkey_ops_lru ((struct bkey_ops) { \ - .key_invalid = bch2_lru_invalid, \ + .key_validate = bch2_lru_validate, \ .val_to_text = bch2_lru_to_text, \ .min_val_size = 8, \ }) diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index deef4f024d20..d86565bf07c8 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -383,7 +383,7 @@ static int bch2_copygc_thread(void *arg) if (min_member_capacity == U64_MAX) min_member_capacity = 128 * 2048; - bch2_trans_unlock_long(ctxt.trans); + move_buckets_wait(&ctxt, buckets, true); bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6), MAX_SCHEDULE_TIMEOUT); } diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 60b93018501f..cda1725702ea 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -391,6 +391,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH_SB_JOURNAL_TRANSACTION_NAMES, true, \ NULL, "Log transaction function names in journal") \ + x(allocator_stuck_timeout, u16, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(0, U16_MAX), \ + BCH_SB_ALLOCATOR_STUCK_TIMEOUT, 30, \ + NULL, "Default timeout in seconds for stuck allocator messages")\ x(noexcl, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index a0cca8b70e0a..c32a05e252e2 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -59,13 +59,13 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = { .to_text = bch2_sb_quota_to_text, }; -int bch2_quota_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, struct printbuf *err) +int bch2_quota_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { int ret = 0; - bkey_fsck_err_on(k.k->p.inode >= QTYP_NR, c, err, - quota_type_invalid, + bkey_fsck_err_on(k.k->p.inode >= QTYP_NR, + c, quota_type_invalid, "invalid quota type (%llu >= %u)", k.k->p.inode, QTYP_NR); fsck_err: diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h index 02d37a332218..a62abcc5332a 100644 --- a/fs/bcachefs/quota.h +++ b/fs/bcachefs/quota.h @@ -8,12 +8,11 @@ enum bch_validate_flags; extern const struct bch_sb_field_ops bch_sb_field_ops_quota; -int bch2_quota_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_quota_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_quota ((struct bkey_ops) { \ - .key_invalid = bch2_quota_invalid, \ + .key_validate = bch2_quota_validate, \ .val_to_text = bch2_quota_to_text, \ .min_val_size = 32, \ }) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index d89eb43c5ce9..36de1c6fe8c3 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -241,7 +241,13 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) const struct journal_key *l = *((const struct journal_key **)_l); const struct journal_key *r = *((const struct journal_key **)_r); - return cmp_int(l->journal_seq, r->journal_seq); + /* + * Map 0 to U64_MAX, so that keys with journal_seq === 0 come last + * + * journal_seq == 0 means that the key comes from early repair, and + * should be inserted last so as to avoid overflowing the journal + */ + return cmp_int(l->journal_seq - 1, r->journal_seq - 1); } int bch2_journal_replay(struct bch_fs *c) @@ -322,6 +328,7 @@ int bch2_journal_replay(struct bch_fs *c) } } + bch2_trans_unlock_long(trans); /* * Now, replay any remaining keys in the order in which they appear in * the journal, unpinning those journal entries as we go: diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 5f92715e1525..e59c0abb4772 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -29,15 +29,14 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k) /* reflink pointers */ -int bch2_reflink_p_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_reflink_p_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); int ret = 0; bkey_fsck_err_on(le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad), - c, err, reflink_p_front_pad_bad, + c, reflink_p_front_pad_bad, "idx < front_pad (%llu < %u)", le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad)); fsck_err: @@ -256,11 +255,10 @@ int bch2_trigger_reflink_p(struct btree_trans *trans, /* indirect extents */ -int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { - return bch2_bkey_ptrs_invalid(c, k, flags, err); + return bch2_bkey_ptrs_validate(c, k, flags); } void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, @@ -311,9 +309,8 @@ int bch2_trigger_reflink_v(struct btree_trans *trans, /* indirect inline data */ -int bch2_indirect_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_indirect_inline_data_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { return 0; } diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h index e894f3a2c67a..51afe11d8ed6 100644 --- a/fs/bcachefs/reflink.h +++ b/fs/bcachefs/reflink.h @@ -4,41 +4,37 @@ enum bch_validate_flags; -int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); -void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); +int bch2_reflink_p_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \ - .key_invalid = bch2_reflink_p_invalid, \ + .key_validate = bch2_reflink_p_validate, \ .val_to_text = bch2_reflink_p_to_text, \ .key_merge = bch2_reflink_p_merge, \ .trigger = bch2_trigger_reflink_p, \ .min_val_size = 16, \ }) -int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); -void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); +int bch2_reflink_v_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \ - .key_invalid = bch2_reflink_v_invalid, \ + .key_validate = bch2_reflink_v_validate, \ .val_to_text = bch2_reflink_v_to_text, \ .swab = bch2_ptr_swab, \ .trigger = bch2_trigger_reflink_v, \ .min_val_size = 8, \ }) -int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_indirect_inline_data_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); void bch2_indirect_inline_data_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_indirect_inline_data(struct btree_trans *, @@ -47,7 +43,7 @@ int bch2_trigger_indirect_inline_data(struct btree_trans *, enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \ - .key_invalid = bch2_indirect_inline_data_invalid, \ + .key_validate = bch2_indirect_inline_data_validate, \ .val_to_text = bch2_indirect_inline_data_to_text, \ .trigger = bch2_trigger_indirect_inline_data, \ .min_val_size = 8, \ diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 10c96cb2047a..1f34c92a6d11 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -24,7 +24,6 @@ static int bch2_memcmp(const void *l, const void *r, const void *priv) static void verify_replicas_entry(struct bch_replicas_entry_v1 *e) { #ifdef CONFIG_BCACHEFS_DEBUG - BUG_ON(e->data_type >= BCH_DATA_NR); BUG_ON(!e->nr_devs); BUG_ON(e->nr_required > 1 && e->nr_required >= e->nr_devs); @@ -83,7 +82,8 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, } for (unsigned i = 0; i < r->nr_devs; i++) - if (!bch2_member_exists(sb, r->devs[i])) { + if (r->devs[i] != BCH_SB_MEMBER_INVALID && + !bch2_member_exists(sb, r->devs[i])) { prt_printf(err, "invalid device %u in entry ", r->devs[i]); goto bad; } @@ -452,7 +452,8 @@ retry: .type = BCH_DISK_ACCOUNTING_replicas, }; - memcpy(&k.replicas, e, replicas_entry_bytes(e)); + unsafe_memcpy(&k.replicas, e, replicas_entry_bytes(e), + "embedded variable length struct"); struct bpos p = disk_accounting_pos_to_bpos(&k); @@ -795,7 +796,7 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, nr_online += test_bit(e->devs[i], devs.d); struct bch_dev *ca = bch2_dev_rcu(c, e->devs[i]); - nr_failed += ca && ca->mi.state == BCH_MEMBER_STATE_failed; + nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed; } rcu_read_unlock(); diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index dfbbd33c8731..c7e4cdd3f6a5 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -61,6 +61,23 @@ BCH_FSCK_ERR_dev_usage_buckets_wrong, \ BCH_FSCK_ERR_dev_usage_sectors_wrong, \ BCH_FSCK_ERR_dev_usage_fragmented_wrong, \ + BCH_FSCK_ERR_accounting_mismatch) \ + x(disk_accounting_v3, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_bkey_version_in_future, \ + BCH_FSCK_ERR_dev_usage_buckets_wrong, \ + BCH_FSCK_ERR_dev_usage_sectors_wrong, \ + BCH_FSCK_ERR_dev_usage_fragmented_wrong, \ + BCH_FSCK_ERR_accounting_mismatch, \ + BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ + BCH_FSCK_ERR_accounting_key_replicas_nr_required_bad, \ + BCH_FSCK_ERR_accounting_key_replicas_devs_unsorted, \ + BCH_FSCK_ERR_accounting_key_junk_at_end) \ + x(disk_accounting_inum, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_accounting_mismatch) \ + x(rebalance_work_acct_fix, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ BCH_FSCK_ERR_accounting_mismatch) #define DOWNGRADE_TABLE() \ @@ -79,7 +96,25 @@ BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \ BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \ BCH_FSCK_ERR_fs_usage_replicas_wrong, \ - BCH_FSCK_ERR_bkey_version_in_future) + BCH_FSCK_ERR_bkey_version_in_future) \ + x(disk_accounting_v3, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_dev_usage_buckets_wrong, \ + BCH_FSCK_ERR_dev_usage_sectors_wrong, \ + BCH_FSCK_ERR_dev_usage_fragmented_wrong, \ + BCH_FSCK_ERR_fs_usage_hidden_wrong, \ + BCH_FSCK_ERR_fs_usage_btree_wrong, \ + BCH_FSCK_ERR_fs_usage_data_wrong, \ + BCH_FSCK_ERR_fs_usage_cached_wrong, \ + BCH_FSCK_ERR_fs_usage_reserved_wrong, \ + BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \ + BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \ + BCH_FSCK_ERR_fs_usage_replicas_wrong, \ + BCH_FSCK_ERR_accounting_replicas_not_marked, \ + BCH_FSCK_ERR_bkey_version_in_future) \ + x(rebalance_work_acct_fix, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_accounting_mismatch) struct upgrade_downgrade_entry { u64 recovery_passes; diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index d1b2f2aa397a..f0c14702f9e6 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -23,7 +23,7 @@ enum bch_fsck_flags { x(jset_past_bucket_end, 9, 0) \ x(jset_seq_blacklisted, 10, 0) \ x(journal_entries_missing, 11, 0) \ - x(journal_entry_replicas_not_marked, 12, 0) \ + x(journal_entry_replicas_not_marked, 12, FSCK_AUTOFIX) \ x(journal_entry_past_jset_end, 13, 0) \ x(journal_entry_replicas_data_mismatch, 14, 0) \ x(journal_entry_bkey_u64s_0, 15, 0) \ @@ -287,7 +287,11 @@ enum bch_fsck_flags { x(accounting_replicas_not_marked, 273, 0) \ x(invalid_btree_id, 274, 0) \ x(alloc_key_io_time_bad, 275, 0) \ - x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) + x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \ + x(accounting_key_junk_at_end, 277, FSCK_AUTOFIX) \ + x(accounting_key_replicas_nr_devs_0, 278, FSCK_AUTOFIX) \ + x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \ + x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index 39196f2a4197..4b765422dd77 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -11,7 +11,8 @@ void bch2_dev_missing(struct bch_fs *c, unsigned dev) { - bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev); + if (dev != BCH_SB_MEMBER_INVALID) + bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev); } void bch2_dev_bucket_missing(struct bch_fs *c, struct bpos bucket) diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h index e2630548c0f6..d727d2dfda08 100644 --- a/fs/bcachefs/sb-members_format.h +++ b/fs/bcachefs/sb-members_format.h @@ -8,6 +8,11 @@ */ #define BCH_SB_MEMBERS_MAX 64 +/* + * Sentinal value - indicates a device that does not exist + */ +#define BCH_SB_MEMBER_INVALID 255 + #define BCH_MIN_NR_NBUCKETS (1 << 6) #define BCH_IOPS_MEASUREMENTS() \ diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 96744b1a76f5..8b18a9b483a4 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -31,15 +31,14 @@ void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c, le32_to_cpu(t.v->root_snapshot)); } -int bch2_snapshot_tree_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { int ret = 0; bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) || - bkey_lt(k.k->p, POS(0, 1)), c, err, - snapshot_tree_pos_bad, + bkey_lt(k.k->p, POS(0, 1)), + c, snapshot_tree_pos_bad, "bad pos"); fsck_err: return ret; @@ -225,55 +224,54 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, le32_to_cpu(s.v->skip[2])); } -int bch2_snapshot_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_snapshot s; u32 i, id; int ret = 0; bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) || - bkey_lt(k.k->p, POS(0, 1)), c, err, - snapshot_pos_bad, + bkey_lt(k.k->p, POS(0, 1)), + c, snapshot_pos_bad, "bad pos"); s = bkey_s_c_to_snapshot(k); id = le32_to_cpu(s.v->parent); - bkey_fsck_err_on(id && id <= k.k->p.offset, c, err, - snapshot_parent_bad, + bkey_fsck_err_on(id && id <= k.k->p.offset, + c, snapshot_parent_bad, "bad parent node (%u <= %llu)", id, k.k->p.offset); - bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]), c, err, - snapshot_children_not_normalized, + bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]), + c, snapshot_children_not_normalized, "children not normalized"); - bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1], c, err, - snapshot_child_duplicate, + bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1], + c, snapshot_child_duplicate, "duplicate child nodes"); for (i = 0; i < 2; i++) { id = le32_to_cpu(s.v->children[i]); - bkey_fsck_err_on(id >= k.k->p.offset, c, err, - snapshot_child_bad, + bkey_fsck_err_on(id >= k.k->p.offset, + c, snapshot_child_bad, "bad child node (%u >= %llu)", id, k.k->p.offset); } if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) { bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) || - le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]), c, err, - snapshot_skiplist_not_normalized, + le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]), + c, snapshot_skiplist_not_normalized, "skiplist not normalized"); for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) { id = le32_to_cpu(s.v->skip[i]); - bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent), c, err, - snapshot_skiplist_bad, + bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent), + c, snapshot_skiplist_bad, "bad skiplist node %u", id); } } diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index 31b0ee03e962..eb5ef64221d6 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -5,11 +5,11 @@ enum bch_validate_flags; void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_snapshot_tree_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_snapshot_tree_validate(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags); #define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \ - .key_invalid = bch2_snapshot_tree_invalid, \ + .key_validate = bch2_snapshot_tree_validate, \ .val_to_text = bch2_snapshot_tree_to_text, \ .min_val_size = 8, \ }) @@ -19,14 +19,13 @@ struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *); int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *); void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_snapshot_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_snapshot_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_snapshot ((struct bkey_ops) { \ - .key_invalid = bch2_snapshot_invalid, \ + .key_validate = bch2_snapshot_validate, \ .val_to_text = bch2_snapshot_to_text, \ .trigger = bch2_mark_snapshot, \ .min_val_size = 24, \ diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index f56720b55862..dbe834cb349f 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -207,23 +207,23 @@ int bch2_check_subvol_children(struct bch_fs *c) /* Subvolumes: */ -int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, struct printbuf *err) +int bch2_subvolume_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_subvolume subvol = bkey_s_c_to_subvolume(k); int ret = 0; bkey_fsck_err_on(bkey_lt(k.k->p, SUBVOL_POS_MIN) || - bkey_gt(k.k->p, SUBVOL_POS_MAX), c, err, - subvol_pos_bad, + bkey_gt(k.k->p, SUBVOL_POS_MAX), + c, subvol_pos_bad, "invalid pos"); - bkey_fsck_err_on(!subvol.v->snapshot, c, err, - subvol_snapshot_bad, + bkey_fsck_err_on(!subvol.v->snapshot, + c, subvol_snapshot_bad, "invalid snapshot"); - bkey_fsck_err_on(!subvol.v->inode, c, err, - subvol_inode_bad, + bkey_fsck_err_on(!subvol.v->inode, + c, subvol_inode_bad, "invalid inode"); fsck_err: return ret; diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index afa5e871efb2..a8299ba2cab2 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -10,15 +10,14 @@ enum bch_validate_flags; int bch2_check_subvols(struct bch_fs *); int bch2_check_subvol_children(struct bch_fs *); -int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_subvolume_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_subvolume ((struct bkey_ops) { \ - .key_invalid = bch2_subvolume_invalid, \ + .key_validate = bch2_subvolume_validate, \ .val_to_text = bch2_subvolume_to_text, \ .trigger = bch2_subvolume_trigger, \ .min_val_size = 16, \ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 8bc819832790..c8c2ccbdfbb5 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -414,6 +414,10 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, if (!BCH_SB_VERSION_UPGRADE_COMPLETE(sb)) SET_BCH_SB_VERSION_UPGRADE_COMPLETE(sb, le16_to_cpu(sb->version)); + + if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2 && + !BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb)) + SET_BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb, 30); } for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 0455a1001fec..e7fa2de35014 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1193,7 +1193,6 @@ static void bch2_dev_free(struct bch_dev *ca) if (ca->kobj.state_in_sysfs) kobject_del(&ca->kobj); - kfree(ca->buckets_nouse); bch2_free_super(&ca->disk_sb); bch2_dev_allocator_background_exit(ca); bch2_dev_journal_exit(ca); diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 1c0d1fb20276..33f2a64c14c9 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -367,7 +367,7 @@ SHOW(bch2_fs) bch2_stripes_heap_to_text(out, c); if (attr == &sysfs_open_buckets) - bch2_open_buckets_to_text(out, c); + bch2_open_buckets_to_text(out, c, NULL); if (attr == &sysfs_open_buckets_partial) bch2_open_buckets_partial_to_text(out, c); @@ -461,7 +461,7 @@ STORE(bch2_fs) sc.gfp_mask = GFP_KERNEL; sc.nr_to_scan = strtoul_or_return(buf); - c->btree_key_cache.shrink->scan_objects(c->btree_cache.shrink, &sc); + c->btree_key_cache.shrink->scan_objects(c->btree_key_cache.shrink, &sc); } if (attr == &sysfs_trigger_gc) @@ -811,6 +811,9 @@ SHOW(bch2_dev) if (attr == &sysfs_alloc_debug) bch2_dev_alloc_debug_to_text(out, ca); + if (attr == &sysfs_open_buckets) + bch2_open_buckets_to_text(out, c, ca); + return 0; } @@ -892,6 +895,7 @@ struct attribute *bch2_dev_files[] = { /* debug: */ &sysfs_alloc_debug, + &sysfs_open_buckets, NULL }; diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c index dc48b52b01b4..dfad1d06633d 100644 --- a/fs/bcachefs/trace.c +++ b/fs/bcachefs/trace.c @@ -4,6 +4,7 @@ #include "buckets.h" #include "btree_cache.h" #include "btree_iter.h" +#include "btree_key_cache.h" #include "btree_locking.h" #include "btree_update_interior.h" #include "keylist.h" diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index d0e6b9deb6cb..c62f00322d1e 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -988,10 +988,33 @@ TRACE_EVENT(trans_restart_split_race, __entry->u64s_remaining) ); -DEFINE_EVENT(transaction_event, trans_blocked_journal_reclaim, +TRACE_EVENT(trans_blocked_journal_reclaim, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), - TP_ARGS(trans, caller_ip) + TP_ARGS(trans, caller_ip), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + + __field(unsigned long, key_cache_nr_keys ) + __field(unsigned long, key_cache_nr_dirty ) + __field(long, must_wait ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->key_cache_nr_keys = atomic_long_read(&trans->c->btree_key_cache.nr_keys); + __entry->key_cache_nr_dirty = atomic_long_read(&trans->c->btree_key_cache.nr_dirty); + __entry->must_wait = __bch2_btree_key_cache_must_wait(trans->c); + ), + + TP_printk("%s %pS key cache keys %lu dirty %lu must_wait %li", + __entry->trans_fn, (void *) __entry->caller_ip, + __entry->key_cache_nr_keys, + __entry->key_cache_nr_dirty, + __entry->must_wait) ); TRACE_EVENT(trans_restart_journal_preres_get, diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 138320eaa2ad..1b8554460af4 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -416,7 +416,6 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats printbuf_tabstop_push(out, TABSTOP_SIZE + 2); prt_printf(out, "\tsince mount\r\trecent\r\n"); - prt_printf(out, "recent"); printbuf_tabstops_reset(out); printbuf_tabstop_push(out, out->indent + 20); diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index c11bf6dacc2c..331f944d73dc 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -70,17 +70,16 @@ const struct bch_hash_desc bch2_xattr_hash_desc = { .cmp_bkey = xattr_cmp_bkey, }; -int bch2_xattr_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, - struct printbuf *err) +int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) { struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len, le16_to_cpu(xattr.v->x_val_len)); int ret = 0; - bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s, c, err, - xattr_val_size_too_small, + bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s, + c, xattr_val_size_too_small, "value too small (%zu < %u)", bkey_val_u64s(k.k), val_u64s); @@ -88,17 +87,17 @@ int bch2_xattr_invalid(struct bch_fs *c, struct bkey_s_c k, val_u64s = xattr_val_u64s(xattr.v->x_name_len, le16_to_cpu(xattr.v->x_val_len) + 4); - bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s, c, err, - xattr_val_size_too_big, + bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s, + c, xattr_val_size_too_big, "value too big (%zu > %u)", bkey_val_u64s(k.k), val_u64s); - bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type), c, err, - xattr_invalid_type, + bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type), + c, xattr_invalid_type, "invalid type (%u)", xattr.v->x_type); - bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len), c, err, - xattr_name_invalid_chars, + bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len), + c, xattr_name_invalid_chars, "xattr name has invalid characters"); fsck_err: return ret; @@ -613,10 +612,20 @@ static int bch2_xattr_bcachefs_get_effective( name, buffer, size, true); } +/* Noop - xattrs in the bcachefs_effective namespace are inherited */ +static int bch2_xattr_bcachefs_set_effective(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *dentry, struct inode *vinode, + const char *name, const void *value, + size_t size, int flags) +{ + return 0; +} + static const struct xattr_handler bch_xattr_bcachefs_effective_handler = { .prefix = "bcachefs_effective.", .get = bch2_xattr_bcachefs_get_effective, - .set = bch2_xattr_bcachefs_set, + .set = bch2_xattr_bcachefs_set_effective, }; #endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h index 1574b9eb4c85..c188a5ad64ce 100644 --- a/fs/bcachefs/xattr.h +++ b/fs/bcachefs/xattr.h @@ -6,12 +6,11 @@ extern const struct bch_hash_desc bch2_xattr_hash_desc; -int bch2_xattr_invalid(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, struct printbuf *); +int bch2_xattr_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_xattr ((struct bkey_ops) { \ - .key_invalid = bch2_xattr_invalid, \ + .key_validate = bch2_xattr_validate, \ .val_to_text = bch2_xattr_to_text, \ .min_val_size = 8, \ }) diff --git a/fs/bfs/file.c b/fs/bfs/file.c index a778411574a9..fa66a09e496a 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -172,11 +172,11 @@ static void bfs_write_failed(struct address_space *mapping, loff_t to) static int bfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - ret = block_write_begin(mapping, pos, len, pagep, bfs_get_block); + ret = block_write_begin(mapping, pos, len, foliop, bfs_get_block); if (unlikely(ret)) bfs_write_failed(mapping, pos + len); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 19fa49cd9907..34d0d1e43f36 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1314,6 +1314,11 @@ out_free_interp: emulate the SVr4 behavior. Sigh. */ error = vm_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC, MAP_FIXED | MAP_PRIVATE, 0); + + retval = do_mseal(0, PAGE_SIZE, 0); + if (retval) + pr_warn_ratelimited("pid=%d, couldn't seal address 0, ret=%d.\n", + task_pid_nr(current), retval); } regs = current_pt_regs(); @@ -2027,8 +2032,10 @@ static int elf_core_dump(struct coredump_params *cprm) * Collect all the non-memory information about the process for the * notes. This also sets up the file header. */ - if (!fill_note_info(&elf, e_phnum, &info, cprm)) + if (!fill_note_info(&elf, e_phnum, &info, cprm)) { + coredump_report_failure("Error collecting note info"); goto end_coredump; + } has_dumped = 1; @@ -2039,12 +2046,14 @@ static int elf_core_dump(struct coredump_params *cprm) { size_t sz = info.size; - /* For cell spufs */ + /* For cell spufs and x86 xstate */ sz += elf_coredump_extra_notes_size(); phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL); - if (!phdr4note) + if (!phdr4note) { + coredump_report_failure("Error allocating program headers note entry"); goto end_coredump; + } fill_elf_note_phdr(phdr4note, sz, offset); offset += sz; @@ -2058,18 +2067,24 @@ static int elf_core_dump(struct coredump_params *cprm) if (e_phnum == PN_XNUM) { shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL); - if (!shdr4extnum) + if (!shdr4extnum) { + coredump_report_failure("Error allocating extra program headers"); goto end_coredump; + } fill_extnum_info(&elf, shdr4extnum, e_shoff, segs); } offset = dataoff; - if (!dump_emit(cprm, &elf, sizeof(elf))) + if (!dump_emit(cprm, &elf, sizeof(elf))) { + coredump_report_failure("Error emitting the ELF headers"); goto end_coredump; + } - if (!dump_emit(cprm, phdr4note, sizeof(*phdr4note))) + if (!dump_emit(cprm, phdr4note, sizeof(*phdr4note))) { + coredump_report_failure("Error emitting the program header for notes"); goto end_coredump; + } /* Write program headers for segments dump */ for (i = 0; i < cprm->vma_count; i++) { @@ -2092,20 +2107,28 @@ static int elf_core_dump(struct coredump_params *cprm) phdr.p_flags |= PF_X; phdr.p_align = ELF_EXEC_PAGESIZE; - if (!dump_emit(cprm, &phdr, sizeof(phdr))) + if (!dump_emit(cprm, &phdr, sizeof(phdr))) { + coredump_report_failure("Error emitting program headers"); goto end_coredump; + } } - if (!elf_core_write_extra_phdrs(cprm, offset)) + if (!elf_core_write_extra_phdrs(cprm, offset)) { + coredump_report_failure("Error writing out extra program headers"); goto end_coredump; + } /* write out the notes section */ - if (!write_note_info(&info, cprm)) + if (!write_note_info(&info, cprm)) { + coredump_report_failure("Error writing out notes"); goto end_coredump; + } - /* For cell spufs */ - if (elf_coredump_extra_notes_write(cprm)) + /* For cell spufs and x86 xstate */ + if (elf_coredump_extra_notes_write(cprm)) { + coredump_report_failure("Error writing out extra notes"); goto end_coredump; + } /* Align to page */ dump_skip_to(cprm, dataoff); @@ -2113,16 +2136,22 @@ static int elf_core_dump(struct coredump_params *cprm) for (i = 0; i < cprm->vma_count; i++) { struct core_vma_metadata *meta = cprm->vma_meta + i; - if (!dump_user_range(cprm, meta->start, meta->dump_size)) + if (!dump_user_range(cprm, meta->start, meta->dump_size)) { + coredump_report_failure("Error writing out the process memory"); goto end_coredump; + } } - if (!elf_core_write_extra_data(cprm)) + if (!elf_core_write_extra_data(cprm)) { + coredump_report_failure("Error writing out extra data"); goto end_coredump; + } if (e_phnum == PN_XNUM) { - if (!dump_emit(cprm, shdr4extnum, sizeof(*shdr4extnum))) + if (!dump_emit(cprm, shdr4extnum, sizeof(*shdr4extnum))) { + coredump_report_failure("Error emitting extra program headers"); goto end_coredump; + } } end_coredump: diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 28a3439f163a..4fe5bb9f1b1f 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -589,6 +589,9 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, if (bprm->have_execfd) nitems++; +#ifdef ELF_HWCAP2 + nitems++; +#endif csp = sp; sp -= nitems * 2 * sizeof(unsigned long); diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index c26545d71d39..cd6d5bbb4b9d 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -72,8 +72,10 @@ #ifdef CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET #define DATA_START_OFFSET_WORDS (0) +#define MAX_SHARED_LIBS_UPDATE (0) #else #define DATA_START_OFFSET_WORDS (MAX_SHARED_LIBS) +#define MAX_SHARED_LIBS_UPDATE (MAX_SHARED_LIBS) #endif struct lib_info { @@ -880,7 +882,7 @@ static int load_flat_binary(struct linux_binprm *bprm) return res; /* Update data segment pointers for all libraries */ - for (i = 0; i < MAX_SHARED_LIBS; i++) { + for (i = 0; i < MAX_SHARED_LIBS_UPDATE; i++) { if (!libinfo.lib_list[i].loaded) continue; for (j = 0; j < MAX_SHARED_LIBS; j++) { diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index a2de5c05f97c..e2f478ecd7fd 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -219,8 +219,8 @@ static void free_pref(struct prelim_ref *ref) * A -1 return indicates ref1 is a 'lower' block than ref2, while 1 * indicates a 'higher' block. */ -static int prelim_ref_compare(struct prelim_ref *ref1, - struct prelim_ref *ref2) +static int prelim_ref_compare(const struct prelim_ref *ref1, + const struct prelim_ref *ref2) { if (ref1->level < ref2->level) return -1; @@ -251,7 +251,7 @@ static int prelim_ref_compare(struct prelim_ref *ref1, } static void update_share_count(struct share_check *sc, int oldcount, - int newcount, struct prelim_ref *newref) + int newcount, const struct prelim_ref *newref) { if ((!sc) || (oldcount == 0 && newcount < 1)) return; diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index f04d93109960..fec5c6cde0a7 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -53,7 +53,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, /* * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for - * btrfs, and is used for all I/O submitted through btrfs_submit_bio. + * btrfs, and is used for all I/O submitted through btrfs_submit_bbio(). * * Just like the underlying bio_alloc_bioset it will not fail as it is backed by * a mempool. @@ -73,20 +73,13 @@ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, struct btrfs_bio *orig_bbio, - u64 map_length, bool use_append) + u64 map_length) { struct btrfs_bio *bbio; struct bio *bio; - if (use_append) { - unsigned int nr_segs; - - bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs, - &btrfs_clone_bioset, map_length); - } else { - bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, - GFP_NOFS, &btrfs_clone_bioset); - } + bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, GFP_NOFS, + &btrfs_clone_bioset); bbio = btrfs_bio(bio); btrfs_bio_init(bbio, fs_info, NULL, orig_bbio); bbio->inode = orig_bbio->inode; @@ -120,12 +113,6 @@ static void __btrfs_bio_end_io(struct btrfs_bio *bbio) } } -void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) -{ - bbio->bio.bi_status = status; - __btrfs_bio_end_io(bbio); -} - static void btrfs_orig_write_end_io(struct bio *bio); static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, @@ -147,8 +134,9 @@ static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, } } -static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio) +void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) { + bbio->bio.bi_status = status; if (bbio->bio.bi_pool == &btrfs_clone_bioset) { struct btrfs_bio *orig_bbio = bbio->private; @@ -179,7 +167,7 @@ static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) static void btrfs_repair_done(struct btrfs_failed_bio *fbio) { if (atomic_dec_and_test(&fbio->repair_count)) { - btrfs_orig_bbio_end_io(fbio->bbio); + btrfs_bio_end_io(fbio->bbio, fbio->bbio->bio.bi_status); mempool_free(fbio, &btrfs_failed_bio_pool); } } @@ -211,7 +199,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, goto done; } - btrfs_submit_bio(repair_bbio, mirror); + btrfs_submit_bbio(repair_bbio, mirror); return; } @@ -280,7 +268,7 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); - btrfs_submit_bio(repair_bbio, mirror); + btrfs_submit_bbio(repair_bbio, mirror); return fbio; } @@ -326,7 +314,7 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de if (fbio) btrfs_repair_done(fbio); else - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); } static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) @@ -360,7 +348,7 @@ static void btrfs_end_bio_work(struct work_struct *work) if (is_data_bbio(bbio)) btrfs_check_read_bio(bbio, bbio->bio.bi_private); else - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); } static void btrfs_simple_end_io(struct bio *bio) @@ -380,7 +368,7 @@ static void btrfs_simple_end_io(struct bio *bio) } else { if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) btrfs_record_physical_zoned(bbio); - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); } } @@ -394,7 +382,7 @@ static void btrfs_raid56_end_io(struct bio *bio) if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) btrfs_check_read_bio(bbio, NULL); else - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); btrfs_put_bioc(bioc); } @@ -424,7 +412,7 @@ static void btrfs_orig_write_end_io(struct bio *bio) if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); btrfs_put_bioc(bioc); } @@ -502,8 +490,8 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); } -static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, - struct btrfs_io_stripe *smap, int mirror_num) +static void btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, + struct btrfs_io_stripe *smap, int mirror_num) { if (!bioc) { /* Single mirror read/write fast path. */ @@ -593,7 +581,7 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free) /* If an error occurred we just want to clean up the bio and move on. */ if (bio->bi_status) { - btrfs_orig_bbio_end_io(async->bbio); + btrfs_bio_end_io(async->bbio, async->bbio->bio.bi_status); return; } @@ -603,7 +591,7 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free) * context. This changes nothing when cgroups aren't in use. */ bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT; - __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); + btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); } static bool should_async_write(struct btrfs_bio *bbio) @@ -664,11 +652,23 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, return true; } +static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length) +{ + unsigned int nr_segs; + int sector_offset; + + map_length = min(map_length, bbio->fs_info->max_zone_append_size); + sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits, + &nr_segs, map_length); + if (sector_offset) + return sector_offset << SECTOR_SHIFT; + return map_length; +} + static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = bbio->fs_info; - struct btrfs_bio *orig_bbio = bbio; struct bio *bio = &bbio->bio; u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 length = bio->bi_iter.bi_size; @@ -679,7 +679,10 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) blk_status_t ret; int error; - smap.is_scrub = !bbio->inode; + if (!bbio->inode || btrfs_is_data_reloc_root(inode->root)) + smap.rst_search_commit_root = true; + else + smap.rst_search_commit_root = false; btrfs_bio_counter_inc_blocked(fs_info); error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, @@ -691,10 +694,10 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) map_length = min(map_length, length); if (use_append) - map_length = min(map_length, fs_info->max_zone_append_size); + map_length = btrfs_append_map_length(bbio, map_length); if (map_length < length) { - bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append); + bbio = btrfs_split_bio(fs_info, bbio, map_length); bio = &bbio->bio; } @@ -706,7 +709,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) bbio->saved_iter = bio->bi_iter; ret = btrfs_lookup_bio_sums(bbio); if (ret) - goto fail_put_bio; + goto fail; } if (btrfs_op(bio) == BTRFS_MAP_WRITE) { @@ -740,31 +743,40 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) ret = btrfs_bio_csum(bbio); if (ret) - goto fail_put_bio; + goto fail; } else if (use_append || (btrfs_is_zoned(fs_info) && inode && inode->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_alloc_dummy_sum(bbio); if (ret) - goto fail_put_bio; + goto fail; } } - __btrfs_submit_bio(bio, bioc, &smap, mirror_num); + btrfs_submit_bio(bio, bioc, &smap, mirror_num); done: return map_length == length; -fail_put_bio: - if (map_length < length) - btrfs_cleanup_bio(bbio); fail: btrfs_bio_counter_dec(fs_info); - btrfs_bio_end_io(orig_bbio, ret); + /* + * We have split the original bbio, now we have to end both the current + * @bbio and remaining one, as the remaining one will never be submitted. + */ + if (map_length < length) { + struct btrfs_bio *remaining = bbio->private; + + ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset); + ASSERT(remaining); + + btrfs_bio_end_io(remaining, ret); + } + btrfs_bio_end_io(bbio, ret); /* Do not submit another chunk */ return true; } -void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) +void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num) { /* If bbio->inode is not populated, its file_offset must be 0. */ ASSERT(bbio->inode || bbio->file_offset == 0); @@ -776,7 +788,7 @@ void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) /* * Submit a repair write. * - * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a + * This bypasses btrfs_submit_bbio() deliberately, as that writes all copies in a * RAID setup. Here we only want to write the one bad copy, so we do the * mapping ourselves and submit the bio directly. * @@ -865,7 +877,7 @@ void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_ ASSERT(smap.dev == fs_info->dev_replace.srcdev); smap.dev = fs_info->dev_replace.tgtdev; } - __btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num); + btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num); return; fail: diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index d9dd5276093d..e48612340745 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -29,7 +29,7 @@ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); /* * Highlevel btrfs I/O structure. It is allocated by btrfs_bio_alloc and - * passed to btrfs_submit_bio for mapping to the physical devices. + * passed to btrfs_submit_bbio() for mapping to the physical devices. */ struct btrfs_bio { /* @@ -42,7 +42,7 @@ struct btrfs_bio { union { /* * For data reads: checksumming and original I/O information. - * (for internal use in the btrfs_submit_bio machinery only) + * (for internal use in the btrfs_submit_bbio() machinery only) */ struct { u8 *csum; @@ -104,7 +104,7 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status); /* Submit using blkcg_punt_bio_submit. */ #define REQ_BTRFS_CGROUP_PUNT REQ_FS_PRIVATE -void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num); +void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num); void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace); int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, u64 length, u64 logical, struct folio *folio, diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 2e49d978f504..7980b2e33a92 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -23,7 +23,7 @@ #include "extent-tree.h" #ifdef CONFIG_BTRFS_DEBUG -int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group) +int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; @@ -40,9 +40,9 @@ int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group) * * Should be called with balance_lock held */ -static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) +static u64 get_restripe_target(const struct btrfs_fs_info *fs_info, u64 flags) { - struct btrfs_balance_control *bctl = fs_info->balance_ctl; + const struct btrfs_balance_control *bctl = fs_info->balance_ctl; u64 target = 0; if (!bctl) @@ -1415,9 +1415,9 @@ out: } static bool clean_pinned_extents(struct btrfs_trans_handle *trans, - struct btrfs_block_group *bg) + const struct btrfs_block_group *bg) { - struct btrfs_fs_info *fs_info = bg->fs_info; + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_transaction *prev_trans = NULL; const u64 start = bg->start; const u64 end = start + bg->length - 1; @@ -1756,14 +1756,14 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a, return bg1->used > bg2->used; } -static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info) +static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info) { if (btrfs_is_zoned(fs_info)) return btrfs_zoned_should_reclaim(fs_info); return true; } -static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed) +static bool should_reclaim_block_group(const struct btrfs_block_group *bg, u64 bytes_freed) { const int thresh_pct = btrfs_calc_reclaim_threshold(bg->space_info); u64 thresh_bytes = mult_perc(bg->length, thresh_pct); @@ -2006,8 +2006,8 @@ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg) spin_unlock(&fs_info->unused_bgs_lock); } -static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, - struct btrfs_path *path) +static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key *key, + const struct btrfs_path *path) { struct btrfs_chunk_map *map; struct btrfs_block_group_item bg; @@ -2055,7 +2055,7 @@ out_free_map: static int find_first_block_group(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - struct btrfs_key *key) + const struct btrfs_key *key) { struct btrfs_root *root = btrfs_block_group_root(fs_info); int ret; @@ -2640,8 +2640,8 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, } static int insert_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 chunk_offset, - u64 start, u64 num_bytes) + const struct btrfs_device *device, u64 chunk_offset, + u64 start, u64 num_bytes) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; @@ -2817,7 +2817,7 @@ next: * For extent tree v2 we use the block_group_item->chunk_offset to point at our * global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID. */ -static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset) +static u64 calculate_global_root_id(const struct btrfs_fs_info *fs_info, u64 offset) { u64 div = SZ_1G; u64 index; @@ -3842,8 +3842,8 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) } } -static int should_alloc_chunk(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *sinfo, int force) +static int should_alloc_chunk(const struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *sinfo, int force) { u64 bytes_used = btrfs_space_info_used(sinfo, false); u64 thresh; @@ -4218,7 +4218,7 @@ out: return ret; } -static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) +static u64 get_profile_num_devs(const struct btrfs_fs_info *fs_info, u64 type) { u64 num_dev; @@ -4622,7 +4622,7 @@ int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, return 0; } -bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg) +bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg) { if (btrfs_is_zoned(bg->fs_info)) return false; diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 915111338fc0..36937eeab9b8 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -266,7 +266,7 @@ struct btrfs_block_group { u64 reclaim_mark; }; -static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) +static inline u64 btrfs_block_group_end(const struct btrfs_block_group *block_group) { return (block_group->start + block_group->length); } @@ -278,8 +278,7 @@ static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg) return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0); } -static inline bool btrfs_is_block_group_data_only( - struct btrfs_block_group *block_group) +static inline bool btrfs_is_block_group_data_only(const struct btrfs_block_group *block_group) { /* * In mixed mode the fragmentation is expected to be high, lowering the @@ -290,7 +289,7 @@ static inline bool btrfs_is_block_group_data_only( } #ifdef CONFIG_BTRFS_DEBUG -int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group); +int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group); #endif struct btrfs_block_group *btrfs_lookup_first_block_group( @@ -370,7 +369,7 @@ static inline u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); } -static inline int btrfs_block_group_done(struct btrfs_block_group *cache) +static inline int btrfs_block_group_done(const struct btrfs_block_group *cache) { smp_mb(); return cache->cached == BTRFS_CACHE_FINISHED || @@ -387,6 +386,6 @@ enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size); int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, enum btrfs_block_group_size_class size_class, bool force_wrong_size_class); -bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg); +bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg); #endif /* BTRFS_BLOCK_GROUP_H */ diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index b299b82d676e..a07b9594dc70 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -553,7 +553,7 @@ try_reserve: return ERR_PTR(ret); } -int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, +int btrfs_check_trunc_cache_free_space(const struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv) { u64 needed_bytes; diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 1f53b967d069..d12b1fac5c74 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -89,7 +89,7 @@ void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info); struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize); -int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, +int btrfs_check_trunc_cache_free_space(const struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 3056c8aed8ef..9a4b7c119318 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -350,10 +350,12 @@ static inline void btrfs_set_first_dir_index_to_log(struct btrfs_inode *inode, WRITE_ONCE(inode->first_dir_index_to_log, index); } -static inline struct btrfs_inode *BTRFS_I(const struct inode *inode) -{ - return container_of(inode, struct btrfs_inode, vfs_inode); -} +/* Type checked and const-preserving VFS inode -> btrfs inode. */ +#define BTRFS_I(_inode) \ + _Generic(_inode, \ + struct inode *: container_of(_inode, struct btrfs_inode, vfs_inode), \ + const struct inode *: (const struct btrfs_inode *)container_of( \ + _inode, const struct btrfs_inode, vfs_inode)) static inline unsigned long btrfs_inode_hash(u64 objectid, const struct btrfs_root *root) @@ -505,6 +507,14 @@ static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode) return true; } +static inline void btrfs_assert_inode_locked(struct btrfs_inode *inode) +{ + /* Immediately trigger a crash if the inode is not locked. */ + ASSERT(inode_is_locked(&inode->vfs_inode)); + /* Trigger a splat in dmesg if this task is not holding the lock. */ + lockdep_assert_held(&inode->vfs_inode.i_rwsem); +} + /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes @@ -578,7 +588,7 @@ struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, struct btrfs_path *path); struct inode *btrfs_iget(u64 ino, struct btrfs_root *root); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, u64 start, u64 len); + struct folio *folio, u64 start, u64 len); int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, @@ -596,9 +606,9 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, struct btrfs_trans_handle *trans, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); -int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, +int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc); -int btrfs_writepage_cow_fixup(struct page *page); +int btrfs_writepage_cow_fixup(struct folio *folio); int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int compress_type); int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index a8e2c461aff7..90aef2627ca2 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -138,15 +138,15 @@ static int compression_decompress_bio(struct list_head *ws, } static int compression_decompress(int type, struct list_head *ws, - const u8 *data_in, struct page *dest_page, + const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { switch (type) { - case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page, + case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_folio, dest_pgoff, srclen, destlen); - case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page, + case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_folio, dest_pgoff, srclen, destlen); - case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page, + case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_folio, dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_NONE: default: @@ -395,7 +395,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, cb->bbio.ordered = ordered; btrfs_add_compressed_bio_folios(cb); - btrfs_submit_bio(&cb->bbio, 0); + btrfs_submit_bbio(&cb->bbio, 0); } /* @@ -420,7 +420,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size; u64 isize = i_size_read(inode); int ret; - struct page *page; + struct folio *folio; struct extent_map *em; struct address_space *mapping = inode->i_mapping; struct extent_map_tree *em_tree; @@ -453,9 +453,13 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (pg_index > end_index) break; - page = xa_load(&mapping->i_pages, pg_index); - if (page && !xa_is_value(page)) { - sectors_missed += (PAGE_SIZE - offset_in_page(cur)) >> + folio = __filemap_get_folio(mapping, pg_index, 0, 0); + if (!IS_ERR(folio)) { + u64 folio_sz = folio_size(folio); + u64 offset = offset_in_folio(folio, cur); + + folio_put(folio); + sectors_missed += (folio_sz - offset) >> fs_info->sectorsize_bits; /* Beyond threshold, no need to continue */ @@ -466,35 +470,35 @@ static noinline int add_ra_bio_pages(struct inode *inode, * Jump to next page start as we already have page for * current offset. */ - cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE; + cur += (folio_sz - offset); continue; } - page = __page_cache_alloc(mapping_gfp_constraint(mapping, - ~__GFP_FS)); - if (!page) + folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, + ~__GFP_FS), 0); + if (!folio) break; - if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) { - put_page(page); + if (filemap_add_folio(mapping, folio, pg_index, GFP_NOFS)) { /* There is already a page, skip to page end */ - cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE; + cur += folio_size(folio); + folio_put(folio); continue; } - if (!*memstall && PageWorkingset(page)) { + if (!*memstall && folio_test_workingset(folio)) { psi_memstall_enter(pflags); *memstall = 1; } - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); break; } - page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1; + page_end = (pg_index << PAGE_SHIFT) + folio_size(folio) - 1; lock_extent(tree, cur, page_end, NULL); read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur); @@ -511,28 +515,28 @@ static noinline int add_ra_bio_pages(struct inode *inode, orig_bio->bi_iter.bi_sector) { free_extent_map(em); unlock_extent(tree, cur, page_end, NULL); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); break; } add_size = min(em->start + em->len, page_end + 1) - cur; free_extent_map(em); + unlock_extent(tree, cur, page_end, NULL); - if (page->index == end_index) { - size_t zero_offset = offset_in_page(isize); + if (folio->index == end_index) { + size_t zero_offset = offset_in_folio(folio, isize); if (zero_offset) { int zeros; - zeros = PAGE_SIZE - zero_offset; - memzero_page(page, zero_offset, zeros); + zeros = folio_size(folio) - zero_offset; + folio_zero_range(folio, zero_offset, zeros); } } - ret = bio_add_page(orig_bio, page, add_size, offset_in_page(cur)); - if (ret != add_size) { - unlock_extent(tree, cur, page_end, NULL); - unlock_page(page); - put_page(page); + if (!bio_add_folio(orig_bio, folio, add_size, + offset_in_folio(folio, cur))) { + folio_unlock(folio); + folio_put(folio); break; } /* @@ -541,9 +545,9 @@ static noinline int add_ra_bio_pages(struct inode *inode, * subpage::readers and to unlock the page. */ if (fs_info->sectorsize < PAGE_SIZE) - btrfs_subpage_start_reader(fs_info, page_folio(page), - cur, add_size); - put_page(page); + btrfs_subpage_start_reader(fs_info, folio, cur, + add_size); + folio_put(folio); cur += add_size; } return 0; @@ -626,7 +630,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) if (memstall) psi_memstall_leave(&pflags); - btrfs_submit_bio(&cb->bbio, 0); + btrfs_submit_bbio(&cb->bbio, 0); return; out_free_compressed_pages: @@ -1057,10 +1061,10 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) * single page, and we want to read a single page out of it. * start_byte tells us the offset into the compressed data we're interested in */ -int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, +int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { - struct btrfs_fs_info *fs_info = page_to_fs_info(dest_page); + struct btrfs_fs_info *fs_info = folio_to_fs_info(dest_folio); struct list_head *workspace; const u32 sectorsize = fs_info->sectorsize; int ret; @@ -1073,7 +1077,7 @@ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize); workspace = get_workspace(type, 0); - ret = compression_decompress(type, workspace, data_in, dest_page, + ret = compression_decompress(type, workspace, data_in, dest_folio, dest_pgoff, srclen, destlen); put_workspace(type, workspace); diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index cfdc64319186..b6563b6a333e 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -82,13 +82,21 @@ static inline unsigned int btrfs_compress_level(unsigned int type_level) return ((type_level & 0xF0) >> 4); } +/* @range_end must be exclusive. */ +static inline u32 btrfs_calc_input_length(u64 range_end, u64 cur) +{ + u64 page_end = round_down(cur, PAGE_SIZE) + PAGE_SIZE; + + return min(range_end, page_end) - cur; +} + int __init btrfs_init_compress(void); void __cold btrfs_exit_compress(void); int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); -int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, +int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, unsigned long start_byte, size_t srclen, size_t destlen); int btrfs_decompress_buf2page(const char *buf, u32 buf_len, struct compressed_bio *cb, u32 decompressed); @@ -154,7 +162,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zlib_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); struct list_head *zlib_alloc_workspace(unsigned int level); void zlib_free_workspace(struct list_head *ws); @@ -165,7 +173,7 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); struct list_head *lzo_alloc_workspace(unsigned int level); void lzo_free_workspace(struct list_head *ws); @@ -175,7 +183,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zstd_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); void zstd_init_workspace_manager(void); void zstd_cleanup_workspace_manager(void); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 451203055bbf..0cc919d15b14 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2564,8 +2564,8 @@ int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, * */ static void fixup_low_keys(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_disk_key *key, int level) + const struct btrfs_path *path, + const struct btrfs_disk_key *key, int level) { int i; struct extent_buffer *t; @@ -2594,7 +2594,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans, * that the new key won't break the order */ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, - struct btrfs_path *path, + const struct btrfs_path *path, const struct btrfs_key *new_key) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -2660,8 +2660,8 @@ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, * is correct, we only need to bother the last key of @left and the first * key of @right. */ -static bool check_sibling_keys(struct extent_buffer *left, - struct extent_buffer *right) +static bool check_sibling_keys(const struct extent_buffer *left, + const struct extent_buffer *right) { struct btrfs_key left_last; struct btrfs_key right_first; @@ -2928,8 +2928,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, * blocknr is the block the key points to. */ static int insert_ptr(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_disk_key *key, u64 bytenr, + const struct btrfs_path *path, + const struct btrfs_disk_key *key, u64 bytenr, int slot, int level) { struct extent_buffer *lower; @@ -4019,7 +4019,7 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, * the front. */ void btrfs_truncate_item(struct btrfs_trans_handle *trans, - struct btrfs_path *path, u32 new_size, int from_end) + const struct btrfs_path *path, u32 new_size, int from_end) { int slot; struct extent_buffer *leaf; @@ -4111,7 +4111,7 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans, * make the item pointed to by the path bigger, data_size is the added size. */ void btrfs_extend_item(struct btrfs_trans_handle *trans, - struct btrfs_path *path, u32 data_size) + const struct btrfs_path *path, u32 data_size) { int slot; struct extent_buffer *leaf; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 75fa563e4cac..1a44fb9845e3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -6,6 +6,7 @@ #ifndef BTRFS_CTREE_H #define BTRFS_CTREE_H +#include "linux/cleanup.h" #include <linux/pagemap.h> #include <linux/spinlock.h> #include <linux/rbtree.h> @@ -84,6 +85,9 @@ struct btrfs_path { unsigned int nowait:1; }; +#define BTRFS_PATH_AUTO_FREE(path_name) \ + struct btrfs_path *path_name __free(btrfs_free_path) = NULL + /* * The state of btrfs root */ @@ -459,7 +463,6 @@ struct btrfs_file_private { void *filldir_buf; u64 last_index; struct extent_state *llseek_cached_state; - bool fsync_skip_inode_lock; }; static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) @@ -539,7 +542,7 @@ int btrfs_previous_item(struct btrfs_root *root, int btrfs_previous_extent_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid); void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, - struct btrfs_path *path, + const struct btrfs_path *path, const struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, @@ -573,9 +576,9 @@ bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); void btrfs_extend_item(struct btrfs_trans_handle *trans, - struct btrfs_path *path, u32 data_size); + const struct btrfs_path *path, u32 data_size); void btrfs_truncate_item(struct btrfs_trans_handle *trans, - struct btrfs_path *path, u32 new_size, int from_end); + const struct btrfs_path *path, u32 new_size, int from_end); int btrfs_split_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -599,6 +602,7 @@ int btrfs_search_slot_for_read(struct btrfs_root *root, void btrfs_release_path(struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); +DEFINE_FREE(btrfs_free_path, struct btrfs_path *, btrfs_free_path(_T)) int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int slot, int nr); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index f6dbda37a361..acf1f39e45d0 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -45,8 +45,8 @@ struct inode_defrag { u32 extent_thresh; }; -static int __compare_inode_defrag(struct inode_defrag *defrag1, - struct inode_defrag *defrag2) +static int compare_inode_defrag(const struct inode_defrag *defrag1, + const struct inode_defrag *defrag2) { if (defrag1->root > defrag2->root) return 1; @@ -61,16 +61,14 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1, } /* - * Pop a record for an inode into the defrag tree. The lock must be held + * Insert a record for an inode into the defrag tree. The lock must be held * already. * * If you're inserting a record for an older transid than an existing record, * the transid already in the tree is lowered. - * - * If an existing record is found the defrag item you pass in is freed. */ -static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, - struct inode_defrag *defrag) +static int btrfs_insert_inode_defrag(struct btrfs_inode *inode, + struct inode_defrag *defrag) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct inode_defrag *entry; @@ -83,7 +81,7 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, parent = *p; entry = rb_entry(parent, struct inode_defrag, rb_node); - ret = __compare_inode_defrag(defrag, entry); + ret = compare_inode_defrag(defrag, entry); if (ret < 0) p = &parent->rb_left; else if (ret > 0) @@ -107,7 +105,7 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, return 0; } -static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) +static inline int need_auto_defrag(struct btrfs_fs_info *fs_info) { if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) return 0; @@ -119,34 +117,28 @@ static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) } /* - * Insert a defrag record for this inode if auto defrag is enabled. + * Insert a defrag record for this inode if auto defrag is enabled. No errors + * returned as they're not considered fatal. */ -int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, u32 extent_thresh) +void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct inode_defrag *defrag; - u64 transid; int ret; - if (!__need_auto_defrag(fs_info)) - return 0; + if (!need_auto_defrag(fs_info)) + return; if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) - return 0; - - if (trans) - transid = trans->transid; - else - transid = btrfs_get_root_last_trans(root); + return; defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); if (!defrag) - return -ENOMEM; + return; defrag->ino = btrfs_ino(inode); - defrag->transid = transid; + defrag->transid = btrfs_get_root_last_trans(root); defrag->root = btrfs_root_id(root); defrag->extent_thresh = extent_thresh; @@ -157,14 +149,13 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, * and then re-read this inode, this new inode doesn't have * IN_DEFRAG flag. At the case, we may find the existed defrag. */ - ret = __btrfs_add_inode_defrag(inode, defrag); + ret = btrfs_insert_inode_defrag(inode, defrag); if (ret) kmem_cache_free(btrfs_inode_defrag_cachep, defrag); } else { kmem_cache_free(btrfs_inode_defrag_cachep, defrag); } spin_unlock(&fs_info->defrag_inodes_lock); - return 0; } /* @@ -189,7 +180,7 @@ static struct inode_defrag *btrfs_pick_defrag_inode( parent = p; entry = rb_entry(parent, struct inode_defrag, rb_node); - ret = __compare_inode_defrag(&tmp, entry); + ret = compare_inode_defrag(&tmp, entry); if (ret < 0) p = parent->rb_left; else if (ret > 0) @@ -198,7 +189,7 @@ static struct inode_defrag *btrfs_pick_defrag_inode( goto out; } - if (parent && __compare_inode_defrag(&tmp, entry) > 0) { + if (parent && compare_inode_defrag(&tmp, entry) > 0) { parent = rb_next(parent); if (parent) entry = rb_entry(parent, struct inode_defrag, rb_node); @@ -214,27 +205,22 @@ out: void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) { - struct inode_defrag *defrag; - struct rb_node *node; + struct inode_defrag *defrag, *next; spin_lock(&fs_info->defrag_inodes_lock); - node = rb_first(&fs_info->defrag_inodes); - while (node) { - rb_erase(node, &fs_info->defrag_inodes); - defrag = rb_entry(node, struct inode_defrag, rb_node); - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - cond_resched_lock(&fs_info->defrag_inodes_lock); + rbtree_postorder_for_each_entry_safe(defrag, next, + &fs_info->defrag_inodes, rb_node) + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - node = rb_first(&fs_info->defrag_inodes); - } spin_unlock(&fs_info->defrag_inodes_lock); } #define BTRFS_DEFRAG_BATCH 1024 -static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, - struct inode_defrag *defrag) +static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, + struct inode_defrag *defrag, + struct file_ra_state *ra) { struct btrfs_root *inode_root; struct inode *inode; @@ -245,7 +231,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, again: if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) goto cleanup; - if (!__need_auto_defrag(fs_info)) + if (!need_auto_defrag(fs_info)) goto cleanup; /* Get the inode */ @@ -273,9 +259,10 @@ again: range.len = (u64)-1; range.start = cur; range.extent_thresh = defrag->extent_thresh; + file_ra_state_init(ra, inode->i_mapping); sb_start_write(fs_info->sb); - ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, + ret = btrfs_defrag_file(inode, ra, &range, defrag->transid, BTRFS_DEFRAG_BATCH); sb_end_write(fs_info->sb); iput(inode); @@ -302,11 +289,13 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) atomic_inc(&fs_info->defrag_running); while (1) { + struct file_ra_state ra = { 0 }; + /* Pause the auto defragger. */ if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) break; - if (!__need_auto_defrag(fs_info)) + if (!need_auto_defrag(fs_info)) break; /* find an inode to defrag */ @@ -324,7 +313,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) first_ino = defrag->ino + 1; root_objectid = defrag->root; - __btrfs_run_defrag_inode(fs_info, defrag); + btrfs_run_defrag_inode(fs_info, defrag, &ra); } atomic_dec(&fs_info->defrag_running); @@ -1317,8 +1306,7 @@ static int defrag_one_cluster(struct btrfs_inode *inode, if (entry->start + range_len <= *last_scanned_ret) continue; - if (ra) - page_cache_sync_readahead(inode->vfs_inode.i_mapping, + page_cache_sync_readahead(inode->vfs_inode.i_mapping, ra, NULL, entry->start >> PAGE_SHIFT, ((entry->start + range_len - 1) >> PAGE_SHIFT) - (entry->start >> PAGE_SHIFT) + 1); @@ -1350,7 +1338,7 @@ out: * Entry point to file defragmentation. * * @inode: inode to be defragged - * @ra: readahead state (can be NUL) + * @ra: readahead state * @range: defrag options including range and flags * @newer_than: minimum transid to defrag * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode @@ -1372,12 +1360,13 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, u64 cur; u64 last_byte; bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); - bool ra_allocated = false; int compress_type = BTRFS_COMPRESS_ZLIB; int ret = 0; u32 extent_thresh = range->extent_thresh; pgoff_t start_index; + ASSERT(ra); + if (isize == 0) return 0; @@ -1407,18 +1396,6 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, last_byte = round_up(last_byte, fs_info->sectorsize) - 1; /* - * If we were not given a ra, allocate a readahead context. As - * readahead is just an optimization, defrag will work without it so - * we don't error out. - */ - if (!ra) { - ra_allocated = true; - ra = kzalloc(sizeof(*ra), GFP_KERNEL); - if (ra) - file_ra_state_init(ra, inode->i_mapping); - } - - /* * Make writeback start from the beginning of the range, so that the * defrag range can be written sequentially. */ @@ -1472,8 +1449,6 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, cond_resched(); } - if (ra_allocated) - kfree(ra); /* * Update range.start for autodefrag, this will indicate where to start * in next run. diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h index 878528e086fb..6b7596c4f0dc 100644 --- a/fs/btrfs/defrag.h +++ b/fs/btrfs/defrag.h @@ -18,8 +18,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, u64 newer_than, unsigned long max_to_defrag); int __init btrfs_auto_defrag_init(void); void __cold btrfs_auto_defrag_exit(void); -int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, u32 extent_thresh); +void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_defrag_root(struct btrfs_root *root); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 2ac9296edccb..ad9ef8312e41 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -855,11 +855,17 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, /* Record qgroup extent info if provided */ if (qrecord) { - if (btrfs_qgroup_trace_extent_nolock(trans->fs_info, - delayed_refs, qrecord)) + int ret; + + ret = btrfs_qgroup_trace_extent_nolock(trans->fs_info, + delayed_refs, qrecord); + if (ret) { + /* Clean up if insertion fails or item exists. */ + xa_release(&delayed_refs->dirty_extents, qrecord->bytenr); kfree(qrecord); - else + } else { qrecord_inserted = true; + } } trace_add_delayed_ref_head(trans->fs_info, head_ref, action); @@ -1005,18 +1011,16 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, return -ENOMEM; head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); - if (!head_ref) { - kmem_cache_free(btrfs_delayed_ref_node_cachep, node); - return -ENOMEM; - } + if (!head_ref) + goto free_node; if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); - if (!record) { - kmem_cache_free(btrfs_delayed_ref_node_cachep, node); - kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); - return -ENOMEM; - } + if (!record) + goto free_head_ref; + if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, + generic_ref->bytenr, GFP_NOFS)) + goto free_record; } init_delayed_ref_common(fs_info, node, generic_ref); @@ -1052,6 +1056,14 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, if (qrecord_inserted) return btrfs_qgroup_trace_extent_post(trans, record); return 0; + +free_record: + kfree(record); +free_head_ref: + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); +free_node: + kmem_cache_free(btrfs_delayed_ref_node_cachep, node); + return -ENOMEM; } /* @@ -1134,6 +1146,73 @@ btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 byt return find_ref_head(delayed_refs, bytenr, false); } +static int find_comp(struct btrfs_delayed_ref_node *entry, u64 root, u64 parent) +{ + int type = parent ? BTRFS_SHARED_BLOCK_REF_KEY : BTRFS_TREE_BLOCK_REF_KEY; + + if (type < entry->type) + return -1; + if (type > entry->type) + return 1; + + if (type == BTRFS_TREE_BLOCK_REF_KEY) { + if (root < entry->ref_root) + return -1; + if (root > entry->ref_root) + return 1; + } else { + if (parent < entry->parent) + return -1; + if (parent > entry->parent) + return 1; + } + return 0; +} + +/* + * Check to see if a given root/parent reference is attached to the head. This + * only checks for BTRFS_ADD_DELAYED_REF references that match, as that + * indicates the reference exists for the given root or parent. This is for + * tree blocks only. + * + * @head: the head of the bytenr we're searching. + * @root: the root objectid of the reference if it is a normal reference. + * @parent: the parent if this is a shared backref. + */ +bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head, + u64 root, u64 parent) +{ + struct rb_node *node; + bool found = false; + + lockdep_assert_held(&head->mutex); + + spin_lock(&head->lock); + node = head->ref_tree.rb_root.rb_node; + while (node) { + struct btrfs_delayed_ref_node *entry; + int ret; + + entry = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); + ret = find_comp(entry, root, parent); + if (ret < 0) { + node = node->rb_left; + } else if (ret > 0) { + node = node->rb_right; + } else { + /* + * We only want to count ADD actions, as drops mean the + * ref doesn't exist. + */ + if (entry->action == BTRFS_ADD_DELAYED_REF) + found = true; + break; + } + } + spin_unlock(&head->lock); + return found; +} + void __cold btrfs_delayed_ref_exit(void) { kmem_cache_destroy(btrfs_delayed_ref_head_cachep); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index ef15e998be03..085f30968aba 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -202,8 +202,8 @@ struct btrfs_delayed_ref_root { /* head ref rbtree */ struct rb_root_cached href_root; - /* dirty extent records */ - struct rb_root dirty_extent_root; + /* Track dirty extent records. */ + struct xarray dirty_extents; /* this spin lock protects the rbtree and the entries inside */ spinlock_t lock; @@ -389,6 +389,8 @@ void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info); int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush); bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); +bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head, + u64 root, u64 parent); static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node) { diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index f638c458d285..83d5cdd77f29 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -824,22 +824,45 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( struct btrfs_device *srcdev, struct btrfs_device *tgtdev) { - u64 start = 0; - int i; + struct rb_node *node; + + /* + * The chunk mutex must be held so that no new chunks can be created + * while we are updating existing chunks. This guarantees we don't miss + * any new chunk that gets created for a range that falls before the + * range of the last chunk we processed. + */ + lockdep_assert_held(&fs_info->chunk_mutex); write_lock(&fs_info->mapping_tree_lock); - do { + node = rb_first_cached(&fs_info->mapping_tree); + while (node) { + struct rb_node *next = rb_next(node); struct btrfs_chunk_map *map; + u64 next_start; - map = btrfs_find_chunk_map_nolock(fs_info, start, U64_MAX); - if (!map) - break; - for (i = 0; i < map->num_stripes; i++) + map = rb_entry(node, struct btrfs_chunk_map, rb_node); + next_start = map->start + map->chunk_len; + + for (int i = 0; i < map->num_stripes; i++) if (srcdev == map->stripes[i].dev) map->stripes[i].dev = tgtdev; - start = map->start + map->chunk_len; - btrfs_free_chunk_map(map); - } while (start); + + if (cond_resched_rwlock_write(&fs_info->mapping_tree_lock)) { + map = btrfs_find_chunk_map_nolock(fs_info, next_start, U64_MAX); + if (!map) + break; + node = &map->rb_node; + /* + * Drop the lookup reference since we are holding the + * lock in write mode and no one can remove the chunk + * map from the tree and drop its tree reference. + */ + btrfs_free_chunk_map(map); + } else { + node = next; + } + } write_unlock(&fs_info->mapping_tree_lock); } diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index 67adbe9d294a..bd38df5647e3 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -40,11 +40,21 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, struct btrfs_ordered_extent *ordered; int ret = 0; + /* Direct lock must be taken before the extent lock. */ + if (nowait) { + if (!try_lock_dio_extent(io_tree, lockstart, lockend, cached_state)) + return -EAGAIN; + } else { + lock_dio_extent(io_tree, lockstart, lockend, cached_state); + } + while (1) { if (nowait) { if (!try_lock_extent(io_tree, lockstart, lockend, - cached_state)) - return -EAGAIN; + cached_state)) { + ret = -EAGAIN; + break; + } } else { lock_extent(io_tree, lockstart, lockend, cached_state); } @@ -120,6 +130,8 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, cond_resched(); } + if (ret) + unlock_dio_extent(io_tree, lockstart, lockend, cached_state); return ret; } @@ -353,7 +365,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, int ret = 0; u64 len = length; const u64 data_alloc_len = length; - bool unlock_extents = false; + u32 unlock_bits = EXTENT_LOCKED; /* * We could potentially fault if we have a buffer > PAGE_SIZE, and if @@ -514,7 +526,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, start, &len, flags); if (ret < 0) goto unlock_err; - unlock_extents = true; /* Recalc len in case the new em is smaller than requested */ len = min(len, em->len - (start - em->start)); if (dio_data->data_space_reserved) { @@ -535,22 +546,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, release_offset, release_len); } - } else { - /* - * We need to unlock only the end area that we aren't using. - * The rest is going to be unlocked by the endio routine. - */ - lockstart = start + len; - if (lockstart < lockend) - unlock_extents = true; } - if (unlock_extents) - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); - else - free_extent_state(cached_state); - /* * Translate extent map information to iomap. * We trim the extents (and move the addr) even though iomap code does @@ -569,11 +566,33 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, iomap->length = len; free_extent_map(em); + /* + * Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed, + * writes only hold it for this part. We hold the extent lock until + * we're completely done with the extent map to make sure it remains + * valid. + */ + if (write) + unlock_bits |= EXTENT_DIO_LOCKED; + + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + unlock_bits, &cached_state); + + /* We didn't use everything, unlock the dio extent for the remainder. */ + if (!write && (start + len) < lockend) + unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len, + lockend, NULL); + return 0; unlock_err: - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); + /* + * Don't use EXTENT_LOCK_BITS here in case we extend it later and forget + * to update this, be explicit that we expect EXTENT_LOCKED and + * EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing. + */ + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state); err: if (dio_data->data_space_reserved) { btrfs_free_reserved_data_space(BTRFS_I(inode), @@ -596,8 +615,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, if (!write && (iomap->type == IOMAP_HOLE)) { /* If reading from a hole, unlock and return */ - unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1, - NULL); + unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1, NULL); return 0; } @@ -608,8 +627,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, btrfs_finish_ordered_extent(dio_data->ordered, NULL, pos, length, false); else - unlock_extent(&BTRFS_I(inode)->io_tree, pos, - pos + length - 1, NULL); + unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1, NULL); ret = -ENOTBLK; } if (write) { @@ -641,8 +660,8 @@ static void btrfs_dio_end_io(struct btrfs_bio *bbio) dip->file_offset, dip->bytes, !bio->bi_status); } else { - unlock_extent(&inode->io_tree, dip->file_offset, - dip->file_offset + dip->bytes - 1, NULL); + unlock_dio_extent(&inode->io_tree, dip->file_offset, + dip->file_offset + dip->bytes - 1, NULL); } bbio->bio.bi_private = bbio->private; @@ -726,7 +745,7 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, } } - btrfs_submit_bio(bbio, 0); + btrfs_submit_bbio(bbio, 0); } static const struct iomap_ops btrfs_dio_iomap_ops = { @@ -864,13 +883,6 @@ again: if (IS_ERR_OR_NULL(dio)) { ret = PTR_ERR_OR_ZERO(dio); } else { - struct btrfs_file_private stack_private = { 0 }; - struct btrfs_file_private *private; - const bool have_private = (file->private_data != NULL); - - if (!have_private) - file->private_data = &stack_private; - /* * If we have a synchronous write, we must make sure the fsync * triggered by the iomap_dio_complete() call below doesn't @@ -879,13 +891,10 @@ again: * partial writes due to the input buffer (or parts of it) not * being already faulted in. */ - private = file->private_data; - private->fsync_skip_inode_lock = true; + ASSERT(current->journal_info == NULL); + current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB; ret = iomap_dio_complete(dio); - private->fsync_skip_inode_lock = false; - - if (!have_private) - file->private_data = NULL; + current->journal_info = NULL; } /* No increment (+=) because iomap returns a cumulative value. */ diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 944a7340f6a4..e815d165cccc 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -68,7 +68,7 @@ static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = { }; static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, - struct btrfs_block_group *block_group) + const struct btrfs_block_group *block_group) { return &discard_ctl->discard_list[block_group->discard_index]; } @@ -80,7 +80,7 @@ static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, * * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. */ -static bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) +static bool btrfs_run_discard_work(const struct btrfs_discard_ctl *discard_ctl) { struct btrfs_fs_info *fs_info = container_of(discard_ctl, struct btrfs_fs_info, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a6f5441e62d1..25d768e67e37 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -525,7 +525,7 @@ static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags) if (folio_test_writeback(folio) || folio_test_dirty(folio)) return false; - return try_release_extent_buffer(&folio->page); + return try_release_extent_buffer(folio); } static void btree_invalidate_folio(struct folio *folio, size_t offset, @@ -1285,7 +1285,6 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); - kfree(fs_info->subpage_info); kvfree(fs_info); } @@ -3322,6 +3321,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; fs_info->sectorsize_bits = ilog2(sectorsize); + fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits); fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; @@ -3346,20 +3346,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize); - if (sectorsize < PAGE_SIZE) { - struct btrfs_subpage_info *subpage_info; - + if (sectorsize < PAGE_SIZE) btrfs_warn(fs_info, "read-write for sector size %u with page size %lu is experimental", sectorsize, PAGE_SIZE); - subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL); - if (!subpage_info) { - ret = -ENOMEM; - goto fail_alloc; - } - btrfs_init_subpage_info(subpage_info, sectorsize); - fs_info->subpage_info = subpage_info; - } ret = btrfs_init_workqueues(fs_info); if (ret) diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index c54c5d7a5cd5..6d08c100b01d 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -126,7 +126,7 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info, * Empty an io tree, removing and freeing every extent state record from the * tree. This should be called once we are sure no other task can access the * tree anymore, so no tree updates happen after we empty the tree and there - * aren't any waiters on any extent state record (EXTENT_LOCKED bit is never + * aren't any waiters on any extent state record (EXTENT_LOCK_BITS are never * set on any extent state when calling this function). */ void extent_io_tree_release(struct extent_io_tree *tree) @@ -141,7 +141,7 @@ void extent_io_tree_release(struct extent_io_tree *tree) rbtree_postorder_for_each_entry_safe(state, tmp, &root, rb_node) { /* Clear node to keep free_extent_state() happy. */ RB_CLEAR_NODE(&state->rb_node); - ASSERT(!(state->state & EXTENT_LOCKED)); + ASSERT(!(state->state & EXTENT_LOCK_BITS)); /* * No need for a memory barrier here, as we are holding the tree * lock and we only change the waitqueue while holding that lock @@ -399,7 +399,7 @@ static void merge_next_state(struct extent_io_tree *tree, struct extent_state *s */ static void merge_state(struct extent_io_tree *tree, struct extent_state *state) { - if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY)) + if (state->state & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY)) return; merge_prev_state(tree, state); @@ -445,7 +445,7 @@ static struct extent_state *insert_state(struct extent_io_tree *tree, struct rb_node *parent = NULL; const u64 start = state->start - 1; const u64 end = state->end + 1; - const bool try_merge = !(bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)); + const bool try_merge = !(bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY)); set_state_bits(tree, state, bits, changeset); @@ -616,9 +616,6 @@ static void set_gfp_mask_from_bits(u32 *bits, gfp_t *mask) * inserting elements in the tree, so the gfp mask is used to indicate which * allocations or sleeping are allowed. * - * Pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove the given - * range from the tree regardless of state (ie for truncate). - * * The range [start, end] is inclusive. * * This takes the tree lock, and returns 0 on success and < 0 on error. @@ -647,8 +644,8 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (bits & EXTENT_DELALLOC) bits |= EXTENT_NORESERVE; - wake = (bits & EXTENT_LOCKED) ? 1 : 0; - if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)) + wake = ((bits & EXTENT_LOCK_BITS) ? 1 : 0); + if (bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY)) clear = 1; again: if (!prealloc) { @@ -861,8 +858,7 @@ static void cache_state_if_flags(struct extent_state *state, static void cache_state(struct extent_state *state, struct extent_state **cached_ptr) { - return cache_state_if_flags(state, cached_ptr, - EXTENT_LOCKED | EXTENT_BOUNDARY); + return cache_state_if_flags(state, cached_ptr, EXTENT_LOCK_BITS | EXTENT_BOUNDARY); } /* @@ -1063,7 +1059,7 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int ret = 0; u64 last_start; u64 last_end; - u32 exclusive_bits = (bits & EXTENT_LOCKED); + u32 exclusive_bits = (bits & EXTENT_LOCK_BITS); gfp_t mask; set_gfp_mask_from_bits(&bits, &mask); @@ -1812,12 +1808,11 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset) { /* - * We don't support EXTENT_LOCKED yet, as current changeset will - * record any bits changed, so for EXTENT_LOCKED case, it will - * either fail with -EEXIST or changeset will record the whole - * range. + * We don't support EXTENT_LOCK_BITS yet, as current changeset will + * record any bits changed, so for EXTENT_LOCK_BITS case, it will either + * fail with -EEXIST or changeset will record the whole range. */ - ASSERT(!(bits & EXTENT_LOCKED)); + ASSERT(!(bits & EXTENT_LOCK_BITS)); return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset); } @@ -1826,26 +1821,25 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset) { /* - * Don't support EXTENT_LOCKED case, same reason as + * Don't support EXTENT_LOCK_BITS case, same reason as * set_record_extent_bits(). */ - ASSERT(!(bits & EXTENT_LOCKED)); + ASSERT(!(bits & EXTENT_LOCK_BITS)); return __clear_extent_bit(tree, start, end, bits, NULL, changeset); } -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached) +bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached) { int err; u64 failed_start; - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, + err = __set_extent_bit(tree, start, end, bits, &failed_start, NULL, cached, NULL); if (err == -EEXIST) { if (failed_start > start) - clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, cached); + clear_extent_bit(tree, start, failed_start - 1, bits, cached); return 0; } return 1; @@ -1855,23 +1849,22 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, * Either insert or lock state struct between start and end use mask to tell * us if waiting is desired. */ -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state) +int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached_state) { struct extent_state *failed_state = NULL; int err; u64 failed_start; - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, + err = __set_extent_bit(tree, start, end, bits, &failed_start, &failed_state, cached_state, NULL); while (err == -EEXIST) { if (failed_start != start) clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, cached_state); + bits, cached_state); - wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED, - &failed_state); - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, + wait_extent_bit(tree, failed_start, end, bits, &failed_state); + err = __set_extent_bit(tree, start, end, bits, &failed_start, &failed_state, cached_state, NULL); } diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 9d3a52d8f59a..6ffef1cd37c1 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -19,6 +19,7 @@ enum { ENUM_BIT(EXTENT_DIRTY), ENUM_BIT(EXTENT_UPTODATE), ENUM_BIT(EXTENT_LOCKED), + ENUM_BIT(EXTENT_DIO_LOCKED), ENUM_BIT(EXTENT_NEW), ENUM_BIT(EXTENT_DELALLOC), ENUM_BIT(EXTENT_DEFRAG), @@ -67,6 +68,8 @@ enum { EXTENT_ADD_INODE_BYTES | \ EXTENT_CLEAR_ALL_BITS) +#define EXTENT_LOCK_BITS (EXTENT_LOCKED | EXTENT_DIO_LOCKED) + /* * Redefined bits above which are used only in the device allocation tree, * shouldn't be using EXTENT_LOCKED / EXTENT_BOUNDARY / EXTENT_CLEAR_META_RESV @@ -134,12 +137,22 @@ const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tre void extent_io_tree_init(struct btrfs_fs_info *fs_info, struct extent_io_tree *tree, unsigned int owner); void extent_io_tree_release(struct extent_io_tree *tree); +int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached); +bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached); -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached); +static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached) +{ + return __lock_extent(tree, start, end, EXTENT_LOCKED, cached); +} -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached); +static inline bool try_lock_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) +{ + return __try_lock_extent(tree, start, end, EXTENT_LOCKED, cached); +} int __init extent_state_init_cachep(void); void __cold extent_state_free_cachep(void); @@ -212,5 +225,22 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, u64 *end, u64 max_bytes, struct extent_state **cached_state); +static inline int lock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) +{ + return __lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached); +} + +static inline bool try_lock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) +{ + return __try_lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached); +} + +static inline int unlock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) +{ + return __clear_extent_bit(tree, start, end, EXTENT_DIO_LOCKED, cached, NULL); +} #endif /* BTRFS_EXTENT_IO_TREE_H */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ff9f0d41987e..a5966324607d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5472,23 +5472,62 @@ static int check_ref_exists(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 parent, int level) { + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_head *head; struct btrfs_path *path; struct btrfs_extent_inline_ref *iref; int ret; + bool exists = false; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - +again: ret = lookup_extent_backref(trans, path, &iref, bytenr, root->fs_info->nodesize, parent, btrfs_root_id(root), level, 0); + if (ret != -ENOENT) { + /* + * If we get 0 then we found our reference, return 1, else + * return the error if it's not -ENOENT; + */ + btrfs_free_path(path); + return (ret < 0 ) ? ret : 1; + } + + /* + * We could have a delayed ref with this reference, so look it up while + * we're holding the path open to make sure we don't race with the + * delayed ref running. + */ + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + if (!head) + goto out; + if (!mutex_trylock(&head->mutex)) { + /* + * We're contended, means that the delayed ref is running, get a + * reference and wait for the ref head to be complete and then + * try again. + */ + refcount_inc(&head->refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(path); + + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref_head(head); + goto again; + } + + exists = btrfs_find_delayed_tree_ref(head, root->root_key.objectid, parent); + mutex_unlock(&head->mutex); +out: + spin_unlock(&delayed_refs->lock); btrfs_free_path(path); - if (ret == -ENOENT) - return 0; - if (ret < 0) - return ret; - return 1; + return exists ? 1 : 0; } /* @@ -6512,13 +6551,13 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) continue; ret = btrfs_trim_free_extents(device, &group_trimmed); + + trimmed += group_trimmed; if (ret) { dev_failed++; dev_ret = ret; break; } - - trimmed += group_trimmed; } mutex_unlock(&fs_devices->device_list_mutex); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index aa7f8148cd0d..39c9677c47d5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -101,6 +101,13 @@ struct btrfs_bio_ctrl { blk_opf_t opf; btrfs_bio_end_io_t end_io_func; struct writeback_control *wbc; + + /* + * The sectors of the page which are going to be submitted by + * extent_writepage_io(). + * This is to avoid touching ranges covered by compression/inline. + */ + unsigned long submit_bitmap; }; static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) @@ -117,7 +124,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) btrfs_submit_compressed_read(bbio); else - btrfs_submit_bio(bbio, 0); + btrfs_submit_bbio(bbio, 0); /* The bbio is owned by the end_io handler now */ bio_ctrl->bbio = NULL; @@ -164,11 +171,10 @@ void __cold extent_buffer_free_cachep(void) kmem_cache_destroy(extent_buffer_cache); } -static void process_one_page(struct btrfs_fs_info *fs_info, - struct page *page, const struct page *locked_page, - unsigned long page_ops, u64 start, u64 end) +static void process_one_folio(struct btrfs_fs_info *fs_info, + struct folio *folio, const struct folio *locked_folio, + unsigned long page_ops, u64 start, u64 end) { - struct folio *folio = page_folio(page); u32 len; ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX); @@ -183,13 +189,13 @@ static void process_one_page(struct btrfs_fs_info *fs_info, if (page_ops & PAGE_END_WRITEBACK) btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len); - if (page != locked_page && (page_ops & PAGE_UNLOCK)) + if (folio != locked_folio && (page_ops & PAGE_UNLOCK)) btrfs_folio_end_writer_lock(fs_info, folio, start, len); } -static void __process_pages_contig(struct address_space *mapping, - const struct page *locked_page, u64 start, u64 end, - unsigned long page_ops) +static void __process_folios_contig(struct address_space *mapping, + const struct folio *locked_folio, u64 start, + u64 end, unsigned long page_ops) { struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); pgoff_t start_index = start >> PAGE_SHIFT; @@ -207,8 +213,8 @@ static void __process_pages_contig(struct address_space *mapping, for (i = 0; i < found_folios; i++) { struct folio *folio = fbatch.folios[i]; - process_one_page(fs_info, &folio->page, locked_page, - page_ops, start, end); + process_one_folio(fs_info, folio, locked_folio, + page_ops, start, end); } folio_batch_release(&fbatch); cond_resched(); @@ -216,24 +222,23 @@ static void __process_pages_contig(struct address_space *mapping, } static noinline void __unlock_for_delalloc(const struct inode *inode, - const struct page *locked_page, + const struct folio *locked_folio, u64 start, u64 end) { unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; - ASSERT(locked_page); - if (index == locked_page->index && end_index == index) + ASSERT(locked_folio); + if (index == locked_folio->index && end_index == index) return; - __process_pages_contig(inode->i_mapping, locked_page, start, end, - PAGE_UNLOCK); + __process_folios_contig(inode->i_mapping, locked_folio, start, end, + PAGE_UNLOCK); } -static noinline int lock_delalloc_pages(struct inode *inode, - const struct page *locked_page, - u64 start, - u64 end) +static noinline int lock_delalloc_folios(struct inode *inode, + const struct folio *locked_folio, + u64 start, u64 end) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct address_space *mapping = inode->i_mapping; @@ -243,7 +248,7 @@ static noinline int lock_delalloc_pages(struct inode *inode, u64 processed_end = start; struct folio_batch fbatch; - if (index == locked_page->index && index == end_index) + if (index == locked_folio->index && index == end_index) return 0; folio_batch_init(&fbatch); @@ -257,23 +262,22 @@ static noinline int lock_delalloc_pages(struct inode *inode, for (i = 0; i < found_folios; i++) { struct folio *folio = fbatch.folios[i]; - struct page *page = folio_page(folio, 0); u32 len = end + 1 - start; - if (page == locked_page) + if (folio == locked_folio) continue; if (btrfs_folio_start_writer_lock(fs_info, folio, start, len)) goto out; - if (!PageDirty(page) || page->mapping != mapping) { + if (!folio_test_dirty(folio) || folio->mapping != mapping) { btrfs_folio_end_writer_lock(fs_info, folio, start, len); goto out; } - processed_end = page_offset(page) + PAGE_SIZE - 1; + processed_end = folio_pos(folio) + folio_size(folio) - 1; } folio_batch_release(&fbatch); cond_resched(); @@ -283,7 +287,8 @@ static noinline int lock_delalloc_pages(struct inode *inode, out: folio_batch_release(&fbatch); if (processed_end > start) - __unlock_for_delalloc(inode, locked_page, start, processed_end); + __unlock_for_delalloc(inode, locked_folio, start, + processed_end); return -EAGAIN; } @@ -304,8 +309,8 @@ out: */ EXPORT_FOR_TESTS noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, - struct page *locked_page, u64 *start, - u64 *end) + struct folio *locked_folio, + u64 *start, u64 *end) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; @@ -323,9 +328,9 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, /* Caller should pass a valid @end to indicate the search range end */ ASSERT(orig_end > orig_start); - /* The range should at least cover part of the page */ - ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE || - orig_end <= page_offset(locked_page))); + /* The range should at least cover part of the folio */ + ASSERT(!(orig_start >= folio_pos(locked_folio) + folio_size(locked_folio) || + orig_end <= folio_pos(locked_folio))); again: /* step one, find a bunch of delalloc bytes starting at start */ delalloc_start = *start; @@ -342,25 +347,25 @@ again: } /* - * start comes from the offset of locked_page. We have to lock - * pages in order, so we can't process delalloc bytes before - * locked_page + * start comes from the offset of locked_folio. We have to lock + * folios in order, so we can't process delalloc bytes before + * locked_folio */ if (delalloc_start < *start) delalloc_start = *start; /* - * make sure to limit the number of pages we try to lock down + * make sure to limit the number of folios we try to lock down */ if (delalloc_end + 1 - delalloc_start > max_bytes) delalloc_end = delalloc_start + max_bytes - 1; - /* step two, lock all the pages after the page that has start */ - ret = lock_delalloc_pages(inode, locked_page, - delalloc_start, delalloc_end); + /* step two, lock all the folioss after the folios that has start */ + ret = lock_delalloc_folios(inode, locked_folio, delalloc_start, + delalloc_end); ASSERT(!ret || ret == -EAGAIN); if (ret == -EAGAIN) { - /* some of the pages are gone, lets avoid looping by + /* some of the folios are gone, lets avoid looping by * shortening the size of the delalloc range we're searching */ free_extent_state(cached_state); @@ -384,8 +389,8 @@ again: unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); if (!ret) { - __unlock_for_delalloc(inode, locked_page, - delalloc_start, delalloc_end); + __unlock_for_delalloc(inode, locked_folio, delalloc_start, + delalloc_end); cond_resched(); goto again; } @@ -396,40 +401,41 @@ out_failed: } void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, - const struct page *locked_page, + const struct folio *locked_folio, struct extent_state **cached, u32 clear_bits, unsigned long page_ops) { clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached); - __process_pages_contig(inode->vfs_inode.i_mapping, locked_page, - start, end, page_ops); + __process_folios_contig(inode->vfs_inode.i_mapping, locked_folio, start, + end, page_ops); } -static bool btrfs_verify_page(struct page *page, u64 start) +static bool btrfs_verify_folio(struct folio *folio, u64 start, u32 len) { - if (!fsverity_active(page->mapping->host) || - PageUptodate(page) || - start >= i_size_read(page->mapping->host)) + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); + + if (!fsverity_active(folio->mapping->host) || + btrfs_folio_test_uptodate(fs_info, folio, start, len) || + start >= i_size_read(folio->mapping->host)) return true; - return fsverity_verify_page(page); + return fsverity_verify_folio(folio); } -static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) +static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 len) { - struct btrfs_fs_info *fs_info = page_to_fs_info(page); - struct folio *folio = page_folio(page); + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); - ASSERT(page_offset(page) <= start && - start + len <= page_offset(page) + PAGE_SIZE); + ASSERT(folio_pos(folio) <= start && + start + len <= folio_pos(folio) + PAGE_SIZE); - if (uptodate && btrfs_verify_page(page, start)) + if (uptodate && btrfs_verify_folio(folio, start, len)) btrfs_folio_set_uptodate(fs_info, folio, start, len); else btrfs_folio_clear_uptodate(fs_info, folio, start, len); - if (!btrfs_is_subpage(fs_info, page->mapping)) - unlock_page(page); + if (!btrfs_is_subpage(fs_info, folio->mapping)) + folio_unlock(folio); else btrfs_subpage_end_reader(fs_info, folio, start, len); } @@ -471,8 +477,8 @@ static void end_bbio_data_write(struct btrfs_bio *bbio) "incomplete page write with offset %zu and length %zu", fi.offset, fi.length); - btrfs_finish_ordered_extent(bbio->ordered, - folio_page(folio, 0), start, len, !error); + btrfs_finish_ordered_extent(bbio->ordered, folio, start, len, + !error); if (error) mapping_set_error(folio->mapping, error); btrfs_folio_clear_writeback(fs_info, folio, start, len); @@ -481,85 +487,14 @@ static void end_bbio_data_write(struct btrfs_bio *bbio) bio_put(bio); } -/* - * Record previously processed extent range - * - * For endio_readpage_release_extent() to handle a full extent range, reducing - * the extent io operations. - */ -struct processed_extent { - struct btrfs_inode *inode; - /* Start of the range in @inode */ - u64 start; - /* End of the range in @inode */ - u64 end; - bool uptodate; -}; - -/* - * Try to release processed extent range - * - * May not release the extent range right now if the current range is - * contiguous to processed extent. - * - * Will release processed extent when any of @inode, @uptodate, the range is - * no longer contiguous to the processed range. - * - * Passing @inode == NULL will force processed extent to be released. - */ -static void endio_readpage_release_extent(struct processed_extent *processed, - struct btrfs_inode *inode, u64 start, u64 end, - bool uptodate) -{ - struct extent_state *cached = NULL; - struct extent_io_tree *tree; - - /* The first extent, initialize @processed */ - if (!processed->inode) - goto update; - - /* - * Contiguous to processed extent, just uptodate the end. - * - * Several things to notice: - * - * - bio can be merged as long as on-disk bytenr is contiguous - * This means we can have page belonging to other inodes, thus need to - * check if the inode still matches. - * - bvec can contain range beyond current page for multi-page bvec - * Thus we need to do processed->end + 1 >= start check - */ - if (processed->inode == inode && processed->uptodate == uptodate && - processed->end + 1 >= start && end >= processed->end) { - processed->end = end; - return; - } - - tree = &processed->inode->io_tree; - /* - * Now we don't have range contiguous to the processed range, release - * the processed range now. - */ - unlock_extent(tree, processed->start, processed->end, &cached); - -update: - /* Update processed to current range */ - processed->inode = inode; - processed->start = start; - processed->end = end; - processed->uptodate = uptodate; -} - -static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) +static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) { - struct folio *folio = page_folio(page); - ASSERT(folio_test_locked(folio)); if (!btrfs_is_subpage(fs_info, folio->mapping)) return; ASSERT(folio_test_private(folio)); - btrfs_subpage_start_reader(fs_info, folio, page_offset(page), PAGE_SIZE); + btrfs_subpage_start_reader(fs_info, folio, folio_pos(folio), PAGE_SIZE); } /* @@ -578,7 +513,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) { struct btrfs_fs_info *fs_info = bbio->fs_info; struct bio *bio = &bbio->bio; - struct processed_extent processed = { 0 }; struct folio_iter fi; const u32 sectorsize = fs_info->sectorsize; @@ -642,12 +576,8 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) } /* Update page status and unlock. */ - end_page_read(folio_page(folio, 0), uptodate, start, len); - endio_readpage_release_extent(&processed, BTRFS_I(inode), - start, end, uptodate); + end_folio_read(folio, uptodate, start, len); } - /* Release the last extent */ - endio_readpage_release_extent(&processed, NULL, 0, 0, false); bio_put(bio); } @@ -737,12 +667,13 @@ static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail) } static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, - struct page *page, u64 disk_bytenr, + struct folio *folio, u64 disk_bytenr, unsigned int pg_offset) { struct bio *bio = &bio_ctrl->bbio->bio; struct bio_vec *bvec = bio_last_bvec_all(bio); const sector_t sector = disk_bytenr >> SECTOR_SHIFT; + struct folio *bv_folio = page_folio(bvec->bv_page); if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { /* @@ -755,7 +686,7 @@ static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, /* * The contig check requires the following conditions to be met: * - * 1) The pages are belonging to the same inode + * 1) The folios are belonging to the same inode * This is implied by the call chain. * * 2) The range has adjacent logical bytenr @@ -764,8 +695,8 @@ static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, * This is required for the usage of btrfs_bio->file_offset. */ return bio_end_sector(bio) == sector && - page_offset(bvec->bv_page) + bvec->bv_offset + bvec->bv_len == - page_offset(page) + pg_offset; + folio_pos(bv_folio) + bvec->bv_offset + bvec->bv_len == + folio_pos(folio) + pg_offset; } static void alloc_new_bio(struct btrfs_inode *inode, @@ -818,17 +749,17 @@ static void alloc_new_bio(struct btrfs_inode *inode, * The mirror number for this IO should already be initizlied in * @bio_ctrl->mirror_num. */ -static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, - u64 disk_bytenr, struct page *page, +static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, + u64 disk_bytenr, struct folio *folio, size_t size, unsigned long pg_offset) { - struct btrfs_inode *inode = page_to_inode(page); + struct btrfs_inode *inode = folio_to_inode(folio); ASSERT(pg_offset + size <= PAGE_SIZE); ASSERT(bio_ctrl->end_io_func); if (bio_ctrl->bbio && - !btrfs_bio_is_contig(bio_ctrl, page, disk_bytenr, pg_offset)) + !btrfs_bio_is_contig(bio_ctrl, folio, disk_bytenr, pg_offset)) submit_one_bio(bio_ctrl); do { @@ -837,7 +768,7 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, /* Allocate new bio if needed */ if (!bio_ctrl->bbio) { alloc_new_bio(inode, bio_ctrl, disk_bytenr, - page_offset(page) + pg_offset); + folio_pos(folio) + pg_offset); } /* Cap to the current ordered extent boundary if there is one. */ @@ -847,21 +778,22 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, len = bio_ctrl->len_to_oe_boundary; } - if (bio_add_page(&bio_ctrl->bbio->bio, page, len, pg_offset) != len) { + if (!bio_add_folio(&bio_ctrl->bbio->bio, folio, len, pg_offset)) { /* bio full: move on to a new one */ submit_one_bio(bio_ctrl); continue; } if (bio_ctrl->wbc) - wbc_account_cgroup_owner(bio_ctrl->wbc, page, len); + wbc_account_cgroup_owner(bio_ctrl->wbc, &folio->page, + len); size -= len; pg_offset += len; disk_bytenr += len; /* - * len_to_oe_boundary defaults to U32_MAX, which isn't page or + * len_to_oe_boundary defaults to U32_MAX, which isn't folio or * sector aligned. alloc_new_bio() then sets it to the end of * our ordered extent for writes into zoned devices. * @@ -871,15 +803,15 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, * boundary is correct. * * When len_to_oe_boundary is U32_MAX, the cap above would - * result in a 4095 byte IO for the last page right before - * we hit the bio limit of UINT_MAX. bio_add_page() has all + * result in a 4095 byte IO for the last folio right before + * we hit the bio limit of UINT_MAX. bio_add_folio() has all * the checks required to make sure we don't overflow the bio, * and we should just ignore len_to_oe_boundary completely * unless we're using it to track an ordered extent. * * It's pretty hard to make a bio sized U32_MAX, but it can * happen when the page cache is able to feed us contiguous - * pages for large extents. + * folios for large extents. */ if (bio_ctrl->len_to_oe_boundary != U32_MAX) bio_ctrl->len_to_oe_boundary -= len; @@ -952,27 +884,28 @@ int set_folio_extent_mapped(struct folio *folio) return 0; } -void clear_page_extent_mapped(struct page *page) +void clear_folio_extent_mapped(struct folio *folio) { - struct folio *folio = page_folio(page); struct btrfs_fs_info *fs_info; - ASSERT(page->mapping); + ASSERT(folio->mapping); if (!folio_test_private(folio)) return; - fs_info = page_to_fs_info(page); - if (btrfs_is_subpage(fs_info, page->mapping)) + fs_info = folio_to_fs_info(folio); + if (btrfs_is_subpage(fs_info, folio->mapping)) return btrfs_detach_subpage(fs_info, folio); folio_detach_private(folio); } -static struct extent_map *__get_extent_map(struct inode *inode, struct page *page, - u64 start, u64 len, struct extent_map **em_cached) +static struct extent_map *__get_extent_map(struct inode *inode, + struct folio *folio, u64 start, + u64 len, struct extent_map **em_cached) { struct extent_map *em; + struct extent_state *cached_state = NULL; ASSERT(em_cached); @@ -988,12 +921,15 @@ static struct extent_map *__get_extent_map(struct inode *inode, struct page *pag *em_cached = NULL; } - em = btrfs_get_extent(BTRFS_I(inode), page, start, len); + btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), start, start + len - 1, &cached_state); + em = btrfs_get_extent(BTRFS_I(inode), folio, start, len); if (!IS_ERR(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); *em_cached = em; } + unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state); + return em; } /* @@ -1003,12 +939,12 @@ static struct extent_map *__get_extent_map(struct inode *inode, struct page *pag * XXX JDM: This needs looking at to ensure proper page locking * return 0 on success, otherwise return error */ -static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, +static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - u64 start = page_offset(page); + u64 start = folio_pos(folio); const u64 end = start + PAGE_SIZE - 1; u64 cur = start; u64 extent_offset; @@ -1019,25 +955,23 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, size_t pg_offset = 0; size_t iosize; size_t blocksize = fs_info->sectorsize; - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) { - unlock_extent(tree, start, end, NULL); - unlock_page(page); + folio_unlock(folio); return ret; } - if (page->index == last_byte >> PAGE_SHIFT) { - size_t zero_offset = offset_in_page(last_byte); + if (folio->index == last_byte >> folio_shift(folio)) { + size_t zero_offset = offset_in_folio(folio, last_byte); if (zero_offset) { - iosize = PAGE_SIZE - zero_offset; - memzero_page(page, zero_offset, iosize); + iosize = folio_size(folio) - zero_offset; + folio_zero_range(folio, zero_offset, iosize); } } bio_ctrl->end_io_func = end_bbio_data_read; - begin_page_read(fs_info, page); + begin_folio_read(fs_info, folio); while (cur <= end) { enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; bool force_bio_submit = false; @@ -1045,16 +979,15 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) { - iosize = PAGE_SIZE - pg_offset; - memzero_page(page, pg_offset, iosize); - unlock_extent(tree, cur, cur + iosize - 1, NULL); - end_page_read(page, true, cur, iosize); + iosize = folio_size(folio) - pg_offset; + folio_zero_range(folio, pg_offset, iosize); + end_folio_read(folio, true, cur, iosize); break; } - em = __get_extent_map(inode, page, cur, end - cur + 1, em_cached); + em = __get_extent_map(inode, folio, cur, end - cur + 1, + em_cached); if (IS_ERR(em)) { - unlock_extent(tree, cur, end, NULL); - end_page_read(page, false, cur, end + 1 - cur); + end_folio_read(folio, false, cur, end + 1 - cur); return PTR_ERR(em); } extent_offset = cur - em->start; @@ -1079,8 +1012,8 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, * to the same compressed extent (possibly with a different * offset and/or length, so it either points to the whole extent * or only part of it), we must make sure we do not submit a - * single bio to populate the pages for the 2 ranges because - * this makes the compressed extent read zero out the pages + * single bio to populate the folios for the 2 ranges because + * this makes the compressed extent read zero out the folios * belonging to the 2nd range. Imagine the following scenario: * * File layout @@ -1093,13 +1026,13 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, * [extent X, compressed length = 4K uncompressed length = 16K] * * If the bio to read the compressed extent covers both ranges, - * it will decompress extent X into the pages belonging to the + * it will decompress extent X into the folios belonging to the * first range and then it will stop, zeroing out the remaining - * pages that belong to the other range that points to extent X. + * folios that belong to the other range that points to extent X. * So here we make sure we submit 2 bios, one for the first * range and another one for the third range. Both will target * the same physical extent from disk, but we can't currently - * make the compressed bio endio callback populate the pages + * make the compressed bio endio callback populate the folios * for both ranges because each compressed bio is tightly * coupled with a single extent map, and each range can have * an extent map with a different offset value relative to the @@ -1120,18 +1053,16 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, /* we've found a hole, just zero and go on */ if (block_start == EXTENT_MAP_HOLE) { - memzero_page(page, pg_offset, iosize); + folio_zero_range(folio, pg_offset, iosize); - unlock_extent(tree, cur, cur + iosize - 1, NULL); - end_page_read(page, true, cur, iosize); + end_folio_read(folio, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; } - /* the get_extent function already copied into the page */ + /* the get_extent function already copied into the folio */ if (block_start == EXTENT_MAP_INLINE) { - unlock_extent(tree, cur, cur + iosize - 1, NULL); - end_page_read(page, true, cur, iosize); + end_folio_read(folio, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; @@ -1144,8 +1075,8 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, if (force_bio_submit) submit_one_bio(bio_ctrl); - submit_extent_page(bio_ctrl, disk_bytenr, page, iosize, - pg_offset); + submit_extent_folio(bio_ctrl, disk_bytenr, folio, iosize, + pg_offset); cur = cur + iosize; pg_offset += iosize; } @@ -1155,17 +1086,11 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, int btrfs_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct btrfs_inode *inode = page_to_inode(page); - u64 start = page_offset(page); - u64 end = start + PAGE_SIZE - 1; struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; struct extent_map *em_cached = NULL; int ret; - btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - - ret = btrfs_do_readpage(page, &em_cached, &bio_ctrl, NULL); + ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); free_extent_map(em_cached); /* @@ -1176,28 +1101,8 @@ int btrfs_read_folio(struct file *file, struct folio *folio) return ret; } -static inline void contiguous_readpages(struct page *pages[], int nr_pages, - u64 start, u64 end, - struct extent_map **em_cached, - struct btrfs_bio_ctrl *bio_ctrl, - u64 *prev_em_start) -{ - struct btrfs_inode *inode = page_to_inode(pages[0]); - int index; - - ASSERT(em_cached); - - btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - - for (index = 0; index < nr_pages; index++) { - btrfs_do_readpage(pages[index], em_cached, bio_ctrl, - prev_em_start); - put_page(pages[index]); - } -} - /* - * helper for __extent_writepage, doing all of the delayed allocation setup. + * helper for extent_writepage(), doing all of the delayed allocation setup. * * This returns 1 if btrfs_run_delalloc_range function did all the work required * to write the page (copy into inline extent). In this case the IO has @@ -1207,13 +1112,14 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, * This returns < 0 if there were errors (page still locked) */ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, - struct page *page, struct writeback_control *wbc) + struct folio *folio, + struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode); - struct folio *folio = page_folio(page); - const bool is_subpage = btrfs_is_subpage(fs_info, page->mapping); - const u64 page_start = page_offset(page); - const u64 page_end = page_start + PAGE_SIZE - 1; + struct writeback_control *wbc = bio_ctrl->wbc; + const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping); + const u64 page_start = folio_pos(folio); + const u64 page_end = page_start + folio_size(folio) - 1; /* * Save the last found delalloc end. As the delalloc end can go beyond * page boundary, thus we cannot rely on subpage bitmap to locate the @@ -1225,10 +1131,18 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, u64 delalloc_to_write = 0; int ret = 0; - /* Lock all (subpage) delalloc ranges inside the page first. */ + /* Save the dirty bitmap as our submission bitmap will be a subset of it. */ + if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) { + ASSERT(fs_info->sectors_per_page > 1); + btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap); + } else { + bio_ctrl->submit_bitmap = 1; + } + + /* Lock all (subpage) delalloc ranges inside the folio first. */ while (delalloc_start < page_end) { delalloc_end = page_end; - if (!find_lock_delalloc_range(&inode->vfs_inode, page, + if (!find_lock_delalloc_range(&inode->vfs_inode, folio, &delalloc_start, &delalloc_end)) { delalloc_start = delalloc_end + 1; continue; @@ -1253,7 +1167,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, if (!is_subpage) { /* * For non-subpage case, the found delalloc range must - * cover this page and there must be only one locked + * cover this folio and there must be only one locked * delalloc range. */ found_start = page_start; @@ -1267,7 +1181,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, break; /* * The subpage range covers the last sector, the delalloc range may - * end beyond the page boundary, use the saved delalloc_end + * end beyond the folio boundary, use the saved delalloc_end * instead. */ if (found_start + found_len >= page_end) @@ -1275,7 +1189,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, if (ret >= 0) { /* No errors hit so far, run the current delalloc range. */ - ret = btrfs_run_delalloc_range(inode, page, found_start, + ret = btrfs_run_delalloc_range(inode, folio, + found_start, found_start + found_len - 1, wbc); } else { @@ -1285,30 +1200,27 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, */ unlock_extent(&inode->io_tree, found_start, found_start + found_len - 1, NULL); - __unlock_for_delalloc(&inode->vfs_inode, page, found_start, + __unlock_for_delalloc(&inode->vfs_inode, folio, + found_start, found_start + found_len - 1); } /* - * We can hit btrfs_run_delalloc_range() with >0 return value. - * - * This happens when either the IO is already done and page - * unlocked (inline) or the IO submission and page unlock would - * be handled as async (compression). - * - * Inline is only possible for regular sectorsize for now. - * - * Compression is possible for both subpage and regular cases, - * but even for subpage compression only happens for page aligned - * range, thus the found delalloc range must go beyond current - * page. + * We have some ranges that's going to be submitted asynchronously + * (compression or inline). These range have their own control + * on when to unlock the pages. We should not touch them + * anymore, so clear the range from the submission bitmap. */ - if (ret > 0) - ASSERT(!is_subpage || found_start + found_len >= page_end); - + if (ret > 0) { + unsigned int start_bit = (found_start - page_start) >> + fs_info->sectorsize_bits; + unsigned int end_bit = (min(page_end + 1, found_start + found_len) - + page_start) >> fs_info->sectorsize_bits; + bitmap_clear(&bio_ctrl->submit_bitmap, start_bit, end_bit - start_bit); + } /* - * Above btrfs_run_delalloc_range() may have unlocked the page, - * thus for the last range, we cannot touch the page anymore. + * Above btrfs_run_delalloc_range() may have unlocked the folio, + * thus for the last range, we cannot touch the folio anymore. */ if (found_start + found_len >= last_delalloc_end + 1) break; @@ -1330,10 +1242,10 @@ out: DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE); /* - * If btrfs_run_dealloc_range() already started I/O and unlocked - * the pages, we just need to account for them here. + * If all ranges are submitted asynchronously, we just need to account + * for them here. */ - if (ret == 1) { + if (bitmap_empty(&bio_ctrl->submit_bitmap, fs_info->sectors_per_page)) { wbc->nr_to_write -= delalloc_to_write; return 1; } @@ -1351,182 +1263,148 @@ out: } /* - * Find the first byte we need to write. + * Return 0 if we have submitted or queued the sector for submission. + * Return <0 for critical errors. * - * For subpage, one page can contain several sectors, and - * __extent_writepage_io() will just grab all extent maps in the page - * range and try to submit all non-inline/non-compressed extents. - * - * This is a big problem for subpage, we shouldn't re-submit already written - * data at all. - * This function will lookup subpage dirty bit to find which range we really - * need to submit. - * - * Return the next dirty range in [@start, @end). - * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE. + * Caller should make sure filepos < i_size and handle filepos >= i_size case. */ -static void find_next_dirty_byte(const struct btrfs_fs_info *fs_info, - struct page *page, u64 *start, u64 *end) +static int submit_one_sector(struct btrfs_inode *inode, + struct folio *folio, + u64 filepos, struct btrfs_bio_ctrl *bio_ctrl, + loff_t i_size) { - struct folio *folio = page_folio(page); - struct btrfs_subpage *subpage = folio_get_private(folio); - struct btrfs_subpage_info *spi = fs_info->subpage_info; - u64 orig_start = *start; - /* Declare as unsigned long so we can use bitmap ops */ - unsigned long flags; - int range_start_bit; - int range_end_bit; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_map *em; + u64 block_start; + u64 disk_bytenr; + u64 extent_offset; + u64 em_end; + const u32 sectorsize = fs_info->sectorsize; - /* - * For regular sector size == page size case, since one page only - * contains one sector, we return the page offset directly. - */ - if (!btrfs_is_subpage(fs_info, page->mapping)) { - *start = page_offset(page); - *end = page_offset(page) + PAGE_SIZE; - return; - } + ASSERT(IS_ALIGNED(filepos, sectorsize)); + + /* @filepos >= i_size case should be handled by the caller. */ + ASSERT(filepos < i_size); - range_start_bit = spi->dirty_offset + - (offset_in_page(orig_start) >> fs_info->sectorsize_bits); + em = btrfs_get_extent(inode, NULL, filepos, sectorsize); + if (IS_ERR(em)) + return PTR_ERR_OR_ZERO(em); - /* We should have the page locked, but just in case */ - spin_lock_irqsave(&subpage->lock, flags); - bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit, - spi->dirty_offset + spi->bitmap_nr_bits); - spin_unlock_irqrestore(&subpage->lock, flags); + extent_offset = filepos - em->start; + em_end = extent_map_end(em); + ASSERT(filepos <= em_end); + ASSERT(IS_ALIGNED(em->start, sectorsize)); + ASSERT(IS_ALIGNED(em->len, sectorsize)); - range_start_bit -= spi->dirty_offset; - range_end_bit -= spi->dirty_offset; + block_start = extent_map_block_start(em); + disk_bytenr = extent_map_block_start(em) + extent_offset; - *start = page_offset(page) + range_start_bit * fs_info->sectorsize; - *end = page_offset(page) + range_end_bit * fs_info->sectorsize; + ASSERT(!extent_map_is_compressed(em)); + ASSERT(block_start != EXTENT_MAP_HOLE); + ASSERT(block_start != EXTENT_MAP_INLINE); + + free_extent_map(em); + em = NULL; + + btrfs_set_range_writeback(inode, filepos, filepos + sectorsize - 1); + /* + * Above call should set the whole folio with writeback flag, even + * just for a single subpage sector. + * As long as the folio is properly locked and the range is correct, + * we should always get the folio with writeback flag. + */ + ASSERT(folio_test_writeback(folio)); + + /* + * Although the PageDirty bit is cleared before entering this + * function, subpage dirty bit is not cleared. + * So clear subpage dirty bit here so next time we won't submit + * folio for range already written to disk. + */ + btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); + submit_extent_folio(bio_ctrl, disk_bytenr, folio, + sectorsize, filepos - folio_pos(folio)); + return 0; } /* - * helper for __extent_writepage. This calls the writepage start hooks, + * Helper for extent_writepage(). This calls the writepage start hooks, * and does the loop to map the page into extents and bios. * * We return 1 if the IO is started and the page is unlocked, * 0 if all went well (page still locked) * < 0 if there were errors (page still locked) */ -static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, - struct page *page, u64 start, u32 len, - struct btrfs_bio_ctrl *bio_ctrl, - loff_t i_size, - int *nr_ret) +static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, + struct folio *folio, + u64 start, u32 len, + struct btrfs_bio_ctrl *bio_ctrl, + loff_t i_size) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - u64 cur = start; - u64 end = start + len - 1; - u64 extent_offset; - u64 block_start; - struct extent_map *em; + unsigned long range_bitmap = 0; + bool submitted_io = false; + const u64 folio_start = folio_pos(folio); + u64 cur; + int bit; int ret = 0; - int nr = 0; - ASSERT(start >= page_offset(page) && - start + len <= page_offset(page) + PAGE_SIZE); + ASSERT(start >= folio_start && + start + len <= folio_start + folio_size(folio)); - ret = btrfs_writepage_cow_fixup(page); + ret = btrfs_writepage_cow_fixup(folio); if (ret) { /* Fixup worker will requeue */ - redirty_page_for_writepage(bio_ctrl->wbc, page); - unlock_page(page); + folio_redirty_for_writepage(bio_ctrl->wbc, folio); + folio_unlock(folio); return 1; } + for (cur = start; cur < start + len; cur += fs_info->sectorsize) + set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); + bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap, + fs_info->sectors_per_page); + bio_ctrl->end_io_func = end_bbio_data_write; - while (cur <= end) { - u32 len = end - cur + 1; - u64 disk_bytenr; - u64 em_end; - u64 dirty_range_start = cur; - u64 dirty_range_end; - u32 iosize; + + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) { + cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits); if (cur >= i_size) { - btrfs_mark_ordered_io_finished(inode, page, cur, len, - true); + btrfs_mark_ordered_io_finished(inode, folio, cur, + start + len - cur, true); /* * This range is beyond i_size, thus we don't need to * bother writing back. * But we still need to clear the dirty subpage bit, or - * the next time the page gets dirtied, we will try to + * the next time the folio gets dirtied, we will try to * writeback the sectors with subpage dirty bits, * causing writeback without ordered extent. */ - btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, len); + btrfs_folio_clear_dirty(fs_info, folio, cur, + start + len - cur); break; } - - find_next_dirty_byte(fs_info, page, &dirty_range_start, - &dirty_range_end); - if (cur < dirty_range_start) { - cur = dirty_range_start; - continue; - } - - em = btrfs_get_extent(inode, NULL, cur, len); - if (IS_ERR(em)) { - ret = PTR_ERR_OR_ZERO(em); - goto out_error; - } - - extent_offset = cur - em->start; - em_end = extent_map_end(em); - ASSERT(cur <= em_end); - ASSERT(cur < end); - ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize)); - ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize)); - - block_start = extent_map_block_start(em); - disk_bytenr = extent_map_block_start(em) + extent_offset; - - ASSERT(!extent_map_is_compressed(em)); - ASSERT(block_start != EXTENT_MAP_HOLE); - ASSERT(block_start != EXTENT_MAP_INLINE); - - /* - * Note that em_end from extent_map_end() and dirty_range_end from - * find_next_dirty_byte() are all exclusive - */ - iosize = min(min(em_end, end + 1), dirty_range_end) - cur; - free_extent_map(em); - em = NULL; - - btrfs_set_range_writeback(inode, cur, cur + iosize - 1); - if (!PageWriteback(page)) { - btrfs_err(inode->root->fs_info, - "page %lu not writeback, cur %llu end %llu", - page->index, cur, end); - } - - /* - * Although the PageDirty bit is cleared before entering this - * function, subpage dirty bit is not cleared. - * So clear subpage dirty bit here so next time we won't submit - * page for range already written to disk. - */ - btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, iosize); - - submit_extent_page(bio_ctrl, disk_bytenr, page, iosize, - cur - page_offset(page)); - cur += iosize; - nr++; + ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size); + if (ret < 0) + goto out; + submitted_io = true; } - btrfs_folio_assert_not_dirty(fs_info, page_folio(page), start, len); - *nr_ret = nr; - return 0; - -out_error: + btrfs_folio_assert_not_dirty(fs_info, folio, start, len); +out: /* - * If we finish without problem, we should not only clear page dirty, - * but also empty subpage dirty bits + * If we didn't submitted any sector (>= i_size), folio dirty get + * cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared + * by folio_start_writeback() if the folio is not dirty). + * + * Here we set writeback and clear for the range. If the full folio + * is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag. */ - *nr_ret = nr; + if (!submitted_io) { + btrfs_folio_set_writeback(fs_info, folio, start, len); + btrfs_folio_clear_writeback(fs_info, folio, start, len); + } return ret; } @@ -1539,62 +1417,65 @@ out_error: * Return 0 if everything goes well. * Return <0 for error. */ -static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl) +static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl) { - struct folio *folio = page_folio(page); - struct inode *inode = page->mapping->host; - const u64 page_start = page_offset(page); + struct inode *inode = folio->mapping->host; + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + const u64 page_start = folio_pos(folio); int ret; - int nr = 0; size_t pg_offset; loff_t i_size = i_size_read(inode); unsigned long end_index = i_size >> PAGE_SHIFT; - trace___extent_writepage(page, inode, bio_ctrl->wbc); + trace_extent_writepage(folio, inode, bio_ctrl->wbc); - WARN_ON(!PageLocked(page)); + WARN_ON(!folio_test_locked(folio)); - pg_offset = offset_in_page(i_size); - if (page->index > end_index || - (page->index == end_index && !pg_offset)) { + pg_offset = offset_in_folio(folio, i_size); + if (folio->index > end_index || + (folio->index == end_index && !pg_offset)) { folio_invalidate(folio, 0, folio_size(folio)); folio_unlock(folio); return 0; } - if (page->index == end_index) - memzero_page(page, pg_offset, PAGE_SIZE - pg_offset); + if (folio->index == end_index) + folio_zero_range(folio, pg_offset, folio_size(folio) - pg_offset); - ret = set_page_extent_mapped(page); + /* + * Default to unlock the whole folio. + * The proper bitmap can only be initialized until writepage_delalloc(). + */ + bio_ctrl->submit_bitmap = (unsigned long)-1; + ret = set_folio_extent_mapped(folio); if (ret < 0) goto done; - ret = writepage_delalloc(BTRFS_I(inode), page, bio_ctrl->wbc); + ret = writepage_delalloc(BTRFS_I(inode), folio, bio_ctrl); if (ret == 1) return 0; if (ret) goto done; - ret = __extent_writepage_io(BTRFS_I(inode), page, page_offset(page), - PAGE_SIZE, bio_ctrl, i_size, &nr); + ret = extent_writepage_io(BTRFS_I(inode), folio, folio_pos(folio), + PAGE_SIZE, bio_ctrl, i_size); if (ret == 1) return 0; bio_ctrl->wbc->nr_to_write--; done: - if (nr == 0) { - /* make sure the mapping tag for page dirty gets cleared */ - set_page_writeback(page); - end_page_writeback(page); - } if (ret) { - btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, page_start, - PAGE_SIZE, !ret); - mapping_set_error(page->mapping, ret); + btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio, + page_start, PAGE_SIZE, !ret); + mapping_set_error(folio->mapping, ret); } - btrfs_folio_end_all_writers(inode_to_fs_info(inode), folio); + /* + * Only unlock ranges that are submitted. As there can be some async + * submitted ranges inside the folio. + */ + btrfs_folio_end_writer_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap); ASSERT(ret <= 0); return ret; } @@ -1846,7 +1727,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, folio_unlock(folio); } } - btrfs_submit_bio(bbio, 0); + btrfs_submit_bbio(bbio, 0); } /* @@ -1863,17 +1744,16 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, * Return >=0 for the number of submitted extent buffers. * Return <0 for fatal error. */ -static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) +static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc) { - struct btrfs_fs_info *fs_info = page_to_fs_info(page); - struct folio *folio = page_folio(page); + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); int submitted = 0; - u64 page_start = page_offset(page); + u64 folio_start = folio_pos(folio); int bit_start = 0; int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; /* Lock and write each dirty extent buffers in the range */ - while (bit_start < fs_info->subpage_info->bitmap_nr_bits) { + while (bit_start < fs_info->sectors_per_page) { struct btrfs_subpage *subpage = folio_get_private(folio); struct extent_buffer *eb; unsigned long flags; @@ -1883,21 +1763,21 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) * Take private lock to ensure the subpage won't be detached * in the meantime. */ - spin_lock(&page->mapping->i_private_lock); + spin_lock(&folio->mapping->i_private_lock); if (!folio_test_private(folio)) { - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); break; } spin_lock_irqsave(&subpage->lock, flags); - if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset, + if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * fs_info->sectors_per_page, subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); bit_start++; continue; } - start = page_start + bit_start * fs_info->sectorsize; + start = folio_start + bit_start * fs_info->sectorsize; bit_start += sectors_per_node; /* @@ -1906,7 +1786,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) */ eb = find_extent_buffer_nolock(fs_info, start); spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); /* * The eb has already reached 0 refs thus find_extent_buffer() @@ -1945,19 +1825,18 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) * previous call. * Return <0 for fatal error. */ -static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) +static int submit_eb_page(struct folio *folio, struct btrfs_eb_write_context *ctx) { struct writeback_control *wbc = ctx->wbc; - struct address_space *mapping = page->mapping; - struct folio *folio = page_folio(page); + struct address_space *mapping = folio->mapping; struct extent_buffer *eb; int ret; if (!folio_test_private(folio)) return 0; - if (page_to_fs_info(page)->nodesize < PAGE_SIZE) - return submit_eb_subpage(page, wbc); + if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE) + return submit_eb_subpage(folio, wbc); spin_lock(&mapping->i_private_lock); if (!folio_test_private(folio)) { @@ -2055,7 +1934,7 @@ retry: for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; - ret = submit_eb_page(&folio->page, &ctx); + ret = submit_eb_page(folio, &ctx); if (ret == 0) continue; if (ret < 0) { @@ -2109,7 +1988,7 @@ retry: * extent io tree. Thus we don't want to submit such wild eb * if the fs already has error. * - * We can get ret > 0 from submit_extent_page() indicating how many ebs + * We can get ret > 0 from submit_extent_folio() indicating how many ebs * were submitted. Reset it to 0 to avoid false alerts for the caller. */ if (ret > 0) @@ -2248,7 +2127,7 @@ retry: continue; } - ret = __extent_writepage(&folio->page, bio_ctrl); + ret = extent_writepage(folio, bio_ctrl); if (ret < 0) { done = 1; break; @@ -2295,7 +2174,7 @@ retry: * already been ran (aka, ordered extent inserted) and all pages are still * locked. */ -void extent_write_locked_range(struct inode *inode, const struct page *locked_page, +void extent_write_locked_range(struct inode *inode, const struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty) { @@ -2319,37 +2198,46 @@ void extent_write_locked_range(struct inode *inode, const struct page *locked_pa while (cur <= end) { u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); u32 cur_len = cur_end + 1 - cur; - struct page *page; - int nr = 0; + struct folio *folio; - page = find_get_page(mapping, cur >> PAGE_SHIFT); - ASSERT(PageLocked(page)); - if (pages_dirty && page != locked_page) - ASSERT(PageDirty(page)); + folio = __filemap_get_folio(mapping, cur >> PAGE_SHIFT, 0, 0); - ret = __extent_writepage_io(BTRFS_I(inode), page, cur, cur_len, - &bio_ctrl, i_size, &nr); + /* + * This shouldn't happen, the pages are pinned and locked, this + * code is just in case, but shouldn't actually be run. + */ + if (IS_ERR(folio)) { + btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, + cur, cur_len, false); + mapping_set_error(mapping, PTR_ERR(folio)); + cur = cur_end + 1; + continue; + } + + ASSERT(folio_test_locked(folio)); + if (pages_dirty && folio != locked_folio) + ASSERT(folio_test_dirty(folio)); + + /* + * Set the submission bitmap to submit all sectors. + * extent_writepage_io() will do the truncation correctly. + */ + bio_ctrl.submit_bitmap = (unsigned long)-1; + ret = extent_writepage_io(BTRFS_I(inode), folio, cur, cur_len, + &bio_ctrl, i_size); if (ret == 1) goto next_page; - /* Make sure the mapping tag for page dirty gets cleared. */ - if (nr == 0) { - struct folio *folio; - - folio = page_folio(page); - btrfs_folio_set_writeback(fs_info, folio, cur, cur_len); - btrfs_folio_clear_writeback(fs_info, folio, cur, cur_len); - } if (ret) { - btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, + btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio, cur, cur_len, !ret); - mapping_set_error(page->mapping, ret); + mapping_set_error(mapping, ret); } - btrfs_folio_unlock_writer(fs_info, page_folio(page), cur, cur_len); + btrfs_folio_end_writer_lock(fs_info, folio, cur, cur_len); if (ret < 0) found_error = true; next_page: - put_page(page); + folio_put(folio); cur = cur_end + 1; } @@ -2379,18 +2267,12 @@ int btrfs_writepages(struct address_space *mapping, struct writeback_control *wb void btrfs_readahead(struct readahead_control *rac) { struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD }; - struct page *pagepool[16]; + struct folio *folio; struct extent_map *em_cached = NULL; u64 prev_em_start = (u64)-1; - int nr; - - while ((nr = readahead_page_batch(rac, pagepool))) { - u64 contig_start = readahead_pos(rac); - u64 contig_end = contig_start + readahead_batch_length(rac) - 1; - contiguous_readpages(pagepool, nr, contig_start, contig_end, - &em_cached, &bio_ctrl, &prev_em_start); - } + while ((folio = readahead_folio(rac)) != NULL) + btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); if (em_cached) free_extent_map(em_cached); @@ -2435,9 +2317,9 @@ int extent_invalidate_folio(struct extent_io_tree *tree, * to drop the page. */ static bool try_release_extent_state(struct extent_io_tree *tree, - struct page *page, gfp_t mask) + struct folio *folio, gfp_t mask) { - u64 start = page_offset(page); + u64 start = folio_pos(folio); u64 end = start + PAGE_SIZE - 1; bool ret; @@ -2473,11 +2355,11 @@ static bool try_release_extent_state(struct extent_io_tree *tree, * in the range corresponding to the page, both state records and extent * map records are removed */ -bool try_release_extent_mapping(struct page *page, gfp_t mask) +bool try_release_extent_mapping(struct folio *folio, gfp_t mask) { - u64 start = page_offset(page); + u64 start = folio_pos(folio); u64 end = start + PAGE_SIZE - 1; - struct btrfs_inode *inode = page_to_inode(page); + struct btrfs_inode *inode = folio_to_inode(folio); struct extent_io_tree *io_tree = &inode->io_tree; while (start <= end) { @@ -2546,7 +2428,7 @@ next: cond_resched(); } } - return try_release_extent_state(io_tree, page, mask); + return try_release_extent_state(io_tree, folio, mask); } static void __free_extent_buffer(struct extent_buffer *eb) @@ -2572,7 +2454,7 @@ static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *foli return true; /* * Even there is no eb refs here, we may still have - * end_page_read() call relying on page::private. + * end_folio_read() call relying on page::private. */ if (atomic_read(&subpage->readers)) return true; @@ -3615,7 +3497,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, ASSERT(ret); } } - btrfs_submit_bio(bbio, mirror_num); + btrfs_submit_bbio(bbio, mirror_num); done: if (wait == WAIT_COMPLETE) { @@ -4171,17 +4053,17 @@ void memmove_extent_buffer(const struct extent_buffer *dst, #define GANG_LOOKUP_SIZE 16 static struct extent_buffer *get_next_extent_buffer( - const struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) + const struct btrfs_fs_info *fs_info, struct folio *folio, u64 bytenr) { struct extent_buffer *gang[GANG_LOOKUP_SIZE]; struct extent_buffer *found = NULL; - u64 page_start = page_offset(page); - u64 cur = page_start; + u64 folio_start = folio_pos(folio); + u64 cur = folio_start; - ASSERT(in_range(bytenr, page_start, PAGE_SIZE)); + ASSERT(in_range(bytenr, folio_start, PAGE_SIZE)); lockdep_assert_held(&fs_info->buffer_lock); - while (cur < page_start + PAGE_SIZE) { + while (cur < folio_start + PAGE_SIZE) { int ret; int i; @@ -4193,7 +4075,7 @@ static struct extent_buffer *get_next_extent_buffer( goto out; for (i = 0; i < ret; i++) { /* Already beyond page end */ - if (gang[i]->start >= page_start + PAGE_SIZE) + if (gang[i]->start >= folio_start + PAGE_SIZE) goto out; /* Found one */ if (gang[i]->start >= bytenr) { @@ -4207,11 +4089,11 @@ out: return found; } -static int try_release_subpage_extent_buffer(struct page *page) +static int try_release_subpage_extent_buffer(struct folio *folio) { - struct btrfs_fs_info *fs_info = page_to_fs_info(page); - u64 cur = page_offset(page); - const u64 end = page_offset(page) + PAGE_SIZE; + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); + u64 cur = folio_pos(folio); + const u64 end = cur + PAGE_SIZE; int ret; while (cur < end) { @@ -4226,7 +4108,7 @@ static int try_release_subpage_extent_buffer(struct page *page) * with spinlock rather than RCU. */ spin_lock(&fs_info->buffer_lock); - eb = get_next_extent_buffer(fs_info, page, cur); + eb = get_next_extent_buffer(fs_info, folio, cur); if (!eb) { /* No more eb in the page range after or at cur */ spin_unlock(&fs_info->buffer_lock); @@ -4267,31 +4149,30 @@ static int try_release_subpage_extent_buffer(struct page *page) * Finally to check if we have cleared folio private, as if we have * released all ebs in the page, the folio private should be cleared now. */ - spin_lock(&page->mapping->i_private_lock); - if (!folio_test_private(page_folio(page))) + spin_lock(&folio->mapping->i_private_lock); + if (!folio_test_private(folio)) ret = 1; else ret = 0; - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); return ret; } -int try_release_extent_buffer(struct page *page) +int try_release_extent_buffer(struct folio *folio) { - struct folio *folio = page_folio(page); struct extent_buffer *eb; - if (page_to_fs_info(page)->nodesize < PAGE_SIZE) - return try_release_subpage_extent_buffer(page); + if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE) + return try_release_subpage_extent_buffer(folio); /* * We need to make sure nobody is changing folio private, as we rely on * folio private as the pointer to extent buffer. */ - spin_lock(&page->mapping->i_private_lock); + spin_lock(&folio->mapping->i_private_lock); if (!folio_test_private(folio)) { - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); return 1; } @@ -4306,10 +4187,10 @@ int try_release_extent_buffer(struct page *page) spin_lock(&eb->refs_lock); if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { spin_unlock(&eb->refs_lock); - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); return 0; } - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); /* * If tree ref isn't set then we know the ref on this eb is a real ref, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index dceebd76c7d1..8a36117ed453 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -236,11 +236,11 @@ static inline void extent_changeset_free(struct extent_changeset *changeset) kfree(changeset); } -bool try_release_extent_mapping(struct page *page, gfp_t mask); -int try_release_extent_buffer(struct page *page); +bool try_release_extent_mapping(struct folio *folio, gfp_t mask); +int try_release_extent_buffer(struct folio *folio); int btrfs_read_folio(struct file *file, struct folio *folio); -void extent_write_locked_range(struct inode *inode, const struct page *locked_page, +void extent_write_locked_range(struct inode *inode, const struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty); int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc); @@ -249,7 +249,7 @@ int btree_write_cache_pages(struct address_space *mapping, void btrfs_readahead(struct readahead_control *rac); int set_folio_extent_mapped(struct folio *folio); int set_page_extent_mapped(struct page *page); -void clear_page_extent_mapped(struct page *page); +void clear_folio_extent_mapped(struct folio *folio); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level); @@ -354,7 +354,7 @@ void set_extent_buffer_dirty(struct extent_buffer *eb); void set_extent_buffer_uptodate(struct extent_buffer *eb); void clear_extent_buffer_uptodate(struct extent_buffer *eb); void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, - const struct page *locked_page, + const struct folio *locked_folio, struct extent_state **cached, u32 bits_to_clear, unsigned long page_ops); int extent_invalidate_folio(struct extent_io_tree *tree, @@ -368,7 +368,7 @@ int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, - struct page *locked_page, u64 *start, + struct folio *locked_folio, u64 *start, u64 *end); #endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 23b65dc73c00..25d191f1ac10 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -192,10 +192,13 @@ static inline u64 extent_map_block_len(const struct extent_map *em) static inline u64 extent_map_block_end(const struct extent_map *em) { - if (extent_map_block_start(em) + extent_map_block_len(em) < - extent_map_block_start(em)) + const u64 block_start = extent_map_block_start(em); + const u64 block_end = block_start + extent_map_block_len(em); + + if (block_end < block_start) return (u64)-1; - return extent_map_block_start(em) + extent_map_block_len(em); + + return block_end; } static bool can_merge_extent_map(const struct extent_map *em) @@ -1147,8 +1150,7 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c return 0; /* - * We want to be fast because we can be called from any path trying to - * allocate memory, so if the lock is busy we don't want to spend time + * We want to be fast so if the lock is busy we don't want to spend time * waiting for it - either some task is about to do IO for the inode or * we may have another task shrinking extent maps, here in this code, so * skip this inode. @@ -1191,9 +1193,7 @@ next: /* * Stop if we need to reschedule or there's contention on the * lock. This is to avoid slowing other tasks trying to take the - * lock and because the shrinker might be called during a memory - * allocation path and we want to avoid taking a very long time - * and slowing down all sorts of tasks. + * lock. */ if (need_resched() || rwlock_needbreak(&tree->lock)) break; @@ -1222,12 +1222,7 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx if (ctx->scanned >= ctx->nr_to_scan) break; - /* - * We may be called from memory allocation paths, so we don't - * want to take too much time and slowdown tasks. - */ - if (need_resched()) - break; + cond_resched(); inode = btrfs_find_first_inode(root, min_ino); } @@ -1285,14 +1280,12 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) ctx.last_ino); } - /* - * We may be called from memory allocation paths, so we don't want to - * take too much time and slowdown tasks, so stop if we need reschedule. - */ - while (ctx.scanned < ctx.nr_to_scan && !need_resched()) { + while (ctx.scanned < ctx.nr_to_scan) { struct btrfs_root *root; unsigned long count; + cond_resched(); + spin_lock(&fs_info->fs_roots_radix_lock); count = radix_tree_gang_lookup(&fs_info->fs_roots_radix, (void **)&root, diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c index 8f95f3e44e99..df7f09f3b02e 100644 --- a/fs/btrfs/fiemap.c +++ b/fs/btrfs/fiemap.c @@ -637,7 +637,7 @@ static int extent_fiemap(struct btrfs_inode *inode, struct btrfs_path *path; struct fiemap_cache cache = { 0 }; struct btrfs_backref_share_check_ctx *backref_ctx; - u64 last_extent_end; + u64 last_extent_end = 0; u64 prev_extent_end; u64 range_start; u64 range_end; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 5c342fe1af61..886749b39672 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -151,7 +151,7 @@ static inline u32 max_ordered_sum_bytes(const struct btrfs_fs_info *fs_info) * Calculate the total size needed to allocate for an ordered sum structure * spanning @bytes in the file. */ -static int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes) +static int btrfs_ordered_sum_size(const struct btrfs_fs_info *fs_info, unsigned long bytes) { return sizeof(struct btrfs_ordered_sum) + bytes_to_csum_size(fs_info, bytes); } @@ -1272,7 +1272,7 @@ out: void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, - struct btrfs_file_extent_item *fi, + const struct btrfs_file_extent_item *fi, struct extent_map *em) { struct btrfs_fs_info *fs_info = inode->root->fs_info; diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 557dc43d7142..0e13661a71f3 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -74,7 +74,7 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, struct btrfs_path *path, unsigned long *csum_bitmap); void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, - struct btrfs_file_extent_item *fi, + const struct btrfs_file_extent_item *fi, struct extent_map *em); int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9f10a9f23fcc..c5e36f58eb07 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1603,7 +1603,6 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) */ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { - struct btrfs_file_private *private = file->private_data; struct dentry *dentry = file_dentry(file); struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); struct btrfs_root *root = inode->root; @@ -1613,7 +1612,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) int ret = 0, err; u64 len; bool full_sync; - const bool skip_ilock = (private ? private->fsync_skip_inode_lock : false); + bool skip_ilock = false; + + if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) { + skip_ilock = true; + current->journal_info = NULL; + btrfs_assert_inode_locked(inode); + } trace_btrfs_sync_file(file, datasync); @@ -1868,7 +1873,10 @@ out: out_release_extents: btrfs_release_log_ctx_extents(&ctx); - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + if (skip_ilock) + up_write(&inode->i_mmap_lock); + else + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); goto out; } @@ -1912,8 +1920,8 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) reserved_space = PAGE_SIZE; sb_start_pagefault(inode->i_sb); - page_start = page_offset(page); - page_end = page_start + PAGE_SIZE - 1; + page_start = folio_pos(folio); + page_end = page_start + folio_size(folio) - 1; end = page_end; /* @@ -1941,18 +1949,18 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) ret = VM_FAULT_NOPAGE; again: down_read(&BTRFS_I(inode)->i_mmap_lock); - lock_page(page); + folio_lock(folio); size = i_size_read(inode); - if ((page->mapping != inode->i_mapping) || + if ((folio->mapping != inode->i_mapping) || (page_start >= size)) { /* Page got truncated out from underneath us. */ goto out_unlock; } - wait_on_page_writeback(page); + folio_wait_writeback(folio); lock_extent(io_tree, page_start, page_end, &cached_state); - ret2 = set_page_extent_mapped(page); + ret2 = set_folio_extent_mapped(folio); if (ret2 < 0) { ret = vmf_error(ret2); unlock_extent(io_tree, page_start, page_end, &cached_state); @@ -1966,14 +1974,14 @@ again: ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE); if (ordered) { unlock_extent(io_tree, page_start, page_end, &cached_state); - unlock_page(page); + folio_unlock(folio); up_read(&BTRFS_I(inode)->i_mmap_lock); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } - if (page->index == ((size - 1) >> PAGE_SHIFT)) { + if (folio->index == ((size - 1) >> PAGE_SHIFT)) { reserved_space = round_up(size - page_start, fs_info->sectorsize); if (reserved_space < PAGE_SIZE) { end = page_start + reserved_space - 1; @@ -2003,13 +2011,13 @@ again: } /* Page is wholly or partially inside EOF. */ - if (page_start + PAGE_SIZE > size) - zero_start = offset_in_page(size); + if (page_start + folio_size(folio) > size) + zero_start = offset_in_folio(folio, size); else zero_start = PAGE_SIZE; if (zero_start != PAGE_SIZE) - memzero_page(page, zero_start, PAGE_SIZE - zero_start); + folio_zero_range(folio, zero_start, folio_size(folio) - zero_start); btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); @@ -2026,7 +2034,7 @@ again: return VM_FAULT_LOCKED; out_unlock: - unlock_page(page); + folio_unlock(folio); up_read(&BTRFS_I(inode)->i_mmap_lock); out: btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index f5996a43db24..eaa1dbd31352 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2697,15 +2697,16 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, u64 offset = bytenr - block_group->start; u64 to_free, to_unusable; int bg_reclaim_threshold = 0; - bool initial = ((size == block_group->length) && (block_group->alloc_offset == 0)); + bool initial; u64 reclaimable_unusable; - WARN_ON(!initial && offset + size > block_group->zone_capacity); + spin_lock(&block_group->lock); + initial = ((size == block_group->length) && (block_group->alloc_offset == 0)); + WARN_ON(!initial && offset + size > block_group->zone_capacity); if (!initial) bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); - spin_lock(&ctl->tree_lock); if (!used) to_free = size; else if (initial) @@ -2718,7 +2719,9 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, to_free = offset + size - block_group->alloc_offset; to_unusable = size - to_free; + spin_lock(&ctl->tree_lock); ctl->free_space += to_free; + spin_unlock(&ctl->tree_lock); /* * If the block group is read-only, we should account freed space into * bytes_readonly. @@ -2727,11 +2730,8 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, block_group->zone_unusable += to_unusable; WARN_ON(block_group->zone_unusable > block_group->length); } - spin_unlock(&ctl->tree_lock); if (!used) { - spin_lock(&block_group->lock); block_group->alloc_offset -= size; - spin_unlock(&block_group->lock); } reclaimable_unusable = block_group->zone_unusable - @@ -2745,6 +2745,8 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, btrfs_mark_bg_to_reclaim(block_group); } + spin_unlock(&block_group->lock); + return 0; } diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 3d6d4b503220..79f64e383edd 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -703,8 +703,8 @@ struct btrfs_fs_info { * running. */ refcount_t scrub_workers_refcnt; + u32 sectors_per_page; struct workqueue_struct *scrub_workers; - struct btrfs_subpage_info *subpage_info; struct btrfs_discard_ctl discard_ctl; diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 316756ff08ac..29572dfaf878 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -14,7 +14,7 @@ #include "extent-tree.h" #include "file-item.h" -struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, +struct btrfs_inode_ref *btrfs_find_name_in_backref(const struct extent_buffer *leaf, int slot, const struct fscrypt_str *name) { @@ -42,7 +42,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, } struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( - struct extent_buffer *leaf, int slot, u64 ref_objectid, + const struct extent_buffer *leaf, int slot, u64 ref_objectid, const struct fscrypt_str *name) { struct btrfs_inode_extref *extref; @@ -423,9 +423,9 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root return ret; } -static inline void btrfs_trace_truncate(struct btrfs_inode *inode, - struct extent_buffer *leaf, - struct btrfs_file_extent_item *fi, +static inline void btrfs_trace_truncate(const struct btrfs_inode *inode, + const struct extent_buffer *leaf, + const struct btrfs_file_extent_item *fi, u64 offset, int extent_type, int slot) { if (!inode) diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h index c4aded82709b..c11b97fdccc4 100644 --- a/fs/btrfs/inode-item.h +++ b/fs/btrfs/inode-item.h @@ -109,11 +109,11 @@ struct btrfs_inode_extref *btrfs_lookup_inode_extref( u64 inode_objectid, u64 ref_objectid, int ins_len, int cow); -struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, +struct btrfs_inode_ref *btrfs_find_name_in_backref(const struct extent_buffer *leaf, int slot, const struct fscrypt_str *name); struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( - struct extent_buffer *leaf, int slot, u64 ref_objectid, + const struct extent_buffer *leaf, int slot, u64 ref_objectid, const struct fscrypt_str *name); #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 19d05a4c5c33..edac499fd83d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -116,7 +116,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback); static noinline int run_delalloc_cow(struct btrfs_inode *inode, - struct page *locked_page, u64 start, + struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty); @@ -393,17 +393,17 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags) * extent (btrfs_finish_ordered_io()). */ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, - struct page *locked_page, + struct folio *locked_folio, u64 offset, u64 bytes) { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; u64 page_start = 0, page_end = 0; - struct page *page; + struct folio *folio; - if (locked_page) { - page_start = page_offset(locked_page); - page_end = page_start + PAGE_SIZE - 1; + if (locked_folio) { + page_start = folio_pos(locked_folio); + page_end = page_start + folio_size(locked_folio) - 1; } while (index <= end_index) { @@ -417,13 +417,13 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, * btrfs_mark_ordered_io_finished() would skip the accounting * for the page range, and the ordered extent will never finish. */ - if (locked_page && index == (page_start >> PAGE_SHIFT)) { + if (locked_folio && index == (page_start >> PAGE_SHIFT)) { index++; continue; } - page = find_get_page(inode->vfs_inode.i_mapping, index); + folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0); index++; - if (!page) + if (IS_ERR(folio)) continue; /* @@ -431,14 +431,14 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, * range, then btrfs_mark_ordered_io_finished() will handle * the ordered extent accounting for the range. */ - btrfs_folio_clamp_clear_ordered(inode->root->fs_info, - page_folio(page), offset, bytes); - put_page(page); + btrfs_folio_clamp_clear_ordered(inode->root->fs_info, folio, + offset, bytes); + folio_put(folio); } - if (locked_page) { + if (locked_folio) { /* The locked page covers the full range, nothing needs to be done */ - if (bytes + offset <= page_start + PAGE_SIZE) + if (bytes + offset <= page_start + folio_size(locked_folio)) return; /* * In case this page belongs to the delalloc range being @@ -447,8 +447,9 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, * run_delalloc_range */ if (page_start >= offset && page_end <= (offset + bytes - 1)) { - bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; - offset = page_offset(locked_page) + PAGE_SIZE; + bytes = offset + bytes - folio_pos(locked_folio) - + folio_size(locked_folio); + offset = folio_pos(locked_folio) + folio_size(locked_folio); } } @@ -494,7 +495,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; struct extent_buffer *leaf; - struct page *page = NULL; const u32 sectorsize = trans->fs_info->sectorsize; char *kaddr; unsigned long ptr; @@ -554,12 +554,16 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_compression(leaf, ei, compress_type); } else { - page = find_get_page(inode->vfs_inode.i_mapping, 0); + struct folio *folio; + + folio = __filemap_get_folio(inode->vfs_inode.i_mapping, + 0, 0, 0); + ASSERT(!IS_ERR(folio)); btrfs_set_file_extent_compression(leaf, ei, 0); - kaddr = kmap_local_page(page); + kaddr = kmap_local_folio(folio, 0); write_extent_buffer(leaf, kaddr, ptr, size); kunmap_local(kaddr); - put_page(page); + folio_put(folio); } btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); @@ -715,7 +719,7 @@ out: } static noinline int cow_file_range_inline(struct btrfs_inode *inode, - struct page *locked_page, + struct folio *locked_folio, u64 offset, u64 end, size_t compressed_size, int compress_type, @@ -740,13 +744,26 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, return ret; } + /* + * In the successful case (ret == 0 here), cow_file_range will return 1. + * + * Quite a bit further up the callstack in extent_writepage(), ret == 1 + * is treated as a short circuited success and does not unlock the folio, + * so we must do it here. + * + * In the failure case, the locked_folio does get unlocked by + * btrfs_folio_end_all_writers, which asserts that it is still locked + * at that point, so we must *not* unlock it here. + * + * The other two callsites in compress_file_range do not have a + * locked_folio, so they are not relevant to this logic. + */ if (ret == 0) - locked_page = NULL; + locked_folio = NULL; - extent_clear_unlock_delalloc(inode, offset, end, locked_page, &cached, - clear_flags, - PAGE_UNLOCK | PAGE_START_WRITEBACK | - PAGE_END_WRITEBACK); + extent_clear_unlock_delalloc(inode, offset, end, locked_folio, &cached, + clear_flags, PAGE_UNLOCK | + PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); return ret; } @@ -762,7 +779,7 @@ struct async_extent { struct async_chunk { struct btrfs_inode *inode; - struct page *locked_page; + struct folio *locked_folio; u64 start; u64 end; blk_opf_t write_flags; @@ -868,25 +885,25 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, /* If this is a small write inside eof, kick off a defrag */ if (num_bytes < small_write && (start > 0 || end + 1 < inode->disk_i_size)) - btrfs_add_inode_defrag(NULL, inode, small_write); + btrfs_add_inode_defrag(inode, small_write); } static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) { unsigned long end_index = end >> PAGE_SHIFT; - struct page *page; + struct folio *folio; int ret = 0; for (unsigned long index = start >> PAGE_SHIFT; index <= end_index; index++) { - page = find_get_page(inode->i_mapping, index); - if (unlikely(!page)) { + folio = __filemap_get_folio(inode->i_mapping, index, 0, 0); + if (IS_ERR(folio)) { if (!ret) - ret = -ENOENT; + ret = PTR_ERR(folio); continue; } - clear_page_dirty_for_io(page); - put_page(page); + folio_clear_dirty_for_io(folio); + folio_put(folio); } return ret; } @@ -1122,7 +1139,7 @@ static void free_async_extent_pages(struct async_extent *async_extent) static void submit_uncompressed_range(struct btrfs_inode *inode, struct async_extent *async_extent, - struct page *locked_page) + struct folio *locked_folio) { u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; @@ -1135,20 +1152,22 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, }; wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); - ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false); + ret = run_delalloc_cow(inode, locked_folio, start, end, + &wbc, false); wbc_detach_inode(&wbc); if (ret < 0) { - btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); - if (locked_page) { - const u64 page_start = page_offset(locked_page); - - set_page_writeback(locked_page); - end_page_writeback(locked_page); - btrfs_mark_ordered_io_finished(inode, locked_page, + btrfs_cleanup_ordered_extents(inode, locked_folio, + start, end - start + 1); + if (locked_folio) { + const u64 page_start = folio_pos(locked_folio); + + folio_start_writeback(locked_folio); + folio_end_writeback(locked_folio); + btrfs_mark_ordered_io_finished(inode, locked_folio, page_start, PAGE_SIZE, !ret); - mapping_set_error(locked_page->mapping, ret); - unlock_page(locked_page); + mapping_set_error(locked_folio->mapping, ret); + folio_unlock(locked_folio); } } } @@ -1164,7 +1183,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, struct btrfs_ordered_extent *ordered; struct btrfs_file_extent file_extent; struct btrfs_key ins; - struct page *locked_page = NULL; + struct folio *locked_folio = NULL; struct extent_state *cached = NULL; struct extent_map *em; int ret = 0; @@ -1175,19 +1194,20 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, kthread_associate_blkcg(async_chunk->blkcg_css); /* - * If async_chunk->locked_page is in the async_extent range, we need to + * If async_chunk->locked_folio is in the async_extent range, we need to * handle it. */ - if (async_chunk->locked_page) { - u64 locked_page_start = page_offset(async_chunk->locked_page); - u64 locked_page_end = locked_page_start + PAGE_SIZE - 1; + if (async_chunk->locked_folio) { + u64 locked_folio_start = folio_pos(async_chunk->locked_folio); + u64 locked_folio_end = locked_folio_start + + folio_size(async_chunk->locked_folio) - 1; - if (!(start >= locked_page_end || end <= locked_page_start)) - locked_page = async_chunk->locked_page; + if (!(start >= locked_folio_end || end <= locked_folio_start)) + locked_folio = async_chunk->locked_folio; } if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { - submit_uncompressed_range(inode, async_extent, locked_page); + submit_uncompressed_range(inode, async_extent, locked_folio); goto done; } @@ -1202,7 +1222,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, * non-contiguous space for the uncompressed size instead. So * fall back to uncompressed. */ - submit_uncompressed_range(inode, async_extent, locked_page); + submit_uncompressed_range(inode, async_extent, locked_folio); goto done; } @@ -1306,21 +1326,21 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * allocate extents on disk for the range, and create ordered data structs * in ram to track those extents. * - * locked_page is the page that writepage had locked already. We use + * locked_folio is the folio that writepage had locked already. We use * it to make sure we don't do extra locks or unlocks. * - * When this function fails, it unlocks all pages except @locked_page. + * When this function fails, it unlocks all pages except @locked_folio. * * When this function successfully creates an inline extent, it returns 1 and - * unlocks all pages including locked_page and starts I/O on them. - * (In reality inline extents are limited to a single page, so locked_page is + * unlocks all pages including locked_folio and starts I/O on them. + * (In reality inline extents are limited to a single page, so locked_folio is * the only page handled anyway). * * When this function succeed and creates a normal extent, the page locking * status depends on the passed in flags: * * - If @keep_locked is set, all pages are kept locked. - * - Else all pages except for @locked_page are unlocked. + * - Else all pages except for @locked_folio are unlocked. * * When a failure happens in the second or later iteration of the * while-loop, the ordered extents created in previous iterations are kept @@ -1329,8 +1349,8 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * example. */ static noinline int cow_file_range(struct btrfs_inode *inode, - struct page *locked_page, u64 start, u64 end, - u64 *done_offset, + struct folio *locked_folio, u64 start, + u64 end, u64 *done_offset, bool keep_locked, bool no_inline) { struct btrfs_root *root = inode->root; @@ -1363,7 +1383,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (!no_inline) { /* lets try to make an inline extent */ - ret = cow_file_range_inline(inode, locked_page, start, end, 0, + ret = cow_file_range_inline(inode, locked_folio, start, end, 0, BTRFS_COMPRESS_NONE, NULL, false); if (ret <= 0) { /* @@ -1500,7 +1520,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, page_ops |= PAGE_SET_ORDERED; extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, - locked_page, &cached, + locked_folio, &cached, EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); if (num_bytes < cur_alloc_size) @@ -1553,13 +1573,13 @@ out_unlock: * function. * * However, in case of @keep_locked, we still need to unlock the pages - * (except @locked_page) to ensure all the pages are unlocked. + * (except @locked_folio) to ensure all the pages are unlocked. */ if (keep_locked && orig_start < start) { - if (!locked_page) + if (!locked_folio) mapping_set_error(inode->vfs_inode.i_mapping, ret); extent_clear_unlock_delalloc(inode, orig_start, start - 1, - locked_page, NULL, 0, page_ops); + locked_folio, NULL, 0, page_ops); } /* @@ -1582,9 +1602,9 @@ out_unlock: if (extent_reserved) { extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, - locked_page, &cached, - clear_bits, + locked_folio, &cached, clear_bits, page_ops); + btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); start += cur_alloc_size; } @@ -1596,8 +1616,9 @@ out_unlock: */ if (start < end) { clear_bits |= EXTENT_CLEAR_DATA_RESV; - extent_clear_unlock_delalloc(inode, start, end, locked_page, + extent_clear_unlock_delalloc(inode, start, end, locked_folio, &cached, clear_bits, page_ops); + btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); } return ret; } @@ -1649,7 +1670,7 @@ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_ } static bool run_delalloc_compressed(struct btrfs_inode *inode, - struct page *locked_page, u64 start, + struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1689,15 +1710,16 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, INIT_LIST_HEAD(&async_chunk[i].extents); /* - * The locked_page comes all the way from writepage and its - * the original page we were actually given. As we spread + * The locked_folio comes all the way from writepage and its + * the original folio we were actually given. As we spread * this large delalloc region across multiple async_chunk - * structs, only the first struct needs a pointer to locked_page + * structs, only the first struct needs a pointer to + * locked_folio. * * This way we don't need racey decisions about who is supposed * to unlock it. */ - if (locked_page) { + if (locked_folio) { /* * Depending on the compressibility, the pages might or * might not go through async. We want all of them to @@ -1707,12 +1729,12 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, * need full accuracy. Just account the whole thing * against the first page. */ - wbc_account_cgroup_owner(wbc, locked_page, + wbc_account_cgroup_owner(wbc, &locked_folio->page, cur_end - start); - async_chunk[i].locked_page = locked_page; - locked_page = NULL; + async_chunk[i].locked_folio = locked_folio; + locked_folio = NULL; } else { - async_chunk[i].locked_page = NULL; + async_chunk[i].locked_folio = NULL; } if (blkcg_css != blkcg_root_css) { @@ -1741,7 +1763,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, * covered by the range. */ static noinline int run_delalloc_cow(struct btrfs_inode *inode, - struct page *locked_page, u64 start, + struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty) { @@ -1749,20 +1771,21 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode, int ret; while (start <= end) { - ret = cow_file_range(inode, locked_page, start, end, &done_offset, - true, false); + ret = cow_file_range(inode, locked_folio, start, end, + &done_offset, true, false); if (ret) return ret; - extent_write_locked_range(&inode->vfs_inode, locked_page, start, - done_offset, wbc, pages_dirty); + extent_write_locked_range(&inode->vfs_inode, locked_folio, + start, done_offset, wbc, pages_dirty); start = done_offset + 1; } return 1; } -static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, - const u64 start, const u64 end) +static int fallback_to_cow(struct btrfs_inode *inode, + struct folio *locked_folio, const u64 start, + const u64 end) { const bool is_space_ino = btrfs_is_free_space_inode(inode); const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root); @@ -1831,7 +1854,8 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, * is written out and unlocked directly and a normal NOCOW extent * doesn't work. */ - ret = cow_file_range(inode, locked_page, start, end, NULL, false, true); + ret = cow_file_range(inode, locked_folio, start, end, NULL, false, + true); ASSERT(ret != 1); return ret; } @@ -1985,7 +2009,7 @@ static int can_nocow_file_extent(struct btrfs_path *path, * blocks on disk */ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, - struct page *locked_page, + struct folio *locked_folio, const u64 start, const u64 end) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -2148,8 +2172,8 @@ must_cow: * NOCOW, following one which needs to be COW'ed */ if (cow_start != (u64)-1) { - ret = fallback_to_cow(inode, locked_page, - cow_start, found_key.offset - 1); + ret = fallback_to_cow(inode, locked_folio, cow_start, + found_key.offset - 1); cow_start = (u64)-1; if (ret) { btrfs_dec_nocow_writers(nocow_bg); @@ -2204,7 +2228,7 @@ must_cow: btrfs_put_ordered_extent(ordered); extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, - locked_page, &cached_state, + locked_folio, &cached_state, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, PAGE_UNLOCK | PAGE_SET_ORDERED); @@ -2226,7 +2250,7 @@ must_cow: if (cow_start != (u64)-1) { cur_offset = end; - ret = fallback_to_cow(inode, locked_page, cow_start, end); + ret = fallback_to_cow(inode, locked_folio, cow_start, end); cow_start = (u64)-1; if (ret) goto error; @@ -2253,12 +2277,13 @@ error: lock_extent(&inode->io_tree, cur_offset, end, &cached); extent_clear_unlock_delalloc(inode, cur_offset, end, - locked_page, &cached, + locked_folio, &cached, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); + btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL); } btrfs_free_path(path); return ret; @@ -2279,39 +2304,39 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) * Function to process delayed allocation (create CoW) for ranges which are * being touched for the first time. */ -int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, +int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc) { const bool zoned = btrfs_is_zoned(inode->root->fs_info); int ret; /* - * The range must cover part of the @locked_page, or a return of 1 + * The range must cover part of the @locked_folio, or a return of 1 * can confuse the caller. */ - ASSERT(!(end <= page_offset(locked_page) || - start >= page_offset(locked_page) + PAGE_SIZE)); + ASSERT(!(end <= folio_pos(locked_folio) || + start >= folio_pos(locked_folio) + folio_size(locked_folio))); if (should_nocow(inode, start, end)) { - ret = run_delalloc_nocow(inode, locked_page, start, end); + ret = run_delalloc_nocow(inode, locked_folio, start, end); goto out; } if (btrfs_inode_can_compress(inode) && inode_need_compress(inode, start, end) && - run_delalloc_compressed(inode, locked_page, start, end, wbc)) + run_delalloc_compressed(inode, locked_folio, start, end, wbc)) return 1; if (zoned) - ret = run_delalloc_cow(inode, locked_page, start, end, wbc, + ret = run_delalloc_cow(inode, locked_folio, start, end, wbc, true); else - ret = cow_file_range(inode, locked_page, start, end, NULL, + ret = cow_file_range(inode, locked_folio, start, end, NULL, false, false); out: if (ret < 0) - btrfs_cleanup_ordered_extents(inode, locked_page, start, + btrfs_cleanup_ordered_extents(inode, locked_folio, start, end - start + 1); return ret; } @@ -2687,7 +2712,7 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, /* see btrfs_writepage_start_hook for details on why this is required */ struct btrfs_writepage_fixup { - struct page *page; + struct folio *folio; struct btrfs_inode *inode; struct btrfs_work work; }; @@ -2699,50 +2724,51 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; - struct page *page = fixup->page; + struct folio *folio = fixup->folio; struct btrfs_inode *inode = fixup->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - u64 page_start = page_offset(page); - u64 page_end = page_offset(page) + PAGE_SIZE - 1; + u64 page_start = folio_pos(folio); + u64 page_end = folio_pos(folio) + folio_size(folio) - 1; int ret = 0; bool free_delalloc_space = true; /* * This is similar to page_mkwrite, we need to reserve the space before - * we take the page lock. + * we take the folio lock. */ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, - PAGE_SIZE); + folio_size(folio)); again: - lock_page(page); + folio_lock(folio); /* - * Before we queued this fixup, we took a reference on the page. - * page->mapping may go NULL, but it shouldn't be moved to a different + * Before we queued this fixup, we took a reference on the folio. + * folio->mapping may go NULL, but it shouldn't be moved to a different * address space. */ - if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { + if (!folio->mapping || !folio_test_dirty(folio) || + !folio_test_checked(folio)) { /* * Unfortunately this is a little tricky, either * - * 1) We got here and our page had already been dealt with and + * 1) We got here and our folio had already been dealt with and * we reserved our space, thus ret == 0, so we need to just * drop our space reservation and bail. This can happen the * first time we come into the fixup worker, or could happen * while waiting for the ordered extent. - * 2) Our page was already dealt with, but we happened to get an + * 2) Our folio was already dealt with, but we happened to get an * ENOSPC above from the btrfs_delalloc_reserve_space. In * this case we obviously don't have anything to release, but - * because the page was already dealt with we don't want to - * mark the page with an error, so make sure we're resetting + * because the folio was already dealt with we don't want to + * mark the folio with an error, so make sure we're resetting * ret to 0. This is why we have this check _before_ the ret * check, because we do not want to have a surprise ENOSPC - * when the page was already properly dealt with. + * when the folio was already properly dealt with. */ if (!ret) { - btrfs_delalloc_release_extents(inode, PAGE_SIZE); + btrfs_delalloc_release_extents(inode, folio_size(folio)); btrfs_delalloc_release_space(inode, data_reserved, - page_start, PAGE_SIZE, + page_start, folio_size(folio), true); } ret = 0; @@ -2750,7 +2776,7 @@ again: } /* - * We can't mess with the page state unless it is locked, so now that + * We can't mess with the folio state unless it is locked, so now that * it is locked bail if we failed to make our space reservation. */ if (ret) @@ -2759,14 +2785,14 @@ again: lock_extent(&inode->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ - if (PageOrdered(page)) + if (folio_test_ordered(folio)) goto out_reserved; ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); if (ordered) { unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); - unlock_page(page); + folio_unlock(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; @@ -2784,7 +2810,7 @@ again: * * The page was dirty when we started, nothing should have cleaned it. */ - BUG_ON(!PageDirty(page)); + BUG_ON(!folio_test_dirty(folio)); free_delalloc_space = false; out_reserved: btrfs_delalloc_release_extents(inode, PAGE_SIZE); @@ -2798,14 +2824,14 @@ out_page: * We hit ENOSPC or other errors. Update the mapping and page * to reflect the errors and clean the page. */ - mapping_set_error(page->mapping, ret); - btrfs_mark_ordered_io_finished(inode, page, page_start, - PAGE_SIZE, !ret); - clear_page_dirty_for_io(page); - } - btrfs_folio_clear_checked(fs_info, page_folio(page), page_start, PAGE_SIZE); - unlock_page(page); - put_page(page); + mapping_set_error(folio->mapping, ret); + btrfs_mark_ordered_io_finished(inode, folio, page_start, + folio_size(folio), !ret); + folio_clear_dirty_for_io(folio); + } + btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); + folio_unlock(folio); + folio_put(folio); kfree(fixup); extent_changeset_free(data_reserved); /* @@ -2818,33 +2844,34 @@ out_page: /* * There are a few paths in the higher layers of the kernel that directly - * set the page dirty bit without asking the filesystem if it is a + * set the folio dirty bit without asking the filesystem if it is a * good idea. This causes problems because we want to make sure COW * properly happens and the data=ordered rules are followed. * * In our case any range that doesn't have the ORDERED bit set * hasn't been properly setup for IO. We kick off an async process * to fix it up. The async helper will wait for ordered extents, set - * the delalloc bit and make it safe to write the page. + * the delalloc bit and make it safe to write the folio. */ -int btrfs_writepage_cow_fixup(struct page *page) +int btrfs_writepage_cow_fixup(struct folio *folio) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_writepage_fixup *fixup; - /* This page has ordered extent covering it already */ - if (PageOrdered(page)) + /* This folio has ordered extent covering it already */ + if (folio_test_ordered(folio)) return 0; /* - * PageChecked is set below when we create a fixup worker for this page, - * don't try to create another one if we're already PageChecked() + * folio_checked is set below when we create a fixup worker for this + * folio, don't try to create another one if we're already + * folio_test_checked. * - * The extent_io writepage code will redirty the page if we send back + * The extent_io writepage code will redirty the foio if we send back * EAGAIN. */ - if (PageChecked(page)) + if (folio_test_checked(folio)) return -EAGAIN; fixup = kzalloc(sizeof(*fixup), GFP_NOFS); @@ -2854,14 +2881,14 @@ int btrfs_writepage_cow_fixup(struct page *page) /* * We are already holding a reference to this inode from * write_cache_pages. We need to hold it because the space reservation - * takes place outside of the page lock, and we can't trust - * page->mapping outside of the page lock. + * takes place outside of the folio lock, and we can't trust + * page->mapping outside of the folio lock. */ ihold(inode); - btrfs_folio_set_checked(fs_info, page_folio(page), page_offset(page), PAGE_SIZE); - get_page(page); + btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); + folio_get(folio); btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL); - fixup->page = page; + fixup->folio = folio; fixup->inode = BTRFS_I(inode); btrfs_queue_work(fs_info->fixup_workers, &fixup->work); @@ -4192,6 +4219,7 @@ err: btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2); inode_inc_iversion(&inode->vfs_inode); + inode_set_ctime_current(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode)); ret = btrfs_update_inode(trans, dir); @@ -6696,7 +6724,7 @@ static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, } static noinline int uncompress_inline(struct btrfs_path *path, - struct page *page, + struct folio *folio, struct btrfs_file_extent_item *item) { int ret; @@ -6718,7 +6746,8 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); max_size = min_t(unsigned long, PAGE_SIZE, max_size); - ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size); + ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size, + max_size); /* * decompression code contains a memset to fill in any space between the end @@ -6729,36 +6758,36 @@ static noinline int uncompress_inline(struct btrfs_path *path, */ if (max_size < PAGE_SIZE) - memzero_page(page, max_size, PAGE_SIZE - max_size); + folio_zero_range(folio, max_size, PAGE_SIZE - max_size); kfree(tmp); return ret; } static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, - struct page *page) + struct folio *folio) { struct btrfs_file_extent_item *fi; void *kaddr; size_t copy_size; - if (!page || PageUptodate(page)) + if (!folio || folio_test_uptodate(folio)) return 0; - ASSERT(page_offset(page) == 0); + ASSERT(folio_pos(folio) == 0); fi = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE) - return uncompress_inline(path, page, fi); + return uncompress_inline(path, folio, fi); copy_size = min_t(u64, PAGE_SIZE, btrfs_file_extent_ram_bytes(path->nodes[0], fi)); - kaddr = kmap_local_page(page); + kaddr = kmap_local_folio(folio, 0); read_extent_buffer(path->nodes[0], kaddr, btrfs_file_extent_inline_start(fi), copy_size); kunmap_local(kaddr); if (copy_size < PAGE_SIZE) - memzero_page(page, copy_size, PAGE_SIZE - copy_size); + folio_zero_range(folio, copy_size, PAGE_SIZE - copy_size); return 0; } @@ -6780,7 +6809,7 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path * Return: ERR_PTR on error, non-NULL extent_map on success. */ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, u64 start, u64 len) + struct folio *folio, u64 start, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret = 0; @@ -6803,7 +6832,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, if (em) { if (em->start > start || em->start + em->len <= start) free_extent_map(em); - else if (em->disk_bytenr == EXTENT_MAP_INLINE && page) + else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio) free_extent_map(em); else goto out; @@ -6933,7 +6962,7 @@ next: ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE); ASSERT(em->len == fs_info->sectorsize); - ret = read_inline_extent(inode, path, page); + ret = read_inline_extent(inode, path, folio); if (ret < 0) goto out; goto insert; @@ -7175,13 +7204,12 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, * for subpage spinlock. So this function is to spin and wait for subpage * spinlock. */ -static void wait_subpage_spinlock(struct page *page) +static void wait_subpage_spinlock(struct folio *folio) { - struct btrfs_fs_info *fs_info = page_to_fs_info(page); - struct folio *folio = page_folio(page); + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, page->mapping)) + if (!btrfs_is_subpage(fs_info, folio->mapping)) return; ASSERT(folio_test_private(folio) && folio_get_private(folio)); @@ -7202,11 +7230,17 @@ static void wait_subpage_spinlock(struct page *page) spin_unlock_irq(&subpage->lock); } +static int btrfs_launder_folio(struct folio *folio) +{ + return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio), + PAGE_SIZE, NULL); +} + static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) { - if (try_release_extent_mapping(&folio->page, gfp_flags)) { - wait_subpage_spinlock(&folio->page); - clear_page_extent_mapped(&folio->page); + if (try_release_extent_mapping(folio, gfp_flags)) { + wait_subpage_spinlock(folio); + clear_folio_extent_mapped(folio); return true; } return false; @@ -7266,7 +7300,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * do double ordered extent accounting on the same folio. */ folio_wait_writeback(folio); - wait_subpage_spinlock(&folio->page); + wait_subpage_spinlock(folio); /* * For subpage case, we have call sites like @@ -7404,7 +7438,7 @@ next: btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); if (!inode_evicting) __btrfs_release_folio(folio, GFP_NOFS); - clear_page_extent_mapped(&folio->page); + clear_folio_extent_mapped(folio); } static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) @@ -8941,19 +8975,19 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; - struct page *page; + struct folio *folio; u32 len; ASSERT(end + 1 - start <= U32_MAX); len = end + 1 - start; while (index <= end_index) { - page = find_get_page(inode->vfs_inode.i_mapping, index); - ASSERT(page); /* Pages should be in the extent_io_tree */ + folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0); + ASSERT(!IS_ERR(folio)); /* folios should be in the extent_io_tree */ /* This is for data, which doesn't yet support larger folio. */ - ASSERT(folio_order(page_folio(page)) == 0); - btrfs_folio_set_writeback(fs_info, page_folio(page), start, len); - put_page(page); + ASSERT(folio_order(folio) == 0); + btrfs_folio_set_writeback(fs_info, folio, start, len); + folio_put(folio); index++; } } @@ -9118,7 +9152,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { atomic_inc(&priv.pending); - btrfs_submit_bio(bbio, 0); + btrfs_submit_bbio(bbio, 0); bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, btrfs_encoded_read_endio, &priv); @@ -9133,7 +9167,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, } while (disk_io_size); atomic_inc(&priv.pending); - btrfs_submit_bio(bbio, 0); + btrfs_submit_bbio(bbio, 0); if (atomic_dec_return(&priv.pending)) io_wait_event(priv.wait, !atomic_read(&priv.pending)); @@ -10137,6 +10171,7 @@ static const struct address_space_operations btrfs_aops = { .writepages = btrfs_writepages, .readahead = btrfs_readahead, .invalidate_folio = btrfs_invalidate_folio, + .launder_folio = btrfs_launder_folio, .release_folio = btrfs_release_folio, .migrate_folio = btrfs_migrate_folio, .dirty_folio = filemap_dirty_folio, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e0a664b8a46a..8537eb9b5531 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -543,13 +543,11 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, range.minlen = max(range.minlen, minlen); ret = btrfs_trim_fs(fs_info, &range); - if (ret < 0) - return ret; if (copy_to_user(arg, &range, sizeof(range))) return -EFAULT; - return 0; + return ret; } int __pure btrfs_is_empty_uuid(const u8 *uuid) @@ -4765,11 +4763,10 @@ long btrfs_ioctl(struct file *file, unsigned int return ret; ret = btrfs_sync_fs(inode->i_sb, 1); /* - * The transaction thread may want to do more work, - * namely it pokes the cleaner kthread that will start - * processing uncleaned subvols. + * There may be work for the cleaner kthread to do (subvolume + * deletion, delayed iputs, defrag inodes, etc), so wake it up. */ - wake_up_process(fs_info->transaction_kthread); + wake_up_process(fs_info->cleaner_kthread); return ret; } case BTRFS_IOC_START_SYNC: diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 1e2a68b8f62d..72856f6775f7 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -438,11 +438,11 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) } int lzo_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); - struct btrfs_fs_info *fs_info = page_to_fs_info(dest_page); + struct btrfs_fs_info *fs_info = folio_to_fs_info(dest_folio); const u32 sectorsize = fs_info->sectorsize; size_t in_len; size_t out_len; @@ -467,22 +467,22 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, out_len = sectorsize; ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); if (unlikely(ret != LZO_E_OK)) { - struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host); + struct btrfs_inode *inode = folio_to_inode(dest_folio); btrfs_err(fs_info, "lzo decompression failed, error %d root %llu inode %llu offset %llu", ret, btrfs_root_id(inode->root), btrfs_ino(inode), - page_offset(dest_page)); + folio_pos(dest_folio)); ret = -EIO; goto out; } ASSERT(out_len <= sectorsize); - memcpy_to_page(dest_page, dest_pgoff, workspace->buf, out_len); + memcpy_to_folio(dest_folio, dest_pgoff, workspace->buf, out_len); /* Early end, considered as an error. */ if (unlikely(out_len < destlen)) { ret = -EIO; - memzero_page(dest_page, dest_pgoff + out_len, destlen - out_len); + folio_zero_range(dest_folio, dest_pgoff + out_len, destlen - out_len); } out: return ret; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 82a68394a89c..2104d60c2161 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -332,7 +332,7 @@ static void finish_ordered_fn(struct btrfs_work *work) } static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, - struct page *page, u64 file_offset, + struct folio *folio, u64 file_offset, u64 len, bool uptodate) { struct btrfs_inode *inode = ordered->inode; @@ -340,10 +340,10 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, lockdep_assert_held(&inode->ordered_tree_lock); - if (page) { - ASSERT(page->mapping); - ASSERT(page_offset(page) <= file_offset); - ASSERT(file_offset + len <= page_offset(page) + PAGE_SIZE); + if (folio) { + ASSERT(folio->mapping); + ASSERT(folio_pos(folio) <= file_offset); + ASSERT(file_offset + len <= folio_pos(folio) + folio_size(folio)); /* * Ordered (Private2) bit indicates whether we still have @@ -351,10 +351,9 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, * * If there's no such bit, we need to skip to next range. */ - if (!btrfs_folio_test_ordered(fs_info, page_folio(page), - file_offset, len)) + if (!btrfs_folio_test_ordered(fs_info, folio, file_offset, len)) return false; - btrfs_folio_clear_ordered(fs_info, page_folio(page), file_offset, len); + btrfs_folio_clear_ordered(fs_info, folio, file_offset, len); } /* Now we're fine to update the accounting. */ @@ -398,7 +397,7 @@ static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered) } void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, - struct page *page, u64 file_offset, u64 len, + struct folio *folio, u64 file_offset, u64 len, bool uptodate) { struct btrfs_inode *inode = ordered->inode; @@ -408,7 +407,8 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate); spin_lock_irqsave(&inode->ordered_tree_lock, flags); - ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate); + ret = can_finish_ordered_extent(ordered, folio, file_offset, len, + uptodate); spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); /* @@ -449,8 +449,8 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, /* * Mark all ordered extents io inside the specified range finished. * - * @page: The involved page for the operation. - * For uncompressed buffered IO, the page status also needs to be + * @folio: The involved folio for the operation. + * For uncompressed buffered IO, the folio status also needs to be * updated to indicate whether the pending ordered io is finished. * Can be NULL for direct IO and compressed write. * For these cases, callers are ensured they won't execute the @@ -460,7 +460,7 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, * extent(s) covering it. */ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, - struct page *page, u64 file_offset, + struct folio *folio, u64 file_offset, u64 num_bytes, bool uptodate) { struct rb_node *node; @@ -524,7 +524,7 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, ASSERT(end + 1 - cur < U32_MAX); len = end + 1 - cur; - if (can_finish_ordered_extent(entry, page, cur, len, uptodate)) { + if (can_finish_ordered_extent(entry, folio, cur, len, uptodate)) { spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); btrfs_queue_ordered_fn(entry); spin_lock_irqsave(&inode->ordered_tree_lock, flags); @@ -1015,7 +1015,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, { struct rb_node *n; - ASSERT(inode_is_locked(&inode->vfs_inode)); + btrfs_assert_inode_locked(inode); spin_lock_irq(&inode->ordered_tree_lock); for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) { diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 51b9e81726e2..4e152736d06c 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -163,11 +163,11 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry); void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, - struct page *page, u64 file_offset, u64 len, + struct folio *folio, u64 file_offset, u64 len, bool uptodate); void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, - struct page *page, u64 file_offset, - u64 num_bytes, bool uptodate); + struct folio *folio, u64 file_offset, + u64 num_bytes, bool uptodate); bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size); diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index 6195a2215b8f..9f3ad124104f 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -9,9 +9,8 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; - int ret = 0; key.objectid = BTRFS_ORPHAN_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; @@ -21,16 +20,13 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - ret = btrfs_insert_empty_item(trans, root, path, &key, 0); - - btrfs_free_path(path); - return ret; + return btrfs_insert_empty_item(trans, root, path, &key, 0); } int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int ret = 0; @@ -44,15 +40,9 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) - goto out; - if (ret) { /* JDM: Really? */ - ret = -ENOENT; - goto out; - } - - ret = btrfs_del_item(trans, root, path); + return ret; + if (ret) + return -ENOENT; -out: - btrfs_free_path(path); - return ret; + return btrfs_del_item(trans, root, path); } diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 32dcea662da3..fc821aa446f0 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -14,7 +14,7 @@ struct root_name_map { u64 id; - char name[16]; + const char *name; }; static const struct root_name_map root_map[] = { diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5d57a285d59b..c297909f1506 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1998,16 +1998,14 @@ out: * * Return 0 for success insert * Return >0 for existing record, caller can free @record safely. - * Error is not possible + * Return <0 for insertion failure, caller can free @record safely. */ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_qgroup_extent_record *record) { - struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; - struct rb_node *parent_node = NULL; - struct btrfs_qgroup_extent_record *entry; - u64 bytenr = record->bytenr; + struct btrfs_qgroup_extent_record *existing, *ret; + unsigned long bytenr = record->bytenr; if (!btrfs_qgroup_full_accounting(fs_info)) return 1; @@ -2015,26 +2013,24 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, lockdep_assert_held(&delayed_refs->lock); trace_btrfs_qgroup_trace_extent(fs_info, record); - while (*p) { - parent_node = *p; - entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record, - node); - if (bytenr < entry->bytenr) { - p = &(*p)->rb_left; - } else if (bytenr > entry->bytenr) { - p = &(*p)->rb_right; - } else { - if (record->data_rsv && !entry->data_rsv) { - entry->data_rsv = record->data_rsv; - entry->data_rsv_refroot = - record->data_rsv_refroot; - } - return 1; + xa_lock(&delayed_refs->dirty_extents); + existing = xa_load(&delayed_refs->dirty_extents, bytenr); + if (existing) { + if (record->data_rsv && !existing->data_rsv) { + existing->data_rsv = record->data_rsv; + existing->data_rsv_refroot = record->data_rsv_refroot; } + xa_unlock(&delayed_refs->dirty_extents); + return 1; + } + + ret = __xa_store(&delayed_refs->dirty_extents, record->bytenr, record, GFP_ATOMIC); + xa_unlock(&delayed_refs->dirty_extents); + if (xa_is_err(ret)) { + qgroup_mark_inconsistent(fs_info); + return xa_err(ret); } - rb_link_node(&record->node, parent_node, p); - rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); return 0; } @@ -2141,6 +2137,11 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, if (!record) return -ENOMEM; + if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, bytenr, GFP_NOFS)) { + kfree(record); + return -ENOMEM; + } + delayed_refs = &trans->transaction->delayed_refs; record->bytenr = bytenr; record->num_bytes = num_bytes; @@ -2149,7 +2150,9 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, spin_lock(&delayed_refs->lock); ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record); spin_unlock(&delayed_refs->lock); - if (ret > 0) { + if (ret) { + /* Clean up if insertion fails or item exists. */ + xa_release(&delayed_refs->dirty_extents, record->bytenr); kfree(record); return 0; } @@ -3018,7 +3021,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) struct btrfs_qgroup_extent_record *record; struct btrfs_delayed_ref_root *delayed_refs; struct ulist *new_roots = NULL; - struct rb_node *node; + unsigned long index; u64 num_dirty_extents = 0; u64 qgroup_to_skip; int ret = 0; @@ -3028,10 +3031,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) delayed_refs = &trans->transaction->delayed_refs; qgroup_to_skip = delayed_refs->qgroup_to_skip; - while ((node = rb_first(&delayed_refs->dirty_extent_root))) { - record = rb_entry(node, struct btrfs_qgroup_extent_record, - node); - + xa_for_each(&delayed_refs->dirty_extents, index, record) { num_dirty_extents++; trace_btrfs_qgroup_account_extents(fs_info, record); @@ -3097,7 +3097,7 @@ cleanup: ulist_free(record->old_roots); ulist_free(new_roots); new_roots = NULL; - rb_erase(node, &delayed_refs->dirty_extent_root); + xa_erase(&delayed_refs->dirty_extents, index); kfree(record); } @@ -4185,6 +4185,8 @@ static int try_flush_qgroup(struct btrfs_root *root) return 0; } + btrfs_run_delayed_iputs(root->fs_info); + btrfs_wait_on_delayed_iputs(root->fs_info); ret = btrfs_start_delalloc_snapshot(root, true); if (ret < 0) goto out; @@ -4344,10 +4346,9 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, int ret; if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) { - extent_changeset_init(&changeset); return clear_record_extent_bits(&inode->io_tree, start, start + len - 1, - EXTENT_QGROUP_RESERVED, &changeset); + EXTENT_QGROUP_RESERVED, NULL); } /* In release case, we shouldn't have @reserved */ @@ -4873,15 +4874,13 @@ out: void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) { struct btrfs_qgroup_extent_record *entry; - struct btrfs_qgroup_extent_record *next; - struct rb_root *root; + unsigned long index; - root = &trans->delayed_refs.dirty_extent_root; - rbtree_postorder_for_each_entry_safe(entry, next, root, node) { + xa_for_each(&trans->delayed_refs.dirty_extents, index, entry) { ulist_free(entry->old_roots); kfree(entry); } - *root = RB_ROOT; + xa_destroy(&trans->delayed_refs.dirty_extents); } void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes) diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index deb479d176a9..98adf4ec7b01 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -125,7 +125,6 @@ struct btrfs_inode; * Record a dirty extent, and info qgroup to update quota on it */ struct btrfs_qgroup_extent_record { - struct rb_node node; u64 bytenr; u64 num_bytes; diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index e6f7a234b8f6..4c859b550f6c 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -66,6 +66,11 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le if (ret) break; + start += key.offset; + length -= key.offset; + if (length == 0) + break; + btrfs_release_path(path); } @@ -73,6 +78,36 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le return ret; } +static int update_raid_extent_item(struct btrfs_trans_handle *trans, + struct btrfs_key *key, + struct btrfs_stripe_extent *stripe_extent, + const size_t item_size) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret; + int slot; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, trans->fs_info->stripe_root, key, path, + 0, 1); + if (ret) + return (ret == 1 ? ret : -EINVAL); + + leaf = path->nodes[0]; + slot = path->slots[0]; + + write_extent_buffer(leaf, stripe_extent, btrfs_item_ptr_offset(leaf, slot), + item_size); + btrfs_mark_buffer_dirty(trans, leaf); + btrfs_free_path(path); + + return ret; +} + static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, struct btrfs_io_context *bioc) { @@ -112,6 +147,9 @@ static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent, item_size); + if (ret == -EEXIST) + ret = update_raid_extent_item(trans, &stripe_key, stripe_extent, + item_size); if (ret) btrfs_abort_transaction(trans, ret); @@ -172,7 +210,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, if (!path) return -ENOMEM; - if (stripe->is_scrub) { + if (stripe->rst_search_commit_root) { path->skip_locking = 1; path->search_commit_root = 1; } @@ -245,10 +283,8 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, out: if (ret > 0) ret = -ENOENT; - if (ret && ret != -EIO && !stripe->is_scrub) { - if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) - btrfs_print_tree(leaf, 1); - btrfs_err(fs_info, + if (ret && ret != -EIO && !stripe->rst_search_commit_root) { + btrfs_debug(fs_info, "cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s", logical, logical + *length, stripe->dev->devid, btrfs_bg_type_to_raid_name(map_type)); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index df6b93b927cd..f0824c948cb7 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -66,7 +66,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode, const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0); char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); struct extent_changeset *data_reserved = NULL; - struct page *page = NULL; + struct folio *folio = NULL; struct address_space *mapping = inode->vfs_inode.i_mapping; int ret; @@ -83,14 +83,15 @@ static int copy_inline_to_page(struct btrfs_inode *inode, if (ret) goto out; - page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT, - btrfs_alloc_write_mask(mapping)); - if (!page) { + folio = __filemap_get_folio(mapping, file_offset >> PAGE_SHIFT, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + btrfs_alloc_write_mask(mapping)); + if (IS_ERR(folio)) { ret = -ENOMEM; goto out_unlock; } - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) goto out_unlock; @@ -115,15 +116,15 @@ static int copy_inline_to_page(struct btrfs_inode *inode, set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags); if (comp_type == BTRFS_COMPRESS_NONE) { - memcpy_to_page(page, offset_in_page(file_offset), data_start, - datal); + memcpy_to_folio(folio, offset_in_folio(folio, file_offset), data_start, + datal); } else { - ret = btrfs_decompress(comp_type, data_start, page, - offset_in_page(file_offset), + ret = btrfs_decompress(comp_type, data_start, folio, + offset_in_folio(folio, file_offset), inline_size, datal); if (ret) goto out_unlock; - flush_dcache_page(page); + flush_dcache_folio(folio); } /* @@ -139,15 +140,15 @@ static int copy_inline_to_page(struct btrfs_inode *inode, * So what's in the range [500, 4095] corresponds to zeroes. */ if (datal < block_size) - memzero_page(page, datal, block_size - datal); + folio_zero_range(folio, datal, block_size - datal); - btrfs_folio_set_uptodate(fs_info, page_folio(page), file_offset, block_size); - btrfs_folio_clear_checked(fs_info, page_folio(page), file_offset, block_size); - btrfs_folio_set_dirty(fs_info, page_folio(page), file_offset, block_size); + btrfs_folio_set_uptodate(fs_info, folio, file_offset, block_size); + btrfs_folio_clear_checked(fs_info, folio, file_offset, block_size); + btrfs_folio_set_dirty(fs_info, folio, file_offset, block_size); out_unlock: - if (page) { - unlock_page(page); - put_page(page); + if (!IS_ERR(folio)) { + folio_unlock(folio); + folio_put(folio); } if (ret) btrfs_delalloc_release_space(inode, data_reserved, file_offset, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 0533d0f82dc9..ea4ed85919ec 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -36,6 +36,7 @@ #include "relocation.h" #include "super.h" #include "tree-checker.h" +#include "raid-stripe-tree.h" /* * Relocation overview @@ -2965,21 +2966,34 @@ static int relocate_one_folio(struct reloc_control *rc, u64 folio_end; u64 cur; int ret; + const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags); ASSERT(index <= last_index); folio = filemap_lock_folio(inode->i_mapping, index); if (IS_ERR(folio)) { - page_cache_sync_readahead(inode->i_mapping, ra, NULL, - index, last_index + 1 - index); + + /* + * On relocation we're doing readahead on the relocation inode, + * but if the filesystem is backed by a RAID stripe tree we can + * get ENOENT (e.g. due to preallocated extents not being + * mapped in the RST) from the lookup. + * + * But readahead doesn't handle the error and submits invalid + * reads to the device, causing a assertion failures. + */ + if (!use_rst) + page_cache_sync_readahead(inode->i_mapping, ra, NULL, + index, last_index + 1 - index); folio = __filemap_get_folio(inode->i_mapping, index, - FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mask); if (IS_ERR(folio)) return PTR_ERR(folio); } WARN_ON(folio_order(folio)); - if (folio_test_readahead(folio)) + if (folio_test_readahead(folio) && !use_rst) page_cache_async_readahead(inode->i_mapping, ra, NULL, folio, last_index + 1 - index); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 14a8d7100018..3a3427428074 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -838,7 +838,7 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, bbio->bio.bi_iter.bi_size >= blocksize)) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); - btrfs_submit_bio(bbio, mirror); + btrfs_submit_bbio(bbio, mirror); if (wait) wait_scrub_stripe_io(stripe); bbio = NULL; @@ -857,7 +857,7 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, if (bbio) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); - btrfs_submit_bio(bbio, mirror); + btrfs_submit_bbio(bbio, mirror); if (wait) wait_scrub_stripe_io(stripe); } @@ -1648,14 +1648,20 @@ static void scrub_reset_stripe(struct scrub_stripe *stripe) } } +static u32 stripe_length(const struct scrub_stripe *stripe) +{ + ASSERT(stripe->bg); + + return min(BTRFS_STRIPE_LEN, + stripe->bg->start + stripe->bg->length - stripe->logical); +} + static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, struct scrub_stripe *stripe) { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct btrfs_bio *bbio = NULL; - unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start + - stripe->bg->length - stripe->logical) >> - fs_info->sectorsize_bits; + unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits; u64 stripe_len = BTRFS_STRIPE_LEN; int mirror = stripe->mirror_num; int i; @@ -1677,7 +1683,7 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, bbio->bio.bi_iter.bi_size >= stripe_len)) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); - btrfs_submit_bio(bbio, mirror); + btrfs_submit_bbio(bbio, mirror); bbio = NULL; } @@ -1688,7 +1694,7 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, (i << fs_info->sectorsize_bits); int err; - io_stripe.is_scrub = true; + io_stripe.rst_search_commit_root = true; stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits; /* * For RST cases, we need to manually split the bbio to @@ -1714,7 +1720,7 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, if (bbio) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); - btrfs_submit_bio(bbio, mirror); + btrfs_submit_bbio(bbio, mirror); } if (atomic_dec_and_test(&stripe->pending_io)) { @@ -1729,9 +1735,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, { struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_bio *bbio; - unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start + - stripe->bg->length - stripe->logical) >> - fs_info->sectorsize_bits; + unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits; int mirror = stripe->mirror_num; ASSERT(stripe->bg); @@ -1772,7 +1776,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, mirror = calc_next_mirror(mirror, num_copies); } - btrfs_submit_bio(bbio, mirror); + btrfs_submit_bbio(bbio, mirror); } static bool stripe_has_metadata_error(struct scrub_stripe *stripe) @@ -1871,6 +1875,9 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) stripe = &sctx->stripes[i]; wait_scrub_stripe_io(stripe); + spin_lock(&sctx->stat_lock); + sctx->stat.last_physical = stripe->physical + stripe_length(stripe); + spin_unlock(&sctx->stat_lock); scrub_reset_stripe(stripe); } out: @@ -2139,7 +2146,9 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, cur_physical, &found_logical); if (ret > 0) { /* No more extent, just update the accounting */ + spin_lock(&sctx->stat_lock); sctx->stat.last_physical = physical + logical_length; + spin_unlock(&sctx->stat_lock); ret = 0; break; } @@ -2336,6 +2345,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, stripe_logical += chunk_logical; ret = scrub_raid56_parity_stripe(sctx, scrub_dev, bg, map, stripe_logical); + spin_lock(&sctx->stat_lock); + sctx->stat.last_physical = min(physical + BTRFS_STRIPE_LEN, + physical_end); + spin_unlock(&sctx->stat_lock); if (ret) goto out; goto next; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 4ca711a773ef..7f48ba6c1c77 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -62,7 +62,7 @@ struct fs_path { /* * Average path length does not exceed 200 bytes, we'll have * better packing in the slab and higher chance to satisfy - * a allocation later during send. + * an allocation later during send. */ char pad[256]; }; @@ -347,7 +347,7 @@ struct name_cache_entry { int ret; int need_later_update; int name_len; - char name[]; + char name[] __counted_by(name_len); }; /* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */ @@ -1136,7 +1136,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, /* * Start with a small buffer (1 page). If later we end up needing more * space, which can happen for xattrs on a fs with a leaf size greater - * then the page size, attempt to increase the buffer. Typically xattr + * than the page size, attempt to increase the buffer. Typically xattr * values are small. */ buf_len = PATH_MAX; @@ -6157,25 +6157,51 @@ static int send_write_or_clone(struct send_ctx *sctx, u64 offset = key->offset; u64 end; u64 bs = sctx->send_root->fs_info->sectorsize; + struct btrfs_file_extent_item *ei; + u64 disk_byte; + u64 data_offset; + u64 num_bytes; + struct btrfs_inode_info info = { 0 }; end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size); if (offset >= end) return 0; - if (clone_root && IS_ALIGNED(end, bs)) { - struct btrfs_file_extent_item *ei; - u64 disk_byte; - u64 data_offset; + num_bytes = end - offset; - ei = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); - data_offset = btrfs_file_extent_offset(path->nodes[0], ei); - ret = clone_range(sctx, path, clone_root, disk_byte, - data_offset, offset, end - offset); - } else { - ret = send_extent_data(sctx, path, offset, end - offset); - } + if (!clone_root) + goto write_data; + + if (IS_ALIGNED(end, bs)) + goto clone_data; + + /* + * If the extent end is not aligned, we can clone if the extent ends at + * the i_size of the inode and the clone range ends at the i_size of the + * source inode, otherwise the clone operation fails with -EINVAL. + */ + if (end != sctx->cur_inode_size) + goto write_data; + + ret = get_inode_info(clone_root->root, clone_root->ino, &info); + if (ret < 0) + return ret; + + if (clone_root->offset + num_bytes == info.size) + goto clone_data; + +write_data: + ret = send_extent_data(sctx, path, offset, num_bytes); + sctx->cur_inode_next_write_offset = end; + return ret; + +clone_data: + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); + data_offset = btrfs_file_extent_offset(path->nodes[0], ei); + ret = clone_range(sctx, path, clone_root, disk_byte, data_offset, offset, + num_bytes); sctx->cur_inode_next_write_offset = end; return ret; } diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 68e14fd48638..d5a9cd8a4fd8 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -163,7 +163,7 @@ * thing with or without extra unallocated space. */ -u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, +u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info, bool may_use_included) { ASSERT(s_info); @@ -368,7 +368,7 @@ static u64 calc_effective_data_chunk_size(struct btrfs_fs_info *fs_info) } static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, + const struct btrfs_space_info *space_info, enum btrfs_reserve_flush_enum flush) { u64 profile; @@ -437,7 +437,7 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, } int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 bytes, + const struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { u64 avail; @@ -542,8 +542,8 @@ static void dump_global_block_rsv(struct btrfs_fs_info *fs_info) DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); } -static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *info) +static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *info) { const char *flag_str = space_info_flag_to_str(info); lockdep_assert_held(&info->lock); @@ -844,9 +844,8 @@ static void flush_space(struct btrfs_fs_info *fs_info, return; } -static inline u64 -btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) +static u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *space_info) { u64 used; u64 avail; @@ -871,7 +870,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, } static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) + const struct btrfs_space_info *space_info) { const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv); u64 ordered, delalloc; @@ -1943,7 +1942,7 @@ static u64 calc_unalloc_target(struct btrfs_fs_info *fs_info) * Typically with 10 block groups as the target, the discrete values this comes * out to are 0, 10, 20, ... , 80, 90, and 99. */ -static int calc_dynamic_reclaim_threshold(struct btrfs_space_info *space_info) +static int calc_dynamic_reclaim_threshold(const struct btrfs_space_info *space_info) { struct btrfs_fs_info *fs_info = space_info->fs_info; u64 unalloc = atomic64_read(&fs_info->free_chunk_space); @@ -1962,7 +1961,7 @@ static int calc_dynamic_reclaim_threshold(struct btrfs_space_info *space_info) return calc_pct_ratio(want, target); } -int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info) +int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info) { lockdep_assert_held(&space_info->lock); @@ -1985,8 +1984,8 @@ static bool is_reclaim_urgent(struct btrfs_space_info *space_info) return unalloc < data_chunk_size; } -static int do_reclaim_sweep(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, int raid) +static void do_reclaim_sweep(const struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, int raid) { struct btrfs_block_group *bg; int thresh_pct; @@ -2031,7 +2030,6 @@ again: } up_read(&space_info->groups_sem); - return 0; } void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes) @@ -2074,21 +2072,15 @@ bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info) return ret; } -int btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info) +void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info) { - int ret; int raid; struct btrfs_space_info *space_info; list_for_each_entry(space_info, &fs_info->space_info, list) { if (!btrfs_should_periodic_reclaim(space_info)) continue; - for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) { - ret = do_reclaim_sweep(fs_info, space_info, raid); - if (ret) - return ret; - } + for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) + do_reclaim_sweep(fs_info, space_info, raid); } - - return ret; } diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 88b44221ce97..efbecc0c5258 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -217,7 +217,7 @@ struct reserve_ticket { wait_queue_head_t wait; }; -static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) +static inline bool btrfs_mixed_space_info(const struct btrfs_space_info *space_info) { return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); @@ -258,7 +258,7 @@ void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, u64 chunk_size); struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, u64 flags); -u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, +u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info, bool may_use_included); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, @@ -271,7 +271,7 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info); int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 bytes, + const struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush); static inline void btrfs_space_info_free_bytes_may_use( @@ -293,7 +293,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes); void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready); bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info); -int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info); -int btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info); +int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info); +void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info); #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 8ddd5fcbeb93..fe4d719d506b 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -64,6 +64,7 @@ * This means a slightly higher tree locking latency. */ +#if PAGE_SIZE > SZ_4K bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping) { if (fs_info->sectorsize >= PAGE_SIZE) @@ -85,37 +86,7 @@ bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space return true; return false; } - -void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize) -{ - unsigned int cur = 0; - unsigned int nr_bits; - - ASSERT(IS_ALIGNED(PAGE_SIZE, sectorsize)); - - nr_bits = PAGE_SIZE / sectorsize; - subpage_info->bitmap_nr_bits = nr_bits; - - subpage_info->uptodate_offset = cur; - cur += nr_bits; - - subpage_info->dirty_offset = cur; - cur += nr_bits; - - subpage_info->writeback_offset = cur; - cur += nr_bits; - - subpage_info->ordered_offset = cur; - cur += nr_bits; - - subpage_info->checked_offset = cur; - cur += nr_bits; - - subpage_info->locked_offset = cur; - cur += nr_bits; - - subpage_info->total_nr_bits = cur; -} +#endif int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, enum btrfs_subpage_type type) @@ -163,7 +134,7 @@ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, ASSERT(fs_info->sectorsize < PAGE_SIZE); real_size = struct_size(ret, bitmaps, - BITS_TO_LONGS(fs_info->subpage_info->total_nr_bits)); + BITS_TO_LONGS(btrfs_bitmap_nr_max * fs_info->sectors_per_page)); ret = kzalloc(real_size, GFP_NOFS); if (!ret) return ERR_PTR(-ENOMEM); @@ -246,7 +217,7 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, \ btrfs_subpage_assert(fs_info, folio, start, len); \ __start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ - __start_bit += fs_info->subpage_info->name##_offset; \ + __start_bit += fs_info->sectors_per_page * btrfs_bitmap_nr_##name; \ __start_bit; \ }) @@ -351,6 +322,8 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); const int nbits = (len >> fs_info->sectorsize_bits); unsigned long flags; + unsigned int cleared = 0; + int bit = start_bit; bool last; btrfs_subpage_assert(fs_info, folio, start, len); @@ -368,11 +341,12 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf return true; } - ASSERT(atomic_read(&subpage->writers) >= nbits); - /* The target range should have been locked. */ - ASSERT(bitmap_test_range_all_set(subpage->bitmaps, start_bit, nbits)); - bitmap_clear(subpage->bitmaps, start_bit, nbits); - last = atomic_sub_and_test(nbits, &subpage->writers); + for_each_set_bit_from(bit, subpage->bitmaps, start_bit + nbits) { + clear_bit(bit, subpage->bitmaps); + cleared++; + } + ASSERT(atomic_read(&subpage->writers) >= cleared); + last = atomic_sub_and_test(cleared, &subpage->writers); spin_unlock_irqrestore(&subpage->lock, flags); return last; } @@ -404,27 +378,94 @@ int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, return 0; } +/* + * Handle different locked folios: + * + * - Non-subpage folio + * Just unlock it. + * + * - folio locked but without any subpage locked + * This happens either before writepage_delalloc() or the delalloc range is + * already handled by previous folio. + * We can simple unlock it. + * + * - folio locked with subpage range locked. + * We go through the locked sectors inside the range and clear their locked + * bitmap, reduce the writer lock number, and unlock the page if that's + * the last locked range. + */ void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { + struct btrfs_subpage *subpage = folio_get_private(folio); + + ASSERT(folio_test_locked(folio)); + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { folio_unlock(folio); return; } + + /* + * For subpage case, there are two types of locked page. With or + * without writers number. + * + * Since we own the page lock, no one else could touch subpage::writers + * and we are safe to do several atomic operations without spinlock. + */ + if (atomic_read(&subpage->writers) == 0) { + /* No writers, locked by plain lock_page(). */ + folio_unlock(folio); + return; + } + btrfs_subpage_clamp_range(folio, &start, &len); if (btrfs_subpage_end_and_test_writer(fs_info, folio, start, len)) folio_unlock(folio); } +void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, + struct folio *folio, unsigned long bitmap) +{ + struct btrfs_subpage *subpage = folio_get_private(folio); + const int start_bit = fs_info->sectors_per_page * btrfs_bitmap_nr_locked; + unsigned long flags; + bool last = false; + int cleared = 0; + int bit; + + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { + folio_unlock(folio); + return; + } + + if (atomic_read(&subpage->writers) == 0) { + /* No writers, locked by plain lock_page(). */ + folio_unlock(folio); + return; + } + + spin_lock_irqsave(&subpage->lock, flags); + for_each_set_bit(bit, &bitmap, fs_info->sectors_per_page) { + if (test_and_clear_bit(bit + start_bit, subpage->bitmaps)) + cleared++; + } + ASSERT(atomic_read(&subpage->writers) >= cleared); + last = atomic_sub_and_test(cleared, &subpage->writers); + spin_unlock_irqrestore(&subpage->lock, flags); + if (last) + folio_unlock(folio); +} + #define subpage_test_bitmap_all_set(fs_info, subpage, name) \ bitmap_test_range_all_set(subpage->bitmaps, \ - fs_info->subpage_info->name##_offset, \ - fs_info->subpage_info->bitmap_nr_bits) + fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \ + fs_info->sectors_per_page) #define subpage_test_bitmap_all_zero(fs_info, subpage, name) \ bitmap_test_range_all_zero(subpage->bitmaps, \ - fs_info->subpage_info->name##_offset, \ - fs_info->subpage_info->bitmap_nr_bits) + fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \ + fs_info->sectors_per_page) void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) @@ -729,53 +770,6 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, } /* - * Handle different locked pages with different page sizes: - * - * - Page locked by plain lock_page() - * It should not have any subpage::writers count. - * Can be unlocked by unlock_page(). - * This is the most common locked page for __extent_writepage() called - * inside extent_write_cache_pages(). - * Rarer cases include the @locked_page from extent_write_locked_range(). - * - * - Page locked by lock_delalloc_pages() - * There is only one caller, all pages except @locked_page for - * extent_write_locked_range(). - * In this case, we have to call subpage helper to handle the case. - */ -void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) -{ - struct btrfs_subpage *subpage; - - ASSERT(folio_test_locked(folio)); - /* For non-subpage case, we just unlock the page */ - if (!btrfs_is_subpage(fs_info, folio->mapping)) { - folio_unlock(folio); - return; - } - - ASSERT(folio_test_private(folio) && folio_get_private(folio)); - subpage = folio_get_private(folio); - - /* - * For subpage case, there are two types of locked page. With or - * without writers number. - * - * Since we own the page lock, no one else could touch subpage::writers - * and we are safe to do several atomic operations without spinlock. - */ - if (atomic_read(&subpage->writers) == 0) { - /* No writers, locked by plain lock_page() */ - folio_unlock(folio); - return; - } - - /* Have writers, use proper subpage helper to end it */ - btrfs_folio_end_writer_lock(fs_info, folio, start, len); -} - -/* * This is for folio already locked by plain lock_page()/folio_lock(), which * doesn't have any subpage awareness. * @@ -803,7 +797,7 @@ void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info, ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); bitmap_set(subpage->bitmaps, start_bit, nbits); ret = atomic_add_return(nbits, &subpage->writers); - ASSERT(ret <= fs_info->subpage_info->bitmap_nr_bits); + ASSERT(ret <= fs_info->sectors_per_page); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -819,14 +813,13 @@ bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 search_start, u64 *found_start_ret, u32 *found_len_ret) { - struct btrfs_subpage_info *subpage_info = fs_info->subpage_info; struct btrfs_subpage *subpage = folio_get_private(folio); + const u32 sectors_per_page = fs_info->sectors_per_page; const unsigned int len = PAGE_SIZE - offset_in_page(search_start); const unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, locked, search_start, len); - const unsigned int locked_bitmap_start = subpage_info->locked_offset; - const unsigned int locked_bitmap_end = locked_bitmap_start + - subpage_info->bitmap_nr_bits; + const unsigned int locked_bitmap_start = sectors_per_page * btrfs_bitmap_nr_locked; + const unsigned int locked_bitmap_end = locked_bitmap_start + sectors_per_page; unsigned long flags; int first_zero; int first_set; @@ -855,59 +848,21 @@ out: return found; } -/* - * Unlike btrfs_folio_end_writer_lock() which unlocks a specified subpage range, - * this ends all writer locked ranges of a page. - * - * This is for the locked page of __extent_writepage(), as the locked page - * can contain several locked subpage ranges. - */ -void btrfs_folio_end_all_writers(const struct btrfs_fs_info *fs_info, struct folio *folio) -{ - struct btrfs_subpage *subpage = folio_get_private(folio); - u64 folio_start = folio_pos(folio); - u64 cur = folio_start; - - ASSERT(folio_test_locked(folio)); - if (!btrfs_is_subpage(fs_info, folio->mapping)) { - folio_unlock(folio); - return; - } - - /* The page has no new delalloc range locked on it. Just plain unlock. */ - if (atomic_read(&subpage->writers) == 0) { - folio_unlock(folio); - return; - } - while (cur < folio_start + PAGE_SIZE) { - u64 found_start; - u32 found_len; - bool found; - bool last; - - found = btrfs_subpage_find_writer_locked(fs_info, folio, cur, - &found_start, &found_len); - if (!found) - break; - last = btrfs_subpage_end_and_test_writer(fs_info, folio, - found_start, found_len); - if (last) { - folio_unlock(folio); - break; - } - cur = found_start + found_len; - } +#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \ +{ \ + const int sectors_per_page = fs_info->sectors_per_page; \ + \ + ASSERT(sectors_per_page < BITS_PER_LONG); \ + *dst = bitmap_read(subpage->bitmaps, \ + sectors_per_page * btrfs_bitmap_nr_##name, \ + sectors_per_page); \ } -#define GET_SUBPAGE_BITMAP(subpage, subpage_info, name, dst) \ - bitmap_cut(dst, subpage->bitmaps, 0, \ - subpage_info->name##_offset, subpage_info->bitmap_nr_bits) - void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage_info *subpage_info = fs_info->subpage_info; struct btrfs_subpage *subpage; + const u32 sectors_per_page = fs_info->sectors_per_page; unsigned long uptodate_bitmap; unsigned long dirty_bitmap; unsigned long writeback_bitmap; @@ -916,25 +871,41 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, unsigned long flags; ASSERT(folio_test_private(folio) && folio_get_private(folio)); - ASSERT(subpage_info); + ASSERT(sectors_per_page > 1); subpage = folio_get_private(folio); spin_lock_irqsave(&subpage->lock, flags); - GET_SUBPAGE_BITMAP(subpage, subpage_info, uptodate, &uptodate_bitmap); - GET_SUBPAGE_BITMAP(subpage, subpage_info, dirty, &dirty_bitmap); - GET_SUBPAGE_BITMAP(subpage, subpage_info, writeback, &writeback_bitmap); - GET_SUBPAGE_BITMAP(subpage, subpage_info, ordered, &ordered_bitmap); - GET_SUBPAGE_BITMAP(subpage, subpage_info, checked, &checked_bitmap); - GET_SUBPAGE_BITMAP(subpage, subpage_info, locked, &checked_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, uptodate, &uptodate_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, &dirty_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, writeback, &writeback_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, ordered, &ordered_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, checked, &checked_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &checked_bitmap); spin_unlock_irqrestore(&subpage->lock, flags); dump_page(folio_page(folio, 0), "btrfs subpage dump"); btrfs_warn(fs_info, "start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", start, len, folio_pos(folio), - subpage_info->bitmap_nr_bits, &uptodate_bitmap, - subpage_info->bitmap_nr_bits, &dirty_bitmap, - subpage_info->bitmap_nr_bits, &writeback_bitmap, - subpage_info->bitmap_nr_bits, &ordered_bitmap, - subpage_info->bitmap_nr_bits, &checked_bitmap); + sectors_per_page, &uptodate_bitmap, + sectors_per_page, &dirty_bitmap, + sectors_per_page, &writeback_bitmap, + sectors_per_page, &ordered_bitmap, + sectors_per_page, &checked_bitmap); +} + +void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, + struct folio *folio, + unsigned long *ret_bitmap) +{ + struct btrfs_subpage *subpage; + unsigned long flags; + + ASSERT(folio_test_private(folio) && folio_get_private(folio)); + ASSERT(fs_info->sectors_per_page > 1); + subpage = folio_get_private(folio); + + spin_lock_irqsave(&subpage->lock, flags); + GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, ret_bitmap); + spin_unlock_irqrestore(&subpage->lock, flags); } diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 249396e118d0..4b85d91d0e18 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -5,6 +5,7 @@ #include <linux/spinlock.h> #include <linux/atomic.h> +#include <linux/sizes.h> struct address_space; struct folio; @@ -18,39 +19,23 @@ struct btrfs_fs_info; * * This structure records how they are organized in the bitmap: * - * /- uptodate_offset /- dirty_offset /- ordered_offset + * /- uptodate /- dirty /- ordered * | | | * v v v * |u|u|u|u|........|u|u|d|d|.......|d|d|o|o|.......|o|o| - * |<- bitmap_nr_bits ->| - * |<----------------- total_nr_bits ------------------>| + * |< sectors_per_page >| + * + * Unlike regular macro-like enums, here we do not go upper-case names, as + * these names will be utilized in various macros to define function names. */ -struct btrfs_subpage_info { - /* Number of bits for each bitmap */ - unsigned int bitmap_nr_bits; - - /* Total number of bits for the whole bitmap */ - unsigned int total_nr_bits; - - /* - * *_offset indicates where the bitmap starts, the length is always - * @bitmap_size, which is calculated from PAGE_SIZE / sectorsize. - */ - unsigned int uptodate_offset; - unsigned int dirty_offset; - unsigned int writeback_offset; - unsigned int ordered_offset; - unsigned int checked_offset; - - /* - * For locked bitmaps, normally it's subpage representation for folio - * Locked flag, but metadata is different: - * - * - Metadata doesn't really lock the folio - * It's just to prevent page::private get cleared before the last - * end_page_read(). - */ - unsigned int locked_offset; +enum { + btrfs_bitmap_nr_uptodate = 0, + btrfs_bitmap_nr_dirty, + btrfs_bitmap_nr_writeback, + btrfs_bitmap_nr_ordered, + btrfs_bitmap_nr_checked, + btrfs_bitmap_nr_locked, + btrfs_bitmap_nr_max }; /* @@ -88,9 +73,16 @@ enum btrfs_subpage_type { BTRFS_SUBPAGE_DATA, }; +#if PAGE_SIZE > SZ_4K bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping); +#else +static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, + struct address_space *mapping) +{ + return false; +} +#endif -void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize); int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, enum btrfs_subpage_type type); void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio); @@ -114,10 +106,11 @@ void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); +void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, + struct folio *folio, unsigned long bitmap); bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 search_start, u64 *found_start_ret, u32 *found_len_ret); -void btrfs_folio_end_all_writers(const struct btrfs_fs_info *fs_info, struct folio *folio); /* * Template for subpage related operations. @@ -164,8 +157,9 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); -void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); +void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, + struct folio *folio, + unsigned long *ret_bitmap); void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 08d33cb372fb..98fa0f382480 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -28,6 +28,7 @@ #include <linux/btrfs.h> #include <linux/security.h> #include <linux/fs_parser.h> +#include <linux/swap.h> #include "messages.h" #include "delayed-inode.h" #include "ctree.h" @@ -683,8 +684,11 @@ bool btrfs_check_options(const struct btrfs_fs_info *info, ret = false; if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) { - if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) + if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) { btrfs_info(info, "disk space caching is enabled"); + btrfs_warn(info, +"space cache v1 is being deprecated and will be removed in a future release, please use -o space_cache=v2"); + } if (btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE)) btrfs_info(info, "using free-space-tree"); } @@ -2398,7 +2402,13 @@ static long btrfs_nr_cached_objects(struct super_block *sb, struct shrink_contro trace_btrfs_extent_map_shrinker_count(fs_info, nr); - return nr; + /* + * Only report the real number for DEBUG builds, as there are reports of + * serious performance degradation caused by too frequent shrinks. + */ + if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) + return nr; + return 0; } static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_control *sc) @@ -2406,6 +2416,15 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan); struct btrfs_fs_info *fs_info = btrfs_sb(sb); + /* + * We may be called from any task trying to allocate memory and we don't + * want to slow it down with scanning and dropping extent maps. It would + * also cause heavy lock contention if many tasks concurrently enter + * here. Therefore only allow kswapd tasks to scan and drop extent maps. + */ + if (!current_is_kswapd()) + return 0; + return btrfs_free_extent_maps(fs_info, nr_to_scan); } diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 865d4af4b303..0a2dbfaaf49e 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -180,7 +180,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) set_extent_bit(tmp, 0, sectorsize - 1, EXTENT_DELALLOC, NULL); start = 0; end = start + PAGE_SIZE - 1; - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (!found) { test_err("should have found at least one delalloc"); @@ -211,7 +211,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) set_extent_bit(tmp, sectorsize, max_bytes - 1, EXTENT_DELALLOC, NULL); start = test_start; end = start + PAGE_SIZE - 1; - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (!found) { test_err("couldn't find delalloc in our range"); @@ -245,7 +245,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) } start = test_start; end = start + PAGE_SIZE - 1; - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (found) { test_err("found range when we shouldn't have"); @@ -266,7 +266,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) set_extent_bit(tmp, max_bytes, total_dirty - 1, EXTENT_DELALLOC, NULL); start = test_start; end = start + PAGE_SIZE - 1; - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (!found) { test_err("didn't find our range"); @@ -307,7 +307,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) * this changes at any point in the future we will need to fix this * tests expected behavior. */ - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (!found) { test_err("didn't find our range"); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 5e6fff8e1003..0fc873af891f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -143,8 +143,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) BUG_ON(!list_empty(&transaction->list)); WARN_ON(!RB_EMPTY_ROOT( &transaction->delayed_refs.href_root.rb_root)); - WARN_ON(!RB_EMPTY_ROOT( - &transaction->delayed_refs.dirty_extent_root)); + WARN_ON(!xa_empty(&transaction->delayed_refs.dirty_extents)); if (transaction->delayed_refs.pending_csums) btrfs_err(transaction->fs_info, "pending csums is %llu", @@ -351,7 +350,7 @@ loop: memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs)); cur_trans->delayed_refs.href_root = RB_ROOT_CACHED; - cur_trans->delayed_refs.dirty_extent_root = RB_ROOT; + xa_init(&cur_trans->delayed_refs.dirty_extents); atomic_set(&cur_trans->delayed_refs.num_entries, 0); /* diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 98c03ddc760b..dd9ce9b9f69e 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -27,6 +27,12 @@ struct btrfs_root_item; struct btrfs_root; struct btrfs_path; +/* + * Signal that a direct IO write is in progress, to avoid deadlock for sync + * direct IO writes when fsync is called during the direct IO write path. + */ +#define BTRFS_TRANS_DIO_WRITE_STUB ((void *) 1) + /* Radix-tree tag for roots that are part of the trasaction. */ #define BTRFS_ROOT_TRANS_TAG 0 diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index a825fa598e3c..634d69964fe4 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -569,9 +569,10 @@ static int check_dir_item(struct extent_buffer *leaf, /* dir type check */ dir_type = btrfs_dir_ftype(leaf, di); - if (unlikely(dir_type >= BTRFS_FT_MAX)) { + if (unlikely(dir_type <= BTRFS_FT_UNKNOWN || + dir_type >= BTRFS_FT_MAX)) { dir_item_err(leaf, slot, - "invalid dir item type, have %u expect [0, %u)", + "invalid dir item type, have %u expect (0, %u)", dir_type, BTRFS_FT_MAX); return -EUCLEAN; } @@ -1763,6 +1764,72 @@ static int check_raid_stripe_extent(const struct extent_buffer *leaf, return 0; } +static int check_dev_extent_item(const struct extent_buffer *leaf, + const struct btrfs_key *key, + int slot, + struct btrfs_key *prev_key) +{ + struct btrfs_dev_extent *de; + const u32 sectorsize = leaf->fs_info->sectorsize; + + de = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); + /* Basic fixed member checks. */ + if (unlikely(btrfs_dev_extent_chunk_tree(leaf, de) != + BTRFS_CHUNK_TREE_OBJECTID)) { + generic_err(leaf, slot, + "invalid dev extent chunk tree id, has %llu expect %llu", + btrfs_dev_extent_chunk_tree(leaf, de), + BTRFS_CHUNK_TREE_OBJECTID); + return -EUCLEAN; + } + if (unlikely(btrfs_dev_extent_chunk_objectid(leaf, de) != + BTRFS_FIRST_CHUNK_TREE_OBJECTID)) { + generic_err(leaf, slot, + "invalid dev extent chunk objectid, has %llu expect %llu", + btrfs_dev_extent_chunk_objectid(leaf, de), + BTRFS_FIRST_CHUNK_TREE_OBJECTID); + return -EUCLEAN; + } + /* Alignment check. */ + if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) { + generic_err(leaf, slot, + "invalid dev extent key.offset, has %llu not aligned to %u", + key->offset, sectorsize); + return -EUCLEAN; + } + if (unlikely(!IS_ALIGNED(btrfs_dev_extent_chunk_offset(leaf, de), + sectorsize))) { + generic_err(leaf, slot, + "invalid dev extent chunk offset, has %llu not aligned to %u", + btrfs_dev_extent_chunk_objectid(leaf, de), + sectorsize); + return -EUCLEAN; + } + if (unlikely(!IS_ALIGNED(btrfs_dev_extent_length(leaf, de), + sectorsize))) { + generic_err(leaf, slot, + "invalid dev extent length, has %llu not aligned to %u", + btrfs_dev_extent_length(leaf, de), sectorsize); + return -EUCLEAN; + } + /* Overlap check with previous dev extent. */ + if (slot && prev_key->objectid == key->objectid && + prev_key->type == key->type) { + struct btrfs_dev_extent *prev_de; + u64 prev_len; + + prev_de = btrfs_item_ptr(leaf, slot - 1, struct btrfs_dev_extent); + prev_len = btrfs_dev_extent_length(leaf, prev_de); + if (unlikely(prev_key->offset + prev_len > key->offset)) { + generic_err(leaf, slot, + "dev extent overlap, prev offset %llu len %llu current offset %llu", + prev_key->objectid, prev_len, key->offset); + return -EUCLEAN; + } + } + return 0; +} + /* * Common point to switch the item-specific validation. */ @@ -1799,6 +1866,9 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf, case BTRFS_DEV_ITEM_KEY: ret = check_dev_item(leaf, key, slot); break; + case BTRFS_DEV_EXTENT_KEY: + ret = check_dev_extent_item(leaf, key, slot, prev_key); + break; case BTRFS_INODE_ITEM_KEY: ret = check_inode_item(leaf, key, slot); break; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f0cf8ce26f01..e2ed2a791f8f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2877,7 +2877,7 @@ void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx) struct btrfs_ordered_extent *ordered; struct btrfs_ordered_extent *tmp; - ASSERT(inode_is_locked(&ctx->inode->vfs_inode)); + btrfs_assert_inode_locked(ctx->inode); list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { list_del_init(&ordered->log_list); diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index fa45b5fb9683..b382a4c443d4 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -170,7 +170,7 @@ static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info, * this until all tree mod log insertions are recorded in the rb tree and then * write unlock fs_info::tree_mod_log_lock. */ -static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) +static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, const struct extent_buffer *eb) { if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) return true; @@ -188,7 +188,7 @@ static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, struct extent_buffe /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, - struct extent_buffer *eb) + const struct extent_buffer *eb) { if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) return false; @@ -198,7 +198,7 @@ static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, return true; } -static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, +static struct tree_mod_elem *alloc_tree_mod_elem(const struct extent_buffer *eb, int slot, enum btrfs_mod_log_op op) { @@ -221,7 +221,7 @@ static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, return tm; } -int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, +int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot, enum btrfs_mod_log_op op) { struct tree_mod_elem *tm; @@ -258,7 +258,7 @@ out_unlock: return ret; } -static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb, +static struct tree_mod_elem *tree_mod_log_alloc_move(const struct extent_buffer *eb, int dst_slot, int src_slot, int nr_items) { @@ -278,7 +278,7 @@ static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb, return tm; } -int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, +int btrfs_tree_mod_log_insert_move(const struct extent_buffer *eb, int dst_slot, int src_slot, int nr_items) { @@ -535,7 +535,7 @@ static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info, } int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, - struct extent_buffer *src, + const struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, int nr_items) diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h index ff00c8e8a393..6308c577a4a4 100644 --- a/fs/btrfs/tree-mod-log.h +++ b/fs/btrfs/tree-mod-log.h @@ -37,7 +37,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, struct extent_buffer *new_root, bool log_removal); -int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, +int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot, enum btrfs_mod_log_op op); int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb); struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, @@ -47,11 +47,11 @@ struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq); int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, - struct extent_buffer *src, + const struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, int nr_items); -int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, +int btrfs_tree_mod_log_insert_move(const struct extent_buffer *eb, int dst_slot, int src_slot, int nr_items); u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index eae75bb572b9..c6399513c66f 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -3,6 +3,7 @@ * Copyright (C) STRATO AG 2013. All rights reserved. */ +#include <linux/kthread.h> #include <linux/uuid.h> #include <asm/unaligned.h> #include "messages.h" @@ -12,6 +13,7 @@ #include "fs.h" #include "accessors.h" #include "uuid-tree.h" +#include "ioctl.h" static void btrfs_uuid_to_key(const u8 *uuid, u8 type, struct btrfs_key *key) { @@ -390,3 +392,180 @@ out: btrfs_free_path(path); return ret; } + +int btrfs_uuid_scan_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = data; + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_key key; + struct btrfs_path *path = NULL; + int ret = 0; + struct extent_buffer *eb; + int slot; + struct btrfs_root_item root_item; + u32 item_size; + struct btrfs_trans_handle *trans = NULL; + bool closing = false; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = 0; + + while (1) { + if (btrfs_fs_closing(fs_info)) { + closing = true; + break; + } + ret = btrfs_search_forward(root, &key, path, + BTRFS_OLDEST_GENERATION); + if (ret) { + if (ret > 0) + ret = 0; + break; + } + + if (key.type != BTRFS_ROOT_ITEM_KEY || + (key.objectid < BTRFS_FIRST_FREE_OBJECTID && + key.objectid != BTRFS_FS_TREE_OBJECTID) || + key.objectid > BTRFS_LAST_FREE_OBJECTID) + goto skip; + + eb = path->nodes[0]; + slot = path->slots[0]; + item_size = btrfs_item_size(eb, slot); + if (item_size < sizeof(root_item)) + goto skip; + + read_extent_buffer(eb, &root_item, + btrfs_item_ptr_offset(eb, slot), + (int)sizeof(root_item)); + if (btrfs_root_refs(&root_item) == 0) + goto skip; + + if (!btrfs_is_empty_uuid(root_item.uuid) || + !btrfs_is_empty_uuid(root_item.received_uuid)) { + if (trans) + goto update_tree; + + btrfs_release_path(path); + /* + * 1 - subvol uuid item + * 1 - received_subvol uuid item + */ + trans = btrfs_start_transaction(fs_info->uuid_root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + continue; + } else { + goto skip; + } +update_tree: + btrfs_release_path(path); + if (!btrfs_is_empty_uuid(root_item.uuid)) { + ret = btrfs_uuid_tree_add(trans, root_item.uuid, + BTRFS_UUID_KEY_SUBVOL, + key.objectid); + if (ret < 0) { + btrfs_warn(fs_info, "uuid_tree_add failed %d", + ret); + break; + } + } + + if (!btrfs_is_empty_uuid(root_item.received_uuid)) { + ret = btrfs_uuid_tree_add(trans, + root_item.received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + key.objectid); + if (ret < 0) { + btrfs_warn(fs_info, "uuid_tree_add failed %d", + ret); + break; + } + } + +skip: + btrfs_release_path(path); + if (trans) { + ret = btrfs_end_transaction(trans); + trans = NULL; + if (ret) + break; + } + + if (key.offset < (u64)-1) { + key.offset++; + } else if (key.type < BTRFS_ROOT_ITEM_KEY) { + key.offset = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + } else if (key.objectid < (u64)-1) { + key.offset = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + key.objectid++; + } else { + break; + } + cond_resched(); + } + +out: + btrfs_free_path(path); + if (trans && !IS_ERR(trans)) + btrfs_end_transaction(trans); + if (ret) + btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); + else if (!closing) + set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); + up(&fs_info->uuid_tree_rescan_sem); + return 0; +} + +int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *uuid_root; + struct task_struct *task; + int ret; + + /* + * 1 - root node + * 1 - root item + */ + trans = btrfs_start_transaction(tree_root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); + if (IS_ERR(uuid_root)) { + ret = PTR_ERR(uuid_root); + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } + + fs_info->uuid_root = uuid_root; + + ret = btrfs_commit_transaction(trans); + if (ret) + return ret; + + down(&fs_info->uuid_tree_rescan_sem); + task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); + if (IS_ERR(task)) { + /* fs_info->update_uuid_tree_gen remains 0 in all error case */ + btrfs_warn(fs_info, "failed to start uuid_scan task"); + up(&fs_info->uuid_tree_rescan_sem); + return PTR_ERR(task); + } + + return 0; +} diff --git a/fs/btrfs/uuid-tree.h b/fs/btrfs/uuid-tree.h index a3f5757cc7cf..c60ad20325cc 100644 --- a/fs/btrfs/uuid-tree.h +++ b/fs/btrfs/uuid-tree.h @@ -13,5 +13,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 typ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type, u64 subid); int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info); +int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); +int btrfs_uuid_scan_kthread(void *data); #endif diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 4042dd6437ae..e97ad824ae16 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -284,7 +284,7 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, * page and ignore dest, but it must still be non-NULL to avoid the * counting-only behavior. * @len: length in bytes to read - * @dest_page: copy into this page instead of the dest buffer + * @dest_folio: copy into this folio instead of the dest buffer * * Helper function to read items from the btree. This returns the number of * bytes read or < 0 for errors. We can return short reads if the items don't @@ -294,7 +294,7 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, * Returns number of bytes read or a negative error code on failure. */ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, - char *dest, u64 len, struct page *dest_page) + char *dest, u64 len, struct folio *dest_folio) { struct btrfs_path *path; struct btrfs_root *root = inode->root; @@ -314,7 +314,7 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, if (!path) return -ENOMEM; - if (dest_page) + if (dest_folio) path->reada = READA_FORWARD; key.objectid = btrfs_ino(inode); @@ -371,15 +371,15 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, copy_offset = offset - key.offset; if (dest) { - if (dest_page) - kaddr = kmap_local_page(dest_page); + if (dest_folio) + kaddr = kmap_local_folio(dest_folio, 0); data = btrfs_item_ptr(leaf, path->slots[0], void); read_extent_buffer(leaf, kaddr + dest_offset, (unsigned long)data + copy_offset, copy_bytes); - if (dest_page) + if (dest_folio) kunmap_local(kaddr); } @@ -460,7 +460,7 @@ static int rollback_verity(struct btrfs_inode *inode) struct btrfs_root *root = inode->root; int ret; - ASSERT(inode_is_locked(&inode->vfs_inode)); + btrfs_assert_inode_locked(inode); truncate_inode_pages(inode->vfs_inode.i_mapping, inode->vfs_inode.i_size); clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags); ret = btrfs_drop_verity_items(inode); @@ -585,7 +585,7 @@ static int btrfs_begin_enable_verity(struct file *filp) struct btrfs_trans_handle *trans; int ret; - ASSERT(inode_is_locked(file_inode(filp))); + btrfs_assert_inode_locked(inode); if (test_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags)) return -EBUSY; @@ -633,7 +633,7 @@ static int btrfs_end_enable_verity(struct file *filp, const void *desc, int ret = 0; int rollback_ret; - ASSERT(inode_is_locked(file_inode(filp))); + btrfs_assert_inode_locked(inode); if (desc == NULL) goto rollback; @@ -762,7 +762,7 @@ again: * [ inode objectid, BTRFS_MERKLE_ITEM_KEY, offset in bytes ] */ ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, off, - folio_address(folio), PAGE_SIZE, &folio->page); + folio_address(folio), PAGE_SIZE, folio); if (ret < 0) { folio_put(folio); return ERR_PTR(ret); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index fcedc43ef291..8f340ad1d938 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -476,6 +476,8 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, if (IS_ERR(*bdev_file)) { ret = PTR_ERR(*bdev_file); + btrfs_err(NULL, "failed to open device for path %s with flags 0x%x: %d", + device_path, flags, ret); goto error; } bdev = file_bdev(*bdev_file); @@ -4784,183 +4786,6 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_uuid_scan_kthread(void *data) -{ - struct btrfs_fs_info *fs_info = data; - struct btrfs_root *root = fs_info->tree_root; - struct btrfs_key key; - struct btrfs_path *path = NULL; - int ret = 0; - struct extent_buffer *eb; - int slot; - struct btrfs_root_item root_item; - u32 item_size; - struct btrfs_trans_handle *trans = NULL; - bool closing = false; - - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - - key.objectid = 0; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = 0; - - while (1) { - if (btrfs_fs_closing(fs_info)) { - closing = true; - break; - } - ret = btrfs_search_forward(root, &key, path, - BTRFS_OLDEST_GENERATION); - if (ret) { - if (ret > 0) - ret = 0; - break; - } - - if (key.type != BTRFS_ROOT_ITEM_KEY || - (key.objectid < BTRFS_FIRST_FREE_OBJECTID && - key.objectid != BTRFS_FS_TREE_OBJECTID) || - key.objectid > BTRFS_LAST_FREE_OBJECTID) - goto skip; - - eb = path->nodes[0]; - slot = path->slots[0]; - item_size = btrfs_item_size(eb, slot); - if (item_size < sizeof(root_item)) - goto skip; - - read_extent_buffer(eb, &root_item, - btrfs_item_ptr_offset(eb, slot), - (int)sizeof(root_item)); - if (btrfs_root_refs(&root_item) == 0) - goto skip; - - if (!btrfs_is_empty_uuid(root_item.uuid) || - !btrfs_is_empty_uuid(root_item.received_uuid)) { - if (trans) - goto update_tree; - - btrfs_release_path(path); - /* - * 1 - subvol uuid item - * 1 - received_subvol uuid item - */ - trans = btrfs_start_transaction(fs_info->uuid_root, 2); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - continue; - } else { - goto skip; - } -update_tree: - btrfs_release_path(path); - if (!btrfs_is_empty_uuid(root_item.uuid)) { - ret = btrfs_uuid_tree_add(trans, root_item.uuid, - BTRFS_UUID_KEY_SUBVOL, - key.objectid); - if (ret < 0) { - btrfs_warn(fs_info, "uuid_tree_add failed %d", - ret); - break; - } - } - - if (!btrfs_is_empty_uuid(root_item.received_uuid)) { - ret = btrfs_uuid_tree_add(trans, - root_item.received_uuid, - BTRFS_UUID_KEY_RECEIVED_SUBVOL, - key.objectid); - if (ret < 0) { - btrfs_warn(fs_info, "uuid_tree_add failed %d", - ret); - break; - } - } - -skip: - btrfs_release_path(path); - if (trans) { - ret = btrfs_end_transaction(trans); - trans = NULL; - if (ret) - break; - } - - if (key.offset < (u64)-1) { - key.offset++; - } else if (key.type < BTRFS_ROOT_ITEM_KEY) { - key.offset = 0; - key.type = BTRFS_ROOT_ITEM_KEY; - } else if (key.objectid < (u64)-1) { - key.offset = 0; - key.type = BTRFS_ROOT_ITEM_KEY; - key.objectid++; - } else { - break; - } - cond_resched(); - } - -out: - btrfs_free_path(path); - if (trans && !IS_ERR(trans)) - btrfs_end_transaction(trans); - if (ret) - btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); - else if (!closing) - set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); - up(&fs_info->uuid_tree_rescan_sem); - return 0; -} - -int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) -{ - struct btrfs_trans_handle *trans; - struct btrfs_root *tree_root = fs_info->tree_root; - struct btrfs_root *uuid_root; - struct task_struct *task; - int ret; - - /* - * 1 - root node - * 1 - root item - */ - trans = btrfs_start_transaction(tree_root, 2); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); - if (IS_ERR(uuid_root)) { - ret = PTR_ERR(uuid_root); - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - return ret; - } - - fs_info->uuid_root = uuid_root; - - ret = btrfs_commit_transaction(trans); - if (ret) - return ret; - - down(&fs_info->uuid_tree_rescan_sem); - task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); - if (IS_ERR(task)) { - /* fs_info->update_uuid_tree_gen remains 0 in all error case */ - btrfs_warn(fs_info, "failed to start uuid_scan task"); - up(&fs_info->uuid_tree_rescan_sem); - return PTR_ERR(task); - } - - return 0; -} - /* * shrinking a device means finding all of the device extents past * the new size, and then following the back refs to the chunks. @@ -5956,11 +5781,31 @@ void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info) write_unlock(&fs_info->mapping_tree_lock); } +static int btrfs_chunk_map_num_copies(const struct btrfs_chunk_map *map) +{ + enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(map->type); + + if (map->type & BTRFS_BLOCK_GROUP_RAID5) + return 2; + + /* + * There could be two corrupted data stripes, we need to loop retry in + * order to rebuild the correct data. + * + * Fail a stripe at a time on every retry except the stripe under + * reconstruction. + */ + if (map->type & BTRFS_BLOCK_GROUP_RAID6) + return map->num_stripes; + + /* Non-RAID56, use their ncopies from btrfs_raid_array. */ + return btrfs_raid_array[index].ncopies; +} + int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { struct btrfs_chunk_map *map; - enum btrfs_raid_types index; - int ret = 1; + int ret; map = btrfs_get_chunk_map(fs_info, logical, len); if (IS_ERR(map)) @@ -5972,22 +5817,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) */ return 1; - index = btrfs_bg_flags_to_raid_index(map->type); - - /* Non-RAID56, use their ncopies from btrfs_raid_array. */ - if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)) - ret = btrfs_raid_array[index].ncopies; - else if (map->type & BTRFS_BLOCK_GROUP_RAID5) - ret = 2; - else if (map->type & BTRFS_BLOCK_GROUP_RAID6) - /* - * There could be two corrupted data stripes, we need - * to loop retry in order to rebuild the correct data. - * - * Fail a stripe at a time on every retry except the - * stripe under reconstruction. - */ - ret = map->num_stripes; + ret = btrfs_chunk_map_num_copies(map); btrfs_free_chunk_map(map); return ret; } @@ -6637,14 +6467,14 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, io_geom.stripe_index = 0; io_geom.op = op; - num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize); - if (io_geom.mirror_num > num_copies) - return -EINVAL; - map = btrfs_get_chunk_map(fs_info, logical, *length); if (IS_ERR(map)) return PTR_ERR(map); + num_copies = btrfs_chunk_map_num_copies(map); + if (io_geom.mirror_num > num_copies) + return -EINVAL; + map_offset = logical - map->start; io_geom.raid56_full_stripe_start = (u64)-1; max_len = btrfs_max_io_len(map, map_offset, &io_geom); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 37a09ebb34dd..03d2d60afe0c 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -444,7 +444,7 @@ struct btrfs_io_stripe { /* Block mapping. */ u64 physical; u64 length; - bool is_scrub; + bool rst_search_commit_root; /* For the endio handler. */ struct btrfs_io_context *bioc; }; @@ -725,8 +725,6 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info); int btrfs_pause_balance(struct btrfs_fs_info *fs_info); int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset); int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); -int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); -int btrfs_uuid_scan_kthread(void *data); bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset); void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 738c7bb8ea7c..ce464cd8e0ac 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -120,7 +120,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, * locks the inode's i_mutex before calling setxattr or removexattr. */ if (flags & XATTR_REPLACE) { - ASSERT(inode_is_locked(inode)); + btrfs_assert_inode_locked(BTRFS_I(inode)); di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(BTRFS_I(inode)), name, name_len, 0); if (!di) diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 30971dd741e2..100abc00b794 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -20,6 +20,8 @@ #include <linux/refcount.h> #include "btrfs_inode.h" #include "compression.h" +#include "fs.h" +#include "subpage.h" /* workspace buffer size for s390 zlib hardware support */ #define ZLIB_DFLTCC_BUF_SIZE (4 * PAGE_SIZE) @@ -108,6 +110,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, unsigned long len = *total_out; unsigned long nr_dest_folios = *out_folios; const unsigned long max_out = nr_dest_folios * PAGE_SIZE; + const u64 orig_end = start + len; *out_folios = 0; *total_out = 0; @@ -153,6 +156,10 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, if (in_buf_folios > 1) { int i; + /* S390 hardware acceleration path, not subpage. */ + ASSERT(!btrfs_is_subpage( + inode_to_fs_info(mapping->host), + mapping)); for (i = 0; i < in_buf_folios; i++) { if (data_in) { kunmap_local(data_in); @@ -167,9 +174,14 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, copy_page(workspace->buf + i * PAGE_SIZE, data_in); start += PAGE_SIZE; + workspace->strm.avail_in = + (in_buf_folios << PAGE_SHIFT); } workspace->strm.next_in = workspace->buf; } else { + unsigned int pg_off; + unsigned int cur_len; + if (data_in) { kunmap_local(data_in); folio_put(in_folio); @@ -179,12 +191,13 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, start, &in_folio); if (ret < 0) goto out; - data_in = kmap_local_folio(in_folio, 0); + pg_off = offset_in_page(start); + cur_len = btrfs_calc_input_length(orig_end, start); + data_in = kmap_local_folio(in_folio, pg_off); start += PAGE_SIZE; workspace->strm.next_in = data_in; + workspace->strm.avail_in = cur_len; } - workspace->strm.avail_in = min(bytes_left, - (unsigned long) workspace->buf_size); } ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH); @@ -380,7 +393,7 @@ done: } int zlib_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); @@ -408,12 +421,12 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in, ret = zlib_inflateInit2(&workspace->strm, wbits); if (unlikely(ret != Z_OK)) { - struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host); + struct btrfs_inode *inode = folio_to_inode(dest_folio); btrfs_err(inode->root->fs_info, "zlib decompression init failed, error %d root %llu inode %llu offset %llu", ret, btrfs_root_id(inode->root), btrfs_ino(inode), - page_offset(dest_page)); + folio_pos(dest_folio)); return -EIO; } @@ -426,16 +439,16 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in, if (ret != Z_STREAM_END) goto out; - memcpy_to_page(dest_page, dest_pgoff, workspace->buf, to_copy); + memcpy_to_folio(dest_folio, dest_pgoff, workspace->buf, to_copy); out: if (unlikely(to_copy != destlen)) { - struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host); + struct btrfs_inode *inode = folio_to_inode(dest_folio); btrfs_err(inode->root->fs_info, "zlib decompression failed, error %d root %llu inode %llu offset %llu decompressed %lu expected %zu", ret, btrfs_root_id(inode->root), btrfs_ino(inode), - page_offset(dest_page), to_copy, destlen); + folio_pos(dest_folio), to_copy, destlen); ret = -EIO; } else { ret = 0; @@ -444,7 +457,7 @@ out: zlib_inflateEnd(&workspace->strm); if (unlikely(to_copy < destlen)) - memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy); + folio_zero_range(dest_folio, dest_pgoff + to_copy, destlen - to_copy); return ret; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 66f63e82af79..7fa2920632ba 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -287,7 +287,7 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, /* The emulated zone size is determined from the size of device extent */ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; struct extent_buffer *leaf; @@ -304,28 +304,21 @@ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); if (ret < 0) - goto out; + return ret; /* No dev extents at all? Not good */ - if (ret > 0) { - ret = -EUCLEAN; - goto out; - } + if (ret > 0) + return -EUCLEAN; } leaf = path->nodes[0]; dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); fs_info->zone_size = btrfs_dev_extent_length(leaf, dext); - ret = 0; - -out: - btrfs_free_path(path); - - return ret; + return 0; } int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) @@ -1211,7 +1204,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_root *root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; int ret; @@ -1246,7 +1239,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, if (!ret) ret = -EUCLEAN; if (ret < 0) - goto out; + return ret; ret = btrfs_previous_extent_item(root, path, cache->start); if (ret) { @@ -1254,7 +1247,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, ret = 0; *offset_ret = 0; } - goto out; + return ret; } btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); @@ -1266,15 +1259,10 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, if (!(found_key.objectid >= cache->start && found_key.objectid + length <= cache->start + cache->length)) { - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } *offset_ret = found_key.objectid + length - cache->start; - ret = 0; - -out: - btrfs_free_path(path); - return ret; + return 0; } struct zone_info { @@ -1406,6 +1394,8 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, return -EINVAL; } + bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); + if (zone_info[0].alloc_offset == WP_MISSING_DEV) { btrfs_err(bg->fs_info, "zoned: cannot recover write pointer for zone %llu", @@ -1432,7 +1422,6 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, } bg->alloc_offset = zone_info[0].alloc_offset; - bg->zone_capacity = min(zone_info[0].capacity, zone_info[1].capacity); return 0; } @@ -1450,6 +1439,9 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, return -EINVAL; } + /* In case a device is missing we have a cap of 0, so don't use it. */ + bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); + for (i = 0; i < map->num_stripes; i++) { if (zone_info[i].alloc_offset == WP_MISSING_DEV || zone_info[i].alloc_offset == WP_CONVENTIONAL) @@ -1471,9 +1463,6 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, if (test_bit(0, active)) set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); } - /* In case a device is missing we have a cap of 0, so don't use it. */ - bg->zone_capacity = min_not_zero(zone_info[0].capacity, - zone_info[1].capacity); } if (zone_info[0].alloc_offset != WP_MISSING_DEV) @@ -1563,6 +1552,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) unsigned long *active = NULL; u64 last_alloc = 0; u32 num_sequential = 0, num_conventional = 0; + u64 profile; if (!btrfs_is_zoned(fs_info)) return 0; @@ -1623,7 +1613,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } } - switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; + switch (profile) { case 0: /* single */ ret = btrfs_load_block_group_single(cache, &zone_info[0], active); break; @@ -1650,6 +1641,23 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } + if (ret == -EIO && profile != 0 && profile != BTRFS_BLOCK_GROUP_RAID0 && + profile != BTRFS_BLOCK_GROUP_RAID10) { + /* + * Detected broken write pointer. Make this block group + * unallocatable by setting the allocation pointer at the end of + * allocatable region. Relocating this block group will fix the + * mismatch. + * + * Currently, we cannot handle RAID0 or RAID10 case like this + * because we don't have a proper zone_capacity value. But, + * reading from this block group won't work anyway by a missing + * stripe. + */ + cache->alloc_offset = cache->zone_capacity; + ret = 0; + } + out: /* Reject non SINGLE data profiles without RST */ if ((map->type & BTRFS_BLOCK_GROUP_DATA) && @@ -2439,7 +2447,7 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) mutex_unlock(&fs_devices->device_list_mutex); } -bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) +bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 30b2e48a1cec..7612e6572605 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -89,7 +89,7 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, struct extent_buffer *eb); void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); -bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info); +bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info); void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, u64 length); int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info); @@ -242,7 +242,7 @@ static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { } static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { } -static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) +static inline bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info) { return false; } diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 2a079561b2b1..866607fd3e58 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -389,7 +389,10 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, unsigned long tot_out = 0; unsigned long len = *total_out; const unsigned long nr_dest_folios = *out_folios; + const u64 orig_end = start + len; unsigned long max_out = nr_dest_folios * PAGE_SIZE; + unsigned int pg_off; + unsigned int cur_len; zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level, len); @@ -415,9 +418,11 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); if (ret < 0) goto out; - workspace->in_buf.src = kmap_local_folio(in_folio, 0); + pg_off = offset_in_page(start); + cur_len = btrfs_calc_input_length(orig_end, start); + workspace->in_buf.src = kmap_local_folio(in_folio, pg_off); workspace->in_buf.pos = 0; - workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); + workspace->in_buf.size = cur_len; /* Allocate and map in the output buffer */ out_folio = btrfs_alloc_compr_folio(); @@ -494,14 +499,16 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, kunmap_local(workspace->in_buf.src); workspace->in_buf.src = NULL; folio_put(in_folio); - start += PAGE_SIZE; - len -= PAGE_SIZE; + start += cur_len; + len -= cur_len; ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); if (ret < 0) goto out; - workspace->in_buf.src = kmap_local_folio(in_folio, 0); + pg_off = offset_in_page(start); + cur_len = btrfs_calc_input_length(orig_end, start); + workspace->in_buf.src = kmap_local_folio(in_folio, pg_off); workspace->in_buf.pos = 0; - workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); + workspace->in_buf.size = cur_len; } } while (1) { @@ -649,11 +656,11 @@ done: } int zstd_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); - struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); + struct btrfs_fs_info *fs_info = btrfs_sb(folio_inode(dest_folio)->i_sb); const u32 sectorsize = fs_info->sectorsize; zstd_dstream *stream; int ret = 0; @@ -662,12 +669,12 @@ int zstd_decompress(struct list_head *ws, const u8 *data_in, stream = zstd_init_dstream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); if (unlikely(!stream)) { - struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host); + struct btrfs_inode *inode = folio_to_inode(dest_folio); btrfs_err(inode->root->fs_info, "zstd decompression init failed, root %llu inode %llu offset %llu", btrfs_root_id(inode->root), btrfs_ino(inode), - page_offset(dest_page)); + folio_pos(dest_folio)); ret = -EIO; goto finish; } @@ -686,21 +693,21 @@ int zstd_decompress(struct list_head *ws, const u8 *data_in, */ ret = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf); if (unlikely(zstd_is_error(ret))) { - struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host); + struct btrfs_inode *inode = folio_to_inode(dest_folio); btrfs_err(inode->root->fs_info, "zstd decompression failed, error %d root %llu inode %llu offset %llu", zstd_get_error_code(ret), btrfs_root_id(inode->root), - btrfs_ino(inode), page_offset(dest_page)); + btrfs_ino(inode), folio_pos(dest_folio)); goto finish; } to_copy = workspace->out_buf.pos; - memcpy_to_page(dest_page, dest_pgoff, workspace->out_buf.dst, to_copy); + memcpy_to_folio(dest_folio, dest_pgoff, workspace->out_buf.dst, to_copy); finish: /* Error or early end. */ if (unlikely(to_copy < destlen)) { ret = -EIO; - memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy); + folio_zero_range(dest_folio, dest_pgoff + to_copy, destlen - to_copy); } return ret; } diff --git a/fs/buffer.c b/fs/buffer.c index e55ad471c530..1fc9a50def0b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -774,12 +774,11 @@ EXPORT_SYMBOL(block_dirty_folio); static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) { struct buffer_head *bh; - struct list_head tmp; struct address_space *mapping; int err = 0, err2; struct blk_plug plug; + LIST_HEAD(tmp); - INIT_LIST_HEAD(&tmp); blk_start_plug(&plug); spin_lock(lock); @@ -958,12 +957,9 @@ no_grow: } EXPORT_SYMBOL_GPL(folio_alloc_buffers); -struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, - bool retry) +struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size) { gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT; - if (retry) - gfp |= __GFP_NOFAIL; return folio_alloc_buffers(page_folio(page), size, gfp); } @@ -2168,11 +2164,10 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, return err; } -int __block_write_begin(struct page *page, loff_t pos, unsigned len, +int __block_write_begin(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block) { - return __block_write_begin_int(page_folio(page), pos, len, get_block, - NULL); + return __block_write_begin_int(folio, pos, len, get_block, NULL); } EXPORT_SYMBOL(__block_write_begin); @@ -2222,33 +2217,33 @@ static void __block_commit_write(struct folio *folio, size_t from, size_t to) * The filesystem needs to handle block truncation upon failure. */ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, get_block_t *get_block) + struct folio **foliop, get_block_t *get_block) { pgoff_t index = pos >> PAGE_SHIFT; - struct page *page; + struct folio *folio; int status; - page = grab_cache_page_write_begin(mapping, index); - if (!page) - return -ENOMEM; + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); - status = __block_write_begin(page, pos, len, get_block); + status = __block_write_begin_int(folio, pos, len, get_block, NULL); if (unlikely(status)) { - unlock_page(page); - put_page(page); - page = NULL; + folio_unlock(folio); + folio_put(folio); + folio = NULL; } - *pagep = page; + *foliop = folio; return status; } EXPORT_SYMBOL(block_write_begin); int block_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); size_t start = pos - folio_pos(folio); if (unlikely(copied < len)) { @@ -2280,19 +2275,19 @@ EXPORT_SYMBOL(block_write_end); int generic_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; loff_t old_size = inode->i_size; bool i_size_changed = false; - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata); /* * No need to use i_size_read() here, the i_size cannot change under us * because we hold i_rwsem. * - * But it's important to update i_size while still holding page lock: + * But it's important to update i_size while still holding folio lock: * page writeout could otherwise come in and zero beyond i_size. */ if (pos + copied > inode->i_size) { @@ -2300,8 +2295,8 @@ int generic_write_end(struct file *file, struct address_space *mapping, i_size_changed = true; } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); if (old_size < pos) pagecache_isize_extended(inode, old_size, pos); @@ -2467,7 +2462,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size) { struct address_space *mapping = inode->i_mapping; const struct address_space_operations *aops = mapping->a_ops; - struct page *page; + struct folio *folio; void *fsdata = NULL; int err; @@ -2475,11 +2470,11 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size) if (err) goto out; - err = aops->write_begin(NULL, mapping, size, 0, &page, &fsdata); + err = aops->write_begin(NULL, mapping, size, 0, &folio, &fsdata); if (err) goto out; - err = aops->write_end(NULL, mapping, size, 0, 0, page, fsdata); + err = aops->write_end(NULL, mapping, size, 0, 0, folio, fsdata); BUG_ON(err > 0); out: @@ -2493,7 +2488,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; const struct address_space_operations *aops = mapping->a_ops; unsigned int blocksize = i_blocksize(inode); - struct page *page; + struct folio *folio; void *fsdata = NULL; pgoff_t index, curidx; loff_t curpos; @@ -2512,12 +2507,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, len = PAGE_SIZE - zerofrom; err = aops->write_begin(file, mapping, curpos, len, - &page, &fsdata); + &folio, &fsdata); if (err) goto out; - zero_user(page, zerofrom, len); + folio_zero_range(folio, offset_in_folio(folio, curpos), len); err = aops->write_end(file, mapping, curpos, len, len, - page, fsdata); + folio, fsdata); if (err < 0) goto out; BUG_ON(err != len); @@ -2545,12 +2540,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, len = offset - zerofrom; err = aops->write_begin(file, mapping, curpos, len, - &page, &fsdata); + &folio, &fsdata); if (err) goto out; - zero_user(page, zerofrom, len); + folio_zero_range(folio, offset_in_folio(folio, curpos), len); err = aops->write_end(file, mapping, curpos, len, len, - page, fsdata); + folio, fsdata); if (err < 0) goto out; BUG_ON(err != len); @@ -2566,7 +2561,7 @@ out: */ int cont_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata, + struct folio **foliop, void **fsdata, get_block_t *get_block, loff_t *bytes) { struct inode *inode = mapping->host; @@ -2584,7 +2579,7 @@ int cont_write_begin(struct file *file, struct address_space *mapping, (*bytes)++; } - return block_write_begin(mapping, pos, len, pagep, get_block); + return block_write_begin(mapping, pos, len, foliop, get_block); } EXPORT_SYMBOL(cont_write_begin); diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index a91acd03ee12..6a821a959b59 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -627,11 +627,12 @@ static void cachefiles_prepare_write_subreq(struct netfs_io_subrequest *subreq) { struct netfs_io_request *wreq = subreq->rreq; struct netfs_cache_resources *cres = &wreq->cache_resources; + struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr]; _enter("W=%x[%x] %llx", wreq->debug_id, subreq->debug_index, subreq->start); - subreq->max_len = MAX_RW_COUNT; - subreq->max_nr_segs = BIO_MAX_VECS; + stream->sreq_max_len = MAX_RW_COUNT; + stream->sreq_max_segs = BIO_MAX_VECS; if (!cachefiles_cres_file(cres)) { if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE)) @@ -647,6 +648,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq) struct netfs_cache_resources *cres = &wreq->cache_resources; struct cachefiles_object *object = cachefiles_cres_object(cres); struct cachefiles_cache *cache = object->volume->cache; + struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr]; const struct cred *saved_cred; size_t off, pre, post, len = subreq->len; loff_t start = subreq->start; @@ -660,6 +662,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq) if (off) { pre = CACHEFILES_DIO_BLOCK_SIZE - off; if (pre >= len) { + fscache_count_dio_misfit(); netfs_write_subrequest_terminated(subreq, len, false); return; } @@ -670,10 +673,22 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq) } /* We also need to end on the cache granularity boundary */ + if (start + len == wreq->i_size) { + size_t part = len % CACHEFILES_DIO_BLOCK_SIZE; + size_t need = CACHEFILES_DIO_BLOCK_SIZE - part; + + if (part && stream->submit_extendable_to >= need) { + len += need; + subreq->len += need; + subreq->io_iter.count += need; + } + } + post = len & (CACHEFILES_DIO_BLOCK_SIZE - 1); if (post) { len -= post; if (len == 0) { + fscache_count_dio_misfit(); netfs_write_subrequest_terminated(subreq, post, false); return; } diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index 4dd8a993c60a..7c6f260a3be5 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -64,9 +64,15 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object) memcpy(buf->data, fscache_get_aux(object->cookie), len); ret = cachefiles_inject_write_error(); - if (ret == 0) - ret = vfs_setxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, - buf, sizeof(struct cachefiles_xattr) + len, 0); + if (ret == 0) { + ret = mnt_want_write_file(file); + if (ret == 0) { + ret = vfs_setxattr(&nop_mnt_idmap, dentry, + cachefiles_xattr_cache, buf, + sizeof(struct cachefiles_xattr) + len, 0); + mnt_drop_write_file(file); + } + } if (ret < 0) { trace_cachefiles_vfs_error(object, file_inode(file), ret, cachefiles_trace_setxattr_error); @@ -151,8 +157,14 @@ int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, int ret; ret = cachefiles_inject_remove_error(); - if (ret == 0) - ret = vfs_removexattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache); + if (ret == 0) { + ret = mnt_want_write(cache->mnt); + if (ret == 0) { + ret = vfs_removexattr(&nop_mnt_idmap, dentry, + cachefiles_xattr_cache); + mnt_drop_write(cache->mnt); + } + } if (ret < 0) { trace_cachefiles_vfs_error(object, d_inode(dentry), ret, cachefiles_trace_remxattr_error); @@ -208,9 +220,15 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume) memcpy(buf->data, p, volume->vcookie->coherency_len); ret = cachefiles_inject_write_error(); - if (ret == 0) - ret = vfs_setxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, - buf, len, 0); + if (ret == 0) { + ret = mnt_want_write(volume->cache->mnt); + if (ret == 0) { + ret = vfs_setxattr(&nop_mnt_idmap, dentry, + cachefiles_xattr_cache, + buf, len, 0); + mnt_drop_write(volume->cache->mnt); + } + } if (ret < 0) { trace_cachefiles_vfs_error(NULL, d_inode(dentry), ret, cachefiles_trace_setxattr_error); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 8c16bc5250ef..5d9ccda098cc 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -13,6 +13,7 @@ #include <linux/iversion.h> #include <linux/ktime.h> #include <linux/netfs.h> +#include <trace/events/netfs.h> #include "super.h" #include "mds_client.h" @@ -205,21 +206,6 @@ static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq) } } -static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq) -{ - struct inode *inode = subreq->rreq->inode; - struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); - struct ceph_inode_info *ci = ceph_inode(inode); - u64 objno, objoff; - u32 xlen; - - /* Truncate the extent at the end of the current block */ - ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len, - &objno, &objoff, &xlen); - subreq->len = min(xlen, fsc->mount_options->rsize); - return true; -} - static void finish_netfs_read(struct ceph_osd_request *req) { struct inode *inode = req->r_inode; @@ -246,7 +232,8 @@ static void finish_netfs_read(struct ceph_osd_request *req) if (err >= 0) { if (sparse && err > 0) err = ceph_sparse_ext_map_end(op); - if (err < subreq->len) + if (err < subreq->len && + subreq->rreq->origin != NETFS_DIO_READ) __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); if (IS_ENCRYPTED(inode) && err > 0) { err = ceph_fscrypt_decrypt_extents(inode, @@ -263,7 +250,12 @@ static void finish_netfs_read(struct ceph_osd_request *req) calc_pages_for(osd_data->alignment, osd_data->length), false); } - netfs_subreq_terminated(subreq, err, false); + if (err > 0) { + subreq->transferred = err; + err = 0; + } + trace_netfs_sreq(subreq, netfs_sreq_trace_io_progress); + netfs_read_subreq_terminated(subreq, err, false); iput(req->r_inode); ceph_dec_osd_stopping_blocker(fsc->mdsc); } @@ -277,12 +269,12 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) struct ceph_mds_request *req; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_inode_info *ci = ceph_inode(inode); - struct iov_iter iter; ssize_t err = 0; size_t len; int mode; - __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + if (rreq->origin != NETFS_DIO_READ) + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); if (subreq->start >= inode->i_size) @@ -299,6 +291,7 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA); req->r_num_caps = 2; + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); err = ceph_mdsc_do_request(mdsc, NULL, req); if (err < 0) goto out; @@ -312,17 +305,36 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) } len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len); - iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len); - err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter); - if (err == 0) + err = copy_to_iter(iinfo->inline_data + subreq->start, len, &subreq->io_iter); + if (err == 0) { err = -EFAULT; + } else { + subreq->transferred += err; + err = 0; + } ceph_mdsc_put_request(req); out: - netfs_subreq_terminated(subreq, err, false); + netfs_read_subreq_terminated(subreq, err, false); return true; } +static int ceph_netfs_prepare_read(struct netfs_io_subrequest *subreq) +{ + struct netfs_io_request *rreq = subreq->rreq; + struct inode *inode = rreq->inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + u64 objno, objoff; + u32 xlen; + + /* Truncate the extent at the end of the current block */ + ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len, + &objno, &objoff, &xlen); + rreq->io_streams[0].sreq_max_len = umin(xlen, fsc->mount_options->rsize); + return 0; +} + static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; @@ -332,9 +344,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) struct ceph_client *cl = fsc->client; struct ceph_osd_request *req = NULL; struct ceph_vino vino = ceph_vino(inode); - struct iov_iter iter; - int err = 0; - u64 len = subreq->len; + int err; + u64 len; bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD); u64 off = subreq->start; int extent_cnt; @@ -347,6 +358,12 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq)) return; + // TODO: This rounding here is slightly dodgy. It *should* work, for + // now, as the cache only deals in blocks that are a multiple of + // PAGE_SIZE and fscrypt blocks are at most PAGE_SIZE. What needs to + // happen is for the fscrypt driving to be moved into netfslib and the + // data in the cache also to be stored encrypted. + len = subreq->len; ceph_fscrypt_adjust_off_and_len(inode, &off, &len); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, @@ -369,8 +386,6 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) doutc(cl, "%llx.%llx pos=%llu orig_len=%zu len=%llu\n", ceph_vinop(inode), subreq->start, subreq->len, len); - iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len); - /* * FIXME: For now, use CEPH_OSD_DATA_TYPE_PAGES instead of _ITER for * encrypted inodes. We'd need infrastructure that handles an iov_iter @@ -382,7 +397,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) struct page **pages; size_t page_off; - err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off); + err = iov_iter_get_pages_alloc2(&subreq->io_iter, &pages, len, &page_off); if (err < 0) { doutc(cl, "%llx.%llx failed to allocate pages, %d\n", ceph_vinop(inode), err); @@ -397,7 +412,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); } else { - osd_req_op_extent_osd_iter(req, 0, &iter); + osd_req_op_extent_osd_iter(req, 0, &subreq->io_iter); } if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { err = -EIO; @@ -408,22 +423,27 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) req->r_inode = inode; ihold(inode); + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); ceph_osdc_start_request(req->r_osdc, req); out: ceph_osdc_put_request(req); if (err) - netfs_subreq_terminated(subreq, err, false); + netfs_read_subreq_terminated(subreq, err, false); doutc(cl, "%llx.%llx result %d\n", ceph_vinop(inode), err); } static int ceph_init_request(struct netfs_io_request *rreq, struct file *file) { struct inode *inode = rreq->inode; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_client *cl = ceph_inode_to_client(inode); int got = 0, want = CEPH_CAP_FILE_CACHE; struct ceph_netfs_request_data *priv; int ret = 0; + /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ + __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags); + if (rreq->origin != NETFS_READAHEAD) return 0; @@ -467,6 +487,7 @@ static int ceph_init_request(struct netfs_io_request *rreq, struct file *file) priv->caps = got; rreq->netfs_priv = priv; + rreq->io_streams[0].sreq_max_len = fsc->mount_options->rsize; out: if (ret < 0) @@ -491,13 +512,18 @@ static void ceph_netfs_free_request(struct netfs_io_request *rreq) const struct netfs_request_ops ceph_netfs_ops = { .init_request = ceph_init_request, .free_request = ceph_netfs_free_request, + .prepare_read = ceph_netfs_prepare_read, .issue_read = ceph_netfs_issue_read, .expand_readahead = ceph_netfs_expand_readahead, - .clamp_length = ceph_netfs_clamp_length, .check_write_begin = ceph_netfs_check_write_begin, }; #ifdef CONFIG_CEPH_FSCACHE +static void ceph_set_page_fscache(struct page *page) +{ + folio_start_private_2(page_folio(page)); /* [DEPRECATED] */ +} + static void ceph_fscache_write_terminated(void *priv, ssize_t error, bool was_async) { struct inode *inode = priv; @@ -515,6 +541,10 @@ static void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, b ceph_fscache_write_terminated, inode, true, caching); } #else +static inline void ceph_set_page_fscache(struct page *page) +{ +} + static inline void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching) { } @@ -706,6 +736,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) len = wlen; set_page_writeback(page); + if (caching) + ceph_set_page_fscache(page); ceph_fscache_write_to_cache(inode, page_off, len, caching); if (IS_ENCRYPTED(inode)) { @@ -789,6 +821,8 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) return AOP_WRITEPAGE_ACTIVATE; } + folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */ + err = writepage_nounlock(page, wbc); if (err == -ERESTARTSYS) { /* direct memory reclaimer was killed by SIGKILL. return 0 @@ -1062,7 +1096,8 @@ get_more_pages: unlock_page(page); break; } - if (PageWriteback(page)) { + if (PageWriteback(page) || + PagePrivate2(page) /* [DEPRECATED] */) { if (wbc->sync_mode == WB_SYNC_NONE) { doutc(cl, "%p under writeback\n", page); unlock_page(page); @@ -1070,6 +1105,7 @@ get_more_pages: } doutc(cl, "waiting on writeback %p\n", page); wait_on_page_writeback(page); + folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */ } if (!clear_page_dirty_for_io(page)) { @@ -1254,6 +1290,8 @@ new_request: } set_page_writeback(page); + if (caching) + ceph_set_page_fscache(page); len += thp_size(page); } ceph_fscache_write_to_cache(inode, offset, len, caching); @@ -1486,20 +1524,18 @@ static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned */ static int ceph_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct folio *folio = NULL; int r; - r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, &folio, NULL); + r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, foliop, NULL); if (r < 0) return r; - folio_wait_private_2(folio); /* [DEPRECATED] */ - WARN_ON_ONCE(!folio_test_locked(folio)); - *pagep = &folio->page; + folio_wait_private_2(*foliop); /* [DEPRECATED] */ + WARN_ON_ONCE(!folio_test_locked(*foliop)); return 0; } @@ -1509,9 +1545,8 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, */ static int ceph_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *subpage, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(subpage); struct inode *inode = file_inode(file); struct ceph_client *cl = ceph_inode_to_client(inode); bool check_cap = false; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 18c72b305858..ddec8c9244ee 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -707,7 +707,6 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) if (offset != file->f_pos) { file->f_pos = offset; - file->f_version = 0; dfi->file_info.flags &= ~CEPH_F_ATEND; } retval = offset; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 8f8de8f33abb..4a8eec46254b 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -577,8 +577,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb) /* Set parameters for the netfs library */ netfs_inode_init(&ci->netfs, &ceph_netfs_ops, false); - /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ - __set_bit(NETFS_ICTX_USE_PGPRIV2, &ci->netfs.flags); spin_lock_init(&ci->i_ceph_lock); @@ -697,6 +695,7 @@ void ceph_evict_inode(struct inode *inode) percpu_counter_dec(&mdsc->metric.total_inodes); + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); if (inode->i_state & I_PINNING_NETFS_WB) ceph_fscache_unuse_cookie(inode, true); diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 6898dc621011..6896fce122e1 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -119,31 +119,43 @@ static const struct fs_parameter_spec coda_param_specs[] = { {} }; -static int coda_parse_fd(struct fs_context *fc, int fd) +static int coda_set_idx(struct fs_context *fc, struct file *file) { struct coda_fs_context *ctx = fc->fs_private; - struct fd f; struct inode *inode; int idx; - f = fdget(fd); - if (!f.file) - return -EBADF; - inode = file_inode(f.file); + inode = file_inode(file); if (!S_ISCHR(inode->i_mode) || imajor(inode) != CODA_PSDEV_MAJOR) { - fdput(f); - return invalf(fc, "code: Not coda psdev"); + return invalf(fc, "coda: Not coda psdev"); } - idx = iminor(inode); - fdput(f); - if (idx < 0 || idx >= MAX_CODADEVS) return invalf(fc, "coda: Bad minor number"); ctx->idx = idx; return 0; } +static int coda_parse_fd(struct fs_context *fc, struct fs_parameter *param, + struct fs_parse_result *result) +{ + struct file *file; + int err; + + if (param->type == fs_value_is_file) { + file = param->file; + param->file = NULL; + } else { + file = fget(result->uint_32); + } + if (!file) + return -EBADF; + + err = coda_set_idx(fc, file); + fput(file); + return err; +} + static int coda_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct fs_parse_result result; @@ -155,7 +167,7 @@ static int coda_parse_param(struct fs_context *fc, struct fs_parameter *param) switch (opt) { case Opt_fd: - return coda_parse_fd(fc, result.uint_32); + return coda_parse_fd(fc, param, &result); } return 0; @@ -167,6 +179,7 @@ static int coda_parse_param(struct fs_context *fc, struct fs_parameter *param) */ static int coda_parse_monolithic(struct fs_context *fc, void *_data) { + struct file *file; struct coda_mount_data *data = _data; if (!data) @@ -175,7 +188,11 @@ static int coda_parse_monolithic(struct fs_context *fc, void *_data) if (data->version != CODA_MOUNT_VERSION) return invalf(fc, "coda: Bad mount version"); - coda_parse_fd(fc, data->fd); + file = fget(data->fd); + if (file) { + coda_set_idx(fc, file); + fput(file); + } return 0; } diff --git a/fs/coredump.c b/fs/coredump.c index 7f12ff6ad1d3..53a78b6bbb5b 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -18,6 +18,7 @@ #include <linux/personality.h> #include <linux/binfmts.h> #include <linux/coredump.h> +#include <linux/sort.h> #include <linux/sched/coredump.h> #include <linux/sched/signal.h> #include <linux/sched/task_stack.h> @@ -464,7 +465,17 @@ static bool dump_interrupted(void) * but then we need to teach dump_write() to restart and clear * TIF_SIGPENDING. */ - return fatal_signal_pending(current) || freezing(current); + if (fatal_signal_pending(current)) { + coredump_report_failure("interrupted: fatal signal pending"); + return true; + } + + if (freezing(current)) { + coredump_report_failure("interrupted: freezing"); + return true; + } + + return false; } static void wait_for_dump_helpers(struct file *file) @@ -519,7 +530,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) return err; } -void do_coredump(const kernel_siginfo_t *siginfo) +int do_coredump(const kernel_siginfo_t *siginfo) { struct core_state core_state; struct core_name cn; @@ -527,7 +538,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) struct linux_binfmt * binfmt; const struct cred *old_cred; struct cred *cred; - int retval = 0; + int retval; int ispipe; size_t *argv = NULL; int argc = 0; @@ -551,14 +562,20 @@ void do_coredump(const kernel_siginfo_t *siginfo) audit_core_dumps(siginfo->si_signo); binfmt = mm->binfmt; - if (!binfmt || !binfmt->core_dump) + if (!binfmt || !binfmt->core_dump) { + retval = -ENOEXEC; goto fail; - if (!__get_dumpable(cprm.mm_flags)) + } + if (!__get_dumpable(cprm.mm_flags)) { + retval = -EACCES; goto fail; + } cred = prepare_creds(); - if (!cred) + if (!cred) { + retval = -EPERM; goto fail; + } /* * We cannot trust fsuid as being the "true" uid of the process * nor do we know its entire history. We only know it was tainted @@ -586,8 +603,8 @@ void do_coredump(const kernel_siginfo_t *siginfo) struct subprocess_info *sub_info; if (ispipe < 0) { - printk(KERN_WARNING "format_corename failed\n"); - printk(KERN_WARNING "Aborting core\n"); + coredump_report_failure("format_corename failed, aborting core"); + retval = ispipe; goto fail_unlock; } @@ -607,27 +624,24 @@ void do_coredump(const kernel_siginfo_t *siginfo) * right pid if a thread in a multi-threaded * core_pattern process dies. */ - printk(KERN_WARNING - "Process %d(%s) has RLIMIT_CORE set to 1\n", - task_tgid_vnr(current), current->comm); - printk(KERN_WARNING "Aborting core\n"); + coredump_report_failure("RLIMIT_CORE is set to 1, aborting core"); + retval = -EPERM; goto fail_unlock; } cprm.limit = RLIM_INFINITY; dump_count = atomic_inc_return(&core_dump_count); if (core_pipe_limit && (core_pipe_limit < dump_count)) { - printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n", - task_tgid_vnr(current), current->comm); - printk(KERN_WARNING "Skipping core dump\n"); + coredump_report_failure("over core_pipe_limit, skipping core dump"); + retval = -E2BIG; goto fail_dropcount; } helper_argv = kmalloc_array(argc + 1, sizeof(*helper_argv), GFP_KERNEL); if (!helper_argv) { - printk(KERN_WARNING "%s failed to allocate memory\n", - __func__); + coredump_report_failure("%s failed to allocate memory", __func__); + retval = -ENOMEM; goto fail_dropcount; } for (argi = 0; argi < argc; argi++) @@ -644,8 +658,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) kfree(helper_argv); if (retval) { - printk(KERN_INFO "Core dump to |%s pipe failed\n", - cn.corename); + coredump_report_failure("|%s pipe failed", cn.corename); goto close_fail; } } else { @@ -654,14 +667,16 @@ void do_coredump(const kernel_siginfo_t *siginfo) int open_flags = O_CREAT | O_WRONLY | O_NOFOLLOW | O_LARGEFILE | O_EXCL; - if (cprm.limit < binfmt->min_coredump) + if (cprm.limit < binfmt->min_coredump) { + coredump_report_failure("over coredump resource limit, skipping core dump"); + retval = -E2BIG; goto fail_unlock; + } if (need_suid_safe && cn.corename[0] != '/') { - printk(KERN_WARNING "Pid %d(%s) can only dump core "\ - "to fully qualified path!\n", - task_tgid_vnr(current), current->comm); - printk(KERN_WARNING "Skipping core dump\n"); + coredump_report_failure( + "this process can only dump core to a fully qualified path, skipping core dump"); + retval = -EPERM; goto fail_unlock; } @@ -707,20 +722,28 @@ void do_coredump(const kernel_siginfo_t *siginfo) } else { cprm.file = filp_open(cn.corename, open_flags, 0600); } - if (IS_ERR(cprm.file)) + if (IS_ERR(cprm.file)) { + retval = PTR_ERR(cprm.file); goto fail_unlock; + } inode = file_inode(cprm.file); - if (inode->i_nlink > 1) + if (inode->i_nlink > 1) { + retval = -EMLINK; goto close_fail; - if (d_unhashed(cprm.file->f_path.dentry)) + } + if (d_unhashed(cprm.file->f_path.dentry)) { + retval = -EEXIST; goto close_fail; + } /* * AK: actually i see no reason to not allow this for named * pipes etc, but keep the previous behaviour for now. */ - if (!S_ISREG(inode->i_mode)) + if (!S_ISREG(inode->i_mode)) { + retval = -EISDIR; goto close_fail; + } /* * Don't dump core if the filesystem changed owner or mode * of the file during file creation. This is an issue when @@ -730,19 +753,24 @@ void do_coredump(const kernel_siginfo_t *siginfo) idmap = file_mnt_idmap(cprm.file); if (!vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid())) { - pr_info_ratelimited("Core dump to %s aborted: cannot preserve file owner\n", - cn.corename); + coredump_report_failure("Core dump to %s aborted: " + "cannot preserve file owner", cn.corename); + retval = -EPERM; goto close_fail; } if ((inode->i_mode & 0677) != 0600) { - pr_info_ratelimited("Core dump to %s aborted: cannot preserve file permissions\n", - cn.corename); + coredump_report_failure("Core dump to %s aborted: " + "cannot preserve file permissions", cn.corename); + retval = -EPERM; goto close_fail; } - if (!(cprm.file->f_mode & FMODE_CAN_WRITE)) + if (!(cprm.file->f_mode & FMODE_CAN_WRITE)) { + retval = -EACCES; goto close_fail; - if (do_truncate(idmap, cprm.file->f_path.dentry, - 0, 0, cprm.file)) + } + retval = do_truncate(idmap, cprm.file->f_path.dentry, + 0, 0, cprm.file); + if (retval) goto close_fail; } @@ -757,11 +785,16 @@ void do_coredump(const kernel_siginfo_t *siginfo) * have this set to NULL. */ if (!cprm.file) { - pr_info("Core dump to |%s disabled\n", cn.corename); + coredump_report_failure("Core dump to |%s disabled", cn.corename); + retval = -EPERM; goto close_fail; } - if (!dump_vma_snapshot(&cprm)) + if (!dump_vma_snapshot(&cprm)) { + coredump_report_failure("Can't get VMA snapshot for core dump |%s", + cn.corename); + retval = -EACCES; goto close_fail; + } file_start_write(cprm.file); core_dumped = binfmt->core_dump(&cprm); @@ -777,9 +810,21 @@ void do_coredump(const kernel_siginfo_t *siginfo) } file_end_write(cprm.file); free_vma_snapshot(&cprm); + } else { + coredump_report_failure("Core dump to %s%s has been interrupted", + ispipe ? "|" : "", cn.corename); + retval = -EAGAIN; + goto fail; } + coredump_report( + "written to %s%s: VMAs: %d, size %zu; core: %lld bytes, pos %lld", + ispipe ? "|" : "", cn.corename, + cprm.vma_count, cprm.vma_data_size, cprm.written, cprm.pos); if (ispipe && core_pipe_limit) wait_for_dump_helpers(cprm.file); + + retval = 0; + close_fail: if (cprm.file) filp_close(cprm.file, NULL); @@ -794,7 +839,7 @@ fail_unlock: fail_creds: put_cred(cred); fail: - return; + return retval; } /* @@ -814,8 +859,16 @@ static int __dump_emit(struct coredump_params *cprm, const void *addr, int nr) if (dump_interrupted()) return 0; n = __kernel_write(file, addr, nr, &pos); - if (n != nr) + if (n != nr) { + if (n < 0) + coredump_report_failure("failed when writing out, error %zd", n); + else + coredump_report_failure( + "partially written out, only %zd(of %d) bytes written", + n, nr); + return 0; + } file->f_pos = pos; cprm->written += n; cprm->pos += n; @@ -828,9 +881,16 @@ static int __dump_skip(struct coredump_params *cprm, size_t nr) static char zeroes[PAGE_SIZE]; struct file *file = cprm->file; if (file->f_mode & FMODE_LSEEK) { - if (dump_interrupted() || - vfs_llseek(file, nr, SEEK_CUR) < 0) + int ret; + + if (dump_interrupted()) + return 0; + + ret = vfs_llseek(file, nr, SEEK_CUR); + if (ret < 0) { + coredump_report_failure("failed when seeking, error %d", ret); return 0; + } cprm->pos += nr; return 1; } else { @@ -983,11 +1043,10 @@ void validate_coredump_safety(void) { if (suid_dumpable == SUID_DUMP_ROOT && core_pattern[0] != '/' && core_pattern[0] != '|') { - pr_warn( -"Unsafe core_pattern used with fs.suid_dumpable=2.\n" -"Pipe handler or fully qualified core dump path required.\n" -"Set kernel.core_pattern before fs.suid_dumpable.\n" - ); + + coredump_report_failure("Unsafe core_pattern used with fs.suid_dumpable=2: " + "pipe handler or fully qualified core dump path required. " + "Set kernel.core_pattern before fs.suid_dumpable."); } } @@ -1191,6 +1250,18 @@ static void free_vma_snapshot(struct coredump_params *cprm) } } +static int cmp_vma_size(const void *vma_meta_lhs_ptr, const void *vma_meta_rhs_ptr) +{ + const struct core_vma_metadata *vma_meta_lhs = vma_meta_lhs_ptr; + const struct core_vma_metadata *vma_meta_rhs = vma_meta_rhs_ptr; + + if (vma_meta_lhs->dump_size < vma_meta_rhs->dump_size) + return -1; + if (vma_meta_lhs->dump_size > vma_meta_rhs->dump_size) + return 1; + return 0; +} + /* * Under the mmap_lock, take a snapshot of relevant information about the task's * VMAs. @@ -1253,5 +1324,8 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) cprm->vma_data_size += m->dump_size; } + sort(cprm->vma_meta, cprm->vma_count, sizeof(*cprm->vma_meta), + cmp_vma_size, NULL); + return true; } diff --git a/fs/dcache.c b/fs/dcache.c index 3d8daaecb6d1..0f6b16ba30d0 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -96,11 +96,16 @@ EXPORT_SYMBOL(dotdot_name); * * This hash-function tries to avoid losing too many bits of hash * information, yet avoid using a prime hash-size or similar. + * + * Marking the variables "used" ensures that the compiler doesn't + * optimize them away completely on architectures with runtime + * constant infrastructure, this allows debuggers to see their + * values. But updating these values has no effect on those arches. */ -static unsigned int d_hash_shift __ro_after_init; +static unsigned int d_hash_shift __ro_after_init __used; -static struct hlist_bl_head *dentry_hashtable __ro_after_init; +static struct hlist_bl_head *dentry_hashtable __ro_after_init __used; static inline struct hlist_bl_head *d_hash(unsigned long hashlen) { @@ -1908,8 +1913,13 @@ void d_instantiate_new(struct dentry *entry, struct inode *inode) __d_instantiate(entry, inode); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW & ~I_CREATING; + /* + * Pairs with the barrier in prepare_to_wait_event() to make sure + * ___wait_var_event() either sees the bit cleared or + * waitqueue_active() check in wake_up_var() sees the waiter. + */ smp_mb(); - wake_up_bit(&inode->i_state, __I_NEW); + inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(d_instantiate_new); @@ -2163,9 +2173,6 @@ seqretry: * without taking d_lock and checking d_seq sequence count against @seq * returned here. * - * A refcount may be taken on the found dentry with the d_rcu_to_refcount - * function. - * * Alternatively, __d_lookup_rcu may be called again to look up the child of * the returned dentry, so long as its parent's seqlock is checked after the * child is looked up. Thus, an interlocking stepping of sequence lock checks diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 91521576f500..66d9b3b4c588 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -89,12 +89,14 @@ enum { Opt_uid, Opt_gid, Opt_mode, + Opt_source, }; static const struct fs_parameter_spec debugfs_param_specs[] = { fsparam_gid ("gid", Opt_gid), fsparam_u32oct ("mode", Opt_mode), fsparam_uid ("uid", Opt_uid), + fsparam_string ("source", Opt_source), {} }; @@ -126,6 +128,12 @@ static int debugfs_parse_param(struct fs_context *fc, struct fs_parameter *param case Opt_mode: opts->mode = result.uint_32 & S_IALLUGO; break; + case Opt_source: + if (fc->source) + return invalfc(fc, "Multiple sources specified"); + fc->source = param->string; + param->string = NULL; + break; /* * We might like to report bad mount options here; * but traditionally debugfs has ignored all mount options diff --git a/fs/direct-io.c b/fs/direct-io.c index b0aafe640fa4..bbd05f1a2145 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -37,7 +37,6 @@ #include <linux/rwsem.h> #include <linux/uio.h> #include <linux/atomic.h> -#include <linux/prefetch.h> #include "internal.h" @@ -1121,11 +1120,6 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct blk_plug plug; unsigned long align = offset | iov_iter_alignment(iter); - /* - * Avoid references to bdev if not absolutely needed to give - * the early prefetch in the caller enough time. - */ - /* watch out for a 0 len io from a tricksy fs */ if (iov_iter_rw(iter) == READ && !count) return 0; diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index e2483acc4366..287e5d407f08 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -234,17 +234,17 @@ out: /* * Called with lower inode mutex held. */ -static int fill_zeros_to_end_of_page(struct page *page, unsigned int to) +static int fill_zeros_to_end_of_page(struct folio *folio, unsigned int to) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; int end_byte_in_page; - if ((i_size_read(inode) / PAGE_SIZE) != page->index) + if ((i_size_read(inode) / PAGE_SIZE) != folio->index) goto out; end_byte_in_page = i_size_read(inode) % PAGE_SIZE; if (to > end_byte_in_page) end_byte_in_page = to; - zero_user_segment(page, end_byte_in_page, PAGE_SIZE); + folio_zero_segment(folio, end_byte_in_page, PAGE_SIZE); out: return 0; } @@ -255,7 +255,7 @@ out: * @mapping: The eCryptfs object * @pos: The file offset at which to start writing * @len: Length of the write - * @pagep: Pointer to return the page + * @foliop: Pointer to return the folio * @fsdata: Pointer to return fs data (unused) * * This function must zero any hole we create @@ -265,38 +265,39 @@ out: static int ecryptfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { pgoff_t index = pos >> PAGE_SHIFT; - struct page *page; + struct folio *folio; loff_t prev_page_end_size; int rc = 0; - page = grab_cache_page_write_begin(mapping, index); - if (!page) - return -ENOMEM; - *pagep = page; + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); + *foliop = folio; prev_page_end_size = ((loff_t)index << PAGE_SHIFT); - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { struct ecryptfs_crypt_stat *crypt_stat = &ecryptfs_inode_to_private(mapping->host)->crypt_stat; if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { rc = ecryptfs_read_lower_page_segment( - page, index, 0, PAGE_SIZE, mapping->host); + &folio->page, index, 0, PAGE_SIZE, mapping->host); if (rc) { printk(KERN_ERR "%s: Error attempting to read " "lower page segment; rc = [%d]\n", __func__, rc); - ClearPageUptodate(page); + folio_clear_uptodate(folio); goto out; } else - SetPageUptodate(page); + folio_mark_uptodate(folio); } else if (crypt_stat->flags & ECRYPTFS_VIEW_AS_ENCRYPTED) { if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) { rc = ecryptfs_copy_up_encrypted_with_header( - page, crypt_stat); + &folio->page, crypt_stat); if (rc) { printk(KERN_ERR "%s: Error attempting " "to copy the encrypted content " @@ -304,46 +305,46 @@ static int ecryptfs_write_begin(struct file *file, "inserting the metadata from " "the xattr into the header; rc " "= [%d]\n", __func__, rc); - ClearPageUptodate(page); + folio_clear_uptodate(folio); goto out; } - SetPageUptodate(page); + folio_mark_uptodate(folio); } else { rc = ecryptfs_read_lower_page_segment( - page, index, 0, PAGE_SIZE, + &folio->page, index, 0, PAGE_SIZE, mapping->host); if (rc) { printk(KERN_ERR "%s: Error reading " "page; rc = [%d]\n", __func__, rc); - ClearPageUptodate(page); + folio_clear_uptodate(folio); goto out; } - SetPageUptodate(page); + folio_mark_uptodate(folio); } } else { if (prev_page_end_size - >= i_size_read(page->mapping->host)) { - zero_user(page, 0, PAGE_SIZE); - SetPageUptodate(page); + >= i_size_read(mapping->host)) { + folio_zero_range(folio, 0, PAGE_SIZE); + folio_mark_uptodate(folio); } else if (len < PAGE_SIZE) { - rc = ecryptfs_decrypt_page(page); + rc = ecryptfs_decrypt_page(&folio->page); if (rc) { printk(KERN_ERR "%s: Error decrypting " "page at index [%ld]; " "rc = [%d]\n", - __func__, page->index, rc); - ClearPageUptodate(page); + __func__, folio->index, rc); + folio_clear_uptodate(folio); goto out; } - SetPageUptodate(page); + folio_mark_uptodate(folio); } } } /* If creating a page or more of holes, zero them out via truncate. * Note, this will increase i_size. */ if (index != 0) { - if (prev_page_end_size > i_size_read(page->mapping->host)) { + if (prev_page_end_size > i_size_read(mapping->host)) { rc = ecryptfs_truncate(file->f_path.dentry, prev_page_end_size); if (rc) { @@ -359,12 +360,11 @@ static int ecryptfs_write_begin(struct file *file, * of page? Zero it out. */ if ((i_size_read(mapping->host) == prev_page_end_size) && (pos != 0)) - zero_user(page, 0, PAGE_SIZE); + folio_zero_range(folio, 0, PAGE_SIZE); out: if (unlikely(rc)) { - unlock_page(page); - put_page(page); - *pagep = NULL; + folio_unlock(folio); + folio_put(folio); } return rc; } @@ -457,13 +457,13 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode) * @pos: The file position * @len: The length of the data (unused) * @copied: The amount of data copied - * @page: The eCryptfs page + * @folio: The eCryptfs folio * @fsdata: The fsdata (unused) */ static int ecryptfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { pgoff_t index = pos >> PAGE_SHIFT; unsigned from = pos & (PAGE_SIZE - 1); @@ -476,8 +476,8 @@ static int ecryptfs_write_end(struct file *file, ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page" "(page w/ index = [0x%.16lx], to = [%d])\n", index, to); if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { - rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0, - to); + rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, + &folio->page, 0, to); if (!rc) { rc = copied; fsstack_copy_inode_size(ecryptfs_inode, @@ -485,21 +485,21 @@ static int ecryptfs_write_end(struct file *file, } goto out; } - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { if (copied < PAGE_SIZE) { rc = 0; goto out; } - SetPageUptodate(page); + folio_mark_uptodate(folio); } /* Fills in zeros if 'to' goes beyond inode size */ - rc = fill_zeros_to_end_of_page(page, to); + rc = fill_zeros_to_end_of_page(folio, to); if (rc) { ecryptfs_printk(KERN_WARNING, "Error attempting to fill " "zeros in page with index = [0x%.16lx]\n", index); goto out; } - rc = ecryptfs_encrypt_page(page); + rc = ecryptfs_encrypt_page(&folio->page); if (rc) { ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper " "index [0x%.16lx])\n", index); @@ -518,8 +518,8 @@ static int ecryptfs_write_end(struct file *file, else rc = copied; out: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return rc; } diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 7dcdce660cac..6ea60661fa55 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -74,6 +74,23 @@ config EROFS_FS_SECURITY If you are not using a security module, say N. +config EROFS_FS_BACKED_BY_FILE + bool "File-backed EROFS filesystem support" + depends on EROFS_FS + default y + help + This allows EROFS to use filesystem image files directly, without + the intercession of loopback block devices or likewise. It is + particularly useful for container images with numerous blobs and + other sandboxes, where loop devices behave intricately. It can also + be used to simplify error-prone lifetime management of unnecessary + virtual block devices. + + Note that this feature, along with ongoing fanotify pre-content + hooks, will eventually replace "EROFS over fscache." + + If you don't want to enable this feature, say N. + config EROFS_FS_ZIP bool "EROFS Data Compression Support" depends on EROFS_FS @@ -128,7 +145,7 @@ config EROFS_FS_ZIP_ZSTD If unsure, say N. config EROFS_FS_ONDEMAND - bool "EROFS fscache-based on-demand read support" + bool "EROFS fscache-based on-demand read support (deprecated)" depends on EROFS_FS select NETFS_SUPPORT select FSCACHE @@ -138,6 +155,9 @@ config EROFS_FS_ONDEMAND This permits EROFS to use fscache-backed data blobs with on-demand read support. + It is now deprecated and scheduled to be removed from the kernel + after fanotify pre-content hooks are landed. + If unsure, say N. config EROFS_FS_PCPU_KTHREAD diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile index 097d672e6b14..4331d53c7109 100644 --- a/fs/erofs/Makefile +++ b/fs/erofs/Makefile @@ -7,4 +7,5 @@ erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o zutil.o erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) += decompressor_zstd.o +erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 1b7eba38ba1e..61debd799cf9 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -59,8 +59,12 @@ void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb) { - if (erofs_is_fscache_mode(sb)) - buf->mapping = EROFS_SB(sb)->s_fscache->inode->i_mapping; + struct erofs_sb_info *sbi = EROFS_SB(sb); + + if (erofs_is_fileio_mode(sbi)) + buf->mapping = file_inode(sbi->fdev)->i_mapping; + else if (erofs_is_fscache_mode(sb)) + buf->mapping = sbi->s_fscache->inode->i_mapping; else buf->mapping = sb->s_bdev->bd_mapping; } @@ -75,38 +79,28 @@ void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, static int erofs_map_blocks_flatmode(struct inode *inode, struct erofs_map_blocks *map) { - erofs_blk_t nblocks, lastblk; - u64 offset = map->m_la; struct erofs_inode *vi = EROFS_I(inode); struct super_block *sb = inode->i_sb; bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE); + erofs_blk_t lastblk = erofs_iblks(inode) - tailendpacking; - nblocks = erofs_iblks(inode); - lastblk = nblocks - tailendpacking; - - /* there is no hole in flatmode */ - map->m_flags = EROFS_MAP_MAPPED; - if (offset < erofs_pos(sb, lastblk)) { + map->m_flags = EROFS_MAP_MAPPED; /* no hole in flat inodes */ + if (map->m_la < erofs_pos(sb, lastblk)) { map->m_pa = erofs_pos(sb, vi->raw_blkaddr) + map->m_la; - map->m_plen = erofs_pos(sb, lastblk) - offset; - } else if (tailendpacking) { + map->m_plen = erofs_pos(sb, lastblk) - map->m_la; + } else { + DBG_BUGON(!tailendpacking); map->m_pa = erofs_iloc(inode) + vi->inode_isize + - vi->xattr_isize + erofs_blkoff(sb, offset); - map->m_plen = inode->i_size - offset; + vi->xattr_isize + erofs_blkoff(sb, map->m_la); + map->m_plen = inode->i_size - map->m_la; /* inline data should be located in the same meta block */ if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) { - erofs_err(sb, "inline data cross block boundary @ nid %llu", - vi->nid); + erofs_err(sb, "inline data across blocks @ nid %llu", vi->nid); DBG_BUGON(1); return -EFSCORRUPTED; } map->m_flags |= EROFS_MAP_META; - } else { - erofs_err(sb, "internal error @ nid: %llu (size %llu), m_la 0x%llx", - vi->nid, inode->i_size, map->m_la); - DBG_BUGON(1); - return -EIO; } return 0; } @@ -128,7 +122,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) if (map->m_la >= inode->i_size) { /* leave out-of-bound access unmapped */ map->m_flags = 0; - map->m_plen = 0; + map->m_plen = map->m_llen; goto out; } @@ -189,16 +183,34 @@ out: return err; } +static void erofs_fill_from_devinfo(struct erofs_map_dev *map, + struct erofs_device_info *dif) +{ + map->m_bdev = NULL; + map->m_fp = NULL; + if (dif->file) { + if (S_ISBLK(file_inode(dif->file)->i_mode)) + map->m_bdev = file_bdev(dif->file); + else + map->m_fp = dif->file; + } + map->m_daxdev = dif->dax_dev; + map->m_dax_part_off = dif->dax_part_off; + map->m_fscache = dif->fscache; +} + int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) { struct erofs_dev_context *devs = EROFS_SB(sb)->devs; struct erofs_device_info *dif; + erofs_off_t startoff, length; int id; map->m_bdev = sb->s_bdev; map->m_daxdev = EROFS_SB(sb)->dax_dev; map->m_dax_part_off = EROFS_SB(sb)->dax_part_off; map->m_fscache = EROFS_SB(sb)->s_fscache; + map->m_fp = EROFS_SB(sb)->fdev; if (map->m_deviceid) { down_read(&devs->rwsem); @@ -212,29 +224,20 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) up_read(&devs->rwsem); return 0; } - map->m_bdev = dif->bdev_file ? file_bdev(dif->bdev_file) : NULL; - map->m_daxdev = dif->dax_dev; - map->m_dax_part_off = dif->dax_part_off; - map->m_fscache = dif->fscache; + erofs_fill_from_devinfo(map, dif); up_read(&devs->rwsem); } else if (devs->extra_devices && !devs->flatdev) { down_read(&devs->rwsem); idr_for_each_entry(&devs->tree, dif, id) { - erofs_off_t startoff, length; - if (!dif->mapped_blkaddr) continue; + startoff = erofs_pos(sb, dif->mapped_blkaddr); length = erofs_pos(sb, dif->blocks); - if (map->m_pa >= startoff && map->m_pa < startoff + length) { map->m_pa -= startoff; - map->m_bdev = dif->bdev_file ? - file_bdev(dif->bdev_file) : NULL; - map->m_daxdev = dif->dax_dev; - map->m_dax_part_off = dif->dax_part_off; - map->m_fscache = dif->fscache; + erofs_fill_from_devinfo(map, dif); break; } } @@ -243,6 +246,42 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) return 0; } +/* + * bit 30: I/O error occurred on this folio + * bit 0 - 29: remaining parts to complete this folio + */ +#define EROFS_ONLINEFOLIO_EIO (1 << 30) + +void erofs_onlinefolio_init(struct folio *folio) +{ + union { + atomic_t o; + void *v; + } u = { .o = ATOMIC_INIT(1) }; + + folio->private = u.v; /* valid only if file-backed folio is locked */ +} + +void erofs_onlinefolio_split(struct folio *folio) +{ + atomic_inc((atomic_t *)&folio->private); +} + +void erofs_onlinefolio_end(struct folio *folio, int err) +{ + int orig, v; + + do { + orig = atomic_read((atomic_t *)&folio->private); + v = (orig - 1) | (err ? EROFS_ONLINEFOLIO_EIO : 0); + } while (atomic_cmpxchg((atomic_t *)&folio->private, orig, v) != orig); + + if (v & ~EROFS_ONLINEFOLIO_EIO) + return; + folio->private = 0; + folio_end_read(folio, !(v & EROFS_ONLINEFOLIO_EIO)); +} + static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { @@ -392,7 +431,7 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } /* for uncompressed (aligned) files and raw access for other files */ -const struct address_space_operations erofs_raw_access_aops = { +const struct address_space_operations erofs_aops = { .read_folio = erofs_read_folio, .readahead = erofs_readahead, .bmap = erofs_bmap, diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index c2253b6a5416..eb318c7ddd80 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -539,7 +539,7 @@ int __init z_erofs_init_decompressor(void) for (i = 0; i < Z_EROFS_COMPRESSION_MAX; ++i) { err = z_erofs_decomp[i] ? z_erofs_decomp[i]->init() : 0; if (err) { - while (--i) + while (i--) if (z_erofs_decomp[i]) z_erofs_decomp[i]->exit(); return err; diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index 2193a6710c8f..c3b90abdee37 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -8,19 +8,15 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, void *dentry_blk, struct erofs_dirent *de, - unsigned int nameoff, unsigned int maxsize) + unsigned int nameoff0, unsigned int maxsize) { - const struct erofs_dirent *end = dentry_blk + nameoff; + const struct erofs_dirent *end = dentry_blk + nameoff0; while (de < end) { - const char *de_name; + unsigned char d_type = fs_ftype_to_dtype(de->file_type); + unsigned int nameoff = le16_to_cpu(de->nameoff); + const char *de_name = (char *)dentry_blk + nameoff; unsigned int de_namelen; - unsigned char d_type; - - d_type = fs_ftype_to_dtype(de->file_type); - - nameoff = le16_to_cpu(de->nameoff); - de_name = (char *)dentry_blk + nameoff; /* the last dirent in the block? */ if (de + 1 >= end) @@ -52,21 +48,20 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct super_block *sb = dir->i_sb; unsigned long bsz = sb->s_blocksize; - const size_t dirsize = i_size_read(dir); - unsigned int i = erofs_blknr(sb, ctx->pos); unsigned int ofs = erofs_blkoff(sb, ctx->pos); int err = 0; bool initial = true; buf.mapping = dir->i_mapping; - while (ctx->pos < dirsize) { + while (ctx->pos < dir->i_size) { + erofs_off_t dbstart = ctx->pos - ofs; struct erofs_dirent *de; unsigned int nameoff, maxsize; - de = erofs_bread(&buf, erofs_pos(sb, i), EROFS_KMAP); + de = erofs_bread(&buf, dbstart, EROFS_KMAP); if (IS_ERR(de)) { erofs_err(sb, "fail to readdir of logical block %u of nid %llu", - i, EROFS_I(dir)->nid); + erofs_blknr(sb, dbstart), EROFS_I(dir)->nid); err = PTR_ERR(de); break; } @@ -79,25 +74,19 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) break; } - maxsize = min_t(unsigned int, dirsize - ctx->pos + ofs, bsz); - + maxsize = min_t(unsigned int, dir->i_size - dbstart, bsz); /* search dirents at the arbitrary position */ if (initial) { initial = false; - ofs = roundup(ofs, sizeof(struct erofs_dirent)); - ctx->pos = erofs_pos(sb, i) + ofs; - if (ofs >= nameoff) - goto skip_this; + ctx->pos = dbstart + ofs; } err = erofs_fill_dentries(dir, ctx, de, (void *)de + ofs, nameoff, maxsize); if (err) break; -skip_this: - ctx->pos = erofs_pos(sb, i) + maxsize; - ++i; + ctx->pos = dbstart + maxsize; ofs = 0; } erofs_put_metabuf(&buf); diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 6c0c270c42e1..c8f2ae845bd2 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -288,9 +288,12 @@ struct erofs_dirent { #define EROFS_NAME_LEN 255 -/* maximum supported size of a physical compression cluster */ +/* maximum supported encoded size of a physical compressed cluster */ #define Z_EROFS_PCLUSTER_MAX_SIZE (1024 * 1024) +/* maximum supported decoded size of a physical compressed cluster */ +#define Z_EROFS_PCLUSTER_MAX_DSIZE (12 * 1024 * 1024) + /* available compression algorithm types (for h_algorithmtype) */ enum { Z_EROFS_COMPRESSION_LZ4 = 0, diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c new file mode 100644 index 000000000000..3af96b1e2c2a --- /dev/null +++ b/fs/erofs/fileio.c @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2024, Alibaba Cloud + */ +#include "internal.h" +#include <trace/events/erofs.h> + +struct erofs_fileio_rq { + struct bio_vec bvecs[BIO_MAX_VECS]; + struct bio bio; + struct kiocb iocb; +}; + +struct erofs_fileio { + struct erofs_map_blocks map; + struct erofs_map_dev dev; + struct erofs_fileio_rq *rq; +}; + +static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret) +{ + struct erofs_fileio_rq *rq = + container_of(iocb, struct erofs_fileio_rq, iocb); + struct folio_iter fi; + + if (ret > 0) { + if (ret != rq->bio.bi_iter.bi_size) { + bio_advance(&rq->bio, ret); + zero_fill_bio(&rq->bio); + } + ret = 0; + } + if (rq->bio.bi_end_io) { + rq->bio.bi_end_io(&rq->bio); + } else { + bio_for_each_folio_all(fi, &rq->bio) { + DBG_BUGON(folio_test_uptodate(fi.folio)); + erofs_onlinefolio_end(fi.folio, ret); + } + } + bio_uninit(&rq->bio); + kfree(rq); +} + +static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq) +{ + struct iov_iter iter; + int ret; + + if (!rq) + return; + rq->iocb.ki_pos = rq->bio.bi_iter.bi_sector << SECTOR_SHIFT; + rq->iocb.ki_ioprio = get_current_ioprio(); + rq->iocb.ki_complete = erofs_fileio_ki_complete; + rq->iocb.ki_flags = (rq->iocb.ki_filp->f_mode & FMODE_CAN_ODIRECT) ? + IOCB_DIRECT : 0; + iov_iter_bvec(&iter, ITER_DEST, rq->bvecs, rq->bio.bi_vcnt, + rq->bio.bi_iter.bi_size); + ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter); + if (ret != -EIOCBQUEUED) + erofs_fileio_ki_complete(&rq->iocb, ret); +} + +static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct erofs_map_dev *mdev) +{ + struct erofs_fileio_rq *rq = kzalloc(sizeof(*rq), + GFP_KERNEL | __GFP_NOFAIL); + + bio_init(&rq->bio, NULL, rq->bvecs, BIO_MAX_VECS, REQ_OP_READ); + rq->iocb.ki_filp = mdev->m_fp; + return rq; +} + +struct bio *erofs_fileio_bio_alloc(struct erofs_map_dev *mdev) +{ + return &erofs_fileio_rq_alloc(mdev)->bio; +} + +void erofs_fileio_submit_bio(struct bio *bio) +{ + return erofs_fileio_rq_submit(container_of(bio, struct erofs_fileio_rq, + bio)); +} + +static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio *folio) +{ + struct inode *inode = folio_inode(folio); + struct erofs_map_blocks *map = &io->map; + unsigned int cur = 0, end = folio_size(folio), len, attached = 0; + loff_t pos = folio_pos(folio), ofs; + struct iov_iter iter; + struct bio_vec bv; + int err = 0; + + erofs_onlinefolio_init(folio); + while (cur < end) { + if (!in_range(pos + cur, map->m_la, map->m_llen)) { + map->m_la = pos + cur; + map->m_llen = end - cur; + err = erofs_map_blocks(inode, map); + if (err) + break; + } + + ofs = folio_pos(folio) + cur - map->m_la; + len = min_t(loff_t, map->m_llen - ofs, end - cur); + if (map->m_flags & EROFS_MAP_META) { + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + void *src; + + src = erofs_read_metabuf(&buf, inode->i_sb, + map->m_pa + ofs, EROFS_KMAP); + if (IS_ERR(src)) { + err = PTR_ERR(src); + break; + } + bvec_set_folio(&bv, folio, len, cur); + iov_iter_bvec(&iter, ITER_DEST, &bv, 1, len); + if (copy_to_iter(src, len, &iter) != len) { + erofs_put_metabuf(&buf); + err = -EIO; + break; + } + erofs_put_metabuf(&buf); + } else if (!(map->m_flags & EROFS_MAP_MAPPED)) { + folio_zero_segment(folio, cur, cur + len); + attached = 0; + } else { + if (io->rq && (map->m_pa + ofs != io->dev.m_pa || + map->m_deviceid != io->dev.m_deviceid)) { +io_retry: + erofs_fileio_rq_submit(io->rq); + io->rq = NULL; + } + + if (!io->rq) { + io->dev = (struct erofs_map_dev) { + .m_pa = io->map.m_pa + ofs, + .m_deviceid = io->map.m_deviceid, + }; + err = erofs_map_dev(inode->i_sb, &io->dev); + if (err) + break; + io->rq = erofs_fileio_rq_alloc(&io->dev); + io->rq->bio.bi_iter.bi_sector = io->dev.m_pa >> 9; + attached = 0; + } + if (!attached++) + erofs_onlinefolio_split(folio); + if (!bio_add_folio(&io->rq->bio, folio, len, cur)) + goto io_retry; + io->dev.m_pa += len; + } + cur += len; + } + erofs_onlinefolio_end(folio, err); + return err; +} + +static int erofs_fileio_read_folio(struct file *file, struct folio *folio) +{ + struct erofs_fileio io = {}; + int err; + + trace_erofs_read_folio(folio, true); + err = erofs_fileio_scan_folio(&io, folio); + erofs_fileio_rq_submit(io.rq); + return err; +} + +static void erofs_fileio_readahead(struct readahead_control *rac) +{ + struct inode *inode = rac->mapping->host; + struct erofs_fileio io = {}; + struct folio *folio; + int err; + + trace_erofs_readpages(inode, readahead_index(rac), + readahead_count(rac), true); + while ((folio = readahead_folio(rac))) { + err = erofs_fileio_scan_folio(&io, folio); + if (err && err != -EINTR) + erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu", + folio->index, EROFS_I(inode)->nid); + } + erofs_fileio_rq_submit(io.rq); +} + +const struct address_space_operations erofs_fileio_aops = { + .read_folio = erofs_fileio_read_folio, + .readahead = erofs_fileio_readahead, +}; diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 43c09aae2afc..db29190656eb 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -5,11 +5,26 @@ * Copyright (C) 2021, Alibaba Cloud */ #include "xattr.h" - #include <trace/events/erofs.h> -static void *erofs_read_inode(struct erofs_buf *buf, - struct inode *inode, unsigned int *ofs) +static int erofs_fill_symlink(struct inode *inode, void *kaddr, + unsigned int m_pofs) +{ + struct erofs_inode *vi = EROFS_I(inode); + loff_t off; + + m_pofs += vi->xattr_isize; + /* check if it cannot be handled with fast symlink scheme */ + if (vi->datalayout != EROFS_INODE_FLAT_INLINE || + check_add_overflow(m_pofs, inode->i_size, &off) || + off > i_blocksize(inode)) + return 0; + + inode->i_link = kmemdup_nul(kaddr + m_pofs, inode->i_size, GFP_KERNEL); + return inode->i_link ? 0 : -ENOMEM; +} + +static int erofs_read_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; struct erofs_sb_info *sbi = EROFS_SB(sb); @@ -20,20 +35,21 @@ static void *erofs_read_inode(struct erofs_buf *buf, struct erofs_inode_compact *dic; struct erofs_inode_extended *die, *copied = NULL; union erofs_inode_i_u iu; - unsigned int ifmt; - int err; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + unsigned int ifmt, ofs; + int err = 0; blkaddr = erofs_blknr(sb, inode_loc); - *ofs = erofs_blkoff(sb, inode_loc); + ofs = erofs_blkoff(sb, inode_loc); - kaddr = erofs_read_metabuf(buf, sb, erofs_pos(sb, blkaddr), EROFS_KMAP); + kaddr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr), EROFS_KMAP); if (IS_ERR(kaddr)) { erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld", vi->nid, PTR_ERR(kaddr)); - return kaddr; + return PTR_ERR(kaddr); } - dic = kaddr + *ofs; + dic = kaddr + ofs; ifmt = le16_to_cpu(dic->i_format); if (ifmt & ~EROFS_I_ALL) { erofs_err(sb, "unsupported i_format %u of nid %llu", @@ -54,11 +70,11 @@ static void *erofs_read_inode(struct erofs_buf *buf, case EROFS_INODE_LAYOUT_EXTENDED: vi->inode_isize = sizeof(struct erofs_inode_extended); /* check if the extended inode acrosses block boundary */ - if (*ofs + vi->inode_isize <= sb->s_blocksize) { - *ofs += vi->inode_isize; + if (ofs + vi->inode_isize <= sb->s_blocksize) { + ofs += vi->inode_isize; die = (struct erofs_inode_extended *)dic; } else { - const unsigned int gotten = sb->s_blocksize - *ofs; + const unsigned int gotten = sb->s_blocksize - ofs; copied = kmalloc(vi->inode_isize, GFP_KERNEL); if (!copied) { @@ -66,16 +82,16 @@ static void *erofs_read_inode(struct erofs_buf *buf, goto err_out; } memcpy(copied, dic, gotten); - kaddr = erofs_read_metabuf(buf, sb, erofs_pos(sb, blkaddr + 1), + kaddr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr + 1), EROFS_KMAP); if (IS_ERR(kaddr)) { erofs_err(sb, "failed to get inode payload block (nid: %llu), err %ld", vi->nid, PTR_ERR(kaddr)); kfree(copied); - return kaddr; + return PTR_ERR(kaddr); } - *ofs = vi->inode_isize - gotten; - memcpy((u8 *)copied + gotten, kaddr, *ofs); + ofs = vi->inode_isize - gotten; + memcpy((u8 *)copied + gotten, kaddr, ofs); die = copied; } vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount); @@ -91,11 +107,10 @@ static void *erofs_read_inode(struct erofs_buf *buf, inode->i_size = le64_to_cpu(die->i_size); kfree(copied); - copied = NULL; break; case EROFS_INODE_LAYOUT_COMPACT: vi->inode_isize = sizeof(struct erofs_inode_compact); - *ofs += vi->inode_isize; + ofs += vi->inode_isize; vi->xattr_isize = erofs_xattr_ibody_size(dic->i_xattr_icount); inode->i_mode = le16_to_cpu(dic->i_mode); @@ -115,11 +130,21 @@ static void *erofs_read_inode(struct erofs_buf *buf, goto err_out; } + if (unlikely(inode->i_size < 0)) { + erofs_err(sb, "negative i_size @ nid %llu", vi->nid); + err = -EFSCORRUPTED; + goto err_out; + } switch (inode->i_mode & S_IFMT) { case S_IFREG: case S_IFDIR: case S_IFLNK: vi->raw_blkaddr = le32_to_cpu(iu.raw_blkaddr); + if(S_ISLNK(inode->i_mode)) { + err = erofs_fill_symlink(inode, kaddr, ofs); + if (err) + goto err_out; + } break; case S_IFCHR: case S_IFBLK: @@ -165,65 +190,23 @@ static void *erofs_read_inode(struct erofs_buf *buf, inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9; else inode->i_blocks = nblks << (sb->s_blocksize_bits - 9); - return kaddr; - err_out: - DBG_BUGON(1); - kfree(copied); - erofs_put_metabuf(buf); - return ERR_PTR(err); -} - -static int erofs_fill_symlink(struct inode *inode, void *kaddr, - unsigned int m_pofs) -{ - struct erofs_inode *vi = EROFS_I(inode); - unsigned int bsz = i_blocksize(inode); - char *lnk; - - /* if it cannot be handled with fast symlink scheme */ - if (vi->datalayout != EROFS_INODE_FLAT_INLINE || - inode->i_size >= bsz || inode->i_size < 0) { - inode->i_op = &erofs_symlink_iops; - return 0; - } - - lnk = kmalloc(inode->i_size + 1, GFP_KERNEL); - if (!lnk) - return -ENOMEM; - - m_pofs += vi->xattr_isize; - /* inline symlink data shouldn't cross block boundary */ - if (m_pofs + inode->i_size > bsz) { - kfree(lnk); - erofs_err(inode->i_sb, - "inline data cross block boundary @ nid %llu", - vi->nid); - DBG_BUGON(1); - return -EFSCORRUPTED; - } - memcpy(lnk, kaddr + m_pofs, inode->i_size); - lnk[inode->i_size] = '\0'; - - inode->i_link = lnk; - inode->i_op = &erofs_fast_symlink_iops; - return 0; + DBG_BUGON(err); + erofs_put_metabuf(&buf); + return err; } static int erofs_fill_inode(struct inode *inode) { struct erofs_inode *vi = EROFS_I(inode); - struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - void *kaddr; - unsigned int ofs; - int err = 0; + int err; trace_erofs_fill_inode(inode); /* read inode base data from disk */ - kaddr = erofs_read_inode(&buf, inode, &ofs); - if (IS_ERR(kaddr)) - return PTR_ERR(kaddr); + err = erofs_read_inode(inode); + if (err) + return err; /* setup the new inode */ switch (inode->i_mode & S_IFMT) { @@ -240,9 +223,10 @@ static int erofs_fill_inode(struct inode *inode) inode_nohighmem(inode); break; case S_IFLNK: - err = erofs_fill_symlink(inode, kaddr, ofs); - if (err) - goto out_unlock; + if (inode->i_link) + inode->i_op = &erofs_fast_symlink_iops; + else + inode->i_op = &erofs_symlink_iops; inode_nohighmem(inode); break; case S_IFCHR: @@ -251,33 +235,33 @@ static int erofs_fill_inode(struct inode *inode) case S_IFSOCK: inode->i_op = &erofs_generic_iops; init_special_inode(inode, inode->i_mode, inode->i_rdev); - goto out_unlock; + return 0; default: - err = -EFSCORRUPTED; - goto out_unlock; + return -EFSCORRUPTED; } + mapping_set_large_folios(inode->i_mapping); if (erofs_inode_is_data_compressed(vi->datalayout)) { #ifdef CONFIG_EROFS_FS_ZIP DO_ONCE_LITE_IF(inode->i_blkbits != PAGE_SHIFT, erofs_info, inode->i_sb, "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!"); inode->i_mapping->a_ops = &z_erofs_aops; - err = 0; - goto out_unlock; -#endif +#else err = -EOPNOTSUPP; - goto out_unlock; - } - inode->i_mapping->a_ops = &erofs_raw_access_aops; - mapping_set_large_folios(inode->i_mapping); +#endif + } else { + inode->i_mapping->a_ops = &erofs_aops; #ifdef CONFIG_EROFS_FS_ONDEMAND - if (erofs_is_fscache_mode(inode->i_sb)) - inode->i_mapping->a_ops = &erofs_fscache_access_aops; + if (erofs_is_fscache_mode(inode->i_sb)) + inode->i_mapping->a_ops = &erofs_fscache_access_aops; #endif +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE + if (erofs_is_fileio_mode(EROFS_SB(inode->i_sb))) + inode->i_mapping->a_ops = &erofs_fileio_aops; +#endif + } -out_unlock: - erofs_put_metabuf(&buf); return err; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 736607675396..4efd578d7c62 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -49,7 +49,7 @@ typedef u32 erofs_blk_t; struct erofs_device_info { char *path; struct erofs_fscache *fscache; - struct file *bdev_file; + struct file *file; struct dax_device *dax_dev; u64 dax_part_off; @@ -130,6 +130,7 @@ struct erofs_sb_info { struct erofs_sb_lz4_info lz4; #endif /* CONFIG_EROFS_FS_ZIP */ + struct file *fdev; struct inode *packed_inode; struct erofs_dev_context *devs; struct dax_device *dax_dev; @@ -190,9 +191,15 @@ struct erofs_sb_info { #define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option) #define test_opt(opt, option) ((opt)->mount_opt & EROFS_MOUNT_##option) +static inline bool erofs_is_fileio_mode(struct erofs_sb_info *sbi) +{ + return IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) && sbi->fdev; +} + static inline bool erofs_is_fscache_mode(struct super_block *sb) { - return IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && !sb->s_bdev; + return IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && + !erofs_is_fileio_mode(EROFS_SB(sb)) && !sb->s_bdev; } enum { @@ -220,7 +227,7 @@ struct erofs_buf { }; #define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL }) -#define erofs_blknr(sb, addr) ((addr) >> (sb)->s_blocksize_bits) +#define erofs_blknr(sb, addr) ((erofs_blk_t)((addr) >> (sb)->s_blocksize_bits)) #define erofs_blkoff(sb, addr) ((addr) & ((sb)->s_blocksize - 1)) #define erofs_pos(sb, blk) ((erofs_off_t)(blk) << (sb)->s_blocksize_bits) #define erofs_iblks(i) (round_up((i)->i_size, i_blocksize(i)) >> (i)->i_blkbits) @@ -365,6 +372,7 @@ struct erofs_map_dev { struct erofs_fscache *m_fscache; struct block_device *m_bdev; struct dax_device *m_daxdev; + struct file *m_fp; u64 m_dax_part_off; erofs_off_t m_pa; @@ -373,7 +381,8 @@ struct erofs_map_dev { extern const struct super_operations erofs_sops; -extern const struct address_space_operations erofs_raw_access_aops; +extern const struct address_space_operations erofs_aops; +extern const struct address_space_operations erofs_fileio_aops; extern const struct address_space_operations z_erofs_aops; extern const struct address_space_operations erofs_fscache_access_aops; @@ -404,6 +413,9 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev); int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map); +void erofs_onlinefolio_init(struct folio *folio); +void erofs_onlinefolio_split(struct folio *folio); +void erofs_onlinefolio_end(struct folio *folio, int err); struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid); int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, @@ -477,6 +489,14 @@ static inline void z_erofs_exit_subsystem(void) {} static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; } #endif /* !CONFIG_EROFS_FS_ZIP */ +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE +struct bio *erofs_fileio_bio_alloc(struct erofs_map_dev *mdev); +void erofs_fileio_submit_bio(struct bio *bio); +#else +static inline struct bio *erofs_fileio_bio_alloc(struct erofs_map_dev *mdev) { return NULL; } +static inline void erofs_fileio_submit_bio(struct bio *bio) {} +#endif + #ifdef CONFIG_EROFS_FS_ONDEMAND int erofs_fscache_register_fs(struct super_block *sb); void erofs_fscache_unregister_fs(struct super_block *sb); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 32ce5b35e1df..666873f745da 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -10,6 +10,7 @@ #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/exportfs.h> +#include <linux/backing-dev.h> #include "xattr.h" #define CREATE_TRACE_POINTS @@ -108,22 +109,6 @@ static void erofs_free_inode(struct inode *inode) kmem_cache_free(erofs_inode_cachep, vi); } -static bool check_layout_compatibility(struct super_block *sb, - struct erofs_super_block *dsb) -{ - const unsigned int feature = le32_to_cpu(dsb->feature_incompat); - - EROFS_SB(sb)->feature_incompat = feature; - - /* check if current kernel meets all mandatory requirements */ - if (feature & (~EROFS_ALL_FEATURE_INCOMPAT)) { - erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel", - feature & ~EROFS_ALL_FEATURE_INCOMPAT); - return false; - } - return true; -} - /* read variable-sized metadata, offset will be aligned by 4-byte */ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, erofs_off_t *offset, int *lengthp) @@ -177,7 +162,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_fscache *fscache; struct erofs_deviceslot *dis; - struct file *bdev_file; + struct file *file; dis = erofs_read_metabuf(buf, sb, *pos, EROFS_KMAP); if (IS_ERR(dis)) @@ -199,13 +184,17 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, return PTR_ERR(fscache); dif->fscache = fscache; } else if (!sbi->devs->flatdev) { - bdev_file = bdev_file_open_by_path(dif->path, BLK_OPEN_READ, - sb->s_type, NULL); - if (IS_ERR(bdev_file)) - return PTR_ERR(bdev_file); - dif->bdev_file = bdev_file; - dif->dax_dev = fs_dax_get_by_bdev(file_bdev(bdev_file), - &dif->dax_part_off, NULL, NULL); + file = erofs_is_fileio_mode(sbi) ? + filp_open(dif->path, O_RDONLY | O_LARGEFILE, 0) : + bdev_file_open_by_path(dif->path, + BLK_OPEN_READ, sb->s_type, NULL); + if (IS_ERR(file)) + return PTR_ERR(file); + + dif->file = file; + if (!erofs_is_fileio_mode(sbi)) + dif->dax_dev = fs_dax_get_by_bdev(file_bdev(file), + &dif->dax_part_off, NULL, NULL); } dif->blocks = le32_to_cpu(dis->blocks); @@ -279,7 +268,7 @@ static int erofs_scan_devices(struct super_block *sb, static int erofs_read_superblock(struct super_block *sb) { - struct erofs_sb_info *sbi; + struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_super_block *dsb; void *data; @@ -291,9 +280,7 @@ static int erofs_read_superblock(struct super_block *sb) return PTR_ERR(data); } - sbi = EROFS_SB(sb); dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET); - ret = -EINVAL; if (le32_to_cpu(dsb->magic) != EROFS_SUPER_MAGIC_V1) { erofs_err(sb, "cannot find valid erofs superblock"); @@ -318,8 +305,12 @@ static int erofs_read_superblock(struct super_block *sb) } ret = -EINVAL; - if (!check_layout_compatibility(sb, dsb)) + sbi->feature_incompat = le32_to_cpu(dsb->feature_incompat); + if (sbi->feature_incompat & ~EROFS_ALL_FEATURE_INCOMPAT) { + erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel", + sbi->feature_incompat & ~EROFS_ALL_FEATURE_INCOMPAT); goto out; + } sbi->sb_size = 128 + dsb->sb_extslots * EROFS_SB_EXTSLOT_SIZE; if (sbi->sb_size > PAGE_SIZE - EROFS_SUPER_OFFSET) { @@ -362,7 +353,7 @@ static int erofs_read_superblock(struct super_block *sb) ret = erofs_scan_devices(sb, dsb); if (erofs_is_fscache_mode(sb)) - erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!"); + erofs_info(sb, "[deprecated] fscache-based on-demand read feature in use. Use at your own risk!"); out: erofs_put_metabuf(&buf); return ret; @@ -580,15 +571,16 @@ static void erofs_set_sysfs_name(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - if (erofs_is_fscache_mode(sb)) { - if (sbi->domain_id) - super_set_sysfs_name_generic(sb, "%s,%s",sbi->domain_id, - sbi->fsid); - else - super_set_sysfs_name_generic(sb, "%s", sbi->fsid); - return; - } - super_set_sysfs_name_id(sb); + if (sbi->domain_id) + super_set_sysfs_name_generic(sb, "%s,%s", sbi->domain_id, + sbi->fsid); + else if (sbi->fsid) + super_set_sysfs_name_generic(sb, "%s", sbi->fsid); + else if (erofs_is_fileio_mode(sbi)) + super_set_sysfs_name_generic(sb, "%s", + bdi_dev_name(sb->s_bdi)); + else + super_set_sysfs_name_id(sb); } static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) @@ -603,14 +595,15 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_op = &erofs_sops; sbi->blkszbits = PAGE_SHIFT; - if (erofs_is_fscache_mode(sb)) { + if (!sb->s_bdev) { sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; - err = erofs_fscache_register_fs(sb); - if (err) - return err; - + if (erofs_is_fscache_mode(sb)) { + err = erofs_fscache_register_fs(sb); + if (err) + return err; + } err = super_setup_bdi(sb); if (err) return err; @@ -658,7 +651,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_flags |= SB_POSIXACL; else sb->s_flags &= ~SB_POSIXACL; - erofs_set_sysfs_name(sb); #ifdef CONFIG_EROFS_FS_ZIP xa_init(&sbi->managed_pslots); @@ -696,6 +688,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) if (err) return err; + erofs_set_sysfs_name(sb); err = erofs_register_sysfs(sb); if (err) return err; @@ -707,11 +700,24 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) static int erofs_fc_get_tree(struct fs_context *fc) { struct erofs_sb_info *sbi = fc->s_fs_info; + int ret; if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) return get_tree_nodev(fc, erofs_fc_fill_super); - return get_tree_bdev(fc, erofs_fc_fill_super); + ret = get_tree_bdev(fc, erofs_fc_fill_super); +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE + if (ret == -ENOTBLK) { + if (!fc->source) + return invalf(fc, "No source specified"); + sbi->fdev = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0); + if (IS_ERR(sbi->fdev)) + return PTR_ERR(sbi->fdev); + + return get_tree_nodev(fc, erofs_fc_fill_super); + } +#endif + return ret; } static int erofs_fc_reconfigure(struct fs_context *fc) @@ -741,8 +747,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data) struct erofs_device_info *dif = ptr; fs_put_dax(dif->dax_dev, NULL); - if (dif->bdev_file) - fput(dif->bdev_file); + if (dif->file) + fput(dif->file); erofs_fscache_unregister_cookie(dif->fscache); dif->fscache = NULL; kfree(dif->path); @@ -805,7 +811,7 @@ static void erofs_kill_sb(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) + if ((IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) || sbi->fdev) kill_anon_super(sb); else kill_block_super(sb); @@ -815,6 +821,8 @@ static void erofs_kill_sb(struct super_block *sb) erofs_fscache_unregister_fs(sb); kfree(sbi->fsid); kfree(sbi->domain_id); + if (sbi->fdev) + fput(sbi->fdev); kfree(sbi); sb->s_fs_info = NULL; } @@ -917,7 +925,7 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_namelen = EROFS_NAME_LEN; if (uuid_is_null(&sb->s_uuid)) - buf->f_fsid = u64_to_fsid(erofs_is_fscache_mode(sb) ? 0 : + buf->f_fsid = u64_to_fsid(!sb->s_bdev ? 0 : huge_encode_dev(sb->s_bdev->bd_dev)); else buf->f_fsid = uuid_to_fsid(sb->s_uuid.b); diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c index 435e515c0792..63cffd0fd261 100644 --- a/fs/erofs/sysfs.c +++ b/fs/erofs/sysfs.c @@ -205,34 +205,16 @@ static struct kobject erofs_feat = { int erofs_register_sysfs(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - char *name; - char *str = NULL; int err; - if (erofs_is_fscache_mode(sb)) { - if (sbi->domain_id) { - str = kasprintf(GFP_KERNEL, "%s,%s", sbi->domain_id, - sbi->fsid); - if (!str) - return -ENOMEM; - name = str; - } else { - name = sbi->fsid; - } - } else { - name = sb->s_id; - } sbi->s_kobj.kset = &erofs_root; init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", name); - kfree(str); - if (err) - goto put_sb_kobj; - return 0; - -put_sb_kobj: - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", + sb->s_sysfs_name); + if (err) { + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + } return err; } diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 424f656cd765..8936790618c6 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -122,42 +122,6 @@ static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo) return fo->mapping == MNGD_MAPPING(sbi); } -/* - * bit 30: I/O error occurred on this folio - * bit 0 - 29: remaining parts to complete this folio - */ -#define Z_EROFS_FOLIO_EIO (1 << 30) - -static void z_erofs_onlinefolio_init(struct folio *folio) -{ - union { - atomic_t o; - void *v; - } u = { .o = ATOMIC_INIT(1) }; - - folio->private = u.v; /* valid only if file-backed folio is locked */ -} - -static void z_erofs_onlinefolio_split(struct folio *folio) -{ - atomic_inc((atomic_t *)&folio->private); -} - -static void z_erofs_onlinefolio_end(struct folio *folio, int err) -{ - int orig, v; - - do { - orig = atomic_read((atomic_t *)&folio->private); - v = (orig - 1) | (err ? Z_EROFS_FOLIO_EIO : 0); - } while (atomic_cmpxchg((atomic_t *)&folio->private, orig, v) != orig); - - if (v & ~Z_EROFS_FOLIO_EIO) - return; - folio->private = 0; - folio_end_read(folio, !(v & Z_EROFS_FOLIO_EIO)); -} - #define Z_EROFS_ONSTACK_PAGES 32 /* @@ -232,7 +196,8 @@ static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, struct page *nextpage = *candidate_bvpage; if (!nextpage) { - nextpage = erofs_allocpage(pagepool, GFP_KERNEL); + nextpage = __erofs_allocpage(pagepool, GFP_KERNEL, + true); if (!nextpage) return -ENOMEM; set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE); @@ -965,7 +930,7 @@ static int z_erofs_scan_folio(struct z_erofs_decompress_frontend *f, int err = 0; tight = (bs == PAGE_SIZE); - z_erofs_onlinefolio_init(folio); + erofs_onlinefolio_init(folio); do { if (offset + end - 1 < map->m_la || offset + end - 1 >= map->m_la + map->m_llen) { @@ -1024,7 +989,7 @@ static int z_erofs_scan_folio(struct z_erofs_decompress_frontend *f, if (err) break; - z_erofs_onlinefolio_split(folio); + erofs_onlinefolio_split(folio); if (f->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) f->pcl->multibases = true; if (f->pcl->length < offset + end - map->m_la) { @@ -1044,7 +1009,7 @@ static int z_erofs_scan_folio(struct z_erofs_decompress_frontend *f, tight = (bs == PAGE_SIZE); } } while ((end = cur) > 0); - z_erofs_onlinefolio_end(folio, err); + erofs_onlinefolio_end(folio, err); return err; } @@ -1147,7 +1112,7 @@ static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, cur += len; } kunmap_local(dst); - z_erofs_onlinefolio_end(page_folio(bvi->bvec.page), err); + erofs_onlinefolio_end(page_folio(bvi->bvec.page), err); list_del(p); kfree(bvi); } @@ -1190,9 +1155,10 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i]; struct page *page = bvec->page; - /* compressed data ought to be valid before decompressing */ - if (!page) { - err = -EIO; + /* compressed data ought to be valid when decompressing */ + if (IS_ERR(page) || !page) { + bvec->page = NULL; /* clear the failure reason */ + err = page ? PTR_ERR(page) : -EIO; continue; } be->compressed_pages[i] = page; @@ -1268,8 +1234,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, .inplace_io = overlapped, .partial_decoding = pcl->partial, .fillgaps = pcl->multibases, - .gfp = pcl->besteffort ? - GFP_KERNEL | __GFP_NOFAIL : + .gfp = pcl->besteffort ? GFP_KERNEL : GFP_NOWAIT | __GFP_NORETRY }, be->pagepool); @@ -1302,7 +1267,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, DBG_BUGON(z_erofs_page_is_invalidated(page)); if (!z_erofs_is_shortlived_page(page)) { - z_erofs_onlinefolio_end(page_folio(page), err); + erofs_onlinefolio_end(page_folio(page), err); continue; } if (pcl->algorithmformat != Z_EROFS_COMPRESSION_LZ4) { @@ -1333,8 +1298,8 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, return err; } -static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, - struct page **pagepool) +static int z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, + struct page **pagepool) { struct z_erofs_decompress_backend be = { .sb = io->sb, @@ -1343,6 +1308,7 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, LIST_HEAD_INIT(be.decompressed_secondary_bvecs), }; z_erofs_next_pcluster_t owned = io->head; + int err = io->eio ? -EIO : 0; while (owned != Z_EROFS_PCLUSTER_TAIL) { DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); @@ -1350,12 +1316,13 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, be.pcl = container_of(owned, struct z_erofs_pcluster, next); owned = READ_ONCE(be.pcl->next); - z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0); + err = z_erofs_decompress_pcluster(&be, err) ?: err; if (z_erofs_is_inline_pcluster(be.pcl)) z_erofs_free_pcluster(be.pcl); else erofs_workgroup_put(&be.pcl->obj); } + return err; } static void z_erofs_decompressqueue_work(struct work_struct *work) @@ -1428,6 +1395,7 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec, struct z_erofs_bvec zbv; struct address_space *mapping; struct folio *folio; + struct page *page; int bs = i_blocksize(f->inode); /* Except for inplace folios, the entire folio can be used for I/Os */ @@ -1450,7 +1418,6 @@ repeat: * file-backed folios will be used instead. */ if (folio->private == (void *)Z_EROFS_PREALLOCATED_PAGE) { - folio->private = 0; tocache = true; goto out_tocache; } @@ -1468,7 +1435,7 @@ repeat: } folio_lock(folio); - if (folio->mapping == mc) { + if (likely(folio->mapping == mc)) { /* * The cached folio is still in managed cache but without * a valid `->private` pcluster hint. Let's reconnect them. @@ -1478,41 +1445,48 @@ repeat: /* compressed_bvecs[] already takes a ref before */ folio_put(folio); } - - /* no need to submit if it is already up-to-date */ - if (folio_test_uptodate(folio)) { - folio_unlock(folio); - bvec->bv_page = NULL; + if (likely(folio->private == pcl)) { + /* don't submit cache I/Os again if already uptodate */ + if (folio_test_uptodate(folio)) { + folio_unlock(folio); + bvec->bv_page = NULL; + } + return; } - return; + /* + * Already linked with another pcluster, which only appears in + * crafted images by fuzzers for now. But handle this anyway. + */ + tocache = false; /* use temporary short-lived pages */ + } else { + DBG_BUGON(1); /* referenced managed folios can't be truncated */ + tocache = true; } - - /* - * It has been truncated, so it's unsafe to reuse this one. Let's - * allocate a new page for compressed data. - */ - DBG_BUGON(folio->mapping); - tocache = true; folio_unlock(folio); folio_put(folio); out_allocfolio: - zbv.page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL); + page = __erofs_allocpage(&f->pagepool, gfp, true); spin_lock(&pcl->obj.lockref.lock); - if (pcl->compressed_bvecs[nr].page) { - erofs_pagepool_add(&f->pagepool, zbv.page); + if (unlikely(pcl->compressed_bvecs[nr].page != zbv.page)) { + if (page) + erofs_pagepool_add(&f->pagepool, page); spin_unlock(&pcl->obj.lockref.lock); cond_resched(); goto repeat; } - bvec->bv_page = pcl->compressed_bvecs[nr].page = zbv.page; - folio = page_folio(zbv.page); - /* first mark it as a temporary shortlived folio (now 1 ref) */ - folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE; + pcl->compressed_bvecs[nr].page = page ? page : ERR_PTR(-ENOMEM); spin_unlock(&pcl->obj.lockref.lock); + bvec->bv_page = page; + if (!page) + return; + folio = page_folio(page); out_tocache: if (!tocache || bs != PAGE_SIZE || - filemap_add_folio(mc, folio, pcl->obj.index + nr, gfp)) + filemap_add_folio(mc, folio, pcl->obj.index + nr, gfp)) { + /* turn into a temporary shortlived folio (1 ref) */ + folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE; return; + } folio_attach_private(folio, pcl); /* drop a refcount added by allocpage (then 2 refs in total here) */ folio_put(folio); @@ -1647,17 +1621,16 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, cur = mdev.m_pa; end = cur + pcl->pclustersize; do { - z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc); - if (!bvec.bv_page) - continue; - + bvec.bv_page = NULL; if (bio && (cur != last_pa || bio->bi_bdev != mdev.m_bdev)) { -io_retry: - if (!erofs_is_fscache_mode(sb)) - submit_bio(bio); - else +drain_io: + if (erofs_is_fileio_mode(EROFS_SB(sb))) + erofs_fileio_submit_bio(bio); + else if (erofs_is_fscache_mode(sb)) erofs_fscache_submit_bio(bio); + else + submit_bio(bio); if (memstall) { psi_memstall_leave(&pflags); @@ -1666,6 +1639,15 @@ io_retry: bio = NULL; } + if (!bvec.bv_page) { + z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc); + if (!bvec.bv_page) + continue; + if (cur + bvec.bv_len > end) + bvec.bv_len = end - cur; + DBG_BUGON(bvec.bv_len < sb->s_blocksize); + } + if (unlikely(PageWorkingset(bvec.bv_page)) && !memstall) { psi_memstall_enter(&pflags); @@ -1673,10 +1655,13 @@ io_retry: } if (!bio) { - bio = erofs_is_fscache_mode(sb) ? - erofs_fscache_bio_alloc(&mdev) : - bio_alloc(mdev.m_bdev, BIO_MAX_VECS, - REQ_OP_READ, GFP_NOIO); + if (erofs_is_fileio_mode(EROFS_SB(sb))) + bio = erofs_fileio_bio_alloc(&mdev); + else if (erofs_is_fscache_mode(sb)) + bio = erofs_fscache_bio_alloc(&mdev); + else + bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, + REQ_OP_READ, GFP_NOIO); bio->bi_end_io = z_erofs_endio; bio->bi_iter.bi_sector = cur >> 9; bio->bi_private = q[JQ_SUBMIT]; @@ -1685,13 +1670,9 @@ io_retry: ++nr_bios; } - if (cur + bvec.bv_len > end) - bvec.bv_len = end - cur; - DBG_BUGON(bvec.bv_len < sb->s_blocksize); if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len, bvec.bv_offset)) - goto io_retry; - + goto drain_io; last_pa = cur + bvec.bv_len; bypass = false; } while ((cur += bvec.bv_len) < end); @@ -1703,10 +1684,12 @@ io_retry: } while (owned_head != Z_EROFS_PCLUSTER_TAIL); if (bio) { - if (!erofs_is_fscache_mode(sb)) - submit_bio(bio); - else + if (erofs_is_fileio_mode(EROFS_SB(sb))) + erofs_fileio_submit_bio(bio); + else if (erofs_is_fscache_mode(sb)) erofs_fscache_submit_bio(bio); + else + submit_bio(bio); if (memstall) psi_memstall_leave(&pflags); } @@ -1722,26 +1705,28 @@ io_retry: z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios); } -static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, - bool force_fg, bool ra) +static int z_erofs_runqueue(struct z_erofs_decompress_frontend *f, + unsigned int ra_folios) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; + struct erofs_sb_info *sbi = EROFS_I_SB(f->inode); + bool force_fg = z_erofs_is_sync_decompress(sbi, ra_folios); + int err; if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) - return; - z_erofs_submit_queue(f, io, &force_fg, ra); + return 0; + z_erofs_submit_queue(f, io, &force_fg, !!ra_folios); /* handle bypass queue (no i/o pclusters) immediately */ - z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool); - + err = z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool); if (!force_fg) - return; + return err; /* wait until all bios are completed */ wait_for_completion_io(&io[JQ_SUBMIT].u.done); /* handle synchronous decompress queue in the caller context */ - z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool); + return z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool) ?: err; } /* @@ -1803,7 +1788,6 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, static int z_erofs_read_folio(struct file *file, struct folio *folio) { struct inode *const inode = folio->mapping->host; - struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); int err; @@ -1815,9 +1799,8 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) z_erofs_pcluster_readmore(&f, NULL, false); z_erofs_pcluster_end(&f); - /* if some compressed cluster ready, need submit them anyway */ - z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false); - + /* if some pclusters are ready, need submit them anyway */ + err = z_erofs_runqueue(&f, 0) ?: err; if (err && err != -EINTR) erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu", err, folio->index, EROFS_I(inode)->nid); @@ -1830,7 +1813,6 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) static void z_erofs_readahead(struct readahead_control *rac) { struct inode *const inode = rac->mapping->host; - struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); struct folio *head = NULL, *folio; unsigned int nr_folios; @@ -1860,7 +1842,7 @@ static void z_erofs_readahead(struct readahead_control *rac) z_erofs_pcluster_readmore(&f, rac, false); z_erofs_pcluster_end(&f); - z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_folios), true); + (void)z_erofs_runqueue(&f, nr_folios); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&f.pagepool); } diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 403af6e31d5b..e980e29873a5 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -687,32 +687,30 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int err = 0; trace_erofs_map_blocks_enter(inode, map, flags); - - /* when trying to read beyond EOF, leave it unmapped */ - if (map->m_la >= inode->i_size) { + if (map->m_la >= inode->i_size) { /* post-EOF unmapped extent */ map->m_llen = map->m_la + 1 - inode->i_size; map->m_la = inode->i_size; map->m_flags = 0; - goto out; - } - - err = z_erofs_fill_inode_lazy(inode); - if (err) - goto out; - - if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) && - !vi->z_tailextent_headlcn) { - map->m_la = 0; - map->m_llen = inode->i_size; - map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_FULL_MAPPED | - EROFS_MAP_FRAGMENT; - goto out; + } else { + err = z_erofs_fill_inode_lazy(inode); + if (!err) { + if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) && + !vi->z_tailextent_headlcn) { + map->m_la = 0; + map->m_llen = inode->i_size; + map->m_flags = EROFS_MAP_MAPPED | + EROFS_MAP_FULL_MAPPED | EROFS_MAP_FRAGMENT; + } else { + err = z_erofs_do_map_blocks(inode, map, flags); + } + } + if (!err && (map->m_flags & EROFS_MAP_ENCODED) && + unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE || + map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE)) + err = -EOPNOTSUPP; + if (err) + map->m_llen = 0; } - - err = z_erofs_do_map_blocks(inode, map, flags); -out: - if (err) - map->m_llen = 0; trace_erofs_map_blocks_exit(inode, map, flags, err); return err; } diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c index 9b53883e5caf..37afe2024840 100644 --- a/fs/erofs/zutil.c +++ b/fs/erofs/zutil.c @@ -111,7 +111,8 @@ int z_erofs_gbuf_growsize(unsigned int nrpages) out: if (i < z_erofs_gbuf_count && tmp_pages) { for (j = 0; j < nrpages; ++j) - if (tmp_pages[j] && tmp_pages[j] != gbuf->pages[j]) + if (tmp_pages[j] && (j >= gbuf->nrpages || + tmp_pages[j] != gbuf->pages[j])) __free_page(tmp_pages[j]); kfree(tmp_pages); } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index f53ca4f7fced..145f5349c612 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -420,7 +420,7 @@ static bool busy_loop_ep_timeout(unsigned long start_time, static bool ep_busy_loop_on(struct eventpoll *ep) { - return !!ep->busy_poll_usecs || net_busy_loop_on(); + return !!READ_ONCE(ep->busy_poll_usecs) || net_busy_loop_on(); } static bool ep_busy_loop_end(void *p, unsigned long start_time) @@ -2200,11 +2200,6 @@ static int do_epoll_create(int flags) error = PTR_ERR(file); goto out_free_fd; } -#ifdef CONFIG_NET_RX_BUSY_POLL - ep->busy_poll_usecs = 0; - ep->busy_poll_budget = 0; - ep->prefer_busy_poll = false; -#endif ep->file = file; fd_install(fd, file); return fd; diff --git a/fs/exec.c b/fs/exec.c index a126e3d1cacb..caae051c5a95 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -145,13 +145,11 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) goto out; /* - * may_open() has already checked for this, so it should be - * impossible to trip now. But we need to be extra cautious - * and check again at the very end too. + * Check do_open_execat() for an explanation. */ error = -EACCES; - if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) || - path_noexec(&file->f_path))) + if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) || + path_noexec(&file->f_path)) goto exit; error = -ENOEXEC; @@ -954,7 +952,6 @@ EXPORT_SYMBOL(transfer_args_to_stack); static struct file *do_open_execat(int fd, struct filename *name, int flags) { struct file *file; - int err; struct open_flags open_exec_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_EXEC, @@ -971,24 +968,20 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags) file = do_filp_open(fd, name, &open_exec_flags); if (IS_ERR(file)) - goto out; + return file; /* - * may_open() has already checked for this, so it should be - * impossible to trip now. But we need to be extra cautious - * and check again at the very end too. + * In the past the regular type check was here. It moved to may_open() in + * 633fb6ac3980 ("exec: move S_ISREG() check earlier"). Since then it is + * an invariant that all non-regular files error out before we get here. */ - err = -EACCES; - if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) || - path_noexec(&file->f_path))) - goto exit; + if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) || + path_noexec(&file->f_path)) { + fput(file); + return ERR_PTR(-EACCES); + } -out: return file; - -exit: - fput(file); - return ERR_PTR(err); } /** @@ -1692,6 +1685,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) unsigned int mode; vfsuid_t vfsuid; vfsgid_t vfsgid; + int err; if (!mnt_may_suid(file->f_path.mnt)) return; @@ -1708,12 +1702,17 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) /* Be careful if suid/sgid is set */ inode_lock(inode); - /* reload atomically mode/uid/gid now that lock held */ + /* Atomically reload and check mode/uid/gid now that lock held. */ mode = inode->i_mode; vfsuid = i_uid_into_vfsuid(idmap, inode); vfsgid = i_gid_into_vfsgid(idmap, inode); + err = inode_permission(idmap, inode, MAY_EXEC); inode_unlock(inode); + /* Did the exec bit vanish out from under us? Give up. */ + if (err) + return; + /* We ignore suid/sgid if there are no mappings for them in the ns */ if (!vfsuid_has_mapping(bprm->cred->user_ns, vfsuid) || !vfsgid_has_mapping(bprm->cred->user_ns, vfsgid)) diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 64c31867bc76..e19469e88000 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -535,20 +535,20 @@ static int exfat_file_zeroed_range(struct file *file, loff_t start, loff_t end) while (start < end) { u32 zerofrom, len; - struct page *page = NULL; + struct folio *folio; zerofrom = start & (PAGE_SIZE - 1); len = PAGE_SIZE - zerofrom; if (start + len > end) len = end - start; - err = ops->write_begin(file, mapping, start, len, &page, NULL); + err = ops->write_begin(file, mapping, start, len, &folio, NULL); if (err) goto out; - zero_user_segment(page, zerofrom, zerofrom + len); + folio_zero_range(folio, offset_in_folio(folio, start), len); - err = ops->write_end(file, mapping, start, len, len, page, NULL); + err = ops->write_end(file, mapping, start, len, len, folio, NULL); if (err < 0) goto out; start += len; diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index dd894e558c91..05f0e07b01d0 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -448,12 +448,11 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to) static int exfat_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned int len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - *pagep = NULL; - ret = block_write_begin(mapping, pos, len, pagep, exfat_get_block); + ret = block_write_begin(mapping, pos, len, foliop, exfat_get_block); if (ret < 0) exfat_write_failed(mapping, pos+len); @@ -463,13 +462,13 @@ static int exfat_write_begin(struct file *file, struct address_space *mapping, static int exfat_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned int len, unsigned int copied, - struct page *pagep, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; struct exfat_inode_info *ei = EXFAT_I(inode); int err; - err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata); + err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); if (ei->i_size_aligned < i_size_read(inode)) { exfat_fs_error(inode->i_sb, diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 087457061c6e..402fecf90a44 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -87,7 +87,7 @@ static void ext2_commit_chunk(struct folio *folio, loff_t pos, unsigned len) struct inode *dir = mapping->host; inode_inc_iversion(dir); - block_write_end(NULL, mapping, pos, len, len, &folio->page, NULL); + block_write_end(NULL, mapping, pos, len, len, folio, NULL); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); @@ -263,7 +263,7 @@ ext2_readdir(struct file *file, struct dir_context *ctx) unsigned long n = pos >> PAGE_SHIFT; unsigned long npages = dir_pages(inode); unsigned chunk_mask = ~(ext2_chunk_size(inode)-1); - bool need_revalidate = !inode_eq_iversion(inode, file->f_version); + bool need_revalidate = !inode_eq_iversion(inode, *(u64 *)file->private_data); bool has_filetype; if (pos > inode->i_size - EXT2_DIR_REC_LEN(1)) @@ -290,7 +290,7 @@ ext2_readdir(struct file *file, struct dir_context *ctx) offset = ext2_validate_entry(kaddr, offset, chunk_mask); ctx->pos = (n<<PAGE_SHIFT) + offset; } - file->f_version = inode_query_iversion(inode); + *(u64 *)file->private_data = inode_query_iversion(inode); need_revalidate = false; } de = (ext2_dirent *)(kaddr+offset); @@ -434,7 +434,7 @@ int ext2_inode_by_name(struct inode *dir, const struct qstr *child, ino_t *ino) static int ext2_prepare_chunk(struct folio *folio, loff_t pos, unsigned len) { - return __block_write_begin(&folio->page, pos, len, ext2_get_block); + return __block_write_begin(folio, pos, len, ext2_get_block); } static int ext2_handle_dirsync(struct inode *dir) @@ -703,8 +703,30 @@ not_empty: return 0; } +static int ext2_dir_open(struct inode *inode, struct file *file) +{ + file->private_data = kzalloc(sizeof(u64), GFP_KERNEL); + if (!file->private_data) + return -ENOMEM; + return 0; +} + +static int ext2_dir_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static loff_t ext2_dir_llseek(struct file *file, loff_t offset, int whence) +{ + return generic_llseek_cookie(file, offset, whence, + (u64 *)file->private_data); +} + const struct file_operations ext2_dir_operations = { - .llseek = generic_file_llseek, + .open = ext2_dir_open, + .release = ext2_dir_release, + .llseek = ext2_dir_llseek, .read = generic_read_dir, .iterate_shared = ext2_readdir, .unlocked_ioctl = ext2_ioctl, diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 0caa1650cee8..30f8201c155f 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -916,11 +916,11 @@ static void ext2_readahead(struct readahead_control *rac) static int ext2_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, void **fsdata) + loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { int ret; - ret = block_write_begin(mapping, pos, len, pagep, ext2_get_block); + ret = block_write_begin(mapping, pos, len, foliop, ext2_get_block); if (ret < 0) ext2_write_failed(mapping, pos + len); return ret; @@ -928,11 +928,11 @@ ext2_write_begin(struct file *file, struct address_space *mapping, static int ext2_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { int ret; - ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); if (ret < len) ext2_write_failed(mapping, pos + len); return ret; diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index ff4514e4626b..13196afe55ce 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -133,6 +133,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) struct super_block *sb = inode->i_sb; struct buffer_head *bh = NULL; struct fscrypt_str fstr = FSTR_INIT(NULL, 0); + struct dir_private_info *info = file->private_data; err = fscrypt_prepare_readdir(inode); if (err) @@ -229,7 +230,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block * to make sure. */ - if (!inode_eq_iversion(inode, file->f_version)) { + if (!inode_eq_iversion(inode, info->cookie)) { for (i = 0; i < sb->s_blocksize && i < offset; ) { de = (struct ext4_dir_entry_2 *) (bh->b_data + i); @@ -249,7 +250,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) offset = i; ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) | offset; - file->f_version = inode_query_iversion(inode); + info->cookie = inode_query_iversion(inode); } while (ctx->pos < inode->i_size @@ -384,6 +385,7 @@ static inline loff_t ext4_get_htree_eof(struct file *filp) static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; + struct dir_private_info *info = file->private_data; int dx_dir = is_dx_dir(inode); loff_t ret, htree_max = ext4_get_htree_eof(file); @@ -392,7 +394,7 @@ static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence) htree_max, htree_max); else ret = ext4_llseek(file, offset, whence); - file->f_version = inode_peek_iversion(inode) - 1; + info->cookie = inode_peek_iversion(inode) - 1; return ret; } @@ -429,18 +431,15 @@ static void free_rb_tree_fname(struct rb_root *root) *root = RB_ROOT; } - -static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp, - loff_t pos) +static void ext4_htree_init_dir_info(struct file *filp, loff_t pos) { - struct dir_private_info *p; - - p = kzalloc(sizeof(*p), GFP_KERNEL); - if (!p) - return NULL; - p->curr_hash = pos2maj_hash(filp, pos); - p->curr_minor_hash = pos2min_hash(filp, pos); - return p; + struct dir_private_info *p = filp->private_data; + + if (is_dx_dir(file_inode(filp)) && !p->initialized) { + p->curr_hash = pos2maj_hash(filp, pos); + p->curr_minor_hash = pos2min_hash(filp, pos); + p->initialized = true; + } } void ext4_htree_free_dir_info(struct dir_private_info *p) @@ -552,12 +551,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) struct fname *fname; int ret = 0; - if (!info) { - info = ext4_htree_create_dir_info(file, ctx->pos); - if (!info) - return -ENOMEM; - file->private_data = info; - } + ext4_htree_init_dir_info(file, ctx->pos); if (ctx->pos == ext4_get_htree_eof(file)) return 0; /* EOF */ @@ -590,10 +584,10 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) * cached entries. */ if ((!info->curr_node) || - !inode_eq_iversion(inode, file->f_version)) { + !inode_eq_iversion(inode, info->cookie)) { info->curr_node = NULL; free_rb_tree_fname(&info->root); - file->f_version = inode_query_iversion(inode); + info->cookie = inode_query_iversion(inode); ret = ext4_htree_fill_tree(file, info->curr_hash, info->curr_minor_hash, &info->next_hash); @@ -664,7 +658,19 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, return 0; } +static int ext4_dir_open(struct inode *inode, struct file *file) +{ + struct dir_private_info *info; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + file->private_data = info; + return 0; +} + const struct file_operations ext4_dir_operations = { + .open = ext4_dir_open, .llseek = ext4_dir_llseek, .read = generic_read_dir, .iterate_shared = ext4_readdir, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 08acd152261e..ecc15e5f1eba 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2553,6 +2553,8 @@ struct dir_private_info { __u32 curr_hash; __u32 curr_minor_hash; __u32 next_hash; + u64 cookie; + bool initialized; }; /* calculate the first block number of the group */ @@ -3563,13 +3565,13 @@ int ext4_readpage_inline(struct inode *inode, struct folio *folio); extern int ext4_try_to_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, - struct page **pagep); + struct folio **foliop); int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct folio *folio); extern int ext4_da_write_inline_data_begin(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, - struct page **pagep, + struct folio **foliop, void **fsdata); extern int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index e7a09a99837b..edf4aa99a974 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -601,10 +601,10 @@ retry: goto out; if (ext4_should_dioread_nolock(inode)) { - ret = __block_write_begin(&folio->page, from, to, + ret = __block_write_begin(folio, from, to, ext4_get_block_unwritten); } else - ret = __block_write_begin(&folio->page, from, to, ext4_get_block); + ret = __block_write_begin(folio, from, to, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, @@ -660,7 +660,7 @@ out_nofolio: int ext4_try_to_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, - struct page **pagep) + struct folio **foliop) { int ret; handle_t *handle; @@ -708,7 +708,7 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, goto out; } - *pagep = &folio->page; + *foliop = folio; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { ret = 0; @@ -856,7 +856,7 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, goto out; } - ret = __block_write_begin(&folio->page, 0, inline_size, + ret = __block_write_begin(folio, 0, inline_size, ext4_da_get_block_prep); if (ret) { up_read(&EXT4_I(inode)->xattr_sem); @@ -891,7 +891,7 @@ out: int ext4_da_write_inline_data_begin(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, - struct page **pagep, + struct folio **foliop, void **fsdata) { int ret; @@ -954,7 +954,7 @@ retry_journal: goto out_release_page; up_read(&EXT4_I(inode)->xattr_sem); - *pagep = &folio->page; + *foliop = folio; brelse(iloc.bh); return 1; out_release_page: @@ -1460,6 +1460,7 @@ int ext4_read_inline_dir(struct file *file, struct ext4_iloc iloc; void *dir_buf = NULL; int dotdot_offset, dotdot_size, extra_offset, extra_size; + struct dir_private_info *info = file->private_data; ret = ext4_get_inode_loc(inode, &iloc); if (ret) @@ -1503,12 +1504,12 @@ int ext4_read_inline_dir(struct file *file, extra_size = extra_offset + inline_size; /* - * If the version has changed since the last call to + * If the cookie has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the inline * dir to make sure. */ - if (!inode_eq_iversion(inode, file->f_version)) { + if (!inode_eq_iversion(inode, info->cookie)) { for (i = 0; i < extra_size && i < offset;) { /* * "." is with offset 0 and @@ -1540,7 +1541,7 @@ int ext4_read_inline_dir(struct file *file, } offset = i; ctx->pos = offset; - file->f_version = inode_query_iversion(inode); + info->cookie = inode_query_iversion(inode); } while (ctx->pos < extra_size) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 941c1c0d5c6e..03374dc215d1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1145,7 +1145,7 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, */ static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; int ret, needed_blocks; @@ -1170,7 +1170,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, - pagep); + foliop); if (ret < 0) return ret; if (ret == 1) @@ -1224,10 +1224,10 @@ retry_journal: ret = ext4_block_write_begin(folio, pos, len, ext4_get_block); #else if (ext4_should_dioread_nolock(inode)) - ret = __block_write_begin(&folio->page, pos, len, + ret = __block_write_begin(folio, pos, len, ext4_get_block_unwritten); else - ret = __block_write_begin(&folio->page, pos, len, ext4_get_block); + ret = __block_write_begin(folio, pos, len, ext4_get_block); #endif if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, @@ -1270,7 +1270,7 @@ retry_journal: folio_put(folio); return ret; } - *pagep = &folio->page; + *foliop = folio; return ret; } @@ -1298,9 +1298,8 @@ static int write_end_fn(handle_t *handle, struct inode *inode, static int ext4_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; @@ -1315,7 +1314,7 @@ static int ext4_write_end(struct file *file, return ext4_write_inline_data_end(inode, pos, len, copied, folio); - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata); /* * it's important to update i_size while still holding folio lock: * page writeout could otherwise come in and zero beyond i_size. @@ -1402,9 +1401,8 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle, static int ext4_journalled_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; @@ -2926,7 +2924,7 @@ static int ext4_nonda_switch(struct super_block *sb) static int ext4_da_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret, retries = 0; struct folio *folio; @@ -2941,14 +2939,14 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; return ext4_write_begin(file, mapping, pos, - len, pagep, fsdata); + len, foliop, fsdata); } *fsdata = (void *)0; trace_ext4_da_write_begin(inode, pos, len); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len, - pagep, fsdata); + foliop, fsdata); if (ret < 0) return ret; if (ret == 1) @@ -2964,7 +2962,7 @@ retry: #ifdef CONFIG_FS_ENCRYPTION ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep); #else - ret = __block_write_begin(&folio->page, pos, len, ext4_da_get_block_prep); + ret = __block_write_begin(folio, pos, len, ext4_da_get_block_prep); #endif if (ret < 0) { folio_unlock(folio); @@ -2983,7 +2981,7 @@ retry: return ret; } - *pagep = &folio->page; + *foliop = folio; return ret; } @@ -3029,7 +3027,7 @@ static int ext4_da_do_write_end(struct address_space *mapping, * flag, which all that's needed to trigger page writeback. */ copied = block_write_end(NULL, mapping, pos, len, copied, - &folio->page, NULL); + folio, NULL); new_i_size = pos + copied; /* @@ -3080,15 +3078,14 @@ static int ext4_da_do_write_end(struct address_space *mapping, static int ext4_da_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int write_mode = (int)(unsigned long)fsdata; - struct folio *folio = page_folio(page); if (write_mode == FALL_BACK_TO_NONDELALLOC) return ext4_write_end(file, mapping, pos, - len, copied, &folio->page, fsdata); + len, copied, folio, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); @@ -6219,7 +6216,7 @@ retry_alloc: if (folio_pos(folio) + len > size) len = size - folio_pos(folio); - err = __block_write_begin(&folio->page, 0, len, ext4_get_block); + err = __block_write_begin(folio, 0, len, ext4_get_block); if (!err) { ret = VM_FAULT_SIGBUS; if (ext4_journal_folio_buffers(handle, folio, len)) diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index 2f37e1ea3955..d9203228ce97 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -76,17 +76,17 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count, while (count) { size_t n = min_t(size_t, count, PAGE_SIZE - offset_in_page(pos)); - struct page *page; + struct folio *folio; void *fsdata = NULL; int res; - res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata); + res = aops->write_begin(NULL, mapping, pos, n, &folio, &fsdata); if (res) return res; - memcpy_to_page(page, offset_in_page(pos), buf, n); + memcpy_to_folio(folio, offset_in_folio(folio, pos), buf, n); - res = aops->write_end(NULL, mapping, pos, n, n, page, fsdata); + res = aops->write_end(NULL, mapping, pos, n, n, folio, fsdata); if (res < 0) return res; if (res != n) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 6457e5bca9c9..5dfa0207ad8f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3552,12 +3552,12 @@ reserve_block: } static int f2fs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, void **fsdata) + loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct page *page = NULL; - pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT; + struct folio *folio; + pgoff_t index = pos >> PAGE_SHIFT; bool need_balance = false; bool use_cow = false; block_t blkaddr = NULL_ADDR; @@ -3573,7 +3573,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, /* * We should check this at this moment to avoid deadlock on inode page * and #0 page. The locking rule for inline_data conversion should be: - * lock_page(page #0) -> lock_page(inode_page) + * folio_lock(folio #0) -> folio_lock(inode_page) */ if (index != 0) { err = f2fs_convert_inline_inode(inode); @@ -3584,18 +3584,20 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { int ret; + struct page *page; *fsdata = NULL; if (len == PAGE_SIZE && !(f2fs_is_atomic_file(inode))) goto repeat; - ret = f2fs_prepare_compress_overwrite(inode, pagep, + ret = f2fs_prepare_compress_overwrite(inode, &page, index, fsdata); if (ret < 0) { err = ret; goto fail; } else if (ret) { + *foliop = page_folio(page); return 0; } } @@ -3603,81 +3605,85 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, repeat: /* - * Do not use grab_cache_page_write_begin() to avoid deadlock due to - * wait_for_stable_page. Will wait that below with our IO control. + * Do not use FGP_STABLE to avoid deadlock. + * Will wait that below with our IO control. */ - page = f2fs_pagecache_get_page(mapping, index, + folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_WRITE | FGP_CREAT, GFP_NOFS); - if (!page) { - err = -ENOMEM; + if (IS_ERR(folio)) { + err = PTR_ERR(folio); goto fail; } /* TODO: cluster can be compressed due to race with .writepage */ - *pagep = page; + *foliop = folio; if (f2fs_is_atomic_file(inode)) - err = prepare_atomic_write_begin(sbi, page, pos, len, + err = prepare_atomic_write_begin(sbi, &folio->page, pos, len, &blkaddr, &need_balance, &use_cow); else - err = prepare_write_begin(sbi, page, pos, len, + err = prepare_write_begin(sbi, &folio->page, pos, len, &blkaddr, &need_balance); if (err) - goto fail; + goto put_folio; if (need_balance && !IS_NOQUOTA(inode) && has_not_enough_free_secs(sbi, 0, 0)) { - unlock_page(page); + folio_unlock(folio); f2fs_balance_fs(sbi, true); - lock_page(page); - if (page->mapping != mapping) { - /* The page got truncated from under us */ - f2fs_put_page(page, 1); + folio_lock(folio); + if (folio->mapping != mapping) { + /* The folio got truncated from under us */ + folio_unlock(folio); + folio_put(folio); goto repeat; } } - f2fs_wait_on_page_writeback(page, DATA, false, true); + f2fs_wait_on_page_writeback(&folio->page, DATA, false, true); - if (len == PAGE_SIZE || PageUptodate(page)) + if (len == folio_size(folio) || folio_test_uptodate(folio)) return 0; if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode) && !f2fs_verity_in_progress(inode)) { - zero_user_segment(page, len, PAGE_SIZE); + folio_zero_segment(folio, len, PAGE_SIZE); return 0; } if (blkaddr == NEW_ADDR) { - zero_user_segment(page, 0, PAGE_SIZE); - SetPageUptodate(page); + folio_zero_segment(folio, 0, folio_size(folio)); + folio_mark_uptodate(folio); } else { if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) { err = -EFSCORRUPTED; - goto fail; + goto put_folio; } err = f2fs_submit_page_read(use_cow ? - F2FS_I(inode)->cow_inode : inode, page, + F2FS_I(inode)->cow_inode : inode, &folio->page, blkaddr, 0, true); if (err) - goto fail; + goto put_folio; - lock_page(page); - if (unlikely(page->mapping != mapping)) { - f2fs_put_page(page, 1); + folio_lock(folio); + if (unlikely(folio->mapping != mapping)) { + folio_unlock(folio); + folio_put(folio); goto repeat; } - if (unlikely(!PageUptodate(page))) { + if (unlikely(!folio_test_uptodate(folio))) { err = -EIO; - goto fail; + goto put_folio; } } return 0; +put_folio: + folio_unlock(folio); + folio_put(folio); fail: - f2fs_put_page(page, 1); f2fs_write_failed(inode, pos + len); return err; } @@ -3685,9 +3691,9 @@ fail: static int f2fs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; trace_f2fs_write_end(inode, pos, len, copied); @@ -3696,17 +3702,17 @@ static int f2fs_write_end(struct file *file, * should be PAGE_SIZE. Otherwise, we treat it with zero copied and * let generic_perform_write() try to copy data again through copied=0. */ - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { if (unlikely(copied != len)) copied = 0; else - SetPageUptodate(page); + folio_mark_uptodate(folio); } #ifdef CONFIG_F2FS_FS_COMPRESSION /* overwrite compressed file */ if (f2fs_compressed_file(inode) && fsdata) { - f2fs_compress_write_end(inode, fsdata, page->index, copied); + f2fs_compress_write_end(inode, fsdata, folio->index, copied); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); if (pos + copied > i_size_read(inode) && @@ -3719,7 +3725,7 @@ static int f2fs_write_end(struct file *file, if (!copied) goto unlock_out; - set_page_dirty(page); + folio_mark_dirty(folio); if (pos + copied > i_size_read(inode) && !f2fs_verity_in_progress(inode)) { @@ -3729,7 +3735,8 @@ static int f2fs_write_end(struct file *file, pos + copied); } unlock_out: - f2fs_put_page(page, 1); + folio_unlock(folio); + folio_put(folio); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return copied; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3959fd137cc9..176b5177c89d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2677,7 +2677,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, const struct address_space_operations *a_ops = mapping->a_ops; int offset = off & (sb->s_blocksize - 1); size_t towrite = len; - struct page *page; + struct folio *folio; void *fsdata = NULL; int err = 0; int tocopy; @@ -2687,7 +2687,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, towrite); retry: err = a_ops->write_begin(NULL, mapping, off, tocopy, - &page, &fsdata); + &folio, &fsdata); if (unlikely(err)) { if (err == -ENOMEM) { f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); @@ -2697,10 +2697,10 @@ retry: break; } - memcpy_to_page(page, offset, data, tocopy); + memcpy_to_folio(folio, offset_in_folio(folio, off), data, tocopy); a_ops->write_end(NULL, mapping, off, tocopy, tocopy, - page, fsdata); + folio, fsdata); offset = 0; towrite -= tocopy; off += tocopy; diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index f7bb0c54502c..84a33fe49bed 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -80,17 +80,17 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count, while (count) { size_t n = min_t(size_t, count, PAGE_SIZE - offset_in_page(pos)); - struct page *page; + struct folio *folio; void *fsdata = NULL; int res; - res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata); + res = aops->write_begin(NULL, mapping, pos, n, &folio, &fsdata); if (res) return res; - memcpy_to_page(page, offset_in_page(pos), buf, n); + memcpy_to_folio(folio, offset_in_folio(folio, pos), buf, n); - res = aops->write_end(NULL, mapping, pos, n, n, page, fsdata); + res = aops->write_end(NULL, mapping, pos, n, n, folio, fsdata); if (res < 0) return res; if (res != n) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 19115fd2d2a4..75722bbd6b5f 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -221,13 +221,12 @@ static void fat_write_failed(struct address_space *mapping, loff_t to) static int fat_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int err; - *pagep = NULL; err = cont_write_begin(file, mapping, pos, len, - pagep, fsdata, fat_get_block, + foliop, fsdata, fat_get_block, &MSDOS_I(mapping->host)->mmu_private); if (err < 0) fat_write_failed(mapping, pos + len); @@ -236,11 +235,11 @@ static int fat_write_begin(struct file *file, struct address_space *mapping, static int fat_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *pagep, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int err; - err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata); + err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); if (err < len) fat_write_failed(mapping, pos + len); if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) { diff --git a/fs/fcntl.c b/fs/fcntl.c index 300e5d9ad913..081e5e3d89ea 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -33,6 +33,8 @@ #include <asm/siginfo.h> #include <linux/uaccess.h> +#include "internal.h" + #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME) static int setfl(int fd, struct file * filp, unsigned int arg) @@ -87,29 +89,65 @@ static int setfl(int fd, struct file * filp, unsigned int arg) return error; } -static void f_modown(struct file *filp, struct pid *pid, enum pid_type type, - int force) +/* + * Allocate an file->f_owner struct if it doesn't exist, handling racing + * allocations correctly. + */ +int file_f_owner_allocate(struct file *file) { - write_lock_irq(&filp->f_owner.lock); - if (force || !filp->f_owner.pid) { - put_pid(filp->f_owner.pid); - filp->f_owner.pid = get_pid(pid); - filp->f_owner.pid_type = type; + struct fown_struct *f_owner; - if (pid) { - const struct cred *cred = current_cred(); - filp->f_owner.uid = cred->uid; - filp->f_owner.euid = cred->euid; - } + f_owner = file_f_owner(file); + if (f_owner) + return 0; + + f_owner = kzalloc(sizeof(struct fown_struct), GFP_KERNEL); + if (!f_owner) + return -ENOMEM; + + rwlock_init(&f_owner->lock); + f_owner->file = file; + /* If someone else raced us, drop our allocation. */ + if (unlikely(cmpxchg(&file->f_owner, NULL, f_owner))) + kfree(f_owner); + return 0; +} +EXPORT_SYMBOL(file_f_owner_allocate); + +void file_f_owner_release(struct file *file) +{ + struct fown_struct *f_owner; + + f_owner = file_f_owner(file); + if (f_owner) { + put_pid(f_owner->pid); + kfree(f_owner); } - write_unlock_irq(&filp->f_owner.lock); } void __f_setown(struct file *filp, struct pid *pid, enum pid_type type, int force) { - security_file_set_fowner(filp); - f_modown(filp, pid, type, force); + struct fown_struct *f_owner; + + f_owner = file_f_owner(filp); + if (WARN_ON_ONCE(!f_owner)) + return; + + write_lock_irq(&f_owner->lock); + if (force || !f_owner->pid) { + put_pid(f_owner->pid); + f_owner->pid = get_pid(pid); + f_owner->pid_type = type; + + if (pid) { + const struct cred *cred = current_cred(); + security_file_set_fowner(filp); + f_owner->uid = cred->uid; + f_owner->euid = cred->euid; + } + } + write_unlock_irq(&f_owner->lock); } EXPORT_SYMBOL(__f_setown); @@ -119,6 +157,8 @@ int f_setown(struct file *filp, int who, int force) struct pid *pid = NULL; int ret = 0; + might_sleep(); + type = PIDTYPE_TGID; if (who < 0) { /* avoid overflow below */ @@ -129,6 +169,10 @@ int f_setown(struct file *filp, int who, int force) who = -who; } + ret = file_f_owner_allocate(filp); + if (ret) + return ret; + rcu_read_lock(); if (who) { pid = find_vpid(who); @@ -146,22 +190,27 @@ EXPORT_SYMBOL(f_setown); void f_delown(struct file *filp) { - f_modown(filp, NULL, PIDTYPE_TGID, 1); + __f_setown(filp, NULL, PIDTYPE_TGID, 1); } pid_t f_getown(struct file *filp) { pid_t pid = 0; + struct fown_struct *f_owner; - read_lock_irq(&filp->f_owner.lock); + f_owner = file_f_owner(filp); + if (!f_owner) + return pid; + + read_lock_irq(&f_owner->lock); rcu_read_lock(); - if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) { - pid = pid_vnr(filp->f_owner.pid); - if (filp->f_owner.pid_type == PIDTYPE_PGID) + if (pid_task(f_owner->pid, f_owner->pid_type)) { + pid = pid_vnr(f_owner->pid); + if (f_owner->pid_type == PIDTYPE_PGID) pid = -pid; } rcu_read_unlock(); - read_unlock_irq(&filp->f_owner.lock); + read_unlock_irq(&f_owner->lock); return pid; } @@ -194,6 +243,10 @@ static int f_setown_ex(struct file *filp, unsigned long arg) return -EINVAL; } + ret = file_f_owner_allocate(filp); + if (ret) + return ret; + rcu_read_lock(); pid = find_vpid(owner.pid); if (owner.pid && !pid) @@ -210,13 +263,20 @@ static int f_getown_ex(struct file *filp, unsigned long arg) struct f_owner_ex __user *owner_p = (void __user *)arg; struct f_owner_ex owner = {}; int ret = 0; + struct fown_struct *f_owner; + enum pid_type pid_type = PIDTYPE_PID; - read_lock_irq(&filp->f_owner.lock); - rcu_read_lock(); - if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) - owner.pid = pid_vnr(filp->f_owner.pid); - rcu_read_unlock(); - switch (filp->f_owner.pid_type) { + f_owner = file_f_owner(filp); + if (f_owner) { + read_lock_irq(&f_owner->lock); + rcu_read_lock(); + if (pid_task(f_owner->pid, f_owner->pid_type)) + owner.pid = pid_vnr(f_owner->pid); + rcu_read_unlock(); + pid_type = f_owner->pid_type; + } + + switch (pid_type) { case PIDTYPE_PID: owner.type = F_OWNER_TID; break; @@ -234,7 +294,8 @@ static int f_getown_ex(struct file *filp, unsigned long arg) ret = -EINVAL; break; } - read_unlock_irq(&filp->f_owner.lock); + if (f_owner) + read_unlock_irq(&f_owner->lock); if (!ret) { ret = copy_to_user(owner_p, &owner, sizeof(owner)); @@ -248,14 +309,18 @@ static int f_getown_ex(struct file *filp, unsigned long arg) static int f_getowner_uids(struct file *filp, unsigned long arg) { struct user_namespace *user_ns = current_user_ns(); + struct fown_struct *f_owner; uid_t __user *dst = (void __user *)arg; - uid_t src[2]; + uid_t src[2] = {0, 0}; int err; - read_lock_irq(&filp->f_owner.lock); - src[0] = from_kuid(user_ns, filp->f_owner.uid); - src[1] = from_kuid(user_ns, filp->f_owner.euid); - read_unlock_irq(&filp->f_owner.lock); + f_owner = file_f_owner(filp); + if (f_owner) { + read_lock_irq(&f_owner->lock); + src[0] = from_kuid(user_ns, f_owner->uid); + src[1] = from_kuid(user_ns, f_owner->euid); + read_unlock_irq(&f_owner->lock); + } err = put_user(src[0], &dst[0]); err |= put_user(src[1], &dst[1]); @@ -343,6 +408,36 @@ static long f_dupfd_query(int fd, struct file *filp) return f.file == filp; } +/* Let the caller figure out whether a given file was just created. */ +static long f_created_query(const struct file *filp) +{ + return !!(filp->f_mode & FMODE_CREATED); +} + +static int f_owner_sig(struct file *filp, int signum, bool setsig) +{ + int ret = 0; + struct fown_struct *f_owner; + + might_sleep(); + + if (setsig) { + if (!valid_signal(signum)) + return -EINVAL; + + ret = file_f_owner_allocate(filp); + if (ret) + return ret; + } + + f_owner = file_f_owner(filp); + if (setsig) + f_owner->signum = signum; + else if (f_owner) + ret = f_owner->signum; + return ret; +} + static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp) { @@ -352,6 +447,9 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, long err = -EINVAL; switch (cmd) { + case F_CREATED_QUERY: + err = f_created_query(filp); + break; case F_DUPFD: err = f_dupfd(argi, filp, 0); break; @@ -421,15 +519,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, err = f_getowner_uids(filp, arg); break; case F_GETSIG: - err = filp->f_owner.signum; + err = f_owner_sig(filp, 0, false); break; case F_SETSIG: - /* arg == 0 restores default behaviour. */ - if (!valid_signal(argi)) { - break; - } - err = 0; - filp->f_owner.signum = argi; + err = f_owner_sig(filp, argi, true); break; case F_GETLEASE: err = fcntl_getlease(filp); @@ -463,6 +556,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, static int check_fcntl_cmd(unsigned cmd) { switch (cmd) { + case F_CREATED_QUERY: case F_DUPFD: case F_DUPFD_CLOEXEC: case F_DUPFD_QUERY: @@ -844,14 +938,19 @@ static void send_sigurg_to_task(struct task_struct *p, do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, type); } -int send_sigurg(struct fown_struct *fown) +int send_sigurg(struct file *file) { + struct fown_struct *fown; struct task_struct *p; enum pid_type type; struct pid *pid; unsigned long flags; int ret = 0; + fown = file_f_owner(file); + if (!fown) + return 0; + read_lock_irqsave(&fown->lock, flags); type = fown->pid_type; @@ -1027,13 +1126,16 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) } read_lock_irqsave(&fa->fa_lock, flags); if (fa->fa_file) { - fown = &fa->fa_file->f_owner; + fown = file_f_owner(fa->fa_file); + if (!fown) + goto next; /* Don't send SIGURG to processes which have not set a queued signum: SIGURG has its own default signalling mechanism. */ if (!(sig == SIGURG && fown->signum == 0)) send_sigio(fown, fa->fa_fd, band); } +next: read_unlock_irqrestore(&fa->fa_lock, flags); fa = rcu_dereference(fa->fa_next); } diff --git a/fs/fhandle.c b/fs/fhandle.c index 6e8cea16790e..8cb665629f4a 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -16,7 +16,8 @@ static long do_sys_name_to_handle(const struct path *path, struct file_handle __user *ufh, - int __user *mnt_id, int fh_flags) + void __user *mnt_id, bool unique_mntid, + int fh_flags) { long retval; struct file_handle f_handle; @@ -69,9 +70,19 @@ static long do_sys_name_to_handle(const struct path *path, } else retval = 0; /* copy the mount id */ - if (put_user(real_mount(path->mnt)->mnt_id, mnt_id) || - copy_to_user(ufh, handle, - struct_size(handle, f_handle, handle_bytes))) + if (unique_mntid) { + if (put_user(real_mount(path->mnt)->mnt_id_unique, + (u64 __user *) mnt_id)) + retval = -EFAULT; + } else { + if (put_user(real_mount(path->mnt)->mnt_id, + (int __user *) mnt_id)) + retval = -EFAULT; + } + /* copy the handle */ + if (retval != -EFAULT && + copy_to_user(ufh, handle, + struct_size(handle, f_handle, handle_bytes))) retval = -EFAULT; kfree(handle); return retval; @@ -83,6 +94,7 @@ static long do_sys_name_to_handle(const struct path *path, * @name: name that should be converted to handle. * @handle: resulting file handle * @mnt_id: mount id of the file system containing the file + * (u64 if AT_HANDLE_MNT_ID_UNIQUE, otherwise int) * @flag: flag value to indicate whether to follow symlink or not * and whether a decodable file handle is required. * @@ -92,7 +104,7 @@ static long do_sys_name_to_handle(const struct path *path, * value required. */ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, - struct file_handle __user *, handle, int __user *, mnt_id, + struct file_handle __user *, handle, void __user *, mnt_id, int, flag) { struct path path; @@ -100,7 +112,8 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, int fh_flags; int err; - if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID)) + if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID | + AT_HANDLE_MNT_ID_UNIQUE)) return -EINVAL; lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0; @@ -109,7 +122,9 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, lookup_flags |= LOOKUP_EMPTY; err = user_path_at(dfd, name, lookup_flags, &path); if (!err) { - err = do_sys_name_to_handle(&path, handle, mnt_id, fh_flags); + err = do_sys_name_to_handle(&path, handle, mnt_id, + flag & AT_HANDLE_MNT_ID_UNIQUE, + fh_flags); path_put(&path); } return err; diff --git a/fs/file.c b/fs/file.c index a11e59b5d602..976ecd4ce2c6 100644 --- a/fs/file.c +++ b/fs/file.c @@ -46,27 +46,23 @@ static void free_fdtable_rcu(struct rcu_head *rcu) #define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr)) #define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long)) +#define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds /* * Copy 'count' fd bits from the old table to the new table and clear the extra * space if any. This does not copy the file pointers. Called with the files * spinlock held for write. */ -static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, - unsigned int count) +static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, + unsigned int copy_words) { - unsigned int cpy, set; - - cpy = count / BITS_PER_BYTE; - set = (nfdt->max_fds - count) / BITS_PER_BYTE; - memcpy(nfdt->open_fds, ofdt->open_fds, cpy); - memset((char *)nfdt->open_fds + cpy, 0, set); - memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy); - memset((char *)nfdt->close_on_exec + cpy, 0, set); - - cpy = BITBIT_SIZE(count); - set = BITBIT_SIZE(nfdt->max_fds) - cpy; - memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy); - memset((char *)nfdt->full_fds_bits + cpy, 0, set); + unsigned int nwords = fdt_words(nfdt); + + bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds, + copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); + bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec, + copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); + bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits, + copy_words, nwords); } /* @@ -84,7 +80,7 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) memcpy(nfdt->fd, ofdt->fd, cpy); memset((char *)nfdt->fd + cpy, 0, set); - copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds); + copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt)); } /* @@ -379,7 +375,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int open_files = sane_fdtable_size(old_fdt, max_fds); } - copy_fd_bitmaps(new_fdt, old_fdt, open_files); + copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG); old_fds = old_fdt->fd; new_fds = new_fdt->fd; @@ -676,7 +672,7 @@ int close_fd(unsigned fd) return filp_close(file, files); } -EXPORT_SYMBOL(close_fd); /* for ksys_close() */ +EXPORT_SYMBOL(close_fd); /** * last_fd - return last valid index into fd table diff --git a/fs/file_table.c b/fs/file_table.c index ca7843dde56d..eed5ffad9997 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -136,6 +136,7 @@ static int __init init_fs_stat_sysctls(void) register_sysctl_init("fs", fs_stat_sysctls); if (IS_ENABLED(CONFIG_BINFMT_MISC)) { struct ctl_table_header *hdr; + hdr = register_sysctl_mount_point("fs/binfmt_misc"); kmemleak_not_leak(hdr); } @@ -155,8 +156,14 @@ static int init_file(struct file *f, int flags, const struct cred *cred) return error; } - rwlock_init(&f->f_owner.lock); spin_lock_init(&f->f_lock); + /* + * Note that f_pos_lock is only used for files raising + * FMODE_ATOMIC_POS and directories. Other files such as pipes + * don't need it and since f_pos_lock is in a union may reuse + * the space for other purposes. They are expected to initialize + * the respective member when opening the file. + */ mutex_init(&f->f_pos_lock); f->f_flags = flags; f->f_mode = OPEN_FMODE(flags); @@ -383,7 +390,9 @@ EXPORT_SYMBOL_GPL(alloc_file_pseudo_noaccount); struct file *alloc_file_clone(struct file *base, int flags, const struct file_operations *fops) { - struct file *f = alloc_file(&base->f_path, flags, fops); + struct file *f; + + f = alloc_file(&base->f_path, flags, fops); if (!IS_ERR(f)) { path_get(&f->f_path); f->f_mapping = base->f_mapping; @@ -425,7 +434,7 @@ static void __fput(struct file *file) cdev_put(inode->i_cdev); } fops_put(file->f_op); - put_pid(file->f_owner.pid); + file_f_owner_release(file); put_file_access(file); dput(dentry); if (unlikely(mode & FMODE_NEED_UNMOUNT)) @@ -512,9 +521,14 @@ EXPORT_SYMBOL(__fput_sync); void __init files_init(void) { - filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, - SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN | - SLAB_PANIC | SLAB_ACCOUNT, NULL); + struct kmem_cache_args args = { + .use_freeptr_offset = true, + .freeptr_offset = offsetof(struct file, f_freeptr), + }; + + filp_cachep = kmem_cache_create("filp", sizeof(struct file), &args, + SLAB_HWCACHE_ALIGN | SLAB_PANIC | + SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); percpu_counter_init(&nr_files, 0, GFP_KERNEL); } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index b865a3fa52f3..d8bec3c1bb1f 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1132,6 +1132,7 @@ out_bdi_put: /** * cgroup_writeback_umount - flush inode wb switches for umount + * @sb: target super_block * * This function is called when a super_block is about to be destroyed and * flushes in-flight inode wb switches. An inode wb switch goes through @@ -1140,8 +1141,12 @@ out_bdi_put: * rare occurrences and synchronize_rcu() can take a while, perform * flushing iff wb switches are in flight. */ -void cgroup_writeback_umount(void) +void cgroup_writeback_umount(struct super_block *sb) { + + if (!(sb->s_bdi->capabilities & BDI_CAP_WRITEBACK)) + return; + /* * SB_ACTIVE should be reliably cleared before checking * isw_nr_in_flight, see generic_shutdown_super(). @@ -1381,12 +1386,13 @@ static void requeue_io(struct inode *inode, struct bdi_writeback *wb) static void inode_sync_complete(struct inode *inode) { + assert_spin_locked(&inode->i_lock); + inode->i_state &= ~I_SYNC; /* If inode is clean an unused, put it into LRU now... */ inode_add_lru(inode); - /* Waiters must see I_SYNC cleared before being woken up */ - smp_mb(); - wake_up_bit(&inode->i_state, __I_SYNC); + /* Called with inode->i_lock which ensures memory ordering. */ + inode_wake_up_bit(inode, __I_SYNC); } static bool inode_dirtied_after(struct inode *inode, unsigned long t) @@ -1505,30 +1511,27 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc) * Wait for writeback on an inode to complete. Called with i_lock held. * Caller must make sure inode cannot go away when we drop i_lock. */ -static void __inode_wait_for_writeback(struct inode *inode) - __releases(inode->i_lock) - __acquires(inode->i_lock) +void inode_wait_for_writeback(struct inode *inode) { - DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); - wait_queue_head_t *wqh; + struct wait_bit_queue_entry wqe; + struct wait_queue_head *wq_head; + + assert_spin_locked(&inode->i_lock); - wqh = bit_waitqueue(&inode->i_state, __I_SYNC); - while (inode->i_state & I_SYNC) { + if (!(inode->i_state & I_SYNC)) + return; + + wq_head = inode_bit_waitqueue(&wqe, inode, __I_SYNC); + for (;;) { + prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); + /* Checking I_SYNC with inode->i_lock guarantees memory ordering. */ + if (!(inode->i_state & I_SYNC)) + break; spin_unlock(&inode->i_lock); - __wait_on_bit(wqh, &wq, bit_wait, - TASK_UNINTERRUPTIBLE); + schedule(); spin_lock(&inode->i_lock); } -} - -/* - * Wait for writeback on an inode to complete. Caller must have inode pinned. - */ -void inode_wait_for_writeback(struct inode *inode) -{ - spin_lock(&inode->i_lock); - __inode_wait_for_writeback(inode); - spin_unlock(&inode->i_lock); + finish_wait(wq_head, &wqe.wq_entry); } /* @@ -1539,16 +1542,20 @@ void inode_wait_for_writeback(struct inode *inode) static void inode_sleep_on_writeback(struct inode *inode) __releases(inode->i_lock) { - DEFINE_WAIT(wait); - wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC); - int sleep; + struct wait_bit_queue_entry wqe; + struct wait_queue_head *wq_head; + bool sleep; + + assert_spin_locked(&inode->i_lock); - prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); - sleep = inode->i_state & I_SYNC; + wq_head = inode_bit_waitqueue(&wqe, inode, __I_SYNC); + prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); + /* Checking I_SYNC with inode->i_lock guarantees memory ordering. */ + sleep = !!(inode->i_state & I_SYNC); spin_unlock(&inode->i_lock); if (sleep) schedule(); - finish_wait(wqh, &wait); + finish_wait(wq_head, &wqe.wq_entry); } /* @@ -1752,7 +1759,7 @@ static int writeback_single_inode(struct inode *inode, */ if (wbc->sync_mode != WB_SYNC_ALL) goto out; - __inode_wait_for_writeback(inode); + inode_wait_for_writeback(inode); } WARN_ON(inode->i_state & I_SYNC); /* diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 9eb191b5c4de..f0c9cd1a0b39 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -31,6 +31,8 @@ MODULE_ALIAS("devname:fuse"); static struct kmem_cache *fuse_req_cachep; +static void end_requests(struct list_head *head); + static struct fuse_dev *fuse_get_dev(struct file *file) { /* @@ -773,7 +775,6 @@ static int fuse_check_folio(struct folio *folio) (folio->flags & PAGE_FLAGS_CHECK_AT_PREP & ~(1 << PG_locked | 1 << PG_referenced | - 1 << PG_uptodate | 1 << PG_lru | 1 << PG_active | 1 << PG_workingset | @@ -818,9 +819,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) newfolio = page_folio(buf->page); - if (!folio_test_uptodate(newfolio)) - folio_mark_uptodate(newfolio); - + folio_clear_uptodate(newfolio); folio_clear_mappedtodisk(newfolio); if (fuse_check_folio(newfolio) != 0) @@ -1618,9 +1617,11 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, this_num = min_t(unsigned, num, PAGE_SIZE - offset); err = fuse_copy_page(cs, &page, offset, this_num, 0); - if (!err && offset == 0 && - (this_num == PAGE_SIZE || file_size == end)) + if (!PageUptodate(page) && !err && offset == 0 && + (this_num == PAGE_SIZE || file_size == end)) { + zero_user_segment(page, this_num, PAGE_SIZE); SetPageUptodate(page); + } unlock_page(page); put_page(page); @@ -1820,6 +1821,13 @@ static void fuse_resend(struct fuse_conn *fc) } spin_lock(&fiq->lock); + if (!fiq->connected) { + spin_unlock(&fiq->lock); + list_for_each_entry(req, &to_queue, list) + clear_bit(FR_PENDING, &req->flags); + end_requests(&to_queue); + return; + } /* iq and pq requests are both oldest to newest */ list_splice(&to_queue, &fiq->pending); fiq->ops->wake_pending_and_unlock(fiq); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 2b0d4781f394..8e96df9fd76c 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -670,7 +670,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, err = get_create_ext(&args, dir, entry, mode); if (err) - goto out_put_forget_req; + goto out_free_ff; err = fuse_simple_request(fm, &args); free_ext_value(&args); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f39456c65ed7..ba6df52a823e 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1832,10 +1832,16 @@ __acquires(fi->lock) fuse_writepage_finish(fm, wpa); spin_unlock(&fi->lock); - /* After fuse_writepage_finish() aux request list is private */ + /* After rb_erase() aux request list is private */ for (aux = wpa->next; aux; aux = next) { + struct backing_dev_info *bdi = inode_to_bdi(aux->inode); + next = aux->next; aux->next = NULL; + + dec_wb_stat(&bdi->wb, WB_WRITEBACK); + dec_node_page_state(aux->ia.ap.pages[0], NR_WRITEBACK_TEMP); + wb_writeout_inc(&bdi->wb); fuse_writepage_free(aux); } @@ -2387,76 +2393,77 @@ out: * but how to implement it without killing performance need more thinking. */ static int fuse_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, void **fsdata) + loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { pgoff_t index = pos >> PAGE_SHIFT; struct fuse_conn *fc = get_fuse_conn(file_inode(file)); - struct page *page; + struct folio *folio; loff_t fsize; int err = -ENOMEM; WARN_ON(!fc->writeback_cache); - page = grab_cache_page_write_begin(mapping, index); - if (!page) + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) goto error; - fuse_wait_on_page_writeback(mapping->host, page->index); + fuse_wait_on_page_writeback(mapping->host, folio->index); - if (PageUptodate(page) || len == PAGE_SIZE) + if (folio_test_uptodate(folio) || len >= folio_size(folio)) goto success; /* - * Check if the start this page comes after the end of file, in which - * case the readpage can be optimized away. + * Check if the start of this folio comes after the end of file, + * in which case the readpage can be optimized away. */ fsize = i_size_read(mapping->host); - if (fsize <= (pos & PAGE_MASK)) { - size_t off = pos & ~PAGE_MASK; + if (fsize <= folio_pos(folio)) { + size_t off = offset_in_folio(folio, pos); if (off) - zero_user_segment(page, 0, off); + folio_zero_segment(folio, 0, off); goto success; } - err = fuse_do_readpage(file, page); + err = fuse_do_readpage(file, &folio->page); if (err) goto cleanup; success: - *pagep = page; + *foliop = folio; return 0; cleanup: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); error: return err; } static int fuse_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; /* Haven't copied anything? Skip zeroing, size extending, dirtying. */ if (!copied) goto unlock; pos += copied; - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { /* Zero any unwritten bytes at the end of the page */ size_t endoff = pos & ~PAGE_MASK; if (endoff) - zero_user_segment(page, endoff, PAGE_SIZE); - SetPageUptodate(page); + folio_zero_segment(folio, endoff, PAGE_SIZE); + folio_mark_uptodate(folio); } if (pos > inode->i_size) i_size_write(inode, pos); - set_page_dirty(page); + folio_mark_dirty(folio); unlock: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return copied; } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index d8ab4e93916f..bebd89002328 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1332,11 +1332,16 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, * on a stacked fs (e.g. overlayfs) themselves and with * max_stack_depth == 1, FUSE fs can be stacked as the * underlying fs of a stacked fs (e.g. overlayfs). + * + * Also don't allow the combination of FUSE_PASSTHROUGH + * and FUSE_WRITEBACK_CACHE, current design doesn't handle + * them together. */ if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH) && (flags & FUSE_PASSTHROUGH) && arg->max_stack_depth > 0 && - arg->max_stack_depth <= FILESYSTEM_MAX_STACK_DEPTH) { + arg->max_stack_depth <= FILESYSTEM_MAX_STACK_DEPTH && + !(flags & FUSE_WRITEBACK_CACHE)) { fc->passthrough = 1; fc->max_stack_depth = arg->max_stack_depth; fm->sb->s_stack_depth = arg->max_stack_depth; diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 5b423fdbb13f..9f568d345c51 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -81,7 +81,7 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, } ret = fuse_simple_request(fm, &args); if (!ret && !size) - ret = min_t(ssize_t, outarg.size, XATTR_SIZE_MAX); + ret = min_t(size_t, outarg.size, XATTR_SIZE_MAX); if (ret == -ENOSYS) { fm->fc->no_getxattr = 1; ret = -EOPNOTSUPP; @@ -143,7 +143,7 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) } ret = fuse_simple_request(fm, &args); if (!ret && !size) - ret = min_t(ssize_t, outarg.size, XATTR_LIST_MAX); + ret = min_t(size_t, outarg.size, XATTR_LIST_MAX); if (ret > 0 && size) ret = fuse_verify_xattr_list(list, ret); if (ret == -ENOSYS) { diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c index 6d1878b99b30..4a0ce131e233 100644 --- a/fs/hfs/extent.c +++ b/fs/hfs/extent.c @@ -487,15 +487,15 @@ void hfs_file_truncate(struct inode *inode) if (inode->i_size > HFS_I(inode)->phys_size) { struct address_space *mapping = inode->i_mapping; void *fsdata = NULL; - struct page *page; + struct folio *folio; /* XXX: Can use generic_cont_expand? */ size = inode->i_size - 1; - res = hfs_write_begin(NULL, mapping, size + 1, 0, &page, + res = hfs_write_begin(NULL, mapping, size + 1, 0, &folio, &fsdata); if (!res) { res = generic_write_end(NULL, mapping, size + 1, 0, 0, - page, fsdata); + folio, fsdata); } if (res) inode->i_size = HFS_I(inode)->phys_size; diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index b5a6ad5df357..a0c7cb0f79fc 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -202,7 +202,7 @@ extern const struct address_space_operations hfs_aops; extern const struct address_space_operations hfs_btree_aops; int hfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, void **fsdata); + loff_t pos, unsigned len, struct folio **foliop, void **fsdata); extern struct inode *hfs_new_inode(struct inode *, const struct qstr *, umode_t); extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *); extern int hfs_write_inode(struct inode *, struct writeback_control *); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 744e10b46904..a81ce7a740b9 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -45,12 +45,11 @@ static void hfs_write_failed(struct address_space *mapping, loff_t to) } int hfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, void **fsdata) + loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { int ret; - *pagep = NULL; - ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata, + ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, hfs_get_block, &HFS_I(mapping->host)->phys_size); if (unlikely(ret)) diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index 9c51867dddc5..a6d61685ae79 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -554,16 +554,16 @@ void hfsplus_file_truncate(struct inode *inode) if (inode->i_size > hip->phys_size) { struct address_space *mapping = inode->i_mapping; - struct page *page; + struct folio *folio; void *fsdata = NULL; loff_t size = inode->i_size; res = hfsplus_write_begin(NULL, mapping, size, 0, - &page, &fsdata); + &folio, &fsdata); if (res) return; res = generic_write_end(NULL, mapping, size, 0, 0, - page, fsdata); + folio, fsdata); if (res < 0) return; mark_inode_dirty(inode); diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 9e78f181c24f..59ce81dca73f 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -472,7 +472,7 @@ extern const struct address_space_operations hfsplus_btree_aops; extern const struct dentry_operations hfsplus_dentry_operations; int hfsplus_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, void **fsdata); + loff_t pos, unsigned len, struct folio **foliop, void **fsdata); struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir, umode_t mode); void hfsplus_delete_inode(struct inode *inode); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 3d326926c195..f331e9574217 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -39,12 +39,11 @@ static void hfsplus_write_failed(struct address_space *mapping, loff_t to) } int hfsplus_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, void **fsdata) + loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { int ret; - *pagep = NULL; - ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata, + ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, hfsplus_get_block, &HFSPLUS_I(mapping->host)->phys_size); if (unlikely(ret)) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 22df574ca99e..6d1cf2436ead 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -465,31 +465,32 @@ static int hostfs_read_folio(struct file *file, struct folio *folio) static int hostfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { pgoff_t index = pos >> PAGE_SHIFT; - *pagep = grab_cache_page_write_begin(mapping, index); - if (!*pagep) + *foliop = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (!*foliop) return -ENOMEM; return 0; } static int hostfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; void *buffer; - unsigned from = pos & (PAGE_SIZE - 1); + size_t from = offset_in_folio(folio, pos); int err; - buffer = kmap_local_page(page); - err = write_file(FILE_HOSTFS_I(file)->fd, &pos, buffer + from, copied); + buffer = kmap_local_folio(folio, from); + err = write_file(FILE_HOSTFS_I(file)->fd, &pos, buffer, copied); kunmap_local(buffer); - if (!PageUptodate(page) && err == PAGE_SIZE) - SetPageUptodate(page); + if (!folio_test_uptodate(folio) && err == folio_size(folio)) + folio_mark_uptodate(folio); /* * If err > 0, write_file has added err to pos, so we are comparing @@ -497,8 +498,8 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping, */ if (err > 0 && (pos > inode->i_size)) inode->i_size = pos; - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return err; } diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 1bb8d97cd9ae..449a3fc1b8d9 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -190,12 +190,11 @@ static void hpfs_write_failed(struct address_space *mapping, loff_t to) static int hpfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - *pagep = NULL; - ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata, + ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, hpfs_get_block, &hpfs_i(mapping->host)->mmu_private); if (unlikely(ret)) @@ -206,11 +205,11 @@ static int hpfs_write_begin(struct file *file, struct address_space *mapping, static int hpfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *pagep, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int err; - err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata); + err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); if (err < len) hpfs_write_failed(mapping, pos + len); if (!(err < 0)) { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 9f6cff356796..5cf327337e22 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -388,14 +388,14 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) static int hugetlbfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { return -EINVAL; } static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { BUG(); return -EINVAL; diff --git a/fs/inode.c b/fs/inode.c index 86670941884b..af78f515403f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -472,6 +472,17 @@ static void __inode_add_lru(struct inode *inode, bool rotate) inode->i_state |= I_REFERENCED; } +struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe, + struct inode *inode, u32 bit) +{ + void *bit_address; + + bit_address = inode_state_wait_address(inode, bit); + init_wait_var_entry(wqe, bit_address, 0); + return __var_waitqueue(bit_address); +} +EXPORT_SYMBOL(inode_bit_waitqueue); + /* * Add inode to LRU if needed (inode is unused and clean). * @@ -488,6 +499,49 @@ static void inode_lru_list_del(struct inode *inode) this_cpu_dec(nr_unused); } +static void inode_pin_lru_isolating(struct inode *inode) +{ + lockdep_assert_held(&inode->i_lock); + WARN_ON(inode->i_state & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE)); + inode->i_state |= I_LRU_ISOLATING; +} + +static void inode_unpin_lru_isolating(struct inode *inode) +{ + spin_lock(&inode->i_lock); + WARN_ON(!(inode->i_state & I_LRU_ISOLATING)); + inode->i_state &= ~I_LRU_ISOLATING; + /* Called with inode->i_lock which ensures memory ordering. */ + inode_wake_up_bit(inode, __I_LRU_ISOLATING); + spin_unlock(&inode->i_lock); +} + +static void inode_wait_for_lru_isolating(struct inode *inode) +{ + struct wait_bit_queue_entry wqe; + struct wait_queue_head *wq_head; + + lockdep_assert_held(&inode->i_lock); + if (!(inode->i_state & I_LRU_ISOLATING)) + return; + + wq_head = inode_bit_waitqueue(&wqe, inode, __I_LRU_ISOLATING); + for (;;) { + prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); + /* + * Checking I_LRU_ISOLATING with inode->i_lock guarantees + * memory ordering. + */ + if (!(inode->i_state & I_LRU_ISOLATING)) + break; + spin_unlock(&inode->i_lock); + schedule(); + spin_lock(&inode->i_lock); + } + finish_wait(wq_head, &wqe.wq_entry); + WARN_ON(inode->i_state & I_LRU_ISOLATING); +} + /** * inode_sb_list_add - add inode to the superblock list of inodes * @inode: inode to add @@ -562,6 +616,7 @@ void dump_mapping(const struct address_space *mapping) struct hlist_node *dentry_first; struct dentry *dentry_ptr; struct dentry dentry; + char fname[64] = {}; unsigned long ino; /* @@ -598,11 +653,14 @@ void dump_mapping(const struct address_space *mapping) return; } + if (strncpy_from_kernel_nofault(fname, dentry.d_name.name, 63) < 0) + strscpy(fname, "<invalid>"); /* - * if dentry is corrupted, the %pd handler may still crash, - * but it's unlikely that we reach here with a corrupt mapping + * Even if strncpy_from_kernel_nofault() succeeded, + * the fname could be unreliable */ - pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n", a_ops, ino, &dentry); + pr_warn("aops:%ps ino:%lx dentry name(?):\"%s\"\n", + a_ops, ino, fname); } void clear_inode(struct inode *inode) @@ -657,6 +715,9 @@ static void evict(struct inode *inode) inode_sb_list_del(inode); + spin_lock(&inode->i_lock); + inode_wait_for_lru_isolating(inode); + /* * Wait for flusher thread to be done with the inode so that filesystem * does not start destroying it while writeback is still running. Since @@ -664,6 +725,7 @@ static void evict(struct inode *inode) * the inode. We just have to wait for running writeback to finish. */ inode_wait_for_writeback(inode); + spin_unlock(&inode->i_lock); if (op->evict_inode) { op->evict_inode(inode); @@ -687,7 +749,13 @@ static void evict(struct inode *inode) * used as an indicator whether blocking on it is safe. */ spin_lock(&inode->i_lock); - wake_up_bit(&inode->i_state, __I_NEW); + /* + * Pairs with the barrier in prepare_to_wait_event() to make sure + * ___wait_var_event() either sees the bit cleared or + * waitqueue_active() check in wake_up_var() sees the waiter. + */ + smp_mb(); + inode_wake_up_bit(inode, __I_NEW); BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); spin_unlock(&inode->i_lock); @@ -735,6 +803,10 @@ again: continue; spin_lock(&inode->i_lock); + if (atomic_read(&inode->i_count)) { + spin_unlock(&inode->i_lock); + continue; + } if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { spin_unlock(&inode->i_lock); continue; @@ -855,7 +927,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item, * be under pressure before the cache inside the highmem zone. */ if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) { - __iget(inode); + inode_pin_lru_isolating(inode); spin_unlock(&inode->i_lock); spin_unlock(lru_lock); if (remove_inode_buffers(inode)) { @@ -867,7 +939,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item, __count_vm_events(PGINODESTEAL, reap); mm_account_reclaimed_pages(reap); } - iput(inode); + inode_unpin_lru_isolating(inode); spin_lock(lru_lock); return LRU_RETRY; } @@ -1095,8 +1167,13 @@ void unlock_new_inode(struct inode *inode) spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW & ~I_CREATING; + /* + * Pairs with the barrier in prepare_to_wait_event() to make sure + * ___wait_var_event() either sees the bit cleared or + * waitqueue_active() check in wake_up_var() sees the waiter. + */ smp_mb(); - wake_up_bit(&inode->i_state, __I_NEW); + inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(unlock_new_inode); @@ -1107,8 +1184,13 @@ void discard_new_inode(struct inode *inode) spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW; + /* + * Pairs with the barrier in prepare_to_wait_event() to make sure + * ___wait_var_event() either sees the bit cleared or + * waitqueue_active() check in wake_up_var() sees the waiter. + */ smp_mb(); - wake_up_bit(&inode->i_state, __I_NEW); + inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); iput(inode); } @@ -1535,9 +1617,7 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino) struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; again: - spin_lock(&inode_hash_lock); - inode = find_inode_fast(sb, head, ino, true); - spin_unlock(&inode_hash_lock); + inode = find_inode_fast(sb, head, ino, false); if (inode) { if (IS_ERR(inode)) @@ -2299,8 +2379,8 @@ EXPORT_SYMBOL(inode_needs_sync); */ static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked) { - wait_queue_head_t *wq; - DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); + struct wait_bit_queue_entry wqe; + struct wait_queue_head *wq_head; /* * Handle racing against evict(), see that routine for more details. @@ -2311,14 +2391,14 @@ static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_lock return; } - wq = bit_waitqueue(&inode->i_state, __I_NEW); - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + wq_head = inode_bit_waitqueue(&wqe, inode, __I_NEW); + prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->i_lock); rcu_read_unlock(); if (is_inode_hash_locked) spin_unlock(&inode_hash_lock); schedule(); - finish_wait(wq, &wait.wq_entry); + finish_wait(wq_head, &wqe.wq_entry); if (is_inode_hash_locked) spin_lock(&inode_hash_lock); rcu_read_lock(); @@ -2467,18 +2547,11 @@ EXPORT_SYMBOL(inode_owner_or_capable); /* * Direct i/o helper functions */ -static void __inode_dio_wait(struct inode *inode) +bool inode_dio_finished(const struct inode *inode) { - wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP); - DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP); - - do { - prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE); - if (atomic_read(&inode->i_dio_count)) - schedule(); - } while (atomic_read(&inode->i_dio_count)); - finish_wait(wq, &q.wq_entry); + return atomic_read(&inode->i_dio_count) == 0; } +EXPORT_SYMBOL(inode_dio_finished); /** * inode_dio_wait - wait for outstanding DIO requests to finish @@ -2492,11 +2565,17 @@ static void __inode_dio_wait(struct inode *inode) */ void inode_dio_wait(struct inode *inode) { - if (atomic_read(&inode->i_dio_count)) - __inode_dio_wait(inode); + wait_var_event(&inode->i_dio_count, inode_dio_finished(inode)); } EXPORT_SYMBOL(inode_dio_wait); +void inode_dio_wait_interruptible(struct inode *inode) +{ + wait_var_event_interruptible(&inode->i_dio_count, + inode_dio_finished(inode)); +} +EXPORT_SYMBOL(inode_dio_wait_interruptible); + /* * inode_set_flags - atomically set some inode flags * diff --git a/fs/internal.h b/fs/internal.h index cdd73209eecb..8c1b7acbbe8f 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -337,3 +337,4 @@ static inline bool path_mounted(const struct path *path) { return path->mnt->mnt_root == path->dentry; } +void file_f_owner_release(struct file *file); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index f420c53d86ac..9b4ca3811a24 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -900,7 +900,7 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, size_t bh_written; bh_written = block_write_end(NULL, iter->inode->i_mapping, pos, - len, copied, &folio->page, NULL); + len, copied, folio, NULL); WARN_ON_ONCE(bh_written != copied && bh_written != 0); return bh_written == copied; } diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index e12cb145147e..13c18ccc13b0 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -23,10 +23,10 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *pg, void *fsdata); + struct folio *folio, void *fsdata); static int jffs2_write_begin(struct file *filp, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata); + struct folio **foliop, void **fsdata); static int jffs2_read_folio(struct file *filp, struct folio *folio); int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync) @@ -77,29 +77,27 @@ const struct address_space_operations jffs2_file_address_operations = .write_end = jffs2_write_end, }; -static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg) +static int jffs2_do_readpage_nolock(struct inode *inode, struct folio *folio) { struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); - unsigned char *pg_buf; + unsigned char *kaddr; int ret; jffs2_dbg(2, "%s(): ino #%lu, page at offset 0x%lx\n", - __func__, inode->i_ino, pg->index << PAGE_SHIFT); + __func__, inode->i_ino, folio->index << PAGE_SHIFT); - BUG_ON(!PageLocked(pg)); + BUG_ON(!folio_test_locked(folio)); - pg_buf = kmap(pg); - /* FIXME: Can kmap fail? */ - - ret = jffs2_read_inode_range(c, f, pg_buf, pg->index << PAGE_SHIFT, + kaddr = kmap_local_folio(folio, 0); + ret = jffs2_read_inode_range(c, f, kaddr, folio->index << PAGE_SHIFT, PAGE_SIZE); + kunmap_local(kaddr); if (!ret) - SetPageUptodate(pg); + folio_mark_uptodate(folio); - flush_dcache_page(pg); - kunmap(pg); + flush_dcache_folio(folio); jffs2_dbg(2, "readpage finished\n"); return ret; @@ -107,7 +105,7 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg) int __jffs2_read_folio(struct file *file, struct folio *folio) { - int ret = jffs2_do_readpage_nolock(folio->mapping->host, &folio->page); + int ret = jffs2_do_readpage_nolock(folio->mapping->host, folio); folio_unlock(folio); return ret; } @@ -125,9 +123,9 @@ static int jffs2_read_folio(struct file *file, struct folio *folio) static int jffs2_write_begin(struct file *filp, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { - struct page *pg; + struct folio *folio; struct inode *inode = mapping->host; struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); @@ -206,29 +204,30 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, * page in read_cache_page(), which causes a deadlock. */ mutex_lock(&c->alloc_sem); - pg = grab_cache_page_write_begin(mapping, index); - if (!pg) { - ret = -ENOMEM; + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); goto release_sem; } - *pagep = pg; + *foliop = folio; /* - * Read in the page if it wasn't already present. Cannot optimize away - * the whole page write case until jffs2_write_end can handle the + * Read in the folio if it wasn't already present. Cannot optimize away + * the whole folio write case until jffs2_write_end can handle the * case of a short-copy. */ - if (!PageUptodate(pg)) { + if (!folio_test_uptodate(folio)) { mutex_lock(&f->sem); - ret = jffs2_do_readpage_nolock(inode, pg); + ret = jffs2_do_readpage_nolock(inode, folio); mutex_unlock(&f->sem); if (ret) { - unlock_page(pg); - put_page(pg); + folio_unlock(folio); + folio_put(folio); goto release_sem; } } - jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags); + jffs2_dbg(1, "end write_begin(). folio->flags %lx\n", folio->flags); release_sem: mutex_unlock(&c->alloc_sem); @@ -238,7 +237,7 @@ out_err: static int jffs2_write_end(struct file *filp, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *pg, void *fsdata) + struct folio *folio, void *fsdata) { /* Actually commit the write from the page cache page we're looking at. * For now, we write the full page out each time. It sucks, but it's simple @@ -252,16 +251,17 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, unsigned aligned_start = start & ~3; int ret = 0; uint32_t writtenlen = 0; + void *buf; - jffs2_dbg(1, "%s(): ino #%lu, page at 0x%lx, range %d-%d, flags %lx\n", - __func__, inode->i_ino, pg->index << PAGE_SHIFT, - start, end, pg->flags); + jffs2_dbg(1, "%s(): ino #%lu, page at 0x%llx, range %d-%d, flags %lx\n", + __func__, inode->i_ino, folio_pos(folio), + start, end, folio->flags); /* We need to avoid deadlock with page_cache_read() in - jffs2_garbage_collect_pass(). So the page must be + jffs2_garbage_collect_pass(). So the folio must be up to date to prevent page_cache_read() from trying to re-lock it. */ - BUG_ON(!PageUptodate(pg)); + BUG_ON(!folio_test_uptodate(folio)); if (end == PAGE_SIZE) { /* When writing out the end of a page, write out the @@ -276,8 +276,8 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, if (!ri) { jffs2_dbg(1, "%s(): Allocation of raw inode failed\n", __func__); - unlock_page(pg); - put_page(pg); + folio_unlock(folio); + folio_put(folio); return -ENOMEM; } @@ -289,15 +289,11 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, ri->isize = cpu_to_je32((uint32_t)inode->i_size); ri->atime = ri->ctime = ri->mtime = cpu_to_je32(JFFS2_NOW()); - /* In 2.4, it was already kmapped by generic_file_write(). Doesn't - hurt to do it again. The alternative is ifdefs, which are ugly. */ - kmap(pg); - - ret = jffs2_write_inode_range(c, f, ri, page_address(pg) + aligned_start, - (pg->index << PAGE_SHIFT) + aligned_start, + buf = kmap_local_folio(folio, aligned_start); + ret = jffs2_write_inode_range(c, f, ri, buf, + folio_pos(folio) + aligned_start, end - aligned_start, &writtenlen); - - kunmap(pg); + kunmap_local(buf); if (ret) mapping_set_error(mapping, ret); @@ -323,12 +319,12 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, it gets reread */ jffs2_dbg(1, "%s(): Not all bytes written. Marking page !uptodate\n", __func__); - ClearPageUptodate(pg); + folio_clear_uptodate(folio); } jffs2_dbg(1, "%s() returning %d\n", __func__, writtenlen > 0 ? writtenlen : ret); - unlock_page(pg); - put_page(pg); + folio_unlock(folio); + folio_put(folio); return writtenlen > 0 ? writtenlen : ret; } diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c index 5c6602f3c189..822949d0eb00 100644 --- a/fs/jffs2/gc.c +++ b/fs/jffs2/gc.c @@ -1171,7 +1171,7 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era uint32_t alloclen, offset, orig_end, orig_start; int ret = 0; unsigned char *comprbuf = NULL, *writebuf; - struct page *page; + struct folio *folio; unsigned char *pg_ptr; memset(&ri, 0, sizeof(ri)); @@ -1317,25 +1317,25 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era BUG_ON(start > orig_start); } - /* The rules state that we must obtain the page lock *before* f->sem, so + /* The rules state that we must obtain the folio lock *before* f->sem, so * drop f->sem temporarily. Since we also hold c->alloc_sem, nothing's * actually going to *change* so we're safe; we only allow reading. * * It is important to note that jffs2_write_begin() will ensure that its - * page is marked Uptodate before allocating space. That means that if we - * end up here trying to GC the *same* page that jffs2_write_begin() is - * trying to write out, read_cache_page() will not deadlock. */ + * folio is marked uptodate before allocating space. That means that if we + * end up here trying to GC the *same* folio that jffs2_write_begin() is + * trying to write out, read_cache_folio() will not deadlock. */ mutex_unlock(&f->sem); - page = read_cache_page(inode->i_mapping, start >> PAGE_SHIFT, + folio = read_cache_folio(inode->i_mapping, start >> PAGE_SHIFT, __jffs2_read_folio, NULL); - if (IS_ERR(page)) { - pr_warn("read_cache_page() returned error: %ld\n", - PTR_ERR(page)); + if (IS_ERR(folio)) { + pr_warn("read_cache_folio() returned error: %ld\n", + PTR_ERR(folio)); mutex_lock(&f->sem); - return PTR_ERR(page); + return PTR_ERR(folio); } - pg_ptr = kmap(page); + pg_ptr = kmap_local_folio(folio, 0); mutex_lock(&f->sem); offset = start; @@ -1400,7 +1400,6 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era } } - kunmap(page); - put_page(page); + folio_release_kmap(folio, pg_ptr); return ret; } diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 1a6b5921d17a..07cfdc440596 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -292,11 +292,11 @@ static void jfs_write_failed(struct address_space *mapping, loff_t to) static int jfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - ret = block_write_begin(mapping, pos, len, pagep, jfs_get_block); + ret = block_write_begin(mapping, pos, len, foliop, jfs_get_block); if (unlikely(ret)) jfs_write_failed(mapping, pos + len); @@ -304,12 +304,12 @@ static int jfs_write_begin(struct file *file, struct address_space *mapping, } static int jfs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, struct page *page, + loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { int ret; - ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); if (ret < len) jfs_write_failed(mapping, pos + len); return ret; diff --git a/fs/jfs/jfs_discard.c b/fs/jfs/jfs_discard.c index 575cb2ba74fc..5f4b305030ad 100644 --- a/fs/jfs/jfs_discard.c +++ b/fs/jfs/jfs_discard.c @@ -65,7 +65,7 @@ void jfs_issue_discard(struct inode *ip, u64 blkno, u64 nblocks) int jfs_ioc_trim(struct inode *ip, struct fstrim_range *range) { struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; - struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; + struct bmap *bmp; struct super_block *sb = ipbmap->i_sb; int agno, agno_end; u64 start, end, minlen; @@ -83,10 +83,15 @@ int jfs_ioc_trim(struct inode *ip, struct fstrim_range *range) if (minlen == 0) minlen = 1; + down_read(&sb->s_umount); + bmp = JFS_SBI(ip->i_sb)->bmap; + if (minlen > bmp->db_agsize || start >= bmp->db_mapsize || - range->len < sb->s_blocksize) + range->len < sb->s_blocksize) { + up_read(&sb->s_umount); return -EINVAL; + } if (end >= bmp->db_mapsize) end = bmp->db_mapsize - 1; @@ -100,6 +105,8 @@ int jfs_ioc_trim(struct inode *ip, struct fstrim_range *range) trimmed += dbDiscardAG(ip, agno, minlen); agno++; } + + up_read(&sb->s_umount); range->len = trimmed << sb->s_blocksize_bits; return 0; diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 5713994328cb..974ecf5e0d95 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -187,7 +187,7 @@ int dbMount(struct inode *ipbmap) } bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag); - if (!bmp->db_numag) { + if (!bmp->db_numag || bmp->db_numag >= MAXAG) { err = -EINVAL; goto err_release_metapage; } @@ -652,7 +652,7 @@ int dbNextAG(struct inode *ipbmap) * average free space. */ for (i = 0 ; i < bmp->db_numag; i++, agpref++) { - if (agpref == bmp->db_numag) + if (agpref >= bmp->db_numag) agpref = 0; if (atomic_read(&bmp->db_active[agpref])) @@ -2944,9 +2944,10 @@ static void dbAdjTree(dmtree_t *tp, int leafno, int newval, bool is_ctl) static int dbFindLeaf(dmtree_t *tp, int l2nb, int *leafidx, bool is_ctl) { int ti, n = 0, k, x = 0; - int max_size; + int max_size, max_idx; max_size = is_ctl ? CTLTREESIZE : TREESIZE; + max_idx = is_ctl ? LPERCTL : LPERDMAP; /* first check the root of the tree to see if there is * sufficient free space. @@ -2978,6 +2979,8 @@ static int dbFindLeaf(dmtree_t *tp, int l2nb, int *leafidx, bool is_ctl) */ assert(n < 4); } + if (le32_to_cpu(tp->dmt_leafidx) >= max_idx) + return -ENOSPC; /* set the return to the leftmost leaf describing sufficient * free space. @@ -3022,7 +3025,7 @@ static int dbFindBits(u32 word, int l2nb) /* scan the word for nb free bits at nb alignments. */ - for (bitno = 0; mask != 0; bitno += nb, mask >>= nb) { + for (bitno = 0; mask != 0; bitno += nb, mask = (mask >> nb)) { if ((mask & word) == mask) break; } diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 1407feccbc2d..a360b24ed320 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -1360,7 +1360,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip) /* get the ag number of this iag */ agno = BLKTOAG(JFS_IP(pip)->agstart, JFS_SBI(pip->i_sb)); dn_numag = JFS_SBI(pip->i_sb)->bmap->db_numag; - if (agno < 0 || agno > dn_numag) + if (agno < 0 || agno > dn_numag || agno >= MAXAG) return -EIO; if (atomic_read(&JFS_SBI(pip->i_sb)->bmap->db_active[agno])) { diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 2999ed5d83f5..0fb05e314edf 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -434,6 +434,8 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size) int rc; int quota_allocation = 0; + memset(&ea_buf->new_ea, 0, sizeof(ea_buf->new_ea)); + /* When fsck.jfs clears a bad ea, it doesn't clear the size */ if (ji->ea.flag == 0) ea_size = 0; diff --git a/fs/libfs.c b/fs/libfs.c index 8aa34870449f..46966fd8bcf9 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -450,6 +450,14 @@ void simple_offset_destroy(struct offset_ctx *octx) mtree_destroy(&octx->mt); } +static int offset_dir_open(struct inode *inode, struct file *file) +{ + struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode); + + file->private_data = (void *)ctx->next_offset; + return 0; +} + /** * offset_dir_llseek - Advance the read position of a directory descriptor * @file: an open directory whose position is to be updated @@ -463,6 +471,9 @@ void simple_offset_destroy(struct offset_ctx *octx) */ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) { + struct inode *inode = file->f_inode; + struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode); + switch (whence) { case SEEK_CUR: offset += file->f_pos; @@ -476,7 +487,8 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) } /* In this case, ->private_data is protected by f_pos_lock */ - file->private_data = NULL; + if (!offset) + file->private_data = (void *)ctx->next_offset; return vfs_setpos(file, offset, LONG_MAX); } @@ -507,7 +519,7 @@ static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) inode->i_ino, fs_umode_to_dtype(inode->i_mode)); } -static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) +static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, long last_index) { struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode); struct dentry *dentry; @@ -515,17 +527,21 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) while (true) { dentry = offset_find_next(octx, ctx->pos); if (!dentry) - return ERR_PTR(-ENOENT); + return; + + if (dentry2offset(dentry) >= last_index) { + dput(dentry); + return; + } if (!offset_dir_emit(ctx, dentry)) { dput(dentry); - break; + return; } ctx->pos = dentry2offset(dentry) + 1; dput(dentry); } - return NULL; } /** @@ -552,22 +568,19 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) static int offset_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dir = file->f_path.dentry; + long last_index = (long)file->private_data; lockdep_assert_held(&d_inode(dir)->i_rwsem); if (!dir_emit_dots(file, ctx)) return 0; - /* In this case, ->private_data is protected by f_pos_lock */ - if (ctx->pos == DIR_OFFSET_MIN) - file->private_data = NULL; - else if (file->private_data == ERR_PTR(-ENOENT)) - return 0; - file->private_data = offset_iterate_dir(d_inode(dir), ctx); + offset_iterate_dir(d_inode(dir), ctx, last_index); return 0; } const struct file_operations simple_offset_dir_operations = { + .open = offset_dir_open, .llseek = offset_dir_llseek, .iterate_shared = offset_readdir, .read = generic_read_dir, @@ -901,7 +914,7 @@ static int simple_read_folio(struct file *file, struct folio *folio) int simple_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct folio *folio; @@ -910,7 +923,7 @@ int simple_write_begin(struct file *file, struct address_space *mapping, if (IS_ERR(folio)) return PTR_ERR(folio); - *pagep = &folio->page; + *foliop = folio; if (!folio_test_uptodate(folio) && (len != folio_size(folio))) { size_t from = offset_in_folio(folio, pos); @@ -929,11 +942,11 @@ EXPORT_SYMBOL(simple_write_begin); * @pos: " * @len: " * @copied: " - * @page: " + * @folio: " * @fsdata: " * - * simple_write_end does the minimum needed for updating a page after writing is - * done. It has the same API signature as the .write_end of + * simple_write_end does the minimum needed for updating a folio after + * writing is done. It has the same API signature as the .write_end of * address_space_operations vector. So it can just be set onto .write_end for * FSes that don't need any other processing. i_mutex is assumed to be held. * Block based filesystems should use generic_write_end(). @@ -946,9 +959,8 @@ EXPORT_SYMBOL(simple_write_begin); */ static int simple_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); struct inode *inode = folio->mapping->host; loff_t last_pos = pos + copied; @@ -1990,13 +2002,19 @@ bool inode_maybe_inc_iversion(struct inode *inode, bool force) * information, but the legacy inode_inc_iversion code used a spinlock * to serialize increments. * - * Here, we add full memory barriers to ensure that any de-facto - * ordering with other info is preserved. + * We add a full memory barrier to ensure that any de facto ordering + * with other state is preserved (either implicitly coming from cmpxchg + * or explicitly from smp_mb if we don't know upfront if we will execute + * the former). * - * This barrier pairs with the barrier in inode_query_iversion() + * These barriers pair with inode_query_iversion(). */ - smp_mb(); cur = inode_peek_iversion_raw(inode); + if (!force && !(cur & I_VERSION_QUERIED)) { + smp_mb(); + cur = inode_peek_iversion_raw(inode); + } + do { /* If flag is clear then we needn't do anything */ if (!force && !(cur & I_VERSION_QUERIED)) @@ -2025,20 +2043,22 @@ EXPORT_SYMBOL(inode_maybe_inc_iversion); u64 inode_query_iversion(struct inode *inode) { u64 cur, new; + bool fenced = false; + /* + * Memory barriers (implicit in cmpxchg, explicit in smp_mb) pair with + * inode_maybe_inc_iversion(), see that routine for more details. + */ cur = inode_peek_iversion_raw(inode); do { /* If flag is already set, then no need to swap */ if (cur & I_VERSION_QUERIED) { - /* - * This barrier (and the implicit barrier in the - * cmpxchg below) pairs with the barrier in - * inode_maybe_inc_iversion(). - */ - smp_mb(); + if (!fenced) + smp_mb(); break; } + fenced = true; new = cur | I_VERSION_QUERIED; } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); return cur >> I_VERSION_QUERIED_SHIFT; @@ -2104,12 +2124,12 @@ struct timespec64 simple_inode_init_ts(struct inode *inode) } EXPORT_SYMBOL(simple_inode_init_ts); -static inline struct dentry *get_stashed_dentry(struct dentry *stashed) +static inline struct dentry *get_stashed_dentry(struct dentry **stashed) { struct dentry *dentry; guard(rcu)(); - dentry = READ_ONCE(stashed); + dentry = rcu_dereference(*stashed); if (!dentry) return NULL; if (!lockref_get_not_dead(&dentry->d_lockref)) @@ -2206,7 +2226,7 @@ int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info; /* See if dentry can be reused. */ - path->dentry = get_stashed_dentry(*stashed); + path->dentry = get_stashed_dentry(stashed); if (path->dentry) { sops->put_data(data); goto out_path; diff --git a/fs/locks.c b/fs/locks.c index 9afb16e0683f..b51b1c395ce6 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1451,7 +1451,7 @@ int lease_modify(struct file_lease *fl, int arg, struct list_head *dispose) struct file *filp = fl->c.flc_file; f_delown(filp); - filp->f_owner.signum = 0; + file_f_owner(filp)->signum = 0; fasync_helper(0, fl->c.flc_file, 0, &fl->fl_fasync); if (fl->fl_fasync != NULL) { printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync); @@ -1783,6 +1783,10 @@ generic_add_lease(struct file *filp, int arg, struct file_lease **flp, void **pr lease = *flp; trace_generic_add_lease(inode, lease); + error = file_f_owner_allocate(filp); + if (error) + return error; + /* Note that arg is never F_UNLCK here */ ctx = locks_get_lock_context(inode, arg); if (!ctx) @@ -2984,7 +2988,7 @@ static int __init filelock_init(void) filelock_cache = kmem_cache_create("file_lock_cache", sizeof(struct file_lock), 0, SLAB_PANIC, NULL); - filelease_cache = kmem_cache_create("file_lock_cache", + filelease_cache = kmem_cache_create("file_lease_cache", sizeof(struct file_lease), 0, SLAB_PANIC, NULL); for_each_possible_cpu(i) { diff --git a/fs/minix/dir.c b/fs/minix/dir.c index a224cf222570..dd2a425b41f0 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -40,18 +40,18 @@ minix_last_byte(struct inode *inode, unsigned long page_nr) return last_byte; } -static void dir_commit_chunk(struct page *page, loff_t pos, unsigned len) +static void dir_commit_chunk(struct folio *folio, loff_t pos, unsigned len) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; struct inode *dir = mapping->host; - block_write_end(NULL, mapping, pos, len, len, page, NULL); + block_write_end(NULL, mapping, pos, len, len, folio, NULL); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); mark_inode_dirty(dir); } - unlock_page(page); + folio_unlock(folio); } static int minix_handle_dirsync(struct inode *dir) @@ -64,14 +64,15 @@ static int minix_handle_dirsync(struct inode *dir) return err; } -static void *dir_get_page(struct inode *dir, unsigned long n, struct page **p) +static void *dir_get_folio(struct inode *dir, unsigned long n, + struct folio **foliop) { - struct address_space *mapping = dir->i_mapping; - struct page *page = read_mapping_page(mapping, n, NULL); - if (IS_ERR(page)) - return ERR_CAST(page); - *p = page; - return kmap_local_page(page); + struct folio *folio = read_mapping_folio(dir->i_mapping, n, NULL); + + if (IS_ERR(folio)) + return ERR_CAST(folio); + *foliop = folio; + return kmap_local_folio(folio, 0); } static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi) @@ -99,9 +100,9 @@ static int minix_readdir(struct file *file, struct dir_context *ctx) for ( ; n < npages; n++, offset = 0) { char *p, *kaddr, *limit; - struct page *page; + struct folio *folio; - kaddr = dir_get_page(inode, n, &page); + kaddr = dir_get_folio(inode, n, &folio); if (IS_ERR(kaddr)) continue; p = kaddr+offset; @@ -122,13 +123,13 @@ static int minix_readdir(struct file *file, struct dir_context *ctx) unsigned l = strnlen(name, sbi->s_namelen); if (!dir_emit(ctx, name, l, inumber, DT_UNKNOWN)) { - unmap_and_put_page(page, p); + folio_release_kmap(folio, p); return 0; } } ctx->pos += chunk_size; } - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); } return 0; } @@ -144,12 +145,13 @@ static inline int namecompare(int len, int maxlen, /* * minix_find_entry() * - * finds an entry in the specified directory with the wanted name. It - * returns the cache buffer in which the entry was found, and the entry - * itself (as a parameter - res_dir). It does NOT read the inode of the + * finds an entry in the specified directory with the wanted name. + * It does NOT read the inode of the * entry - you'll have to do that yourself if you want to. + * + * On Success folio_release_kmap() should be called on *foliop. */ -minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page) +minix_dirent *minix_find_entry(struct dentry *dentry, struct folio **foliop) { const char * name = dentry->d_name.name; int namelen = dentry->d_name.len; @@ -158,17 +160,15 @@ minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page) struct minix_sb_info * sbi = minix_sb(sb); unsigned long n; unsigned long npages = dir_pages(dir); - struct page *page = NULL; char *p; char *namx; __u32 inumber; - *res_page = NULL; for (n = 0; n < npages; n++) { char *kaddr, *limit; - kaddr = dir_get_page(dir, n, &page); + kaddr = dir_get_folio(dir, n, foliop); if (IS_ERR(kaddr)) continue; @@ -188,12 +188,11 @@ minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page) if (namecompare(namelen, sbi->s_namelen, name, namx)) goto found; } - unmap_and_put_page(page, kaddr); + folio_release_kmap(*foliop, kaddr); } return NULL; found: - *res_page = page; return (minix_dirent *)p; } @@ -204,7 +203,7 @@ int minix_add_link(struct dentry *dentry, struct inode *inode) int namelen = dentry->d_name.len; struct super_block * sb = dir->i_sb; struct minix_sb_info * sbi = minix_sb(sb); - struct page *page = NULL; + struct folio *folio = NULL; unsigned long npages = dir_pages(dir); unsigned long n; char *kaddr, *p; @@ -223,10 +222,10 @@ int minix_add_link(struct dentry *dentry, struct inode *inode) for (n = 0; n <= npages; n++) { char *limit, *dir_end; - kaddr = dir_get_page(dir, n, &page); + kaddr = dir_get_folio(dir, n, &folio); if (IS_ERR(kaddr)) return PTR_ERR(kaddr); - lock_page(page); + folio_lock(folio); dir_end = kaddr + minix_last_byte(dir, n); limit = kaddr + PAGE_SIZE - sbi->s_dirsize; for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) { @@ -253,15 +252,15 @@ int minix_add_link(struct dentry *dentry, struct inode *inode) if (namecompare(namelen, sbi->s_namelen, name, namx)) goto out_unlock; } - unlock_page(page); - unmap_and_put_page(page, kaddr); + folio_unlock(folio); + folio_release_kmap(folio, kaddr); } BUG(); return -EINVAL; got_it: - pos = page_offset(page) + offset_in_page(p); - err = minix_prepare_chunk(page, pos, sbi->s_dirsize); + pos = folio_pos(folio) + offset_in_folio(folio, p); + err = minix_prepare_chunk(folio, pos, sbi->s_dirsize); if (err) goto out_unlock; memcpy (namx, name, namelen); @@ -272,37 +271,37 @@ got_it: memset (namx + namelen, 0, sbi->s_dirsize - namelen - 2); de->inode = inode->i_ino; } - dir_commit_chunk(page, pos, sbi->s_dirsize); + dir_commit_chunk(folio, pos, sbi->s_dirsize); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); err = minix_handle_dirsync(dir); out_put: - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); return err; out_unlock: - unlock_page(page); + folio_unlock(folio); goto out_put; } -int minix_delete_entry(struct minix_dir_entry *de, struct page *page) +int minix_delete_entry(struct minix_dir_entry *de, struct folio *folio) { - struct inode *inode = page->mapping->host; - loff_t pos = page_offset(page) + offset_in_page(de); + struct inode *inode = folio->mapping->host; + loff_t pos = folio_pos(folio) + offset_in_folio(folio, de); struct minix_sb_info *sbi = minix_sb(inode->i_sb); unsigned len = sbi->s_dirsize; int err; - lock_page(page); - err = minix_prepare_chunk(page, pos, len); + folio_lock(folio); + err = minix_prepare_chunk(folio, pos, len); if (err) { - unlock_page(page); + folio_unlock(folio); return err; } if (sbi->s_version == MINIX_V3) ((minix3_dirent *)de)->inode = 0; else de->inode = 0; - dir_commit_chunk(page, pos, len); + dir_commit_chunk(folio, pos, len); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); return minix_handle_dirsync(inode); @@ -310,21 +309,21 @@ int minix_delete_entry(struct minix_dir_entry *de, struct page *page) int minix_make_empty(struct inode *inode, struct inode *dir) { - struct page *page = grab_cache_page(inode->i_mapping, 0); + struct folio *folio = filemap_grab_folio(inode->i_mapping, 0); struct minix_sb_info *sbi = minix_sb(inode->i_sb); char *kaddr; int err; - if (!page) - return -ENOMEM; - err = minix_prepare_chunk(page, 0, 2 * sbi->s_dirsize); + if (IS_ERR(folio)) + return PTR_ERR(folio); + err = minix_prepare_chunk(folio, 0, 2 * sbi->s_dirsize); if (err) { - unlock_page(page); + folio_unlock(folio); goto fail; } - kaddr = kmap_local_page(page); - memset(kaddr, 0, PAGE_SIZE); + kaddr = kmap_local_folio(folio, 0); + memset(kaddr, 0, folio_size(folio)); if (sbi->s_version == MINIX_V3) { minix3_dirent *de3 = (minix3_dirent *)kaddr; @@ -345,10 +344,10 @@ int minix_make_empty(struct inode *inode, struct inode *dir) } kunmap_local(kaddr); - dir_commit_chunk(page, 0, 2 * sbi->s_dirsize); + dir_commit_chunk(folio, 0, 2 * sbi->s_dirsize); err = minix_handle_dirsync(inode); fail: - put_page(page); + folio_put(folio); return err; } @@ -357,7 +356,7 @@ fail: */ int minix_empty_dir(struct inode * inode) { - struct page *page = NULL; + struct folio *folio = NULL; unsigned long i, npages = dir_pages(inode); struct minix_sb_info *sbi = minix_sb(inode->i_sb); char *name, *kaddr; @@ -366,7 +365,7 @@ int minix_empty_dir(struct inode * inode) for (i = 0; i < npages; i++) { char *p, *limit; - kaddr = dir_get_page(inode, i, &page); + kaddr = dir_get_folio(inode, i, &folio); if (IS_ERR(kaddr)) continue; @@ -395,44 +394,44 @@ int minix_empty_dir(struct inode * inode) goto not_empty; } } - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); } return 1; not_empty: - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); return 0; } /* Releases the page */ -int minix_set_link(struct minix_dir_entry *de, struct page *page, +int minix_set_link(struct minix_dir_entry *de, struct folio *folio, struct inode *inode) { - struct inode *dir = page->mapping->host; + struct inode *dir = folio->mapping->host; struct minix_sb_info *sbi = minix_sb(dir->i_sb); - loff_t pos = page_offset(page) + offset_in_page(de); + loff_t pos = folio_pos(folio) + offset_in_folio(folio, de); int err; - lock_page(page); - err = minix_prepare_chunk(page, pos, sbi->s_dirsize); + folio_lock(folio); + err = minix_prepare_chunk(folio, pos, sbi->s_dirsize); if (err) { - unlock_page(page); + folio_unlock(folio); return err; } if (sbi->s_version == MINIX_V3) ((minix3_dirent *)de)->inode = inode->i_ino; else de->inode = inode->i_ino; - dir_commit_chunk(page, pos, sbi->s_dirsize); + dir_commit_chunk(folio, pos, sbi->s_dirsize); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); return minix_handle_dirsync(dir); } -struct minix_dir_entry * minix_dotdot (struct inode *dir, struct page **p) +struct minix_dir_entry *minix_dotdot(struct inode *dir, struct folio **foliop) { struct minix_sb_info *sbi = minix_sb(dir->i_sb); - struct minix_dir_entry *de = dir_get_page(dir, 0, p); + struct minix_dir_entry *de = dir_get_folio(dir, 0, foliop); if (!IS_ERR(de)) return minix_next_entry(de, sbi); @@ -441,20 +440,19 @@ struct minix_dir_entry * minix_dotdot (struct inode *dir, struct page **p) ino_t minix_inode_by_name(struct dentry *dentry) { - struct page *page; - struct minix_dir_entry *de = minix_find_entry(dentry, &page); + struct folio *folio; + struct minix_dir_entry *de = minix_find_entry(dentry, &folio); ino_t res = 0; if (de) { - struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; + struct inode *inode = folio->mapping->host; struct minix_sb_info *sbi = minix_sb(inode->i_sb); if (sbi->s_version == MINIX_V3) res = ((minix3_dirent *) de)->inode; else res = de->inode; - unmap_and_put_page(page, de); + folio_release_kmap(folio, de); } return res; } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 1c3df63162ef..f007e389d5d2 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -427,9 +427,9 @@ static int minix_read_folio(struct file *file, struct folio *folio) return block_read_full_folio(folio, minix_get_block); } -int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len) +int minix_prepare_chunk(struct folio *folio, loff_t pos, unsigned len) { - return __block_write_begin(page, pos, len, minix_get_block); + return __block_write_begin(folio, pos, len, minix_get_block); } static void minix_write_failed(struct address_space *mapping, loff_t to) @@ -444,11 +444,11 @@ static void minix_write_failed(struct address_space *mapping, loff_t to) static int minix_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - ret = block_write_begin(mapping, pos, len, pagep, minix_get_block); + ret = block_write_begin(mapping, pos, len, foliop, minix_get_block); if (unlikely(ret)) minix_write_failed(mapping, pos + len); diff --git a/fs/minix/minix.h b/fs/minix/minix.h index d493507c064f..d54273c3c9ff 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -42,18 +42,18 @@ struct minix_sb_info { unsigned short s_version; }; -extern struct inode *minix_iget(struct super_block *, unsigned long); -extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **); -extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); -extern struct inode * minix_new_inode(const struct inode *, umode_t); -extern void minix_free_inode(struct inode * inode); -extern unsigned long minix_count_free_inodes(struct super_block *sb); -extern int minix_new_block(struct inode * inode); -extern void minix_free_block(struct inode *inode, unsigned long block); -extern unsigned long minix_count_free_blocks(struct super_block *sb); -extern int minix_getattr(struct mnt_idmap *, const struct path *, - struct kstat *, u32, unsigned int); -extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len); +struct inode *minix_iget(struct super_block *, unsigned long); +struct minix_inode *minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **); +struct minix2_inode *minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); +struct inode *minix_new_inode(const struct inode *, umode_t); +void minix_free_inode(struct inode *inode); +unsigned long minix_count_free_inodes(struct super_block *sb); +int minix_new_block(struct inode *inode); +void minix_free_block(struct inode *inode, unsigned long block); +unsigned long minix_count_free_blocks(struct super_block *sb); +int minix_getattr(struct mnt_idmap *, const struct path *, + struct kstat *, u32, unsigned int); +int minix_prepare_chunk(struct folio *folio, loff_t pos, unsigned len); extern void V1_minix_truncate(struct inode *); extern void V2_minix_truncate(struct inode *); @@ -64,15 +64,15 @@ extern int V2_minix_get_block(struct inode *, long, struct buffer_head *, int); extern unsigned V1_minix_blocks(loff_t, struct super_block *); extern unsigned V2_minix_blocks(loff_t, struct super_block *); -extern struct minix_dir_entry *minix_find_entry(struct dentry*, struct page**); -extern int minix_add_link(struct dentry*, struct inode*); -extern int minix_delete_entry(struct minix_dir_entry*, struct page*); -extern int minix_make_empty(struct inode*, struct inode*); -extern int minix_empty_dir(struct inode*); -int minix_set_link(struct minix_dir_entry *de, struct page *page, +struct minix_dir_entry *minix_find_entry(struct dentry *, struct folio **); +int minix_add_link(struct dentry*, struct inode*); +int minix_delete_entry(struct minix_dir_entry *, struct folio *); +int minix_make_empty(struct inode*, struct inode*); +int minix_empty_dir(struct inode*); +int minix_set_link(struct minix_dir_entry *de, struct folio *folio, struct inode *inode); -extern struct minix_dir_entry *minix_dotdot(struct inode*, struct page**); -extern ino_t minix_inode_by_name(struct dentry*); +struct minix_dir_entry *minix_dotdot(struct inode*, struct folio **); +ino_t minix_inode_by_name(struct dentry*); extern const struct inode_operations minix_file_inode_operations; extern const struct inode_operations minix_dir_inode_operations; diff --git a/fs/minix/namei.c b/fs/minix/namei.c index a944a0f17b53..5d9c1406fe27 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -141,15 +141,15 @@ out_fail: static int minix_unlink(struct inode * dir, struct dentry *dentry) { struct inode * inode = d_inode(dentry); - struct page * page; + struct folio *folio; struct minix_dir_entry * de; int err; - de = minix_find_entry(dentry, &page); + de = minix_find_entry(dentry, &folio); if (!de) return -ENOENT; - err = minix_delete_entry(de, page); - unmap_and_put_page(page, de); + err = minix_delete_entry(de, folio); + folio_release_kmap(folio, de); if (err) return err; @@ -180,28 +180,28 @@ static int minix_rename(struct mnt_idmap *idmap, { struct inode * old_inode = d_inode(old_dentry); struct inode * new_inode = d_inode(new_dentry); - struct page * dir_page = NULL; + struct folio * dir_folio = NULL; struct minix_dir_entry * dir_de = NULL; - struct page * old_page; + struct folio *old_folio; struct minix_dir_entry * old_de; int err = -ENOENT; if (flags & ~RENAME_NOREPLACE) return -EINVAL; - old_de = minix_find_entry(old_dentry, &old_page); + old_de = minix_find_entry(old_dentry, &old_folio); if (!old_de) goto out; if (S_ISDIR(old_inode->i_mode)) { err = -EIO; - dir_de = minix_dotdot(old_inode, &dir_page); + dir_de = minix_dotdot(old_inode, &dir_folio); if (!dir_de) goto out_old; } if (new_inode) { - struct page * new_page; + struct folio *new_folio; struct minix_dir_entry * new_de; err = -ENOTEMPTY; @@ -209,11 +209,11 @@ static int minix_rename(struct mnt_idmap *idmap, goto out_dir; err = -ENOENT; - new_de = minix_find_entry(new_dentry, &new_page); + new_de = minix_find_entry(new_dentry, &new_folio); if (!new_de) goto out_dir; - err = minix_set_link(new_de, new_page, old_inode); - unmap_and_put_page(new_page, new_de); + err = minix_set_link(new_de, new_folio, old_inode); + folio_release_kmap(new_folio, new_de); if (err) goto out_dir; inode_set_ctime_current(new_inode); @@ -228,22 +228,22 @@ static int minix_rename(struct mnt_idmap *idmap, inode_inc_link_count(new_dir); } - err = minix_delete_entry(old_de, old_page); + err = minix_delete_entry(old_de, old_folio); if (err) goto out_dir; mark_inode_dirty(old_inode); if (dir_de) { - err = minix_set_link(dir_de, dir_page, new_dir); + err = minix_set_link(dir_de, dir_folio, new_dir); if (!err) inode_dec_link_count(old_dir); } out_dir: if (dir_de) - unmap_and_put_page(dir_page, dir_de); + folio_release_kmap(dir_folio, dir_de); out_old: - unmap_and_put_page(old_page, old_de); + folio_release_kmap(old_folio, old_de); out: return err; } diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c index 3c60f1eaca61..79491663dbc0 100644 --- a/fs/mnt_idmapping.c +++ b/fs/mnt_idmapping.c @@ -228,15 +228,15 @@ static int copy_mnt_idmap(struct uid_gid_map *map_from, return 0; } - forward = kmemdup(map_from->forward, - nr_extents * sizeof(struct uid_gid_extent), - GFP_KERNEL_ACCOUNT); + forward = kmemdup_array(map_from->forward, nr_extents, + sizeof(struct uid_gid_extent), + GFP_KERNEL_ACCOUNT); if (!forward) return -ENOMEM; - reverse = kmemdup(map_from->reverse, - nr_extents * sizeof(struct uid_gid_extent), - GFP_KERNEL_ACCOUNT); + reverse = kmemdup_array(map_from->reverse, nr_extents, + sizeof(struct uid_gid_extent), + GFP_KERNEL_ACCOUNT); if (!reverse) { kfree(forward); return -ENOMEM; diff --git a/fs/mount.h b/fs/mount.h index ad4b1ddebb54..185fc56afc13 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -153,5 +153,17 @@ static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list) list_add_tail(&mnt->mnt_list, dt_list); } -extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor); bool has_locked_children(struct mount *mnt, struct dentry *dentry); +struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mnt_ns, bool previous); +static inline struct mnt_namespace *lookup_next_mnt_ns(struct mnt_namespace *mntns) +{ + return __lookup_next_mnt_ns(mntns, false); +} +static inline struct mnt_namespace *lookup_prev_mnt_ns(struct mnt_namespace *mntns) +{ + return __lookup_next_mnt_ns(mntns, true); +} +static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns) +{ + return container_of(ns, struct mnt_namespace, ns); +} diff --git a/fs/namei.c b/fs/namei.c index 5512cb10fa89..891b169e38c9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1639,6 +1639,20 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name, } EXPORT_SYMBOL(lookup_one_qstr_excl); +/** + * lookup_fast - do fast lockless (but racy) lookup of a dentry + * @nd: current nameidata + * + * Do a fast, but racy lookup in the dcache for the given dentry, and + * revalidate it. Returns a valid dentry pointer or NULL if one wasn't + * found. On error, an ERR_PTR will be returned. + * + * If this function returns a valid dentry and the walk is no longer + * lazy, the dentry will carry a reference that must later be put. If + * RCU mode is still in force, then this is not the case and the dentry + * must be legitimized before use. If this returns NULL, then the walk + * will no longer be in RCU mode. + */ static struct dentry *lookup_fast(struct nameidata *nd) { struct dentry *dentry, *parent = nd->path.dentry; @@ -3521,6 +3535,9 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, return dentry; } + if (open_flag & O_CREAT) + audit_inode(nd->name, dir, AUDIT_INODE_PARENT); + /* * Checking write permission is tricky, bacuse we don't know if we are * going to actually need it: O_CREAT opens should work as long as the @@ -3591,6 +3608,42 @@ out_dput: return ERR_PTR(error); } +static inline bool trailing_slashes(struct nameidata *nd) +{ + return (bool)nd->last.name[nd->last.len]; +} + +static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag) +{ + struct dentry *dentry; + + if (open_flag & O_CREAT) { + if (trailing_slashes(nd)) + return ERR_PTR(-EISDIR); + + /* Don't bother on an O_EXCL create */ + if (open_flag & O_EXCL) + return NULL; + } + + if (trailing_slashes(nd)) + nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; + + dentry = lookup_fast(nd); + if (IS_ERR_OR_NULL(dentry)) + return dentry; + + if (open_flag & O_CREAT) { + /* Discard negative dentries. Need inode_lock to do the create */ + if (!dentry->d_inode) { + if (!(nd->flags & LOOKUP_RCU)) + dput(dentry); + dentry = NULL; + } + } + return dentry; +} + static const char *open_last_lookups(struct nameidata *nd, struct file *file, const struct open_flags *op) { @@ -3608,28 +3661,22 @@ static const char *open_last_lookups(struct nameidata *nd, return handle_dots(nd, nd->last_type); } - if (!(open_flag & O_CREAT)) { - if (nd->last.name[nd->last.len]) - nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; - /* we _can_ be in RCU mode here */ - dentry = lookup_fast(nd); - if (IS_ERR(dentry)) - return ERR_CAST(dentry); - if (likely(dentry)) - goto finish_lookup; + /* We _can_ be in RCU mode here */ + dentry = lookup_fast_for_open(nd, open_flag); + if (IS_ERR(dentry)) + return ERR_CAST(dentry); + if (likely(dentry)) + goto finish_lookup; + + if (!(open_flag & O_CREAT)) { if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU)) return ERR_PTR(-ECHILD); } else { - /* create side of things */ if (nd->flags & LOOKUP_RCU) { if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); } - audit_inode(nd->name, dir, AUDIT_INODE_PARENT); - /* trailing slashes? */ - if (unlikely(nd->last.name[nd->last.len])) - return ERR_PTR(-EISDIR); } if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { @@ -5304,7 +5351,7 @@ int page_symlink(struct inode *inode, const char *symname, int len) struct address_space *mapping = inode->i_mapping; const struct address_space_operations *aops = mapping->a_ops; bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS); - struct page *page; + struct folio *folio; void *fsdata = NULL; int err; unsigned int flags; @@ -5312,16 +5359,16 @@ int page_symlink(struct inode *inode, const char *symname, int len) retry: if (nofs) flags = memalloc_nofs_save(); - err = aops->write_begin(NULL, mapping, 0, len-1, &page, &fsdata); + err = aops->write_begin(NULL, mapping, 0, len-1, &folio, &fsdata); if (nofs) memalloc_nofs_restore(flags); if (err) goto fail; - memcpy(page_address(page), symname, len-1); + memcpy(folio_address(folio), symname, len - 1); - err = aops->write_end(NULL, mapping, 0, len-1, len-1, - page, fsdata); + err = aops->write_end(NULL, mapping, 0, len - 1, len - 1, + folio, fsdata); if (err < 0) goto fail; if (err < len-1) diff --git a/fs/namespace.c b/fs/namespace.c index 328087a4df8a..e71e4564987b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1774,7 +1774,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) list_del_init(&p->mnt_child); } - /* Add propogated mounts to the tmp_list */ + /* Add propagated mounts to the tmp_list */ if (how & UMOUNT_PROPAGATE) propagate_umount(&tmp_list); @@ -2060,14 +2060,41 @@ static bool is_mnt_ns_file(struct dentry *dentry) dentry->d_fsdata == &mntns_operations; } -static struct mnt_namespace *to_mnt_ns(struct ns_common *ns) +struct ns_common *from_mnt_ns(struct mnt_namespace *mnt) { - return container_of(ns, struct mnt_namespace, ns); + return &mnt->ns; } -struct ns_common *from_mnt_ns(struct mnt_namespace *mnt) +struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mntns, bool previous) { - return &mnt->ns; + guard(read_lock)(&mnt_ns_tree_lock); + for (;;) { + struct rb_node *node; + + if (previous) + node = rb_prev(&mntns->mnt_ns_tree_node); + else + node = rb_next(&mntns->mnt_ns_tree_node); + if (!node) + return ERR_PTR(-ENOENT); + + mntns = node_to_mnt_ns(node); + node = &mntns->mnt_ns_tree_node; + + if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN)) + continue; + + /* + * Holding mnt_ns_tree_lock prevents the mount namespace from + * being freed but it may well be on it's deathbed. We want an + * active reference, not just a passive one here as we're + * persisting the mount namespace. + */ + if (!refcount_inc_not_zero(&mntns->ns.count)) + continue; + + return mntns; + } } static bool mnt_ns_loop(struct dentry *dentry) @@ -2921,8 +2948,15 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * if (!__mnt_is_readonly(mnt) && (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) && (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) { - char *buf = (char *)__get_free_page(GFP_KERNEL); - char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM); + char *buf, *mntpath; + + buf = (char *)__get_free_page(GFP_KERNEL); + if (buf) + mntpath = d_path(mountpoint, buf, PAGE_SIZE); + else + mntpath = ERR_PTR(-ENOMEM); + if (IS_ERR(mntpath)) + mntpath = "(unknown)"; pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n", sb->s_type->name, @@ -2930,8 +2964,9 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * mntpath, &sb->s_time_max, (unsigned long long)sb->s_time_max); - free_page((unsigned long)buf); sb->s_iflags |= SB_I_TS_EXPIRY_WARNED; + if (buf) + free_page((unsigned long)buf); } } @@ -5243,12 +5278,37 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req, * that, or if not simply grab a passive reference on our mount namespace and * return that. */ -static struct mnt_namespace *grab_requested_mnt_ns(u64 mnt_ns_id) +static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq) { - if (mnt_ns_id) - return lookup_mnt_ns(mnt_ns_id); - refcount_inc(¤t->nsproxy->mnt_ns->passive); - return current->nsproxy->mnt_ns; + struct mnt_namespace *mnt_ns; + + if (kreq->mnt_ns_id && kreq->spare) + return ERR_PTR(-EINVAL); + + if (kreq->mnt_ns_id) + return lookup_mnt_ns(kreq->mnt_ns_id); + + if (kreq->spare) { + struct ns_common *ns; + + CLASS(fd, f)(kreq->spare); + if (!f.file) + return ERR_PTR(-EBADF); + + if (!proc_ns_file(f.file)) + return ERR_PTR(-EINVAL); + + ns = get_proc_ns(file_inode(f.file)); + if (ns->ops->type != CLONE_NEWNS) + return ERR_PTR(-EINVAL); + + mnt_ns = to_mnt_ns(ns); + } else { + mnt_ns = current->nsproxy->mnt_ns; + } + + refcount_inc(&mnt_ns->passive); + return mnt_ns; } SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, @@ -5269,7 +5329,7 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, if (ret) return ret; - ns = grab_requested_mnt_ns(kreq.mnt_ns_id); + ns = grab_requested_mnt_ns(&kreq); if (!ns) return -ENOENT; @@ -5396,7 +5456,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, if (!kmnt_ids) return -ENOMEM; - ns = grab_requested_mnt_ns(kreq.mnt_ns_id); + ns = grab_requested_mnt_ns(&kreq); if (!ns) return -ENOENT; @@ -5605,7 +5665,7 @@ static bool mnt_already_visible(struct mnt_namespace *ns, /* Only worry about locked mounts */ if (!(child->mnt.mnt_flags & MNT_LOCKED)) continue; - /* Is the directory permanetly empty? */ + /* Is the directory permanently empty? */ if (!is_empty_dir_inode(inode)) goto next; } diff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig index 1b78e8b65ebc..7701c037c328 100644 --- a/fs/netfs/Kconfig +++ b/fs/netfs/Kconfig @@ -24,7 +24,7 @@ config NETFS_STATS config NETFS_DEBUG bool "Enable dynamic debugging netfslib and FS-Cache" - depends on NETFS + depends on NETFS_SUPPORT help This permits debugging to be dynamically enabled in the local caching management module. If this is set, the debugging output may be diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile index 8e6781e0b10b..d08b0bfb6756 100644 --- a/fs/netfs/Makefile +++ b/fs/netfs/Makefile @@ -5,12 +5,14 @@ netfs-y := \ buffered_write.o \ direct_read.o \ direct_write.o \ - io.o \ iterator.o \ locking.o \ main.o \ misc.o \ objects.o \ + read_collect.o \ + read_pgpriv2.o \ + read_retry.o \ write_collect.o \ write_issue.o diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index a688d4c75d99..c40e226053cc 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -9,126 +9,6 @@ #include <linux/task_io_accounting_ops.h> #include "internal.h" -/* - * Unlock the folios in a read operation. We need to set PG_writeback on any - * folios we're going to write back before we unlock them. - * - * Note that if the deprecated NETFS_RREQ_USE_PGPRIV2 is set then we use - * PG_private_2 and do a direct write to the cache from here instead. - */ -void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) -{ - struct netfs_io_subrequest *subreq; - struct netfs_folio *finfo; - struct folio *folio; - pgoff_t start_page = rreq->start / PAGE_SIZE; - pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; - size_t account = 0; - bool subreq_failed = false; - - XA_STATE(xas, &rreq->mapping->i_pages, start_page); - - if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) { - __clear_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags); - list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { - __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); - } - } - - /* Walk through the pagecache and the I/O request lists simultaneously. - * We may have a mixture of cached and uncached sections and we only - * really want to write out the uncached sections. This is slightly - * complicated by the possibility that we might have huge pages with a - * mixture inside. - */ - subreq = list_first_entry(&rreq->subrequests, - struct netfs_io_subrequest, rreq_link); - subreq_failed = (subreq->error < 0); - - trace_netfs_rreq(rreq, netfs_rreq_trace_unlock); - - rcu_read_lock(); - xas_for_each(&xas, folio, last_page) { - loff_t pg_end; - bool pg_failed = false; - bool wback_to_cache = false; - bool folio_started = false; - - if (xas_retry(&xas, folio)) - continue; - - pg_end = folio_pos(folio) + folio_size(folio) - 1; - - for (;;) { - loff_t sreq_end; - - if (!subreq) { - pg_failed = true; - break; - } - if (test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) { - if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, - &subreq->flags)) { - trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); - folio_start_private_2(folio); - folio_started = true; - } - } else { - wback_to_cache |= - test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); - } - pg_failed |= subreq_failed; - sreq_end = subreq->start + subreq->len - 1; - if (pg_end < sreq_end) - break; - - account += subreq->transferred; - if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { - subreq = list_next_entry(subreq, rreq_link); - subreq_failed = (subreq->error < 0); - } else { - subreq = NULL; - subreq_failed = false; - } - - if (pg_end == sreq_end) - break; - } - - if (!pg_failed) { - flush_dcache_folio(folio); - finfo = netfs_folio_info(folio); - if (finfo) { - trace_netfs_folio(folio, netfs_folio_trace_filled_gaps); - if (finfo->netfs_group) - folio_change_private(folio, finfo->netfs_group); - else - folio_detach_private(folio); - kfree(finfo); - } - folio_mark_uptodate(folio); - if (wback_to_cache && !WARN_ON_ONCE(folio_get_private(folio) != NULL)) { - trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); - folio_attach_private(folio, NETFS_FOLIO_COPY_TO_CACHE); - filemap_dirty_folio(folio->mapping, folio); - } - } - - if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { - if (folio->index == rreq->no_unlock_folio && - test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) - _debug("no unlock"); - else - folio_unlock(folio); - } - } - rcu_read_unlock(); - - task_io_account_read(account); - if (rreq->netfs_ops->done) - rreq->netfs_ops->done(rreq); -} - static void netfs_cache_expand_readahead(struct netfs_io_request *rreq, unsigned long long *_start, unsigned long long *_len, @@ -183,6 +63,336 @@ static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_in return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx)); } +/* + * Decant the list of folios to read into a rolling buffer. + */ +static size_t netfs_load_buffer_from_ra(struct netfs_io_request *rreq, + struct folio_queue *folioq) +{ + unsigned int order, nr; + size_t size = 0; + + nr = __readahead_batch(rreq->ractl, (struct page **)folioq->vec.folios, + ARRAY_SIZE(folioq->vec.folios)); + folioq->vec.nr = nr; + for (int i = 0; i < nr; i++) { + struct folio *folio = folioq_folio(folioq, i); + + trace_netfs_folio(folio, netfs_folio_trace_read); + order = folio_order(folio); + folioq->orders[i] = order; + size += PAGE_SIZE << order; + } + + for (int i = nr; i < folioq_nr_slots(folioq); i++) + folioq_clear(folioq, i); + + return size; +} + +/* + * netfs_prepare_read_iterator - Prepare the subreq iterator for I/O + * @subreq: The subrequest to be set up + * + * Prepare the I/O iterator representing the read buffer on a subrequest for + * the filesystem to use for I/O (it can be passed directly to a socket). This + * is intended to be called from the ->issue_read() method once the filesystem + * has trimmed the request to the size it wants. + * + * Returns the limited size if successful and -ENOMEM if insufficient memory + * available. + * + * [!] NOTE: This must be run in the same thread as ->issue_read() was called + * in as we access the readahead_control struct. + */ +static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq) +{ + struct netfs_io_request *rreq = subreq->rreq; + size_t rsize = subreq->len; + + if (subreq->source == NETFS_DOWNLOAD_FROM_SERVER) + rsize = umin(rsize, rreq->io_streams[0].sreq_max_len); + + if (rreq->ractl) { + /* If we don't have sufficient folios in the rolling buffer, + * extract a folioq's worth from the readahead region at a time + * into the buffer. Note that this acquires a ref on each page + * that we will need to release later - but we don't want to do + * that until after we've started the I/O. + */ + while (rreq->submitted < subreq->start + rsize) { + struct folio_queue *tail = rreq->buffer_tail, *new; + size_t added; + + new = kmalloc(sizeof(*new), GFP_NOFS); + if (!new) + return -ENOMEM; + netfs_stat(&netfs_n_folioq); + folioq_init(new); + new->prev = tail; + tail->next = new; + rreq->buffer_tail = new; + added = netfs_load_buffer_from_ra(rreq, new); + rreq->iter.count += added; + rreq->submitted += added; + } + } + + subreq->len = rsize; + if (unlikely(rreq->io_streams[0].sreq_max_segs)) { + size_t limit = netfs_limit_iter(&rreq->iter, 0, rsize, + rreq->io_streams[0].sreq_max_segs); + + if (limit < rsize) { + subreq->len = limit; + trace_netfs_sreq(subreq, netfs_sreq_trace_limited); + } + } + + subreq->io_iter = rreq->iter; + + if (iov_iter_is_folioq(&subreq->io_iter)) { + if (subreq->io_iter.folioq_slot >= folioq_nr_slots(subreq->io_iter.folioq)) { + subreq->io_iter.folioq = subreq->io_iter.folioq->next; + subreq->io_iter.folioq_slot = 0; + } + subreq->curr_folioq = (struct folio_queue *)subreq->io_iter.folioq; + subreq->curr_folioq_slot = subreq->io_iter.folioq_slot; + subreq->curr_folio_order = subreq->curr_folioq->orders[subreq->curr_folioq_slot]; + } + + iov_iter_truncate(&subreq->io_iter, subreq->len); + iov_iter_advance(&rreq->iter, subreq->len); + return subreq->len; +} + +static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq, + loff_t i_size) +{ + struct netfs_cache_resources *cres = &rreq->cache_resources; + + if (!cres->ops) + return NETFS_DOWNLOAD_FROM_SERVER; + return cres->ops->prepare_read(subreq, i_size); +} + +static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, + bool was_async) +{ + struct netfs_io_subrequest *subreq = priv; + + if (transferred_or_error < 0) { + netfs_read_subreq_terminated(subreq, transferred_or_error, was_async); + return; + } + + if (transferred_or_error > 0) + subreq->transferred += transferred_or_error; + netfs_read_subreq_terminated(subreq, 0, was_async); +} + +/* + * Issue a read against the cache. + * - Eats the caller's ref on subreq. + */ +static void netfs_read_cache_to_pagecache(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq) +{ + struct netfs_cache_resources *cres = &rreq->cache_resources; + + netfs_stat(&netfs_n_rh_read); + cres->ops->read(cres, subreq->start, &subreq->io_iter, NETFS_READ_HOLE_IGNORE, + netfs_cache_read_terminated, subreq); +} + +/* + * Perform a read to the pagecache from a series of sources of different types, + * slicing up the region to be read according to available cache blocks and + * network rsize. + */ +static void netfs_read_to_pagecache(struct netfs_io_request *rreq) +{ + struct netfs_inode *ictx = netfs_inode(rreq->inode); + unsigned long long start = rreq->start; + ssize_t size = rreq->len; + int ret = 0; + + atomic_inc(&rreq->nr_outstanding); + + do { + struct netfs_io_subrequest *subreq; + enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER; + ssize_t slice; + + subreq = netfs_alloc_subrequest(rreq); + if (!subreq) { + ret = -ENOMEM; + break; + } + + subreq->start = start; + subreq->len = size; + + atomic_inc(&rreq->nr_outstanding); + spin_lock_bh(&rreq->lock); + list_add_tail(&subreq->rreq_link, &rreq->subrequests); + subreq->prev_donated = rreq->prev_donated; + rreq->prev_donated = 0; + trace_netfs_sreq(subreq, netfs_sreq_trace_added); + spin_unlock_bh(&rreq->lock); + + source = netfs_cache_prepare_read(rreq, subreq, rreq->i_size); + subreq->source = source; + if (source == NETFS_DOWNLOAD_FROM_SERVER) { + unsigned long long zp = umin(ictx->zero_point, rreq->i_size); + size_t len = subreq->len; + + if (subreq->start >= zp) { + subreq->source = source = NETFS_FILL_WITH_ZEROES; + goto fill_with_zeroes; + } + + if (len > zp - subreq->start) + len = zp - subreq->start; + if (len == 0) { + pr_err("ZERO-LEN READ: R=%08x[%x] l=%zx/%zx s=%llx z=%llx i=%llx", + rreq->debug_id, subreq->debug_index, + subreq->len, size, + subreq->start, ictx->zero_point, rreq->i_size); + break; + } + subreq->len = len; + + netfs_stat(&netfs_n_rh_download); + if (rreq->netfs_ops->prepare_read) { + ret = rreq->netfs_ops->prepare_read(subreq); + if (ret < 0) { + atomic_dec(&rreq->nr_outstanding); + netfs_put_subrequest(subreq, false, + netfs_sreq_trace_put_cancel); + break; + } + trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); + } + + slice = netfs_prepare_read_iterator(subreq); + if (slice < 0) { + atomic_dec(&rreq->nr_outstanding); + netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); + ret = slice; + break; + } + + rreq->netfs_ops->issue_read(subreq); + goto done; + } + + fill_with_zeroes: + if (source == NETFS_FILL_WITH_ZEROES) { + subreq->source = NETFS_FILL_WITH_ZEROES; + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + netfs_stat(&netfs_n_rh_zero); + slice = netfs_prepare_read_iterator(subreq); + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + netfs_read_subreq_terminated(subreq, 0, false); + goto done; + } + + if (source == NETFS_READ_FROM_CACHE) { + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + slice = netfs_prepare_read_iterator(subreq); + netfs_read_cache_to_pagecache(rreq, subreq); + goto done; + } + + pr_err("Unexpected read source %u\n", source); + WARN_ON_ONCE(1); + break; + + done: + size -= slice; + start += slice; + cond_resched(); + } while (size > 0); + + if (atomic_dec_and_test(&rreq->nr_outstanding)) + netfs_rreq_terminated(rreq, false); + + /* Defer error return as we may need to wait for outstanding I/O. */ + cmpxchg(&rreq->error, 0, ret); +} + +/* + * Wait for the read operation to complete, successfully or otherwise. + */ +static int netfs_wait_for_read(struct netfs_io_request *rreq) +{ + int ret; + + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip); + wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE); + ret = rreq->error; + if (ret == 0 && rreq->submitted < rreq->len) { + trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read); + ret = -EIO; + } + + return ret; +} + +/* + * Set up the initial folioq of buffer folios in the rolling buffer and set the + * iterator to refer to it. + */ +static int netfs_prime_buffer(struct netfs_io_request *rreq) +{ + struct folio_queue *folioq; + size_t added; + + folioq = kmalloc(sizeof(*folioq), GFP_KERNEL); + if (!folioq) + return -ENOMEM; + netfs_stat(&netfs_n_folioq); + folioq_init(folioq); + rreq->buffer = folioq; + rreq->buffer_tail = folioq; + rreq->submitted = rreq->start; + iov_iter_folio_queue(&rreq->iter, ITER_DEST, folioq, 0, 0, 0); + + added = netfs_load_buffer_from_ra(rreq, folioq); + rreq->iter.count += added; + rreq->submitted += added; + return 0; +} + +/* + * Drop the ref on each folio that we inherited from the VM readahead code. We + * still have the folio locks to pin the page until we complete the I/O. + * + * Note that we can't just release the batch in each queue struct as we use the + * occupancy count in other places. + */ +static void netfs_put_ra_refs(struct folio_queue *folioq) +{ + struct folio_batch fbatch; + + folio_batch_init(&fbatch); + while (folioq) { + for (unsigned int slot = 0; slot < folioq_count(folioq); slot++) { + struct folio *folio = folioq_folio(folioq, slot); + if (!folio) + continue; + trace_netfs_folio(folio, netfs_folio_trace_read_put); + if (!folio_batch_add(&fbatch, folio)) + folio_batch_release(&fbatch); + } + folioq = folioq->next; + } + + folio_batch_release(&fbatch); +} + /** * netfs_readahead - Helper to manage a read request * @ractl: The description of the readahead request @@ -201,22 +411,17 @@ static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_in void netfs_readahead(struct readahead_control *ractl) { struct netfs_io_request *rreq; - struct netfs_inode *ctx = netfs_inode(ractl->mapping->host); + struct netfs_inode *ictx = netfs_inode(ractl->mapping->host); + unsigned long long start = readahead_pos(ractl); + size_t size = readahead_length(ractl); int ret; - _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl)); - - if (readahead_count(ractl) == 0) - return; - - rreq = netfs_alloc_request(ractl->mapping, ractl->file, - readahead_pos(ractl), - readahead_length(ractl), + rreq = netfs_alloc_request(ractl->mapping, ractl->file, start, size, NETFS_READAHEAD); if (IS_ERR(rreq)) return; - ret = netfs_begin_cache_read(rreq, ctx); + ret = netfs_begin_cache_read(rreq, ictx); if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) goto cleanup_free; @@ -226,18 +431,15 @@ void netfs_readahead(struct readahead_control *ractl) netfs_rreq_expand(rreq, ractl); - /* Set up the output buffer */ - iov_iter_xarray(&rreq->iter, ITER_DEST, &ractl->mapping->i_pages, - rreq->start, rreq->len); + rreq->ractl = ractl; + if (netfs_prime_buffer(rreq) < 0) + goto cleanup_free; + netfs_read_to_pagecache(rreq); - /* Drop the refs on the folios here rather than in the cache or - * filesystem. The locks will be dropped in netfs_rreq_unlock(). - */ - while (readahead_folio(ractl)) - ; + /* Release the folio refs whilst we're waiting for the I/O. */ + netfs_put_ra_refs(rreq->buffer); - netfs_begin_read(rreq, false); - netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + netfs_put_request(rreq, true, netfs_rreq_trace_put_return); return; cleanup_free: @@ -246,6 +448,117 @@ cleanup_free: } EXPORT_SYMBOL(netfs_readahead); +/* + * Create a rolling buffer with a single occupying folio. + */ +static int netfs_create_singular_buffer(struct netfs_io_request *rreq, struct folio *folio) +{ + struct folio_queue *folioq; + + folioq = kmalloc(sizeof(*folioq), GFP_KERNEL); + if (!folioq) + return -ENOMEM; + + netfs_stat(&netfs_n_folioq); + folioq_init(folioq); + folioq_append(folioq, folio); + BUG_ON(folioq_folio(folioq, 0) != folio); + BUG_ON(folioq_folio_order(folioq, 0) != folio_order(folio)); + rreq->buffer = folioq; + rreq->buffer_tail = folioq; + rreq->submitted = rreq->start + rreq->len; + iov_iter_folio_queue(&rreq->iter, ITER_DEST, folioq, 0, 0, rreq->len); + rreq->ractl = (struct readahead_control *)1UL; + return 0; +} + +/* + * Read into gaps in a folio partially filled by a streaming write. + */ +static int netfs_read_gaps(struct file *file, struct folio *folio) +{ + struct netfs_io_request *rreq; + struct address_space *mapping = folio->mapping; + struct netfs_folio *finfo = netfs_folio_info(folio); + struct netfs_inode *ctx = netfs_inode(mapping->host); + struct folio *sink = NULL; + struct bio_vec *bvec; + unsigned int from = finfo->dirty_offset; + unsigned int to = from + finfo->dirty_len; + unsigned int off = 0, i = 0; + size_t flen = folio_size(folio); + size_t nr_bvec = flen / PAGE_SIZE + 2; + size_t part; + int ret; + + _enter("%lx", folio->index); + + rreq = netfs_alloc_request(mapping, file, folio_pos(folio), flen, NETFS_READ_GAPS); + if (IS_ERR(rreq)) { + ret = PTR_ERR(rreq); + goto alloc_error; + } + + ret = netfs_begin_cache_read(rreq, ctx); + if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) + goto discard; + + netfs_stat(&netfs_n_rh_read_folio); + trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_read_gaps); + + /* Fiddle the buffer so that a gap at the beginning and/or a gap at the + * end get copied to, but the middle is discarded. + */ + ret = -ENOMEM; + bvec = kmalloc_array(nr_bvec, sizeof(*bvec), GFP_KERNEL); + if (!bvec) + goto discard; + + sink = folio_alloc(GFP_KERNEL, 0); + if (!sink) { + kfree(bvec); + goto discard; + } + + trace_netfs_folio(folio, netfs_folio_trace_read_gaps); + + rreq->direct_bv = bvec; + rreq->direct_bv_count = nr_bvec; + if (from > 0) { + bvec_set_folio(&bvec[i++], folio, from, 0); + off = from; + } + while (off < to) { + part = min_t(size_t, to - off, PAGE_SIZE); + bvec_set_folio(&bvec[i++], sink, part, 0); + off += part; + } + if (to < flen) + bvec_set_folio(&bvec[i++], folio, flen - to, to); + iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len); + rreq->submitted = rreq->start + flen; + + netfs_read_to_pagecache(rreq); + + if (sink) + folio_put(sink); + + ret = netfs_wait_for_read(rreq); + if (ret == 0) { + flush_dcache_folio(folio); + folio_mark_uptodate(folio); + } + folio_unlock(folio); + netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + return ret < 0 ? ret : 0; + +discard: + netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); +alloc_error: + folio_unlock(folio); + return ret; +} + /** * netfs_read_folio - Helper to manage a read_folio request * @file: The file to read from @@ -265,9 +578,13 @@ int netfs_read_folio(struct file *file, struct folio *folio) struct address_space *mapping = folio->mapping; struct netfs_io_request *rreq; struct netfs_inode *ctx = netfs_inode(mapping->host); - struct folio *sink = NULL; int ret; + if (folio_test_dirty(folio)) { + trace_netfs_folio(folio, netfs_folio_trace_read_gaps); + return netfs_read_gaps(file, folio); + } + _enter("%lx", folio->index); rreq = netfs_alloc_request(mapping, file, @@ -286,54 +603,12 @@ int netfs_read_folio(struct file *file, struct folio *folio) trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage); /* Set up the output buffer */ - if (folio_test_dirty(folio)) { - /* Handle someone trying to read from an unflushed streaming - * write. We fiddle the buffer so that a gap at the beginning - * and/or a gap at the end get copied to, but the middle is - * discarded. - */ - struct netfs_folio *finfo = netfs_folio_info(folio); - struct bio_vec *bvec; - unsigned int from = finfo->dirty_offset; - unsigned int to = from + finfo->dirty_len; - unsigned int off = 0, i = 0; - size_t flen = folio_size(folio); - size_t nr_bvec = flen / PAGE_SIZE + 2; - size_t part; - - ret = -ENOMEM; - bvec = kmalloc_array(nr_bvec, sizeof(*bvec), GFP_KERNEL); - if (!bvec) - goto discard; - - sink = folio_alloc(GFP_KERNEL, 0); - if (!sink) - goto discard; - - trace_netfs_folio(folio, netfs_folio_trace_read_gaps); - - rreq->direct_bv = bvec; - rreq->direct_bv_count = nr_bvec; - if (from > 0) { - bvec_set_folio(&bvec[i++], folio, from, 0); - off = from; - } - while (off < to) { - part = min_t(size_t, to - off, PAGE_SIZE); - bvec_set_folio(&bvec[i++], sink, part, 0); - off += part; - } - if (to < flen) - bvec_set_folio(&bvec[i++], folio, flen - to, to); - iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len); - } else { - iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, - rreq->start, rreq->len); - } + ret = netfs_create_singular_buffer(rreq, folio); + if (ret < 0) + goto discard; - ret = netfs_begin_read(rreq, true); - if (sink) - folio_put(sink); + netfs_read_to_pagecache(rreq); + ret = netfs_wait_for_read(rreq); netfs_put_request(rreq, false, netfs_rreq_trace_put_return); return ret < 0 ? ret : 0; @@ -395,7 +670,7 @@ zero_out: } /** - * netfs_write_begin - Helper to prepare for writing + * netfs_write_begin - Helper to prepare for writing [DEPRECATED] * @ctx: The netfs context * @file: The file to read from * @mapping: The mapping to read from @@ -406,13 +681,10 @@ zero_out: * * Pre-read data for a write-begin request by drawing data from the cache if * possible, or the netfs if not. Space beyond the EOF is zero-filled. - * Multiple I/O requests from different sources will get munged together. If - * necessary, the readahead window can be expanded in either direction to a - * more convenient alighment for RPC efficiency or to make storage in the cache - * feasible. + * Multiple I/O requests from different sources will get munged together. * * The calling netfs must provide a table of operations, only one of which, - * issue_op, is mandatory. + * issue_read, is mandatory. * * The check_write_begin() operation can be provided to check for and flush * conflicting writes once the folio is grabbed and locked. It is passed a @@ -426,6 +698,9 @@ zero_out: * inode before calling this. * * This is usable whether or not caching is enabled. + * + * Note that this should be considered deprecated and netfs_perform_write() + * used instead. */ int netfs_write_begin(struct netfs_inode *ctx, struct file *file, struct address_space *mapping, @@ -437,8 +712,6 @@ int netfs_write_begin(struct netfs_inode *ctx, pgoff_t index = pos >> PAGE_SHIFT; int ret; - DEFINE_READAHEAD(ractl, file, NULL, mapping, index); - retry: folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); @@ -466,7 +739,7 @@ retry: if (!netfs_is_cache_enabled(ctx) && netfs_skip_folio_read(folio, pos, len, false)) { netfs_stat(&netfs_n_rh_write_zskip); - goto have_folio; + goto have_folio_no_wait; } rreq = netfs_alloc_request(mapping, file, @@ -486,27 +759,22 @@ retry: netfs_stat(&netfs_n_rh_write_begin); trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin); - /* Expand the request to meet caching requirements and download - * preferences. - */ - ractl._nr_pages = folio_nr_pages(folio); - netfs_rreq_expand(rreq, &ractl); - /* Set up the output buffer */ - iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, - rreq->start, rreq->len); - - /* We hold the folio locks, so we can drop the references */ - folio_get(folio); - while (readahead_folio(&ractl)) - ; + ret = netfs_create_singular_buffer(rreq, folio); + if (ret < 0) + goto error_put; - ret = netfs_begin_read(rreq, true); + netfs_read_to_pagecache(rreq); + ret = netfs_wait_for_read(rreq); if (ret < 0) goto error; netfs_put_request(rreq, false, netfs_rreq_trace_put_return); have_folio: + ret = folio_wait_private_2_killable(folio); + if (ret < 0) + goto error; +have_folio_no_wait: *_folio = folio; _leave(" = 0"); return 0; @@ -557,10 +825,13 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio, trace_netfs_read(rreq, start, flen, netfs_read_trace_prefetch_for_write); /* Set up the output buffer */ - iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, - rreq->start, rreq->len); + ret = netfs_create_singular_buffer(rreq, folio); + if (ret < 0) + goto error_put; - ret = netfs_begin_read(rreq, true); + folioq_mark2(rreq->buffer, 0); + netfs_read_to_pagecache(rreq); + ret = netfs_wait_for_read(rreq); netfs_put_request(rreq, false, netfs_rreq_trace_put_return); return ret; diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 4726c315453c..d7eae597e54d 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -13,91 +13,22 @@ #include <linux/pagevec.h> #include "internal.h" -/* - * Determined write method. Adjust netfs_folio_traces if this is changed. - */ -enum netfs_how_to_modify { - NETFS_FOLIO_IS_UPTODATE, /* Folio is uptodate already */ - NETFS_JUST_PREFETCH, /* We have to read the folio anyway */ - NETFS_WHOLE_FOLIO_MODIFY, /* We're going to overwrite the whole folio */ - NETFS_MODIFY_AND_CLEAR, /* We can assume there is no data to be downloaded. */ - NETFS_STREAMING_WRITE, /* Store incomplete data in non-uptodate page. */ - NETFS_STREAMING_WRITE_CONT, /* Continue streaming write. */ - NETFS_FLUSH_CONTENT, /* Flush incompatible content. */ -}; - -static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) +static void __netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) { - void *priv = folio_get_private(folio); - - if (netfs_group && (!priv || priv == NETFS_FOLIO_COPY_TO_CACHE)) + if (netfs_group) folio_attach_private(folio, netfs_get_group(netfs_group)); - else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE) - folio_detach_private(folio); } -/* - * Decide how we should modify a folio. We might be attempting to do - * write-streaming, in which case we don't want to a local RMW cycle if we can - * avoid it. If we're doing local caching or content crypto, we award that - * priority over avoiding RMW. If the file is open readably, then we also - * assume that we may want to read what we wrote. - */ -static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx, - struct file *file, - struct folio *folio, - void *netfs_group, - size_t flen, - size_t offset, - size_t len, - bool maybe_trouble) +static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) { - struct netfs_folio *finfo = netfs_folio_info(folio); - struct netfs_group *group = netfs_folio_group(folio); - loff_t pos = folio_pos(folio); - - _enter(""); - - if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE) - return NETFS_FLUSH_CONTENT; - - if (folio_test_uptodate(folio)) - return NETFS_FOLIO_IS_UPTODATE; - - if (pos >= ctx->zero_point) - return NETFS_MODIFY_AND_CLEAR; - - if (!maybe_trouble && offset == 0 && len >= flen) - return NETFS_WHOLE_FOLIO_MODIFY; - - if (file->f_mode & FMODE_READ) - goto no_write_streaming; - - if (netfs_is_cache_enabled(ctx)) { - /* We don't want to get a streaming write on a file that loses - * caching service temporarily because the backing store got - * culled. - */ - goto no_write_streaming; - } + void *priv = folio_get_private(folio); - if (!finfo) - return NETFS_STREAMING_WRITE; - - /* We can continue a streaming write only if it continues on from the - * previous. If it overlaps, we must flush lest we suffer a partial - * copy and disjoint dirty regions. - */ - if (offset == finfo->dirty_offset + finfo->dirty_len) - return NETFS_STREAMING_WRITE_CONT; - return NETFS_FLUSH_CONTENT; - -no_write_streaming: - if (finfo) { - netfs_stat(&netfs_n_wh_wstream_conflict); - return NETFS_FLUSH_CONTENT; + if (unlikely(priv != netfs_group)) { + if (netfs_group && (!priv || priv == NETFS_FOLIO_COPY_TO_CACHE)) + folio_attach_private(folio, netfs_get_group(netfs_group)); + else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE) + folio_detach_private(folio); } - return NETFS_JUST_PREFETCH; } /* @@ -177,14 +108,11 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, .range_end = iocb->ki_pos + iter->count, }; struct netfs_io_request *wreq = NULL; - struct netfs_folio *finfo; - struct folio *folio, *writethrough = NULL; - enum netfs_how_to_modify howto; - enum netfs_folio_trace trace; + struct folio *folio = NULL, *writethrough = NULL; unsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : 0; ssize_t written = 0, ret, ret2; - loff_t i_size, pos = iocb->ki_pos, from, to; - size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER; + loff_t i_size, pos = iocb->ki_pos; + size_t max_chunk = mapping_max_folio_size(mapping); bool maybe_trouble = false; if (unlikely(test_bit(NETFS_ICTX_WRITETHROUGH, &ctx->flags) || @@ -213,15 +141,14 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, } do { + struct netfs_folio *finfo; + struct netfs_group *group; + unsigned long long fpos; size_t flen; size_t offset; /* Offset into pagecache folio */ size_t part; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ - ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags); - if (unlikely(ret < 0)) - break; - offset = pos & (max_chunk - 1); part = min(max_chunk - offset, iov_iter_count(iter)); @@ -247,7 +174,8 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, } flen = folio_size(folio); - offset = pos & (flen - 1); + fpos = folio_pos(folio); + offset = pos - fpos; part = min_t(size_t, flen - offset, part); /* Wait for writeback to complete. The writeback engine owns @@ -265,71 +193,52 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, goto error_folio_unlock; } - /* See if we need to prefetch the area we're going to modify. - * We need to do this before we get a lock on the folio in case - * there's more than one writer competing for the same cache - * block. + /* Decide how we should modify a folio. We might be attempting + * to do write-streaming, in which case we don't want to a + * local RMW cycle if we can avoid it. If we're doing local + * caching or content crypto, we award that priority over + * avoiding RMW. If the file is open readably, then we also + * assume that we may want to read what we wrote. */ - howto = netfs_how_to_modify(ctx, file, folio, netfs_group, - flen, offset, part, maybe_trouble); - _debug("howto %u", howto); - switch (howto) { - case NETFS_JUST_PREFETCH: - ret = netfs_prefetch_for_write(file, folio, offset, part); - if (ret < 0) { - _debug("prefetch = %zd", ret); - goto error_folio_unlock; - } - break; - case NETFS_FOLIO_IS_UPTODATE: - case NETFS_WHOLE_FOLIO_MODIFY: - case NETFS_STREAMING_WRITE_CONT: - break; - case NETFS_MODIFY_AND_CLEAR: - zero_user_segment(&folio->page, 0, offset); - break; - case NETFS_STREAMING_WRITE: - ret = -EIO; - if (WARN_ON(folio_get_private(folio))) - goto error_folio_unlock; - break; - case NETFS_FLUSH_CONTENT: - trace_netfs_folio(folio, netfs_flush_content); - from = folio_pos(folio); - to = from + folio_size(folio) - 1; - folio_unlock(folio); - folio_put(folio); - ret = filemap_write_and_wait_range(mapping, from, to); - if (ret < 0) - goto error_folio_unlock; - continue; - } - - if (mapping_writably_mapped(mapping)) - flush_dcache_folio(folio); - - copied = copy_folio_from_iter_atomic(folio, offset, part, iter); - - flush_dcache_folio(folio); - - /* Deal with a (partially) failed copy */ - if (copied == 0) { - ret = -EFAULT; - goto error_folio_unlock; + finfo = netfs_folio_info(folio); + group = netfs_folio_group(folio); + + if (unlikely(group != netfs_group) && + group != NETFS_FOLIO_COPY_TO_CACHE) + goto flush_content; + + if (folio_test_uptodate(folio)) { + if (mapping_writably_mapped(mapping)) + flush_dcache_folio(folio); + copied = copy_folio_from_iter_atomic(folio, offset, part, iter); + if (unlikely(copied == 0)) + goto copy_failed; + netfs_set_group(folio, netfs_group); + trace_netfs_folio(folio, netfs_folio_is_uptodate); + goto copied; } - trace = (enum netfs_folio_trace)howto; - switch (howto) { - case NETFS_FOLIO_IS_UPTODATE: - case NETFS_JUST_PREFETCH: - netfs_set_group(folio, netfs_group); - break; - case NETFS_MODIFY_AND_CLEAR: + /* If the page is above the zero-point then we assume that the + * server would just return a block of zeros or a short read if + * we try to read it. + */ + if (fpos >= ctx->zero_point) { + zero_user_segment(&folio->page, 0, offset); + copied = copy_folio_from_iter_atomic(folio, offset, part, iter); + if (unlikely(copied == 0)) + goto copy_failed; zero_user_segment(&folio->page, offset + copied, flen); - netfs_set_group(folio, netfs_group); + __netfs_set_group(folio, netfs_group); folio_mark_uptodate(folio); - break; - case NETFS_WHOLE_FOLIO_MODIFY: + trace_netfs_folio(folio, netfs_modify_and_clear); + goto copied; + } + + /* See if we can write a whole folio in one go. */ + if (!maybe_trouble && offset == 0 && part >= flen) { + copied = copy_folio_from_iter_atomic(folio, offset, part, iter); + if (unlikely(copied == 0)) + goto copy_failed; if (unlikely(copied < part)) { maybe_trouble = true; iov_iter_revert(iter, copied); @@ -337,16 +246,53 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, folio_unlock(folio); goto retry; } - netfs_set_group(folio, netfs_group); + __netfs_set_group(folio, netfs_group); folio_mark_uptodate(folio); - break; - case NETFS_STREAMING_WRITE: + trace_netfs_folio(folio, netfs_whole_folio_modify); + goto copied; + } + + /* We don't want to do a streaming write on a file that loses + * caching service temporarily because the backing store got + * culled and we don't really want to get a streaming write on + * a file that's open for reading as ->read_folio() then has to + * be able to flush it. + */ + if ((file->f_mode & FMODE_READ) || + netfs_is_cache_enabled(ctx)) { + if (finfo) { + netfs_stat(&netfs_n_wh_wstream_conflict); + goto flush_content; + } + ret = netfs_prefetch_for_write(file, folio, offset, part); + if (ret < 0) { + _debug("prefetch = %zd", ret); + goto error_folio_unlock; + } + /* Note that copy-to-cache may have been set. */ + + copied = copy_folio_from_iter_atomic(folio, offset, part, iter); + if (unlikely(copied == 0)) + goto copy_failed; + netfs_set_group(folio, netfs_group); + trace_netfs_folio(folio, netfs_just_prefetch); + goto copied; + } + + if (!finfo) { + ret = -EIO; + if (WARN_ON(folio_get_private(folio))) + goto error_folio_unlock; + copied = copy_folio_from_iter_atomic(folio, offset, part, iter); + if (unlikely(copied == 0)) + goto copy_failed; if (offset == 0 && copied == flen) { - netfs_set_group(folio, netfs_group); + __netfs_set_group(folio, netfs_group); folio_mark_uptodate(folio); - trace = netfs_streaming_filled_page; - break; + trace_netfs_folio(folio, netfs_streaming_filled_page); + goto copied; } + finfo = kzalloc(sizeof(*finfo), GFP_KERNEL); if (!finfo) { iov_iter_revert(iter, copied); @@ -358,9 +304,18 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, finfo->dirty_len = copied; folio_attach_private(folio, (void *)((unsigned long)finfo | NETFS_FOLIO_INFO)); - break; - case NETFS_STREAMING_WRITE_CONT: - finfo = netfs_folio_info(folio); + trace_netfs_folio(folio, netfs_streaming_write); + goto copied; + } + + /* We can continue a streaming write only if it continues on + * from the previous. If it overlaps, we must flush lest we + * suffer a partial copy and disjoint dirty regions. + */ + if (offset == finfo->dirty_offset + finfo->dirty_len) { + copied = copy_folio_from_iter_atomic(folio, offset, part, iter); + if (unlikely(copied == 0)) + goto copy_failed; finfo->dirty_len += copied; if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) { if (finfo->netfs_group) @@ -369,17 +324,25 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, folio_detach_private(folio); folio_mark_uptodate(folio); kfree(finfo); - trace = netfs_streaming_cont_filled_page; + trace_netfs_folio(folio, netfs_streaming_cont_filled_page); + } else { + trace_netfs_folio(folio, netfs_streaming_write_cont); } - break; - default: - WARN(true, "Unexpected modify type %u ix=%lx\n", - howto, folio->index); - ret = -EIO; - goto error_folio_unlock; + goto copied; } - trace_netfs_folio(folio, trace); + /* Incompatible write; flush the folio and try again. */ + flush_content: + trace_netfs_folio(folio, netfs_flush_content); + folio_unlock(folio); + folio_put(folio); + ret = filemap_write_and_wait_range(mapping, fpos, fpos + flen - 1); + if (ret < 0) + goto error_folio_unlock; + continue; + + copied: + flush_dcache_folio(folio); /* Update the inode size if we moved the EOF marker */ pos += copied; @@ -401,12 +364,22 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, folio_put(folio); folio = NULL; + ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags); + if (unlikely(ret < 0)) + break; + cond_resched(); } while (iov_iter_count(iter)); out: - if (likely(written) && ctx->ops->post_modify) - ctx->ops->post_modify(inode); + if (likely(written)) { + /* Set indication that ctime and mtime got updated in case + * close is deferred. + */ + set_bit(NETFS_ICTX_MODIFIED_ATTR, &ctx->flags); + if (unlikely(ctx->ops->post_modify)) + ctx->ops->post_modify(inode); + } if (unlikely(wreq)) { ret2 = netfs_end_writethrough(wreq, &wbc, writethrough); @@ -421,6 +394,8 @@ out: _leave(" = %zd [%zd]", written, ret); return written ? written : ret; +copy_failed: + ret = -EFAULT; error_folio_unlock: folio_unlock(folio); folio_put(folio); diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c index 10a1e4da6bda..b1a66a6e6bc2 100644 --- a/fs/netfs/direct_read.c +++ b/fs/netfs/direct_read.c @@ -16,6 +16,143 @@ #include <linux/netfs.h> #include "internal.h" +static void netfs_prepare_dio_read_iterator(struct netfs_io_subrequest *subreq) +{ + struct netfs_io_request *rreq = subreq->rreq; + size_t rsize; + + rsize = umin(subreq->len, rreq->io_streams[0].sreq_max_len); + subreq->len = rsize; + + if (unlikely(rreq->io_streams[0].sreq_max_segs)) { + size_t limit = netfs_limit_iter(&rreq->iter, 0, rsize, + rreq->io_streams[0].sreq_max_segs); + + if (limit < rsize) { + subreq->len = limit; + trace_netfs_sreq(subreq, netfs_sreq_trace_limited); + } + } + + trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); + + subreq->io_iter = rreq->iter; + iov_iter_truncate(&subreq->io_iter, subreq->len); + iov_iter_advance(&rreq->iter, subreq->len); +} + +/* + * Perform a read to a buffer from the server, slicing up the region to be read + * according to the network rsize. + */ +static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq) +{ + unsigned long long start = rreq->start; + ssize_t size = rreq->len; + int ret = 0; + + atomic_set(&rreq->nr_outstanding, 1); + + do { + struct netfs_io_subrequest *subreq; + ssize_t slice; + + subreq = netfs_alloc_subrequest(rreq); + if (!subreq) { + ret = -ENOMEM; + break; + } + + subreq->source = NETFS_DOWNLOAD_FROM_SERVER; + subreq->start = start; + subreq->len = size; + + atomic_inc(&rreq->nr_outstanding); + spin_lock_bh(&rreq->lock); + list_add_tail(&subreq->rreq_link, &rreq->subrequests); + subreq->prev_donated = rreq->prev_donated; + rreq->prev_donated = 0; + trace_netfs_sreq(subreq, netfs_sreq_trace_added); + spin_unlock_bh(&rreq->lock); + + netfs_stat(&netfs_n_rh_download); + if (rreq->netfs_ops->prepare_read) { + ret = rreq->netfs_ops->prepare_read(subreq); + if (ret < 0) { + atomic_dec(&rreq->nr_outstanding); + netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); + break; + } + } + + netfs_prepare_dio_read_iterator(subreq); + slice = subreq->len; + rreq->netfs_ops->issue_read(subreq); + + size -= slice; + start += slice; + rreq->submitted += slice; + + if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) && + test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags)) + break; + cond_resched(); + } while (size > 0); + + if (atomic_dec_and_test(&rreq->nr_outstanding)) + netfs_rreq_terminated(rreq, false); + return ret; +} + +/* + * Perform a read to an application buffer, bypassing the pagecache and the + * local disk cache. + */ +static int netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync) +{ + int ret; + + _enter("R=%x %llx-%llx", + rreq->debug_id, rreq->start, rreq->start + rreq->len - 1); + + if (rreq->len == 0) { + pr_err("Zero-sized read [R=%x]\n", rreq->debug_id); + return -EIO; + } + + // TODO: Use bounce buffer if requested + + inode_dio_begin(rreq->inode); + + ret = netfs_dispatch_unbuffered_reads(rreq); + + if (!rreq->submitted) { + netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit); + inode_dio_end(rreq->inode); + ret = 0; + goto out; + } + + if (sync) { + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip); + wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS, + TASK_UNINTERRUPTIBLE); + + ret = rreq->error; + if (ret == 0 && rreq->submitted < rreq->len && + rreq->origin != NETFS_DIO_READ) { + trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read); + ret = -EIO; + } + } else { + ret = -EIOCBQUEUED; + } + +out: + _leave(" = %d", ret); + return ret; +} + /** * netfs_unbuffered_read_iter_locked - Perform an unbuffered or direct I/O read * @iocb: The I/O control descriptor describing the read @@ -31,7 +168,7 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i struct netfs_io_request *rreq; ssize_t ret; size_t orig_count = iov_iter_count(iter); - bool async = !is_sync_kiocb(iocb); + bool sync = is_sync_kiocb(iocb); _enter(""); @@ -78,13 +215,13 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i // TODO: Set up bounce buffer if needed - if (async) + if (!sync) rreq->iocb = iocb; - ret = netfs_begin_read(rreq, is_sync_kiocb(iocb)); + ret = netfs_unbuffered_read(rreq, sync); if (ret < 0) goto out; /* May be -EIOCBQUEUED */ - if (!async) { + if (sync) { // TODO: Copy from bounce buffer iocb->ki_pos += rreq->transferred; ret = rreq->transferred; @@ -94,8 +231,6 @@ out: netfs_put_request(rreq, false, netfs_rreq_trace_put_return); if (ret > 0) orig_count -= ret; - if (ret != -EIOCBQUEUED) - iov_iter_revert(iter, orig_count - iov_iter_count(iter)); return ret; } EXPORT_SYMBOL(netfs_unbuffered_read_iter_locked); diff --git a/fs/netfs/fscache_cookie.c b/fs/netfs/fscache_cookie.c index bce2492186d0..d4d4b3a8b106 100644 --- a/fs/netfs/fscache_cookie.c +++ b/fs/netfs/fscache_cookie.c @@ -741,6 +741,10 @@ again_locked: spin_lock(&cookie->lock); } if (test_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags)) { + if (atomic_read(&cookie->n_accesses) != 0) + /* still being accessed: postpone it */ + break; + __fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_LRU_DISCARDING); wake = true; diff --git a/fs/netfs/fscache_main.c b/fs/netfs/fscache_main.c index 42e98bb523e3..49849005eb7c 100644 --- a/fs/netfs/fscache_main.c +++ b/fs/netfs/fscache_main.c @@ -103,6 +103,7 @@ void __exit fscache_exit(void) kmem_cache_destroy(fscache_cookie_jar); fscache_proc_cleanup(); + timer_shutdown_sync(&fscache_cookie_lru_timer); destroy_workqueue(fscache_wq); pr_notice("FS-Cache unloaded\n"); } diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index 7773f3d855a9..c9f0ed24cb7b 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -7,6 +7,7 @@ #include <linux/slab.h> #include <linux/seq_file.h> +#include <linux/folio_queue.h> #include <linux/netfs.h> #include <linux/fscache.h> #include <linux/fscache-cache.h> @@ -22,16 +23,10 @@ /* * buffered_read.c */ -void netfs_rreq_unlock_folios(struct netfs_io_request *rreq); int netfs_prefetch_for_write(struct file *file, struct folio *folio, size_t offset, size_t len); /* - * io.c - */ -int netfs_begin_read(struct netfs_io_request *rreq, bool sync); - -/* * main.c */ extern unsigned int netfs_debug; @@ -63,6 +58,11 @@ static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {} /* * misc.c */ +int netfs_buffer_append_folio(struct netfs_io_request *rreq, struct folio *folio, + bool needs_put); +struct folio_queue *netfs_delete_buffer_head(struct netfs_io_request *wreq); +void netfs_clear_buffer(struct netfs_io_request *rreq); +void netfs_reset_iter(struct netfs_io_subrequest *subreq); /* * objects.c @@ -84,6 +84,28 @@ static inline void netfs_see_request(struct netfs_io_request *rreq, } /* + * read_collect.c + */ +void netfs_read_termination_worker(struct work_struct *work); +void netfs_rreq_terminated(struct netfs_io_request *rreq, bool was_async); + +/* + * read_pgpriv2.c + */ +void netfs_pgpriv2_mark_copy_to_cache(struct netfs_io_subrequest *subreq, + struct netfs_io_request *rreq, + struct folio_queue *folioq, + int slot); +void netfs_pgpriv2_write_to_the_cache(struct netfs_io_request *rreq); +bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq); + +/* + * read_retry.c + */ +void netfs_retry_reads(struct netfs_io_request *rreq); +void netfs_unlock_abandoned_read_pages(struct netfs_io_request *rreq); + +/* * stats.c */ #ifdef CONFIG_NETFS_STATS @@ -110,6 +132,7 @@ extern atomic_t netfs_n_wh_buffered_write; extern atomic_t netfs_n_wh_writethrough; extern atomic_t netfs_n_wh_dio_write; extern atomic_t netfs_n_wh_writepages; +extern atomic_t netfs_n_wh_copy_to_cache; extern atomic_t netfs_n_wh_wstream_conflict; extern atomic_t netfs_n_wh_upload; extern atomic_t netfs_n_wh_upload_done; @@ -117,6 +140,9 @@ extern atomic_t netfs_n_wh_upload_failed; extern atomic_t netfs_n_wh_write; extern atomic_t netfs_n_wh_write_done; extern atomic_t netfs_n_wh_write_failed; +extern atomic_t netfs_n_wb_lock_skip; +extern atomic_t netfs_n_wb_lock_wait; +extern atomic_t netfs_n_folioq; int netfs_stats_show(struct seq_file *m, void *v); @@ -150,7 +176,10 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, loff_t start, enum netfs_io_origin origin); void netfs_reissue_write(struct netfs_io_stream *stream, - struct netfs_io_subrequest *subreq); + struct netfs_io_subrequest *subreq, + struct iov_iter *source); +void netfs_issue_write(struct netfs_io_request *wreq, + struct netfs_io_stream *stream); int netfs_advance_write(struct netfs_io_request *wreq, struct netfs_io_stream *stream, loff_t start, size_t len, bool to_eof); diff --git a/fs/netfs/io.c b/fs/netfs/io.c deleted file mode 100644 index c93851b98368..000000000000 --- a/fs/netfs/io.c +++ /dev/null @@ -1,647 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* Network filesystem high-level read support. - * - * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#include <linux/module.h> -#include <linux/export.h> -#include <linux/fs.h> -#include <linux/mm.h> -#include <linux/pagemap.h> -#include <linux/slab.h> -#include <linux/uio.h> -#include <linux/sched/mm.h> -#include <linux/task_io_accounting_ops.h> -#include "internal.h" - -/* - * Clear the unread part of an I/O request. - */ -static void netfs_clear_unread(struct netfs_io_subrequest *subreq) -{ - iov_iter_zero(iov_iter_count(&subreq->io_iter), &subreq->io_iter); -} - -static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, - bool was_async) -{ - struct netfs_io_subrequest *subreq = priv; - - netfs_subreq_terminated(subreq, transferred_or_error, was_async); -} - -/* - * Issue a read against the cache. - * - Eats the caller's ref on subreq. - */ -static void netfs_read_from_cache(struct netfs_io_request *rreq, - struct netfs_io_subrequest *subreq, - enum netfs_read_from_hole read_hole) -{ - struct netfs_cache_resources *cres = &rreq->cache_resources; - - netfs_stat(&netfs_n_rh_read); - cres->ops->read(cres, subreq->start, &subreq->io_iter, read_hole, - netfs_cache_read_terminated, subreq); -} - -/* - * Fill a subrequest region with zeroes. - */ -static void netfs_fill_with_zeroes(struct netfs_io_request *rreq, - struct netfs_io_subrequest *subreq) -{ - netfs_stat(&netfs_n_rh_zero); - __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); - netfs_subreq_terminated(subreq, 0, false); -} - -/* - * Ask the netfs to issue a read request to the server for us. - * - * The netfs is expected to read from subreq->pos + subreq->transferred to - * subreq->pos + subreq->len - 1. It may not backtrack and write data into the - * buffer prior to the transferred point as it might clobber dirty data - * obtained from the cache. - * - * Alternatively, the netfs is allowed to indicate one of two things: - * - * - NETFS_SREQ_SHORT_READ: A short read - it will get called again to try and - * make progress. - * - * - NETFS_SREQ_CLEAR_TAIL: A short read - the rest of the buffer will be - * cleared. - */ -static void netfs_read_from_server(struct netfs_io_request *rreq, - struct netfs_io_subrequest *subreq) -{ - netfs_stat(&netfs_n_rh_download); - - if (rreq->origin != NETFS_DIO_READ && - iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred) - pr_warn("R=%08x[%u] ITER PRE-MISMATCH %zx != %zx-%zx %lx\n", - rreq->debug_id, subreq->debug_index, - iov_iter_count(&subreq->io_iter), subreq->len, - subreq->transferred, subreq->flags); - rreq->netfs_ops->issue_read(subreq); -} - -/* - * Release those waiting. - */ -static void netfs_rreq_completed(struct netfs_io_request *rreq, bool was_async) -{ - trace_netfs_rreq(rreq, netfs_rreq_trace_done); - netfs_clear_subrequests(rreq, was_async); - netfs_put_request(rreq, was_async, netfs_rreq_trace_put_complete); -} - -/* - * Handle a short read. - */ -static void netfs_rreq_short_read(struct netfs_io_request *rreq, - struct netfs_io_subrequest *subreq) -{ - __clear_bit(NETFS_SREQ_SHORT_IO, &subreq->flags); - __set_bit(NETFS_SREQ_SEEK_DATA_READ, &subreq->flags); - - netfs_stat(&netfs_n_rh_short_read); - trace_netfs_sreq(subreq, netfs_sreq_trace_resubmit_short); - - netfs_get_subrequest(subreq, netfs_sreq_trace_get_short_read); - atomic_inc(&rreq->nr_outstanding); - if (subreq->source == NETFS_READ_FROM_CACHE) - netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_CLEAR); - else - netfs_read_from_server(rreq, subreq); -} - -/* - * Reset the subrequest iterator prior to resubmission. - */ -static void netfs_reset_subreq_iter(struct netfs_io_request *rreq, - struct netfs_io_subrequest *subreq) -{ - size_t remaining = subreq->len - subreq->transferred; - size_t count = iov_iter_count(&subreq->io_iter); - - if (count == remaining) - return; - - _debug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x\n", - rreq->debug_id, subreq->debug_index, - iov_iter_count(&subreq->io_iter), subreq->transferred, - subreq->len, rreq->i_size, - subreq->io_iter.iter_type); - - if (count < remaining) - iov_iter_revert(&subreq->io_iter, remaining - count); - else - iov_iter_advance(&subreq->io_iter, count - remaining); -} - -/* - * Resubmit any short or failed operations. Returns true if we got the rreq - * ref back. - */ -static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq) -{ - struct netfs_io_subrequest *subreq; - - WARN_ON(in_interrupt()); - - trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit); - - /* We don't want terminating submissions trying to wake us up whilst - * we're still going through the list. - */ - atomic_inc(&rreq->nr_outstanding); - - __clear_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); - list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { - if (subreq->error) { - if (subreq->source != NETFS_READ_FROM_CACHE) - break; - subreq->source = NETFS_DOWNLOAD_FROM_SERVER; - subreq->error = 0; - netfs_stat(&netfs_n_rh_download_instead); - trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead); - netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); - atomic_inc(&rreq->nr_outstanding); - netfs_reset_subreq_iter(rreq, subreq); - netfs_read_from_server(rreq, subreq); - } else if (test_bit(NETFS_SREQ_SHORT_IO, &subreq->flags)) { - netfs_rreq_short_read(rreq, subreq); - } - } - - /* If we decrement nr_outstanding to 0, the usage ref belongs to us. */ - if (atomic_dec_and_test(&rreq->nr_outstanding)) - return true; - - wake_up_var(&rreq->nr_outstanding); - return false; -} - -/* - * Check to see if the data read is still valid. - */ -static void netfs_rreq_is_still_valid(struct netfs_io_request *rreq) -{ - struct netfs_io_subrequest *subreq; - - if (!rreq->netfs_ops->is_still_valid || - rreq->netfs_ops->is_still_valid(rreq)) - return; - - list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { - if (subreq->source == NETFS_READ_FROM_CACHE) { - subreq->error = -ESTALE; - __set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); - } - } -} - -/* - * Determine how much we can admit to having read from a DIO read. - */ -static void netfs_rreq_assess_dio(struct netfs_io_request *rreq) -{ - struct netfs_io_subrequest *subreq; - unsigned int i; - size_t transferred = 0; - - for (i = 0; i < rreq->direct_bv_count; i++) { - flush_dcache_page(rreq->direct_bv[i].bv_page); - // TODO: cifs marks pages in the destination buffer - // dirty under some circumstances after a read. Do we - // need to do that too? - set_page_dirty(rreq->direct_bv[i].bv_page); - } - - list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { - if (subreq->error || subreq->transferred == 0) - break; - transferred += subreq->transferred; - if (subreq->transferred < subreq->len) - break; - } - - for (i = 0; i < rreq->direct_bv_count; i++) - flush_dcache_page(rreq->direct_bv[i].bv_page); - - rreq->transferred = transferred; - task_io_account_read(transferred); - - if (rreq->iocb) { - rreq->iocb->ki_pos += transferred; - if (rreq->iocb->ki_complete) - rreq->iocb->ki_complete( - rreq->iocb, rreq->error ? rreq->error : transferred); - } - if (rreq->netfs_ops->done) - rreq->netfs_ops->done(rreq); - inode_dio_end(rreq->inode); -} - -/* - * Assess the state of a read request and decide what to do next. - * - * Note that we could be in an ordinary kernel thread, on a workqueue or in - * softirq context at this point. We inherit a ref from the caller. - */ -static void netfs_rreq_assess(struct netfs_io_request *rreq, bool was_async) -{ - trace_netfs_rreq(rreq, netfs_rreq_trace_assess); - -again: - netfs_rreq_is_still_valid(rreq); - - if (!test_bit(NETFS_RREQ_FAILED, &rreq->flags) && - test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags)) { - if (netfs_rreq_perform_resubmissions(rreq)) - goto again; - return; - } - - if (rreq->origin != NETFS_DIO_READ) - netfs_rreq_unlock_folios(rreq); - else - netfs_rreq_assess_dio(rreq); - - trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip); - clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags); - wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS); - - netfs_rreq_completed(rreq, was_async); -} - -static void netfs_rreq_work(struct work_struct *work) -{ - struct netfs_io_request *rreq = - container_of(work, struct netfs_io_request, work); - netfs_rreq_assess(rreq, false); -} - -/* - * Handle the completion of all outstanding I/O operations on a read request. - * We inherit a ref from the caller. - */ -static void netfs_rreq_terminated(struct netfs_io_request *rreq, - bool was_async) -{ - if (test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags) && - was_async) { - if (!queue_work(system_unbound_wq, &rreq->work)) - BUG(); - } else { - netfs_rreq_assess(rreq, was_async); - } -} - -/** - * netfs_subreq_terminated - Note the termination of an I/O operation. - * @subreq: The I/O request that has terminated. - * @transferred_or_error: The amount of data transferred or an error code. - * @was_async: The termination was asynchronous - * - * This tells the read helper that a contributory I/O operation has terminated, - * one way or another, and that it should integrate the results. - * - * The caller indicates in @transferred_or_error the outcome of the operation, - * supplying a positive value to indicate the number of bytes transferred, 0 to - * indicate a failure to transfer anything that should be retried or a negative - * error code. The helper will look after reissuing I/O operations as - * appropriate and writing downloaded data to the cache. - * - * If @was_async is true, the caller might be running in softirq or interrupt - * context and we can't sleep. - */ -void netfs_subreq_terminated(struct netfs_io_subrequest *subreq, - ssize_t transferred_or_error, - bool was_async) -{ - struct netfs_io_request *rreq = subreq->rreq; - int u; - - _enter("R=%x[%x]{%llx,%lx},%zd", - rreq->debug_id, subreq->debug_index, - subreq->start, subreq->flags, transferred_or_error); - - switch (subreq->source) { - case NETFS_READ_FROM_CACHE: - netfs_stat(&netfs_n_rh_read_done); - break; - case NETFS_DOWNLOAD_FROM_SERVER: - netfs_stat(&netfs_n_rh_download_done); - break; - default: - break; - } - - if (IS_ERR_VALUE(transferred_or_error)) { - subreq->error = transferred_or_error; - trace_netfs_failure(rreq, subreq, transferred_or_error, - netfs_fail_read); - goto failed; - } - - if (WARN(transferred_or_error > subreq->len - subreq->transferred, - "Subreq overread: R%x[%x] %zd > %zu - %zu", - rreq->debug_id, subreq->debug_index, - transferred_or_error, subreq->len, subreq->transferred)) - transferred_or_error = subreq->len - subreq->transferred; - - subreq->error = 0; - subreq->transferred += transferred_or_error; - if (subreq->transferred < subreq->len) - goto incomplete; - -complete: - __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); - if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) - set_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags); - -out: - trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); - - /* If we decrement nr_outstanding to 0, the ref belongs to us. */ - u = atomic_dec_return(&rreq->nr_outstanding); - if (u == 0) - netfs_rreq_terminated(rreq, was_async); - else if (u == 1) - wake_up_var(&rreq->nr_outstanding); - - netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); - return; - -incomplete: - if (test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags)) { - netfs_clear_unread(subreq); - subreq->transferred = subreq->len; - goto complete; - } - - if (transferred_or_error == 0) { - if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) { - subreq->error = -ENODATA; - goto failed; - } - } else { - __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); - } - - __set_bit(NETFS_SREQ_SHORT_IO, &subreq->flags); - set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); - goto out; - -failed: - if (subreq->source == NETFS_READ_FROM_CACHE) { - netfs_stat(&netfs_n_rh_read_failed); - set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); - } else { - netfs_stat(&netfs_n_rh_download_failed); - set_bit(NETFS_RREQ_FAILED, &rreq->flags); - rreq->error = subreq->error; - } - goto out; -} -EXPORT_SYMBOL(netfs_subreq_terminated); - -static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_subrequest *subreq, - loff_t i_size) -{ - struct netfs_io_request *rreq = subreq->rreq; - struct netfs_cache_resources *cres = &rreq->cache_resources; - - if (cres->ops) - return cres->ops->prepare_read(subreq, i_size); - if (subreq->start >= rreq->i_size) - return NETFS_FILL_WITH_ZEROES; - return NETFS_DOWNLOAD_FROM_SERVER; -} - -/* - * Work out what sort of subrequest the next one will be. - */ -static enum netfs_io_source -netfs_rreq_prepare_read(struct netfs_io_request *rreq, - struct netfs_io_subrequest *subreq, - struct iov_iter *io_iter) -{ - enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER; - struct netfs_inode *ictx = netfs_inode(rreq->inode); - size_t lsize; - - _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size); - - if (rreq->origin != NETFS_DIO_READ) { - source = netfs_cache_prepare_read(subreq, rreq->i_size); - if (source == NETFS_INVALID_READ) - goto out; - } - - if (source == NETFS_DOWNLOAD_FROM_SERVER) { - /* Call out to the netfs to let it shrink the request to fit - * its own I/O sizes and boundaries. If it shinks it here, it - * will be called again to make simultaneous calls; if it wants - * to make serial calls, it can indicate a short read and then - * we will call it again. - */ - if (rreq->origin != NETFS_DIO_READ) { - if (subreq->start >= ictx->zero_point) { - source = NETFS_FILL_WITH_ZEROES; - goto set; - } - if (subreq->len > ictx->zero_point - subreq->start) - subreq->len = ictx->zero_point - subreq->start; - } - if (subreq->len > rreq->i_size - subreq->start) - subreq->len = rreq->i_size - subreq->start; - if (rreq->rsize && subreq->len > rreq->rsize) - subreq->len = rreq->rsize; - - if (rreq->netfs_ops->clamp_length && - !rreq->netfs_ops->clamp_length(subreq)) { - source = NETFS_INVALID_READ; - goto out; - } - - if (subreq->max_nr_segs) { - lsize = netfs_limit_iter(io_iter, 0, subreq->len, - subreq->max_nr_segs); - if (subreq->len > lsize) { - subreq->len = lsize; - trace_netfs_sreq(subreq, netfs_sreq_trace_limited); - } - } - } - -set: - if (subreq->len > rreq->len) - pr_warn("R=%08x[%u] SREQ>RREQ %zx > %llx\n", - rreq->debug_id, subreq->debug_index, - subreq->len, rreq->len); - - if (WARN_ON(subreq->len == 0)) { - source = NETFS_INVALID_READ; - goto out; - } - - subreq->source = source; - trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); - - subreq->io_iter = *io_iter; - iov_iter_truncate(&subreq->io_iter, subreq->len); - iov_iter_advance(io_iter, subreq->len); -out: - subreq->source = source; - trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); - return source; -} - -/* - * Slice off a piece of a read request and submit an I/O request for it. - */ -static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq, - struct iov_iter *io_iter) -{ - struct netfs_io_subrequest *subreq; - enum netfs_io_source source; - - subreq = netfs_alloc_subrequest(rreq); - if (!subreq) - return false; - - subreq->start = rreq->start + rreq->submitted; - subreq->len = io_iter->count; - - _debug("slice %llx,%zx,%llx", subreq->start, subreq->len, rreq->submitted); - list_add_tail(&subreq->rreq_link, &rreq->subrequests); - - /* Call out to the cache to find out what it can do with the remaining - * subset. It tells us in subreq->flags what it decided should be done - * and adjusts subreq->len down if the subset crosses a cache boundary. - * - * Then when we hand the subset, it can choose to take a subset of that - * (the starts must coincide), in which case, we go around the loop - * again and ask it to download the next piece. - */ - source = netfs_rreq_prepare_read(rreq, subreq, io_iter); - if (source == NETFS_INVALID_READ) - goto subreq_failed; - - atomic_inc(&rreq->nr_outstanding); - - rreq->submitted += subreq->len; - - trace_netfs_sreq(subreq, netfs_sreq_trace_submit); - switch (source) { - case NETFS_FILL_WITH_ZEROES: - netfs_fill_with_zeroes(rreq, subreq); - break; - case NETFS_DOWNLOAD_FROM_SERVER: - netfs_read_from_server(rreq, subreq); - break; - case NETFS_READ_FROM_CACHE: - netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_IGNORE); - break; - default: - BUG(); - } - - return true; - -subreq_failed: - rreq->error = subreq->error; - netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_failed); - return false; -} - -/* - * Begin the process of reading in a chunk of data, where that data may be - * stitched together from multiple sources, including multiple servers and the - * local cache. - */ -int netfs_begin_read(struct netfs_io_request *rreq, bool sync) -{ - struct iov_iter io_iter; - int ret; - - _enter("R=%x %llx-%llx", - rreq->debug_id, rreq->start, rreq->start + rreq->len - 1); - - if (rreq->len == 0) { - pr_err("Zero-sized read [R=%x]\n", rreq->debug_id); - return -EIO; - } - - if (rreq->origin == NETFS_DIO_READ) - inode_dio_begin(rreq->inode); - - // TODO: Use bounce buffer if requested - rreq->io_iter = rreq->iter; - - INIT_WORK(&rreq->work, netfs_rreq_work); - - /* Chop the read into slices according to what the cache and the netfs - * want and submit each one. - */ - netfs_get_request(rreq, netfs_rreq_trace_get_for_outstanding); - atomic_set(&rreq->nr_outstanding, 1); - io_iter = rreq->io_iter; - do { - _debug("submit %llx + %llx >= %llx", - rreq->start, rreq->submitted, rreq->i_size); - if (rreq->origin == NETFS_DIO_READ && - rreq->start + rreq->submitted >= rreq->i_size) - break; - if (!netfs_rreq_submit_slice(rreq, &io_iter)) - break; - if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) && - test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags)) - break; - - } while (rreq->submitted < rreq->len); - - if (!rreq->submitted) { - netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit); - if (rreq->origin == NETFS_DIO_READ) - inode_dio_end(rreq->inode); - ret = 0; - goto out; - } - - if (sync) { - /* Keep nr_outstanding incremented so that the ref always - * belongs to us, and the service code isn't punted off to a - * random thread pool to process. Note that this might start - * further work, such as writing to the cache. - */ - wait_var_event(&rreq->nr_outstanding, - atomic_read(&rreq->nr_outstanding) == 1); - if (atomic_dec_and_test(&rreq->nr_outstanding)) - netfs_rreq_assess(rreq, false); - - trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip); - wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS, - TASK_UNINTERRUPTIBLE); - - ret = rreq->error; - if (ret == 0 && rreq->submitted < rreq->len && - rreq->origin != NETFS_DIO_READ) { - trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read); - ret = -EIO; - } - } else { - /* If we decrement nr_outstanding to 0, the ref belongs to us. */ - if (atomic_dec_and_test(&rreq->nr_outstanding)) - netfs_rreq_assess(rreq, false); - ret = -EIOCBQUEUED; - } - -out: - return ret; -} diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c index b781bbbf1d8d..72a435e5fc6d 100644 --- a/fs/netfs/iterator.c +++ b/fs/netfs/iterator.c @@ -188,9 +188,59 @@ static size_t netfs_limit_xarray(const struct iov_iter *iter, size_t start_offse return min(span, max_size); } +/* + * Select the span of a folio queue iterator we're going to use. Limit it by + * both maximum size and maximum number of segments. Returns the size of the + * span in bytes. + */ +static size_t netfs_limit_folioq(const struct iov_iter *iter, size_t start_offset, + size_t max_size, size_t max_segs) +{ + const struct folio_queue *folioq = iter->folioq; + unsigned int nsegs = 0; + unsigned int slot = iter->folioq_slot; + size_t span = 0, n = iter->count; + + if (WARN_ON(!iov_iter_is_folioq(iter)) || + WARN_ON(start_offset > n) || + n == 0) + return 0; + max_size = umin(max_size, n - start_offset); + + if (slot >= folioq_nr_slots(folioq)) { + folioq = folioq->next; + slot = 0; + } + + start_offset += iter->iov_offset; + do { + size_t flen = folioq_folio_size(folioq, slot); + + if (start_offset < flen) { + span += flen - start_offset; + nsegs++; + start_offset = 0; + } else { + start_offset -= flen; + } + if (span >= max_size || nsegs >= max_segs) + break; + + slot++; + if (slot >= folioq_nr_slots(folioq)) { + folioq = folioq->next; + slot = 0; + } + } while (folioq); + + return umin(span, max_size); +} + size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset, size_t max_size, size_t max_segs) { + if (iov_iter_is_folioq(iter)) + return netfs_limit_folioq(iter, start_offset, max_size, max_segs); if (iov_iter_is_bvec(iter)) return netfs_limit_bvec(iter, start_offset, max_size, max_segs); if (iov_iter_is_xarray(iter)) diff --git a/fs/netfs/locking.c b/fs/netfs/locking.c index 75dc52a49b3a..21eab56ee2f9 100644 --- a/fs/netfs/locking.c +++ b/fs/netfs/locking.c @@ -19,25 +19,13 @@ * Must be called under a lock that serializes taking new references * to i_dio_count, usually by inode->i_mutex. */ -static int inode_dio_wait_interruptible(struct inode *inode) +static int netfs_inode_dio_wait_interruptible(struct inode *inode) { - if (!atomic_read(&inode->i_dio_count)) + if (inode_dio_finished(inode)) return 0; - wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP); - DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP); - - for (;;) { - prepare_to_wait(wq, &q.wq_entry, TASK_INTERRUPTIBLE); - if (!atomic_read(&inode->i_dio_count)) - break; - if (signal_pending(current)) - break; - schedule(); - } - finish_wait(wq, &q.wq_entry); - - return atomic_read(&inode->i_dio_count) ? -ERESTARTSYS : 0; + inode_dio_wait_interruptible(inode); + return !inode_dio_finished(inode) ? -ERESTARTSYS : 0; } /* Call with exclusively locked inode->i_rwsem */ @@ -46,7 +34,7 @@ static int netfs_block_o_direct(struct netfs_inode *ictx) if (!test_bit(NETFS_ICTX_ODIRECT, &ictx->flags)) return 0; clear_bit(NETFS_ICTX_ODIRECT, &ictx->flags); - return inode_dio_wait_interruptible(&ictx->inode); + return netfs_inode_dio_wait_interruptible(&ictx->inode); } /** diff --git a/fs/netfs/main.c b/fs/netfs/main.c index 5f0f438e5d21..6c7be1377ee0 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -36,13 +36,14 @@ DEFINE_SPINLOCK(netfs_proc_lock); static const char *netfs_origins[nr__netfs_io_origin] = { [NETFS_READAHEAD] = "RA", [NETFS_READPAGE] = "RP", + [NETFS_READ_GAPS] = "RG", [NETFS_READ_FOR_WRITE] = "RW", - [NETFS_COPY_TO_CACHE] = "CC", + [NETFS_DIO_READ] = "DR", [NETFS_WRITEBACK] = "WB", [NETFS_WRITETHROUGH] = "WT", [NETFS_UNBUFFERED_WRITE] = "UW", - [NETFS_DIO_READ] = "DR", [NETFS_DIO_WRITE] = "DW", + [NETFS_PGPRIV2_COPY_TO_CACHE] = "2C", }; /* @@ -62,7 +63,7 @@ static int netfs_requests_seq_show(struct seq_file *m, void *v) rreq = list_entry(v, struct netfs_io_request, proc_link); seq_printf(m, - "%08x %s %3d %2lx %4d %3d @%04llx %llx/%llx", + "%08x %s %3d %2lx %4ld %3d @%04llx %llx/%llx", rreq->debug_id, netfs_origins[rreq->origin], refcount_read(&rreq->ref), @@ -142,7 +143,7 @@ static int __init netfs_init(void) error_fscache: error_procfile: - remove_proc_entry("fs/netfs", NULL); + remove_proc_subtree("fs/netfs", NULL); error_proc: mempool_exit(&netfs_subrequest_pool); error_subreqpool: @@ -159,7 +160,7 @@ fs_initcall(netfs_init); static void __exit netfs_exit(void) { fscache_exit(); - remove_proc_entry("fs/netfs", NULL); + remove_proc_subtree("fs/netfs", NULL); mempool_exit(&netfs_subrequest_pool); kmem_cache_destroy(netfs_subrequest_slab); mempool_exit(&netfs_request_pool); diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 83e644bd518f..0ad0982ce0e2 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -8,6 +8,100 @@ #include <linux/swap.h> #include "internal.h" +/* + * Append a folio to the rolling queue. + */ +int netfs_buffer_append_folio(struct netfs_io_request *rreq, struct folio *folio, + bool needs_put) +{ + struct folio_queue *tail = rreq->buffer_tail; + unsigned int slot, order = folio_order(folio); + + if (WARN_ON_ONCE(!rreq->buffer && tail) || + WARN_ON_ONCE(rreq->buffer && !tail)) + return -EIO; + + if (!tail || folioq_full(tail)) { + tail = kmalloc(sizeof(*tail), GFP_NOFS); + if (!tail) + return -ENOMEM; + netfs_stat(&netfs_n_folioq); + folioq_init(tail); + tail->prev = rreq->buffer_tail; + if (tail->prev) + tail->prev->next = tail; + rreq->buffer_tail = tail; + if (!rreq->buffer) { + rreq->buffer = tail; + iov_iter_folio_queue(&rreq->io_iter, ITER_SOURCE, tail, 0, 0, 0); + } + rreq->buffer_tail_slot = 0; + } + + rreq->io_iter.count += PAGE_SIZE << order; + + slot = folioq_append(tail, folio); + /* Store the counter after setting the slot. */ + smp_store_release(&rreq->buffer_tail_slot, slot); + return 0; +} + +/* + * Delete the head of a rolling queue. + */ +struct folio_queue *netfs_delete_buffer_head(struct netfs_io_request *wreq) +{ + struct folio_queue *head = wreq->buffer, *next = head->next; + + if (next) + next->prev = NULL; + netfs_stat_d(&netfs_n_folioq); + kfree(head); + wreq->buffer = next; + return next; +} + +/* + * Clear out a rolling queue. + */ +void netfs_clear_buffer(struct netfs_io_request *rreq) +{ + struct folio_queue *p; + + while ((p = rreq->buffer)) { + rreq->buffer = p->next; + for (int slot = 0; slot < folioq_nr_slots(p); slot++) { + struct folio *folio = folioq_folio(p, slot); + if (!folio) + continue; + if (folioq_is_marked(p, slot)) { + trace_netfs_folio(folio, netfs_folio_trace_put); + folio_put(folio); + } + } + netfs_stat_d(&netfs_n_folioq); + kfree(p); + } +} + +/* + * Reset the subrequest iterator to refer just to the region remaining to be + * read. The iterator may or may not have been advanced by socket ops or + * extraction ops to an extent that may or may not match the amount actually + * read. + */ +void netfs_reset_iter(struct netfs_io_subrequest *subreq) +{ + struct iov_iter *io_iter = &subreq->io_iter; + size_t remain = subreq->len - subreq->transferred; + + if (io_iter->count > remain) + iov_iter_advance(io_iter, io_iter->count - remain); + else if (io_iter->count < remain) + iov_iter_revert(io_iter, remain - io_iter->count); + iov_iter_truncate(&subreq->io_iter, remain); +} + /** * netfs_dirty_folio - Mark folio dirty and pin a cache object for writeback * @mapping: The mapping the folio belongs to. @@ -97,10 +191,22 @@ EXPORT_SYMBOL(netfs_clear_inode_writeback); void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) { struct netfs_folio *finfo; + struct netfs_inode *ctx = netfs_inode(folio_inode(folio)); size_t flen = folio_size(folio); _enter("{%lx},%zx,%zx", folio->index, offset, length); + if (offset == 0 && length == flen) { + unsigned long long i_size = i_size_read(&ctx->inode); + unsigned long long fpos = folio_pos(folio), end; + + end = umin(fpos + flen, i_size); + if (fpos < i_size && end > ctx->zero_point) + ctx->zero_point = end; + } + + folio_wait_private_2(folio); /* [DEPRECATED] */ + if (!folio_test_private(folio)) return; @@ -113,18 +219,34 @@ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) /* We have a partially uptodate page from a streaming write. */ unsigned int fstart = finfo->dirty_offset; unsigned int fend = fstart + finfo->dirty_len; - unsigned int end = offset + length; + unsigned int iend = offset + length; if (offset >= fend) return; - if (end <= fstart) + if (iend <= fstart) return; - if (offset <= fstart && end >= fend) - goto erase_completely; - if (offset <= fstart && end > fstart) - goto reduce_len; - if (offset > fstart && end >= fend) - goto move_start; + + /* The invalidation region overlaps the data. If the region + * covers the start of the data, we either move along the start + * or just erase the data entirely. + */ + if (offset <= fstart) { + if (iend >= fend) + goto erase_completely; + /* Move the start of the data. */ + finfo->dirty_len = fend - iend; + finfo->dirty_offset = offset; + return; + } + + /* Reduce the length of the data if the invalidation region + * covers the tail part. + */ + if (iend >= fend) { + finfo->dirty_len = offset - fstart; + return; + } + /* A partial write was split. The caller has already zeroed * it, so just absorb the hole. */ @@ -137,12 +259,6 @@ erase_completely: folio_clear_uptodate(folio); kfree(finfo); return; -reduce_len: - finfo->dirty_len = offset + length - finfo->dirty_offset; - return; -move_start: - finfo->dirty_len -= offset - finfo->dirty_offset; - finfo->dirty_offset = offset; } EXPORT_SYMBOL(netfs_invalidate_folio); @@ -159,12 +275,20 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp) struct netfs_inode *ctx = netfs_inode(folio_inode(folio)); unsigned long long end; - end = folio_pos(folio) + folio_size(folio); + if (folio_test_dirty(folio)) + return false; + + end = umin(folio_pos(folio) + folio_size(folio), i_size_read(&ctx->inode)); if (end > ctx->zero_point) ctx->zero_point = end; if (folio_test_private(folio)) return false; + if (unlikely(folio_test_private_2(folio))) { /* [DEPRECATED] */ + if (current_is_kswapd() || !(gfp & __GFP_FS)) + return false; + folio_wait_private_2(folio); + } fscache_note_page_release(netfs_i_cookie(ctx)); return true; } diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index f4a642727479..31e388ec6e48 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -24,10 +24,6 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, struct netfs_io_request *rreq; mempool_t *mempool = ctx->ops->request_pool ?: &netfs_request_pool; struct kmem_cache *cache = mempool->pool_data; - bool is_unbuffered = (origin == NETFS_UNBUFFERED_WRITE || - origin == NETFS_DIO_READ || - origin == NETFS_DIO_WRITE); - bool cached = !is_unbuffered && netfs_is_cache_enabled(ctx); int ret; for (;;) { @@ -40,7 +36,6 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, memset(rreq, 0, kmem_cache_size(cache)); rreq->start = start; rreq->len = len; - rreq->upper_len = len; rreq->origin = origin; rreq->netfs_ops = ctx->ops; rreq->mapping = mapping; @@ -48,20 +43,24 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, rreq->i_size = i_size_read(inode); rreq->debug_id = atomic_inc_return(&debug_ids); rreq->wsize = INT_MAX; + rreq->io_streams[0].sreq_max_len = ULONG_MAX; + rreq->io_streams[0].sreq_max_segs = 0; spin_lock_init(&rreq->lock); INIT_LIST_HEAD(&rreq->io_streams[0].subrequests); INIT_LIST_HEAD(&rreq->io_streams[1].subrequests); INIT_LIST_HEAD(&rreq->subrequests); - INIT_WORK(&rreq->work, NULL); refcount_set(&rreq->ref, 1); + if (origin == NETFS_READAHEAD || + origin == NETFS_READPAGE || + origin == NETFS_READ_GAPS || + origin == NETFS_READ_FOR_WRITE || + origin == NETFS_DIO_READ) + INIT_WORK(&rreq->work, netfs_read_termination_worker); + else + INIT_WORK(&rreq->work, netfs_write_collection_worker); + __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); - if (cached) { - __set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags); - if (test_bit(NETFS_ICTX_USE_PGPRIV2, &ctx->flags)) - /* Filesystem uses deprecated PG_private_2 marking. */ - __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags); - } if (file && file->f_flags & O_NONBLOCK) __set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags); if (rreq->netfs_ops->init_request) { @@ -144,6 +143,7 @@ static void netfs_free_request(struct work_struct *work) } kvfree(rreq->direct_bv); } + netfs_clear_buffer(rreq); if (atomic_dec_and_test(&ictx->io_count)) wake_up_var(&ictx->io_count); @@ -165,7 +165,7 @@ void netfs_put_request(struct netfs_io_request *rreq, bool was_async, if (was_async) { rreq->work.func = netfs_free_request; if (!queue_work(system_unbound_wq, &rreq->work)) - BUG(); + WARN_ON(1); } else { netfs_free_request(&rreq->work); } diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c new file mode 100644 index 000000000000..b18c65ba5580 --- /dev/null +++ b/fs/netfs/read_collect.c @@ -0,0 +1,544 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Network filesystem read subrequest result collection, assessment and + * retrying. + * + * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/export.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/task_io_accounting_ops.h> +#include "internal.h" + +/* + * Clear the unread part of an I/O request. + */ +static void netfs_clear_unread(struct netfs_io_subrequest *subreq) +{ + netfs_reset_iter(subreq); + WARN_ON_ONCE(subreq->len - subreq->transferred != iov_iter_count(&subreq->io_iter)); + iov_iter_zero(iov_iter_count(&subreq->io_iter), &subreq->io_iter); + if (subreq->start + subreq->transferred >= subreq->rreq->i_size) + __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); +} + +/* + * Flush, mark and unlock a folio that's now completely read. If we want to + * cache the folio, we set the group to NETFS_FOLIO_COPY_TO_CACHE, mark it + * dirty and let writeback handle it. + */ +static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq, + struct netfs_io_request *rreq, + struct folio_queue *folioq, + int slot) +{ + struct netfs_folio *finfo; + struct folio *folio = folioq_folio(folioq, slot); + + flush_dcache_folio(folio); + folio_mark_uptodate(folio); + + if (!test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) { + finfo = netfs_folio_info(folio); + if (finfo) { + trace_netfs_folio(folio, netfs_folio_trace_filled_gaps); + if (finfo->netfs_group) + folio_change_private(folio, finfo->netfs_group); + else + folio_detach_private(folio); + kfree(finfo); + } + + if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) { + if (!WARN_ON_ONCE(folio_get_private(folio) != NULL)) { + trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); + folio_attach_private(folio, NETFS_FOLIO_COPY_TO_CACHE); + folio_mark_dirty(folio); + } + } else { + trace_netfs_folio(folio, netfs_folio_trace_read_done); + } + } else { + // TODO: Use of PG_private_2 is deprecated. + if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) + netfs_pgpriv2_mark_copy_to_cache(subreq, rreq, folioq, slot); + } + + if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { + if (folio->index == rreq->no_unlock_folio && + test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) { + _debug("no unlock"); + } else { + trace_netfs_folio(folio, netfs_folio_trace_read_unlock); + folio_unlock(folio); + } + } +} + +/* + * Unlock any folios that are now completely read. Returns true if the + * subrequest is removed from the list. + */ +static bool netfs_consume_read_data(struct netfs_io_subrequest *subreq, bool was_async) +{ + struct netfs_io_subrequest *prev, *next; + struct netfs_io_request *rreq = subreq->rreq; + struct folio_queue *folioq = subreq->curr_folioq; + size_t avail, prev_donated, next_donated, fsize, part, excess; + loff_t fpos, start; + loff_t fend; + int slot = subreq->curr_folioq_slot; + + if (WARN(subreq->transferred > subreq->len, + "Subreq overread: R%x[%x] %zu > %zu", + rreq->debug_id, subreq->debug_index, + subreq->transferred, subreq->len)) + subreq->transferred = subreq->len; + +next_folio: + fsize = PAGE_SIZE << subreq->curr_folio_order; + fpos = round_down(subreq->start + subreq->consumed, fsize); + fend = fpos + fsize; + + if (WARN_ON_ONCE(!folioq) || + WARN_ON_ONCE(!folioq_folio(folioq, slot)) || + WARN_ON_ONCE(folioq_folio(folioq, slot)->index != fpos / PAGE_SIZE)) { + pr_err("R=%08x[%x] s=%llx-%llx ctl=%zx/%zx/%zx sl=%u\n", + rreq->debug_id, subreq->debug_index, + subreq->start, subreq->start + subreq->transferred - 1, + subreq->consumed, subreq->transferred, subreq->len, + slot); + if (folioq) { + struct folio *folio = folioq_folio(folioq, slot); + + pr_err("folioq: orders=%02x%02x%02x%02x\n", + folioq->orders[0], folioq->orders[1], + folioq->orders[2], folioq->orders[3]); + if (folio) + pr_err("folio: %llx-%llx ix=%llx o=%u qo=%u\n", + fpos, fend - 1, folio_pos(folio), folio_order(folio), + folioq_folio_order(folioq, slot)); + } + } + +donation_changed: + /* Try to consume the current folio if we've hit or passed the end of + * it. There's a possibility that this subreq doesn't start at the + * beginning of the folio, in which case we need to donate to/from the + * preceding subreq. + * + * We also need to include any potential donation back from the + * following subreq. + */ + prev_donated = READ_ONCE(subreq->prev_donated); + next_donated = READ_ONCE(subreq->next_donated); + if (prev_donated || next_donated) { + spin_lock_bh(&rreq->lock); + prev_donated = subreq->prev_donated; + next_donated = subreq->next_donated; + subreq->start -= prev_donated; + subreq->len += prev_donated; + subreq->transferred += prev_donated; + prev_donated = subreq->prev_donated = 0; + if (subreq->transferred == subreq->len) { + subreq->len += next_donated; + subreq->transferred += next_donated; + next_donated = subreq->next_donated = 0; + } + trace_netfs_sreq(subreq, netfs_sreq_trace_add_donations); + spin_unlock_bh(&rreq->lock); + } + + avail = subreq->transferred; + if (avail == subreq->len) + avail += next_donated; + start = subreq->start; + if (subreq->consumed == 0) { + start -= prev_donated; + avail += prev_donated; + } else { + start += subreq->consumed; + avail -= subreq->consumed; + } + part = umin(avail, fsize); + + trace_netfs_progress(subreq, start, avail, part); + + if (start + avail >= fend) { + if (fpos == start) { + /* Flush, unlock and mark for caching any folio we've just read. */ + subreq->consumed = fend - subreq->start; + netfs_unlock_read_folio(subreq, rreq, folioq, slot); + folioq_mark2(folioq, slot); + if (subreq->consumed >= subreq->len) + goto remove_subreq; + } else if (fpos < start) { + excess = fend - subreq->start; + + spin_lock_bh(&rreq->lock); + /* If we complete first on a folio split with the + * preceding subreq, donate to that subreq - otherwise + * we get the responsibility. + */ + if (subreq->prev_donated != prev_donated) { + spin_unlock_bh(&rreq->lock); + goto donation_changed; + } + + if (list_is_first(&subreq->rreq_link, &rreq->subrequests)) { + spin_unlock_bh(&rreq->lock); + pr_err("Can't donate prior to front\n"); + goto bad; + } + + prev = list_prev_entry(subreq, rreq_link); + WRITE_ONCE(prev->next_donated, prev->next_donated + excess); + subreq->start += excess; + subreq->len -= excess; + subreq->transferred -= excess; + trace_netfs_donate(rreq, subreq, prev, excess, + netfs_trace_donate_tail_to_prev); + trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_prev); + + if (subreq->consumed >= subreq->len) + goto remove_subreq_locked; + spin_unlock_bh(&rreq->lock); + } else { + pr_err("fpos > start\n"); + goto bad; + } + + /* Advance the rolling buffer to the next folio. */ + slot++; + if (slot >= folioq_nr_slots(folioq)) { + slot = 0; + folioq = folioq->next; + subreq->curr_folioq = folioq; + } + subreq->curr_folioq_slot = slot; + if (folioq && folioq_folio(folioq, slot)) + subreq->curr_folio_order = folioq->orders[slot]; + if (!was_async) + cond_resched(); + goto next_folio; + } + + /* Deal with partial progress. */ + if (subreq->transferred < subreq->len) + return false; + + /* Donate the remaining downloaded data to one of the neighbouring + * subrequests. Note that we may race with them doing the same thing. + */ + spin_lock_bh(&rreq->lock); + + if (subreq->prev_donated != prev_donated || + subreq->next_donated != next_donated) { + spin_unlock_bh(&rreq->lock); + cond_resched(); + goto donation_changed; + } + + /* Deal with the trickiest case: that this subreq is in the middle of a + * folio, not touching either edge, but finishes first. In such a + * case, we donate to the previous subreq, if there is one, so that the + * donation is only handled when that completes - and remove this + * subreq from the list. + * + * If the previous subreq finished first, we will have acquired their + * donation and should be able to unlock folios and/or donate nextwards. + */ + if (!subreq->consumed && + !prev_donated && + !list_is_first(&subreq->rreq_link, &rreq->subrequests)) { + prev = list_prev_entry(subreq, rreq_link); + WRITE_ONCE(prev->next_donated, prev->next_donated + subreq->len); + subreq->start += subreq->len; + subreq->len = 0; + subreq->transferred = 0; + trace_netfs_donate(rreq, subreq, prev, subreq->len, + netfs_trace_donate_to_prev); + trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_prev); + goto remove_subreq_locked; + } + + /* If we can't donate down the chain, donate up the chain instead. */ + excess = subreq->len - subreq->consumed + next_donated; + + if (!subreq->consumed) + excess += prev_donated; + + if (list_is_last(&subreq->rreq_link, &rreq->subrequests)) { + rreq->prev_donated = excess; + trace_netfs_donate(rreq, subreq, NULL, excess, + netfs_trace_donate_to_deferred_next); + } else { + next = list_next_entry(subreq, rreq_link); + WRITE_ONCE(next->prev_donated, excess); + trace_netfs_donate(rreq, subreq, next, excess, + netfs_trace_donate_to_next); + } + trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_next); + subreq->len = subreq->consumed; + subreq->transferred = subreq->consumed; + goto remove_subreq_locked; + +remove_subreq: + spin_lock_bh(&rreq->lock); +remove_subreq_locked: + subreq->consumed = subreq->len; + list_del(&subreq->rreq_link); + spin_unlock_bh(&rreq->lock); + netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_consumed); + return true; + +bad: + /* Errr... prev and next both donated to us, but insufficient to finish + * the folio. + */ + printk("R=%08x[%x] s=%llx-%llx %zx/%zx/%zx\n", + rreq->debug_id, subreq->debug_index, + subreq->start, subreq->start + subreq->transferred - 1, + subreq->consumed, subreq->transferred, subreq->len); + printk("folio: %llx-%llx\n", fpos, fend - 1); + printk("donated: prev=%zx next=%zx\n", prev_donated, next_donated); + printk("s=%llx av=%zx part=%zx\n", start, avail, part); + BUG(); +} + +/* + * Do page flushing and suchlike after DIO. + */ +static void netfs_rreq_assess_dio(struct netfs_io_request *rreq) +{ + struct netfs_io_subrequest *subreq; + unsigned int i; + + /* Collect unbuffered reads and direct reads, adding up the transfer + * sizes until we find the first short or failed subrequest. + */ + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + rreq->transferred += subreq->transferred; + + if (subreq->transferred < subreq->len || + test_bit(NETFS_SREQ_FAILED, &subreq->flags)) { + rreq->error = subreq->error; + break; + } + } + + if (rreq->origin == NETFS_DIO_READ) { + for (i = 0; i < rreq->direct_bv_count; i++) { + flush_dcache_page(rreq->direct_bv[i].bv_page); + // TODO: cifs marks pages in the destination buffer + // dirty under some circumstances after a read. Do we + // need to do that too? + set_page_dirty(rreq->direct_bv[i].bv_page); + } + } + + if (rreq->iocb) { + rreq->iocb->ki_pos += rreq->transferred; + if (rreq->iocb->ki_complete) + rreq->iocb->ki_complete( + rreq->iocb, rreq->error ? rreq->error : rreq->transferred); + } + if (rreq->netfs_ops->done) + rreq->netfs_ops->done(rreq); + if (rreq->origin == NETFS_DIO_READ) + inode_dio_end(rreq->inode); +} + +/* + * Assess the state of a read request and decide what to do next. + * + * Note that we're in normal kernel thread context at this point, possibly + * running on a workqueue. + */ +static void netfs_rreq_assess(struct netfs_io_request *rreq) +{ + trace_netfs_rreq(rreq, netfs_rreq_trace_assess); + + //netfs_rreq_is_still_valid(rreq); + + if (test_and_clear_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags)) { + netfs_retry_reads(rreq); + return; + } + + if (rreq->origin == NETFS_DIO_READ || + rreq->origin == NETFS_READ_GAPS) + netfs_rreq_assess_dio(rreq); + task_io_account_read(rreq->transferred); + + trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip); + clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags); + wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS); + + trace_netfs_rreq(rreq, netfs_rreq_trace_done); + netfs_clear_subrequests(rreq, false); + netfs_unlock_abandoned_read_pages(rreq); + if (unlikely(test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags))) + netfs_pgpriv2_write_to_the_cache(rreq); +} + +void netfs_read_termination_worker(struct work_struct *work) +{ + struct netfs_io_request *rreq = + container_of(work, struct netfs_io_request, work); + netfs_see_request(rreq, netfs_rreq_trace_see_work); + netfs_rreq_assess(rreq); + netfs_put_request(rreq, false, netfs_rreq_trace_put_work_complete); +} + +/* + * Handle the completion of all outstanding I/O operations on a read request. + * We inherit a ref from the caller. + */ +void netfs_rreq_terminated(struct netfs_io_request *rreq, bool was_async) +{ + if (!was_async) + return netfs_rreq_assess(rreq); + if (!work_pending(&rreq->work)) { + netfs_get_request(rreq, netfs_rreq_trace_get_work); + if (!queue_work(system_unbound_wq, &rreq->work)) + netfs_put_request(rreq, was_async, netfs_rreq_trace_put_work_nq); + } +} + +/** + * netfs_read_subreq_progress - Note progress of a read operation. + * @subreq: The read request that has terminated. + * @was_async: True if we're in an asynchronous context. + * + * This tells the read side of netfs lib that a contributory I/O operation has + * made some progress and that it may be possible to unlock some folios. + * + * Before calling, the filesystem should update subreq->transferred to track + * the amount of data copied into the output buffer. + * + * If @was_async is true, the caller might be running in softirq or interrupt + * context and we can't sleep. + */ +void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq, + bool was_async) +{ + struct netfs_io_request *rreq = subreq->rreq; + + trace_netfs_sreq(subreq, netfs_sreq_trace_progress); + + if (subreq->transferred > subreq->consumed && + (rreq->origin == NETFS_READAHEAD || + rreq->origin == NETFS_READPAGE || + rreq->origin == NETFS_READ_FOR_WRITE)) { + netfs_consume_read_data(subreq, was_async); + __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); + } +} +EXPORT_SYMBOL(netfs_read_subreq_progress); + +/** + * netfs_read_subreq_terminated - Note the termination of an I/O operation. + * @subreq: The I/O request that has terminated. + * @error: Error code indicating type of completion. + * @was_async: The termination was asynchronous + * + * This tells the read helper that a contributory I/O operation has terminated, + * one way or another, and that it should integrate the results. + * + * The caller indicates the outcome of the operation through @error, supplying + * 0 to indicate a successful or retryable transfer (if NETFS_SREQ_NEED_RETRY + * is set) or a negative error code. The helper will look after reissuing I/O + * operations as appropriate and writing downloaded data to the cache. + * + * Before calling, the filesystem should update subreq->transferred to track + * the amount of data copied into the output buffer. + * + * If @was_async is true, the caller might be running in softirq or interrupt + * context and we can't sleep. + */ +void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, + int error, bool was_async) +{ + struct netfs_io_request *rreq = subreq->rreq; + + switch (subreq->source) { + case NETFS_READ_FROM_CACHE: + netfs_stat(&netfs_n_rh_read_done); + break; + case NETFS_DOWNLOAD_FROM_SERVER: + netfs_stat(&netfs_n_rh_download_done); + break; + default: + break; + } + + if (rreq->origin != NETFS_DIO_READ) { + /* Collect buffered reads. + * + * If the read completed validly short, then we can clear the + * tail before going on to unlock the folios. + */ + if (error == 0 && subreq->transferred < subreq->len && + (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags) || + test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags))) { + netfs_clear_unread(subreq); + subreq->transferred = subreq->len; + trace_netfs_sreq(subreq, netfs_sreq_trace_clear); + } + if (subreq->transferred > subreq->consumed && + (rreq->origin == NETFS_READAHEAD || + rreq->origin == NETFS_READPAGE || + rreq->origin == NETFS_READ_FOR_WRITE)) { + netfs_consume_read_data(subreq, was_async); + __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); + } + rreq->transferred += subreq->transferred; + } + + /* Deal with retry requests, short reads and errors. If we retry + * but don't make progress, we abandon the attempt. + */ + if (!error && subreq->transferred < subreq->len) { + if (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags)) { + trace_netfs_sreq(subreq, netfs_sreq_trace_hit_eof); + } else { + trace_netfs_sreq(subreq, netfs_sreq_trace_short); + if (subreq->transferred > subreq->consumed) { + __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); + __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); + set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags); + } else if (!__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) { + __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); + set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags); + } else { + __set_bit(NETFS_SREQ_FAILED, &subreq->flags); + error = -ENODATA; + } + } + } + + subreq->error = error; + trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); + + if (unlikely(error < 0)) { + trace_netfs_failure(rreq, subreq, error, netfs_fail_read); + if (subreq->source == NETFS_READ_FROM_CACHE) { + netfs_stat(&netfs_n_rh_read_failed); + } else { + netfs_stat(&netfs_n_rh_download_failed); + set_bit(NETFS_RREQ_FAILED, &rreq->flags); + rreq->error = subreq->error; + } + } + + if (atomic_dec_and_test(&rreq->nr_outstanding)) + netfs_rreq_terminated(rreq, was_async); + + netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); +} +EXPORT_SYMBOL(netfs_read_subreq_terminated); diff --git a/fs/netfs/read_pgpriv2.c b/fs/netfs/read_pgpriv2.c new file mode 100644 index 000000000000..ba5af89d37fa --- /dev/null +++ b/fs/netfs/read_pgpriv2.c @@ -0,0 +1,264 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Read with PG_private_2 [DEPRECATED]. + * + * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/export.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/task_io_accounting_ops.h> +#include "internal.h" + +/* + * [DEPRECATED] Mark page as requiring copy-to-cache using PG_private_2. The + * third mark in the folio queue is used to indicate that this folio needs + * writing. + */ +void netfs_pgpriv2_mark_copy_to_cache(struct netfs_io_subrequest *subreq, + struct netfs_io_request *rreq, + struct folio_queue *folioq, + int slot) +{ + struct folio *folio = folioq_folio(folioq, slot); + + trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); + folio_start_private_2(folio); + folioq_mark3(folioq, slot); +} + +/* + * [DEPRECATED] Cancel PG_private_2 on all marked folios in the event of an + * unrecoverable error. + */ +static void netfs_pgpriv2_cancel(struct folio_queue *folioq) +{ + struct folio *folio; + int slot; + + while (folioq) { + if (!folioq->marks3) { + folioq = folioq->next; + continue; + } + + slot = __ffs(folioq->marks3); + folio = folioq_folio(folioq, slot); + + trace_netfs_folio(folio, netfs_folio_trace_cancel_copy); + folio_end_private_2(folio); + folioq_unmark3(folioq, slot); + } +} + +/* + * [DEPRECATED] Copy a folio to the cache with PG_private_2 set. + */ +static int netfs_pgpriv2_copy_folio(struct netfs_io_request *wreq, struct folio *folio) +{ + struct netfs_io_stream *cache = &wreq->io_streams[1]; + size_t fsize = folio_size(folio), flen = fsize; + loff_t fpos = folio_pos(folio), i_size; + bool to_eof = false; + + _enter(""); + + /* netfs_perform_write() may shift i_size around the page or from out + * of the page to beyond it, but cannot move i_size into or through the + * page since we have it locked. + */ + i_size = i_size_read(wreq->inode); + + if (fpos >= i_size) { + /* mmap beyond eof. */ + _debug("beyond eof"); + folio_end_private_2(folio); + return 0; + } + + if (fpos + fsize > wreq->i_size) + wreq->i_size = i_size; + + if (flen > i_size - fpos) { + flen = i_size - fpos; + to_eof = true; + } else if (flen == i_size - fpos) { + to_eof = true; + } + + _debug("folio %zx %zx", flen, fsize); + + trace_netfs_folio(folio, netfs_folio_trace_store_copy); + + /* Attach the folio to the rolling buffer. */ + if (netfs_buffer_append_folio(wreq, folio, false) < 0) + return -ENOMEM; + + cache->submit_extendable_to = fsize; + cache->submit_off = 0; + cache->submit_len = flen; + + /* Attach the folio to one or more subrequests. For a big folio, we + * could end up with thousands of subrequests if the wsize is small - + * but we might need to wait during the creation of subrequests for + * network resources (eg. SMB credits). + */ + do { + ssize_t part; + + wreq->io_iter.iov_offset = cache->submit_off; + + atomic64_set(&wreq->issued_to, fpos + cache->submit_off); + cache->submit_extendable_to = fsize - cache->submit_off; + part = netfs_advance_write(wreq, cache, fpos + cache->submit_off, + cache->submit_len, to_eof); + cache->submit_off += part; + if (part > cache->submit_len) + cache->submit_len = 0; + else + cache->submit_len -= part; + } while (cache->submit_len > 0); + + wreq->io_iter.iov_offset = 0; + iov_iter_advance(&wreq->io_iter, fsize); + atomic64_set(&wreq->issued_to, fpos + fsize); + + if (flen < fsize) + netfs_issue_write(wreq, cache); + + _leave(" = 0"); + return 0; +} + +/* + * [DEPRECATED] Go through the buffer and write any folios that are marked with + * the third mark to the cache. + */ +void netfs_pgpriv2_write_to_the_cache(struct netfs_io_request *rreq) +{ + struct netfs_io_request *wreq; + struct folio_queue *folioq; + struct folio *folio; + int error = 0; + int slot = 0; + + _enter(""); + + if (!fscache_resources_valid(&rreq->cache_resources)) + goto couldnt_start; + + /* Need the first folio to be able to set up the op. */ + for (folioq = rreq->buffer; folioq; folioq = folioq->next) { + if (folioq->marks3) { + slot = __ffs(folioq->marks3); + break; + } + } + if (!folioq) + return; + folio = folioq_folio(folioq, slot); + + wreq = netfs_create_write_req(rreq->mapping, NULL, folio_pos(folio), + NETFS_PGPRIV2_COPY_TO_CACHE); + if (IS_ERR(wreq)) { + kleave(" [create %ld]", PTR_ERR(wreq)); + goto couldnt_start; + } + + trace_netfs_write(wreq, netfs_write_trace_copy_to_cache); + netfs_stat(&netfs_n_wh_copy_to_cache); + + for (;;) { + error = netfs_pgpriv2_copy_folio(wreq, folio); + if (error < 0) + break; + + folioq_unmark3(folioq, slot); + if (!folioq->marks3) { + folioq = folioq->next; + if (!folioq) + break; + } + + slot = __ffs(folioq->marks3); + folio = folioq_folio(folioq, slot); + } + + netfs_issue_write(wreq, &wreq->io_streams[1]); + smp_wmb(); /* Write lists before ALL_QUEUED. */ + set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags); + + netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + _leave(" = %d", error); +couldnt_start: + netfs_pgpriv2_cancel(rreq->buffer); +} + +/* + * [DEPRECATED] Remove the PG_private_2 mark from any folios we've finished + * copying. + */ +bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq) +{ + struct folio_queue *folioq = wreq->buffer; + unsigned long long collected_to = wreq->collected_to; + unsigned int slot = wreq->buffer_head_slot; + bool made_progress = false; + + if (slot >= folioq_nr_slots(folioq)) { + folioq = netfs_delete_buffer_head(wreq); + slot = 0; + } + + for (;;) { + struct folio *folio; + unsigned long long fpos, fend; + size_t fsize, flen; + + folio = folioq_folio(folioq, slot); + if (WARN_ONCE(!folio_test_private_2(folio), + "R=%08x: folio %lx is not marked private_2\n", + wreq->debug_id, folio->index)) + trace_netfs_folio(folio, netfs_folio_trace_not_under_wback); + + fpos = folio_pos(folio); + fsize = folio_size(folio); + flen = fsize; + + fend = min_t(unsigned long long, fpos + flen, wreq->i_size); + + trace_netfs_collect_folio(wreq, folio, fend, collected_to); + + /* Unlock any folio we've transferred all of. */ + if (collected_to < fend) + break; + + trace_netfs_folio(folio, netfs_folio_trace_end_copy); + folio_end_private_2(folio); + wreq->cleaned_to = fpos + fsize; + made_progress = true; + + /* Clean up the head folioq. If we clear an entire folioq, then + * we can get rid of it provided it's not also the tail folioq + * being filled by the issuer. + */ + folioq_clear(folioq, slot); + slot++; + if (slot >= folioq_nr_slots(folioq)) { + if (READ_ONCE(wreq->buffer_tail) == folioq) + break; + folioq = netfs_delete_buffer_head(wreq); + slot = 0; + } + + if (fpos + fsize >= collected_to) + break; + } + + wreq->buffer = folioq; + wreq->buffer_head_slot = slot; + return made_progress; +} diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c new file mode 100644 index 000000000000..0350592ea804 --- /dev/null +++ b/fs/netfs/read_retry.c @@ -0,0 +1,256 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Network filesystem read subrequest retrying. + * + * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/fs.h> +#include <linux/slab.h> +#include "internal.h" + +static void netfs_reissue_read(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq) +{ + struct iov_iter *io_iter = &subreq->io_iter; + + if (iov_iter_is_folioq(io_iter)) { + subreq->curr_folioq = (struct folio_queue *)io_iter->folioq; + subreq->curr_folioq_slot = io_iter->folioq_slot; + subreq->curr_folio_order = subreq->curr_folioq->orders[subreq->curr_folioq_slot]; + } + + atomic_inc(&rreq->nr_outstanding); + __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); + netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); + subreq->rreq->netfs_ops->issue_read(subreq); +} + +/* + * Go through the list of failed/short reads, retrying all retryable ones. We + * need to switch failed cache reads to network downloads. + */ +static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) +{ + struct netfs_io_subrequest *subreq; + struct netfs_io_stream *stream0 = &rreq->io_streams[0]; + LIST_HEAD(sublist); + LIST_HEAD(queue); + + _enter("R=%x", rreq->debug_id); + + if (list_empty(&rreq->subrequests)) + return; + + if (rreq->netfs_ops->retry_request) + rreq->netfs_ops->retry_request(rreq, NULL); + + /* If there's no renegotiation to do, just resend each retryable subreq + * up to the first permanently failed one. + */ + if (!rreq->netfs_ops->prepare_read && + !test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags)) { + struct netfs_io_subrequest *subreq; + + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) + break; + if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) { + netfs_reset_iter(subreq); + netfs_reissue_read(rreq, subreq); + } + } + return; + } + + /* Okay, we need to renegotiate all the download requests and flip any + * failed cache reads over to being download requests and negotiate + * those also. All fully successful subreqs have been removed from the + * list and any spare data from those has been donated. + * + * What we do is decant the list and rebuild it one subreq at a time so + * that we don't end up with donations jumping over a gap we're busy + * populating with smaller subrequests. In the event that the subreq + * we just launched finishes before we insert the next subreq, it'll + * fill in rreq->prev_donated instead. + + * Note: Alternatively, we could split the tail subrequest right before + * we reissue it and fix up the donations under lock. + */ + list_splice_init(&rreq->subrequests, &queue); + + do { + struct netfs_io_subrequest *from; + struct iov_iter source; + unsigned long long start, len; + size_t part, deferred_next_donated = 0; + bool boundary = false; + + /* Go through the subreqs and find the next span of contiguous + * buffer that we then rejig (cifs, for example, needs the + * rsize renegotiating) and reissue. + */ + from = list_first_entry(&queue, struct netfs_io_subrequest, rreq_link); + list_move_tail(&from->rreq_link, &sublist); + start = from->start + from->transferred; + len = from->len - from->transferred; + + _debug("from R=%08x[%x] s=%llx ctl=%zx/%zx/%zx", + rreq->debug_id, from->debug_index, + from->start, from->consumed, from->transferred, from->len); + + if (test_bit(NETFS_SREQ_FAILED, &from->flags) || + !test_bit(NETFS_SREQ_NEED_RETRY, &from->flags)) + goto abandon; + + deferred_next_donated = from->next_donated; + while ((subreq = list_first_entry_or_null( + &queue, struct netfs_io_subrequest, rreq_link))) { + if (subreq->start != start + len || + subreq->transferred > 0 || + !test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) + break; + list_move_tail(&subreq->rreq_link, &sublist); + len += subreq->len; + deferred_next_donated = subreq->next_donated; + if (test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags)) + break; + } + + _debug(" - range: %llx-%llx %llx", start, start + len - 1, len); + + /* Determine the set of buffers we're going to use. Each + * subreq gets a subset of a single overall contiguous buffer. + */ + netfs_reset_iter(from); + source = from->io_iter; + source.count = len; + + /* Work through the sublist. */ + while ((subreq = list_first_entry_or_null( + &sublist, struct netfs_io_subrequest, rreq_link))) { + list_del(&subreq->rreq_link); + + subreq->source = NETFS_DOWNLOAD_FROM_SERVER; + subreq->start = start - subreq->transferred; + subreq->len = len + subreq->transferred; + stream0->sreq_max_len = subreq->len; + + __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); + __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); + + spin_lock_bh(&rreq->lock); + list_add_tail(&subreq->rreq_link, &rreq->subrequests); + subreq->prev_donated += rreq->prev_donated; + rreq->prev_donated = 0; + trace_netfs_sreq(subreq, netfs_sreq_trace_retry); + spin_unlock_bh(&rreq->lock); + + BUG_ON(!len); + + /* Renegotiate max_len (rsize) */ + if (rreq->netfs_ops->prepare_read(subreq) < 0) { + trace_netfs_sreq(subreq, netfs_sreq_trace_reprep_failed); + __set_bit(NETFS_SREQ_FAILED, &subreq->flags); + } + + part = umin(len, stream0->sreq_max_len); + if (unlikely(rreq->io_streams[0].sreq_max_segs)) + part = netfs_limit_iter(&source, 0, part, stream0->sreq_max_segs); + subreq->len = subreq->transferred + part; + subreq->io_iter = source; + iov_iter_truncate(&subreq->io_iter, part); + iov_iter_advance(&source, part); + len -= part; + start += part; + if (!len) { + if (boundary) + __set_bit(NETFS_SREQ_BOUNDARY, &subreq->flags); + subreq->next_donated = deferred_next_donated; + } else { + __clear_bit(NETFS_SREQ_BOUNDARY, &subreq->flags); + subreq->next_donated = 0; + } + + netfs_reissue_read(rreq, subreq); + if (!len) + break; + + /* If we ran out of subrequests, allocate another. */ + if (list_empty(&sublist)) { + subreq = netfs_alloc_subrequest(rreq); + if (!subreq) + goto abandon; + subreq->source = NETFS_DOWNLOAD_FROM_SERVER; + subreq->start = start; + + /* We get two refs, but need just one. */ + netfs_put_subrequest(subreq, false, netfs_sreq_trace_new); + trace_netfs_sreq(subreq, netfs_sreq_trace_split); + list_add_tail(&subreq->rreq_link, &sublist); + } + } + + /* If we managed to use fewer subreqs, we can discard the + * excess. + */ + while ((subreq = list_first_entry_or_null( + &sublist, struct netfs_io_subrequest, rreq_link))) { + trace_netfs_sreq(subreq, netfs_sreq_trace_discard); + list_del(&subreq->rreq_link); + netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done); + } + + } while (!list_empty(&queue)); + + return; + + /* If we hit ENOMEM, fail all remaining subrequests */ +abandon: + list_splice_init(&sublist, &queue); + list_for_each_entry(subreq, &queue, rreq_link) { + if (!subreq->error) + subreq->error = -ENOMEM; + __clear_bit(NETFS_SREQ_FAILED, &subreq->flags); + __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); + __clear_bit(NETFS_SREQ_RETRYING, &subreq->flags); + } + spin_lock_bh(&rreq->lock); + list_splice_tail_init(&queue, &rreq->subrequests); + spin_unlock_bh(&rreq->lock); +} + +/* + * Retry reads. + */ +void netfs_retry_reads(struct netfs_io_request *rreq) +{ + trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit); + + atomic_inc(&rreq->nr_outstanding); + + netfs_retry_read_subrequests(rreq); + + if (atomic_dec_and_test(&rreq->nr_outstanding)) + netfs_rreq_terminated(rreq, false); +} + +/* + * Unlock any the pages that haven't been unlocked yet due to abandoned + * subrequests. + */ +void netfs_unlock_abandoned_read_pages(struct netfs_io_request *rreq) +{ + struct folio_queue *p; + + for (p = rreq->buffer; p; p = p->next) { + for (int slot = 0; slot < folioq_count(p); slot++) { + struct folio *folio = folioq_folio(p, slot); + + if (folio && !folioq_is_marked2(p, slot)) { + trace_netfs_folio(folio, netfs_folio_trace_abandon); + folio_unlock(folio); + } + } + } +} diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c index 0892768eea32..8e63516b40f6 100644 --- a/fs/netfs/stats.c +++ b/fs/netfs/stats.c @@ -32,6 +32,7 @@ atomic_t netfs_n_wh_buffered_write; atomic_t netfs_n_wh_writethrough; atomic_t netfs_n_wh_dio_write; atomic_t netfs_n_wh_writepages; +atomic_t netfs_n_wh_copy_to_cache; atomic_t netfs_n_wh_wstream_conflict; atomic_t netfs_n_wh_upload; atomic_t netfs_n_wh_upload_done; @@ -39,45 +40,53 @@ atomic_t netfs_n_wh_upload_failed; atomic_t netfs_n_wh_write; atomic_t netfs_n_wh_write_done; atomic_t netfs_n_wh_write_failed; +atomic_t netfs_n_wb_lock_skip; +atomic_t netfs_n_wb_lock_wait; +atomic_t netfs_n_folioq; int netfs_stats_show(struct seq_file *m, void *v) { - seq_printf(m, "Netfs : DR=%u RA=%u RF=%u WB=%u WBZ=%u\n", + seq_printf(m, "Reads : DR=%u RA=%u RF=%u WB=%u WBZ=%u\n", atomic_read(&netfs_n_rh_dio_read), atomic_read(&netfs_n_rh_readahead), atomic_read(&netfs_n_rh_read_folio), atomic_read(&netfs_n_rh_write_begin), atomic_read(&netfs_n_rh_write_zskip)); - seq_printf(m, "Netfs : BW=%u WT=%u DW=%u WP=%u\n", + seq_printf(m, "Writes : BW=%u WT=%u DW=%u WP=%u 2C=%u\n", atomic_read(&netfs_n_wh_buffered_write), atomic_read(&netfs_n_wh_writethrough), atomic_read(&netfs_n_wh_dio_write), - atomic_read(&netfs_n_wh_writepages)); - seq_printf(m, "Netfs : ZR=%u sh=%u sk=%u\n", + atomic_read(&netfs_n_wh_writepages), + atomic_read(&netfs_n_wh_copy_to_cache)); + seq_printf(m, "ZeroOps: ZR=%u sh=%u sk=%u\n", atomic_read(&netfs_n_rh_zero), atomic_read(&netfs_n_rh_short_read), atomic_read(&netfs_n_rh_write_zskip)); - seq_printf(m, "Netfs : DL=%u ds=%u df=%u di=%u\n", + seq_printf(m, "DownOps: DL=%u ds=%u df=%u di=%u\n", atomic_read(&netfs_n_rh_download), atomic_read(&netfs_n_rh_download_done), atomic_read(&netfs_n_rh_download_failed), atomic_read(&netfs_n_rh_download_instead)); - seq_printf(m, "Netfs : RD=%u rs=%u rf=%u\n", + seq_printf(m, "CaRdOps: RD=%u rs=%u rf=%u\n", atomic_read(&netfs_n_rh_read), atomic_read(&netfs_n_rh_read_done), atomic_read(&netfs_n_rh_read_failed)); - seq_printf(m, "Netfs : UL=%u us=%u uf=%u\n", + seq_printf(m, "UpldOps: UL=%u us=%u uf=%u\n", atomic_read(&netfs_n_wh_upload), atomic_read(&netfs_n_wh_upload_done), atomic_read(&netfs_n_wh_upload_failed)); - seq_printf(m, "Netfs : WR=%u ws=%u wf=%u\n", + seq_printf(m, "CaWrOps: WR=%u ws=%u wf=%u\n", atomic_read(&netfs_n_wh_write), atomic_read(&netfs_n_wh_write_done), atomic_read(&netfs_n_wh_write_failed)); - seq_printf(m, "Netfs : rr=%u sr=%u wsc=%u\n", + seq_printf(m, "Objs : rr=%u sr=%u foq=%u wsc=%u\n", atomic_read(&netfs_n_rh_rreq), atomic_read(&netfs_n_rh_sreq), + atomic_read(&netfs_n_folioq), atomic_read(&netfs_n_wh_wstream_conflict)); + seq_printf(m, "WbLock : skip=%u wait=%u\n", + atomic_read(&netfs_n_wb_lock_skip), + atomic_read(&netfs_n_wb_lock_wait)); return fscache_stats_show(m); } EXPORT_SYMBOL(netfs_stats_show); diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 426cf87aaf2e..1d438be2e1b4 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -15,15 +15,11 @@ /* Notes made in the collector */ #define HIT_PENDING 0x01 /* A front op was still pending */ -#define SOME_EMPTY 0x02 /* One of more streams are empty */ -#define ALL_EMPTY 0x04 /* All streams are empty */ -#define MAYBE_DISCONTIG 0x08 /* A front op may be discontiguous (rounded to PAGE_SIZE) */ -#define NEED_REASSESS 0x10 /* Need to loop round and reassess */ -#define REASSESS_DISCONTIG 0x20 /* Reassess discontiguity if contiguity advances */ -#define MADE_PROGRESS 0x40 /* Made progress cleaning up a stream or the folio set */ -#define BUFFERED 0x80 /* The pagecache needs cleaning up */ -#define NEED_RETRY 0x100 /* A front op requests retrying */ -#define SAW_FAILURE 0x200 /* One stream or hit a permanent failure */ +#define NEED_REASSESS 0x02 /* Need to loop round and reassess */ +#define MADE_PROGRESS 0x04 /* Made progress cleaning up a stream or the folio set */ +#define BUFFERED 0x08 /* The pagecache needs cleaning up */ +#define NEED_RETRY 0x10 /* A front op requests retrying */ +#define SAW_FAILURE 0x20 /* One stream or hit a permanent failure */ /* * Successful completion of write of a folio to the server and/or cache. Note @@ -33,6 +29,7 @@ int netfs_folio_written_back(struct folio *folio) { enum netfs_folio_trace why = netfs_folio_trace_clear; + struct netfs_inode *ictx = netfs_inode(folio->mapping->host); struct netfs_folio *finfo; struct netfs_group *group = NULL; int gcount = 0; @@ -41,6 +38,12 @@ int netfs_folio_written_back(struct folio *folio) /* Streaming writes cannot be redirtied whilst under writeback, * so discard the streaming record. */ + unsigned long long fend; + + fend = folio_pos(folio) + finfo->dirty_offset + finfo->dirty_len; + if (fend > ictx->zero_point) + ictx->zero_point = fend; + folio_detach_private(folio); group = finfo->netfs_group; gcount++; @@ -75,55 +78,37 @@ end_wb: } /* - * Get hold of a folio we have under writeback. We don't want to get the - * refcount on it. + * Unlock any folios we've finished with. */ -static struct folio *netfs_writeback_lookup_folio(struct netfs_io_request *wreq, loff_t pos) +static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq, + unsigned int *notes) { - XA_STATE(xas, &wreq->mapping->i_pages, pos / PAGE_SIZE); - struct folio *folio; - - rcu_read_lock(); - - for (;;) { - xas_reset(&xas); - folio = xas_load(&xas); - if (xas_retry(&xas, folio)) - continue; + struct folio_queue *folioq = wreq->buffer; + unsigned long long collected_to = wreq->collected_to; + unsigned int slot = wreq->buffer_head_slot; - if (!folio || xa_is_value(folio)) - kdebug("R=%08x: folio %lx (%llx) not present", - wreq->debug_id, xas.xa_index, pos / PAGE_SIZE); - BUG_ON(!folio || xa_is_value(folio)); - - if (folio == xas_reload(&xas)) - break; + if (wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE) { + if (netfs_pgpriv2_unlock_copied_folios(wreq)) + *notes |= MADE_PROGRESS; + return; } - rcu_read_unlock(); - - if (WARN_ONCE(!folio_test_writeback(folio), - "R=%08x: folio %lx is not under writeback\n", - wreq->debug_id, folio->index)) { - trace_netfs_folio(folio, netfs_folio_trace_not_under_wback); + if (slot >= folioq_nr_slots(folioq)) { + folioq = netfs_delete_buffer_head(wreq); + slot = 0; } - return folio; -} -/* - * Unlock any folios we've finished with. - */ -static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq, - unsigned long long collected_to, - unsigned int *notes) -{ for (;;) { struct folio *folio; struct netfs_folio *finfo; unsigned long long fpos, fend; size_t fsize, flen; - folio = netfs_writeback_lookup_folio(wreq, wreq->cleaned_to); + folio = folioq_folio(folioq, slot); + if (WARN_ONCE(!folio_test_writeback(folio), + "R=%08x: folio %lx is not under writeback\n", + wreq->debug_id, folio->index)) + trace_netfs_folio(folio, netfs_folio_trace_not_under_wback); fpos = folio_pos(folio); fsize = folio_size(folio); @@ -134,12 +119,6 @@ static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq, trace_netfs_collect_folio(wreq, folio, fend, collected_to); - if (fpos + fsize > wreq->contiguity) { - trace_netfs_collect_contig(wreq, fpos + fsize, - netfs_contig_trace_unlock); - wreq->contiguity = fpos + fsize; - } - /* Unlock any folio we've transferred all of. */ if (collected_to < fend) break; @@ -148,9 +127,25 @@ static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq, wreq->cleaned_to = fpos + fsize; *notes |= MADE_PROGRESS; + /* Clean up the head folioq. If we clear an entire folioq, then + * we can get rid of it provided it's not also the tail folioq + * being filled by the issuer. + */ + folioq_clear(folioq, slot); + slot++; + if (slot >= folioq_nr_slots(folioq)) { + if (READ_ONCE(wreq->buffer_tail) == folioq) + break; + folioq = netfs_delete_buffer_head(wreq); + slot = 0; + } + if (fpos + fsize >= collected_to) break; } + + wreq->buffer = folioq; + wreq->buffer_head_slot = slot; } /* @@ -181,9 +176,12 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) break; if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) { + struct iov_iter source = subreq->io_iter; + + iov_iter_revert(&source, subreq->len - source.count); __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); - netfs_reissue_write(stream, subreq); + netfs_reissue_write(stream, subreq, &source); } } return; @@ -193,6 +191,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, do { struct netfs_io_subrequest *subreq = NULL, *from, *to, *tmp; + struct iov_iter source; unsigned long long start, len; size_t part; bool boundary = false; @@ -220,6 +219,13 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, len += to->len; } + /* Determine the set of buffers we're going to use. Each + * subreq gets a subset of a single overall contiguous buffer. + */ + netfs_reset_iter(from); + source = from->io_iter; + source.count = len; + /* Work through the sublist. */ subreq = from; list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) { @@ -231,7 +237,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); stream->prepare_write(subreq); - part = min(len, subreq->max_len); + part = min(len, stream->sreq_max_len); subreq->len = part; subreq->start = start; subreq->transferred = 0; @@ -242,7 +248,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, boundary = true; netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); - netfs_reissue_write(stream, subreq); + netfs_reissue_write(stream, subreq, &source); if (subreq == to) break; } @@ -271,8 +277,6 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, subreq = netfs_alloc_subrequest(wreq); subreq->source = to->source; subreq->start = start; - subreq->max_len = len; - subreq->max_nr_segs = INT_MAX; subreq->debug_index = atomic_inc_return(&wreq->subreq_counter); subreq->stream_nr = to->stream_nr; __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); @@ -286,10 +290,12 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, to = list_next_entry(to, rreq_link); trace_netfs_sreq(subreq, netfs_sreq_trace_retry); + stream->sreq_max_len = len; + stream->sreq_max_segs = INT_MAX; switch (stream->source) { case NETFS_UPLOAD_TO_SERVER: netfs_stat(&netfs_n_wh_upload); - subreq->max_len = min(len, wreq->wsize); + stream->sreq_max_len = umin(len, wreq->wsize); break; case NETFS_WRITE_TO_CACHE: netfs_stat(&netfs_n_wh_write); @@ -300,7 +306,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, stream->prepare_write(subreq); - part = min(len, subreq->max_len); + part = umin(len, stream->sreq_max_len); subreq->len = subreq->transferred + part; len -= part; start += part; @@ -309,7 +315,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, boundary = false; } - netfs_reissue_write(stream, subreq); + netfs_reissue_write(stream, subreq, &source); if (!len) break; @@ -370,7 +376,7 @@ static void netfs_collect_write_results(struct netfs_io_request *wreq) { struct netfs_io_subrequest *front, *remove; struct netfs_io_stream *stream; - unsigned long long collected_to; + unsigned long long collected_to, issued_to; unsigned int notes; int s; @@ -379,28 +385,22 @@ static void netfs_collect_write_results(struct netfs_io_request *wreq) trace_netfs_rreq(wreq, netfs_rreq_trace_collect); reassess_streams: + issued_to = atomic64_read(&wreq->issued_to); smp_rmb(); collected_to = ULLONG_MAX; - if (wreq->origin == NETFS_WRITEBACK) - notes = ALL_EMPTY | BUFFERED | MAYBE_DISCONTIG; - else if (wreq->origin == NETFS_WRITETHROUGH) - notes = ALL_EMPTY | BUFFERED; + if (wreq->origin == NETFS_WRITEBACK || + wreq->origin == NETFS_WRITETHROUGH || + wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE) + notes = BUFFERED; else - notes = ALL_EMPTY; + notes = 0; /* Remove completed subrequests from the front of the streams and * advance the completion point on each stream. We stop when we hit * something that's in progress. The issuer thread may be adding stuff * to the tail whilst we're doing this. - * - * We must not, however, merge in discontiguities that span whole - * folios that aren't under writeback. This is made more complicated - * by the folios in the gap being of unpredictable sizes - if they even - * exist - but we don't want to look them up. */ for (s = 0; s < NR_IO_STREAMS; s++) { - loff_t rstart, rend; - stream = &wreq->io_streams[s]; /* Read active flag before list pointers */ if (!smp_load_acquire(&stream->active)) @@ -412,26 +412,10 @@ reassess_streams: //_debug("sreq [%x] %llx %zx/%zx", // front->debug_index, front->start, front->transferred, front->len); - /* Stall if there may be a discontinuity. */ - rstart = round_down(front->start, PAGE_SIZE); - if (rstart > wreq->contiguity) { - if (wreq->contiguity > stream->collected_to) { - trace_netfs_collect_gap(wreq, stream, - wreq->contiguity, 'D'); - stream->collected_to = wreq->contiguity; - } - notes |= REASSESS_DISCONTIG; - break; + if (stream->collected_to < front->start) { + trace_netfs_collect_gap(wreq, stream, issued_to, 'F'); + stream->collected_to = front->start; } - rend = round_up(front->start + front->len, PAGE_SIZE); - if (rend > wreq->contiguity) { - trace_netfs_collect_contig(wreq, rend, - netfs_contig_trace_collect); - wreq->contiguity = rend; - if (notes & REASSESS_DISCONTIG) - notes |= NEED_REASSESS; - } - notes &= ~MAYBE_DISCONTIG; /* Stall if the front is still undergoing I/O. */ if (test_bit(NETFS_SREQ_IN_PROGRESS, &front->flags)) { @@ -466,33 +450,27 @@ reassess_streams: cancel: /* Remove if completely consumed. */ - spin_lock(&wreq->lock); + spin_lock_bh(&wreq->lock); remove = front; list_del_init(&front->rreq_link); front = list_first_entry_or_null(&stream->subrequests, struct netfs_io_subrequest, rreq_link); stream->front = front; - if (!front) { - unsigned long long jump_to = atomic64_read(&wreq->issued_to); - - if (stream->collected_to < jump_to) { - trace_netfs_collect_gap(wreq, stream, jump_to, 'A'); - stream->collected_to = jump_to; - } - } - - spin_unlock(&wreq->lock); + spin_unlock_bh(&wreq->lock); netfs_put_subrequest(remove, false, notes & SAW_FAILURE ? netfs_sreq_trace_put_cancel : netfs_sreq_trace_put_done); } - if (front) - notes &= ~ALL_EMPTY; - else - notes |= SOME_EMPTY; + /* If we have an empty stream, we need to jump it forward + * otherwise the collection point will never advance. + */ + if (!front && issued_to > stream->collected_to) { + trace_netfs_collect_gap(wreq, stream, issued_to, 'E'); + stream->collected_to = issued_to; + } if (stream->collected_to < collected_to) collected_to = stream->collected_to; @@ -501,36 +479,6 @@ reassess_streams: if (collected_to != ULLONG_MAX && collected_to > wreq->collected_to) wreq->collected_to = collected_to; - /* If we have an empty stream, we need to jump it forward over any gap - * otherwise the collection point will never advance. - * - * Note that the issuer always adds to the stream with the lowest - * so-far submitted start, so if we see two consecutive subreqs in one - * stream with nothing between then in another stream, then the second - * stream has a gap that can be jumped. - */ - if (notes & SOME_EMPTY) { - unsigned long long jump_to = wreq->start + READ_ONCE(wreq->submitted); - - for (s = 0; s < NR_IO_STREAMS; s++) { - stream = &wreq->io_streams[s]; - if (stream->active && - stream->front && - stream->front->start < jump_to) - jump_to = stream->front->start; - } - - for (s = 0; s < NR_IO_STREAMS; s++) { - stream = &wreq->io_streams[s]; - if (stream->active && - !stream->front && - stream->collected_to < jump_to) { - trace_netfs_collect_gap(wreq, stream, jump_to, 'B'); - stream->collected_to = jump_to; - } - } - } - for (s = 0; s < NR_IO_STREAMS; s++) { stream = &wreq->io_streams[s]; if (stream->active) @@ -541,43 +489,14 @@ reassess_streams: /* Unlock any folios that we have now finished with. */ if (notes & BUFFERED) { - unsigned long long clean_to = min(wreq->collected_to, wreq->contiguity); - - if (wreq->cleaned_to < clean_to) - netfs_writeback_unlock_folios(wreq, clean_to, ¬es); + if (wreq->cleaned_to < wreq->collected_to) + netfs_writeback_unlock_folios(wreq, ¬es); } else { wreq->cleaned_to = wreq->collected_to; } // TODO: Discard encryption buffers - /* If all streams are discontiguous with the last folio we cleared, we - * may need to skip a set of folios. - */ - if ((notes & (MAYBE_DISCONTIG | ALL_EMPTY)) == MAYBE_DISCONTIG) { - unsigned long long jump_to = ULLONG_MAX; - - for (s = 0; s < NR_IO_STREAMS; s++) { - stream = &wreq->io_streams[s]; - if (stream->active && stream->front && - stream->front->start < jump_to) - jump_to = stream->front->start; - } - - trace_netfs_collect_contig(wreq, jump_to, netfs_contig_trace_jump); - wreq->contiguity = jump_to; - wreq->cleaned_to = jump_to; - wreq->collected_to = jump_to; - for (s = 0; s < NR_IO_STREAMS; s++) { - stream = &wreq->io_streams[s]; - if (stream->collected_to < jump_to) - stream->collected_to = jump_to; - } - //cond_resched(); - notes |= MADE_PROGRESS; - goto reassess_streams; - } - if (notes & NEED_RETRY) goto need_retry; if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) { diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 9258d30cffe3..04e66d587f77 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -94,6 +94,9 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, { struct netfs_io_request *wreq; struct netfs_inode *ictx; + bool is_buffered = (origin == NETFS_WRITEBACK || + origin == NETFS_WRITETHROUGH || + origin == NETFS_PGPRIV2_COPY_TO_CACHE); wreq = netfs_alloc_request(mapping, file, start, 0, origin); if (IS_ERR(wreq)) @@ -102,12 +105,10 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, _enter("R=%x", wreq->debug_id); ictx = netfs_inode(wreq->inode); - if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags)) + if (is_buffered && netfs_is_cache_enabled(ictx)) fscache_begin_write_operation(&wreq->cache_resources, netfs_i_cookie(ictx)); - wreq->contiguity = wreq->start; wreq->cleaned_to = wreq->start; - INIT_WORK(&wreq->work, netfs_write_collection_worker); wreq->io_streams[0].stream_nr = 0; wreq->io_streams[0].source = NETFS_UPLOAD_TO_SERVER; @@ -156,22 +157,19 @@ static void netfs_prepare_write(struct netfs_io_request *wreq, subreq = netfs_alloc_subrequest(wreq); subreq->source = stream->source; subreq->start = start; - subreq->max_len = ULONG_MAX; - subreq->max_nr_segs = INT_MAX; subreq->stream_nr = stream->stream_nr; + subreq->io_iter = wreq->io_iter; _enter("R=%x[%x]", wreq->debug_id, subreq->debug_index); - trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index, - refcount_read(&subreq->ref), - netfs_sreq_trace_new); - trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); + stream->sreq_max_len = UINT_MAX; + stream->sreq_max_segs = INT_MAX; switch (stream->source) { case NETFS_UPLOAD_TO_SERVER: netfs_stat(&netfs_n_wh_upload); - subreq->max_len = wreq->wsize; + stream->sreq_max_len = wreq->wsize; break; case NETFS_WRITE_TO_CACHE: netfs_stat(&netfs_n_wh_write); @@ -190,7 +188,7 @@ static void netfs_prepare_write(struct netfs_io_request *wreq, * the list. The collector only goes nextwards and uses the lock to * remove entries off of the front. */ - spin_lock(&wreq->lock); + spin_lock_bh(&wreq->lock); list_add_tail(&subreq->rreq_link, &stream->subrequests); if (list_is_first(&subreq->rreq_link, &stream->subrequests)) { stream->front = subreq; @@ -201,7 +199,7 @@ static void netfs_prepare_write(struct netfs_io_request *wreq, } } - spin_unlock(&wreq->lock); + spin_unlock_bh(&wreq->lock); stream->construct = subreq; } @@ -221,41 +219,34 @@ static void netfs_do_issue_write(struct netfs_io_stream *stream, if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) return netfs_write_subrequest_terminated(subreq, subreq->error, false); - // TODO: Use encrypted buffer - if (test_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags)) { - subreq->io_iter = wreq->io_iter; - iov_iter_advance(&subreq->io_iter, - subreq->start + subreq->transferred - wreq->start); - iov_iter_truncate(&subreq->io_iter, - subreq->len - subreq->transferred); - } else { - iov_iter_xarray(&subreq->io_iter, ITER_SOURCE, &wreq->mapping->i_pages, - subreq->start + subreq->transferred, - subreq->len - subreq->transferred); - } - trace_netfs_sreq(subreq, netfs_sreq_trace_submit); stream->issue_write(subreq); } void netfs_reissue_write(struct netfs_io_stream *stream, - struct netfs_io_subrequest *subreq) + struct netfs_io_subrequest *subreq, + struct iov_iter *source) { + size_t size = subreq->len - subreq->transferred; + + // TODO: Use encrypted buffer + subreq->io_iter = *source; + iov_iter_advance(source, size); + iov_iter_truncate(&subreq->io_iter, size); + __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); netfs_do_issue_write(stream, subreq); } -static void netfs_issue_write(struct netfs_io_request *wreq, - struct netfs_io_stream *stream) +void netfs_issue_write(struct netfs_io_request *wreq, + struct netfs_io_stream *stream) { struct netfs_io_subrequest *subreq = stream->construct; if (!subreq) return; stream->construct = NULL; - - if (subreq->start + subreq->len > wreq->start + wreq->submitted) - WRITE_ONCE(wreq->submitted, subreq->start + subreq->len - wreq->start); + subreq->io_iter.count = subreq->len; netfs_do_issue_write(stream, subreq); } @@ -288,13 +279,14 @@ int netfs_advance_write(struct netfs_io_request *wreq, netfs_prepare_write(wreq, stream, start); subreq = stream->construct; - part = min(subreq->max_len - subreq->len, len); - _debug("part %zx/%zx %zx/%zx", subreq->len, subreq->max_len, part, len); + part = umin(stream->sreq_max_len - subreq->len, len); + _debug("part %zx/%zx %zx/%zx", subreq->len, stream->sreq_max_len, part, len); subreq->len += part; subreq->nr_segs++; + stream->submit_extendable_to -= part; - if (subreq->len >= subreq->max_len || - subreq->nr_segs >= subreq->max_nr_segs || + if (subreq->len >= stream->sreq_max_len || + subreq->nr_segs >= stream->sreq_max_segs || to_eof) { netfs_issue_write(wreq, stream); subreq = NULL; @@ -408,19 +400,26 @@ static int netfs_write_folio(struct netfs_io_request *wreq, folio_unlock(folio); if (fgroup == NETFS_FOLIO_COPY_TO_CACHE) { - if (!fscache_resources_valid(&wreq->cache_resources)) { + if (!cache->avail) { trace_netfs_folio(folio, netfs_folio_trace_cancel_copy); netfs_issue_write(wreq, upload); netfs_folio_written_back(folio); return 0; } trace_netfs_folio(folio, netfs_folio_trace_store_copy); + } else if (!upload->avail && !cache->avail) { + trace_netfs_folio(folio, netfs_folio_trace_cancel_store); + netfs_folio_written_back(folio); + return 0; } else if (!upload->construct) { trace_netfs_folio(folio, netfs_folio_trace_store); } else { trace_netfs_folio(folio, netfs_folio_trace_store_plus); } + /* Attach the folio to the rolling buffer. */ + netfs_buffer_append_folio(wreq, folio, false); + /* Move the submission point forward to allow for write-streaming data * not starting at the front of the page. We don't do write-streaming * with the cache as the cache requires DIO alignment. @@ -430,7 +429,6 @@ static int netfs_write_folio(struct netfs_io_request *wreq, */ for (int s = 0; s < NR_IO_STREAMS; s++) { stream = &wreq->io_streams[s]; - stream->submit_max_len = fsize; stream->submit_off = foff; stream->submit_len = flen; if ((stream->source == NETFS_WRITE_TO_CACHE && streamw) || @@ -438,7 +436,6 @@ static int netfs_write_folio(struct netfs_io_request *wreq, fgroup == NETFS_FOLIO_COPY_TO_CACHE)) { stream->submit_off = UINT_MAX; stream->submit_len = 0; - stream->submit_max_len = 0; } } @@ -465,12 +462,13 @@ static int netfs_write_folio(struct netfs_io_request *wreq, if (choose_s < 0) break; stream = &wreq->io_streams[choose_s]; + wreq->io_iter.iov_offset = stream->submit_off; + atomic64_set(&wreq->issued_to, fpos + stream->submit_off); + stream->submit_extendable_to = fsize - stream->submit_off; part = netfs_advance_write(wreq, stream, fpos + stream->submit_off, stream->submit_len, to_eof); - atomic64_set(&wreq->issued_to, fpos + stream->submit_off); stream->submit_off += part; - stream->submit_max_len -= part; if (part > stream->submit_len) stream->submit_len = 0; else @@ -479,6 +477,8 @@ static int netfs_write_folio(struct netfs_io_request *wreq, debug = true; } + wreq->io_iter.iov_offset = 0; + iov_iter_advance(&wreq->io_iter, fsize); atomic64_set(&wreq->issued_to, fpos + fsize); if (!debug) @@ -503,10 +503,14 @@ int netfs_writepages(struct address_space *mapping, struct folio *folio; int error = 0; - if (wbc->sync_mode == WB_SYNC_ALL) + if (!mutex_trylock(&ictx->wb_lock)) { + if (wbc->sync_mode == WB_SYNC_NONE) { + netfs_stat(&netfs_n_wb_lock_skip); + return 0; + } + netfs_stat(&netfs_n_wb_lock_wait); mutex_lock(&ictx->wb_lock); - else if (!mutex_trylock(&ictx->wb_lock)) - return 0; + } /* Need the first folio to be able to set up the op. */ folio = writeback_iter(mapping, wbc, NULL, &error); @@ -523,10 +527,10 @@ int netfs_writepages(struct address_space *mapping, netfs_stat(&netfs_n_wh_writepages); do { - _debug("wbiter %lx %llx", folio->index, wreq->start + wreq->submitted); + _debug("wbiter %lx %llx", folio->index, atomic64_read(&wreq->issued_to)); /* It appears we don't have to handle cyclic writeback wrapping. */ - WARN_ON_ONCE(wreq && folio_pos(folio) < wreq->start + wreq->submitted); + WARN_ON_ONCE(wreq && folio_pos(folio) < atomic64_read(&wreq->issued_to)); if (netfs_folio_group(folio) != NETFS_FOLIO_COPY_TO_CACHE && unlikely(!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))) { @@ -670,6 +674,7 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t part = netfs_advance_write(wreq, upload, start, len, false); start += part; len -= part; + iov_iter_advance(&wreq->io_iter, part); if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) { trace_netfs_rreq(wreq, netfs_rreq_trace_wait_pause); wait_on_bit(&wreq->flags, NETFS_RREQ_PAUSE, TASK_UNINTERRUPTIBLE); diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 29c49a7e5fe1..6df77f008d3f 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -118,7 +118,9 @@ static __be32 decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) if (likely(attrlen > 0)) bitmap[0] = ntohl(*p++); if (attrlen > 1) - bitmap[1] = ntohl(*p); + bitmap[1] = ntohl(*p++); + if (attrlen > 2) + bitmap[2] = ntohl(*p); return 0; } @@ -446,7 +448,7 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp, void *argp) { struct cb_recallanyargs *args = argp; - uint32_t bitmap[2]; + uint32_t bitmap[3]; __be32 *p, status; p = xdr_inline_decode(xdr, 4); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index d5edb3b3eeef..20cb2008f9e4 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -647,6 +647,9 @@ restart: prev = delegation; continue; } + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) + continue; if (prev) { struct inode *tmp = nfs_delegation_grab_inode(prev); @@ -657,12 +660,6 @@ restart: } } - inode = nfs_delegation_grab_inode(delegation); - if (inode == NULL) { - rcu_read_unlock(); - iput(to_put); - goto restart; - } delegation = nfs_start_delegation_return_locked(NFS_I(inode)); rcu_read_unlock(); @@ -1184,7 +1181,6 @@ static int nfs_server_reap_unclaimed_delegations(struct nfs_server *server, struct inode *inode; restart: rcu_read_lock(); -restart_locked: list_for_each_entry_rcu(delegation, &server->delegations, super_list) { if (test_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags) || @@ -1195,7 +1191,7 @@ restart_locked: continue; inode = nfs_delegation_grab_inode(delegation); if (inode == NULL) - goto restart_locked; + continue; delegation = nfs_start_delegation_return_locked(NFS_I(inode)); rcu_read_unlock(); if (delegation != NULL) { @@ -1318,7 +1314,6 @@ static int nfs_server_reap_expired_delegations(struct nfs_server *server, restart: rcu_read_lock(); -restart_locked: list_for_each_entry_rcu(delegation, &server->delegations, super_list) { if (test_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags) || @@ -1330,7 +1325,7 @@ restart_locked: continue; inode = nfs_delegation_grab_inode(delegation); if (inode == NULL) - goto restart_locked; + continue; spin_lock(&delegation->lock); cred = get_cred_rcu(delegation->cred); nfs4_stateid_copy(&stateid, &delegation->stateid); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 61a8cdb9f1e1..6800ee92d742 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -336,7 +336,7 @@ static bool nfs_want_read_modify_write(struct file *file, struct folio *folio, * increment the page use counts until he is done with the page. */ static int nfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, + loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { fgf_t fgp = FGP_WRITEBEGIN; @@ -353,7 +353,7 @@ start: mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); - *pagep = &folio->page; + *foliop = folio; ret = nfs_flush_incompatible(file, folio); if (ret) { @@ -372,10 +372,9 @@ start: static int nfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct nfs_open_context *ctx = nfs_file_open_context(file); - struct folio *folio = page_folio(page); unsigned offset = offset_in_folio(folio, pos); int status; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 7202ce84d0eb..810269ee0a50 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -265,6 +265,9 @@ static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *fi { rreq->netfs_priv = get_nfs_open_context(nfs_file_open_context(file)); rreq->debug_id = atomic_inc_return(&nfs_netfs_debug_id); + /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ + __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags); + rreq->io_streams[0].sreq_max_len = NFS_SB(rreq->inode->i_sb)->rsize; return 0; } @@ -286,14 +289,6 @@ static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sre return netfs; } -static bool nfs_netfs_clamp_length(struct netfs_io_subrequest *sreq) -{ - size_t rsize = NFS_SB(sreq->rreq->inode->i_sb)->rsize; - - sreq->len = min(sreq->len, rsize); - return true; -} - static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq) { struct nfs_netfs_io_data *netfs; @@ -302,17 +297,18 @@ static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq) struct nfs_open_context *ctx = sreq->rreq->netfs_priv; struct page *page; unsigned long idx; + pgoff_t start, last; int err; - pgoff_t start = (sreq->start + sreq->transferred) >> PAGE_SHIFT; - pgoff_t last = ((sreq->start + sreq->len - - sreq->transferred - 1) >> PAGE_SHIFT); + + start = (sreq->start + sreq->transferred) >> PAGE_SHIFT; + last = ((sreq->start + sreq->len - sreq->transferred - 1) >> PAGE_SHIFT); nfs_pageio_init_read(&pgio, inode, false, &nfs_async_read_completion_ops); netfs = nfs_netfs_alloc(sreq); if (!netfs) - return netfs_subreq_terminated(sreq, -ENOMEM, false); + return netfs_read_subreq_terminated(sreq, -ENOMEM, false); pgio.pg_netfs = netfs; /* used in completion */ @@ -361,7 +357,8 @@ void nfs_netfs_read_completion(struct nfs_pgio_header *hdr) return; sreq = netfs->sreq; - if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) + if (test_bit(NFS_IOHDR_EOF, &hdr->flags) && + sreq->rreq->origin != NETFS_DIO_READ) __set_bit(NETFS_SREQ_CLEAR_TAIL, &sreq->flags); if (hdr->error) @@ -377,5 +374,4 @@ const struct netfs_request_ops nfs_netfs_ops = { .init_request = nfs_netfs_init_request, .free_request = nfs_netfs_free_request, .issue_read = nfs_netfs_issue_read, - .clamp_length = nfs_netfs_clamp_length }; diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index fbed0027996f..772d485e96d3 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -60,8 +60,6 @@ static inline void nfs_netfs_get(struct nfs_netfs_io_data *netfs) static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs) { - ssize_t final_len; - /* Only the last RPC completion should call netfs_subreq_terminated() */ if (!refcount_dec_and_test(&netfs->refcount)) return; @@ -74,15 +72,14 @@ static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs) * Correct the final length here to be no larger than the netfs subrequest * length, and thus avoid netfs's "Subreq overread" warning message. */ - final_len = min_t(s64, netfs->sreq->len, atomic64_read(&netfs->transferred)); - netfs_subreq_terminated(netfs->sreq, netfs->error ?: final_len, false); + netfs->sreq->transferred = min_t(s64, netfs->sreq->len, + atomic64_read(&netfs->transferred)); + netfs_read_subreq_terminated(netfs->sreq, netfs->error, false); kfree(netfs); } static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) { netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops, false); - /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ - __set_bit(NETFS_ICTX_USE_PGPRIV2, &nfsi->netfs.flags); } extern void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr); extern void nfs_netfs_read_completion(struct nfs_pgio_header *hdr); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8883016c551c..b8ffbe52ba15 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3931,7 +3931,8 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f FATTR4_WORD0_CASE_INSENSITIVE | FATTR4_WORD0_CASE_PRESERVING; if (minorversion) - bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT; + bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT | + FATTR4_WORD2_OPEN_ARGUMENTS; status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status == 0) { @@ -9997,6 +9998,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) fallthrough; default: task->tk_status = 0; + lrp->res.lrs_present = 0; fallthrough; case 0: break; @@ -10010,9 +10012,11 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) task->tk_status = 0; break; case -NFS4ERR_DELAY: - if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN) - break; - goto out_restart; + if (nfs4_async_handle_error(task, server, NULL, NULL) == + -EAGAIN) + goto out_restart; + lrp->res.lrs_present = 0; + break; } return; out_restart: diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index aa698481bec8..0d16b383a452 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1284,10 +1284,9 @@ void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, LIST_HEAD(freeme); spin_lock(&inode->i_lock); - if (!pnfs_layout_is_valid(lo) || - !nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) + if (!nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) goto out_unlock; - if (stateid) { + if (stateid && pnfs_layout_is_valid(lo)) { u32 seq = be32_to_cpu(arg_stateid->seqid); pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index cbbd4866b0b7..97b386032b71 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -47,6 +47,7 @@ #include <linux/vfs.h> #include <linux/inet.h> #include <linux/in6.h> +#include <linux/sched.h> #include <linux/slab.h> #include <net/ipv6.h> #include <linux/netdevice.h> @@ -228,6 +229,7 @@ static int __nfs_list_for_each_server(struct list_head *head, ret = fn(server, data); if (ret) goto out; + cond_resched(); rcu_read_lock(); } rcu_read_unlock(); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a20c2c9d7d45..a366fb1c1b9b 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2789,15 +2789,18 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) deny & NFS4_SHARE_ACCESS_READ ? "r" : "-", deny & NFS4_SHARE_ACCESS_WRITE ? "w" : "-"); - spin_lock(&nf->fi_lock); - file = find_any_file_locked(nf); - if (file) { - nfs4_show_superblock(s, file); - seq_puts(s, ", "); - nfs4_show_fname(s, file); - seq_puts(s, ", "); - } - spin_unlock(&nf->fi_lock); + if (nf) { + spin_lock(&nf->fi_lock); + file = find_any_file_locked(nf); + if (file) { + nfs4_show_superblock(s, file); + seq_puts(s, ", "); + nfs4_show_fname(s, file); + seq_puts(s, ", "); + } + spin_unlock(&nf->fi_lock); + } else + seq_puts(s, "closed, "); nfs4_show_owner(s, oo); if (st->sc_status & SC_STATUS_ADMIN_REVOKED) seq_puts(s, ", admin-revoked"); @@ -3075,9 +3078,9 @@ nfsd4_cb_getattr_release(struct nfsd4_callback *cb) struct nfs4_delegation *dp = container_of(ncf, struct nfs4_delegation, dl_cb_fattr); - nfs4_put_stid(&dp->dl_stid); clear_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags); wake_up_bit(&ncf->ncf_cb_flags, CB_GETATTR_BUSY); + nfs4_put_stid(&dp->dl_stid); } static const struct nfsd4_callback_ops nfsd4_cb_recall_any_ops = { @@ -8812,7 +8815,7 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, /** * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict * @rqstp: RPC transaction context - * @inode: file to be checked for a conflict + * @dentry: dentry of inode to be checked for a conflict * @modified: return true if file was modified * @size: new size of file if modified is true * @@ -8827,16 +8830,16 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, * code is returned. */ __be32 -nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode, +nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry, bool *modified, u64 *size) { __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct file_lock_context *ctx; struct file_lease *fl; - struct nfs4_delegation *dp; struct iattr attrs; struct nfs4_cb_fattr *ncf; + struct inode *inode = d_inode(dentry); *modified = false; ctx = locks_inode_context(inode); @@ -8856,17 +8859,26 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode, */ if (type == F_RDLCK) break; - goto break_lease; + + nfsd_stats_wdeleg_getattr_inc(nn); + spin_unlock(&ctx->flc_lock); + + status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); + if (status != nfserr_jukebox || + !nfsd_wait_for_delegreturn(rqstp, inode)) + return status; + return 0; } if (type == F_WRLCK) { - dp = fl->c.flc_owner; + struct nfs4_delegation *dp = fl->c.flc_owner; + if (dp->dl_recall.cb_clp == *(rqstp->rq_lease_breaker)) { spin_unlock(&ctx->flc_lock); return 0; } -break_lease: nfsd_stats_wdeleg_getattr_inc(nn); dp = fl->c.flc_owner; + refcount_inc(&dp->dl_stid.sc_count); ncf = &dp->dl_cb_fattr; nfs4_cb_getattr(&dp->dl_cb_fattr); spin_unlock(&ctx->flc_lock); @@ -8876,27 +8888,37 @@ break_lease: /* Recall delegation only if client didn't respond */ status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); if (status != nfserr_jukebox || - !nfsd_wait_for_delegreturn(rqstp, inode)) + !nfsd_wait_for_delegreturn(rqstp, inode)) { + nfs4_put_stid(&dp->dl_stid); return status; + } } if (!ncf->ncf_file_modified && (ncf->ncf_initial_cinfo != ncf->ncf_cb_change || ncf->ncf_cur_fsize != ncf->ncf_cb_fsize)) ncf->ncf_file_modified = true; if (ncf->ncf_file_modified) { + int err; + /* * Per section 10.4.3 of RFC 8881, the server would * not update the file's metadata with the client's * modified size */ attrs.ia_mtime = attrs.ia_ctime = current_time(inode); - attrs.ia_valid = ATTR_MTIME | ATTR_CTIME; - setattr_copy(&nop_mnt_idmap, inode, &attrs); - mark_inode_dirty(inode); + attrs.ia_valid = ATTR_MTIME | ATTR_CTIME | ATTR_DELEG; + inode_lock(inode); + err = notify_change(&nop_mnt_idmap, dentry, &attrs, NULL); + inode_unlock(inode); + if (err) { + nfs4_put_stid(&dp->dl_stid); + return nfserrno(err); + } ncf->ncf_cur_fsize = ncf->ncf_cb_fsize; *size = ncf->ncf_cur_fsize; *modified = true; } + nfs4_put_stid(&dp->dl_stid); return 0; } break; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 42b41d55d4ed..97f583777972 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3545,6 +3545,9 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, args.dentry = dentry; args.ignore_crossmnt = (ignore_crossmnt != 0); args.acl = NULL; +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL + args.context = NULL; +#endif /* * Make a local copy of the attribute bitmap that can be modified. @@ -3562,7 +3565,7 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, } args.size = 0; if (attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) { - status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry), + status = nfsd4_deleg_getattr_conflict(rqstp, dentry, &file_modified, &size); if (status) goto out; @@ -3617,7 +3620,6 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, args.contextsupport = false; #ifdef CONFIG_NFSD_V4_SECURITY_LABEL - args.context = NULL; if ((attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) || attrmask[0] & FATTR4_WORD0_SUPPORTED_ATTRS) { if (exp->ex_flags & NFSEXP_SECURITY_LABEL) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 9e0ea6fc2aa3..34eb2c2cbcde 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -2069,8 +2069,7 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info) continue; } - ret = svc_xprt_create_from_sa(serv, xcl_name, net, sa, - SVC_SOCK_ANONYMOUS, + ret = svc_xprt_create_from_sa(serv, xcl_name, net, sa, 0, get_current_cred()); /* always save the latest error */ if (ret < 0) diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index ffc217099d19..ec4559ecd193 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -781,5 +781,5 @@ static inline bool try_to_expire_client(struct nfs4_client *clp) } extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, - struct inode *inode, bool *file_modified, u64 *size); + struct dentry *dentry, bool *file_modified, u64 *size); #endif /* NFSD4_STATE_H */ diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 4a29b0138d75..4b3e19d74925 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -83,7 +83,7 @@ static int nilfs_prepare_chunk(struct folio *folio, unsigned int from, { loff_t pos = folio_pos(folio) + from; - return __block_write_begin(&folio->page, pos, to - from, nilfs_get_block); + return __block_write_begin(folio, pos, to - from, nilfs_get_block); } static void nilfs_commit_chunk(struct folio *folio, @@ -96,7 +96,7 @@ static void nilfs_commit_chunk(struct folio *folio, int err; nr_dirty = nilfs_page_count_clean_buffers(&folio->page, from, to); - copied = block_write_end(NULL, mapping, pos, len, len, &folio->page, NULL); + copied = block_write_end(NULL, mapping, pos, len, len, folio, NULL); if (pos + copied > dir->i_size) i_size_write(dir, pos + copied); if (IS_DIRSYNC(dir)) diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 7340a01d80e1..8661f452dba6 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -250,7 +250,7 @@ void nilfs_write_failed(struct address_space *mapping, loff_t to) static int nilfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; @@ -259,7 +259,7 @@ static int nilfs_write_begin(struct file *file, struct address_space *mapping, if (unlikely(err)) return err; - err = block_write_begin(mapping, pos, len, pagep, nilfs_get_block); + err = block_write_begin(mapping, pos, len, foliop, nilfs_get_block); if (unlikely(err)) { nilfs_write_failed(mapping, pos + len); nilfs_transaction_abort(inode->i_sb); @@ -269,16 +269,16 @@ static int nilfs_write_begin(struct file *file, struct address_space *mapping, static int nilfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; unsigned int start = pos & (PAGE_SIZE - 1); unsigned int nr_dirty; int err; - nr_dirty = nilfs_page_count_clean_buffers(page, start, + nr_dirty = nilfs_page_count_clean_buffers(&folio->page, start, start + copied); - copied = generic_write_end(file, mapping, pos, len, copied, page, + copied = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); nilfs_set_file_dirty(inode, nr_dirty); err = nilfs_transaction_commit(inode->i_sb); diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index b638dc06df2f..ec61ce9f29a2 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -498,7 +498,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, struct inode *inode; struct nilfs_recovery_block *rb, *n; unsigned int blocksize = nilfs->ns_blocksize; - struct page *page; + struct folio *folio; loff_t pos; int err = 0, err2 = 0; @@ -512,7 +512,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, pos = rb->blkoff << inode->i_blkbits; err = block_write_begin(inode->i_mapping, pos, blocksize, - &page, nilfs_get_block); + &folio, nilfs_get_block); if (unlikely(err)) { loff_t isize = inode->i_size; @@ -522,7 +522,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, goto failed_inode; } - err = nilfs_recovery_copy_block(nilfs, rb, pos, page); + err = nilfs_recovery_copy_block(nilfs, rb, pos, &folio->page); if (unlikely(err)) goto failed_page; @@ -531,17 +531,17 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, goto failed_page; block_write_end(NULL, inode->i_mapping, pos, blocksize, - blocksize, page, NULL); + blocksize, folio, NULL); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); (*nr_salvaged_blocks)++; goto next; failed_page: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); failed_inode: nilfs_warn(sb, @@ -716,6 +716,33 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs, } /** + * nilfs_abort_roll_forward - cleaning up after a failed rollforward recovery + * @nilfs: nilfs object + */ +static void nilfs_abort_roll_forward(struct the_nilfs *nilfs) +{ + struct nilfs_inode_info *ii, *n; + LIST_HEAD(head); + + /* Abandon inodes that have read recovery data */ + spin_lock(&nilfs->ns_inode_lock); + list_splice_init(&nilfs->ns_dirty_files, &head); + spin_unlock(&nilfs->ns_inode_lock); + if (list_empty(&head)) + return; + + set_nilfs_purging(nilfs); + list_for_each_entry_safe(ii, n, &head, i_dirty) { + spin_lock(&nilfs->ns_inode_lock); + list_del_init(&ii->i_dirty); + spin_unlock(&nilfs->ns_inode_lock); + + iput(&ii->vfs_inode); + } + clear_nilfs_purging(nilfs); +} + +/** * nilfs_salvage_orphan_logs - salvage logs written after the latest checkpoint * @nilfs: nilfs object * @sb: super block instance @@ -773,15 +800,19 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, if (unlikely(err)) { nilfs_err(sb, "error %d writing segment for recovery", err); - goto failed; + goto put_root; } nilfs_finish_roll_forward(nilfs, ri); } - failed: +put_root: nilfs_put_root(root); return err; + +failed: + nilfs_abort_roll_forward(nilfs); + goto put_root; } /** diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 0ca3110d6386..871ec35ea8e8 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1812,6 +1812,9 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci, nilfs_abort_logs(&logs, ret ? : err); list_splice_tail_init(&sci->sc_segbufs, &logs); + if (list_empty(&logs)) + return; /* if the first segment buffer preparation failed */ + nilfs_cancel_segusage(&logs, nilfs->ns_sufile); nilfs_free_incomplete_logs(&logs, nilfs); @@ -2056,7 +2059,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) err = nilfs_segctor_begin_construction(sci, nilfs); if (unlikely(err)) - goto out; + goto failed; /* Update time stamp */ sci->sc_seg_ctime = ktime_get_real_seconds(); @@ -2120,10 +2123,9 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) return err; failed_to_write: - if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED) - nilfs_redirty_inodes(&sci->sc_dirty_files); - failed: + if (mode == SC_LSEG_SR && nilfs_sc_cstage_get(sci) >= NILFS_ST_IFILE) + nilfs_redirty_inodes(&sci->sc_dirty_files); if (nilfs_doing_gc()) nilfs_redirty_inodes(&sci->sc_gc_inodes); nilfs_segctor_abort_construction(sci, nilfs, err); diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index a5569b7f47a3..14868a3dd592 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -836,9 +836,15 @@ ssize_t nilfs_dev_revision_show(struct nilfs_dev_attr *attr, struct the_nilfs *nilfs, char *buf) { - struct nilfs_super_block **sbp = nilfs->ns_sbp; - u32 major = le32_to_cpu(sbp[0]->s_rev_level); - u16 minor = le16_to_cpu(sbp[0]->s_minor_rev_level); + struct nilfs_super_block *raw_sb; + u32 major; + u16 minor; + + down_read(&nilfs->ns_sem); + raw_sb = nilfs->ns_sbp[0]; + major = le32_to_cpu(raw_sb->s_rev_level); + minor = le16_to_cpu(raw_sb->s_minor_rev_level); + up_read(&nilfs->ns_sem); return sysfs_emit(buf, "%d.%d\n", major, minor); } @@ -856,8 +862,13 @@ ssize_t nilfs_dev_device_size_show(struct nilfs_dev_attr *attr, struct the_nilfs *nilfs, char *buf) { - struct nilfs_super_block **sbp = nilfs->ns_sbp; - u64 dev_size = le64_to_cpu(sbp[0]->s_dev_size); + struct nilfs_super_block *raw_sb; + u64 dev_size; + + down_read(&nilfs->ns_sem); + raw_sb = nilfs->ns_sbp[0]; + dev_size = le64_to_cpu(raw_sb->s_dev_size); + up_read(&nilfs->ns_sem); return sysfs_emit(buf, "%llu\n", dev_size); } @@ -879,9 +890,15 @@ ssize_t nilfs_dev_uuid_show(struct nilfs_dev_attr *attr, struct the_nilfs *nilfs, char *buf) { - struct nilfs_super_block **sbp = nilfs->ns_sbp; + struct nilfs_super_block *raw_sb; + ssize_t len; - return sysfs_emit(buf, "%pUb\n", sbp[0]->s_uuid); + down_read(&nilfs->ns_sem); + raw_sb = nilfs->ns_sbp[0]; + len = sysfs_emit(buf, "%pUb\n", raw_sb->s_uuid); + up_read(&nilfs->ns_sem); + + return len; } static @@ -889,10 +906,16 @@ ssize_t nilfs_dev_volume_name_show(struct nilfs_dev_attr *attr, struct the_nilfs *nilfs, char *buf) { - struct nilfs_super_block **sbp = nilfs->ns_sbp; + struct nilfs_super_block *raw_sb; + ssize_t len; + + down_read(&nilfs->ns_sem); + raw_sb = nilfs->ns_sbp[0]; + len = scnprintf(buf, sizeof(raw_sb->s_volume_name), "%s\n", + raw_sb->s_volume_name); + up_read(&nilfs->ns_sem); - return scnprintf(buf, sizeof(sbp[0]->s_volume_name), "%s\n", - sbp[0]->s_volume_name); + return len; } static const char dev_readme_str[] = diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index f3669403fabf..46440fbb8662 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -110,7 +110,7 @@ static int dnotify_handle_event(struct fsnotify_mark *inode_mark, u32 mask, prev = &dn->dn_next; continue; } - fown = &dn->dn_filp->f_owner; + fown = file_f_owner(dn->dn_filp); send_sigio(fown, dn->dn_fd, POLL_MSG); if (dn->dn_mask & FS_DN_MULTISHOT) prev = &dn->dn_next; @@ -316,6 +316,10 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg) goto out_err; } + error = file_f_owner_allocate(filp); + if (error) + goto out_err; + /* set up the new_fsn_mark and new_dn_mark */ new_fsn_mark = &new_dn_mark->fsn_mark; fsnotify_init_mark(new_fsn_mark, dnotify_group); diff --git a/fs/nsfs.c b/fs/nsfs.c index 97c37a9631e5..67ee176b8824 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -12,6 +12,7 @@ #include <linux/user_namespace.h> #include <linux/nsfs.h> #include <linux/uaccess.h> +#include <linux/mnt_namespace.h> #include "mount.h" #include "internal.h" @@ -128,6 +129,30 @@ int open_related_ns(struct ns_common *ns, } EXPORT_SYMBOL_GPL(open_related_ns); +static int copy_ns_info_to_user(const struct mnt_namespace *mnt_ns, + struct mnt_ns_info __user *uinfo, size_t usize, + struct mnt_ns_info *kinfo) +{ + /* + * If userspace and the kernel have the same struct size it can just + * be copied. If userspace provides an older struct, only the bits that + * userspace knows about will be copied. If userspace provides a new + * struct, only the bits that the kernel knows aobut will be copied and + * the size value will be set to the size the kernel knows about. + */ + kinfo->size = min(usize, sizeof(*kinfo)); + kinfo->mnt_ns_id = mnt_ns->seq; + kinfo->nr_mounts = READ_ONCE(mnt_ns->nr_mounts); + /* Subtract the root mount of the mount namespace. */ + if (kinfo->nr_mounts) + kinfo->nr_mounts--; + + if (copy_to_user(uinfo, kinfo, kinfo->size)) + return -EFAULT; + + return 0; +} + static long ns_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -135,6 +160,8 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, struct pid_namespace *pid_ns; struct task_struct *tsk; struct ns_common *ns = get_proc_ns(file_inode(filp)); + struct mnt_namespace *mnt_ns; + bool previous = false; uid_t __user *argp; uid_t uid; int ret; @@ -156,7 +183,6 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, uid = from_kuid_munged(current_user_ns(), user_ns->owner); return put_user(uid, argp); case NS_GET_MNTNS_ID: { - struct mnt_namespace *mnt_ns; __u64 __user *idp; __u64 id; @@ -211,7 +237,79 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, if (!ret) ret = -ESRCH; - break; + return ret; + } + } + + /* extensible ioctls */ + switch (_IOC_NR(ioctl)) { + case _IOC_NR(NS_MNT_GET_INFO): { + struct mnt_ns_info kinfo = {}; + struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg; + size_t usize = _IOC_SIZE(ioctl); + + if (ns->ops->type != CLONE_NEWNS) + return -EINVAL; + + if (!uinfo) + return -EINVAL; + + if (usize < MNT_NS_INFO_SIZE_VER0) + return -EINVAL; + + return copy_ns_info_to_user(to_mnt_ns(ns), uinfo, usize, &kinfo); + } + case _IOC_NR(NS_MNT_GET_PREV): + previous = true; + fallthrough; + case _IOC_NR(NS_MNT_GET_NEXT): { + struct mnt_ns_info kinfo = {}; + struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg; + struct path path __free(path_put) = {}; + struct file *f __free(fput) = NULL; + size_t usize = _IOC_SIZE(ioctl); + + if (ns->ops->type != CLONE_NEWNS) + return -EINVAL; + + if (usize < MNT_NS_INFO_SIZE_VER0) + return -EINVAL; + + if (previous) + mnt_ns = lookup_prev_mnt_ns(to_mnt_ns(ns)); + else + mnt_ns = lookup_next_mnt_ns(to_mnt_ns(ns)); + if (IS_ERR(mnt_ns)) + return PTR_ERR(mnt_ns); + + ns = to_ns_common(mnt_ns); + /* Transfer ownership of @mnt_ns reference to @path. */ + ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path); + if (ret) + return ret; + + CLASS(get_unused_fd, fd)(O_CLOEXEC); + if (fd < 0) + return fd; + + f = dentry_open(&path, O_RDONLY, current_cred()); + if (IS_ERR(f)) + return PTR_ERR(f); + + if (uinfo) { + /* + * If @uinfo is passed return all information about the + * mount namespace as well. + */ + ret = copy_ns_info_to_user(to_mnt_ns(ns), uinfo, usize, &kinfo); + if (ret) + return ret; + } + + /* Transfer reference of @f to caller's fdtable. */ + fd_install(fd, no_free_ptr(f)); + /* File descriptor is live so hand it off to the caller. */ + return take_fd(fd); } default: ret = -ENOTTY; diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index ca1ddc46bd86..6202895a4542 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -182,7 +182,7 @@ static int ntfs_extend_initialized_size(struct file *file, for (;;) { u32 zerofrom, len; - struct page *page; + struct folio *folio; u8 bits; CLST vcn, lcn, clen; @@ -208,14 +208,13 @@ static int ntfs_extend_initialized_size(struct file *file, if (pos + len > new_valid) len = new_valid - pos; - err = ntfs_write_begin(file, mapping, pos, len, &page, NULL); + err = ntfs_write_begin(file, mapping, pos, len, &folio, NULL); if (err) goto out; - zero_user_segment(page, zerofrom, PAGE_SIZE); + folio_zero_range(folio, zerofrom, folio_size(folio)); - /* This function in any case puts page. */ - err = ntfs_write_end(file, mapping, pos, len, len, page, NULL); + err = ntfs_write_end(file, mapping, pos, len, len, folio, NULL); if (err < 0) goto out; pos += len; diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 6b0bdc474e76..f672072e6bd4 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -901,7 +901,7 @@ static int ntfs_get_block_write_begin(struct inode *inode, sector_t vbn, } int ntfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, u32 len, struct page **pagep, void **fsdata) + loff_t pos, u32 len, struct folio **foliop, void **fsdata) { int err; struct inode *inode = mapping->host; @@ -910,7 +910,6 @@ int ntfs_write_begin(struct file *file, struct address_space *mapping, if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) return -EIO; - *pagep = NULL; if (is_resident(ni)) { struct folio *folio = __filemap_get_folio( mapping, pos >> PAGE_SHIFT, FGP_WRITEBEGIN, @@ -926,7 +925,7 @@ int ntfs_write_begin(struct file *file, struct address_space *mapping, ni_unlock(ni); if (!err) { - *pagep = &folio->page; + *foliop = folio; goto out; } folio_unlock(folio); @@ -936,7 +935,7 @@ int ntfs_write_begin(struct file *file, struct address_space *mapping, goto out; } - err = block_write_begin(mapping, pos, len, pagep, + err = block_write_begin(mapping, pos, len, foliop, ntfs_get_block_write_begin); out: @@ -947,9 +946,8 @@ out: * ntfs_write_end - Address_space_operations::write_end. */ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, - u32 len, u32 copied, struct page *page, void *fsdata) + u32 len, u32 copied, struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); struct inode *inode = mapping->host; struct ntfs_inode *ni = ntfs_i(inode); u64 valid = ni->i_valid; @@ -979,7 +977,7 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, folio_unlock(folio); folio_put(folio); } else { - err = generic_write_end(file, mapping, pos, len, copied, page, + err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); } @@ -1008,45 +1006,6 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, return err; } -int reset_log_file(struct inode *inode) -{ - int err; - loff_t pos = 0; - u32 log_size = inode->i_size; - struct address_space *mapping = inode->i_mapping; - - for (;;) { - u32 len; - void *kaddr; - struct page *page; - - len = pos + PAGE_SIZE > log_size ? (log_size - pos) : PAGE_SIZE; - - err = block_write_begin(mapping, pos, len, &page, - ntfs_get_block_write_begin); - if (err) - goto out; - - kaddr = kmap_atomic(page); - memset(kaddr, -1, len); - kunmap_atomic(kaddr); - flush_dcache_page(page); - - err = block_write_end(NULL, mapping, pos, len, len, page, NULL); - if (err < 0) - goto out; - pos += len; - - if (pos >= log_size) - break; - balance_dirty_pages_ratelimited(mapping); - } -out: - mark_inode_dirty_sync(inode); - - return err; -} - int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc) { return _ni_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index e5255a251929..584f814715f4 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -708,13 +708,12 @@ int indx_update_dup(struct ntfs_inode *ni, struct ntfs_sb_info *sbi, struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref, const struct cpu_str *name); int ntfs_set_size(struct inode *inode, u64 new_size); -int reset_log_file(struct inode *inode); int ntfs_get_block(struct inode *inode, sector_t vbn, struct buffer_head *bh_result, int create); int ntfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, u32 len, struct page **pagep, void **fsdata); + loff_t pos, u32 len, struct folio **foliop, void **fsdata); int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, - u32 len, u32 copied, struct page *page, void *fsdata); + u32 len, u32 copied, struct folio *folio, void *fsdata); int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc); int ntfs_sync_inode(struct inode *inode); int ntfs_flush_inodes(struct super_block *sb, struct inode *i1, diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 6be175a1ab3c..d6c985cc6353 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1643,7 +1643,7 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh, int ocfs2_write_begin_nolock(struct address_space *mapping, loff_t pos, unsigned len, ocfs2_write_type_t type, - struct page **pagep, void **fsdata, + struct folio **foliop, void **fsdata, struct buffer_head *di_bh, struct page *mmap_page) { int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS; @@ -1826,8 +1826,8 @@ try_again: ocfs2_free_alloc_context(meta_ac); success: - if (pagep) - *pagep = wc->w_target_page; + if (foliop) + *foliop = page_folio(wc->w_target_page); *fsdata = wc; return 0; out_quota: @@ -1879,7 +1879,7 @@ out: static int ocfs2_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; struct buffer_head *di_bh = NULL; @@ -1901,7 +1901,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping, down_write(&OCFS2_I(inode)->ip_alloc_sem); ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_BUFFER, - pagep, fsdata, di_bh, NULL); + foliop, fsdata, di_bh, NULL); if (ret) { mlog_errno(ret); goto out_fail; @@ -2076,7 +2076,7 @@ out: static int ocfs2_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { int ret; struct inode *inode = mapping->host; diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 3a520117fa59..45db1781ea73 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -38,7 +38,7 @@ typedef enum { int ocfs2_write_begin_nolock(struct address_space *mapping, loff_t pos, unsigned len, ocfs2_write_type_t type, - struct page **pagep, void **fsdata, + struct folio **foliop, void **fsdata, struct buffer_head *di_bh, struct page *mmap_page); int ocfs2_read_inline_data(struct inode *inode, struct page *page, diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index f0beb173dbba..ccef3f42b333 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1932,6 +1932,7 @@ int ocfs2_readdir(struct file *file, struct dir_context *ctx) { int error = 0; struct inode *inode = file_inode(file); + struct ocfs2_file_private *fp = file->private_data; int lock_level = 0; trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -1952,7 +1953,7 @@ int ocfs2_readdir(struct file *file, struct dir_context *ctx) goto bail_nolock; } - error = ocfs2_dir_foreach_blk(inode, &file->f_version, ctx, false); + error = ocfs2_dir_foreach_blk(inode, &fp->cookie, ctx, false); ocfs2_inode_unlock(inode, lock_level); if (error) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index ccc57038a977..ad131a2fc58e 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -755,7 +755,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, u64 abs_to, struct buffer_head *di_bh) { struct address_space *mapping = inode->i_mapping; - struct page *page; + struct folio *folio; unsigned long index = abs_from >> PAGE_SHIFT; handle_t *handle; int ret = 0; @@ -774,9 +774,10 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, goto out; } - page = find_or_create_page(mapping, index, GFP_NOFS); - if (!page) { - ret = -ENOMEM; + folio = __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); mlog_errno(ret); goto out_commit_trans; } @@ -803,7 +804,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, * __block_write_begin and block_commit_write to zero the * whole block. */ - ret = __block_write_begin(page, block_start + 1, 0, + ret = __block_write_begin(folio, block_start + 1, 0, ocfs2_get_block); if (ret < 0) { mlog_errno(ret); @@ -812,7 +813,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, /* must not update i_size! */ - block_commit_write(page, block_start + 1, block_start + 1); + block_commit_write(&folio->page, block_start + 1, block_start + 1); } /* @@ -833,8 +834,8 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, } out_unlock: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); out_commit_trans: if (handle) ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); @@ -2750,6 +2751,13 @@ out_unlock: return remapped > 0 ? remapped : ret; } +static loff_t ocfs2_dir_llseek(struct file *file, loff_t offset, int whence) +{ + struct ocfs2_file_private *fp = file->private_data; + + return generic_llseek_cookie(file, offset, whence, &fp->cookie); +} + const struct inode_operations ocfs2_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, @@ -2797,7 +2805,7 @@ const struct file_operations ocfs2_fops = { WRAP_DIR_ITER(ocfs2_readdir) // FIXME! const struct file_operations ocfs2_dops = { - .llseek = generic_file_llseek, + .llseek = ocfs2_dir_llseek, .read = generic_read_dir, .iterate_shared = shared_ocfs2_readdir, .fsync = ocfs2_sync_file, @@ -2843,7 +2851,7 @@ const struct file_operations ocfs2_fops_no_plocks = { }; const struct file_operations ocfs2_dops_no_plocks = { - .llseek = generic_file_llseek, + .llseek = ocfs2_dir_llseek, .read = generic_read_dir, .iterate_shared = shared_ocfs2_readdir, .fsync = ocfs2_sync_file, diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 8e53e4ac1120..41e65e45a9f3 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -20,6 +20,7 @@ struct ocfs2_alloc_context; enum ocfs2_alloc_restarted; struct ocfs2_file_private { + u64 cookie; struct file *fp_file; struct mutex fp_mutex; struct ocfs2_lock_res fp_flock; diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 1834f26522ed..6ef4cb045ccd 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -53,7 +53,7 @@ static vm_fault_t __ocfs2_page_mkwrite(struct file *file, loff_t pos = page_offset(page); unsigned int len = PAGE_SIZE; pgoff_t last_index; - struct page *locked_page = NULL; + struct folio *locked_folio = NULL; void *fsdata; loff_t size = i_size_read(inode); @@ -91,7 +91,7 @@ static vm_fault_t __ocfs2_page_mkwrite(struct file *file, len = ((size - 1) & ~PAGE_MASK) + 1; err = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP, - &locked_page, &fsdata, di_bh, page); + &locked_folio, &fsdata, di_bh, page); if (err) { if (err != -ENOSPC) mlog_errno(err); @@ -99,7 +99,7 @@ static vm_fault_t __ocfs2_page_mkwrite(struct file *file, goto out; } - if (!locked_page) { + if (!locked_folio) { ret = VM_FAULT_NOPAGE; goto out; } diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 6b580b9da8e3..98358d405b6a 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -312,11 +312,11 @@ static void omfs_write_failed(struct address_space *mapping, loff_t to) static int omfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - ret = block_write_begin(mapping, pos, len, pagep, omfs_get_block); + ret = block_write_begin(mapping, pos, len, foliop, omfs_get_block); if (unlikely(ret)) omfs_write_failed(mapping, pos + len); diff --git a/fs/open.c b/fs/open.c index 22adbef7ecc2..daf1b55ca818 100644 --- a/fs/open.c +++ b/fs/open.c @@ -252,40 +252,39 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (offset < 0 || len <= 0) return -EINVAL; - /* Return error if mode is not supported */ - if (mode & ~FALLOC_FL_SUPPORTED_MASK) + if (mode & ~(FALLOC_FL_MODE_MASK | FALLOC_FL_KEEP_SIZE)) return -EOPNOTSUPP; - /* Punch hole and zero range are mutually exclusive */ - if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) == - (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) - return -EOPNOTSUPP; - - /* Punch hole must have keep size set */ - if ((mode & FALLOC_FL_PUNCH_HOLE) && - !(mode & FALLOC_FL_KEEP_SIZE)) + /* + * Modes are exclusive, even if that is not obvious from the encoding + * as bit masks and the mix with the flag in the same namespace. + * + * To make things even more complicated, FALLOC_FL_ALLOCATE_RANGE is + * encoded as no bit set. + */ + switch (mode & FALLOC_FL_MODE_MASK) { + case FALLOC_FL_ALLOCATE_RANGE: + case FALLOC_FL_UNSHARE_RANGE: + case FALLOC_FL_ZERO_RANGE: + break; + case FALLOC_FL_PUNCH_HOLE: + if (!(mode & FALLOC_FL_KEEP_SIZE)) + return -EOPNOTSUPP; + break; + case FALLOC_FL_COLLAPSE_RANGE: + case FALLOC_FL_INSERT_RANGE: + if (mode & FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + break; + default: return -EOPNOTSUPP; - - /* Collapse range should only be used exclusively. */ - if ((mode & FALLOC_FL_COLLAPSE_RANGE) && - (mode & ~FALLOC_FL_COLLAPSE_RANGE)) - return -EINVAL; - - /* Insert range should only be used exclusively. */ - if ((mode & FALLOC_FL_INSERT_RANGE) && - (mode & ~FALLOC_FL_INSERT_RANGE)) - return -EINVAL; - - /* Unshare range should only be used with allocate mode. */ - if ((mode & FALLOC_FL_UNSHARE_RANGE) && - (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE))) - return -EINVAL; + } if (!(file->f_mode & FMODE_WRITE)) return -EBADF; /* - * We can only allow pure fallocate on append only files + * On append-only files only space preallocation is supported. */ if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode)) return -EPERM; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index fdb9b65db1de..aae6d2b8767d 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -309,22 +309,18 @@ static int orangefs_read_folio(struct file *file, struct folio *folio) static int orangefs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct orangefs_write_range *wr; struct folio *folio; - struct page *page; - pgoff_t index; int ret; - index = pos >> PAGE_SHIFT; + folio = __filemap_get_folio(mapping, pos / PAGE_SIZE, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); - page = grab_cache_page_write_begin(mapping, index); - if (!page) - return -ENOMEM; - - *pagep = page; - folio = page_folio(page); + *foliop = folio; if (folio_test_dirty(folio) && !folio_test_private(folio)) { /* @@ -365,9 +361,10 @@ okay: } static int orangefs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) + loff_t pos, unsigned len, unsigned copied, struct folio *folio, + void *fsdata) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; loff_t last_pos = pos + copied; /* @@ -377,23 +374,23 @@ static int orangefs_write_end(struct file *file, struct address_space *mapping, if (last_pos > inode->i_size) i_size_write(inode, last_pos); - /* zero the stale part of the page if we did a short copy */ - if (!PageUptodate(page)) { + /* zero the stale part of the folio if we did a short copy */ + if (!folio_test_uptodate(folio)) { unsigned from = pos & (PAGE_SIZE - 1); if (copied < len) { - zero_user(page, from + copied, len - copied); + folio_zero_range(folio, from + copied, len - copied); } /* Set fully written pages uptodate. */ - if (pos == page_offset(page) && + if (pos == folio_pos(folio) && (len == PAGE_SIZE || pos + len == inode->i_size)) { - zero_user_segment(page, from + copied, PAGE_SIZE); - SetPageUptodate(page); + folio_zero_segment(folio, from + copied, PAGE_SIZE); + folio_mark_uptodate(folio); } } - set_page_dirty(page); - unlock_page(page); - put_page(page); + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); mark_inode_dirty_sync(file_inode(file)); return copied; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index a5ef2005a2cc..2ed6ad641a20 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -115,12 +115,12 @@ int ovl_copy_xattr(struct super_block *sb, const struct path *oldpath, struct de continue; error = security_inode_copy_up_xattr(old, name); - if (error < 0 && error != -EOPNOTSUPP) - break; - if (error == 1) { + if (error == -ECANCELED) { error = 0; continue; /* Discard */ } + if (error < 0 && error != -EOPNOTSUPP) + break; if (is_posix_acl_xattr(name)) { error = ovl_copy_acl(OVL_FS(sb), oldpath, new, name); @@ -243,8 +243,24 @@ static int ovl_verify_area(loff_t pos, loff_t pos2, loff_t len, loff_t totlen) return 0; } +static int ovl_sync_file(struct path *path) +{ + struct file *new_file; + int err; + + new_file = ovl_path_open(path, O_LARGEFILE | O_RDONLY); + if (IS_ERR(new_file)) + return PTR_ERR(new_file); + + err = vfs_fsync(new_file, 0); + fput(new_file); + + return err; +} + static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, - struct file *new_file, loff_t len) + struct file *new_file, loff_t len, + bool datasync) { struct path datapath; struct file *old_file; @@ -342,7 +358,8 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, len -= bytes; } - if (!error && ovl_should_sync(ofs)) + /* call fsync once, either now or later along with metadata */ + if (!error && ovl_should_sync(ofs) && datasync) error = vfs_fsync(new_file, 0); out_fput: fput(old_file); @@ -574,6 +591,7 @@ struct ovl_copy_up_ctx { bool indexed; bool metacopy; bool metacopy_digest; + bool metadata_fsync; }; static int ovl_link_up(struct ovl_copy_up_ctx *c) @@ -634,7 +652,8 @@ static int ovl_copy_up_data(struct ovl_copy_up_ctx *c, const struct path *temp) if (IS_ERR(new_file)) return PTR_ERR(new_file); - err = ovl_copy_up_file(ofs, c->dentry, new_file, c->stat.size); + err = ovl_copy_up_file(ofs, c->dentry, new_file, c->stat.size, + !c->metadata_fsync); fput(new_file); return err; @@ -701,6 +720,10 @@ static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp) err = ovl_set_attr(ofs, temp, &c->stat); inode_unlock(temp->d_inode); + /* fsync metadata before moving it into upper dir */ + if (!err && ovl_should_sync(ofs) && c->metadata_fsync) + err = ovl_sync_file(&upperpath); + return err; } @@ -860,7 +883,8 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) temp = tmpfile->f_path.dentry; if (!c->metacopy && c->stat.size) { - err = ovl_copy_up_file(ofs, c->dentry, tmpfile, c->stat.size); + err = ovl_copy_up_file(ofs, c->dentry, tmpfile, c->stat.size, + !c->metadata_fsync); if (err) goto out_fput; } @@ -1135,6 +1159,17 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, !kgid_has_mapping(current_user_ns(), ctx.stat.gid)) return -EOVERFLOW; + /* + * With metacopy disabled, we fsync after final metadata copyup, for + * both regular files and directories to get atomic copyup semantics + * on filesystems that do not use strict metadata ordering (e.g. ubifs). + * + * With metacopy enabled we want to avoid fsync on all meta copyup + * that will hurt performance of workloads such as chown -R, so we + * only fsync on data copyup as legacy behavior. + */ + ctx.metadata_fsync = !OVL_FS(dentry->d_sb)->config.metacopy && + (S_ISREG(ctx.stat.mode) || S_ISDIR(ctx.stat.mode)); ctx.metacopy = ovl_need_meta_copy_up(dentry, ctx.stat.mode, flags); if (parent) { diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index 4860fcc4611b..e42546c6c5df 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -353,6 +353,8 @@ static void ovl_add_layer(struct fs_context *fc, enum ovl_opt layer, case Opt_datadir_add: ctx->nr_data++; fallthrough; + case Opt_lowerdir: + fallthrough; case Opt_lowerdir_add: WARN_ON(ctx->nr >= ctx->capacity); l = &ctx->lower[ctx->nr++]; @@ -365,10 +367,9 @@ static void ovl_add_layer(struct fs_context *fc, enum ovl_opt layer, } } -static int ovl_parse_layer(struct fs_context *fc, struct fs_parameter *param, - enum ovl_opt layer) +static int ovl_parse_layer(struct fs_context *fc, const char *layer_name, enum ovl_opt layer) { - char *name = kstrdup(param->string, GFP_KERNEL); + char *name = kstrdup(layer_name, GFP_KERNEL); bool upper = (layer == Opt_upperdir || layer == Opt_workdir); struct path path; int err; @@ -376,7 +377,7 @@ static int ovl_parse_layer(struct fs_context *fc, struct fs_parameter *param, if (!name) return -ENOMEM; - if (upper) + if (upper || layer == Opt_lowerdir) err = ovl_mount_dir(name, &path); else err = ovl_mount_dir_noesc(name, &path); @@ -432,7 +433,6 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) { int err; struct ovl_fs_context *ctx = fc->fs_private; - struct ovl_fs_context_layer *l; char *dup = NULL, *iter; ssize_t nr_lower, nr; bool data_layer = false; @@ -449,7 +449,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) return 0; if (*name == ':') { - pr_err("cannot append lower layer"); + pr_err("cannot append lower layer\n"); return -EINVAL; } @@ -472,35 +472,11 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) goto out_err; } - if (nr_lower > ctx->capacity) { - err = -ENOMEM; - l = krealloc_array(ctx->lower, nr_lower, sizeof(*ctx->lower), - GFP_KERNEL_ACCOUNT); - if (!l) - goto out_err; - - ctx->lower = l; - ctx->capacity = nr_lower; - } - iter = dup; - l = ctx->lower; - for (nr = 0; nr < nr_lower; nr++, l++) { - ctx->nr++; - memset(l, 0, sizeof(*l)); - - err = ovl_mount_dir(iter, &l->path); - if (err) - goto out_put; - - err = ovl_mount_dir_check(fc, &l->path, Opt_lowerdir, iter, false); + for (nr = 0; nr < nr_lower; nr++) { + err = ovl_parse_layer(fc, iter, Opt_lowerdir); if (err) - goto out_put; - - err = -ENOMEM; - l->name = kstrdup(iter, GFP_KERNEL_ACCOUNT); - if (!l->name) - goto out_put; + goto out_err; if (data_layer) ctx->nr_data++; @@ -517,8 +493,8 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) * there are no data layers. */ if (ctx->nr_data > 0) { - pr_err("regular lower layers cannot follow data lower layers"); - goto out_put; + pr_err("regular lower layers cannot follow data lower layers\n"); + goto out_err; } data_layer = false; @@ -532,9 +508,6 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) kfree(dup); return 0; -out_put: - ovl_reset_lowerdirs(ctx); - out_err: kfree(dup); @@ -582,7 +555,7 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_datadir_add: case Opt_upperdir: case Opt_workdir: - err = ovl_parse_layer(fc, param, opt); + err = ovl_parse_layer(fc, param->string, opt); break; case Opt_default_permissions: config->default_permissions = true; @@ -782,11 +755,6 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx, { struct ovl_opt_set set = ctx->set; - if (ctx->nr_data > 0 && !config->metacopy) { - pr_err("lower data-only dirs require metacopy support.\n"); - return -EINVAL; - } - /* Workdir/index are useless in non-upper mount */ if (!config->upperdir) { if (config->workdir) { @@ -938,6 +906,39 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx, config->metacopy = false; } + /* + * Fail if we don't have trusted xattr capability and a feature was + * explicitly requested that requires them. + */ + if (!config->userxattr && !capable(CAP_SYS_ADMIN)) { + if (set.redirect && + config->redirect_mode != OVL_REDIRECT_NOFOLLOW) { + pr_err("redirect_dir requires permission to access trusted xattrs\n"); + return -EPERM; + } + if (config->metacopy && set.metacopy) { + pr_err("metacopy requires permission to access trusted xattrs\n"); + return -EPERM; + } + if (config->verity_mode) { + pr_err("verity requires permission to access trusted xattrs\n"); + return -EPERM; + } + if (ctx->nr_data > 0) { + pr_err("lower data-only dirs require permission to access trusted xattrs\n"); + return -EPERM; + } + /* + * Other xattr-dependent features should be disabled without + * great disturbance to the user in ovl_make_workdir(). + */ + } + + if (ctx->nr_data > 0 && !config->metacopy) { + pr_err("lower data-only dirs require metacopy support.\n"); + return -EINVAL; + } + return 0; } diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 06a231970cb5..fe511192f83c 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -202,15 +202,9 @@ static int ovl_sync_fs(struct super_block *sb, int wait) int ret; ret = ovl_sync_status(ofs); - /* - * We have to always set the err, because the return value isn't - * checked in syncfs, and instead indirectly return an error via - * the sb's writeback errseq, which VFS inspects after this call. - */ - if (ret < 0) { - errseq_set(&sb->s_wb_err, -EIO); + + if (ret < 0) return -EIO; - } if (!ret) return ret; diff --git a/fs/pipe.c b/fs/pipe.c index 7dff2aa50a6d..4083ba492cb6 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -686,7 +686,7 @@ pipe_poll(struct file *filp, poll_table *wait) if (filp->f_mode & FMODE_READ) { if (!pipe_empty(head, tail)) mask |= EPOLLIN | EPOLLRDNORM; - if (!pipe->writers && filp->f_version != pipe->w_counter) + if (!pipe->writers && filp->f_pipe != pipe->w_counter) mask |= EPOLLHUP; } @@ -945,6 +945,7 @@ int create_pipe_files(struct file **res, int flags) } f->private_data = inode->i_pipe; + f->f_pipe = 0; res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK), &pipefifo_fops); @@ -954,6 +955,7 @@ int create_pipe_files(struct file **res, int flags) return PTR_ERR(res[0]); } res[0]->private_data = inode->i_pipe; + res[0]->f_pipe = 0; res[1] = f; stream_open(inode, res[0]); stream_open(inode, res[1]); @@ -1108,7 +1110,7 @@ static int fifo_open(struct inode *inode, struct file *filp) bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; int ret; - filp->f_version = 0; + filp->f_pipe = 0; spin_lock(&inode->i_lock); if (inode->i_pipe) { @@ -1155,7 +1157,7 @@ static int fifo_open(struct inode *inode, struct file *filp) if ((filp->f_flags & O_NONBLOCK)) { /* suppress EPOLLHUP until we have * seen a writer */ - filp->f_version = pipe->w_counter; + filp->f_pipe = pipe->w_counter; } else { if (wait_for_partner(pipe, &pipe->w_counter)) goto err_rd; @@ -1427,7 +1429,7 @@ static const struct super_operations pipefs_ops = { /* * pipefs should _never_ be mounted by userland - too much of security hassle, - * no real gain from having the whole whorehouse mounted. So we don't need + * no real gain from having the whole file system mounted. So we don't need * any operations on the root directory. However, we need a non-trivial * d_name - pipe: will go nicely and kill the special-casing in procfs. */ diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 3f87297dbfdb..6c66a37522d0 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -715,8 +715,8 @@ int posix_acl_update_mode(struct mnt_idmap *idmap, return error; if (error == 0) *acl = NULL; - if (!vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode)) && - !capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID)) + if (!in_group_or_capable(idmap, inode, + i_gid_into_vfsgid(idmap, inode))) mode &= ~S_ISGID; *mode_p = mode; return 0; diff --git a/fs/proc/base.c b/fs/proc/base.c index 72a1acd03675..e7810f3bd522 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -85,6 +85,7 @@ #include <linux/elf.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> +#include <linux/fs_parser.h> #include <linux/fs_struct.h> #include <linux/slab.h> #include <linux/sched/autogroup.h> @@ -117,6 +118,40 @@ static u8 nlink_tid __ro_after_init; static u8 nlink_tgid __ro_after_init; +enum proc_mem_force { + PROC_MEM_FORCE_ALWAYS, + PROC_MEM_FORCE_PTRACE, + PROC_MEM_FORCE_NEVER +}; + +static enum proc_mem_force proc_mem_force_override __ro_after_init = + IS_ENABLED(CONFIG_PROC_MEM_NO_FORCE) ? PROC_MEM_FORCE_NEVER : + IS_ENABLED(CONFIG_PROC_MEM_FORCE_PTRACE) ? PROC_MEM_FORCE_PTRACE : + PROC_MEM_FORCE_ALWAYS; + +static const struct constant_table proc_mem_force_table[] __initconst = { + { "always", PROC_MEM_FORCE_ALWAYS }, + { "ptrace", PROC_MEM_FORCE_PTRACE }, + { "never", PROC_MEM_FORCE_NEVER }, + { } +}; + +static int __init early_proc_mem_force_override(char *buf) +{ + if (!buf) + return -EINVAL; + + /* + * lookup_constant() defaults to proc_mem_force_override to preseve + * the initial Kconfig choice in case an invalid param gets passed. + */ + proc_mem_force_override = lookup_constant(proc_mem_force_table, + buf, proc_mem_force_override); + + return 0; +} +early_param("proc_mem.force_override", early_proc_mem_force_override); + struct pid_entry { const char *name; unsigned int len; @@ -827,12 +862,31 @@ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) static int mem_open(struct inode *inode, struct file *file) { - int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH); - - /* OK to pass negative loff_t, we can catch out-of-range */ - file->f_mode |= FMODE_UNSIGNED_OFFSET; + if (WARN_ON_ONCE(!(file->f_op->fop_flags & FOP_UNSIGNED_OFFSET))) + return -EINVAL; + return __mem_open(inode, file, PTRACE_MODE_ATTACH); +} - return ret; +static bool proc_mem_foll_force(struct file *file, struct mm_struct *mm) +{ + struct task_struct *task; + bool ptrace_active = false; + + switch (proc_mem_force_override) { + case PROC_MEM_FORCE_NEVER: + return false; + case PROC_MEM_FORCE_PTRACE: + task = get_proc_task(file_inode(file)); + if (task) { + ptrace_active = READ_ONCE(task->ptrace) && + READ_ONCE(task->mm) == mm && + READ_ONCE(task->parent) == current; + put_task_struct(task); + } + return ptrace_active; + default: + return true; + } } static ssize_t mem_rw(struct file *file, char __user *buf, @@ -855,7 +909,9 @@ static ssize_t mem_rw(struct file *file, char __user *buf, if (!mmget_not_zero(mm)) goto free; - flags = FOLL_FORCE | (write ? FOLL_WRITE : 0); + flags = write ? FOLL_WRITE : 0; + if (proc_mem_foll_force(file, mm)) + flags |= FOLL_FORCE; while (count > 0) { size_t this_len = min_t(size_t, count, PAGE_SIZE); @@ -932,6 +988,7 @@ static const struct file_operations proc_mem_operations = { .write = mem_write, .open = mem_open, .release = mem_release, + .fop_flags = FOP_UNSIGNED_OFFSET, }; static int environ_open(struct inode *inode, struct file *file) @@ -2276,8 +2333,8 @@ proc_map_files_instantiate(struct dentry *dentry, inode->i_op = &proc_map_files_link_inode_operations; inode->i_size = 64; - d_set_d_op(dentry, &tid_map_files_dentry_operations); - return d_splice_alias(inode, dentry); + return proc_splice_unmountable(inode, dentry, + &tid_map_files_dentry_operations); } static struct dentry *proc_map_files_lookup(struct inode *dir, @@ -2456,13 +2513,13 @@ static void *timers_start(struct seq_file *m, loff_t *pos) if (!tp->sighand) return ERR_PTR(-ESRCH); - return seq_list_start(&tp->task->signal->posix_timers, *pos); + return seq_hlist_start(&tp->task->signal->posix_timers, *pos); } static void *timers_next(struct seq_file *m, void *v, loff_t *pos) { struct timers_private *tp = m->private; - return seq_list_next(v, &tp->task->signal->posix_timers, pos); + return seq_hlist_next(v, &tp->task->signal->posix_timers, pos); } static void timers_stop(struct seq_file *m, void *v) @@ -2491,7 +2548,7 @@ static int show_timer(struct seq_file *m, void *v) [SIGEV_THREAD] = "thread", }; - timer = list_entry((struct list_head *)v, struct k_itimer, list); + timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list); notify = timer->it_sigev_notify; seq_printf(m, "ID: %d\n", timer->it_id); @@ -2569,10 +2626,11 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf, } task_lock(p); - if (slack_ns == 0) - p->timer_slack_ns = p->default_timer_slack_ns; - else - p->timer_slack_ns = slack_ns; + if (task_is_realtime(p)) + slack_ns = 0; + else if (slack_ns == 0) + slack_ns = p->default_timer_slack_ns; + p->timer_slack_ns = slack_ns; task_unlock(p); out: @@ -3870,12 +3928,12 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit_dots(file, ctx)) return 0; - /* f_version caches the tgid value that the last readdir call couldn't - * return. lseek aka telldir automagically resets f_version to 0. + /* We cache the tgid value that the last readdir call couldn't + * return and lseek resets it to 0. */ ns = proc_pid_ns(inode->i_sb); - tid = (int)file->f_version; - file->f_version = 0; + tid = (int)(intptr_t)file->private_data; + file->private_data = NULL; for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); task; task = next_tid(task), ctx->pos++) { @@ -3890,7 +3948,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) proc_task_instantiate, task, NULL)) { /* returning this tgid failed, save it as the first * pid for the next readir call */ - file->f_version = (u64)tid; + file->private_data = (void *)(intptr_t)tid; put_task_struct(task); break; } @@ -3915,6 +3973,24 @@ static int proc_task_getattr(struct mnt_idmap *idmap, return 0; } +/* + * proc_task_readdir() set @file->private_data to a positive integer + * value, so casting that to u64 is safe. generic_llseek_cookie() will + * set @cookie to 0, so casting to an int is safe. The WARN_ON_ONCE() is + * here to catch any unexpected change in behavior either in + * proc_task_readdir() or generic_llseek_cookie(). + */ +static loff_t proc_dir_llseek(struct file *file, loff_t offset, int whence) +{ + u64 cookie = (u64)(intptr_t)file->private_data; + loff_t off; + + off = generic_llseek_cookie(file, offset, whence, &cookie); + WARN_ON_ONCE(cookie > INT_MAX); + file->private_data = (void *)(intptr_t)cookie; /* serialized by f_pos_lock */ + return off; +} + static const struct inode_operations proc_task_inode_operations = { .lookup = proc_task_lookup, .getattr = proc_task_getattr, @@ -3925,7 +4001,7 @@ static const struct inode_operations proc_task_inode_operations = { static const struct file_operations proc_task_operations = { .read = generic_read_dir, .iterate_shared = proc_task_readdir, - .llseek = generic_file_llseek, + .llseek = proc_dir_llseek, }; void __init set_proc_pid_nlink(void) diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c index e0758fe7936d..b7cab1ad990d 100644 --- a/fs/proc/consoles.c +++ b/fs/proc/consoles.c @@ -21,6 +21,7 @@ static int show_console_dev(struct seq_file *m, void *v) { CON_ENABLED, 'E' }, { CON_CONSDEV, 'C' }, { CON_BOOT, 'B' }, + { CON_NBCON, 'N' }, { CON_PRINTBUFFER, 'p' }, { CON_BRL, 'b' }, { CON_ANYTIME, 'a' }, @@ -58,8 +59,8 @@ static int show_console_dev(struct seq_file *m, void *v) seq_printf(m, "%s%d", con->name, con->index); seq_pad(m, ' '); seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-', - con->write ? 'W' : '-', con->unblank ? 'U' : '-', - flags); + ((con->flags & CON_NBCON) || con->write) ? 'W' : '-', + con->unblank ? 'U' : '-', flags); if (dev) seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev)); @@ -68,6 +69,7 @@ static int show_console_dev(struct seq_file *m, void *v) } static void *c_start(struct seq_file *m, loff_t *pos) + __acquires(&console_mutex) { struct console *con; loff_t off = 0; @@ -94,6 +96,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos) } static void c_stop(struct seq_file *m, void *v) + __releases(&console_mutex) { console_list_unlock(); } diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 586bbc84ca04..1f54a54bfb91 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -59,7 +59,7 @@ static int seq_show(struct seq_file *m, void *v) real_mount(file->f_path.mnt)->mnt_id, file_inode(file)->i_ino); - /* show_fd_locks() never deferences files so a stale value is safe */ + /* show_fd_locks() never dereferences files, so a stale value is safe */ show_fd_locks(m, file, files); if (seq_has_overflowed(m)) goto out; @@ -220,8 +220,8 @@ static struct dentry *proc_fd_instantiate(struct dentry *dentry, ei->op.proc_get_link = proc_fd_link; tid_fd_update_inode(task, inode, data->mode); - d_set_d_op(dentry, &tid_fd_dentry_operations); - return d_splice_alias(inode, dentry); + return proc_splice_unmountable(inode, dentry, + &tid_fd_dentry_operations); } static struct dentry *proc_lookupfd_common(struct inode *dir, @@ -312,14 +312,14 @@ static int proc_readfd_count(struct inode *inode, loff_t *count) return 0; } -static int proc_readfd(struct file *file, struct dir_context *ctx) +static int proc_fd_iterate(struct file *file, struct dir_context *ctx) { return proc_readfd_common(file, ctx, proc_fd_instantiate); } const struct file_operations proc_fd_operations = { .read = generic_read_dir, - .iterate_shared = proc_readfd, + .iterate_shared = proc_fd_iterate, .llseek = generic_file_llseek, }; @@ -397,8 +397,8 @@ static struct dentry *proc_fdinfo_instantiate(struct dentry *dentry, inode->i_fop = &proc_fdinfo_file_operations; tid_fd_update_inode(task, inode, 0); - d_set_d_op(dentry, &tid_fd_dentry_operations); - return d_splice_alias(inode, dentry); + return proc_splice_unmountable(inode, dentry, + &tid_fd_dentry_operations); } static struct dentry * @@ -407,7 +407,7 @@ proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags) return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate); } -static int proc_readfdinfo(struct file *file, struct dir_context *ctx) +static int proc_fdinfo_iterate(struct file *file, struct dir_context *ctx) { return proc_readfd_common(file, ctx, proc_fdinfo_instantiate); @@ -421,6 +421,6 @@ const struct inode_operations proc_fdinfo_inode_operations = { const struct file_operations proc_fdinfo_operations = { .read = generic_read_dir, - .iterate_shared = proc_readfdinfo, + .iterate_shared = proc_fdinfo_iterate, .llseek = generic_file_llseek, }; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index c02f1e63f82d..dbe82cf23ee4 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -464,9 +464,9 @@ struct proc_dir_entry *proc_symlink(const char *name, (S_IFLNK | S_IRUGO | S_IWUGO | S_IXUGO),1); if (ent) { - ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL); + ent->size = strlen(dest); + ent->data = kmemdup(dest, ent->size + 1, GFP_KERNEL); if (ent->data) { - strcpy((char*)ent->data,dest); ent->proc_iops = &proc_link_inode_operations; ent = proc_register(parent, ent); } else { diff --git a/fs/proc/internal.h b/fs/proc/internal.h index a8a8576d8592..9e3f25e4c188 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -349,3 +349,16 @@ static inline void pde_force_lookup(struct proc_dir_entry *pde) /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */ pde->proc_dops = &proc_net_dentry_ops; } + +/* + * Add a new procfs dentry that can't serve as a mountpoint. That should + * encompass anything that is ephemeral and can just disappear while the + * process is still around. + */ +static inline struct dentry *proc_splice_unmountable(struct inode *inode, + struct dentry *dentry, const struct dentry_operations *d_ops) +{ + d_set_d_op(dentry, d_ops); + dont_mount(dentry); + return d_splice_alias(inode, dentry); +} diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 8e08a9a1b7ed..7d0acdad74e2 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -235,7 +235,7 @@ static int kcore_ram_list(struct list_head *list) int nid, ret; unsigned long end_pfn; - /* Not inialized....update now */ + /* Not initialized....update now */ /* find out "max pfn" */ end_pfn = 0; for_each_node_state(nid, N_MEMORY) { diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 5f171ad7b436..ade74a396968 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -976,7 +976,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_PKEY_BIT0)] = "", [ilog2(VM_PKEY_BIT1)] = "", [ilog2(VM_PKEY_BIT2)] = "", +#if VM_PKEY_BIT3 [ilog2(VM_PKEY_BIT3)] = "", +#endif #if VM_PKEY_BIT4 [ilog2(VM_PKEY_BIT4)] = "", #endif @@ -987,8 +989,10 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) #ifdef CONFIG_X86_USER_SHADOW_STACK [ilog2(VM_SHADOW_STACK)] = "ss", #endif -#ifdef CONFIG_64BIT +#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) [ilog2(VM_DROPPABLE)] = "dp", +#endif +#ifdef CONFIG_64BIT [ilog2(VM_SEALED)] = "sl", #endif }; diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 3497ede88aa0..84719e2bcbc4 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -288,13 +288,13 @@ static void pstore_dump(struct kmsg_dumper *dumper, why = kmsg_dump_reason_str(reason); if (pstore_cannot_block_path(reason)) { - if (!spin_trylock_irqsave(&psinfo->buf_lock, flags)) { + if (!raw_spin_trylock_irqsave(&psinfo->buf_lock, flags)) { pr_err("dump skipped in %s path because of concurrent dump\n", in_nmi() ? "NMI" : why); return; } } else { - spin_lock_irqsave(&psinfo->buf_lock, flags); + raw_spin_lock_irqsave(&psinfo->buf_lock, flags); } kmsg_dump_rewind(&iter); @@ -364,7 +364,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, total += record.size; part++; } - spin_unlock_irqrestore(&psinfo->buf_lock, flags); + raw_spin_unlock_irqrestore(&psinfo->buf_lock, flags); if (saved_ret) { pr_err_once("backend (%s) writing error (%d)\n", psinfo->name, @@ -503,7 +503,7 @@ int pstore_register(struct pstore_info *psi) psi->write_user = pstore_write_user_compat; psinfo = psi; mutex_init(&psinfo->read_mutex); - spin_lock_init(&psinfo->buf_lock); + raw_spin_lock_init(&psinfo->buf_lock); if (psi->flags & PSTORE_FLAGS_DMESG) allocate_buf_for_compression(); diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c index c1cfb8a19e9d..b4d10e45f2e4 100644 --- a/fs/qnx6/dir.c +++ b/fs/qnx6/dir.c @@ -24,13 +24,15 @@ static unsigned qnx6_lfile_checksum(char *name, unsigned size) return crc; } -static struct page *qnx6_get_page(struct inode *dir, unsigned long n) +static void *qnx6_get_folio(struct inode *dir, unsigned long n, + struct folio **foliop) { - struct address_space *mapping = dir->i_mapping; - struct page *page = read_mapping_page(mapping, n, NULL); - if (!IS_ERR(page)) - kmap(page); - return page; + struct folio *folio = read_mapping_folio(dir->i_mapping, n, NULL); + + if (IS_ERR(folio)) + return folio; + *foliop = folio; + return kmap_local_folio(folio, 0); } static unsigned last_entry(struct inode *inode, unsigned long page_nr) @@ -44,19 +46,20 @@ static unsigned last_entry(struct inode *inode, unsigned long page_nr) static struct qnx6_long_filename *qnx6_longname(struct super_block *sb, struct qnx6_long_dir_entry *de, - struct page **p) + struct folio **foliop) { struct qnx6_sb_info *sbi = QNX6_SB(sb); u32 s = fs32_to_cpu(sbi, de->de_long_inode); /* in block units */ u32 n = s >> (PAGE_SHIFT - sb->s_blocksize_bits); /* in pages */ - /* within page */ - u32 offs = (s << sb->s_blocksize_bits) & ~PAGE_MASK; + u32 offs; struct address_space *mapping = sbi->longfile->i_mapping; - struct page *page = read_mapping_page(mapping, n, NULL); - if (IS_ERR(page)) - return ERR_CAST(page); - kmap(*p = page); - return (struct qnx6_long_filename *)(page_address(page) + offs); + struct folio *folio = read_mapping_folio(mapping, n, NULL); + + if (IS_ERR(folio)) + return ERR_CAST(folio); + offs = offset_in_folio(folio, s << sb->s_blocksize_bits); + *foliop = folio; + return kmap_local_folio(folio, offs); } static int qnx6_dir_longfilename(struct inode *inode, @@ -67,7 +70,7 @@ static int qnx6_dir_longfilename(struct inode *inode, struct qnx6_long_filename *lf; struct super_block *s = inode->i_sb; struct qnx6_sb_info *sbi = QNX6_SB(s); - struct page *page; + struct folio *folio; int lf_size; if (de->de_size != 0xff) { @@ -76,7 +79,7 @@ static int qnx6_dir_longfilename(struct inode *inode, pr_err("invalid direntry size (%i).\n", de->de_size); return 0; } - lf = qnx6_longname(s, de, &page); + lf = qnx6_longname(s, de, &folio); if (IS_ERR(lf)) { pr_err("Error reading longname\n"); return 0; @@ -87,7 +90,7 @@ static int qnx6_dir_longfilename(struct inode *inode, if (lf_size > QNX6_LONG_NAME_MAX) { pr_debug("file %s\n", lf->lf_fname); pr_err("Filename too long (%i)\n", lf_size); - qnx6_put_page(page); + folio_release_kmap(folio, lf); return 0; } @@ -100,11 +103,11 @@ static int qnx6_dir_longfilename(struct inode *inode, pr_debug("qnx6_readdir:%.*s inode:%u\n", lf_size, lf->lf_fname, de_inode); if (!dir_emit(ctx, lf->lf_fname, lf_size, de_inode, DT_UNKNOWN)) { - qnx6_put_page(page); + folio_release_kmap(folio, lf); return 0; } - qnx6_put_page(page); + folio_release_kmap(folio, lf); /* success */ return 1; } @@ -117,26 +120,27 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx) loff_t pos = ctx->pos & ~(QNX6_DIR_ENTRY_SIZE - 1); unsigned long npages = dir_pages(inode); unsigned long n = pos >> PAGE_SHIFT; - unsigned start = (pos & ~PAGE_MASK) / QNX6_DIR_ENTRY_SIZE; + unsigned offset = (pos & ~PAGE_MASK) / QNX6_DIR_ENTRY_SIZE; bool done = false; ctx->pos = pos; if (ctx->pos >= inode->i_size) return 0; - for ( ; !done && n < npages; n++, start = 0) { - struct page *page = qnx6_get_page(inode, n); - int limit = last_entry(inode, n); + for ( ; !done && n < npages; n++, offset = 0) { struct qnx6_dir_entry *de; - int i = start; + struct folio *folio; + char *kaddr = qnx6_get_folio(inode, n, &folio); + char *limit; - if (IS_ERR(page)) { + if (IS_ERR(kaddr)) { pr_err("%s(): read failed\n", __func__); ctx->pos = (n + 1) << PAGE_SHIFT; - return PTR_ERR(page); + return PTR_ERR(kaddr); } - de = ((struct qnx6_dir_entry *)page_address(page)) + start; - for (; i < limit; i++, de++, ctx->pos += QNX6_DIR_ENTRY_SIZE) { + de = (struct qnx6_dir_entry *)(kaddr + offset); + limit = kaddr + last_entry(inode, n); + for (; (char *)de < limit; de++, ctx->pos += QNX6_DIR_ENTRY_SIZE) { int size = de->de_size; u32 no_inode = fs32_to_cpu(sbi, de->de_inode); @@ -164,7 +168,7 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx) } } } - qnx6_put_page(page); + folio_release_kmap(folio, kaddr); } return 0; } @@ -177,23 +181,23 @@ static unsigned qnx6_long_match(int len, const char *name, { struct super_block *s = dir->i_sb; struct qnx6_sb_info *sbi = QNX6_SB(s); - struct page *page; + struct folio *folio; int thislen; - struct qnx6_long_filename *lf = qnx6_longname(s, de, &page); + struct qnx6_long_filename *lf = qnx6_longname(s, de, &folio); if (IS_ERR(lf)) return 0; thislen = fs16_to_cpu(sbi, lf->lf_size); if (len != thislen) { - qnx6_put_page(page); + folio_release_kmap(folio, lf); return 0; } if (memcmp(name, lf->lf_fname, len) == 0) { - qnx6_put_page(page); + folio_release_kmap(folio, lf); return fs32_to_cpu(sbi, de->de_inode); } - qnx6_put_page(page); + folio_release_kmap(folio, lf); return 0; } @@ -210,20 +214,17 @@ static unsigned qnx6_match(struct super_block *s, int len, const char *name, } -unsigned qnx6_find_entry(int len, struct inode *dir, const char *name, - struct page **res_page) +unsigned qnx6_find_ino(int len, struct inode *dir, const char *name) { struct super_block *s = dir->i_sb; struct qnx6_inode_info *ei = QNX6_I(dir); - struct page *page = NULL; + struct folio *folio; unsigned long start, n; unsigned long npages = dir_pages(dir); unsigned ino; struct qnx6_dir_entry *de; struct qnx6_long_dir_entry *lde; - *res_page = NULL; - if (npages == 0) return 0; start = ei->i_dir_start_lookup; @@ -232,12 +233,11 @@ unsigned qnx6_find_entry(int len, struct inode *dir, const char *name, n = start; do { - page = qnx6_get_page(dir, n); - if (!IS_ERR(page)) { + de = qnx6_get_folio(dir, n, &folio); + if (!IS_ERR(de)) { int limit = last_entry(dir, n); int i; - de = (struct qnx6_dir_entry *)page_address(page); for (i = 0; i < limit; i++, de++) { if (len <= QNX6_SHORT_NAME_MAX) { /* short filename */ @@ -256,7 +256,7 @@ unsigned qnx6_find_entry(int len, struct inode *dir, const char *name, } else pr_err("undefined filename size in inode.\n"); } - qnx6_put_page(page); + folio_release_kmap(folio, de - i); } if (++n >= npages) @@ -265,8 +265,8 @@ unsigned qnx6_find_entry(int len, struct inode *dir, const char *name, return 0; found: - *res_page = page; ei->i_dir_start_lookup = n; + folio_release_kmap(folio, de); return ino; } diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 4f1735b882b1..85925ec0051a 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -184,17 +184,17 @@ static const char *qnx6_checkroot(struct super_block *s) struct qnx6_dir_entry *dir_entry; struct inode *root = d_inode(s->s_root); struct address_space *mapping = root->i_mapping; - struct page *page = read_mapping_page(mapping, 0, NULL); - if (IS_ERR(page)) + struct folio *folio = read_mapping_folio(mapping, 0, NULL); + + if (IS_ERR(folio)) return "error reading root directory"; - kmap(page); - dir_entry = page_address(page); + dir_entry = kmap_local_folio(folio, 0); for (i = 0; i < 2; i++) { /* maximum 3 bytes - due to match_root limitation */ if (strncmp(dir_entry[i].de_fname, match_root[i], 3)) error = 1; } - qnx6_put_page(page); + folio_release_kmap(folio, dir_entry); if (error) return "error reading root directory."; return NULL; @@ -518,7 +518,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino) struct inode *inode; struct qnx6_inode_info *ei; struct address_space *mapping; - struct page *page; + struct folio *folio; u32 n, offs; inode = iget_locked(sb, ino); @@ -538,17 +538,16 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino) return ERR_PTR(-EIO); } n = (ino - 1) >> (PAGE_SHIFT - QNX6_INODE_SIZE_BITS); - offs = (ino - 1) & (~PAGE_MASK >> QNX6_INODE_SIZE_BITS); mapping = sbi->inodes->i_mapping; - page = read_mapping_page(mapping, n, NULL); - if (IS_ERR(page)) { + folio = read_mapping_folio(mapping, n, NULL); + if (IS_ERR(folio)) { pr_err("major problem: unable to read inode from dev %s\n", sb->s_id); iget_failed(inode); - return ERR_CAST(page); + return ERR_CAST(folio); } - kmap(page); - raw_inode = ((struct qnx6_inode_entry *)page_address(page)) + offs; + offs = offset_in_folio(folio, (ino - 1) << QNX6_INODE_SIZE_BITS); + raw_inode = kmap_local_folio(folio, offs); inode->i_mode = fs16_to_cpu(sbi, raw_inode->di_mode); i_uid_write(inode, (uid_t)fs32_to_cpu(sbi, raw_inode->di_uid)); @@ -578,7 +577,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino) inode->i_mapping->a_ops = &qnx6_aops; } else init_special_inode(inode, inode->i_mode, 0); - qnx6_put_page(page); + folio_release_kmap(folio, raw_inode); unlock_new_inode(inode); return inode; } diff --git a/fs/qnx6/namei.c b/fs/qnx6/namei.c index e2e98e653b8d..0f0755a9ecb5 100644 --- a/fs/qnx6/namei.c +++ b/fs/qnx6/namei.c @@ -17,7 +17,6 @@ struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { unsigned ino; - struct page *page; struct inode *foundinode = NULL; const char *name = dentry->d_name.name; int len = dentry->d_name.len; @@ -25,10 +24,9 @@ struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry, if (len > QNX6_LONG_NAME_MAX) return ERR_PTR(-ENAMETOOLONG); - ino = qnx6_find_entry(len, dir, name, &page); + ino = qnx6_find_ino(len, dir, name); if (ino) { foundinode = qnx6_iget(dir->i_sb, ino); - qnx6_put_page(page); if (IS_ERR(foundinode)) pr_debug("lookup->iget -> error %ld\n", PTR_ERR(foundinode)); diff --git a/fs/qnx6/qnx6.h b/fs/qnx6/qnx6.h index 34a6b126a3a9..56ed1367499e 100644 --- a/fs/qnx6/qnx6.h +++ b/fs/qnx6/qnx6.h @@ -126,11 +126,4 @@ static inline __fs16 cpu_to_fs16(struct qnx6_sb_info *sbi, __u16 n) extern struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent); -static inline void qnx6_put_page(struct page *page) -{ - kunmap(page); - put_page(page); -} - -extern unsigned qnx6_find_entry(int len, struct inode *dir, const char *name, - struct page **res_page); +unsigned qnx6_find_ino(int len, struct inode *dir, const char *name); diff --git a/fs/read_write.c b/fs/read_write.c index 90e283b31ca1..070a7c33b9dd 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -36,22 +36,24 @@ EXPORT_SYMBOL(generic_ro_fops); static inline bool unsigned_offsets(struct file *file) { - return file->f_mode & FMODE_UNSIGNED_OFFSET; + return file->f_op->fop_flags & FOP_UNSIGNED_OFFSET; } /** - * vfs_setpos - update the file offset for lseek + * vfs_setpos_cookie - update the file offset for lseek and reset cookie * @file: file structure in question * @offset: file offset to seek to * @maxsize: maximum file size + * @cookie: cookie to reset * - * This is a low-level filesystem helper for updating the file offset to - * the value specified by @offset if the given offset is valid and it is - * not equal to the current file offset. + * Update the file offset to the value specified by @offset if the given + * offset is valid and it is not equal to the current file offset and + * reset the specified cookie to indicate that a seek happened. * * Return the specified offset on success and -EINVAL on invalid offset. */ -loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize) +static loff_t vfs_setpos_cookie(struct file *file, loff_t offset, + loff_t maxsize, u64 *cookie) { if (offset < 0 && !unsigned_offsets(file)) return -EINVAL; @@ -60,35 +62,48 @@ loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize) if (offset != file->f_pos) { file->f_pos = offset; - file->f_version = 0; + if (cookie) + *cookie = 0; } return offset; } -EXPORT_SYMBOL(vfs_setpos); /** - * generic_file_llseek_size - generic llseek implementation for regular files - * @file: file structure to seek on + * vfs_setpos - update the file offset for lseek + * @file: file structure in question * @offset: file offset to seek to - * @whence: type of seek - * @maxsize: max size of this file in file system - * @eof: offset used for SEEK_END position + * @maxsize: maximum file size * - * This is a variant of generic_file_llseek that allows passing in a custom - * maximum file size and a custom EOF position, for e.g. hashed directories + * This is a low-level filesystem helper for updating the file offset to + * the value specified by @offset if the given offset is valid and it is + * not equal to the current file offset. * - * Synchronization: - * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms) - * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes. - * read/writes behave like SEEK_SET against seeks. + * Return the specified offset on success and -EINVAL on invalid offset. */ -loff_t -generic_file_llseek_size(struct file *file, loff_t offset, int whence, - loff_t maxsize, loff_t eof) +loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize) +{ + return vfs_setpos_cookie(file, offset, maxsize, NULL); +} +EXPORT_SYMBOL(vfs_setpos); + +/** + * must_set_pos - check whether f_pos has to be updated + * @file: file to seek on + * @offset: offset to use + * @whence: type of seek operation + * @eof: end of file + * + * Check whether f_pos needs to be updated and update @offset according + * to @whence. + * + * Return: 0 if f_pos doesn't need to be updated, 1 if f_pos has to be + * updated, and negative error code on failure. + */ +static int must_set_pos(struct file *file, loff_t *offset, int whence, loff_t eof) { switch (whence) { case SEEK_END: - offset += eof; + *offset += eof; break; case SEEK_CUR: /* @@ -97,23 +112,17 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence, * f_pos value back to the file because a concurrent read(), * write() or lseek() might have altered it */ - if (offset == 0) - return file->f_pos; - /* - * f_lock protects against read/modify/write race with other - * SEEK_CURs. Note that parallel writes and reads behave - * like SEEK_SET. - */ - spin_lock(&file->f_lock); - offset = vfs_setpos(file, file->f_pos + offset, maxsize); - spin_unlock(&file->f_lock); - return offset; + if (*offset == 0) { + *offset = file->f_pos; + return 0; + } + break; case SEEK_DATA: /* * In the generic case the entire file is data, so as long as * offset isn't at the end of the file then the offset is data. */ - if ((unsigned long long)offset >= eof) + if ((unsigned long long)*offset >= eof) return -ENXIO; break; case SEEK_HOLE: @@ -121,17 +130,103 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence, * There is a virtual hole at the end of the file, so as long as * offset isn't i_size or larger, return i_size. */ - if ((unsigned long long)offset >= eof) + if ((unsigned long long)*offset >= eof) return -ENXIO; - offset = eof; + *offset = eof; break; } + return 1; +} + +/** + * generic_file_llseek_size - generic llseek implementation for regular files + * @file: file structure to seek on + * @offset: file offset to seek to + * @whence: type of seek + * @maxsize: max size of this file in file system + * @eof: offset used for SEEK_END position + * + * This is a variant of generic_file_llseek that allows passing in a custom + * maximum file size and a custom EOF position, for e.g. hashed directories + * + * Synchronization: + * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms) + * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes. + * read/writes behave like SEEK_SET against seeks. + */ +loff_t +generic_file_llseek_size(struct file *file, loff_t offset, int whence, + loff_t maxsize, loff_t eof) +{ + int ret; + + ret = must_set_pos(file, &offset, whence, eof); + if (ret < 0) + return ret; + if (ret == 0) + return offset; + + if (whence == SEEK_CUR) { + /* + * f_lock protects against read/modify/write race with + * other SEEK_CURs. Note that parallel writes and reads + * behave like SEEK_SET. + */ + guard(spinlock)(&file->f_lock); + return vfs_setpos(file, file->f_pos + offset, maxsize); + } + return vfs_setpos(file, offset, maxsize); } EXPORT_SYMBOL(generic_file_llseek_size); /** + * generic_llseek_cookie - versioned llseek implementation + * @file: file structure to seek on + * @offset: file offset to seek to + * @whence: type of seek + * @cookie: cookie to update + * + * See generic_file_llseek for a general description and locking assumptions. + * + * In contrast to generic_file_llseek, this function also resets a + * specified cookie to indicate a seek took place. + */ +loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence, + u64 *cookie) +{ + struct inode *inode = file->f_mapping->host; + loff_t maxsize = inode->i_sb->s_maxbytes; + loff_t eof = i_size_read(inode); + int ret; + + if (WARN_ON_ONCE(!cookie)) + return -EINVAL; + + /* + * Require that this is only used for directories that guarantee + * synchronization between readdir and seek so that an update to + * @cookie is correctly synchronized with concurrent readdir. + */ + if (WARN_ON_ONCE(!(file->f_mode & FMODE_ATOMIC_POS))) + return -EINVAL; + + ret = must_set_pos(file, &offset, whence, eof); + if (ret < 0) + return ret; + if (ret == 0) + return offset; + + /* No need to hold f_lock because we know that f_pos_lock is held. */ + if (whence == SEEK_CUR) + return vfs_setpos_cookie(file, file->f_pos + offset, maxsize, cookie); + + return vfs_setpos_cookie(file, offset, maxsize, cookie); +} +EXPORT_SYMBOL(generic_llseek_cookie); + +/** * generic_file_llseek - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to @@ -270,10 +365,8 @@ loff_t default_llseek(struct file *file, loff_t offset, int whence) } retval = -EINVAL; if (offset >= 0 || unsigned_offsets(file)) { - if (offset != file->f_pos) { + if (offset != file->f_pos) file->f_pos = offset; - file->f_version = 0; - } retval = offset; } out: diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 9b43a81a6488..72c53129c952 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2178,7 +2178,7 @@ static int grab_tail_page(struct inode *inode, unsigned long offset = (inode->i_size) & (PAGE_SIZE - 1); struct buffer_head *bh; struct buffer_head *head; - struct page *page; + struct folio *folio; int error; /* @@ -2190,20 +2190,20 @@ static int grab_tail_page(struct inode *inode, if ((offset & (blocksize - 1)) == 0) { return -ENOENT; } - page = grab_cache_page(inode->i_mapping, index); - error = -ENOMEM; - if (!page) { - goto out; - } + folio = __filemap_get_folio(inode->i_mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mapping_gfp_mask(inode->i_mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); /* start within the page of the last block in the file */ start = (offset / blocksize) * blocksize; - error = __block_write_begin(page, start, offset - start, + error = __block_write_begin(folio, start, offset - start, reiserfs_get_block_create_0); if (error) goto unlock; - head = page_buffers(page); + head = folio_buffers(folio); bh = head; do { if (pos >= start) { @@ -2226,14 +2226,13 @@ static int grab_tail_page(struct inode *inode, goto unlock; } *bh_result = bh; - *page_result = page; + *page_result = &folio->page; -out: return error; unlock: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return error; } @@ -2736,23 +2735,24 @@ static void reiserfs_truncate_failed_write(struct inode *inode) static int reiserfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct inode *inode; - struct page *page; + struct folio *folio; pgoff_t index; int ret; int old_ref = 0; inode = mapping->host; index = pos >> PAGE_SHIFT; - page = grab_cache_page_write_begin(mapping, index); - if (!page) - return -ENOMEM; - *pagep = page; + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); + *foliop = folio; reiserfs_wait_on_write_block(inode->i_sb); - fix_tail_page_for_writing(page); + fix_tail_page_for_writing(&folio->page); if (reiserfs_transaction_running(inode->i_sb)) { struct reiserfs_transaction_handle *th; th = (struct reiserfs_transaction_handle *)current-> @@ -2762,7 +2762,7 @@ static int reiserfs_write_begin(struct file *file, old_ref = th->t_refcount; th->t_refcount++; } - ret = __block_write_begin(page, pos, len, reiserfs_get_block); + ret = __block_write_begin(folio, pos, len, reiserfs_get_block); if (ret && reiserfs_transaction_running(inode->i_sb)) { struct reiserfs_transaction_handle *th = current->journal_info; /* @@ -2792,8 +2792,8 @@ static int reiserfs_write_begin(struct file *file, } } if (ret) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); /* Truncate allocated blocks */ reiserfs_truncate_failed_write(inode); } @@ -2822,7 +2822,7 @@ int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len) th->t_refcount++; } - ret = __block_write_begin(page, from, len, reiserfs_get_block); + ret = __block_write_begin(page_folio(page), from, len, reiserfs_get_block); if (ret && reiserfs_transaction_running(inode->i_sb)) { struct reiserfs_transaction_handle *th = current->journal_info; /* @@ -2862,10 +2862,9 @@ static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block) static int reiserfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; int ret = 0; int update_sd = 0; struct reiserfs_transaction_handle *th; @@ -2887,7 +2886,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, } flush_dcache_folio(folio); - reiserfs_commit_page(inode, page, start, start + copied); + reiserfs_commit_page(inode, &folio->page, start, start + copied); /* * generic_commit_write does this for us, but does not update the @@ -2942,8 +2941,8 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, out: if (locked) reiserfs_write_unlock(inode->i_sb); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); if (pos + len > inode->i_size) reiserfs_truncate_failed_write(inode); diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 68758b6fed94..0addcc849ff2 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -126,7 +126,7 @@ static int romfs_read_folio(struct file *file, struct folio *folio) } } - buf = folio_zero_tail(folio, fillsize, buf); + buf = folio_zero_tail(folio, fillsize, buf + fillsize); kunmap_local(buf); folio_end_read(folio, ret == 0); return ret; diff --git a/fs/select.c b/fs/select.c index 9515c3fa1a03..cae82e9e0dcc 100644 --- a/fs/select.c +++ b/fs/select.c @@ -77,19 +77,16 @@ u64 select_estimate_accuracy(struct timespec64 *tv) { u64 ret; struct timespec64 now; + u64 slack = current->timer_slack_ns; - /* - * Realtime tasks get a slack of 0 for obvious reasons. - */ - - if (rt_task(current)) + if (slack == 0) return 0; ktime_get_ts64(&now); now = timespec64_sub(*tv, now); ret = __estimate_accuracy(&now); - if (ret < current->timer_slack_ns) - return current->timer_slack_ns; + if (ret < slack) + return slack; return ret; } @@ -840,7 +837,7 @@ SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg) struct poll_list { struct poll_list *next; unsigned int len; - struct pollfd entries[]; + struct pollfd entries[] __counted_by(len); }; #define POLLFD_PER_PAGE ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd)) diff --git a/fs/signalfd.c b/fs/signalfd.c index ec7b2da2477a..d0333bce015e 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -159,7 +159,7 @@ static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info DECLARE_WAITQUEUE(wait, current); spin_lock_irq(¤t->sighand->siglock); - ret = dequeue_signal(current, &ctx->sigmask, info, &type); + ret = dequeue_signal(&ctx->sigmask, info, &type); switch (ret) { case 0: if (!nonblock) @@ -174,7 +174,7 @@ static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info add_wait_queue(¤t->sighand->signalfd_wqh, &wait); for (;;) { set_current_state(TASK_INTERRUPTIBLE); - ret = dequeue_signal(current, &ctx->sigmask, info, &type); + ret = dequeue_signal(&ctx->sigmask, info, &type); if (ret != 0) break; if (signal_pending(current)) { diff --git a/fs/smb/client/Kconfig b/fs/smb/client/Kconfig index 2517dc242386..2aff6d1395ce 100644 --- a/fs/smb/client/Kconfig +++ b/fs/smb/client/Kconfig @@ -204,4 +204,18 @@ config CIFS_ROOT Most people say N here. +config CIFS_COMPRESSION + bool "SMB message compression (Experimental)" + depends on CIFS + default n + help + Enables over-the-wire message compression for SMB 3.1.1 + mounts when negotiated with the server. + + Only write requests with data size >= PAGE_SIZE will be + compressed to avoid wasting resources. + + Say Y here if you want SMB traffic to be compressed. + If unsure, say N. + endif diff --git a/fs/smb/client/Makefile b/fs/smb/client/Makefile index e11985f2460b..22023e30915b 100644 --- a/fs/smb/client/Makefile +++ b/fs/smb/client/Makefile @@ -33,3 +33,5 @@ cifs-$(CONFIG_CIFS_SMB_DIRECT) += smbdirect.o cifs-$(CONFIG_CIFS_ROOT) += cifsroot.o cifs-$(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) += smb1ops.o cifssmb.o + +cifs-$(CONFIG_CIFS_COMPRESSION) += compress.o compress/lz77.o diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index c71ae5c04306..e03c890de0a0 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -350,6 +350,9 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_CIFS_SWN_UPCALL seq_puts(m, ",WITNESS"); #endif +#ifdef CONFIG_CIFS_COMPRESSION + seq_puts(m, ",COMPRESSION"); +#endif seq_putc(m, '\n'); seq_printf(m, "CIFSMaxBufSize: %d\n", CIFSMaxBufSize); seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid); @@ -475,7 +478,9 @@ skip_rdma: } seq_puts(m, "\nCompression: "); - if (!server->compression.requested) + if (!IS_ENABLED(CONFIG_CIFS_COMPRESSION)) + seq_puts(m, "no built-in support"); + else if (!server->compression.requested) seq_puts(m, "disabled on mount"); else if (server->compression.enabled) seq_printf(m, "enabled (%s)", compression_alg_str(server->compression.alg)); @@ -1072,7 +1077,7 @@ static int cifs_security_flags_proc_open(struct inode *inode, struct file *file) static void cifs_security_flags_handle_must_flags(unsigned int *flags) { - unsigned int signflags = *flags & CIFSSEC_MUST_SIGN; + unsigned int signflags = *flags & (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL); if ((*flags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5) *flags = CIFSSEC_MUST_KRB5; diff --git a/fs/smb/client/cifsacl.c b/fs/smb/client/cifsacl.c index f5b6df82e857..1d294d53f662 100644 --- a/fs/smb/client/cifsacl.c +++ b/fs/smb/client/cifsacl.c @@ -27,18 +27,18 @@ #include "cifs_unicode.h" /* security id for everyone/world system group */ -static const struct cifs_sid sid_everyone = { +static const struct smb_sid sid_everyone = { 1, 1, {0, 0, 0, 0, 0, 1}, {0} }; /* security id for Authenticated Users system group */ -static const struct cifs_sid sid_authusers = { +static const struct smb_sid sid_authusers = { 1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11)} }; /* S-1-22-1 Unmapped Unix users */ -static const struct cifs_sid sid_unix_users = {1, 1, {0, 0, 0, 0, 0, 22}, +static const struct smb_sid sid_unix_users = {1, 1, {0, 0, 0, 0, 0, 22}, {cpu_to_le32(1), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; /* S-1-22-2 Unmapped Unix groups */ -static const struct cifs_sid sid_unix_groups = { 1, 1, {0, 0, 0, 0, 0, 22}, +static const struct smb_sid sid_unix_groups = { 1, 1, {0, 0, 0, 0, 0, 22}, {cpu_to_le32(2), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; /* @@ -48,17 +48,17 @@ static const struct cifs_sid sid_unix_groups = { 1, 1, {0, 0, 0, 0, 0, 22}, /* S-1-5-88 MS NFS and Apple style UID/GID/mode */ /* S-1-5-88-1 Unix uid */ -static const struct cifs_sid sid_unix_NFS_users = { 1, 2, {0, 0, 0, 0, 0, 5}, +static const struct smb_sid sid_unix_NFS_users = { 1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(88), cpu_to_le32(1), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; /* S-1-5-88-2 Unix gid */ -static const struct cifs_sid sid_unix_NFS_groups = { 1, 2, {0, 0, 0, 0, 0, 5}, +static const struct smb_sid sid_unix_NFS_groups = { 1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(88), cpu_to_le32(2), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; /* S-1-5-88-3 Unix mode */ -static const struct cifs_sid sid_unix_NFS_mode = { 1, 2, {0, 0, 0, 0, 0, 5}, +static const struct smb_sid sid_unix_NFS_mode = { 1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(88), cpu_to_le32(3), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; @@ -106,7 +106,7 @@ static struct key_type cifs_idmap_key_type = { }; static char * -sid_to_key_str(struct cifs_sid *sidptr, unsigned int type) +sid_to_key_str(struct smb_sid *sidptr, unsigned int type) { int i, len; unsigned int saval; @@ -158,7 +158,7 @@ sid_to_key_str(struct cifs_sid *sidptr, unsigned int type) * the same returns zero, if they do not match returns non-zero. */ static int -compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid) +compare_sids(const struct smb_sid *ctsid, const struct smb_sid *cwsid) { int i; int num_subauth, num_sat, num_saw; @@ -187,7 +187,7 @@ compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid) /* compare all of the subauth values if any */ num_sat = ctsid->num_subauth; num_saw = cwsid->num_subauth; - num_subauth = num_sat < num_saw ? num_sat : num_saw; + num_subauth = min(num_sat, num_saw); if (num_subauth) { for (i = 0; i < num_subauth; ++i) { if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) { @@ -204,11 +204,11 @@ compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid) } static bool -is_well_known_sid(const struct cifs_sid *psid, uint32_t *puid, bool is_group) +is_well_known_sid(const struct smb_sid *psid, uint32_t *puid, bool is_group) { int i; int num_subauth; - const struct cifs_sid *pwell_known_sid; + const struct smb_sid *pwell_known_sid; if (!psid || (puid == NULL)) return false; @@ -260,7 +260,7 @@ is_well_known_sid(const struct cifs_sid *psid, uint32_t *puid, bool is_group) } static __u16 -cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src) +cifs_copy_sid(struct smb_sid *dst, const struct smb_sid *src) { int i; __u16 size = 1 + 1 + 6; @@ -277,11 +277,11 @@ cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src) } static int -id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid) +id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid) { int rc; struct key *sidkey; - struct cifs_sid *ksid; + struct smb_sid *ksid; unsigned int ksid_size; char desc[3 + 10 + 1]; /* 3 byte prefix + 10 bytes for value + NULL */ const struct cred *saved_cred; @@ -312,8 +312,8 @@ id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid) * it could be. */ ksid = sidkey->datalen <= sizeof(sidkey->payload) ? - (struct cifs_sid *)&sidkey->payload : - (struct cifs_sid *)sidkey->payload.data[0]; + (struct smb_sid *)&sidkey->payload : + (struct smb_sid *)sidkey->payload.data[0]; ksid_size = CIFS_SID_BASE_SIZE + (ksid->num_subauth * sizeof(__le32)); if (ksid_size > sidkey->datalen) { @@ -336,7 +336,7 @@ invalidate_key: } int -sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, +sid_to_id(struct cifs_sb_info *cifs_sb, struct smb_sid *psid, struct cifs_fattr *fattr, uint sidtype) { int rc = 0; @@ -515,43 +515,43 @@ exit_cifs_idmap(void) } /* copy ntsd, owner sid, and group sid from a security descriptor to another */ -static __u32 copy_sec_desc(const struct cifs_ntsd *pntsd, - struct cifs_ntsd *pnntsd, +static __u32 copy_sec_desc(const struct smb_ntsd *pntsd, + struct smb_ntsd *pnntsd, __u32 sidsoffset, - struct cifs_sid *pownersid, - struct cifs_sid *pgrpsid) + struct smb_sid *pownersid, + struct smb_sid *pgrpsid) { - struct cifs_sid *owner_sid_ptr, *group_sid_ptr; - struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr; + struct smb_sid *owner_sid_ptr, *group_sid_ptr; + struct smb_sid *nowner_sid_ptr, *ngroup_sid_ptr; /* copy security descriptor control portion */ pnntsd->revision = pntsd->revision; pnntsd->type = pntsd->type; - pnntsd->dacloffset = cpu_to_le32(sizeof(struct cifs_ntsd)); + pnntsd->dacloffset = cpu_to_le32(sizeof(struct smb_ntsd)); pnntsd->sacloffset = 0; pnntsd->osidoffset = cpu_to_le32(sidsoffset); - pnntsd->gsidoffset = cpu_to_le32(sidsoffset + sizeof(struct cifs_sid)); + pnntsd->gsidoffset = cpu_to_le32(sidsoffset + sizeof(struct smb_sid)); /* copy owner sid */ if (pownersid) owner_sid_ptr = pownersid; else - owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + + owner_sid_ptr = (struct smb_sid *)((char *)pntsd + le32_to_cpu(pntsd->osidoffset)); - nowner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset); + nowner_sid_ptr = (struct smb_sid *)((char *)pnntsd + sidsoffset); cifs_copy_sid(nowner_sid_ptr, owner_sid_ptr); /* copy group sid */ if (pgrpsid) group_sid_ptr = pgrpsid; else - group_sid_ptr = (struct cifs_sid *)((char *)pntsd + + group_sid_ptr = (struct smb_sid *)((char *)pntsd + le32_to_cpu(pntsd->gsidoffset)); - ngroup_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset + - sizeof(struct cifs_sid)); + ngroup_sid_ptr = (struct smb_sid *)((char *)pnntsd + sidsoffset + + sizeof(struct smb_sid)); cifs_copy_sid(ngroup_sid_ptr, group_sid_ptr); - return sidsoffset + (2 * sizeof(struct cifs_sid)); + return sidsoffset + (2 * sizeof(struct smb_sid)); } @@ -666,7 +666,7 @@ static void mode_to_access_flags(umode_t mode, umode_t bits_to_use, return; } -static __u16 cifs_copy_ace(struct cifs_ace *dst, struct cifs_ace *src, struct cifs_sid *psid) +static __u16 cifs_copy_ace(struct smb_ace *dst, struct smb_ace *src, struct smb_sid *psid) { __u16 size = 1 + 1 + 2 + 4; @@ -685,8 +685,8 @@ static __u16 cifs_copy_ace(struct cifs_ace *dst, struct cifs_ace *src, struct ci return size; } -static __u16 fill_ace_for_sid(struct cifs_ace *pntace, - const struct cifs_sid *psid, __u64 nmode, +static __u16 fill_ace_for_sid(struct smb_ace *pntace, + const struct smb_sid *psid, __u64 nmode, umode_t bits, __u8 access_type, bool allow_delete_child) { @@ -723,7 +723,7 @@ static __u16 fill_ace_for_sid(struct cifs_ace *pntace, #ifdef CONFIG_CIFS_DEBUG2 -static void dump_ace(struct cifs_ace *pace, char *end_of_acl) +static void dump_ace(struct smb_ace *pace, char *end_of_acl) { int num_subauth; @@ -758,15 +758,15 @@ static void dump_ace(struct cifs_ace *pace, char *end_of_acl) } #endif -static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, - struct cifs_sid *pownersid, struct cifs_sid *pgrpsid, +static void parse_dacl(struct smb_acl *pdacl, char *end_of_acl, + struct smb_sid *pownersid, struct smb_sid *pgrpsid, struct cifs_fattr *fattr, bool mode_from_special_sid) { int i; int num_aces = 0; int acl_size; char *acl_base; - struct cifs_ace **ppace; + struct smb_ace **ppace; /* BB need to add parm so we can store the SID BB */ @@ -793,21 +793,21 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, fattr->cf_mode &= ~(0777); acl_base = (char *)pdacl; - acl_size = sizeof(struct cifs_acl); + acl_size = sizeof(struct smb_acl); num_aces = le32_to_cpu(pdacl->num_aces); if (num_aces > 0) { umode_t denied_mode = 0; - if (num_aces > ULONG_MAX / sizeof(struct cifs_ace *)) + if (num_aces > ULONG_MAX / sizeof(struct smb_ace *)) return; - ppace = kmalloc_array(num_aces, sizeof(struct cifs_ace *), + ppace = kmalloc_array(num_aces, sizeof(struct smb_ace *), GFP_KERNEL); if (!ppace) return; for (i = 0; i < num_aces; ++i) { - ppace[i] = (struct cifs_ace *) (acl_base + acl_size); + ppace[i] = (struct smb_ace *) (acl_base + acl_size); #ifdef CONFIG_CIFS_DEBUG2 dump_ace(ppace[i], end_of_acl); #endif @@ -849,7 +849,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, /* memcpy((void *)(&(cifscred->aces[i])), (void *)ppace[i], - sizeof(struct cifs_ace)); */ + sizeof(struct smb_ace)); */ acl_base = (char *)ppace[i]; acl_size = le16_to_cpu(ppace[i]->size); @@ -861,7 +861,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, return; } -unsigned int setup_authusers_ACE(struct cifs_ace *pntace) +unsigned int setup_authusers_ACE(struct smb_ace *pntace) { int i; unsigned int ace_size = 20; @@ -885,7 +885,7 @@ unsigned int setup_authusers_ACE(struct cifs_ace *pntace) * Fill in the special SID based on the mode. See * https://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx */ -unsigned int setup_special_mode_ACE(struct cifs_ace *pntace, __u64 nmode) +unsigned int setup_special_mode_ACE(struct smb_ace *pntace, __u64 nmode) { int i; unsigned int ace_size = 28; @@ -907,7 +907,7 @@ unsigned int setup_special_mode_ACE(struct cifs_ace *pntace, __u64 nmode) return ace_size; } -unsigned int setup_special_user_owner_ACE(struct cifs_ace *pntace) +unsigned int setup_special_user_owner_ACE(struct smb_ace *pntace) { int i; unsigned int ace_size = 28; @@ -930,8 +930,8 @@ unsigned int setup_special_user_owner_ACE(struct cifs_ace *pntace) } static void populate_new_aces(char *nacl_base, - struct cifs_sid *pownersid, - struct cifs_sid *pgrpsid, + struct smb_sid *pownersid, + struct smb_sid *pgrpsid, __u64 *pnmode, u32 *pnum_aces, u16 *pnsize, bool modefromsid) { @@ -944,17 +944,17 @@ static void populate_new_aces(char *nacl_base, __u64 deny_user_mode = 0; __u64 deny_group_mode = 0; bool sticky_set = false; - struct cifs_ace *pnntace = NULL; + struct smb_ace *pnntace = NULL; nmode = *pnmode; num_aces = *pnum_aces; nsize = *pnsize; if (modefromsid) { - pnntace = (struct cifs_ace *) (nacl_base + nsize); + pnntace = (struct smb_ace *) (nacl_base + nsize); nsize += setup_special_mode_ACE(pnntace, nmode); num_aces++; - pnntace = (struct cifs_ace *) (nacl_base + nsize); + pnntace = (struct smb_ace *) (nacl_base + nsize); nsize += setup_authusers_ACE(pnntace); num_aces++; goto set_size; @@ -967,7 +967,7 @@ static void populate_new_aces(char *nacl_base, * updated in the inode. */ - if (!memcmp(pownersid, pgrpsid, sizeof(struct cifs_sid))) { + if (!memcmp(pownersid, pgrpsid, sizeof(struct smb_sid))) { /* * Case when owner and group SIDs are the same. * Set the more restrictive of the two modes. @@ -992,7 +992,7 @@ static void populate_new_aces(char *nacl_base, sticky_set = true; if (deny_user_mode) { - pnntace = (struct cifs_ace *) (nacl_base + nsize); + pnntace = (struct smb_ace *) (nacl_base + nsize); nsize += fill_ace_for_sid(pnntace, pownersid, deny_user_mode, 0700, ACCESS_DENIED, false); num_aces++; @@ -1000,31 +1000,31 @@ static void populate_new_aces(char *nacl_base, /* Group DENY ACE does not conflict with owner ALLOW ACE. Keep in preferred order*/ if (deny_group_mode && !(deny_group_mode & (user_mode >> 3))) { - pnntace = (struct cifs_ace *) (nacl_base + nsize); + pnntace = (struct smb_ace *) (nacl_base + nsize); nsize += fill_ace_for_sid(pnntace, pgrpsid, deny_group_mode, 0070, ACCESS_DENIED, false); num_aces++; } - pnntace = (struct cifs_ace *) (nacl_base + nsize); + pnntace = (struct smb_ace *) (nacl_base + nsize); nsize += fill_ace_for_sid(pnntace, pownersid, user_mode, 0700, ACCESS_ALLOWED, true); num_aces++; /* Group DENY ACE conflicts with owner ALLOW ACE. So keep it after. */ if (deny_group_mode && (deny_group_mode & (user_mode >> 3))) { - pnntace = (struct cifs_ace *) (nacl_base + nsize); + pnntace = (struct smb_ace *) (nacl_base + nsize); nsize += fill_ace_for_sid(pnntace, pgrpsid, deny_group_mode, 0070, ACCESS_DENIED, false); num_aces++; } - pnntace = (struct cifs_ace *) (nacl_base + nsize); + pnntace = (struct smb_ace *) (nacl_base + nsize); nsize += fill_ace_for_sid(pnntace, pgrpsid, group_mode, 0070, ACCESS_ALLOWED, !sticky_set); num_aces++; - pnntace = (struct cifs_ace *) (nacl_base + nsize); + pnntace = (struct smb_ace *) (nacl_base + nsize); nsize += fill_ace_for_sid(pnntace, &sid_everyone, other_mode, 0007, ACCESS_ALLOWED, !sticky_set); num_aces++; @@ -1034,31 +1034,31 @@ set_size: *pnsize = nsize; } -static __u16 replace_sids_and_copy_aces(struct cifs_acl *pdacl, struct cifs_acl *pndacl, - struct cifs_sid *pownersid, struct cifs_sid *pgrpsid, - struct cifs_sid *pnownersid, struct cifs_sid *pngrpsid) +static __u16 replace_sids_and_copy_aces(struct smb_acl *pdacl, struct smb_acl *pndacl, + struct smb_sid *pownersid, struct smb_sid *pgrpsid, + struct smb_sid *pnownersid, struct smb_sid *pngrpsid) { int i; u16 size = 0; - struct cifs_ace *pntace = NULL; + struct smb_ace *pntace = NULL; char *acl_base = NULL; u32 src_num_aces = 0; u16 nsize = 0; - struct cifs_ace *pnntace = NULL; + struct smb_ace *pnntace = NULL; char *nacl_base = NULL; u16 ace_size = 0; acl_base = (char *)pdacl; - size = sizeof(struct cifs_acl); + size = sizeof(struct smb_acl); src_num_aces = le32_to_cpu(pdacl->num_aces); nacl_base = (char *)pndacl; - nsize = sizeof(struct cifs_acl); + nsize = sizeof(struct smb_acl); /* Go through all the ACEs */ for (i = 0; i < src_num_aces; ++i) { - pntace = (struct cifs_ace *) (acl_base + size); - pnntace = (struct cifs_ace *) (nacl_base + nsize); + pntace = (struct smb_ace *) (acl_base + size); + pnntace = (struct smb_ace *) (nacl_base + nsize); if (pnownersid && compare_sids(&pntace->sid, pownersid) == 0) ace_size = cifs_copy_ace(pnntace, pntace, pnownersid); @@ -1074,24 +1074,24 @@ static __u16 replace_sids_and_copy_aces(struct cifs_acl *pdacl, struct cifs_acl return nsize; } -static int set_chmod_dacl(struct cifs_acl *pdacl, struct cifs_acl *pndacl, - struct cifs_sid *pownersid, struct cifs_sid *pgrpsid, +static int set_chmod_dacl(struct smb_acl *pdacl, struct smb_acl *pndacl, + struct smb_sid *pownersid, struct smb_sid *pgrpsid, __u64 *pnmode, bool mode_from_sid) { int i; u16 size = 0; - struct cifs_ace *pntace = NULL; + struct smb_ace *pntace = NULL; char *acl_base = NULL; u32 src_num_aces = 0; u16 nsize = 0; - struct cifs_ace *pnntace = NULL; + struct smb_ace *pnntace = NULL; char *nacl_base = NULL; u32 num_aces = 0; bool new_aces_set = false; /* Assuming that pndacl and pnmode are never NULL */ nacl_base = (char *)pndacl; - nsize = sizeof(struct cifs_acl); + nsize = sizeof(struct smb_acl); /* If pdacl is NULL, we don't have a src. Simply populate new ACL. */ if (!pdacl) { @@ -1103,12 +1103,12 @@ static int set_chmod_dacl(struct cifs_acl *pdacl, struct cifs_acl *pndacl, } acl_base = (char *)pdacl; - size = sizeof(struct cifs_acl); + size = sizeof(struct smb_acl); src_num_aces = le32_to_cpu(pdacl->num_aces); /* Retain old ACEs which we can retain */ for (i = 0; i < src_num_aces; ++i) { - pntace = (struct cifs_ace *) (acl_base + size); + pntace = (struct smb_ace *) (acl_base + size); if (!new_aces_set && (pntace->flags & INHERITED_ACE)) { /* Place the new ACEs in between existing explicit and inherited */ @@ -1130,7 +1130,7 @@ static int set_chmod_dacl(struct cifs_acl *pdacl, struct cifs_acl *pndacl, } /* update the pointer to the next ACE to populate*/ - pnntace = (struct cifs_ace *) (nacl_base + nsize); + pnntace = (struct smb_ace *) (nacl_base + nsize); nsize += cifs_copy_ace(pnntace, pntace, NULL); num_aces++; @@ -1156,7 +1156,7 @@ finalize_dacl: return 0; } -static int parse_sid(struct cifs_sid *psid, char *end_of_acl) +static int parse_sid(struct smb_sid *psid, char *end_of_acl) { /* BB need to add parm so we can store the SID BB */ @@ -1191,24 +1191,24 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl) /* Convert CIFS ACL to POSIX form */ static int parse_sec_desc(struct cifs_sb_info *cifs_sb, - struct cifs_ntsd *pntsd, int acl_len, struct cifs_fattr *fattr, + struct smb_ntsd *pntsd, int acl_len, struct cifs_fattr *fattr, bool get_mode_from_special_sid) { int rc = 0; - struct cifs_sid *owner_sid_ptr, *group_sid_ptr; - struct cifs_acl *dacl_ptr; /* no need for SACL ptr */ + struct smb_sid *owner_sid_ptr, *group_sid_ptr; + struct smb_acl *dacl_ptr; /* no need for SACL ptr */ char *end_of_acl = ((char *)pntsd) + acl_len; __u32 dacloffset; if (pntsd == NULL) return -EIO; - owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + + owner_sid_ptr = (struct smb_sid *)((char *)pntsd + le32_to_cpu(pntsd->osidoffset)); - group_sid_ptr = (struct cifs_sid *)((char *)pntsd + + group_sid_ptr = (struct smb_sid *)((char *)pntsd + le32_to_cpu(pntsd->gsidoffset)); dacloffset = le32_to_cpu(pntsd->dacloffset); - dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); + dacl_ptr = (struct smb_acl *)((char *)pntsd + dacloffset); cifs_dbg(NOISY, "revision %d type 0x%x ooffset 0x%x goffset 0x%x sacloffset 0x%x dacloffset 0x%x\n", pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset), le32_to_cpu(pntsd->gsidoffset), @@ -1249,7 +1249,7 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb, } /* Convert permission bits from mode to equivalent CIFS ACL */ -static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, +static int build_sec_desc(struct smb_ntsd *pntsd, struct smb_ntsd *pnntsd, __u32 secdesclen, __u32 *pnsecdesclen, __u64 *pnmode, kuid_t uid, kgid_t gid, bool mode_from_sid, bool id_from_sid, int *aclflag) { @@ -1257,30 +1257,30 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, __u32 dacloffset; __u32 ndacloffset; __u32 sidsoffset; - struct cifs_sid *owner_sid_ptr, *group_sid_ptr; - struct cifs_sid *nowner_sid_ptr = NULL, *ngroup_sid_ptr = NULL; - struct cifs_acl *dacl_ptr = NULL; /* no need for SACL ptr */ - struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */ + struct smb_sid *owner_sid_ptr, *group_sid_ptr; + struct smb_sid *nowner_sid_ptr = NULL, *ngroup_sid_ptr = NULL; + struct smb_acl *dacl_ptr = NULL; /* no need for SACL ptr */ + struct smb_acl *ndacl_ptr = NULL; /* no need for SACL ptr */ char *end_of_acl = ((char *)pntsd) + secdesclen; u16 size = 0; dacloffset = le32_to_cpu(pntsd->dacloffset); if (dacloffset) { - dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); + dacl_ptr = (struct smb_acl *)((char *)pntsd + dacloffset); if (end_of_acl < (char *)dacl_ptr + le16_to_cpu(dacl_ptr->size)) { cifs_dbg(VFS, "Server returned illegal ACL size\n"); return -EINVAL; } } - owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + + owner_sid_ptr = (struct smb_sid *)((char *)pntsd + le32_to_cpu(pntsd->osidoffset)); - group_sid_ptr = (struct cifs_sid *)((char *)pntsd + + group_sid_ptr = (struct smb_sid *)((char *)pntsd + le32_to_cpu(pntsd->gsidoffset)); if (pnmode && *pnmode != NO_CHANGE_64) { /* chmod */ - ndacloffset = sizeof(struct cifs_ntsd); - ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset); + ndacloffset = sizeof(struct smb_ntsd); + ndacl_ptr = (struct smb_acl *)((char *)pnntsd + ndacloffset); ndacl_ptr->revision = dacloffset ? dacl_ptr->revision : cpu_to_le16(ACL_REVISION); @@ -1297,15 +1297,15 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, *aclflag |= CIFS_ACL_DACL; } else { - ndacloffset = sizeof(struct cifs_ntsd); - ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset); + ndacloffset = sizeof(struct smb_ntsd); + ndacl_ptr = (struct smb_acl *)((char *)pnntsd + ndacloffset); ndacl_ptr->revision = dacloffset ? dacl_ptr->revision : cpu_to_le16(ACL_REVISION); ndacl_ptr->num_aces = dacl_ptr ? dacl_ptr->num_aces : 0; if (uid_valid(uid)) { /* chown */ uid_t id; - nowner_sid_ptr = kzalloc(sizeof(struct cifs_sid), + nowner_sid_ptr = kzalloc(sizeof(struct smb_sid), GFP_KERNEL); if (!nowner_sid_ptr) { rc = -ENOMEM; @@ -1334,7 +1334,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, } if (gid_valid(gid)) { /* chgrp */ gid_t id; - ngroup_sid_ptr = kzalloc(sizeof(struct cifs_sid), + ngroup_sid_ptr = kzalloc(sizeof(struct smb_sid), GFP_KERNEL); if (!ngroup_sid_ptr) { rc = -ENOMEM; @@ -1385,11 +1385,11 @@ chown_chgrp_exit: } #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY -struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, +struct smb_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, const struct cifs_fid *cifsfid, u32 *pacllen, u32 __maybe_unused unused) { - struct cifs_ntsd *pntsd = NULL; + struct smb_ntsd *pntsd = NULL; unsigned int xid; int rc; struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); @@ -1410,10 +1410,10 @@ struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, return pntsd; } -static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, +static struct smb_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path, u32 *pacllen) { - struct cifs_ntsd *pntsd = NULL; + struct smb_ntsd *pntsd = NULL; int oplock = 0; unsigned int xid; int rc; @@ -1454,11 +1454,11 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, } /* Retrieve an ACL from the server */ -struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, +struct smb_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, struct inode *inode, const char *path, u32 *pacllen, u32 info) { - struct cifs_ntsd *pntsd = NULL; + struct smb_ntsd *pntsd = NULL; struct cifsFileInfo *open_file = NULL; if (inode) @@ -1472,7 +1472,7 @@ struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, } /* Set an ACL on the server */ -int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, +int set_cifs_acl(struct smb_ntsd *pnntsd, __u32 acllen, struct inode *inode, const char *path, int aclflag) { int oplock = 0; @@ -1528,7 +1528,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct inode *inode, bool mode_from_special_sid, const char *path, const struct cifs_fid *pfid) { - struct cifs_ntsd *pntsd = NULL; + struct smb_ntsd *pntsd = NULL; u32 acllen = 0; int rc = 0; struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); @@ -1580,9 +1580,9 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, __u32 secdesclen = 0; __u32 nsecdesclen = 0; __u32 dacloffset = 0; - struct cifs_acl *dacl_ptr = NULL; - struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */ - struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ + struct smb_acl *dacl_ptr = NULL; + struct smb_ntsd *pntsd = NULL; /* acl obtained from server */ + struct smb_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); struct smb_version_operations *ops; @@ -1625,18 +1625,18 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, nsecdesclen = secdesclen; if (pnmode && *pnmode != NO_CHANGE_64) { /* chmod */ if (mode_from_sid) - nsecdesclen += 2 * sizeof(struct cifs_ace); + nsecdesclen += 2 * sizeof(struct smb_ace); else /* cifsacl */ - nsecdesclen += 5 * sizeof(struct cifs_ace); + nsecdesclen += 5 * sizeof(struct smb_ace); } else { /* chown */ /* When ownership changes, changes new owner sid length could be different */ - nsecdesclen = sizeof(struct cifs_ntsd) + (sizeof(struct cifs_sid) * 2); + nsecdesclen = sizeof(struct smb_ntsd) + (sizeof(struct smb_sid) * 2); dacloffset = le32_to_cpu(pntsd->dacloffset); if (dacloffset) { - dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); + dacl_ptr = (struct smb_acl *)((char *)pntsd + dacloffset); if (mode_from_sid) nsecdesclen += - le32_to_cpu(dacl_ptr->num_aces) * sizeof(struct cifs_ace); + le32_to_cpu(dacl_ptr->num_aces) * sizeof(struct smb_ace); else /* cifsacl */ nsecdesclen += le16_to_cpu(dacl_ptr->size); } diff --git a/fs/smb/client/cifsacl.h b/fs/smb/client/cifsacl.h index ccbfc754bd3c..6529478b7f48 100644 --- a/fs/smb/client/cifsacl.h +++ b/fs/smb/client/cifsacl.h @@ -9,8 +9,7 @@ #ifndef _CIFSACL_H #define _CIFSACL_H -#define NUM_AUTHS (6) /* number of authority fields */ -#define SID_MAX_SUB_AUTHORITIES (15) /* max number of sub authority fields */ +#include "../common/smbacl.h" #define READ_BIT 0x4 #define WRITE_BIT 0x2 @@ -23,101 +22,13 @@ #define UBITSHIFT 6 #define GBITSHIFT 3 -#define ACCESS_ALLOWED 0 -#define ACCESS_DENIED 1 - -#define SIDOWNER 1 -#define SIDGROUP 2 - /* * Security Descriptor length containing DACL with 3 ACEs (one each for * owner, group and world). */ -#define DEFAULT_SEC_DESC_LEN (sizeof(struct cifs_ntsd) + \ - sizeof(struct cifs_acl) + \ - (sizeof(struct cifs_ace) * 4)) - -/* - * Maximum size of a string representation of a SID: - * - * The fields are unsigned values in decimal. So: - * - * u8: max 3 bytes in decimal - * u32: max 10 bytes in decimal - * - * "S-" + 3 bytes for version field + 15 for authority field + NULL terminator - * - * For authority field, max is when all 6 values are non-zero and it must be - * represented in hex. So "-0x" + 12 hex digits. - * - * Add 11 bytes for each subauthority field (10 bytes each + 1 for '-') - */ -#define SID_STRING_BASE_SIZE (2 + 3 + 15 + 1) -#define SID_STRING_SUBAUTH_SIZE (11) /* size of a single subauth string */ - -struct cifs_ntsd { - __le16 revision; /* revision level */ - __le16 type; - __le32 osidoffset; - __le32 gsidoffset; - __le32 sacloffset; - __le32 dacloffset; -} __attribute__((packed)); - -struct cifs_sid { - __u8 revision; /* revision level */ - __u8 num_subauth; - __u8 authority[NUM_AUTHS]; - __le32 sub_auth[SID_MAX_SUB_AUTHORITIES]; /* sub_auth[num_subauth] */ -} __attribute__((packed)); - -/* size of a struct cifs_sid, sans sub_auth array */ -#define CIFS_SID_BASE_SIZE (1 + 1 + NUM_AUTHS) - -struct cifs_acl { - __le16 revision; /* revision level */ - __le16 size; - __le32 num_aces; -} __attribute__((packed)); - -/* ACE types - see MS-DTYP 2.4.4.1 */ -#define ACCESS_ALLOWED_ACE_TYPE 0x00 -#define ACCESS_DENIED_ACE_TYPE 0x01 -#define SYSTEM_AUDIT_ACE_TYPE 0x02 -#define SYSTEM_ALARM_ACE_TYPE 0x03 -#define ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04 -#define ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 -#define ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 -#define SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 -#define SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 -#define ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09 -#define ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A -#define ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B -#define ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C -#define SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D -#define SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E /* Reserved */ -#define SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F -#define SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10 /* reserved */ -#define SYSTEM_MANDATORY_LABEL_ACE_TYPE 0x11 -#define SYSTEM_RESOURCE_ATTRIBUTE_ACE_TYPE 0x12 -#define SYSTEM_SCOPED_POLICY_ID_ACE_TYPE 0x13 - -/* ACE flags */ -#define OBJECT_INHERIT_ACE 0x01 -#define CONTAINER_INHERIT_ACE 0x02 -#define NO_PROPAGATE_INHERIT_ACE 0x04 -#define INHERIT_ONLY_ACE 0x08 -#define INHERITED_ACE 0x10 -#define SUCCESSFUL_ACCESS_ACE_FLAG 0x40 -#define FAILED_ACCESS_ACE_FLAG 0x80 - -struct cifs_ace { - __u8 type; /* see above and MS-DTYP 2.4.4.1 */ - __u8 flags; - __le16 size; - __le32 access_req; - struct cifs_sid sid; /* ie UUID of user or group who gets these perms */ -} __attribute__((packed)); +#define DEFAULT_SEC_DESC_LEN (sizeof(struct smb_ntsd) + \ + sizeof(struct smb_acl) + \ + (sizeof(struct smb_ace) * 4)) /* * The current SMB3 form of security descriptor is similar to what was used for @@ -194,6 +105,6 @@ struct owner_group_sids { * Minimum security descriptor can be one without any SACL and DACL and can * consist of revision, type, and two sids of minimum size for owner and group */ -#define MIN_SEC_DESC_LEN (sizeof(struct cifs_ntsd) + (2 * MIN_SID_LEN)) +#define MIN_SEC_DESC_LEN (sizeof(struct smb_ntsd) + (2 * MIN_SID_LEN)) #endif /* _CIFSACL_H */ diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c index 6322f0f68a17..7481b21a0489 100644 --- a/fs/smb/client/cifsencrypt.c +++ b/fs/smb/client/cifsencrypt.c @@ -21,127 +21,21 @@ #include <linux/random.h> #include <linux/highmem.h> #include <linux/fips.h> +#include <linux/iov_iter.h> #include "../common/arc4.h" #include <crypto/aead.h> -/* - * Hash data from a BVEC-type iterator. - */ -static int cifs_shash_bvec(const struct iov_iter *iter, ssize_t maxsize, - struct shash_desc *shash) +static size_t cifs_shash_step(void *iter_base, size_t progress, size_t len, + void *priv, void *priv2) { - const struct bio_vec *bv = iter->bvec; - unsigned long start = iter->iov_offset; - unsigned int i; - void *p; - int ret; - - for (i = 0; i < iter->nr_segs; i++) { - size_t off, len; - - len = bv[i].bv_len; - if (start >= len) { - start -= len; - continue; - } - - len = min_t(size_t, maxsize, len - start); - off = bv[i].bv_offset + start; + struct shash_desc *shash = priv; + int ret, *pret = priv2; - p = kmap_local_page(bv[i].bv_page); - ret = crypto_shash_update(shash, p + off, len); - kunmap_local(p); - if (ret < 0) - return ret; - - maxsize -= len; - if (maxsize <= 0) - break; - start = 0; + ret = crypto_shash_update(shash, iter_base, len); + if (ret < 0) { + *pret = ret; + return len; } - - return 0; -} - -/* - * Hash data from a KVEC-type iterator. - */ -static int cifs_shash_kvec(const struct iov_iter *iter, ssize_t maxsize, - struct shash_desc *shash) -{ - const struct kvec *kv = iter->kvec; - unsigned long start = iter->iov_offset; - unsigned int i; - int ret; - - for (i = 0; i < iter->nr_segs; i++) { - size_t len; - - len = kv[i].iov_len; - if (start >= len) { - start -= len; - continue; - } - - len = min_t(size_t, maxsize, len - start); - ret = crypto_shash_update(shash, kv[i].iov_base + start, len); - if (ret < 0) - return ret; - maxsize -= len; - - if (maxsize <= 0) - break; - start = 0; - } - - return 0; -} - -/* - * Hash data from an XARRAY-type iterator. - */ -static ssize_t cifs_shash_xarray(const struct iov_iter *iter, ssize_t maxsize, - struct shash_desc *shash) -{ - struct folio *folios[16], *folio; - unsigned int nr, i, j, npages; - loff_t start = iter->xarray_start + iter->iov_offset; - pgoff_t last, index = start / PAGE_SIZE; - ssize_t ret = 0; - size_t len, offset, foffset; - void *p; - - if (maxsize == 0) - return 0; - - last = (start + maxsize - 1) / PAGE_SIZE; - do { - nr = xa_extract(iter->xarray, (void **)folios, index, last, - ARRAY_SIZE(folios), XA_PRESENT); - if (nr == 0) - return -EIO; - - for (i = 0; i < nr; i++) { - folio = folios[i]; - npages = folio_nr_pages(folio); - foffset = start - folio_pos(folio); - offset = foffset % PAGE_SIZE; - for (j = foffset / PAGE_SIZE; j < npages; j++) { - len = min_t(size_t, maxsize, PAGE_SIZE - offset); - p = kmap_local_page(folio_page(folio, j)); - ret = crypto_shash_update(shash, p, len); - kunmap_local(p); - if (ret < 0) - return ret; - maxsize -= len; - if (maxsize <= 0) - return 0; - start += len; - offset = 0; - index++; - } - } - } while (nr == ARRAY_SIZE(folios)); return 0; } @@ -151,21 +45,13 @@ static ssize_t cifs_shash_xarray(const struct iov_iter *iter, ssize_t maxsize, static int cifs_shash_iter(const struct iov_iter *iter, size_t maxsize, struct shash_desc *shash) { - if (maxsize == 0) - return 0; + struct iov_iter tmp_iter = *iter; + int err = -EIO; - switch (iov_iter_type(iter)) { - case ITER_BVEC: - return cifs_shash_bvec(iter, maxsize, shash); - case ITER_KVEC: - return cifs_shash_kvec(iter, maxsize, shash); - case ITER_XARRAY: - return cifs_shash_xarray(iter, maxsize, shash); - default: - pr_err("cifs_shash_iter(%u) unsupported\n", iov_iter_type(iter)); - WARN_ON_ONCE(1); - return -EIO; - } + if (iterate_and_advance_kernel(&tmp_iter, maxsize, shash, &err, + cifs_shash_step) != maxsize) + return err; + return 0; } int __cifs_calc_signature(struct smb_rqst *rqst, diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 2c4b357d85e2..2a2523c93944 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -75,9 +75,9 @@ unsigned int sign_CIFS_PDUs = 1; /* * Global transaction id (XID) information */ -unsigned int GlobalCurrentXid; /* protected by GlobalMid_Sem */ -unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */ -unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */ +unsigned int GlobalCurrentXid; /* protected by GlobalMid_Lock */ +unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Lock */ +unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Lock */ spinlock_t GlobalMid_Lock; /* protects above & list operations on midQ entries */ /* @@ -1341,7 +1341,6 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, struct cifsFileInfo *smb_file_target; struct cifs_tcon *src_tcon; struct cifs_tcon *target_tcon; - unsigned long long destend, fstart, fend; ssize_t rc; cifs_dbg(FYI, "copychunk range\n"); @@ -1391,25 +1390,13 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, goto unlock; } - destend = destoff + len - 1; - - /* Flush the folios at either end of the destination range to prevent - * accidental loss of dirty data outside of the range. + /* Flush and invalidate all the folios in the destination region. If + * the copy was successful, then some of the flush is extra overhead, + * but we need to allow for the copy failing in some way (eg. ENOSPC). */ - fstart = destoff; - fend = destend; - - rc = cifs_flush_folio(target_inode, destoff, &fstart, &fend, true); + rc = filemap_invalidate_inode(target_inode, true, destoff, destoff + len - 1); if (rc) goto unlock; - rc = cifs_flush_folio(target_inode, destend, &fstart, &fend, false); - if (rc) - goto unlock; - if (fend > target_cifsi->netfs.zero_point) - target_cifsi->netfs.zero_point = fend + 1; - - /* Discard all the folios that overlap the destination region. */ - truncate_inode_pages_range(&target_inode->i_data, fstart, fend); fscache_invalidate(cifs_inode_cookie(target_inode), NULL, i_size_read(target_inode), 0); diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h index ca2bd204bcc5..61ded59b858f 100644 --- a/fs/smb/client/cifsfs.h +++ b/fs/smb/client/cifsfs.h @@ -106,7 +106,6 @@ extern int cifs_flush(struct file *, fl_owner_t id); extern int cifs_file_mmap(struct file *file, struct vm_area_struct *vma); extern int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma); extern const struct file_operations cifs_dir_ops; -extern int cifs_dir_open(struct inode *inode, struct file *file); extern int cifs_readdir(struct file *file, struct dir_context *ctx); /* Functions related to dir entries */ diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index f6d1f075987f..a71a988a92f9 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -202,10 +202,10 @@ struct cifs_cred { int gid; int mode; int cecount; - struct cifs_sid osid; - struct cifs_sid gsid; + struct smb_sid osid; + struct smb_sid gsid; struct cifs_ntace *ntaces; - struct cifs_ace *aces; + struct smb_ace *aces; }; struct cifs_open_info_data { @@ -231,8 +231,8 @@ struct cifs_open_info_data { unsigned int eas_len; } wsl; char *symlink_target; - struct cifs_sid posix_owner; - struct cifs_sid posix_group; + struct smb_sid posix_owner; + struct smb_sid posix_group; union { struct smb2_file_all_info fi; struct smb311_posix_qinfo posix_fi; @@ -254,9 +254,8 @@ struct cifs_open_info_data { struct smb_rqst { struct kvec *rq_iov; /* array of kvecs */ unsigned int rq_nvec; /* number of kvecs in array */ - size_t rq_iter_size; /* Amount of data in ->rq_iter */ struct iov_iter rq_iter; /* Data iterator */ - struct xarray rq_buffer; /* Page buffer for encryption */ + struct folio_queue *rq_buffer; /* Buffer for encryption */ }; struct mid_q_entry; @@ -345,7 +344,7 @@ struct smb_version_operations { /* connect to a server share */ int (*tree_connect)(const unsigned int, struct cifs_ses *, const char *, struct cifs_tcon *, const struct nls_table *); - /* close tree connecion */ + /* close tree connection */ int (*tree_disconnect)(const unsigned int, struct cifs_tcon *); /* get DFS referrals */ int (*get_dfs_refer)(const unsigned int, struct cifs_ses *, @@ -537,12 +536,12 @@ struct smb_version_operations { int (*set_EA)(const unsigned int, struct cifs_tcon *, const char *, const char *, const void *, const __u16, const struct nls_table *, struct cifs_sb_info *); - struct cifs_ntsd * (*get_acl)(struct cifs_sb_info *, struct inode *, - const char *, u32 *, u32); - struct cifs_ntsd * (*get_acl_by_fid)(struct cifs_sb_info *, - const struct cifs_fid *, u32 *, u32); - int (*set_acl)(struct cifs_ntsd *, __u32, struct inode *, const char *, - int); + struct smb_ntsd * (*get_acl)(struct cifs_sb_info *cifssb, struct inode *ino, + const char *patch, u32 *plen, u32 info); + struct smb_ntsd * (*get_acl_by_fid)(struct cifs_sb_info *cifssmb, + const struct cifs_fid *pfid, u32 *plen, u32 info); + int (*set_acl)(struct smb_ntsd *pntsd, __u32 len, struct inode *ino, const char *path, + int flag); /* writepages retry size */ unsigned int (*wp_retry_size)(struct inode *); /* get mtu credits */ @@ -556,7 +555,7 @@ struct smb_version_operations { bool (*dir_needs_close)(struct cifsFileInfo *); long (*fallocate)(struct file *, struct cifs_tcon *, int, loff_t, loff_t); - /* init transform request - used for encryption for now */ + /* init transform (compress/encrypt) request */ int (*init_transform_rq)(struct TCP_Server_Info *, int num_rqst, struct smb_rqst *, struct smb_rqst *); int (*is_transform_hdr)(void *buf); @@ -816,7 +815,7 @@ struct TCP_Server_Info { * Protected by @refpath_lock and @srv_lock. The @refpath_lock is * mostly used for not requiring a copy of @leaf_fullpath when getting * cached or new DFS referrals (which might also sleep during I/O). - * While @srv_lock is held for making string and NULL comparions against + * While @srv_lock is held for making string and NULL comparisons against * both fields as in mount(2) and cache refresh. * * format: \\HOST\SHARE[\OPTIONAL PATH] @@ -1550,7 +1549,6 @@ struct cifsInodeInfo { #define CIFS_INO_DELETE_PENDING (3) /* delete pending on server */ #define CIFS_INO_INVALID_MAPPING (4) /* pagecache is invalid */ #define CIFS_INO_LOCK (5) /* lock bit for synchronization */ -#define CIFS_INO_MODIFIED_ATTR (6) /* Indicate change in mtime/ctime */ #define CIFS_INO_CLOSE_ON_LOCK (7) /* Not to defer the close when lock is set */ unsigned long flags; spinlock_t writers_lock; @@ -1876,12 +1874,13 @@ static inline bool is_replayable_error(int error) #define CIFS_HAS_CREDITS 0x0400 /* already has credits */ #define CIFS_TRANSFORM_REQ 0x0800 /* transform request before sending */ #define CIFS_NO_SRV_RSP 0x1000 /* there is no server response */ +#define CIFS_COMPRESS_REQ 0x4000 /* compress request before sending */ /* Security Flags: indicate type of session setup needed */ #define CIFSSEC_MAY_SIGN 0x00001 #define CIFSSEC_MAY_NTLMV2 0x00004 #define CIFSSEC_MAY_KRB5 0x00008 -#define CIFSSEC_MAY_SEAL 0x00040 /* not supported yet */ +#define CIFSSEC_MAY_SEAL 0x00040 #define CIFSSEC_MAY_NTLMSSP 0x00080 /* raw ntlmssp with ntlmv2 */ #define CIFSSEC_MUST_SIGN 0x01001 @@ -1891,11 +1890,11 @@ require use of the stronger protocol */ #define CIFSSEC_MUST_NTLMV2 0x04004 #define CIFSSEC_MUST_KRB5 0x08008 #ifdef CONFIG_CIFS_UPCALL -#define CIFSSEC_MASK 0x8F08F /* flags supported if no weak allowed */ +#define CIFSSEC_MASK 0xCF0CF /* flags supported if no weak allowed */ #else -#define CIFSSEC_MASK 0x87087 /* flags supported if no weak allowed */ +#define CIFSSEC_MASK 0xC70C7 /* flags supported if no weak allowed */ #endif /* UPCALL */ -#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */ +#define CIFSSEC_MUST_SEAL 0x40040 #define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */ #define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP | CIFSSEC_MAY_SEAL) @@ -2017,9 +2016,9 @@ extern spinlock_t cifs_tcp_ses_lock; /* * Global transaction id (XID) information */ -extern unsigned int GlobalCurrentXid; /* protected by GlobalMid_Sem */ -extern unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */ -extern unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */ +extern unsigned int GlobalCurrentXid; /* protected by GlobalMid_Lock */ +extern unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Lock */ +extern unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Lock */ extern spinlock_t GlobalMid_Lock; /* protects above & list operations on midQ entries */ /* diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h index a2072ab9e586..c3b6263060b0 100644 --- a/fs/smb/client/cifspdu.h +++ b/fs/smb/client/cifspdu.h @@ -2573,12 +2573,6 @@ typedef struct { } __attribute__((packed)) FIND_FILE_STANDARD_INFO; /* level 0x1 FF resp data */ -struct win_dev { - unsigned char type[8]; /* IntxCHR or IntxBLK or LnxFIFO or LnxSOCK */ - __le64 major; - __le64 minor; -} __attribute__((packed)); - struct fea { unsigned char EA_flags; __u8 name_len; diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index 497bf3c447bc..c69e3f48a60c 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -225,7 +225,7 @@ extern int cifs_set_file_info(struct inode *inode, struct iattr *attrs, extern int cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, const unsigned int xid); -extern int sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, +extern int sid_to_id(struct cifs_sb_info *cifs_sb, struct smb_sid *psid, struct cifs_fattr *fattr, uint sidtype); extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct inode *inode, @@ -233,19 +233,19 @@ extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, const char *path, const struct cifs_fid *pfid); extern int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, kuid_t uid, kgid_t gid); -extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *, - const char *, u32 *, u32); -extern struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *, - const struct cifs_fid *, u32 *, u32); +extern struct smb_ntsd *get_cifs_acl(struct cifs_sb_info *cifssmb, struct inode *ino, + const char *path, u32 *plen, u32 info); +extern struct smb_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifssb, + const struct cifs_fid *pfid, u32 *plen, u32 info); extern struct posix_acl *cifs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type); extern int cifs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); -extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, - const char *, int); -extern unsigned int setup_authusers_ACE(struct cifs_ace *pace); -extern unsigned int setup_special_mode_ACE(struct cifs_ace *pace, __u64 nmode); -extern unsigned int setup_special_user_owner_ACE(struct cifs_ace *pace); +extern int set_cifs_acl(struct smb_ntsd *pntsd, __u32 len, struct inode *ino, + const char *path, int flag); +extern unsigned int setup_authusers_ACE(struct smb_ace *pace); +extern unsigned int setup_special_mode_ACE(struct smb_ace *pace, __u64 nmode); +extern unsigned int setup_special_user_owner_ACE(struct smb_ace *pace); extern void dequeue_mid(struct mid_q_entry *mid, bool malformed); extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, @@ -570,9 +570,9 @@ extern int CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon, const struct nls_table *nls_codepage, struct cifs_sb_info *cifs_sb); extern int CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, - __u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen); + __u16 fid, struct smb_ntsd **acl_inf, __u32 *buflen); extern int CIFSSMBSetCIFSACL(const unsigned int, struct cifs_tcon *, __u16, - struct cifs_ntsd *, __u32, int); + struct smb_ntsd *pntsd, __u32 len, int aclflag); extern int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon, const unsigned char *searchName, struct posix_acl **acl, const int acl_type, @@ -676,6 +676,10 @@ char *extract_sharename(const char *unc); int parse_reparse_point(struct reparse_data_buffer *buf, u32 plen, struct cifs_sb_info *cifs_sb, bool unicode, struct cifs_open_info_data *data); +int __cifs_sfu_make_node(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, umode_t mode, dev_t dev, + const char *symname); int cifs_sfu_make_node(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, umode_t mode, dev_t dev); diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 595c4b673707..131f20b91c3e 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -1076,8 +1076,8 @@ OldOpenRetry: pSMB->OpenFlags |= cpu_to_le16(REQ_MORE_INFO); pSMB->Mode = cpu_to_le16(access_flags_to_smbopen_mode(access_flags)); pSMB->Mode |= cpu_to_le16(0x40); /* deny none */ - /* set file as system file if special file such - as fifo and server expecting SFU style and + /* set file as system file if special file such as fifo, + * socket, char or block and server expecting SFU style and no Unix extensions */ if (create_options & CREATE_OPTION_SPECIAL) @@ -1193,8 +1193,8 @@ openRetry: req->AllocationSize = 0; /* - * Set file as system file if special file such as fifo and server - * expecting SFU style and no Unix extensions. + * Set file as system file if special file such as fifo, socket, char + * or block and server expecting SFU style and no Unix extensions. */ if (create_options & CREATE_OPTION_SPECIAL) req->FileAttributes = cpu_to_le32(ATTR_SYSTEM); @@ -1261,16 +1261,30 @@ openRetry: return rc; } +static void cifs_readv_worker(struct work_struct *work) +{ + struct cifs_io_subrequest *rdata = + container_of(work, struct cifs_io_subrequest, subreq.work); + + netfs_read_subreq_terminated(&rdata->subreq, rdata->result, false); +} + static void cifs_readv_callback(struct mid_q_entry *mid) { struct cifs_io_subrequest *rdata = mid->callback_data; + struct netfs_inode *ictx = netfs_inode(rdata->rreq->inode); struct cifs_tcon *tcon = tlink_tcon(rdata->req->cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; struct smb_rqst rqst = { .rq_iov = rdata->iov, .rq_nvec = 2, .rq_iter = rdata->subreq.io_iter }; - struct cifs_credits credits = { .value = 1, .instance = 0 }; + struct cifs_credits credits = { + .value = 1, + .instance = 0, + .rreq_debug_id = rdata->rreq->debug_id, + .rreq_debug_index = rdata->subreq.debug_index, + }; cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%zu\n", __func__, mid->mid, mid->mid_state, rdata->result, @@ -1282,6 +1296,7 @@ cifs_readv_callback(struct mid_q_entry *mid) if (server->sign) { int rc = 0; + iov_iter_truncate(&rqst.rq_iter, rdata->got_bytes); rc = cifs_verify_signature(&rqst, server, mid->sequence_number); if (rc) @@ -1306,13 +1321,22 @@ cifs_readv_callback(struct mid_q_entry *mid) rdata->result = -EIO; } - if (rdata->result == 0 || rdata->result == -EAGAIN) - iov_iter_advance(&rdata->subreq.io_iter, rdata->got_bytes); + if (rdata->result == -ENODATA) { + __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); + rdata->result = 0; + } else { + size_t trans = rdata->subreq.transferred + rdata->got_bytes; + if (trans < rdata->subreq.len && + rdata->subreq.start + trans == ictx->remote_i_size) { + __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); + rdata->result = 0; + } + } + rdata->credits.value = 0; - netfs_subreq_terminated(&rdata->subreq, - (rdata->result == 0 || rdata->result == -EAGAIN) ? - rdata->got_bytes : rdata->result, - false); + rdata->subreq.transferred += rdata->got_bytes; + INIT_WORK(&rdata->subreq.work, cifs_readv_worker); + queue_work(cifsiod_wq, &rdata->subreq.work); release_mid(mid); add_credits(server, &credits, 0); } @@ -1619,9 +1643,15 @@ static void cifs_writev_callback(struct mid_q_entry *mid) { struct cifs_io_subrequest *wdata = mid->callback_data; + struct TCP_Server_Info *server = wdata->server; struct cifs_tcon *tcon = tlink_tcon(wdata->req->cfile->tlink); WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf; - struct cifs_credits credits = { .value = 1, .instance = 0 }; + struct cifs_credits credits = { + .value = 1, + .instance = 0, + .rreq_debug_id = wdata->rreq->debug_id, + .rreq_debug_index = wdata->subreq.debug_index, + }; ssize_t result; size_t written; @@ -1657,9 +1687,16 @@ cifs_writev_callback(struct mid_q_entry *mid) break; } + trace_smb3_rw_credits(credits.rreq_debug_id, credits.rreq_debug_index, + wdata->credits.value, + server->credits, server->in_flight, + 0, cifs_trace_rw_credits_write_response_clear); wdata->credits.value = 0; cifs_write_subrequest_terminated(wdata, result, true); release_mid(mid); + trace_smb3_rw_credits(credits.rreq_debug_id, credits.rreq_debug_index, 0, + server->credits, server->in_flight, + credits.value, cifs_trace_rw_credits_write_response_add); add_credits(tcon->ses->server, &credits, 0); } @@ -1713,7 +1750,6 @@ cifs_async_writev(struct cifs_io_subrequest *wdata) rqst.rq_iov = iov; rqst.rq_nvec = 2; rqst.rq_iter = wdata->subreq.io_iter; - rqst.rq_iter_size = iov_iter_count(&wdata->subreq.io_iter); cifs_dbg(FYI, "async write at %llu %zu bytes\n", wdata->subreq.start, wdata->subreq.len); @@ -3391,7 +3427,7 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata, /* Get Security Descriptor (by handle) from remote server for a file or dir */ int CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, - struct cifs_ntsd **acl_inf, __u32 *pbuflen) + struct smb_ntsd **acl_inf, __u32 *pbuflen) { int rc = 0; int buf_type = 0; @@ -3461,7 +3497,7 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, /* check if buffer is big enough for the acl header followed by the smallest SID */ - if ((*pbuflen < sizeof(struct cifs_ntsd) + 8) || + if ((*pbuflen < sizeof(struct smb_ntsd) + 8) || (*pbuflen >= 64 * 1024)) { cifs_dbg(VFS, "bad acl length %d\n", *pbuflen); rc = -EINVAL; @@ -3481,7 +3517,7 @@ qsec_out: int CIFSSMBSetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, - struct cifs_ntsd *pntsd, __u32 acllen, int aclflag) + struct smb_ntsd *pntsd, __u32 acllen, int aclflag) { __u16 byte_count, param_count, data_count, param_offset, data_offset; int rc = 0; diff --git a/fs/smb/client/compress.c b/fs/smb/client/compress.c new file mode 100644 index 000000000000..63b5a55b7a57 --- /dev/null +++ b/fs/smb/client/compress.c @@ -0,0 +1,390 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2024, SUSE LLC + * + * Authors: Enzo Matsumiya <ematsumiya@suse.de> + * + * This file implements I/O compression support for SMB2 messages (SMB 3.1.1 only). + * See compress/ for implementation details of each algorithm. + * + * References: + * MS-SMB2 "3.1.4.4 Compressing the Message" + * MS-SMB2 "3.1.5.3 Decompressing the Chained Message" + * MS-XCA - for details of the supported algorithms + */ +#include <linux/slab.h> +#include <linux/kernel.h> +#include <linux/uio.h> +#include <linux/sort.h> + +#include "cifsglob.h" +#include "../common/smb2pdu.h" +#include "cifsproto.h" +#include "smb2proto.h" + +#include "compress/lz77.h" +#include "compress.h" + +/* + * The heuristic_*() functions below try to determine data compressibility. + * + * Derived from fs/btrfs/compression.c, changing coding style, some parameters, and removing + * unused parts. + * + * Read that file for better and more detailed explanation of the calculations. + * + * The algorithms are ran in a collected sample of the input (uncompressed) data. + * The sample is formed of 2K reads in PAGE_SIZE intervals, with a maximum size of 4M. + * + * Parsing the sample goes from "low-hanging fruits" (fastest algorithms, likely compressible) + * to "need more analysis" (likely uncompressible). + */ + +struct bucket { + unsigned int count; +}; + +/** + * has_low_entropy() - Compute Shannon entropy of the sampled data. + * @bkt: Bytes counts of the sample. + * @slen: Size of the sample. + * + * Return: true if the level (percentage of number of bits that would be required to + * compress the data) is below the minimum threshold. + * + * Note: + * There _is_ an entropy level here that's > 65 (minimum threshold) that would indicate a + * possibility of compression, but compressing, or even further analysing, it would waste so much + * resources that it's simply not worth it. + * + * Also Shannon entropy is the last computed heuristic; if we got this far and ended up + * with uncertainty, just stay on the safe side and call it uncompressible. + */ +static bool has_low_entropy(struct bucket *bkt, size_t slen) +{ + const size_t threshold = 65, max_entropy = 8 * ilog2(16); + size_t i, p, p2, len, sum = 0; + +#define pow4(n) (n * n * n * n) + len = ilog2(pow4(slen)); + + for (i = 0; i < 256 && bkt[i].count > 0; i++) { + p = bkt[i].count; + p2 = ilog2(pow4(p)); + sum += p * (len - p2); + } + + sum /= slen; + + return ((sum * 100 / max_entropy) <= threshold); +} + +#define BYTE_DIST_BAD 0 +#define BYTE_DIST_GOOD 1 +#define BYTE_DIST_MAYBE 2 +/** + * calc_byte_distribution() - Compute byte distribution on the sampled data. + * @bkt: Byte counts of the sample. + * @slen: Size of the sample. + * + * Return: + * BYTE_DIST_BAD: A "hard no" for compression -- a computed uniform distribution of + * the bytes (e.g. random or encrypted data). + * BYTE_DIST_GOOD: High probability (normal (Gaussian) distribution) of the data being + * compressible. + * BYTE_DIST_MAYBE: When computed byte distribution resulted in "low > n < high" + * grounds. has_low_entropy() should be used for a final decision. + */ +static int calc_byte_distribution(struct bucket *bkt, size_t slen) +{ + const size_t low = 64, high = 200, threshold = slen * 90 / 100; + size_t sum = 0; + int i; + + for (i = 0; i < low; i++) + sum += bkt[i].count; + + if (sum > threshold) + return BYTE_DIST_BAD; + + for (; i < high && bkt[i].count > 0; i++) { + sum += bkt[i].count; + if (sum > threshold) + break; + } + + if (i <= low) + return BYTE_DIST_GOOD; + + if (i >= high) + return BYTE_DIST_BAD; + + return BYTE_DIST_MAYBE; +} + +static bool is_mostly_ascii(const struct bucket *bkt) +{ + size_t count = 0; + int i; + + for (i = 0; i < 256; i++) + if (bkt[i].count > 0) + /* Too many non-ASCII (0-63) bytes. */ + if (++count > 64) + return false; + + return true; +} + +static bool has_repeated_data(const u8 *sample, size_t len) +{ + size_t s = len / 2; + + return (!memcmp(&sample[0], &sample[s], s)); +} + +static int cmp_bkt(const void *_a, const void *_b) +{ + const struct bucket *a = _a, *b = _b; + + /* Reverse sort. */ + if (a->count > b->count) + return -1; + + return 1; +} + +/* + * TODO: + * Support other iter types, if required. + * Only ITER_XARRAY is supported for now. + */ +static int collect_sample(const struct iov_iter *iter, ssize_t max, u8 *sample) +{ + struct folio *folios[16], *folio; + unsigned int nr, i, j, npages; + loff_t start = iter->xarray_start + iter->iov_offset; + pgoff_t last, index = start / PAGE_SIZE; + size_t len, off, foff; + ssize_t ret = 0; + void *p; + int s = 0; + + last = (start + max - 1) / PAGE_SIZE; + do { + nr = xa_extract(iter->xarray, (void **)folios, index, last, ARRAY_SIZE(folios), + XA_PRESENT); + if (nr == 0) + return -EIO; + + for (i = 0; i < nr; i++) { + folio = folios[i]; + npages = folio_nr_pages(folio); + foff = start - folio_pos(folio); + off = foff % PAGE_SIZE; + + for (j = foff / PAGE_SIZE; j < npages; j++) { + size_t len2; + + len = min_t(size_t, max, PAGE_SIZE - off); + len2 = min_t(size_t, len, SZ_2K); + + p = kmap_local_page(folio_page(folio, j)); + memcpy(&sample[s], p, len2); + kunmap_local(p); + + if (ret < 0) + return ret; + + s += len2; + + if (len2 < SZ_2K || s >= max - SZ_2K) + return s; + + max -= len; + if (max <= 0) + return s; + + start += len; + off = 0; + index++; + } + } + } while (nr == ARRAY_SIZE(folios)); + + return s; +} + +/** + * is_compressible() - Determines if a chunk of data is compressible. + * @data: Iterator containing uncompressed data. + * + * Return: true if @data is compressible, false otherwise. + * + * Tests shows that this function is quite reliable in predicting data compressibility, + * matching close to 1:1 with the behaviour of LZ77 compression success and failures. + */ +static bool is_compressible(const struct iov_iter *data) +{ + const size_t read_size = SZ_2K, bkt_size = 256, max = SZ_4M; + struct bucket *bkt = NULL; + size_t len; + u8 *sample; + bool ret = false; + int i; + + /* Preventive double check -- already checked in should_compress(). */ + len = iov_iter_count(data); + if (unlikely(len < read_size)) + return ret; + + if (len - read_size > max) + len = max; + + sample = kvzalloc(len, GFP_KERNEL); + if (!sample) { + WARN_ON_ONCE(1); + + return ret; + } + + /* Sample 2K bytes per page of the uncompressed data. */ + i = collect_sample(data, len, sample); + if (i <= 0) { + WARN_ON_ONCE(1); + + goto out; + } + + len = i; + ret = true; + + if (has_repeated_data(sample, len)) + goto out; + + bkt = kcalloc(bkt_size, sizeof(*bkt), GFP_KERNEL); + if (!bkt) { + WARN_ON_ONCE(1); + ret = false; + + goto out; + } + + for (i = 0; i < len; i++) + bkt[sample[i]].count++; + + if (is_mostly_ascii(bkt)) + goto out; + + /* Sort in descending order */ + sort(bkt, bkt_size, sizeof(*bkt), cmp_bkt, NULL); + + i = calc_byte_distribution(bkt, len); + if (i != BYTE_DIST_MAYBE) { + ret = !!i; + + goto out; + } + + ret = has_low_entropy(bkt, len); +out: + kvfree(sample); + kfree(bkt); + + return ret; +} + +bool should_compress(const struct cifs_tcon *tcon, const struct smb_rqst *rq) +{ + const struct smb2_hdr *shdr = rq->rq_iov->iov_base; + + if (unlikely(!tcon || !tcon->ses || !tcon->ses->server)) + return false; + + if (!tcon->ses->server->compression.enabled) + return false; + + if (!(tcon->share_flags & SMB2_SHAREFLAG_COMPRESS_DATA)) + return false; + + if (shdr->Command == SMB2_WRITE) { + const struct smb2_write_req *wreq = rq->rq_iov->iov_base; + + if (le32_to_cpu(wreq->Length) < SMB_COMPRESS_MIN_LEN) + return false; + + return is_compressible(&rq->rq_iter); + } + + return (shdr->Command == SMB2_READ); +} + +int smb_compress(struct TCP_Server_Info *server, struct smb_rqst *rq, compress_send_fn send_fn) +{ + struct iov_iter iter; + u32 slen, dlen; + void *src, *dst = NULL; + int ret; + + if (!server || !rq || !rq->rq_iov || !rq->rq_iov->iov_base) + return -EINVAL; + + if (rq->rq_iov->iov_len != sizeof(struct smb2_write_req)) + return -EINVAL; + + slen = iov_iter_count(&rq->rq_iter); + src = kvzalloc(slen, GFP_KERNEL); + if (!src) { + ret = -ENOMEM; + goto err_free; + } + + /* Keep the original iter intact. */ + iter = rq->rq_iter; + + if (!copy_from_iter_full(src, slen, &iter)) { + ret = -EIO; + goto err_free; + } + + /* + * This is just overprovisioning, as the algorithm will error out if @dst reaches 7/8 + * of @slen. + */ + dlen = slen; + dst = kvzalloc(dlen, GFP_KERNEL); + if (!dst) { + ret = -ENOMEM; + goto err_free; + } + + ret = lz77_compress(src, slen, dst, &dlen); + if (!ret) { + struct smb2_compression_hdr hdr = { 0 }; + struct smb_rqst comp_rq = { .rq_nvec = 3, }; + struct kvec iov[3]; + + hdr.ProtocolId = SMB2_COMPRESSION_TRANSFORM_ID; + hdr.OriginalCompressedSegmentSize = cpu_to_le32(slen); + hdr.CompressionAlgorithm = SMB3_COMPRESS_LZ77; + hdr.Flags = SMB2_COMPRESSION_FLAG_NONE; + hdr.Offset = cpu_to_le32(rq->rq_iov[0].iov_len); + + iov[0].iov_base = &hdr; + iov[0].iov_len = sizeof(hdr); + iov[1] = rq->rq_iov[0]; + iov[2].iov_base = dst; + iov[2].iov_len = dlen; + + comp_rq.rq_iov = iov; + + ret = send_fn(server, 1, &comp_rq); + } else if (ret == -EMSGSIZE || dlen >= slen) { + ret = send_fn(server, 1, rq); + } +err_free: + kvfree(dst); + kvfree(src); + + return ret; +} diff --git a/fs/smb/client/compress.h b/fs/smb/client/compress.h new file mode 100644 index 000000000000..f3ed1d3e52fb --- /dev/null +++ b/fs/smb/client/compress.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2024, SUSE LLC + * + * Authors: Enzo Matsumiya <ematsumiya@suse.de> + * + * This file implements I/O compression support for SMB2 messages (SMB 3.1.1 only). + * See compress/ for implementation details of each algorithm. + * + * References: + * MS-SMB2 "3.1.4.4 Compressing the Message" - for compression details + * MS-SMB2 "3.1.5.3 Decompressing the Chained Message" - for decompression details + * MS-XCA - for details of the supported algorithms + */ +#ifndef _SMB_COMPRESS_H +#define _SMB_COMPRESS_H + +#include <linux/uio.h> +#include <linux/kernel.h> +#include "../common/smb2pdu.h" +#include "cifsglob.h" + +/* sizeof(smb2_compression_hdr) - sizeof(OriginalPayloadSize) */ +#define SMB_COMPRESS_HDR_LEN 16 +/* sizeof(smb2_compression_payload_hdr) - sizeof(OriginalPayloadSize) */ +#define SMB_COMPRESS_PAYLOAD_HDR_LEN 8 +#define SMB_COMPRESS_MIN_LEN PAGE_SIZE + +#ifdef CONFIG_CIFS_COMPRESSION +typedef int (*compress_send_fn)(struct TCP_Server_Info *, int, struct smb_rqst *); + +int smb_compress(struct TCP_Server_Info *server, struct smb_rqst *rq, compress_send_fn send_fn); + +/** + * should_compress() - Determines if a request (write) or the response to a + * request (read) should be compressed. + * @tcon: tcon of the request is being sent to + * @rqst: request to evaluate + * + * Return: true iff: + * - compression was successfully negotiated with server + * - server has enabled compression for the share + * - it's a read or write request + * - (write only) request length is >= SMB_COMPRESS_MIN_LEN + * - (write only) is_compressible() returns 1 + * + * Return false otherwise. + */ +bool should_compress(const struct cifs_tcon *tcon, const struct smb_rqst *rq); + +/** + * smb_compress_alg_valid() - Validate a compression algorithm. + * @alg: Compression algorithm to check. + * @valid_none: Conditional check whether NONE algorithm should be + * considered valid or not. + * + * If @alg is SMB3_COMPRESS_NONE, this function returns @valid_none. + * + * Note that 'NONE' (0) compressor type is considered invalid in protocol + * negotiation, as it's never requested to/returned from the server. + * + * Return: true if @alg is valid/supported, false otherwise. + */ +static __always_inline int smb_compress_alg_valid(__le16 alg, bool valid_none) +{ + if (alg == SMB3_COMPRESS_NONE) + return valid_none; + + if (alg == SMB3_COMPRESS_LZ77 || alg == SMB3_COMPRESS_PATTERN) + return true; + + return false; +} +#else /* !CONFIG_CIFS_COMPRESSION */ +static inline int smb_compress(void *unused1, void *unused2, void *unused3) +{ + return -EOPNOTSUPP; +} + +static inline bool should_compress(void *unused1, void *unused2) +{ + return false; +} + +static inline int smb_compress_alg_valid(__le16 unused1, bool unused2) +{ + return -EOPNOTSUPP; +} +#endif /* !CONFIG_CIFS_COMPRESSION */ +#endif /* _SMB_COMPRESS_H */ diff --git a/fs/smb/client/compress/lz77.c b/fs/smb/client/compress/lz77.c new file mode 100644 index 000000000000..553e253ada29 --- /dev/null +++ b/fs/smb/client/compress/lz77.c @@ -0,0 +1,235 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2024, SUSE LLC + * + * Authors: Enzo Matsumiya <ematsumiya@suse.de> + * + * Implementation of the LZ77 "plain" compression algorithm, as per MS-XCA spec. + */ +#include <linux/slab.h> +#include <linux/sizes.h> +#include <linux/count_zeros.h> +#include <asm/unaligned.h> + +#include "lz77.h" + +/* + * Compression parameters. + */ +#define LZ77_MATCH_MIN_LEN 4 +#define LZ77_MATCH_MIN_DIST 1 +#define LZ77_MATCH_MAX_DIST SZ_1K +#define LZ77_HASH_LOG 15 +#define LZ77_HASH_SIZE (1 << LZ77_HASH_LOG) +#define LZ77_STEP_SIZE sizeof(u64) + +static __always_inline u8 lz77_read8(const u8 *ptr) +{ + return get_unaligned(ptr); +} + +static __always_inline u64 lz77_read64(const u64 *ptr) +{ + return get_unaligned(ptr); +} + +static __always_inline void lz77_write8(u8 *ptr, u8 v) +{ + put_unaligned(v, ptr); +} + +static __always_inline void lz77_write16(u16 *ptr, u16 v) +{ + put_unaligned_le16(v, ptr); +} + +static __always_inline void lz77_write32(u32 *ptr, u32 v) +{ + put_unaligned_le32(v, ptr); +} + +static __always_inline u32 lz77_match_len(const void *wnd, const void *cur, const void *end) +{ + const void *start = cur; + u64 diff; + + /* Safe for a do/while because otherwise we wouldn't reach here from the main loop. */ + do { + diff = lz77_read64(cur) ^ lz77_read64(wnd); + if (!diff) { + cur += LZ77_STEP_SIZE; + wnd += LZ77_STEP_SIZE; + + continue; + } + + /* This computes the number of common bytes in @diff. */ + cur += count_trailing_zeros(diff) >> 3; + + return (cur - start); + } while (likely(cur + LZ77_STEP_SIZE < end)); + + while (cur < end && lz77_read8(cur++) == lz77_read8(wnd++)) + ; + + return (cur - start); +} + +static __always_inline void *lz77_write_match(void *dst, void **nib, u32 dist, u32 len) +{ + len -= 3; + dist--; + dist <<= 3; + + if (len < 7) { + lz77_write16(dst, dist + len); + + return dst + 2; + } + + dist |= 7; + lz77_write16(dst, dist); + dst += 2; + len -= 7; + + if (!*nib) { + lz77_write8(dst, umin(len, 15)); + *nib = dst; + dst++; + } else { + u8 *b = *nib; + + lz77_write8(b, *b | umin(len, 15) << 4); + *nib = NULL; + } + + if (len < 15) + return dst; + + len -= 15; + if (len < 255) { + lz77_write8(dst, len); + + return dst + 1; + } + + lz77_write8(dst, 0xff); + dst++; + len += 7 + 15; + if (len <= 0xffff) { + lz77_write16(dst, len); + + return dst + 2; + } + + lz77_write16(dst, 0); + dst += 2; + lz77_write32(dst, len); + + return dst + 4; +} + +noinline int lz77_compress(const void *src, u32 slen, void *dst, u32 *dlen) +{ + const void *srcp, *end; + void *dstp, *nib, *flag_pos; + u32 flag_count = 0; + long flag = 0; + u64 *htable; + + srcp = src; + end = src + slen; + dstp = dst; + nib = NULL; + flag_pos = dstp; + dstp += 4; + + htable = kvcalloc(LZ77_HASH_SIZE, sizeof(*htable), GFP_KERNEL); + if (!htable) + return -ENOMEM; + + /* Main loop. */ + do { + u32 dist, len = 0; + const void *wnd; + u64 hash; + + hash = ((lz77_read64(srcp) << 24) * 889523592379ULL) >> (64 - LZ77_HASH_LOG); + wnd = src + htable[hash]; + htable[hash] = srcp - src; + dist = srcp - wnd; + + if (dist && dist < LZ77_MATCH_MAX_DIST) + len = lz77_match_len(wnd, srcp, end); + + if (len < LZ77_MATCH_MIN_LEN) { + lz77_write8(dstp, lz77_read8(srcp)); + + dstp++; + srcp++; + + flag <<= 1; + flag_count++; + if (flag_count == 32) { + lz77_write32(flag_pos, flag); + flag_count = 0; + flag_pos = dstp; + dstp += 4; + } + + continue; + } + + /* + * Bail out if @dstp reached >= 7/8 of @slen -- already compressed badly, not worth + * going further. + */ + if (unlikely(dstp - dst >= slen - (slen >> 3))) { + *dlen = slen; + goto out; + } + + dstp = lz77_write_match(dstp, &nib, dist, len); + srcp += len; + + flag = (flag << 1) | 1; + flag_count++; + if (flag_count == 32) { + lz77_write32(flag_pos, flag); + flag_count = 0; + flag_pos = dstp; + dstp += 4; + } + } while (likely(srcp + LZ77_STEP_SIZE < end)); + + while (srcp < end) { + u32 c = umin(end - srcp, 32 - flag_count); + + memcpy(dstp, srcp, c); + + dstp += c; + srcp += c; + + flag <<= c; + flag_count += c; + if (flag_count == 32) { + lz77_write32(flag_pos, flag); + flag_count = 0; + flag_pos = dstp; + dstp += 4; + } + } + + flag <<= (32 - flag_count); + flag |= (1 << (32 - flag_count)) - 1; + lz77_write32(flag_pos, flag); + + *dlen = dstp - dst; +out: + kvfree(htable); + + if (*dlen < slen) + return 0; + + return -EMSGSIZE; +} diff --git a/fs/smb/client/compress/lz77.h b/fs/smb/client/compress/lz77.h new file mode 100644 index 000000000000..cdcb191b48a2 --- /dev/null +++ b/fs/smb/client/compress/lz77.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2024, SUSE LLC + * + * Authors: Enzo Matsumiya <ematsumiya@suse.de> + * + * Implementation of the LZ77 "plain" compression algorithm, as per MS-XCA spec. + */ +#ifndef _SMB_COMPRESS_LZ77_H +#define _SMB_COMPRESS_LZ77_H + +#include <linux/kernel.h> + +int lz77_compress(const void *src, u32 slen, void *dst, u32 *dlen); +#endif /* _SMB_COMPRESS_LZ77_H */ diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index d2307162a2de..08a41c7aaf72 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -657,6 +657,19 @@ static bool server_unresponsive(struct TCP_Server_Info *server) { /* + * If we're in the process of mounting a share or reconnecting a session + * and the server abruptly shut down (e.g. socket wasn't closed, packet + * had been ACK'ed but no SMB response), don't wait longer than 20s to + * negotiate protocol. + */ + spin_lock(&server->srv_lock); + if (server->tcpStatus == CifsInNegotiate && + time_after(jiffies, server->lstrp + 20 * HZ)) { + spin_unlock(&server->srv_lock); + cifs_reconnect(server, false); + return true; + } + /* * We need to wait 3 echo intervals to make sure we handle such * situations right: * 1s client sends a normal SMB request @@ -667,7 +680,6 @@ server_unresponsive(struct TCP_Server_Info *server) * 65s kernel_recvmsg times out, and we see that we haven't gotten * a response in >60s. */ - spin_lock(&server->srv_lock); if ((server->tcpStatus == CifsGood || server->tcpStatus == CifsNeedNegotiate) && (!server->ops->can_echo || server->ops->can_echo(server)) && @@ -997,11 +1009,10 @@ clean_demultiplex_info(struct TCP_Server_Info *server) } if (!list_empty(&server->pending_mid_q)) { - struct list_head dispose_list; struct mid_q_entry *mid_entry; struct list_head *tmp, *tmp2; + LIST_HEAD(dispose_list); - INIT_LIST_HEAD(&dispose_list); spin_lock(&server->mid_lock); list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { mid_entry = list_entry(tmp, struct mid_q_entry, qhead); @@ -4069,7 +4080,7 @@ __cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) ses = cifs_get_smb_ses(master_tcon->ses->server, ctx); if (IS_ERR(ses)) { - tcon = (struct cifs_tcon *)ses; + tcon = ERR_CAST(ses); cifs_put_tcp_session(master_tcon->ses->server, 0); goto out; } @@ -4194,6 +4205,9 @@ tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink) * * If one doesn't exist then insert a new tcon_link struct into the tree and * try to construct a new one. + * + * REMEMBER to call cifs_put_tlink() after successful calls to cifs_sb_tlink, + * to avoid refcount issues */ struct tcon_link * cifs_sb_tlink(struct cifs_sb_info *cifs_sb) diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index b2405dd4d4d4..78b59c4ef3ce 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -49,6 +49,7 @@ static void cifs_prepare_write(struct netfs_io_subrequest *subreq) struct cifs_io_subrequest *wdata = container_of(subreq, struct cifs_io_subrequest, subreq); struct cifs_io_request *req = wdata->req; + struct netfs_io_stream *stream = &req->rreq.io_streams[subreq->stream_nr]; struct TCP_Server_Info *server; struct cifsFileInfo *open_file = req->cfile; size_t wsize = req->rreq.wsize; @@ -73,7 +74,7 @@ retry: } } - rc = server->ops->wait_mtu_credits(server, wsize, &wdata->subreq.max_len, + rc = server->ops->wait_mtu_credits(server, wsize, &stream->sreq_max_len, &wdata->credits); if (rc < 0) { subreq->error = rc; @@ -92,7 +93,7 @@ retry: #ifdef CONFIG_CIFS_SMB_DIRECT if (server->smbd_conn) - subreq->max_nr_segs = server->smbd_conn->max_frmr_depth; + stream->sreq_max_segs = server->smbd_conn->max_frmr_depth; #endif } @@ -139,25 +140,22 @@ static void cifs_netfs_invalidate_cache(struct netfs_io_request *wreq) } /* - * Split the read up according to how many credits we can get for each piece. - * It's okay to sleep here if we need to wait for more credit to become - * available. - * - * We also choose the server and allocate an operation ID to be cleaned up - * later. + * Negotiate the size of a read operation on behalf of the netfs library. */ -static bool cifs_clamp_length(struct netfs_io_subrequest *subreq) +static int cifs_prepare_read(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq); struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq); struct TCP_Server_Info *server = req->server; struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb); - size_t rsize = 0; - int rc; + size_t size; + int rc = 0; - rdata->xid = get_xid(); - rdata->have_xid = true; + if (!rdata->have_xid) { + rdata->xid = get_xid(); + rdata->have_xid = true; + } rdata->server = server; if (cifs_sb->ctx->rsize == 0) @@ -165,13 +163,12 @@ static bool cifs_clamp_length(struct netfs_io_subrequest *subreq) server->ops->negotiate_rsize(tlink_tcon(req->cfile->tlink), cifs_sb->ctx); + rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize, + &size, &rdata->credits); + if (rc) + return rc; - rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize, &rsize, - &rdata->credits); - if (rc) { - subreq->error = rc; - return false; - } + rreq->io_streams[0].sreq_max_len = size; rdata->credits.in_flight_check = 1; rdata->credits.rreq_debug_id = rreq->debug_id; @@ -183,13 +180,11 @@ static bool cifs_clamp_length(struct netfs_io_subrequest *subreq) server->credits, server->in_flight, 0, cifs_trace_rw_credits_read_submit); - subreq->len = min_t(size_t, subreq->len, rsize); - #ifdef CONFIG_CIFS_SMB_DIRECT if (server->smbd_conn) - subreq->max_nr_segs = server->smbd_conn->max_frmr_depth; + rreq->io_streams[0].sreq_max_segs = server->smbd_conn->max_frmr_depth; #endif - return true; + return 0; } /* @@ -198,31 +193,41 @@ static bool cifs_clamp_length(struct netfs_io_subrequest *subreq) * to only read a portion of that, but as long as we read something, the netfs * helper will call us again so that we can issue another read. */ -static void cifs_req_issue_read(struct netfs_io_subrequest *subreq) +static void cifs_issue_read(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq); struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq); + struct TCP_Server_Info *server = req->server; int rc = 0; cifs_dbg(FYI, "%s: op=%08x[%x] mapping=%p len=%zu/%zu\n", __func__, rreq->debug_id, subreq->debug_index, rreq->mapping, subreq->transferred, subreq->len); + rc = adjust_credits(server, rdata, cifs_trace_rw_credits_issue_read_adjust); + if (rc) + goto failed; + if (req->cfile->invalidHandle) { do { rc = cifs_reopen_file(req->cfile, true); } while (rc == -EAGAIN); if (rc) - goto out; + goto failed; } - __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + if (subreq->rreq->origin != NETFS_DIO_READ) + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); rc = rdata->server->ops->async_readv(rdata); -out: if (rc) - netfs_subreq_terminated(subreq, rc, false); + goto failed; + return; + +failed: + netfs_read_subreq_terminated(subreq, rc, false); } /* @@ -286,12 +291,6 @@ static void cifs_rreq_done(struct netfs_io_request *rreq) inode_set_atime_to_ts(inode, inode_get_mtime(inode)); } -static void cifs_post_modify(struct inode *inode) -{ - /* Indication to update ctime and mtime as close is deferred */ - set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags); -} - static void cifs_free_request(struct netfs_io_request *rreq) { struct cifs_io_request *req = container_of(rreq, struct cifs_io_request, rreq); @@ -315,7 +314,7 @@ static void cifs_free_subrequest(struct netfs_io_subrequest *subreq) #endif } - if (rdata->credits.value != 0) + if (rdata->credits.value != 0) { trace_smb3_rw_credits(rdata->rreq->debug_id, rdata->subreq.debug_index, rdata->credits.value, @@ -323,8 +322,12 @@ static void cifs_free_subrequest(struct netfs_io_subrequest *subreq) rdata->server ? rdata->server->in_flight : 0, -rdata->credits.value, cifs_trace_rw_credits_free_subreq); + if (rdata->server) + add_credits_and_wake_if(rdata->server, &rdata->credits, 0); + else + rdata->credits.value = 0; + } - add_credits_and_wake_if(rdata->server, &rdata->credits, 0); if (rdata->have_xid) free_xid(rdata->xid); } @@ -335,10 +338,9 @@ const struct netfs_request_ops cifs_req_ops = { .init_request = cifs_init_request, .free_request = cifs_free_request, .free_subrequest = cifs_free_subrequest, - .clamp_length = cifs_clamp_length, - .issue_read = cifs_req_issue_read, + .prepare_read = cifs_prepare_read, + .issue_read = cifs_issue_read, .done = cifs_rreq_done, - .post_modify = cifs_post_modify, .begin_writeback = cifs_begin_writeback, .prepare_write = cifs_prepare_write, .issue_write = cifs_issue_write, @@ -1362,7 +1364,7 @@ int cifs_close(struct inode *inode, struct file *file) dclose = kmalloc(sizeof(struct cifs_deferred_close), GFP_KERNEL); if ((cfile->status_file_deleted == false) && (smb2_can_defer_close(inode, dclose))) { - if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) { + if (test_and_clear_bit(NETFS_ICTX_MODIFIED_ATTR, &cinode->netfs.flags)) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); } @@ -1401,7 +1403,7 @@ void cifs_reopen_persistent_handles(struct cifs_tcon *tcon) { struct cifsFileInfo *open_file, *tmp; - struct list_head tmp_list; + LIST_HEAD(tmp_list); if (!tcon->use_persistent || !tcon->need_reopen_files) return; @@ -1409,7 +1411,6 @@ cifs_reopen_persistent_handles(struct cifs_tcon *tcon) tcon->need_reopen_files = false; cifs_dbg(FYI, "Reopen persistent handles\n"); - INIT_LIST_HEAD(&tmp_list); /* list all files open on tree connection, reopen resilient handles */ spin_lock(&tcon->open_file_lock); @@ -2092,9 +2093,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); struct cifsLockInfo *li, *tmp; __u64 length = cifs_flock_len(flock); - struct list_head tmp_llist; - - INIT_LIST_HEAD(&tmp_llist); + LIST_HEAD(tmp_llist); /* * Accessing maxBuf is racy with cifs_reconnect - need to store value @@ -2749,6 +2748,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file->f_mapping->host; struct cifsInodeInfo *cinode = CIFS_I(inode); struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); ssize_t rc; rc = netfs_start_io_write(inode); @@ -2765,12 +2765,16 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from) if (rc <= 0) goto out; - if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(from), + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) && + (cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(from), server->vals->exclusive_lock_type, 0, - NULL, CIFS_WRITE_OP)) - rc = netfs_buffered_write_iter_locked(iocb, from, NULL); - else + NULL, CIFS_WRITE_OP))) { rc = -EACCES; + goto out; + } + + rc = netfs_buffered_write_iter_locked(iocb, from, NULL); + out: up_read(&cinode->lock_sem); netfs_end_io_write(inode); @@ -2902,9 +2906,7 @@ cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to) if (!CIFS_CACHE_READ(cinode)) return netfs_unbuffered_read_iter(iocb, to); - if (cap_unix(tcon->ses) && - (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && - ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) { + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) { if (iocb->ki_flags & IOCB_DIRECT) return netfs_unbuffered_read_iter(iocb, to); return netfs_buffered_read_iter(iocb, to); diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index bc926ab2555b..28c4e576d460 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -978,9 +978,12 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, switch (opt) { case Opt_compress: + if (!IS_ENABLED(CONFIG_CIFS_COMPRESSION)) { + cifs_errorf(fc, "CONFIG_CIFS_COMPRESSION kernel config option is unset\n"); + goto cifs_parse_mount_err; + } ctx->compress = true; - cifs_dbg(VFS, - "SMB3 compression support is experimental\n"); + cifs_dbg(VFS, "SMB3 compression support is experimental\n"); break; case Opt_nodfs: ctx->nodfs = 1; @@ -1896,14 +1899,17 @@ void smb3_update_mnt_flags(struct cifs_sb_info *cifs_sb) if (ctx->mfsymlinks) { if (ctx->sfu_emul) { /* - * Our SFU ("Services for Unix" emulation does not allow - * creating symlinks but does allow reading existing SFU - * symlinks (it does allow both creating and reading SFU - * style mknod and FIFOs though). When "mfsymlinks" and + * Our SFU ("Services for Unix") emulation allows now + * creating new and reading existing SFU symlinks. + * Older Linux kernel versions were not able to neither + * read existing nor create new SFU symlinks. But + * creating and reading SFU style mknod and FIFOs was + * supported for long time. When "mfsymlinks" and * "sfu" are both enabled at the same time, it allows * reading both types of symlinks, but will only create * them with mfsymlinks format. This allows better - * Apple compatibility (probably better for Samba too) + * Apple compatibility, compatibility with older Linux + * kernel clients (probably better for Samba too) * while still recognizing old Windows style symlinks. */ cifs_dbg(VFS, "mount options mfsymlinks and sfu both enabled\n"); diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index dd0afa23734c..331a86074ae7 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -172,6 +172,8 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr, CIFS_I(inode)->time = 0; /* force reval */ return -ESTALE; } + if (inode->i_state & I_NEW) + CIFS_I(inode)->netfs.zero_point = fattr->cf_eof; cifs_revalidate_cache(inode, fattr); @@ -527,6 +529,8 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path, struct cifs_fid fid; struct cifs_open_parms oparms; struct cifs_io_parms io_parms = {0}; + char *symlink_buf_utf16; + unsigned int symlink_len_utf16; char buf[24]; unsigned int bytes_read; char *pbuf; @@ -537,10 +541,11 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path, fattr->cf_mode &= ~S_IFMT; if (fattr->cf_eof == 0) { + cifs_dbg(FYI, "Fifo\n"); fattr->cf_mode |= S_IFIFO; fattr->cf_dtype = DT_FIFO; return 0; - } else if (fattr->cf_eof < 8) { + } else if (fattr->cf_eof > 1 && fattr->cf_eof < 8) { fattr->cf_mode |= S_IFREG; fattr->cf_dtype = DT_REG; return -EINVAL; /* EOPNOTSUPP? */ @@ -582,7 +587,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path, rc = tcon->ses->server->ops->sync_read(xid, &fid, &io_parms, &bytes_read, &pbuf, &buf_type); if ((rc == 0) && (bytes_read >= 8)) { - if (memcmp("IntxBLK", pbuf, 8) == 0) { + if (memcmp("IntxBLK\0", pbuf, 8) == 0) { cifs_dbg(FYI, "Block device\n"); fattr->cf_mode |= S_IFBLK; fattr->cf_dtype = DT_BLK; @@ -594,7 +599,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path, mnr = le64_to_cpu(*(__le64 *)(pbuf+16)); fattr->cf_rdev = MKDEV(mjr, mnr); } - } else if (memcmp("IntxCHR", pbuf, 8) == 0) { + } else if (memcmp("IntxCHR\0", pbuf, 8) == 0) { cifs_dbg(FYI, "Char device\n"); fattr->cf_mode |= S_IFCHR; fattr->cf_dtype = DT_CHR; @@ -610,10 +615,37 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path, cifs_dbg(FYI, "Socket\n"); fattr->cf_mode |= S_IFSOCK; fattr->cf_dtype = DT_SOCK; - } else if (memcmp("IntxLNK", pbuf, 7) == 0) { + } else if (memcmp("IntxLNK\1", pbuf, 8) == 0) { cifs_dbg(FYI, "Symlink\n"); fattr->cf_mode |= S_IFLNK; fattr->cf_dtype = DT_LNK; + if ((fattr->cf_eof > 8) && (fattr->cf_eof % 2 == 0)) { + symlink_buf_utf16 = kmalloc(fattr->cf_eof-8 + 1, GFP_KERNEL); + if (symlink_buf_utf16) { + io_parms.offset = 8; + io_parms.length = fattr->cf_eof-8 + 1; + buf_type = CIFS_NO_BUFFER; + rc = tcon->ses->server->ops->sync_read(xid, &fid, &io_parms, + &symlink_len_utf16, + &symlink_buf_utf16, + &buf_type); + if ((rc == 0) && + (symlink_len_utf16 > 0) && + (symlink_len_utf16 < fattr->cf_eof-8 + 1) && + (symlink_len_utf16 % 2 == 0)) { + fattr->cf_symlink_target = + cifs_strndup_from_utf16(symlink_buf_utf16, + symlink_len_utf16, + true, + cifs_sb->local_nls); + if (!fattr->cf_symlink_target) + rc = -ENOMEM; + } + kfree(symlink_buf_utf16); + } else { + rc = -ENOMEM; + } + } } else if (memcmp("LnxFIFO", pbuf, 8) == 0) { cifs_dbg(FYI, "FIFO\n"); fattr->cf_mode |= S_IFIFO; @@ -623,6 +655,10 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path, fattr->cf_dtype = DT_REG; rc = -EOPNOTSUPP; } + } else if ((rc == 0) && (bytes_read == 1) && (pbuf[0] == '\0')) { + cifs_dbg(FYI, "Socket\n"); + fattr->cf_mode |= S_IFSOCK; + fattr->cf_dtype = DT_SOCK; } else { fattr->cf_mode |= S_IFREG; /* then it is a file */ fattr->cf_dtype = DT_REG; diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c index 44dbaf9929a4..9bb5c869f4db 100644 --- a/fs/smb/client/ioctl.c +++ b/fs/smb/client/ioctl.c @@ -229,9 +229,11 @@ static int cifs_shutdown(struct super_block *sb, unsigned long arg) shutdown_good: trace_smb3_shutdown_done(flags, tcon->tid); + cifs_put_tlink(tlink); return 0; shutdown_out_err: trace_smb3_shutdown_err(rc, flags, tcon->tid); + cifs_put_tlink(tlink); return rc; } diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c index d86da949a919..47ddeb7fa111 100644 --- a/fs/smb/client/link.c +++ b/fs/smb/client/link.c @@ -588,6 +588,7 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) { rc = PTR_ERR(tlink); + /* BB could be clearer if skipped put_tlink on error here, but harmless */ goto symlink_exit; } pTcon = tlink_tcon(tlink); @@ -605,6 +606,9 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, /* BB what if DFS and this volume is on different share? BB */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { rc = create_mf_symlink(xid, pTcon, cifs_sb, full_path, symname); + } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { + rc = __cifs_sfu_make_node(xid, inode, direntry, pTcon, + full_path, S_IFLNK, 0, symname); #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY } else if (pTcon->unix_ext) { rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname, diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index b28ff62f1f15..dab526191b07 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -352,7 +352,7 @@ checkSMB(char *buf, unsigned int total_read, struct TCP_Server_Info *server) * on simple responses (wct, bcc both zero) * in particular have seen this on * ulogoffX and FindClose. This leaves - * one byte of bcc potentially unitialized + * one byte of bcc potentially uninitialized */ /* zero rest of bcc */ tmp[sizeof(struct smb_hdr)+1] = 0; @@ -751,12 +751,11 @@ cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode) { struct cifsFileInfo *cfile = NULL; struct file_list *tmp_list, *tmp_next_list; - struct list_head file_head; + LIST_HEAD(file_head); if (cifs_inode == NULL) return; - INIT_LIST_HEAD(&file_head); spin_lock(&cifs_inode->open_file_lock); list_for_each_entry(cfile, &cifs_inode->openFileList, flist) { if (delayed_work_pending(&cfile->deferred)) { @@ -787,9 +786,8 @@ cifs_close_all_deferred_files(struct cifs_tcon *tcon) { struct cifsFileInfo *cfile; struct file_list *tmp_list, *tmp_next_list; - struct list_head file_head; + LIST_HEAD(file_head); - INIT_LIST_HEAD(&file_head); spin_lock(&tcon->open_file_lock); list_for_each_entry(cfile, &tcon->openFileList, tlist) { if (delayed_work_pending(&cfile->deferred)) { @@ -819,11 +817,10 @@ cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path) { struct cifsFileInfo *cfile; struct file_list *tmp_list, *tmp_next_list; - struct list_head file_head; void *page; const char *full_path; + LIST_HEAD(file_head); - INIT_LIST_HEAD(&file_head); page = alloc_dentry_path(); spin_lock(&tcon->open_file_lock); list_for_each_entry(cfile, &tcon->openFileList, tlist) { @@ -1234,6 +1231,7 @@ int cifs_inval_name_dfs_link_error(const unsigned int xid, const char *full_path, bool *islink) { + struct TCP_Server_Info *server = tcon->ses->server; struct cifs_ses *ses = tcon->ses; size_t len; char *path; @@ -1250,12 +1248,12 @@ int cifs_inval_name_dfs_link_error(const unsigned int xid, !is_tcon_dfs(tcon)) return 0; - spin_lock(&tcon->tc_lock); - if (!tcon->origin_fullpath) { - spin_unlock(&tcon->tc_lock); + spin_lock(&server->srv_lock); + if (!server->leaf_fullpath) { + spin_unlock(&server->srv_lock); return 0; } - spin_unlock(&tcon->tc_lock); + spin_unlock(&server->srv_lock); /* * Slow path - tcon is DFS and @full_path has prefix path, so attempt diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c index 689d8a506d45..48c27581ec51 100644 --- a/fs/smb/client/reparse.c +++ b/fs/smb/client/reparse.c @@ -378,6 +378,8 @@ int parse_reparse_point(struct reparse_data_buffer *buf, u32 plen, struct cifs_sb_info *cifs_sb, bool unicode, struct cifs_open_info_data *data) { + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + data->reparse.buf = buf; /* See MS-FSCC 2.1.2 */ @@ -394,12 +396,13 @@ int parse_reparse_point(struct reparse_data_buffer *buf, case IO_REPARSE_TAG_LX_FIFO: case IO_REPARSE_TAG_LX_CHR: case IO_REPARSE_TAG_LX_BLK: - return 0; + break; default: - cifs_dbg(VFS, "%s: unhandled reparse tag: 0x%08x\n", - __func__, le32_to_cpu(buf->ReparseTag)); - return -EOPNOTSUPP; + cifs_tcon_dbg(VFS | ONCE, "unhandled reparse tag: 0x%08x\n", + le32_to_cpu(buf->ReparseTag)); + break; } + return 0; } int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index e1f2feb56f45..e03c91a49650 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -1078,7 +1078,7 @@ cifs_make_node(unsigned int xid, struct inode *inode, /* * Check if mounted with mount parm 'sfu' mount parm. * SFU emulation should work with all servers, but only - * supports block and char device (no socket & fifo), + * supports block and char device, socket & fifo, * and was used by default in earlier versions of Windows */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) diff --git a/fs/smb/client/smb2file.c b/fs/smb/client/smb2file.c index c23478ab1cf8..e301349b0078 100644 --- a/fs/smb/client/smb2file.c +++ b/fs/smb/client/smb2file.c @@ -21,7 +21,7 @@ #include "cifs_unicode.h" #include "fscache.h" #include "smb2proto.h" -#include "smb2status.h" +#include "../common/smb2status.h" static struct smb2_symlink_err_rsp *symlink_data(const struct kvec *iov) { @@ -196,9 +196,7 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); struct cifsLockInfo *li, *tmp; __u64 length = 1 + flock->fl_end - flock->fl_start; - struct list_head tmp_llist; - - INIT_LIST_HEAD(&tmp_llist); + LIST_HEAD(tmp_llist); /* * Accessing maxBuf is racy with cifs_reconnect - need to store value diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 9f5bc41433c1..b992117377e9 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -24,7 +24,7 @@ #include "smb2pdu.h" #include "smb2proto.h" #include "cached_dir.h" -#include "smb2status.h" +#include "../common/smb2status.h" static struct reparse_data_buffer *reparse_buf_ptr(struct kvec *iov) { @@ -315,7 +315,7 @@ replay_again: SMB2_O_INFO_FILE, 0, sizeof(struct smb311_posix_qinfo *) + (PATH_MAX * 2) + - (sizeof(struct cifs_sid) * 2), 0, NULL); + (sizeof(struct smb_sid) * 2), 0, NULL); } else { rc = SMB2_query_info_init(tcon, server, &rqst[num_rqst], @@ -325,7 +325,7 @@ replay_again: SMB2_O_INFO_FILE, 0, sizeof(struct smb311_posix_qinfo *) + (PATH_MAX * 2) + - (sizeof(struct cifs_sid) * 2), 0, NULL); + (sizeof(struct smb_sid) * 2), 0, NULL); } if (!rc && (!cfile || num_rqst > 1)) { smb2_set_next_command(tcon, &rqst[num_rqst]); @@ -1106,6 +1106,8 @@ int smb2_rename_path(const unsigned int xid, co, DELETE, SMB2_OP_RENAME, cfile, source_dentry); if (rc == -EINVAL) { cifs_dbg(FYI, "invalid lease key, resending request without lease"); + cifs_get_writable_path(tcon, from_name, + FIND_WR_WITH_DELETE, &cfile); rc = smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb, co, DELETE, SMB2_OP_RENAME, cfile, NULL); } @@ -1149,6 +1151,7 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, cfile, NULL, NULL, dentry); if (rc == -EINVAL) { cifs_dbg(FYI, "invalid lease key, resending request without lease"); + cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms, &in_iov, &(int){SMB2_OP_SET_EOF}, 1, diff --git a/fs/smb/client/smb2maperror.c b/fs/smb/client/smb2maperror.c index ac1895358908..b05313acf9b2 100644 --- a/fs/smb/client/smb2maperror.c +++ b/fs/smb/client/smb2maperror.c @@ -12,7 +12,7 @@ #include "cifs_debug.h" #include "smb2pdu.h" #include "smb2proto.h" -#include "smb2status.h" +#include "../common/smb2status.h" #include "smb2glob.h" #include "trace.h" diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c index 677ef6f99a5b..f3c4b70b77b9 100644 --- a/fs/smb/client/smb2misc.c +++ b/fs/smb/client/smb2misc.c @@ -13,7 +13,7 @@ #include "smb2proto.h" #include "cifs_debug.h" #include "cifs_unicode.h" -#include "smb2status.h" +#include "../common/smb2status.h" #include "smb2glob.h" #include "nterr.h" #include "cached_dir.h" diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 322cabc69c6f..7381ec333c6d 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -13,6 +13,7 @@ #include <linux/sort.h> #include <crypto/aead.h> #include <linux/fiemap.h> +#include <linux/folio_queue.h> #include <uapi/linux/magic.h> #include "cifsfs.h" #include "cifsglob.h" @@ -21,7 +22,7 @@ #include "cifsproto.h" #include "cifs_debug.h" #include "cifs_unicode.h" -#include "smb2status.h" +#include "../common/smb2status.h" #include "smb2glob.h" #include "cifs_ioctl.h" #include "smbdirect.h" @@ -301,7 +302,8 @@ smb2_adjust_credits(struct TCP_Server_Info *server, unsigned int /*enum smb3_rw_credits_trace*/ trace) { struct cifs_credits *credits = &subreq->credits; - int new_val = DIV_ROUND_UP(subreq->subreq.len, SMB2_MAX_BUFFER_SIZE); + int new_val = DIV_ROUND_UP(subreq->subreq.len - subreq->subreq.transferred, + SMB2_MAX_BUFFER_SIZE); int scredits, in_flight; if (!credits->value || credits->value == new_val) @@ -316,7 +318,8 @@ smb2_adjust_credits(struct TCP_Server_Info *server, cifs_trace_rw_credits_no_adjust_up); trace_smb3_too_many_credits(server->CurrentMid, server->conn_id, server->hostname, 0, credits->value - new_val, 0); - cifs_server_dbg(VFS, "request has less credits (%d) than required (%d)", + cifs_server_dbg(VFS, "R=%x[%x] request has less credits (%d) than required (%d)", + subreq->rreq->debug_id, subreq->subreq.debug_index, credits->value, new_val); return -EOPNOTSUPP; @@ -338,8 +341,9 @@ smb2_adjust_credits(struct TCP_Server_Info *server, trace_smb3_reconnect_detected(server->CurrentMid, server->conn_id, server->hostname, scredits, credits->value - new_val, in_flight); - cifs_server_dbg(VFS, "trying to return %d credits to old session\n", - credits->value - new_val); + cifs_server_dbg(VFS, "R=%x[%x] trying to return %d credits to old session\n", + subreq->rreq->debug_id, subreq->subreq.debug_index, + credits->value - new_val); return -EAGAIN; } @@ -3046,11 +3050,11 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses, return rc; } -static struct cifs_ntsd * +static struct smb_ntsd * get_smb2_acl_by_fid(struct cifs_sb_info *cifs_sb, const struct cifs_fid *cifsfid, u32 *pacllen, u32 info) { - struct cifs_ntsd *pntsd = NULL; + struct smb_ntsd *pntsd = NULL; unsigned int xid; int rc = -EOPNOTSUPP; struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); @@ -3075,11 +3079,11 @@ get_smb2_acl_by_fid(struct cifs_sb_info *cifs_sb, } -static struct cifs_ntsd * +static struct smb_ntsd * get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path, u32 *pacllen, u32 info) { - struct cifs_ntsd *pntsd = NULL; + struct smb_ntsd *pntsd = NULL; u8 oplock = SMB2_OPLOCK_LEVEL_NONE; unsigned int xid; int rc; @@ -3142,7 +3146,7 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb, } static int -set_smb2_acl(struct cifs_ntsd *pnntsd, __u32 acllen, +set_smb2_acl(struct smb_ntsd *pnntsd, __u32 acllen, struct inode *inode, const char *path, int aclflag) { u8 oplock = SMB2_OPLOCK_LEVEL_NONE; @@ -3200,12 +3204,12 @@ set_smb2_acl(struct cifs_ntsd *pnntsd, __u32 acllen, } /* Retrieve an ACL from the server */ -static struct cifs_ntsd * +static struct smb_ntsd * get_smb2_acl(struct cifs_sb_info *cifs_sb, struct inode *inode, const char *path, u32 *pacllen, u32 info) { - struct cifs_ntsd *pntsd = NULL; + struct smb_ntsd *pntsd = NULL; struct cifsFileInfo *open_file = NULL; if (inode && !(info & SACL_SECINFO)) @@ -3237,13 +3241,15 @@ static long smb3_zero_data(struct file *file, struct cifs_tcon *tcon, } static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, - loff_t offset, loff_t len, bool keep_size) + unsigned long long offset, unsigned long long len, + bool keep_size) { struct cifs_ses *ses = tcon->ses; struct inode *inode = file_inode(file); struct cifsInodeInfo *cifsi = CIFS_I(inode); struct cifsFileInfo *cfile = file->private_data; - unsigned long long new_size; + struct netfs_inode *ictx = netfs_inode(inode); + unsigned long long i_size, new_size, remote_size; long rc; unsigned int xid; @@ -3255,6 +3261,16 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, inode_lock(inode); filemap_invalidate_lock(inode->i_mapping); + i_size = i_size_read(inode); + remote_size = ictx->remote_i_size; + if (offset + len >= remote_size && offset < i_size) { + unsigned long long top = umin(offset + len, i_size); + + rc = filemap_write_and_wait_range(inode->i_mapping, offset, top - 1); + if (rc < 0) + goto zero_range_exit; + } + /* * We zero the range through ioctl, so we need remove the page caches * first, otherwise the data may be inconsistent with the server. @@ -3305,6 +3321,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, struct inode *inode = file_inode(file); struct cifsFileInfo *cfile = file->private_data; struct file_zero_data_information fsctl_buf; + unsigned long long end = offset + len, i_size, remote_i_size; long rc; unsigned int xid; __u8 set_sparse = 1; @@ -3336,6 +3353,27 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, (char *)&fsctl_buf, sizeof(struct file_zero_data_information), CIFSMaxBufSize, NULL, NULL); + + if (rc) + goto unlock; + + /* If there's dirty data in the buffer that would extend the EOF if it + * were written, then we need to move the EOF marker over to the lower + * of the high end of the hole and the proposed EOF. The problem is + * that we locally hole-punch the tail of the dirty data, the proposed + * EOF update will end up in the wrong place. + */ + i_size = i_size_read(inode); + remote_i_size = netfs_inode(inode)->remote_i_size; + if (end > remote_i_size && i_size > remote_i_size) { + unsigned long long extend_to = umin(end, i_size); + rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, cfile->pid, extend_to); + if (rc >= 0) + netfs_inode(inode)->remote_i_size = extend_to; + } + +unlock: filemap_invalidate_unlock(inode->i_mapping); out: inode_unlock(inode); @@ -4356,30 +4394,86 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, } /* - * Clear a read buffer, discarding the folios which have XA_MARK_0 set. + * Clear a read buffer, discarding the folios which have the 1st mark set. + */ +static void cifs_clear_folioq_buffer(struct folio_queue *buffer) +{ + struct folio_queue *folioq; + + while ((folioq = buffer)) { + for (int s = 0; s < folioq_count(folioq); s++) + if (folioq_is_marked(folioq, s)) + folio_put(folioq_folio(folioq, s)); + buffer = folioq->next; + kfree(folioq); + } +} + +/* + * Allocate buffer space into a folio queue. */ -static void cifs_clear_xarray_buffer(struct xarray *buffer) +static struct folio_queue *cifs_alloc_folioq_buffer(ssize_t size) { + struct folio_queue *buffer = NULL, *tail = NULL, *p; struct folio *folio; + unsigned int slot; + + do { + if (!tail || folioq_full(tail)) { + p = kmalloc(sizeof(*p), GFP_NOFS); + if (!p) + goto nomem; + folioq_init(p); + if (tail) { + tail->next = p; + p->prev = tail; + } else { + buffer = p; + } + tail = p; + } + + folio = folio_alloc(GFP_KERNEL|__GFP_HIGHMEM, 0); + if (!folio) + goto nomem; - XA_STATE(xas, buffer, 0); + slot = folioq_append_mark(tail, folio); + size -= folioq_folio_size(tail, slot); + } while (size > 0); + + return buffer; + +nomem: + cifs_clear_folioq_buffer(buffer); + return NULL; +} + +/* + * Copy data from an iterator to the folios in a folio queue buffer. + */ +static bool cifs_copy_iter_to_folioq(struct iov_iter *iter, size_t size, + struct folio_queue *buffer) +{ + for (; buffer; buffer = buffer->next) { + for (int s = 0; s < folioq_count(buffer); s++) { + struct folio *folio = folioq_folio(buffer, s); + size_t part = folioq_folio_size(buffer, s); - rcu_read_lock(); - xas_for_each_marked(&xas, folio, ULONG_MAX, XA_MARK_0) { - folio_put(folio); + part = umin(part, size); + + if (copy_folio_from_iter(folio, 0, part, iter) != part) + return false; + size -= part; + } } - rcu_read_unlock(); - xa_destroy(buffer); + return true; } void smb3_free_compound_rqst(int num_rqst, struct smb_rqst *rqst) { - int i; - - for (i = 0; i < num_rqst; i++) - if (!xa_empty(&rqst[i].rq_buffer)) - cifs_clear_xarray_buffer(&rqst[i].rq_buffer); + for (int i = 0; i < num_rqst; i++) + cifs_clear_folioq_buffer(rqst[i].rq_buffer); } /* @@ -4400,53 +4494,32 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst, struct smb_rqst *new_rq, struct smb_rqst *old_rq) { struct smb2_transform_hdr *tr_hdr = new_rq[0].rq_iov[0].iov_base; - struct page *page; unsigned int orig_len = 0; - int i, j; int rc = -ENOMEM; - for (i = 1; i < num_rqst; i++) { + for (int i = 1; i < num_rqst; i++) { struct smb_rqst *old = &old_rq[i - 1]; struct smb_rqst *new = &new_rq[i]; - struct xarray *buffer = &new->rq_buffer; - size_t size = iov_iter_count(&old->rq_iter), seg, copied = 0; + struct folio_queue *buffer; + size_t size = iov_iter_count(&old->rq_iter); orig_len += smb_rqst_len(server, old); new->rq_iov = old->rq_iov; new->rq_nvec = old->rq_nvec; - xa_init(buffer); - if (size > 0) { - unsigned int npages = DIV_ROUND_UP(size, PAGE_SIZE); - - for (j = 0; j < npages; j++) { - void *o; - - rc = -ENOMEM; - page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); - if (!page) - goto err_free; - page->index = j; - o = xa_store(buffer, j, page, GFP_KERNEL); - if (xa_is_err(o)) { - rc = xa_err(o); - put_page(page); - goto err_free; - } + buffer = cifs_alloc_folioq_buffer(size); + if (!buffer) + goto err_free; - xa_set_mark(buffer, j, XA_MARK_0); + new->rq_buffer = buffer; + iov_iter_folio_queue(&new->rq_iter, ITER_SOURCE, + buffer, 0, 0, size); - seg = min_t(size_t, size - copied, PAGE_SIZE); - if (copy_page_from_iter(page, 0, seg, &old->rq_iter) != seg) { - rc = -EFAULT; - goto err_free; - } - copied += seg; + if (!cifs_copy_iter_to_folioq(&old->rq_iter, size, buffer)) { + rc = -EIO; + goto err_free; } - iov_iter_xarray(&new->rq_iter, ITER_SOURCE, - buffer, 0, size); - new->rq_iter_size = size; } } @@ -4492,7 +4565,6 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, rqst.rq_nvec = 2; if (iter) { rqst.rq_iter = *iter; - rqst.rq_iter_size = iov_iter_count(iter); iter_size = iov_iter_count(iter); } @@ -4511,22 +4583,23 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, } static int -cifs_copy_pages_to_iter(struct xarray *pages, unsigned int data_size, - unsigned int skip, struct iov_iter *iter) +cifs_copy_folioq_to_iter(struct folio_queue *folioq, size_t data_size, + size_t skip, struct iov_iter *iter) { - struct page *page; - unsigned long index; - - xa_for_each(pages, index, page) { - size_t n, len = min_t(unsigned int, PAGE_SIZE - skip, data_size); - - n = copy_page_to_iter(page, skip, len, iter); - if (n != len) { - cifs_dbg(VFS, "%s: something went wrong\n", __func__); - return -EIO; + for (; folioq; folioq = folioq->next) { + for (int s = 0; s < folioq_count(folioq); s++) { + struct folio *folio = folioq_folio(folioq, s); + size_t fsize = folio_size(folio); + size_t n, len = umin(fsize - skip, data_size); + + n = copy_folio_to_iter(folio, skip, len, iter); + if (n != len) { + cifs_dbg(VFS, "%s: something went wrong\n", __func__); + return -EIO; + } + data_size -= n; + skip = 0; } - data_size -= n; - skip = 0; } return 0; @@ -4534,8 +4607,8 @@ cifs_copy_pages_to_iter(struct xarray *pages, unsigned int data_size, static int handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, - char *buf, unsigned int buf_len, struct xarray *pages, - unsigned int pages_len, bool is_offloaded) + char *buf, unsigned int buf_len, struct folio_queue *buffer, + unsigned int buffer_len, bool is_offloaded) { unsigned int data_offset; unsigned int data_len; @@ -4632,7 +4705,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, return 0; } - if (data_len > pages_len - pad_len) { + if (data_len > buffer_len - pad_len) { /* data_len is corrupt -- discard frame */ rdata->result = -EIO; if (is_offloaded) @@ -4643,8 +4716,8 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, } /* Copy the data to the output I/O iterator. */ - rdata->result = cifs_copy_pages_to_iter(pages, pages_len, - cur_off, &rdata->subreq.io_iter); + rdata->result = cifs_copy_folioq_to_iter(buffer, buffer_len, + cur_off, &rdata->subreq.io_iter); if (rdata->result != 0) { if (is_offloaded) mid->mid_state = MID_RESPONSE_MALFORMED; @@ -4652,12 +4725,11 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, dequeue_mid(mid, rdata->result); return 0; } - rdata->got_bytes = pages_len; + rdata->got_bytes = buffer_len; } else if (buf_len >= data_offset + data_len) { /* read response payload is in buf */ - WARN_ONCE(pages && !xa_empty(pages), - "read data can be either in buf or in pages"); + WARN_ONCE(buffer, "read data can be either in buf or in buffer"); length = copy_to_iter(buf + data_offset, data_len, &rdata->subreq.io_iter); if (length < 0) return length; @@ -4683,7 +4755,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, struct smb2_decrypt_work { struct work_struct decrypt; struct TCP_Server_Info *server; - struct xarray buffer; + struct folio_queue *buffer; char *buf; unsigned int len; }; @@ -4697,7 +4769,7 @@ static void smb2_decrypt_offload(struct work_struct *work) struct mid_q_entry *mid; struct iov_iter iter; - iov_iter_xarray(&iter, ITER_DEST, &dw->buffer, 0, dw->len); + iov_iter_folio_queue(&iter, ITER_DEST, dw->buffer, 0, 0, dw->len); rc = decrypt_raw_data(dw->server, dw->buf, dw->server->vals->read_rsp_size, &iter, true); if (rc) { @@ -4713,7 +4785,7 @@ static void smb2_decrypt_offload(struct work_struct *work) mid->decrypted = true; rc = handle_read_data(dw->server, mid, dw->buf, dw->server->vals->read_rsp_size, - &dw->buffer, dw->len, + dw->buffer, dw->len, true); if (rc >= 0) { #ifdef CONFIG_CIFS_STATS2 @@ -4746,7 +4818,7 @@ static void smb2_decrypt_offload(struct work_struct *work) } free_pages: - cifs_clear_xarray_buffer(&dw->buffer); + cifs_clear_folioq_buffer(dw->buffer); cifs_small_buf_release(dw->buf); kfree(dw); } @@ -4756,20 +4828,17 @@ static int receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid, int *num_mids) { - struct page *page; char *buf = server->smallbuf; struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf; struct iov_iter iter; - unsigned int len, npages; + unsigned int len; unsigned int buflen = server->pdu_size; int rc; - int i = 0; struct smb2_decrypt_work *dw; dw = kzalloc(sizeof(struct smb2_decrypt_work), GFP_KERNEL); if (!dw) return -ENOMEM; - xa_init(&dw->buffer); INIT_WORK(&dw->decrypt, smb2_decrypt_offload); dw->server = server; @@ -4785,26 +4854,14 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid, len = le32_to_cpu(tr_hdr->OriginalMessageSize) - server->vals->read_rsp_size; dw->len = len; - npages = DIV_ROUND_UP(len, PAGE_SIZE); + len = round_up(dw->len, PAGE_SIZE); rc = -ENOMEM; - for (; i < npages; i++) { - void *old; - - page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); - if (!page) - goto discard_data; - page->index = i; - old = xa_store(&dw->buffer, i, page, GFP_KERNEL); - if (xa_is_err(old)) { - rc = xa_err(old); - put_page(page); - goto discard_data; - } - xa_set_mark(&dw->buffer, i, XA_MARK_0); - } + dw->buffer = cifs_alloc_folioq_buffer(len); + if (!dw->buffer) + goto discard_data; - iov_iter_xarray(&iter, ITER_DEST, &dw->buffer, 0, npages * PAGE_SIZE); + iov_iter_folio_queue(&iter, ITER_DEST, dw->buffer, 0, 0, len); /* Read the data into the buffer and clear excess bufferage. */ rc = cifs_read_iter_from_socket(server, &iter, dw->len); @@ -4812,9 +4869,9 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid, goto discard_data; server->total_read += rc; - if (rc < npages * PAGE_SIZE) - iov_iter_zero(npages * PAGE_SIZE - rc, &iter); - iov_iter_revert(&iter, npages * PAGE_SIZE); + if (rc < len) + iov_iter_zero(len - rc, &iter); + iov_iter_revert(&iter, len); iov_iter_truncate(&iter, dw->len); rc = cifs_discard_remaining_data(server); @@ -4849,7 +4906,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid, (*mid)->decrypted = true; rc = handle_read_data(server, *mid, buf, server->vals->read_rsp_size, - &dw->buffer, dw->len, false); + dw->buffer, dw->len, false); if (rc >= 0) { if (server->ops->is_network_name_deleted) { server->ops->is_network_name_deleted(buf, @@ -4859,7 +4916,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid, } free_pages: - cifs_clear_xarray_buffer(&dw->buffer); + cifs_clear_folioq_buffer(dw->buffer); free_dw: kfree(dw); return rc; @@ -5021,9 +5078,10 @@ static int smb2_next_header(struct TCP_Server_Info *server, char *buf, return 0; } -static int __cifs_sfu_make_node(unsigned int xid, struct inode *inode, +int __cifs_sfu_make_node(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, - const char *full_path, umode_t mode, dev_t dev) + const char *full_path, umode_t mode, dev_t dev, + const char *symname) { struct TCP_Server_Info *server = tcon->ses->server; struct cifs_open_parms oparms; @@ -5031,30 +5089,64 @@ static int __cifs_sfu_make_node(unsigned int xid, struct inode *inode, struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_fid fid; unsigned int bytes_written; - struct win_dev pdev = {}; - struct kvec iov[2]; + u8 type[8]; + int type_len = 0; + struct { + __le64 major; + __le64 minor; + } __packed pdev = {}; + __le16 *symname_utf16 = NULL; + u8 *data = NULL; + int data_len = 0; + struct kvec iov[3]; __u32 oplock = server->oplocks ? REQ_OPLOCK : 0; int rc; switch (mode & S_IFMT) { case S_IFCHR: - strscpy(pdev.type, "IntxCHR"); + type_len = 8; + memcpy(type, "IntxCHR\0", type_len); pdev.major = cpu_to_le64(MAJOR(dev)); pdev.minor = cpu_to_le64(MINOR(dev)); + data = (u8 *)&pdev; + data_len = sizeof(pdev); break; case S_IFBLK: - strscpy(pdev.type, "IntxBLK"); + type_len = 8; + memcpy(type, "IntxBLK\0", type_len); pdev.major = cpu_to_le64(MAJOR(dev)); pdev.minor = cpu_to_le64(MINOR(dev)); + data = (u8 *)&pdev; + data_len = sizeof(pdev); + break; + case S_IFLNK: + type_len = 8; + memcpy(type, "IntxLNK\1", type_len); + symname_utf16 = cifs_strndup_to_utf16(symname, strlen(symname), + &data_len, cifs_sb->local_nls, + NO_MAP_UNI_RSVD); + if (!symname_utf16) { + rc = -ENOMEM; + goto out; + } + data_len -= 2; /* symlink is without trailing wide-nul */ + data = (u8 *)symname_utf16; break; case S_IFSOCK: - strscpy(pdev.type, "LnxSOCK"); + type_len = 8; + strscpy(type, "LnxSOCK"); + data = (u8 *)&pdev; + data_len = sizeof(pdev); break; case S_IFIFO: - strscpy(pdev.type, "LnxFIFO"); + type_len = 8; + strscpy(type, "LnxFIFO"); + data = (u8 *)&pdev; + data_len = sizeof(pdev); break; default: - return -EPERM; + rc = -EPERM; + goto out; } oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, GENERIC_WRITE, @@ -5064,17 +5156,26 @@ static int __cifs_sfu_make_node(unsigned int xid, struct inode *inode, rc = server->ops->open(xid, &oparms, &oplock, NULL); if (rc) - return rc; + goto out; - io_parms.pid = current->tgid; - io_parms.tcon = tcon; - io_parms.length = sizeof(pdev); - iov[1].iov_base = &pdev; - iov[1].iov_len = sizeof(pdev); + if (type_len + data_len > 0) { + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.length = type_len + data_len; + iov[1].iov_base = type; + iov[1].iov_len = type_len; + iov[2].iov_base = data; + iov[2].iov_len = data_len; + + rc = server->ops->sync_write(xid, &fid, &io_parms, + &bytes_written, + iov, ARRAY_SIZE(iov)-1); + } - rc = server->ops->sync_write(xid, &fid, &io_parms, - &bytes_written, iov, 1); server->ops->close(xid, tcon, &fid); + +out: + kfree(symname_utf16); return rc; } @@ -5086,7 +5187,7 @@ int cifs_sfu_make_node(unsigned int xid, struct inode *inode, int rc; rc = __cifs_sfu_make_node(xid, inode, dentry, tcon, - full_path, mode, dev); + full_path, mode, dev, NULL); if (rc) return rc; @@ -5115,7 +5216,7 @@ static int smb2_make_node(unsigned int xid, struct inode *inode, /* * Check if mounted with mount parm 'sfu' mount parm. * SFU emulation should work with all servers, but only - * supports block and char device (no socket & fifo), + * supports block and char device, socket & fifo, * and was used by default in earlier versions of Windows */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 9a06b5594669..2cb1bf65a172 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -32,7 +32,7 @@ #include "cifs_unicode.h" #include "cifs_debug.h" #include "ntlmssp.h" -#include "smb2status.h" +#include "../common/smb2status.h" #include "smb2glob.h" #include "cifspdu.h" #include "cifs_spnego.h" @@ -42,6 +42,7 @@ #include "dfs_cache.h" #endif #include "cached_dir.h" +#include "compress.h" /* * The following table defines the expected "StructureSize" of SMB2 requests @@ -82,6 +83,9 @@ int smb3_encryption_required(const struct cifs_tcon *tcon) if (tcon->seal && (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION)) return 1; + if (((global_secflags & CIFSSEC_MUST_SEAL) == CIFSSEC_MUST_SEAL) && + (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION)) + return 1; return 0; } @@ -2620,7 +2624,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len) unsigned int group_offset = 0; struct smb3_acl acl = {}; - *len = round_up(sizeof(struct crt_sd_ctxt) + (sizeof(struct cifs_ace) * 4), 8); + *len = round_up(sizeof(struct crt_sd_ctxt) + (sizeof(struct smb_ace) * 4), 8); if (set_owner) { /* sizeof(struct owner_group_sids) is already multiple of 8 so no need to round */ @@ -2669,21 +2673,21 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len) ptr += sizeof(struct smb3_acl); /* create one ACE to hold the mode embedded in reserved special SID */ - acelen = setup_special_mode_ACE((struct cifs_ace *)ptr, (__u64)mode); + acelen = setup_special_mode_ACE((struct smb_ace *)ptr, (__u64)mode); ptr += acelen; acl_size = acelen + sizeof(struct smb3_acl); ace_count = 1; if (set_owner) { /* we do not need to reallocate buffer to add the two more ACEs. plenty of space */ - acelen = setup_special_user_owner_ACE((struct cifs_ace *)ptr); + acelen = setup_special_user_owner_ACE((struct smb_ace *)ptr); ptr += acelen; acl_size += acelen; ace_count += 1; } /* and one more ACE to allow access for authenticated users */ - acelen = setup_authusers_ACE((struct cifs_ace *)ptr); + acelen = setup_authusers_ACE((struct smb_ace *)ptr); ptr += acelen; acl_size += acelen; ace_count += 1; @@ -3903,7 +3907,7 @@ SMB311_posix_query_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, struct smb311_posix_qinfo *data, u32 *plen) { size_t output_len = sizeof(struct smb311_posix_qinfo *) + - (sizeof(struct cifs_sid) * 2) + (PATH_MAX * 2); + (sizeof(struct smb_sid) * 2) + (PATH_MAX * 2); *plen = 0; return query_info(xid, tcon, persistent_fid, volatile_fid, @@ -4438,7 +4442,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len, * If we want to do a RDMA write, fill in and append * smbd_buffer_descriptor_v1 to the end of read request */ - if (smb3_use_rdma_offload(io_parms)) { + if (rdata && smb3_use_rdma_offload(io_parms)) { struct smbd_buffer_descriptor_v1 *v1; bool need_invalidate = server->dialect == SMB30_PROT_ID; @@ -4495,15 +4499,14 @@ static void smb2_readv_worker(struct work_struct *work) struct cifs_io_subrequest *rdata = container_of(work, struct cifs_io_subrequest, subreq.work); - netfs_subreq_terminated(&rdata->subreq, - (rdata->result == 0 || rdata->result == -EAGAIN) ? - rdata->got_bytes : rdata->result, true); + netfs_read_subreq_terminated(&rdata->subreq, rdata->result, false); } static void smb2_readv_callback(struct mid_q_entry *mid) { struct cifs_io_subrequest *rdata = mid->callback_data; + struct netfs_inode *ictx = netfs_inode(rdata->rreq->inode); struct cifs_tcon *tcon = tlink_tcon(rdata->req->cfile->tlink); struct TCP_Server_Info *server = rdata->server; struct smb2_hdr *shdr = @@ -4520,16 +4523,15 @@ smb2_readv_callback(struct mid_q_entry *mid) if (rdata->got_bytes) { rqst.rq_iter = rdata->subreq.io_iter; - rqst.rq_iter_size = iov_iter_count(&rdata->subreq.io_iter); } WARN_ONCE(rdata->server != mid->server, "rdata server %p != mid server %p", rdata->server, mid->server); - cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%zu\n", + cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%zu/%zu\n", __func__, mid->mid, mid->mid_state, rdata->result, - rdata->subreq.len); + rdata->got_bytes, rdata->subreq.len - rdata->subreq.transferred); switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: @@ -4551,6 +4553,7 @@ smb2_readv_callback(struct mid_q_entry *mid) break; case MID_REQUEST_SUBMITTED: case MID_RETRY_NEEDED: + __set_bit(NETFS_SREQ_NEED_RETRY, &rdata->subreq.flags); rdata->result = -EAGAIN; if (server->sign && rdata->got_bytes) /* reset bytes number since we can not check a sign */ @@ -4583,27 +4586,36 @@ smb2_readv_callback(struct mid_q_entry *mid) rdata->subreq.debug_index, rdata->xid, rdata->req->cfile->fid.persistent_fid, - tcon->tid, tcon->ses->Suid, rdata->subreq.start, - rdata->subreq.len, rdata->result); + tcon->tid, tcon->ses->Suid, + rdata->subreq.start + rdata->subreq.transferred, + rdata->subreq.len - rdata->subreq.transferred, + rdata->result); } else trace_smb3_read_done(rdata->rreq->debug_id, rdata->subreq.debug_index, rdata->xid, rdata->req->cfile->fid.persistent_fid, tcon->tid, tcon->ses->Suid, - rdata->subreq.start, rdata->got_bytes); + rdata->subreq.start + rdata->subreq.transferred, + rdata->got_bytes); if (rdata->result == -ENODATA) { - /* We may have got an EOF error because fallocate - * failed to enlarge the file. - */ - if (rdata->subreq.start < rdata->subreq.rreq->i_size) + __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); + rdata->result = 0; + } else { + size_t trans = rdata->subreq.transferred + rdata->got_bytes; + if (trans < rdata->subreq.len && + rdata->subreq.start + trans == ictx->remote_i_size) { + __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); rdata->result = 0; + } } trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, rdata->credits.value, server->credits, server->in_flight, 0, cifs_trace_rw_credits_read_response_clear); rdata->credits.value = 0; + rdata->subreq.transferred += rdata->got_bytes; + trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_progress); INIT_WORK(&rdata->subreq.work, smb2_readv_worker); queue_work(cifsiod_wq, &rdata->subreq.work); release_mid(mid); @@ -4619,6 +4631,7 @@ smb2_async_readv(struct cifs_io_subrequest *rdata) { int rc, flags = 0; char *buf; + struct netfs_io_subrequest *subreq = &rdata->subreq; struct smb2_hdr *shdr; struct cifs_io_parms io_parms; struct smb_rqst rqst = { .rq_iov = rdata->iov, @@ -4629,15 +4642,15 @@ smb2_async_readv(struct cifs_io_subrequest *rdata) int credit_request; cifs_dbg(FYI, "%s: offset=%llu bytes=%zu\n", - __func__, rdata->subreq.start, rdata->subreq.len); + __func__, subreq->start, subreq->len); if (!rdata->server) rdata->server = cifs_pick_channel(tcon->ses); io_parms.tcon = tlink_tcon(rdata->req->cfile->tlink); io_parms.server = server = rdata->server; - io_parms.offset = rdata->subreq.start; - io_parms.length = rdata->subreq.len; + io_parms.offset = subreq->start + subreq->transferred; + io_parms.length = subreq->len - subreq->transferred; io_parms.persistent_fid = rdata->req->cfile->fid.persistent_fid; io_parms.volatile_fid = rdata->req->cfile->fid.volatile_fid; io_parms.pid = rdata->req->pid; @@ -4652,11 +4665,13 @@ smb2_async_readv(struct cifs_io_subrequest *rdata) rdata->iov[0].iov_base = buf; rdata->iov[0].iov_len = total_len; + rdata->got_bytes = 0; + rdata->result = 0; shdr = (struct smb2_hdr *)buf; if (rdata->credits.value > 0) { - shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->subreq.len, + shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(io_parms.length, SMB2_MAX_BUFFER_SIZE)); credit_request = le16_to_cpu(shdr->CreditCharge) + 8; if (server->credits >= server->max_credits) @@ -4680,11 +4695,12 @@ smb2_async_readv(struct cifs_io_subrequest *rdata) if (rc) { cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE); trace_smb3_read_err(rdata->rreq->debug_id, - rdata->subreq.debug_index, + subreq->debug_index, rdata->xid, io_parms.persistent_fid, io_parms.tcon->tid, io_parms.tcon->ses->Suid, - io_parms.offset, io_parms.length, rc); + io_parms.offset, + subreq->len - subreq->transferred, rc); } async_readv_out: @@ -4867,6 +4883,7 @@ smb2_writev_callback(struct mid_q_entry *mid) server->credits, server->in_flight, 0, cifs_trace_rw_credits_write_response_clear); wdata->credits.value = 0; + trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_progress); cifs_write_subrequest_terminated(wdata, result ?: written, true); release_mid(mid); trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, 0, @@ -4911,6 +4928,13 @@ smb2_async_writev(struct cifs_io_subrequest *wdata) if (rc) goto out; + rqst.rq_iov = iov; + rqst.rq_iter = wdata->subreq.io_iter; + + rqst.rq_iov[0].iov_len = total_len - 1; + rqst.rq_iov[0].iov_base = (char *)req; + rqst.rq_nvec += 1; + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; @@ -4922,6 +4946,7 @@ smb2_async_writev(struct cifs_io_subrequest *wdata) req->WriteChannelInfoOffset = 0; req->WriteChannelInfoLength = 0; req->Channel = SMB2_CHANNEL_NONE; + req->Length = cpu_to_le32(io_parms->length); req->Offset = cpu_to_le64(io_parms->offset); req->DataOffset = cpu_to_le16( offsetof(struct smb2_write_req, Buffer)); @@ -4941,7 +4966,6 @@ smb2_async_writev(struct cifs_io_subrequest *wdata) */ if (smb3_use_rdma_offload(io_parms)) { struct smbd_buffer_descriptor_v1 *v1; - size_t data_size = iov_iter_count(&wdata->subreq.io_iter); bool need_invalidate = server->dialect == SMB30_PROT_ID; wdata->mr = smbd_register_mr(server->smbd_conn, &wdata->subreq.io_iter, @@ -4950,9 +4974,10 @@ smb2_async_writev(struct cifs_io_subrequest *wdata) rc = -EAGAIN; goto async_writev_out; } + /* For RDMA read, I/O size is in RemainingBytes not in Length */ + req->RemainingBytes = req->Length; req->Length = 0; req->DataOffset = 0; - req->RemainingBytes = cpu_to_le32(data_size); req->Channel = SMB2_CHANNEL_RDMA_V1_INVALIDATE; if (need_invalidate) req->Channel = SMB2_CHANNEL_RDMA_V1; @@ -4964,31 +4989,22 @@ smb2_async_writev(struct cifs_io_subrequest *wdata) v1->offset = cpu_to_le64(wdata->mr->mr->iova); v1->token = cpu_to_le32(wdata->mr->mr->rkey); v1->length = cpu_to_le32(wdata->mr->mr->length); + + rqst.rq_iov[0].iov_len += sizeof(*v1); + + /* + * We keep wdata->subreq.io_iter, + * but we have to truncate rqst.rq_iter + */ + iov_iter_truncate(&rqst.rq_iter, 0); } #endif - iov[0].iov_len = total_len - 1; - iov[0].iov_base = (char *)req; - rqst.rq_iov = iov; - rqst.rq_nvec = 1; - rqst.rq_iter = wdata->subreq.io_iter; - rqst.rq_iter_size = iov_iter_count(&rqst.rq_iter); if (test_bit(NETFS_SREQ_RETRYING, &wdata->subreq.flags)) smb2_set_replay(server, &rqst); -#ifdef CONFIG_CIFS_SMB_DIRECT - if (wdata->mr) - iov[0].iov_len += sizeof(struct smbd_buffer_descriptor_v1); -#endif - cifs_dbg(FYI, "async write at %llu %u bytes iter=%zx\n", - io_parms->offset, io_parms->length, iov_iter_count(&rqst.rq_iter)); -#ifdef CONFIG_CIFS_SMB_DIRECT - /* For RDMA read, I/O size is in RemainingBytes not in Length */ - if (!wdata->mr) - req->Length = cpu_to_le32(io_parms->length); -#else - req->Length = cpu_to_le32(io_parms->length); -#endif + cifs_dbg(FYI, "async write at %llu %u bytes iter=%zx\n", + io_parms->offset, io_parms->length, iov_iter_count(&wdata->subreq.io_iter)); if (wdata->credits.value > 0) { shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->subreq.len, @@ -5008,6 +5024,10 @@ smb2_async_writev(struct cifs_io_subrequest *wdata) flags |= CIFS_HAS_CREDITS; } + /* XXX: compression + encryption is unsupported for now */ + if (((flags & CIFS_TRANSFORM_REQ) != CIFS_TRANSFORM_REQ) && should_compress(tcon, &rqst)) + flags |= CIFS_COMPRESS_REQ; + rc = cifs_call_async(server, &rqst, NULL, smb2_writev_callback, NULL, wdata, flags, &wdata->credits); /* Can't touch wdata if rc == 0 */ @@ -5671,7 +5691,7 @@ SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, int SMB2_set_acl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, - struct cifs_ntsd *pnntsd, int pacllen, int aclflag) + struct smb_ntsd *pnntsd, int pacllen, int aclflag) { return send_set_info(xid, tcon, persistent_fid, volatile_fid, current->tgid, 0, SMB2_O_INFO_SECURITY, aclflag, diff --git a/fs/smb/client/smb2pdu.h b/fs/smb/client/smb2pdu.h index 5c458ab3b05a..076d9e83e1a0 100644 --- a/fs/smb/client/smb2pdu.h +++ b/fs/smb/client/smb2pdu.h @@ -364,8 +364,8 @@ struct create_posix_rsp { u32 nlink; u32 reparse_tag; u32 mode; - struct cifs_sid owner; /* var-sized on the wire */ - struct cifs_sid group; /* var-sized on the wire */ + struct smb_sid owner; /* var-sized on the wire */ + struct smb_sid group; /* var-sized on the wire */ } __packed; #define SMB2_QUERY_DIRECTORY_IOV_SIZE 2 @@ -408,8 +408,8 @@ struct smb2_posix_info { struct smb2_posix_info_parsed { const struct smb2_posix_info *base; size_t size; - struct cifs_sid owner; - struct cifs_sid group; + struct smb_sid owner; + struct smb_sid group; int name_len; const u8 *name; }; diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h index b208232b12a2..c7e1b149877a 100644 --- a/fs/smb/client/smb2proto.h +++ b/fs/smb/client/smb2proto.h @@ -238,7 +238,7 @@ extern int SMB2_set_info_init(struct cifs_tcon *tcon, extern void SMB2_set_info_free(struct smb_rqst *rqst); extern int SMB2_set_acl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, - struct cifs_ntsd *pnntsd, int pacllen, int aclflag); + struct smb_ntsd *pnntsd, int pacllen, int aclflag); extern int SMB2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, struct smb2_file_full_ea_info *buf, int len); diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c index 1476c445cadc..e4636fca821d 100644 --- a/fs/smb/client/smb2transport.c +++ b/fs/smb/client/smb2transport.c @@ -23,7 +23,7 @@ #include "cifsproto.h" #include "smb2proto.h" #include "cifs_debug.h" -#include "smb2status.h" +#include "../common/smb2status.h" #include "smb2glob.h" static int diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index d74e829de51c..0c64b37e2660 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -6,6 +6,7 @@ */ #include <linux/module.h> #include <linux/highmem.h> +#include <linux/folio_queue.h> #include "smbdirect.h" #include "cifs_debug.h" #include "cifsproto.h" @@ -406,7 +407,7 @@ static void smbd_post_send_credits(struct work_struct *work) else response = get_empty_queue_buffer(info); if (!response) { - /* now switch to emtpy packet queue */ + /* now switch to empty packet queue */ if (use_receive_queue) { use_receive_queue = 0; continue; @@ -618,7 +619,7 @@ out: /* * Test if FRWR (Fast Registration Work Requests) is supported on the device - * This implementation requries FRWR on RDMA read/write + * This implementation requires FRWR on RDMA read/write * return value: true if it is supported */ static bool frwr_is_supported(struct ib_device_attr *attrs) @@ -1585,10 +1586,8 @@ static struct smbd_connection *_smbd_get_connection( conn_param.initiator_depth = 0; conn_param.responder_resources = - info->id->device->attrs.max_qp_rd_atom - < SMBD_CM_RESPONDER_RESOURCES ? - info->id->device->attrs.max_qp_rd_atom : - SMBD_CM_RESPONDER_RESOURCES; + min(info->id->device->attrs.max_qp_rd_atom, + SMBD_CM_RESPONDER_RESOURCES); info->responder_resources = conn_param.responder_resources; log_rdma_mr(INFO, "responder_resources=%d\n", info->responder_resources); @@ -2177,7 +2176,7 @@ cleanup_entries: * MR available in the list. It may access the list while the * smbd_mr_recovery_work is recovering the MR list. This doesn't need a lock * as they never modify the same places. However, there may be several CPUs - * issueing I/O trying to get MR at the same time, mr_list_lock is used to + * issuing I/O trying to get MR at the same time, mr_list_lock is used to * protect this situation. */ static struct smbd_mr *get_mr(struct smbd_connection *info) @@ -2311,7 +2310,7 @@ struct smbd_mr *smbd_register_mr(struct smbd_connection *info, /* * There is no need for waiting for complemtion on ib_post_send * on IB_WR_REG_MR. Hardware enforces a barrier and order of execution - * on the next ib_post_send when we actaully send I/O to remote peer + * on the next ib_post_send when we actually send I/O to remote peer */ rc = ib_post_send(info->id->qp, ®_wr->wr, NULL); if (!rc) @@ -2463,6 +2462,8 @@ static ssize_t smb_extract_bvec_to_rdma(struct iov_iter *iter, start = 0; } + if (ret > 0) + iov_iter_advance(iter, ret); return ret; } @@ -2519,50 +2520,65 @@ static ssize_t smb_extract_kvec_to_rdma(struct iov_iter *iter, start = 0; } + if (ret > 0) + iov_iter_advance(iter, ret); return ret; } /* - * Extract folio fragments from an XARRAY-class iterator and add them to an - * RDMA list. The folios are not pinned. + * Extract folio fragments from a FOLIOQ-class iterator and add them to an RDMA + * list. The folios are not pinned. */ -static ssize_t smb_extract_xarray_to_rdma(struct iov_iter *iter, +static ssize_t smb_extract_folioq_to_rdma(struct iov_iter *iter, struct smb_extract_to_rdma *rdma, ssize_t maxsize) { - struct xarray *xa = iter->xarray; - struct folio *folio; - loff_t start = iter->xarray_start + iter->iov_offset; - pgoff_t index = start / PAGE_SIZE; + const struct folio_queue *folioq = iter->folioq; + unsigned int slot = iter->folioq_slot; ssize_t ret = 0; - size_t off, len; - XA_STATE(xas, xa, index); + size_t offset = iter->iov_offset; - rcu_read_lock(); + BUG_ON(!folioq); - xas_for_each(&xas, folio, ULONG_MAX) { - if (xas_retry(&xas, folio)) - continue; - if (WARN_ON(xa_is_value(folio))) - break; - if (WARN_ON(folio_test_hugetlb(folio))) - break; + if (slot >= folioq_nr_slots(folioq)) { + folioq = folioq->next; + if (WARN_ON_ONCE(!folioq)) + return -EIO; + slot = 0; + } - off = offset_in_folio(folio, start); - len = min_t(size_t, maxsize, folio_size(folio) - off); + do { + struct folio *folio = folioq_folio(folioq, slot); + size_t fsize = folioq_folio_size(folioq, slot); - if (!smb_set_sge(rdma, folio_page(folio, 0), off, len)) { - rcu_read_unlock(); - return -EIO; + if (offset < fsize) { + size_t part = umin(maxsize - ret, fsize - offset); + + if (!smb_set_sge(rdma, folio_page(folio, 0), offset, part)) + return -EIO; + + offset += part; + ret += part; } - maxsize -= len; - ret += len; - if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0) - break; - } + if (offset >= fsize) { + offset = 0; + slot++; + if (slot >= folioq_nr_slots(folioq)) { + if (!folioq->next) { + WARN_ON_ONCE(ret < iter->count); + break; + } + folioq = folioq->next; + slot = 0; + } + } + } while (rdma->nr_sge < rdma->max_sge || maxsize > 0); - rcu_read_unlock(); + iter->folioq = folioq; + iter->folioq_slot = slot; + iter->iov_offset = offset; + iter->count -= ret; return ret; } @@ -2590,17 +2606,15 @@ static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len, case ITER_KVEC: ret = smb_extract_kvec_to_rdma(iter, rdma, len); break; - case ITER_XARRAY: - ret = smb_extract_xarray_to_rdma(iter, rdma, len); + case ITER_FOLIOQ: + ret = smb_extract_folioq_to_rdma(iter, rdma, len); break; default: WARN_ON_ONCE(1); return -EIO; } - if (ret > 0) { - iov_iter_advance(iter, ret); - } else if (ret < 0) { + if (ret < 0) { while (rdma->nr_sge > before) { struct ib_sge *sge = &rdma->sge[rdma->nr_sge--]; diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index 0f0c10c7ada7..8e9964001e2a 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -30,6 +30,7 @@ EM(cifs_trace_rw_credits_old_session, "old-session") \ EM(cifs_trace_rw_credits_read_response_add, "rd-resp-add") \ EM(cifs_trace_rw_credits_read_response_clear, "rd-resp-clr") \ + EM(cifs_trace_rw_credits_read_resubmit, "rd-resubmit") \ EM(cifs_trace_rw_credits_read_submit, "rd-submit ") \ EM(cifs_trace_rw_credits_write_prepare, "wr-prepare ") \ EM(cifs_trace_rw_credits_write_response_add, "wr-resp-add") \ diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index adfe0d058701..fd5a85d43759 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -28,6 +28,7 @@ #include "cifs_debug.h" #include "smb2proto.h" #include "smbdirect.h" +#include "compress.h" /* Max number of iovectors we can use off the stack when sending requests. */ #define CIFS_MAX_IOV_SIZE 8 @@ -432,6 +433,9 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, struct kvec *iov; int rc; + if (flags & CIFS_COMPRESS_REQ) + return smb_compress(server, &rqst[0], __smb_send_rqst); + if (!(flags & CIFS_TRANSFORM_REQ)) return __smb_send_rqst(server, num_rqst, rqst); @@ -1289,7 +1293,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, out: /* * This will dequeue all mids. After this it is important that the - * demultiplex_thread will not process any of these mids any futher. + * demultiplex_thread will not process any of these mids any further. * This is prevented above by using a noop callback that will not * wake this thread except for the very last PDU. */ diff --git a/fs/smb/client/xattr.c b/fs/smb/client/xattr.c index 6780aa3e98a1..58a584f0b27e 100644 --- a/fs/smb/client/xattr.c +++ b/fs/smb/client/xattr.c @@ -162,7 +162,7 @@ static int cifs_xattr_set(const struct xattr_handler *handler, case XATTR_CIFS_ACL: case XATTR_CIFS_NTSD: case XATTR_CIFS_NTSD_FULL: { - struct cifs_ntsd *pacl; + struct smb_ntsd *pacl; if (!value) goto out; @@ -315,7 +315,7 @@ static int cifs_xattr_get(const struct xattr_handler *handler, * fetch owner and DACL otherwise */ u32 acllen, extra_info; - struct cifs_ntsd *pacl; + struct smb_ntsd *pacl; if (pTcon->ses->server->ops->get_acl == NULL) goto out; /* rc already EOPNOTSUPP */ diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h index c3ee42188d25..c769f9dbc0b4 100644 --- a/fs/smb/common/smb2pdu.h +++ b/fs/smb/common/smb2pdu.h @@ -1216,6 +1216,8 @@ struct create_context { ); __u8 Buffer[]; } __packed; +static_assert(offsetof(struct create_context, Buffer) == sizeof(struct create_context_hdr), + "struct member likely outside of __struct_group()"); struct smb2_create_req { struct smb2_hdr hdr; diff --git a/fs/smb/client/smb2status.h b/fs/smb/common/smb2status.h index 9c6d79b0bd49..14b4a5f04564 100644 --- a/fs/smb/client/smb2status.h +++ b/fs/smb/common/smb2status.h @@ -901,6 +901,10 @@ struct ntstatus { #define STATUS_DEVICE_ENUMERATION_ERROR cpu_to_le32(0xC0000366) #define STATUS_MOUNT_POINT_NOT_RESOLVED cpu_to_le32(0xC0000368) #define STATUS_INVALID_DEVICE_OBJECT_PARAMETER cpu_to_le32(0xC0000369) +/* + * 'OCCURED' is typo in MS-ERREF, it should be 'OCCURRED', + * but we'll keep it consistent with MS-ERREF. + */ #define STATUS_MCA_OCCURED cpu_to_le32(0xC000036A) #define STATUS_DRIVER_BLOCKED_CRITICAL cpu_to_le32(0xC000036B) #define STATUS_DRIVER_BLOCKED cpu_to_le32(0xC000036C) @@ -1769,3 +1773,5 @@ struct ntstatus { #define STATUS_IPSEC_INVALID_PACKET cpu_to_le32(0xC0360005) #define STATUS_IPSEC_INTEGRITY_CHECK_FAILED cpu_to_le32(0xC0360006) #define STATUS_IPSEC_CLEAR_TEXT_DROP cpu_to_le32(0xC0360007) +#define STATUS_NO_PREAUTH_INTEGRITY_HASH_OVERLAP cpu_to_le32(0xC05D0000) +#define STATUS_INVALID_LOCK_RANGE cpu_to_le32(0xC00001a1) diff --git a/fs/smb/common/smbacl.h b/fs/smb/common/smbacl.h new file mode 100644 index 000000000000..6a60698fc6f0 --- /dev/null +++ b/fs/smb/common/smbacl.h @@ -0,0 +1,121 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +/* + * Copyright (c) International Business Machines Corp., 2007 + * Author(s): Steve French (sfrench@us.ibm.com) + * Modified by Namjae Jeon (linkinjeon@kernel.org) + */ + +#ifndef _COMMON_SMBACL_H +#define _COMMON_SMBACL_H + +#define NUM_AUTHS (6) /* number of authority fields */ +#define SID_MAX_SUB_AUTHORITIES (15) /* max number of sub authority fields */ + +/* ACE types - see MS-DTYP 2.4.4.1 */ +#define ACCESS_ALLOWED_ACE_TYPE 0x00 +#define ACCESS_DENIED_ACE_TYPE 0x01 +#define SYSTEM_AUDIT_ACE_TYPE 0x02 +#define SYSTEM_ALARM_ACE_TYPE 0x03 +#define ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04 +#define ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 +#define ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 +#define SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 +#define SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 +#define ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09 +#define ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A +#define ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B +#define ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C +#define SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D +#define SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E /* Reserved */ +#define SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F +#define SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10 /* reserved */ +#define SYSTEM_MANDATORY_LABEL_ACE_TYPE 0x11 +#define SYSTEM_RESOURCE_ATTRIBUTE_ACE_TYPE 0x12 +#define SYSTEM_SCOPED_POLICY_ID_ACE_TYPE 0x13 + +/* ACE flags */ +#define OBJECT_INHERIT_ACE 0x01 +#define CONTAINER_INHERIT_ACE 0x02 +#define NO_PROPAGATE_INHERIT_ACE 0x04 +#define INHERIT_ONLY_ACE 0x08 +#define INHERITED_ACE 0x10 +#define SUCCESSFUL_ACCESS_ACE_FLAG 0x40 +#define FAILED_ACCESS_ACE_FLAG 0x80 + +/* + * Maximum size of a string representation of a SID: + * + * The fields are unsigned values in decimal. So: + * + * u8: max 3 bytes in decimal + * u32: max 10 bytes in decimal + * + * "S-" + 3 bytes for version field + 15 for authority field + NULL terminator + * + * For authority field, max is when all 6 values are non-zero and it must be + * represented in hex. So "-0x" + 12 hex digits. + * + * Add 11 bytes for each subauthority field (10 bytes each + 1 for '-') + */ +#define SID_STRING_BASE_SIZE (2 + 3 + 15 + 1) +#define SID_STRING_SUBAUTH_SIZE (11) /* size of a single subauth string */ + +#define DOMAIN_USER_RID_LE cpu_to_le32(513) + +/* + * ACE types - see MS-DTYP 2.4.4.1 + */ +enum { + ACCESS_ALLOWED, + ACCESS_DENIED, +}; + +/* + * Security ID types + */ +enum { + SIDOWNER = 1, + SIDGROUP, + SIDCREATOR_OWNER, + SIDCREATOR_GROUP, + SIDUNIX_USER, + SIDUNIX_GROUP, + SIDNFS_USER, + SIDNFS_GROUP, + SIDNFS_MODE, +}; + +struct smb_ntsd { + __le16 revision; /* revision level */ + __le16 type; + __le32 osidoffset; + __le32 gsidoffset; + __le32 sacloffset; + __le32 dacloffset; +} __attribute__((packed)); + +struct smb_sid { + __u8 revision; /* revision level */ + __u8 num_subauth; + __u8 authority[NUM_AUTHS]; + __le32 sub_auth[SID_MAX_SUB_AUTHORITIES]; /* sub_auth[num_subauth] */ +} __attribute__((packed)); + +/* size of a struct smb_sid, sans sub_auth array */ +#define CIFS_SID_BASE_SIZE (1 + 1 + NUM_AUTHS) + +struct smb_acl { + __le16 revision; /* revision level */ + __le16 size; + __le32 num_aces; +} __attribute__((packed)); + +struct smb_ace { + __u8 type; /* see above and MS-DTYP 2.4.4.1 */ + __u8 flags; + __le16 size; + __le32 access_req; + struct smb_sid sid; /* ie UUID of user or group who gets these perms */ +} __attribute__((packed)); + +#endif /* _COMMON_SMBACL_H */ diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index 09e1e7771592..cac80e7bfefc 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -39,7 +39,8 @@ void ksmbd_conn_free(struct ksmbd_conn *conn) xa_destroy(&conn->sessions); kvfree(conn->request_buf); kfree(conn->preauth_info); - kfree(conn); + if (atomic_dec_and_test(&conn->refcnt)) + kfree(conn); } /** @@ -68,6 +69,7 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) conn->um = NULL; atomic_set(&conn->req_running, 0); atomic_set(&conn->r_count, 0); + atomic_set(&conn->refcnt, 1); conn->total_credits = 1; conn->outstanding_credits = 0; @@ -165,11 +167,43 @@ void ksmbd_all_conn_set_status(u64 sess_id, u32 status) up_read(&conn_list_lock); } -void ksmbd_conn_wait_idle(struct ksmbd_conn *conn, u64 sess_id) +void ksmbd_conn_wait_idle(struct ksmbd_conn *conn) { wait_event(conn->req_running_q, atomic_read(&conn->req_running) < 2); } +int ksmbd_conn_wait_idle_sess_id(struct ksmbd_conn *curr_conn, u64 sess_id) +{ + struct ksmbd_conn *conn; + int rc, retry_count = 0, max_timeout = 120; + int rcount = 1; + +retry_idle: + if (retry_count >= max_timeout) + return -EIO; + + down_read(&conn_list_lock); + list_for_each_entry(conn, &conn_list, conns_list) { + if (conn->binding || xa_load(&conn->sessions, sess_id)) { + if (conn == curr_conn) + rcount = 2; + if (atomic_read(&conn->req_running) >= rcount) { + rc = wait_event_timeout(conn->req_running_q, + atomic_read(&conn->req_running) < rcount, + HZ); + if (!rc) { + up_read(&conn_list_lock); + retry_count++; + goto retry_idle; + } + } + } + } + up_read(&conn_list_lock); + + return 0; +} + int ksmbd_conn_write(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index 5c2845e47cf2..b379ae4fdcdf 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -106,6 +106,7 @@ struct ksmbd_conn { bool signing_negotiated; __le16 signing_algorithm; bool binding; + atomic_t refcnt; }; struct ksmbd_conn_ops { @@ -145,7 +146,8 @@ extern struct list_head conn_list; extern struct rw_semaphore conn_list_lock; bool ksmbd_conn_alive(struct ksmbd_conn *conn); -void ksmbd_conn_wait_idle(struct ksmbd_conn *conn, u64 sess_id); +void ksmbd_conn_wait_idle(struct ksmbd_conn *conn); +int ksmbd_conn_wait_idle_sess_id(struct ksmbd_conn *curr_conn, u64 sess_id); struct ksmbd_conn *ksmbd_conn_alloc(void); void ksmbd_conn_free(struct ksmbd_conn *conn); bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c); diff --git a/fs/smb/server/mgmt/share_config.c b/fs/smb/server/mgmt/share_config.c index e0a6b758094f..d8d03070ae44 100644 --- a/fs/smb/server/mgmt/share_config.c +++ b/fs/smb/server/mgmt/share_config.c @@ -15,6 +15,7 @@ #include "share_config.h" #include "user_config.h" #include "user_session.h" +#include "../connection.h" #include "../transport_ipc.h" #include "../misc.h" @@ -120,12 +121,13 @@ static int parse_veto_list(struct ksmbd_share_config *share, return 0; } -static struct ksmbd_share_config *share_config_request(struct unicode_map *um, +static struct ksmbd_share_config *share_config_request(struct ksmbd_work *work, const char *name) { struct ksmbd_share_config_response *resp; struct ksmbd_share_config *share = NULL; struct ksmbd_share_config *lookup; + struct unicode_map *um = work->conn->um; int ret; resp = ksmbd_ipc_share_config_request(name); @@ -181,7 +183,14 @@ static struct ksmbd_share_config *share_config_request(struct unicode_map *um, KSMBD_SHARE_CONFIG_VETO_LIST(resp), resp->veto_list_sz); if (!ret && share->path) { + if (__ksmbd_override_fsids(work, share)) { + kill_share(share); + share = NULL; + goto out; + } + ret = kern_path(share->path, 0, &share->vfs_path); + ksmbd_revert_fsids(work); if (ret) { ksmbd_debug(SMB, "failed to access '%s'\n", share->path); @@ -214,7 +223,7 @@ out: return share; } -struct ksmbd_share_config *ksmbd_share_config_get(struct unicode_map *um, +struct ksmbd_share_config *ksmbd_share_config_get(struct ksmbd_work *work, const char *name) { struct ksmbd_share_config *share; @@ -227,7 +236,7 @@ struct ksmbd_share_config *ksmbd_share_config_get(struct unicode_map *um, if (share) return share; - return share_config_request(um, name); + return share_config_request(work, name); } bool ksmbd_share_veto_filename(struct ksmbd_share_config *share, diff --git a/fs/smb/server/mgmt/share_config.h b/fs/smb/server/mgmt/share_config.h index 5f591751b923..d4ac2dd4de20 100644 --- a/fs/smb/server/mgmt/share_config.h +++ b/fs/smb/server/mgmt/share_config.h @@ -11,6 +11,8 @@ #include <linux/path.h> #include <linux/unicode.h> +struct ksmbd_work; + struct ksmbd_share_config { char *name; char *path; @@ -68,7 +70,7 @@ static inline void ksmbd_share_config_put(struct ksmbd_share_config *share) __ksmbd_share_config_put(share); } -struct ksmbd_share_config *ksmbd_share_config_get(struct unicode_map *um, +struct ksmbd_share_config *ksmbd_share_config_get(struct ksmbd_work *work, const char *name); bool ksmbd_share_veto_filename(struct ksmbd_share_config *share, const char *filename); diff --git a/fs/smb/server/mgmt/tree_connect.c b/fs/smb/server/mgmt/tree_connect.c index d2c81a8a11dd..94a52a75014a 100644 --- a/fs/smb/server/mgmt/tree_connect.c +++ b/fs/smb/server/mgmt/tree_connect.c @@ -16,17 +16,18 @@ #include "user_session.h" struct ksmbd_tree_conn_status -ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess, - const char *share_name) +ksmbd_tree_conn_connect(struct ksmbd_work *work, const char *share_name) { struct ksmbd_tree_conn_status status = {-ENOENT, NULL}; struct ksmbd_tree_connect_response *resp = NULL; struct ksmbd_share_config *sc; struct ksmbd_tree_connect *tree_conn = NULL; struct sockaddr *peer_addr; + struct ksmbd_conn *conn = work->conn; + struct ksmbd_session *sess = work->sess; int ret; - sc = ksmbd_share_config_get(conn->um, share_name); + sc = ksmbd_share_config_get(work, share_name); if (!sc) return status; @@ -61,7 +62,7 @@ ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess, struct ksmbd_share_config *new_sc; ksmbd_share_config_del(sc); - new_sc = ksmbd_share_config_get(conn->um, share_name); + new_sc = ksmbd_share_config_get(work, share_name); if (!new_sc) { pr_err("Failed to update stale share config\n"); status.ret = -ESTALE; diff --git a/fs/smb/server/mgmt/tree_connect.h b/fs/smb/server/mgmt/tree_connect.h index 6377a70b811c..a42cdd051041 100644 --- a/fs/smb/server/mgmt/tree_connect.h +++ b/fs/smb/server/mgmt/tree_connect.h @@ -13,6 +13,7 @@ struct ksmbd_share_config; struct ksmbd_user; struct ksmbd_conn; +struct ksmbd_work; enum { TREE_NEW = 0, @@ -50,8 +51,7 @@ static inline int test_tree_conn_flag(struct ksmbd_tree_connect *tree_conn, struct ksmbd_session; struct ksmbd_tree_conn_status -ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess, - const char *share_name); +ksmbd_tree_conn_connect(struct ksmbd_work *work, const char *share_name); void ksmbd_tree_connect_put(struct ksmbd_tree_connect *tcon); int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess, diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c index 162a12685d2c..99416ce9f501 100644 --- a/fs/smb/server/mgmt/user_session.c +++ b/fs/smb/server/mgmt/user_session.c @@ -311,6 +311,7 @@ void destroy_previous_session(struct ksmbd_conn *conn, { struct ksmbd_session *prev_sess; struct ksmbd_user *prev_user; + int err; down_write(&sessions_table_lock); down_write(&conn->session_lock); @@ -325,8 +326,16 @@ void destroy_previous_session(struct ksmbd_conn *conn, memcmp(user->passkey, prev_user->passkey, user->passkey_sz)) goto out; + ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_RECONNECT); + err = ksmbd_conn_wait_idle_sess_id(conn, id); + if (err) { + ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_NEGOTIATE); + goto out; + } + ksmbd_destroy_file_table(&prev_sess->file_table); prev_sess->state = SMB2_SESSION_EXPIRED; + ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_NEGOTIATE); ksmbd_launch_ksmbd_durable_scavenger(); out: up_write(&conn->session_lock); diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c index a8f52c4ebbda..246cde380dfb 100644 --- a/fs/smb/server/oplock.c +++ b/fs/smb/server/oplock.c @@ -10,7 +10,7 @@ #include "oplock.h" #include "smb_common.h" -#include "smbstatus.h" +#include "../common/smb2status.h" #include "connection.h" #include "mgmt/user_session.h" #include "mgmt/share_config.h" @@ -51,6 +51,7 @@ static struct oplock_info *alloc_opinfo(struct ksmbd_work *work, init_waitqueue_head(&opinfo->oplock_brk); atomic_set(&opinfo->refcount, 1); atomic_set(&opinfo->breaking_cnt, 0); + atomic_inc(&opinfo->conn->refcnt); return opinfo; } @@ -124,6 +125,8 @@ static void free_opinfo(struct oplock_info *opinfo) { if (opinfo->is_lease) free_lease(opinfo); + if (opinfo->conn && atomic_dec_and_test(&opinfo->conn->refcnt)) + kfree(opinfo->conn); kfree(opinfo); } @@ -163,9 +166,7 @@ static struct oplock_info *opinfo_get_list(struct ksmbd_inode *ci) !atomic_inc_not_zero(&opinfo->refcount)) opinfo = NULL; else { - atomic_inc(&opinfo->conn->r_count); if (ksmbd_conn_releasing(opinfo->conn)) { - atomic_dec(&opinfo->conn->r_count); atomic_dec(&opinfo->refcount); opinfo = NULL; } @@ -177,26 +178,11 @@ static struct oplock_info *opinfo_get_list(struct ksmbd_inode *ci) return opinfo; } -static void opinfo_conn_put(struct oplock_info *opinfo) +void opinfo_put(struct oplock_info *opinfo) { - struct ksmbd_conn *conn; - if (!opinfo) return; - conn = opinfo->conn; - /* - * Checking waitqueue to dropping pending requests on - * disconnection. waitqueue_active is safe because it - * uses atomic operation for condition. - */ - if (!atomic_dec_return(&conn->r_count) && waitqueue_active(&conn->r_count_q)) - wake_up(&conn->r_count_q); - opinfo_put(opinfo); -} - -void opinfo_put(struct oplock_info *opinfo) -{ if (!atomic_dec_and_test(&opinfo->refcount)) return; @@ -1127,14 +1113,11 @@ void smb_send_parent_lease_break_noti(struct ksmbd_file *fp, if (!atomic_inc_not_zero(&opinfo->refcount)) continue; - atomic_inc(&opinfo->conn->r_count); - if (ksmbd_conn_releasing(opinfo->conn)) { - atomic_dec(&opinfo->conn->r_count); + if (ksmbd_conn_releasing(opinfo->conn)) continue; - } oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE); - opinfo_conn_put(opinfo); + opinfo_put(opinfo); } } up_read(&p_ci->m_lock); @@ -1167,13 +1150,10 @@ void smb_lazy_parent_lease_break_close(struct ksmbd_file *fp) if (!atomic_inc_not_zero(&opinfo->refcount)) continue; - atomic_inc(&opinfo->conn->r_count); - if (ksmbd_conn_releasing(opinfo->conn)) { - atomic_dec(&opinfo->conn->r_count); + if (ksmbd_conn_releasing(opinfo->conn)) continue; - } oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE); - opinfo_conn_put(opinfo); + opinfo_put(opinfo); } } up_read(&p_ci->m_lock); @@ -1252,7 +1232,7 @@ int smb_grant_oplock(struct ksmbd_work *work, int req_op_level, u64 pid, prev_opinfo = opinfo_get_list(ci); if (!prev_opinfo || (prev_opinfo->level == SMB2_OPLOCK_LEVEL_NONE && lctx)) { - opinfo_conn_put(prev_opinfo); + opinfo_put(prev_opinfo); goto set_lev; } prev_op_has_lease = prev_opinfo->is_lease; @@ -1262,19 +1242,19 @@ int smb_grant_oplock(struct ksmbd_work *work, int req_op_level, u64 pid, if (share_ret < 0 && prev_opinfo->level == SMB2_OPLOCK_LEVEL_EXCLUSIVE) { err = share_ret; - opinfo_conn_put(prev_opinfo); + opinfo_put(prev_opinfo); goto err_out; } if (prev_opinfo->level != SMB2_OPLOCK_LEVEL_BATCH && prev_opinfo->level != SMB2_OPLOCK_LEVEL_EXCLUSIVE) { - opinfo_conn_put(prev_opinfo); + opinfo_put(prev_opinfo); goto op_break_not_needed; } list_add(&work->interim_entry, &prev_opinfo->interim_list); err = oplock_break(prev_opinfo, SMB2_OPLOCK_LEVEL_II); - opinfo_conn_put(prev_opinfo); + opinfo_put(prev_opinfo); if (err == -ENOENT) goto set_lev; /* Check all oplock was freed by close */ @@ -1337,14 +1317,14 @@ static void smb_break_all_write_oplock(struct ksmbd_work *work, return; if (brk_opinfo->level != SMB2_OPLOCK_LEVEL_BATCH && brk_opinfo->level != SMB2_OPLOCK_LEVEL_EXCLUSIVE) { - opinfo_conn_put(brk_opinfo); + opinfo_put(brk_opinfo); return; } brk_opinfo->open_trunc = is_trunc; list_add(&work->interim_entry, &brk_opinfo->interim_list); oplock_break(brk_opinfo, SMB2_OPLOCK_LEVEL_II); - opinfo_conn_put(brk_opinfo); + opinfo_put(brk_opinfo); } /** @@ -1376,11 +1356,8 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp, if (!atomic_inc_not_zero(&brk_op->refcount)) continue; - atomic_inc(&brk_op->conn->r_count); - if (ksmbd_conn_releasing(brk_op->conn)) { - atomic_dec(&brk_op->conn->r_count); + if (ksmbd_conn_releasing(brk_op->conn)) continue; - } rcu_read_unlock(); if (brk_op->is_lease && (brk_op->o_lease->state & @@ -1411,7 +1388,7 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp, brk_op->open_trunc = is_trunc; oplock_break(brk_op, SMB2_OPLOCK_LEVEL_NONE); next: - opinfo_conn_put(brk_op); + opinfo_put(brk_op); rcu_read_lock(); } rcu_read_unlock(); @@ -1510,7 +1487,7 @@ void create_lease_buf(u8 *rbuf, struct lease *lease) * parse_lease_state() - parse lease context containted in file open request * @open_req: buffer containing smb2 file open(create) request * - * Return: oplock state, -ENOENT if create lease context not found + * Return: allocated lease context object on success, otherwise NULL */ struct lease_ctx_info *parse_lease_state(void *open_req) { diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index 4d24cc105ef6..c402d4abe826 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -15,7 +15,7 @@ #include "server.h" #include "smb_common.h" -#include "smbstatus.h" +#include "../common/smb2status.h" #include "connection.h" #include "transport_ipc.h" #include "mgmt/user_session.h" diff --git a/fs/smb/server/smb2misc.c b/fs/smb/server/smb2misc.c index 727cb49926ee..ae501024665e 100644 --- a/fs/smb/server/smb2misc.c +++ b/fs/smb/server/smb2misc.c @@ -7,7 +7,7 @@ #include "glob.h" #include "nterr.h" #include "smb_common.h" -#include "smbstatus.h" +#include "../common/smb2status.h" #include "mgmt/user_session.h" #include "connection.h" diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 37a39ab4ee65..e6bdc1b20727 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -30,7 +30,7 @@ #include "server.h" #include "smb_common.h" -#include "smbstatus.h" +#include "../common/smb2status.h" #include "ksmbd_work.h" #include "mgmt/user_config.h" #include "mgmt/share_config.h" @@ -519,7 +519,7 @@ int init_smb2_rsp_hdr(struct ksmbd_work *work) * smb2_allocate_rsp_buf() - allocate smb2 response buffer * @work: smb work containing smb request buffer * - * Return: 0 on success, otherwise -ENOMEM + * Return: 0 on success, otherwise error */ int smb2_allocate_rsp_buf(struct ksmbd_work *work) { @@ -1370,7 +1370,8 @@ static int ntlm_negotiate(struct ksmbd_work *work, } sz = le16_to_cpu(rsp->SecurityBufferOffset); - memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len); + unsafe_memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len, + /* alloc is larger than blob, see smb2_allocate_rsp_buf() */); rsp->SecurityBufferLength = cpu_to_le16(spnego_blob_len); out: @@ -1453,7 +1454,9 @@ static int ntlm_authenticate(struct ksmbd_work *work, return -ENOMEM; sz = le16_to_cpu(rsp->SecurityBufferOffset); - memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len); + unsafe_memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, + spnego_blob_len, + /* alloc is larger than blob, see smb2_allocate_rsp_buf() */); rsp->SecurityBufferLength = cpu_to_le16(spnego_blob_len); kfree(spnego_blob); } @@ -1687,6 +1690,8 @@ int smb2_sess_setup(struct ksmbd_work *work) rc = ksmbd_session_register(conn, sess); if (rc) goto out_err; + + conn->binding = false; } else if (conn->dialect >= SMB30_PROT_ID && (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) && req->Flags & SMB2_SESSION_REQ_FLAG_BINDING) { @@ -1765,6 +1770,8 @@ int smb2_sess_setup(struct ksmbd_work *work) sess = NULL; goto out_err; } + + conn->binding = false; } work->sess = sess; @@ -1955,7 +1962,7 @@ int smb2_tree_connect(struct ksmbd_work *work) ksmbd_debug(SMB, "tree connect request for tree %s treename %s\n", name, treename); - status = ksmbd_tree_conn_connect(conn, sess, name); + status = ksmbd_tree_conn_connect(work, name); if (status.ret == KSMBD_TREE_CONN_STATUS_OK) rsp->hdr.Id.SyncId.TreeId = cpu_to_le32(status.tree_conn->id); else @@ -2210,7 +2217,7 @@ int smb2_session_logoff(struct ksmbd_work *work) ksmbd_conn_unlock(conn); ksmbd_close_session_fds(work); - ksmbd_conn_wait_idle(conn, sess_id); + ksmbd_conn_wait_idle(conn); /* * Re-lookup session to validate if session is deleted @@ -2767,8 +2774,8 @@ static int parse_durable_handle_context(struct ksmbd_work *work, } } - if (((lc && (lc->req_state & SMB2_LEASE_HANDLE_CACHING_LE)) || - req_op_level == SMB2_OPLOCK_LEVEL_BATCH)) { + if ((lc && (lc->req_state & SMB2_LEASE_HANDLE_CACHING_LE)) || + req_op_level == SMB2_OPLOCK_LEVEL_BATCH) { dh_info->CreateGuid = durable_v2_blob->CreateGuid; dh_info->persistent = @@ -2788,8 +2795,8 @@ static int parse_durable_handle_context(struct ksmbd_work *work, goto out; } - if (((lc && (lc->req_state & SMB2_LEASE_HANDLE_CACHING_LE)) || - req_op_level == SMB2_OPLOCK_LEVEL_BATCH)) { + if ((lc && (lc->req_state & SMB2_LEASE_HANDLE_CACHING_LE)) || + req_op_level == SMB2_OPLOCK_LEVEL_BATCH) { ksmbd_debug(SMB, "Request for durable open\n"); dh_info->type = dh_idx; } @@ -3093,7 +3100,6 @@ int smb2_open(struct ksmbd_work *work) goto err_out; } - file_present = true; idmap = mnt_idmap(path.mnt); } else { if (rc != -ENOENT) @@ -3411,7 +3417,7 @@ int smb2_open(struct ksmbd_work *work) goto err_out1; } } else { - if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE) { + if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE && lc) { if (S_ISDIR(file_inode(filp)->i_mode)) { lc->req_state &= ~SMB2_LEASE_WRITE_CACHING_LE; lc->is_dir = true; @@ -3710,7 +3716,7 @@ err_out2: kfree(name); kfree(lc); - return 0; + return rc; } static int readdir_info_level_struct_sz(int info_level) @@ -4406,7 +4412,8 @@ int smb2_query_dir(struct ksmbd_work *work) rsp->OutputBufferLength = cpu_to_le32(0); rsp->Buffer[0] = 0; rc = ksmbd_iov_pin_rsp(work, (void *)rsp, - sizeof(struct smb2_query_directory_rsp)); + offsetof(struct smb2_query_directory_rsp, Buffer) + + 1); if (rc) goto err_out; } else { @@ -5357,7 +5364,7 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, "NTFS", PATH_MAX, conn->local_nls, 0); len = len * 2; info->FileSystemNameLen = cpu_to_le32(len); - sz = sizeof(struct filesystem_attribute_info) - 2 + len; + sz = sizeof(struct filesystem_attribute_info) + len; rsp->OutputBufferLength = cpu_to_le32(sz); break; } @@ -5383,7 +5390,7 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, len = len * 2; info->VolumeLabelSize = cpu_to_le32(len); info->Reserved = 0; - sz = sizeof(struct filesystem_vol_info) - 2 + len; + sz = sizeof(struct filesystem_vol_info) + len; rsp->OutputBufferLength = cpu_to_le32(sz); break; } @@ -5596,6 +5603,11 @@ int smb2_query_info(struct ksmbd_work *work) ksmbd_debug(SMB, "GOT query info request\n"); + if (ksmbd_override_fsids(work)) { + rc = -ENOMEM; + goto err_out; + } + switch (req->InfoType) { case SMB2_O_INFO_FILE: ksmbd_debug(SMB, "GOT SMB2_O_INFO_FILE\n"); @@ -5614,6 +5626,7 @@ int smb2_query_info(struct ksmbd_work *work) req->InfoType); rc = -EOPNOTSUPP; } + ksmbd_revert_fsids(work); if (!rc) { rsp->StructureSize = cpu_to_le16(9); @@ -5623,6 +5636,7 @@ int smb2_query_info(struct ksmbd_work *work) le32_to_cpu(rsp->OutputBufferLength)); } +err_out: if (rc < 0) { if (rc == -EACCES) rsp->hdr.Status = STATUS_ACCESS_DENIED; diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c index 474dadf6b7b8..cc4bb2377cbd 100644 --- a/fs/smb/server/smb_common.c +++ b/fs/smb/server/smb_common.c @@ -9,7 +9,7 @@ #include "smb_common.h" #include "server.h" #include "misc.h" -#include "smbstatus.h" +#include "../common/smb2status.h" #include "connection.h" #include "ksmbd_work.h" #include "mgmt/user_session.h" @@ -732,10 +732,10 @@ bool is_asterisk(char *p) return p && p[0] == '*'; } -int ksmbd_override_fsids(struct ksmbd_work *work) +int __ksmbd_override_fsids(struct ksmbd_work *work, + struct ksmbd_share_config *share) { struct ksmbd_session *sess = work->sess; - struct ksmbd_share_config *share = work->tcon->share_conf; struct cred *cred; struct group_info *gi; unsigned int uid; @@ -775,6 +775,11 @@ int ksmbd_override_fsids(struct ksmbd_work *work) return 0; } +int ksmbd_override_fsids(struct ksmbd_work *work) +{ + return __ksmbd_override_fsids(work, work->tcon->share_conf); +} + void ksmbd_revert_fsids(struct ksmbd_work *work) { const struct cred *cred; diff --git a/fs/smb/server/smb_common.h b/fs/smb/server/smb_common.h index f1092519c0c2..cc1d6dfe29d5 100644 --- a/fs/smb/server/smb_common.h +++ b/fs/smb/server/smb_common.h @@ -213,7 +213,7 @@ struct filesystem_attribute_info { __le32 Attributes; __le32 MaxPathNameComponentLength; __le32 FileSystemNameLen; - __le16 FileSystemName[1]; /* do not have to save this - get subset? */ + __le16 FileSystemName[]; /* do not have to save this - get subset? */ } __packed; struct filesystem_device_info { @@ -226,7 +226,7 @@ struct filesystem_vol_info { __le32 SerialNumber; __le32 VolumeLabelSize; __le16 Reserved; - __le16 VolumeLabel[1]; + __le16 VolumeLabel[]; } __packed; struct filesystem_info { @@ -447,6 +447,8 @@ int ksmbd_extract_shortname(struct ksmbd_conn *conn, int ksmbd_smb_negotiate_common(struct ksmbd_work *work, unsigned int command); int ksmbd_smb_check_shared_mode(struct file *filp, struct ksmbd_file *curr_fp); +int __ksmbd_override_fsids(struct ksmbd_work *work, + struct ksmbd_share_config *share); int ksmbd_override_fsids(struct ksmbd_work *work); void ksmbd_revert_fsids(struct ksmbd_work *work); diff --git a/fs/smb/server/smbacl.h b/fs/smb/server/smbacl.h index 2b52861707d8..24ce576fc292 100644 --- a/fs/smb/server/smbacl.h +++ b/fs/smb/server/smbacl.h @@ -8,6 +8,7 @@ #ifndef _SMBACL_H #define _SMBACL_H +#include "../common/smbacl.h" #include <linux/fs.h> #include <linux/namei.h> #include <linux/posix_acl.h> @@ -15,32 +16,6 @@ #include "mgmt/tree_connect.h" -#define NUM_AUTHS (6) /* number of authority fields */ -#define SID_MAX_SUB_AUTHORITIES (15) /* max number of sub authority fields */ - -/* - * ACE types - see MS-DTYP 2.4.4.1 - */ -enum { - ACCESS_ALLOWED, - ACCESS_DENIED, -}; - -/* - * Security ID types - */ -enum { - SIDOWNER = 1, - SIDGROUP, - SIDCREATOR_OWNER, - SIDCREATOR_GROUP, - SIDUNIX_USER, - SIDUNIX_GROUP, - SIDNFS_USER, - SIDNFS_GROUP, - SIDNFS_MODE, -}; - /* Revision for ACLs */ #define SD_REVISION 1 @@ -62,92 +37,8 @@ enum { #define RM_CONTROL_VALID 0x4000 #define SELF_RELATIVE 0x8000 -/* ACE types - see MS-DTYP 2.4.4.1 */ -#define ACCESS_ALLOWED_ACE_TYPE 0x00 -#define ACCESS_DENIED_ACE_TYPE 0x01 -#define SYSTEM_AUDIT_ACE_TYPE 0x02 -#define SYSTEM_ALARM_ACE_TYPE 0x03 -#define ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04 -#define ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 -#define ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 -#define SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 -#define SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 -#define ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09 -#define ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A -#define ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B -#define ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C -#define SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D -#define SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E /* Reserved */ -#define SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F -#define SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10 /* reserved */ -#define SYSTEM_MANDATORY_LABEL_ACE_TYPE 0x11 -#define SYSTEM_RESOURCE_ATTRIBUTE_ACE_TYPE 0x12 -#define SYSTEM_SCOPED_POLICY_ID_ACE_TYPE 0x13 - -/* ACE flags */ -#define OBJECT_INHERIT_ACE 0x01 -#define CONTAINER_INHERIT_ACE 0x02 -#define NO_PROPAGATE_INHERIT_ACE 0x04 -#define INHERIT_ONLY_ACE 0x08 -#define INHERITED_ACE 0x10 -#define SUCCESSFUL_ACCESS_ACE_FLAG 0x40 -#define FAILED_ACCESS_ACE_FLAG 0x80 - -/* - * Maximum size of a string representation of a SID: - * - * The fields are unsigned values in decimal. So: - * - * u8: max 3 bytes in decimal - * u32: max 10 bytes in decimal - * - * "S-" + 3 bytes for version field + 15 for authority field + NULL terminator - * - * For authority field, max is when all 6 values are non-zero and it must be - * represented in hex. So "-0x" + 12 hex digits. - * - * Add 11 bytes for each subauthority field (10 bytes each + 1 for '-') - */ -#define SID_STRING_BASE_SIZE (2 + 3 + 15 + 1) -#define SID_STRING_SUBAUTH_SIZE (11) /* size of a single subauth string */ - -#define DOMAIN_USER_RID_LE cpu_to_le32(513) - struct ksmbd_conn; -struct smb_ntsd { - __le16 revision; /* revision level */ - __le16 type; - __le32 osidoffset; - __le32 gsidoffset; - __le32 sacloffset; - __le32 dacloffset; -} __packed; - -struct smb_sid { - __u8 revision; /* revision level */ - __u8 num_subauth; - __u8 authority[NUM_AUTHS]; - __le32 sub_auth[SID_MAX_SUB_AUTHORITIES]; /* sub_auth[num_subauth] */ -} __packed; - -/* size of a struct cifs_sid, sans sub_auth array */ -#define CIFS_SID_BASE_SIZE (1 + 1 + NUM_AUTHS) - -struct smb_acl { - __le16 revision; /* revision level */ - __le16 size; - __le32 num_aces; -} __packed; - -struct smb_ace { - __u8 type; - __u8 flags; - __le16 size; - __le32 access_req; - struct smb_sid sid; /* ie UUID of user or group who gets these perms */ -} __packed; - struct smb_fattr { kuid_t cf_uid; kgid_t cf_gid; diff --git a/fs/smb/server/smbstatus.h b/fs/smb/server/smbstatus.h deleted file mode 100644 index 8963deb42404..000000000000 --- a/fs/smb/server/smbstatus.h +++ /dev/null @@ -1,1822 +0,0 @@ -/* SPDX-License-Identifier: LGPL-2.1+ */ -/* - * fs/server/smb2status.h - * - * SMB2 Status code (network error) definitions - * Definitions are from MS-ERREF - * - * Copyright (c) International Business Machines Corp., 2009,2011 - * Author(s): Steve French (sfrench@us.ibm.com) - */ - -/* - * 0 1 2 3 4 5 6 7 8 9 0 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F - * SEV C N <-------Facility--------> <------Error Status Code------> - * - * C is set if "customer defined" error, N bit is reserved and MBZ - */ - -#define STATUS_SEVERITY_SUCCESS cpu_to_le32(0x0000) -#define STATUS_SEVERITY_INFORMATIONAL cpu_to_le32(0x0001) -#define STATUS_SEVERITY_WARNING cpu_to_le32(0x0002) -#define STATUS_SEVERITY_ERROR cpu_to_le32(0x0003) - -struct ntstatus { - /* Facility is the high 12 bits of the following field */ - __le32 Facility; /* low 2 bits Severity, next is Customer, then rsrvd */ - __le32 Code; -}; - -#define STATUS_SUCCESS 0x00000000 -#define STATUS_WAIT_0 cpu_to_le32(0x00000000) -#define STATUS_WAIT_1 cpu_to_le32(0x00000001) -#define STATUS_WAIT_2 cpu_to_le32(0x00000002) -#define STATUS_WAIT_3 cpu_to_le32(0x00000003) -#define STATUS_WAIT_63 cpu_to_le32(0x0000003F) -#define STATUS_ABANDONED cpu_to_le32(0x00000080) -#define STATUS_ABANDONED_WAIT_0 cpu_to_le32(0x00000080) -#define STATUS_ABANDONED_WAIT_63 cpu_to_le32(0x000000BF) -#define STATUS_USER_APC cpu_to_le32(0x000000C0) -#define STATUS_KERNEL_APC cpu_to_le32(0x00000100) -#define STATUS_ALERTED cpu_to_le32(0x00000101) -#define STATUS_TIMEOUT cpu_to_le32(0x00000102) -#define STATUS_PENDING cpu_to_le32(0x00000103) -#define STATUS_REPARSE cpu_to_le32(0x00000104) -#define STATUS_MORE_ENTRIES cpu_to_le32(0x00000105) -#define STATUS_NOT_ALL_ASSIGNED cpu_to_le32(0x00000106) -#define STATUS_SOME_NOT_MAPPED cpu_to_le32(0x00000107) -#define STATUS_OPLOCK_BREAK_IN_PROGRESS cpu_to_le32(0x00000108) -#define STATUS_VOLUME_MOUNTED cpu_to_le32(0x00000109) -#define STATUS_RXACT_COMMITTED cpu_to_le32(0x0000010A) -#define STATUS_NOTIFY_CLEANUP cpu_to_le32(0x0000010B) -#define STATUS_NOTIFY_ENUM_DIR cpu_to_le32(0x0000010C) -#define STATUS_NO_QUOTAS_FOR_ACCOUNT cpu_to_le32(0x0000010D) -#define STATUS_PRIMARY_TRANSPORT_CONNECT_FAILED cpu_to_le32(0x0000010E) -#define STATUS_PAGE_FAULT_TRANSITION cpu_to_le32(0x00000110) -#define STATUS_PAGE_FAULT_DEMAND_ZERO cpu_to_le32(0x00000111) -#define STATUS_PAGE_FAULT_COPY_ON_WRITE cpu_to_le32(0x00000112) -#define STATUS_PAGE_FAULT_GUARD_PAGE cpu_to_le32(0x00000113) -#define STATUS_PAGE_FAULT_PAGING_FILE cpu_to_le32(0x00000114) -#define STATUS_CACHE_PAGE_LOCKED cpu_to_le32(0x00000115) -#define STATUS_CRASH_DUMP cpu_to_le32(0x00000116) -#define STATUS_BUFFER_ALL_ZEROS cpu_to_le32(0x00000117) -#define STATUS_REPARSE_OBJECT cpu_to_le32(0x00000118) -#define STATUS_RESOURCE_REQUIREMENTS_CHANGED cpu_to_le32(0x00000119) -#define STATUS_TRANSLATION_COMPLETE cpu_to_le32(0x00000120) -#define STATUS_DS_MEMBERSHIP_EVALUATED_LOCALLY cpu_to_le32(0x00000121) -#define STATUS_NOTHING_TO_TERMINATE cpu_to_le32(0x00000122) -#define STATUS_PROCESS_NOT_IN_JOB cpu_to_le32(0x00000123) -#define STATUS_PROCESS_IN_JOB cpu_to_le32(0x00000124) -#define STATUS_VOLSNAP_HIBERNATE_READY cpu_to_le32(0x00000125) -#define STATUS_FSFILTER_OP_COMPLETED_SUCCESSFULLY cpu_to_le32(0x00000126) -#define STATUS_INTERRUPT_VECTOR_ALREADY_CONNECTED cpu_to_le32(0x00000127) -#define STATUS_INTERRUPT_STILL_CONNECTED cpu_to_le32(0x00000128) -#define STATUS_PROCESS_CLONED cpu_to_le32(0x00000129) -#define STATUS_FILE_LOCKED_WITH_ONLY_READERS cpu_to_le32(0x0000012A) -#define STATUS_FILE_LOCKED_WITH_WRITERS cpu_to_le32(0x0000012B) -#define STATUS_RESOURCEMANAGER_READ_ONLY cpu_to_le32(0x00000202) -#define STATUS_WAIT_FOR_OPLOCK cpu_to_le32(0x00000367) -#define DBG_EXCEPTION_HANDLED cpu_to_le32(0x00010001) -#define DBG_CONTINUE cpu_to_le32(0x00010002) -#define STATUS_FLT_IO_COMPLETE cpu_to_le32(0x001C0001) -#define STATUS_OBJECT_NAME_EXISTS cpu_to_le32(0x40000000) -#define STATUS_THREAD_WAS_SUSPENDED cpu_to_le32(0x40000001) -#define STATUS_WORKING_SET_LIMIT_RANGE cpu_to_le32(0x40000002) -#define STATUS_IMAGE_NOT_AT_BASE cpu_to_le32(0x40000003) -#define STATUS_RXACT_STATE_CREATED cpu_to_le32(0x40000004) -#define STATUS_SEGMENT_NOTIFICATION cpu_to_le32(0x40000005) -#define STATUS_LOCAL_USER_SESSION_KEY cpu_to_le32(0x40000006) -#define STATUS_BAD_CURRENT_DIRECTORY cpu_to_le32(0x40000007) -#define STATUS_SERIAL_MORE_WRITES cpu_to_le32(0x40000008) -#define STATUS_REGISTRY_RECOVERED cpu_to_le32(0x40000009) -#define STATUS_FT_READ_RECOVERY_FROM_BACKUP cpu_to_le32(0x4000000A) -#define STATUS_FT_WRITE_RECOVERY cpu_to_le32(0x4000000B) -#define STATUS_SERIAL_COUNTER_TIMEOUT cpu_to_le32(0x4000000C) -#define STATUS_NULL_LM_PASSWORD cpu_to_le32(0x4000000D) -#define STATUS_IMAGE_MACHINE_TYPE_MISMATCH cpu_to_le32(0x4000000E) -#define STATUS_RECEIVE_PARTIAL cpu_to_le32(0x4000000F) -#define STATUS_RECEIVE_EXPEDITED cpu_to_le32(0x40000010) -#define STATUS_RECEIVE_PARTIAL_EXPEDITED cpu_to_le32(0x40000011) -#define STATUS_EVENT_DONE cpu_to_le32(0x40000012) -#define STATUS_EVENT_PENDING cpu_to_le32(0x40000013) -#define STATUS_CHECKING_FILE_SYSTEM cpu_to_le32(0x40000014) -#define STATUS_FATAL_APP_EXIT cpu_to_le32(0x40000015) -#define STATUS_PREDEFINED_HANDLE cpu_to_le32(0x40000016) -#define STATUS_WAS_UNLOCKED cpu_to_le32(0x40000017) -#define STATUS_SERVICE_NOTIFICATION cpu_to_le32(0x40000018) -#define STATUS_WAS_LOCKED cpu_to_le32(0x40000019) -#define STATUS_LOG_HARD_ERROR cpu_to_le32(0x4000001A) -#define STATUS_ALREADY_WIN32 cpu_to_le32(0x4000001B) -#define STATUS_WX86_UNSIMULATE cpu_to_le32(0x4000001C) -#define STATUS_WX86_CONTINUE cpu_to_le32(0x4000001D) -#define STATUS_WX86_SINGLE_STEP cpu_to_le32(0x4000001E) -#define STATUS_WX86_BREAKPOINT cpu_to_le32(0x4000001F) -#define STATUS_WX86_EXCEPTION_CONTINUE cpu_to_le32(0x40000020) -#define STATUS_WX86_EXCEPTION_LASTCHANCE cpu_to_le32(0x40000021) -#define STATUS_WX86_EXCEPTION_CHAIN cpu_to_le32(0x40000022) -#define STATUS_IMAGE_MACHINE_TYPE_MISMATCH_EXE cpu_to_le32(0x40000023) -#define STATUS_NO_YIELD_PERFORMED cpu_to_le32(0x40000024) -#define STATUS_TIMER_RESUME_IGNORED cpu_to_le32(0x40000025) -#define STATUS_ARBITRATION_UNHANDLED cpu_to_le32(0x40000026) -#define STATUS_CARDBUS_NOT_SUPPORTED cpu_to_le32(0x40000027) -#define STATUS_WX86_CREATEWX86TIB cpu_to_le32(0x40000028) -#define STATUS_MP_PROCESSOR_MISMATCH cpu_to_le32(0x40000029) -#define STATUS_HIBERNATED cpu_to_le32(0x4000002A) -#define STATUS_RESUME_HIBERNATION cpu_to_le32(0x4000002B) -#define STATUS_FIRMWARE_UPDATED cpu_to_le32(0x4000002C) -#define STATUS_DRIVERS_LEAKING_LOCKED_PAGES cpu_to_le32(0x4000002D) -#define STATUS_MESSAGE_RETRIEVED cpu_to_le32(0x4000002E) -#define STATUS_SYSTEM_POWERSTATE_TRANSITION cpu_to_le32(0x4000002F) -#define STATUS_ALPC_CHECK_COMPLETION_LIST cpu_to_le32(0x40000030) -#define STATUS_SYSTEM_POWERSTATE_COMPLEX_TRANSITION cpu_to_le32(0x40000031) -#define STATUS_ACCESS_AUDIT_BY_POLICY cpu_to_le32(0x40000032) -#define STATUS_ABANDON_HIBERFILE cpu_to_le32(0x40000033) -#define STATUS_BIZRULES_NOT_ENABLED cpu_to_le32(0x40000034) -#define STATUS_WAKE_SYSTEM cpu_to_le32(0x40000294) -#define STATUS_DS_SHUTTING_DOWN cpu_to_le32(0x40000370) -#define DBG_REPLY_LATER cpu_to_le32(0x40010001) -#define DBG_UNABLE_TO_PROVIDE_HANDLE cpu_to_le32(0x40010002) -#define DBG_TERMINATE_THREAD cpu_to_le32(0x40010003) -#define DBG_TERMINATE_PROCESS cpu_to_le32(0x40010004) -#define DBG_CONTROL_C cpu_to_le32(0x40010005) -#define DBG_PRINTEXCEPTION_C cpu_to_le32(0x40010006) -#define DBG_RIPEXCEPTION cpu_to_le32(0x40010007) -#define DBG_CONTROL_BREAK cpu_to_le32(0x40010008) -#define DBG_COMMAND_EXCEPTION cpu_to_le32(0x40010009) -#define RPC_NT_UUID_LOCAL_ONLY cpu_to_le32(0x40020056) -#define RPC_NT_SEND_INCOMPLETE cpu_to_le32(0x400200AF) -#define STATUS_CTX_CDM_CONNECT cpu_to_le32(0x400A0004) -#define STATUS_CTX_CDM_DISCONNECT cpu_to_le32(0x400A0005) -#define STATUS_SXS_RELEASE_ACTIVATION_CONTEXT cpu_to_le32(0x4015000D) -#define STATUS_RECOVERY_NOT_NEEDED cpu_to_le32(0x40190034) -#define STATUS_RM_ALREADY_STARTED cpu_to_le32(0x40190035) -#define STATUS_LOG_NO_RESTART cpu_to_le32(0x401A000C) -#define STATUS_VIDEO_DRIVER_DEBUG_REPORT_REQUEST cpu_to_le32(0x401B00EC) -#define STATUS_GRAPHICS_PARTIAL_DATA_POPULATED cpu_to_le32(0x401E000A) -#define STATUS_GRAPHICS_DRIVER_MISMATCH cpu_to_le32(0x401E0117) -#define STATUS_GRAPHICS_MODE_NOT_PINNED cpu_to_le32(0x401E0307) -#define STATUS_GRAPHICS_NO_PREFERRED_MODE cpu_to_le32(0x401E031E) -#define STATUS_GRAPHICS_DATASET_IS_EMPTY cpu_to_le32(0x401E034B) -#define STATUS_GRAPHICS_NO_MORE_ELEMENTS_IN_DATASET cpu_to_le32(0x401E034C) -#define STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_PINNED \ - cpu_to_le32(0x401E0351) -#define STATUS_GRAPHICS_UNKNOWN_CHILD_STATUS cpu_to_le32(0x401E042F) -#define STATUS_GRAPHICS_LEADLINK_START_DEFERRED cpu_to_le32(0x401E0437) -#define STATUS_GRAPHICS_POLLING_TOO_FREQUENTLY cpu_to_le32(0x401E0439) -#define STATUS_GRAPHICS_START_DEFERRED cpu_to_le32(0x401E043A) -#define STATUS_NDIS_INDICATION_REQUIRED cpu_to_le32(0x40230001) -#define STATUS_GUARD_PAGE_VIOLATION cpu_to_le32(0x80000001) -#define STATUS_DATATYPE_MISALIGNMENT cpu_to_le32(0x80000002) -#define STATUS_BREAKPOINT cpu_to_le32(0x80000003) -#define STATUS_SINGLE_STEP cpu_to_le32(0x80000004) -#define STATUS_BUFFER_OVERFLOW cpu_to_le32(0x80000005) -#define STATUS_NO_MORE_FILES cpu_to_le32(0x80000006) -#define STATUS_WAKE_SYSTEM_DEBUGGER cpu_to_le32(0x80000007) -#define STATUS_HANDLES_CLOSED cpu_to_le32(0x8000000A) -#define STATUS_NO_INHERITANCE cpu_to_le32(0x8000000B) -#define STATUS_GUID_SUBSTITUTION_MADE cpu_to_le32(0x8000000C) -#define STATUS_PARTIAL_COPY cpu_to_le32(0x8000000D) -#define STATUS_DEVICE_PAPER_EMPTY cpu_to_le32(0x8000000E) -#define STATUS_DEVICE_POWERED_OFF cpu_to_le32(0x8000000F) -#define STATUS_DEVICE_OFF_LINE cpu_to_le32(0x80000010) -#define STATUS_DEVICE_BUSY cpu_to_le32(0x80000011) -#define STATUS_NO_MORE_EAS cpu_to_le32(0x80000012) -#define STATUS_INVALID_EA_NAME cpu_to_le32(0x80000013) -#define STATUS_EA_LIST_INCONSISTENT cpu_to_le32(0x80000014) -#define STATUS_INVALID_EA_FLAG cpu_to_le32(0x80000015) -#define STATUS_VERIFY_REQUIRED cpu_to_le32(0x80000016) -#define STATUS_EXTRANEOUS_INFORMATION cpu_to_le32(0x80000017) -#define STATUS_RXACT_COMMIT_NECESSARY cpu_to_le32(0x80000018) -#define STATUS_NO_MORE_ENTRIES cpu_to_le32(0x8000001A) -#define STATUS_FILEMARK_DETECTED cpu_to_le32(0x8000001B) -#define STATUS_MEDIA_CHANGED cpu_to_le32(0x8000001C) -#define STATUS_BUS_RESET cpu_to_le32(0x8000001D) -#define STATUS_END_OF_MEDIA cpu_to_le32(0x8000001E) -#define STATUS_BEGINNING_OF_MEDIA cpu_to_le32(0x8000001F) -#define STATUS_MEDIA_CHECK cpu_to_le32(0x80000020) -#define STATUS_SETMARK_DETECTED cpu_to_le32(0x80000021) -#define STATUS_NO_DATA_DETECTED cpu_to_le32(0x80000022) -#define STATUS_REDIRECTOR_HAS_OPEN_HANDLES cpu_to_le32(0x80000023) -#define STATUS_SERVER_HAS_OPEN_HANDLES cpu_to_le32(0x80000024) -#define STATUS_ALREADY_DISCONNECTED cpu_to_le32(0x80000025) -#define STATUS_LONGJUMP cpu_to_le32(0x80000026) -#define STATUS_CLEANER_CARTRIDGE_INSTALLED cpu_to_le32(0x80000027) -#define STATUS_PLUGPLAY_QUERY_VETOED cpu_to_le32(0x80000028) -#define STATUS_UNWIND_CONSOLIDATE cpu_to_le32(0x80000029) -#define STATUS_REGISTRY_HIVE_RECOVERED cpu_to_le32(0x8000002A) -#define STATUS_DLL_MIGHT_BE_INSECURE cpu_to_le32(0x8000002B) -#define STATUS_DLL_MIGHT_BE_INCOMPATIBLE cpu_to_le32(0x8000002C) -#define STATUS_STOPPED_ON_SYMLINK cpu_to_le32(0x8000002D) -#define STATUS_DEVICE_REQUIRES_CLEANING cpu_to_le32(0x80000288) -#define STATUS_DEVICE_DOOR_OPEN cpu_to_le32(0x80000289) -#define STATUS_DATA_LOST_REPAIR cpu_to_le32(0x80000803) -#define DBG_EXCEPTION_NOT_HANDLED cpu_to_le32(0x80010001) -#define STATUS_CLUSTER_NODE_ALREADY_UP cpu_to_le32(0x80130001) -#define STATUS_CLUSTER_NODE_ALREADY_DOWN cpu_to_le32(0x80130002) -#define STATUS_CLUSTER_NETWORK_ALREADY_ONLINE cpu_to_le32(0x80130003) -#define STATUS_CLUSTER_NETWORK_ALREADY_OFFLINE cpu_to_le32(0x80130004) -#define STATUS_CLUSTER_NODE_ALREADY_MEMBER cpu_to_le32(0x80130005) -#define STATUS_COULD_NOT_RESIZE_LOG cpu_to_le32(0x80190009) -#define STATUS_NO_TXF_METADATA cpu_to_le32(0x80190029) -#define STATUS_CANT_RECOVER_WITH_HANDLE_OPEN cpu_to_le32(0x80190031) -#define STATUS_TXF_METADATA_ALREADY_PRESENT cpu_to_le32(0x80190041) -#define STATUS_TRANSACTION_SCOPE_CALLBACKS_NOT_SET cpu_to_le32(0x80190042) -#define STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD_RECOVERED \ - cpu_to_le32(0x801B00EB) -#define STATUS_FLT_BUFFER_TOO_SMALL cpu_to_le32(0x801C0001) -#define STATUS_FVE_PARTIAL_METADATA cpu_to_le32(0x80210001) -#define STATUS_UNSUCCESSFUL cpu_to_le32(0xC0000001) -#define STATUS_NOT_IMPLEMENTED cpu_to_le32(0xC0000002) -#define STATUS_INVALID_INFO_CLASS cpu_to_le32(0xC0000003) -#define STATUS_INFO_LENGTH_MISMATCH cpu_to_le32(0xC0000004) -#define STATUS_ACCESS_VIOLATION cpu_to_le32(0xC0000005) -#define STATUS_IN_PAGE_ERROR cpu_to_le32(0xC0000006) -#define STATUS_PAGEFILE_QUOTA cpu_to_le32(0xC0000007) -#define STATUS_INVALID_HANDLE cpu_to_le32(0xC0000008) -#define STATUS_BAD_INITIAL_STACK cpu_to_le32(0xC0000009) -#define STATUS_BAD_INITIAL_PC cpu_to_le32(0xC000000A) -#define STATUS_INVALID_CID cpu_to_le32(0xC000000B) -#define STATUS_TIMER_NOT_CANCELED cpu_to_le32(0xC000000C) -#define STATUS_INVALID_PARAMETER cpu_to_le32(0xC000000D) -#define STATUS_NO_SUCH_DEVICE cpu_to_le32(0xC000000E) -#define STATUS_NO_SUCH_FILE cpu_to_le32(0xC000000F) -#define STATUS_INVALID_DEVICE_REQUEST cpu_to_le32(0xC0000010) -#define STATUS_END_OF_FILE cpu_to_le32(0xC0000011) -#define STATUS_WRONG_VOLUME cpu_to_le32(0xC0000012) -#define STATUS_NO_MEDIA_IN_DEVICE cpu_to_le32(0xC0000013) -#define STATUS_UNRECOGNIZED_MEDIA cpu_to_le32(0xC0000014) -#define STATUS_NONEXISTENT_SECTOR cpu_to_le32(0xC0000015) -#define STATUS_MORE_PROCESSING_REQUIRED cpu_to_le32(0xC0000016) -#define STATUS_NO_MEMORY cpu_to_le32(0xC0000017) -#define STATUS_CONFLICTING_ADDRESSES cpu_to_le32(0xC0000018) -#define STATUS_NOT_MAPPED_VIEW cpu_to_le32(0xC0000019) -#define STATUS_UNABLE_TO_FREE_VM cpu_to_le32(0xC000001A) -#define STATUS_UNABLE_TO_DELETE_SECTION cpu_to_le32(0xC000001B) -#define STATUS_INVALID_SYSTEM_SERVICE cpu_to_le32(0xC000001C) -#define STATUS_ILLEGAL_INSTRUCTION cpu_to_le32(0xC000001D) -#define STATUS_INVALID_LOCK_SEQUENCE cpu_to_le32(0xC000001E) -#define STATUS_INVALID_VIEW_SIZE cpu_to_le32(0xC000001F) -#define STATUS_INVALID_FILE_FOR_SECTION cpu_to_le32(0xC0000020) -#define STATUS_ALREADY_COMMITTED cpu_to_le32(0xC0000021) -#define STATUS_ACCESS_DENIED cpu_to_le32(0xC0000022) -#define STATUS_BUFFER_TOO_SMALL cpu_to_le32(0xC0000023) -#define STATUS_OBJECT_TYPE_MISMATCH cpu_to_le32(0xC0000024) -#define STATUS_NONCONTINUABLE_EXCEPTION cpu_to_le32(0xC0000025) -#define STATUS_INVALID_DISPOSITION cpu_to_le32(0xC0000026) -#define STATUS_UNWIND cpu_to_le32(0xC0000027) -#define STATUS_BAD_STACK cpu_to_le32(0xC0000028) -#define STATUS_INVALID_UNWIND_TARGET cpu_to_le32(0xC0000029) -#define STATUS_NOT_LOCKED cpu_to_le32(0xC000002A) -#define STATUS_PARITY_ERROR cpu_to_le32(0xC000002B) -#define STATUS_UNABLE_TO_DECOMMIT_VM cpu_to_le32(0xC000002C) -#define STATUS_NOT_COMMITTED cpu_to_le32(0xC000002D) -#define STATUS_INVALID_PORT_ATTRIBUTES cpu_to_le32(0xC000002E) -#define STATUS_PORT_MESSAGE_TOO_LONG cpu_to_le32(0xC000002F) -#define STATUS_INVALID_PARAMETER_MIX cpu_to_le32(0xC0000030) -#define STATUS_INVALID_QUOTA_LOWER cpu_to_le32(0xC0000031) -#define STATUS_DISK_CORRUPT_ERROR cpu_to_le32(0xC0000032) -#define STATUS_OBJECT_NAME_INVALID cpu_to_le32(0xC0000033) -#define STATUS_OBJECT_NAME_NOT_FOUND cpu_to_le32(0xC0000034) -#define STATUS_OBJECT_NAME_COLLISION cpu_to_le32(0xC0000035) -#define STATUS_PORT_DISCONNECTED cpu_to_le32(0xC0000037) -#define STATUS_DEVICE_ALREADY_ATTACHED cpu_to_le32(0xC0000038) -#define STATUS_OBJECT_PATH_INVALID cpu_to_le32(0xC0000039) -#define STATUS_OBJECT_PATH_NOT_FOUND cpu_to_le32(0xC000003A) -#define STATUS_OBJECT_PATH_SYNTAX_BAD cpu_to_le32(0xC000003B) -#define STATUS_DATA_OVERRUN cpu_to_le32(0xC000003C) -#define STATUS_DATA_LATE_ERROR cpu_to_le32(0xC000003D) -#define STATUS_DATA_ERROR cpu_to_le32(0xC000003E) -#define STATUS_CRC_ERROR cpu_to_le32(0xC000003F) -#define STATUS_SECTION_TOO_BIG cpu_to_le32(0xC0000040) -#define STATUS_PORT_CONNECTION_REFUSED cpu_to_le32(0xC0000041) -#define STATUS_INVALID_PORT_HANDLE cpu_to_le32(0xC0000042) -#define STATUS_SHARING_VIOLATION cpu_to_le32(0xC0000043) -#define STATUS_QUOTA_EXCEEDED cpu_to_le32(0xC0000044) -#define STATUS_INVALID_PAGE_PROTECTION cpu_to_le32(0xC0000045) -#define STATUS_MUTANT_NOT_OWNED cpu_to_le32(0xC0000046) -#define STATUS_SEMAPHORE_LIMIT_EXCEEDED cpu_to_le32(0xC0000047) -#define STATUS_PORT_ALREADY_SET cpu_to_le32(0xC0000048) -#define STATUS_SECTION_NOT_IMAGE cpu_to_le32(0xC0000049) -#define STATUS_SUSPEND_COUNT_EXCEEDED cpu_to_le32(0xC000004A) -#define STATUS_THREAD_IS_TERMINATING cpu_to_le32(0xC000004B) -#define STATUS_BAD_WORKING_SET_LIMIT cpu_to_le32(0xC000004C) -#define STATUS_INCOMPATIBLE_FILE_MAP cpu_to_le32(0xC000004D) -#define STATUS_SECTION_PROTECTION cpu_to_le32(0xC000004E) -#define STATUS_EAS_NOT_SUPPORTED cpu_to_le32(0xC000004F) -#define STATUS_EA_TOO_LARGE cpu_to_le32(0xC0000050) -#define STATUS_NONEXISTENT_EA_ENTRY cpu_to_le32(0xC0000051) -#define STATUS_NO_EAS_ON_FILE cpu_to_le32(0xC0000052) -#define STATUS_EA_CORRUPT_ERROR cpu_to_le32(0xC0000053) -#define STATUS_FILE_LOCK_CONFLICT cpu_to_le32(0xC0000054) -#define STATUS_LOCK_NOT_GRANTED cpu_to_le32(0xC0000055) -#define STATUS_DELETE_PENDING cpu_to_le32(0xC0000056) -#define STATUS_CTL_FILE_NOT_SUPPORTED cpu_to_le32(0xC0000057) -#define STATUS_UNKNOWN_REVISION cpu_to_le32(0xC0000058) -#define STATUS_REVISION_MISMATCH cpu_to_le32(0xC0000059) -#define STATUS_INVALID_OWNER cpu_to_le32(0xC000005A) -#define STATUS_INVALID_PRIMARY_GROUP cpu_to_le32(0xC000005B) -#define STATUS_NO_IMPERSONATION_TOKEN cpu_to_le32(0xC000005C) -#define STATUS_CANT_DISABLE_MANDATORY cpu_to_le32(0xC000005D) -#define STATUS_NO_LOGON_SERVERS cpu_to_le32(0xC000005E) -#define STATUS_NO_SUCH_LOGON_SESSION cpu_to_le32(0xC000005F) -#define STATUS_NO_SUCH_PRIVILEGE cpu_to_le32(0xC0000060) -#define STATUS_PRIVILEGE_NOT_HELD cpu_to_le32(0xC0000061) -#define STATUS_INVALID_ACCOUNT_NAME cpu_to_le32(0xC0000062) -#define STATUS_USER_EXISTS cpu_to_le32(0xC0000063) -#define STATUS_NO_SUCH_USER cpu_to_le32(0xC0000064) -#define STATUS_GROUP_EXISTS cpu_to_le32(0xC0000065) -#define STATUS_NO_SUCH_GROUP cpu_to_le32(0xC0000066) -#define STATUS_MEMBER_IN_GROUP cpu_to_le32(0xC0000067) -#define STATUS_MEMBER_NOT_IN_GROUP cpu_to_le32(0xC0000068) -#define STATUS_LAST_ADMIN cpu_to_le32(0xC0000069) -#define STATUS_WRONG_PASSWORD cpu_to_le32(0xC000006A) -#define STATUS_ILL_FORMED_PASSWORD cpu_to_le32(0xC000006B) -#define STATUS_PASSWORD_RESTRICTION cpu_to_le32(0xC000006C) -#define STATUS_LOGON_FAILURE cpu_to_le32(0xC000006D) -#define STATUS_ACCOUNT_RESTRICTION cpu_to_le32(0xC000006E) -#define STATUS_INVALID_LOGON_HOURS cpu_to_le32(0xC000006F) -#define STATUS_INVALID_WORKSTATION cpu_to_le32(0xC0000070) -#define STATUS_PASSWORD_EXPIRED cpu_to_le32(0xC0000071) -#define STATUS_ACCOUNT_DISABLED cpu_to_le32(0xC0000072) -#define STATUS_NONE_MAPPED cpu_to_le32(0xC0000073) -#define STATUS_TOO_MANY_LUIDS_REQUESTED cpu_to_le32(0xC0000074) -#define STATUS_LUIDS_EXHAUSTED cpu_to_le32(0xC0000075) -#define STATUS_INVALID_SUB_AUTHORITY cpu_to_le32(0xC0000076) -#define STATUS_INVALID_ACL cpu_to_le32(0xC0000077) -#define STATUS_INVALID_SID cpu_to_le32(0xC0000078) -#define STATUS_INVALID_SECURITY_DESCR cpu_to_le32(0xC0000079) -#define STATUS_PROCEDURE_NOT_FOUND cpu_to_le32(0xC000007A) -#define STATUS_INVALID_IMAGE_FORMAT cpu_to_le32(0xC000007B) -#define STATUS_NO_TOKEN cpu_to_le32(0xC000007C) -#define STATUS_BAD_INHERITANCE_ACL cpu_to_le32(0xC000007D) -#define STATUS_RANGE_NOT_LOCKED cpu_to_le32(0xC000007E) -#define STATUS_DISK_FULL cpu_to_le32(0xC000007F) -#define STATUS_SERVER_DISABLED cpu_to_le32(0xC0000080) -#define STATUS_SERVER_NOT_DISABLED cpu_to_le32(0xC0000081) -#define STATUS_TOO_MANY_GUIDS_REQUESTED cpu_to_le32(0xC0000082) -#define STATUS_GUIDS_EXHAUSTED cpu_to_le32(0xC0000083) -#define STATUS_INVALID_ID_AUTHORITY cpu_to_le32(0xC0000084) -#define STATUS_AGENTS_EXHAUSTED cpu_to_le32(0xC0000085) -#define STATUS_INVALID_VOLUME_LABEL cpu_to_le32(0xC0000086) -#define STATUS_SECTION_NOT_EXTENDED cpu_to_le32(0xC0000087) -#define STATUS_NOT_MAPPED_DATA cpu_to_le32(0xC0000088) -#define STATUS_RESOURCE_DATA_NOT_FOUND cpu_to_le32(0xC0000089) -#define STATUS_RESOURCE_TYPE_NOT_FOUND cpu_to_le32(0xC000008A) -#define STATUS_RESOURCE_NAME_NOT_FOUND cpu_to_le32(0xC000008B) -#define STATUS_ARRAY_BOUNDS_EXCEEDED cpu_to_le32(0xC000008C) -#define STATUS_FLOAT_DENORMAL_OPERAND cpu_to_le32(0xC000008D) -#define STATUS_FLOAT_DIVIDE_BY_ZERO cpu_to_le32(0xC000008E) -#define STATUS_FLOAT_INEXACT_RESULT cpu_to_le32(0xC000008F) -#define STATUS_FLOAT_INVALID_OPERATION cpu_to_le32(0xC0000090) -#define STATUS_FLOAT_OVERFLOW cpu_to_le32(0xC0000091) -#define STATUS_FLOAT_STACK_CHECK cpu_to_le32(0xC0000092) -#define STATUS_FLOAT_UNDERFLOW cpu_to_le32(0xC0000093) -#define STATUS_INTEGER_DIVIDE_BY_ZERO cpu_to_le32(0xC0000094) -#define STATUS_INTEGER_OVERFLOW cpu_to_le32(0xC0000095) -#define STATUS_PRIVILEGED_INSTRUCTION cpu_to_le32(0xC0000096) -#define STATUS_TOO_MANY_PAGING_FILES cpu_to_le32(0xC0000097) -#define STATUS_FILE_INVALID cpu_to_le32(0xC0000098) -#define STATUS_ALLOTTED_SPACE_EXCEEDED cpu_to_le32(0xC0000099) -#define STATUS_INSUFFICIENT_RESOURCES cpu_to_le32(0xC000009A) -#define STATUS_DFS_EXIT_PATH_FOUND cpu_to_le32(0xC000009B) -#define STATUS_DEVICE_DATA_ERROR cpu_to_le32(0xC000009C) -#define STATUS_DEVICE_NOT_CONNECTED cpu_to_le32(0xC000009D) -#define STATUS_DEVICE_POWER_FAILURE cpu_to_le32(0xC000009E) -#define STATUS_FREE_VM_NOT_AT_BASE cpu_to_le32(0xC000009F) -#define STATUS_MEMORY_NOT_ALLOCATED cpu_to_le32(0xC00000A0) -#define STATUS_WORKING_SET_QUOTA cpu_to_le32(0xC00000A1) -#define STATUS_MEDIA_WRITE_PROTECTED cpu_to_le32(0xC00000A2) -#define STATUS_DEVICE_NOT_READY cpu_to_le32(0xC00000A3) -#define STATUS_INVALID_GROUP_ATTRIBUTES cpu_to_le32(0xC00000A4) -#define STATUS_BAD_IMPERSONATION_LEVEL cpu_to_le32(0xC00000A5) -#define STATUS_CANT_OPEN_ANONYMOUS cpu_to_le32(0xC00000A6) -#define STATUS_BAD_VALIDATION_CLASS cpu_to_le32(0xC00000A7) -#define STATUS_BAD_TOKEN_TYPE cpu_to_le32(0xC00000A8) -#define STATUS_BAD_MASTER_BOOT_RECORD cpu_to_le32(0xC00000A9) -#define STATUS_INSTRUCTION_MISALIGNMENT cpu_to_le32(0xC00000AA) -#define STATUS_INSTANCE_NOT_AVAILABLE cpu_to_le32(0xC00000AB) -#define STATUS_PIPE_NOT_AVAILABLE cpu_to_le32(0xC00000AC) -#define STATUS_INVALID_PIPE_STATE cpu_to_le32(0xC00000AD) -#define STATUS_PIPE_BUSY cpu_to_le32(0xC00000AE) -#define STATUS_ILLEGAL_FUNCTION cpu_to_le32(0xC00000AF) -#define STATUS_PIPE_DISCONNECTED cpu_to_le32(0xC00000B0) -#define STATUS_PIPE_CLOSING cpu_to_le32(0xC00000B1) -#define STATUS_PIPE_CONNECTED cpu_to_le32(0xC00000B2) -#define STATUS_PIPE_LISTENING cpu_to_le32(0xC00000B3) -#define STATUS_INVALID_READ_MODE cpu_to_le32(0xC00000B4) -#define STATUS_IO_TIMEOUT cpu_to_le32(0xC00000B5) -#define STATUS_FILE_FORCED_CLOSED cpu_to_le32(0xC00000B6) -#define STATUS_PROFILING_NOT_STARTED cpu_to_le32(0xC00000B7) -#define STATUS_PROFILING_NOT_STOPPED cpu_to_le32(0xC00000B8) -#define STATUS_COULD_NOT_INTERPRET cpu_to_le32(0xC00000B9) -#define STATUS_FILE_IS_A_DIRECTORY cpu_to_le32(0xC00000BA) -#define STATUS_NOT_SUPPORTED cpu_to_le32(0xC00000BB) -#define STATUS_REMOTE_NOT_LISTENING cpu_to_le32(0xC00000BC) -#define STATUS_DUPLICATE_NAME cpu_to_le32(0xC00000BD) -#define STATUS_BAD_NETWORK_PATH cpu_to_le32(0xC00000BE) -#define STATUS_NETWORK_BUSY cpu_to_le32(0xC00000BF) -#define STATUS_DEVICE_DOES_NOT_EXIST cpu_to_le32(0xC00000C0) -#define STATUS_TOO_MANY_COMMANDS cpu_to_le32(0xC00000C1) -#define STATUS_ADAPTER_HARDWARE_ERROR cpu_to_le32(0xC00000C2) -#define STATUS_INVALID_NETWORK_RESPONSE cpu_to_le32(0xC00000C3) -#define STATUS_UNEXPECTED_NETWORK_ERROR cpu_to_le32(0xC00000C4) -#define STATUS_BAD_REMOTE_ADAPTER cpu_to_le32(0xC00000C5) -#define STATUS_PRINT_QUEUE_FULL cpu_to_le32(0xC00000C6) -#define STATUS_NO_SPOOL_SPACE cpu_to_le32(0xC00000C7) -#define STATUS_PRINT_CANCELLED cpu_to_le32(0xC00000C8) -#define STATUS_NETWORK_NAME_DELETED cpu_to_le32(0xC00000C9) -#define STATUS_NETWORK_ACCESS_DENIED cpu_to_le32(0xC00000CA) -#define STATUS_BAD_DEVICE_TYPE cpu_to_le32(0xC00000CB) -#define STATUS_BAD_NETWORK_NAME cpu_to_le32(0xC00000CC) -#define STATUS_TOO_MANY_NAMES cpu_to_le32(0xC00000CD) -#define STATUS_TOO_MANY_SESSIONS cpu_to_le32(0xC00000CE) -#define STATUS_SHARING_PAUSED cpu_to_le32(0xC00000CF) -#define STATUS_REQUEST_NOT_ACCEPTED cpu_to_le32(0xC00000D0) -#define STATUS_REDIRECTOR_PAUSED cpu_to_le32(0xC00000D1) -#define STATUS_NET_WRITE_FAULT cpu_to_le32(0xC00000D2) -#define STATUS_PROFILING_AT_LIMIT cpu_to_le32(0xC00000D3) -#define STATUS_NOT_SAME_DEVICE cpu_to_le32(0xC00000D4) -#define STATUS_FILE_RENAMED cpu_to_le32(0xC00000D5) -#define STATUS_VIRTUAL_CIRCUIT_CLOSED cpu_to_le32(0xC00000D6) -#define STATUS_NO_SECURITY_ON_OBJECT cpu_to_le32(0xC00000D7) -#define STATUS_CANT_WAIT cpu_to_le32(0xC00000D8) -#define STATUS_PIPE_EMPTY cpu_to_le32(0xC00000D9) -#define STATUS_CANT_ACCESS_DOMAIN_INFO cpu_to_le32(0xC00000DA) -#define STATUS_CANT_TERMINATE_SELF cpu_to_le32(0xC00000DB) -#define STATUS_INVALID_SERVER_STATE cpu_to_le32(0xC00000DC) -#define STATUS_INVALID_DOMAIN_STATE cpu_to_le32(0xC00000DD) -#define STATUS_INVALID_DOMAIN_ROLE cpu_to_le32(0xC00000DE) -#define STATUS_NO_SUCH_DOMAIN cpu_to_le32(0xC00000DF) -#define STATUS_DOMAIN_EXISTS cpu_to_le32(0xC00000E0) -#define STATUS_DOMAIN_LIMIT_EXCEEDED cpu_to_le32(0xC00000E1) -#define STATUS_OPLOCK_NOT_GRANTED cpu_to_le32(0xC00000E2) -#define STATUS_INVALID_OPLOCK_PROTOCOL cpu_to_le32(0xC00000E3) -#define STATUS_INTERNAL_DB_CORRUPTION cpu_to_le32(0xC00000E4) -#define STATUS_INTERNAL_ERROR cpu_to_le32(0xC00000E5) -#define STATUS_GENERIC_NOT_MAPPED cpu_to_le32(0xC00000E6) -#define STATUS_BAD_DESCRIPTOR_FORMAT cpu_to_le32(0xC00000E7) -#define STATUS_INVALID_USER_BUFFER cpu_to_le32(0xC00000E8) -#define STATUS_UNEXPECTED_IO_ERROR cpu_to_le32(0xC00000E9) -#define STATUS_UNEXPECTED_MM_CREATE_ERR cpu_to_le32(0xC00000EA) -#define STATUS_UNEXPECTED_MM_MAP_ERROR cpu_to_le32(0xC00000EB) -#define STATUS_UNEXPECTED_MM_EXTEND_ERR cpu_to_le32(0xC00000EC) -#define STATUS_NOT_LOGON_PROCESS cpu_to_le32(0xC00000ED) -#define STATUS_LOGON_SESSION_EXISTS cpu_to_le32(0xC00000EE) -#define STATUS_INVALID_PARAMETER_1 cpu_to_le32(0xC00000EF) -#define STATUS_INVALID_PARAMETER_2 cpu_to_le32(0xC00000F0) -#define STATUS_INVALID_PARAMETER_3 cpu_to_le32(0xC00000F1) -#define STATUS_INVALID_PARAMETER_4 cpu_to_le32(0xC00000F2) -#define STATUS_INVALID_PARAMETER_5 cpu_to_le32(0xC00000F3) -#define STATUS_INVALID_PARAMETER_6 cpu_to_le32(0xC00000F4) -#define STATUS_INVALID_PARAMETER_7 cpu_to_le32(0xC00000F5) -#define STATUS_INVALID_PARAMETER_8 cpu_to_le32(0xC00000F6) -#define STATUS_INVALID_PARAMETER_9 cpu_to_le32(0xC00000F7) -#define STATUS_INVALID_PARAMETER_10 cpu_to_le32(0xC00000F8) -#define STATUS_INVALID_PARAMETER_11 cpu_to_le32(0xC00000F9) -#define STATUS_INVALID_PARAMETER_12 cpu_to_le32(0xC00000FA) -#define STATUS_REDIRECTOR_NOT_STARTED cpu_to_le32(0xC00000FB) -#define STATUS_REDIRECTOR_STARTED cpu_to_le32(0xC00000FC) -#define STATUS_STACK_OVERFLOW cpu_to_le32(0xC00000FD) -#define STATUS_NO_SUCH_PACKAGE cpu_to_le32(0xC00000FE) -#define STATUS_BAD_FUNCTION_TABLE cpu_to_le32(0xC00000FF) -#define STATUS_VARIABLE_NOT_FOUND cpu_to_le32(0xC0000100) -#define STATUS_DIRECTORY_NOT_EMPTY cpu_to_le32(0xC0000101) -#define STATUS_FILE_CORRUPT_ERROR cpu_to_le32(0xC0000102) -#define STATUS_NOT_A_DIRECTORY cpu_to_le32(0xC0000103) -#define STATUS_BAD_LOGON_SESSION_STATE cpu_to_le32(0xC0000104) -#define STATUS_LOGON_SESSION_COLLISION cpu_to_le32(0xC0000105) -#define STATUS_NAME_TOO_LONG cpu_to_le32(0xC0000106) -#define STATUS_FILES_OPEN cpu_to_le32(0xC0000107) -#define STATUS_CONNECTION_IN_USE cpu_to_le32(0xC0000108) -#define STATUS_MESSAGE_NOT_FOUND cpu_to_le32(0xC0000109) -#define STATUS_PROCESS_IS_TERMINATING cpu_to_le32(0xC000010A) -#define STATUS_INVALID_LOGON_TYPE cpu_to_le32(0xC000010B) -#define STATUS_NO_GUID_TRANSLATION cpu_to_le32(0xC000010C) -#define STATUS_CANNOT_IMPERSONATE cpu_to_le32(0xC000010D) -#define STATUS_IMAGE_ALREADY_LOADED cpu_to_le32(0xC000010E) -#define STATUS_ABIOS_NOT_PRESENT cpu_to_le32(0xC000010F) -#define STATUS_ABIOS_LID_NOT_EXIST cpu_to_le32(0xC0000110) -#define STATUS_ABIOS_LID_ALREADY_OWNED cpu_to_le32(0xC0000111) -#define STATUS_ABIOS_NOT_LID_OWNER cpu_to_le32(0xC0000112) -#define STATUS_ABIOS_INVALID_COMMAND cpu_to_le32(0xC0000113) -#define STATUS_ABIOS_INVALID_LID cpu_to_le32(0xC0000114) -#define STATUS_ABIOS_SELECTOR_NOT_AVAILABLE cpu_to_le32(0xC0000115) -#define STATUS_ABIOS_INVALID_SELECTOR cpu_to_le32(0xC0000116) -#define STATUS_NO_LDT cpu_to_le32(0xC0000117) -#define STATUS_INVALID_LDT_SIZE cpu_to_le32(0xC0000118) -#define STATUS_INVALID_LDT_OFFSET cpu_to_le32(0xC0000119) -#define STATUS_INVALID_LDT_DESCRIPTOR cpu_to_le32(0xC000011A) -#define STATUS_INVALID_IMAGE_NE_FORMAT cpu_to_le32(0xC000011B) -#define STATUS_RXACT_INVALID_STATE cpu_to_le32(0xC000011C) -#define STATUS_RXACT_COMMIT_FAILURE cpu_to_le32(0xC000011D) -#define STATUS_MAPPED_FILE_SIZE_ZERO cpu_to_le32(0xC000011E) -#define STATUS_TOO_MANY_OPENED_FILES cpu_to_le32(0xC000011F) -#define STATUS_CANCELLED cpu_to_le32(0xC0000120) -#define STATUS_CANNOT_DELETE cpu_to_le32(0xC0000121) -#define STATUS_INVALID_COMPUTER_NAME cpu_to_le32(0xC0000122) -#define STATUS_FILE_DELETED cpu_to_le32(0xC0000123) -#define STATUS_SPECIAL_ACCOUNT cpu_to_le32(0xC0000124) -#define STATUS_SPECIAL_GROUP cpu_to_le32(0xC0000125) -#define STATUS_SPECIAL_USER cpu_to_le32(0xC0000126) -#define STATUS_MEMBERS_PRIMARY_GROUP cpu_to_le32(0xC0000127) -#define STATUS_FILE_CLOSED cpu_to_le32(0xC0000128) -#define STATUS_TOO_MANY_THREADS cpu_to_le32(0xC0000129) -#define STATUS_THREAD_NOT_IN_PROCESS cpu_to_le32(0xC000012A) -#define STATUS_TOKEN_ALREADY_IN_USE cpu_to_le32(0xC000012B) -#define STATUS_PAGEFILE_QUOTA_EXCEEDED cpu_to_le32(0xC000012C) -#define STATUS_COMMITMENT_LIMIT cpu_to_le32(0xC000012D) -#define STATUS_INVALID_IMAGE_LE_FORMAT cpu_to_le32(0xC000012E) -#define STATUS_INVALID_IMAGE_NOT_MZ cpu_to_le32(0xC000012F) -#define STATUS_INVALID_IMAGE_PROTECT cpu_to_le32(0xC0000130) -#define STATUS_INVALID_IMAGE_WIN_16 cpu_to_le32(0xC0000131) -#define STATUS_LOGON_SERVER_CONFLICT cpu_to_le32(0xC0000132) -#define STATUS_TIME_DIFFERENCE_AT_DC cpu_to_le32(0xC0000133) -#define STATUS_SYNCHRONIZATION_REQUIRED cpu_to_le32(0xC0000134) -#define STATUS_DLL_NOT_FOUND cpu_to_le32(0xC0000135) -#define STATUS_OPEN_FAILED cpu_to_le32(0xC0000136) -#define STATUS_IO_PRIVILEGE_FAILED cpu_to_le32(0xC0000137) -#define STATUS_ORDINAL_NOT_FOUND cpu_to_le32(0xC0000138) -#define STATUS_ENTRYPOINT_NOT_FOUND cpu_to_le32(0xC0000139) -#define STATUS_CONTROL_C_EXIT cpu_to_le32(0xC000013A) -#define STATUS_LOCAL_DISCONNECT cpu_to_le32(0xC000013B) -#define STATUS_REMOTE_DISCONNECT cpu_to_le32(0xC000013C) -#define STATUS_REMOTE_RESOURCES cpu_to_le32(0xC000013D) -#define STATUS_LINK_FAILED cpu_to_le32(0xC000013E) -#define STATUS_LINK_TIMEOUT cpu_to_le32(0xC000013F) -#define STATUS_INVALID_CONNECTION cpu_to_le32(0xC0000140) -#define STATUS_INVALID_ADDRESS cpu_to_le32(0xC0000141) -#define STATUS_DLL_INIT_FAILED cpu_to_le32(0xC0000142) -#define STATUS_MISSING_SYSTEMFILE cpu_to_le32(0xC0000143) -#define STATUS_UNHANDLED_EXCEPTION cpu_to_le32(0xC0000144) -#define STATUS_APP_INIT_FAILURE cpu_to_le32(0xC0000145) -#define STATUS_PAGEFILE_CREATE_FAILED cpu_to_le32(0xC0000146) -#define STATUS_NO_PAGEFILE cpu_to_le32(0xC0000147) -#define STATUS_INVALID_LEVEL cpu_to_le32(0xC0000148) -#define STATUS_WRONG_PASSWORD_CORE cpu_to_le32(0xC0000149) -#define STATUS_ILLEGAL_FLOAT_CONTEXT cpu_to_le32(0xC000014A) -#define STATUS_PIPE_BROKEN cpu_to_le32(0xC000014B) -#define STATUS_REGISTRY_CORRUPT cpu_to_le32(0xC000014C) -#define STATUS_REGISTRY_IO_FAILED cpu_to_le32(0xC000014D) -#define STATUS_NO_EVENT_PAIR cpu_to_le32(0xC000014E) -#define STATUS_UNRECOGNIZED_VOLUME cpu_to_le32(0xC000014F) -#define STATUS_SERIAL_NO_DEVICE_INITED cpu_to_le32(0xC0000150) -#define STATUS_NO_SUCH_ALIAS cpu_to_le32(0xC0000151) -#define STATUS_MEMBER_NOT_IN_ALIAS cpu_to_le32(0xC0000152) -#define STATUS_MEMBER_IN_ALIAS cpu_to_le32(0xC0000153) -#define STATUS_ALIAS_EXISTS cpu_to_le32(0xC0000154) -#define STATUS_LOGON_NOT_GRANTED cpu_to_le32(0xC0000155) -#define STATUS_TOO_MANY_SECRETS cpu_to_le32(0xC0000156) -#define STATUS_SECRET_TOO_LONG cpu_to_le32(0xC0000157) -#define STATUS_INTERNAL_DB_ERROR cpu_to_le32(0xC0000158) -#define STATUS_FULLSCREEN_MODE cpu_to_le32(0xC0000159) -#define STATUS_TOO_MANY_CONTEXT_IDS cpu_to_le32(0xC000015A) -#define STATUS_LOGON_TYPE_NOT_GRANTED cpu_to_le32(0xC000015B) -#define STATUS_NOT_REGISTRY_FILE cpu_to_le32(0xC000015C) -#define STATUS_NT_CROSS_ENCRYPTION_REQUIRED cpu_to_le32(0xC000015D) -#define STATUS_DOMAIN_CTRLR_CONFIG_ERROR cpu_to_le32(0xC000015E) -#define STATUS_FT_MISSING_MEMBER cpu_to_le32(0xC000015F) -#define STATUS_ILL_FORMED_SERVICE_ENTRY cpu_to_le32(0xC0000160) -#define STATUS_ILLEGAL_CHARACTER cpu_to_le32(0xC0000161) -#define STATUS_UNMAPPABLE_CHARACTER cpu_to_le32(0xC0000162) -#define STATUS_UNDEFINED_CHARACTER cpu_to_le32(0xC0000163) -#define STATUS_FLOPPY_VOLUME cpu_to_le32(0xC0000164) -#define STATUS_FLOPPY_ID_MARK_NOT_FOUND cpu_to_le32(0xC0000165) -#define STATUS_FLOPPY_WRONG_CYLINDER cpu_to_le32(0xC0000166) -#define STATUS_FLOPPY_UNKNOWN_ERROR cpu_to_le32(0xC0000167) -#define STATUS_FLOPPY_BAD_REGISTERS cpu_to_le32(0xC0000168) -#define STATUS_DISK_RECALIBRATE_FAILED cpu_to_le32(0xC0000169) -#define STATUS_DISK_OPERATION_FAILED cpu_to_le32(0xC000016A) -#define STATUS_DISK_RESET_FAILED cpu_to_le32(0xC000016B) -#define STATUS_SHARED_IRQ_BUSY cpu_to_le32(0xC000016C) -#define STATUS_FT_ORPHANING cpu_to_le32(0xC000016D) -#define STATUS_BIOS_FAILED_TO_CONNECT_INTERRUPT cpu_to_le32(0xC000016E) -#define STATUS_PARTITION_FAILURE cpu_to_le32(0xC0000172) -#define STATUS_INVALID_BLOCK_LENGTH cpu_to_le32(0xC0000173) -#define STATUS_DEVICE_NOT_PARTITIONED cpu_to_le32(0xC0000174) -#define STATUS_UNABLE_TO_LOCK_MEDIA cpu_to_le32(0xC0000175) -#define STATUS_UNABLE_TO_UNLOAD_MEDIA cpu_to_le32(0xC0000176) -#define STATUS_EOM_OVERFLOW cpu_to_le32(0xC0000177) -#define STATUS_NO_MEDIA cpu_to_le32(0xC0000178) -#define STATUS_NO_SUCH_MEMBER cpu_to_le32(0xC000017A) -#define STATUS_INVALID_MEMBER cpu_to_le32(0xC000017B) -#define STATUS_KEY_DELETED cpu_to_le32(0xC000017C) -#define STATUS_NO_LOG_SPACE cpu_to_le32(0xC000017D) -#define STATUS_TOO_MANY_SIDS cpu_to_le32(0xC000017E) -#define STATUS_LM_CROSS_ENCRYPTION_REQUIRED cpu_to_le32(0xC000017F) -#define STATUS_KEY_HAS_CHILDREN cpu_to_le32(0xC0000180) -#define STATUS_CHILD_MUST_BE_VOLATILE cpu_to_le32(0xC0000181) -#define STATUS_DEVICE_CONFIGURATION_ERROR cpu_to_le32(0xC0000182) -#define STATUS_DRIVER_INTERNAL_ERROR cpu_to_le32(0xC0000183) -#define STATUS_INVALID_DEVICE_STATE cpu_to_le32(0xC0000184) -#define STATUS_IO_DEVICE_ERROR cpu_to_le32(0xC0000185) -#define STATUS_DEVICE_PROTOCOL_ERROR cpu_to_le32(0xC0000186) -#define STATUS_BACKUP_CONTROLLER cpu_to_le32(0xC0000187) -#define STATUS_LOG_FILE_FULL cpu_to_le32(0xC0000188) -#define STATUS_TOO_LATE cpu_to_le32(0xC0000189) -#define STATUS_NO_TRUST_LSA_SECRET cpu_to_le32(0xC000018A) -#define STATUS_NO_TRUST_SAM_ACCOUNT cpu_to_le32(0xC000018B) -#define STATUS_TRUSTED_DOMAIN_FAILURE cpu_to_le32(0xC000018C) -#define STATUS_TRUSTED_RELATIONSHIP_FAILURE cpu_to_le32(0xC000018D) -#define STATUS_EVENTLOG_FILE_CORRUPT cpu_to_le32(0xC000018E) -#define STATUS_EVENTLOG_CANT_START cpu_to_le32(0xC000018F) -#define STATUS_TRUST_FAILURE cpu_to_le32(0xC0000190) -#define STATUS_MUTANT_LIMIT_EXCEEDED cpu_to_le32(0xC0000191) -#define STATUS_NETLOGON_NOT_STARTED cpu_to_le32(0xC0000192) -#define STATUS_ACCOUNT_EXPIRED cpu_to_le32(0xC0000193) -#define STATUS_POSSIBLE_DEADLOCK cpu_to_le32(0xC0000194) -#define STATUS_NETWORK_CREDENTIAL_CONFLICT cpu_to_le32(0xC0000195) -#define STATUS_REMOTE_SESSION_LIMIT cpu_to_le32(0xC0000196) -#define STATUS_EVENTLOG_FILE_CHANGED cpu_to_le32(0xC0000197) -#define STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT cpu_to_le32(0xC0000198) -#define STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT cpu_to_le32(0xC0000199) -#define STATUS_NOLOGON_SERVER_TRUST_ACCOUNT cpu_to_le32(0xC000019A) -#define STATUS_DOMAIN_TRUST_INCONSISTENT cpu_to_le32(0xC000019B) -#define STATUS_FS_DRIVER_REQUIRED cpu_to_le32(0xC000019C) -#define STATUS_IMAGE_ALREADY_LOADED_AS_DLL cpu_to_le32(0xC000019D) -#define STATUS_NETWORK_OPEN_RESTRICTION cpu_to_le32(0xC0000201) -#define STATUS_NO_USER_SESSION_KEY cpu_to_le32(0xC0000202) -#define STATUS_USER_SESSION_DELETED cpu_to_le32(0xC0000203) -#define STATUS_RESOURCE_LANG_NOT_FOUND cpu_to_le32(0xC0000204) -#define STATUS_INSUFF_SERVER_RESOURCES cpu_to_le32(0xC0000205) -#define STATUS_INVALID_BUFFER_SIZE cpu_to_le32(0xC0000206) -#define STATUS_INVALID_ADDRESS_COMPONENT cpu_to_le32(0xC0000207) -#define STATUS_INVALID_ADDRESS_WILDCARD cpu_to_le32(0xC0000208) -#define STATUS_TOO_MANY_ADDRESSES cpu_to_le32(0xC0000209) -#define STATUS_ADDRESS_ALREADY_EXISTS cpu_to_le32(0xC000020A) -#define STATUS_ADDRESS_CLOSED cpu_to_le32(0xC000020B) -#define STATUS_CONNECTION_DISCONNECTED cpu_to_le32(0xC000020C) -#define STATUS_CONNECTION_RESET cpu_to_le32(0xC000020D) -#define STATUS_TOO_MANY_NODES cpu_to_le32(0xC000020E) -#define STATUS_TRANSACTION_ABORTED cpu_to_le32(0xC000020F) -#define STATUS_TRANSACTION_TIMED_OUT cpu_to_le32(0xC0000210) -#define STATUS_TRANSACTION_NO_RELEASE cpu_to_le32(0xC0000211) -#define STATUS_TRANSACTION_NO_MATCH cpu_to_le32(0xC0000212) -#define STATUS_TRANSACTION_RESPONDED cpu_to_le32(0xC0000213) -#define STATUS_TRANSACTION_INVALID_ID cpu_to_le32(0xC0000214) -#define STATUS_TRANSACTION_INVALID_TYPE cpu_to_le32(0xC0000215) -#define STATUS_NOT_SERVER_SESSION cpu_to_le32(0xC0000216) -#define STATUS_NOT_CLIENT_SESSION cpu_to_le32(0xC0000217) -#define STATUS_CANNOT_LOAD_REGISTRY_FILE cpu_to_le32(0xC0000218) -#define STATUS_DEBUG_ATTACH_FAILED cpu_to_le32(0xC0000219) -#define STATUS_SYSTEM_PROCESS_TERMINATED cpu_to_le32(0xC000021A) -#define STATUS_DATA_NOT_ACCEPTED cpu_to_le32(0xC000021B) -#define STATUS_NO_BROWSER_SERVERS_FOUND cpu_to_le32(0xC000021C) -#define STATUS_VDM_HARD_ERROR cpu_to_le32(0xC000021D) -#define STATUS_DRIVER_CANCEL_TIMEOUT cpu_to_le32(0xC000021E) -#define STATUS_REPLY_MESSAGE_MISMATCH cpu_to_le32(0xC000021F) -#define STATUS_MAPPED_ALIGNMENT cpu_to_le32(0xC0000220) -#define STATUS_IMAGE_CHECKSUM_MISMATCH cpu_to_le32(0xC0000221) -#define STATUS_LOST_WRITEBEHIND_DATA cpu_to_le32(0xC0000222) -#define STATUS_CLIENT_SERVER_PARAMETERS_INVALID cpu_to_le32(0xC0000223) -#define STATUS_PASSWORD_MUST_CHANGE cpu_to_le32(0xC0000224) -#define STATUS_NOT_FOUND cpu_to_le32(0xC0000225) -#define STATUS_NOT_TINY_STREAM cpu_to_le32(0xC0000226) -#define STATUS_RECOVERY_FAILURE cpu_to_le32(0xC0000227) -#define STATUS_STACK_OVERFLOW_READ cpu_to_le32(0xC0000228) -#define STATUS_FAIL_CHECK cpu_to_le32(0xC0000229) -#define STATUS_DUPLICATE_OBJECTID cpu_to_le32(0xC000022A) -#define STATUS_OBJECTID_EXISTS cpu_to_le32(0xC000022B) -#define STATUS_CONVERT_TO_LARGE cpu_to_le32(0xC000022C) -#define STATUS_RETRY cpu_to_le32(0xC000022D) -#define STATUS_FOUND_OUT_OF_SCOPE cpu_to_le32(0xC000022E) -#define STATUS_ALLOCATE_BUCKET cpu_to_le32(0xC000022F) -#define STATUS_PROPSET_NOT_FOUND cpu_to_le32(0xC0000230) -#define STATUS_MARSHALL_OVERFLOW cpu_to_le32(0xC0000231) -#define STATUS_INVALID_VARIANT cpu_to_le32(0xC0000232) -#define STATUS_DOMAIN_CONTROLLER_NOT_FOUND cpu_to_le32(0xC0000233) -#define STATUS_ACCOUNT_LOCKED_OUT cpu_to_le32(0xC0000234) -#define STATUS_HANDLE_NOT_CLOSABLE cpu_to_le32(0xC0000235) -#define STATUS_CONNECTION_REFUSED cpu_to_le32(0xC0000236) -#define STATUS_GRACEFUL_DISCONNECT cpu_to_le32(0xC0000237) -#define STATUS_ADDRESS_ALREADY_ASSOCIATED cpu_to_le32(0xC0000238) -#define STATUS_ADDRESS_NOT_ASSOCIATED cpu_to_le32(0xC0000239) -#define STATUS_CONNECTION_INVALID cpu_to_le32(0xC000023A) -#define STATUS_CONNECTION_ACTIVE cpu_to_le32(0xC000023B) -#define STATUS_NETWORK_UNREACHABLE cpu_to_le32(0xC000023C) -#define STATUS_HOST_UNREACHABLE cpu_to_le32(0xC000023D) -#define STATUS_PROTOCOL_UNREACHABLE cpu_to_le32(0xC000023E) -#define STATUS_PORT_UNREACHABLE cpu_to_le32(0xC000023F) -#define STATUS_REQUEST_ABORTED cpu_to_le32(0xC0000240) -#define STATUS_CONNECTION_ABORTED cpu_to_le32(0xC0000241) -#define STATUS_BAD_COMPRESSION_BUFFER cpu_to_le32(0xC0000242) -#define STATUS_USER_MAPPED_FILE cpu_to_le32(0xC0000243) -#define STATUS_AUDIT_FAILED cpu_to_le32(0xC0000244) -#define STATUS_TIMER_RESOLUTION_NOT_SET cpu_to_le32(0xC0000245) -#define STATUS_CONNECTION_COUNT_LIMIT cpu_to_le32(0xC0000246) -#define STATUS_LOGIN_TIME_RESTRICTION cpu_to_le32(0xC0000247) -#define STATUS_LOGIN_WKSTA_RESTRICTION cpu_to_le32(0xC0000248) -#define STATUS_IMAGE_MP_UP_MISMATCH cpu_to_le32(0xC0000249) -#define STATUS_INSUFFICIENT_LOGON_INFO cpu_to_le32(0xC0000250) -#define STATUS_BAD_DLL_ENTRYPOINT cpu_to_le32(0xC0000251) -#define STATUS_BAD_SERVICE_ENTRYPOINT cpu_to_le32(0xC0000252) -#define STATUS_LPC_REPLY_LOST cpu_to_le32(0xC0000253) -#define STATUS_IP_ADDRESS_CONFLICT1 cpu_to_le32(0xC0000254) -#define STATUS_IP_ADDRESS_CONFLICT2 cpu_to_le32(0xC0000255) -#define STATUS_REGISTRY_QUOTA_LIMIT cpu_to_le32(0xC0000256) -#define STATUS_PATH_NOT_COVERED cpu_to_le32(0xC0000257) -#define STATUS_NO_CALLBACK_ACTIVE cpu_to_le32(0xC0000258) -#define STATUS_LICENSE_QUOTA_EXCEEDED cpu_to_le32(0xC0000259) -#define STATUS_PWD_TOO_SHORT cpu_to_le32(0xC000025A) -#define STATUS_PWD_TOO_RECENT cpu_to_le32(0xC000025B) -#define STATUS_PWD_HISTORY_CONFLICT cpu_to_le32(0xC000025C) -#define STATUS_PLUGPLAY_NO_DEVICE cpu_to_le32(0xC000025E) -#define STATUS_UNSUPPORTED_COMPRESSION cpu_to_le32(0xC000025F) -#define STATUS_INVALID_HW_PROFILE cpu_to_le32(0xC0000260) -#define STATUS_INVALID_PLUGPLAY_DEVICE_PATH cpu_to_le32(0xC0000261) -#define STATUS_DRIVER_ORDINAL_NOT_FOUND cpu_to_le32(0xC0000262) -#define STATUS_DRIVER_ENTRYPOINT_NOT_FOUND cpu_to_le32(0xC0000263) -#define STATUS_RESOURCE_NOT_OWNED cpu_to_le32(0xC0000264) -#define STATUS_TOO_MANY_LINKS cpu_to_le32(0xC0000265) -#define STATUS_QUOTA_LIST_INCONSISTENT cpu_to_le32(0xC0000266) -#define STATUS_FILE_IS_OFFLINE cpu_to_le32(0xC0000267) -#define STATUS_EVALUATION_EXPIRATION cpu_to_le32(0xC0000268) -#define STATUS_ILLEGAL_DLL_RELOCATION cpu_to_le32(0xC0000269) -#define STATUS_LICENSE_VIOLATION cpu_to_le32(0xC000026A) -#define STATUS_DLL_INIT_FAILED_LOGOFF cpu_to_le32(0xC000026B) -#define STATUS_DRIVER_UNABLE_TO_LOAD cpu_to_le32(0xC000026C) -#define STATUS_DFS_UNAVAILABLE cpu_to_le32(0xC000026D) -#define STATUS_VOLUME_DISMOUNTED cpu_to_le32(0xC000026E) -#define STATUS_WX86_INTERNAL_ERROR cpu_to_le32(0xC000026F) -#define STATUS_WX86_FLOAT_STACK_CHECK cpu_to_le32(0xC0000270) -#define STATUS_VALIDATE_CONTINUE cpu_to_le32(0xC0000271) -#define STATUS_NO_MATCH cpu_to_le32(0xC0000272) -#define STATUS_NO_MORE_MATCHES cpu_to_le32(0xC0000273) -#define STATUS_NOT_A_REPARSE_POINT cpu_to_le32(0xC0000275) -#define STATUS_IO_REPARSE_TAG_INVALID cpu_to_le32(0xC0000276) -#define STATUS_IO_REPARSE_TAG_MISMATCH cpu_to_le32(0xC0000277) -#define STATUS_IO_REPARSE_DATA_INVALID cpu_to_le32(0xC0000278) -#define STATUS_IO_REPARSE_TAG_NOT_HANDLED cpu_to_le32(0xC0000279) -#define STATUS_REPARSE_POINT_NOT_RESOLVED cpu_to_le32(0xC0000280) -#define STATUS_DIRECTORY_IS_A_REPARSE_POINT cpu_to_le32(0xC0000281) -#define STATUS_RANGE_LIST_CONFLICT cpu_to_le32(0xC0000282) -#define STATUS_SOURCE_ELEMENT_EMPTY cpu_to_le32(0xC0000283) -#define STATUS_DESTINATION_ELEMENT_FULL cpu_to_le32(0xC0000284) -#define STATUS_ILLEGAL_ELEMENT_ADDRESS cpu_to_le32(0xC0000285) -#define STATUS_MAGAZINE_NOT_PRESENT cpu_to_le32(0xC0000286) -#define STATUS_REINITIALIZATION_NEEDED cpu_to_le32(0xC0000287) -#define STATUS_ENCRYPTION_FAILED cpu_to_le32(0xC000028A) -#define STATUS_DECRYPTION_FAILED cpu_to_le32(0xC000028B) -#define STATUS_RANGE_NOT_FOUND cpu_to_le32(0xC000028C) -#define STATUS_NO_RECOVERY_POLICY cpu_to_le32(0xC000028D) -#define STATUS_NO_EFS cpu_to_le32(0xC000028E) -#define STATUS_WRONG_EFS cpu_to_le32(0xC000028F) -#define STATUS_NO_USER_KEYS cpu_to_le32(0xC0000290) -#define STATUS_FILE_NOT_ENCRYPTED cpu_to_le32(0xC0000291) -#define STATUS_NOT_EXPORT_FORMAT cpu_to_le32(0xC0000292) -#define STATUS_FILE_ENCRYPTED cpu_to_le32(0xC0000293) -#define STATUS_WMI_GUID_NOT_FOUND cpu_to_le32(0xC0000295) -#define STATUS_WMI_INSTANCE_NOT_FOUND cpu_to_le32(0xC0000296) -#define STATUS_WMI_ITEMID_NOT_FOUND cpu_to_le32(0xC0000297) -#define STATUS_WMI_TRY_AGAIN cpu_to_le32(0xC0000298) -#define STATUS_SHARED_POLICY cpu_to_le32(0xC0000299) -#define STATUS_POLICY_OBJECT_NOT_FOUND cpu_to_le32(0xC000029A) -#define STATUS_POLICY_ONLY_IN_DS cpu_to_le32(0xC000029B) -#define STATUS_VOLUME_NOT_UPGRADED cpu_to_le32(0xC000029C) -#define STATUS_REMOTE_STORAGE_NOT_ACTIVE cpu_to_le32(0xC000029D) -#define STATUS_REMOTE_STORAGE_MEDIA_ERROR cpu_to_le32(0xC000029E) -#define STATUS_NO_TRACKING_SERVICE cpu_to_le32(0xC000029F) -#define STATUS_SERVER_SID_MISMATCH cpu_to_le32(0xC00002A0) -#define STATUS_DS_NO_ATTRIBUTE_OR_VALUE cpu_to_le32(0xC00002A1) -#define STATUS_DS_INVALID_ATTRIBUTE_SYNTAX cpu_to_le32(0xC00002A2) -#define STATUS_DS_ATTRIBUTE_TYPE_UNDEFINED cpu_to_le32(0xC00002A3) -#define STATUS_DS_ATTRIBUTE_OR_VALUE_EXISTS cpu_to_le32(0xC00002A4) -#define STATUS_DS_BUSY cpu_to_le32(0xC00002A5) -#define STATUS_DS_UNAVAILABLE cpu_to_le32(0xC00002A6) -#define STATUS_DS_NO_RIDS_ALLOCATED cpu_to_le32(0xC00002A7) -#define STATUS_DS_NO_MORE_RIDS cpu_to_le32(0xC00002A8) -#define STATUS_DS_INCORRECT_ROLE_OWNER cpu_to_le32(0xC00002A9) -#define STATUS_DS_RIDMGR_INIT_ERROR cpu_to_le32(0xC00002AA) -#define STATUS_DS_OBJ_CLASS_VIOLATION cpu_to_le32(0xC00002AB) -#define STATUS_DS_CANT_ON_NON_LEAF cpu_to_le32(0xC00002AC) -#define STATUS_DS_CANT_ON_RDN cpu_to_le32(0xC00002AD) -#define STATUS_DS_CANT_MOD_OBJ_CLASS cpu_to_le32(0xC00002AE) -#define STATUS_DS_CROSS_DOM_MOVE_FAILED cpu_to_le32(0xC00002AF) -#define STATUS_DS_GC_NOT_AVAILABLE cpu_to_le32(0xC00002B0) -#define STATUS_DIRECTORY_SERVICE_REQUIRED cpu_to_le32(0xC00002B1) -#define STATUS_REPARSE_ATTRIBUTE_CONFLICT cpu_to_le32(0xC00002B2) -#define STATUS_CANT_ENABLE_DENY_ONLY cpu_to_le32(0xC00002B3) -#define STATUS_FLOAT_MULTIPLE_FAULTS cpu_to_le32(0xC00002B4) -#define STATUS_FLOAT_MULTIPLE_TRAPS cpu_to_le32(0xC00002B5) -#define STATUS_DEVICE_REMOVED cpu_to_le32(0xC00002B6) -#define STATUS_JOURNAL_DELETE_IN_PROGRESS cpu_to_le32(0xC00002B7) -#define STATUS_JOURNAL_NOT_ACTIVE cpu_to_le32(0xC00002B8) -#define STATUS_NOINTERFACE cpu_to_le32(0xC00002B9) -#define STATUS_DS_ADMIN_LIMIT_EXCEEDED cpu_to_le32(0xC00002C1) -#define STATUS_DRIVER_FAILED_SLEEP cpu_to_le32(0xC00002C2) -#define STATUS_MUTUAL_AUTHENTICATION_FAILED cpu_to_le32(0xC00002C3) -#define STATUS_CORRUPT_SYSTEM_FILE cpu_to_le32(0xC00002C4) -#define STATUS_DATATYPE_MISALIGNMENT_ERROR cpu_to_le32(0xC00002C5) -#define STATUS_WMI_READ_ONLY cpu_to_le32(0xC00002C6) -#define STATUS_WMI_SET_FAILURE cpu_to_le32(0xC00002C7) -#define STATUS_COMMITMENT_MINIMUM cpu_to_le32(0xC00002C8) -#define STATUS_REG_NAT_CONSUMPTION cpu_to_le32(0xC00002C9) -#define STATUS_TRANSPORT_FULL cpu_to_le32(0xC00002CA) -#define STATUS_DS_SAM_INIT_FAILURE cpu_to_le32(0xC00002CB) -#define STATUS_ONLY_IF_CONNECTED cpu_to_le32(0xC00002CC) -#define STATUS_DS_SENSITIVE_GROUP_VIOLATION cpu_to_le32(0xC00002CD) -#define STATUS_PNP_RESTART_ENUMERATION cpu_to_le32(0xC00002CE) -#define STATUS_JOURNAL_ENTRY_DELETED cpu_to_le32(0xC00002CF) -#define STATUS_DS_CANT_MOD_PRIMARYGROUPID cpu_to_le32(0xC00002D0) -#define STATUS_SYSTEM_IMAGE_BAD_SIGNATURE cpu_to_le32(0xC00002D1) -#define STATUS_PNP_REBOOT_REQUIRED cpu_to_le32(0xC00002D2) -#define STATUS_POWER_STATE_INVALID cpu_to_le32(0xC00002D3) -#define STATUS_DS_INVALID_GROUP_TYPE cpu_to_le32(0xC00002D4) -#define STATUS_DS_NO_NEST_GLOBALGROUP_IN_MIXEDDOMAIN cpu_to_le32(0xC00002D5) -#define STATUS_DS_NO_NEST_LOCALGROUP_IN_MIXEDDOMAIN cpu_to_le32(0xC00002D6) -#define STATUS_DS_GLOBAL_CANT_HAVE_LOCAL_MEMBER cpu_to_le32(0xC00002D7) -#define STATUS_DS_GLOBAL_CANT_HAVE_UNIVERSAL_MEMBER cpu_to_le32(0xC00002D8) -#define STATUS_DS_UNIVERSAL_CANT_HAVE_LOCAL_MEMBER cpu_to_le32(0xC00002D9) -#define STATUS_DS_GLOBAL_CANT_HAVE_CROSSDOMAIN_MEMBER cpu_to_le32(0xC00002DA) -#define STATUS_DS_LOCAL_CANT_HAVE_CROSSDOMAIN_LOCAL_MEMBER \ - cpu_to_le32(0xC00002DB) -#define STATUS_DS_HAVE_PRIMARY_MEMBERS cpu_to_le32(0xC00002DC) -#define STATUS_WMI_NOT_SUPPORTED cpu_to_le32(0xC00002DD) -#define STATUS_INSUFFICIENT_POWER cpu_to_le32(0xC00002DE) -#define STATUS_SAM_NEED_BOOTKEY_PASSWORD cpu_to_le32(0xC00002DF) -#define STATUS_SAM_NEED_BOOTKEY_FLOPPY cpu_to_le32(0xC00002E0) -#define STATUS_DS_CANT_START cpu_to_le32(0xC00002E1) -#define STATUS_DS_INIT_FAILURE cpu_to_le32(0xC00002E2) -#define STATUS_SAM_INIT_FAILURE cpu_to_le32(0xC00002E3) -#define STATUS_DS_GC_REQUIRED cpu_to_le32(0xC00002E4) -#define STATUS_DS_LOCAL_MEMBER_OF_LOCAL_ONLY cpu_to_le32(0xC00002E5) -#define STATUS_DS_NO_FPO_IN_UNIVERSAL_GROUPS cpu_to_le32(0xC00002E6) -#define STATUS_DS_MACHINE_ACCOUNT_QUOTA_EXCEEDED cpu_to_le32(0xC00002E7) -#define STATUS_MULTIPLE_FAULT_VIOLATION cpu_to_le32(0xC00002E8) -#define STATUS_CURRENT_DOMAIN_NOT_ALLOWED cpu_to_le32(0xC00002E9) -#define STATUS_CANNOT_MAKE cpu_to_le32(0xC00002EA) -#define STATUS_SYSTEM_SHUTDOWN cpu_to_le32(0xC00002EB) -#define STATUS_DS_INIT_FAILURE_CONSOLE cpu_to_le32(0xC00002EC) -#define STATUS_DS_SAM_INIT_FAILURE_CONSOLE cpu_to_le32(0xC00002ED) -#define STATUS_UNFINISHED_CONTEXT_DELETED cpu_to_le32(0xC00002EE) -#define STATUS_NO_TGT_REPLY cpu_to_le32(0xC00002EF) -#define STATUS_OBJECTID_NOT_FOUND cpu_to_le32(0xC00002F0) -#define STATUS_NO_IP_ADDRESSES cpu_to_le32(0xC00002F1) -#define STATUS_WRONG_CREDENTIAL_HANDLE cpu_to_le32(0xC00002F2) -#define STATUS_CRYPTO_SYSTEM_INVALID cpu_to_le32(0xC00002F3) -#define STATUS_MAX_REFERRALS_EXCEEDED cpu_to_le32(0xC00002F4) -#define STATUS_MUST_BE_KDC cpu_to_le32(0xC00002F5) -#define STATUS_STRONG_CRYPTO_NOT_SUPPORTED cpu_to_le32(0xC00002F6) -#define STATUS_TOO_MANY_PRINCIPALS cpu_to_le32(0xC00002F7) -#define STATUS_NO_PA_DATA cpu_to_le32(0xC00002F8) -#define STATUS_PKINIT_NAME_MISMATCH cpu_to_le32(0xC00002F9) -#define STATUS_SMARTCARD_LOGON_REQUIRED cpu_to_le32(0xC00002FA) -#define STATUS_KDC_INVALID_REQUEST cpu_to_le32(0xC00002FB) -#define STATUS_KDC_UNABLE_TO_REFER cpu_to_le32(0xC00002FC) -#define STATUS_KDC_UNKNOWN_ETYPE cpu_to_le32(0xC00002FD) -#define STATUS_SHUTDOWN_IN_PROGRESS cpu_to_le32(0xC00002FE) -#define STATUS_SERVER_SHUTDOWN_IN_PROGRESS cpu_to_le32(0xC00002FF) -#define STATUS_NOT_SUPPORTED_ON_SBS cpu_to_le32(0xC0000300) -#define STATUS_WMI_GUID_DISCONNECTED cpu_to_le32(0xC0000301) -#define STATUS_WMI_ALREADY_DISABLED cpu_to_le32(0xC0000302) -#define STATUS_WMI_ALREADY_ENABLED cpu_to_le32(0xC0000303) -#define STATUS_MFT_TOO_FRAGMENTED cpu_to_le32(0xC0000304) -#define STATUS_COPY_PROTECTION_FAILURE cpu_to_le32(0xC0000305) -#define STATUS_CSS_AUTHENTICATION_FAILURE cpu_to_le32(0xC0000306) -#define STATUS_CSS_KEY_NOT_PRESENT cpu_to_le32(0xC0000307) -#define STATUS_CSS_KEY_NOT_ESTABLISHED cpu_to_le32(0xC0000308) -#define STATUS_CSS_SCRAMBLED_SECTOR cpu_to_le32(0xC0000309) -#define STATUS_CSS_REGION_MISMATCH cpu_to_le32(0xC000030A) -#define STATUS_CSS_RESETS_EXHAUSTED cpu_to_le32(0xC000030B) -#define STATUS_PKINIT_FAILURE cpu_to_le32(0xC0000320) -#define STATUS_SMARTCARD_SUBSYSTEM_FAILURE cpu_to_le32(0xC0000321) -#define STATUS_NO_KERB_KEY cpu_to_le32(0xC0000322) -#define STATUS_HOST_DOWN cpu_to_le32(0xC0000350) -#define STATUS_UNSUPPORTED_PREAUTH cpu_to_le32(0xC0000351) -#define STATUS_EFS_ALG_BLOB_TOO_BIG cpu_to_le32(0xC0000352) -#define STATUS_PORT_NOT_SET cpu_to_le32(0xC0000353) -#define STATUS_DEBUGGER_INACTIVE cpu_to_le32(0xC0000354) -#define STATUS_DS_VERSION_CHECK_FAILURE cpu_to_le32(0xC0000355) -#define STATUS_AUDITING_DISABLED cpu_to_le32(0xC0000356) -#define STATUS_PRENT4_MACHINE_ACCOUNT cpu_to_le32(0xC0000357) -#define STATUS_DS_AG_CANT_HAVE_UNIVERSAL_MEMBER cpu_to_le32(0xC0000358) -#define STATUS_INVALID_IMAGE_WIN_32 cpu_to_le32(0xC0000359) -#define STATUS_INVALID_IMAGE_WIN_64 cpu_to_le32(0xC000035A) -#define STATUS_BAD_BINDINGS cpu_to_le32(0xC000035B) -#define STATUS_NETWORK_SESSION_EXPIRED cpu_to_le32(0xC000035C) -#define STATUS_APPHELP_BLOCK cpu_to_le32(0xC000035D) -#define STATUS_ALL_SIDS_FILTERED cpu_to_le32(0xC000035E) -#define STATUS_NOT_SAFE_MODE_DRIVER cpu_to_le32(0xC000035F) -#define STATUS_ACCESS_DISABLED_BY_POLICY_DEFAULT cpu_to_le32(0xC0000361) -#define STATUS_ACCESS_DISABLED_BY_POLICY_PATH cpu_to_le32(0xC0000362) -#define STATUS_ACCESS_DISABLED_BY_POLICY_PUBLISHER cpu_to_le32(0xC0000363) -#define STATUS_ACCESS_DISABLED_BY_POLICY_OTHER cpu_to_le32(0xC0000364) -#define STATUS_FAILED_DRIVER_ENTRY cpu_to_le32(0xC0000365) -#define STATUS_DEVICE_ENUMERATION_ERROR cpu_to_le32(0xC0000366) -#define STATUS_MOUNT_POINT_NOT_RESOLVED cpu_to_le32(0xC0000368) -#define STATUS_INVALID_DEVICE_OBJECT_PARAMETER cpu_to_le32(0xC0000369) -#define STATUS_MCA_OCCURRED cpu_to_le32(0xC000036A) -#define STATUS_DRIVER_BLOCKED_CRITICAL cpu_to_le32(0xC000036B) -#define STATUS_DRIVER_BLOCKED cpu_to_le32(0xC000036C) -#define STATUS_DRIVER_DATABASE_ERROR cpu_to_le32(0xC000036D) -#define STATUS_SYSTEM_HIVE_TOO_LARGE cpu_to_le32(0xC000036E) -#define STATUS_INVALID_IMPORT_OF_NON_DLL cpu_to_le32(0xC000036F) -#define STATUS_NO_SECRETS cpu_to_le32(0xC0000371) -#define STATUS_ACCESS_DISABLED_NO_SAFER_UI_BY_POLICY cpu_to_le32(0xC0000372) -#define STATUS_FAILED_STACK_SWITCH cpu_to_le32(0xC0000373) -#define STATUS_HEAP_CORRUPTION cpu_to_le32(0xC0000374) -#define STATUS_SMARTCARD_WRONG_PIN cpu_to_le32(0xC0000380) -#define STATUS_SMARTCARD_CARD_BLOCKED cpu_to_le32(0xC0000381) -#define STATUS_SMARTCARD_CARD_NOT_AUTHENTICATED cpu_to_le32(0xC0000382) -#define STATUS_SMARTCARD_NO_CARD cpu_to_le32(0xC0000383) -#define STATUS_SMARTCARD_NO_KEY_CONTAINER cpu_to_le32(0xC0000384) -#define STATUS_SMARTCARD_NO_CERTIFICATE cpu_to_le32(0xC0000385) -#define STATUS_SMARTCARD_NO_KEYSET cpu_to_le32(0xC0000386) -#define STATUS_SMARTCARD_IO_ERROR cpu_to_le32(0xC0000387) -#define STATUS_DOWNGRADE_DETECTED cpu_to_le32(0xC0000388) -#define STATUS_SMARTCARD_CERT_REVOKED cpu_to_le32(0xC0000389) -#define STATUS_ISSUING_CA_UNTRUSTED cpu_to_le32(0xC000038A) -#define STATUS_REVOCATION_OFFLINE_C cpu_to_le32(0xC000038B) -#define STATUS_PKINIT_CLIENT_FAILURE cpu_to_le32(0xC000038C) -#define STATUS_SMARTCARD_CERT_EXPIRED cpu_to_le32(0xC000038D) -#define STATUS_DRIVER_FAILED_PRIOR_UNLOAD cpu_to_le32(0xC000038E) -#define STATUS_SMARTCARD_SILENT_CONTEXT cpu_to_le32(0xC000038F) -#define STATUS_PER_USER_TRUST_QUOTA_EXCEEDED cpu_to_le32(0xC0000401) -#define STATUS_ALL_USER_TRUST_QUOTA_EXCEEDED cpu_to_le32(0xC0000402) -#define STATUS_USER_DELETE_TRUST_QUOTA_EXCEEDED cpu_to_le32(0xC0000403) -#define STATUS_DS_NAME_NOT_UNIQUE cpu_to_le32(0xC0000404) -#define STATUS_DS_DUPLICATE_ID_FOUND cpu_to_le32(0xC0000405) -#define STATUS_DS_GROUP_CONVERSION_ERROR cpu_to_le32(0xC0000406) -#define STATUS_VOLSNAP_PREPARE_HIBERNATE cpu_to_le32(0xC0000407) -#define STATUS_USER2USER_REQUIRED cpu_to_le32(0xC0000408) -#define STATUS_STACK_BUFFER_OVERRUN cpu_to_le32(0xC0000409) -#define STATUS_NO_S4U_PROT_SUPPORT cpu_to_le32(0xC000040A) -#define STATUS_CROSSREALM_DELEGATION_FAILURE cpu_to_le32(0xC000040B) -#define STATUS_REVOCATION_OFFLINE_KDC cpu_to_le32(0xC000040C) -#define STATUS_ISSUING_CA_UNTRUSTED_KDC cpu_to_le32(0xC000040D) -#define STATUS_KDC_CERT_EXPIRED cpu_to_le32(0xC000040E) -#define STATUS_KDC_CERT_REVOKED cpu_to_le32(0xC000040F) -#define STATUS_PARAMETER_QUOTA_EXCEEDED cpu_to_le32(0xC0000410) -#define STATUS_HIBERNATION_FAILURE cpu_to_le32(0xC0000411) -#define STATUS_DELAY_LOAD_FAILED cpu_to_le32(0xC0000412) -#define STATUS_AUTHENTICATION_FIREWALL_FAILED cpu_to_le32(0xC0000413) -#define STATUS_VDM_DISALLOWED cpu_to_le32(0xC0000414) -#define STATUS_HUNG_DISPLAY_DRIVER_THREAD cpu_to_le32(0xC0000415) -#define STATUS_INSUFFICIENT_RESOURCE_FOR_SPECIFIED_SHARED_SECTION_SIZE \ - cpu_to_le32(0xC0000416) -#define STATUS_INVALID_CRUNTIME_PARAMETER cpu_to_le32(0xC0000417) -#define STATUS_NTLM_BLOCKED cpu_to_le32(0xC0000418) -#define STATUS_ASSERTION_FAILURE cpu_to_le32(0xC0000420) -#define STATUS_VERIFIER_STOP cpu_to_le32(0xC0000421) -#define STATUS_CALLBACK_POP_STACK cpu_to_le32(0xC0000423) -#define STATUS_INCOMPATIBLE_DRIVER_BLOCKED cpu_to_le32(0xC0000424) -#define STATUS_HIVE_UNLOADED cpu_to_le32(0xC0000425) -#define STATUS_COMPRESSION_DISABLED cpu_to_le32(0xC0000426) -#define STATUS_FILE_SYSTEM_LIMITATION cpu_to_le32(0xC0000427) -#define STATUS_INVALID_IMAGE_HASH cpu_to_le32(0xC0000428) -#define STATUS_NOT_CAPABLE cpu_to_le32(0xC0000429) -#define STATUS_REQUEST_OUT_OF_SEQUENCE cpu_to_le32(0xC000042A) -#define STATUS_IMPLEMENTATION_LIMIT cpu_to_le32(0xC000042B) -#define STATUS_ELEVATION_REQUIRED cpu_to_le32(0xC000042C) -#define STATUS_BEYOND_VDL cpu_to_le32(0xC0000432) -#define STATUS_ENCOUNTERED_WRITE_IN_PROGRESS cpu_to_le32(0xC0000433) -#define STATUS_PTE_CHANGED cpu_to_le32(0xC0000434) -#define STATUS_PURGE_FAILED cpu_to_le32(0xC0000435) -#define STATUS_CRED_REQUIRES_CONFIRMATION cpu_to_le32(0xC0000440) -#define STATUS_CS_ENCRYPTION_INVALID_SERVER_RESPONSE cpu_to_le32(0xC0000441) -#define STATUS_CS_ENCRYPTION_UNSUPPORTED_SERVER cpu_to_le32(0xC0000442) -#define STATUS_CS_ENCRYPTION_EXISTING_ENCRYPTED_FILE cpu_to_le32(0xC0000443) -#define STATUS_CS_ENCRYPTION_NEW_ENCRYPTED_FILE cpu_to_le32(0xC0000444) -#define STATUS_CS_ENCRYPTION_FILE_NOT_CSE cpu_to_le32(0xC0000445) -#define STATUS_INVALID_LABEL cpu_to_le32(0xC0000446) -#define STATUS_DRIVER_PROCESS_TERMINATED cpu_to_le32(0xC0000450) -#define STATUS_AMBIGUOUS_SYSTEM_DEVICE cpu_to_le32(0xC0000451) -#define STATUS_SYSTEM_DEVICE_NOT_FOUND cpu_to_le32(0xC0000452) -#define STATUS_RESTART_BOOT_APPLICATION cpu_to_le32(0xC0000453) -#define STATUS_INVALID_TASK_NAME cpu_to_le32(0xC0000500) -#define STATUS_INVALID_TASK_INDEX cpu_to_le32(0xC0000501) -#define STATUS_THREAD_ALREADY_IN_TASK cpu_to_le32(0xC0000502) -#define STATUS_CALLBACK_BYPASS cpu_to_le32(0xC0000503) -#define STATUS_PORT_CLOSED cpu_to_le32(0xC0000700) -#define STATUS_MESSAGE_LOST cpu_to_le32(0xC0000701) -#define STATUS_INVALID_MESSAGE cpu_to_le32(0xC0000702) -#define STATUS_REQUEST_CANCELED cpu_to_le32(0xC0000703) -#define STATUS_RECURSIVE_DISPATCH cpu_to_le32(0xC0000704) -#define STATUS_LPC_RECEIVE_BUFFER_EXPECTED cpu_to_le32(0xC0000705) -#define STATUS_LPC_INVALID_CONNECTION_USAGE cpu_to_le32(0xC0000706) -#define STATUS_LPC_REQUESTS_NOT_ALLOWED cpu_to_le32(0xC0000707) -#define STATUS_RESOURCE_IN_USE cpu_to_le32(0xC0000708) -#define STATUS_HARDWARE_MEMORY_ERROR cpu_to_le32(0xC0000709) -#define STATUS_THREADPOOL_HANDLE_EXCEPTION cpu_to_le32(0xC000070A) -#define STATUS_THREADPOOL_SET_EVENT_ON_COMPLETION_FAILED cpu_to_le32(0xC000070B) -#define STATUS_THREADPOOL_RELEASE_SEMAPHORE_ON_COMPLETION_FAILED \ - cpu_to_le32(0xC000070C) -#define STATUS_THREADPOOL_RELEASE_MUTEX_ON_COMPLETION_FAILED \ - cpu_to_le32(0xC000070D) -#define STATUS_THREADPOOL_FREE_LIBRARY_ON_COMPLETION_FAILED \ - cpu_to_le32(0xC000070E) -#define STATUS_THREADPOOL_RELEASED_DURING_OPERATION cpu_to_le32(0xC000070F) -#define STATUS_CALLBACK_RETURNED_WHILE_IMPERSONATING cpu_to_le32(0xC0000710) -#define STATUS_APC_RETURNED_WHILE_IMPERSONATING cpu_to_le32(0xC0000711) -#define STATUS_PROCESS_IS_PROTECTED cpu_to_le32(0xC0000712) -#define STATUS_MCA_EXCEPTION cpu_to_le32(0xC0000713) -#define STATUS_CERTIFICATE_MAPPING_NOT_UNIQUE cpu_to_le32(0xC0000714) -#define STATUS_SYMLINK_CLASS_DISABLED cpu_to_le32(0xC0000715) -#define STATUS_INVALID_IDN_NORMALIZATION cpu_to_le32(0xC0000716) -#define STATUS_NO_UNICODE_TRANSLATION cpu_to_le32(0xC0000717) -#define STATUS_ALREADY_REGISTERED cpu_to_le32(0xC0000718) -#define STATUS_CONTEXT_MISMATCH cpu_to_le32(0xC0000719) -#define STATUS_PORT_ALREADY_HAS_COMPLETION_LIST cpu_to_le32(0xC000071A) -#define STATUS_CALLBACK_RETURNED_THREAD_PRIORITY cpu_to_le32(0xC000071B) -#define STATUS_INVALID_THREAD cpu_to_le32(0xC000071C) -#define STATUS_CALLBACK_RETURNED_TRANSACTION cpu_to_le32(0xC000071D) -#define STATUS_CALLBACK_RETURNED_LDR_LOCK cpu_to_le32(0xC000071E) -#define STATUS_CALLBACK_RETURNED_LANG cpu_to_le32(0xC000071F) -#define STATUS_CALLBACK_RETURNED_PRI_BACK cpu_to_le32(0xC0000720) -#define STATUS_CALLBACK_RETURNED_THREAD_AFFINITY cpu_to_le32(0xC0000721) -#define STATUS_DISK_REPAIR_DISABLED cpu_to_le32(0xC0000800) -#define STATUS_DS_DOMAIN_RENAME_IN_PROGRESS cpu_to_le32(0xC0000801) -#define STATUS_DISK_QUOTA_EXCEEDED cpu_to_le32(0xC0000802) -#define STATUS_CONTENT_BLOCKED cpu_to_le32(0xC0000804) -#define STATUS_BAD_CLUSTERS cpu_to_le32(0xC0000805) -#define STATUS_VOLUME_DIRTY cpu_to_le32(0xC0000806) -#define STATUS_FILE_CHECKED_OUT cpu_to_le32(0xC0000901) -#define STATUS_CHECKOUT_REQUIRED cpu_to_le32(0xC0000902) -#define STATUS_BAD_FILE_TYPE cpu_to_le32(0xC0000903) -#define STATUS_FILE_TOO_LARGE cpu_to_le32(0xC0000904) -#define STATUS_FORMS_AUTH_REQUIRED cpu_to_le32(0xC0000905) -#define STATUS_VIRUS_INFECTED cpu_to_le32(0xC0000906) -#define STATUS_VIRUS_DELETED cpu_to_le32(0xC0000907) -#define STATUS_BAD_MCFG_TABLE cpu_to_le32(0xC0000908) -#define STATUS_WOW_ASSERTION cpu_to_le32(0xC0009898) -#define STATUS_INVALID_SIGNATURE cpu_to_le32(0xC000A000) -#define STATUS_HMAC_NOT_SUPPORTED cpu_to_le32(0xC000A001) -#define STATUS_IPSEC_QUEUE_OVERFLOW cpu_to_le32(0xC000A010) -#define STATUS_ND_QUEUE_OVERFLOW cpu_to_le32(0xC000A011) -#define STATUS_HOPLIMIT_EXCEEDED cpu_to_le32(0xC000A012) -#define STATUS_PROTOCOL_NOT_SUPPORTED cpu_to_le32(0xC000A013) -#define STATUS_LOST_WRITEBEHIND_DATA_NETWORK_DISCONNECTED \ - cpu_to_le32(0xC000A080) -#define STATUS_LOST_WRITEBEHIND_DATA_NETWORK_SERVER_ERROR \ - cpu_to_le32(0xC000A081) -#define STATUS_LOST_WRITEBEHIND_DATA_LOCAL_DISK_ERROR cpu_to_le32(0xC000A082) -#define STATUS_XML_PARSE_ERROR cpu_to_le32(0xC000A083) -#define STATUS_XMLDSIG_ERROR cpu_to_le32(0xC000A084) -#define STATUS_WRONG_COMPARTMENT cpu_to_le32(0xC000A085) -#define STATUS_AUTHIP_FAILURE cpu_to_le32(0xC000A086) -#define DBG_NO_STATE_CHANGE cpu_to_le32(0xC0010001) -#define DBG_APP_NOT_IDLE cpu_to_le32(0xC0010002) -#define RPC_NT_INVALID_STRING_BINDING cpu_to_le32(0xC0020001) -#define RPC_NT_WRONG_KIND_OF_BINDING cpu_to_le32(0xC0020002) -#define RPC_NT_INVALID_BINDING cpu_to_le32(0xC0020003) -#define RPC_NT_PROTSEQ_NOT_SUPPORTED cpu_to_le32(0xC0020004) -#define RPC_NT_INVALID_RPC_PROTSEQ cpu_to_le32(0xC0020005) -#define RPC_NT_INVALID_STRING_UUID cpu_to_le32(0xC0020006) -#define RPC_NT_INVALID_ENDPOINT_FORMAT cpu_to_le32(0xC0020007) -#define RPC_NT_INVALID_NET_ADDR cpu_to_le32(0xC0020008) -#define RPC_NT_NO_ENDPOINT_FOUND cpu_to_le32(0xC0020009) -#define RPC_NT_INVALID_TIMEOUT cpu_to_le32(0xC002000A) -#define RPC_NT_OBJECT_NOT_FOUND cpu_to_le32(0xC002000B) -#define RPC_NT_ALREADY_REGISTERED cpu_to_le32(0xC002000C) -#define RPC_NT_TYPE_ALREADY_REGISTERED cpu_to_le32(0xC002000D) -#define RPC_NT_ALREADY_LISTENING cpu_to_le32(0xC002000E) -#define RPC_NT_NO_PROTSEQS_REGISTERED cpu_to_le32(0xC002000F) -#define RPC_NT_NOT_LISTENING cpu_to_le32(0xC0020010) -#define RPC_NT_UNKNOWN_MGR_TYPE cpu_to_le32(0xC0020011) -#define RPC_NT_UNKNOWN_IF cpu_to_le32(0xC0020012) -#define RPC_NT_NO_BINDINGS cpu_to_le32(0xC0020013) -#define RPC_NT_NO_PROTSEQS cpu_to_le32(0xC0020014) -#define RPC_NT_CANT_CREATE_ENDPOINT cpu_to_le32(0xC0020015) -#define RPC_NT_OUT_OF_RESOURCES cpu_to_le32(0xC0020016) -#define RPC_NT_SERVER_UNAVAILABLE cpu_to_le32(0xC0020017) -#define RPC_NT_SERVER_TOO_BUSY cpu_to_le32(0xC0020018) -#define RPC_NT_INVALID_NETWORK_OPTIONS cpu_to_le32(0xC0020019) -#define RPC_NT_NO_CALL_ACTIVE cpu_to_le32(0xC002001A) -#define RPC_NT_CALL_FAILED cpu_to_le32(0xC002001B) -#define RPC_NT_CALL_FAILED_DNE cpu_to_le32(0xC002001C) -#define RPC_NT_PROTOCOL_ERROR cpu_to_le32(0xC002001D) -#define RPC_NT_UNSUPPORTED_TRANS_SYN cpu_to_le32(0xC002001F) -#define RPC_NT_UNSUPPORTED_TYPE cpu_to_le32(0xC0020021) -#define RPC_NT_INVALID_TAG cpu_to_le32(0xC0020022) -#define RPC_NT_INVALID_BOUND cpu_to_le32(0xC0020023) -#define RPC_NT_NO_ENTRY_NAME cpu_to_le32(0xC0020024) -#define RPC_NT_INVALID_NAME_SYNTAX cpu_to_le32(0xC0020025) -#define RPC_NT_UNSUPPORTED_NAME_SYNTAX cpu_to_le32(0xC0020026) -#define RPC_NT_UUID_NO_ADDRESS cpu_to_le32(0xC0020028) -#define RPC_NT_DUPLICATE_ENDPOINT cpu_to_le32(0xC0020029) -#define RPC_NT_UNKNOWN_AUTHN_TYPE cpu_to_le32(0xC002002A) -#define RPC_NT_MAX_CALLS_TOO_SMALL cpu_to_le32(0xC002002B) -#define RPC_NT_STRING_TOO_LONG cpu_to_le32(0xC002002C) -#define RPC_NT_PROTSEQ_NOT_FOUND cpu_to_le32(0xC002002D) -#define RPC_NT_PROCNUM_OUT_OF_RANGE cpu_to_le32(0xC002002E) -#define RPC_NT_BINDING_HAS_NO_AUTH cpu_to_le32(0xC002002F) -#define RPC_NT_UNKNOWN_AUTHN_SERVICE cpu_to_le32(0xC0020030) -#define RPC_NT_UNKNOWN_AUTHN_LEVEL cpu_to_le32(0xC0020031) -#define RPC_NT_INVALID_AUTH_IDENTITY cpu_to_le32(0xC0020032) -#define RPC_NT_UNKNOWN_AUTHZ_SERVICE cpu_to_le32(0xC0020033) -#define EPT_NT_INVALID_ENTRY cpu_to_le32(0xC0020034) -#define EPT_NT_CANT_PERFORM_OP cpu_to_le32(0xC0020035) -#define EPT_NT_NOT_REGISTERED cpu_to_le32(0xC0020036) -#define RPC_NT_NOTHING_TO_EXPORT cpu_to_le32(0xC0020037) -#define RPC_NT_INCOMPLETE_NAME cpu_to_le32(0xC0020038) -#define RPC_NT_INVALID_VERS_OPTION cpu_to_le32(0xC0020039) -#define RPC_NT_NO_MORE_MEMBERS cpu_to_le32(0xC002003A) -#define RPC_NT_NOT_ALL_OBJS_UNEXPORTED cpu_to_le32(0xC002003B) -#define RPC_NT_INTERFACE_NOT_FOUND cpu_to_le32(0xC002003C) -#define RPC_NT_ENTRY_ALREADY_EXISTS cpu_to_le32(0xC002003D) -#define RPC_NT_ENTRY_NOT_FOUND cpu_to_le32(0xC002003E) -#define RPC_NT_NAME_SERVICE_UNAVAILABLE cpu_to_le32(0xC002003F) -#define RPC_NT_INVALID_NAF_ID cpu_to_le32(0xC0020040) -#define RPC_NT_CANNOT_SUPPORT cpu_to_le32(0xC0020041) -#define RPC_NT_NO_CONTEXT_AVAILABLE cpu_to_le32(0xC0020042) -#define RPC_NT_INTERNAL_ERROR cpu_to_le32(0xC0020043) -#define RPC_NT_ZERO_DIVIDE cpu_to_le32(0xC0020044) -#define RPC_NT_ADDRESS_ERROR cpu_to_le32(0xC0020045) -#define RPC_NT_FP_DIV_ZERO cpu_to_le32(0xC0020046) -#define RPC_NT_FP_UNDERFLOW cpu_to_le32(0xC0020047) -#define RPC_NT_FP_OVERFLOW cpu_to_le32(0xC0020048) -#define RPC_NT_CALL_IN_PROGRESS cpu_to_le32(0xC0020049) -#define RPC_NT_NO_MORE_BINDINGS cpu_to_le32(0xC002004A) -#define RPC_NT_GROUP_MEMBER_NOT_FOUND cpu_to_le32(0xC002004B) -#define EPT_NT_CANT_CREATE cpu_to_le32(0xC002004C) -#define RPC_NT_INVALID_OBJECT cpu_to_le32(0xC002004D) -#define RPC_NT_NO_INTERFACES cpu_to_le32(0xC002004F) -#define RPC_NT_CALL_CANCELLED cpu_to_le32(0xC0020050) -#define RPC_NT_BINDING_INCOMPLETE cpu_to_le32(0xC0020051) -#define RPC_NT_COMM_FAILURE cpu_to_le32(0xC0020052) -#define RPC_NT_UNSUPPORTED_AUTHN_LEVEL cpu_to_le32(0xC0020053) -#define RPC_NT_NO_PRINC_NAME cpu_to_le32(0xC0020054) -#define RPC_NT_NOT_RPC_ERROR cpu_to_le32(0xC0020055) -#define RPC_NT_SEC_PKG_ERROR cpu_to_le32(0xC0020057) -#define RPC_NT_NOT_CANCELLED cpu_to_le32(0xC0020058) -#define RPC_NT_INVALID_ASYNC_HANDLE cpu_to_le32(0xC0020062) -#define RPC_NT_INVALID_ASYNC_CALL cpu_to_le32(0xC0020063) -#define RPC_NT_PROXY_ACCESS_DENIED cpu_to_le32(0xC0020064) -#define RPC_NT_NO_MORE_ENTRIES cpu_to_le32(0xC0030001) -#define RPC_NT_SS_CHAR_TRANS_OPEN_FAIL cpu_to_le32(0xC0030002) -#define RPC_NT_SS_CHAR_TRANS_SHORT_FILE cpu_to_le32(0xC0030003) -#define RPC_NT_SS_IN_NULL_CONTEXT cpu_to_le32(0xC0030004) -#define RPC_NT_SS_CONTEXT_MISMATCH cpu_to_le32(0xC0030005) -#define RPC_NT_SS_CONTEXT_DAMAGED cpu_to_le32(0xC0030006) -#define RPC_NT_SS_HANDLES_MISMATCH cpu_to_le32(0xC0030007) -#define RPC_NT_SS_CANNOT_GET_CALL_HANDLE cpu_to_le32(0xC0030008) -#define RPC_NT_NULL_REF_POINTER cpu_to_le32(0xC0030009) -#define RPC_NT_ENUM_VALUE_OUT_OF_RANGE cpu_to_le32(0xC003000A) -#define RPC_NT_BYTE_COUNT_TOO_SMALL cpu_to_le32(0xC003000B) -#define RPC_NT_BAD_STUB_DATA cpu_to_le32(0xC003000C) -#define RPC_NT_INVALID_ES_ACTION cpu_to_le32(0xC0030059) -#define RPC_NT_WRONG_ES_VERSION cpu_to_le32(0xC003005A) -#define RPC_NT_WRONG_STUB_VERSION cpu_to_le32(0xC003005B) -#define RPC_NT_INVALID_PIPE_OBJECT cpu_to_le32(0xC003005C) -#define RPC_NT_INVALID_PIPE_OPERATION cpu_to_le32(0xC003005D) -#define RPC_NT_WRONG_PIPE_VERSION cpu_to_le32(0xC003005E) -#define RPC_NT_PIPE_CLOSED cpu_to_le32(0xC003005F) -#define RPC_NT_PIPE_DISCIPLINE_ERROR cpu_to_le32(0xC0030060) -#define RPC_NT_PIPE_EMPTY cpu_to_le32(0xC0030061) -#define STATUS_PNP_BAD_MPS_TABLE cpu_to_le32(0xC0040035) -#define STATUS_PNP_TRANSLATION_FAILED cpu_to_le32(0xC0040036) -#define STATUS_PNP_IRQ_TRANSLATION_FAILED cpu_to_le32(0xC0040037) -#define STATUS_PNP_INVALID_ID cpu_to_le32(0xC0040038) -#define STATUS_IO_REISSUE_AS_CACHED cpu_to_le32(0xC0040039) -#define STATUS_CTX_WINSTATION_NAME_INVALID cpu_to_le32(0xC00A0001) -#define STATUS_CTX_INVALID_PD cpu_to_le32(0xC00A0002) -#define STATUS_CTX_PD_NOT_FOUND cpu_to_le32(0xC00A0003) -#define STATUS_CTX_CLOSE_PENDING cpu_to_le32(0xC00A0006) -#define STATUS_CTX_NO_OUTBUF cpu_to_le32(0xC00A0007) -#define STATUS_CTX_MODEM_INF_NOT_FOUND cpu_to_le32(0xC00A0008) -#define STATUS_CTX_INVALID_MODEMNAME cpu_to_le32(0xC00A0009) -#define STATUS_CTX_RESPONSE_ERROR cpu_to_le32(0xC00A000A) -#define STATUS_CTX_MODEM_RESPONSE_TIMEOUT cpu_to_le32(0xC00A000B) -#define STATUS_CTX_MODEM_RESPONSE_NO_CARRIER cpu_to_le32(0xC00A000C) -#define STATUS_CTX_MODEM_RESPONSE_NO_DIALTONE cpu_to_le32(0xC00A000D) -#define STATUS_CTX_MODEM_RESPONSE_BUSY cpu_to_le32(0xC00A000E) -#define STATUS_CTX_MODEM_RESPONSE_VOICE cpu_to_le32(0xC00A000F) -#define STATUS_CTX_TD_ERROR cpu_to_le32(0xC00A0010) -#define STATUS_CTX_LICENSE_CLIENT_INVALID cpu_to_le32(0xC00A0012) -#define STATUS_CTX_LICENSE_NOT_AVAILABLE cpu_to_le32(0xC00A0013) -#define STATUS_CTX_LICENSE_EXPIRED cpu_to_le32(0xC00A0014) -#define STATUS_CTX_WINSTATION_NOT_FOUND cpu_to_le32(0xC00A0015) -#define STATUS_CTX_WINSTATION_NAME_COLLISION cpu_to_le32(0xC00A0016) -#define STATUS_CTX_WINSTATION_BUSY cpu_to_le32(0xC00A0017) -#define STATUS_CTX_BAD_VIDEO_MODE cpu_to_le32(0xC00A0018) -#define STATUS_CTX_GRAPHICS_INVALID cpu_to_le32(0xC00A0022) -#define STATUS_CTX_NOT_CONSOLE cpu_to_le32(0xC00A0024) -#define STATUS_CTX_CLIENT_QUERY_TIMEOUT cpu_to_le32(0xC00A0026) -#define STATUS_CTX_CONSOLE_DISCONNECT cpu_to_le32(0xC00A0027) -#define STATUS_CTX_CONSOLE_CONNECT cpu_to_le32(0xC00A0028) -#define STATUS_CTX_SHADOW_DENIED cpu_to_le32(0xC00A002A) -#define STATUS_CTX_WINSTATION_ACCESS_DENIED cpu_to_le32(0xC00A002B) -#define STATUS_CTX_INVALID_WD cpu_to_le32(0xC00A002E) -#define STATUS_CTX_WD_NOT_FOUND cpu_to_le32(0xC00A002F) -#define STATUS_CTX_SHADOW_INVALID cpu_to_le32(0xC00A0030) -#define STATUS_CTX_SHADOW_DISABLED cpu_to_le32(0xC00A0031) -#define STATUS_RDP_PROTOCOL_ERROR cpu_to_le32(0xC00A0032) -#define STATUS_CTX_CLIENT_LICENSE_NOT_SET cpu_to_le32(0xC00A0033) -#define STATUS_CTX_CLIENT_LICENSE_IN_USE cpu_to_le32(0xC00A0034) -#define STATUS_CTX_SHADOW_ENDED_BY_MODE_CHANGE cpu_to_le32(0xC00A0035) -#define STATUS_CTX_SHADOW_NOT_RUNNING cpu_to_le32(0xC00A0036) -#define STATUS_CTX_LOGON_DISABLED cpu_to_le32(0xC00A0037) -#define STATUS_CTX_SECURITY_LAYER_ERROR cpu_to_le32(0xC00A0038) -#define STATUS_TS_INCOMPATIBLE_SESSIONS cpu_to_le32(0xC00A0039) -#define STATUS_MUI_FILE_NOT_FOUND cpu_to_le32(0xC00B0001) -#define STATUS_MUI_INVALID_FILE cpu_to_le32(0xC00B0002) -#define STATUS_MUI_INVALID_RC_CONFIG cpu_to_le32(0xC00B0003) -#define STATUS_MUI_INVALID_LOCALE_NAME cpu_to_le32(0xC00B0004) -#define STATUS_MUI_INVALID_ULTIMATEFALLBACK_NAME cpu_to_le32(0xC00B0005) -#define STATUS_MUI_FILE_NOT_LOADED cpu_to_le32(0xC00B0006) -#define STATUS_RESOURCE_ENUM_USER_STOP cpu_to_le32(0xC00B0007) -#define STATUS_CLUSTER_INVALID_NODE cpu_to_le32(0xC0130001) -#define STATUS_CLUSTER_NODE_EXISTS cpu_to_le32(0xC0130002) -#define STATUS_CLUSTER_JOIN_IN_PROGRESS cpu_to_le32(0xC0130003) -#define STATUS_CLUSTER_NODE_NOT_FOUND cpu_to_le32(0xC0130004) -#define STATUS_CLUSTER_LOCAL_NODE_NOT_FOUND cpu_to_le32(0xC0130005) -#define STATUS_CLUSTER_NETWORK_EXISTS cpu_to_le32(0xC0130006) -#define STATUS_CLUSTER_NETWORK_NOT_FOUND cpu_to_le32(0xC0130007) -#define STATUS_CLUSTER_NETINTERFACE_EXISTS cpu_to_le32(0xC0130008) -#define STATUS_CLUSTER_NETINTERFACE_NOT_FOUND cpu_to_le32(0xC0130009) -#define STATUS_CLUSTER_INVALID_REQUEST cpu_to_le32(0xC013000A) -#define STATUS_CLUSTER_INVALID_NETWORK_PROVIDER cpu_to_le32(0xC013000B) -#define STATUS_CLUSTER_NODE_DOWN cpu_to_le32(0xC013000C) -#define STATUS_CLUSTER_NODE_UNREACHABLE cpu_to_le32(0xC013000D) -#define STATUS_CLUSTER_NODE_NOT_MEMBER cpu_to_le32(0xC013000E) -#define STATUS_CLUSTER_JOIN_NOT_IN_PROGRESS cpu_to_le32(0xC013000F) -#define STATUS_CLUSTER_INVALID_NETWORK cpu_to_le32(0xC0130010) -#define STATUS_CLUSTER_NO_NET_ADAPTERS cpu_to_le32(0xC0130011) -#define STATUS_CLUSTER_NODE_UP cpu_to_le32(0xC0130012) -#define STATUS_CLUSTER_NODE_PAUSED cpu_to_le32(0xC0130013) -#define STATUS_CLUSTER_NODE_NOT_PAUSED cpu_to_le32(0xC0130014) -#define STATUS_CLUSTER_NO_SECURITY_CONTEXT cpu_to_le32(0xC0130015) -#define STATUS_CLUSTER_NETWORK_NOT_INTERNAL cpu_to_le32(0xC0130016) -#define STATUS_CLUSTER_POISONED cpu_to_le32(0xC0130017) -#define STATUS_ACPI_INVALID_OPCODE cpu_to_le32(0xC0140001) -#define STATUS_ACPI_STACK_OVERFLOW cpu_to_le32(0xC0140002) -#define STATUS_ACPI_ASSERT_FAILED cpu_to_le32(0xC0140003) -#define STATUS_ACPI_INVALID_INDEX cpu_to_le32(0xC0140004) -#define STATUS_ACPI_INVALID_ARGUMENT cpu_to_le32(0xC0140005) -#define STATUS_ACPI_FATAL cpu_to_le32(0xC0140006) -#define STATUS_ACPI_INVALID_SUPERNAME cpu_to_le32(0xC0140007) -#define STATUS_ACPI_INVALID_ARGTYPE cpu_to_le32(0xC0140008) -#define STATUS_ACPI_INVALID_OBJTYPE cpu_to_le32(0xC0140009) -#define STATUS_ACPI_INVALID_TARGETTYPE cpu_to_le32(0xC014000A) -#define STATUS_ACPI_INCORRECT_ARGUMENT_COUNT cpu_to_le32(0xC014000B) -#define STATUS_ACPI_ADDRESS_NOT_MAPPED cpu_to_le32(0xC014000C) -#define STATUS_ACPI_INVALID_EVENTTYPE cpu_to_le32(0xC014000D) -#define STATUS_ACPI_HANDLER_COLLISION cpu_to_le32(0xC014000E) -#define STATUS_ACPI_INVALID_DATA cpu_to_le32(0xC014000F) -#define STATUS_ACPI_INVALID_REGION cpu_to_le32(0xC0140010) -#define STATUS_ACPI_INVALID_ACCESS_SIZE cpu_to_le32(0xC0140011) -#define STATUS_ACPI_ACQUIRE_GLOBAL_LOCK cpu_to_le32(0xC0140012) -#define STATUS_ACPI_ALREADY_INITIALIZED cpu_to_le32(0xC0140013) -#define STATUS_ACPI_NOT_INITIALIZED cpu_to_le32(0xC0140014) -#define STATUS_ACPI_INVALID_MUTEX_LEVEL cpu_to_le32(0xC0140015) -#define STATUS_ACPI_MUTEX_NOT_OWNED cpu_to_le32(0xC0140016) -#define STATUS_ACPI_MUTEX_NOT_OWNER cpu_to_le32(0xC0140017) -#define STATUS_ACPI_RS_ACCESS cpu_to_le32(0xC0140018) -#define STATUS_ACPI_INVALID_TABLE cpu_to_le32(0xC0140019) -#define STATUS_ACPI_REG_HANDLER_FAILED cpu_to_le32(0xC0140020) -#define STATUS_ACPI_POWER_REQUEST_FAILED cpu_to_le32(0xC0140021) -#define STATUS_SXS_SECTION_NOT_FOUND cpu_to_le32(0xC0150001) -#define STATUS_SXS_CANT_GEN_ACTCTX cpu_to_le32(0xC0150002) -#define STATUS_SXS_INVALID_ACTCTXDATA_FORMAT cpu_to_le32(0xC0150003) -#define STATUS_SXS_ASSEMBLY_NOT_FOUND cpu_to_le32(0xC0150004) -#define STATUS_SXS_MANIFEST_FORMAT_ERROR cpu_to_le32(0xC0150005) -#define STATUS_SXS_MANIFEST_PARSE_ERROR cpu_to_le32(0xC0150006) -#define STATUS_SXS_ACTIVATION_CONTEXT_DISABLED cpu_to_le32(0xC0150007) -#define STATUS_SXS_KEY_NOT_FOUND cpu_to_le32(0xC0150008) -#define STATUS_SXS_VERSION_CONFLICT cpu_to_le32(0xC0150009) -#define STATUS_SXS_WRONG_SECTION_TYPE cpu_to_le32(0xC015000A) -#define STATUS_SXS_THREAD_QUERIES_DISABLED cpu_to_le32(0xC015000B) -#define STATUS_SXS_ASSEMBLY_MISSING cpu_to_le32(0xC015000C) -#define STATUS_SXS_PROCESS_DEFAULT_ALREADY_SET cpu_to_le32(0xC015000E) -#define STATUS_SXS_EARLY_DEACTIVATION cpu_to_le32(0xC015000F) -#define STATUS_SXS_INVALID_DEACTIVATION cpu_to_le32(0xC0150010) -#define STATUS_SXS_MULTIPLE_DEACTIVATION cpu_to_le32(0xC0150011) -#define STATUS_SXS_SYSTEM_DEFAULT_ACTIVATION_CONTEXT_EMPTY \ - cpu_to_le32(0xC0150012) -#define STATUS_SXS_PROCESS_TERMINATION_REQUESTED cpu_to_le32(0xC0150013) -#define STATUS_SXS_CORRUPT_ACTIVATION_STACK cpu_to_le32(0xC0150014) -#define STATUS_SXS_CORRUPTION cpu_to_le32(0xC0150015) -#define STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_VALUE cpu_to_le32(0xC0150016) -#define STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_NAME cpu_to_le32(0xC0150017) -#define STATUS_SXS_IDENTITY_DUPLICATE_ATTRIBUTE cpu_to_le32(0xC0150018) -#define STATUS_SXS_IDENTITY_PARSE_ERROR cpu_to_le32(0xC0150019) -#define STATUS_SXS_COMPONENT_STORE_CORRUPT cpu_to_le32(0xC015001A) -#define STATUS_SXS_FILE_HASH_MISMATCH cpu_to_le32(0xC015001B) -#define STATUS_SXS_MANIFEST_IDENTITY_SAME_BUT_CONTENTS_DIFFERENT \ - cpu_to_le32(0xC015001C) -#define STATUS_SXS_IDENTITIES_DIFFERENT cpu_to_le32(0xC015001D) -#define STATUS_SXS_ASSEMBLY_IS_NOT_A_DEPLOYMENT cpu_to_le32(0xC015001E) -#define STATUS_SXS_FILE_NOT_PART_OF_ASSEMBLY cpu_to_le32(0xC015001F) -#define STATUS_ADVANCED_INSTALLER_FAILED cpu_to_le32(0xC0150020) -#define STATUS_XML_ENCODING_MISMATCH cpu_to_le32(0xC0150021) -#define STATUS_SXS_MANIFEST_TOO_BIG cpu_to_le32(0xC0150022) -#define STATUS_SXS_SETTING_NOT_REGISTERED cpu_to_le32(0xC0150023) -#define STATUS_SXS_TRANSACTION_CLOSURE_INCOMPLETE cpu_to_le32(0xC0150024) -#define STATUS_SMI_PRIMITIVE_INSTALLER_FAILED cpu_to_le32(0xC0150025) -#define STATUS_GENERIC_COMMAND_FAILED cpu_to_le32(0xC0150026) -#define STATUS_SXS_FILE_HASH_MISSING cpu_to_le32(0xC0150027) -#define STATUS_TRANSACTIONAL_CONFLICT cpu_to_le32(0xC0190001) -#define STATUS_INVALID_TRANSACTION cpu_to_le32(0xC0190002) -#define STATUS_TRANSACTION_NOT_ACTIVE cpu_to_le32(0xC0190003) -#define STATUS_TM_INITIALIZATION_FAILED cpu_to_le32(0xC0190004) -#define STATUS_RM_NOT_ACTIVE cpu_to_le32(0xC0190005) -#define STATUS_RM_METADATA_CORRUPT cpu_to_le32(0xC0190006) -#define STATUS_TRANSACTION_NOT_JOINED cpu_to_le32(0xC0190007) -#define STATUS_DIRECTORY_NOT_RM cpu_to_le32(0xC0190008) -#define STATUS_TRANSACTIONS_UNSUPPORTED_REMOTE cpu_to_le32(0xC019000A) -#define STATUS_LOG_RESIZE_INVALID_SIZE cpu_to_le32(0xC019000B) -#define STATUS_REMOTE_FILE_VERSION_MISMATCH cpu_to_le32(0xC019000C) -#define STATUS_CRM_PROTOCOL_ALREADY_EXISTS cpu_to_le32(0xC019000F) -#define STATUS_TRANSACTION_PROPAGATION_FAILED cpu_to_le32(0xC0190010) -#define STATUS_CRM_PROTOCOL_NOT_FOUND cpu_to_le32(0xC0190011) -#define STATUS_TRANSACTION_SUPERIOR_EXISTS cpu_to_le32(0xC0190012) -#define STATUS_TRANSACTION_REQUEST_NOT_VALID cpu_to_le32(0xC0190013) -#define STATUS_TRANSACTION_NOT_REQUESTED cpu_to_le32(0xC0190014) -#define STATUS_TRANSACTION_ALREADY_ABORTED cpu_to_le32(0xC0190015) -#define STATUS_TRANSACTION_ALREADY_COMMITTED cpu_to_le32(0xC0190016) -#define STATUS_TRANSACTION_INVALID_MARSHALL_BUFFER cpu_to_le32(0xC0190017) -#define STATUS_CURRENT_TRANSACTION_NOT_VALID cpu_to_le32(0xC0190018) -#define STATUS_LOG_GROWTH_FAILED cpu_to_le32(0xC0190019) -#define STATUS_OBJECT_NO_LONGER_EXISTS cpu_to_le32(0xC0190021) -#define STATUS_STREAM_MINIVERSION_NOT_FOUND cpu_to_le32(0xC0190022) -#define STATUS_STREAM_MINIVERSION_NOT_VALID cpu_to_le32(0xC0190023) -#define STATUS_MINIVERSION_INACCESSIBLE_FROM_SPECIFIED_TRANSACTION \ - cpu_to_le32(0xC0190024) -#define STATUS_CANT_OPEN_MINIVERSION_WITH_MODIFY_INTENT cpu_to_le32(0xC0190025) -#define STATUS_CANT_CREATE_MORE_STREAM_MINIVERSIONS cpu_to_le32(0xC0190026) -#define STATUS_HANDLE_NO_LONGER_VALID cpu_to_le32(0xC0190028) -#define STATUS_LOG_CORRUPTION_DETECTED cpu_to_le32(0xC0190030) -#define STATUS_RM_DISCONNECTED cpu_to_le32(0xC0190032) -#define STATUS_ENLISTMENT_NOT_SUPERIOR cpu_to_le32(0xC0190033) -#define STATUS_FILE_IDENTITY_NOT_PERSISTENT cpu_to_le32(0xC0190036) -#define STATUS_CANT_BREAK_TRANSACTIONAL_DEPENDENCY cpu_to_le32(0xC0190037) -#define STATUS_CANT_CROSS_RM_BOUNDARY cpu_to_le32(0xC0190038) -#define STATUS_TXF_DIR_NOT_EMPTY cpu_to_le32(0xC0190039) -#define STATUS_INDOUBT_TRANSACTIONS_EXIST cpu_to_le32(0xC019003A) -#define STATUS_TM_VOLATILE cpu_to_le32(0xC019003B) -#define STATUS_ROLLBACK_TIMER_EXPIRED cpu_to_le32(0xC019003C) -#define STATUS_TXF_ATTRIBUTE_CORRUPT cpu_to_le32(0xC019003D) -#define STATUS_EFS_NOT_ALLOWED_IN_TRANSACTION cpu_to_le32(0xC019003E) -#define STATUS_TRANSACTIONAL_OPEN_NOT_ALLOWED cpu_to_le32(0xC019003F) -#define STATUS_TRANSACTED_MAPPING_UNSUPPORTED_REMOTE cpu_to_le32(0xC0190040) -#define STATUS_TRANSACTION_REQUIRED_PROMOTION cpu_to_le32(0xC0190043) -#define STATUS_CANNOT_EXECUTE_FILE_IN_TRANSACTION cpu_to_le32(0xC0190044) -#define STATUS_TRANSACTIONS_NOT_FROZEN cpu_to_le32(0xC0190045) -#define STATUS_TRANSACTION_FREEZE_IN_PROGRESS cpu_to_le32(0xC0190046) -#define STATUS_NOT_SNAPSHOT_VOLUME cpu_to_le32(0xC0190047) -#define STATUS_NO_SAVEPOINT_WITH_OPEN_FILES cpu_to_le32(0xC0190048) -#define STATUS_SPARSE_NOT_ALLOWED_IN_TRANSACTION cpu_to_le32(0xC0190049) -#define STATUS_TM_IDENTITY_MISMATCH cpu_to_le32(0xC019004A) -#define STATUS_FLOATED_SECTION cpu_to_le32(0xC019004B) -#define STATUS_CANNOT_ACCEPT_TRANSACTED_WORK cpu_to_le32(0xC019004C) -#define STATUS_CANNOT_ABORT_TRANSACTIONS cpu_to_le32(0xC019004D) -#define STATUS_TRANSACTION_NOT_FOUND cpu_to_le32(0xC019004E) -#define STATUS_RESOURCEMANAGER_NOT_FOUND cpu_to_le32(0xC019004F) -#define STATUS_ENLISTMENT_NOT_FOUND cpu_to_le32(0xC0190050) -#define STATUS_TRANSACTIONMANAGER_NOT_FOUND cpu_to_le32(0xC0190051) -#define STATUS_TRANSACTIONMANAGER_NOT_ONLINE cpu_to_le32(0xC0190052) -#define STATUS_TRANSACTIONMANAGER_RECOVERY_NAME_COLLISION \ - cpu_to_le32(0xC0190053) -#define STATUS_TRANSACTION_NOT_ROOT cpu_to_le32(0xC0190054) -#define STATUS_TRANSACTION_OBJECT_EXPIRED cpu_to_le32(0xC0190055) -#define STATUS_COMPRESSION_NOT_ALLOWED_IN_TRANSACTION cpu_to_le32(0xC0190056) -#define STATUS_TRANSACTION_RESPONSE_NOT_ENLISTED cpu_to_le32(0xC0190057) -#define STATUS_TRANSACTION_RECORD_TOO_LONG cpu_to_le32(0xC0190058) -#define STATUS_NO_LINK_TRACKING_IN_TRANSACTION cpu_to_le32(0xC0190059) -#define STATUS_OPERATION_NOT_SUPPORTED_IN_TRANSACTION cpu_to_le32(0xC019005A) -#define STATUS_TRANSACTION_INTEGRITY_VIOLATED cpu_to_le32(0xC019005B) -#define STATUS_LOG_SECTOR_INVALID cpu_to_le32(0xC01A0001) -#define STATUS_LOG_SECTOR_PARITY_INVALID cpu_to_le32(0xC01A0002) -#define STATUS_LOG_SECTOR_REMAPPED cpu_to_le32(0xC01A0003) -#define STATUS_LOG_BLOCK_INCOMPLETE cpu_to_le32(0xC01A0004) -#define STATUS_LOG_INVALID_RANGE cpu_to_le32(0xC01A0005) -#define STATUS_LOG_BLOCKS_EXHAUSTED cpu_to_le32(0xC01A0006) -#define STATUS_LOG_READ_CONTEXT_INVALID cpu_to_le32(0xC01A0007) -#define STATUS_LOG_RESTART_INVALID cpu_to_le32(0xC01A0008) -#define STATUS_LOG_BLOCK_VERSION cpu_to_le32(0xC01A0009) -#define STATUS_LOG_BLOCK_INVALID cpu_to_le32(0xC01A000A) -#define STATUS_LOG_READ_MODE_INVALID cpu_to_le32(0xC01A000B) -#define STATUS_LOG_METADATA_CORRUPT cpu_to_le32(0xC01A000D) -#define STATUS_LOG_METADATA_INVALID cpu_to_le32(0xC01A000E) -#define STATUS_LOG_METADATA_INCONSISTENT cpu_to_le32(0xC01A000F) -#define STATUS_LOG_RESERVATION_INVALID cpu_to_le32(0xC01A0010) -#define STATUS_LOG_CANT_DELETE cpu_to_le32(0xC01A0011) -#define STATUS_LOG_CONTAINER_LIMIT_EXCEEDED cpu_to_le32(0xC01A0012) -#define STATUS_LOG_START_OF_LOG cpu_to_le32(0xC01A0013) -#define STATUS_LOG_POLICY_ALREADY_INSTALLED cpu_to_le32(0xC01A0014) -#define STATUS_LOG_POLICY_NOT_INSTALLED cpu_to_le32(0xC01A0015) -#define STATUS_LOG_POLICY_INVALID cpu_to_le32(0xC01A0016) -#define STATUS_LOG_POLICY_CONFLICT cpu_to_le32(0xC01A0017) -#define STATUS_LOG_PINNED_ARCHIVE_TAIL cpu_to_le32(0xC01A0018) -#define STATUS_LOG_RECORD_NONEXISTENT cpu_to_le32(0xC01A0019) -#define STATUS_LOG_RECORDS_RESERVED_INVALID cpu_to_le32(0xC01A001A) -#define STATUS_LOG_SPACE_RESERVED_INVALID cpu_to_le32(0xC01A001B) -#define STATUS_LOG_TAIL_INVALID cpu_to_le32(0xC01A001C) -#define STATUS_LOG_FULL cpu_to_le32(0xC01A001D) -#define STATUS_LOG_MULTIPLEXED cpu_to_le32(0xC01A001E) -#define STATUS_LOG_DEDICATED cpu_to_le32(0xC01A001F) -#define STATUS_LOG_ARCHIVE_NOT_IN_PROGRESS cpu_to_le32(0xC01A0020) -#define STATUS_LOG_ARCHIVE_IN_PROGRESS cpu_to_le32(0xC01A0021) -#define STATUS_LOG_EPHEMERAL cpu_to_le32(0xC01A0022) -#define STATUS_LOG_NOT_ENOUGH_CONTAINERS cpu_to_le32(0xC01A0023) -#define STATUS_LOG_CLIENT_ALREADY_REGISTERED cpu_to_le32(0xC01A0024) -#define STATUS_LOG_CLIENT_NOT_REGISTERED cpu_to_le32(0xC01A0025) -#define STATUS_LOG_FULL_HANDLER_IN_PROGRESS cpu_to_le32(0xC01A0026) -#define STATUS_LOG_CONTAINER_READ_FAILED cpu_to_le32(0xC01A0027) -#define STATUS_LOG_CONTAINER_WRITE_FAILED cpu_to_le32(0xC01A0028) -#define STATUS_LOG_CONTAINER_OPEN_FAILED cpu_to_le32(0xC01A0029) -#define STATUS_LOG_CONTAINER_STATE_INVALID cpu_to_le32(0xC01A002A) -#define STATUS_LOG_STATE_INVALID cpu_to_le32(0xC01A002B) -#define STATUS_LOG_PINNED cpu_to_le32(0xC01A002C) -#define STATUS_LOG_METADATA_FLUSH_FAILED cpu_to_le32(0xC01A002D) -#define STATUS_LOG_INCONSISTENT_SECURITY cpu_to_le32(0xC01A002E) -#define STATUS_LOG_APPENDED_FLUSH_FAILED cpu_to_le32(0xC01A002F) -#define STATUS_LOG_PINNED_RESERVATION cpu_to_le32(0xC01A0030) -#define STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD cpu_to_le32(0xC01B00EA) -#define STATUS_FLT_NO_HANDLER_DEFINED cpu_to_le32(0xC01C0001) -#define STATUS_FLT_CONTEXT_ALREADY_DEFINED cpu_to_le32(0xC01C0002) -#define STATUS_FLT_INVALID_ASYNCHRONOUS_REQUEST cpu_to_le32(0xC01C0003) -#define STATUS_FLT_DISALLOW_FAST_IO cpu_to_le32(0xC01C0004) -#define STATUS_FLT_INVALID_NAME_REQUEST cpu_to_le32(0xC01C0005) -#define STATUS_FLT_NOT_SAFE_TO_POST_OPERATION cpu_to_le32(0xC01C0006) -#define STATUS_FLT_NOT_INITIALIZED cpu_to_le32(0xC01C0007) -#define STATUS_FLT_FILTER_NOT_READY cpu_to_le32(0xC01C0008) -#define STATUS_FLT_POST_OPERATION_CLEANUP cpu_to_le32(0xC01C0009) -#define STATUS_FLT_INTERNAL_ERROR cpu_to_le32(0xC01C000A) -#define STATUS_FLT_DELETING_OBJECT cpu_to_le32(0xC01C000B) -#define STATUS_FLT_MUST_BE_NONPAGED_POOL cpu_to_le32(0xC01C000C) -#define STATUS_FLT_DUPLICATE_ENTRY cpu_to_le32(0xC01C000D) -#define STATUS_FLT_CBDQ_DISABLED cpu_to_le32(0xC01C000E) -#define STATUS_FLT_DO_NOT_ATTACH cpu_to_le32(0xC01C000F) -#define STATUS_FLT_DO_NOT_DETACH cpu_to_le32(0xC01C0010) -#define STATUS_FLT_INSTANCE_ALTITUDE_COLLISION cpu_to_le32(0xC01C0011) -#define STATUS_FLT_INSTANCE_NAME_COLLISION cpu_to_le32(0xC01C0012) -#define STATUS_FLT_FILTER_NOT_FOUND cpu_to_le32(0xC01C0013) -#define STATUS_FLT_VOLUME_NOT_FOUND cpu_to_le32(0xC01C0014) -#define STATUS_FLT_INSTANCE_NOT_FOUND cpu_to_le32(0xC01C0015) -#define STATUS_FLT_CONTEXT_ALLOCATION_NOT_FOUND cpu_to_le32(0xC01C0016) -#define STATUS_FLT_INVALID_CONTEXT_REGISTRATION cpu_to_le32(0xC01C0017) -#define STATUS_FLT_NAME_CACHE_MISS cpu_to_le32(0xC01C0018) -#define STATUS_FLT_NO_DEVICE_OBJECT cpu_to_le32(0xC01C0019) -#define STATUS_FLT_VOLUME_ALREADY_MOUNTED cpu_to_le32(0xC01C001A) -#define STATUS_FLT_ALREADY_ENLISTED cpu_to_le32(0xC01C001B) -#define STATUS_FLT_CONTEXT_ALREADY_LINKED cpu_to_le32(0xC01C001C) -#define STATUS_FLT_NO_WAITER_FOR_REPLY cpu_to_le32(0xC01C0020) -#define STATUS_MONITOR_NO_DESCRIPTOR cpu_to_le32(0xC01D0001) -#define STATUS_MONITOR_UNKNOWN_DESCRIPTOR_FORMAT cpu_to_le32(0xC01D0002) -#define STATUS_MONITOR_INVALID_DESCRIPTOR_CHECKSUM cpu_to_le32(0xC01D0003) -#define STATUS_MONITOR_INVALID_STANDARD_TIMING_BLOCK cpu_to_le32(0xC01D0004) -#define STATUS_MONITOR_WMI_DATABLOCK_REGISTRATION_FAILED cpu_to_le32(0xC01D0005) -#define STATUS_MONITOR_INVALID_SERIAL_NUMBER_MONDSC_BLOCK \ - cpu_to_le32(0xC01D0006) -#define STATUS_MONITOR_INVALID_USER_FRIENDLY_MONDSC_BLOCK \ - cpu_to_le32(0xC01D0007) -#define STATUS_MONITOR_NO_MORE_DESCRIPTOR_DATA cpu_to_le32(0xC01D0008) -#define STATUS_MONITOR_INVALID_DETAILED_TIMING_BLOCK cpu_to_le32(0xC01D0009) -#define STATUS_GRAPHICS_NOT_EXCLUSIVE_MODE_OWNER cpu_to_le32(0xC01E0000) -#define STATUS_GRAPHICS_INSUFFICIENT_DMA_BUFFER cpu_to_le32(0xC01E0001) -#define STATUS_GRAPHICS_INVALID_DISPLAY_ADAPTER cpu_to_le32(0xC01E0002) -#define STATUS_GRAPHICS_ADAPTER_WAS_RESET cpu_to_le32(0xC01E0003) -#define STATUS_GRAPHICS_INVALID_DRIVER_MODEL cpu_to_le32(0xC01E0004) -#define STATUS_GRAPHICS_PRESENT_MODE_CHANGED cpu_to_le32(0xC01E0005) -#define STATUS_GRAPHICS_PRESENT_OCCLUDED cpu_to_le32(0xC01E0006) -#define STATUS_GRAPHICS_PRESENT_DENIED cpu_to_le32(0xC01E0007) -#define STATUS_GRAPHICS_CANNOTCOLORCONVERT cpu_to_le32(0xC01E0008) -#define STATUS_GRAPHICS_NO_VIDEO_MEMORY cpu_to_le32(0xC01E0100) -#define STATUS_GRAPHICS_CANT_LOCK_MEMORY cpu_to_le32(0xC01E0101) -#define STATUS_GRAPHICS_ALLOCATION_BUSY cpu_to_le32(0xC01E0102) -#define STATUS_GRAPHICS_TOO_MANY_REFERENCES cpu_to_le32(0xC01E0103) -#define STATUS_GRAPHICS_TRY_AGAIN_LATER cpu_to_le32(0xC01E0104) -#define STATUS_GRAPHICS_TRY_AGAIN_NOW cpu_to_le32(0xC01E0105) -#define STATUS_GRAPHICS_ALLOCATION_INVALID cpu_to_le32(0xC01E0106) -#define STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNAVAILABLE cpu_to_le32(0xC01E0107) -#define STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNSUPPORTED cpu_to_le32(0xC01E0108) -#define STATUS_GRAPHICS_CANT_EVICT_PINNED_ALLOCATION cpu_to_le32(0xC01E0109) -#define STATUS_GRAPHICS_INVALID_ALLOCATION_USAGE cpu_to_le32(0xC01E0110) -#define STATUS_GRAPHICS_CANT_RENDER_LOCKED_ALLOCATION cpu_to_le32(0xC01E0111) -#define STATUS_GRAPHICS_ALLOCATION_CLOSED cpu_to_le32(0xC01E0112) -#define STATUS_GRAPHICS_INVALID_ALLOCATION_INSTANCE cpu_to_le32(0xC01E0113) -#define STATUS_GRAPHICS_INVALID_ALLOCATION_HANDLE cpu_to_le32(0xC01E0114) -#define STATUS_GRAPHICS_WRONG_ALLOCATION_DEVICE cpu_to_le32(0xC01E0115) -#define STATUS_GRAPHICS_ALLOCATION_CONTENT_LOST cpu_to_le32(0xC01E0116) -#define STATUS_GRAPHICS_GPU_EXCEPTION_ON_DEVICE cpu_to_le32(0xC01E0200) -#define STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY cpu_to_le32(0xC01E0300) -#define STATUS_GRAPHICS_VIDPN_TOPOLOGY_NOT_SUPPORTED cpu_to_le32(0xC01E0301) -#define STATUS_GRAPHICS_VIDPN_TOPOLOGY_CURRENTLY_NOT_SUPPORTED \ - cpu_to_le32(0xC01E0302) -#define STATUS_GRAPHICS_INVALID_VIDPN cpu_to_le32(0xC01E0303) -#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE cpu_to_le32(0xC01E0304) -#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET cpu_to_le32(0xC01E0305) -#define STATUS_GRAPHICS_VIDPN_MODALITY_NOT_SUPPORTED cpu_to_le32(0xC01E0306) -#define STATUS_GRAPHICS_INVALID_VIDPN_SOURCEMODESET cpu_to_le32(0xC01E0308) -#define STATUS_GRAPHICS_INVALID_VIDPN_TARGETMODESET cpu_to_le32(0xC01E0309) -#define STATUS_GRAPHICS_INVALID_FREQUENCY cpu_to_le32(0xC01E030A) -#define STATUS_GRAPHICS_INVALID_ACTIVE_REGION cpu_to_le32(0xC01E030B) -#define STATUS_GRAPHICS_INVALID_TOTAL_REGION cpu_to_le32(0xC01E030C) -#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE_MODE \ - cpu_to_le32(0xC01E0310) -#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET_MODE \ - cpu_to_le32(0xC01E0311) -#define STATUS_GRAPHICS_PINNED_MODE_MUST_REMAIN_IN_SET cpu_to_le32(0xC01E0312) -#define STATUS_GRAPHICS_PATH_ALREADY_IN_TOPOLOGY cpu_to_le32(0xC01E0313) -#define STATUS_GRAPHICS_MODE_ALREADY_IN_MODESET cpu_to_le32(0xC01E0314) -#define STATUS_GRAPHICS_INVALID_VIDEOPRESENTSOURCESET cpu_to_le32(0xC01E0315) -#define STATUS_GRAPHICS_INVALID_VIDEOPRESENTTARGETSET cpu_to_le32(0xC01E0316) -#define STATUS_GRAPHICS_SOURCE_ALREADY_IN_SET cpu_to_le32(0xC01E0317) -#define STATUS_GRAPHICS_TARGET_ALREADY_IN_SET cpu_to_le32(0xC01E0318) -#define STATUS_GRAPHICS_INVALID_VIDPN_PRESENT_PATH cpu_to_le32(0xC01E0319) -#define STATUS_GRAPHICS_NO_RECOMMENDED_VIDPN_TOPOLOGY cpu_to_le32(0xC01E031A) -#define STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGESET \ - cpu_to_le32(0xC01E031B) -#define STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE cpu_to_le32(0xC01E031C) -#define STATUS_GRAPHICS_FREQUENCYRANGE_NOT_IN_SET cpu_to_le32(0xC01E031D) -#define STATUS_GRAPHICS_FREQUENCYRANGE_ALREADY_IN_SET cpu_to_le32(0xC01E031F) -#define STATUS_GRAPHICS_STALE_MODESET cpu_to_le32(0xC01E0320) -#define STATUS_GRAPHICS_INVALID_MONITOR_SOURCEMODESET cpu_to_le32(0xC01E0321) -#define STATUS_GRAPHICS_INVALID_MONITOR_SOURCE_MODE cpu_to_le32(0xC01E0322) -#define STATUS_GRAPHICS_NO_RECOMMENDED_FUNCTIONAL_VIDPN cpu_to_le32(0xC01E0323) -#define STATUS_GRAPHICS_MODE_ID_MUST_BE_UNIQUE cpu_to_le32(0xC01E0324) -#define STATUS_GRAPHICS_EMPTY_ADAPTER_MONITOR_MODE_SUPPORT_INTERSECTION \ - cpu_to_le32(0xC01E0325) -#define STATUS_GRAPHICS_VIDEO_PRESENT_TARGETS_LESS_THAN_SOURCES \ - cpu_to_le32(0xC01E0326) -#define STATUS_GRAPHICS_PATH_NOT_IN_TOPOLOGY cpu_to_le32(0xC01E0327) -#define STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_SOURCE \ - cpu_to_le32(0xC01E0328) -#define STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_TARGET \ - cpu_to_le32(0xC01E0329) -#define STATUS_GRAPHICS_INVALID_MONITORDESCRIPTORSET cpu_to_le32(0xC01E032A) -#define STATUS_GRAPHICS_INVALID_MONITORDESCRIPTOR cpu_to_le32(0xC01E032B) -#define STATUS_GRAPHICS_MONITORDESCRIPTOR_NOT_IN_SET cpu_to_le32(0xC01E032C) -#define STATUS_GRAPHICS_MONITORDESCRIPTOR_ALREADY_IN_SET cpu_to_le32(0xC01E032D) -#define STATUS_GRAPHICS_MONITORDESCRIPTOR_ID_MUST_BE_UNIQUE \ - cpu_to_le32(0xC01E032E) -#define STATUS_GRAPHICS_INVALID_VIDPN_TARGET_SUBSET_TYPE cpu_to_le32(0xC01E032F) -#define STATUS_GRAPHICS_RESOURCES_NOT_RELATED cpu_to_le32(0xC01E0330) -#define STATUS_GRAPHICS_SOURCE_ID_MUST_BE_UNIQUE cpu_to_le32(0xC01E0331) -#define STATUS_GRAPHICS_TARGET_ID_MUST_BE_UNIQUE cpu_to_le32(0xC01E0332) -#define STATUS_GRAPHICS_NO_AVAILABLE_VIDPN_TARGET cpu_to_le32(0xC01E0333) -#define STATUS_GRAPHICS_MONITOR_COULD_NOT_BE_ASSOCIATED_WITH_ADAPTER \ - cpu_to_le32(0xC01E0334) -#define STATUS_GRAPHICS_NO_VIDPNMGR cpu_to_le32(0xC01E0335) -#define STATUS_GRAPHICS_NO_ACTIVE_VIDPN cpu_to_le32(0xC01E0336) -#define STATUS_GRAPHICS_STALE_VIDPN_TOPOLOGY cpu_to_le32(0xC01E0337) -#define STATUS_GRAPHICS_MONITOR_NOT_CONNECTED cpu_to_le32(0xC01E0338) -#define STATUS_GRAPHICS_SOURCE_NOT_IN_TOPOLOGY cpu_to_le32(0xC01E0339) -#define STATUS_GRAPHICS_INVALID_PRIMARYSURFACE_SIZE cpu_to_le32(0xC01E033A) -#define STATUS_GRAPHICS_INVALID_VISIBLEREGION_SIZE cpu_to_le32(0xC01E033B) -#define STATUS_GRAPHICS_INVALID_STRIDE cpu_to_le32(0xC01E033C) -#define STATUS_GRAPHICS_INVALID_PIXELFORMAT cpu_to_le32(0xC01E033D) -#define STATUS_GRAPHICS_INVALID_COLORBASIS cpu_to_le32(0xC01E033E) -#define STATUS_GRAPHICS_INVALID_PIXELVALUEACCESSMODE cpu_to_le32(0xC01E033F) -#define STATUS_GRAPHICS_TARGET_NOT_IN_TOPOLOGY cpu_to_le32(0xC01E0340) -#define STATUS_GRAPHICS_NO_DISPLAY_MODE_MANAGEMENT_SUPPORT \ - cpu_to_le32(0xC01E0341) -#define STATUS_GRAPHICS_VIDPN_SOURCE_IN_USE cpu_to_le32(0xC01E0342) -#define STATUS_GRAPHICS_CANT_ACCESS_ACTIVE_VIDPN cpu_to_le32(0xC01E0343) -#define STATUS_GRAPHICS_INVALID_PATH_IMPORTANCE_ORDINAL cpu_to_le32(0xC01E0344) -#define STATUS_GRAPHICS_INVALID_PATH_CONTENT_GEOMETRY_TRANSFORMATION \ - cpu_to_le32(0xC01E0345) -#define STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_SUPPORTED \ - cpu_to_le32(0xC01E0346) -#define STATUS_GRAPHICS_INVALID_GAMMA_RAMP cpu_to_le32(0xC01E0347) -#define STATUS_GRAPHICS_GAMMA_RAMP_NOT_SUPPORTED cpu_to_le32(0xC01E0348) -#define STATUS_GRAPHICS_MULTISAMPLING_NOT_SUPPORTED cpu_to_le32(0xC01E0349) -#define STATUS_GRAPHICS_MODE_NOT_IN_MODESET cpu_to_le32(0xC01E034A) -#define STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY_RECOMMENDATION_REASON \ - cpu_to_le32(0xC01E034D) -#define STATUS_GRAPHICS_INVALID_PATH_CONTENT_TYPE cpu_to_le32(0xC01E034E) -#define STATUS_GRAPHICS_INVALID_COPYPROTECTION_TYPE cpu_to_le32(0xC01E034F) -#define STATUS_GRAPHICS_UNASSIGNED_MODESET_ALREADY_EXISTS \ - cpu_to_le32(0xC01E0350) -#define STATUS_GRAPHICS_INVALID_SCANLINE_ORDERING cpu_to_le32(0xC01E0352) -#define STATUS_GRAPHICS_TOPOLOGY_CHANGES_NOT_ALLOWED cpu_to_le32(0xC01E0353) -#define STATUS_GRAPHICS_NO_AVAILABLE_IMPORTANCE_ORDINALS cpu_to_le32(0xC01E0354) -#define STATUS_GRAPHICS_INCOMPATIBLE_PRIVATE_FORMAT cpu_to_le32(0xC01E0355) -#define STATUS_GRAPHICS_INVALID_MODE_PRUNING_ALGORITHM cpu_to_le32(0xC01E0356) -#define STATUS_GRAPHICS_INVALID_MONITOR_CAPABILITY_ORIGIN \ - cpu_to_le32(0xC01E0357) -#define STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE_CONSTRAINT \ - cpu_to_le32(0xC01E0358) -#define STATUS_GRAPHICS_MAX_NUM_PATHS_REACHED cpu_to_le32(0xC01E0359) -#define STATUS_GRAPHICS_CANCEL_VIDPN_TOPOLOGY_AUGMENTATION \ - cpu_to_le32(0xC01E035A) -#define STATUS_GRAPHICS_INVALID_CLIENT_TYPE cpu_to_le32(0xC01E035B) -#define STATUS_GRAPHICS_CLIENTVIDPN_NOT_SET cpu_to_le32(0xC01E035C) -#define STATUS_GRAPHICS_SPECIFIED_CHILD_ALREADY_CONNECTED \ - cpu_to_le32(0xC01E0400) -#define STATUS_GRAPHICS_CHILD_DESCRIPTOR_NOT_SUPPORTED cpu_to_le32(0xC01E0401) -#define STATUS_GRAPHICS_NOT_A_LINKED_ADAPTER cpu_to_le32(0xC01E0430) -#define STATUS_GRAPHICS_LEADLINK_NOT_ENUMERATED cpu_to_le32(0xC01E0431) -#define STATUS_GRAPHICS_CHAINLINKS_NOT_ENUMERATED cpu_to_le32(0xC01E0432) -#define STATUS_GRAPHICS_ADAPTER_CHAIN_NOT_READY cpu_to_le32(0xC01E0433) -#define STATUS_GRAPHICS_CHAINLINKS_NOT_STARTED cpu_to_le32(0xC01E0434) -#define STATUS_GRAPHICS_CHAINLINKS_NOT_POWERED_ON cpu_to_le32(0xC01E0435) -#define STATUS_GRAPHICS_INCONSISTENT_DEVICE_LINK_STATE cpu_to_le32(0xC01E0436) -#define STATUS_GRAPHICS_NOT_POST_DEVICE_DRIVER cpu_to_le32(0xC01E0438) -#define STATUS_GRAPHICS_ADAPTER_ACCESS_NOT_EXCLUDED cpu_to_le32(0xC01E043B) -#define STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_COPP_SEMANTICS \ - cpu_to_le32(0xC01E051C) -#define STATUS_GRAPHICS_OPM_INVALID_INFORMATION_REQUEST cpu_to_le32(0xC01E051D) -#define STATUS_GRAPHICS_OPM_DRIVER_INTERNAL_ERROR cpu_to_le32(0xC01E051E) -#define STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_OPM_SEMANTICS \ - cpu_to_le32(0xC01E051F) -#define STATUS_GRAPHICS_OPM_SIGNALING_NOT_SUPPORTED cpu_to_le32(0xC01E0520) -#define STATUS_GRAPHICS_OPM_INVALID_CONFIGURATION_REQUEST \ - cpu_to_le32(0xC01E0521) -#define STATUS_GRAPHICS_OPM_NOT_SUPPORTED cpu_to_le32(0xC01E0500) -#define STATUS_GRAPHICS_COPP_NOT_SUPPORTED cpu_to_le32(0xC01E0501) -#define STATUS_GRAPHICS_UAB_NOT_SUPPORTED cpu_to_le32(0xC01E0502) -#define STATUS_GRAPHICS_OPM_INVALID_ENCRYPTED_PARAMETERS cpu_to_le32(0xC01E0503) -#define STATUS_GRAPHICS_OPM_PARAMETER_ARRAY_TOO_SMALL cpu_to_le32(0xC01E0504) -#define STATUS_GRAPHICS_OPM_NO_PROTECTED_OUTPUTS_EXIST cpu_to_le32(0xC01E0505) -#define STATUS_GRAPHICS_PVP_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME \ - cpu_to_le32(0xC01E0506) -#define STATUS_GRAPHICS_PVP_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP \ - cpu_to_le32(0xC01E0507) -#define STATUS_GRAPHICS_PVP_MIRRORING_DEVICES_NOT_SUPPORTED \ - cpu_to_le32(0xC01E0508) -#define STATUS_GRAPHICS_OPM_INVALID_POINTER cpu_to_le32(0xC01E050A) -#define STATUS_GRAPHICS_OPM_INTERNAL_ERROR cpu_to_le32(0xC01E050B) -#define STATUS_GRAPHICS_OPM_INVALID_HANDLE cpu_to_le32(0xC01E050C) -#define STATUS_GRAPHICS_PVP_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE \ - cpu_to_le32(0xC01E050D) -#define STATUS_GRAPHICS_PVP_INVALID_CERTIFICATE_LENGTH cpu_to_le32(0xC01E050E) -#define STATUS_GRAPHICS_OPM_SPANNING_MODE_ENABLED cpu_to_le32(0xC01E050F) -#define STATUS_GRAPHICS_OPM_THEATER_MODE_ENABLED cpu_to_le32(0xC01E0510) -#define STATUS_GRAPHICS_PVP_HFS_FAILED cpu_to_le32(0xC01E0511) -#define STATUS_GRAPHICS_OPM_INVALID_SRM cpu_to_le32(0xC01E0512) -#define STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_HDCP cpu_to_le32(0xC01E0513) -#define STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_ACP cpu_to_le32(0xC01E0514) -#define STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_CGMSA \ - cpu_to_le32(0xC01E0515) -#define STATUS_GRAPHICS_OPM_HDCP_SRM_NEVER_SET cpu_to_le32(0xC01E0516) -#define STATUS_GRAPHICS_OPM_RESOLUTION_TOO_HIGH cpu_to_le32(0xC01E0517) -#define STATUS_GRAPHICS_OPM_ALL_HDCP_HARDWARE_ALREADY_IN_USE \ - cpu_to_le32(0xC01E0518) -#define STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_NO_LONGER_EXISTS \ - cpu_to_le32(0xC01E051A) -#define STATUS_GRAPHICS_OPM_SESSION_TYPE_CHANGE_IN_PROGRESS \ - cpu_to_le32(0xC01E051B) -#define STATUS_GRAPHICS_I2C_NOT_SUPPORTED cpu_to_le32(0xC01E0580) -#define STATUS_GRAPHICS_I2C_DEVICE_DOES_NOT_EXIST cpu_to_le32(0xC01E0581) -#define STATUS_GRAPHICS_I2C_ERROR_TRANSMITTING_DATA cpu_to_le32(0xC01E0582) -#define STATUS_GRAPHICS_I2C_ERROR_RECEIVING_DATA cpu_to_le32(0xC01E0583) -#define STATUS_GRAPHICS_DDCCI_VCP_NOT_SUPPORTED cpu_to_le32(0xC01E0584) -#define STATUS_GRAPHICS_DDCCI_INVALID_DATA cpu_to_le32(0xC01E0585) -#define STATUS_GRAPHICS_DDCCI_MONITOR_RETURNED_INVALID_TIMING_STATUS_BYTE \ - cpu_to_le32(0xC01E0586) -#define STATUS_GRAPHICS_DDCCI_INVALID_CAPABILITIES_STRING \ - cpu_to_le32(0xC01E0587) -#define STATUS_GRAPHICS_MCA_INTERNAL_ERROR cpu_to_le32(0xC01E0588) -#define STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_COMMAND cpu_to_le32(0xC01E0589) -#define STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_LENGTH cpu_to_le32(0xC01E058A) -#define STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_CHECKSUM cpu_to_le32(0xC01E058B) -#define STATUS_GRAPHICS_INVALID_PHYSICAL_MONITOR_HANDLE cpu_to_le32(0xC01E058C) -#define STATUS_GRAPHICS_MONITOR_NO_LONGER_EXISTS cpu_to_le32(0xC01E058D) -#define STATUS_GRAPHICS_ONLY_CONSOLE_SESSION_SUPPORTED cpu_to_le32(0xC01E05E0) -#define STATUS_GRAPHICS_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME \ - cpu_to_le32(0xC01E05E1) -#define STATUS_GRAPHICS_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP \ - cpu_to_le32(0xC01E05E2) -#define STATUS_GRAPHICS_MIRRORING_DEVICES_NOT_SUPPORTED cpu_to_le32(0xC01E05E3) -#define STATUS_GRAPHICS_INVALID_POINTER cpu_to_le32(0xC01E05E4) -#define STATUS_GRAPHICS_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE \ - cpu_to_le32(0xC01E05E5) -#define STATUS_GRAPHICS_PARAMETER_ARRAY_TOO_SMALL cpu_to_le32(0xC01E05E6) -#define STATUS_GRAPHICS_INTERNAL_ERROR cpu_to_le32(0xC01E05E7) -#define STATUS_GRAPHICS_SESSION_TYPE_CHANGE_IN_PROGRESS cpu_to_le32(0xC01E05E8) -#define STATUS_FVE_LOCKED_VOLUME cpu_to_le32(0xC0210000) -#define STATUS_FVE_NOT_ENCRYPTED cpu_to_le32(0xC0210001) -#define STATUS_FVE_BAD_INFORMATION cpu_to_le32(0xC0210002) -#define STATUS_FVE_TOO_SMALL cpu_to_le32(0xC0210003) -#define STATUS_FVE_FAILED_WRONG_FS cpu_to_le32(0xC0210004) -#define STATUS_FVE_FAILED_BAD_FS cpu_to_le32(0xC0210005) -#define STATUS_FVE_FS_NOT_EXTENDED cpu_to_le32(0xC0210006) -#define STATUS_FVE_FS_MOUNTED cpu_to_le32(0xC0210007) -#define STATUS_FVE_NO_LICENSE cpu_to_le32(0xC0210008) -#define STATUS_FVE_ACTION_NOT_ALLOWED cpu_to_le32(0xC0210009) -#define STATUS_FVE_BAD_DATA cpu_to_le32(0xC021000A) -#define STATUS_FVE_VOLUME_NOT_BOUND cpu_to_le32(0xC021000B) -#define STATUS_FVE_NOT_DATA_VOLUME cpu_to_le32(0xC021000C) -#define STATUS_FVE_CONV_READ_ERROR cpu_to_le32(0xC021000D) -#define STATUS_FVE_CONV_WRITE_ERROR cpu_to_le32(0xC021000E) -#define STATUS_FVE_OVERLAPPED_UPDATE cpu_to_le32(0xC021000F) -#define STATUS_FVE_FAILED_SECTOR_SIZE cpu_to_le32(0xC0210010) -#define STATUS_FVE_FAILED_AUTHENTICATION cpu_to_le32(0xC0210011) -#define STATUS_FVE_NOT_OS_VOLUME cpu_to_le32(0xC0210012) -#define STATUS_FVE_KEYFILE_NOT_FOUND cpu_to_le32(0xC0210013) -#define STATUS_FVE_KEYFILE_INVALID cpu_to_le32(0xC0210014) -#define STATUS_FVE_KEYFILE_NO_VMK cpu_to_le32(0xC0210015) -#define STATUS_FVE_TPM_DISABLED cpu_to_le32(0xC0210016) -#define STATUS_FVE_TPM_SRK_AUTH_NOT_ZERO cpu_to_le32(0xC0210017) -#define STATUS_FVE_TPM_INVALID_PCR cpu_to_le32(0xC0210018) -#define STATUS_FVE_TPM_NO_VMK cpu_to_le32(0xC0210019) -#define STATUS_FVE_PIN_INVALID cpu_to_le32(0xC021001A) -#define STATUS_FVE_AUTH_INVALID_APPLICATION cpu_to_le32(0xC021001B) -#define STATUS_FVE_AUTH_INVALID_CONFIG cpu_to_le32(0xC021001C) -#define STATUS_FVE_DEBUGGER_ENABLED cpu_to_le32(0xC021001D) -#define STATUS_FVE_DRY_RUN_FAILED cpu_to_le32(0xC021001E) -#define STATUS_FVE_BAD_METADATA_POINTER cpu_to_le32(0xC021001F) -#define STATUS_FVE_OLD_METADATA_COPY cpu_to_le32(0xC0210020) -#define STATUS_FVE_REBOOT_REQUIRED cpu_to_le32(0xC0210021) -#define STATUS_FVE_RAW_ACCESS cpu_to_le32(0xC0210022) -#define STATUS_FVE_RAW_BLOCKED cpu_to_le32(0xC0210023) -#define STATUS_FWP_CALLOUT_NOT_FOUND cpu_to_le32(0xC0220001) -#define STATUS_FWP_CONDITION_NOT_FOUND cpu_to_le32(0xC0220002) -#define STATUS_FWP_FILTER_NOT_FOUND cpu_to_le32(0xC0220003) -#define STATUS_FWP_LAYER_NOT_FOUND cpu_to_le32(0xC0220004) -#define STATUS_FWP_PROVIDER_NOT_FOUND cpu_to_le32(0xC0220005) -#define STATUS_FWP_PROVIDER_CONTEXT_NOT_FOUND cpu_to_le32(0xC0220006) -#define STATUS_FWP_SUBLAYER_NOT_FOUND cpu_to_le32(0xC0220007) -#define STATUS_FWP_NOT_FOUND cpu_to_le32(0xC0220008) -#define STATUS_FWP_ALREADY_EXISTS cpu_to_le32(0xC0220009) -#define STATUS_FWP_IN_USE cpu_to_le32(0xC022000A) -#define STATUS_FWP_DYNAMIC_SESSION_IN_PROGRESS cpu_to_le32(0xC022000B) -#define STATUS_FWP_WRONG_SESSION cpu_to_le32(0xC022000C) -#define STATUS_FWP_NO_TXN_IN_PROGRESS cpu_to_le32(0xC022000D) -#define STATUS_FWP_TXN_IN_PROGRESS cpu_to_le32(0xC022000E) -#define STATUS_FWP_TXN_ABORTED cpu_to_le32(0xC022000F) -#define STATUS_FWP_SESSION_ABORTED cpu_to_le32(0xC0220010) -#define STATUS_FWP_INCOMPATIBLE_TXN cpu_to_le32(0xC0220011) -#define STATUS_FWP_TIMEOUT cpu_to_le32(0xC0220012) -#define STATUS_FWP_NET_EVENTS_DISABLED cpu_to_le32(0xC0220013) -#define STATUS_FWP_INCOMPATIBLE_LAYER cpu_to_le32(0xC0220014) -#define STATUS_FWP_KM_CLIENTS_ONLY cpu_to_le32(0xC0220015) -#define STATUS_FWP_LIFETIME_MISMATCH cpu_to_le32(0xC0220016) -#define STATUS_FWP_BUILTIN_OBJECT cpu_to_le32(0xC0220017) -#define STATUS_FWP_TOO_MANY_BOOTTIME_FILTERS cpu_to_le32(0xC0220018) -#define STATUS_FWP_TOO_MANY_CALLOUTS cpu_to_le32(0xC0220018) -#define STATUS_FWP_NOTIFICATION_DROPPED cpu_to_le32(0xC0220019) -#define STATUS_FWP_TRAFFIC_MISMATCH cpu_to_le32(0xC022001A) -#define STATUS_FWP_INCOMPATIBLE_SA_STATE cpu_to_le32(0xC022001B) -#define STATUS_FWP_NULL_POINTER cpu_to_le32(0xC022001C) -#define STATUS_FWP_INVALID_ENUMERATOR cpu_to_le32(0xC022001D) -#define STATUS_FWP_INVALID_FLAGS cpu_to_le32(0xC022001E) -#define STATUS_FWP_INVALID_NET_MASK cpu_to_le32(0xC022001F) -#define STATUS_FWP_INVALID_RANGE cpu_to_le32(0xC0220020) -#define STATUS_FWP_INVALID_INTERVAL cpu_to_le32(0xC0220021) -#define STATUS_FWP_ZERO_LENGTH_ARRAY cpu_to_le32(0xC0220022) -#define STATUS_FWP_NULL_DISPLAY_NAME cpu_to_le32(0xC0220023) -#define STATUS_FWP_INVALID_ACTION_TYPE cpu_to_le32(0xC0220024) -#define STATUS_FWP_INVALID_WEIGHT cpu_to_le32(0xC0220025) -#define STATUS_FWP_MATCH_TYPE_MISMATCH cpu_to_le32(0xC0220026) -#define STATUS_FWP_TYPE_MISMATCH cpu_to_le32(0xC0220027) -#define STATUS_FWP_OUT_OF_BOUNDS cpu_to_le32(0xC0220028) -#define STATUS_FWP_RESERVED cpu_to_le32(0xC0220029) -#define STATUS_FWP_DUPLICATE_CONDITION cpu_to_le32(0xC022002A) -#define STATUS_FWP_DUPLICATE_KEYMOD cpu_to_le32(0xC022002B) -#define STATUS_FWP_ACTION_INCOMPATIBLE_WITH_LAYER cpu_to_le32(0xC022002C) -#define STATUS_FWP_ACTION_INCOMPATIBLE_WITH_SUBLAYER cpu_to_le32(0xC022002D) -#define STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_LAYER cpu_to_le32(0xC022002E) -#define STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_CALLOUT cpu_to_le32(0xC022002F) -#define STATUS_FWP_INCOMPATIBLE_AUTH_METHOD cpu_to_le32(0xC0220030) -#define STATUS_FWP_INCOMPATIBLE_DH_GROUP cpu_to_le32(0xC0220031) -#define STATUS_FWP_EM_NOT_SUPPORTED cpu_to_le32(0xC0220032) -#define STATUS_FWP_NEVER_MATCH cpu_to_le32(0xC0220033) -#define STATUS_FWP_PROVIDER_CONTEXT_MISMATCH cpu_to_le32(0xC0220034) -#define STATUS_FWP_INVALID_PARAMETER cpu_to_le32(0xC0220035) -#define STATUS_FWP_TOO_MANY_SUBLAYERS cpu_to_le32(0xC0220036) -#define STATUS_FWP_CALLOUT_NOTIFICATION_FAILED cpu_to_le32(0xC0220037) -#define STATUS_FWP_INCOMPATIBLE_AUTH_CONFIG cpu_to_le32(0xC0220038) -#define STATUS_FWP_INCOMPATIBLE_CIPHER_CONFIG cpu_to_le32(0xC0220039) -#define STATUS_FWP_TCPIP_NOT_READY cpu_to_le32(0xC0220100) -#define STATUS_FWP_INJECT_HANDLE_CLOSING cpu_to_le32(0xC0220101) -#define STATUS_FWP_INJECT_HANDLE_STALE cpu_to_le32(0xC0220102) -#define STATUS_FWP_CANNOT_PEND cpu_to_le32(0xC0220103) -#define STATUS_NDIS_CLOSING cpu_to_le32(0xC0230002) -#define STATUS_NDIS_BAD_VERSION cpu_to_le32(0xC0230004) -#define STATUS_NDIS_BAD_CHARACTERISTICS cpu_to_le32(0xC0230005) -#define STATUS_NDIS_ADAPTER_NOT_FOUND cpu_to_le32(0xC0230006) -#define STATUS_NDIS_OPEN_FAILED cpu_to_le32(0xC0230007) -#define STATUS_NDIS_DEVICE_FAILED cpu_to_le32(0xC0230008) -#define STATUS_NDIS_MULTICAST_FULL cpu_to_le32(0xC0230009) -#define STATUS_NDIS_MULTICAST_EXISTS cpu_to_le32(0xC023000A) -#define STATUS_NDIS_MULTICAST_NOT_FOUND cpu_to_le32(0xC023000B) -#define STATUS_NDIS_REQUEST_ABORTED cpu_to_le32(0xC023000C) -#define STATUS_NDIS_RESET_IN_PROGRESS cpu_to_le32(0xC023000D) -#define STATUS_NDIS_INVALID_PACKET cpu_to_le32(0xC023000F) -#define STATUS_NDIS_INVALID_DEVICE_REQUEST cpu_to_le32(0xC0230010) -#define STATUS_NDIS_ADAPTER_NOT_READY cpu_to_le32(0xC0230011) -#define STATUS_NDIS_INVALID_LENGTH cpu_to_le32(0xC0230014) -#define STATUS_NDIS_INVALID_DATA cpu_to_le32(0xC0230015) -#define STATUS_NDIS_BUFFER_TOO_SHORT cpu_to_le32(0xC0230016) -#define STATUS_NDIS_INVALID_OID cpu_to_le32(0xC0230017) -#define STATUS_NDIS_ADAPTER_REMOVED cpu_to_le32(0xC0230018) -#define STATUS_NDIS_UNSUPPORTED_MEDIA cpu_to_le32(0xC0230019) -#define STATUS_NDIS_GROUP_ADDRESS_IN_USE cpu_to_le32(0xC023001A) -#define STATUS_NDIS_FILE_NOT_FOUND cpu_to_le32(0xC023001B) -#define STATUS_NDIS_ERROR_READING_FILE cpu_to_le32(0xC023001C) -#define STATUS_NDIS_ALREADY_MAPPED cpu_to_le32(0xC023001D) -#define STATUS_NDIS_RESOURCE_CONFLICT cpu_to_le32(0xC023001E) -#define STATUS_NDIS_MEDIA_DISCONNECTED cpu_to_le32(0xC023001F) -#define STATUS_NDIS_INVALID_ADDRESS cpu_to_le32(0xC0230022) -#define STATUS_NDIS_PAUSED cpu_to_le32(0xC023002A) -#define STATUS_NDIS_INTERFACE_NOT_FOUND cpu_to_le32(0xC023002B) -#define STATUS_NDIS_UNSUPPORTED_REVISION cpu_to_le32(0xC023002C) -#define STATUS_NDIS_INVALID_PORT cpu_to_le32(0xC023002D) -#define STATUS_NDIS_INVALID_PORT_STATE cpu_to_le32(0xC023002E) -#define STATUS_NDIS_LOW_POWER_STATE cpu_to_le32(0xC023002F) -#define STATUS_NDIS_NOT_SUPPORTED cpu_to_le32(0xC02300BB) -#define STATUS_NDIS_DOT11_AUTO_CONFIG_ENABLED cpu_to_le32(0xC0232000) -#define STATUS_NDIS_DOT11_MEDIA_IN_USE cpu_to_le32(0xC0232001) -#define STATUS_NDIS_DOT11_POWER_STATE_INVALID cpu_to_le32(0xC0232002) -#define STATUS_IPSEC_BAD_SPI cpu_to_le32(0xC0360001) -#define STATUS_IPSEC_SA_LIFETIME_EXPIRED cpu_to_le32(0xC0360002) -#define STATUS_IPSEC_WRONG_SA cpu_to_le32(0xC0360003) -#define STATUS_IPSEC_REPLAY_CHECK_FAILED cpu_to_le32(0xC0360004) -#define STATUS_IPSEC_INVALID_PACKET cpu_to_le32(0xC0360005) -#define STATUS_IPSEC_INTEGRITY_CHECK_FAILED cpu_to_le32(0xC0360006) -#define STATUS_IPSEC_CLEAR_TEXT_DROP cpu_to_le32(0xC0360007) - -#define STATUS_NO_PREAUTH_INTEGRITY_HASH_OVERLAP cpu_to_le32(0xC05D0000) -#define STATUS_INVALID_LOCK_RANGE cpu_to_le32(0xC00001a1) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index cf4418f72772..44c87e300c16 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -21,7 +21,7 @@ #include "glob.h" #include "connection.h" #include "smb_common.h" -#include "smbstatus.h" +#include "../common/smb2status.h" #include "transport_rdma.h" #define SMB_DIRECT_PORT_IWARP 5445 diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c index a84788396daa..aaed9e293b2e 100644 --- a/fs/smb/server/transport_tcp.c +++ b/fs/smb/server/transport_tcp.c @@ -624,8 +624,10 @@ int ksmbd_tcp_set_interfaces(char *ifc_list, int ifc_list_sz) for_each_netdev(&init_net, netdev) { if (netif_is_bridge_port(netdev)) continue; - if (!alloc_iface(kstrdup(netdev->name, GFP_KERNEL))) + if (!alloc_iface(kstrdup(netdev->name, GFP_KERNEL))) { + rtnl_unlock(); return -ENOMEM; + } } rtnl_unlock(); bind_additional_ifaces = 1; diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 9e859ba010cf..7cbd580120d1 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -496,7 +496,7 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp, int err = 0; if (work->conn->connection_type) { - if (!(fp->daccess & FILE_WRITE_DATA_LE)) { + if (!(fp->daccess & (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE))) { pr_err("no right to write(%pD)\n", fp->filp); err = -EACCES; goto out; @@ -1115,9 +1115,10 @@ static bool __dir_empty(struct dir_context *ctx, const char *name, int namlen, struct ksmbd_readdir_data *buf; buf = container_of(ctx, struct ksmbd_readdir_data, ctx); - buf->dirent_count++; + if (!is_dot_dotdot(name, namlen)) + buf->dirent_count++; - return buf->dirent_count <= 2; + return !buf->dirent_count; } /** @@ -1137,7 +1138,7 @@ int ksmbd_vfs_empty_dir(struct ksmbd_file *fp) readdir_data.dirent_count = 0; err = iterate_dir(fp->filp, &readdir_data.ctx); - if (readdir_data.dirent_count > 2) + if (readdir_data.dirent_count) err = -ENOTEMPTY; else err = 0; @@ -1166,7 +1167,7 @@ static bool __caseless_lookup(struct dir_context *ctx, const char *name, if (cmp < 0) cmp = strncasecmp((char *)buf->private, name, namlen); if (!cmp) { - memcpy((char *)buf->private, name, namlen); + memcpy((char *)buf->private, name, buf->used); buf->dirent_count = 1; return false; } @@ -1234,10 +1235,7 @@ int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, char *filepath; size_t path_len, remain_len; - filepath = kstrdup(name, GFP_KERNEL); - if (!filepath) - return -ENOMEM; - + filepath = name; path_len = strlen(filepath); remain_len = path_len; @@ -1280,10 +1278,9 @@ int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, err = -EINVAL; out2: path_put(parent_path); -out1: - kfree(filepath); } +out1: if (!err) { err = mnt_want_write(parent_path->mnt); if (err) { diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c index 4d4ee696e37c..a19f4e563c7e 100644 --- a/fs/smb/server/vfs_cache.c +++ b/fs/smb/server/vfs_cache.c @@ -863,6 +863,8 @@ static bool session_fd_check(struct ksmbd_tree_connect *tcon, list_for_each_entry_rcu(op, &ci->m_op_list, op_entry) { if (op->conn != conn) continue; + if (op->conn && atomic_dec_and_test(&op->conn->refcnt)) + kfree(op->conn); op->conn = NULL; } up_write(&ci->m_lock); @@ -965,6 +967,7 @@ int ksmbd_reopen_durable_fd(struct ksmbd_work *work, struct ksmbd_file *fp) if (op->conn) continue; op->conn = fp->conn; + atomic_inc(&op->conn->refcnt); } up_write(&ci->m_lock); diff --git a/fs/smb/server/xattr.h b/fs/smb/server/xattr.h index 16499ca5c82d..fa3e27d6971b 100644 --- a/fs/smb/server/xattr.h +++ b/fs/smb/server/xattr.h @@ -76,7 +76,7 @@ struct xattr_acl_entry { struct xattr_smb_acl { int count; int next; - struct xattr_acl_entry entries[]; + struct xattr_acl_entry entries[] __counted_by(count); }; /* 64bytes hash in xattr_ntacl is computed with sha256 */ diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index a8c1e7f9a609..21aaa96856c1 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -494,39 +494,73 @@ out: } static int squashfs_readahead_fragment(struct page **page, - unsigned int pages, unsigned int expected) + unsigned int pages, unsigned int expected, loff_t start) { struct inode *inode = page[0]->mapping->host; struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb, squashfs_i(inode)->fragment_block, squashfs_i(inode)->fragment_size); struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; - unsigned int n, mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; - int error = buffer->error; + int i, bytes, copied; + struct squashfs_page_actor *actor; + unsigned int offset; + void *addr; + struct page *last_page; + + if (buffer->error) + goto out; - if (error) + actor = squashfs_page_actor_init_special(msblk, page, pages, + expected, start); + if (!actor) goto out; - expected += squashfs_i(inode)->fragment_offset; + squashfs_actor_nobuff(actor); + addr = squashfs_first_page(actor); + + for (copied = offset = 0; offset < expected; offset += PAGE_SIZE) { + int avail = min_t(int, expected - offset, PAGE_SIZE); + + if (!IS_ERR(addr)) { + bytes = squashfs_copy_data(addr, buffer, offset + + squashfs_i(inode)->fragment_offset, avail); + + if (bytes != avail) + goto failed; + } + + copied += avail; + addr = squashfs_next_page(actor); + } - for (n = 0; n < pages; n++) { - unsigned int base = (page[n]->index & mask) << PAGE_SHIFT; - unsigned int offset = base + squashfs_i(inode)->fragment_offset; + last_page = squashfs_page_actor_free(actor); - if (expected > offset) { - unsigned int avail = min_t(unsigned int, expected - - offset, PAGE_SIZE); + if (copied == expected && !IS_ERR(last_page)) { + /* Last page (if present) may have trailing bytes not filled */ + bytes = copied % PAGE_SIZE; + if (bytes && last_page) + memzero_page(last_page, bytes, PAGE_SIZE - bytes); - squashfs_fill_page(page[n], buffer, offset, avail); + for (i = 0; i < pages; i++) { + flush_dcache_page(page[i]); + SetPageUptodate(page[i]); } + } - unlock_page(page[n]); - put_page(page[n]); + for (i = 0; i < pages; i++) { + unlock_page(page[i]); + put_page(page[i]); } + squashfs_cache_put(buffer); + return 0; + +failed: + squashfs_page_actor_free(actor); + out: squashfs_cache_put(buffer); - return error; + return 1; } static void squashfs_readahead(struct readahead_control *ractl) @@ -551,7 +585,6 @@ static void squashfs_readahead(struct readahead_control *ractl) return; for (;;) { - pgoff_t index; int res, bsize; u64 block = 0; unsigned int expected; @@ -570,26 +603,21 @@ static void squashfs_readahead(struct readahead_control *ractl) if (readahead_pos(ractl) >= i_size_read(inode)) goto skip_pages; - index = pages[0]->index >> shift; - - if ((pages[nr_pages - 1]->index >> shift) != index) - goto skip_pages; - - if (index == file_end && squashfs_i(inode)->fragment_block != - SQUASHFS_INVALID_BLK) { + if (start >> msblk->block_log == file_end && + squashfs_i(inode)->fragment_block != SQUASHFS_INVALID_BLK) { res = squashfs_readahead_fragment(pages, nr_pages, - expected); + expected, start); if (res) goto skip_pages; continue; } - bsize = read_blocklist(inode, index, &block); + bsize = read_blocklist(inode, start >> msblk->block_log, &block); if (bsize == 0) goto skip_pages; actor = squashfs_page_actor_init_special(msblk, pages, nr_pages, - expected); + expected, start); if (!actor) goto skip_pages; @@ -597,12 +625,12 @@ static void squashfs_readahead(struct readahead_control *ractl) last_page = squashfs_page_actor_free(actor); - if (res == expected) { + if (res == expected && !IS_ERR(last_page)) { int bytes; /* Last page (if present) may have trailing bytes not filled */ bytes = res % PAGE_SIZE; - if (index == file_end && bytes && last_page) + if (start >> msblk->block_log == file_end && bytes && last_page) memzero_page(last_page, bytes, PAGE_SIZE - bytes); @@ -616,6 +644,8 @@ static void squashfs_readahead(struct readahead_control *ractl) unlock_page(pages[i]); put_page(pages[i]); } + + start += readahead_batch_length(ractl); } kfree(pages); diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index 2a689ce71de9..22251743fadf 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -23,15 +23,15 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, int expected) { + struct folio *folio = page_folio(target_page); struct inode *inode = target_page->mapping->host; struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; - loff_t file_end = (i_size_read(inode) - 1) >> PAGE_SHIFT; int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; - loff_t start_index = target_page->index & ~mask; + loff_t start_index = folio->index & ~mask; loff_t end_index = start_index | mask; int i, n, pages, bytes, res = -ENOMEM; - struct page **page; + struct page **page, *last_page; struct squashfs_page_actor *actor; void *pageaddr; @@ -46,7 +46,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, /* Try to grab all the pages covered by the Squashfs block */ for (i = 0, n = start_index; n <= end_index; n++) { - page[i] = (n == target_page->index) ? target_page : + page[i] = (n == folio->index) ? target_page : grab_cache_page_nowait(target_page->mapping, n); if (page[i] == NULL) @@ -67,27 +67,28 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, * Create a "page actor" which will kmap and kunmap the * page cache pages appropriately within the decompressor */ - actor = squashfs_page_actor_init_special(msblk, page, pages, expected); + actor = squashfs_page_actor_init_special(msblk, page, pages, expected, + start_index << PAGE_SHIFT); if (actor == NULL) goto out; /* Decompress directly into the page cache buffers */ res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor); - squashfs_page_actor_free(actor); + last_page = squashfs_page_actor_free(actor); if (res < 0) goto mark_errored; - if (res != expected) { + if (res != expected || IS_ERR(last_page)) { res = -EIO; goto mark_errored; } /* Last page (if present) may have trailing bytes not filled */ bytes = res % PAGE_SIZE; - if (page[pages - 1]->index == end_index && bytes) { - pageaddr = kmap_local_page(page[pages - 1]); + if (end_index == file_end && last_page && bytes) { + pageaddr = kmap_local_page(last_page); memset(pageaddr + bytes, 0, PAGE_SIZE - bytes); kunmap_local(pageaddr); } diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index 16bd693d0b3a..d5918eba27e3 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -279,8 +279,13 @@ int squashfs_read_inode(struct inode *inode, long long ino) if (err < 0) goto failed_read; - set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_size = le32_to_cpu(sqsh_ino->symlink_size); + if (inode->i_size > PAGE_SIZE) { + ERROR("Corrupted symlink\n"); + return -EINVAL; + } + + set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_op = &squashfs_symlink_inode_ops; inode_nohighmem(inode); inode->i_data.a_ops = &squashfs_symlink_aops; diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c index 81af6c4ca115..2b3e807d4dea 100644 --- a/fs/squashfs/page_actor.c +++ b/fs/squashfs/page_actor.c @@ -60,6 +60,11 @@ struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, } /* Implementation of page_actor for decompressing directly into page cache. */ +static loff_t page_next_index(struct squashfs_page_actor *actor) +{ + return page_folio(actor->page[actor->next_page])->index; +} + static void *handle_next_page(struct squashfs_page_actor *actor) { int max_pages = (actor->length + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -68,7 +73,7 @@ static void *handle_next_page(struct squashfs_page_actor *actor) return NULL; if ((actor->next_page == actor->pages) || - (actor->next_index != actor->page[actor->next_page]->index)) { + (actor->next_index != page_next_index(actor))) { actor->next_index++; actor->returned_pages++; actor->last_page = NULL; @@ -103,7 +108,7 @@ static void direct_finish_page(struct squashfs_page_actor *actor) } struct squashfs_page_actor *squashfs_page_actor_init_special(struct squashfs_sb_info *msblk, - struct page **page, int pages, int length) + struct page **page, int pages, int length, loff_t start_index) { struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); @@ -125,7 +130,7 @@ struct squashfs_page_actor *squashfs_page_actor_init_special(struct squashfs_sb_ actor->pages = pages; actor->next_page = 0; actor->returned_pages = 0; - actor->next_index = page[0]->index & ~((1 << (msblk->block_log - PAGE_SHIFT)) - 1); + actor->next_index = start_index >> PAGE_SHIFT; actor->pageaddr = NULL; actor->last_page = NULL; actor->alloc_buffer = msblk->decompressor->alloc_buffer; diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h index 97d4983559b1..ffe25eb77c32 100644 --- a/fs/squashfs/page_actor.h +++ b/fs/squashfs/page_actor.h @@ -29,13 +29,15 @@ extern struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, int pages, int length); extern struct squashfs_page_actor *squashfs_page_actor_init_special( struct squashfs_sb_info *msblk, - struct page **page, int pages, int length); + struct page **page, int pages, int length, + loff_t start_index); static inline struct page *squashfs_page_actor_free(struct squashfs_page_actor *actor) { - struct page *last_page = actor->last_page; + struct page *last_page = actor->next_page == actor->pages ? actor->last_page : ERR_PTR(-EIO); kfree(actor->tmp_buffer); kfree(actor); + return last_page; } static inline void *squashfs_first_page(struct squashfs_page_actor *actor) diff --git a/fs/super.c b/fs/super.c index 38d72a3cf6fc..1db230432960 100644 --- a/fs/super.c +++ b/fs/super.c @@ -621,7 +621,7 @@ void generic_shutdown_super(struct super_block *sb) sync_filesystem(sb); sb->s_flags &= ~SB_ACTIVE; - cgroup_writeback_umount(); + cgroup_writeback_umount(sb); /* Evict all inodes with zero refcount. */ evict_inodes(sb); @@ -1802,8 +1802,8 @@ int vfs_get_tree(struct fs_context *fc) return error; if (!fc->root) { - pr_err("Filesystem %s get_tree() didn't set fc->root\n", - fc->fs_type->name); + pr_err("Filesystem %s get_tree() didn't set fc->root, returned %i\n", + fc->fs_type->name, error); /* We don't know what the locking state of the superblock is - * if there is a superblock. */ @@ -1905,7 +1905,7 @@ static void lockdep_sb_freeze_release(struct super_block *sb) int level; for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) - percpu_rwsem_release(sb->s_writers.rw_sem + level, 0, _THIS_IP_); + percpu_rwsem_release(sb->s_writers.rw_sem + level, _THIS_IP_); } /* diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 2e126d72d619..639307e2ff8c 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -28,17 +28,17 @@ const struct file_operations sysv_dir_operations = { .fsync = generic_file_fsync, }; -static void dir_commit_chunk(struct page *page, loff_t pos, unsigned len) +static void dir_commit_chunk(struct folio *folio, loff_t pos, unsigned len) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; struct inode *dir = mapping->host; - block_write_end(NULL, mapping, pos, len, len, page, NULL); + block_write_end(NULL, mapping, pos, len, len, folio, NULL); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); mark_inode_dirty(dir); } - unlock_page(page); + folio_unlock(folio); } static int sysv_handle_dirsync(struct inode *dir) @@ -52,20 +52,21 @@ static int sysv_handle_dirsync(struct inode *dir) } /* - * Calls to dir_get_page()/unmap_and_put_page() must be nested according to the + * Calls to dir_get_folio()/folio_release_kmap() must be nested according to the * rules documented in mm/highmem.rst. * - * NOTE: sysv_find_entry() and sysv_dotdot() act as calls to dir_get_page() + * NOTE: sysv_find_entry() and sysv_dotdot() act as calls to dir_get_folio() * and must be treated accordingly for nesting purposes. */ -static void *dir_get_page(struct inode *dir, unsigned long n, struct page **p) +static void *dir_get_folio(struct inode *dir, unsigned long n, + struct folio **foliop) { - struct address_space *mapping = dir->i_mapping; - struct page *page = read_mapping_page(mapping, n, NULL); - if (IS_ERR(page)) - return ERR_CAST(page); - *p = page; - return kmap_local_page(page); + struct folio *folio = read_mapping_folio(dir->i_mapping, n, NULL); + + if (IS_ERR(folio)) + return ERR_CAST(folio); + *foliop = folio; + return kmap_local_folio(folio, 0); } static int sysv_readdir(struct file *file, struct dir_context *ctx) @@ -87,9 +88,9 @@ static int sysv_readdir(struct file *file, struct dir_context *ctx) for ( ; n < npages; n++, offset = 0) { char *kaddr, *limit; struct sysv_dir_entry *de; - struct page *page; + struct folio *folio; - kaddr = dir_get_page(inode, n, &page); + kaddr = dir_get_folio(inode, n, &folio); if (IS_ERR(kaddr)) continue; de = (struct sysv_dir_entry *)(kaddr+offset); @@ -103,11 +104,11 @@ static int sysv_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN), fs16_to_cpu(SYSV_SB(sb), de->inode), DT_UNKNOWN)) { - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); return 0; } } - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); } return 0; } @@ -126,39 +127,35 @@ static inline int namecompare(int len, int maxlen, /* * sysv_find_entry() * - * finds an entry in the specified directory with the wanted name. It - * returns the cache buffer in which the entry was found, and the entry - * itself (as a parameter - res_dir). It does NOT read the inode of the + * finds an entry in the specified directory with the wanted name. + * It does NOT read the inode of the * entry - you'll have to do that yourself if you want to. * - * On Success unmap_and_put_page() should be called on *res_page. + * On Success folio_release_kmap() should be called on *foliop. * - * sysv_find_entry() acts as a call to dir_get_page() and must be treated + * sysv_find_entry() acts as a call to dir_get_folio() and must be treated * accordingly for nesting purposes. */ -struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_page) +struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct folio **foliop) { const char * name = dentry->d_name.name; int namelen = dentry->d_name.len; struct inode * dir = d_inode(dentry->d_parent); unsigned long start, n; unsigned long npages = dir_pages(dir); - struct page *page = NULL; struct sysv_dir_entry *de; - *res_page = NULL; - start = SYSV_I(dir)->i_dir_start_lookup; if (start >= npages) start = 0; n = start; do { - char *kaddr = dir_get_page(dir, n, &page); + char *kaddr = dir_get_folio(dir, n, foliop); if (!IS_ERR(kaddr)) { de = (struct sysv_dir_entry *)kaddr; - kaddr += PAGE_SIZE - SYSV_DIRSIZE; + kaddr += folio_size(*foliop) - SYSV_DIRSIZE; for ( ; (char *) de <= kaddr ; de++) { if (!de->inode) continue; @@ -166,7 +163,7 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_ name, de->name)) goto found; } - unmap_and_put_page(page, kaddr); + folio_release_kmap(*foliop, kaddr); } if (++n >= npages) @@ -177,7 +174,6 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_ found: SYSV_I(dir)->i_dir_start_lookup = n; - *res_page = page; return de; } @@ -186,7 +182,7 @@ int sysv_add_link(struct dentry *dentry, struct inode *inode) struct inode *dir = d_inode(dentry->d_parent); const char * name = dentry->d_name.name; int namelen = dentry->d_name.len; - struct page *page = NULL; + struct folio *folio = NULL; struct sysv_dir_entry * de; unsigned long npages = dir_pages(dir); unsigned long n; @@ -196,7 +192,7 @@ int sysv_add_link(struct dentry *dentry, struct inode *inode) /* We take care of directory expansion in the same loop */ for (n = 0; n <= npages; n++) { - kaddr = dir_get_page(dir, n, &page); + kaddr = dir_get_folio(dir, n, &folio); if (IS_ERR(kaddr)) return PTR_ERR(kaddr); de = (struct sysv_dir_entry *)kaddr; @@ -206,49 +202,49 @@ int sysv_add_link(struct dentry *dentry, struct inode *inode) goto got_it; err = -EEXIST; if (namecompare(namelen, SYSV_NAMELEN, name, de->name)) - goto out_page; + goto out_folio; de++; } - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); } BUG(); return -EINVAL; got_it: - pos = page_offset(page) + offset_in_page(de); - lock_page(page); - err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE); + pos = folio_pos(folio) + offset_in_folio(folio, de); + folio_lock(folio); + err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE); if (err) goto out_unlock; memcpy (de->name, name, namelen); memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2); de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); - dir_commit_chunk(page, pos, SYSV_DIRSIZE); + dir_commit_chunk(folio, pos, SYSV_DIRSIZE); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); err = sysv_handle_dirsync(dir); -out_page: - unmap_and_put_page(page, kaddr); +out_folio: + folio_release_kmap(folio, kaddr); return err; out_unlock: - unlock_page(page); - goto out_page; + folio_unlock(folio); + goto out_folio; } -int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page) +int sysv_delete_entry(struct sysv_dir_entry *de, struct folio *folio) { - struct inode *inode = page->mapping->host; - loff_t pos = page_offset(page) + offset_in_page(de); + struct inode *inode = folio->mapping->host; + loff_t pos = folio_pos(folio) + offset_in_folio(folio, de); int err; - lock_page(page); - err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE); + folio_lock(folio); + err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE); if (err) { - unlock_page(page); + folio_unlock(folio); return err; } de->inode = 0; - dir_commit_chunk(page, pos, SYSV_DIRSIZE); + dir_commit_chunk(folio, pos, SYSV_DIRSIZE); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); return sysv_handle_dirsync(inode); @@ -256,33 +252,33 @@ int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page) int sysv_make_empty(struct inode *inode, struct inode *dir) { - struct page *page = grab_cache_page(inode->i_mapping, 0); + struct folio *folio = filemap_grab_folio(inode->i_mapping, 0); struct sysv_dir_entry * de; - char *base; + char *kaddr; int err; - if (!page) - return -ENOMEM; - err = sysv_prepare_chunk(page, 0, 2 * SYSV_DIRSIZE); + if (IS_ERR(folio)) + return PTR_ERR(folio); + err = sysv_prepare_chunk(folio, 0, 2 * SYSV_DIRSIZE); if (err) { - unlock_page(page); + folio_unlock(folio); goto fail; } - base = kmap_local_page(page); - memset(base, 0, PAGE_SIZE); + kaddr = kmap_local_folio(folio, 0); + memset(kaddr, 0, folio_size(folio)); - de = (struct sysv_dir_entry *) base; + de = (struct sysv_dir_entry *)kaddr; de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); strcpy(de->name,"."); de++; de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), dir->i_ino); strcpy(de->name,".."); - kunmap_local(base); - dir_commit_chunk(page, 0, 2 * SYSV_DIRSIZE); + kunmap_local(kaddr); + dir_commit_chunk(folio, 0, 2 * SYSV_DIRSIZE); err = sysv_handle_dirsync(inode); fail: - put_page(page); + folio_put(folio); return err; } @@ -292,19 +288,19 @@ fail: int sysv_empty_dir(struct inode * inode) { struct super_block *sb = inode->i_sb; - struct page *page = NULL; + struct folio *folio = NULL; unsigned long i, npages = dir_pages(inode); char *kaddr; for (i = 0; i < npages; i++) { struct sysv_dir_entry *de; - kaddr = dir_get_page(inode, i, &page); + kaddr = dir_get_folio(inode, i, &folio); if (IS_ERR(kaddr)) continue; de = (struct sysv_dir_entry *)kaddr; - kaddr += PAGE_SIZE-SYSV_DIRSIZE; + kaddr += folio_size(folio) - SYSV_DIRSIZE; for ( ;(char *)de <= kaddr; de++) { if (!de->inode) @@ -321,46 +317,46 @@ int sysv_empty_dir(struct inode * inode) if (de->name[1] != '.' || de->name[2]) goto not_empty; } - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); } return 1; not_empty: - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); return 0; } /* Releases the page */ -int sysv_set_link(struct sysv_dir_entry *de, struct page *page, - struct inode *inode) +int sysv_set_link(struct sysv_dir_entry *de, struct folio *folio, + struct inode *inode) { - struct inode *dir = page->mapping->host; - loff_t pos = page_offset(page) + offset_in_page(de); + struct inode *dir = folio->mapping->host; + loff_t pos = folio_pos(folio) + offset_in_folio(folio, de); int err; - lock_page(page); - err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE); + folio_lock(folio); + err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE); if (err) { - unlock_page(page); + folio_unlock(folio); return err; } de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); - dir_commit_chunk(page, pos, SYSV_DIRSIZE); + dir_commit_chunk(folio, pos, SYSV_DIRSIZE); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); return sysv_handle_dirsync(inode); } /* - * Calls to dir_get_page()/unmap_and_put_page() must be nested according to the + * Calls to dir_get_folio()/folio_release_kmap() must be nested according to the * rules documented in mm/highmem.rst. * - * sysv_dotdot() acts as a call to dir_get_page() and must be treated + * sysv_dotdot() acts as a call to dir_get_folio() and must be treated * accordingly for nesting purposes. */ -struct sysv_dir_entry *sysv_dotdot(struct inode *dir, struct page **p) +struct sysv_dir_entry *sysv_dotdot(struct inode *dir, struct folio **foliop) { - struct sysv_dir_entry *de = dir_get_page(dir, 0, p); + struct sysv_dir_entry *de = dir_get_folio(dir, 0, foliop); if (IS_ERR(de)) return NULL; @@ -370,13 +366,13 @@ struct sysv_dir_entry *sysv_dotdot(struct inode *dir, struct page **p) ino_t sysv_inode_by_name(struct dentry *dentry) { - struct page *page; - struct sysv_dir_entry *de = sysv_find_entry (dentry, &page); + struct folio *folio; + struct sysv_dir_entry *de = sysv_find_entry (dentry, &folio); ino_t res = 0; if (de) { res = fs16_to_cpu(SYSV_SB(dentry->d_sb), de->inode); - unmap_and_put_page(page, de); + folio_release_kmap(folio, de); } return res; } diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index 19bcb51a2203..451e95f474fa 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -466,9 +466,9 @@ static int sysv_read_folio(struct file *file, struct folio *folio) return block_read_full_folio(folio, get_block); } -int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len) +int sysv_prepare_chunk(struct folio *folio, loff_t pos, unsigned len) { - return __block_write_begin(page, pos, len, get_block); + return __block_write_begin(folio, pos, len, get_block); } static void sysv_write_failed(struct address_space *mapping, loff_t to) @@ -483,11 +483,11 @@ static void sysv_write_failed(struct address_space *mapping, loff_t to) static int sysv_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - ret = block_write_begin(mapping, pos, len, pagep, get_block); + ret = block_write_begin(mapping, pos, len, foliop, get_block); if (unlikely(ret)) sysv_write_failed(mapping, pos + len); diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index d6b73798071b..fb8bd8437872 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -151,20 +151,20 @@ out_dir: static int sysv_unlink(struct inode * dir, struct dentry * dentry) { struct inode * inode = d_inode(dentry); - struct page * page; + struct folio *folio; struct sysv_dir_entry * de; int err; - de = sysv_find_entry(dentry, &page); + de = sysv_find_entry(dentry, &folio); if (!de) return -ENOENT; - err = sysv_delete_entry(de, page); + err = sysv_delete_entry(de, folio); if (!err) { inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); inode_dec_link_count(inode); } - unmap_and_put_page(page, de); + folio_release_kmap(folio, de); return err; } @@ -194,28 +194,28 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, { struct inode * old_inode = d_inode(old_dentry); struct inode * new_inode = d_inode(new_dentry); - struct page * dir_page = NULL; + struct folio *dir_folio; struct sysv_dir_entry * dir_de = NULL; - struct page * old_page; + struct folio *old_folio; struct sysv_dir_entry * old_de; int err = -ENOENT; if (flags & ~RENAME_NOREPLACE) return -EINVAL; - old_de = sysv_find_entry(old_dentry, &old_page); + old_de = sysv_find_entry(old_dentry, &old_folio); if (!old_de) goto out; if (S_ISDIR(old_inode->i_mode)) { err = -EIO; - dir_de = sysv_dotdot(old_inode, &dir_page); + dir_de = sysv_dotdot(old_inode, &dir_folio); if (!dir_de) goto out_old; } if (new_inode) { - struct page * new_page; + struct folio *new_folio; struct sysv_dir_entry * new_de; err = -ENOTEMPTY; @@ -223,11 +223,11 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, goto out_dir; err = -ENOENT; - new_de = sysv_find_entry(new_dentry, &new_page); + new_de = sysv_find_entry(new_dentry, &new_folio); if (!new_de) goto out_dir; - err = sysv_set_link(new_de, new_page, old_inode); - unmap_and_put_page(new_page, new_de); + err = sysv_set_link(new_de, new_folio, old_inode); + folio_release_kmap(new_folio, new_de); if (err) goto out_dir; inode_set_ctime_current(new_inode); @@ -242,23 +242,23 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, inode_inc_link_count(new_dir); } - err = sysv_delete_entry(old_de, old_page); + err = sysv_delete_entry(old_de, old_folio); if (err) goto out_dir; mark_inode_dirty(old_inode); if (dir_de) { - err = sysv_set_link(dir_de, dir_page, new_dir); + err = sysv_set_link(dir_de, dir_folio, new_dir); if (!err) inode_dec_link_count(old_dir); } out_dir: if (dir_de) - unmap_and_put_page(dir_page, dir_de); + folio_release_kmap(dir_folio, dir_de); out_old: - unmap_and_put_page(old_page, old_de); + folio_release_kmap(old_folio, old_de); out: return err; } diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index e3f988b469ee..0a48b2e7edb1 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -133,8 +133,8 @@ extern void sysv_free_block(struct super_block *, sysv_zone_t); extern unsigned long sysv_count_free_blocks(struct super_block *); /* itree.c */ -extern void sysv_truncate(struct inode *); -extern int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len); +void sysv_truncate(struct inode *); +int sysv_prepare_chunk(struct folio *folio, loff_t pos, unsigned len); /* inode.c */ extern struct inode *sysv_iget(struct super_block *, unsigned int); @@ -148,15 +148,15 @@ extern void sysv_destroy_icache(void); /* dir.c */ -extern struct sysv_dir_entry *sysv_find_entry(struct dentry *, struct page **); -extern int sysv_add_link(struct dentry *, struct inode *); -extern int sysv_delete_entry(struct sysv_dir_entry *, struct page *); -extern int sysv_make_empty(struct inode *, struct inode *); -extern int sysv_empty_dir(struct inode *); -extern int sysv_set_link(struct sysv_dir_entry *, struct page *, +struct sysv_dir_entry *sysv_find_entry(struct dentry *, struct folio **); +int sysv_add_link(struct dentry *, struct inode *); +int sysv_delete_entry(struct sysv_dir_entry *, struct folio *); +int sysv_make_empty(struct inode *, struct inode *); +int sysv_empty_dir(struct inode *); +int sysv_set_link(struct sysv_dir_entry *, struct folio *, struct inode *); -extern struct sysv_dir_entry *sysv_dotdot(struct inode *, struct page **); -extern ino_t sysv_inode_by_name(struct dentry *); +struct sysv_dir_entry *sysv_dotdot(struct inode *, struct folio **); +ino_t sysv_inode_by_name(struct dentry *); extern const struct inode_operations sysv_file_inode_operations; diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 5d88c184f0fc..8705c77a9e75 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -112,7 +112,7 @@ static void release_ei(struct kref *ref) entry->release(entry->name, ei->data); } - call_rcu(&ei->rcu, free_ei_rcu); + call_srcu(&eventfs_srcu, &ei->rcu, free_ei_rcu); } static inline void put_ei(struct eventfs_inode *ei) @@ -736,7 +736,7 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode /* Was the parent freed? */ if (list_empty(&ei->list)) { cleanup_ei(ei); - ei = NULL; + ei = ERR_PTR(-EBUSY); } return ei; } @@ -862,7 +862,7 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, int level) list_for_each_entry(ei_child, &ei->children, list) eventfs_remove_rec(ei_child, level + 1); - list_del(&ei->list); + list_del_rcu(&ei->list); free_ei(ei); } diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 1028ab6d9a74..1748dff58c3b 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -42,7 +42,7 @@ static struct inode *tracefs_alloc_inode(struct super_block *sb) struct tracefs_inode *ti; unsigned long flags; - ti = kmem_cache_alloc(tracefs_inode_cachep, GFP_KERNEL); + ti = alloc_inode_sb(sb, tracefs_inode_cachep, GFP_KERNEL); if (!ti) return NULL; @@ -53,15 +53,14 @@ static struct inode *tracefs_alloc_inode(struct super_block *sb) return &ti->vfs_inode; } -static void tracefs_free_inode_rcu(struct rcu_head *rcu) +static void tracefs_free_inode(struct inode *inode) { - struct tracefs_inode *ti; + struct tracefs_inode *ti = get_tracefs(inode); - ti = container_of(rcu, struct tracefs_inode, rcu); kmem_cache_free(tracefs_inode_cachep, ti); } -static void tracefs_free_inode(struct inode *inode) +static void tracefs_destroy_inode(struct inode *inode) { struct tracefs_inode *ti = get_tracefs(inode); unsigned long flags; @@ -69,8 +68,6 @@ static void tracefs_free_inode(struct inode *inode) spin_lock_irqsave(&tracefs_inode_lock, flags); list_del_rcu(&ti->list); spin_unlock_irqrestore(&tracefs_inode_lock, flags); - - call_rcu(&ti->rcu, tracefs_free_inode_rcu); } static ssize_t default_read_file(struct file *file, char __user *buf, @@ -437,6 +434,7 @@ static int tracefs_drop_inode(struct inode *inode) static const struct super_operations tracefs_super_operations = { .alloc_inode = tracefs_alloc_inode, .free_inode = tracefs_free_inode, + .destroy_inode = tracefs_destroy_inode, .drop_inode = tracefs_drop_inode, .statfs = simple_statfs, .show_options = tracefs_show_options, diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h index f704d8348357..d83c2a25f288 100644 --- a/fs/tracefs/internal.h +++ b/fs/tracefs/internal.h @@ -10,10 +10,7 @@ enum { }; struct tracefs_inode { - union { - struct inode vfs_inode; - struct rcu_head rcu; - }; + struct inode vfs_inode; /* The below gets initialized with memset_after(ti, 0, vfs_inode) */ struct list_head list; unsigned long flags; diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index c77ea57fe696..fda82f3e16e8 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -555,6 +555,11 @@ static unsigned int vfs_dent_type(uint8_t type) return 0; } +struct ubifs_dir_data { + struct ubifs_dent_node *dent; + u64 cookie; +}; + /* * The classical Unix view for directory is that it is a linear array of * (name, inode number) entries. Linux/VFS assumes this model as well. @@ -582,6 +587,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx) struct inode *dir = file_inode(file); struct ubifs_info *c = dir->i_sb->s_fs_info; bool encrypted = IS_ENCRYPTED(dir); + struct ubifs_dir_data *data = file->private_data; dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, ctx->pos); @@ -604,27 +610,27 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx) fstr_real_len = fstr.len; } - if (file->f_version == 0) { + if (data->cookie == 0) { /* - * The file was seek'ed, which means that @file->private_data + * The file was seek'ed, which means that @data->dent * is now invalid. This may also be just the first * 'ubifs_readdir()' invocation, in which case - * @file->private_data is NULL, and the below code is + * @data->dent is NULL, and the below code is * basically a no-op. */ - kfree(file->private_data); - file->private_data = NULL; + kfree(data->dent); + data->dent = NULL; } /* - * 'generic_file_llseek()' unconditionally sets @file->f_version to - * zero, and we use this for detecting whether the file was seek'ed. + * 'ubifs_dir_llseek()' sets @data->cookie to zero, and we use this + * for detecting whether the file was seek'ed. */ - file->f_version = 1; + data->cookie = 1; /* File positions 0 and 1 correspond to "." and ".." */ if (ctx->pos < 2) { - ubifs_assert(c, !file->private_data); + ubifs_assert(c, !data->dent); if (!dir_emit_dots(file, ctx)) { if (encrypted) fscrypt_fname_free_buffer(&fstr); @@ -641,10 +647,10 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx) } ctx->pos = key_hash_flash(c, &dent->key); - file->private_data = dent; + data->dent = dent; } - dent = file->private_data; + dent = data->dent; if (!dent) { /* * The directory was seek'ed to and is now readdir'ed. @@ -658,7 +664,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx) goto out; } ctx->pos = key_hash_flash(c, &dent->key); - file->private_data = dent; + data->dent = dent; } while (1) { @@ -701,15 +707,15 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx) goto out; } - kfree(file->private_data); + kfree(data->dent); ctx->pos = key_hash_flash(c, &dent->key); - file->private_data = dent; + data->dent = dent; cond_resched(); } out: - kfree(file->private_data); - file->private_data = NULL; + kfree(data->dent); + data->dent = NULL; if (encrypted) fscrypt_fname_free_buffer(&fstr); @@ -733,7 +739,10 @@ out: /* Free saved readdir() state when the directory is closed */ static int ubifs_dir_release(struct inode *dir, struct file *file) { - kfree(file->private_data); + struct ubifs_dir_data *data = file->private_data; + + kfree(data->dent); + kfree(data); file->private_data = NULL; return 0; } @@ -1712,6 +1721,24 @@ int ubifs_getattr(struct mnt_idmap *idmap, const struct path *path, return 0; } +static int ubifs_dir_open(struct inode *inode, struct file *file) +{ + struct ubifs_dir_data *data; + + data = kzalloc(sizeof(struct ubifs_dir_data), GFP_KERNEL); + if (!data) + return -ENOMEM; + file->private_data = data; + return 0; +} + +static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int whence) +{ + struct ubifs_dir_data *data = file->private_data; + + return generic_llseek_cookie(file, offset, whence, &data->cookie); +} + const struct inode_operations ubifs_dir_inode_operations = { .lookup = ubifs_lookup, .create = ubifs_create, @@ -1732,7 +1759,8 @@ const struct inode_operations ubifs_dir_inode_operations = { }; const struct file_operations ubifs_dir_operations = { - .llseek = generic_file_llseek, + .open = ubifs_dir_open, + .llseek = ubifs_dir_llseek, .release = ubifs_dir_release, .read = generic_read_dir, .iterate_shared = ubifs_readdir, diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 68e104423a48..5130123005e4 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -211,7 +211,7 @@ static void release_existing_page_budget(struct ubifs_info *c) } static int write_begin_slow(struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep) + loff_t pos, unsigned len, struct folio **foliop) { struct inode *inode = mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; @@ -298,7 +298,7 @@ static int write_begin_slow(struct address_space *mapping, ubifs_release_dirty_inode_budget(c, ui); } - *pagep = &folio->page; + *foliop = folio; return 0; } @@ -414,7 +414,7 @@ static int allocate_budget(struct ubifs_info *c, struct folio *folio, */ static int ubifs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; @@ -483,7 +483,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, folio_unlock(folio); folio_put(folio); - return write_begin_slow(mapping, pos, len, pagep); + return write_begin_slow(mapping, pos, len, foliop); } /* @@ -492,7 +492,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, * with @ui->ui_mutex locked if we are appending pages, and unlocked * otherwise. This is an optimization (slightly hacky though). */ - *pagep = &folio->page; + *foliop = folio; return 0; } @@ -524,9 +524,8 @@ static void cancel_budget(struct ubifs_info *c, struct folio *folio, static int ubifs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); struct inode *inode = mapping->host; struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_info *c = inode->i_sb->s_fs_info; diff --git a/fs/udf/dir.c b/fs/udf/dir.c index f94f45fe2c91..5023dfe191e8 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -60,7 +60,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) * identifying beginning of dir entry (names are under user control), * we need to scan the directory from the beginning. */ - if (!inode_eq_iversion(dir, file->f_version)) { + if (!inode_eq_iversion(dir, *(u64 *)file->private_data)) { emit_pos = nf_pos; nf_pos = 0; } else { @@ -122,15 +122,37 @@ out_iter: udf_fiiter_release(&iter); out: if (pos_valid) - file->f_version = inode_query_iversion(dir); + *(u64 *)file->private_data = inode_query_iversion(dir); kfree(fname); return ret; } +static int udf_dir_open(struct inode *inode, struct file *file) +{ + file->private_data = kzalloc(sizeof(u64), GFP_KERNEL); + if (!file->private_data) + return -ENOMEM; + return 0; +} + +static int udf_dir_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static loff_t udf_dir_llseek(struct file *file, loff_t offset, int whence) +{ + return generic_llseek_cookie(file, offset, whence, + (u64 *)file->private_data); +} + /* readdir and lookup functions */ const struct file_operations udf_dir_operations = { - .llseek = generic_file_llseek, + .open = udf_dir_open, + .release = udf_dir_release, + .llseek = udf_dir_llseek, .read = generic_read_dir, .iterate_shared = udf_readdir, .unlocked_ioctl = udf_ioctl, diff --git a/fs/udf/file.c b/fs/udf/file.c index 3a4179de316b..412fe7c4d348 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -62,7 +62,7 @@ static vm_fault_t udf_page_mkwrite(struct vm_fault *vmf) end = size & ~PAGE_MASK; else end = PAGE_SIZE; - err = __block_write_begin(&folio->page, 0, end, udf_get_block); + err = __block_write_begin(folio, 0, end, udf_get_block); if (err) { folio_unlock(folio); ret = vmf_fs_error(err); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 4726a4d014b6..eaee57b91c6c 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -246,14 +246,14 @@ static void udf_readahead(struct readahead_control *rac) static int udf_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct udf_inode_info *iinfo = UDF_I(file_inode(file)); struct folio *folio; int ret; if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { - ret = block_write_begin(mapping, pos, len, pagep, + ret = block_write_begin(mapping, pos, len, foliop, udf_get_block); if (unlikely(ret)) udf_write_failed(mapping, pos + len); @@ -265,7 +265,7 @@ static int udf_write_begin(struct file *file, struct address_space *mapping, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); - *pagep = &folio->page; + *foliop = folio; if (!folio_test_uptodate(folio)) udf_adinicb_read_folio(folio); return 0; @@ -273,16 +273,14 @@ static int udf_write_begin(struct file *file, struct address_space *mapping, static int udf_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = file_inode(file); - struct folio *folio; loff_t last_pos; if (UDF_I(inode)->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) - return generic_write_end(file, mapping, pos, len, copied, page, + return generic_write_end(file, mapping, pos, len, copied, folio, fsdata); - folio = page_folio(page); last_pos = pos + copied; if (last_pos > inode->i_size) i_size_write(inode, last_pos); diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 61f25d3cf3f7..d6e6a2198971 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -42,18 +42,18 @@ static inline int ufs_match(struct super_block *sb, int len, return !memcmp(name, de->d_name, len); } -static void ufs_commit_chunk(struct page *page, loff_t pos, unsigned len) +static void ufs_commit_chunk(struct folio *folio, loff_t pos, unsigned len) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; struct inode *dir = mapping->host; inode_inc_iversion(dir); - block_write_end(NULL, mapping, pos, len, len, page, NULL); + block_write_end(NULL, mapping, pos, len, len, folio, NULL); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); mark_inode_dirty(dir); } - unlock_page(page); + folio_unlock(folio); } static int ufs_handle_dirsync(struct inode *dir) @@ -66,22 +66,16 @@ static int ufs_handle_dirsync(struct inode *dir) return err; } -static inline void ufs_put_page(struct page *page) -{ - kunmap(page); - put_page(page); -} - ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr) { ino_t res = 0; struct ufs_dir_entry *de; - struct page *page; + struct folio *folio; - de = ufs_find_entry(dir, qstr, &page); + de = ufs_find_entry(dir, qstr, &folio); if (de) { res = fs32_to_cpu(dir->i_sb, de->d_ino); - ufs_put_page(page); + folio_release_kmap(folio, de); } return res; } @@ -89,43 +83,40 @@ ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr) /* Releases the page */ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, - struct page *page, struct inode *inode, + struct folio *folio, struct inode *inode, bool update_times) { - loff_t pos = page_offset(page) + - (char *) de - (char *) page_address(page); + loff_t pos = folio_pos(folio) + offset_in_folio(folio, de); unsigned len = fs16_to_cpu(dir->i_sb, de->d_reclen); int err; - lock_page(page); - err = ufs_prepare_chunk(page, pos, len); + folio_lock(folio); + err = ufs_prepare_chunk(folio, pos, len); BUG_ON(err); de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino); ufs_set_de_type(dir->i_sb, de, inode->i_mode); - ufs_commit_chunk(page, pos, len); - ufs_put_page(page); + ufs_commit_chunk(folio, pos, len); + folio_release_kmap(folio, de); if (update_times) inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); ufs_handle_dirsync(dir); } - -static bool ufs_check_page(struct page *page) +static bool ufs_check_folio(struct folio *folio, char *kaddr) { - struct inode *dir = page->mapping->host; + struct inode *dir = folio->mapping->host; struct super_block *sb = dir->i_sb; - char *kaddr = page_address(page); unsigned offs, rec_len; - unsigned limit = PAGE_SIZE; + unsigned limit = folio_size(folio); const unsigned chunk_mask = UFS_SB(sb)->s_uspi->s_dirblksize - 1; struct ufs_dir_entry *p; char *error; - if ((dir->i_size >> PAGE_SHIFT) == page->index) { - limit = dir->i_size & ~PAGE_MASK; + if (dir->i_size < folio_pos(folio) + limit) { + limit = offset_in_folio(folio, dir->i_size); if (limit & chunk_mask) goto Ebadsize; if (!limit) @@ -150,13 +141,13 @@ static bool ufs_check_page(struct page *page) if (offs != limit) goto Eend; out: - SetPageChecked(page); + folio_set_checked(folio); return true; /* Too bad, we had an error */ Ebadsize: - ufs_error(sb, "ufs_check_page", + ufs_error(sb, __func__, "size of directory #%lu is not a multiple of chunk size", dir->i_ino ); @@ -176,36 +167,40 @@ Espan: Einumber: error = "inode out of bounds"; bad_entry: - ufs_error (sb, "ufs_check_page", "bad entry in directory #%lu: %s - " - "offset=%lu, rec_len=%d, name_len=%d", - dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs, + ufs_error(sb, __func__, "bad entry in directory #%lu: %s - " + "offset=%llu, rec_len=%d, name_len=%d", + dir->i_ino, error, folio_pos(folio) + offs, rec_len, ufs_get_de_namlen(sb, p)); goto fail; Eend: p = (struct ufs_dir_entry *)(kaddr + offs); ufs_error(sb, __func__, "entry in directory #%lu spans the page boundary" - "offset=%lu", - dir->i_ino, (page->index<<PAGE_SHIFT)+offs); + "offset=%llu", + dir->i_ino, folio_pos(folio) + offs); fail: return false; } -static struct page *ufs_get_page(struct inode *dir, unsigned long n) +static void *ufs_get_folio(struct inode *dir, unsigned long n, + struct folio **foliop) { struct address_space *mapping = dir->i_mapping; - struct page *page = read_mapping_page(mapping, n, NULL); - if (!IS_ERR(page)) { - kmap(page); - if (unlikely(!PageChecked(page))) { - if (!ufs_check_page(page)) - goto fail; - } + struct folio *folio = read_mapping_folio(mapping, n, NULL); + void *kaddr; + + if (IS_ERR(folio)) + return ERR_CAST(folio); + kaddr = kmap_local_folio(folio, 0); + if (unlikely(!folio_test_checked(folio))) { + if (!ufs_check_folio(folio, kaddr)) + goto fail; } - return page; + *foliop = folio; + return kaddr; fail: - ufs_put_page(page); + folio_release_kmap(folio, kaddr); return ERR_PTR(-EIO); } @@ -231,17 +226,14 @@ ufs_next_entry(struct super_block *sb, struct ufs_dir_entry *p) fs16_to_cpu(sb, p->d_reclen)); } -struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p) +struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct folio **foliop) { - struct page *page = ufs_get_page(dir, 0); - struct ufs_dir_entry *de = NULL; + struct ufs_dir_entry *de = ufs_get_folio(dir, 0, foliop); - if (!IS_ERR(page)) { - de = ufs_next_entry(dir->i_sb, - (struct ufs_dir_entry *)page_address(page)); - *p = page; - } - return de; + if (!IS_ERR(de)) + return ufs_next_entry(dir->i_sb, de); + + return NULL; } /* @@ -253,7 +245,7 @@ struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p) * Entry is guaranteed to be valid. */ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr, - struct page **res_page) + struct folio **foliop) { struct super_block *sb = dir->i_sb; const unsigned char *name = qstr->name; @@ -261,7 +253,6 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr, unsigned reclen = UFS_DIR_REC_LEN(namelen); unsigned long start, n; unsigned long npages = dir_pages(dir); - struct page *page = NULL; struct ufs_inode_info *ui = UFS_I(dir); struct ufs_dir_entry *de; @@ -270,27 +261,23 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr, if (npages == 0 || namelen > UFS_MAXNAMLEN) goto out; - /* OFFSET_CACHE */ - *res_page = NULL; - start = ui->i_dir_start_lookup; if (start >= npages) start = 0; n = start; do { - char *kaddr; - page = ufs_get_page(dir, n); - if (!IS_ERR(page)) { - kaddr = page_address(page); - de = (struct ufs_dir_entry *) kaddr; + char *kaddr = ufs_get_folio(dir, n, foliop); + + if (!IS_ERR(kaddr)) { + de = (struct ufs_dir_entry *)kaddr; kaddr += ufs_last_byte(dir, n) - reclen; while ((char *) de <= kaddr) { if (ufs_match(sb, namelen, name, de)) goto found; de = ufs_next_entry(sb, de); } - ufs_put_page(page); + folio_release_kmap(*foliop, kaddr); } if (++n >= npages) n = 0; @@ -299,7 +286,6 @@ out: return NULL; found: - *res_page = page; ui->i_dir_start_lookup = n; return de; } @@ -316,11 +302,10 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode) unsigned reclen = UFS_DIR_REC_LEN(namelen); const unsigned int chunk_size = UFS_SB(sb)->s_uspi->s_dirblksize; unsigned short rec_len, name_len; - struct page *page = NULL; + struct folio *folio = NULL; struct ufs_dir_entry *de; unsigned long npages = dir_pages(dir); unsigned long n; - char *kaddr; loff_t pos; int err; @@ -328,21 +313,19 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode) /* * We take care of directory expansion in the same loop. - * This code plays outside i_size, so it locks the page + * This code plays outside i_size, so it locks the folio * to protect that region. */ for (n = 0; n <= npages; n++) { + char *kaddr = ufs_get_folio(dir, n, &folio); char *dir_end; - page = ufs_get_page(dir, n); - err = PTR_ERR(page); - if (IS_ERR(page)) - goto out; - lock_page(page); - kaddr = page_address(page); + if (IS_ERR(kaddr)) + return PTR_ERR(kaddr); + folio_lock(folio); dir_end = kaddr + ufs_last_byte(dir, n); de = (struct ufs_dir_entry *)kaddr; - kaddr += PAGE_SIZE - reclen; + kaddr += folio_size(folio) - reclen; while ((char *)de <= kaddr) { if ((char *)de == dir_end) { /* We hit i_size */ @@ -369,16 +352,15 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode) goto got_it; de = (struct ufs_dir_entry *) ((char *) de + rec_len); } - unlock_page(page); - ufs_put_page(page); + folio_unlock(folio); + folio_release_kmap(folio, kaddr); } BUG(); return -EINVAL; got_it: - pos = page_offset(page) + - (char*)de - (char*)page_address(page); - err = ufs_prepare_chunk(page, pos, rec_len); + pos = folio_pos(folio) + offset_in_folio(folio, de); + err = ufs_prepare_chunk(folio, pos, rec_len); if (err) goto out_unlock; if (de->d_ino) { @@ -395,18 +377,17 @@ got_it: de->d_ino = cpu_to_fs32(sb, inode->i_ino); ufs_set_de_type(sb, de, inode->i_mode); - ufs_commit_chunk(page, pos, rec_len); + ufs_commit_chunk(folio, pos, rec_len); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); err = ufs_handle_dirsync(dir); /* OFFSET_CACHE */ out_put: - ufs_put_page(page); -out: + folio_release_kmap(folio, de); return err; out_unlock: - unlock_page(page); + folio_unlock(folio); goto out_put; } @@ -435,7 +416,7 @@ ufs_readdir(struct file *file, struct dir_context *ctx) unsigned long n = pos >> PAGE_SHIFT; unsigned long npages = dir_pages(inode); unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); - bool need_revalidate = !inode_eq_iversion(inode, file->f_version); + bool need_revalidate = !inode_eq_iversion(inode, *(u64 *)file->private_data); unsigned flags = UFS_SB(sb)->s_flags; UFSD("BEGIN\n"); @@ -444,25 +425,24 @@ ufs_readdir(struct file *file, struct dir_context *ctx) return 0; for ( ; n < npages; n++, offset = 0) { - char *kaddr, *limit; struct ufs_dir_entry *de; + struct folio *folio; + char *kaddr = ufs_get_folio(inode, n, &folio); + char *limit; - struct page *page = ufs_get_page(inode, n); - - if (IS_ERR(page)) { + if (IS_ERR(kaddr)) { ufs_error(sb, __func__, "bad page in #%lu", inode->i_ino); ctx->pos += PAGE_SIZE - offset; - return -EIO; + return PTR_ERR(kaddr); } - kaddr = page_address(page); if (unlikely(need_revalidate)) { if (offset) { offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask); ctx->pos = (n<<PAGE_SHIFT) + offset; } - file->f_version = inode_query_iversion(inode); + *(u64 *)file->private_data = inode_query_iversion(inode); need_revalidate = false; } de = (struct ufs_dir_entry *)(kaddr+offset); @@ -482,13 +462,13 @@ ufs_readdir(struct file *file, struct dir_context *ctx) ufs_get_de_namlen(sb, de), fs32_to_cpu(sb, de->d_ino), d_type)) { - ufs_put_page(page); + folio_release_kmap(folio, de); return 0; } } ctx->pos += fs16_to_cpu(sb, de->d_reclen); } - ufs_put_page(page); + folio_release_kmap(folio, kaddr); } return 0; } @@ -499,19 +479,23 @@ ufs_readdir(struct file *file, struct dir_context *ctx) * previous entry. */ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir, - struct page * page) + struct folio *folio) { struct super_block *sb = inode->i_sb; - char *kaddr = page_address(page); - unsigned from = ((char*)dir - kaddr) & ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); - unsigned to = ((char*)dir - kaddr) + fs16_to_cpu(sb, dir->d_reclen); + size_t from, to; + char *kaddr; loff_t pos; - struct ufs_dir_entry *pde = NULL; - struct ufs_dir_entry *de = (struct ufs_dir_entry *) (kaddr + from); + struct ufs_dir_entry *de, *pde = NULL; int err; UFSD("ENTER\n"); + from = offset_in_folio(folio, dir); + to = from + fs16_to_cpu(sb, dir->d_reclen); + kaddr = (char *)dir - from; + from &= ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); + de = (struct ufs_dir_entry *) (kaddr + from); + UFSD("ino %u, reclen %u, namlen %u, name %s\n", fs32_to_cpu(sb, de->d_ino), fs16_to_cpu(sb, de->d_reclen), @@ -528,21 +512,20 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir, de = ufs_next_entry(sb, de); } if (pde) - from = (char*)pde - (char*)page_address(page); - - pos = page_offset(page) + from; - lock_page(page); - err = ufs_prepare_chunk(page, pos, to - from); + from = offset_in_folio(folio, pde); + pos = folio_pos(folio) + from; + folio_lock(folio); + err = ufs_prepare_chunk(folio, pos, to - from); BUG_ON(err); if (pde) pde->d_reclen = cpu_to_fs16(sb, to - from); dir->d_ino = 0; - ufs_commit_chunk(page, pos, to - from); + ufs_commit_chunk(folio, pos, to - from); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); err = ufs_handle_dirsync(inode); out: - ufs_put_page(page); + folio_release_kmap(folio, kaddr); UFSD("EXIT\n"); return err; } @@ -551,26 +534,25 @@ int ufs_make_empty(struct inode * inode, struct inode *dir) { struct super_block * sb = dir->i_sb; struct address_space *mapping = inode->i_mapping; - struct page *page = grab_cache_page(mapping, 0); + struct folio *folio = filemap_grab_folio(mapping, 0); const unsigned int chunk_size = UFS_SB(sb)->s_uspi->s_dirblksize; struct ufs_dir_entry * de; - char *base; int err; + char *kaddr; - if (!page) - return -ENOMEM; + if (IS_ERR(folio)) + return PTR_ERR(folio); - err = ufs_prepare_chunk(page, 0, chunk_size); + err = ufs_prepare_chunk(folio, 0, chunk_size); if (err) { - unlock_page(page); + folio_unlock(folio); goto fail; } - kmap(page); - base = (char*)page_address(page); - memset(base, 0, PAGE_SIZE); + kaddr = kmap_local_folio(folio, 0); + memset(kaddr, 0, folio_size(folio)); - de = (struct ufs_dir_entry *) base; + de = (struct ufs_dir_entry *)kaddr; de->d_ino = cpu_to_fs32(sb, inode->i_ino); ufs_set_de_type(sb, de, inode->i_mode); @@ -584,12 +566,12 @@ int ufs_make_empty(struct inode * inode, struct inode *dir) de->d_reclen = cpu_to_fs16(sb, chunk_size - UFS_DIR_REC_LEN(1)); ufs_set_de_namlen(sb, de, 2); strcpy (de->d_name, ".."); - kunmap(page); + kunmap_local(kaddr); - ufs_commit_chunk(page, 0, chunk_size); + ufs_commit_chunk(folio, 0, chunk_size); err = ufs_handle_dirsync(inode); fail: - put_page(page); + folio_put(folio); return err; } @@ -599,18 +581,17 @@ fail: int ufs_empty_dir(struct inode * inode) { struct super_block *sb = inode->i_sb; - struct page *page = NULL; + struct folio *folio; + char *kaddr; unsigned long i, npages = dir_pages(inode); for (i = 0; i < npages; i++) { - char *kaddr; struct ufs_dir_entry *de; - page = ufs_get_page(inode, i); - if (IS_ERR(page)) + kaddr = ufs_get_folio(inode, i, &folio); + if (IS_ERR(kaddr)) continue; - kaddr = page_address(page); de = (struct ufs_dir_entry *)kaddr; kaddr += ufs_last_byte(inode, i) - UFS_DIR_REC_LEN(1); @@ -637,18 +618,40 @@ int ufs_empty_dir(struct inode * inode) } de = ufs_next_entry(sb, de); } - ufs_put_page(page); + folio_release_kmap(folio, kaddr); } return 1; not_empty: - ufs_put_page(page); + folio_release_kmap(folio, kaddr); return 0; } +static int ufs_dir_open(struct inode *inode, struct file *file) +{ + file->private_data = kzalloc(sizeof(u64), GFP_KERNEL); + if (!file->private_data) + return -ENOMEM; + return 0; +} + +static int ufs_dir_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static loff_t ufs_dir_llseek(struct file *file, loff_t offset, int whence) +{ + return generic_llseek_cookie(file, offset, whence, + (u64 *)file->private_data); +} + const struct file_operations ufs_dir_operations = { + .open = ufs_dir_open, + .release = ufs_dir_release, .read = generic_read_dir, .iterate_shared = ufs_readdir, .fsync = generic_file_fsync, - .llseek = generic_file_llseek, + .llseek = ufs_dir_llseek, }; diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index a7bb2e63cdde..5331ae7ebf3e 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -479,9 +479,9 @@ static int ufs_read_folio(struct file *file, struct folio *folio) return block_read_full_folio(folio, ufs_getfrag_block); } -int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len) +int ufs_prepare_chunk(struct folio *folio, loff_t pos, unsigned len) { - return __block_write_begin(page, pos, len, ufs_getfrag_block); + return __block_write_begin(folio, pos, len, ufs_getfrag_block); } static void ufs_truncate_blocks(struct inode *); @@ -498,11 +498,11 @@ static void ufs_write_failed(struct address_space *mapping, loff_t to) static int ufs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - ret = block_write_begin(mapping, pos, len, pagep, ufs_getfrag_block); + ret = block_write_begin(mapping, pos, len, foliop, ufs_getfrag_block); if (unlikely(ret)) ufs_write_failed(mapping, pos + len); @@ -511,11 +511,11 @@ static int ufs_write_begin(struct file *file, struct address_space *mapping, static int ufs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { int ret; - ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); if (ret < len) ufs_write_failed(mapping, pos + len); return ret; diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 9cad29463791..24bd12186647 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -209,14 +209,14 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry) { struct inode * inode = d_inode(dentry); struct ufs_dir_entry *de; - struct page *page; + struct folio *folio; int err = -ENOENT; - de = ufs_find_entry(dir, &dentry->d_name, &page); + de = ufs_find_entry(dir, &dentry->d_name, &folio); if (!de) goto out; - err = ufs_delete_entry(dir, de, page); + err = ufs_delete_entry(dir, de, folio); if (err) goto out; @@ -249,28 +249,28 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); - struct page *dir_page = NULL; + struct folio *dir_folio = NULL; struct ufs_dir_entry * dir_de = NULL; - struct page *old_page; + struct folio *old_folio; struct ufs_dir_entry *old_de; int err = -ENOENT; if (flags & ~RENAME_NOREPLACE) return -EINVAL; - old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page); + old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_folio); if (!old_de) goto out; if (S_ISDIR(old_inode->i_mode)) { err = -EIO; - dir_de = ufs_dotdot(old_inode, &dir_page); + dir_de = ufs_dotdot(old_inode, &dir_folio); if (!dir_de) goto out_old; } if (new_inode) { - struct page *new_page; + struct folio *new_folio; struct ufs_dir_entry *new_de; err = -ENOTEMPTY; @@ -278,10 +278,10 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, goto out_dir; err = -ENOENT; - new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page); + new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_folio); if (!new_de) goto out_dir; - ufs_set_link(new_dir, new_de, new_page, old_inode, 1); + ufs_set_link(new_dir, new_de, new_folio, old_inode, 1); inode_set_ctime_current(new_inode); if (dir_de) drop_nlink(new_inode); @@ -300,29 +300,24 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, */ inode_set_ctime_current(old_inode); - ufs_delete_entry(old_dir, old_de, old_page); + ufs_delete_entry(old_dir, old_de, old_folio); mark_inode_dirty(old_inode); if (dir_de) { if (old_dir != new_dir) - ufs_set_link(old_inode, dir_de, dir_page, new_dir, 0); - else { - kunmap(dir_page); - put_page(dir_page); - } + ufs_set_link(old_inode, dir_de, dir_folio, new_dir, 0); + else + folio_release_kmap(dir_folio, new_dir); inode_dec_link_count(old_dir); } return 0; out_dir: - if (dir_de) { - kunmap(dir_page); - put_page(dir_page); - } + if (dir_de) + folio_release_kmap(dir_folio, dir_de); out_old: - kunmap(old_page); - put_page(old_page); + folio_release_kmap(old_folio, old_de); out: return err; } diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 6b499180643b..a2c762cb65a0 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -99,15 +99,17 @@ extern void ufs_put_cylinder (struct super_block *, unsigned); /* dir.c */ extern const struct inode_operations ufs_dir_inode_operations; -extern int ufs_add_link (struct dentry *, struct inode *); -extern ino_t ufs_inode_by_name(struct inode *, const struct qstr *); -extern int ufs_make_empty(struct inode *, struct inode *); -extern struct ufs_dir_entry *ufs_find_entry(struct inode *, const struct qstr *, struct page **); -extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page *); -extern int ufs_empty_dir (struct inode *); -extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **); -extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, - struct page *page, struct inode *inode, bool update_times); + +int ufs_add_link(struct dentry *, struct inode *); +ino_t ufs_inode_by_name(struct inode *, const struct qstr *); +int ufs_make_empty(struct inode *, struct inode *); +struct ufs_dir_entry *ufs_find_entry(struct inode *, const struct qstr *, + struct folio **); +int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct folio *); +int ufs_empty_dir(struct inode *); +struct ufs_dir_entry *ufs_dotdot(struct inode *, struct folio **); +void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, + struct folio *folio, struct inode *inode, bool update_times); /* file.c */ extern const struct inode_operations ufs_file_inode_operations; diff --git a/fs/ufs/util.h b/fs/ufs/util.h index 0ecd2ed792f5..bf708b68f150 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -250,9 +250,9 @@ ufs_set_inode_gid(struct super_block *sb, struct ufs_inode *inode, u32 value) } } -extern dev_t ufs_get_inode_dev(struct super_block *, struct ufs_inode_info *); -extern void ufs_set_inode_dev(struct super_block *, struct ufs_inode_info *, dev_t); -extern int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len); +dev_t ufs_get_inode_dev(struct super_block *, struct ufs_inode_info *); +void ufs_set_inode_dev(struct super_block *, struct ufs_inode_info *, dev_t); +int ufs_prepare_chunk(struct folio *folio, loff_t pos, unsigned len); /* * These functions manipulate ufs buffers diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c index fdb4da24d662..b780deb81b02 100644 --- a/fs/vboxsf/file.c +++ b/fs/vboxsf/file.c @@ -300,23 +300,23 @@ static int vboxsf_writepage(struct page *page, struct writeback_control *wbc) static int vboxsf_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned int len, unsigned int copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; struct vboxsf_handle *sf_handle = file->private_data; - unsigned int from = pos & ~PAGE_MASK; + size_t from = offset_in_folio(folio, pos); u32 nwritten = len; u8 *buf; int err; - /* zero the stale part of the page if we did a short copy */ - if (!PageUptodate(page) && copied < len) - zero_user(page, from + copied, len - copied); + /* zero the stale part of the folio if we did a short copy */ + if (!folio_test_uptodate(folio) && copied < len) + folio_zero_range(folio, from + copied, len - copied); - buf = kmap(page); + buf = kmap(&folio->page); err = vboxsf_write(sf_handle->root, sf_handle->handle, pos, &nwritten, buf + from); - kunmap(page); + kunmap(&folio->page); if (err) { nwritten = 0; @@ -326,16 +326,16 @@ static int vboxsf_write_end(struct file *file, struct address_space *mapping, /* mtime changed */ VBOXSF_I(inode)->force_restat = 1; - if (!PageUptodate(page) && nwritten == PAGE_SIZE) - SetPageUptodate(page); + if (!folio_test_uptodate(folio) && nwritten == folio_size(folio)) + folio_mark_uptodate(folio); pos += nwritten; if (pos > inode->i_size) i_size_write(inode, pos); out: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return nwritten; } @@ -343,7 +343,7 @@ out: /* * Note simple_write_begin does not read the page from disk on partial writes * this is ok since vboxsf_write_end only writes the written parts of the - * page and it does not call SetPageUptodate for partial writes. + * page and it does not call folio_mark_uptodate for partial writes. */ const struct address_space_operations vboxsf_reg_aops = { .read_folio = vboxsf_read_folio, diff --git a/fs/verity/signature.c b/fs/verity/signature.c index 90c07573dd77..0302a4e506ec 100644 --- a/fs/verity/signature.c +++ b/fs/verity/signature.c @@ -17,6 +17,7 @@ #include <linux/cred.h> #include <linux/key.h> +#include <linux/security.h> #include <linux/slab.h> #include <linux/verification.h> @@ -41,7 +42,11 @@ static struct key *fsverity_keyring; * @sig_size: size of signature in bytes, or 0 if no signature * * If the file includes a signature of its fs-verity file digest, verify it - * against the certificates in the fs-verity keyring. + * against the certificates in the fs-verity keyring. Note that signatures + * are verified regardless of the state of the 'fsverity_require_signatures' + * variable and the LSM subsystem relies on this behavior to help enforce + * file integrity policies. Please discuss changes with the LSM list + * (thank you!). * * Return: 0 on success (signature valid or not required); -errno on failure */ @@ -106,6 +111,17 @@ int fsverity_verify_signature(const struct fsverity_info *vi, return err; } + err = security_inode_setintegrity(inode, + LSM_INT_FSVERITY_BUILTINSIG_VALID, + signature, + sig_size); + + if (err) { + fsverity_err(inode, "Error %d exposing file signature to LSMs", + err); + return err; + } + return 0; } diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 7e80732cb547..5f0494702e0b 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -46,7 +46,7 @@ xfs_perag_get( struct xfs_perag *pag; rcu_read_lock(); - pag = radix_tree_lookup(&mp->m_perag_tree, agno); + pag = xa_load(&mp->m_perags, agno); if (pag) { trace_xfs_perag_get(pag, _RET_IP_); ASSERT(atomic_read(&pag->pag_ref) >= 0); @@ -56,31 +56,6 @@ xfs_perag_get( return pag; } -/* - * search from @first to find the next perag with the given tag set. - */ -struct xfs_perag * -xfs_perag_get_tag( - struct xfs_mount *mp, - xfs_agnumber_t first, - unsigned int tag) -{ - struct xfs_perag *pag; - int found; - - rcu_read_lock(); - found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, - (void **)&pag, first, 1, tag); - if (found <= 0) { - rcu_read_unlock(); - return NULL; - } - trace_xfs_perag_get_tag(pag, _RET_IP_); - atomic_inc(&pag->pag_ref); - rcu_read_unlock(); - return pag; -} - /* Get a passive reference to the given perag. */ struct xfs_perag * xfs_perag_hold( @@ -117,7 +92,7 @@ xfs_perag_grab( struct xfs_perag *pag; rcu_read_lock(); - pag = radix_tree_lookup(&mp->m_perag_tree, agno); + pag = xa_load(&mp->m_perags, agno); if (pag) { trace_xfs_perag_grab(pag, _RET_IP_); if (!atomic_inc_not_zero(&pag->pag_active_ref)) @@ -127,32 +102,6 @@ xfs_perag_grab( return pag; } -/* - * search from @first to find the next perag with the given tag set. - */ -struct xfs_perag * -xfs_perag_grab_tag( - struct xfs_mount *mp, - xfs_agnumber_t first, - int tag) -{ - struct xfs_perag *pag; - int found; - - rcu_read_lock(); - found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, - (void **)&pag, first, 1, tag); - if (found <= 0) { - rcu_read_unlock(); - return NULL; - } - trace_xfs_perag_grab_tag(pag, _RET_IP_); - if (!atomic_inc_not_zero(&pag->pag_active_ref)) - pag = NULL; - rcu_read_unlock(); - return pag; -} - void xfs_perag_rele( struct xfs_perag *pag) @@ -235,16 +184,6 @@ out: return error; } -STATIC void -__xfs_free_perag( - struct rcu_head *head) -{ - struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); - - ASSERT(!delayed_work_pending(&pag->pag_blockgc_work)); - kfree(pag); -} - /* * Free up the per-ag resources associated with the mount structure. */ @@ -256,9 +195,7 @@ xfs_free_perag( xfs_agnumber_t agno; for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - spin_lock(&mp->m_perag_lock); - pag = radix_tree_delete(&mp->m_perag_tree, agno); - spin_unlock(&mp->m_perag_lock); + pag = xa_erase(&mp->m_perags, agno); ASSERT(pag); XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0); xfs_defer_drain_free(&pag->pag_intents_drain); @@ -270,7 +207,7 @@ xfs_free_perag( xfs_perag_rele(pag); XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_active_ref) != 0); - call_rcu(&pag->rcu_head, __xfs_free_perag); + kfree_rcu_mightsleep(pag); } } @@ -347,9 +284,7 @@ xfs_free_unused_perag_range( xfs_agnumber_t index; for (index = agstart; index < agend; index++) { - spin_lock(&mp->m_perag_lock); - pag = radix_tree_delete(&mp->m_perag_tree, index); - spin_unlock(&mp->m_perag_lock); + pag = xa_erase(&mp->m_perags, index); if (!pag) break; xfs_buf_cache_destroy(&pag->pag_bcache); @@ -390,20 +325,11 @@ xfs_initialize_perag( pag->pag_agno = index; pag->pag_mount = mp; - error = radix_tree_preload(GFP_KERNEL | __GFP_RETRY_MAYFAIL); - if (error) - goto out_free_pag; - - spin_lock(&mp->m_perag_lock); - if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { - WARN_ON_ONCE(1); - spin_unlock(&mp->m_perag_lock); - radix_tree_preload_end(); - error = -EEXIST; + error = xa_insert(&mp->m_perags, index, pag, GFP_KERNEL); + if (error) { + WARN_ON_ONCE(error == -EBUSY); goto out_free_pag; } - spin_unlock(&mp->m_perag_lock); - radix_tree_preload_end(); #ifdef __KERNEL__ /* Place kernel structure only init below this point. */ @@ -451,9 +377,7 @@ xfs_initialize_perag( out_remove_pag: xfs_defer_drain_free(&pag->pag_intents_drain); - spin_lock(&mp->m_perag_lock); - radix_tree_delete(&mp->m_perag_tree, index); - spin_unlock(&mp->m_perag_lock); + pag = xa_erase(&mp->m_perags, index); out_free_pag: kfree(pag); out_unwind_new_pags: diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 35de09a2516c..d9cccd093b60 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -63,9 +63,6 @@ struct xfs_perag { /* Blocks reserved for the reverse mapping btree. */ struct xfs_ag_resv pag_rmapbt_resv; - /* for rcu-safe freeing */ - struct rcu_head rcu_head; - /* Precalculated geometry info */ xfs_agblock_t block_count; xfs_agblock_t min_block; @@ -156,15 +153,11 @@ void xfs_free_perag(struct xfs_mount *mp); /* Passive AG references */ struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno); -struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno, - unsigned int tag); struct xfs_perag *xfs_perag_hold(struct xfs_perag *pag); void xfs_perag_put(struct xfs_perag *pag); /* Active AG references */ struct xfs_perag *xfs_perag_grab(struct xfs_mount *, xfs_agnumber_t); -struct xfs_perag *xfs_perag_grab_tag(struct xfs_mount *, xfs_agnumber_t, - int tag); void xfs_perag_rele(struct xfs_perag *pag); /* @@ -266,13 +259,6 @@ xfs_perag_next( (agno) = 0; \ for_each_perag_from((mp), (agno), (pag)) -#define for_each_perag_tag(mp, agno, pag, tag) \ - for ((agno) = 0, (pag) = xfs_perag_grab_tag((mp), 0, (tag)); \ - (pag) != NULL; \ - (agno) = (pag)->pag_agno + 1, \ - xfs_perag_rele(pag), \ - (pag) = xfs_perag_grab_tag((mp), (agno), (tag))) - static inline struct xfs_perag * xfs_perag_next_wrap( struct xfs_perag *pag, diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 585e98e87ef9..aada676eee51 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -569,11 +569,11 @@ xfs_allocbt_block_maxrecs( /* * Calculate number of records in an alloc btree block. */ -int +unsigned int xfs_allocbt_maxrecs( struct xfs_mount *mp, - int blocklen, - int leaf) + unsigned int blocklen, + bool leaf) { blocklen -= XFS_ALLOC_BLOCK_LEN(mp); return xfs_allocbt_block_maxrecs(blocklen, leaf); diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h index 155b47f231ab..12647f9aaa6d 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.h +++ b/fs/xfs/libxfs/xfs_alloc_btree.h @@ -53,7 +53,8 @@ struct xfs_btree_cur *xfs_bnobt_init_cursor(struct xfs_mount *mp, struct xfs_btree_cur *xfs_cntbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_perag *pag); -extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); +unsigned int xfs_allocbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen, + bool leaf); extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp, unsigned long long len); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index b9e98950eb3d..6aaec1246c95 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -686,7 +686,7 @@ xfs_attr_shortform_bytesfit( */ if (!dp->i_forkoff && dp->i_df.if_bytes > xfs_default_attroffset(dp)) - dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS); + dsize = xfs_bmdr_space_calc(MINDBTPTRS); break; case XFS_DINODE_FMT_BTREE: /* @@ -700,7 +700,7 @@ xfs_attr_shortform_bytesfit( return 0; return dp->i_forkoff; } - dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot); + dsize = xfs_bmap_bmdr_space(dp->i_df.if_broot); break; } @@ -708,11 +708,11 @@ xfs_attr_shortform_bytesfit( * A data fork btree root must have space for at least * MINDBTPTRS key/ptr pairs if the data fork is small or empty. */ - minforkoff = max_t(int64_t, dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS)); + minforkoff = max_t(int64_t, dsize, xfs_bmdr_space_calc(MINDBTPTRS)); minforkoff = roundup(minforkoff, 8) >> 3; /* attr fork btree root can have at least this many key/ptr pairs */ - maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); + maxforkoff = XFS_LITINO(mp) - xfs_bmdr_space_calc(MINABTPTRS); maxforkoff = maxforkoff >> 3; /* rounded down */ if (offset >= maxforkoff) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 7df74c35d9f9..8090e8249116 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -79,9 +79,9 @@ xfs_bmap_compute_maxlevels( maxleafents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp), whichfork); if (whichfork == XFS_DATA_FORK) - sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS); + sz = xfs_bmdr_space_calc(MINDBTPTRS); else - sz = XFS_BMDR_SPACE_CALC(MINABTPTRS); + sz = xfs_bmdr_space_calc(MINABTPTRS); maxrootrecs = xfs_bmdr_maxrecs(sz, 0); minleafrecs = mp->m_bmap_dmnr[0]; @@ -102,8 +102,8 @@ xfs_bmap_compute_attr_offset( struct xfs_mount *mp) { if (mp->m_sb.sb_inodesize == 256) - return XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); - return XFS_BMDR_SPACE_CALC(6 * MINABTPTRS); + return XFS_LITINO(mp) - xfs_bmdr_space_calc(MINABTPTRS); + return xfs_bmdr_space_calc(6 * MINABTPTRS); } STATIC int /* error */ @@ -298,7 +298,7 @@ xfs_check_block( prevp = NULL; for( i = 1; i <= xfs_btree_get_numrecs(block); i++) { dmxr = mp->m_bmap_dmxr[0]; - keyp = XFS_BMBT_KEY_ADDR(mp, block, i); + keyp = xfs_bmbt_key_addr(mp, block, i); if (prevp) { ASSERT(be64_to_cpu(prevp->br_startoff) < @@ -310,15 +310,15 @@ xfs_check_block( * Compare the block numbers to see if there are dups. */ if (root) - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz); + pp = xfs_bmap_broot_ptr_addr(mp, block, i, sz); else - pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr); + pp = xfs_bmbt_ptr_addr(mp, block, i, dmxr); for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) { if (root) - thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz); + thispa = xfs_bmap_broot_ptr_addr(mp, block, j, sz); else - thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr); + thispa = xfs_bmbt_ptr_addr(mp, block, j, dmxr); if (*thispa == *pp) { xfs_warn(mp, "%s: thispa(%d) == pp(%d) %lld", __func__, j, i, @@ -373,7 +373,7 @@ xfs_bmap_check_leaf_extents( level = be16_to_cpu(block->bb_level); ASSERT(level > 0); xfs_check_block(block, mp, 1, ifp->if_broot_bytes); - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + pp = xfs_bmap_broot_ptr_addr(mp, block, 1, ifp->if_broot_bytes); bno = be64_to_cpu(*pp); ASSERT(bno != NULLFSBLOCK); @@ -406,7 +406,7 @@ xfs_bmap_check_leaf_extents( */ xfs_check_block(block, mp, 0, 0); - pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); + pp = xfs_bmbt_ptr_addr(mp, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, bno))) { xfs_btree_mark_sick(cur); @@ -446,14 +446,14 @@ xfs_bmap_check_leaf_extents( * conform with the first entry in this one. */ - ep = XFS_BMBT_REC_ADDR(mp, block, 1); + ep = xfs_bmbt_rec_addr(mp, block, 1); if (i) { ASSERT(xfs_bmbt_disk_get_startoff(&last) + xfs_bmbt_disk_get_blockcount(&last) <= xfs_bmbt_disk_get_startoff(ep)); } for (j = 1; j < num_recs; j++) { - nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1); + nextp = xfs_bmbt_rec_addr(mp, block, j + 1); ASSERT(xfs_bmbt_disk_get_startoff(ep) + xfs_bmbt_disk_get_blockcount(ep) <= xfs_bmbt_disk_get_startoff(nextp)); @@ -584,9 +584,9 @@ xfs_bmap_btree_to_extents( ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE); ASSERT(be16_to_cpu(rblock->bb_level) == 1); ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); - ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1); + ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, false) == 1); - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes); + pp = xfs_bmap_broot_ptr_addr(mp, rblock, 1, ifp->if_broot_bytes); cbno = be64_to_cpu(*pp); #ifdef DEBUG if (XFS_IS_CORRUPT(cur->bc_mp, !xfs_verify_fsbno(mp, cbno))) { @@ -714,7 +714,7 @@ xfs_bmap_extents_to_btree( for_each_xfs_iext(ifp, &icur, &rec) { if (isnullstartblock(rec.br_startblock)) continue; - arp = XFS_BMBT_REC_ADDR(mp, ablock, 1 + cnt); + arp = xfs_bmbt_rec_addr(mp, ablock, 1 + cnt); xfs_bmbt_disk_set_all(arp, &rec); cnt++; } @@ -724,10 +724,10 @@ xfs_bmap_extents_to_btree( /* * Fill in the root key and pointer. */ - kp = XFS_BMBT_KEY_ADDR(mp, block, 1); - arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); + kp = xfs_bmbt_key_addr(mp, block, 1); + arp = xfs_bmbt_rec_addr(mp, ablock, 1); kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp)); - pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur, + pp = xfs_bmbt_ptr_addr(mp, block, 1, xfs_bmbt_get_maxrecs(cur, be16_to_cpu(block->bb_level))); *pp = cpu_to_be64(args.fsbno); @@ -896,7 +896,7 @@ xfs_bmap_add_attrfork_btree( mp = ip->i_mount; - if (XFS_BMAP_BMDR_SPACE(block) <= xfs_inode_data_fork_size(ip)) + if (xfs_bmap_bmdr_space(block) <= xfs_inode_data_fork_size(ip)) *flags |= XFS_ILOG_DBROOT; else { cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK); @@ -1160,7 +1160,7 @@ xfs_iread_bmbt_block( } /* Copy records into the incore cache. */ - frp = XFS_BMBT_REC_ADDR(mp, block, 1); + frp = xfs_bmbt_rec_addr(mp, block, 1); for (j = 0; j < num_recs; j++, frp++, ir->loaded++) { struct xfs_bmbt_irec new; xfs_failaddr_t fa; @@ -3112,6 +3112,23 @@ xfs_bmap_extsize_align( return 0; } +static inline bool +xfs_bmap_adjacent_valid( + struct xfs_bmalloca *ap, + xfs_fsblock_t x, + xfs_fsblock_t y) +{ + struct xfs_mount *mp = ap->ip->i_mount; + + if (XFS_IS_REALTIME_INODE(ap->ip) && + (ap->datatype & XFS_ALLOC_USERDATA)) + return x < mp->m_sb.sb_rblocks; + + return XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && + XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && + XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks; +} + #define XFS_ALLOC_GAP_UNITS 4 /* returns true if ap->blkno was modified */ @@ -3119,36 +3136,25 @@ bool xfs_bmap_adjacent( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { - xfs_fsblock_t adjust; /* adjustment to block numbers */ - xfs_mount_t *mp; /* mount point structure */ - int rt; /* true if inode is realtime */ - -#define ISVALID(x,y) \ - (rt ? \ - (x) < mp->m_sb.sb_rblocks : \ - XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && \ - XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && \ - XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks) - - mp = ap->ip->i_mount; - rt = XFS_IS_REALTIME_INODE(ap->ip) && - (ap->datatype & XFS_ALLOC_USERDATA); + xfs_fsblock_t adjust; /* adjustment to block numbers */ + /* * If allocating at eof, and there's a previous real block, * try to use its last block as our starting point. */ if (ap->eof && ap->prev.br_startoff != NULLFILEOFF && !isnullstartblock(ap->prev.br_startblock) && - ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount, - ap->prev.br_startblock)) { + xfs_bmap_adjacent_valid(ap, + ap->prev.br_startblock + ap->prev.br_blockcount, + ap->prev.br_startblock)) { ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount; /* * Adjust for the gap between prevp and us. */ adjust = ap->offset - (ap->prev.br_startoff + ap->prev.br_blockcount); - if (adjust && - ISVALID(ap->blkno + adjust, ap->prev.br_startblock)) + if (adjust && xfs_bmap_adjacent_valid(ap, ap->blkno + adjust, + ap->prev.br_startblock)) ap->blkno += adjust; return true; } @@ -3171,7 +3177,8 @@ xfs_bmap_adjacent( !isnullstartblock(ap->prev.br_startblock) && (prevbno = ap->prev.br_startblock + ap->prev.br_blockcount) && - ISVALID(prevbno, ap->prev.br_startblock)) { + xfs_bmap_adjacent_valid(ap, prevbno, + ap->prev.br_startblock)) { /* * Calculate gap to end of previous block. */ @@ -3187,8 +3194,8 @@ xfs_bmap_adjacent( * number, then just use the end of the previous block. */ if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->length && - ISVALID(prevbno + prevdiff, - ap->prev.br_startblock)) + xfs_bmap_adjacent_valid(ap, prevbno + prevdiff, + ap->prev.br_startblock)) prevbno += adjust; else prevdiff += adjust; @@ -3220,9 +3227,11 @@ xfs_bmap_adjacent( * offset by our length. */ if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->length && - ISVALID(gotbno - gotdiff, gotbno)) + xfs_bmap_adjacent_valid(ap, gotbno - gotdiff, + gotbno)) gotbno -= adjust; - else if (ISVALID(gotbno - ap->length, gotbno)) { + else if (xfs_bmap_adjacent_valid(ap, gotbno - ap->length, + gotbno)) { gotbno -= ap->length; gotdiff += adjust - ap->length; } else @@ -3250,7 +3259,7 @@ xfs_bmap_adjacent( return true; } } -#undef ISVALID + return false; } @@ -4847,6 +4856,7 @@ xfs_bmapi_remap( } ip->i_nblocks += len; + ip->i_delayed_blks -= len; /* see xfs_bmap_defer_add */ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (ifp->if_format == XFS_DINODE_FMT_BTREE) @@ -5376,7 +5386,8 @@ xfs_bmap_del_extent_real( */ if (!(tp->t_flags & XFS_TRANS_RTBITMAP_LOCKED)) { tp->t_flags |= XFS_TRANS_RTBITMAP_LOCKED; - xfs_rtbitmap_lock(tp, mp); + xfs_rtbitmap_lock(mp); + xfs_rtbitmap_trans_join(tp); } error = xfs_rtfree_blocks(tp, del->br_startblock, del->br_blockcount); diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index d1b06ccde19e..3464be771f95 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -65,10 +65,10 @@ xfs_bmdr_to_bmbt( ASSERT(be16_to_cpu(rblock->bb_level) > 0); rblock->bb_numrecs = dblock->bb_numrecs; dmxr = xfs_bmdr_maxrecs(dblocklen, 0); - fkp = XFS_BMDR_KEY_ADDR(dblock, 1); - tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1); - fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr); - tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen); + fkp = xfs_bmdr_key_addr(dblock, 1); + tkp = xfs_bmbt_key_addr(mp, rblock, 1); + fpp = xfs_bmdr_ptr_addr(dblock, 1, dmxr); + tpp = xfs_bmap_broot_ptr_addr(mp, rblock, 1, rblocklen); dmxr = be16_to_cpu(dblock->bb_numrecs); memcpy(tkp, fkp, sizeof(*fkp) * dmxr); memcpy(tpp, fpp, sizeof(*fpp) * dmxr); @@ -168,10 +168,10 @@ xfs_bmbt_to_bmdr( dblock->bb_level = rblock->bb_level; dblock->bb_numrecs = rblock->bb_numrecs; dmxr = xfs_bmdr_maxrecs(dblocklen, 0); - fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1); - tkp = XFS_BMDR_KEY_ADDR(dblock, 1); - fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen); - tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr); + fkp = xfs_bmbt_key_addr(mp, rblock, 1); + tkp = xfs_bmdr_key_addr(dblock, 1); + fpp = xfs_bmap_broot_ptr_addr(mp, rblock, 1, rblocklen); + tpp = xfs_bmdr_ptr_addr(dblock, 1, dmxr); dmxr = be16_to_cpu(dblock->bb_numrecs); memcpy(tkp, fkp, sizeof(*fkp) * dmxr); memcpy(tpp, fpp, sizeof(*fpp) * dmxr); @@ -645,13 +645,13 @@ xfs_bmbt_commit_staged_btree( /* * Calculate number of records in a bmap btree block. */ -int +unsigned int xfs_bmbt_maxrecs( struct xfs_mount *mp, - int blocklen, - int leaf) + unsigned int blocklen, + bool leaf) { - blocklen -= XFS_BMBT_BLOCK_LEN(mp); + blocklen -= xfs_bmbt_block_len(mp); return xfs_bmbt_block_maxrecs(blocklen, leaf); } diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h index de1b73f1225c..49a3bae3f6ec 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.h +++ b/fs/xfs/libxfs/xfs_bmap_btree.h @@ -14,70 +14,6 @@ struct xfs_trans; struct xbtree_ifakeroot; /* - * Btree block header size depends on a superblock flag. - */ -#define XFS_BMBT_BLOCK_LEN(mp) \ - (xfs_has_crc(((mp))) ? \ - XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN) - -#define XFS_BMBT_REC_ADDR(mp, block, index) \ - ((xfs_bmbt_rec_t *) \ - ((char *)(block) + \ - XFS_BMBT_BLOCK_LEN(mp) + \ - ((index) - 1) * sizeof(xfs_bmbt_rec_t))) - -#define XFS_BMBT_KEY_ADDR(mp, block, index) \ - ((xfs_bmbt_key_t *) \ - ((char *)(block) + \ - XFS_BMBT_BLOCK_LEN(mp) + \ - ((index) - 1) * sizeof(xfs_bmbt_key_t))) - -#define XFS_BMBT_PTR_ADDR(mp, block, index, maxrecs) \ - ((xfs_bmbt_ptr_t *) \ - ((char *)(block) + \ - XFS_BMBT_BLOCK_LEN(mp) + \ - (maxrecs) * sizeof(xfs_bmbt_key_t) + \ - ((index) - 1) * sizeof(xfs_bmbt_ptr_t))) - -#define XFS_BMDR_REC_ADDR(block, index) \ - ((xfs_bmdr_rec_t *) \ - ((char *)(block) + \ - sizeof(struct xfs_bmdr_block) + \ - ((index) - 1) * sizeof(xfs_bmdr_rec_t))) - -#define XFS_BMDR_KEY_ADDR(block, index) \ - ((xfs_bmdr_key_t *) \ - ((char *)(block) + \ - sizeof(struct xfs_bmdr_block) + \ - ((index) - 1) * sizeof(xfs_bmdr_key_t))) - -#define XFS_BMDR_PTR_ADDR(block, index, maxrecs) \ - ((xfs_bmdr_ptr_t *) \ - ((char *)(block) + \ - sizeof(struct xfs_bmdr_block) + \ - (maxrecs) * sizeof(xfs_bmdr_key_t) + \ - ((index) - 1) * sizeof(xfs_bmdr_ptr_t))) - -/* - * These are to be used when we know the size of the block and - * we don't have a cursor. - */ -#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \ - XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0)) - -#define XFS_BMAP_BROOT_SPACE_CALC(mp, nrecs) \ - (int)(XFS_BMBT_BLOCK_LEN(mp) + \ - ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) - -#define XFS_BMAP_BROOT_SPACE(mp, bb) \ - (XFS_BMAP_BROOT_SPACE_CALC(mp, be16_to_cpu((bb)->bb_numrecs))) -#define XFS_BMDR_SPACE_CALC(nrecs) \ - (int)(sizeof(xfs_bmdr_block_t) + \ - ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) -#define XFS_BMAP_BMDR_SPACE(bb) \ - (XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs))) - -/* * Maximum number of bmap btree levels. */ #define XFS_BM_MAXLEVELS(mp,w) ((mp)->m_bm_maxlevels[(w)]) @@ -99,7 +35,8 @@ extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int, extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level); extern int xfs_bmdr_maxrecs(int blocklen, int leaf); -extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf); +unsigned int xfs_bmbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen, + bool leaf); extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, xfs_ino_t new_owner, @@ -121,4 +58,144 @@ void xfs_bmbt_destroy_cur_cache(void); void xfs_bmbt_init_block(struct xfs_inode *ip, struct xfs_btree_block *buf, struct xfs_buf *bp, __u16 level, __u16 numrecs); +/* + * Btree block header size depends on a superblock flag. + */ +static inline size_t +xfs_bmbt_block_len(struct xfs_mount *mp) +{ + return xfs_has_crc(mp) ? + XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN; +} + +/* Addresses of key, pointers, and records within an incore bmbt block. */ + +static inline struct xfs_bmbt_rec * +xfs_bmbt_rec_addr( + struct xfs_mount *mp, + struct xfs_btree_block *block, + unsigned int index) +{ + return (struct xfs_bmbt_rec *) + ((char *)block + xfs_bmbt_block_len(mp) + + (index - 1) * sizeof(struct xfs_bmbt_rec)); +} + +static inline struct xfs_bmbt_key * +xfs_bmbt_key_addr( + struct xfs_mount *mp, + struct xfs_btree_block *block, + unsigned int index) +{ + return (struct xfs_bmbt_key *) + ((char *)block + xfs_bmbt_block_len(mp) + + (index - 1) * sizeof(struct xfs_bmbt_key *)); +} + +static inline xfs_bmbt_ptr_t * +xfs_bmbt_ptr_addr( + struct xfs_mount *mp, + struct xfs_btree_block *block, + unsigned int index, + unsigned int maxrecs) +{ + return (xfs_bmbt_ptr_t *) + ((char *)block + xfs_bmbt_block_len(mp) + + maxrecs * sizeof(struct xfs_bmbt_key) + + (index - 1) * sizeof(xfs_bmbt_ptr_t)); +} + +/* Addresses of key, pointers, and records within an ondisk bmbt block. */ + +static inline struct xfs_bmbt_rec * +xfs_bmdr_rec_addr( + struct xfs_bmdr_block *block, + unsigned int index) +{ + return (struct xfs_bmbt_rec *) + ((char *)(block + 1) + + (index - 1) * sizeof(struct xfs_bmbt_rec)); +} + +static inline struct xfs_bmbt_key * +xfs_bmdr_key_addr( + struct xfs_bmdr_block *block, + unsigned int index) +{ + return (struct xfs_bmbt_key *) + ((char *)(block + 1) + + (index - 1) * sizeof(struct xfs_bmbt_key)); +} + +static inline xfs_bmbt_ptr_t * +xfs_bmdr_ptr_addr( + struct xfs_bmdr_block *block, + unsigned int index, + unsigned int maxrecs) +{ + return (xfs_bmbt_ptr_t *) + ((char *)(block + 1) + + maxrecs * sizeof(struct xfs_bmbt_key) + + (index - 1) * sizeof(xfs_bmbt_ptr_t)); +} + +/* + * Address of pointers within the incore btree root. + * + * These are to be used when we know the size of the block and + * we don't have a cursor. + */ +static inline xfs_bmbt_ptr_t * +xfs_bmap_broot_ptr_addr( + struct xfs_mount *mp, + struct xfs_btree_block *bb, + unsigned int i, + unsigned int sz) +{ + return xfs_bmbt_ptr_addr(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, false)); +} + +/* + * Compute the space required for the incore btree root containing the given + * number of records. + */ +static inline size_t +xfs_bmap_broot_space_calc( + struct xfs_mount *mp, + unsigned int nrecs) +{ + return xfs_bmbt_block_len(mp) + + (nrecs * (sizeof(struct xfs_bmbt_key) + sizeof(xfs_bmbt_ptr_t))); +} + +/* + * Compute the space required for the incore btree root given the ondisk + * btree root block. + */ +static inline size_t +xfs_bmap_broot_space( + struct xfs_mount *mp, + struct xfs_bmdr_block *bb) +{ + return xfs_bmap_broot_space_calc(mp, be16_to_cpu(bb->bb_numrecs)); +} + +/* Compute the space required for the ondisk root block. */ +static inline size_t +xfs_bmdr_space_calc(unsigned int nrecs) +{ + return sizeof(struct xfs_bmdr_block) + + (nrecs * (sizeof(struct xfs_bmbt_key) + sizeof(xfs_bmbt_ptr_t))); +} + +/* + * Compute the space required for the ondisk root block given an incore root + * block. + */ +static inline size_t +xfs_bmap_bmdr_space(struct xfs_btree_block *bb) +{ + return xfs_bmdr_space_calc(be16_to_cpu(bb->bb_numrecs)); +} + #endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 40021849b42f..2cd212ad2c1d 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -28,7 +28,6 @@ #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_attr.h" -#include "xfs_trans_priv.h" #include "xfs_exchmaps.h" static struct kmem_cache *xfs_defer_pending_cache; diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 454b63ef7201..860284064c5a 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -8,6 +8,7 @@ /* * SGI's XFS filesystem's major stuff (constants, structures) + * NOTE: This file must be compile-able with C++ compilers. */ /* @@ -826,6 +827,30 @@ struct xfs_exchange_range { }; /* + * Using the same definition of file2 as struct xfs_exchange_range, commit the + * contents of file1 into file2 if file2 has the same inode number, mtime, and + * ctime as the arguments provided to the call. The old contents of file2 will + * be moved to file1. + * + * Returns -EBUSY if there isn't an exact match for the file2 fields. + * + * Filesystems must be able to restart and complete the operation even after + * the system goes down. + */ +struct xfs_commit_range { + __s32 file1_fd; + __u32 pad; /* must be zeroes */ + __u64 file1_offset; /* file1 offset, bytes */ + __u64 file2_offset; /* file2 offset, bytes */ + __u64 length; /* bytes to exchange */ + + __u64 flags; /* see XFS_EXCHANGE_RANGE_* below */ + + /* opaque file2 metadata for freshness checks */ + __u64 file2_freshness[6]; +}; + +/* * Exchange file data all the way to the ends of both files, and then exchange * the file sizes. This flag can be used to replace a file's contents with a * different amount of data. length will be ignored. @@ -906,13 +931,13 @@ static inline struct xfs_getparents_rec * xfs_getparents_next_rec(struct xfs_getparents *gp, struct xfs_getparents_rec *gpr) { - void *next = ((void *)gpr + gpr->gpr_reclen); + void *next = ((char *)gpr + gpr->gpr_reclen); void *end = (void *)(uintptr_t)(gp->gp_buffer + gp->gp_bufsize); if (next >= end) return NULL; - return next; + return (struct xfs_getparents_rec *)next; } /* Iterate through this file handle's directory parent pointers. */ @@ -997,6 +1022,8 @@ struct xfs_getparents_by_handle { #define XFS_IOC_BULKSTAT _IOR ('X', 127, struct xfs_bulkstat_req) #define XFS_IOC_INUMBERS _IOR ('X', 128, struct xfs_inumbers_req) #define XFS_IOC_EXCHANGE_RANGE _IOW ('X', 129, struct xfs_exchange_range) +#define XFS_IOC_START_COMMIT _IOR ('X', 130, struct xfs_commit_range) +#define XFS_IOC_COMMIT_RANGE _IOW ('X', 131, struct xfs_commit_range) /* XFS_IOC_GETFSUUID ---------- deprecated 140 */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 0af5b7a33d05..20bb5ce38134 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1855,11 +1855,12 @@ out_release: int xfs_dialloc( struct xfs_trans **tpp, - xfs_ino_t parent, - umode_t mode, + const struct xfs_icreate_args *args, xfs_ino_t *new_ino) { struct xfs_mount *mp = (*tpp)->t_mountp; + xfs_ino_t parent = args->pip ? args->pip->i_ino : 0; + umode_t mode = args->mode & S_IFMT; xfs_agnumber_t agno; int error = 0; xfs_agnumber_t start_agno; @@ -2947,8 +2948,8 @@ xfs_ialloc_setup_geometry( /* Compute inode btree geometry. */ igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog; - igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1); - igeo->inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0); + igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, true); + igeo->inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, false); igeo->inobt_mnr[0] = igeo->inobt_mxr[0] / 2; igeo->inobt_mnr[1] = igeo->inobt_mxr[1] / 2; diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index b549627e3a61..3a1323155a45 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -33,11 +33,13 @@ xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o) return xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog); } +struct xfs_icreate_args; + /* * Allocate an inode on disk. Mode is used to tell whether the new inode will * need space, and whether it is a directory. */ -int xfs_dialloc(struct xfs_trans **tpp, xfs_ino_t parent, umode_t mode, +int xfs_dialloc(struct xfs_trans **tpp, const struct xfs_icreate_args *args, xfs_ino_t *new_ino); int xfs_difree(struct xfs_trans *tp, struct xfs_perag *pag, diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 496e2f72a85b..401b42d52af6 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -572,11 +572,11 @@ xfs_inobt_block_maxrecs( /* * Calculate number of records in an inobt btree block. */ -int +unsigned int xfs_inobt_maxrecs( struct xfs_mount *mp, - int blocklen, - int leaf) + unsigned int blocklen, + bool leaf) { blocklen -= XFS_INOBT_BLOCK_LEN(mp); return xfs_inobt_block_maxrecs(blocklen, leaf); @@ -749,7 +749,7 @@ xfs_finobt_count_blocks( if (error) return error; - cur = xfs_inobt_init_cursor(pag, tp, agbp); + cur = xfs_finobt_init_cursor(pag, tp, agbp); error = xfs_btree_count_blocks(cur, tree_blocks); xfs_btree_del_cursor(cur, error); xfs_trans_brelse(tp, agbp); diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index 6472ec1ecbb4..300edf5bc009 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -50,7 +50,8 @@ struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp); struct xfs_btree_cur *xfs_finobt_init_cursor(struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp); -extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); +unsigned int xfs_inobt_maxrecs(struct xfs_mount *mp, unsigned int blocklen, + bool leaf); /* ir_holemask to inode allocation bitmap conversion */ uint64_t xfs_inobt_irec_to_allocmask(const struct xfs_inobt_rec_incore *irec); diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 513b50da6215..79babeac9d75 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -514,12 +514,18 @@ xfs_dinode_verify( return __this_address; } - if (dip->di_version > 1) { + /* + * Historical note: xfsprogs in the 3.2 era set up its incore inodes to + * have di_nlink track the link count, even if the actual filesystem + * only supported V1 inodes (i.e. di_onlink). When writing out the + * ondisk inode, it would set both the ondisk di_nlink and di_onlink to + * the the incore di_nlink value, which is why we cannot check for + * di_nlink==0 on a V1 inode. V2/3 inodes would get written out with + * di_onlink==0, so we can check that. + */ + if (dip->di_version >= 2) { if (dip->di_onlink) return __this_address; - } else { - if (dip->di_nlink) - return __this_address; } /* don't allow invalid i_size */ diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 9d11ae015909..1158ca48626b 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -185,7 +185,7 @@ xfs_iformat_btree( ifp = xfs_ifork_ptr(ip, whichfork); dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); - size = XFS_BMAP_BROOT_SPACE(mp, dfp); + size = xfs_bmap_broot_space(mp, dfp); nrecs = be16_to_cpu(dfp->bb_numrecs); level = be16_to_cpu(dfp->bb_level); @@ -198,7 +198,7 @@ xfs_iformat_btree( */ if (unlikely(ifp->if_nextents <= XFS_IFORK_MAXEXT(ip, whichfork) || nrecs == 0 || - XFS_BMDR_SPACE_CALC(nrecs) > + xfs_bmdr_space_calc(nrecs) > XFS_DFORK_SIZE(dip, mp, whichfork) || ifp->if_nextents > ip->i_nblocks) || level == 0 || level > XFS_BM_MAXLEVELS(mp, whichfork)) { @@ -409,7 +409,7 @@ xfs_iroot_realloc( * allocate it now and get out. */ if (ifp->if_broot_bytes == 0) { - new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff); + new_size = xfs_bmap_broot_space_calc(mp, rec_diff); ifp->if_broot = kmalloc(new_size, GFP_KERNEL | __GFP_NOFAIL); ifp->if_broot_bytes = (int)new_size; @@ -422,17 +422,17 @@ xfs_iroot_realloc( * location. The records don't change location because * they are kept butted up against the btree block header. */ - cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); + cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, false); new_max = cur_max + rec_diff; - new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); + new_size = xfs_bmap_broot_space_calc(mp, new_max); ifp->if_broot = krealloc(ifp->if_broot, new_size, GFP_KERNEL | __GFP_NOFAIL); - op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, + op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1, ifp->if_broot_bytes); - np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, + np = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1, (int)new_size); ifp->if_broot_bytes = (int)new_size; - ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= + ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <= xfs_inode_fork_size(ip, whichfork)); memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t)); return; @@ -444,11 +444,11 @@ xfs_iroot_realloc( * records, just get rid of the root and clear the status bit. */ ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0)); - cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); + cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, false); new_max = cur_max + rec_diff; ASSERT(new_max >= 0); if (new_max > 0) - new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); + new_size = xfs_bmap_broot_space_calc(mp, new_max); else new_size = 0; if (new_size > 0) { @@ -457,28 +457,28 @@ xfs_iroot_realloc( * First copy over the btree block header. */ memcpy(new_broot, ifp->if_broot, - XFS_BMBT_BLOCK_LEN(ip->i_mount)); + xfs_bmbt_block_len(ip->i_mount)); } else { new_broot = NULL; } /* - * Only copy the records and pointers if there are any. + * Only copy the keys and pointers if there are any. */ if (new_max > 0) { /* - * First copy the records. + * First copy the keys. */ - op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1); - np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1); - memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t)); + op = (char *)xfs_bmbt_key_addr(mp, ifp->if_broot, 1); + np = (char *)xfs_bmbt_key_addr(mp, new_broot, 1); + memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_key_t)); /* * Then copy the pointers. */ - op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, + op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1, ifp->if_broot_bytes); - np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1, + np = (char *)xfs_bmap_broot_ptr_addr(mp, new_broot, 1, (int)new_size); memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t)); } @@ -486,7 +486,7 @@ xfs_iroot_realloc( ifp->if_broot = new_broot; ifp->if_broot_bytes = (int)new_size; if (ifp->if_broot) - ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= + ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <= xfs_inode_fork_size(ip, whichfork)); return; } @@ -655,7 +655,7 @@ xfs_iflush_fork( if ((iip->ili_fields & brootflag[whichfork]) && (ifp->if_broot_bytes > 0)) { ASSERT(ifp->if_broot != NULL); - ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= + ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <= xfs_inode_fork_size(ip, whichfork)); xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, (xfs_bmdr_block_t *)cp, diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c index 032333289113..cc38e1c3c3e1 100644 --- a/fs/xfs/libxfs/xfs_inode_util.c +++ b/fs/xfs/libxfs/xfs_inode_util.c @@ -308,7 +308,7 @@ xfs_inode_init( !vfsgid_in_group_p(i_gid_into_vfsgid(args->idmap, inode))) inode->i_mode &= ~S_ISGID; - ip->i_projid = pip ? xfs_get_initial_prid(pip) : 0; + ip->i_projid = xfs_get_initial_prid(pip); } ip->i_disk_size = 0; diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index cb3b1d42ae9a..795928d1a66d 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -417,9 +417,10 @@ xfs_refcountbt_block_maxrecs( /* * Calculate the number of records in a refcount btree block. */ -int +unsigned int xfs_refcountbt_maxrecs( - int blocklen, + struct xfs_mount *mp, + unsigned int blocklen, bool leaf) { blocklen -= XFS_REFCOUNT_BLOCK_LEN; diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h index 1e0ab25f6c68..beb93bef6a81 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.h +++ b/fs/xfs/libxfs/xfs_refcount_btree.h @@ -48,7 +48,8 @@ struct xbtree_afakeroot; extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, struct xfs_perag *pag); -extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf); +unsigned int xfs_refcountbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen, + bool leaf); extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp); extern xfs_extlen_t xfs_refcountbt_calc_size(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 56fd6c4bd8b4..ac2f1f499b76 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -731,10 +731,11 @@ xfs_rmapbt_block_maxrecs( /* * Calculate number of records in an rmap btree block. */ -int +unsigned int xfs_rmapbt_maxrecs( - int blocklen, - int leaf) + struct xfs_mount *mp, + unsigned int blocklen, + bool leaf) { blocklen -= XFS_RMAP_BLOCK_LEN; return xfs_rmapbt_block_maxrecs(blocklen, leaf); diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h index eb90d89e8086..119b1567cd0e 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.h +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -47,7 +47,8 @@ struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp, struct xfs_perag *pag); void xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur *cur, struct xfs_trans *tp, struct xfs_buf *agbp); -int xfs_rmapbt_maxrecs(int blocklen, int leaf); +unsigned int xfs_rmapbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen, + bool leaf); extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp); extern xfs_extlen_t xfs_rmapbt_calc_size(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 386b672c5058..27a4472402ba 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -13,6 +13,8 @@ #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" #include "xfs_trans.h" #include "xfs_rtalloc.h" #include "xfs_error.h" @@ -69,7 +71,7 @@ xfs_rtbuf_cache_relse( * Get a buffer for the bitmap or summary file block specified. * The buffer is returned read and locked. */ -int +static int xfs_rtbuf_get( struct xfs_rtalloc_args *args, xfs_fileoff_t block, /* block number in bitmap or summary */ @@ -138,15 +140,43 @@ xfs_rtbuf_get( return 0; } +int +xfs_rtbitmap_read_buf( + struct xfs_rtalloc_args *args, + xfs_fileoff_t block) +{ + struct xfs_mount *mp = args->mp; + + if (XFS_IS_CORRUPT(mp, block >= mp->m_sb.sb_rbmblocks)) { + xfs_rt_mark_sick(mp, XFS_SICK_RT_BITMAP); + return -EFSCORRUPTED; + } + + return xfs_rtbuf_get(args, block, 0); +} + +int +xfs_rtsummary_read_buf( + struct xfs_rtalloc_args *args, + xfs_fileoff_t block) +{ + struct xfs_mount *mp = args->mp; + + if (XFS_IS_CORRUPT(mp, block >= mp->m_rsumblocks)) { + xfs_rt_mark_sick(args->mp, XFS_SICK_RT_SUMMARY); + return -EFSCORRUPTED; + } + return xfs_rtbuf_get(args, block, 1); +} + /* - * Searching backward from start to limit, find the first block whose - * allocated/free state is different from start's. + * Searching backward from start find the first block whose allocated/free state + * is different from start's. */ int xfs_rtfind_back( struct xfs_rtalloc_args *args, xfs_rtxnum_t start, /* starting rtext to look at */ - xfs_rtxnum_t limit, /* last rtext to look at */ xfs_rtxnum_t *rtx) /* out: start rtext found */ { struct xfs_mount *mp = args->mp; @@ -175,7 +205,7 @@ xfs_rtfind_back( */ word = xfs_rtx_to_rbmword(mp, start); bit = (int)(start & (XFS_NBWORD - 1)); - len = start - limit + 1; + len = start + 1; /* * Compute match value, based on the bit at start: if 1 (free) * then all-ones, else all-zeroes. @@ -316,6 +346,8 @@ xfs_rtfind_forw( xfs_rtword_t incore; unsigned int word; /* word number in the buffer */ + ASSERT(start <= limit); + /* * Compute and read in starting bitmap block for starting block. */ @@ -698,7 +730,7 @@ xfs_rtfree_range( * We need to find the beginning and end of the extent so we can * properly update the summary. */ - error = xfs_rtfind_back(args, start, 0, &preblock); + error = xfs_rtfind_back(args, start, &preblock); if (error) { return error; } @@ -990,25 +1022,24 @@ xfs_rtfree_blocks( xfs_filblks_t rtlen) { struct xfs_mount *mp = tp->t_mountp; - xfs_rtxnum_t start; - xfs_filblks_t len; xfs_extlen_t mod; ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN); - len = xfs_rtb_to_rtxrem(mp, rtlen, &mod); + mod = xfs_rtb_to_rtxoff(mp, rtlen); if (mod) { ASSERT(mod == 0); return -EIO; } - start = xfs_rtb_to_rtxrem(mp, rtbno, &mod); + mod = xfs_rtb_to_rtxoff(mp, rtbno); if (mod) { ASSERT(mod == 0); return -EIO; } - return xfs_rtfree_extent(tp, start, len); + return xfs_rtfree_extent(tp, xfs_rtb_to_rtx(mp, rtbno), + xfs_rtb_to_rtx(mp, rtlen)); } /* Find all the free records within a given range. */ @@ -1016,8 +1047,8 @@ int xfs_rtalloc_query_range( struct xfs_mount *mp, struct xfs_trans *tp, - const struct xfs_rtalloc_rec *low_rec, - const struct xfs_rtalloc_rec *high_rec, + xfs_rtxnum_t start, + xfs_rtxnum_t end, xfs_rtalloc_query_range_fn fn, void *priv) { @@ -1025,45 +1056,42 @@ xfs_rtalloc_query_range( .mp = mp, .tp = tp, }; - struct xfs_rtalloc_rec rec; - xfs_rtxnum_t rtstart; - xfs_rtxnum_t rtend; - xfs_rtxnum_t high_key; - int is_free; int error = 0; - if (low_rec->ar_startext > high_rec->ar_startext) + if (start > end) return -EINVAL; - if (low_rec->ar_startext >= mp->m_sb.sb_rextents || - low_rec->ar_startext == high_rec->ar_startext) + if (start == end || start >= mp->m_sb.sb_rextents) return 0; - high_key = min(high_rec->ar_startext, mp->m_sb.sb_rextents - 1); + end = min(end, mp->m_sb.sb_rextents - 1); /* Iterate the bitmap, looking for discrepancies. */ - rtstart = low_rec->ar_startext; - while (rtstart <= high_key) { + while (start <= end) { + struct xfs_rtalloc_rec rec; + int is_free; + xfs_rtxnum_t rtend; + /* Is the first block free? */ - error = xfs_rtcheck_range(&args, rtstart, 1, 1, &rtend, + error = xfs_rtcheck_range(&args, start, 1, 1, &rtend, &is_free); if (error) break; /* How long does the extent go for? */ - error = xfs_rtfind_forw(&args, rtstart, high_key, &rtend); + error = xfs_rtfind_forw(&args, start, end, &rtend); if (error) break; if (is_free) { - rec.ar_startext = rtstart; - rec.ar_extcount = rtend - rtstart + 1; + rec.ar_startext = start; + rec.ar_extcount = rtend - start + 1; error = fn(mp, tp, &rec, priv); if (error) break; } - rtstart = rtend + 1; + start = rtend + 1; } xfs_rtbuf_cache_relse(&args); @@ -1078,13 +1106,8 @@ xfs_rtalloc_query_all( xfs_rtalloc_query_range_fn fn, void *priv) { - struct xfs_rtalloc_rec keys[2]; - - keys[0].ar_startext = 0; - keys[1].ar_startext = mp->m_sb.sb_rextents - 1; - keys[0].ar_extcount = keys[1].ar_extcount = 0; - - return xfs_rtalloc_query_range(mp, tp, &keys[0], &keys[1], fn, priv); + return xfs_rtalloc_query_range(mp, tp, 0, mp->m_sb.sb_rextents - 1, fn, + priv); } /* Is the given extent all free? */ @@ -1125,21 +1148,6 @@ xfs_rtbitmap_blockcount( return howmany_64(rtextents, NBBY * mp->m_sb.sb_blocksize); } -/* - * Compute the number of rtbitmap words needed to populate every block of a - * bitmap that is large enough to track the given number of rt extents. - */ -unsigned long long -xfs_rtbitmap_wordcount( - struct xfs_mount *mp, - xfs_rtbxlen_t rtextents) -{ - xfs_filblks_t blocks; - - blocks = xfs_rtbitmap_blockcount(mp, rtextents); - return XFS_FSB_TO_B(mp, blocks) >> XFS_WORDLOG; -} - /* Compute the number of rtsummary blocks needed to track the given rt space. */ xfs_filblks_t xfs_rtsummary_blockcount( @@ -1153,39 +1161,25 @@ xfs_rtsummary_blockcount( return XFS_B_TO_FSB(mp, rsumwords << XFS_WORDLOG); } -/* - * Compute the number of rtsummary info words needed to populate every block of - * a summary file that is large enough to track the given rt space. - */ -unsigned long long -xfs_rtsummary_wordcount( - struct xfs_mount *mp, - unsigned int rsumlevels, - xfs_extlen_t rbmblocks) +/* Lock both realtime free space metadata inodes for a freespace update. */ +void +xfs_rtbitmap_lock( + struct xfs_mount *mp) { - xfs_filblks_t blocks; - - blocks = xfs_rtsummary_blockcount(mp, rsumlevels, rbmblocks); - return XFS_FSB_TO_B(mp, blocks) >> XFS_WORDLOG; + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); + xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM); } /* - * Lock both realtime free space metadata inodes for a freespace update. If a - * transaction is given, the inodes will be joined to the transaction and the + * Join both realtime free space metadata inodes to the transaction. The * ILOCKs will be released on transaction commit. */ void -xfs_rtbitmap_lock( - struct xfs_trans *tp, - struct xfs_mount *mp) +xfs_rtbitmap_trans_join( + struct xfs_trans *tp) { - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); - if (tp) - xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); - - xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM); - if (tp) - xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, tp->t_mountp->m_rbmip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, tp->t_mountp->m_rsumip, XFS_ILOCK_EXCL); } /* Unlock both realtime free space metadata inodes after a freespace update. */ @@ -1225,3 +1219,127 @@ xfs_rtbitmap_unlock_shared( if (rbmlock_flags & XFS_RBMLOCK_BITMAP) xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); } + +static int +xfs_rtfile_alloc_blocks( + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, + xfs_filblks_t count_fsb, + struct xfs_bmbt_irec *map) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int nmap = 1; + int error; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtalloc, + XFS_GROWFSRT_SPACE_RES(mp, count_fsb), 0, 0, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, + XFS_IEXT_ADD_NOSPLIT_CNT); + if (error) + goto out_trans_cancel; + + error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, + XFS_BMAPI_METADATA, 0, map, &nmap); + if (error) + goto out_trans_cancel; + + return xfs_trans_commit(tp); + +out_trans_cancel: + xfs_trans_cancel(tp); + return error; +} + +/* Get a buffer for the block. */ +static int +xfs_rtfile_initialize_block( + struct xfs_inode *ip, + xfs_fsblock_t fsbno, + void *data) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + struct xfs_buf *bp; + const size_t copylen = mp->m_blockwsize << XFS_WORDLOG; + enum xfs_blft buf_type; + int error; + + if (ip == mp->m_rsumip) + buf_type = XFS_BLFT_RTSUMMARY_BUF; + else + buf_type = XFS_BLFT_RTBITMAP_BUF; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtzero, 0, 0, 0, &tp); + if (error) + return error; + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, fsbno), mp->m_bsize, 0, &bp); + if (error) { + xfs_trans_cancel(tp); + return error; + } + + xfs_trans_buf_set_type(tp, bp, buf_type); + bp->b_ops = &xfs_rtbuf_ops; + if (data) + memcpy(bp->b_addr, data, copylen); + else + memset(bp->b_addr, 0, copylen); + xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); + return xfs_trans_commit(tp); +} + +/* + * Allocate space to the bitmap or summary file, and zero it, for growfs. + * @data must be a contiguous buffer large enough to fill all blocks in the + * file; or NULL to initialize the contents to zeroes. + */ +int +xfs_rtfile_initialize_blocks( + struct xfs_inode *ip, /* inode (bitmap/summary) */ + xfs_fileoff_t offset_fsb, /* offset to start from */ + xfs_fileoff_t end_fsb, /* offset to allocate to */ + void *data) /* data to fill the blocks */ +{ + struct xfs_mount *mp = ip->i_mount; + const size_t copylen = mp->m_blockwsize << XFS_WORDLOG; + + while (offset_fsb < end_fsb) { + struct xfs_bmbt_irec map; + xfs_filblks_t i; + int error; + + error = xfs_rtfile_alloc_blocks(ip, offset_fsb, + end_fsb - offset_fsb, &map); + if (error) + return error; + + /* + * Now we need to clear the allocated blocks. + * + * Do this one block per transaction, to keep it simple. + */ + for (i = 0; i < map.br_blockcount; i++) { + error = xfs_rtfile_initialize_block(ip, + map.br_startblock + i, data); + if (error) + return error; + if (data) + data += copylen; + } + + offset_fsb = map.br_startoff + map.br_blockcount; + } + + return 0; +} diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h index 6186585f2c37..140513d1d6bc 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.h +++ b/fs/xfs/libxfs/xfs_rtbitmap.h @@ -87,24 +87,6 @@ xfs_rtb_to_rtxoff( } /* - * Crack an rt block number into an rt extent number and an offset within that - * rt extent. Returns the rt extent number directly and the offset in @off. - */ -static inline xfs_rtxnum_t -xfs_rtb_to_rtxrem( - struct xfs_mount *mp, - xfs_rtblock_t rtbno, - xfs_extlen_t *off) -{ - if (likely(mp->m_rtxblklog >= 0)) { - *off = rtbno & mp->m_rtxblkmask; - return rtbno >> mp->m_rtxblklog; - } - - return div_u64_rem(rtbno, mp->m_sb.sb_rextsize, off); -} - -/* * Convert an rt block number into an rt extent number, rounding up to the next * rt extent if the rt block is not aligned to an rt extent boundary. */ @@ -293,30 +275,12 @@ typedef int (*xfs_rtalloc_query_range_fn)( #ifdef CONFIG_XFS_RT void xfs_rtbuf_cache_relse(struct xfs_rtalloc_args *args); - -int xfs_rtbuf_get(struct xfs_rtalloc_args *args, xfs_fileoff_t block, - int issum); - -static inline int -xfs_rtbitmap_read_buf( - struct xfs_rtalloc_args *args, - xfs_fileoff_t block) -{ - return xfs_rtbuf_get(args, block, 0); -} - -static inline int -xfs_rtsummary_read_buf( - struct xfs_rtalloc_args *args, - xfs_fileoff_t block) -{ - return xfs_rtbuf_get(args, block, 1); -} - +int xfs_rtbitmap_read_buf(struct xfs_rtalloc_args *args, xfs_fileoff_t block); +int xfs_rtsummary_read_buf(struct xfs_rtalloc_args *args, xfs_fileoff_t block); int xfs_rtcheck_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, xfs_rtxlen_t len, int val, xfs_rtxnum_t *new, int *stat); int xfs_rtfind_back(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, - xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock); + xfs_rtxnum_t *rtblock); int xfs_rtfind_forw(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock); int xfs_rtmodify_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, @@ -328,8 +292,7 @@ int xfs_rtmodify_summary(struct xfs_rtalloc_args *args, int log, int xfs_rtfree_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, xfs_rtxlen_t len); int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp, - const struct xfs_rtalloc_rec *low_rec, - const struct xfs_rtalloc_rec *high_rec, + xfs_rtxnum_t start, xfs_rtxnum_t end, xfs_rtalloc_query_range_fn fn, void *priv); int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtalloc_query_range_fn fn, @@ -353,16 +316,15 @@ int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno, xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents); -unsigned long long xfs_rtbitmap_wordcount(struct xfs_mount *mp, - xfs_rtbxlen_t rtextents); - xfs_filblks_t xfs_rtsummary_blockcount(struct xfs_mount *mp, unsigned int rsumlevels, xfs_extlen_t rbmblocks); -unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp, - unsigned int rsumlevels, xfs_extlen_t rbmblocks); -void xfs_rtbitmap_lock(struct xfs_trans *tp, struct xfs_mount *mp); +int xfs_rtfile_initialize_blocks(struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, xfs_fileoff_t end_fsb, void *data); + +void xfs_rtbitmap_lock(struct xfs_mount *mp); void xfs_rtbitmap_unlock(struct xfs_mount *mp); +void xfs_rtbitmap_trans_join(struct xfs_trans *tp); /* Lock the rt bitmap inode in shared mode */ #define XFS_RBMLOCK_BITMAP (1U << 0) @@ -388,10 +350,9 @@ xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents) /* shut up gcc */ return 0; } -# define xfs_rtbitmap_wordcount(mp, r) (0) # define xfs_rtsummary_blockcount(mp, l, b) (0) -# define xfs_rtsummary_wordcount(mp, l, b) (0) -# define xfs_rtbitmap_lock(tp, mp) do { } while (0) +# define xfs_rtbitmap_lock(mp) do { } while (0) +# define xfs_rtbitmap_trans_join(tp) do { } while (0) # define xfs_rtbitmap_unlock(mp) do { } while (0) # define xfs_rtbitmap_lock_shared(mp, lf) do { } while (0) # define xfs_rtbitmap_unlock_shared(mp, lf) do { } while (0) diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 6b56f0f6d4c1..d95409f3cba6 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -232,6 +232,38 @@ xfs_validate_sb_read( return 0; } +static uint64_t +xfs_sb_calc_rbmblocks( + struct xfs_sb *sbp) +{ + return howmany_64(sbp->sb_rextents, NBBY * sbp->sb_blocksize); +} + +/* Validate the realtime geometry */ +bool +xfs_validate_rt_geometry( + struct xfs_sb *sbp) +{ + if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || + sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) + return false; + + if (sbp->sb_rblocks == 0) { + if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 || + sbp->sb_rextslog != 0 || sbp->sb_frextents != 0) + return false; + return true; + } + + if (sbp->sb_rextents == 0 || + sbp->sb_rextents != div_u64(sbp->sb_rblocks, sbp->sb_rextsize) || + sbp->sb_rextslog != xfs_compute_rextslog(sbp->sb_rextents) || + sbp->sb_rbmblocks != xfs_sb_calc_rbmblocks(sbp)) + return false; + + return true; +} + /* Check all the superblock fields we care about when writing one out. */ STATIC int xfs_validate_sb_write( @@ -491,39 +523,13 @@ xfs_validate_sb_common( } } - /* Validate the realtime geometry; stolen from xfs_repair */ - if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || - sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) { + if (!xfs_validate_rt_geometry(sbp)) { xfs_notice(mp, - "realtime extent sanity check failed"); + "realtime %sgeometry check failed", + sbp->sb_rblocks ? "" : "zeroed "); return -EFSCORRUPTED; } - if (sbp->sb_rblocks == 0) { - if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 || - sbp->sb_rextslog != 0 || sbp->sb_frextents != 0) { - xfs_notice(mp, - "realtime zeroed geometry check failed"); - return -EFSCORRUPTED; - } - } else { - uint64_t rexts; - uint64_t rbmblocks; - - rexts = div_u64(sbp->sb_rblocks, sbp->sb_rextsize); - rbmblocks = howmany_64(sbp->sb_rextents, - NBBY * sbp->sb_blocksize); - - if (!xfs_validate_rtextents(rexts) || - sbp->sb_rextents != rexts || - sbp->sb_rextslog != xfs_compute_rextslog(rexts) || - sbp->sb_rbmblocks != rbmblocks) { - xfs_notice(mp, - "realtime geometry sanity check failed"); - return -EFSCORRUPTED; - } - } - /* * Either (sb_unit and !hasdalign) or (!sb_unit and hasdalign) * would imply the image is corrupted. @@ -959,6 +965,15 @@ const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { .verify_write = xfs_sb_write_verify, }; +void +xfs_mount_sb_set_rextsize( + struct xfs_mount *mp, + struct xfs_sb *sbp) +{ + mp->m_rtxblklog = log2_if_power2(sbp->sb_rextsize); + mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize); +} + /* * xfs_mount_common * @@ -983,26 +998,25 @@ xfs_sb_mount_common( mp->m_blockmask = sbp->sb_blocksize - 1; mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG; mp->m_blockwmask = mp->m_blockwsize - 1; - mp->m_rtxblklog = log2_if_power2(sbp->sb_rextsize); - mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize); + xfs_mount_sb_set_rextsize(mp, sbp); - mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1); - mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, true); + mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, false); mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2; mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2; - mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1); - mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, true); + mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, false); mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2; mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2; - mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(sbp->sb_blocksize, 1); - mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(sbp->sb_blocksize, 0); + mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, true); + mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, false); mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2; mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2; - mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(sbp->sb_blocksize, true); - mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(sbp->sb_blocksize, false); + mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, true); + mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, false); mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2; mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2; diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index 37b1ed1bc209..885c83755991 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -17,6 +17,8 @@ extern void xfs_log_sb(struct xfs_trans *tp); extern int xfs_sync_sb(struct xfs_mount *mp, bool wait); extern int xfs_sync_sb_buf(struct xfs_mount *mp); extern void xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp); +void xfs_mount_sb_set_rextsize(struct xfs_mount *mp, + struct xfs_sb *sbp); extern void xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from); extern void xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from); extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp); @@ -38,6 +40,7 @@ extern int xfs_sb_get_secondary(struct xfs_mount *mp, bool xfs_validate_stripe_geometry(struct xfs_mount *mp, __s64 sunit, __s64 swidth, int sectorsize, bool may_repair, bool silent); +bool xfs_validate_rt_geometry(struct xfs_sb *sbp); uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents); diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 45aaf169806a..1a7f95bcf069 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -130,7 +130,7 @@ xfs_calc_inode_res( (4 * sizeof(struct xlog_op_header) + sizeof(struct xfs_inode_log_format) + mp->m_sb.sb_inodesize + - 2 * XFS_BMBT_BLOCK_LEN(mp)); + 2 * xfs_bmbt_block_len(mp)); } /* @@ -918,7 +918,7 @@ xfs_calc_growrtfree_reservation( return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + xfs_calc_inode_res(mp, 2) + xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) + - xfs_calc_buf_res(1, mp->m_rsumsize); + xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, mp->m_rsumblocks)); } /* diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 76eb9e328835..a8cd44d03ef6 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -235,16 +235,4 @@ bool xfs_verify_fileoff(struct xfs_mount *mp, xfs_fileoff_t off); bool xfs_verify_fileext(struct xfs_mount *mp, xfs_fileoff_t off, xfs_fileoff_t len); -/* Do we support an rt volume having this number of rtextents? */ -static inline bool -xfs_validate_rtextents( - xfs_rtbxlen_t rtextents) -{ - /* No runt rt volumes */ - if (rtextents == 0) - return false; - - return true; -} - #endif /* __XFS_TYPES_H__ */ diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 24a15bf784f1..5ab2ac53c920 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -938,7 +938,13 @@ xchk_bmap( } break; case XFS_ATTR_FORK: - if (!xfs_has_attr(mp) && !xfs_has_attr2(mp)) + /* + * "attr" means that an attr fork was created at some point in + * the life of this filesystem. "attr2" means that inodes have + * variable-sized data/attr fork areas. Hence we only check + * attr here. + */ + if (!xfs_has_attr(mp)) xchk_ino_set_corrupt(sc, sc->ip->i_ino); break; default: diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c index 1e656fab5e41..49dc38acc66b 100644 --- a/fs/xfs/scrub/bmap_repair.c +++ b/fs/xfs/scrub/bmap_repair.c @@ -480,7 +480,7 @@ xrep_bmap_iroot_size( { ASSERT(level > 0); - return XFS_BMAP_BROOT_SPACE_CALC(cur->bc_mp, nr_this_level); + return xfs_bmap_broot_space_calc(cur->bc_mp, nr_this_level); } /* Update the inode counters. */ diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 3d5f1f6b4b7b..47148cc4a833 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -53,6 +53,11 @@ int xchk_checkpoint_log(struct xfs_mount *mp); bool xchk_should_check_xref(struct xfs_scrub *sc, int *error, struct xfs_btree_cur **curpp); +static inline int xchk_setup_nothing(struct xfs_scrub *sc) +{ + return -ENOENT; +} + /* Setup functions */ int xchk_setup_agheader(struct xfs_scrub *sc); int xchk_setup_fs(struct xfs_scrub *sc); @@ -72,16 +77,8 @@ int xchk_setup_dirtree(struct xfs_scrub *sc); int xchk_setup_rtbitmap(struct xfs_scrub *sc); int xchk_setup_rtsummary(struct xfs_scrub *sc); #else -static inline int -xchk_setup_rtbitmap(struct xfs_scrub *sc) -{ - return -ENOENT; -} -static inline int -xchk_setup_rtsummary(struct xfs_scrub *sc) -{ - return -ENOENT; -} +# define xchk_setup_rtbitmap xchk_setup_nothing +# define xchk_setup_rtsummary xchk_setup_nothing #endif #ifdef CONFIG_XFS_QUOTA int xchk_ino_dqattach(struct xfs_scrub *sc); @@ -93,16 +90,8 @@ xchk_ino_dqattach(struct xfs_scrub *sc) { return 0; } -static inline int -xchk_setup_quota(struct xfs_scrub *sc) -{ - return -ENOENT; -} -static inline int -xchk_setup_quotacheck(struct xfs_scrub *sc) -{ - return -ENOENT; -} +# define xchk_setup_quota xchk_setup_nothing +# define xchk_setup_quotacheck xchk_setup_nothing #endif int xchk_setup_fscounters(struct xfs_scrub *sc); int xchk_setup_nlinks(struct xfs_scrub *sc); diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c index daf9f1ee7c2c..3e45b9b72312 100644 --- a/fs/xfs/scrub/inode_repair.c +++ b/fs/xfs/scrub/inode_repair.c @@ -846,7 +846,7 @@ xrep_dinode_bad_bmbt_fork( nrecs = be16_to_cpu(dfp->bb_numrecs); level = be16_to_cpu(dfp->bb_level); - if (nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > dfork_size) + if (nrecs == 0 || xfs_bmdr_space_calc(nrecs) > dfork_size) return true; if (level == 0 || level >= XFS_BM_MAXLEVELS(sc->mp, whichfork)) return true; @@ -858,12 +858,12 @@ xrep_dinode_bad_bmbt_fork( xfs_fileoff_t fileoff; xfs_fsblock_t fsbno; - fkp = XFS_BMDR_KEY_ADDR(dfp, i); + fkp = xfs_bmdr_key_addr(dfp, i); fileoff = be64_to_cpu(fkp->br_startoff); if (!xfs_verify_fileoff(sc->mp, fileoff)) return true; - fpp = XFS_BMDR_PTR_ADDR(dfp, i, dmxr); + fpp = xfs_bmdr_ptr_addr(dfp, i, dmxr); fsbno = be64_to_cpu(*fpp); if (!xfs_verify_fsbno(sc->mp, fsbno)) return true; @@ -1121,7 +1121,7 @@ xrep_dinode_ensure_forkoff( struct xfs_bmdr_block *bmdr; struct xfs_scrub *sc = ri->sc; xfs_extnum_t attr_extents, data_extents; - size_t bmdr_minsz = XFS_BMDR_SPACE_CALC(1); + size_t bmdr_minsz = xfs_bmdr_space_calc(1); unsigned int lit_sz = XFS_LITINO(sc->mp); unsigned int afork_min, dfork_min; @@ -1173,7 +1173,7 @@ xrep_dinode_ensure_forkoff( case XFS_DINODE_FMT_BTREE: /* Must have space for btree header and key/pointers. */ bmdr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK); - afork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr); + afork_min = xfs_bmap_broot_space(sc->mp, bmdr); break; default: /* We should never see any other formats. */ @@ -1223,7 +1223,7 @@ xrep_dinode_ensure_forkoff( case XFS_DINODE_FMT_BTREE: /* Must have space for btree header and key/pointers. */ bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); - dfork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr); + dfork_min = xfs_bmap_broot_space(sc->mp, bmdr); break; default: dfork_min = 0; diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c index 3fee603f5244..7c7366c98338 100644 --- a/fs/xfs/scrub/rtsummary.c +++ b/fs/xfs/scrub/rtsummary.c @@ -63,7 +63,8 @@ xchk_setup_rtsummary( * us to avoid pinning kernel memory for this purpose. */ descr = xchk_xfile_descr(sc, "realtime summary file"); - error = xfile_create(descr, mp->m_rsumsize, &sc->xfile); + error = xfile_create(descr, XFS_FSB_TO_B(mp, mp->m_rsumblocks), + &sc->xfile); kfree(descr); if (error) return error; @@ -95,16 +96,14 @@ xchk_setup_rtsummary( * volume. Hence it is safe to compute and check the geometry values. */ if (mp->m_sb.sb_rblocks) { - xfs_filblks_t rsumblocks; int rextslog; rts->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks); rextslog = xfs_compute_rextslog(rts->rextents); rts->rsumlevels = rextslog + 1; rts->rbmblocks = xfs_rtbitmap_blockcount(mp, rts->rextents); - rsumblocks = xfs_rtsummary_blockcount(mp, rts->rsumlevels, + rts->rsumblocks = xfs_rtsummary_blockcount(mp, rts->rsumlevels, rts->rbmblocks); - rts->rsumsize = XFS_FSB_TO_B(mp, rsumblocks); } return 0; } @@ -316,7 +315,7 @@ xchk_rtsummary( } /* Is m_rsumsize correct? */ - if (mp->m_rsumsize != rts->rsumsize) { + if (mp->m_rsumblocks != rts->rsumblocks) { xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); goto out_rbm; } @@ -332,7 +331,7 @@ xchk_rtsummary( * growfsrt expands the summary file before updating sb_rextents, so * the file can be larger than rsumsize. */ - if (mp->m_rsumip->i_disk_size < rts->rsumsize) { + if (mp->m_rsumip->i_disk_size < XFS_FSB_TO_B(mp, rts->rsumblocks)) { xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); goto out_rbm; } diff --git a/fs/xfs/scrub/rtsummary.h b/fs/xfs/scrub/rtsummary.h index e1d50304d8d4..e44b04cb6e2d 100644 --- a/fs/xfs/scrub/rtsummary.h +++ b/fs/xfs/scrub/rtsummary.h @@ -14,7 +14,7 @@ struct xchk_rtsummary { uint64_t rextents; uint64_t rbmblocks; - uint64_t rsumsize; + xfs_filblks_t rsumblocks; unsigned int rsumlevels; unsigned int resblks; diff --git a/fs/xfs/scrub/rtsummary_repair.c b/fs/xfs/scrub/rtsummary_repair.c index d9e971c4c79f..7deeb948cb70 100644 --- a/fs/xfs/scrub/rtsummary_repair.c +++ b/fs/xfs/scrub/rtsummary_repair.c @@ -56,7 +56,7 @@ xrep_setup_rtsummary( * transaction (which we cannot drop because we cannot drop the * rtsummary ILOCK) and cannot ask for more reservation. */ - blocks = XFS_B_TO_FSB(mp, mp->m_rsumsize); + blocks = mp->m_rsumblocks; blocks += xfs_bmbt_calc_size(mp, blocks) * 2; if (blocks > UINT_MAX) return -EOPNOTSUPP; @@ -100,7 +100,6 @@ xrep_rtsummary( { struct xchk_rtsummary *rts = sc->buf; struct xfs_mount *mp = sc->mp; - xfs_filblks_t rsumblocks; int error; /* We require the rmapbt to rebuild anything. */ @@ -131,10 +130,9 @@ xrep_rtsummary( } /* Make sure we have space allocated for the entire summary file. */ - rsumblocks = XFS_B_TO_FSB(mp, rts->rsumsize); xfs_trans_ijoin(sc->tp, sc->ip, 0); xfs_trans_ijoin(sc->tp, sc->tempip, 0); - error = xrep_tempfile_prealloc(sc, 0, rsumblocks); + error = xrep_tempfile_prealloc(sc, 0, rts->rsumblocks); if (error) return error; @@ -143,11 +141,11 @@ xrep_rtsummary( return error; /* Copy the rtsummary file that we generated. */ - error = xrep_tempfile_copyin(sc, 0, rsumblocks, + error = xrep_tempfile_copyin(sc, 0, rts->rsumblocks, xrep_rtsummary_prep_buf, rts); if (error) return error; - error = xrep_tempfile_set_isize(sc, rts->rsumsize); + error = xrep_tempfile_set_isize(sc, XFS_FSB_TO_B(mp, rts->rsumblocks)); if (error) return error; @@ -168,7 +166,7 @@ xrep_rtsummary( memset(mp->m_rsum_cache, 0xFF, mp->m_sb.sb_rbmblocks); mp->m_rsumlevels = rts->rsumlevels; - mp->m_rsumsize = rts->rsumsize; + mp->m_rsumblocks = rts->rsumblocks; /* Free the old rtsummary blocks if they're not in use. */ return xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK); diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 1bc33f010d0e..5993fcaffb2c 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -231,6 +231,11 @@ xchk_should_terminate( return false; } +static inline int xchk_nothing(struct xfs_scrub *sc) +{ + return -ENOENT; +} + /* Metadata scrubbers */ int xchk_tester(struct xfs_scrub *sc); int xchk_superblock(struct xfs_scrub *sc); @@ -254,31 +259,15 @@ int xchk_dirtree(struct xfs_scrub *sc); int xchk_rtbitmap(struct xfs_scrub *sc); int xchk_rtsummary(struct xfs_scrub *sc); #else -static inline int -xchk_rtbitmap(struct xfs_scrub *sc) -{ - return -ENOENT; -} -static inline int -xchk_rtsummary(struct xfs_scrub *sc) -{ - return -ENOENT; -} +# define xchk_rtbitmap xchk_nothing +# define xchk_rtsummary xchk_nothing #endif #ifdef CONFIG_XFS_QUOTA int xchk_quota(struct xfs_scrub *sc); int xchk_quotacheck(struct xfs_scrub *sc); #else -static inline int -xchk_quota(struct xfs_scrub *sc) -{ - return -ENOENT; -} -static inline int -xchk_quotacheck(struct xfs_scrub *sc) -{ - return -ENOENT; -} +# define xchk_quota xchk_nothing +# define xchk_quotacheck xchk_nothing #endif int xchk_fscounters(struct xfs_scrub *sc); int xchk_nlinks(struct xfs_scrub *sc); diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c index d390d56cd875..177f922acfaf 100644 --- a/fs/xfs/scrub/tempfile.c +++ b/fs/xfs/scrub/tempfile.c @@ -88,7 +88,7 @@ xrep_tempfile_create( goto out_release_dquots; /* Allocate inode, set up directory. */ - error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); + error = xfs_dialloc(&tp, &args, &ino); if (error) goto out_trans_cancel; error = xfs_icreate(tp, ino, &args, &sc->tempip); diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c index d848222f802b..9b5d98fe1f8a 100644 --- a/fs/xfs/scrub/xfile.c +++ b/fs/xfs/scrub/xfile.c @@ -293,7 +293,7 @@ xfile_get_folio( * (potentially last) reference in xfile_put_folio. */ if (flags & XFILE_ALLOC) - folio_set_dirty(folio); + folio_mark_dirty(folio); return folio; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index e224b49b7cff..35a8c1b8b3cb 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -346,6 +346,17 @@ xfs_bmap_defer_add( trace_xfs_bmap_defer(bi); xfs_bmap_update_get_group(tp->t_mountp, bi); + + /* + * Ensure the deferred mapping is pre-recorded in i_delayed_blks. + * + * Otherwise stat can report zero blocks for an inode that actually has + * data when the entire mapping is in the process of being overwritten + * using the out of place write path. This is undone in xfs_bmapi_remap + * after it has incremented di_nblocks for a successful operation. + */ + if (bi->bi_type == XFS_BMAP_MAP) + bi->bi_owner->i_delayed_blks += bi->bi_bmap.br_blockcount; xfs_defer_add(tp, &bi->bi_list, &xfs_bmap_update_defer_type); } @@ -367,6 +378,9 @@ xfs_bmap_update_cancel_item( { struct xfs_bmap_intent *bi = bi_entry(item); + if (bi->bi_type == XFS_BMAP_MAP) + bi->bi_owner->i_delayed_blks -= bi->bi_bmap.br_blockcount; + xfs_bmap_update_put_group(bi); kmem_cache_free(xfs_bmap_intent_cache, bi); } @@ -464,6 +478,9 @@ xfs_bui_recover_work( bi->bi_owner = *ipp; xfs_bmap_update_get_group(mp, bi); + /* see xfs_bmap_defer_add for details */ + if (bi->bi_type == XFS_BMAP_MAP) + bi->bi_owner->i_delayed_blks += bi->bi_bmap.br_blockcount; xfs_defer_add_item(dfp, &bi->bi_list); return bi; } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index fe2e2c930975..053d567c9108 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -331,8 +331,7 @@ xfs_getbmap( } if (xfs_get_extsz_hint(ip) || - (ip->i_diflags & - (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))) + (ip->i_diflags & XFS_DIFLAG_PREALLOC)) max_len = mp->m_super->s_maxbytes; else max_len = XFS_ISIZE(ip); @@ -492,12 +491,12 @@ bool xfs_can_free_eofblocks( struct xfs_inode *ip) { - struct xfs_bmbt_irec imap; struct xfs_mount *mp = ip->i_mount; + bool found_blocks = false; xfs_fileoff_t end_fsb; xfs_fileoff_t last_fsb; - int nimaps = 1; - int error; + struct xfs_bmbt_irec imap; + struct xfs_iext_cursor icur; /* * Caller must either hold the exclusive io lock; or be inactivating @@ -524,12 +523,11 @@ xfs_can_free_eofblocks( return false; /* - * Only free real extents for inodes with persistent preallocations or - * the append-only flag. + * Do not free real extents in preallocated files unless the file has + * delalloc blocks and we are forced to remove them. */ - if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) - if (ip->i_delayed_blks == 0) - return false; + if ((ip->i_diflags & XFS_DIFLAG_PREALLOC) && !ip->i_delayed_blks) + return false; /* * Do not try to free post-EOF blocks if EOF is beyond the end of the @@ -544,21 +542,13 @@ xfs_can_free_eofblocks( return false; /* - * Look up the mapping for the first block past EOF. If we can't find - * it, there's nothing to free. + * Check if there is an post-EOF extent to free. */ xfs_ilock(ip, XFS_ILOCK_SHARED); - error = xfs_bmapi_read(ip, end_fsb, last_fsb - end_fsb, &imap, &nimaps, - 0); + if (xfs_iext_lookup_extent(ip, &ip->i_df, end_fsb, &icur, &imap)) + found_blocks = true; xfs_iunlock(ip, XFS_ILOCK_SHARED); - if (error || nimaps == 0) - return false; - - /* - * If there's a real mapping there or there are delayed allocation - * reservations, then we have post-EOF blocks to try to free. - */ - return imap.br_startblock != HOLESTARTBLOCK || ip->i_delayed_blks; + return found_blocks; } /* @@ -653,6 +643,9 @@ xfs_alloc_file_space( xfs_bmbt_irec_t imaps[1], *imapp; int error; + if (xfs_is_always_cow_inode(ip)) + return 0; + trace_xfs_alloc_file_space(ip); if (xfs_is_shutdown(mp)) @@ -848,6 +841,14 @@ xfs_free_file_space( if (len <= 0) /* if nothing being freed */ return 0; + /* + * Now AIO and DIO has drained we flush and (if necessary) invalidate + * the cached range over the first operation we are about to run. + */ + error = xfs_flush_unmap_range(ip, offset, len); + if (error) + return error; + startoffset_fsb = XFS_B_TO_FSB(mp, offset); endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); @@ -1184,7 +1185,7 @@ xfs_swap_extents_check_format( */ if (tifp->if_format == XFS_DINODE_FMT_BTREE) { if (xfs_inode_has_attr_fork(ip) && - XFS_BMAP_BMDR_SPACE(tifp->if_broot) > xfs_inode_fork_boff(ip)) + xfs_bmap_bmdr_space(tifp->if_broot) > xfs_inode_fork_boff(ip)) return -EINVAL; if (tifp->if_nextents <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) return -EINVAL; @@ -1193,7 +1194,7 @@ xfs_swap_extents_check_format( /* Reciprocal target->temp btree format checks */ if (ifp->if_format == XFS_DINODE_FMT_BTREE) { if (xfs_inode_has_attr_fork(tip) && - XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > xfs_inode_fork_boff(tip)) + xfs_bmap_bmdr_space(ip->i_df.if_broot) > xfs_inode_fork_boff(tip)) return -EINVAL; if (ifp->if_nextents <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) return -EINVAL; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index b1580644501f..209a389f2abc 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -210,7 +210,7 @@ struct xfs_buf { * success the write is considered to be failed permanently and the * iodone handler will take appropriate action. * - * For retry timeouts, we record the jiffie of the first failure. This + * For retry timeouts, we record the jiffy of the first failure. This * means that we can change the retry timeout for buffers already under * I/O and thus avoid getting stuck in a retry loop with a long timeout. * diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 6f0fc7fe1f2b..d8c4a5dcca7a 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -158,8 +158,7 @@ static int xfs_trim_gather_extents( struct xfs_perag *pag, struct xfs_trim_cur *tcur, - struct xfs_busy_extents *extents, - uint64_t *blocks_trimmed) + struct xfs_busy_extents *extents) { struct xfs_mount *mp = pag->pag_mount; struct xfs_trans *tp; @@ -280,7 +279,6 @@ xfs_trim_gather_extents( xfs_extent_busy_insert_discard(pag, fbno, flen, &extents->extent_list); - *blocks_trimmed += flen; next_extent: if (tcur->by_bno) error = xfs_btree_increment(cur, 0, &i); @@ -327,8 +325,7 @@ xfs_trim_perag_extents( struct xfs_perag *pag, xfs_agblock_t start, xfs_agblock_t end, - xfs_extlen_t minlen, - uint64_t *blocks_trimmed) + xfs_extlen_t minlen) { struct xfs_trim_cur tcur = { .start = start, @@ -354,8 +351,7 @@ xfs_trim_perag_extents( extents->owner = extents; INIT_LIST_HEAD(&extents->extent_list); - error = xfs_trim_gather_extents(pag, &tcur, extents, - blocks_trimmed); + error = xfs_trim_gather_extents(pag, &tcur, extents); if (error) { kfree(extents); break; @@ -389,8 +385,7 @@ xfs_trim_datadev_extents( struct xfs_mount *mp, xfs_daddr_t start, xfs_daddr_t end, - xfs_extlen_t minlen, - uint64_t *blocks_trimmed) + xfs_extlen_t minlen) { xfs_agnumber_t start_agno, end_agno; xfs_agblock_t start_agbno, end_agbno; @@ -411,8 +406,7 @@ xfs_trim_datadev_extents( if (start_agno == end_agno) agend = end_agbno; - error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen, - blocks_trimmed); + error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen); if (error) last_error = error; @@ -431,9 +425,6 @@ struct xfs_trim_rtdev { /* list of rt extents to free */ struct list_head extent_list; - /* pointer to count of blocks trimmed */ - uint64_t *blocks_trimmed; - /* minimum length that caller allows us to trim */ xfs_rtblock_t minlen_fsb; @@ -551,7 +542,6 @@ xfs_trim_gather_rtextent( busyp->length = rlen; INIT_LIST_HEAD(&busyp->list); list_add_tail(&busyp->list, &tr->extent_list); - *tr->blocks_trimmed += rlen; tr->restart_rtx = rec->ar_startext + rec->ar_extcount; return 0; @@ -562,15 +552,12 @@ xfs_trim_rtdev_extents( struct xfs_mount *mp, xfs_daddr_t start, xfs_daddr_t end, - xfs_daddr_t minlen, - uint64_t *blocks_trimmed) + xfs_daddr_t minlen) { - struct xfs_rtalloc_rec low = { }; - struct xfs_rtalloc_rec high = { }; struct xfs_trim_rtdev tr = { - .blocks_trimmed = blocks_trimmed, .minlen_fsb = XFS_BB_TO_FSB(mp, minlen), }; + xfs_rtxnum_t low, high; struct xfs_trans *tp; xfs_daddr_t rtdev_daddr; int error; @@ -596,17 +583,17 @@ xfs_trim_rtdev_extents( XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks) - 1); /* Convert the rt blocks to rt extents */ - low.ar_startext = xfs_rtb_to_rtxup(mp, XFS_BB_TO_FSB(mp, start)); - high.ar_startext = xfs_rtb_to_rtx(mp, XFS_BB_TO_FSBT(mp, end)); + low = xfs_rtb_to_rtxup(mp, XFS_BB_TO_FSB(mp, start)); + high = xfs_rtb_to_rtx(mp, XFS_BB_TO_FSBT(mp, end)); /* * Walk the free ranges between low and high. The query_range function * trims the extents returned. */ do { - tr.stop_rtx = low.ar_startext + (mp->m_sb.sb_blocksize * NBBY); + tr.stop_rtx = low + (mp->m_sb.sb_blocksize * NBBY); xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP); - error = xfs_rtalloc_query_range(mp, tp, &low, &high, + error = xfs_rtalloc_query_range(mp, tp, low, high, xfs_trim_gather_rtextent, &tr); if (error == -ECANCELED) @@ -627,14 +614,14 @@ xfs_trim_rtdev_extents( if (error) break; - low.ar_startext = tr.restart_rtx; - } while (!xfs_trim_should_stop() && low.ar_startext <= high.ar_startext); + low = tr.restart_rtx; + } while (!xfs_trim_should_stop() && low <= high); xfs_trans_cancel(tp); return error; } #else -# define xfs_trim_rtdev_extents(m,s,e,n,b) (-EOPNOTSUPP) +# define xfs_trim_rtdev_extents(...) (-EOPNOTSUPP) #endif /* CONFIG_XFS_RT */ /* @@ -661,7 +648,6 @@ xfs_ioc_trim( xfs_daddr_t start, end; xfs_extlen_t minlen; xfs_rfsblock_t max_blocks; - uint64_t blocks_trimmed = 0; int error, last_error = 0; if (!capable(CAP_SYS_ADMIN)) @@ -706,15 +692,13 @@ xfs_ioc_trim( end = start + BTOBBT(range.len) - 1; if (bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev)) { - error = xfs_trim_datadev_extents(mp, start, end, minlen, - &blocks_trimmed); + error = xfs_trim_datadev_extents(mp, start, end, minlen); if (error) last_error = error; } if (rt_bdev && !xfs_trim_should_stop()) { - error = xfs_trim_rtdev_extents(mp, start, end, minlen, - &blocks_trimmed); + error = xfs_trim_rtdev_extents(mp, start, end, minlen); if (error) last_error = error; } @@ -722,7 +706,8 @@ xfs_ioc_trim( if (last_error) return last_error; - range.len = XFS_FSB_TO_B(mp, blocks_trimmed); + range.len = min_t(unsigned long long, range.len, + XFS_FSB_TO_B(mp, max_blocks) - range.start); if (copy_to_user(urange, &range, sizeof(range))) return -EFAULT; return 0; diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c index c8a655c92c92..d0889190ab7f 100644 --- a/fs/xfs/xfs_exchrange.c +++ b/fs/xfs/xfs_exchrange.c @@ -72,6 +72,34 @@ xfs_exchrange_estimate( return error; } +/* + * Check that file2's metadata agree with the snapshot that we took for the + * range commit request. + * + * This should be called after the filesystem has locked /all/ inode metadata + * against modification. + */ +STATIC int +xfs_exchrange_check_freshness( + const struct xfs_exchrange *fxr, + struct xfs_inode *ip2) +{ + struct inode *inode2 = VFS_I(ip2); + struct timespec64 ctime = inode_get_ctime(inode2); + struct timespec64 mtime = inode_get_mtime(inode2); + + trace_xfs_exchrange_freshness(fxr, ip2); + + /* Check that file2 hasn't otherwise been modified. */ + if (fxr->file2_ino != ip2->i_ino || + fxr->file2_gen != inode2->i_generation || + !timespec64_equal(&fxr->file2_ctime, &ctime) || + !timespec64_equal(&fxr->file2_mtime, &mtime)) + return -EBUSY; + + return 0; +} + #define QRETRY_IP1 (0x1) #define QRETRY_IP2 (0x2) @@ -607,6 +635,12 @@ xfs_exchrange_prep( if (error || fxr->length == 0) return error; + if (fxr->flags & __XFS_EXCHANGE_RANGE_CHECK_FRESH2) { + error = xfs_exchrange_check_freshness(fxr, ip2); + if (error) + return error; + } + /* Attach dquots to both inodes before changing block maps. */ error = xfs_qm_dqattach(ip2); if (error) @@ -719,7 +753,8 @@ xfs_exchange_range( if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt) return -EXDEV; - if (fxr->flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS) + if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS | + __XFS_EXCHANGE_RANGE_CHECK_FRESH2)) return -EINVAL; /* Userspace requests only honored for regular files. */ @@ -802,3 +837,109 @@ xfs_ioc_exchange_range( fdput(file1); return error; } + +/* Opaque freshness blob for XFS_IOC_COMMIT_RANGE */ +struct xfs_commit_range_fresh { + xfs_fsid_t fsid; /* m_fixedfsid */ + __u64 file2_ino; /* inode number */ + __s64 file2_mtime; /* modification time */ + __s64 file2_ctime; /* change time */ + __s32 file2_mtime_nsec; /* mod time, nsec */ + __s32 file2_ctime_nsec; /* change time, nsec */ + __u32 file2_gen; /* inode generation */ + __u32 magic; /* zero */ +}; +#define XCR_FRESH_MAGIC 0x444F524B /* DORK */ + +/* Set up a commitrange operation by sampling file2's write-related attrs */ +long +xfs_ioc_start_commit( + struct file *file, + struct xfs_commit_range __user *argp) +{ + struct xfs_commit_range args = { }; + struct timespec64 ts; + struct xfs_commit_range_fresh *kern_f; + struct xfs_commit_range_fresh __user *user_f; + struct inode *inode2 = file_inode(file); + struct xfs_inode *ip2 = XFS_I(inode2); + const unsigned int lockflags = XFS_IOLOCK_SHARED | + XFS_MMAPLOCK_SHARED | + XFS_ILOCK_SHARED; + + BUILD_BUG_ON(sizeof(struct xfs_commit_range_fresh) != + sizeof(args.file2_freshness)); + + kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness; + + memcpy(&kern_f->fsid, ip2->i_mount->m_fixedfsid, sizeof(xfs_fsid_t)); + + xfs_ilock(ip2, lockflags); + ts = inode_get_ctime(inode2); + kern_f->file2_ctime = ts.tv_sec; + kern_f->file2_ctime_nsec = ts.tv_nsec; + ts = inode_get_mtime(inode2); + kern_f->file2_mtime = ts.tv_sec; + kern_f->file2_mtime_nsec = ts.tv_nsec; + kern_f->file2_ino = ip2->i_ino; + kern_f->file2_gen = inode2->i_generation; + kern_f->magic = XCR_FRESH_MAGIC; + xfs_iunlock(ip2, lockflags); + + user_f = (struct xfs_commit_range_fresh __user *)&argp->file2_freshness; + if (copy_to_user(user_f, kern_f, sizeof(*kern_f))) + return -EFAULT; + + return 0; +} + +/* + * Exchange file1 and file2 contents if file2 has not been written since the + * start commit operation. + */ +long +xfs_ioc_commit_range( + struct file *file, + struct xfs_commit_range __user *argp) +{ + struct xfs_exchrange fxr = { + .file2 = file, + }; + struct xfs_commit_range args; + struct xfs_commit_range_fresh *kern_f; + struct xfs_inode *ip2 = XFS_I(file_inode(file)); + struct xfs_mount *mp = ip2->i_mount; + struct fd file1; + int error; + + kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS) + return -EINVAL; + if (kern_f->magic != XCR_FRESH_MAGIC) + return -EBUSY; + if (memcmp(&kern_f->fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t))) + return -EBUSY; + + fxr.file1_offset = args.file1_offset; + fxr.file2_offset = args.file2_offset; + fxr.length = args.length; + fxr.flags = args.flags | __XFS_EXCHANGE_RANGE_CHECK_FRESH2; + fxr.file2_ino = kern_f->file2_ino; + fxr.file2_gen = kern_f->file2_gen; + fxr.file2_mtime.tv_sec = kern_f->file2_mtime; + fxr.file2_mtime.tv_nsec = kern_f->file2_mtime_nsec; + fxr.file2_ctime.tv_sec = kern_f->file2_ctime; + fxr.file2_ctime.tv_nsec = kern_f->file2_ctime_nsec; + + file1 = fdget(args.file1_fd); + if (!file1.file) + return -EBADF; + fxr.file1 = file1.file; + + error = xfs_exchange_range(&fxr); + fdput(file1); + return error; +} diff --git a/fs/xfs/xfs_exchrange.h b/fs/xfs/xfs_exchrange.h index 039abcca546e..bc1298aba806 100644 --- a/fs/xfs/xfs_exchrange.h +++ b/fs/xfs/xfs_exchrange.h @@ -10,8 +10,12 @@ #define __XFS_EXCHANGE_RANGE_UPD_CMTIME1 (1ULL << 63) #define __XFS_EXCHANGE_RANGE_UPD_CMTIME2 (1ULL << 62) +/* Freshness check required */ +#define __XFS_EXCHANGE_RANGE_CHECK_FRESH2 (1ULL << 61) + #define XFS_EXCHANGE_RANGE_PRIV_FLAGS (__XFS_EXCHANGE_RANGE_UPD_CMTIME1 | \ - __XFS_EXCHANGE_RANGE_UPD_CMTIME2) + __XFS_EXCHANGE_RANGE_UPD_CMTIME2 | \ + __XFS_EXCHANGE_RANGE_CHECK_FRESH2) struct xfs_exchrange { struct file *file1; @@ -22,10 +26,20 @@ struct xfs_exchrange { u64 length; u64 flags; /* XFS_EXCHANGE_RANGE flags */ + + /* file2 metadata for freshness checks */ + u64 file2_ino; + struct timespec64 file2_mtime; + struct timespec64 file2_ctime; + u32 file2_gen; }; long xfs_ioc_exchange_range(struct file *file, struct xfs_exchange_range __user *argp); +long xfs_ioc_start_commit(struct file *file, + struct xfs_commit_range __user *argp); +long xfs_ioc_commit_range(struct file *file, + struct xfs_commit_range __user *argp); struct xfs_exchmaps_req; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 4cdc54dc9686..e97d789495a5 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -852,6 +852,192 @@ static inline bool xfs_file_sync_writes(struct file *filp) return false; } +static int +xfs_falloc_newsize( + struct file *file, + int mode, + loff_t offset, + loff_t len, + loff_t *new_size) +{ + struct inode *inode = file_inode(file); + + if ((mode & FALLOC_FL_KEEP_SIZE) || offset + len <= i_size_read(inode)) + return 0; + *new_size = offset + len; + return inode_newsize_ok(inode, *new_size); +} + +static int +xfs_falloc_setsize( + struct file *file, + loff_t new_size) +{ + struct iattr iattr = { + .ia_valid = ATTR_SIZE, + .ia_size = new_size, + }; + + if (!new_size) + return 0; + return xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file), + &iattr); +} + +static int +xfs_falloc_collapse_range( + struct file *file, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + loff_t new_size = i_size_read(inode) - len; + int error; + + if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len)) + return -EINVAL; + + /* + * There is no need to overlap collapse range with EOF, in which case it + * is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) + return -EINVAL; + + error = xfs_collapse_file_space(XFS_I(inode), offset, len); + if (error) + return error; + return xfs_falloc_setsize(file, new_size); +} + +static int +xfs_falloc_insert_range( + struct file *file, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + loff_t isize = i_size_read(inode); + int error; + + if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len)) + return -EINVAL; + + /* + * New inode size must not exceed ->s_maxbytes, accounting for + * possible signed overflow. + */ + if (inode->i_sb->s_maxbytes - isize < len) + return -EFBIG; + + /* Offset should be less than i_size */ + if (offset >= isize) + return -EINVAL; + + error = xfs_falloc_setsize(file, isize + len); + if (error) + return error; + + /* + * Perform hole insertion now that the file size has been updated so + * that if we crash during the operation we don't leave shifted extents + * past EOF and hence losing access to the data that is contained within + * them. + */ + return xfs_insert_file_space(XFS_I(inode), offset, len); +} + +/* + * Punch a hole and prealloc the range. We use a hole punch rather than + * unwritten extent conversion for two reasons: + * + * 1.) Hole punch handles partial block zeroing for us. + * 2.) If prealloc returns ENOSPC, the file range is still zero-valued by + * virtue of the hole punch. + */ +static int +xfs_falloc_zero_range( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + unsigned int blksize = i_blocksize(inode); + loff_t new_size = 0; + int error; + + trace_xfs_zero_file_space(XFS_I(inode)); + + error = xfs_falloc_newsize(file, mode, offset, len, &new_size); + if (error) + return error; + + error = xfs_free_file_space(XFS_I(inode), offset, len); + if (error) + return error; + + len = round_up(offset + len, blksize) - round_down(offset, blksize); + offset = round_down(offset, blksize); + error = xfs_alloc_file_space(XFS_I(inode), offset, len); + if (error) + return error; + return xfs_falloc_setsize(file, new_size); +} + +static int +xfs_falloc_unshare_range( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + loff_t new_size = 0; + int error; + + error = xfs_falloc_newsize(file, mode, offset, len, &new_size); + if (error) + return error; + + error = xfs_reflink_unshare(XFS_I(inode), offset, len); + if (error) + return error; + + error = xfs_alloc_file_space(XFS_I(inode), offset, len); + if (error) + return error; + return xfs_falloc_setsize(file, new_size); +} + +static int +xfs_falloc_allocate_range( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + loff_t new_size = 0; + int error; + + /* + * If always_cow mode we can't use preallocations and thus should not + * create them. + */ + if (xfs_is_always_cow_inode(XFS_I(inode))) + return -EOPNOTSUPP; + + error = xfs_falloc_newsize(file, mode, offset, len, &new_size); + if (error) + return error; + + error = xfs_alloc_file_space(XFS_I(inode), offset, len); + if (error) + return error; + return xfs_falloc_setsize(file, new_size); +} + #define XFS_FALLOC_FL_SUPPORTED \ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ @@ -868,8 +1054,6 @@ xfs_file_fallocate( struct xfs_inode *ip = XFS_I(inode); long error; uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - loff_t new_size = 0; - bool do_file_insert = false; if (!S_ISREG(inode->i_mode)) return -EINVAL; @@ -890,156 +1074,35 @@ xfs_file_fallocate( */ inode_dio_wait(inode); - /* - * Now AIO and DIO has drained we flush and (if necessary) invalidate - * the cached range over the first operation we are about to run. - * - * We care about zero and collapse here because they both run a hole - * punch over the range first. Because that can zero data, and the range - * of invalidation for the shift operations is much larger, we still do - * the required flush for collapse in xfs_prepare_shift(). - * - * Insert has the same range requirements as collapse, and we extend the - * file first which can zero data. Hence insert has the same - * flush/invalidate requirements as collapse and so they are both - * handled at the right time by xfs_prepare_shift(). - */ - if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | - FALLOC_FL_COLLAPSE_RANGE)) { - error = xfs_flush_unmap_range(ip, offset, len); - if (error) - goto out_unlock; - } - error = file_modified(file); if (error) goto out_unlock; - if (mode & FALLOC_FL_PUNCH_HOLE) { + switch (mode & FALLOC_FL_MODE_MASK) { + case FALLOC_FL_PUNCH_HOLE: error = xfs_free_file_space(ip, offset, len); - if (error) - goto out_unlock; - } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { - if (!xfs_is_falloc_aligned(ip, offset, len)) { - error = -EINVAL; - goto out_unlock; - } - - /* - * There is no need to overlap collapse range with EOF, - * in which case it is effectively a truncate operation - */ - if (offset + len >= i_size_read(inode)) { - error = -EINVAL; - goto out_unlock; - } - - new_size = i_size_read(inode) - len; - - error = xfs_collapse_file_space(ip, offset, len); - if (error) - goto out_unlock; - } else if (mode & FALLOC_FL_INSERT_RANGE) { - loff_t isize = i_size_read(inode); - - if (!xfs_is_falloc_aligned(ip, offset, len)) { - error = -EINVAL; - goto out_unlock; - } - - /* - * New inode size must not exceed ->s_maxbytes, accounting for - * possible signed overflow. - */ - if (inode->i_sb->s_maxbytes - isize < len) { - error = -EFBIG; - goto out_unlock; - } - new_size = isize + len; - - /* Offset should be less than i_size */ - if (offset >= isize) { - error = -EINVAL; - goto out_unlock; - } - do_file_insert = true; - } else { - if (!(mode & FALLOC_FL_KEEP_SIZE) && - offset + len > i_size_read(inode)) { - new_size = offset + len; - error = inode_newsize_ok(inode, new_size); - if (error) - goto out_unlock; - } - - if (mode & FALLOC_FL_ZERO_RANGE) { - /* - * Punch a hole and prealloc the range. We use a hole - * punch rather than unwritten extent conversion for two - * reasons: - * - * 1.) Hole punch handles partial block zeroing for us. - * 2.) If prealloc returns ENOSPC, the file range is - * still zero-valued by virtue of the hole punch. - */ - unsigned int blksize = i_blocksize(inode); - - trace_xfs_zero_file_space(ip); - - error = xfs_free_file_space(ip, offset, len); - if (error) - goto out_unlock; - - len = round_up(offset + len, blksize) - - round_down(offset, blksize); - offset = round_down(offset, blksize); - } else if (mode & FALLOC_FL_UNSHARE_RANGE) { - error = xfs_reflink_unshare(ip, offset, len); - if (error) - goto out_unlock; - } else { - /* - * If always_cow mode we can't use preallocations and - * thus should not create them. - */ - if (xfs_is_always_cow_inode(ip)) { - error = -EOPNOTSUPP; - goto out_unlock; - } - } - - if (!xfs_is_always_cow_inode(ip)) { - error = xfs_alloc_file_space(ip, offset, len); - if (error) - goto out_unlock; - } - } - - /* Change file size if needed */ - if (new_size) { - struct iattr iattr; - - iattr.ia_valid = ATTR_SIZE; - iattr.ia_size = new_size; - error = xfs_vn_setattr_size(file_mnt_idmap(file), - file_dentry(file), &iattr); - if (error) - goto out_unlock; - } - - /* - * Perform hole insertion now that the file size has been - * updated so that if we crash during the operation we don't - * leave shifted extents past EOF and hence losing access to - * the data that is contained within them. - */ - if (do_file_insert) { - error = xfs_insert_file_space(ip, offset, len); - if (error) - goto out_unlock; + break; + case FALLOC_FL_COLLAPSE_RANGE: + error = xfs_falloc_collapse_range(file, offset, len); + break; + case FALLOC_FL_INSERT_RANGE: + error = xfs_falloc_insert_range(file, offset, len); + break; + case FALLOC_FL_ZERO_RANGE: + error = xfs_falloc_zero_range(file, mode, offset, len); + break; + case FALLOC_FL_UNSHARE_RANGE: + error = xfs_falloc_unshare_range(file, mode, offset, len); + break; + case FALLOC_FL_ALLOCATE_RANGE: + error = xfs_falloc_allocate_range(file, mode, offset, len); + break; + default: + error = -EOPNOTSUPP; + break; } - if (xfs_file_sync_writes(file)) + if (!error && xfs_file_sync_writes(file)) error = xfs_log_force_inode(ip); out_unlock: @@ -1175,12 +1238,78 @@ xfs_dir_open( return error; } +/* + * Don't bother propagating errors. We're just doing cleanup, and the caller + * ignores the return value anyway. + */ STATIC int xfs_file_release( - struct inode *inode, - struct file *filp) + struct inode *inode, + struct file *file) { - return xfs_release(XFS_I(inode)); + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + + /* + * If this is a read-only mount or the file system has been shut down, + * don't generate I/O. + */ + if (xfs_is_readonly(mp) || xfs_is_shutdown(mp)) + return 0; + + /* + * If we previously truncated this file and removed old data in the + * process, we want to initiate "early" writeout on the last close. + * This is an attempt to combat the notorious NULL files problem which + * is particularly noticeable from a truncate down, buffered (re-)write + * (delalloc), followed by a crash. What we are effectively doing here + * is significantly reducing the time window where we'd otherwise be + * exposed to that problem. + */ + if (xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED)) { + xfs_iflags_clear(ip, XFS_EOFBLOCKS_RELEASED); + if (ip->i_delayed_blks > 0) + filemap_flush(inode->i_mapping); + } + + /* + * XFS aggressively preallocates post-EOF space to generate contiguous + * allocations for writers that append to the end of the file. + * + * To support workloads that close and reopen the file frequently, these + * preallocations usually persist after a close unless it is the first + * close for the inode. This is a tradeoff to generate tightly packed + * data layouts for unpacking tarballs or similar archives that write + * one file after another without going back to it while keeping the + * preallocation for files that have recurring open/write/close cycles. + * + * This heuristic is skipped for inodes with the append-only flag as + * that flag is rather pointless for inodes written only once. + * + * There is no point in freeing blocks here for open but unlinked files + * as they will be taken care of by the inactivation path soon. + * + * When releasing a read-only context, don't flush data or trim post-EOF + * blocks. This avoids open/read/close workloads from removing EOF + * blocks that other writers depend upon to reduce fragmentation. + * + * If we can't get the iolock just skip truncating the blocks past EOF + * because we could deadlock with the mmap_lock otherwise. We'll get + * another chance to drop them once the last reference to the inode is + * dropped, so we'll never leak blocks permanently. + */ + if (inode->i_nlink && + (file->f_mode & FMODE_WRITE) && + !(ip->i_diflags & XFS_DIFLAG_APPEND) && + !xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) && + xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + if (xfs_can_free_eofblocks(ip) && + !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED)) + xfs_free_eofblocks(ip); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + } + + return 0; } STATIC int diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 85dbb46452ca..ae18ab86e608 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -44,7 +44,7 @@ xfs_fsmap_from_internal( } /* Convert an fsmap to an xfs_fsmap. */ -void +static void xfs_fsmap_to_internal( struct xfs_fsmap *dest, struct fsmap *src) @@ -71,7 +71,7 @@ xfs_fsmap_owner_to_rmap( switch (src->fmr_owner) { case 0: /* "lowest owner id possible" */ case -1ULL: /* "highest owner id possible" */ - dest->rm_owner = 0; + dest->rm_owner = src->fmr_owner; break; case XFS_FMR_OWN_FREE: dest->rm_owner = XFS_RMAP_OWN_NULL; @@ -162,6 +162,7 @@ struct xfs_getfsmap_info { xfs_daddr_t next_daddr; /* next daddr we expect */ /* daddr of low fsmap key when we're using the rtbitmap */ xfs_daddr_t low_daddr; + xfs_daddr_t end_daddr; /* daddr of high fsmap key */ u64 missing_owner; /* owner of holes */ u32 dev; /* device id */ /* @@ -182,6 +183,7 @@ struct xfs_getfsmap_dev { int (*fn)(struct xfs_trans *tp, const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info); + sector_t nr_sectors; }; /* Compare two getfsmap device handlers. */ @@ -252,7 +254,7 @@ xfs_getfsmap_rec_before_start( const struct xfs_rmap_irec *rec, xfs_daddr_t rec_daddr) { - if (info->low_daddr != -1ULL) + if (info->low_daddr != XFS_BUF_DADDR_NULL) return rec_daddr < info->low_daddr; if (info->low.rm_blockcount) return xfs_rmap_compare(rec, &info->low) < 0; @@ -294,6 +296,18 @@ xfs_getfsmap_helper( return 0; } + /* + * For an info->last query, we're looking for a gap between the last + * mapping emitted and the high key specified by userspace. If the + * user's query spans less than 1 fsblock, then info->high and + * info->low will have the same rm_startblock, which causes rec_daddr + * and next_daddr to be the same. Therefore, use the end_daddr that + * we calculated from userspace's high key to synthesize the record. + * Note that if the btree query found a mapping, there won't be a gap. + */ + if (info->last && info->end_daddr != XFS_BUF_DADDR_NULL) + rec_daddr = info->end_daddr; + /* Are we just counting mappings? */ if (info->head->fmh_count == 0) { if (info->head->fmh_entries == UINT_MAX) @@ -427,141 +441,6 @@ xfs_getfsmap_set_irec_flags( irec->rm_flags |= XFS_RMAP_UNWRITTEN; } -/* Execute a getfsmap query against the log device. */ -STATIC int -xfs_getfsmap_logdev( - struct xfs_trans *tp, - const struct xfs_fsmap *keys, - struct xfs_getfsmap_info *info) -{ - struct xfs_mount *mp = tp->t_mountp; - struct xfs_rmap_irec rmap; - xfs_daddr_t rec_daddr, len_daddr; - xfs_fsblock_t start_fsb, end_fsb; - uint64_t eofs; - - eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); - if (keys[0].fmr_physical >= eofs) - return 0; - start_fsb = XFS_BB_TO_FSBT(mp, - keys[0].fmr_physical + keys[0].fmr_length); - end_fsb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical)); - - /* Adjust the low key if we are continuing from where we left off. */ - if (keys[0].fmr_length > 0) - info->low_daddr = XFS_FSB_TO_BB(mp, start_fsb); - - trace_xfs_fsmap_low_key_linear(mp, info->dev, start_fsb); - trace_xfs_fsmap_high_key_linear(mp, info->dev, end_fsb); - - if (start_fsb > 0) - return 0; - - /* Fabricate an rmap entry for the external log device. */ - rmap.rm_startblock = 0; - rmap.rm_blockcount = mp->m_sb.sb_logblocks; - rmap.rm_owner = XFS_RMAP_OWN_LOG; - rmap.rm_offset = 0; - rmap.rm_flags = 0; - - rec_daddr = XFS_FSB_TO_BB(mp, rmap.rm_startblock); - len_daddr = XFS_FSB_TO_BB(mp, rmap.rm_blockcount); - return xfs_getfsmap_helper(tp, info, &rmap, rec_daddr, len_daddr); -} - -#ifdef CONFIG_XFS_RT -/* Transform a rtbitmap "record" into a fsmap */ -STATIC int -xfs_getfsmap_rtdev_rtbitmap_helper( - struct xfs_mount *mp, - struct xfs_trans *tp, - const struct xfs_rtalloc_rec *rec, - void *priv) -{ - struct xfs_getfsmap_info *info = priv; - struct xfs_rmap_irec irec; - xfs_rtblock_t rtbno; - xfs_daddr_t rec_daddr, len_daddr; - - rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext); - rec_daddr = XFS_FSB_TO_BB(mp, rtbno); - irec.rm_startblock = rtbno; - - rtbno = xfs_rtx_to_rtb(mp, rec->ar_extcount); - len_daddr = XFS_FSB_TO_BB(mp, rtbno); - irec.rm_blockcount = rtbno; - - irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */ - irec.rm_offset = 0; - irec.rm_flags = 0; - - return xfs_getfsmap_helper(tp, info, &irec, rec_daddr, len_daddr); -} - -/* Execute a getfsmap query against the realtime device rtbitmap. */ -STATIC int -xfs_getfsmap_rtdev_rtbitmap( - struct xfs_trans *tp, - const struct xfs_fsmap *keys, - struct xfs_getfsmap_info *info) -{ - - struct xfs_rtalloc_rec alow = { 0 }; - struct xfs_rtalloc_rec ahigh = { 0 }; - struct xfs_mount *mp = tp->t_mountp; - xfs_rtblock_t start_rtb; - xfs_rtblock_t end_rtb; - uint64_t eofs; - int error; - - eofs = XFS_FSB_TO_BB(mp, xfs_rtx_to_rtb(mp, mp->m_sb.sb_rextents)); - if (keys[0].fmr_physical >= eofs) - return 0; - start_rtb = XFS_BB_TO_FSBT(mp, - keys[0].fmr_physical + keys[0].fmr_length); - end_rtb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical)); - - info->missing_owner = XFS_FMR_OWN_UNKNOWN; - - /* Adjust the low key if we are continuing from where we left off. */ - if (keys[0].fmr_length > 0) { - info->low_daddr = XFS_FSB_TO_BB(mp, start_rtb); - if (info->low_daddr >= eofs) - return 0; - } - - trace_xfs_fsmap_low_key_linear(mp, info->dev, start_rtb); - trace_xfs_fsmap_high_key_linear(mp, info->dev, end_rtb); - - xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP); - - /* - * Set up query parameters to return free rtextents covering the range - * we want. - */ - alow.ar_startext = xfs_rtb_to_rtx(mp, start_rtb); - ahigh.ar_startext = xfs_rtb_to_rtxup(mp, end_rtb); - error = xfs_rtalloc_query_range(mp, tp, &alow, &ahigh, - xfs_getfsmap_rtdev_rtbitmap_helper, info); - if (error) - goto err; - - /* - * Report any gaps at the end of the rtbitmap by simulating a null - * rmap starting at the block after the end of the query range. - */ - info->last = true; - ahigh.ar_startext = min(mp->m_sb.sb_rextents, ahigh.ar_startext); - - error = xfs_getfsmap_rtdev_rtbitmap_helper(mp, tp, &ahigh, info); - if (error) - goto err; -err: - xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP); - return error; -} -#endif /* CONFIG_XFS_RT */ - static inline bool rmap_not_shareable(struct xfs_mount *mp, const struct xfs_rmap_irec *r) { @@ -786,6 +665,140 @@ xfs_getfsmap_datadev_bnobt( xfs_getfsmap_datadev_bnobt_query, &akeys[0]); } +/* Execute a getfsmap query against the log device. */ +STATIC int +xfs_getfsmap_logdev( + struct xfs_trans *tp, + const struct xfs_fsmap *keys, + struct xfs_getfsmap_info *info) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_rmap_irec rmap; + xfs_daddr_t rec_daddr, len_daddr; + xfs_fsblock_t start_fsb, end_fsb; + uint64_t eofs; + + eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); + if (keys[0].fmr_physical >= eofs) + return 0; + start_fsb = XFS_BB_TO_FSBT(mp, + keys[0].fmr_physical + keys[0].fmr_length); + end_fsb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical)); + + /* Adjust the low key if we are continuing from where we left off. */ + if (keys[0].fmr_length > 0) + info->low_daddr = XFS_FSB_TO_BB(mp, start_fsb); + + trace_xfs_fsmap_low_key_linear(mp, info->dev, start_fsb); + trace_xfs_fsmap_high_key_linear(mp, info->dev, end_fsb); + + if (start_fsb > 0) + return 0; + + /* Fabricate an rmap entry for the external log device. */ + rmap.rm_startblock = 0; + rmap.rm_blockcount = mp->m_sb.sb_logblocks; + rmap.rm_owner = XFS_RMAP_OWN_LOG; + rmap.rm_offset = 0; + rmap.rm_flags = 0; + + rec_daddr = XFS_FSB_TO_BB(mp, rmap.rm_startblock); + len_daddr = XFS_FSB_TO_BB(mp, rmap.rm_blockcount); + return xfs_getfsmap_helper(tp, info, &rmap, rec_daddr, len_daddr); +} + +#ifdef CONFIG_XFS_RT +/* Transform a rtbitmap "record" into a fsmap */ +STATIC int +xfs_getfsmap_rtdev_rtbitmap_helper( + struct xfs_mount *mp, + struct xfs_trans *tp, + const struct xfs_rtalloc_rec *rec, + void *priv) +{ + struct xfs_getfsmap_info *info = priv; + struct xfs_rmap_irec irec; + xfs_rtblock_t rtbno; + xfs_daddr_t rec_daddr, len_daddr; + + rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext); + rec_daddr = XFS_FSB_TO_BB(mp, rtbno); + irec.rm_startblock = rtbno; + + rtbno = xfs_rtx_to_rtb(mp, rec->ar_extcount); + len_daddr = XFS_FSB_TO_BB(mp, rtbno); + irec.rm_blockcount = rtbno; + + irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */ + irec.rm_offset = 0; + irec.rm_flags = 0; + + return xfs_getfsmap_helper(tp, info, &irec, rec_daddr, len_daddr); +} + +/* Execute a getfsmap query against the realtime device rtbitmap. */ +STATIC int +xfs_getfsmap_rtdev_rtbitmap( + struct xfs_trans *tp, + const struct xfs_fsmap *keys, + struct xfs_getfsmap_info *info) +{ + + struct xfs_rtalloc_rec ahigh = { 0 }; + struct xfs_mount *mp = tp->t_mountp; + xfs_rtblock_t start_rtb; + xfs_rtblock_t end_rtb; + xfs_rtxnum_t high; + uint64_t eofs; + int error; + + eofs = XFS_FSB_TO_BB(mp, xfs_rtx_to_rtb(mp, mp->m_sb.sb_rextents)); + if (keys[0].fmr_physical >= eofs) + return 0; + start_rtb = XFS_BB_TO_FSBT(mp, + keys[0].fmr_physical + keys[0].fmr_length); + end_rtb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical)); + + info->missing_owner = XFS_FMR_OWN_UNKNOWN; + + /* Adjust the low key if we are continuing from where we left off. */ + if (keys[0].fmr_length > 0) { + info->low_daddr = XFS_FSB_TO_BB(mp, start_rtb); + if (info->low_daddr >= eofs) + return 0; + } + + trace_xfs_fsmap_low_key_linear(mp, info->dev, start_rtb); + trace_xfs_fsmap_high_key_linear(mp, info->dev, end_rtb); + + xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP); + + /* + * Set up query parameters to return free rtextents covering the range + * we want. + */ + high = xfs_rtb_to_rtxup(mp, end_rtb); + error = xfs_rtalloc_query_range(mp, tp, xfs_rtb_to_rtx(mp, start_rtb), + high, xfs_getfsmap_rtdev_rtbitmap_helper, info); + if (error) + goto err; + + /* + * Report any gaps at the end of the rtbitmap by simulating a null + * rmap starting at the block after the end of the query range. + */ + info->last = true; + ahigh.ar_startext = min(mp->m_sb.sb_rextents, high); + + error = xfs_getfsmap_rtdev_rtbitmap_helper(mp, tp, &ahigh, info); + if (error) + goto err; +err: + xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP); + return error; +} +#endif /* CONFIG_XFS_RT */ + /* Do we recognize the device? */ STATIC bool xfs_getfsmap_is_valid_device( @@ -876,7 +889,7 @@ xfs_getfsmap_check_keys( * xfs_getfsmap_info.low/high -- per-AG low/high keys computed from * dkeys; used to query the metadata. */ -int +STATIC int xfs_getfsmap( struct xfs_mount *mp, struct xfs_fsmap_head *head, @@ -904,17 +917,21 @@ xfs_getfsmap( /* Set up our device handlers. */ memset(handlers, 0, sizeof(handlers)); + handlers[0].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev); if (use_rmap) handlers[0].fn = xfs_getfsmap_datadev_rmapbt; else handlers[0].fn = xfs_getfsmap_datadev_bnobt; if (mp->m_logdev_targp != mp->m_ddev_targp) { + handlers[1].nr_sectors = XFS_FSB_TO_BB(mp, + mp->m_sb.sb_logblocks); handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev); handlers[1].fn = xfs_getfsmap_logdev; } #ifdef CONFIG_XFS_RT if (mp->m_rtdev_targp) { + handlers[2].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev); handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap; } @@ -946,6 +963,7 @@ xfs_getfsmap( info.next_daddr = head->fmh_keys[0].fmr_physical + head->fmh_keys[0].fmr_length; + info.end_daddr = XFS_BUF_DADDR_NULL; info.fsmap_recs = fsmap_recs; info.head = head; @@ -966,8 +984,11 @@ xfs_getfsmap( * low key, zero out the low key so that we get * everything from the beginning. */ - if (handlers[i].dev == head->fmh_keys[1].fmr_device) + if (handlers[i].dev == head->fmh_keys[1].fmr_device) { dkeys[1] = head->fmh_keys[1]; + info.end_daddr = min(handlers[i].nr_sectors - 1, + dkeys[1].fmr_physical); + } if (handlers[i].dev > head->fmh_keys[0].fmr_device) memset(&dkeys[0], 0, sizeof(struct xfs_fsmap)); @@ -983,7 +1004,7 @@ xfs_getfsmap( info.dev = handlers[i].dev; info.last = false; info.pag = NULL; - info.low_daddr = -1ULL; + info.low_daddr = XFS_BUF_DADDR_NULL; info.low.rm_blockcount = 0; error = handlers[i].fn(tp, dkeys, &info); if (error) @@ -998,3 +1019,133 @@ xfs_getfsmap( head->fmh_oflags = FMH_OF_DEV_T; return error; } + +int +xfs_ioc_getfsmap( + struct xfs_inode *ip, + struct fsmap_head __user *arg) +{ + struct xfs_fsmap_head xhead = {0}; + struct fsmap_head head; + struct fsmap *recs; + unsigned int count; + __u32 last_flags = 0; + bool done = false; + int error; + + if (copy_from_user(&head, arg, sizeof(struct fsmap_head))) + return -EFAULT; + if (memchr_inv(head.fmh_reserved, 0, sizeof(head.fmh_reserved)) || + memchr_inv(head.fmh_keys[0].fmr_reserved, 0, + sizeof(head.fmh_keys[0].fmr_reserved)) || + memchr_inv(head.fmh_keys[1].fmr_reserved, 0, + sizeof(head.fmh_keys[1].fmr_reserved))) + return -EINVAL; + + /* + * Use an internal memory buffer so that we don't have to copy fsmap + * data to userspace while holding locks. Start by trying to allocate + * up to 128k for the buffer, but fall back to a single page if needed. + */ + count = min_t(unsigned int, head.fmh_count, + 131072 / sizeof(struct fsmap)); + recs = kvcalloc(count, sizeof(struct fsmap), GFP_KERNEL); + if (!recs) { + count = min_t(unsigned int, head.fmh_count, + PAGE_SIZE / sizeof(struct fsmap)); + recs = kvcalloc(count, sizeof(struct fsmap), GFP_KERNEL); + if (!recs) + return -ENOMEM; + } + + xhead.fmh_iflags = head.fmh_iflags; + xfs_fsmap_to_internal(&xhead.fmh_keys[0], &head.fmh_keys[0]); + xfs_fsmap_to_internal(&xhead.fmh_keys[1], &head.fmh_keys[1]); + + trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]); + trace_xfs_getfsmap_high_key(ip->i_mount, &xhead.fmh_keys[1]); + + head.fmh_entries = 0; + do { + struct fsmap __user *user_recs; + struct fsmap *last_rec; + + user_recs = &arg->fmh_recs[head.fmh_entries]; + xhead.fmh_entries = 0; + xhead.fmh_count = min_t(unsigned int, count, + head.fmh_count - head.fmh_entries); + + /* Run query, record how many entries we got. */ + error = xfs_getfsmap(ip->i_mount, &xhead, recs); + switch (error) { + case 0: + /* + * There are no more records in the result set. Copy + * whatever we got to userspace and break out. + */ + done = true; + break; + case -ECANCELED: + /* + * The internal memory buffer is full. Copy whatever + * records we got to userspace and go again if we have + * not yet filled the userspace buffer. + */ + error = 0; + break; + default: + goto out_free; + } + head.fmh_entries += xhead.fmh_entries; + head.fmh_oflags = xhead.fmh_oflags; + + /* + * If the caller wanted a record count or there aren't any + * new records to return, we're done. + */ + if (head.fmh_count == 0 || xhead.fmh_entries == 0) + break; + + /* Copy all the records we got out to userspace. */ + if (copy_to_user(user_recs, recs, + xhead.fmh_entries * sizeof(struct fsmap))) { + error = -EFAULT; + goto out_free; + } + + /* Remember the last record flags we copied to userspace. */ + last_rec = &recs[xhead.fmh_entries - 1]; + last_flags = last_rec->fmr_flags; + + /* Set up the low key for the next iteration. */ + xfs_fsmap_to_internal(&xhead.fmh_keys[0], last_rec); + trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]); + } while (!done && head.fmh_entries < head.fmh_count); + + /* + * If there are no more records in the query result set and we're not + * in counting mode, mark the last record returned with the LAST flag. + */ + if (done && head.fmh_count > 0 && head.fmh_entries > 0) { + struct fsmap __user *user_rec; + + last_flags |= FMR_OF_LAST; + user_rec = &arg->fmh_recs[head.fmh_entries - 1]; + + if (copy_to_user(&user_rec->fmr_flags, &last_flags, + sizeof(last_flags))) { + error = -EFAULT; + goto out_free; + } + } + + /* copy back header */ + if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) { + error = -EFAULT; + goto out_free; + } + +out_free: + kvfree(recs); + return error; +} diff --git a/fs/xfs/xfs_fsmap.h b/fs/xfs/xfs_fsmap.h index a0775788e7b1..a0bcc38486a5 100644 --- a/fs/xfs/xfs_fsmap.h +++ b/fs/xfs/xfs_fsmap.h @@ -7,6 +7,7 @@ #define __XFS_FSMAP_H__ struct fsmap; +struct fsmap_head; /* internal fsmap representation */ struct xfs_fsmap { @@ -27,9 +28,6 @@ struct xfs_fsmap_head { struct xfs_fsmap fmh_keys[2]; /* low and high keys */ }; -void xfs_fsmap_to_internal(struct xfs_fsmap *dest, struct fsmap *src); - -int xfs_getfsmap(struct xfs_mount *mp, struct xfs_fsmap_head *head, - struct fsmap *out_recs); +int xfs_ioc_getfsmap(struct xfs_inode *ip, struct fsmap_head __user *arg); #endif /* __XFS_FSMAP_H__ */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index c211ea2b63c4..3643cc843f62 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -485,7 +485,7 @@ xfs_do_force_shutdown( const char *why; - if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate)) { + if (xfs_set_shutdown(mp)) { xlog_shutdown_wait(mp->m_log); return; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index cf629302d48e..20d9924f28c2 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -65,6 +65,18 @@ static int xfs_icwalk_ag(struct xfs_perag *pag, XFS_ICWALK_FLAG_RECLAIM_SICK | \ XFS_ICWALK_FLAG_UNION) +/* Marks for the perag xarray */ +#define XFS_PERAG_RECLAIM_MARK XA_MARK_0 +#define XFS_PERAG_BLOCKGC_MARK XA_MARK_1 + +static inline xa_mark_t ici_tag_to_mark(unsigned int tag) +{ + if (tag == XFS_ICI_RECLAIM_TAG) + return XFS_PERAG_RECLAIM_MARK; + ASSERT(tag == XFS_ICI_BLOCKGC_TAG); + return XFS_PERAG_BLOCKGC_MARK; +} + /* * Allocate and initialise an xfs_inode. */ @@ -191,7 +203,7 @@ xfs_reclaim_work_queue( { rcu_read_lock(); - if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { + if (xa_marked(&mp->m_perags, XFS_PERAG_RECLAIM_MARK)) { queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); } @@ -241,9 +253,7 @@ xfs_perag_set_inode_tag( return; /* propagate the tag up into the perag radix tree */ - spin_lock(&mp->m_perag_lock); - radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, tag); - spin_unlock(&mp->m_perag_lock); + xa_set_mark(&mp->m_perags, pag->pag_agno, ici_tag_to_mark(tag)); /* start background work */ switch (tag) { @@ -285,14 +295,39 @@ xfs_perag_clear_inode_tag( return; /* clear the tag from the perag radix tree */ - spin_lock(&mp->m_perag_lock); - radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, tag); - spin_unlock(&mp->m_perag_lock); + xa_clear_mark(&mp->m_perags, pag->pag_agno, ici_tag_to_mark(tag)); trace_xfs_perag_clear_inode_tag(pag, _RET_IP_); } /* + * Find the next AG after @pag, or the first AG if @pag is NULL. + */ +static struct xfs_perag * +xfs_perag_grab_next_tag( + struct xfs_mount *mp, + struct xfs_perag *pag, + int tag) +{ + unsigned long index = 0; + + if (pag) { + index = pag->pag_agno + 1; + xfs_perag_rele(pag); + } + + rcu_read_lock(); + pag = xa_find(&mp->m_perags, &index, ULONG_MAX, ici_tag_to_mark(tag)); + if (pag) { + trace_xfs_perag_grab_next_tag(pag, _RET_IP_); + if (!atomic_inc_not_zero(&pag->pag_active_ref)) + pag = NULL; + } + rcu_read_unlock(); + return pag; +} + +/* * When we recycle a reclaimable inode, we need to re-initialise the VFS inode * part of the structure. This is made more complex by the fact we store * information about the on-disk values in the VFS inode and so we can't just @@ -755,7 +790,7 @@ xfs_iget( ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); /* reject inode numbers outside existing AGs */ - if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) + if (!xfs_verify_ino(mp, ino)) return -EINVAL; XFS_STATS_INC(mp, xs_ig_attempts); @@ -977,7 +1012,7 @@ xfs_reclaim_inodes( if (xfs_want_reclaim_sick(mp)) icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK; - while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { + while (xa_marked(&mp->m_perags, XFS_PERAG_RECLAIM_MARK)) { xfs_ail_push_all_sync(mp->m_ail); xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw); } @@ -1019,15 +1054,17 @@ long xfs_reclaim_inodes_count( struct xfs_mount *mp) { - struct xfs_perag *pag; - xfs_agnumber_t ag = 0; + XA_STATE (xas, &mp->m_perags, 0); long reclaimable = 0; + struct xfs_perag *pag; - while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { - ag = pag->pag_agno + 1; + rcu_read_lock(); + xas_for_each_marked(&xas, pag, ULONG_MAX, XFS_PERAG_RECLAIM_MARK) { + trace_xfs_reclaim_inodes_count(pag, _THIS_IP_); reclaimable += pag->pag_ici_reclaimable; - xfs_perag_put(pag); } + rcu_read_unlock(); + return reclaimable; } @@ -1159,7 +1196,7 @@ xfs_inode_free_eofblocks( if (xfs_can_free_eofblocks(ip)) return xfs_free_eofblocks(ip); - /* inode could be preallocated or append-only */ + /* inode could be preallocated */ trace_xfs_inode_free_eofblocks_invalid(ip); xfs_inode_clear_eofblocks_tag(ip); return 0; @@ -1369,14 +1406,13 @@ void xfs_blockgc_start( struct xfs_mount *mp) { - struct xfs_perag *pag; - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; if (xfs_set_blockgc_enabled(mp)) return; trace_xfs_blockgc_start(mp, __return_address); - for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) + while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG))) xfs_blockgc_queue(pag); } @@ -1492,21 +1528,19 @@ int xfs_blockgc_flush_all( struct xfs_mount *mp) { - struct xfs_perag *pag; - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; trace_xfs_blockgc_flush_all(mp, __return_address); /* - * For each blockgc worker, move its queue time up to now. If it - * wasn't queued, it will not be requeued. Then flush whatever's - * left. + * For each blockgc worker, move its queue time up to now. If it wasn't + * queued, it will not be requeued. Then flush whatever is left. */ - for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) + while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG))) mod_delayed_work(pag->pag_mount->m_blockgc_wq, &pag->pag_blockgc_work, 0); - for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) + while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG))) flush_delayed_work(&pag->pag_blockgc_work); return xfs_inodegc_flush(mp); @@ -1752,12 +1786,11 @@ xfs_icwalk( enum xfs_icwalk_goal goal, struct xfs_icwalk *icw) { - struct xfs_perag *pag; + struct xfs_perag *pag = NULL; int error = 0; int last_error = 0; - xfs_agnumber_t agno; - for_each_perag_tag(mp, agno, pag, goal) { + while ((pag = xfs_perag_grab_next_tag(mp, pag, goal))) { error = xfs_icwalk_ag(pag, goal, icw); if (error) { last_error = error; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 7dc6f326936c..bcc277fc0a83 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -704,7 +704,7 @@ xfs_create( * entry pointing to them, but a directory also the "." entry * pointing to itself. */ - error = xfs_dialloc(&tp, dp->i_ino, args->mode, &ino); + error = xfs_dialloc(&tp, args, &ino); if (!error) error = xfs_icreate(tp, ino, args, &du.ip); if (error) @@ -812,7 +812,7 @@ xfs_create_tmpfile( if (error) goto out_release_dquots; - error = xfs_dialloc(&tp, dp->i_ino, args->mode, &ino); + error = xfs_dialloc(&tp, args, &ino); if (!error) error = xfs_icreate(tp, ino, args, &ip); if (error) @@ -1079,88 +1079,6 @@ out: return error; } -int -xfs_release( - xfs_inode_t *ip) -{ - xfs_mount_t *mp = ip->i_mount; - int error = 0; - - if (!S_ISREG(VFS_I(ip)->i_mode) || (VFS_I(ip)->i_mode == 0)) - return 0; - - /* If this is a read-only mount, don't do this (would generate I/O) */ - if (xfs_is_readonly(mp)) - return 0; - - if (!xfs_is_shutdown(mp)) { - int truncated; - - /* - * If we previously truncated this file and removed old data - * in the process, we want to initiate "early" writeout on - * the last close. This is an attempt to combat the notorious - * NULL files problem which is particularly noticeable from a - * truncate down, buffered (re-)write (delalloc), followed by - * a crash. What we are effectively doing here is - * significantly reducing the time window where we'd otherwise - * be exposed to that problem. - */ - truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); - if (truncated) { - xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); - if (ip->i_delayed_blks > 0) { - error = filemap_flush(VFS_I(ip)->i_mapping); - if (error) - return error; - } - } - } - - if (VFS_I(ip)->i_nlink == 0) - return 0; - - /* - * If we can't get the iolock just skip truncating the blocks past EOF - * because we could deadlock with the mmap_lock otherwise. We'll get - * another chance to drop them once the last reference to the inode is - * dropped, so we'll never leak blocks permanently. - */ - if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) - return 0; - - if (xfs_can_free_eofblocks(ip)) { - /* - * Check if the inode is being opened, written and closed - * frequently and we have delayed allocation blocks outstanding - * (e.g. streaming writes from the NFS server), truncating the - * blocks past EOF will cause fragmentation to occur. - * - * In this case don't do the truncation, but we have to be - * careful how we detect this case. Blocks beyond EOF show up as - * i_delayed_blks even when the inode is clean, so we need to - * truncate them away first before checking for a dirty release. - * Hence on the first dirty close we will still remove the - * speculative allocation, but after that we will leave it in - * place. - */ - if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) - goto out_unlock; - - error = xfs_free_eofblocks(ip); - if (error) - goto out_unlock; - - /* delalloc blocks after truncation means it really is dirty */ - if (ip->i_delayed_blks) - xfs_iflags_set(ip, XFS_IDIRTY_RELEASE); - } - -out_unlock: - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return error; -} - /* * Mark all the buffers attached to this directory stale. In theory we should * never be freeing a directory with any blocks at all, but this covers the diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 51defdebef30..97ed912306fd 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -276,12 +276,13 @@ static inline bool xfs_is_reflink_inode(struct xfs_inode *ip) return ip->i_diflags2 & XFS_DIFLAG2_REFLINK; } -static inline bool xfs_is_metadata_inode(struct xfs_inode *ip) +static inline bool xfs_is_metadata_inode(const struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; - return ip == mp->m_rbmip || ip == mp->m_rsumip || - xfs_is_quota_inode(&mp->m_sb, ip->i_ino); + return ip->i_ino == mp->m_sb.sb_rbmino || + ip->i_ino == mp->m_sb.sb_rsumino || + xfs_is_quota_inode(&mp->m_sb, ip->i_ino); } bool xfs_is_always_cow_inode(struct xfs_inode *ip); @@ -335,7 +336,7 @@ static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip) #define XFS_INEW (1 << 3) /* inode has just been allocated */ #define XFS_IPRESERVE_DM_FIELDS (1 << 4) /* has legacy DMAPI fields set */ #define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */ -#define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */ +#define XFS_EOFBLOCKS_RELEASED (1 << 6) /* eofblocks were freed in ->release */ #define XFS_IFLUSHING (1 << 7) /* inode is being flushed */ #define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ #define XFS_IPINNED (1 << __XFS_IPINNED_BIT) @@ -382,7 +383,7 @@ static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip) */ #define XFS_IRECLAIM_RESET_FLAGS \ (XFS_IRECLAIMABLE | XFS_IRECLAIM | \ - XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | XFS_NEED_INACTIVE | \ + XFS_EOFBLOCKS_RELEASED | XFS_ITRUNCATED | XFS_NEED_INACTIVE | \ XFS_INACTIVATING | XFS_IQUOTAUNCHECKED) /* @@ -512,7 +513,6 @@ enum layout_break_reason { #define XFS_INHERIT_GID(pip) \ (xfs_has_grpid((pip)->i_mount) || (VFS_I(pip)->i_mode & S_ISGID)) -int xfs_release(struct xfs_inode *ip); int xfs_inactive(struct xfs_inode *ip); int xfs_lookup(struct xfs_inode *dp, const struct xfs_name *name, struct xfs_inode **ipp, struct xfs_name *ci_name); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 4e933db75b12..7226d27e8afc 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -483,6 +483,17 @@ xfs_ioctl_setattr_xflags( /* Can't change realtime flag if any extents are allocated. */ if (ip->i_df.if_nextents || ip->i_delayed_blks) return -EINVAL; + + /* + * If S_DAX is enabled on this file, we can only switch the + * device if both support fsdax. We can't update S_DAX because + * there might be other threads walking down the access paths. + */ + if (IS_DAX(VFS_I(ip)) && + (mp->m_ddev_targp->bt_daxdev == NULL || + (mp->m_rtdev_targp && + mp->m_rtdev_targp->bt_daxdev == NULL))) + return -EINVAL; } if (rtflag) { @@ -865,136 +876,6 @@ out_free_buf: return error; } -STATIC int -xfs_ioc_getfsmap( - struct xfs_inode *ip, - struct fsmap_head __user *arg) -{ - struct xfs_fsmap_head xhead = {0}; - struct fsmap_head head; - struct fsmap *recs; - unsigned int count; - __u32 last_flags = 0; - bool done = false; - int error; - - if (copy_from_user(&head, arg, sizeof(struct fsmap_head))) - return -EFAULT; - if (memchr_inv(head.fmh_reserved, 0, sizeof(head.fmh_reserved)) || - memchr_inv(head.fmh_keys[0].fmr_reserved, 0, - sizeof(head.fmh_keys[0].fmr_reserved)) || - memchr_inv(head.fmh_keys[1].fmr_reserved, 0, - sizeof(head.fmh_keys[1].fmr_reserved))) - return -EINVAL; - - /* - * Use an internal memory buffer so that we don't have to copy fsmap - * data to userspace while holding locks. Start by trying to allocate - * up to 128k for the buffer, but fall back to a single page if needed. - */ - count = min_t(unsigned int, head.fmh_count, - 131072 / sizeof(struct fsmap)); - recs = kvcalloc(count, sizeof(struct fsmap), GFP_KERNEL); - if (!recs) { - count = min_t(unsigned int, head.fmh_count, - PAGE_SIZE / sizeof(struct fsmap)); - recs = kvcalloc(count, sizeof(struct fsmap), GFP_KERNEL); - if (!recs) - return -ENOMEM; - } - - xhead.fmh_iflags = head.fmh_iflags; - xfs_fsmap_to_internal(&xhead.fmh_keys[0], &head.fmh_keys[0]); - xfs_fsmap_to_internal(&xhead.fmh_keys[1], &head.fmh_keys[1]); - - trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]); - trace_xfs_getfsmap_high_key(ip->i_mount, &xhead.fmh_keys[1]); - - head.fmh_entries = 0; - do { - struct fsmap __user *user_recs; - struct fsmap *last_rec; - - user_recs = &arg->fmh_recs[head.fmh_entries]; - xhead.fmh_entries = 0; - xhead.fmh_count = min_t(unsigned int, count, - head.fmh_count - head.fmh_entries); - - /* Run query, record how many entries we got. */ - error = xfs_getfsmap(ip->i_mount, &xhead, recs); - switch (error) { - case 0: - /* - * There are no more records in the result set. Copy - * whatever we got to userspace and break out. - */ - done = true; - break; - case -ECANCELED: - /* - * The internal memory buffer is full. Copy whatever - * records we got to userspace and go again if we have - * not yet filled the userspace buffer. - */ - error = 0; - break; - default: - goto out_free; - } - head.fmh_entries += xhead.fmh_entries; - head.fmh_oflags = xhead.fmh_oflags; - - /* - * If the caller wanted a record count or there aren't any - * new records to return, we're done. - */ - if (head.fmh_count == 0 || xhead.fmh_entries == 0) - break; - - /* Copy all the records we got out to userspace. */ - if (copy_to_user(user_recs, recs, - xhead.fmh_entries * sizeof(struct fsmap))) { - error = -EFAULT; - goto out_free; - } - - /* Remember the last record flags we copied to userspace. */ - last_rec = &recs[xhead.fmh_entries - 1]; - last_flags = last_rec->fmr_flags; - - /* Set up the low key for the next iteration. */ - xfs_fsmap_to_internal(&xhead.fmh_keys[0], last_rec); - trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]); - } while (!done && head.fmh_entries < head.fmh_count); - - /* - * If there are no more records in the query result set and we're not - * in counting mode, mark the last record returned with the LAST flag. - */ - if (done && head.fmh_count > 0 && head.fmh_entries > 0) { - struct fsmap __user *user_rec; - - last_flags |= FMR_OF_LAST; - user_rec = &arg->fmh_recs[head.fmh_entries - 1]; - - if (copy_to_user(&user_rec->fmr_flags, &last_flags, - sizeof(last_flags))) { - error = -EFAULT; - goto out_free; - } - } - - /* copy back header */ - if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) { - error = -EFAULT; - goto out_free; - } - -out_free: - kvfree(recs); - return error; -} - int xfs_ioc_swapext( xfs_swapext_t *sxp) @@ -1507,6 +1388,10 @@ xfs_file_ioctl( case XFS_IOC_EXCHANGE_RANGE: return xfs_ioc_exchange_range(filp, arg); + case XFS_IOC_START_COMMIT: + return xfs_ioc_start_commit(filp, arg); + case XFS_IOC_COMMIT_RANGE: + return xfs_ioc_commit_range(filp, arg); default: return -ENOTTY; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 817ea7e0a8ab..26b2f5887b88 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3495,7 +3495,7 @@ xlog_force_shutdown( * If this log shutdown also sets the mount shutdown state, issue a * shutdown warning message. */ - if (!test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &log->l_mp->m_opstate)) { + if (!xfs_set_shutdown(log->l_mp)) { xfs_alert_tag(log->l_mp, XFS_PTAG_SHUTDOWN_LOGERROR, "Filesystem has been shut down due to log error (0x%x).", shutdown_flags); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 4423dd344239..1a74fe22672e 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1336,7 +1336,7 @@ xlog_find_tail( * headers if we have a filesystem using non-persistent counters. */ if (clean) - set_bit(XFS_OPSTATE_CLEAN, &log->l_mp->m_opstate); + xfs_set_clean(log->l_mp); /* * Make sure that there are no blocks in front of the head diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 09eef1721ef4..460f93a9ce00 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -595,7 +595,7 @@ xfs_unmount_flush_inodes( xfs_extent_busy_wait_all(mp); flush_workqueue(xfs_discard_wq); - set_bit(XFS_OPSTATE_UNMOUNTING, &mp->m_opstate); + xfs_set_unmounting(mp); xfs_ail_push_all_sync(mp->m_ail); xfs_inodegc_stop(mp); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index d0567dfbc036..96496f39f551 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -147,7 +147,7 @@ typedef struct xfs_mount { int m_logbufs; /* number of log buffers */ int m_logbsize; /* size of each log buffer */ uint m_rsumlevels; /* rt summary levels */ - uint m_rsumsize; /* size of rt summary, bytes */ + xfs_filblks_t m_rsumblocks; /* size of rt summary, FSBs */ int m_fixedfsid[2]; /* unchanged for life of FS */ uint m_qflags; /* quota status flags */ uint64_t m_features; /* active filesystem features */ @@ -208,8 +208,7 @@ typedef struct xfs_mount { */ atomic64_t m_allocbt_blks; - struct radix_tree_root m_perag_tree; /* per-ag accounting info */ - spinlock_t m_perag_lock; /* lock for m_perag_tree */ + struct xarray m_perags; /* per-ag accounting info */ uint64_t m_resblks; /* total reserved blocks */ uint64_t m_resblks_avail;/* available reserved blocks */ uint64_t m_resblks_save; /* reserved blks @ remount,ro */ diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index 7443debaffd6..d0f5b403bdbe 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -230,9 +230,8 @@ _xfs_mru_cache_clear_reap_list( __releases(mru->lock) __acquires(mru->lock) { struct xfs_mru_cache_elem *elem, *next; - struct list_head tmp; + LIST_HEAD(tmp); - INIT_LIST_HEAD(&tmp); list_for_each_entry_safe(elem, next, &mru->reap_list, list_node) { /* Remove the element from the data store. */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 9490b913a4ab..7e2307921deb 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -799,7 +799,7 @@ xfs_qm_qino_alloc( }; xfs_ino_t ino; - error = xfs_dialloc(&tp, 0, S_IFREG, &ino); + error = xfs_dialloc(&tp, &args, &ino); if (!error) error = xfs_icreate(tp, ino, &args, ipp); if (error) { @@ -1539,6 +1539,43 @@ xfs_qm_mount_quotas( } /* + * Load the inode for a given type of quota, assuming that the sb fields have + * been sorted out. This is not true when switching quota types on a V4 + * filesystem, so do not use this function for that. + * + * Returns -ENOENT if the quota inode field is NULLFSINO; 0 and an inode on + * success; or a negative errno. + */ +int +xfs_qm_qino_load( + struct xfs_mount *mp, + xfs_dqtype_t type, + struct xfs_inode **ipp) +{ + xfs_ino_t ino = NULLFSINO; + + switch (type) { + case XFS_DQTYPE_USER: + ino = mp->m_sb.sb_uquotino; + break; + case XFS_DQTYPE_GROUP: + ino = mp->m_sb.sb_gquotino; + break; + case XFS_DQTYPE_PROJ: + ino = mp->m_sb.sb_pquotino; + break; + default: + ASSERT(0); + return -EFSCORRUPTED; + } + + if (ino == NULLFSINO) + return -ENOENT; + + return xfs_iget(mp, NULL, ino, 0, 0, ipp); +} + +/* * This is called after the superblock has been read in and we're ready to * iget the quota inodes. */ @@ -1561,24 +1598,21 @@ xfs_qm_init_quotainos( if (XFS_IS_UQUOTA_ON(mp) && mp->m_sb.sb_uquotino != NULLFSINO) { ASSERT(mp->m_sb.sb_uquotino > 0); - error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, - 0, 0, &uip); + error = xfs_qm_qino_load(mp, XFS_DQTYPE_USER, &uip); if (error) return error; } if (XFS_IS_GQUOTA_ON(mp) && mp->m_sb.sb_gquotino != NULLFSINO) { ASSERT(mp->m_sb.sb_gquotino > 0); - error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, - 0, 0, &gip); + error = xfs_qm_qino_load(mp, XFS_DQTYPE_GROUP, &gip); if (error) goto error_rele; } if (XFS_IS_PQUOTA_ON(mp) && mp->m_sb.sb_pquotino != NULLFSINO) { ASSERT(mp->m_sb.sb_pquotino > 0); - error = xfs_iget(mp, NULL, mp->m_sb.sb_pquotino, - 0, 0, &pip); + error = xfs_qm_qino_load(mp, XFS_DQTYPE_PROJ, &pip); if (error) goto error_rele; } diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 6e09dfcd13e2..e919c7f62f57 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -184,4 +184,7 @@ xfs_get_defquota(struct xfs_quotainfo *qi, xfs_dqtype_t type) } } +int xfs_qm_qino_load(struct xfs_mount *mp, xfs_dqtype_t type, + struct xfs_inode **ipp); + #endif /* __XFS_QM_H__ */ diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 392cb39cc10c..4eda50ae2d1c 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -53,16 +53,15 @@ xfs_qm_scall_quotaoff( STATIC int xfs_qm_scall_trunc_qfile( struct xfs_mount *mp, - xfs_ino_t ino) + xfs_dqtype_t type) { struct xfs_inode *ip; struct xfs_trans *tp; int error; - if (ino == NULLFSINO) + error = xfs_qm_qino_load(mp, type, &ip); + if (error == -ENOENT) return 0; - - error = xfs_iget(mp, NULL, ino, 0, 0, &ip); if (error) return error; @@ -113,17 +112,17 @@ xfs_qm_scall_trunc_qfiles( } if (flags & XFS_QMOPT_UQUOTA) { - error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino); + error = xfs_qm_scall_trunc_qfile(mp, XFS_DQTYPE_USER); if (error) return error; } if (flags & XFS_QMOPT_GQUOTA) { - error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino); + error = xfs_qm_scall_trunc_qfile(mp, XFS_DQTYPE_GROUP); if (error) return error; } if (flags & XFS_QMOPT_PQUOTA) - error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino); + error = xfs_qm_scall_trunc_qfile(mp, XFS_DQTYPE_PROJ); return error; } diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 9c162e69976b..4c7f7ce4fd2f 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -16,24 +16,25 @@ #include "xfs_qm.h" -static void +static int xfs_qm_fill_state( struct qc_type_state *tstate, struct xfs_mount *mp, - struct xfs_inode *ip, - xfs_ino_t ino, - struct xfs_def_quota *defq) + xfs_dqtype_t type) { - bool tempqip = false; - - tstate->ino = ino; - if (!ip && ino == NULLFSINO) - return; - if (!ip) { - if (xfs_iget(mp, NULL, ino, 0, 0, &ip)) - return; - tempqip = true; + struct xfs_inode *ip; + struct xfs_def_quota *defq; + int error; + + error = xfs_qm_qino_load(mp, type, &ip); + if (error) { + tstate->ino = NULLFSINO; + return error != -ENOENT ? error : 0; } + + defq = xfs_get_defquota(mp->m_quotainfo, type); + + tstate->ino = ip->i_ino; tstate->flags |= QCI_SYSFILE; tstate->blocks = ip->i_nblocks; tstate->nextents = ip->i_df.if_nextents; @@ -43,8 +44,9 @@ xfs_qm_fill_state( tstate->spc_warnlimit = 0; tstate->ino_warnlimit = 0; tstate->rt_spc_warnlimit = 0; - if (tempqip) - xfs_irele(ip); + xfs_irele(ip); + + return 0; } /* @@ -56,8 +58,9 @@ xfs_fs_get_quota_state( struct super_block *sb, struct qc_state *state) { - struct xfs_mount *mp = XFS_M(sb); - struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_mount *mp = XFS_M(sb); + struct xfs_quotainfo *q = mp->m_quotainfo; + int error; memset(state, 0, sizeof(*state)); if (!XFS_IS_QUOTA_ON(mp)) @@ -76,12 +79,18 @@ xfs_fs_get_quota_state( if (XFS_IS_PQUOTA_ENFORCED(mp)) state->s_state[PRJQUOTA].flags |= QCI_LIMITS_ENFORCED; - xfs_qm_fill_state(&state->s_state[USRQUOTA], mp, q->qi_uquotaip, - mp->m_sb.sb_uquotino, &q->qi_usr_default); - xfs_qm_fill_state(&state->s_state[GRPQUOTA], mp, q->qi_gquotaip, - mp->m_sb.sb_gquotino, &q->qi_grp_default); - xfs_qm_fill_state(&state->s_state[PRJQUOTA], mp, q->qi_pquotaip, - mp->m_sb.sb_pquotino, &q->qi_prj_default); + error = xfs_qm_fill_state(&state->s_state[USRQUOTA], mp, + XFS_DQTYPE_USER); + if (error) + return error; + error = xfs_qm_fill_state(&state->s_state[GRPQUOTA], mp, + XFS_DQTYPE_GROUP); + if (error) + return error; + error = xfs_qm_fill_state(&state->s_state[PRJQUOTA], mp, + XFS_DQTYPE_PROJ); + if (error) + return error; return 0; } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 0c3e96c621a6..3a2005a1e673 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -142,7 +142,7 @@ xfs_rtallocate_range( * We need to find the beginning and end of the extent so we can * properly update the summary. */ - error = xfs_rtfind_back(args, start, 0, &preblock); + error = xfs_rtfind_back(args, start, &preblock); if (error) return error; @@ -194,6 +194,17 @@ xfs_rtallocate_range( return xfs_rtmodify_range(args, start, len, 0); } +/* Reduce @rtxlen until it is a multiple of @prod. */ +static inline xfs_rtxlen_t +xfs_rtalloc_align_len( + xfs_rtxlen_t rtxlen, + xfs_rtxlen_t prod) +{ + if (unlikely(prod > 1)) + return rounddown(rtxlen, prod); + return rtxlen; +} + /* * Make sure we don't run off the end of the rt volume. Be careful that * adjusting maxlen downwards doesn't cause us to fail the alignment checks. @@ -208,7 +219,7 @@ xfs_rtallocate_clamp_len( xfs_rtxlen_t ret; ret = min(mp->m_sb.sb_rextents, startrtx + rtxlen) - startrtx; - return rounddown(ret, prod); + return xfs_rtalloc_align_len(ret, prod); } /* @@ -229,39 +240,40 @@ xfs_rtallocate_extent_block( xfs_rtxnum_t *rtx) /* out: start rtext allocated */ { struct xfs_mount *mp = args->mp; - xfs_rtxnum_t besti; /* best rtext found so far */ - xfs_rtxnum_t bestlen;/* best length found so far */ + xfs_rtxnum_t besti = -1; /* best rtext found so far */ xfs_rtxnum_t end; /* last rtext in chunk */ - int error; xfs_rtxnum_t i; /* current rtext trying */ xfs_rtxnum_t next; /* next rtext to try */ + xfs_rtxlen_t scanlen; /* number of free rtx to look for */ + xfs_rtxlen_t bestlen = 0; /* best length found so far */ int stat; /* status from internal calls */ + int error; /* - * Loop over all the extents starting in this bitmap block, - * looking for one that's long enough. + * Loop over all the extents starting in this bitmap block up to the + * end of the rt volume, looking for one that's long enough. */ - for (i = xfs_rbmblock_to_rtx(mp, bbno), besti = -1, bestlen = 0, - end = xfs_rbmblock_to_rtx(mp, bbno + 1) - 1; - i <= end; - i++) { + end = min(mp->m_sb.sb_rextents, xfs_rbmblock_to_rtx(mp, bbno + 1)) - 1; + for (i = xfs_rbmblock_to_rtx(mp, bbno); i <= end; i++) { /* Make sure we don't scan off the end of the rt volume. */ - maxlen = xfs_rtallocate_clamp_len(mp, i, maxlen, prod); + scanlen = xfs_rtallocate_clamp_len(mp, i, maxlen, prod); + if (scanlen < minlen) + break; /* - * See if there's a free extent of maxlen starting at i. + * See if there's a free extent of scanlen starting at i. * If it's not so then next will contain the first non-free. */ - error = xfs_rtcheck_range(args, i, maxlen, 1, &next, &stat); + error = xfs_rtcheck_range(args, i, scanlen, 1, &next, &stat); if (error) return error; if (stat) { /* - * i for maxlen is all free, allocate and return that. + * i to scanlen is all free, allocate and return that. */ - bestlen = maxlen; - besti = i; - goto allocate; + *len = scanlen; + *rtx = i; + return 0; } /* @@ -289,38 +301,28 @@ xfs_rtallocate_extent_block( return error; } - /* - * Searched the whole thing & didn't find a maxlen free extent. - */ - if (minlen > maxlen || besti == -1) { - /* - * Allocation failed. Set *nextp to the next block to try. - */ - *nextp = next; - return -ENOSPC; - } + /* Searched the whole thing & didn't find a maxlen free extent. */ + if (besti == -1) + goto nospace; /* - * If size should be a multiple of prod, make that so. + * Ensure bestlen is a multiple of prod, but don't return a too-short + * extent. */ - if (prod > 1) { - xfs_rtxlen_t p; /* amount to trim length by */ - - div_u64_rem(bestlen, prod, &p); - if (p) - bestlen -= p; - } + bestlen = xfs_rtalloc_align_len(bestlen, prod); + if (bestlen < minlen) + goto nospace; /* - * Allocate besti for bestlen & return that. + * Pick besti for bestlen & return that. */ -allocate: - error = xfs_rtallocate_range(args, besti, bestlen); - if (error) - return error; *len = bestlen; *rtx = besti; return 0; +nospace: + /* Allocation failed. Set *nextp to the next block to try. */ + *nextp = next; + return -ENOSPC; } /* @@ -339,45 +341,46 @@ xfs_rtallocate_extent_exact( xfs_rtxlen_t prod, /* extent product factor */ xfs_rtxnum_t *rtx) /* out: start rtext allocated */ { - int error; - xfs_rtxlen_t i; /* extent length trimmed due to prod */ - int isfree; /* extent is free */ + struct xfs_mount *mp = args->mp; xfs_rtxnum_t next; /* next rtext to try (dummy) */ + xfs_rtxlen_t alloclen; /* candidate length */ + xfs_rtxlen_t scanlen; /* number of free rtx to look for */ + int isfree; /* extent is free */ + int error; ASSERT(minlen % prod == 0); ASSERT(maxlen % prod == 0); - /* - * Check if the range in question (for maxlen) is free. - */ - error = xfs_rtcheck_range(args, start, maxlen, 1, &next, &isfree); + + /* Make sure we don't run off the end of the rt volume. */ + scanlen = xfs_rtallocate_clamp_len(mp, start, maxlen, prod); + if (scanlen < minlen) + return -ENOSPC; + + /* Check if the range in question (for scanlen) is free. */ + error = xfs_rtcheck_range(args, start, scanlen, 1, &next, &isfree); if (error) return error; - if (!isfree) { - /* - * If not, allocate what there is, if it's at least minlen. - */ - maxlen = next - start; - if (maxlen < minlen) - return -ENOSPC; - - /* - * Trim off tail of extent, if prod is specified. - */ - if (prod > 1 && (i = maxlen % prod)) { - maxlen -= i; - if (maxlen < minlen) - return -ENOSPC; - } + if (isfree) { + /* start to scanlen is all free; allocate it. */ + *len = scanlen; + *rtx = start; + return 0; } /* - * Allocate what we can and return it. + * If not, allocate what there is, if it's at least minlen. */ - error = xfs_rtallocate_range(args, start, maxlen); - if (error) - return error; - *len = maxlen; + alloclen = next - start; + if (alloclen < minlen) + return -ENOSPC; + + /* Ensure alloclen is a multiple of prod. */ + alloclen = xfs_rtalloc_align_len(alloclen, prod); + if (alloclen < minlen) + return -ENOSPC; + + *len = alloclen; *rtx = start; return 0; } @@ -416,11 +419,6 @@ xfs_rtallocate_extent_near( if (start >= mp->m_sb.sb_rextents) start = mp->m_sb.sb_rextents - 1; - /* Make sure we don't run off the end of the rt volume. */ - maxlen = xfs_rtallocate_clamp_len(mp, start, maxlen, prod); - if (maxlen < minlen) - return -ENOSPC; - /* * Try the exact allocation first. */ @@ -429,7 +427,6 @@ xfs_rtallocate_extent_near( if (error != -ENOSPC) return error; - bbno = xfs_rtx_to_rbmblock(mp, start); i = 0; j = -1; @@ -552,11 +549,11 @@ xfs_rtalloc_sumlevel( xfs_rtxnum_t *rtx) /* out: start rtext allocated */ { xfs_fileoff_t i; /* bitmap block number */ + int error; for (i = 0; i < args->mp->m_sb.sb_rbmblocks; i++) { xfs_suminfo_t sum; /* summary information for extents */ xfs_rtxnum_t n; /* next rtext to be tried */ - int error; error = xfs_rtget_summary(args, l, i, &sum); if (error) @@ -652,141 +649,217 @@ xfs_rtallocate_extent_size( return -ENOSPC; } +static int +xfs_alloc_rsum_cache( + struct xfs_mount *mp, + xfs_extlen_t rbmblocks) +{ + /* + * The rsum cache is initialized to the maximum value, which is + * trivially an upper bound on the maximum level with any free extents. + */ + mp->m_rsum_cache = kvmalloc(rbmblocks, GFP_KERNEL); + if (!mp->m_rsum_cache) + return -ENOMEM; + memset(mp->m_rsum_cache, -1, rbmblocks); + return 0; +} + /* - * Allocate space to the bitmap or summary file, and zero it, for growfs. + * If we changed the rt extent size (meaning there was no rt volume previously) + * and the root directory had EXTSZINHERIT and RTINHERIT set, it's possible + * that the extent size hint on the root directory is no longer congruent with + * the new rt extent size. Log the rootdir inode to fix this. */ -STATIC int -xfs_growfs_rt_alloc( - struct xfs_mount *mp, /* file system mount point */ - xfs_extlen_t oblocks, /* old count of blocks */ - xfs_extlen_t nblocks, /* new count of blocks */ - struct xfs_inode *ip) /* inode (bitmap/summary) */ +static int +xfs_growfs_rt_fixup_extsize( + struct xfs_mount *mp) { - xfs_fileoff_t bno; /* block number in file */ - struct xfs_buf *bp; /* temporary buffer for zeroing */ - xfs_daddr_t d; /* disk block address */ - int error; /* error return value */ - xfs_fsblock_t fsbno; /* filesystem block for bno */ - struct xfs_bmbt_irec map; /* block map output */ - int nmap; /* number of block maps */ - int resblks; /* space reservation */ - enum xfs_blft buf_type; + struct xfs_inode *ip = mp->m_rootip; struct xfs_trans *tp; + int error = 0; - if (ip == mp->m_rsumip) - buf_type = XFS_BLFT_RTSUMMARY_BUF; - else - buf_type = XFS_BLFT_RTBITMAP_BUF; + xfs_ilock(ip, XFS_IOLOCK_EXCL); + if (!(ip->i_diflags & XFS_DIFLAG_RTINHERIT) || + !(ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT)) + goto out_iolock; + + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_ichange, 0, 0, false, + &tp); + if (error) + goto out_iolock; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + +out_iolock: + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; +} + +static int +xfs_growfs_rt_bmblock( + struct xfs_mount *mp, + xfs_rfsblock_t nrblocks, + xfs_agblock_t rextsize, + xfs_fileoff_t bmbno) +{ + struct xfs_inode *rbmip = mp->m_rbmip; + struct xfs_inode *rsumip = mp->m_rsumip; + struct xfs_rtalloc_args args = { + .mp = mp, + }; + struct xfs_rtalloc_args nargs = { + }; + struct xfs_mount *nmp; + xfs_rfsblock_t nrblocks_step; + xfs_rtbxlen_t freed_rtx; + int error; + + + nrblocks_step = (bmbno + 1) * NBBY * mp->m_sb.sb_blocksize * rextsize; + + nmp = nargs.mp = kmemdup(mp, sizeof(*mp), GFP_KERNEL); + if (!nmp) + return -ENOMEM; /* - * Allocate space to the file, as necessary. + * Calculate new sb and mount fields for this round. */ - while (oblocks < nblocks) { - resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks); - /* - * Reserve space & log for one extent added to the file. - */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtalloc, resblks, - 0, 0, &tp); - if (error) - return error; - /* - * Lock the inode. - */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + nmp->m_sb.sb_rextsize = rextsize; + xfs_mount_sb_set_rextsize(nmp, &nmp->m_sb); + nmp->m_sb.sb_rbmblocks = bmbno + 1; + nmp->m_sb.sb_rblocks = min(nrblocks, nrblocks_step); + nmp->m_sb.sb_rextents = xfs_rtb_to_rtx(nmp, nmp->m_sb.sb_rblocks); + nmp->m_sb.sb_rextslog = xfs_compute_rextslog(nmp->m_sb.sb_rextents); + nmp->m_rsumlevels = nmp->m_sb.sb_rextslog + 1; + nmp->m_rsumblocks = xfs_rtsummary_blockcount(mp, nmp->m_rsumlevels, + nmp->m_sb.sb_rbmblocks); - error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, - XFS_IEXT_ADD_NOSPLIT_CNT); - if (error) - goto out_trans_cancel; + /* + * Recompute the growfsrt reservation from the new rsumsize, so that the + * transaction below use the new, potentially larger value. + * */ + xfs_trans_resv_calc(nmp, &nmp->m_resv); + error = xfs_trans_alloc(mp, &M_RES(nmp)->tr_growrtfree, 0, 0, 0, + &args.tp); + if (error) + goto out_free; + nargs.tp = args.tp; - /* - * Allocate blocks to the bitmap file. - */ - nmap = 1; - error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks, - XFS_BMAPI_METADATA, 0, &map, &nmap); - if (error) - goto out_trans_cancel; - /* - * Free any blocks freed up in the transaction, then commit. - */ - error = xfs_trans_commit(tp); - if (error) - return error; - /* - * Now we need to clear the allocated blocks. - * Do this one block per transaction, to keep it simple. - */ - for (bno = map.br_startoff, fsbno = map.br_startblock; - bno < map.br_startoff + map.br_blockcount; - bno++, fsbno++) { - /* - * Reserve log for one block zeroing. - */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtzero, - 0, 0, 0, &tp); - if (error) - return error; - /* - * Lock the bitmap inode. - */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - /* - * Get a buffer for the block. - */ - d = XFS_FSB_TO_DADDR(mp, fsbno); - error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - mp->m_bsize, 0, &bp); - if (error) - goto out_trans_cancel; + xfs_rtbitmap_lock(mp); + xfs_rtbitmap_trans_join(args.tp); - xfs_trans_buf_set_type(tp, bp, buf_type); - bp->b_ops = &xfs_rtbuf_ops; - memset(bp->b_addr, 0, mp->m_sb.sb_blocksize); - xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); - /* - * Commit the transaction. - */ - error = xfs_trans_commit(tp); - if (error) - return error; - } - /* - * Go on to the next extent, if any. - */ - oblocks = map.br_startoff + map.br_blockcount; + /* + * Update the bitmap inode's size ondisk and incore. We need to update + * the incore size so that inode inactivation won't punch what it thinks + * are "posteof" blocks. + */ + rbmip->i_disk_size = nmp->m_sb.sb_rbmblocks * nmp->m_sb.sb_blocksize; + i_size_write(VFS_I(rbmip), rbmip->i_disk_size); + xfs_trans_log_inode(args.tp, rbmip, XFS_ILOG_CORE); + + /* + * Update the summary inode's size. We need to update the incore size + * so that inode inactivation won't punch what it thinks are "posteof" + * blocks. + */ + rsumip->i_disk_size = nmp->m_rsumblocks * nmp->m_sb.sb_blocksize; + i_size_write(VFS_I(rsumip), rsumip->i_disk_size); + xfs_trans_log_inode(args.tp, rsumip, XFS_ILOG_CORE); + + /* + * Copy summary data from old to new sizes when the real size (not + * block-aligned) changes. + */ + if (mp->m_sb.sb_rbmblocks != nmp->m_sb.sb_rbmblocks || + mp->m_rsumlevels != nmp->m_rsumlevels) { + error = xfs_rtcopy_summary(&args, &nargs); + if (error) + goto out_cancel; } - return 0; + /* + * Update superblock fields. + */ + if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize) + xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSIZE, + nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize); + if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks) + xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBMBLOCKS, + nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks); + if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks) + xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBLOCKS, + nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks); + if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents) + xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTENTS, + nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents); + if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog) + xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSLOG, + nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog); -out_trans_cancel: - xfs_trans_cancel(tp); - return error; -} + /* + * Free the new extent. + */ + freed_rtx = nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents; + error = xfs_rtfree_range(&nargs, mp->m_sb.sb_rextents, freed_rtx); + xfs_rtbuf_cache_relse(&nargs); + if (error) + goto out_cancel; -static void -xfs_alloc_rsum_cache( - xfs_mount_t *mp, /* file system mount structure */ - xfs_extlen_t rbmblocks) /* number of rt bitmap blocks */ -{ /* - * The rsum cache is initialized to the maximum value, which is - * trivially an upper bound on the maximum level with any free extents. - * We can continue without the cache if it couldn't be allocated. + * Mark more blocks free in the superblock. */ - mp->m_rsum_cache = kvmalloc(rbmblocks, GFP_KERNEL); - if (mp->m_rsum_cache) - memset(mp->m_rsum_cache, -1, rbmblocks); - else - xfs_warn(mp, "could not allocate realtime summary cache"); + xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_FREXTENTS, freed_rtx); + + /* + * Update the calculated values in the real mount structure. + */ + mp->m_rsumlevels = nmp->m_rsumlevels; + mp->m_rsumblocks = nmp->m_rsumblocks; + xfs_mount_sb_set_rextsize(mp, &mp->m_sb); + + /* + * Recompute the growfsrt reservation from the new rsumsize. + */ + xfs_trans_resv_calc(mp, &mp->m_resv); + + error = xfs_trans_commit(args.tp); + if (error) + goto out_free; + + /* + * Ensure the mount RT feature flag is now set. + */ + mp->m_features |= XFS_FEAT_REALTIME; + + kfree(nmp); + return 0; + +out_cancel: + xfs_trans_cancel(args.tp); +out_free: + kfree(nmp); + return error; } /* - * Visible (exported) functions. + * Calculate the last rbmblock currently used. + * + * This also deals with the case where there were no rtextents before. */ +static xfs_fileoff_t +xfs_last_rt_bmblock( + struct xfs_mount *mp) +{ + xfs_fileoff_t bmbno = mp->m_sb.sb_rbmblocks; + + /* Skip the current block if it is exactly full. */ + if (xfs_rtx_to_rbmword(mp, mp->m_sb.sb_rextents) != 0) + bmbno--; + return bmbno; +} /* * Grow the realtime area of the filesystem. @@ -799,21 +872,13 @@ xfs_growfs_rt( xfs_fileoff_t bmbno; /* bitmap block number */ struct xfs_buf *bp; /* temporary buffer */ int error; /* error return value */ - xfs_mount_t *nmp; /* new (fake) mount structure */ - xfs_rfsblock_t nrblocks; /* new number of realtime blocks */ xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */ xfs_rtxnum_t nrextents; /* new number of realtime extents */ - uint8_t nrextslog; /* new log2 of sb_rextents */ xfs_extlen_t nrsumblocks; /* new number of summary blocks */ - uint nrsumlevels; /* new rt summary levels */ - uint nrsumsize; /* new size of rt summary, bytes */ - xfs_sb_t *nsbp; /* new superblock */ xfs_extlen_t rbmblocks; /* current number of rt bitmap blocks */ xfs_extlen_t rsumblocks; /* current number of rt summary blks */ - xfs_sb_t *sbp; /* old superblock */ uint8_t *rsum_cache; /* old summary cache */ - - sbp = &mp->m_sb; + xfs_agblock_t old_rextsize = mp->m_sb.sb_rextsize; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -821,63 +886,69 @@ xfs_growfs_rt( /* Needs to have been mounted with an rt device. */ if (!XFS_IS_REALTIME_MOUNT(mp)) return -EINVAL; + + if (!mutex_trylock(&mp->m_growlock)) + return -EWOULDBLOCK; /* * Mount should fail if the rt bitmap/summary files don't load, but * we'll check anyway. */ + error = -EINVAL; if (!mp->m_rbmip || !mp->m_rsumip) - return -EINVAL; + goto out_unlock; /* Shrink not supported. */ - if (in->newblocks <= sbp->sb_rblocks) - return -EINVAL; - + if (in->newblocks <= mp->m_sb.sb_rblocks) + goto out_unlock; /* Can only change rt extent size when adding rt volume. */ - if (sbp->sb_rblocks > 0 && in->extsize != sbp->sb_rextsize) - return -EINVAL; + if (mp->m_sb.sb_rblocks > 0 && in->extsize != mp->m_sb.sb_rextsize) + goto out_unlock; /* Range check the extent size. */ if (XFS_FSB_TO_B(mp, in->extsize) > XFS_MAX_RTEXTSIZE || XFS_FSB_TO_B(mp, in->extsize) < XFS_MIN_RTEXTSIZE) - return -EINVAL; + goto out_unlock; /* Unsupported realtime features. */ + error = -EOPNOTSUPP; if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp) || xfs_has_quota(mp)) - return -EOPNOTSUPP; + goto out_unlock; - nrblocks = in->newblocks; - error = xfs_sb_validate_fsb_count(sbp, nrblocks); + error = xfs_sb_validate_fsb_count(&mp->m_sb, in->newblocks); if (error) - return error; + goto out_unlock; /* * Read in the last block of the device, make sure it exists. */ error = xfs_buf_read_uncached(mp->m_rtdev_targp, - XFS_FSB_TO_BB(mp, nrblocks - 1), + XFS_FSB_TO_BB(mp, in->newblocks - 1), XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); if (error) - return error; + goto out_unlock; xfs_buf_relse(bp); /* * Calculate new parameters. These are the final values to be reached. */ - nrextents = nrblocks; - do_div(nrextents, in->extsize); - if (!xfs_validate_rtextents(nrextents)) - return -EINVAL; + nrextents = div_u64(in->newblocks, in->extsize); + if (nrextents == 0) { + error = -EINVAL; + goto out_unlock; + } nrbmblocks = xfs_rtbitmap_blockcount(mp, nrextents); - nrextslog = xfs_compute_rextslog(nrextents); - nrsumlevels = nrextslog + 1; - nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels, nrbmblocks); - nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks); + nrsumblocks = xfs_rtsummary_blockcount(mp, + xfs_compute_rextslog(nrextents) + 1, nrbmblocks); + /* * New summary size can't be more than half the size of * the log. This prevents us from getting a log overflow, * since we'll log basically the whole summary file at once. */ - if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1)) - return -EINVAL; + if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1)) { + error = -EINVAL; + goto out_unlock; + } + /* * Get the old block counts for bitmap and summary inodes. * These can't change since other growfs callers are locked out. @@ -887,165 +958,41 @@ xfs_growfs_rt( /* * Allocate space to the bitmap and summary files, as necessary. */ - error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks, mp->m_rbmip); + error = xfs_rtfile_initialize_blocks(mp->m_rbmip, rbmblocks, + nrbmblocks, NULL); if (error) - return error; - error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_rsumip); + goto out_unlock; + error = xfs_rtfile_initialize_blocks(mp->m_rsumip, rsumblocks, + nrsumblocks, NULL); if (error) - return error; + goto out_unlock; rsum_cache = mp->m_rsum_cache; - if (nrbmblocks != sbp->sb_rbmblocks) - xfs_alloc_rsum_cache(mp, nrbmblocks); - - /* - * Allocate a new (fake) mount/sb. - */ - nmp = kmalloc(sizeof(*nmp), GFP_KERNEL | __GFP_NOFAIL); - /* - * Loop over the bitmap blocks. - * We will do everything one bitmap block at a time. - * Skip the current block if it is exactly full. - * This also deals with the case where there were no rtextents before. - */ - for (bmbno = sbp->sb_rbmblocks - - ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0); - bmbno < nrbmblocks; - bmbno++) { - struct xfs_rtalloc_args args = { - .mp = mp, - }; - struct xfs_rtalloc_args nargs = { - .mp = nmp, - }; - struct xfs_trans *tp; - xfs_rfsblock_t nrblocks_step; - - *nmp = *mp; - nsbp = &nmp->m_sb; - /* - * Calculate new sb and mount fields for this round. - */ - nsbp->sb_rextsize = in->extsize; - nmp->m_rtxblklog = -1; /* don't use shift or masking */ - nsbp->sb_rbmblocks = bmbno + 1; - nrblocks_step = (bmbno + 1) * NBBY * nsbp->sb_blocksize * - nsbp->sb_rextsize; - nsbp->sb_rblocks = min(nrblocks, nrblocks_step); - nsbp->sb_rextents = xfs_rtb_to_rtx(nmp, nsbp->sb_rblocks); - ASSERT(nsbp->sb_rextents != 0); - nsbp->sb_rextslog = xfs_compute_rextslog(nsbp->sb_rextents); - nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1; - nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels, - nsbp->sb_rbmblocks); - nmp->m_rsumsize = nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks); - /* recompute growfsrt reservation from new rsumsize */ - xfs_trans_resv_calc(nmp, &nmp->m_resv); - - /* - * Start a transaction, get the log reservation. - */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtfree, 0, 0, 0, - &tp); + if (nrbmblocks != mp->m_sb.sb_rbmblocks) { + error = xfs_alloc_rsum_cache(mp, nrbmblocks); if (error) - break; - args.tp = tp; - nargs.tp = tp; - - /* - * Lock out other callers by grabbing the bitmap and summary - * inode locks and joining them to the transaction. - */ - xfs_rtbitmap_lock(tp, mp); - /* - * Update the bitmap inode's size ondisk and incore. We need - * to update the incore size so that inode inactivation won't - * punch what it thinks are "posteof" blocks. - */ - mp->m_rbmip->i_disk_size = - nsbp->sb_rbmblocks * nsbp->sb_blocksize; - i_size_write(VFS_I(mp->m_rbmip), mp->m_rbmip->i_disk_size); - xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); - /* - * Update the summary inode's size. We need to update the - * incore size so that inode inactivation won't punch what it - * thinks are "posteof" blocks. - */ - mp->m_rsumip->i_disk_size = nmp->m_rsumsize; - i_size_write(VFS_I(mp->m_rsumip), mp->m_rsumip->i_disk_size); - xfs_trans_log_inode(tp, mp->m_rsumip, XFS_ILOG_CORE); - /* - * Copy summary data from old to new sizes. - * Do this when the real size (not block-aligned) changes. - */ - if (sbp->sb_rbmblocks != nsbp->sb_rbmblocks || - mp->m_rsumlevels != nmp->m_rsumlevels) { - error = xfs_rtcopy_summary(&args, &nargs); - if (error) - goto error_cancel; - } - /* - * Update superblock fields. - */ - if (nsbp->sb_rextsize != sbp->sb_rextsize) - xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSIZE, - nsbp->sb_rextsize - sbp->sb_rextsize); - if (nsbp->sb_rbmblocks != sbp->sb_rbmblocks) - xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS, - nsbp->sb_rbmblocks - sbp->sb_rbmblocks); - if (nsbp->sb_rblocks != sbp->sb_rblocks) - xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBLOCKS, - nsbp->sb_rblocks - sbp->sb_rblocks); - if (nsbp->sb_rextents != sbp->sb_rextents) - xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTENTS, - nsbp->sb_rextents - sbp->sb_rextents); - if (nsbp->sb_rextslog != sbp->sb_rextslog) - xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG, - nsbp->sb_rextslog - sbp->sb_rextslog); - /* - * Free new extent. - */ - error = xfs_rtfree_range(&nargs, sbp->sb_rextents, - nsbp->sb_rextents - sbp->sb_rextents); - xfs_rtbuf_cache_relse(&nargs); - if (error) { -error_cancel: - xfs_trans_cancel(tp); - break; - } - /* - * Mark more blocks free in the superblock. - */ - xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, - nsbp->sb_rextents - sbp->sb_rextents); - /* - * Update mp values into the real mp structure. - */ - mp->m_rsumlevels = nrsumlevels; - mp->m_rsumsize = nrsumsize; - /* recompute growfsrt reservation from new rsumsize */ - xfs_trans_resv_calc(mp, &mp->m_resv); + goto out_unlock; + } - error = xfs_trans_commit(tp); + /* Initialize the free space bitmap one bitmap block at a time. */ + for (bmbno = xfs_last_rt_bmblock(mp); bmbno < nrbmblocks; bmbno++) { + error = xfs_growfs_rt_bmblock(mp, in->newblocks, in->extsize, + bmbno); if (error) - break; + goto out_free; + } - /* Ensure the mount RT feature flag is now set. */ - mp->m_features |= XFS_FEAT_REALTIME; + if (old_rextsize != in->extsize) { + error = xfs_growfs_rt_fixup_extsize(mp); + if (error) + goto out_free; } - if (error) - goto out_free; /* Update secondary superblocks now the physical grow has completed */ error = xfs_update_secondary_sbs(mp); out_free: /* - * Free the fake mp structure. - */ - kfree(nmp); - - /* * If we had to allocate a new rsum_cache, we either need to free the * old one (if we succeeded) or free the new one and restore the old one * (if there was an error). @@ -1059,6 +1006,8 @@ out_free: } } +out_unlock: + mutex_unlock(&mp->m_growlock); return error; } @@ -1072,7 +1021,6 @@ xfs_rtmount_init( struct xfs_buf *bp; /* buffer for last block of subvolume */ struct xfs_sb *sbp; /* filesystem superblock copy in mount */ xfs_daddr_t d; /* address of last block of subvolume */ - unsigned int rsumblocks; int error; sbp = &mp->m_sb; @@ -1084,9 +1032,8 @@ xfs_rtmount_init( return -ENODEV; } mp->m_rsumlevels = sbp->sb_rextslog + 1; - rsumblocks = xfs_rtsummary_blockcount(mp, mp->m_rsumlevels, + mp->m_rsumblocks = xfs_rtsummary_blockcount(mp, mp->m_rsumlevels, mp->m_sb.sb_rbmblocks); - mp->m_rsumsize = XFS_FSB_TO_B(mp, rsumblocks); mp->m_rbmip = mp->m_rsumip = NULL; /* * Check that the realtime section is an ok size. @@ -1216,7 +1163,9 @@ xfs_rtmount_inodes( if (error) goto out_rele_summary; - xfs_alloc_rsum_cache(mp, sbp->sb_rbmblocks); + error = xfs_alloc_rsum_cache(mp, sbp->sb_rbmblocks); + if (error) + goto out_rele_summary; return 0; out_rele_summary: @@ -1244,12 +1193,11 @@ xfs_rtunmount_inodes( * of rtextents and the fraction. * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ... */ -static int +static xfs_rtxnum_t xfs_rtpick_extent( xfs_mount_t *mp, /* file system mount point */ xfs_trans_t *tp, /* transaction pointer */ - xfs_rtxlen_t len, /* allocation length (rtextents) */ - xfs_rtxnum_t *pick) /* result rt extent */ + xfs_rtxlen_t len) /* allocation length (rtextents) */ { xfs_rtxnum_t b; /* result rtext */ int log2; /* log of sequence number */ @@ -1280,8 +1228,7 @@ xfs_rtpick_extent( ts.tv_sec = seq + 1; inode_set_atime_to_ts(VFS_I(mp->m_rbmip), ts); xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); - *pick = b; - return 0; + return b; } static void @@ -1313,36 +1260,109 @@ xfs_rtalloc_align_minmax( *raminlen = newminlen; } -int -xfs_bmap_rtalloc( - struct xfs_bmalloca *ap) +static int +xfs_rtallocate( + struct xfs_trans *tp, + xfs_rtblock_t bno_hint, + xfs_rtxlen_t minlen, + xfs_rtxlen_t maxlen, + xfs_rtxlen_t prod, + bool wasdel, + bool initial_user_data, + bool *rtlocked, + xfs_rtblock_t *bno, + xfs_extlen_t *blen) +{ + struct xfs_rtalloc_args args = { + .mp = tp->t_mountp, + .tp = tp, + }; + xfs_rtxnum_t start = 0; + xfs_rtxnum_t rtx; + xfs_rtxlen_t len = 0; + int error = 0; + + /* + * Lock out modifications to both the RT bitmap and summary inodes. + */ + if (!*rtlocked) { + xfs_rtbitmap_lock(args.mp); + xfs_rtbitmap_trans_join(tp); + *rtlocked = true; + } + + /* + * For an allocation to an empty file at offset 0, pick an extent that + * will space things out in the rt area. + */ + if (bno_hint) + start = xfs_rtb_to_rtx(args.mp, bno_hint); + else if (initial_user_data) + start = xfs_rtpick_extent(args.mp, tp, maxlen); + + if (start) { + error = xfs_rtallocate_extent_near(&args, start, minlen, maxlen, + &len, prod, &rtx); + /* + * If we can't allocate near a specific rt extent, try again + * without locality criteria. + */ + if (error == -ENOSPC) { + xfs_rtbuf_cache_relse(&args); + error = 0; + } + } + + if (!error) { + error = xfs_rtallocate_extent_size(&args, minlen, maxlen, &len, + prod, &rtx); + } + + if (error) + goto out_release; + + error = xfs_rtallocate_range(&args, rtx, len); + if (error) + goto out_release; + + xfs_trans_mod_sb(tp, wasdel ? + XFS_TRANS_SB_RES_FREXTENTS : XFS_TRANS_SB_FREXTENTS, + -(long)len); + *bno = xfs_rtx_to_rtb(args.mp, rtx); + *blen = xfs_rtxlen_to_extlen(args.mp, len); + +out_release: + xfs_rtbuf_cache_relse(&args); + return error; +} + +static int +xfs_rtallocate_align( + struct xfs_bmalloca *ap, + xfs_rtxlen_t *ralen, + xfs_rtxlen_t *raminlen, + xfs_rtxlen_t *prod, + bool *noalign) { struct xfs_mount *mp = ap->ip->i_mount; xfs_fileoff_t orig_offset = ap->offset; - xfs_rtxnum_t start; /* allocation hint rtextent no */ - xfs_rtxnum_t rtx; /* actually allocated rtextent no */ - xfs_rtxlen_t prod = 0; /* product factor for allocators */ - xfs_extlen_t mod = 0; /* product factor for allocators */ - xfs_rtxlen_t ralen = 0; /* realtime allocation length */ - xfs_extlen_t align; /* minimum allocation alignment */ - xfs_extlen_t orig_length = ap->length; xfs_extlen_t minlen = mp->m_sb.sb_rextsize; - xfs_rtxlen_t raminlen; - bool rtlocked = false; - bool ignore_locality = false; - struct xfs_rtalloc_args args = { - .mp = mp, - .tp = ap->tp, - }; + xfs_extlen_t align; /* minimum allocation alignment */ + xfs_extlen_t mod; /* product factor for allocators */ int error; - align = xfs_get_extsz_hint(ap->ip); - if (!align) - align = 1; -retry: - error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, - align, 1, ap->eof, 0, - ap->conv, &ap->offset, &ap->length); + if (*noalign) { + align = mp->m_sb.sb_rextsize; + } else { + align = xfs_get_extsz_hint(ap->ip); + if (!align) + align = 1; + if (align == mp->m_sb.sb_rextsize) + *noalign = true; + } + + error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 1, + ap->eof, 0, ap->conv, &ap->offset, &ap->length); if (error) return error; ASSERT(ap->length); @@ -1366,59 +1386,55 @@ retry: * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't * adjust the starting point to match it. */ - ralen = xfs_extlen_to_rtxlen(mp, min(ap->length, XFS_MAX_BMBT_EXTLEN)); - raminlen = max_t(xfs_rtxlen_t, 1, xfs_extlen_to_rtxlen(mp, minlen)); - ASSERT(raminlen > 0); - ASSERT(raminlen <= ralen); - - /* - * Lock out modifications to both the RT bitmap and summary inodes - */ - if (!rtlocked) { - xfs_rtbitmap_lock(ap->tp, mp); - rtlocked = true; - } - - if (ignore_locality) { - start = 0; - } else if (xfs_bmap_adjacent(ap)) { - start = xfs_rtb_to_rtx(mp, ap->blkno); - } else if (ap->datatype & XFS_ALLOC_INITIAL_USER_DATA) { - /* - * If it's an allocation to an empty file at offset 0, pick an - * extent that will space things out in the rt area. - */ - error = xfs_rtpick_extent(mp, ap->tp, ralen, &start); - if (error) - return error; - } else { - start = 0; - } + *ralen = xfs_extlen_to_rtxlen(mp, min(ap->length, XFS_MAX_BMBT_EXTLEN)); + *raminlen = max_t(xfs_rtxlen_t, 1, xfs_extlen_to_rtxlen(mp, minlen)); + ASSERT(*raminlen > 0); + ASSERT(*raminlen <= *ralen); /* * Only bother calculating a real prod factor if offset & length are * perfectly aligned, otherwise it will just get us in trouble. */ div_u64_rem(ap->offset, align, &mod); - if (mod || ap->length % align) { - prod = 1; - } else { - prod = xfs_extlen_to_rtxlen(mp, align); - if (prod > 1) - xfs_rtalloc_align_minmax(&raminlen, &ralen, &prod); - } + if (mod || ap->length % align) + *prod = 1; + else + *prod = xfs_extlen_to_rtxlen(mp, align); - if (start) { - error = xfs_rtallocate_extent_near(&args, start, raminlen, - ralen, &ralen, prod, &rtx); - } else { - error = xfs_rtallocate_extent_size(&args, raminlen, - ralen, &ralen, prod, &rtx); - } - xfs_rtbuf_cache_relse(&args); + if (*prod > 1) + xfs_rtalloc_align_minmax(raminlen, ralen, prod); + return 0; +} +int +xfs_bmap_rtalloc( + struct xfs_bmalloca *ap) +{ + xfs_fileoff_t orig_offset = ap->offset; + xfs_rtxlen_t prod = 0; /* product factor for allocators */ + xfs_rtxlen_t ralen = 0; /* realtime allocation length */ + xfs_rtblock_t bno_hint = NULLRTBLOCK; + xfs_extlen_t orig_length = ap->length; + xfs_rtxlen_t raminlen; + bool rtlocked = false; + bool noalign = false; + bool initial_user_data = + ap->datatype & XFS_ALLOC_INITIAL_USER_DATA; + int error; + +retry: + error = xfs_rtallocate_align(ap, &ralen, &raminlen, &prod, &noalign); + if (error) + return error; + + if (xfs_bmap_adjacent(ap)) + bno_hint = ap->blkno; + + error = xfs_rtallocate(ap->tp, bno_hint, raminlen, ralen, prod, + ap->wasdel, initial_user_data, &rtlocked, + &ap->blkno, &ap->length); if (error == -ENOSPC) { - if (align > mp->m_sb.sb_rextsize) { + if (!noalign) { /* * We previously enlarged the request length to try to * satisfy an extent size hint. The allocator didn't @@ -1428,16 +1444,7 @@ retry: */ ap->offset = orig_offset; ap->length = orig_length; - minlen = align = mp->m_sb.sb_rextsize; - goto retry; - } - - if (!ignore_locality && start != 0) { - /* - * If we can't allocate near a specific rt extent, try - * again without locality criteria. - */ - ignore_locality = true; + noalign = true; goto retry; } @@ -1448,11 +1455,6 @@ retry: if (error) return error; - xfs_trans_mod_sb(ap->tp, ap->wasdel ? - XFS_TRANS_SB_RES_FREXTENTS : XFS_TRANS_SB_FREXTENTS, - -(long)ralen); - ap->blkno = xfs_rtx_to_rtb(mp, rtx); - ap->length = xfs_rtxlen_to_extlen(mp, ralen); xfs_bmap_alloc_account(ap); return 0; } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 27e9f749c4c7..26767745d9fd 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -311,9 +311,9 @@ xfs_set_inode_alloc( * the allocator to accommodate the request. */ if (xfs_has_small_inums(mp) && ino > XFS_MAXINUMBER_32) - set_bit(XFS_OPSTATE_INODE32, &mp->m_opstate); + xfs_set_inode32(mp); else - clear_bit(XFS_OPSTATE_INODE32, &mp->m_opstate); + xfs_clear_inode32(mp); for (index = 0; index < agcount; index++) { struct xfs_perag *pag; @@ -1511,7 +1511,7 @@ xfs_fs_fill_super( * the newer fsopen/fsconfig API. */ if (fc->sb_flags & SB_RDONLY) - set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); + xfs_set_readonly(mp); if (fc->sb_flags & SB_DIRSYNC) mp->m_features |= XFS_FEAT_DIRSYNC; if (fc->sb_flags & SB_SYNCHRONOUS) @@ -1820,7 +1820,7 @@ xfs_remount_rw( return -EINVAL; } - clear_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); + xfs_clear_readonly(mp); /* * If this is the first remount to writeable state we might have some @@ -1908,7 +1908,7 @@ xfs_remount_ro( xfs_save_resvblks(mp); xfs_log_clean(mp); - set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); + xfs_set_readonly(mp); return 0; } @@ -2009,8 +2009,7 @@ static int xfs_init_fs_context( return -ENOMEM; spin_lock_init(&mp->m_sb_lock); - INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC); - spin_lock_init(&mp->m_perag_lock); + xa_init(&mp->m_perags); mutex_init(&mp->m_growlock); INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 77f19e2f66e0..4252b07cd251 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -165,7 +165,7 @@ xfs_symlink( /* * Allocate an inode for the symlink. */ - error = xfs_dialloc(&tp, dp->i_ino, S_IFLNK, &ino); + error = xfs_dialloc(&tp, &args, &ino); if (!error) error = xfs_icreate(tp, ino, &args, &du.ip); if (error) diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 180ce697305a..ee9f0b1f548d 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -210,14 +210,14 @@ DEFINE_EVENT(xfs_perag_class, name, \ TP_PROTO(struct xfs_perag *pag, unsigned long caller_ip), \ TP_ARGS(pag, caller_ip)) DEFINE_PERAG_REF_EVENT(xfs_perag_get); -DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag); DEFINE_PERAG_REF_EVENT(xfs_perag_hold); DEFINE_PERAG_REF_EVENT(xfs_perag_put); DEFINE_PERAG_REF_EVENT(xfs_perag_grab); -DEFINE_PERAG_REF_EVENT(xfs_perag_grab_tag); +DEFINE_PERAG_REF_EVENT(xfs_perag_grab_next_tag); DEFINE_PERAG_REF_EVENT(xfs_perag_rele); DEFINE_PERAG_REF_EVENT(xfs_perag_set_inode_tag); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_inode_tag); +DEFINE_PERAG_REF_EVENT(xfs_reclaim_inodes_count); TRACE_EVENT(xfs_inodegc_worker, TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits), @@ -4926,7 +4926,8 @@ DEFINE_INODE_ERROR_EVENT(xfs_exchrange_error); { XFS_EXCHANGE_RANGE_DRY_RUN, "DRY_RUN" }, \ { XFS_EXCHANGE_RANGE_FILE1_WRITTEN, "F1_WRITTEN" }, \ { __XFS_EXCHANGE_RANGE_UPD_CMTIME1, "CMTIME1" }, \ - { __XFS_EXCHANGE_RANGE_UPD_CMTIME2, "CMTIME2" } + { __XFS_EXCHANGE_RANGE_UPD_CMTIME2, "CMTIME2" }, \ + { __XFS_EXCHANGE_RANGE_CHECK_FRESH2, "FRESH2" } /* file exchange-range tracepoint class */ DECLARE_EVENT_CLASS(xfs_exchrange_class, @@ -4986,6 +4987,60 @@ DEFINE_EXCHRANGE_EVENT(xfs_exchrange_prep); DEFINE_EXCHRANGE_EVENT(xfs_exchrange_flush); DEFINE_EXCHRANGE_EVENT(xfs_exchrange_mappings); +TRACE_EVENT(xfs_exchrange_freshness, + TP_PROTO(const struct xfs_exchrange *fxr, struct xfs_inode *ip2), + TP_ARGS(fxr, ip2), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ip2_ino) + __field(long long, ip2_mtime) + __field(long long, ip2_ctime) + __field(int, ip2_mtime_nsec) + __field(int, ip2_ctime_nsec) + + __field(xfs_ino_t, file2_ino) + __field(long long, file2_mtime) + __field(long long, file2_ctime) + __field(int, file2_mtime_nsec) + __field(int, file2_ctime_nsec) + ), + TP_fast_assign( + struct timespec64 ts64; + struct inode *inode2 = VFS_I(ip2); + + __entry->dev = inode2->i_sb->s_dev; + __entry->ip2_ino = ip2->i_ino; + + ts64 = inode_get_ctime(inode2); + __entry->ip2_ctime = ts64.tv_sec; + __entry->ip2_ctime_nsec = ts64.tv_nsec; + + ts64 = inode_get_mtime(inode2); + __entry->ip2_mtime = ts64.tv_sec; + __entry->ip2_mtime_nsec = ts64.tv_nsec; + + __entry->file2_ino = fxr->file2_ino; + __entry->file2_mtime = fxr->file2_mtime.tv_sec; + __entry->file2_ctime = fxr->file2_ctime.tv_sec; + __entry->file2_mtime_nsec = fxr->file2_mtime.tv_nsec; + __entry->file2_ctime_nsec = fxr->file2_ctime.tv_nsec; + ), + TP_printk("dev %d:%d " + "ino 0x%llx mtime %lld:%d ctime %lld:%d -> " + "file 0x%llx mtime %lld:%d ctime %lld:%d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ip2_ino, + __entry->ip2_mtime, + __entry->ip2_mtime_nsec, + __entry->ip2_ctime, + __entry->ip2_ctime_nsec, + __entry->file2_ino, + __entry->file2_mtime, + __entry->file2_mtime_nsec, + __entry->file2_ctime, + __entry->file2_ctime_nsec) +); + TRACE_EVENT(xfs_exchmaps_overhead, TP_PROTO(struct xfs_mount *mp, unsigned long long bmbt_blocks, unsigned long long rmapbt_blocks), diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 0fafcc9f3dbe..8ede9d099d1f 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -644,7 +644,12 @@ xfsaild( set_freezable(); while (1) { - if (tout) + /* + * Long waits of 50ms or more occur when we've run out of items + * to push, so we only want uninterruptible state if we're + * actually blocked on something. + */ + if (tout && tout <= 20) set_current_state(TASK_KILLABLE|TASK_FREEZABLE); else set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); |