diff options
Diffstat (limited to 'fs')
178 files changed, 10816 insertions, 7350 deletions
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 21c6332fa785..32dff7ba3dda 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -142,12 +142,6 @@ config BINFMT_ZFLAT help Support FLAT format compressed binaries -config BINFMT_SHARED_FLAT - bool "Enable shared FLAT support" - depends on BINFMT_FLAT - help - Support FLAT shared libraries - config HAVE_AOUT def_bool n diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 2fe402483ad5..30b066299d39 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -740,10 +740,22 @@ int afs_getattr(struct user_namespace *mnt_userns, const struct path *path, { struct inode *inode = d_inode(path->dentry); struct afs_vnode *vnode = AFS_FS_I(inode); - int seq = 0; + struct key *key; + int ret, seq = 0; _enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation); + if (!(query_flags & AT_STATX_DONT_SYNC) && + !test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { + key = afs_request_key(vnode->volume->cell); + if (IS_ERR(key)) + return PTR_ERR(key); + ret = afs_validate(vnode, key); + key_put(key); + if (ret < 0) + return ret; + } + do { read_seqbegin_or_lock(&vnode->cb_lock, &seq); generic_fillattr(&init_user_ns, inode, stat); diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 626898150011..c26545d71d39 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -37,7 +37,6 @@ #include <linux/flat.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> -#include <linux/coredump.h> #include <asm/byteorder.h> #include <asm/unaligned.h> @@ -69,11 +68,7 @@ #define RELOC_FAILED 0xff00ff01 /* Relocation incorrect somewhere */ #define UNLOADED_LIB 0x7ff000ff /* Placeholder for unused library */ -#ifdef CONFIG_BINFMT_SHARED_FLAT -#define MAX_SHARED_LIBS (4) -#else -#define MAX_SHARED_LIBS (1) -#endif +#define MAX_SHARED_LIBS (1) #ifdef CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET #define DATA_START_OFFSET_WORDS (0) @@ -93,38 +88,13 @@ struct lib_info { } lib_list[MAX_SHARED_LIBS]; }; -#ifdef CONFIG_BINFMT_SHARED_FLAT -static int load_flat_shared_library(int id, struct lib_info *p); -#endif - static int load_flat_binary(struct linux_binprm *); -#ifdef CONFIG_COREDUMP -static int flat_core_dump(struct coredump_params *cprm); -#endif static struct linux_binfmt flat_format = { .module = THIS_MODULE, .load_binary = load_flat_binary, -#ifdef CONFIG_COREDUMP - .core_dump = flat_core_dump, - .min_coredump = PAGE_SIZE -#endif }; -/****************************************************************************/ -/* - * Routine writes a core dump image in the current directory. - * Currently only a stub-function. - */ - -#ifdef CONFIG_COREDUMP -static int flat_core_dump(struct coredump_params *cprm) -{ - pr_warn("Process %s:%d received signr %d and should have core dumped\n", - current->comm, current->pid, cprm->siginfo->si_signo); - return 1; -} -#endif /****************************************************************************/ /* @@ -329,51 +299,18 @@ out_free: /****************************************************************************/ static unsigned long -calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp) +calc_reloc(unsigned long r, struct lib_info *p) { unsigned long addr; - int id; unsigned long start_brk; unsigned long start_data; unsigned long text_len; unsigned long start_code; -#ifdef CONFIG_BINFMT_SHARED_FLAT - if (r == 0) - id = curid; /* Relocs of 0 are always self referring */ - else { - id = (r >> 24) & 0xff; /* Find ID for this reloc */ - r &= 0x00ffffff; /* Trim ID off here */ - } - if (id >= MAX_SHARED_LIBS) { - pr_err("reference 0x%lx to shared library %d", r, id); - goto failed; - } - if (curid != id) { - if (internalp) { - pr_err("reloc address 0x%lx not in same module " - "(%d != %d)", r, curid, id); - goto failed; - } else if (!p->lib_list[id].loaded && - load_flat_shared_library(id, p) < 0) { - pr_err("failed to load library %d", id); - goto failed; - } - /* Check versioning information (i.e. time stamps) */ - if (p->lib_list[id].build_date && p->lib_list[curid].build_date && - p->lib_list[curid].build_date < p->lib_list[id].build_date) { - pr_err("library %d is younger than %d", id, curid); - goto failed; - } - } -#else - id = 0; -#endif - - start_brk = p->lib_list[id].start_brk; - start_data = p->lib_list[id].start_data; - start_code = p->lib_list[id].start_code; - text_len = p->lib_list[id].text_len; + start_brk = p->lib_list[0].start_brk; + start_data = p->lib_list[0].start_data; + start_code = p->lib_list[0].start_code; + text_len = p->lib_list[0].text_len; if (r > start_brk - start_data + text_len) { pr_err("reloc outside program 0x%lx (0 - 0x%lx/0x%lx)", @@ -440,8 +377,32 @@ static void old_reloc(unsigned long rl) /****************************************************************************/ +static inline u32 __user *skip_got_header(u32 __user *rp) +{ + if (IS_ENABLED(CONFIG_RISCV)) { + /* + * RISC-V has a 16 byte GOT PLT header for elf64-riscv + * and 8 byte GOT PLT header for elf32-riscv. + * Skip the whole GOT PLT header, since it is reserved + * for the dynamic linker (ld.so). + */ + u32 rp_val0, rp_val1; + + if (get_user(rp_val0, rp)) + return rp; + if (get_user(rp_val1, rp + 1)) + return rp; + + if (rp_val0 == 0xffffffff && rp_val1 == 0xffffffff) + rp += 4; + else if (rp_val0 == 0xffffffff) + rp += 2; + } + return rp; +} + static int load_flat_file(struct linux_binprm *bprm, - struct lib_info *libinfo, int id, unsigned long *extra_stack) + struct lib_info *libinfo, unsigned long *extra_stack) { struct flat_hdr *hdr; unsigned long textpos, datapos, realdatastart; @@ -493,14 +454,6 @@ static int load_flat_file(struct linux_binprm *bprm, goto err; } - /* Don't allow old format executables to use shared libraries */ - if (rev == OLD_FLAT_VERSION && id != 0) { - pr_err("shared libraries are not available before rev 0x%lx\n", - FLAT_VERSION); - ret = -ENOEXEC; - goto err; - } - /* * fix up the flags for the older format, there were all kinds * of endian hacks, this only works for the simple cases @@ -551,15 +504,13 @@ static int load_flat_file(struct linux_binprm *bprm, } /* Flush all traces of the currently running executable */ - if (id == 0) { - ret = begin_new_exec(bprm); - if (ret) - goto err; + ret = begin_new_exec(bprm); + if (ret) + goto err; - /* OK, This is the point of no return */ - set_personality(PER_LINUX_32BIT); - setup_new_exec(bprm); - } + /* OK, This is the point of no return */ + set_personality(PER_LINUX_32BIT); + setup_new_exec(bprm); /* * calculate the extra space we need to map in @@ -739,42 +690,40 @@ static int load_flat_file(struct linux_binprm *bprm, text_len -= sizeof(struct flat_hdr); /* the real code len */ /* The main program needs a little extra setup in the task structure */ - if (id == 0) { - current->mm->start_code = start_code; - current->mm->end_code = end_code; - current->mm->start_data = datapos; - current->mm->end_data = datapos + data_len; - /* - * set up the brk stuff, uses any slack left in data/bss/stack - * allocation. We put the brk after the bss (between the bss - * and stack) like other platforms. - * Userspace code relies on the stack pointer starting out at - * an address right at the end of a page. - */ - current->mm->start_brk = datapos + data_len + bss_len; - current->mm->brk = (current->mm->start_brk + 3) & ~3; + current->mm->start_code = start_code; + current->mm->end_code = end_code; + current->mm->start_data = datapos; + current->mm->end_data = datapos + data_len; + /* + * set up the brk stuff, uses any slack left in data/bss/stack + * allocation. We put the brk after the bss (between the bss + * and stack) like other platforms. + * Userspace code relies on the stack pointer starting out at + * an address right at the end of a page. + */ + current->mm->start_brk = datapos + data_len + bss_len; + current->mm->brk = (current->mm->start_brk + 3) & ~3; #ifndef CONFIG_MMU - current->mm->context.end_brk = memp + memp_size - stack_len; + current->mm->context.end_brk = memp + memp_size - stack_len; #endif - } if (flags & FLAT_FLAG_KTRACE) { pr_info("Mapping is %lx, Entry point is %x, data_start is %x\n", textpos, 0x00ffffff&ntohl(hdr->entry), ntohl(hdr->data_start)); pr_info("%s %s: TEXT=%lx-%lx DATA=%lx-%lx BSS=%lx-%lx\n", - id ? "Lib" : "Load", bprm->filename, + "Load", bprm->filename, start_code, end_code, datapos, datapos + data_len, datapos + data_len, (datapos + data_len + bss_len + 3) & ~3); } /* Store the current module values into the global library structure */ - libinfo->lib_list[id].start_code = start_code; - libinfo->lib_list[id].start_data = datapos; - libinfo->lib_list[id].start_brk = datapos + data_len + bss_len; - libinfo->lib_list[id].text_len = text_len; - libinfo->lib_list[id].loaded = 1; - libinfo->lib_list[id].entry = (0x00ffffff & ntohl(hdr->entry)) + textpos; - libinfo->lib_list[id].build_date = ntohl(hdr->build_date); + libinfo->lib_list[0].start_code = start_code; + libinfo->lib_list[0].start_data = datapos; + libinfo->lib_list[0].start_brk = datapos + data_len + bss_len; + libinfo->lib_list[0].text_len = text_len; + libinfo->lib_list[0].loaded = 1; + libinfo->lib_list[0].entry = (0x00ffffff & ntohl(hdr->entry)) + textpos; + libinfo->lib_list[0].build_date = ntohl(hdr->build_date); /* * We just load the allocations into some temporary memory to @@ -789,14 +738,15 @@ static int load_flat_file(struct linux_binprm *bprm, * image. */ if (flags & FLAT_FLAG_GOTPIC) { - for (rp = (u32 __user *)datapos; ; rp++) { + rp = skip_got_header((u32 __user *) datapos); + for (; ; rp++) { u32 addr, rp_val; if (get_user(rp_val, rp)) return -EFAULT; if (rp_val == 0xffffffff) break; if (rp_val) { - addr = calc_reloc(rp_val, libinfo, id, 0); + addr = calc_reloc(rp_val, libinfo); if (addr == RELOC_FAILED) { ret = -ENOEXEC; goto err; @@ -832,7 +782,7 @@ static int load_flat_file(struct linux_binprm *bprm, return -EFAULT; relval = ntohl(tmp); addr = flat_get_relocate_addr(relval); - rp = (u32 __user *)calc_reloc(addr, libinfo, id, 1); + rp = (u32 __user *)calc_reloc(addr, libinfo); if (rp == (u32 __user *)RELOC_FAILED) { ret = -ENOEXEC; goto err; @@ -855,7 +805,7 @@ static int load_flat_file(struct linux_binprm *bprm, */ addr = ntohl((__force __be32)addr); } - addr = calc_reloc(addr, libinfo, id, 0); + addr = calc_reloc(addr, libinfo); if (addr == RELOC_FAILED) { ret = -ENOEXEC; goto err; @@ -883,7 +833,7 @@ static int load_flat_file(struct linux_binprm *bprm, /* zero the BSS, BRK and stack areas */ if (clear_user((void __user *)(datapos + data_len), bss_len + (memp + memp_size - stack_len - /* end brk */ - libinfo->lib_list[id].start_brk) + /* start brk */ + libinfo->lib_list[0].start_brk) + /* start brk */ stack_len)) return -EFAULT; @@ -894,49 +844,6 @@ err: /****************************************************************************/ -#ifdef CONFIG_BINFMT_SHARED_FLAT - -/* - * Load a shared library into memory. The library gets its own data - * segment (including bss) but not argv/argc/environ. - */ - -static int load_flat_shared_library(int id, struct lib_info *libs) -{ - /* - * This is a fake bprm struct; only the members "buf", "file" and - * "filename" are actually used. - */ - struct linux_binprm bprm; - int res; - char buf[16]; - loff_t pos = 0; - - memset(&bprm, 0, sizeof(bprm)); - - /* Create the file name */ - sprintf(buf, "/lib/lib%d.so", id); - - /* Open the file up */ - bprm.filename = buf; - bprm.file = open_exec(bprm.filename); - res = PTR_ERR(bprm.file); - if (IS_ERR(bprm.file)) - return res; - - res = kernel_read(bprm.file, bprm.buf, BINPRM_BUF_SIZE, &pos); - - if (res >= 0) - res = load_flat_file(&bprm, libs, id, NULL); - - allow_write_access(bprm.file); - fput(bprm.file); - - return res; -} - -#endif /* CONFIG_BINFMT_SHARED_FLAT */ -/****************************************************************************/ /* * These are the functions used to load flat style executables and shared @@ -968,7 +875,7 @@ static int load_flat_binary(struct linux_binprm *bprm) stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */ stack_len = ALIGN(stack_len, FLAT_STACK_ALIGN); - res = load_flat_file(bprm, &libinfo, 0, &stack_len); + res = load_flat_file(bprm, &libinfo, &stack_len); if (res < 0) return res; @@ -1013,20 +920,6 @@ static int load_flat_binary(struct linux_binprm *bprm) */ start_addr = libinfo.lib_list[0].entry; -#ifdef CONFIG_BINFMT_SHARED_FLAT - for (i = MAX_SHARED_LIBS-1; i > 0; i--) { - if (libinfo.lib_list[i].loaded) { - /* Push previos first to call address */ - unsigned long __user *sp; - current->mm->start_stack -= sizeof(unsigned long); - sp = (unsigned long __user *)current->mm->start_stack; - if (put_user(start_addr, sp)) - return -EFAULT; - start_addr = libinfo.lib_list[i].entry; - } - } -#endif - #ifdef FLAT_PLAT_INIT FLAT_PLAT_INIT(regs); #endif diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 0a0d0eccee4e..548d6a5477b4 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -55,9 +55,8 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu) return acl; } -static int __btrfs_set_acl(struct btrfs_trans_handle *trans, - struct user_namespace *mnt_userns, - struct inode *inode, struct posix_acl *acl, int type) +int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, + struct posix_acl *acl, int type) { int ret, size = 0; const char *name; @@ -123,40 +122,8 @@ int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, if (ret) return ret; } - ret = __btrfs_set_acl(NULL, mnt_userns, inode, acl, type); + ret = __btrfs_set_acl(NULL, inode, acl, type); if (ret) inode->i_mode = old_mode; return ret; } - -int btrfs_init_acl(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir) -{ - struct posix_acl *default_acl, *acl; - int ret = 0; - - /* this happens with subvols */ - if (!dir) - return 0; - - ret = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); - if (ret) - return ret; - - if (default_acl) { - ret = __btrfs_set_acl(trans, &init_user_ns, inode, default_acl, - ACL_TYPE_DEFAULT); - posix_acl_release(default_acl); - } - - if (acl) { - if (!ret) - ret = __btrfs_set_acl(trans, &init_user_ns, inode, acl, - ACL_TYPE_ACCESS); - posix_acl_release(acl); - } - - if (!default_acl && !acl) - cache_no_acl(inode); - return ret; -} diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 43c89952b7d2..aac240430efe 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -15,13 +15,12 @@ enum { WORK_DONE_BIT, WORK_ORDER_DONE_BIT, - WORK_HIGH_PRIO_BIT, }; #define NO_THRESHOLD (-1) #define DFT_THRESHOLD (32) -struct __btrfs_workqueue { +struct btrfs_workqueue { struct workqueue_struct *normal_wq; /* File system this workqueue services */ @@ -48,12 +47,7 @@ struct __btrfs_workqueue { spinlock_t thres_lock; }; -struct btrfs_workqueue { - struct __btrfs_workqueue *normal; - struct __btrfs_workqueue *high; -}; - -struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq) +struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq) { return wq->fs_info; } @@ -66,22 +60,22 @@ struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work) bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq) { /* - * We could compare wq->normal->pending with num_online_cpus() + * We could compare wq->pending with num_online_cpus() * to support "thresh == NO_THRESHOLD" case, but it requires * moving up atomic_inc/dec in thresh_queue/exec_hook. Let's * postpone it until someone needs the support of that case. */ - if (wq->normal->thresh == NO_THRESHOLD) + if (wq->thresh == NO_THRESHOLD) return false; - return atomic_read(&wq->normal->pending) > wq->normal->thresh * 2; + return atomic_read(&wq->pending) > wq->thresh * 2; } -static struct __btrfs_workqueue * -__btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, - unsigned int flags, int limit_active, int thresh) +struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, + const char *name, unsigned int flags, + int limit_active, int thresh) { - struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); + struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); if (!ret) return NULL; @@ -105,12 +99,8 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, ret->thresh = thresh; } - if (flags & WQ_HIGHPRI) - ret->normal_wq = alloc_workqueue("btrfs-%s-high", flags, - ret->current_active, name); - else - ret->normal_wq = alloc_workqueue("btrfs-%s", flags, - ret->current_active, name); + ret->normal_wq = alloc_workqueue("btrfs-%s", flags, ret->current_active, + name); if (!ret->normal_wq) { kfree(ret); return NULL; @@ -119,41 +109,7 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, INIT_LIST_HEAD(&ret->ordered_list); spin_lock_init(&ret->list_lock); spin_lock_init(&ret->thres_lock); - trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI); - return ret; -} - -static inline void -__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq); - -struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, - const char *name, - unsigned int flags, - int limit_active, - int thresh) -{ - struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); - - if (!ret) - return NULL; - - ret->normal = __btrfs_alloc_workqueue(fs_info, name, - flags & ~WQ_HIGHPRI, - limit_active, thresh); - if (!ret->normal) { - kfree(ret); - return NULL; - } - - if (flags & WQ_HIGHPRI) { - ret->high = __btrfs_alloc_workqueue(fs_info, name, flags, - limit_active, thresh); - if (!ret->high) { - __btrfs_destroy_workqueue(ret->normal); - kfree(ret); - return NULL; - } - } + trace_btrfs_workqueue_alloc(ret, name); return ret; } @@ -162,7 +118,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, * This hook WILL be called in IRQ handler context, * so workqueue_set_max_active MUST NOT be called in this hook */ -static inline void thresh_queue_hook(struct __btrfs_workqueue *wq) +static inline void thresh_queue_hook(struct btrfs_workqueue *wq) { if (wq->thresh == NO_THRESHOLD) return; @@ -174,7 +130,7 @@ static inline void thresh_queue_hook(struct __btrfs_workqueue *wq) * This hook is called in kthread content. * So workqueue_set_max_active is called here. */ -static inline void thresh_exec_hook(struct __btrfs_workqueue *wq) +static inline void thresh_exec_hook(struct btrfs_workqueue *wq) { int new_current_active; long pending; @@ -217,7 +173,7 @@ out: } } -static void run_ordered_work(struct __btrfs_workqueue *wq, +static void run_ordered_work(struct btrfs_workqueue *wq, struct btrfs_work *self) { struct list_head *list = &wq->ordered_list; @@ -305,7 +261,7 @@ static void btrfs_work_helper(struct work_struct *normal_work) { struct btrfs_work *work = container_of(normal_work, struct btrfs_work, normal_work); - struct __btrfs_workqueue *wq; + struct btrfs_workqueue *wq = work->wq; int need_order = 0; /* @@ -318,7 +274,6 @@ static void btrfs_work_helper(struct work_struct *normal_work) */ if (work->ordered_func) need_order = 1; - wq = work->wq; trace_btrfs_work_sched(work); thresh_exec_hook(wq); @@ -350,8 +305,7 @@ void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func, work->flags = 0; } -static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq, - struct btrfs_work *work) +void btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work) { unsigned long flags; @@ -366,54 +320,22 @@ static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq, queue_work(wq->normal_wq, &work->normal_work); } -void btrfs_queue_work(struct btrfs_workqueue *wq, - struct btrfs_work *work) -{ - struct __btrfs_workqueue *dest_wq; - - if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high) - dest_wq = wq->high; - else - dest_wq = wq->normal; - __btrfs_queue_work(dest_wq, work); -} - -static inline void -__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq) -{ - destroy_workqueue(wq->normal_wq); - trace_btrfs_workqueue_destroy(wq); - kfree(wq); -} - void btrfs_destroy_workqueue(struct btrfs_workqueue *wq) { if (!wq) return; - if (wq->high) - __btrfs_destroy_workqueue(wq->high); - __btrfs_destroy_workqueue(wq->normal); + destroy_workqueue(wq->normal_wq); + trace_btrfs_workqueue_destroy(wq); kfree(wq); } void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int limit_active) { - if (!wq) - return; - wq->normal->limit_active = limit_active; - if (wq->high) - wq->high->limit_active = limit_active; -} - -void btrfs_set_work_high_priority(struct btrfs_work *work) -{ - set_bit(WORK_HIGH_PRIO_BIT, &work->flags); + if (wq) + wq->limit_active = limit_active; } void btrfs_flush_workqueue(struct btrfs_workqueue *wq) { - if (wq->high) - flush_workqueue(wq->high->normal_wq); - - flush_workqueue(wq->normal->normal_wq); + flush_workqueue(wq->normal_wq); } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 3204daa51b95..07960529b360 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -11,8 +11,6 @@ struct btrfs_fs_info; struct btrfs_workqueue; -/* Internal use only */ -struct __btrfs_workqueue; struct btrfs_work; typedef void (*btrfs_func_t)(struct btrfs_work *arg); typedef void (*btrfs_work_func_t)(struct work_struct *arg); @@ -25,7 +23,7 @@ struct btrfs_work { /* Don't touch things below */ struct work_struct normal_work; struct list_head ordered_list; - struct __btrfs_workqueue *wq; + struct btrfs_workqueue *wq; unsigned long flags; }; @@ -40,9 +38,8 @@ void btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work); void btrfs_destroy_workqueue(struct btrfs_workqueue *wq); void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max); -void btrfs_set_work_high_priority(struct btrfs_work *work); struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work); -struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq); +struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq); bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq); void btrfs_flush_workqueue(struct btrfs_workqueue *wq); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 0dd6de994199..ede389f2602d 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -168,11 +168,12 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, struct rb_node **p; struct rb_node *parent = NULL; struct btrfs_block_group *cache; + bool leftmost = true; ASSERT(block_group->length != 0); - spin_lock(&info->block_group_cache_lock); - p = &info->block_group_cache_tree.rb_node; + write_lock(&info->block_group_cache_lock); + p = &info->block_group_cache_tree.rb_root.rb_node; while (*p) { parent = *p; @@ -181,20 +182,18 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, p = &(*p)->rb_left; } else if (block_group->start > cache->start) { p = &(*p)->rb_right; + leftmost = false; } else { - spin_unlock(&info->block_group_cache_lock); + write_unlock(&info->block_group_cache_lock); return -EEXIST; } } rb_link_node(&block_group->cache_node, parent, p); - rb_insert_color(&block_group->cache_node, - &info->block_group_cache_tree); + rb_insert_color_cached(&block_group->cache_node, + &info->block_group_cache_tree, leftmost); - if (info->first_logical_byte > block_group->start) - info->first_logical_byte = block_group->start; - - spin_unlock(&info->block_group_cache_lock); + write_unlock(&info->block_group_cache_lock); return 0; } @@ -210,8 +209,8 @@ static struct btrfs_block_group *block_group_cache_tree_search( struct rb_node *n; u64 end, start; - spin_lock(&info->block_group_cache_lock); - n = info->block_group_cache_tree.rb_node; + read_lock(&info->block_group_cache_lock); + n = info->block_group_cache_tree.rb_root.rb_node; while (n) { cache = rb_entry(n, struct btrfs_block_group, cache_node); @@ -233,12 +232,9 @@ static struct btrfs_block_group *block_group_cache_tree_search( break; } } - if (ret) { + if (ret) btrfs_get_block_group(ret); - if (bytenr == 0 && info->first_logical_byte > ret->start) - info->first_logical_byte = ret->start; - } - spin_unlock(&info->block_group_cache_lock); + read_unlock(&info->block_group_cache_lock); return ret; } @@ -267,15 +263,15 @@ struct btrfs_block_group *btrfs_next_block_group( struct btrfs_fs_info *fs_info = cache->fs_info; struct rb_node *node; - spin_lock(&fs_info->block_group_cache_lock); + read_lock(&fs_info->block_group_cache_lock); /* If our block group was removed, we need a full search. */ if (RB_EMPTY_NODE(&cache->cache_node)) { const u64 next_bytenr = cache->start + cache->length; - spin_unlock(&fs_info->block_group_cache_lock); + read_unlock(&fs_info->block_group_cache_lock); btrfs_put_block_group(cache); - cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache; + return btrfs_lookup_first_block_group(fs_info, next_bytenr); } node = rb_next(&cache->cache_node); btrfs_put_block_group(cache); @@ -284,46 +280,70 @@ struct btrfs_block_group *btrfs_next_block_group( btrfs_get_block_group(cache); } else cache = NULL; - spin_unlock(&fs_info->block_group_cache_lock); + read_unlock(&fs_info->block_group_cache_lock); return cache; } -bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +/** + * Check if we can do a NOCOW write for a given extent. + * + * @fs_info: The filesystem information object. + * @bytenr: Logical start address of the extent. + * + * Check if we can do a NOCOW write for the given extent, and increments the + * number of NOCOW writers in the block group that contains the extent, as long + * as the block group exists and it's currently not in read-only mode. + * + * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller + * is responsible for calling btrfs_dec_nocow_writers() later. + * + * Or NULL if we can not do a NOCOW write + */ +struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, + u64 bytenr) { struct btrfs_block_group *bg; - bool ret = true; + bool can_nocow = true; bg = btrfs_lookup_block_group(fs_info, bytenr); if (!bg) - return false; + return NULL; spin_lock(&bg->lock); if (bg->ro) - ret = false; + can_nocow = false; else atomic_inc(&bg->nocow_writers); spin_unlock(&bg->lock); - /* No put on block group, done by btrfs_dec_nocow_writers */ - if (!ret) + if (!can_nocow) { btrfs_put_block_group(bg); + return NULL; + } - return ret; + /* No put on block group, done by btrfs_dec_nocow_writers(). */ + return bg; } -void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +/** + * Decrement the number of NOCOW writers in a block group. + * + * @bg: The block group. + * + * This is meant to be called after a previous call to btrfs_inc_nocow_writers(), + * and on the block group returned by that call. Typically this is called after + * creating an ordered extent for a NOCOW write, to prevent races with scrub and + * relocation. + * + * After this call, the caller should not use the block group anymore. It it wants + * to use it, then it should get a reference on it before calling this function. + */ +void btrfs_dec_nocow_writers(struct btrfs_block_group *bg) { - struct btrfs_block_group *bg; - - bg = btrfs_lookup_block_group(fs_info, bytenr); - ASSERT(bg); if (atomic_dec_and_test(&bg->nocow_writers)) wake_up_var(&bg->nocow_writers); - /* - * Once for our lookup and once for the lookup done by a previous call - * to btrfs_inc_nocow_writers() - */ - btrfs_put_block_group(bg); + + /* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */ btrfs_put_block_group(bg); } @@ -772,10 +792,10 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only cache->has_caching_ctl = 1; spin_unlock(&cache->lock); - spin_lock(&fs_info->block_group_cache_lock); + write_lock(&fs_info->block_group_cache_lock); refcount_inc(&caching_ctl->count); list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); - spin_unlock(&fs_info->block_group_cache_lock); + write_unlock(&fs_info->block_group_cache_lock); btrfs_get_block_group(cache); @@ -957,17 +977,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, if (ret) goto out; - spin_lock(&fs_info->block_group_cache_lock); - rb_erase(&block_group->cache_node, - &fs_info->block_group_cache_tree); + write_lock(&fs_info->block_group_cache_lock); + rb_erase_cached(&block_group->cache_node, + &fs_info->block_group_cache_tree); RB_CLEAR_NODE(&block_group->cache_node); /* Once for the block groups rbtree */ btrfs_put_block_group(block_group); - if (fs_info->first_logical_byte == block_group->start) - fs_info->first_logical_byte = (u64)-1; - spin_unlock(&fs_info->block_group_cache_lock); + write_unlock(&fs_info->block_group_cache_lock); down_write(&block_group->space_info->groups_sem); /* @@ -992,7 +1010,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, if (block_group->cached == BTRFS_CACHE_STARTED) btrfs_wait_block_group_cache_done(block_group); if (block_group->has_caching_ctl) { - spin_lock(&fs_info->block_group_cache_lock); + write_lock(&fs_info->block_group_cache_lock); if (!caching_ctl) { struct btrfs_caching_control *ctl; @@ -1006,7 +1024,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, } if (caching_ctl) list_del_init(&caching_ctl->list); - spin_unlock(&fs_info->block_group_cache_lock); + write_unlock(&fs_info->block_group_cache_lock); if (caching_ctl) { /* Once for the caching bgs list and once for us. */ btrfs_put_caching_control(caching_ctl); @@ -1367,6 +1385,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) goto next; } + ret = btrfs_zone_finish(block_group); + if (ret < 0) { + btrfs_dec_block_group_ro(block_group); + if (ret == -EAGAIN) + ret = 0; + goto next; + } + /* * Want to do this before we do anything else so we can recover * properly if we fail to join the transaction. @@ -1512,6 +1538,13 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a, return bg1->used > bg2->used; } +static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info) +{ + if (btrfs_is_zoned(fs_info)) + return btrfs_zoned_should_reclaim(fs_info); + return true; +} + void btrfs_reclaim_bgs_work(struct work_struct *work) { struct btrfs_fs_info *fs_info = @@ -1522,6 +1555,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; + if (!btrfs_should_reclaim(fs_info)) + return; + sb_start_write(fs_info->sb); if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { @@ -1692,35 +1728,13 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info, struct btrfs_root *root = btrfs_block_group_root(fs_info); int ret; struct btrfs_key found_key; - struct extent_buffer *leaf; - int slot; - - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); - if (ret < 0) - return ret; - - while (1) { - slot = path->slots[0]; - leaf = path->nodes[0]; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto out; - break; - } - btrfs_item_key_to_cpu(leaf, &found_key, slot); + btrfs_for_each_slot(root, key, &found_key, path, ret) { if (found_key.objectid >= key->objectid && found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { - ret = read_bg_from_eb(fs_info, &found_key, path); - break; + return read_bg_from_eb(fs_info, &found_key, path); } - - path->slots[0]++; } -out: return ret; } @@ -3220,6 +3234,31 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) return ret; } +static inline bool should_reclaim_block_group(struct btrfs_block_group *bg, + u64 bytes_freed) +{ + const struct btrfs_space_info *space_info = bg->space_info; + const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold); + const u64 new_val = bg->used; + const u64 old_val = new_val + bytes_freed; + u64 thresh; + + if (reclaim_thresh == 0) + return false; + + thresh = div_factor_fine(bg->length, reclaim_thresh); + + /* + * If we were below the threshold before don't reclaim, we are likely a + * brand new block group and we don't want to relocate new block groups. + */ + if (old_val < thresh) + return false; + if (new_val >= thresh) + return false; + return true; +} + int btrfs_update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, bool alloc) { @@ -3242,6 +3281,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&info->delalloc_root_lock); while (total) { + bool reclaim; + cache = btrfs_lookup_block_group(info, bytenr); if (!cache) { ret = -ENOENT; @@ -3287,6 +3328,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, cache->space_info, num_bytes); cache->space_info->bytes_used -= num_bytes; cache->space_info->disk_used -= num_bytes * factor; + + reclaim = should_reclaim_block_group(cache, num_bytes); spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); @@ -3313,6 +3356,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, if (!alloc && old_val == 0) { if (!btrfs_test_opt(info, DISCARD_ASYNC)) btrfs_mark_bg_unused(cache); + } else if (!alloc && reclaim) { + btrfs_mark_bg_to_reclaim(cache); } btrfs_put_block_group(cache); @@ -3957,14 +4002,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) struct btrfs_caching_control *caching_ctl; struct rb_node *n; - spin_lock(&info->block_group_cache_lock); + write_lock(&info->block_group_cache_lock); while (!list_empty(&info->caching_block_groups)) { caching_ctl = list_entry(info->caching_block_groups.next, struct btrfs_caching_control, list); list_del(&caching_ctl->list); btrfs_put_caching_control(caching_ctl); } - spin_unlock(&info->block_group_cache_lock); + write_unlock(&info->block_group_cache_lock); spin_lock(&info->unused_bgs_lock); while (!list_empty(&info->unused_bgs)) { @@ -3994,14 +4039,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) } spin_unlock(&info->zone_active_bgs_lock); - spin_lock(&info->block_group_cache_lock); - while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { + write_lock(&info->block_group_cache_lock); + while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) { block_group = rb_entry(n, struct btrfs_block_group, cache_node); - rb_erase(&block_group->cache_node, - &info->block_group_cache_tree); + rb_erase_cached(&block_group->cache_node, + &info->block_group_cache_tree); RB_CLEAR_NODE(&block_group->cache_node); - spin_unlock(&info->block_group_cache_lock); + write_unlock(&info->block_group_cache_lock); down_write(&block_group->space_info->groups_sem); list_del(&block_group->list); @@ -4024,9 +4069,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) ASSERT(block_group->swap_extents == 0); btrfs_put_block_group(block_group); - spin_lock(&info->block_group_cache_lock); + write_lock(&info->block_group_cache_lock); } - spin_unlock(&info->block_group_cache_lock); + write_unlock(&info->block_group_cache_lock); btrfs_release_global_block_rsv(info); diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index e8308f2ad07d..3ac668ace50a 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -212,6 +212,8 @@ struct btrfs_block_group { u64 meta_write_pointer; struct map_lookup *physical_map; struct list_head active_bg_list; + struct work_struct zone_finish_work; + struct extent_buffer *last_eb; }; static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) @@ -254,8 +256,9 @@ void btrfs_put_block_group(struct btrfs_block_group *cache); void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start); void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg); -bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); -void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); +struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, + u64 bytenr); +void btrfs_dec_nocow_writers(struct btrfs_block_group *bg); void btrfs_wait_nocow_writers(struct btrfs_block_group *bg); void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, u64 num_bytes); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 32131a5d321b..33811e896623 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -395,31 +395,6 @@ static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode) return true; } -struct btrfs_dio_private { - struct inode *inode; - - /* - * Since DIO can use anonymous page, we cannot use page_offset() to - * grab the file offset, thus need a dedicated member for file offset. - */ - u64 file_offset; - u64 disk_bytenr; - /* Used for bio::bi_size */ - u32 bytes; - - /* - * References to this structure. There is one reference per in-flight - * bio plus one while we're still setting up. - */ - refcount_t refs; - - /* dio_bio came from fs/direct-io.c */ - struct bio *dio_bio; - - /* Array of checksums */ - u8 csums[]; -}; - /* * btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two * separate u32s. These two functions convert between the two representations. diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index abac86a75840..5d20137b7b67 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1552,21 +1552,18 @@ static int btrfsic_read_block(struct btrfsic_state *state, return -ENOMEM; block_ctx->datav = block_ctx->mem_to_free; block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages); - for (i = 0; i < num_pages; i++) { - block_ctx->pagev[i] = alloc_page(GFP_NOFS); - if (!block_ctx->pagev[i]) - return -1; - } + ret = btrfs_alloc_page_array(num_pages, block_ctx->pagev); + if (ret) + return ret; dev_bytenr = block_ctx->dev_bytenr; for (i = 0; i < num_pages;) { struct bio *bio; unsigned int j; - bio = btrfs_bio_alloc(num_pages - i); - bio_set_dev(bio, block_ctx->dev->bdev); + bio = bio_alloc(block_ctx->dev->bdev, num_pages - i, + REQ_OP_READ, GFP_NOFS); bio->bi_iter.bi_sector = dev_bytenr >> 9; - bio->bi_opf = REQ_OP_READ; for (j = i; j < num_pages; j++) { ret = bio_add_page(bio, block_ctx->pagev[j], @@ -2033,7 +2030,7 @@ continue_loop: static void btrfsic_bio_end_io(struct bio *bp) { - struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private; + struct btrfsic_block *block = bp->bi_private; int iodone_w_error; /* mutex is not held! This is not save if IO is not yet completed @@ -2635,100 +2632,93 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev) &btrfsic_dev_state_hashtable); } -static void __btrfsic_submit_bio(struct bio *bio) +static void btrfsic_check_write_bio(struct bio *bio, struct btrfsic_dev_state *dev_state) { - struct btrfsic_dev_state *dev_state; + unsigned int segs = bio_segments(bio); + u64 dev_bytenr = 512 * bio->bi_iter.bi_sector; + u64 cur_bytenr = dev_bytenr; + struct bvec_iter iter; + struct bio_vec bvec; + char **mapped_datav; + int bio_is_patched = 0; + int i = 0; + + if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + pr_info( +"submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", + bio_op(bio), bio->bi_opf, segs, + bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); - if (!btrfsic_is_initialized) + mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS); + if (!mapped_datav) return; - mutex_lock(&btrfsic_mutex); - /* since btrfsic_submit_bio() is also called before - * btrfsic_mount(), this might return NULL */ - dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev); - if (NULL != dev_state && - (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) { - int i = 0; - u64 dev_bytenr; - u64 cur_bytenr; - struct bio_vec bvec; - struct bvec_iter iter; - int bio_is_patched; - char **mapped_datav; - unsigned int segs = bio_segments(bio); - - dev_bytenr = 512 * bio->bi_iter.bi_sector; - bio_is_patched = 0; - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", - bio_op(bio), bio->bi_opf, segs, - bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); - - mapped_datav = kmalloc_array(segs, - sizeof(*mapped_datav), GFP_NOFS); - if (!mapped_datav) - goto leave; - cur_bytenr = dev_bytenr; - - bio_for_each_segment(bvec, bio, iter) { - BUG_ON(bvec.bv_len != PAGE_SIZE); - mapped_datav[i] = page_address(bvec.bv_page); - i++; - - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) - pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n", - i, cur_bytenr, bvec.bv_len, bvec.bv_offset); - cur_bytenr += bvec.bv_len; - } - btrfsic_process_written_block(dev_state, dev_bytenr, - mapped_datav, segs, - bio, &bio_is_patched, - bio->bi_opf); - kfree(mapped_datav); - } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) { - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n", - bio_op(bio), bio->bi_opf, bio->bi_bdev); - if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { - if ((dev_state->state->print_mask & - (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | - BTRFSIC_PRINT_MASK_VERBOSE))) - pr_info( -"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n", - dev_state->bdev); - } else { - struct btrfsic_block *const block = - &dev_state->dummy_block_for_bio_bh_flush; + bio_for_each_segment(bvec, bio, iter) { + BUG_ON(bvec.bv_len != PAGE_SIZE); + mapped_datav[i] = page_address(bvec.bv_page); + i++; - block->is_iodone = 0; - block->never_written = 0; - block->iodone_w_error = 0; - block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = bio->bi_opf; - block->orig_bio_private = bio->bi_private; - block->orig_bio_end_io = bio->bi_end_io; - block->next_in_same_bio = NULL; - bio->bi_private = block; - bio->bi_end_io = btrfsic_bio_end_io; - } + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) + pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n", + i, cur_bytenr, bvec.bv_len, bvec.bv_offset); + cur_bytenr += bvec.bv_len; } -leave: - mutex_unlock(&btrfsic_mutex); + + btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, segs, + bio, &bio_is_patched, bio->bi_opf); + kfree(mapped_datav); } -void btrfsic_submit_bio(struct bio *bio) +static void btrfsic_check_flush_bio(struct bio *bio, struct btrfsic_dev_state *dev_state) { - __btrfsic_submit_bio(bio); - submit_bio(bio); + if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n", + bio_op(bio), bio->bi_opf, bio->bi_bdev); + + if (dev_state->dummy_block_for_bio_bh_flush.is_iodone) { + struct btrfsic_block *const block = + &dev_state->dummy_block_for_bio_bh_flush; + + block->is_iodone = 0; + block->never_written = 0; + block->iodone_w_error = 0; + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = bio->bi_opf; + block->orig_bio_private = bio->bi_private; + block->orig_bio_end_io = bio->bi_end_io; + block->next_in_same_bio = NULL; + bio->bi_private = block; + bio->bi_end_io = btrfsic_bio_end_io; + } else if ((dev_state->state->print_mask & + (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | + BTRFSIC_PRINT_MASK_VERBOSE))) { + pr_info( +"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n", + dev_state->bdev); + } } -int btrfsic_submit_bio_wait(struct bio *bio) +void btrfsic_check_bio(struct bio *bio) { - __btrfsic_submit_bio(bio); - return submit_bio_wait(bio); + struct btrfsic_dev_state *dev_state; + + if (!btrfsic_is_initialized) + return; + + /* + * We can be called before btrfsic_mount, so there might not be a + * dev_state. + */ + dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev); + mutex_lock(&btrfsic_mutex); + if (dev_state) { + if (bio_op(bio) == REQ_OP_WRITE && bio_has_data(bio)) + btrfsic_check_write_bio(bio, dev_state); + else if (bio->bi_opf & REQ_PREFLUSH) + btrfsic_check_flush_bio(bio, dev_state); + } + mutex_unlock(&btrfsic_mutex); } int btrfsic_mount(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h index bcc730a06cb5..e4c8aed7996f 100644 --- a/fs/btrfs/check-integrity.h +++ b/fs/btrfs/check-integrity.h @@ -7,11 +7,9 @@ #define BTRFS_CHECK_INTEGRITY_H #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY -void btrfsic_submit_bio(struct bio *bio); -int btrfsic_submit_bio_wait(struct bio *bio); +void btrfsic_check_bio(struct bio *bio); #else -#define btrfsic_submit_bio submit_bio -#define btrfsic_submit_bio_wait submit_bio_wait +static inline void btrfsic_check_bio(struct bio *bio) { } #endif int btrfsic_mount(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 19bf36d8ffea..f4564f32f6d9 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -425,7 +425,6 @@ out: } static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info, - struct compressed_bio *cb, struct bio *bio, int mirror_num) { blk_status_t ret; @@ -604,7 +603,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, goto finish_cb; } - ret = submit_compressed_bio(fs_info, cb, bio, 0); + ret = submit_compressed_bio(fs_info, bio, 0); if (ret) goto finish_cb; bio = NULL; @@ -802,15 +801,13 @@ static noinline int add_ra_bio_pages(struct inode *inode, * After the compressed pages are read, we copy the bytes into the * bio we were passed and then call the bio end_io calls */ -blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags) +void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + int mirror_num) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map_tree *em_tree; struct compressed_bio *cb; unsigned int compressed_len; - unsigned int nr_pages; - unsigned int pg_index; struct bio *comp_bio = NULL; const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 cur_disk_byte = disk_bytenr; @@ -820,7 +817,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, u64 em_start; struct extent_map *em; blk_status_t ret; - int faili = 0; + int ret2; + int i; u8 *sums; em_tree = &BTRFS_I(inode)->extent_tree; @@ -855,32 +853,26 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, em_len = em->len; em_start = em->start; - free_extent_map(em); - em = NULL; - cb->len = bio->bi_iter.bi_size; cb->compressed_len = compressed_len; - cb->compress_type = extent_compress_type(bio_flags); + cb->compress_type = em->compress_type; cb->orig_bio = bio; - nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE); - cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *), - GFP_NOFS); + free_extent_map(em); + em = NULL; + + cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE); + cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS); if (!cb->compressed_pages) { ret = BLK_STS_RESOURCE; - goto fail1; + goto fail; } - for (pg_index = 0; pg_index < nr_pages; pg_index++) { - cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS); - if (!cb->compressed_pages[pg_index]) { - faili = pg_index - 1; - ret = BLK_STS_RESOURCE; - goto fail2; - } + ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages); + if (ret2) { + ret = BLK_STS_RESOURCE; + goto fail; } - faili = nr_pages - 1; - cb->nr_pages = nr_pages; add_ra_bio_pages(inode, em_start + em_len, cb); @@ -949,28 +941,29 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, fs_info->sectorsize); sums += fs_info->csum_size * nr_sectors; - ret = submit_compressed_bio(fs_info, cb, comp_bio, mirror_num); + ret = submit_compressed_bio(fs_info, comp_bio, mirror_num); if (ret) goto finish_cb; comp_bio = NULL; } } - return BLK_STS_OK; + return; -fail2: - while (faili >= 0) { - __free_page(cb->compressed_pages[faili]); - faili--; +fail: + if (cb->compressed_pages) { + for (i = 0; i < cb->nr_pages; i++) { + if (cb->compressed_pages[i]) + __free_page(cb->compressed_pages[i]); + } } kfree(cb->compressed_pages); -fail1: kfree(cb); out: free_extent_map(em); bio->bi_status = ret; bio_endio(bio); - return ret; + return; finish_cb: if (comp_bio) { comp_bio->bi_status = ret; @@ -978,7 +971,7 @@ finish_cb: } /* All bytes of @cb is submitted, endio will free @cb */ if (cur_disk_byte == disk_bytenr + compressed_len) - return ret; + return; wait_var_event(cb, refcount_read(&cb->pending_sectors) == (disk_bytenr + compressed_len - cur_disk_byte) >> @@ -990,7 +983,6 @@ finish_cb: ASSERT(refcount_read(&cb->pending_sectors)); /* Now we are the only one referring @cb, can finish it safely. */ finish_compressed_bio_read(cb); - return ret; } /* diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index ac5b20731d2a..2707404389a5 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -102,8 +102,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, unsigned int write_flags, struct cgroup_subsys_state *blkcg_css, bool writeback); -blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags); +void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + int mirror_num); unsigned int btrfs_compress_str2level(unsigned int type, const char *str); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0eecf98d0abb..6e556031a8f3 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -16,6 +16,7 @@ #include "volumes.h" #include "qgroup.h" #include "tree-mod-log.h" +#include "tree-checker.h" static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level); @@ -342,7 +343,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, int level = btrfs_header_level(buf); ret = btrfs_set_disk_extent_flags(trans, buf, - new_flags, level, 0); + new_flags, level); if (ret) return ret; } @@ -1390,12 +1391,13 @@ static noinline void unlock_up(struct btrfs_path *path, int level, } /* - * helper function for btrfs_search_slot. The goal is to find a block - * in cache without setting the path to blocking. If we find the block - * we return zero and the path is unchanged. + * Helper function for btrfs_search_slot() and other functions that do a search + * on a btree. The goal is to find a tree block in the cache (the radix tree at + * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read + * its pages from disk. * - * If we can't find the block, we set the path blocking and do some - * reada. -EAGAIN is returned and the search must be repeated. + * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the + * whole btree search, starting again from the current root node. */ static int read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, @@ -1409,12 +1411,21 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, struct btrfs_key first_key; int ret; int parent_level; + bool unlock_up; + unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]); blocknr = btrfs_node_blockptr(*eb_ret, slot); gen = btrfs_node_ptr_generation(*eb_ret, slot); parent_level = btrfs_header_level(*eb_ret); btrfs_node_key_to_cpu(*eb_ret, &first_key, slot); + /* + * If we need to read an extent buffer from disk and we are holding locks + * on upper level nodes, we unlock all the upper nodes before reading the + * extent buffer, and then return -EAGAIN to the caller as it needs to + * restart the search. We don't release the lock on the current level + * because we need to walk this node to figure out which blocks to read. + */ tmp = find_extent_buffer(fs_info, blocknr); if (tmp) { if (p->reada == READA_FORWARD_ALWAYS) @@ -1436,30 +1447,38 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, return 0; } + if (unlock_up) + btrfs_unlock_up_safe(p, level + 1); + /* now we're allowed to do a blocking uptodate check */ - ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key); + ret = btrfs_read_extent_buffer(tmp, gen, parent_level - 1, &first_key); if (ret) { free_extent_buffer(tmp); btrfs_release_path(p); return -EIO; } - *eb_ret = tmp; - return 0; + if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) { + free_extent_buffer(tmp); + btrfs_release_path(p); + return -EUCLEAN; + } + + if (unlock_up) + ret = -EAGAIN; + + goto out; } - /* - * reduce lock contention at high levels - * of the btree by dropping locks before - * we read. Don't release the lock on the current - * level because we need to walk this node to figure - * out which blocks to read. - */ - btrfs_unlock_up_safe(p, level + 1); + if (unlock_up) { + btrfs_unlock_up_safe(p, level + 1); + ret = -EAGAIN; + } else { + ret = 0; + } if (p->reada != READA_NONE) reada_for_search(fs_info, p, level, slot, key->objectid); - ret = -EAGAIN; tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid, gen, parent_level - 1, &first_key); if (IS_ERR(tmp)) { @@ -1474,9 +1493,15 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, */ if (!extent_buffer_uptodate(tmp)) ret = -EIO; - free_extent_buffer(tmp); - btrfs_release_path(p); +out: + if (ret == 0) { + *eb_ret = tmp; + } else { + free_extent_buffer(tmp); + btrfs_release_path(p); + } + return ret; } @@ -2279,6 +2304,43 @@ int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, return ret; } +/** + * Search for a valid slot for the given path. + * + * @root: The root node of the tree. + * @key: Will contain a valid item if found. + * @path: The starting point to validate the slot. + * + * Return: 0 if the item is valid + * 1 if not found + * <0 if error. + */ +int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *path) +{ + while (1) { + int ret; + const int slot = path->slots[0]; + const struct extent_buffer *leaf = path->nodes[0]; + + /* This is where we start walking the path. */ + if (slot >= btrfs_header_nritems(leaf)) { + /* + * If we've reached the last slot in this leaf we need + * to go to the next leaf and reset the path. + */ + ret = btrfs_next_leaf(root, path); + if (ret) + return ret; + continue; + } + /* Store the found, valid item in @key. */ + btrfs_item_key_to_cpu(leaf, key, slot); + break; + } + return 0; +} + /* * adjust the pointers going up the tree, starting at level * making sure the right key of each node is points to 'key'. diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 077c95e9baa5..0e49b1a0c071 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -675,13 +675,13 @@ struct btrfs_fs_info { rwlock_t global_root_lock; struct rb_root global_root_tree; - spinlock_t fs_roots_radix_lock; - struct radix_tree_root fs_roots_radix; + /* The xarray that holds all the FS roots */ + spinlock_t fs_roots_lock; + struct xarray fs_roots; /* block group cache stuff */ - spinlock_t block_group_cache_lock; - u64 first_logical_byte; - struct rb_root block_group_cache_tree; + rwlock_t block_group_cache_lock; + struct rb_root_cached block_group_cache_tree; /* keep track of unallocated space */ atomic64_t free_chunk_space; @@ -848,12 +848,13 @@ struct btrfs_fs_info { * two */ struct btrfs_workqueue *workers; + struct btrfs_workqueue *hipri_workers; struct btrfs_workqueue *delalloc_workers; struct btrfs_workqueue *flush_workers; struct btrfs_workqueue *endio_workers; struct btrfs_workqueue *endio_meta_workers; struct btrfs_workqueue *endio_raid56_workers; - struct btrfs_workqueue *rmw_workers; + struct workqueue_struct *rmw_workers; struct btrfs_workqueue *endio_meta_write_workers; struct btrfs_workqueue *endio_write_workers; struct btrfs_workqueue *endio_freespace_worker; @@ -946,9 +947,9 @@ struct btrfs_fs_info { * running. */ refcount_t scrub_workers_refcnt; - struct btrfs_workqueue *scrub_workers; - struct btrfs_workqueue *scrub_wr_completion_workers; - struct btrfs_workqueue *scrub_parity_workers; + struct workqueue_struct *scrub_workers; + struct workqueue_struct *scrub_wr_completion_workers; + struct workqueue_struct *scrub_parity_workers; struct btrfs_subpage_info *subpage_info; struct btrfs_discard_ctl discard_ctl; @@ -994,10 +995,10 @@ struct btrfs_fs_info { struct btrfs_delayed_root *delayed_root; - /* Extent buffer radix tree */ + /* Extent buffer xarray */ spinlock_t buffer_lock; /* Entries are eb->start / sectorsize */ - struct radix_tree_root buffer_radix; + struct xarray extent_buffers; /* next backup root to be overwritten */ int backup_root_index; @@ -1045,10 +1046,7 @@ struct btrfs_fs_info { * Zone size > 0 when in ZONED mode, otherwise it's used for a check * if the mode is enabled */ - union { - u64 zone_size; - u64 zoned; - }; + u64 zone_size; struct mutex zoned_meta_io_lock; spinlock_t treelog_bg_lock; @@ -1121,7 +1119,8 @@ enum { */ BTRFS_ROOT_SHAREABLE, BTRFS_ROOT_TRACK_DIRTY, - BTRFS_ROOT_IN_RADIX, + /* The root is tracked in fs_info::fs_roots */ + BTRFS_ROOT_REGISTERED, BTRFS_ROOT_ORPHAN_ITEM_INSERTED, BTRFS_ROOT_DEFRAG_RUNNING, BTRFS_ROOT_FORCE_COW, @@ -1225,10 +1224,10 @@ struct btrfs_root { struct rb_root inode_tree; /* - * radix tree that keeps track of delayed nodes of every inode, - * protected by inode_lock + * Xarray that keeps track of delayed nodes of every inode, protected + * by inode_lock */ - struct radix_tree_root delayed_nodes_tree; + struct xarray delayed_nodes; /* * right now this just gets used so that a root has its own devid * for stat. It may be used for more later @@ -2784,7 +2783,8 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes); int btrfs_exclude_logged_extents(struct extent_buffer *eb); int btrfs_cross_ref_exist(struct btrfs_root *root, - u64 objectid, u64 offset, u64 bytenr, bool strict); + u64 objectid, u64 offset, u64 bytenr, bool strict, + struct btrfs_path *path); struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, @@ -2811,8 +2811,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - struct extent_buffer *eb, u64 flags, - int level, int is_data); + struct extent_buffer *eb, u64 flags, int level); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, @@ -2892,7 +2891,7 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root, void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, - u64 disk_num_bytes); + u64 disk_num_bytes, bool noflush); u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); @@ -3039,6 +3038,35 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *path); +int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *path); + +/* + * Search in @root for a given @key, and store the slot found in @found_key. + * + * @root: The root node of the tree. + * @key: The key we are looking for. + * @found_key: Will hold the found item. + * @path: Holds the current slot/leaf. + * @iter_ret: Contains the value returned from btrfs_search_slot or + * btrfs_get_next_valid_item, whichever was executed last. + * + * The @iter_ret is an output variable that will contain the return value of + * btrfs_search_slot, if it encountered an error, or the value returned from + * btrfs_get_next_valid_item otherwise. That return value can be 0, if a valid + * slot was found, 1 if there were no more leaves, and <0 if there was an error. + * + * It's recommended to use a separate variable for iter_ret and then use it to + * set the function return value so there's no confusion of the 0/1/errno + * values stemming from btrfs_search_slot. + */ +#define btrfs_for_each_slot(root, key, found_key, path, iter_ret) \ + for (iter_ret = btrfs_search_slot(NULL, (root), (key), (path), 0, 0); \ + (iter_ret) >= 0 && \ + (iter_ret = btrfs_get_next_valid_item((root), (found_key), (path))) == 0; \ + (path)->slots[0]++ \ + ) + static inline int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *p, u64 time_seq) { @@ -3190,7 +3218,6 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset); /* file-item.c */ -struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); @@ -3224,8 +3251,8 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ -blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags); +void btrfs_submit_data_bio(struct inode *inode, struct bio *bio, + int mirror_num, enum btrfs_compression_type compress_type); unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, u32 bio_offset, struct page *page, u64 start, u64 end); @@ -3255,10 +3282,28 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state); -int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, - struct btrfs_root *new_root, - struct btrfs_root *parent_root, - struct user_namespace *mnt_userns); +struct btrfs_new_inode_args { + /* Input */ + struct inode *dir; + struct dentry *dentry; + struct inode *inode; + bool orphan; + bool subvol; + + /* + * Output from btrfs_new_inode_prepare(), input to + * btrfs_create_new_inode(). + */ + struct posix_acl *default_acl; + struct posix_acl *acl; +}; +int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, + unsigned int *trans_num_items); +int btrfs_create_new_inode(struct btrfs_trans_handle *trans, + struct btrfs_new_inode_args *args); +void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args); +struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, + struct inode *dir); void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, unsigned *bits); void btrfs_clear_delalloc_extent(struct inode *inode, @@ -3269,7 +3314,6 @@ void btrfs_split_delalloc_extent(struct inode *inode, struct extent_state *orig, u64 split); void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); -int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); struct inode *btrfs_alloc_inode(struct super_block *sb); @@ -3314,9 +3358,9 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, const struct btrfs_ioctl_encoded_io_args *encoded); +ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before); + extern const struct dentry_operations btrfs_dentry_operations; -extern const struct iomap_ops btrfs_dio_iomap_ops; -extern const struct iomap_dio_ops btrfs_dio_ops; /* Inode locking type flags, by default the exclusive lock is taken */ #define BTRFS_ILOCK_SHARED (1U << 0) @@ -3328,6 +3372,7 @@ void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags); void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes, const u64 del_bytes); +void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end); /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -3403,11 +3448,29 @@ void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { } -#ifdef CONFIG_PRINTK +#ifdef CONFIG_PRINTK_INDEX + +#define btrfs_printk(fs_info, fmt, args...) \ +do { \ + printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); \ + _btrfs_printk(fs_info, fmt, ##args); \ +} while (0) + __printf(2, 3) __cold -void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); +void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); + +#elif defined(CONFIG_PRINTK) + +#define btrfs_printk(fs_info, fmt, args...) \ + _btrfs_printk(fs_info, fmt, ##args) + +__printf(2, 3) +__cold +void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); + #else + #define btrfs_printk(fs_info, fmt, args...) \ btrfs_no_printk(fs_info, fmt, ##args) #endif @@ -3658,12 +3721,25 @@ do { \ __LINE__, (errno)); \ } while (0) +#ifdef CONFIG_PRINTK_INDEX + #define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ -do { \ - __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ - (errno), fmt, ##args); \ +do { \ + printk_index_subsys_emit( \ + "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", \ + KERN_CRIT, fmt); \ + __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ + (errno), fmt, ##args); \ } while (0) +#else + +#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ + __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ + (errno), fmt, ##args) + +#endif + #define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ &(fs_info)->fs_state))) #define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ @@ -3816,15 +3892,16 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); -int btrfs_init_acl(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir); +int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, + struct posix_acl *acl, int type); #else #define btrfs_get_acl NULL #define btrfs_set_acl NULL -static inline int btrfs_init_acl(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir) +static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct posix_acl *acl, + int type) { - return 0; + return -EOPNOTSUPP; } #endif @@ -3929,7 +4006,7 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) { - return fs_info->zoned != 0; + return fs_info->zone_size > 0; } static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index bd8267c4687d..36ab0859a263 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -289,7 +289,7 @@ static void calc_inode_reservations(struct btrfs_fs_info *fs_info, } int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, - u64 disk_num_bytes) + u64 disk_num_bytes, bool noflush) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -308,7 +308,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, * If we have a transaction open (can happen if we call truncate_block * from truncate), then we need FLUSH_LIMIT so we don't deadlock. */ - if (btrfs_is_free_space_inode(inode)) { + if (noflush || btrfs_is_free_space_inode(inode)) { flush = BTRFS_RESERVE_NO_FLUSH; } else { if (current->journal_info) @@ -333,7 +333,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, */ calc_inode_reservations(fs_info, num_bytes, disk_num_bytes, &meta_reserve, &qgroup_reserve); - ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true); + ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true, + noflush); if (ret) return ret; ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush); @@ -456,7 +457,7 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, ret = btrfs_check_data_free_space(inode, reserved, start, len); if (ret < 0) return ret; - ret = btrfs_delalloc_reserve_metadata(inode, len, len); + ret = btrfs_delalloc_reserve_metadata(inode, len, len, false); if (ret < 0) { btrfs_free_reserved_data_space(inode, *reserved, start, len); extent_changeset_free(*reserved); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 748bf6b0d860..66779ab3ed4a 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -78,7 +78,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( } spin_lock(&root->inode_lock); - node = radix_tree_lookup(&root->delayed_nodes_tree, ino); + node = xa_load(&root->delayed_nodes, ino); if (node) { if (btrfs_inode->delayed_node) { @@ -90,9 +90,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( /* * It's possible that we're racing into the middle of removing - * this node from the radix tree. In this case, the refcount + * this node from the xarray. In this case, the refcount * was zero and it should never go back to one. Just return - * NULL like it was never in the radix at all; our release + * NULL like it was never in the xarray at all; our release * function is in the process of removing it. * * Some implementations of refcount_inc refuse to bump the @@ -100,7 +100,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( * here, refcount_inc() may decide to just WARN_ONCE() instead * of actually bumping the refcount. * - * If this node is properly in the radix, we want to bump the + * If this node is properly in the xarray, we want to bump the * refcount twice, once for the inode and once for this get * operation. */ @@ -128,36 +128,30 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( u64 ino = btrfs_ino(btrfs_inode); int ret; -again: - node = btrfs_get_delayed_node(btrfs_inode); - if (node) - return node; + do { + node = btrfs_get_delayed_node(btrfs_inode); + if (node) + return node; - node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS); - if (!node) - return ERR_PTR(-ENOMEM); - btrfs_init_delayed_node(node, root, ino); + node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS); + if (!node) + return ERR_PTR(-ENOMEM); + btrfs_init_delayed_node(node, root, ino); - /* cached in the btrfs inode and can be accessed */ - refcount_set(&node->refs, 2); + /* Cached in the inode and can be accessed */ + refcount_set(&node->refs, 2); - ret = radix_tree_preload(GFP_NOFS); - if (ret) { - kmem_cache_free(delayed_node_cache, node); - return ERR_PTR(ret); - } - - spin_lock(&root->inode_lock); - ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node); - if (ret == -EEXIST) { - spin_unlock(&root->inode_lock); - kmem_cache_free(delayed_node_cache, node); - radix_tree_preload_end(); - goto again; - } + spin_lock(&root->inode_lock); + ret = xa_insert(&root->delayed_nodes, ino, node, GFP_NOFS); + if (ret) { + spin_unlock(&root->inode_lock); + kmem_cache_free(delayed_node_cache, node); + if (ret != -EBUSY) + return ERR_PTR(ret); + } + } while (ret); btrfs_inode->delayed_node = node; spin_unlock(&root->inode_lock); - radix_tree_preload_end(); return node; } @@ -276,8 +270,7 @@ static void __btrfs_release_delayed_node( * back up. We can delete it now. */ ASSERT(refcount_read(&delayed_node->refs) == 0); - radix_tree_delete(&root->delayed_nodes_tree, - delayed_node->inode_id); + xa_erase(&root->delayed_nodes, delayed_node->inode_id); spin_unlock(&root->inode_lock); kmem_cache_free(delayed_node_cache, delayed_node); } @@ -1870,34 +1863,35 @@ void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode) void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) { - u64 inode_id = 0; + unsigned long index = 0; + struct btrfs_delayed_node *delayed_node; struct btrfs_delayed_node *delayed_nodes[8]; - int i, n; while (1) { + int n = 0; + spin_lock(&root->inode_lock); - n = radix_tree_gang_lookup(&root->delayed_nodes_tree, - (void **)delayed_nodes, inode_id, - ARRAY_SIZE(delayed_nodes)); - if (!n) { + if (xa_empty(&root->delayed_nodes)) { spin_unlock(&root->inode_lock); - break; + return; } - inode_id = delayed_nodes[n - 1]->inode_id + 1; - for (i = 0; i < n; i++) { + xa_for_each_start(&root->delayed_nodes, index, delayed_node, index) { /* * Don't increase refs in case the node is dead and * about to be removed from the tree in the loop below */ - if (!refcount_inc_not_zero(&delayed_nodes[i]->refs)) - delayed_nodes[i] = NULL; + if (refcount_inc_not_zero(&delayed_node->refs)) { + delayed_nodes[n] = delayed_node; + n++; + } + if (n >= ARRAY_SIZE(delayed_nodes)) + break; } + index++; spin_unlock(&root->inode_lock); - for (i = 0; i < n; i++) { - if (!delayed_nodes[i]) - continue; + for (int i = 0; i < n; i++) { __btrfs_kill_delayed_node(delayed_nodes[i]); btrfs_release_delayed_node(delayed_nodes[i]); } diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 4176df149d04..99f37fca2e96 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -930,7 +930,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID); ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); - BUG_ON(extent_op && extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); if (!ref) return -ENOMEM; @@ -1103,8 +1102,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, return -ENOMEM; init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0, - BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data, - false); + BTRFS_UPDATE_DELAYED_HEAD, false, false); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 91a3aabad150..d6304b690ec4 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -58,7 +58,6 @@ struct btrfs_delayed_extent_op { u8 level; bool update_key; bool update_flags; - bool is_data; u64 flags_to_set; }; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index f26202621989..a7dd6ba25e99 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -474,6 +474,7 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_block_group *cache; struct btrfs_trans_handle *trans; + int iter_ret = 0; int ret = 0; u64 chunk_offset; @@ -524,29 +525,8 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, key.type = BTRFS_DEV_EXTENT_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto free_path; - if (ret > 0) { - if (path->slots[0] >= - btrfs_header_nritems(path->nodes[0])) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto free_path; - if (ret > 0) { - ret = 0; - goto free_path; - } - } else { - ret = 0; - } - } - - while (1) { + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct extent_buffer *leaf = path->nodes[0]; - int slot = path->slots[0]; - - btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.objectid != src_dev->devid) break; @@ -557,30 +537,23 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, if (found_key.offset < key.offset) break; - dev_extent = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); + dev_extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent); cache = btrfs_lookup_block_group(fs_info, chunk_offset); if (!cache) - goto skip; + continue; spin_lock(&cache->lock); cache->to_copy = 1; spin_unlock(&cache->lock); btrfs_put_block_group(cache); - -skip: - ret = btrfs_next_item(root, path); - if (ret != 0) { - if (ret > 0) - ret = 0; - break; - } } + if (iter_ret < 0) + ret = iter_ret; -free_path: btrfs_free_path(path); unlock: mutex_unlock(&fs_info->chunk_mutex); @@ -881,6 +854,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret) { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *tgt_device; struct btrfs_device *src_device; struct btrfs_root *root = fs_info->tree_root; @@ -930,12 +904,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, WARN_ON(ret); /* Prevent write_all_supers() during the finishing procedure */ - mutex_lock(&fs_info->fs_devices->device_list_mutex); + mutex_lock(&fs_devices->device_list_mutex); /* Prevent new chunks being allocated on the source device */ mutex_lock(&fs_info->chunk_mutex); if (!list_empty(&src_device->post_commit_list)) { - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_devices->device_list_mutex); mutex_unlock(&fs_info->chunk_mutex); } else { break; @@ -972,7 +946,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, error: up_write(&dev_replace->rwsem); mutex_unlock(&fs_info->chunk_mutex); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_devices->device_list_mutex); btrfs_rm_dev_replace_blocked(fs_info); if (tgt_device) btrfs_destroy_dev_replace_tgtdev(tgt_device); @@ -1001,8 +975,8 @@ error: btrfs_assign_next_active_device(src_device, tgt_device); - list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); - fs_info->fs_devices->rw_devices++; + list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list); + fs_devices->rw_devices++; up_write(&dev_replace->rwsem); btrfs_rm_dev_replace_blocked(fs_info); @@ -1025,7 +999,7 @@ error: * belong to this filesystem. */ mutex_unlock(&fs_info->chunk_mutex); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_devices->device_list_mutex); /* replace the sysfs entry */ btrfs_sysfs_remove_device(src_device); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 3b532bab0755..72fb2c518a2b 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -325,36 +325,15 @@ btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, u64 dirid, const char *name, int name_len) { - struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key key; - u32 nritems; int ret; key.objectid = dirid; key.type = BTRFS_DIR_INDEX_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - return ERR_PTR(ret); - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - - while (1) { - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - return ERR_PTR(ret); - if (ret > 0) - break; - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - continue; - } - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_for_each_slot(root, &key, &key, path, ret) { if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY) break; @@ -362,10 +341,12 @@ btrfs_search_dir_index_item(struct btrfs_root *root, name, name_len); if (di) return di; - - path->slots[0]++; } - return NULL; + /* Adjust return code if the key was not found in the next leaf. */ + if (ret > 0) + ret = 0; + + return ERR_PTR(ret); } struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 31c3f592e587..14f8a90df321 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -5,7 +5,6 @@ #include <linux/fs.h> #include <linux/blkdev.h> -#include <linux/radix-tree.h> #include <linux/writeback.h> #include <linux/workqueue.h> #include <linux/kthread.h> @@ -374,9 +373,9 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, * @level: expected level, mandatory check * @first_key: expected key of first slot, skip check if NULL */ -static int btree_read_extent_buffer_pages(struct extent_buffer *eb, - u64 parent_transid, int level, - struct btrfs_key *first_key) +int btrfs_read_extent_buffer(struct extent_buffer *eb, + u64 parent_transid, int level, + struct btrfs_key *first_key) { struct btrfs_fs_info *fs_info = eb->fs_info; struct extent_io_tree *io_tree; @@ -486,7 +485,7 @@ static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info, uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur, fs_info->nodesize); - /* A dirty eb shouldn't disappear from buffer_radix */ + /* A dirty eb shouldn't disappear from extent_buffers */ if (WARN_ON(!eb)) return -EUCLEAN; @@ -519,7 +518,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec u64 found_start; struct extent_buffer *eb; - if (fs_info->sectorsize < PAGE_SIZE) + if (fs_info->nodesize < PAGE_SIZE) return csum_dirty_subpage_buffers(fs_info, bvec); eb = (struct extent_buffer *)page->private; @@ -704,7 +703,7 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, ASSERT(page->private); - if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) return validate_subpage_buffer(page, start, end, mirror); eb = (struct extent_buffer *)page->private; @@ -850,8 +849,7 @@ static void run_one_async_free(struct btrfs_work *work) } blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 dio_file_offset, + int mirror_num, u64 dio_file_offset, extent_submit_bio_start_t *submit_bio_start) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; @@ -874,9 +872,9 @@ blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, async->status = 0; if (op_is_sync(bio->bi_opf)) - btrfs_set_work_high_priority(&async->work); - - btrfs_queue_work(fs_info->workers, &async->work); + btrfs_queue_work(fs_info->hipri_workers, &async->work); + else + btrfs_queue_work(fs_info->workers, &async->work); return 0; } @@ -920,8 +918,7 @@ static bool should_async_write(struct btrfs_fs_info *fs_info, return true; } -blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags) +void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); blk_status_t ret; @@ -933,31 +930,25 @@ blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, */ ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_METADATA); - if (ret) - goto out_w_error; - ret = btrfs_map_bio(fs_info, bio, mirror_num); + if (!ret) + ret = btrfs_map_bio(fs_info, bio, mirror_num); } else if (!should_async_write(fs_info, BTRFS_I(inode))) { ret = btree_csum_one_bio(bio); - if (ret) - goto out_w_error; - ret = btrfs_map_bio(fs_info, bio, mirror_num); + if (!ret) + ret = btrfs_map_bio(fs_info, bio, mirror_num); } else { /* * kthread helpers are used to submit writes so that * checksumming can happen in parallel across all CPUs */ ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0, - 0, btree_submit_bio_start); + btree_submit_bio_start); } - if (ret) - goto out_w_error; - return 0; - -out_w_error: - bio->bi_status = ret; - bio_endio(bio); - return ret; + if (ret) { + bio->bi_status = ret; + bio_endio(bio); + } } #ifdef CONFIG_MIGRATION @@ -1118,12 +1109,15 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, if (IS_ERR(buf)) return buf; - ret = btree_read_extent_buffer_pages(buf, parent_transid, - level, first_key); + ret = btrfs_read_extent_buffer(buf, parent_transid, level, first_key); if (ret) { free_extent_buffer_stale(buf); return ERR_PTR(ret); } + if (btrfs_check_eb_owner(buf, owner_root)) { + free_extent_buffer_stale(buf); + return ERR_PTR(-EUCLEAN); + } return buf; } @@ -1164,7 +1158,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->nr_delalloc_inodes = 0; root->nr_ordered_extents = 0; root->inode_tree = RB_ROOT; - INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); + xa_init_flags(&root->delayed_nodes, GFP_ATOMIC); btrfs_init_root_block_rsv(root); @@ -1216,9 +1210,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks); #ifdef CONFIG_BTRFS_DEBUG INIT_LIST_HEAD(&root->leak_list); - spin_lock(&fs_info->fs_roots_radix_lock); + spin_lock(&fs_info->fs_roots_lock); list_add_tail(&root->leak_list, &fs_info->allocated_roots); - spin_unlock(&fs_info->fs_roots_radix_lock); + spin_unlock(&fs_info->fs_roots_lock); #endif } @@ -1563,6 +1557,23 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, ret = -EIO; goto fail; } + + /* + * For real fs, and not log/reloc trees, root owner must + * match its root node owner + */ + if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) && + root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && + root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && + root->root_key.objectid != btrfs_header_owner(root->node)) { + btrfs_crit(fs_info, +"root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu", + root->root_key.objectid, root->node->start, + btrfs_header_owner(root->node), + root->root_key.objectid); + ret = -EUCLEAN; + goto fail; + } root->commit_root = btrfs_root_node(root); return root; fail: @@ -1648,12 +1659,11 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, { struct btrfs_root *root; - spin_lock(&fs_info->fs_roots_radix_lock); - root = radix_tree_lookup(&fs_info->fs_roots_radix, - (unsigned long)root_id); + spin_lock(&fs_info->fs_roots_lock); + root = xa_load(&fs_info->fs_roots, (unsigned long)root_id); if (root) root = btrfs_grab_root(root); - spin_unlock(&fs_info->fs_roots_radix_lock); + spin_unlock(&fs_info->fs_roots_lock); return root; } @@ -1695,20 +1705,14 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, { int ret; - ret = radix_tree_preload(GFP_NOFS); - if (ret) - return ret; - - spin_lock(&fs_info->fs_roots_radix_lock); - ret = radix_tree_insert(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, - root); + spin_lock(&fs_info->fs_roots_lock); + ret = xa_insert(&fs_info->fs_roots, (unsigned long)root->root_key.objectid, + root, GFP_NOFS); if (ret == 0) { btrfs_grab_root(root); - set_bit(BTRFS_ROOT_IN_RADIX, &root->state); + set_bit(BTRFS_ROOT_REGISTERED, &root->state); } - spin_unlock(&fs_info->fs_roots_radix_lock); - radix_tree_preload_end(); + spin_unlock(&fs_info->fs_roots_lock); return ret; } @@ -1964,7 +1968,7 @@ static void end_workqueue_fn(struct btrfs_work *work) static int cleaner_kthread(void *arg) { - struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)arg; + struct btrfs_fs_info *fs_info = arg; int again; while (1) { @@ -2266,10 +2270,12 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) { btrfs_destroy_workqueue(fs_info->fixup_workers); btrfs_destroy_workqueue(fs_info->delalloc_workers); + btrfs_destroy_workqueue(fs_info->hipri_workers); btrfs_destroy_workqueue(fs_info->workers); btrfs_destroy_workqueue(fs_info->endio_workers); btrfs_destroy_workqueue(fs_info->endio_raid56_workers); - btrfs_destroy_workqueue(fs_info->rmw_workers); + if (fs_info->rmw_workers) + destroy_workqueue(fs_info->rmw_workers); btrfs_destroy_workqueue(fs_info->endio_write_workers); btrfs_destroy_workqueue(fs_info->endio_freespace_worker); btrfs_destroy_workqueue(fs_info->delayed_workers); @@ -2336,9 +2342,9 @@ void btrfs_put_root(struct btrfs_root *root) btrfs_drew_lock_destroy(&root->snapshot_lock); free_root_extent_buffers(root); #ifdef CONFIG_BTRFS_DEBUG - spin_lock(&root->fs_info->fs_roots_radix_lock); + spin_lock(&root->fs_info->fs_roots_lock); list_del_init(&root->leak_list); - spin_unlock(&root->fs_info->fs_roots_radix_lock); + spin_unlock(&root->fs_info->fs_roots_lock); #endif kfree(root); } @@ -2346,28 +2352,21 @@ void btrfs_put_root(struct btrfs_root *root) void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) { - int ret; - struct btrfs_root *gang[8]; - int i; + struct btrfs_root *root; + unsigned long index = 0; while (!list_empty(&fs_info->dead_roots)) { - gang[0] = list_entry(fs_info->dead_roots.next, - struct btrfs_root, root_list); - list_del(&gang[0]->root_list); + root = list_entry(fs_info->dead_roots.next, + struct btrfs_root, root_list); + list_del(&root->root_list); - if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) - btrfs_drop_and_free_fs_root(fs_info, gang[0]); - btrfs_put_root(gang[0]); + if (test_bit(BTRFS_ROOT_REGISTERED, &root->state)) + btrfs_drop_and_free_fs_root(fs_info, root); + btrfs_put_root(root); } - while (1) { - ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, - (void **)gang, 0, - ARRAY_SIZE(gang)); - if (!ret) - break; - for (i = 0; i < ret; i++) - btrfs_drop_and_free_fs_root(fs_info, gang[i]); + xa_for_each(&fs_info->fs_roots, index, root) { + btrfs_drop_and_free_fs_root(fs_info, root); } } @@ -2444,7 +2443,9 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; fs_info->workers = - btrfs_alloc_workqueue(fs_info, "worker", + btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16); + fs_info->hipri_workers = + btrfs_alloc_workqueue(fs_info, "worker-high", flags | WQ_HIGHPRI, max_active, 16); fs_info->delalloc_workers = @@ -2476,8 +2477,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) fs_info->endio_raid56_workers = btrfs_alloc_workqueue(fs_info, "endio-raid56", flags, max_active, 4); - fs_info->rmw_workers = - btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2); + fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active); fs_info->endio_write_workers = btrfs_alloc_workqueue(fs_info, "endio-write", flags, max_active, 2); @@ -2492,8 +2492,8 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) fs_info->discard_ctl.discard_workers = alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1); - if (!(fs_info->workers && fs_info->delalloc_workers && - fs_info->flush_workers && + if (!(fs_info->workers && fs_info->hipri_workers && + fs_info->delalloc_workers && fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && fs_info->endio_meta_write_workers && fs_info->endio_write_workers && fs_info->endio_raid56_workers && @@ -2815,12 +2815,14 @@ static int validate_super(struct btrfs_fs_info *fs_info, } /* - * For 4K page size, we only support 4K sector size. - * For 64K page size, we support 64K and 4K sector sizes. + * We only support at most two sectorsizes: 4K and PAGE_SIZE. + * + * We can support 16K sectorsize with 64K page size without problem, + * but such sectorsize/pagesize combination doesn't make much sense. + * 4K will be our future standard, PAGE_SIZE is supported from the very + * beginning. */ - if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) || - (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K && - sectorsize != SZ_64K))) { + if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) { btrfs_err(fs_info, "sectorsize %llu not yet supported for page size %lu", sectorsize, PAGE_SIZE); @@ -3132,8 +3134,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) { - INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); - INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); + xa_init_flags(&fs_info->fs_roots, GFP_ATOMIC); + xa_init_flags(&fs_info->extent_buffers, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); @@ -3141,7 +3143,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&fs_info->caching_block_groups); spin_lock_init(&fs_info->delalloc_root_lock); spin_lock_init(&fs_info->trans_lock); - spin_lock_init(&fs_info->fs_roots_radix_lock); + spin_lock_init(&fs_info->fs_roots_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); spin_lock_init(&fs_info->super_lock); @@ -3209,9 +3211,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) btrfs_init_balance(fs_info); btrfs_init_async_reclaim_work(fs_info); - spin_lock_init(&fs_info->block_group_cache_lock); - fs_info->block_group_cache_tree = RB_ROOT; - fs_info->first_logical_byte = (u64)-1; + rwlock_init(&fs_info->block_group_cache_lock); + fs_info->block_group_cache_tree = RB_ROOT_CACHED; extent_io_tree_init(fs_info, &fs_info->excluded_extents, IO_TREE_FS_EXCLUDED_EXTENTS, NULL); @@ -3295,7 +3296,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block static int btrfs_uuid_rescan_kthread(void *data) { - struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; + struct btrfs_fs_info *fs_info = data; int ret; /* @@ -3373,7 +3374,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) /* * btrfs_find_orphan_roots() is responsible for finding all the dead * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load - * them into the fs_info->fs_roots_radix tree. This must be done before + * them into the fs_info->fs_roots. This must be done before * calling btrfs_orphan_cleanup() on the tree root. If we don't do it * first, then btrfs_orphan_cleanup() will delete a dead root's orphan * item before the root's tree is deleted - this means that if we unmount @@ -3611,7 +3612,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device ~BTRFS_FEATURE_INCOMPAT_SUPP; if (features) { btrfs_err(fs_info, - "cannot mount because of unsupported optional features (%llx)", + "cannot mount because of unsupported optional features (0x%llx)", features); err = -EINVAL; goto fail_alloc; @@ -3649,7 +3650,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device ~BTRFS_FEATURE_COMPAT_RO_SUPP; if (!sb_rdonly(sb) && features) { btrfs_err(fs_info, - "cannot mount read-write because of unsupported optional features (%llx)", + "cannot mount read-write because of unsupported optional features (0x%llx)", features); err = -EINVAL; goto fail_alloc; @@ -3672,14 +3673,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device btrfs_warn(fs_info, "read-write for sector size %u with page size %lu is experimental", sectorsize, PAGE_SIZE); - if (btrfs_super_incompat_flags(fs_info->super_copy) & - BTRFS_FEATURE_INCOMPAT_RAID56) { - btrfs_err(fs_info, - "RAID56 is not yet supported for sector size %u with page size %lu", - sectorsize, PAGE_SIZE); - err = -EINVAL; - goto fail_alloc; - } subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL); if (!subpage_info) goto fail_alloc; @@ -4157,7 +4150,8 @@ static int write_dev_supers(struct btrfs_device *device, if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER)) bio->bi_opf |= REQ_FUA; - btrfsic_submit_bio(bio); + btrfsic_check_bio(bio); + submit_bio(bio); if (btrfs_advance_sb_log(device, i)) errors++; @@ -4238,6 +4232,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) */ static void btrfs_end_empty_barrier(struct bio *bio) { + bio_uninit(bio); complete(bio->bi_private); } @@ -4247,7 +4242,7 @@ static void btrfs_end_empty_barrier(struct bio *bio) */ static void write_dev_flush(struct btrfs_device *device) { - struct bio *bio = device->flush_bio; + struct bio *bio = &device->flush_bio; #ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY /* @@ -4260,17 +4255,18 @@ static void write_dev_flush(struct btrfs_device *device) * of simplicity, since this is a debug tool and not meant for use in * non-debug builds. */ - struct request_queue *q = bdev_get_queue(device->bdev); - if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) + if (!bdev_write_cache(device->bdev)) return; #endif - bio_reset(bio, device->bdev, REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH); + bio_init(bio, device->bdev, NULL, 0, + REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH); bio->bi_end_io = btrfs_end_empty_barrier; init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; - btrfsic_submit_bio(bio); + btrfsic_check_bio(bio); + submit_bio(bio); set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); } @@ -4279,7 +4275,7 @@ static void write_dev_flush(struct btrfs_device *device) */ static blk_status_t wait_dev_flush(struct btrfs_device *device) { - struct bio *bio = device->flush_bio; + struct bio *bio = &device->flush_bio; if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)) return BLK_STS_OK; @@ -4503,12 +4499,11 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, { bool drop_ref = false; - spin_lock(&fs_info->fs_roots_radix_lock); - radix_tree_delete(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid); - if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state)) + spin_lock(&fs_info->fs_roots_lock); + xa_erase(&fs_info->fs_roots, (unsigned long)root->root_key.objectid); + if (test_and_clear_bit(BTRFS_ROOT_REGISTERED, &root->state)) drop_ref = true; - spin_unlock(&fs_info->fs_roots_radix_lock); + spin_unlock(&fs_info->fs_roots_lock); if (BTRFS_FS_ERROR(fs_info)) { ASSERT(root->log_root == NULL); @@ -4524,50 +4519,48 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) { - u64 root_objectid = 0; - struct btrfs_root *gang[8]; - int i = 0; + struct btrfs_root *roots[8]; + unsigned long index = 0; + int i; int err = 0; - unsigned int ret = 0; + int grabbed; while (1) { - spin_lock(&fs_info->fs_roots_radix_lock); - ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, - (void **)gang, root_objectid, - ARRAY_SIZE(gang)); - if (!ret) { - spin_unlock(&fs_info->fs_roots_radix_lock); - break; + struct btrfs_root *root; + + spin_lock(&fs_info->fs_roots_lock); + if (!xa_find(&fs_info->fs_roots, &index, ULONG_MAX, XA_PRESENT)) { + spin_unlock(&fs_info->fs_roots_lock); + return err; } - root_objectid = gang[ret - 1]->root_key.objectid + 1; - for (i = 0; i < ret; i++) { - /* Avoid to grab roots in dead_roots */ - if (btrfs_root_refs(&gang[i]->root_item) == 0) { - gang[i] = NULL; - continue; - } - /* grab all the search result for later use */ - gang[i] = btrfs_grab_root(gang[i]); + grabbed = 0; + xa_for_each_start(&fs_info->fs_roots, index, root, index) { + /* Avoid grabbing roots in dead_roots */ + if (btrfs_root_refs(&root->root_item) > 0) + roots[grabbed++] = btrfs_grab_root(root); + if (grabbed >= ARRAY_SIZE(roots)) + break; } - spin_unlock(&fs_info->fs_roots_radix_lock); + spin_unlock(&fs_info->fs_roots_lock); - for (i = 0; i < ret; i++) { - if (!gang[i]) + for (i = 0; i < grabbed; i++) { + if (!roots[i]) continue; - root_objectid = gang[i]->root_key.objectid; - err = btrfs_orphan_cleanup(gang[i]); + index = roots[i]->root_key.objectid; + err = btrfs_orphan_cleanup(roots[i]); if (err) - break; - btrfs_put_root(gang[i]); + goto out; + btrfs_put_root(roots[i]); } - root_objectid++; + index++; } - /* release the uncleaned roots due to error */ - for (; i < ret; i++) { - if (gang[i]) - btrfs_put_root(gang[i]); +out: + /* Release the roots that remain uncleaned due to error */ + for (; i < grabbed; i++) { + if (roots[i]) + btrfs_put_root(roots[i]); } return err; } @@ -4862,13 +4855,6 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info) __btrfs_btree_balance_dirty(fs_info, 0); } -int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, - struct btrfs_key *first_key) -{ - return btree_read_extent_buffer_pages(buf, parent_transid, - level, first_key); -} - static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) { /* cleanup FS via transaction */ @@ -4884,31 +4870,28 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info) { - struct btrfs_root *gang[8]; - u64 root_objectid = 0; - int ret; - - spin_lock(&fs_info->fs_roots_radix_lock); - while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, - (void **)gang, root_objectid, - ARRAY_SIZE(gang))) != 0) { - int i; + unsigned long index = 0; + int grabbed = 0; + struct btrfs_root *roots[8]; - for (i = 0; i < ret; i++) - gang[i] = btrfs_grab_root(gang[i]); - spin_unlock(&fs_info->fs_roots_radix_lock); + spin_lock(&fs_info->fs_roots_lock); + while ((grabbed = xa_extract(&fs_info->fs_roots, (void **)roots, index, + ULONG_MAX, 8, XA_PRESENT))) { + for (int i = 0; i < grabbed; i++) + roots[i] = btrfs_grab_root(roots[i]); + spin_unlock(&fs_info->fs_roots_lock); - for (i = 0; i < ret; i++) { - if (!gang[i]) + for (int i = 0; i < grabbed; i++) { + if (!roots[i]) continue; - root_objectid = gang[i]->root_key.objectid; - btrfs_free_log(NULL, gang[i]); - btrfs_put_root(gang[i]); + index = roots[i]->root_key.objectid; + btrfs_free_log(NULL, roots[i]); + btrfs_put_root(roots[i]); } - root_objectid++; - spin_lock(&fs_info->fs_roots_radix_lock); + index++; + spin_lock(&fs_info->fs_roots_lock); } - spin_unlock(&fs_info->fs_roots_radix_lock); + spin_unlock(&fs_info->fs_roots_lock); btrfs_free_log_root_tree(NULL, fs_info); } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 2e10514ecda8..4ee8c42c9f78 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -87,8 +87,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, struct page *page, u64 start, u64 end, int mirror); -blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags); +void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif @@ -120,13 +119,12 @@ void btrfs_put_root(struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); -int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, - struct btrfs_key *first_key); +int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid, + int level, struct btrfs_key *first_key); blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata); blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 dio_file_offset, + int mirror_num, u64 dio_file_offset, extent_submit_bio_start_t *submit_bio_start); blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, int mirror_num); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6aa92f84f465..0867c5cd6e01 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -895,7 +895,13 @@ again: err = -ENOENT; while (1) { if (ptr >= end) { - WARN_ON(ptr > end); + if (ptr > end) { + err = -EUCLEAN; + btrfs_print_leaf(path->nodes[0]); + btrfs_crit(fs_info, +"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu", + path->slots[0], root_objectid, owner, offset, parent); + } break; } iref = (struct btrfs_extent_inline_ref *)ptr; @@ -1239,7 +1245,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, if (size) { ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, - GFP_NOFS, 0); + GFP_NOFS); if (!ret) *discarded_bytes += size; else if (ret != -EOPNOTSUPP) @@ -1256,7 +1262,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, if (bytes_left) { ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, - GFP_NOFS, 0); + GFP_NOFS); if (!ret) *discarded_bytes += bytes_left; } @@ -1291,7 +1297,7 @@ static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes) ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len, &discarded); discarded += src_disc; - } else if (blk_queue_discard(bdev_get_queue(stripe->dev->bdev))) { + } else if (bdev_max_discard_sectors(stripe->dev->bdev)) { ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded); } else { ret = 0; @@ -1577,12 +1583,12 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, u32 item_size; int ret; int err = 0; - int metadata = !extent_op->is_data; + int metadata = 1; if (TRANS_ABORTED(trans)) return 0; - if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA)) metadata = 0; path = btrfs_alloc_path(); @@ -2180,7 +2186,7 @@ out: int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct extent_buffer *eb, u64 flags, - int level, int is_data) + int level) { struct btrfs_delayed_extent_op *extent_op; int ret; @@ -2192,7 +2198,6 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, extent_op->flags_to_set = flags; extent_op->update_flags = true; extent_op->update_key = false; - extent_op->is_data = is_data ? true : false; extent_op->level = level; ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op); @@ -2357,15 +2362,10 @@ out: } int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, - u64 bytenr, bool strict) + u64 bytenr, bool strict, struct btrfs_path *path) { - struct btrfs_path *path; int ret; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - do { ret = check_committed_ref(root, path, objectid, offset, bytenr, strict); @@ -2376,7 +2376,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, } while (ret == -EAGAIN); out: - btrfs_free_path(path); + btrfs_release_path(path); if (btrfs_is_data_reloc_root(root)) WARN_ON(ret > 0); return ret; @@ -2497,24 +2497,21 @@ static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) return ret; } -static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) +static u64 first_logical_byte(struct btrfs_fs_info *fs_info) { - struct btrfs_block_group *cache; - u64 bytenr; - - spin_lock(&fs_info->block_group_cache_lock); - bytenr = fs_info->first_logical_byte; - spin_unlock(&fs_info->block_group_cache_lock); - - if (bytenr < (u64)-1) - return bytenr; + struct rb_node *leftmost; + u64 bytenr = 0; - cache = btrfs_lookup_first_block_group(fs_info, search_start); - if (!cache) - return 0; + read_lock(&fs_info->block_group_cache_lock); + /* Get the block group with the lowest logical start address. */ + leftmost = rb_first_cached(&fs_info->block_group_cache_tree); + if (leftmost) { + struct btrfs_block_group *bg; - bytenr = cache->start; - btrfs_put_block_group(cache); + bg = rb_entry(leftmost, struct btrfs_block_group, cache_node); + bytenr = bg->start; + } + read_unlock(&fs_info->block_group_cache_lock); return bytenr; } @@ -3803,8 +3800,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, /* Check RO and no space case before trying to activate it */ spin_lock(&block_group->lock); - if (block_group->ro || - block_group->alloc_offset == block_group->zone_capacity) { + if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) { ret = 1; /* * May need to clear fs_info->{treelog,data_reloc}_bg. @@ -4272,7 +4268,7 @@ static noinline int find_free_extent(struct btrfs_root *root, return ret; ffe_ctl->search_start = max(ffe_ctl->search_start, - first_logical_byte(fs_info, 0)); + first_logical_byte(fs_info)); ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte); if (ffe_ctl->search_start == ffe_ctl->hint_byte) { block_group = btrfs_lookup_block_group(fs_info, @@ -4959,7 +4955,6 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, extent_op->flags_to_set = flags; extent_op->update_key = skinny_metadata ? false : true; extent_op->update_flags = true; - extent_op->is_data = false; extent_op->level = level; btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, @@ -5144,7 +5139,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ ret = btrfs_set_disk_extent_flags(trans, eb, flag, - btrfs_header_level(eb), 0); + btrfs_header_level(eb)); BUG_ON(ret); /* -ENOMEM */ wc->flags[level] |= flag; } @@ -5818,7 +5813,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) btrfs_qgroup_convert_reserved_meta(root, INT_MAX); btrfs_qgroup_free_meta_all_pertrans(root); - if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) + if (test_bit(BTRFS_ROOT_REGISTERED, &root->state)) btrfs_add_dropped_root(trans, root); else btrfs_put_root(root); @@ -5987,7 +5982,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) *trimmed = 0; /* Discard not supported = nothing to do. */ - if (!blk_queue_discard(bdev_get_queue(device->bdev))) + if (!bdev_max_discard_sectors(device->bdev)) return 0; /* Not writable = nothing to do. */ diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 33c19f51d79b..588c7c606a2c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -6,6 +6,7 @@ #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/page-flags.h> +#include <linux/sched/mm.h> #include <linux/spinlock.h> #include <linux/blkdev.h> #include <linux/swap.h> @@ -28,6 +29,7 @@ #include "subpage.h" #include "zoned.h" #include "block-group.h" +#include "compression.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -75,6 +77,7 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) if (!fs_info->allocated_ebs.next) return; + WARN_ON(!list_empty(&fs_info->allocated_ebs)); spin_lock_irqsave(&fs_info->eb_leak_lock, flags); while (!list_empty(&fs_info->allocated_ebs)) { eb = list_first_entry(&fs_info->allocated_ebs, @@ -135,6 +138,17 @@ struct tree_entry { struct rb_node rb_node; }; +/* + * Structure to record info about the bio being assembled, and other info like + * how many bytes are there before stripe/ordered extent boundary. + */ +struct btrfs_bio_ctrl { + struct bio *bio; + enum btrfs_compression_type compress_type; + u32 len_to_stripe_boundary; + u32 len_to_oe_boundary; +}; + struct extent_page_data { struct btrfs_bio_ctrl bio_ctrl; /* tells writepage not to lock the state bits for this range @@ -164,24 +178,27 @@ static int add_extent_changeset(struct extent_state *state, u32 bits, return ret; } -int __must_check submit_one_bio(struct bio *bio, int mirror_num, - unsigned long bio_flags) +static void submit_one_bio(struct bio *bio, int mirror_num, + enum btrfs_compression_type compress_type) { - blk_status_t ret = 0; struct extent_io_tree *tree = bio->bi_private; bio->bi_private = NULL; /* Caller should ensure the bio has at least some range added */ ASSERT(bio->bi_iter.bi_size); + if (is_data_inode(tree->private_data)) - ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num, - bio_flags); + btrfs_submit_data_bio(tree->private_data, bio, mirror_num, + compress_type); else - ret = btrfs_submit_metadata_bio(tree->private_data, bio, - mirror_num, bio_flags); - - return blk_status_to_errno(ret); + btrfs_submit_metadata_bio(tree->private_data, bio, mirror_num); + /* + * Above submission hooks will handle the error by ending the bio, + * which will do the cleanup properly. So here we should not return + * any error, or the caller of submit_extent_page() will do cleanup + * again, causing problems. + */ } /* Cleanup unsubmitted bios */ @@ -202,13 +219,12 @@ static void end_write_bio(struct extent_page_data *epd, int ret) * Return 0 if everything is OK. * Return <0 for error. */ -static int __must_check flush_write_bio(struct extent_page_data *epd) +static void flush_write_bio(struct extent_page_data *epd) { - int ret = 0; struct bio *bio = epd->bio_ctrl.bio; if (bio) { - ret = submit_one_bio(bio, 0, 0); + submit_one_bio(bio, 0, 0); /* * Clean up of epd->bio is handled by its endio function. * And endio is either triggered by successful bio execution @@ -218,7 +234,6 @@ static int __must_check flush_write_bio(struct extent_page_data *epd) */ epd->bio_ctrl.bio = NULL; } - return ret; } int __init extent_state_cache_init(void) @@ -2303,12 +2318,13 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, u64 length, u64 logical, struct page *page, unsigned int pg_offset, int mirror_num) { - struct bio *bio; struct btrfs_device *dev; + struct bio_vec bvec; + struct bio bio; u64 map_length = 0; u64 sector; struct btrfs_io_context *bioc = NULL; - int ret; + int ret = 0; ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); BUG_ON(!mirror_num); @@ -2316,8 +2332,6 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, if (btrfs_repair_one_zone(fs_info, logical)) return 0; - bio = btrfs_bio_alloc(1); - bio->bi_iter.bi_size = 0; map_length = length; /* @@ -2335,52 +2349,50 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, */ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, &map_length, &bioc, 0); - if (ret) { - btrfs_bio_counter_dec(fs_info); - bio_put(bio); - return -EIO; - } + if (ret) + goto out_counter_dec; ASSERT(bioc->mirror_num == 1); } else { ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, &bioc, mirror_num); - if (ret) { - btrfs_bio_counter_dec(fs_info); - bio_put(bio); - return -EIO; - } + if (ret) + goto out_counter_dec; BUG_ON(mirror_num != bioc->mirror_num); } sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; - bio->bi_iter.bi_sector = sector; dev = bioc->stripes[bioc->mirror_num - 1].dev; btrfs_put_bioc(bioc); + if (!dev || !dev->bdev || !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { - btrfs_bio_counter_dec(fs_info); - bio_put(bio); - return -EIO; + ret = -EIO; + goto out_counter_dec; } - bio_set_dev(bio, dev->bdev); - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; - bio_add_page(bio, page, length, pg_offset); - if (btrfsic_submit_bio_wait(bio)) { + bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); + bio.bi_iter.bi_sector = sector; + __bio_add_page(&bio, page, length, pg_offset); + + btrfsic_check_bio(&bio); + ret = submit_bio_wait(&bio); + if (ret) { /* try to remap that extent elsewhere? */ - btrfs_bio_counter_dec(fs_info); - bio_put(bio); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); - return -EIO; + goto out_bio_uninit; } btrfs_info_rl_in_rcu(fs_info, "read error corrected: ino %llu off %llu (dev %s sector %llu)", ino, start, rcu_str_deref(dev->name), sector); + ret = 0; + +out_bio_uninit: + bio_uninit(&bio); +out_counter_dec: btrfs_bio_counter_dec(fs_info); - bio_put(bio); - return 0; + return ret; } int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) @@ -2527,7 +2539,7 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode failrec->start = start; failrec->len = sectorsize; failrec->this_mirror = 0; - failrec->bio_flags = 0; + failrec->compress_type = BTRFS_COMPRESS_NONE; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, failrec->len); @@ -2551,8 +2563,7 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode logical = em->block_start + logical; if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { logical = em->block_start; - failrec->bio_flags = EXTENT_BIO_COMPRESSED; - extent_set_compress_type(&failrec->bio_flags, em->compress_type); + failrec->compress_type = em->compress_type; } btrfs_debug(fs_info, @@ -2684,7 +2695,7 @@ int btrfs_repair_one_sector(struct inode *inode, * will be handled by the endio on the repair_bio, so we can't return an * error here. */ - submit_bio_hook(inode, repair_bio, failrec->this_mirror, failrec->bio_flags); + submit_bio_hook(inode, repair_bio, failrec->this_mirror, failrec->compress_type); return BLK_STS_OK; } @@ -2710,18 +2721,19 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) btrfs_page_set_error(fs_info, page, start, len); } - if (fs_info->sectorsize == PAGE_SIZE) + if (!btrfs_is_subpage(fs_info, page)) unlock_page(page); else btrfs_subpage_end_reader(fs_info, page, start, len); } -static blk_status_t submit_read_repair(struct inode *inode, - struct bio *failed_bio, u32 bio_offset, - struct page *page, unsigned int pgoff, - u64 start, u64 end, int failed_mirror, - unsigned int error_bitmap, - submit_bio_hook_t *submit_bio_hook) +static blk_status_t submit_data_read_repair(struct inode *inode, + struct bio *failed_bio, + u32 bio_offset, struct page *page, + unsigned int pgoff, + u64 start, u64 end, + int failed_mirror, + unsigned int error_bitmap) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u32 sectorsize = fs_info->sectorsize; @@ -2731,6 +2743,9 @@ static blk_status_t submit_read_repair(struct inode *inode, BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); + /* This repair is only for data */ + ASSERT(is_data_inode(inode)); + /* We're here because we had some read errors or csum mismatch */ ASSERT(error_bitmap); @@ -2759,7 +2774,7 @@ static blk_status_t submit_read_repair(struct inode *inode, ret = btrfs_repair_one_sector(inode, failed_bio, bio_offset + offset, page, pgoff + offset, start + offset, - failed_mirror, submit_bio_hook); + failed_mirror, btrfs_submit_data_bio); if (!ret) { /* * We have submitted the read repair, the page release @@ -2943,7 +2958,7 @@ update: static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) { ASSERT(PageLocked(page)); - if (fs_info->sectorsize == PAGE_SIZE) + if (!btrfs_is_subpage(fs_info, page)) return; ASSERT(PagePrivate(page)); @@ -2951,7 +2966,7 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) } /* - * Find extent buffer for a givne bytenr. + * Find extent buffer for a given bytenr. * * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking * in endio context. @@ -2965,16 +2980,14 @@ static struct extent_buffer *find_extent_buffer_readpage( * For regular sectorsize, we can use page->private to grab extent * buffer */ - if (fs_info->sectorsize == PAGE_SIZE) { + if (fs_info->nodesize >= PAGE_SIZE) { ASSERT(PagePrivate(page) && page->private); return (struct extent_buffer *)page->private; } - /* For subpage case, we need to lookup buffer radix tree */ - rcu_read_lock(); - eb = radix_tree_lookup(&fs_info->buffer_radix, - bytenr >> fs_info->sectorsize_bits); - rcu_read_unlock(); + /* For subpage case, we need to lookup extent buffer xarray */ + eb = xa_load(&fs_info->extent_buffers, + bytenr >> fs_info->sectorsize_bits); ASSERT(eb); return eb; } @@ -3077,13 +3090,13 @@ static void end_bio_extent_readpage(struct bio *bio) goto readpage_ok; /* - * btrfs_submit_read_repair() will handle all the good + * submit_data_read_repair() will handle all the good * and bad sectors, we just continue to the next bvec. */ - submit_read_repair(inode, bio, bio_offset, page, - start - page_offset(page), start, - end, mirror, error_bitmap, - btrfs_submit_data_bio); + submit_data_read_repair(inode, bio, bio_offset, page, + start - page_offset(page), + start, end, mirror, + error_bitmap); ASSERT(bio_offset + len > bio_offset); bio_offset += len; @@ -3132,6 +3145,42 @@ readpage_ok: bio_put(bio); } +/** + * Populate every free slot in a provided array with pages. + * + * @nr_pages: number of pages to allocate + * @page_array: the array to fill with pages; any existing non-null entries in + * the array will be skipped + * + * Return: 0 if all pages were able to be allocated; + * -ENOMEM otherwise, and the caller is responsible for freeing all + * non-null page pointers in the array. + */ +int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) +{ + unsigned int allocated; + + for (allocated = 0; allocated < nr_pages;) { + unsigned int last = allocated; + + allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array); + + if (allocated == nr_pages) + return 0; + + /* + * During this iteration, no page could be allocated, even + * though alloc_pages_bulk_array() falls back to alloc_page() + * if it could not bulk-allocate. So we must be out of memory. + */ + if (allocated == last) + return -ENOMEM; + + memalloc_retry_wait(GFP_NOFS); + } + return 0; +} + /* * Initialize the members up to but not including 'bio'. Use after allocating a * new bio by bio_alloc_bioset as it does not initialize the bytes outside of @@ -3157,13 +3206,13 @@ struct bio *btrfs_bio_alloc(unsigned int nr_iovecs) return bio; } -struct bio *btrfs_bio_clone(struct bio *bio) +struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio) { struct btrfs_bio *bbio; struct bio *new; /* Bio allocation backed by a bioset does not fail */ - new = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOFS, &btrfs_bioset); + new = bio_alloc_clone(bdev, bio, GFP_NOFS, &btrfs_bioset); bbio = btrfs_bio(new); btrfs_bio_init(bbio); bbio->iter = bio->bi_iter; @@ -3198,7 +3247,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) * a contiguous page to the previous one * @size: portion of page that we want to write * @pg_offset: starting offset in the page - * @bio_flags: flags of the current bio to see if we can merge them + * @compress_type: compression type of the current bio to see if we can merge them * * Attempt to add a page to bio considering stripe alignment etc. * @@ -3210,7 +3259,7 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, struct page *page, u64 disk_bytenr, unsigned int size, unsigned int pg_offset, - unsigned long bio_flags) + enum btrfs_compression_type compress_type) { struct bio *bio = bio_ctrl->bio; u32 bio_size = bio->bi_iter.bi_size; @@ -3222,10 +3271,10 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, ASSERT(bio); /* The limit should be calculated when bio_ctrl->bio is allocated */ ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary); - if (bio_ctrl->bio_flags != bio_flags) + if (bio_ctrl->compress_type != compress_type) return 0; - if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) + if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) contig = bio->bi_iter.bi_sector == sector; else contig = bio_end_sector(bio) == sector; @@ -3268,7 +3317,7 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, * The split happens for real compressed bio, which happens in * btrfs_submit_compressed_read/write(). */ - if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) { + if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { bio_ctrl->len_to_oe_boundary = U32_MAX; bio_ctrl->len_to_stripe_boundary = U32_MAX; return 0; @@ -3311,7 +3360,7 @@ static int alloc_new_bio(struct btrfs_inode *inode, unsigned int opf, bio_end_io_t end_io_func, u64 disk_bytenr, u32 offset, u64 file_offset, - unsigned long bio_flags) + enum btrfs_compression_type compress_type) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio; @@ -3322,12 +3371,12 @@ static int alloc_new_bio(struct btrfs_inode *inode, * For compressed page range, its disk_bytenr is always @disk_bytenr * passed in, no matter if we have added any range into previous bio. */ - if (bio_flags & EXTENT_BIO_COMPRESSED) + if (compress_type != BTRFS_COMPRESS_NONE) bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; else bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT; bio_ctrl->bio = bio; - bio_ctrl->bio_flags = bio_flags; + bio_ctrl->compress_type = compress_type; bio->bi_end_io = end_io_func; bio->bi_private = &inode->io_tree; bio->bi_opf = opf; @@ -3386,7 +3435,7 @@ error: * @end_io_func: end_io callback for new bio * @mirror_num: desired mirror to read/write * @prev_bio_flags: flags of previous bio to see if we can merge the current one - * @bio_flags: flags of the current bio to see if we can merge them + * @compress_type: compress type for current bio */ static int submit_extent_page(unsigned int opf, struct writeback_control *wbc, @@ -3395,7 +3444,7 @@ static int submit_extent_page(unsigned int opf, size_t size, unsigned long pg_offset, bio_end_io_t end_io_func, int mirror_num, - unsigned long bio_flags, + enum btrfs_compression_type compress_type, bool force_bio_submit) { int ret = 0; @@ -3407,10 +3456,8 @@ static int submit_extent_page(unsigned int opf, ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE && pg_offset + size <= PAGE_SIZE); if (force_bio_submit && bio_ctrl->bio) { - ret = submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->bio_flags); + submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->compress_type); bio_ctrl->bio = NULL; - if (ret < 0) - return ret; } while (cur < pg_offset + size) { @@ -3422,7 +3469,7 @@ static int submit_extent_page(unsigned int opf, ret = alloc_new_bio(inode, bio_ctrl, wbc, opf, end_io_func, disk_bytenr, offset, page_offset(page) + cur, - bio_flags); + compress_type); if (ret < 0) return ret; } @@ -3430,14 +3477,14 @@ static int submit_extent_page(unsigned int opf, * We must go through btrfs_bio_add_page() to ensure each * page range won't cross various boundaries. */ - if (bio_flags & EXTENT_BIO_COMPRESSED) + if (compress_type != BTRFS_COMPRESS_NONE) added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, size - offset, pg_offset + offset, - bio_flags); + compress_type); else added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr + offset, size - offset, - pg_offset + offset, bio_flags); + pg_offset + offset, compress_type); /* Metadata page range should never be split */ if (!is_data_inode(&inode->vfs_inode)) @@ -3451,11 +3498,8 @@ static int submit_extent_page(unsigned int opf, if (added < size - offset) { /* The bio should contain some page(s) */ ASSERT(bio_ctrl->bio->bi_iter.bi_size); - ret = submit_one_bio(bio_ctrl->bio, mirror_num, - bio_ctrl->bio_flags); + submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->compress_type); bio_ctrl->bio = NULL; - if (ret < 0) - return ret; } cur += added; } @@ -3478,7 +3522,7 @@ static int attach_extent_buffer_page(struct extent_buffer *eb, if (page->mapping) lockdep_assert_held(&page->mapping->private_lock); - if (fs_info->sectorsize == PAGE_SIZE) { + if (fs_info->nodesize >= PAGE_SIZE) { if (!PagePrivate(page)) attach_page_private(page, eb); else @@ -3513,7 +3557,7 @@ int set_page_extent_mapped(struct page *page) fs_info = btrfs_sb(page->mapping->host->i_sb); - if (fs_info->sectorsize < PAGE_SIZE) + if (btrfs_is_subpage(fs_info, page)) return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA); attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); @@ -3530,7 +3574,7 @@ void clear_page_extent_mapped(struct page *page) return; fs_info = btrfs_sb(page->mapping->host->i_sb); - if (fs_info->sectorsize < PAGE_SIZE) + if (btrfs_is_subpage(fs_info, page)) return btrfs_detach_subpage(fs_info, page); detach_page_private(page); @@ -3569,7 +3613,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * XXX JDM: This needs looking at to ensure proper page locking * return 0 on success, otherwise return error */ -int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, +static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, struct btrfs_bio_ctrl *bio_ctrl, unsigned int read_flags, u64 *prev_em_start) { @@ -3638,16 +3682,13 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, BUG_ON(extent_map_end(em) <= cur); BUG_ON(end < cur); - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - this_bio_flag |= EXTENT_BIO_COMPRESSED; - extent_set_compress_type(&this_bio_flag, - em->compress_type); - } + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + this_bio_flag = em->compress_type; iosize = min(extent_map_end(em) - cur, end - cur + 1); cur_end = min(extent_map_end(em) - 1, end); iosize = ALIGN(iosize, blocksize); - if (this_bio_flag & EXTENT_BIO_COMPRESSED) + if (this_bio_flag != BTRFS_COMPRESS_NONE) disk_bytenr = em->block_start; else disk_bytenr = em->block_start + extent_offset; @@ -3743,8 +3784,12 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, this_bio_flag, force_bio_submit); if (ret) { - unlock_extent(tree, cur, cur + iosize - 1); - end_page_read(page, false, cur, iosize); + /* + * We have to unlock the remaining range, or the page + * will never be unlocked. + */ + unlock_extent(tree, cur, end); + end_page_read(page, false, cur, end + 1 - cur); goto out; } cur = cur + iosize; @@ -3754,6 +3799,26 @@ out: return ret; } +int btrfs_readpage(struct file *file, struct page *page) +{ + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + u64 start = page_offset(page); + u64 end = start + PAGE_SIZE - 1; + struct btrfs_bio_ctrl bio_ctrl = { 0 }; + int ret; + + btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); + + ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL); + /* + * If btrfs_do_readpage() failed we will want to submit the assembled + * bio to do the cleanup. + */ + if (bio_ctrl.bio) + submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.compress_type); + return ret; +} + static inline void contiguous_readpages(struct page *pages[], int nr_pages, u64 start, u64 end, struct extent_map **em_cached, @@ -3772,12 +3837,6 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, } } -static void update_nr_written(struct writeback_control *wbc, - unsigned long nr_written) -{ - wbc->nr_to_write -= nr_written; -} - /* * helper for __extent_writepage, doing all of the delayed allocation setup. * @@ -3877,7 +3936,7 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, * For regular sector size == page size case, since one page only * contains one sector, we return the page offset directly. */ - if (fs_info->sectorsize == PAGE_SIZE) { + if (!btrfs_is_subpage(fs_info, page)) { *start = page_offset(page); *end = page_offset(page) + PAGE_SIZE; return; @@ -3920,10 +3979,12 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, u64 extent_offset; u64 block_start; struct extent_map *em; + int saved_ret = 0; int ret = 0; int nr = 0; u32 opf = REQ_OP_WRITE; const unsigned int write_flags = wbc_to_write_flags(wbc); + bool has_error = false; bool compressed; ret = btrfs_writepage_cow_fixup(page); @@ -3938,7 +3999,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * we don't want to touch the inode after unlocking the page, * so we update the mapping writeback index now */ - update_nr_written(wbc, 1); + wbc->nr_to_write--; while (cur <= end) { u64 disk_bytenr; @@ -3973,6 +4034,9 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, if (IS_ERR(em)) { btrfs_page_set_error(fs_info, page, cur, end - cur + 1); ret = PTR_ERR_OR_ZERO(em); + has_error = true; + if (!saved_ret) + saved_ret = ret; break; } @@ -4036,6 +4100,10 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, end_bio_extent_writepage, 0, 0, false); if (ret) { + has_error = true; + if (!saved_ret) + saved_ret = ret; + btrfs_page_set_error(fs_info, page, cur, iosize); if (PageWriteback(page)) btrfs_page_clear_writeback(fs_info, page, cur, @@ -4049,8 +4117,10 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * If we finish without problem, we should not only clear page dirty, * but also empty subpage dirty bits */ - if (!ret) + if (!has_error) btrfs_page_assert_not_dirty(fs_info, page); + else + ret = saved_ret; *nr_ret = nr; return ret; } @@ -4181,9 +4251,6 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb) static void end_extent_buffer_writeback(struct extent_buffer *eb) { - if (test_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags)) - btrfs_zone_finish_endio(eb->fs_info, eb->start, eb->len); - clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); smp_mb__after_atomic(); wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); @@ -4203,14 +4270,12 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb struct extent_page_data *epd) { struct btrfs_fs_info *fs_info = eb->fs_info; - int i, num_pages, failed_page_nr; + int i, num_pages; int flush = 0; int ret = 0; if (!btrfs_try_tree_write_lock(eb)) { - ret = flush_write_bio(epd); - if (ret < 0) - return ret; + flush_write_bio(epd); flush = 1; btrfs_tree_lock(eb); } @@ -4220,9 +4285,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb if (!epd->sync_io) return 0; if (!flush) { - ret = flush_write_bio(epd); - if (ret < 0) - return ret; + flush_write_bio(epd); flush = 1; } while (1) { @@ -4260,7 +4323,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb * Subpage metadata doesn't use page locking at all, so we can skip * the page locking. */ - if (!ret || fs_info->sectorsize < PAGE_SIZE) + if (!ret || fs_info->nodesize < PAGE_SIZE) return ret; num_pages = num_extent_pages(eb); @@ -4269,14 +4332,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb if (!trylock_page(p)) { if (!flush) { - int err; - - err = flush_write_bio(epd); - if (err < 0) { - ret = err; - failed_page_nr = i; - goto err_unlock; - } + flush_write_bio(epd); flush = 1; } lock_page(p); @@ -4284,25 +4340,6 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb } return ret; -err_unlock: - /* Unlock already locked pages */ - for (i = 0; i < failed_page_nr; i++) - unlock_page(eb->pages[i]); - /* - * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it. - * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can - * be made and undo everything done before. - */ - btrfs_tree_lock(eb); - spin_lock(&eb->refs_lock); - set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - end_extent_buffer_writeback(eb); - spin_unlock(&eb->refs_lock); - percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len, - fs_info->dirty_metadata_batch); - btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); - btrfs_tree_unlock(eb); - return ret; } static void set_btree_ioerr(struct page *page, struct extent_buffer *eb) @@ -4397,8 +4434,8 @@ static struct extent_buffer *find_extent_buffer_nolock( struct extent_buffer *eb; rcu_read_lock(); - eb = radix_tree_lookup(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits); + eb = xa_load(&fs_info->extent_buffers, + start >> fs_info->sectorsize_bits); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); return eb; @@ -4420,7 +4457,7 @@ static void end_bio_subpage_eb_writepage(struct bio *bio) struct bvec_iter_all iter_all; fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb); - ASSERT(fs_info->sectorsize < PAGE_SIZE); + ASSERT(fs_info->nodesize < PAGE_SIZE); ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { @@ -4572,7 +4609,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb, * dirty anymore, we have submitted a page. Update nr_written in wbc. */ if (no_dirty_ebs) - update_nr_written(wbc, 1); + wbc->nr_to_write--; return ret; } @@ -4608,7 +4645,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, break; } disk_bytenr += PAGE_SIZE; - update_nr_written(wbc, 1); + wbc->nr_to_write--; unlock_page(p); } @@ -4747,7 +4784,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, if (!PagePrivate(page)) return 0; - if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) return submit_eb_subpage(page, wbc, epd); spin_lock(&mapping->private_lock); @@ -4803,8 +4840,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, /* * Implies write in zoned mode. Mark the last eb in a block group. */ - if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity) - set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags); + btrfs_schedule_zone_finish_bg(cache, eb); btrfs_put_block_group(cache); } ret = write_one_eb(eb, wbc, epd); @@ -4923,13 +4959,19 @@ retry: * if the fs already has error. */ if (!BTRFS_FS_ERROR(fs_info)) { - ret = flush_write_bio(&epd); + flush_write_bio(&epd); } else { ret = -EROFS; end_write_bio(&epd, ret); } out: btrfs_zoned_meta_io_unlock(fs_info); + /* + * We can get ret > 0 from submit_extent_page() indicating how many ebs + * were submitted. Reset it to 0 to avoid false alerts for the caller. + */ + if (ret > 0) + ret = 0; return ret; } @@ -5031,8 +5073,7 @@ retry: * tmpfs file mapping */ if (!trylock_page(page)) { - ret = flush_write_bio(epd); - BUG_ON(ret < 0); + flush_write_bio(epd); lock_page(page); } @@ -5042,10 +5083,8 @@ retry: } if (wbc->sync_mode != WB_SYNC_NONE) { - if (PageWriteback(page)) { - ret = flush_write_bio(epd); - BUG_ON(ret < 0); - } + if (PageWriteback(page)) + flush_write_bio(epd); wait_on_page_writeback(page); } @@ -5085,9 +5124,8 @@ retry: * page in our current bio, and thus deadlock, so flush the * write bio here. */ - ret = flush_write_bio(epd); - if (!ret) - goto retry; + flush_write_bio(epd); + goto retry; } if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) @@ -5113,8 +5151,7 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc) return ret; } - ret = flush_write_bio(&epd); - ASSERT(ret <= 0); + flush_write_bio(&epd); return ret; } @@ -5176,7 +5213,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) } if (!found_error) - ret = flush_write_bio(&epd); + flush_write_bio(&epd); else end_write_bio(&epd, ret); @@ -5209,7 +5246,7 @@ int extent_writepages(struct address_space *mapping, end_write_bio(&epd, ret); return ret; } - ret = flush_write_bio(&epd); + flush_write_bio(&epd); return ret; } @@ -5232,10 +5269,8 @@ void extent_readahead(struct readahead_control *rac) if (em_cached) free_extent_map(em_cached); - if (bio_ctrl.bio) { - if (submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags)) - return; - } + if (bio_ctrl.bio) + submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.compress_type); } /* @@ -5804,7 +5839,7 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag return; } - if (fs_info->sectorsize == PAGE_SIZE) { + if (fs_info->nodesize >= PAGE_SIZE) { /* * We do this since we'll remove the pages after we've * removed the eb from the radix tree, so we could race @@ -5911,9 +5946,9 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) { int i; - struct page *p; struct extent_buffer *new; int num_pages = num_extent_pages(src); + int ret; new = __alloc_extent_buffer(src->fs_info, src->start, src->len); if (new == NULL) @@ -5926,22 +5961,23 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) */ set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); + memset(new->pages, 0, sizeof(*new->pages) * num_pages); + ret = btrfs_alloc_page_array(num_pages, new->pages); + if (ret) { + btrfs_release_extent_buffer(new); + return NULL; + } + for (i = 0; i < num_pages; i++) { int ret; + struct page *p = new->pages[i]; - p = alloc_page(GFP_NOFS); - if (!p) { - btrfs_release_extent_buffer(new); - return NULL; - } ret = attach_extent_buffer_page(new, p, NULL); if (ret < 0) { - put_page(p); btrfs_release_extent_buffer(new); return NULL; } WARN_ON(PageDirty(p)); - new->pages[i] = p; copy_page(page_address(p), page_address(src->pages[i])); } set_extent_buffer_uptodate(new); @@ -5955,31 +5991,36 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, struct extent_buffer *eb; int num_pages; int i; + int ret; eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) return NULL; num_pages = num_extent_pages(eb); + ret = btrfs_alloc_page_array(num_pages, eb->pages); + if (ret) + goto err; + for (i = 0; i < num_pages; i++) { - int ret; + struct page *p = eb->pages[i]; - eb->pages[i] = alloc_page(GFP_NOFS); - if (!eb->pages[i]) - goto err; - ret = attach_extent_buffer_page(eb, eb->pages[i], NULL); + ret = attach_extent_buffer_page(eb, p, NULL); if (ret < 0) goto err; } + set_extent_buffer_uptodate(eb); btrfs_set_header_nritems(eb, 0); set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); return eb; err: - for (; i > 0; i--) { - detach_extent_buffer_page(eb, eb->pages[i - 1]); - __free_page(eb->pages[i - 1]); + for (i = 0; i < num_pages; i++) { + if (eb->pages[i]) { + detach_extent_buffer_page(eb, eb->pages[i]); + __free_page(eb->pages[i]); + } } __free_extent_buffer(eb); return NULL; @@ -6086,24 +6127,22 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, if (!eb) return ERR_PTR(-ENOMEM); eb->fs_info = fs_info; -again: - ret = radix_tree_preload(GFP_NOFS); - if (ret) { - exists = ERR_PTR(ret); - goto free_eb; - } - spin_lock(&fs_info->buffer_lock); - ret = radix_tree_insert(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits, eb); - spin_unlock(&fs_info->buffer_lock); - radix_tree_preload_end(); - if (ret == -EEXIST) { - exists = find_extent_buffer(fs_info, start); - if (exists) + + do { + ret = xa_insert(&fs_info->extent_buffers, + start >> fs_info->sectorsize_bits, + eb, GFP_NOFS); + if (ret == -ENOMEM) { + exists = ERR_PTR(ret); goto free_eb; - else - goto again; - } + } + if (ret == -EBUSY) { + exists = find_extent_buffer(fs_info, start); + if (exists) + goto free_eb; + } + } while (ret); + check_buffer_tree_ref(eb); set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); @@ -6124,7 +6163,7 @@ static struct extent_buffer *grab_extent_buffer( * don't try to insert two ebs for the same bytenr. So here we always * return NULL and just continue. */ - if (fs_info->sectorsize < PAGE_SIZE) + if (fs_info->nodesize < PAGE_SIZE) return NULL; /* Page not yet attached to an extent buffer */ @@ -6146,6 +6185,30 @@ static struct extent_buffer *grab_extent_buffer( return NULL; } +static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) +{ + if (!IS_ALIGNED(start, fs_info->sectorsize)) { + btrfs_err(fs_info, "bad tree block start %llu", start); + return -EINVAL; + } + + if (fs_info->nodesize < PAGE_SIZE && + offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) { + btrfs_err(fs_info, + "tree block crosses page boundary, start %llu nodesize %u", + start, fs_info->nodesize); + return -EINVAL; + } + if (fs_info->nodesize >= PAGE_SIZE && + !IS_ALIGNED(start, PAGE_SIZE)) { + btrfs_err(fs_info, + "tree block is not page aligned, start %llu nodesize %u", + start, fs_info->nodesize); + return -EINVAL; + } + return 0; +} + struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level) { @@ -6160,10 +6223,8 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, int uptodate = 1; int ret; - if (!IS_ALIGNED(start, fs_info->sectorsize)) { - btrfs_err(fs_info, "bad tree block start %llu", start); + if (check_eb_alignment(fs_info, start)) return ERR_PTR(-EINVAL); - } #if BITS_PER_LONG == 32 if (start >= MAX_LFS_FILESIZE) { @@ -6176,14 +6237,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, btrfs_warn_32bit_limit(fs_info); #endif - if (fs_info->sectorsize < PAGE_SIZE && - offset_in_page(start) + len > PAGE_SIZE) { - btrfs_err(fs_info, - "tree block crosses page boundary, start %llu nodesize %lu", - start, len); - return ERR_PTR(-EINVAL); - } - eb = find_extent_buffer(fs_info, start); if (eb) return eb; @@ -6213,7 +6266,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * page, but it may change in the future for 16K page size * support, so we still preallocate the memory in the loop. */ - if (fs_info->sectorsize < PAGE_SIZE) { + if (fs_info->nodesize < PAGE_SIZE) { prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); if (IS_ERR(prealloc)) { ret = PTR_ERR(prealloc); @@ -6264,25 +6317,22 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, } if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); -again: - ret = radix_tree_preload(GFP_NOFS); - if (ret) { - exists = ERR_PTR(ret); - goto free_eb; - } - - spin_lock(&fs_info->buffer_lock); - ret = radix_tree_insert(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits, eb); - spin_unlock(&fs_info->buffer_lock); - radix_tree_preload_end(); - if (ret == -EEXIST) { - exists = find_extent_buffer(fs_info, start); - if (exists) + + do { + ret = xa_insert(&fs_info->extent_buffers, + start >> fs_info->sectorsize_bits, + eb, GFP_NOFS); + if (ret == -ENOMEM) { + exists = ERR_PTR(ret); goto free_eb; - else - goto again; - } + } + if (ret == -EBUSY) { + exists = find_extent_buffer(fs_info, start); + if (exists) + goto free_eb; + } + } while (ret); + /* add one reference for the tree */ check_buffer_tree_ref(eb); set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); @@ -6327,10 +6377,8 @@ static int release_extent_buffer(struct extent_buffer *eb) spin_unlock(&eb->refs_lock); - spin_lock(&fs_info->buffer_lock); - radix_tree_delete(&fs_info->buffer_radix, - eb->start >> fs_info->sectorsize_bits); - spin_unlock(&fs_info->buffer_lock); + xa_erase(&fs_info->extent_buffers, + eb->start >> fs_info->sectorsize_bits); } else { spin_unlock(&eb->refs_lock); } @@ -6432,7 +6480,7 @@ void clear_extent_buffer_dirty(const struct extent_buffer *eb) int num_pages; struct page *page; - if (eb->fs_info->sectorsize < PAGE_SIZE) + if (eb->fs_info->nodesize < PAGE_SIZE) return clear_subpage_extent_buffer_dirty(eb); num_pages = num_extent_pages(eb); @@ -6464,7 +6512,7 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb) WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); if (!was_dirty) { - bool subpage = eb->fs_info->sectorsize < PAGE_SIZE; + bool subpage = eb->fs_info->nodesize < PAGE_SIZE; /* * For subpage case, we can have other extent buffers in the @@ -6504,9 +6552,18 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb) num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; - if (page) - btrfs_page_clear_uptodate(fs_info, page, - eb->start, eb->len); + if (!page) + continue; + + /* + * This is special handling for metadata subpage, as regular + * btrfs_is_subpage() can not handle cloned/dummy metadata. + */ + if (fs_info->nodesize >= PAGE_SIZE) + ClearPageUptodate(page); + else + btrfs_subpage_clear_uptodate(fs_info, page, eb->start, + eb->len); } } @@ -6521,7 +6578,16 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb) num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; - btrfs_page_set_uptodate(fs_info, page, eb->start, eb->len); + + /* + * This is special handling for metadata subpage, as regular + * btrfs_is_subpage() can not handle cloned/dummy metadata. + */ + if (fs_info->nodesize >= PAGE_SIZE) + SetPageUptodate(page); + else + btrfs_subpage_set_uptodate(fs_info, page, eb->start, + eb->len); } } @@ -6577,12 +6643,8 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, atomic_dec(&eb->io_pages); } if (bio_ctrl.bio) { - int tmp; - - tmp = submit_one_bio(bio_ctrl.bio, mirror_num, 0); + submit_one_bio(bio_ctrl.bio, mirror_num, 0); bio_ctrl.bio = NULL; - if (tmp < 0) - return tmp; } if (ret || wait != WAIT_COMPLETE) return ret; @@ -6616,7 +6678,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))) return -EIO; - if (eb->fs_info->sectorsize < PAGE_SIZE) + if (eb->fs_info->nodesize < PAGE_SIZE) return read_extent_buffer_subpage(eb, wait, mirror_num); num_pages = num_extent_pages(eb); @@ -6695,10 +6757,8 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } if (bio_ctrl.bio) { - err = submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.bio_flags); + submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.compress_type); bio_ctrl.bio = NULL; - if (err) - return err; } if (ret || wait != WAIT_COMPLETE) @@ -6871,7 +6931,7 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb, * would have !PageUptodate && !PageError, as we clear PageError before * reading. */ - if (fs_info->sectorsize < PAGE_SIZE) { + if (fs_info->nodesize < PAGE_SIZE) { bool uptodate, error; uptodate = btrfs_subpage_test_uptodate(fs_info, page, @@ -6973,7 +7033,7 @@ void copy_extent_buffer_full(const struct extent_buffer *dst, ASSERT(dst->len == src->len); - if (dst->fs_info->sectorsize == PAGE_SIZE) { + if (dst->fs_info->nodesize >= PAGE_SIZE) { num_pages = num_extent_pages(dst); for (i = 0; i < num_pages; i++) copy_page(page_address(dst->pages[i]), @@ -6982,7 +7042,7 @@ void copy_extent_buffer_full(const struct extent_buffer *dst, size_t src_offset = get_eb_offset_in_page(src, 0); size_t dst_offset = get_eb_offset_in_page(dst, 0); - ASSERT(src->fs_info->sectorsize < PAGE_SIZE); + ASSERT(src->fs_info->nodesize < PAGE_SIZE); memcpy(page_address(dst->pages[0]) + dst_offset, page_address(src->pages[0]) + src_offset, src->len); @@ -7263,42 +7323,25 @@ void memmove_extent_buffer(const struct extent_buffer *dst, } } -#define GANG_LOOKUP_SIZE 16 static struct extent_buffer *get_next_extent_buffer( struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) { - struct extent_buffer *gang[GANG_LOOKUP_SIZE]; - struct extent_buffer *found = NULL; + struct extent_buffer *eb; + unsigned long index; u64 page_start = page_offset(page); - u64 cur = page_start; ASSERT(in_range(bytenr, page_start, PAGE_SIZE)); lockdep_assert_held(&fs_info->buffer_lock); - while (cur < page_start + PAGE_SIZE) { - int ret; - int i; - - ret = radix_tree_gang_lookup(&fs_info->buffer_radix, - (void **)gang, cur >> fs_info->sectorsize_bits, - min_t(unsigned int, GANG_LOOKUP_SIZE, - PAGE_SIZE / fs_info->nodesize)); - if (ret == 0) - goto out; - for (i = 0; i < ret; i++) { - /* Already beyond page end */ - if (gang[i]->start >= page_start + PAGE_SIZE) - goto out; - /* Found one */ - if (gang[i]->start >= bytenr) { - found = gang[i]; - goto out; - } - } - cur = gang[ret - 1]->start + gang[ret - 1]->len; + xa_for_each_start(&fs_info->extent_buffers, index, eb, + page_start >> fs_info->sectorsize_bits) { + if (in_range(eb->start, page_start, PAGE_SIZE)) + return eb; + else if (eb->start >= page_start + PAGE_SIZE) + /* Already beyond page end */ + return NULL; } -out: - return found; + return NULL; } static int try_release_subpage_extent_buffer(struct page *page) @@ -7375,7 +7418,7 @@ int try_release_extent_buffer(struct page *page) { struct extent_buffer *eb; - if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) return try_release_subpage_extent_buffer(page); /* diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 151e9da5da2d..956fa434df43 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -7,15 +7,9 @@ #include <linux/refcount.h> #include <linux/fiemap.h> #include <linux/btrfs_tree.h> +#include "compression.h" #include "ulist.h" -/* - * flags for bio submission. The high bits indicate the compression - * type for this bio - */ -#define EXTENT_BIO_COMPRESSED 1 -#define EXTENT_BIO_FLAG_SHIFT 16 - enum { EXTENT_BUFFER_UPTODATE, EXTENT_BUFFER_DIRTY, @@ -32,7 +26,6 @@ enum { /* write IO error */ EXTENT_BUFFER_WRITE_ERR, EXTENT_BUFFER_NO_CHECK, - EXTENT_BUFFER_ZONE_FINISH, }; /* these are flags for __process_pages_contig */ @@ -71,9 +64,9 @@ struct btrfs_fs_info; struct io_failure_record; struct extent_io_tree; -typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio, +typedef void (submit_bio_hook_t)(struct inode *inode, struct bio *bio, int mirror_num, - unsigned long bio_flags); + enum btrfs_compression_type compress_type); typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode, struct bio *bio, u64 dio_file_offset); @@ -103,17 +96,6 @@ struct extent_buffer { }; /* - * Structure to record info about the bio being assembled, and other info like - * how many bytes are there before stripe/ordered extent boundary. - */ -struct btrfs_bio_ctrl { - struct bio *bio; - unsigned long bio_flags; - u32 len_to_stripe_boundary; - u32 len_to_oe_boundary; -}; - -/* * Structure to record how many bytes and which ranges are set/cleared */ struct extent_changeset { @@ -158,17 +140,6 @@ static inline void extent_changeset_free(struct extent_changeset *changeset) kfree(changeset); } -static inline void extent_set_compress_type(unsigned long *bio_flags, - int compress_type) -{ - *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT; -} - -static inline int extent_compress_type(unsigned long bio_flags) -{ - return bio_flags >> EXTENT_BIO_FLAG_SHIFT; -} - struct extent_map_tree; typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, @@ -178,11 +149,7 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); -int __must_check submit_one_bio(struct bio *bio, int mirror_num, - unsigned long bio_flags); -int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, - struct btrfs_bio_ctrl *bio_ctrl, - unsigned int read_flags, u64 *prev_em_start); +int btrfs_readpage(struct file *file, struct page *page); int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end); int extent_writepages(struct address_space *mapping, @@ -277,8 +244,10 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, u32 bits_to_clear, unsigned long page_ops); + +int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); struct bio *btrfs_bio_alloc(unsigned int nr_iovecs); -struct bio *btrfs_bio_clone(struct bio *bio); +struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio); struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); @@ -297,7 +266,7 @@ struct io_failure_record { u64 start; u64 len; u64 logical; - unsigned long bio_flags; + enum btrfs_compression_type compress_type; int this_mirror; int failed_mirror; }; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 380054c94e4b..46c2baa8fdf5 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1460,8 +1460,27 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, return ret; } -static int check_can_nocow(struct btrfs_inode *inode, loff_t pos, - size_t *write_bytes, bool nowait) +/* + * Check if we can do nocow write into the range [@pos, @pos + @write_bytes) + * + * @pos: File offset. + * @write_bytes: The length to write, will be updated to the nocow writeable + * range. + * + * This function will flush ordered extents in the range to ensure proper + * nocow checks. + * + * Return: + * > 0 If we can nocow, and updates @write_bytes. + * 0 If we can't do a nocow write. + * -EAGAIN If we can't do a nocow write because snapshoting of the inode's + * root is in progress. + * < 0 If an error happened. + * + * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0. + */ +int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, + size_t *write_bytes) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; @@ -1472,7 +1491,7 @@ static int check_can_nocow(struct btrfs_inode *inode, loff_t pos, if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) return 0; - if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock)) + if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) return -EAGAIN; lockstart = round_down(pos, fs_info->sectorsize); @@ -1480,71 +1499,21 @@ static int check_can_nocow(struct btrfs_inode *inode, loff_t pos, fs_info->sectorsize) - 1; num_bytes = lockend - lockstart + 1; - if (nowait) { - struct btrfs_ordered_extent *ordered; - - if (!try_lock_extent(&inode->io_tree, lockstart, lockend)) - return -EAGAIN; - - ordered = btrfs_lookup_ordered_range(inode, lockstart, - num_bytes); - if (ordered) { - btrfs_put_ordered_extent(ordered); - ret = -EAGAIN; - goto out_unlock; - } - } else { - btrfs_lock_and_flush_ordered_range(inode, lockstart, - lockend, NULL); - } - + btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL); ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, NULL, NULL, NULL, false); if (ret <= 0) { ret = 0; - if (!nowait) - btrfs_drew_write_unlock(&root->snapshot_lock); + btrfs_drew_write_unlock(&root->snapshot_lock); } else { *write_bytes = min_t(size_t, *write_bytes , num_bytes - pos + lockstart); } -out_unlock: unlock_extent(&inode->io_tree, lockstart, lockend); return ret; } -static int check_nocow_nolock(struct btrfs_inode *inode, loff_t pos, - size_t *write_bytes) -{ - return check_can_nocow(inode, pos, write_bytes, true); -} - -/* - * Check if we can do nocow write into the range [@pos, @pos + @write_bytes) - * - * @pos: File offset - * @write_bytes: The length to write, will be updated to the nocow writeable - * range - * - * This function will flush ordered extents in the range to ensure proper - * nocow checks. - * - * Return: - * >0 and update @write_bytes if we can do nocow write - * 0 if we can't do nocow write - * -EAGAIN if we can't get the needed lock or there are ordered extents - * for * (nowait == true) case - * <0 if other error happened - * - * NOTE: Callers need to release the lock by btrfs_check_nocow_unlock(). - */ -int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, - size_t *write_bytes) -{ - return check_can_nocow(inode, pos, write_bytes, false); -} - void btrfs_check_nocow_unlock(struct btrfs_inode *inode) { btrfs_drew_write_unlock(&inode->root->snapshot_lock); @@ -1579,20 +1548,15 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, loff_t oldsize; loff_t start_pos; - if (iocb->ki_flags & IOCB_NOWAIT) { - size_t nocow_bytes = count; - - /* We will allocate space in case nodatacow is not set, so bail */ - if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes) <= 0) - return -EAGAIN; - /* - * There are holes in the range or parts of the range that must - * be COWed (shared extents, RO block groups, etc), so just bail - * out. - */ - if (nocow_bytes < count) - return -EAGAIN; - } + /* + * Quickly bail out on NOWAIT writes if we don't have the nodatacow or + * prealloc flags, as without those flags we always have to COW. We will + * later check if we can really COW into the target range (using + * can_nocow_extent() at btrfs_get_blocks_direct_write()). + */ + if ((iocb->ki_flags & IOCB_NOWAIT) && + !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) + return -EAGAIN; current->backing_dev_info = inode_to_bdi(inode); ret = file_remove_privs(file); @@ -1720,7 +1684,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, WARN_ON(reserve_bytes == 0); ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), reserve_bytes, - reserve_bytes); + reserve_bytes, false); if (ret) { if (!only_release_metadata) btrfs_free_reserved_data_space(BTRFS_I(inode), @@ -1965,8 +1929,7 @@ relock: */ again: from->nofault = true; - err = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL, written); + err = btrfs_dio_rw(iocb, from, written); from->nofault = false; /* No increment (+=) because iomap returns a cumulative value. */ @@ -2570,10 +2533,10 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len) return ret; } -static int btrfs_punch_hole_lock_range(struct inode *inode, - const u64 lockstart, - const u64 lockend, - struct extent_state **cached_state) +static void btrfs_punch_hole_lock_range(struct inode *inode, + const u64 lockstart, + const u64 lockend, + struct extent_state **cached_state) { /* * For subpage case, if the range is not at page boundary, we could @@ -2587,40 +2550,29 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1; while (1) { - struct btrfs_ordered_extent *ordered; - int ret; - truncate_pagecache_range(inode, lockstart, lockend); lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, cached_state); - ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), - lockend); - /* - * We need to make sure we have no ordered extents in this range - * and nobody raced in and read a page in this range, if we did - * we need to try again. + * We can't have ordered extents in the range, nor dirty/writeback + * pages, because we have locked the inode's VFS lock in exclusive + * mode, we have locked the inode's i_mmap_lock in exclusive mode, + * we have flushed all delalloc in the range and we have waited + * for any ordered extents in the range to complete. + * We can race with anyone reading pages from this range, so after + * locking the range check if we have pages in the range, and if + * we do, unlock the range and retry. */ - if ((!ordered || - (ordered->file_offset + ordered->num_bytes <= lockstart || - ordered->file_offset > lockend)) && - !filemap_range_has_page(inode->i_mapping, - page_lockstart, page_lockend)) { - if (ordered) - btrfs_put_ordered_extent(ordered); + if (!filemap_range_has_page(inode->i_mapping, page_lockstart, + page_lockend)) break; - } - if (ordered) - btrfs_put_ordered_extent(ordered); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, cached_state); - ret = btrfs_wait_ordered_range(inode, lockstart, - lockend - lockstart + 1); - if (ret) - return ret; } - return 0; + + btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend); } static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, @@ -2976,11 +2928,12 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) bool truncated_block = false; bool updated_inode = false; + btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); + ret = btrfs_wait_ordered_range(inode, offset, len); if (ret) - return ret; + goto out_only_mutex; - btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); ino_size = round_up(inode->i_size, fs_info->sectorsize); ret = find_first_non_hole(BTRFS_I(inode), &offset, &len); if (ret < 0) @@ -3072,10 +3025,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) goto out_only_mutex; } - ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend, - &cached_state); - if (ret) - goto out_only_mutex; + btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state); path = btrfs_alloc_path(); if (!path) { @@ -3237,8 +3187,6 @@ static int btrfs_zero_range(struct inode *inode, u64 bytes_to_reserve = 0; bool space_reserved = false; - inode_dio_wait(inode); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, alloc_end - alloc_start); if (IS_ERR(em)) { @@ -3368,10 +3316,8 @@ reserve_space: if (ret < 0) goto out; space_reserved = true; - ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend, - &cached_state); - if (ret) - goto out; + btrfs_punch_hole_lock_range(inode, lockstart, lockend, + &cached_state); ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, alloc_start, bytes_to_reserve); if (ret) { @@ -3417,6 +3363,9 @@ static long btrfs_fallocate(struct file *file, int mode, u64 alloc_hint = 0; u64 locked_end; u64 actual_end = 0; + u64 data_space_needed = 0; + u64 data_space_reserved = 0; + u64 qgroup_reserved = 0; struct extent_map *em; int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode)); int ret; @@ -3437,18 +3386,6 @@ static long btrfs_fallocate(struct file *file, int mode, if (mode & FALLOC_FL_PUNCH_HOLE) return btrfs_punch_hole(file, offset, len); - /* - * Only trigger disk allocation, don't trigger qgroup reserve - * - * For qgroup space, it will be checked later. - */ - if (!(mode & FALLOC_FL_ZERO_RANGE)) { - ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), - alloc_end - alloc_start); - if (ret < 0) - return ret; - } - btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) { @@ -3485,8 +3422,12 @@ static long btrfs_fallocate(struct file *file, int mode, } /* - * wait for ordered IO before we have any locks. We'll loop again - * below with the locks held. + * We have locked the inode at the VFS level (in exclusive mode) and we + * have locked the i_mmap_lock lock (in exclusive mode). Now before + * locking the file range, flush all dealloc in the range and wait for + * all ordered extents in the range to complete. After this we can lock + * the file range and, due to the previous locking we did, we know there + * can't be more delalloc or ordered extents in the range. */ ret = btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); @@ -3500,38 +3441,10 @@ static long btrfs_fallocate(struct file *file, int mode, } locked_end = alloc_end - 1; - while (1) { - struct btrfs_ordered_extent *ordered; + lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state); - /* the extent lock is ordered inside the running - * transaction - */ - lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, - locked_end, &cached_state); - ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), - locked_end); - - if (ordered && - ordered->file_offset + ordered->num_bytes > alloc_start && - ordered->file_offset < alloc_end) { - btrfs_put_ordered_extent(ordered); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - alloc_start, locked_end, - &cached_state); - /* - * we can't wait on the range with the transaction - * running or with the extent lock held - */ - ret = btrfs_wait_ordered_range(inode, alloc_start, - alloc_end - alloc_start); - if (ret) - goto out; - } else { - if (ordered) - btrfs_put_ordered_extent(ordered); - break; - } - } + btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end); /* First, check if we exceed the qgroup limit */ INIT_LIST_HEAD(&reserve_list); @@ -3548,48 +3461,64 @@ static long btrfs_fallocate(struct file *file, int mode, if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - ret = add_falloc_range(&reserve_list, cur_offset, - last_byte - cur_offset); + const u64 range_len = last_byte - cur_offset; + + ret = add_falloc_range(&reserve_list, cur_offset, range_len); if (ret < 0) { free_extent_map(em); break; } ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), - &data_reserved, cur_offset, - last_byte - cur_offset); + &data_reserved, cur_offset, range_len); if (ret < 0) { - cur_offset = last_byte; free_extent_map(em); break; } - } else { - /* - * Do not need to reserve unwritten extent for this - * range, free reserved data space first, otherwise - * it'll result in false ENOSPC error. - */ - btrfs_free_reserved_data_space(BTRFS_I(inode), - data_reserved, cur_offset, - last_byte - cur_offset); + qgroup_reserved += range_len; + data_space_needed += range_len; } free_extent_map(em); cur_offset = last_byte; } + if (!ret && data_space_needed > 0) { + /* + * We are safe to reserve space here as we can't have delalloc + * in the range, see above. + */ + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), + data_space_needed); + if (!ret) + data_space_reserved = data_space_needed; + } + /* * If ret is still 0, means we're OK to fallocate. * Or just cleanup the list and exit. */ list_for_each_entry_safe(range, tmp, &reserve_list, list) { - if (!ret) + if (!ret) { ret = btrfs_prealloc_file_range(inode, mode, range->start, range->len, i_blocksize(inode), offset + len, &alloc_hint); - else + /* + * btrfs_prealloc_file_range() releases space even + * if it returns an error. + */ + data_space_reserved -= range->len; + qgroup_reserved -= range->len; + } else if (data_space_reserved > 0) { btrfs_free_reserved_data_space(BTRFS_I(inode), - data_reserved, range->start, - range->len); + data_reserved, range->start, + range->len); + data_space_reserved -= range->len; + qgroup_reserved -= range->len; + } else if (qgroup_reserved > 0) { + btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved, + range->start, range->len); + qgroup_reserved -= range->len; + } list_del(&range->list); kfree(range); } @@ -3606,10 +3535,6 @@ out_unlock: &cached_state); out: btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); - /* Let go of our reservation. */ - if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE)) - btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved, - cur_offset, alloc_end - cur_offset); extent_changeset_free(data_reserved); return ret; } @@ -3767,8 +3692,7 @@ again: */ pagefault_disable(); to->nofault = true; - ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL, read); + ret = btrfs_dio_rw(iocb, to, read); to->nofault = false; pagefault_enable(); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 01a408db5683..f7adee6fa05e 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2630,16 +2630,19 @@ out: static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, u64 bytenr, u64 size, bool used) { - struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_space_info *sinfo = block_group->space_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; u64 offset = bytenr - block_group->start; u64 to_free, to_unusable; - const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold); + int bg_reclaim_threshold = 0; bool initial = (size == block_group->length); u64 reclaimable_unusable; WARN_ON(!initial && offset + size > block_group->zone_capacity); + if (!initial) + bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); + spin_lock(&ctl->tree_lock); if (!used) to_free = size; @@ -4069,7 +4072,7 @@ static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info, btrfs_info(fs_info, "cleaning free space cache v1"); - node = rb_first(&fs_info->block_group_cache_tree); + node = rb_first_cached(&fs_info->block_group_cache_tree); while (node) { block_group = rb_entry(node, struct btrfs_block_group, cache_node); ret = btrfs_remove_free_space_inode(trans, NULL, block_group); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 0ae54d8c10d6..1bf89aa67216 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1178,7 +1178,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) goto abort; } - node = rb_first(&fs_info->block_group_cache_tree); + node = rb_first_cached(&fs_info->block_group_cache_tree); while (node) { block_group = rb_entry(node, struct btrfs_block_group, cache_node); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 95c499b8424e..da13bd0d10f1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -64,8 +64,36 @@ struct btrfs_iget_args { struct btrfs_dio_data { ssize_t submitted; struct extent_changeset *data_reserved; + bool data_space_reserved; + bool nocow_done; }; +struct btrfs_dio_private { + struct inode *inode; + + /* + * Since DIO can use anonymous page, we cannot use page_offset() to + * grab the file offset, thus need a dedicated member for file offset. + */ + u64 file_offset; + /* Used for bio::bi_size */ + u32 bytes; + + /* + * References to this structure. There is one reference per in-flight + * bio plus one while we're still setting up. + */ + refcount_t refs; + + /* Array of checksums */ + u8 *csums; + + /* This must be last */ + struct bio bio; +}; + +static struct bio_set btrfs_dio_bioset; + struct btrfs_rename_ctx { /* Output field. Stores the index number of the old directory entry. */ u64 index; @@ -222,15 +250,25 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, static int btrfs_dirty_inode(struct inode *inode); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir, - const struct qstr *qstr) + struct btrfs_new_inode_args *args) { int err; - err = btrfs_init_acl(trans, inode, dir); - if (!err) - err = btrfs_xattr_security_init(trans, inode, dir, qstr); - return err; + if (args->default_acl) { + err = __btrfs_set_acl(trans, args->inode, args->default_acl, + ACL_TYPE_DEFAULT); + if (err) + return err; + } + if (args->acl) { + err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS); + if (err) + return err; + } + if (!args->default_acl && !args->acl) + cache_no_acl(args->inode); + return btrfs_xattr_security_init(trans, args->inode, args->dir, + &args->dentry->d_name); } /* @@ -1607,6 +1645,141 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, nr_written, 1); } +struct can_nocow_file_extent_args { + /* Input fields. */ + + /* Start file offset of the range we want to NOCOW. */ + u64 start; + /* End file offset (inclusive) of the range we want to NOCOW. */ + u64 end; + bool writeback_path; + bool strict; + /* + * Free the path passed to can_nocow_file_extent() once it's not needed + * anymore. + */ + bool free_path; + + /* Output fields. Only set when can_nocow_file_extent() returns 1. */ + + u64 disk_bytenr; + u64 disk_num_bytes; + u64 extent_offset; + /* Number of bytes that can be written to in NOCOW mode. */ + u64 num_bytes; +}; + +/* + * Check if we can NOCOW the file extent that the path points to. + * This function may return with the path released, so the caller should check + * if path->nodes[0] is NULL or not if it needs to use the path afterwards. + * + * Returns: < 0 on error + * 0 if we can not NOCOW + * 1 if we can NOCOW + */ +static int can_nocow_file_extent(struct btrfs_path *path, + struct btrfs_key *key, + struct btrfs_inode *inode, + struct can_nocow_file_extent_args *args) +{ + const bool is_freespace_inode = btrfs_is_free_space_inode(inode); + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_root *root = inode->root; + struct btrfs_file_extent_item *fi; + u64 extent_end; + u8 extent_type; + int can_nocow = 0; + int ret = 0; + + fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + goto out; + + /* Can't access these fields unless we know it's not an inline extent. */ + args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + args->extent_offset = btrfs_file_extent_offset(leaf, fi); + + if (!(inode->flags & BTRFS_INODE_NODATACOW) && + extent_type == BTRFS_FILE_EXTENT_REG) + goto out; + + /* + * If the extent was created before the generation where the last snapshot + * for its subvolume was created, then this implies the extent is shared, + * hence we must COW. + */ + if (!args->strict && + btrfs_file_extent_generation(leaf, fi) <= + btrfs_root_last_snapshot(&root->root_item)) + goto out; + + /* An explicit hole, must COW. */ + if (args->disk_bytenr == 0) + goto out; + + /* Compressed/encrypted/encoded extents must be COWed. */ + if (btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + goto out; + + extent_end = btrfs_file_extent_end(path); + + /* + * The following checks can be expensive, as they need to take other + * locks and do btree or rbtree searches, so release the path to avoid + * blocking other tasks for too long. + */ + btrfs_release_path(path); + + ret = btrfs_cross_ref_exist(root, btrfs_ino(inode), + key->offset - args->extent_offset, + args->disk_bytenr, false, path); + WARN_ON_ONCE(ret > 0 && is_freespace_inode); + if (ret != 0) + goto out; + + if (args->free_path) { + /* + * We don't need the path anymore, plus through the + * csum_exist_in_range() call below we will end up allocating + * another path. So free the path to avoid unnecessary extra + * memory usage. + */ + btrfs_free_path(path); + path = NULL; + } + + /* If there are pending snapshots for this root, we must COW. */ + if (args->writeback_path && !is_freespace_inode && + atomic_read(&root->snapshot_force_cow)) + goto out; + + args->disk_bytenr += args->extent_offset; + args->disk_bytenr += args->start - key->offset; + args->num_bytes = min(args->end + 1, extent_end) - args->start; + + /* + * Force COW if csums exist in the range. This ensures that csums for a + * given extent are either valid or do not exist. + */ + ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes); + WARN_ON_ONCE(ret > 0 && is_freespace_inode); + if (ret != 0) + goto out; + + can_nocow = 1; + out: + if (args->free_path && path) + btrfs_free_path(path); + + return ret < 0 ? ret : can_nocow; +} + /* * when nowcow writeback call back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. @@ -1627,11 +1800,10 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, u64 cur_offset = start; int ret; bool check_prev = true; - const bool freespace_inode = btrfs_is_free_space_inode(inode); u64 ino = btrfs_ino(inode); + struct btrfs_block_group *bg; bool nocow = false; - u64 disk_bytenr = 0; - const bool force = inode->flags & BTRFS_INODE_NODATACOW; + struct can_nocow_file_extent_args nocow_args = { 0 }; path = btrfs_alloc_path(); if (!path) { @@ -1644,15 +1816,16 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, return -ENOMEM; } + nocow_args.end = end; + nocow_args.writeback_path = true; + while (1) { struct btrfs_key found_key; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; u64 extent_end; - u64 extent_offset; - u64 num_bytes = 0; - u64 disk_num_bytes; u64 ram_bytes; + u64 nocow_end; int extent_type; nocow = false; @@ -1728,116 +1901,38 @@ next_slot: fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); - + /* If this is triggered then we have a memory corruption. */ + ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES); + if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) { + ret = -EUCLEAN; + goto error; + } ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); - if (extent_type == BTRFS_FILE_EXTENT_REG || - extent_type == BTRFS_FILE_EXTENT_PREALLOC) { - disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - extent_offset = btrfs_file_extent_offset(leaf, fi); - extent_end = found_key.offset + - btrfs_file_extent_num_bytes(leaf, fi); - disk_num_bytes = - btrfs_file_extent_disk_num_bytes(leaf, fi); - /* - * If the extent we got ends before our current offset, - * skip to the next extent. - */ - if (extent_end <= cur_offset) { - path->slots[0]++; - goto next_slot; - } - /* Skip holes */ - if (disk_bytenr == 0) - goto out_check; - /* Skip compressed/encrypted/encoded extents */ - if (btrfs_file_extent_compression(leaf, fi) || - btrfs_file_extent_encryption(leaf, fi) || - btrfs_file_extent_other_encoding(leaf, fi)) - goto out_check; - /* - * If extent is created before the last volume's snapshot - * this implies the extent is shared, hence we can't do - * nocow. This is the same check as in - * btrfs_cross_ref_exist but without calling - * btrfs_search_slot. - */ - if (!freespace_inode && - btrfs_file_extent_generation(leaf, fi) <= - btrfs_root_last_snapshot(&root->root_item)) - goto out_check; - if (extent_type == BTRFS_FILE_EXTENT_REG && !force) - goto out_check; + extent_end = btrfs_file_extent_end(path); - /* - * The following checks can be expensive, as they need to - * take other locks and do btree or rbtree searches, so - * release the path to avoid blocking other tasks for too - * long. - */ - btrfs_release_path(path); + /* + * If the extent we got ends before our current offset, skip to + * the next extent. + */ + if (extent_end <= cur_offset) { + path->slots[0]++; + goto next_slot; + } - ret = btrfs_cross_ref_exist(root, ino, - found_key.offset - - extent_offset, disk_bytenr, false); - if (ret) { - /* - * ret could be -EIO if the above fails to read - * metadata. - */ - if (ret < 0) { - if (cow_start != (u64)-1) - cur_offset = cow_start; - goto error; - } + nocow_args.start = cur_offset; + ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args); + if (ret < 0) { + if (cow_start != (u64)-1) + cur_offset = cow_start; + goto error; + } else if (ret == 0) { + goto out_check; + } - WARN_ON_ONCE(freespace_inode); - goto out_check; - } - disk_bytenr += extent_offset; - disk_bytenr += cur_offset - found_key.offset; - num_bytes = min(end + 1, extent_end) - cur_offset; - /* - * If there are pending snapshots for this root, we - * fall into common COW way - */ - if (!freespace_inode && atomic_read(&root->snapshot_force_cow)) - goto out_check; - /* - * force cow if csum exists in the range. - * this ensure that csum for a given extent are - * either valid or do not exist. - */ - ret = csum_exist_in_range(fs_info, disk_bytenr, - num_bytes); - if (ret) { - /* - * ret could be -EIO if the above fails to read - * metadata. - */ - if (ret < 0) { - if (cow_start != (u64)-1) - cur_offset = cow_start; - goto error; - } - WARN_ON_ONCE(freespace_inode); - goto out_check; - } - /* If the extent's block group is RO, we must COW */ - if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) - goto out_check; + ret = 0; + bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr); + if (bg) nocow = true; - } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - extent_end = found_key.offset + ram_bytes; - extent_end = ALIGN(extent_end, fs_info->sectorsize); - /* Skip extents outside of our requested range */ - if (extent_end <= start) { - path->slots[0]++; - goto next_slot; - } - } else { - /* If this triggers then we have a memory corruption */ - BUG(); - } out_check: /* * If nocow is false then record the beginning of the range @@ -1869,15 +1964,17 @@ out_check: cow_start = (u64)-1; } + nocow_end = cur_offset + nocow_args.num_bytes - 1; + if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { - u64 orig_start = found_key.offset - extent_offset; + u64 orig_start = found_key.offset - nocow_args.extent_offset; struct extent_map *em; - em = create_io_em(inode, cur_offset, num_bytes, + em = create_io_em(inode, cur_offset, nocow_args.num_bytes, orig_start, - disk_bytenr, /* block_start */ - num_bytes, /* block_len */ - disk_num_bytes, /* orig_block_len */ + nocow_args.disk_bytenr, /* block_start */ + nocow_args.num_bytes, /* block_len */ + nocow_args.disk_num_bytes, /* orig_block_len */ ram_bytes, BTRFS_COMPRESS_NONE, BTRFS_ORDERED_PREALLOC); if (IS_ERR(em)) { @@ -1886,20 +1983,23 @@ out_check: } free_extent_map(em); ret = btrfs_add_ordered_extent(inode, - cur_offset, num_bytes, num_bytes, - disk_bytenr, num_bytes, 0, + cur_offset, nocow_args.num_bytes, + nocow_args.num_bytes, + nocow_args.disk_bytenr, + nocow_args.num_bytes, 0, 1 << BTRFS_ORDERED_PREALLOC, BTRFS_COMPRESS_NONE); if (ret) { btrfs_drop_extent_cache(inode, cur_offset, - cur_offset + num_bytes - 1, - 0); + nocow_end, 0); goto error; } } else { ret = btrfs_add_ordered_extent(inode, cur_offset, - num_bytes, num_bytes, - disk_bytenr, num_bytes, + nocow_args.num_bytes, + nocow_args.num_bytes, + nocow_args.disk_bytenr, + nocow_args.num_bytes, 0, 1 << BTRFS_ORDERED_NOCOW, BTRFS_COMPRESS_NONE); @@ -1907,9 +2007,10 @@ out_check: goto error; } - if (nocow) - btrfs_dec_nocow_writers(fs_info, disk_bytenr); - nocow = false; + if (nocow) { + btrfs_dec_nocow_writers(bg); + nocow = false; + } if (btrfs_is_data_reloc_root(root)) /* @@ -1918,10 +2019,9 @@ out_check: * from freeing metadata of created ordered extent. */ ret = btrfs_reloc_clone_csums(inode, cur_offset, - num_bytes); + nocow_args.num_bytes); - extent_clear_unlock_delalloc(inode, cur_offset, - cur_offset + num_bytes - 1, + extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, @@ -1954,7 +2054,7 @@ out_check: error: if (nocow) - btrfs_dec_nocow_writers(fs_info, disk_bytenr); + btrfs_dec_nocow_writers(bg); if (ret && cur_offset < end) extent_clear_unlock_delalloc(inode, cur_offset, end, @@ -2498,9 +2598,8 @@ out: * * c-3) otherwise: async submit */ -blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags) - +void btrfs_submit_data_bio(struct inode *inode, struct bio *bio, + int mirror_num, enum btrfs_compression_type compress_type) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; @@ -2529,16 +2628,14 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, if (ret) goto out; - if (bio_flags & EXTENT_BIO_COMPRESSED) { + if (compress_type != BTRFS_COMPRESS_NONE) { /* * btrfs_submit_compressed_read will handle completing * the bio if there were any errors, so just return * here. */ - ret = btrfs_submit_compressed_read(inode, bio, - mirror_num, - bio_flags); - goto out_no_endio; + btrfs_submit_compressed_read(inode, bio, mirror_num); + return; } else { /* * Lookup bio sums does extra checks around whether we @@ -2555,7 +2652,7 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, if (btrfs_is_data_reloc_root(root)) goto mapit; /* we're doing a write, do the async checksumming */ - ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags, + ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0, btrfs_submit_bio_start); goto out; } else if (!skip_sum) { @@ -2572,8 +2669,6 @@ out: bio->bi_status = ret; bio_endio(bio); } -out_no_endio: - return ret; } /* @@ -3264,11 +3359,11 @@ static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio, shash->tfm = fs_info->csum_shash; crypto_shash_digest(shash, kaddr + pgoff, len, csum); + kunmap_atomic(kaddr); if (memcmp(csum, csum_expected, csum_size)) goto zeroit; - kunmap_atomic(kaddr); return 0; zeroit: btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected, @@ -3276,9 +3371,7 @@ zeroit: if (bbio->device) btrfs_dev_stat_inc_and_print(bbio->device, BTRFS_DEV_STAT_CORRUPTION_ERRS); - memset(kaddr + pgoff, 1, len); - flush_dcache_page(page); - kunmap_atomic(kaddr); + memzero_page(page, pgoff, len); return -EIO; } @@ -3483,6 +3576,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) u64 last_objectid = 0; int ret = 0, nr_unlink = 0; + /* Bail out if the cleanup is already running. */ if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state)) return 0; @@ -3565,17 +3659,17 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * * btrfs_find_orphan_roots() ran before us, which has * found all deleted roots and loaded them into - * fs_info->fs_roots_radix. So here we can find if an + * fs_info->fs_roots. So here we can find if an * orphan item corresponds to a deleted root by looking - * up the root from that radix tree. + * up the root from that xarray. */ - spin_lock(&fs_info->fs_roots_radix_lock); - dead_root = radix_tree_lookup(&fs_info->fs_roots_radix, - (unsigned long)found_key.objectid); + spin_lock(&fs_info->fs_roots_lock); + dead_root = xa_load(&fs_info->fs_roots, + (unsigned long)found_key.objectid); if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0) is_dead_root = 1; - spin_unlock(&fs_info->fs_roots_radix_lock); + spin_unlock(&fs_info->fs_roots_lock); if (is_dead_root) { /* prevent this orphan from being found again */ @@ -3815,7 +3909,7 @@ cache_index: * cache. * * This is required for both inode re-read from disk and delayed inode - * in delayed_nodes_tree. + * in the delayed_nodes xarray. */ if (BTRFS_I(inode)->last_trans == fs_info->generation) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, @@ -4199,8 +4293,9 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) * 1 for the dir index * 1 for the inode ref * 1 for the inode + * 1 for the parent inode */ - return btrfs_start_transaction_fallback_global_rsv(root, 5); + return btrfs_start_transaction_fallback_global_rsv(root, 6); } static int btrfs_unlink(struct inode *dir, struct dentry *dentry) @@ -4693,7 +4788,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, goto out; } } - ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize); + ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false); if (ret < 0) { if (!only_release_metadata) btrfs_free_reserved_data_space(inode, data_reserved, @@ -5780,8 +5875,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) struct list_head ins_list; struct list_head del_list; int ret; - struct extent_buffer *leaf; - int slot; char *name_ptr; int name_len; int entries = 0; @@ -5808,35 +5901,19 @@ again: key.offset = ctx->pos; key.objectid = btrfs_ino(BTRFS_I(inode)); - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto err; - - while (1) { + btrfs_for_each_slot(root, &key, &found_key, path, ret) { struct dir_entry *entry; - - leaf = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto err; - else if (ret > 0) - break; - continue; - } - - btrfs_item_key_to_cpu(leaf, &found_key, slot); + struct extent_buffer *leaf = path->nodes[0]; if (found_key.objectid != key.objectid) break; if (found_key.type != BTRFS_DIR_INDEX_KEY) break; if (found_key.offset < ctx->pos) - goto next; + continue; if (btrfs_should_delete_dir_index(&del_list, found_key.offset)) - goto next; - di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + continue; + di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); name_len = btrfs_dir_name_len(leaf, di); if ((total_len + sizeof(struct dir_entry) + name_len) >= PAGE_SIZE) { @@ -5863,9 +5940,11 @@ again: entries++; addr += sizeof(struct dir_entry) + name_len; total_len += sizeof(struct dir_entry) + name_len; -next: - path->slots[0]++; } + /* Catch error encountered during iteration */ + if (ret < 0) + goto err; + btrfs_release_path(path); ret = btrfs_filldir(private->filldir_buf, entries, ctx); @@ -6053,6 +6132,57 @@ static int btrfs_insert_inode_locked(struct inode *inode) btrfs_find_actor, &args); } +int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, + unsigned int *trans_num_items) +{ + struct inode *dir = args->dir; + struct inode *inode = args->inode; + int ret; + + ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl); + if (ret) + return ret; + + /* 1 to add inode item */ + *trans_num_items = 1; + /* 1 to add compression property */ + if (BTRFS_I(dir)->prop_compress) + (*trans_num_items)++; + /* 1 to add default ACL xattr */ + if (args->default_acl) + (*trans_num_items)++; + /* 1 to add access ACL xattr */ + if (args->acl) + (*trans_num_items)++; +#ifdef CONFIG_SECURITY + /* 1 to add LSM xattr */ + if (dir->i_security) + (*trans_num_items)++; +#endif + if (args->orphan) { + /* 1 to add orphan item */ + (*trans_num_items)++; + } else { + /* + * 1 to add dir item + * 1 to add dir index + * 1 to update parent inode item + * + * No need for 1 unit for the inode ref item because it is + * inserted in a batch together with the inode item at + * btrfs_create_new_inode(). + */ + *trans_num_items += 3; + } + return 0; +} + +void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args) +{ + posix_acl_release(args->acl); + posix_acl_release(args->default_acl); +} + /* * Inherit flags from the parent inode. * @@ -6062,9 +6192,6 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) { unsigned int flags; - if (!dir) - return; - flags = BTRFS_I(dir)->flags; if (flags & BTRFS_INODE_NOCOMPRESS) { @@ -6084,76 +6211,86 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) btrfs_sync_inode_flags_to_i_flags(inode); } -static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct user_namespace *mnt_userns, - struct inode *dir, - const char *name, int name_len, - u64 ref_objectid, u64 objectid, - umode_t mode, u64 *index) +int btrfs_create_new_inode(struct btrfs_trans_handle *trans, + struct btrfs_new_inode_args *args) { - struct btrfs_fs_info *fs_info = root->fs_info; - struct inode *inode; + struct inode *dir = args->dir; + struct inode *inode = args->inode; + const char *name = args->orphan ? NULL : args->dentry->d_name.name; + int name_len = args->orphan ? 0 : args->dentry->d_name.len; + struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_root *root; struct btrfs_inode_item *inode_item; struct btrfs_key *location; struct btrfs_path *path; + u64 objectid; struct btrfs_inode_ref *ref; struct btrfs_key key[2]; u32 sizes[2]; struct btrfs_item_batch batch; unsigned long ptr; - unsigned int nofs_flag; int ret; path = btrfs_alloc_path(); if (!path) - return ERR_PTR(-ENOMEM); - - nofs_flag = memalloc_nofs_save(); - inode = new_inode(fs_info->sb); - memalloc_nofs_restore(nofs_flag); - if (!inode) { - btrfs_free_path(path); - return ERR_PTR(-ENOMEM); - } + return -ENOMEM; - /* - * O_TMPFILE, set link count to 0, so that after this point, - * we fill in an inode item with the correct link count. - */ - if (!name) - set_nlink(inode, 0); + if (!args->subvol) + BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root); + root = BTRFS_I(inode)->root; - /* - * we have to initialize this early, so we can reclaim the inode - * number if we fail afterwards in this function. - */ + ret = btrfs_get_free_objectid(root, &objectid); + if (ret) + goto out; inode->i_ino = objectid; - if (dir && name) { + if (args->orphan) { + /* + * O_TMPFILE, set link count to 0, so that after this point, we + * fill in an inode item with the correct link count. + */ + set_nlink(inode, 0); + } else { trace_btrfs_inode_request(dir); - ret = btrfs_set_inode_index(BTRFS_I(dir), index); - if (ret) { - btrfs_free_path(path); - iput(inode); - return ERR_PTR(ret); - } - } else if (dir) { - *index = 0; + ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index); + if (ret) + goto out; } - /* - * index_cnt is ignored for everything but a dir, - * btrfs_set_inode_index_count has an explanation for the magic - * number - */ - BTRFS_I(inode)->index_cnt = 2; - BTRFS_I(inode)->dir_index = *index; - BTRFS_I(inode)->root = btrfs_grab_root(root); + /* index_cnt is ignored for everything but a dir. */ + BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX; BTRFS_I(inode)->generation = trans->transid; inode->i_generation = BTRFS_I(inode)->generation; /* + * Subvolumes don't inherit flags from their parent directory. + * Originally this was probably by accident, but we probably can't + * change it now without compatibility issues. + */ + if (!args->subvol) + btrfs_inherit_iflags(inode, dir); + + if (S_ISREG(inode->i_mode)) { + if (btrfs_test_opt(fs_info, NODATASUM)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; + if (btrfs_test_opt(fs_info, NODATACOW)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | + BTRFS_INODE_NODATASUM; + } + + location = &BTRFS_I(inode)->location; + location->objectid = objectid; + location->offset = 0; + location->type = BTRFS_INODE_ITEM_KEY; + + ret = btrfs_insert_inode_locked(inode); + if (ret < 0) { + if (!args->orphan) + BTRFS_I(dir)->index_cnt--; + goto out; + } + + /* * We could have gotten an inode number from somebody who was fsynced * and then removed in this same transaction, so let's just set full * sync since it will be a full sync anyway and this will blow away the @@ -6167,7 +6304,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, sizes[0] = sizeof(struct btrfs_inode_item); - if (name) { + if (!args->orphan) { /* * Start new inodes with an inode_ref. This is slightly more * efficient for small numbers of hard links since they will @@ -6176,64 +6313,95 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, */ key[1].objectid = objectid; key[1].type = BTRFS_INODE_REF_KEY; - key[1].offset = ref_objectid; - - sizes[1] = name_len + sizeof(*ref); - } - - location = &BTRFS_I(inode)->location; - location->objectid = objectid; - location->offset = 0; - location->type = BTRFS_INODE_ITEM_KEY; - - ret = btrfs_insert_inode_locked(inode); - if (ret < 0) { - iput(inode); - goto fail; + if (args->subvol) { + key[1].offset = objectid; + sizes[1] = 2 + sizeof(*ref); + } else { + key[1].offset = btrfs_ino(BTRFS_I(dir)); + sizes[1] = name_len + sizeof(*ref); + } } batch.keys = &key[0]; batch.data_sizes = &sizes[0]; - batch.total_data_size = sizes[0] + (name ? sizes[1] : 0); - batch.nr = name ? 2 : 1; + batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]); + batch.nr = args->orphan ? 1 : 2; ret = btrfs_insert_empty_items(trans, root, path, &batch); - if (ret != 0) - goto fail_unlock; - - inode_init_owner(mnt_userns, inode, dir, mode); - inode_set_bytes(inode, 0); + if (ret != 0) { + btrfs_abort_transaction(trans, ret); + goto discard; + } inode->i_mtime = current_time(inode); inode->i_atime = inode->i_mtime; inode->i_ctime = inode->i_mtime; BTRFS_I(inode)->i_otime = inode->i_mtime; + /* + * We're going to fill the inode item now, so at this point the inode + * must be fully initialized. + */ + inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item, sizeof(*inode_item)); fill_inode_item(trans, path->nodes[0], inode_item, inode); - if (name) { + if (!args->orphan) { ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, struct btrfs_inode_ref); - btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); - btrfs_set_inode_ref_index(path->nodes[0], ref, *index); ptr = (unsigned long)(ref + 1); - write_extent_buffer(path->nodes[0], name, ptr, name_len); + if (args->subvol) { + btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2); + btrfs_set_inode_ref_index(path->nodes[0], ref, 0); + write_extent_buffer(path->nodes[0], "..", ptr, 2); + } else { + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_index(path->nodes[0], ref, + BTRFS_I(inode)->dir_index); + write_extent_buffer(path->nodes[0], name, ptr, name_len); + } } btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_free_path(path); + btrfs_release_path(path); - btrfs_inherit_iflags(inode, dir); + if (args->subvol) { + struct inode *parent; - if (S_ISREG(mode)) { - if (btrfs_test_opt(fs_info, NODATASUM)) - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; - if (btrfs_test_opt(fs_info, NODATACOW)) - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | - BTRFS_INODE_NODATASUM; + /* + * Subvolumes inherit properties from their parent subvolume, + * not the directory they were created in. + */ + parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID, + BTRFS_I(dir)->root); + if (IS_ERR(parent)) { + ret = PTR_ERR(parent); + } else { + ret = btrfs_inode_inherit_props(trans, inode, parent); + iput(parent); + } + } else { + ret = btrfs_inode_inherit_props(trans, inode, dir); + } + if (ret) { + btrfs_err(fs_info, + "error inheriting props for ino %llu (root %llu): %d", + btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, + ret); + } + + /* + * Subvolumes don't inherit ACLs or get passed to the LSM. This is + * probably a bug. + */ + if (!args->subvol) { + ret = btrfs_init_inode_security(trans, args); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto discard; + } } inode_tree_add(inode); @@ -6243,21 +6411,30 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_update_root_times(trans, root); - ret = btrfs_inode_inherit_props(trans, inode, dir); - if (ret) - btrfs_err(fs_info, - "error inheriting props for ino %llu (root %llu): %d", - btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret); + if (args->orphan) { + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + } else { + ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, + name_len, 0, BTRFS_I(inode)->dir_index); + } + if (ret) { + btrfs_abort_transaction(trans, ret); + goto discard; + } - return inode; + ret = 0; + goto out; -fail_unlock: +discard: + /* + * discard_new_inode() calls iput(), but the caller owns the reference + * to the inode. + */ + ihold(inode); discard_new_inode(inode); -fail: - if (dir && name) - BTRFS_I(dir)->index_cnt--; +out: btrfs_free_path(path); - return ERR_PTR(ret); + return ret; } /* @@ -6349,147 +6526,71 @@ fail_dir_item: return ret; } -static int btrfs_add_nondir(struct btrfs_trans_handle *trans, - struct btrfs_inode *dir, struct dentry *dentry, - struct btrfs_inode *inode, int backref, u64 index) -{ - int err = btrfs_add_link(trans, dir, inode, - dentry->d_name.name, dentry->d_name.len, - backref, index); - if (err > 0) - err = -EEXIST; - return err; -} - -static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode, dev_t rdev) +static int btrfs_create_common(struct inode *dir, struct dentry *dentry, + struct inode *inode) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); - struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; - struct inode *inode = NULL; + struct btrfs_new_inode_args new_inode_args = { + .dir = dir, + .dentry = dentry, + .inode = inode, + }; + unsigned int trans_num_items; + struct btrfs_trans_handle *trans; int err; - u64 objectid; - u64 index = 0; - /* - * 2 for inode item and ref - * 2 for dir items - * 1 for xattr if selinux is on - */ - trans = btrfs_start_transaction(root, 5); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - err = btrfs_get_free_objectid(root, &objectid); + err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); if (err) - goto out_unlock; + goto out_inode; - inode = btrfs_new_inode(trans, root, mnt_userns, dir, - dentry->d_name.name, dentry->d_name.len, - btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - inode = NULL; - goto out_unlock; + trans = btrfs_start_transaction(root, trans_num_items); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_new_inode_args; } - /* - * If the active LSM wants to access the inode during - * d_instantiate it needs these. Smack checks to see - * if the filesystem supports xattrs by looking at the - * ops vector. - */ - inode->i_op = &btrfs_special_inode_operations; - init_special_inode(inode, inode->i_mode, rdev); - - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) - goto out_unlock; - - err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), - 0, index); - if (err) - goto out_unlock; - - btrfs_update_inode(trans, root, BTRFS_I(inode)); - d_instantiate_new(dentry, inode); + err = btrfs_create_new_inode(trans, &new_inode_args); + if (!err) + d_instantiate_new(dentry, inode); -out_unlock: btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); - if (err && inode) { - inode_dec_link_count(inode); - discard_new_inode(inode); - } +out_new_inode_args: + btrfs_new_inode_args_destroy(&new_inode_args); +out_inode: + if (err) + iput(inode); return err; } -static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode, bool excl) +static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(dir)->root; - struct inode *inode = NULL; - int err; - u64 objectid; - u64 index = 0; + struct inode *inode; - /* - * 2 for inode item and ref - * 2 for dir items - * 1 for xattr if selinux is on - */ - trans = btrfs_start_transaction(root, 5); - if (IS_ERR(trans)) - return PTR_ERR(trans); + inode = new_inode(dir->i_sb); + if (!inode) + return -ENOMEM; + inode_init_owner(mnt_userns, inode, dir, mode); + inode->i_op = &btrfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, rdev); + return btrfs_create_common(dir, dentry, inode); +} - err = btrfs_get_free_objectid(root, &objectid); - if (err) - goto out_unlock; +static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) +{ + struct inode *inode; - inode = btrfs_new_inode(trans, root, mnt_userns, dir, - dentry->d_name.name, dentry->d_name.len, - btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - inode = NULL; - goto out_unlock; - } - /* - * If the active LSM wants to access the inode during - * d_instantiate it needs these. Smack checks to see - * if the filesystem supports xattrs by looking at the - * ops vector. - */ + inode = new_inode(dir->i_sb); + if (!inode) + return -ENOMEM; + inode_init_owner(mnt_userns, inode, dir, mode); inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; - - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) - goto out_unlock; - - err = btrfs_update_inode(trans, root, BTRFS_I(inode)); - if (err) - goto out_unlock; - - err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), - 0, index); - if (err) - goto out_unlock; - - d_instantiate_new(dentry, inode); - -out_unlock: - btrfs_end_transaction(trans); - if (err && inode) { - inode_dec_link_count(inode); - discard_new_inode(inode); - } - btrfs_btree_balance_dirty(fs_info); - return err; + return btrfs_create_common(dir, dentry, inode); } static int btrfs_link(struct dentry *old_dentry, struct inode *dir, @@ -6535,8 +6636,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, ihold(inode); set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); - err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), - 1, index); + err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), + dentry->d_name.name, dentry->d_name.len, 1, index); if (err) { drop_inode = 1; @@ -6573,66 +6674,15 @@ fail: static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); - struct inode *inode = NULL; - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(dir)->root; - int err = 0; - u64 objectid = 0; - u64 index = 0; - - /* - * 2 items for inode and ref - * 2 items for dir items - * 1 for xattr if selinux is on - */ - trans = btrfs_start_transaction(root, 5); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - err = btrfs_get_free_objectid(root, &objectid); - if (err) - goto out_fail; - - inode = btrfs_new_inode(trans, root, mnt_userns, dir, - dentry->d_name.name, dentry->d_name.len, - btrfs_ino(BTRFS_I(dir)), objectid, - S_IFDIR | mode, &index); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - inode = NULL; - goto out_fail; - } + struct inode *inode; - /* these must be set before we unlock the inode */ + inode = new_inode(dir->i_sb); + if (!inode) + return -ENOMEM; + inode_init_owner(mnt_userns, inode, dir, S_IFDIR | mode); inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; - - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) - goto out_fail; - - btrfs_i_size_write(BTRFS_I(inode), 0); - err = btrfs_update_inode(trans, root, BTRFS_I(inode)); - if (err) - goto out_fail; - - err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), - dentry->d_name.name, - dentry->d_name.len, 0, index); - if (err) - goto out_fail; - - d_instantiate_new(dentry, inode); - -out_fail: - btrfs_end_transaction(trans); - if (err && inode) { - inode_dec_link_count(inode); - discard_new_inode(inode); - } - btrfs_btree_balance_dirty(fs_info); - return err; + return btrfs_create_common(dir, dentry, inode); } static noinline int uncompress_inline(struct btrfs_path *path, @@ -7141,6 +7191,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *ram_bytes, bool strict) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct can_nocow_file_extent_args nocow_args = { 0 }; struct btrfs_path *path; int ret; struct extent_buffer *leaf; @@ -7148,13 +7199,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_file_extent_item *fi; struct btrfs_key key; - u64 disk_bytenr; - u64 backref_offset; - u64 extent_end; - u64 num_bytes; - int slot; int found_type; - bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW); path = btrfs_alloc_path(); if (!path) @@ -7165,18 +7210,17 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, if (ret < 0) goto out; - slot = path->slots[0]; if (ret == 1) { - if (slot == 0) { + if (path->slots[0] == 0) { /* can't find the item, must cow */ ret = 0; goto out; } - slot--; + path->slots[0]--; } ret = 0; leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, slot); + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != btrfs_ino(BTRFS_I(inode)) || key.type != BTRFS_EXTENT_DATA_KEY) { /* not our file or wrong item type, must cow */ @@ -7188,55 +7232,38 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, goto out; } - fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - found_type = btrfs_file_extent_type(leaf, fi); - if (found_type != BTRFS_FILE_EXTENT_REG && - found_type != BTRFS_FILE_EXTENT_PREALLOC) { - /* not a regular extent, must cow */ - goto out; - } - - if (!nocow && found_type == BTRFS_FILE_EXTENT_REG) + if (btrfs_file_extent_end(path) <= offset) goto out; - extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); - if (extent_end <= offset) - goto out; + fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(leaf, fi); + if (ram_bytes) + *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); - disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - if (disk_bytenr == 0) - goto out; + nocow_args.start = offset; + nocow_args.end = offset + *len - 1; + nocow_args.strict = strict; + nocow_args.free_path = true; - if (btrfs_file_extent_compression(leaf, fi) || - btrfs_file_extent_encryption(leaf, fi) || - btrfs_file_extent_other_encoding(leaf, fi)) - goto out; + ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args); + /* can_nocow_file_extent() has freed the path. */ + path = NULL; - /* - * Do the same check as in btrfs_cross_ref_exist but without the - * unnecessary search. - */ - if (!strict && - (btrfs_file_extent_generation(leaf, fi) <= - btrfs_root_last_snapshot(&root->root_item))) + if (ret != 1) { + /* Treat errors as not being able to NOCOW. */ + ret = 0; goto out; - - backref_offset = btrfs_file_extent_offset(leaf, fi); - - if (orig_start) { - *orig_start = key.offset - backref_offset; - *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); - *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); } - if (btrfs_extent_readonly(fs_info, disk_bytenr)) + ret = 0; + if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr)) goto out; - num_bytes = min(offset + *len, extent_end) - offset; - if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) { + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + found_type == BTRFS_FILE_EXTENT_PREALLOC) { u64 range_end; - range_end = round_up(offset + num_bytes, + range_end = round_up(offset + nocow_args.num_bytes, root->fs_info->sectorsize) - 1; ret = test_range_bit(io_tree, offset, range_end, EXTENT_DELALLOC, 0, NULL); @@ -7246,36 +7273,12 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, } } - btrfs_release_path(path); - - /* - * look for other files referencing this extent, if we - * find any we must cow - */ - - ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)), - key.offset - backref_offset, disk_bytenr, - strict); - if (ret) { - ret = 0; - goto out; - } + if (orig_start) + *orig_start = key.offset - nocow_args.extent_offset; + if (orig_block_len) + *orig_block_len = nocow_args.disk_num_bytes; - /* - * adjust disk_bytenr and num_bytes to cover just the bytes - * in this extent we are about to write. If there - * are any csums in that range we have to cow in order - * to keep the csums correct - */ - disk_bytenr += backref_offset; - disk_bytenr += offset - key.offset; - if (csum_exist_in_range(fs_info, disk_bytenr, num_bytes)) - goto out; - /* - * all of the above have passed, it is safe to overwrite this extent - * without cow - */ - *len = num_bytes; + *len = nocow_args.num_bytes; ret = 1; out: btrfs_free_path(path); @@ -7283,14 +7286,22 @@ out: } static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, - struct extent_state **cached_state, bool writing) + struct extent_state **cached_state, + unsigned int iomap_flags) { + const bool writing = (iomap_flags & IOMAP_WRITE); + const bool nowait = (iomap_flags & IOMAP_NOWAIT); + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; int ret = 0; while (1) { - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, - cached_state); + if (nowait) { + if (!try_lock_extent(io_tree, lockstart, lockend)) + return -EAGAIN; + } else { + lock_extent_bits(io_tree, lockstart, lockend, cached_state); + } /* * We're concerned with the entire range that we're going to be * doing DIO to, so we need to make sure there's no ordered @@ -7311,10 +7322,14 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, lockstart, lockend))) break; - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, - cached_state); + unlock_extent_cached(io_tree, lockstart, lockend, cached_state); if (ordered) { + if (nowait) { + btrfs_put_ordered_extent(ordered); + ret = -EAGAIN; + break; + } /* * If we are doing a DIO read and the ordered extent we * found is for a buffered write, we can not wait for it @@ -7334,7 +7349,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) btrfs_start_ordered_extent(ordered, 1); else - ret = -ENOTBLK; + ret = nowait ? -EAGAIN : -ENOTBLK; btrfs_put_ordered_extent(ordered); } else { /* @@ -7350,7 +7365,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, * ordered extent to complete while holding a lock on * that page. */ - ret = -ENOTBLK; + ret = nowait ? -EAGAIN : -ENOTBLK; } if (ret) @@ -7424,12 +7439,15 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, static int btrfs_get_blocks_direct_write(struct extent_map **map, struct inode *inode, struct btrfs_dio_data *dio_data, - u64 start, u64 len) + u64 start, u64 len, + unsigned int iomap_flags) { + const bool nowait = (iomap_flags & IOMAP_NOWAIT); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em = *map; int type; u64 block_start, orig_start, orig_block_len, ram_bytes; + struct btrfs_block_group *bg; bool can_nocow = false; bool space_reserved = false; u64 prev_len; @@ -7455,9 +7473,11 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, block_start = em->block_start + (start - em->start); if (can_nocow_extent(inode, start, &len, &orig_start, - &orig_block_len, &ram_bytes, false) == 1 && - btrfs_inc_nocow_writers(fs_info, block_start)) - can_nocow = true; + &orig_block_len, &ram_bytes, false) == 1) { + bg = btrfs_inc_nocow_writers(fs_info, block_start); + if (bg) + can_nocow = true; + } } prev_len = len; @@ -7465,12 +7485,15 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, struct extent_map *em2; /* We can NOCOW, so only need to reserve metadata space. */ - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len); + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, + nowait); if (ret < 0) { /* Our caller expects us to free the input extent map. */ free_extent_map(em); *map = NULL; - btrfs_dec_nocow_writers(fs_info, block_start); + btrfs_dec_nocow_writers(bg); + if (nowait && (ret == -ENOSPC || ret == -EDQUOT)) + ret = -EAGAIN; goto out; } space_reserved = true; @@ -7479,7 +7502,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, orig_start, block_start, len, orig_block_len, ram_bytes, type); - btrfs_dec_nocow_writers(fs_info, block_start); + btrfs_dec_nocow_writers(bg); if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); *map = em = em2; @@ -7489,15 +7512,29 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, ret = PTR_ERR(em2); goto out; } + + dio_data->nocow_done = true; } else { /* Our caller expects us to free the input extent map. */ free_extent_map(em); *map = NULL; - /* We have to COW, so need to reserve metadata and data space. */ - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), - &dio_data->data_reserved, - start, len); + if (nowait) + return -EAGAIN; + + /* + * If we could not allocate data space before locking the file + * range and we can't do a NOCOW write, then we have to fail. + */ + if (!dio_data->data_space_reserved) + return -ENOSPC; + + /* + * We have to COW and we have already reserved data space before, + * so now we reserve only metadata. + */ + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, + false); if (ret < 0) goto out; space_reserved = true; @@ -7510,10 +7547,8 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, *map = em; len = min(len, em->len - (start - em->start)); if (len < prev_len) - btrfs_delalloc_release_space(BTRFS_I(inode), - dio_data->data_reserved, - start + len, prev_len - len, - true); + btrfs_delalloc_release_metadata(BTRFS_I(inode), + prev_len - len, true); } /* @@ -7531,15 +7566,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, out: if (ret && space_reserved) { btrfs_delalloc_release_extents(BTRFS_I(inode), len); - if (can_nocow) { - btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); - } else { - btrfs_delalloc_release_space(BTRFS_I(inode), - dio_data->data_reserved, - start, len, true); - extent_changeset_free(dio_data->data_reserved); - dio_data->data_reserved = NULL; - } + btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); } return ret; } @@ -7548,14 +7575,16 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { + struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em; struct extent_state *cached_state = NULL; - struct btrfs_dio_data *dio_data = NULL; + struct btrfs_dio_data *dio_data = iter->private; u64 lockstart, lockend; const bool write = !!(flags & IOMAP_WRITE); int ret = 0; u64 len = length; + const u64 data_alloc_len = length; bool unlock_extents = false; if (!write) @@ -7565,34 +7594,67 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, lockend = start + len - 1; /* - * The generic stuff only does filemap_write_and_wait_range, which - * isn't enough if we've written compressed pages to this area, so we - * need to flush the dirty pages again to make absolutely sure that any - * outstanding dirty pages are on disk. + * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't + * enough if we've written compressed pages to this area, so we need to + * flush the dirty pages again to make absolutely sure that any + * outstanding dirty pages are on disk - the first flush only starts + * compression on the data, while keeping the pages locked, so by the + * time the second flush returns we know bios for the compressed pages + * were submitted and finished, and the pages no longer under writeback. + * + * If we have a NOWAIT request and we have any pages in the range that + * are locked, likely due to compression still in progress, we don't want + * to block on page locks. We also don't want to block on pages marked as + * dirty or under writeback (same as for the non-compression case). + * iomap_dio_rw() did the same check, but after that and before we got + * here, mmap'ed writes may have happened or buffered reads started + * (readpage() and readahead(), which lock pages), as we haven't locked + * the file range yet. */ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags)) { - ret = filemap_fdatawrite_range(inode->i_mapping, start, - start + length - 1); - if (ret) - return ret; + if (flags & IOMAP_NOWAIT) { + if (filemap_range_needs_writeback(inode->i_mapping, + lockstart, lockend)) + return -EAGAIN; + } else { + ret = filemap_fdatawrite_range(inode->i_mapping, start, + start + length - 1); + if (ret) + return ret; + } } - dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS); - if (!dio_data) - return -ENOMEM; - - iomap->private = dio_data; + memset(dio_data, 0, sizeof(*dio_data)); + /* + * We always try to allocate data space and must do it before locking + * the file range, to avoid deadlocks with concurrent writes to the same + * range if the range has several extents and the writes don't expand the + * current i_size (the inode lock is taken in shared mode). If we fail to + * allocate data space here we continue and later, after locking the + * file range, we fail with ENOSPC only if we figure out we can not do a + * NOCOW write. + */ + if (write && !(flags & IOMAP_NOWAIT)) { + ret = btrfs_check_data_free_space(BTRFS_I(inode), + &dio_data->data_reserved, + start, data_alloc_len); + if (!ret) + dio_data->data_space_reserved = true; + else if (ret && !(BTRFS_I(inode)->flags & + (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) + goto err; + } /* * If this errors out it's because we couldn't invalidate pagecache for - * this range and we need to fallback to buffered. + * this range and we need to fallback to buffered IO, or we are doing a + * NOWAIT read/write and we need to block. */ - if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) { - ret = -ENOTBLK; + ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags); + if (ret < 0) goto err; - } em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); if (IS_ERR(em)) { @@ -7652,12 +7714,30 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (write) { ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, - start, len); + start, len, flags); if (ret < 0) goto unlock_err; unlock_extents = true; /* Recalc len in case the new em is smaller than requested */ len = min(len, em->len - (start - em->start)); + if (dio_data->data_space_reserved) { + u64 release_offset; + u64 release_len = 0; + + if (dio_data->nocow_done) { + release_offset = start; + release_len = data_alloc_len; + } else if (len < data_alloc_len) { + release_offset = start + len; + release_len = data_alloc_len - len; + } + + if (release_len > 0) + btrfs_free_reserved_data_space(BTRFS_I(inode), + dio_data->data_reserved, + release_offset, + release_len); + } } else { /* * We need to unlock only the end area that we aren't using. @@ -7702,7 +7782,12 @@ unlock_err: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); err: - kfree(dio_data); + if (dio_data->data_space_reserved) { + btrfs_free_reserved_data_space(BTRFS_I(inode), + dio_data->data_reserved, + start, data_alloc_len); + extent_changeset_free(dio_data->data_reserved); + } return ret; } @@ -7710,15 +7795,16 @@ err: static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, ssize_t written, unsigned int flags, struct iomap *iomap) { - int ret = 0; - struct btrfs_dio_data *dio_data = iomap->private; + struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); + struct btrfs_dio_data *dio_data = iter->private; size_t submitted = dio_data->submitted; const bool write = !!(flags & IOMAP_WRITE); + int ret = 0; if (!write && (iomap->type == IOMAP_HOLE)) { /* If reading from a hole, unlock and return */ unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1); - goto out; + return 0; } if (submitted < length) { @@ -7735,10 +7821,6 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, if (write) extent_changeset_free(dio_data->data_reserved); -out: - kfree(dio_data); - iomap->private = NULL; - return ret; } @@ -7751,40 +7833,36 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) if (!refcount_dec_and_test(&dip->refs)) return; - if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) { + if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) { __endio_write_update_ordered(BTRFS_I(dip->inode), dip->file_offset, dip->bytes, - !dip->dio_bio->bi_status); + !dip->bio.bi_status); } else { unlock_extent(&BTRFS_I(dip->inode)->io_tree, dip->file_offset, dip->file_offset + dip->bytes - 1); } - bio_endio(dip->dio_bio); - kfree(dip); + kfree(dip->csums); + bio_endio(&dip->bio); } -static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio, - int mirror_num, - unsigned long bio_flags) +static void submit_dio_repair_bio(struct inode *inode, struct bio *bio, + int mirror_num, + enum btrfs_compression_type compress_type) { struct btrfs_dio_private *dip = bio->bi_private; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - blk_status_t ret; BUG_ON(bio_op(bio) == REQ_OP_WRITE); - ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); - if (ret) - return ret; + if (btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA)) + return; refcount_inc(&dip->refs); - ret = btrfs_map_bio(fs_info, bio, mirror_num); - if (ret) + if (btrfs_map_bio(fs_info, bio, mirror_num)) refcount_dec(&dip->refs); - return ret; } static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, @@ -7869,7 +7947,7 @@ static void btrfs_end_dio_bio(struct bio *bio) err = btrfs_check_read_dio_bio(dip, bbio, !err); if (err) - dip->dio_bio->bi_status = err; + dip->bio.bi_status = err; btrfs_record_physical_zoned(dip->inode, bbio->file_offset, bio); @@ -7899,7 +7977,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, goto map; if (write && async_submit) { - ret = btrfs_wq_submit_bio(inode, bio, 0, 0, file_offset, + ret = btrfs_wq_submit_bio(inode, bio, 0, file_offset, btrfs_submit_bio_start_direct_io); goto err; } else if (write) { @@ -7924,50 +8002,16 @@ err: return ret; } -/* - * If this succeeds, the btrfs_dio_private is responsible for cleaning up locked - * or ordered extents whether or not we submit any bios. - */ -static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, - struct inode *inode, - loff_t file_offset) -{ - const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); - const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); - size_t dip_size; - struct btrfs_dio_private *dip; - - dip_size = sizeof(*dip); - if (!write && csum) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - size_t nblocks; - - nblocks = dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits; - dip_size += fs_info->csum_size * nblocks; - } - - dip = kzalloc(dip_size, GFP_NOFS); - if (!dip) - return NULL; - - dip->inode = inode; - dip->file_offset = file_offset; - dip->bytes = dio_bio->bi_iter.bi_size; - dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9; - dip->dio_bio = dio_bio; - refcount_set(&dip->refs, 1); - return dip; -} - static void btrfs_submit_direct(const struct iomap_iter *iter, struct bio *dio_bio, loff_t file_offset) { + struct btrfs_dio_private *dip = + container_of(dio_bio, struct btrfs_dio_private, bio); struct inode *inode = iter->inode; const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const bool raid56 = (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK); - struct btrfs_dio_private *dip; struct bio *bio; u64 start_sector; int async_submit = 0; @@ -7978,27 +8022,28 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, int ret; blk_status_t status; struct btrfs_io_geometry geom; - struct btrfs_dio_data *dio_data = iter->iomap.private; + struct btrfs_dio_data *dio_data = iter->private; struct extent_map *em = NULL; - dip = btrfs_create_dio_private(dio_bio, inode, file_offset); - if (!dip) { - if (!write) { - unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, - file_offset + dio_bio->bi_iter.bi_size - 1); - } - dio_bio->bi_status = BLK_STS_RESOURCE; - bio_endio(dio_bio); - return; - } + dip->inode = inode; + dip->file_offset = file_offset; + dip->bytes = dio_bio->bi_iter.bi_size; + refcount_set(&dip->refs, 1); + dip->csums = NULL; + + if (!write && !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + unsigned int nr_sectors = + (dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits); - if (!write) { /* * Load the csums up front to reduce csum tree searches and * contention when submitting bios. - * - * If we have csums disabled this will do nothing. */ + status = BLK_STS_RESOURCE; + dip->csums = kcalloc(nr_sectors, fs_info->csum_size, GFP_NOFS); + if (!dip) + goto out_err; + status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums); if (status != BLK_STS_OK) goto out_err; @@ -8088,19 +8133,28 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, out_err_em: free_extent_map(em); out_err: - dip->dio_bio->bi_status = status; + dio_bio->bi_status = status; btrfs_dio_private_put(dip); } -const struct iomap_ops btrfs_dio_iomap_ops = { +static const struct iomap_ops btrfs_dio_iomap_ops = { .iomap_begin = btrfs_dio_iomap_begin, .iomap_end = btrfs_dio_iomap_end, }; -const struct iomap_dio_ops btrfs_dio_ops = { +static const struct iomap_dio_ops btrfs_dio_ops = { .submit_io = btrfs_submit_direct, + .bio_set = &btrfs_dio_bioset, }; +ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) +{ + struct btrfs_dio_data data; + + return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + IOMAP_DIO_PARTIAL, &data, done_before); +} + static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { @@ -8113,27 +8167,6 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return extent_fiemap(BTRFS_I(inode), fieinfo, start, len); } -int btrfs_readpage(struct file *file, struct page *page) -{ - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - u64 start = page_offset(page); - u64 end = start + PAGE_SIZE - 1; - struct btrfs_bio_ctrl bio_ctrl = { 0 }; - int ret; - - btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - - ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL); - if (bio_ctrl.bio) { - int ret2; - - ret2 = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags); - if (ret == 0) - ret = ret2; - } - return ret; -} - static int btrfs_writepage(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; @@ -8182,7 +8215,7 @@ static void wait_subpage_spinlock(struct page *page) struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); struct btrfs_subpage *subpage; - if (fs_info->sectorsize == PAGE_SIZE) + if (!btrfs_is_subpage(fs_info, page)) return; ASSERT(PagePrivate(page) && page->private); @@ -8764,46 +8797,23 @@ out: return ret; } -/* - * create a new subvolume directory/inode (helper for the ioctl). - */ -int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, - struct btrfs_root *new_root, - struct btrfs_root *parent_root, - struct user_namespace *mnt_userns) +struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, + struct inode *dir) { struct inode *inode; - int err; - u64 index = 0; - u64 ino; - - err = btrfs_get_free_objectid(new_root, &ino); - if (err < 0) - return err; - - inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2, - ino, ino, - S_IFDIR | (~current_umask() & S_IRWXUGO), - &index); - if (IS_ERR(inode)) - return PTR_ERR(inode); - inode->i_op = &btrfs_dir_inode_operations; - inode->i_fop = &btrfs_dir_file_operations; - - set_nlink(inode, 1); - btrfs_i_size_write(BTRFS_I(inode), 0); - unlock_new_inode(inode); - - err = btrfs_subvol_inherit_props(trans, new_root, parent_root); - if (err) - btrfs_err(new_root->fs_info, - "error inheriting subvolume %llu properties: %d", - new_root->root_key.objectid, err); - err = btrfs_update_inode(trans, new_root, BTRFS_I(inode)); - - iput(inode); - return err; + inode = new_inode(dir->i_sb); + if (inode) { + /* + * Subvolumes don't inherit the sgid bit or the parent's gid if + * the parent's sgid bit is set. This is probably a bug. + */ + inode_init_owner(mnt_userns, inode, NULL, + S_IFDIR | (~current_umask() & S_IRWXUGO)); + inode->i_op = &btrfs_dir_inode_operations; + inode->i_fop = &btrfs_dir_file_operations; + } + return inode; } struct inode *btrfs_alloc_inode(struct super_block *sb) @@ -8943,7 +8953,7 @@ int btrfs_drop_inode(struct inode *inode) static void init_once(void *foo) { - struct btrfs_inode *ei = (struct btrfs_inode *) foo; + struct btrfs_inode *ei = foo; inode_init_once(&ei->vfs_inode); } @@ -8955,6 +8965,7 @@ void __cold btrfs_destroy_cachep(void) * destroy cache. */ rcu_barrier(); + bioset_exit(&btrfs_dio_bioset); kmem_cache_destroy(btrfs_inode_cachep); kmem_cache_destroy(btrfs_trans_handle_cachep); kmem_cache_destroy(btrfs_path_cachep); @@ -8995,6 +9006,11 @@ int __init btrfs_init_cachep(void) if (!btrfs_free_space_bitmap_cachep) goto fail; + if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_dio_private, bio), + BIOSET_NEED_BVECS)) + goto fail; + return 0; fail: btrfs_destroy_cachep(); @@ -9050,6 +9066,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, { struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); struct btrfs_trans_handle *trans; + unsigned int trans_num_items; struct btrfs_root *root = BTRFS_I(old_dir)->root; struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = new_dentry->d_inode; @@ -9081,14 +9098,37 @@ static int btrfs_rename_exchange(struct inode *old_dir, down_read(&fs_info->subvol_sem); /* - * We want to reserve the absolute worst case amount of items. So if - * both inodes are subvols and we need to unlink them then that would - * require 4 item modifications, but if they are both normal inodes it - * would require 5 item modifications, so we'll assume their normal - * inodes. So 5 * 2 is 10, plus 2 for the new links, so 12 total items - * should cover the worst case number of items we'll modify. + * For each inode: + * 1 to remove old dir item + * 1 to remove old dir index + * 1 to add new dir item + * 1 to add new dir index + * 1 to update parent inode + * + * If the parents are the same, we only need to account for one */ - trans = btrfs_start_transaction(root, 12); + trans_num_items = (old_dir == new_dir ? 9 : 10); + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { + /* + * 1 to remove old root ref + * 1 to remove old root backref + * 1 to add new root ref + * 1 to add new root backref + */ + trans_num_items += 4; + } else { + /* + * 1 to update inode item + * 1 to remove old inode ref + * 1 to add new inode ref + */ + trans_num_items += 3; + } + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) + trans_num_items += 4; + else + trans_num_items += 3; + trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_notrans; @@ -9255,56 +9295,19 @@ out_notrans: return ret; } -static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct user_namespace *mnt_userns, - struct inode *dir, - struct dentry *dentry) +static struct inode *new_whiteout_inode(struct user_namespace *mnt_userns, + struct inode *dir) { - int ret; struct inode *inode; - u64 objectid; - u64 index; - - ret = btrfs_get_free_objectid(root, &objectid); - if (ret) - return ret; - inode = btrfs_new_inode(trans, root, mnt_userns, dir, - dentry->d_name.name, - dentry->d_name.len, - btrfs_ino(BTRFS_I(dir)), - objectid, - S_IFCHR | WHITEOUT_MODE, - &index); - - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - return ret; + inode = new_inode(dir->i_sb); + if (inode) { + inode_init_owner(mnt_userns, inode, dir, + S_IFCHR | WHITEOUT_MODE); + inode->i_op = &btrfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); } - - inode->i_op = &btrfs_special_inode_operations; - init_special_inode(inode, inode->i_mode, - WHITEOUT_DEV); - - ret = btrfs_init_inode_security(trans, inode, dir, - &dentry->d_name); - if (ret) - goto out; - - ret = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, - BTRFS_I(inode), 0, index); - if (ret) - goto out; - - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); -out: - unlock_new_inode(inode); - if (ret) - inode_dec_link_count(inode); - iput(inode); - - return ret; + return inode; } static int btrfs_rename(struct user_namespace *mnt_userns, @@ -9313,6 +9316,10 @@ static int btrfs_rename(struct user_namespace *mnt_userns, unsigned int flags) { struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); + struct btrfs_new_inode_args whiteout_args = { + .dir = old_dir, + .dentry = old_dentry, + }; struct btrfs_trans_handle *trans; unsigned int trans_num_items; struct btrfs_root *root = BTRFS_I(old_dir)->root; @@ -9367,23 +9374,56 @@ static int btrfs_rename(struct user_namespace *mnt_userns, if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size) filemap_flush(old_inode->i_mapping); - /* close the racy window with snapshot create/destroy ioctl */ - if (old_ino == BTRFS_FIRST_FREE_OBJECTID) + if (flags & RENAME_WHITEOUT) { + whiteout_args.inode = new_whiteout_inode(mnt_userns, old_dir); + if (!whiteout_args.inode) + return -ENOMEM; + ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items); + if (ret) + goto out_whiteout_inode; + } else { + /* 1 to update the old parent inode. */ + trans_num_items = 1; + } + + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { + /* Close the race window with snapshot create/destroy ioctl */ down_read(&fs_info->subvol_sem); + /* + * 1 to remove old root ref + * 1 to remove old root backref + * 1 to add new root ref + * 1 to add new root backref + */ + trans_num_items += 4; + } else { + /* + * 1 to update inode + * 1 to remove old inode ref + * 1 to add new inode ref + */ + trans_num_items += 3; + } /* - * We want to reserve the absolute worst case amount of items. So if - * both inodes are subvols and we need to unlink them then that would - * require 4 item modifications, but if they are both normal inodes it - * would require 5 item modifications, so we'll assume they are normal - * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items - * should cover the worst case number of items we'll modify. - * If our rename has the whiteout flag, we need more 5 units for the - * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item - * when selinux is enabled). + * 1 to remove old dir item + * 1 to remove old dir index + * 1 to add new dir item + * 1 to add new dir index */ - trans_num_items = 11; - if (flags & RENAME_WHITEOUT) + trans_num_items += 4; + /* 1 to update new parent inode if it's not the same as the old parent */ + if (new_dir != old_dir) + trans_num_items++; + if (new_inode) { + /* + * 1 to update inode + * 1 to remove inode ref + * 1 to remove dir item + * 1 to remove dir index + * 1 to possibly add orphan item + */ trans_num_items += 5; + } trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -9479,12 +9519,14 @@ static int btrfs_rename(struct user_namespace *mnt_userns, rename_ctx.index, new_dentry->d_parent); if (flags & RENAME_WHITEOUT) { - ret = btrfs_whiteout_for_rename(trans, root, mnt_userns, - old_dir, old_dentry); - + ret = btrfs_create_new_inode(trans, &whiteout_args); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; + } else { + unlock_new_inode(whiteout_args.inode); + iput(whiteout_args.inode); + whiteout_args.inode = NULL; } } out_fail: @@ -9493,7 +9535,11 @@ out_fail: out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); - + if (flags & RENAME_WHITEOUT) + btrfs_new_inode_args_destroy(&whiteout_args); +out_whiteout_inode: + if (flags & RENAME_WHITEOUT) + iput(whiteout_args.inode); return ret; } @@ -9712,10 +9758,13 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_path *path; struct btrfs_key key; - struct inode *inode = NULL; + struct inode *inode; + struct btrfs_new_inode_args new_inode_args = { + .dir = dir, + .dentry = dentry, + }; + unsigned int trans_num_items; int err; - u64 objectid; - u64 index = 0; int name_len; int datasize; unsigned long ptr; @@ -9726,49 +9775,40 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) return -ENAMETOOLONG; - /* - * 2 items for inode item and ref - * 2 items for dir items - * 1 item for updating parent inode item - * 1 item for the inline extent item - * 1 item for xattr if selinux is on - */ - trans = btrfs_start_transaction(root, 7); - if (IS_ERR(trans)) - return PTR_ERR(trans); + inode = new_inode(dir->i_sb); + if (!inode) + return -ENOMEM; + inode_init_owner(mnt_userns, inode, dir, S_IFLNK | S_IRWXUGO); + inode->i_op = &btrfs_symlink_inode_operations; + inode_nohighmem(inode); + inode->i_mapping->a_ops = &btrfs_aops; + btrfs_i_size_write(BTRFS_I(inode), name_len); + inode_set_bytes(inode, name_len); - err = btrfs_get_free_objectid(root, &objectid); + new_inode_args.inode = inode; + err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); if (err) - goto out_unlock; + goto out_inode; + /* 1 additional item for the inline extent */ + trans_num_items++; - inode = btrfs_new_inode(trans, root, mnt_userns, dir, - dentry->d_name.name, dentry->d_name.len, - btrfs_ino(BTRFS_I(dir)), objectid, - S_IFLNK | S_IRWXUGO, &index); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - inode = NULL; - goto out_unlock; + trans = btrfs_start_transaction(root, trans_num_items); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_new_inode_args; } - /* - * If the active LSM wants to access the inode during - * d_instantiate it needs these. Smack checks to see - * if the filesystem supports xattrs by looking at the - * ops vector. - */ - inode->i_fop = &btrfs_file_operations; - inode->i_op = &btrfs_file_inode_operations; - inode->i_mapping->a_ops = &btrfs_aops; - - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); + err = btrfs_create_new_inode(trans, &new_inode_args); if (err) - goto out_unlock; + goto out; path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; - goto out_unlock; + btrfs_abort_transaction(trans, err); + discard_new_inode(inode); + inode = NULL; + goto out; } key.objectid = btrfs_ino(BTRFS_I(inode)); key.offset = 0; @@ -9777,8 +9817,11 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, err = btrfs_insert_empty_item(trans, root, path, &key, datasize); if (err) { + btrfs_abort_transaction(trans, err); btrfs_free_path(path); - goto out_unlock; + discard_new_inode(inode); + inode = NULL; + goto out; } leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], @@ -9796,31 +9839,16 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - inode->i_op = &btrfs_symlink_inode_operations; - inode_nohighmem(inode); - inode_set_bytes(inode, name_len); - btrfs_i_size_write(BTRFS_I(inode), name_len); - err = btrfs_update_inode(trans, root, BTRFS_I(inode)); - /* - * Last step, add directory indexes for our symlink inode. This is the - * last step to avoid extra cleanup of these indexes if an error happens - * elsewhere above. - */ - if (!err) - err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, - BTRFS_I(inode), 0, index); - if (err) - goto out_unlock; - d_instantiate_new(dentry, inode); - -out_unlock: + err = 0; +out: btrfs_end_transaction(trans); - if (err && inode) { - inode_dec_link_count(inode); - discard_new_inode(inode); - } btrfs_btree_balance_dirty(fs_info); +out_new_inode_args: + btrfs_new_inode_args_destroy(&new_inode_args); +out_inode: + if (err) + iput(inode); return err; } @@ -10071,62 +10099,58 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; - struct inode *inode = NULL; - u64 objectid; - u64 index; - int ret = 0; - - /* - * 5 units required for adding orphan entry - */ - trans = btrfs_start_transaction(root, 5); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - ret = btrfs_get_free_objectid(root, &objectid); - if (ret) - goto out; - - inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0, - btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - inode = NULL; - goto out; - } + struct inode *inode; + struct btrfs_new_inode_args new_inode_args = { + .dir = dir, + .dentry = dentry, + .orphan = true, + }; + unsigned int trans_num_items; + int ret; + inode = new_inode(dir->i_sb); + if (!inode) + return -ENOMEM; + inode_init_owner(mnt_userns, inode, dir, mode); inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; - inode->i_mapping->a_ops = &btrfs_aops; - ret = btrfs_init_inode_security(trans, inode, dir, NULL); + new_inode_args.inode = inode; + ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); if (ret) - goto out; + goto out_inode; - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); - if (ret) - goto out; - ret = btrfs_orphan_add(trans, BTRFS_I(inode)); - if (ret) - goto out; + trans = btrfs_start_transaction(root, trans_num_items); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_new_inode_args; + } + + ret = btrfs_create_new_inode(trans, &new_inode_args); /* - * We set number of links to 0 in btrfs_new_inode(), and here we set - * it to 1 because d_tmpfile() will issue a warning if the count is 0, - * through: + * We set number of links to 0 in btrfs_create_new_inode(), and here we + * set it to 1 because d_tmpfile() will issue a warning if the count is + * 0, through: * * d_tmpfile() -> inode_dec_link_count() -> drop_nlink() */ set_nlink(inode, 1); - d_tmpfile(dentry, inode); - unlock_new_inode(inode); - mark_inode_dirty(inode); -out: + + if (!ret) { + d_tmpfile(dentry, inode); + unlock_new_inode(inode); + mark_inode_dirty(inode); + } + btrfs_end_transaction(trans); - if (ret && inode) - discard_new_inode(inode); btrfs_btree_balance_dirty(fs_info); +out_new_inode_args: + btrfs_new_inode_args_destroy(&new_inode_args); +out_inode: + if (ret) + iput(inode); return ret; } @@ -10458,13 +10482,11 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) return -ENOMEM; - for (i = 0; i < nr_pages; i++) { - pages[i] = alloc_page(GFP_NOFS); - if (!pages[i]) { - ret = -ENOMEM; - goto out; + ret = btrfs_alloc_page_array(nr_pages, pages); + if (ret) { + ret = -ENOMEM; + goto out; } - } ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr, disk_io_size, pages); @@ -10800,7 +10822,8 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes); if (ret) goto out_free_data_space; - ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes); + ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes, + false); if (ret) goto out_qgroup_free_data; @@ -11309,6 +11332,41 @@ void btrfs_update_inode_bytes(struct btrfs_inode *inode, spin_unlock(&inode->lock); } +/** + * Verify that there are no ordered extents for a given file range. + * + * @inode: The target inode. + * @start: Start offset of the file range, should be sector size aligned. + * @end: End offset (inclusive) of the file range, its value +1 should be + * sector size aligned. + * + * This should typically be used for cases where we locked an inode's VFS lock in + * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode, + * we have flushed all delalloc in the range, we have waited for all ordered + * extents in the range to complete and finally we have locked the file range in + * the inode's io_tree. + */ +void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end) +{ + struct btrfs_root *root = inode->root; + struct btrfs_ordered_extent *ordered; + + if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) + return; + + ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start); + if (ordered) { + btrfs_err(root->fs_info, +"found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])", + start, end, btrfs_ino(inode), root->root_key.objectid, + ordered->file_offset, + ordered->file_offset + ordered->num_bytes - 1); + btrfs_put_ordered_extent(ordered); + } + + ASSERT(ordered == NULL); +} + static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index be6c24577dbe..43b6f23bbd89 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -468,7 +468,6 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_device *device; - struct request_queue *q; struct fstrim_range range; u64 minlen = ULLONG_MAX; u64 num_devices = 0; @@ -498,14 +497,11 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, rcu_read_lock(); list_for_each_entry_rcu(device, &fs_info->fs_devices->devices, dev_list) { - if (!device->bdev) + if (!device->bdev || !bdev_max_discard_sectors(device->bdev)) continue; - q = bdev_get_queue(device->bdev); - if (blk_queue_discard(q)) { - num_devices++; - minlen = min_t(u64, q->limits.discard_granularity, - minlen); - } + num_devices++; + minlen = min_t(u64, bdev_discard_granularity(device->bdev), + minlen); } rcu_read_unlock(); @@ -544,9 +540,35 @@ int __pure btrfs_is_empty_uuid(u8 *uuid) return 1; } +/* + * Calculate the number of transaction items to reserve for creating a subvolume + * or snapshot, not including the inode, directory entries, or parent directory. + */ +static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit) +{ + /* + * 1 to add root block + * 1 to add root item + * 1 to add root ref + * 1 to add root backref + * 1 to add UUID item + * 1 to add qgroup info + * 1 to add qgroup limit + * + * Ideally the last two would only be accounted if qgroups are enabled, + * but that can change between now and the time we would insert them. + */ + unsigned int num_items = 7; + + if (inherit) { + /* 2 to add qgroup relations for each inherited qgroup */ + num_items += 2 * inherit->num_qgroups; + } + return num_items; +} + static noinline int create_subvol(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, - const char *name, int namelen, struct btrfs_qgroup_inherit *inherit) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); @@ -559,11 +581,15 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, struct btrfs_root *new_root; struct btrfs_block_rsv block_rsv; struct timespec64 cur_time = current_time(dir); - struct inode *inode; + struct btrfs_new_inode_args new_inode_args = { + .dir = dir, + .dentry = dentry, + .subvol = true, + }; + unsigned int trans_num_items; int ret; - dev_t anon_dev = 0; + dev_t anon_dev; u64 objectid; - u64 index = 0; root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); if (!root_item) @@ -571,11 +597,7 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid); if (ret) - goto fail_free; - - ret = get_anon_bdev(&anon_dev); - if (ret < 0) - goto fail_free; + goto out_root_item; /* * Don't create subvolume whose level is not zero. Or qgroup will be @@ -583,36 +605,47 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, */ if (btrfs_qgroup_level(objectid)) { ret = -ENOSPC; - goto fail_free; + goto out_root_item; + } + + ret = get_anon_bdev(&anon_dev); + if (ret < 0) + goto out_root_item; + + new_inode_args.inode = btrfs_new_subvol_inode(mnt_userns, dir); + if (!new_inode_args.inode) { + ret = -ENOMEM; + goto out_anon_dev; } + ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); + if (ret) + goto out_inode; + trans_num_items += create_subvol_num_items(inherit); btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); - /* - * The same as the snapshot creation, please see the comment - * of create_snapshot(). - */ - ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, false); + ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, + trans_num_items, false); if (ret) - goto fail_free; + goto out_new_inode_args; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); btrfs_subvolume_release_metadata(root, &block_rsv); - goto fail_free; + goto out_new_inode_args; } trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit); if (ret) - goto fail; + goto out; leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); - goto fail; + goto out; } btrfs_mark_buffer_dirty(leaf); @@ -667,75 +700,46 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, btrfs_tree_unlock(leaf); btrfs_free_tree_block(trans, objectid, leaf, 0, 1); free_extent_buffer(leaf); - goto fail; + goto out; } free_extent_buffer(leaf); leaf = NULL; - key.offset = (u64)-1; new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev); if (IS_ERR(new_root)) { - free_anon_bdev(anon_dev); ret = PTR_ERR(new_root); btrfs_abort_transaction(trans, ret); - goto fail; + goto out; } - /* Freeing will be done in btrfs_put_root() of new_root */ + /* anon_dev is owned by new_root now. */ anon_dev = 0; + BTRFS_I(new_inode_args.inode)->root = new_root; + /* ... and new_root is owned by new_inode_args.inode now. */ ret = btrfs_record_root_in_trans(trans, new_root); if (ret) { - btrfs_put_root(new_root); btrfs_abort_transaction(trans, ret); - goto fail; - } - - ret = btrfs_create_subvol_root(trans, new_root, root, mnt_userns); - btrfs_put_root(new_root); - if (ret) { - /* We potentially lose an unused inode item here */ - btrfs_abort_transaction(trans, ret); - goto fail; - } - - /* - * insert the directory item - */ - ret = btrfs_set_inode_index(BTRFS_I(dir), &index); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto fail; - } - - ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key, - BTRFS_FT_DIR, index); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto fail; + goto out; } - btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2); - ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); + ret = btrfs_uuid_tree_add(trans, root_item->uuid, + BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) { btrfs_abort_transaction(trans, ret); - goto fail; + goto out; } - ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid, - btrfs_ino(BTRFS_I(dir)), index, name, namelen); + ret = btrfs_create_new_inode(trans, &new_inode_args); if (ret) { btrfs_abort_transaction(trans, ret); - goto fail; + goto out; } - ret = btrfs_uuid_tree_add(trans, root_item->uuid, - BTRFS_UUID_KEY_SUBVOL, objectid); - if (ret) - btrfs_abort_transaction(trans, ret); + d_instantiate_new(dentry, new_inode_args.inode); + new_inode_args.inode = NULL; -fail: - kfree(root_item); +out: trans->block_rsv = NULL; trans->bytes_reserved = 0; btrfs_subvolume_release_metadata(root, &block_rsv); @@ -744,18 +748,14 @@ fail: btrfs_end_transaction(trans); else ret = btrfs_commit_transaction(trans); - - if (!ret) { - inode = btrfs_lookup_dentry(dir, dentry); - if (IS_ERR(inode)) - return PTR_ERR(inode); - d_instantiate(dentry, inode); - } - return ret; - -fail_free: +out_new_inode_args: + btrfs_new_inode_args_destroy(&new_inode_args); +out_inode: + iput(new_inode_args.inode); +out_anon_dev: if (anon_dev) free_anon_bdev(anon_dev); +out_root_item: kfree(root_item); return ret; } @@ -767,6 +767,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct inode *inode; struct btrfs_pending_snapshot *pending_snapshot; + unsigned int trans_num_items; struct btrfs_trans_handle *trans; int ret; @@ -804,16 +805,14 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); /* - * 1 - parent dir inode - * 2 - dir entries - * 1 - root item - * 2 - root ref/backref - * 1 - root of snapshot - * 1 - UUID item + * 1 to add dir item + * 1 to add dir index + * 1 to update parent inode item */ + trans_num_items = create_subvol_num_items(inherit) + 3; ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, - &pending_snapshot->block_rsv, 8, - false); + &pending_snapshot->block_rsv, + trans_num_items, false); if (ret) goto free_pending; @@ -983,7 +982,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, if (snap_src) error = create_snapshot(snap_src, dir, dentry, readonly, inherit); else - error = create_subvol(mnt_userns, dir, dentry, name, namelen, inherit); + error = create_subvol(mnt_userns, dir, dentry, inherit); if (!error) fsnotify_mkdir(dir, dentry); @@ -1417,8 +1416,19 @@ static int defrag_collect_targets(struct btrfs_inode *inode, if (!em) break; - /* Skip hole/inline/preallocated extents */ - if (em->block_start >= EXTENT_MAP_LAST_BYTE || + /* + * If the file extent is an inlined one, we may still want to + * defrag it (fallthrough) if it will cause a regular extent. + * This is for users who want to convert inline extents to + * regular ones through max_inline= mount option. + */ + if (em->block_start == EXTENT_MAP_INLINE && + em->len <= inode->root->fs_info->max_inline) + goto next; + + /* Skip hole/delalloc/preallocated extents */ + if (em->block_start == EXTENT_MAP_HOLE || + em->block_start == EXTENT_MAP_DELALLOC || test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) goto next; @@ -1477,6 +1487,15 @@ static int defrag_collect_targets(struct btrfs_inode *inode, if (em->len >= get_extent_max_capacity(em)) goto next; + /* + * Normally there are no more extents after an inline one, thus + * @next_mergeable will normally be false and not defragged. + * So if an inline extent passed all above checks, just add it + * for defrag, and be converted to regular extents. + */ + if (em->block_start == EXTENT_MAP_INLINE) + goto add; + next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, extent_thresh, newer_than, locked); if (!next_mergeable) { @@ -2565,7 +2584,12 @@ static noinline int search_ioctl(struct inode *inode, while (1) { ret = -EFAULT; - if (fault_in_writeable(ubuf + sk_offset, *buf_size - sk_offset)) + /* + * Ensure that the whole user buffer is faulted in at sub-page + * granularity, otherwise the loop may live-lock. + */ + if (fault_in_subpage_writeable(ubuf + sk_offset, + *buf_size - sk_offset)) break; ret = btrfs_search_forward(root, &key, path, sk->min_transid); @@ -2593,7 +2617,7 @@ err: static noinline int btrfs_ioctl_tree_search(struct inode *inode, void __user *argp) { - struct btrfs_ioctl_search_args __user *uargs; + struct btrfs_ioctl_search_args __user *uargs = argp; struct btrfs_ioctl_search_key sk; int ret; size_t buf_size; @@ -2601,8 +2625,6 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - uargs = (struct btrfs_ioctl_search_args __user *)argp; - if (copy_from_user(&sk, &uargs->key, sizeof(sk))) return -EFAULT; @@ -2625,7 +2647,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode, static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, void __user *argp) { - struct btrfs_ioctl_search_args_v2 __user *uarg; + struct btrfs_ioctl_search_args_v2 __user *uarg = argp; struct btrfs_ioctl_search_args_v2 args; int ret; size_t buf_size; @@ -2635,7 +2657,6 @@ static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, return -EPERM; /* copy search header and buffer size */ - uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp; if (copy_from_user(&args, uarg, sizeof(args))) return -EFAULT; @@ -4343,10 +4364,6 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) bool need_unlock; /* for mut. excl. ops lock */ int ret; - if (!arg) - btrfs_warn(fs_info, - "IOC_BALANCE ioctl (v1) is deprecated and will be removed in kernel 5.18"); - if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4354,6 +4371,13 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) if (ret) return ret; + bargs = memdup_user(arg, sizeof(*bargs)); + if (IS_ERR(bargs)) { + ret = PTR_ERR(bargs); + bargs = NULL; + goto out; + } + again: if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { mutex_lock(&fs_info->balance_mutex); @@ -4401,59 +4425,42 @@ again: } locked: - - if (arg) { - bargs = memdup_user(arg, sizeof(*bargs)); - if (IS_ERR(bargs)) { - ret = PTR_ERR(bargs); + if (bargs->flags & BTRFS_BALANCE_RESUME) { + if (!fs_info->balance_ctl) { + ret = -ENOTCONN; goto out_unlock; } - if (bargs->flags & BTRFS_BALANCE_RESUME) { - if (!fs_info->balance_ctl) { - ret = -ENOTCONN; - goto out_bargs; - } + bctl = fs_info->balance_ctl; + spin_lock(&fs_info->balance_lock); + bctl->flags |= BTRFS_BALANCE_RESUME; + spin_unlock(&fs_info->balance_lock); + btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE); - bctl = fs_info->balance_ctl; - spin_lock(&fs_info->balance_lock); - bctl->flags |= BTRFS_BALANCE_RESUME; - spin_unlock(&fs_info->balance_lock); - btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE); + goto do_balance; + } - goto do_balance; - } - } else { - bargs = NULL; + if (bargs->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) { + ret = -EINVAL; + goto out_unlock; } if (fs_info->balance_ctl) { ret = -EINPROGRESS; - goto out_bargs; + goto out_unlock; } bctl = kzalloc(sizeof(*bctl), GFP_KERNEL); if (!bctl) { ret = -ENOMEM; - goto out_bargs; - } - - if (arg) { - memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); - memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); - memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); - - bctl->flags = bargs->flags; - } else { - /* balance everything - no filters */ - bctl->flags |= BTRFS_BALANCE_TYPE_MASK; + goto out_unlock; } - if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) { - ret = -EINVAL; - goto out_bctl; - } + memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); + memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); + memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); + bctl->flags = bargs->flags; do_balance: /* * Ownership of bctl and exclusive operation goes to btrfs_balance. @@ -4466,21 +4473,19 @@ do_balance: ret = btrfs_balance(fs_info, bctl, bargs); bctl = NULL; - if ((ret == 0 || ret == -ECANCELED) && arg) { + if (ret == 0 || ret == -ECANCELED) { if (copy_to_user(arg, bargs, sizeof(*bargs))) ret = -EFAULT; } -out_bctl: kfree(bctl); -out_bargs: - kfree(bargs); out_unlock: mutex_unlock(&fs_info->balance_mutex); if (need_unlock) btrfs_exclop_finish(fs_info); out: mnt_drop_write_file(file); + kfree(bargs); return ret; } diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 1b31481f9e72..a2ec8ecae8de 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -380,9 +380,8 @@ static struct prop_handler prop_handlers[] = { }, }; -static int inherit_props(struct btrfs_trans_handle *trans, - struct inode *inode, - struct inode *parent) +int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *parent) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -457,41 +456,6 @@ static int inherit_props(struct btrfs_trans_handle *trans, return 0; } -int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, - struct inode *inode, - struct inode *dir) -{ - if (!dir) - return 0; - - return inherit_props(trans, inode, dir); -} - -int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_root *parent_root) -{ - struct super_block *sb = root->fs_info->sb; - struct inode *parent_inode, *child_inode; - int ret; - - parent_inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, parent_root); - if (IS_ERR(parent_inode)) - return PTR_ERR(parent_inode); - - child_inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, root); - if (IS_ERR(child_inode)) { - iput(parent_inode); - return PTR_ERR(child_inode); - } - - ret = inherit_props(trans, child_inode, parent_inode); - iput(child_inode); - iput(parent_inode); - - return ret; -} - void __init btrfs_props_init(void) { int i; diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h index 59bea741cfcf..ca9dd3df129b 100644 --- a/fs/btrfs/props.h +++ b/fs/btrfs/props.h @@ -23,8 +23,4 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir); -int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_root *parent_root); - #endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 1866b1f0da01..db723c0026bd 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2290,7 +2290,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, return 0; if (!extent_buffer_uptodate(root_eb)) { - ret = btrfs_read_buffer(root_eb, root_gen, root_level, NULL); + ret = btrfs_read_extent_buffer(root_eb, root_gen, root_level, NULL); if (ret) goto out; } @@ -3939,12 +3939,13 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, } int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, - enum btrfs_qgroup_rsv_type type, bool enforce) + enum btrfs_qgroup_rsv_type type, bool enforce, + bool noflush) { int ret; ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); - if (ret <= 0 && ret != -EDQUOT) + if ((ret <= 0 && ret != -EDQUOT) || noflush) return ret; ret = try_flush_qgroup(root); diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 880e9df0dac1..0c4dd2a9af96 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -364,19 +364,23 @@ int btrfs_qgroup_free_data(struct btrfs_inode *inode, int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type, bool enforce); int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, - enum btrfs_qgroup_rsv_type type, bool enforce); + enum btrfs_qgroup_rsv_type type, bool enforce, + bool noflush); /* Reserve metadata space for pertrans and prealloc type */ static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root, int num_bytes, bool enforce) { return __btrfs_qgroup_reserve_meta(root, num_bytes, - BTRFS_QGROUP_RSV_META_PERTRANS, enforce); + BTRFS_QGROUP_RSV_META_PERTRANS, + enforce, false); } static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root, - int num_bytes, bool enforce) + int num_bytes, bool enforce, + bool noflush) { return __btrfs_qgroup_reserve_meta(root, num_bytes, - BTRFS_QGROUP_RSV_META_PREALLOC, enforce); + BTRFS_QGROUP_RSV_META_PREALLOC, + enforce, noflush); } void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 0e239a4c3b26..a5b623ee6fac 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -52,6 +52,17 @@ struct btrfs_stripe_hash_table { struct btrfs_stripe_hash table[]; }; +/* + * A bvec like structure to present a sector inside a page. + * + * Unlike bvec we don't need bvlen, as it's fixed to sectorsize. + */ +struct sector_ptr { + struct page *page; + unsigned int pgoff:24; + unsigned int uptodate:8; +}; + enum btrfs_rbio_ops { BTRFS_RBIO_WRITE, BTRFS_RBIO_READ_REBUILD, @@ -77,7 +88,7 @@ struct btrfs_raid_bio { /* * for scheduling work in the helper threads */ - struct btrfs_work work; + struct work_struct work; /* * bio list and bio_list_lock are used @@ -101,15 +112,6 @@ struct btrfs_raid_bio { */ unsigned long flags; - /* size of each individual stripe on disk */ - int stripe_len; - - /* number of data stripes (no p/q) */ - int nr_data; - - int real_stripes; - - int stripe_npages; /* * set if we're doing a parity rebuild * for a read from higher up, which is handled @@ -118,18 +120,35 @@ struct btrfs_raid_bio { */ enum btrfs_rbio_ops operation; - /* first bad stripe */ - int faila; + /* Size of each individual stripe on disk */ + u32 stripe_len; - /* second bad stripe (for raid6 use) */ - int failb; + /* How many pages there are for the full stripe including P/Q */ + u16 nr_pages; - int scrubp; - /* - * number of pages needed to represent the full - * stripe - */ - int nr_pages; + /* How many sectors there are for the full stripe including P/Q */ + u16 nr_sectors; + + /* Number of data stripes (no p/q) */ + u8 nr_data; + + /* Numer of all stripes (including P/Q) */ + u8 real_stripes; + + /* How many pages there are for each stripe */ + u8 stripe_npages; + + /* How many sectors there are for each stripe */ + u8 stripe_nsectors; + + /* First bad stripe, -1 means no corruption */ + s8 faila; + + /* Second bad stripe (for RAID6 use) */ + s8 failb; + + /* Stripe number that we're scrubbing */ + u8 scrubp; /* * size of all the bios in the bio_list. This @@ -156,28 +175,29 @@ struct btrfs_raid_bio { */ struct page **stripe_pages; - /* - * pointers to the pages in the bio_list. Stored - * here for faster lookup - */ - struct page **bio_pages; + /* Pointers to the sectors in the bio_list, for faster lookup */ + struct sector_ptr *bio_sectors; /* - * bitmap to record which horizontal stripe has data + * For subpage support, we need to map each sector to above + * stripe_pages. */ + struct sector_ptr *stripe_sectors; + + /* Bitmap to record which horizontal stripe has data */ unsigned long *dbitmap; /* allocated with real_stripes-many pointers for finish_*() calls */ void **finish_pointers; - /* allocated with stripe_npages-many bits for finish_*() calls */ + /* Allocated with stripe_nsectors-many bits for finish_*() calls */ unsigned long *finish_pbitmap; }; static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); static noinline void finish_rmw(struct btrfs_raid_bio *rbio); -static void rmw_work(struct btrfs_work *work); -static void read_rebuild_work(struct btrfs_work *work); +static void rmw_work(struct work_struct *work); +static void read_rebuild_work(struct work_struct *work); static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); static void __free_raid_bio(struct btrfs_raid_bio *rbio); @@ -186,12 +206,12 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check); -static void scrub_parity_work(struct btrfs_work *work); +static void scrub_parity_work(struct work_struct *work); -static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func) +static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func) { - btrfs_init_work(&rbio->work, work_func, NULL, NULL); - btrfs_queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work); + INIT_WORK(&rbio->work, work_func); + queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work); } /* @@ -239,7 +259,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) /* * caching an rbio means to copy anything from the - * bio_pages array into the stripe_pages array. We + * bio_sectors array into the stripe_pages array. We * use the page uptodate bit in the stripe cache array * to indicate if it has valid data * @@ -255,12 +275,18 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) if (ret) return; - for (i = 0; i < rbio->nr_pages; i++) { - if (!rbio->bio_pages[i]) + for (i = 0; i < rbio->nr_sectors; i++) { + /* Some range not covered by bio (partial write), skip it */ + if (!rbio->bio_sectors[i].page) continue; - copy_highpage(rbio->stripe_pages[i], rbio->bio_pages[i]); - SetPageUptodate(rbio->stripe_pages[i]); + ASSERT(rbio->stripe_sectors[i].page); + memcpy_page(rbio->stripe_sectors[i].page, + rbio->stripe_sectors[i].pgoff, + rbio->bio_sectors[i].page, + rbio->bio_sectors[i].pgoff, + rbio->bioc->fs_info->sectorsize); + rbio->stripe_sectors[i].uptodate = 1; } set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); } @@ -283,9 +309,50 @@ static int rbio_bucket(struct btrfs_raid_bio *rbio) return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); } +static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, + unsigned int page_nr) +{ + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + const u32 sectors_per_page = PAGE_SIZE / sectorsize; + int i; + + ASSERT(page_nr < rbio->nr_pages); + + for (i = sectors_per_page * page_nr; + i < sectors_per_page * page_nr + sectors_per_page; + i++) { + if (!rbio->stripe_sectors[i].uptodate) + return false; + } + return true; +} + /* - * stealing an rbio means taking all the uptodate pages from the stripe - * array in the source rbio and putting them into the destination rbio + * Update the stripe_sectors[] array to use correct page and pgoff + * + * Should be called every time any page pointer in stripes_pages[] got modified. + */ +static void index_stripe_sectors(struct btrfs_raid_bio *rbio) +{ + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + u32 offset; + int i; + + for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) { + int page_index = offset >> PAGE_SHIFT; + + ASSERT(page_index < rbio->nr_pages); + rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index]; + rbio->stripe_sectors[i].pgoff = offset_in_page(offset); + } +} + +/* + * Stealing an rbio means taking all the uptodate pages from the stripe array + * in the source rbio and putting them into the destination rbio. + * + * This will also update the involved stripe_sectors[] which are referring to + * the old pages. */ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) { @@ -298,9 +365,8 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) for (i = 0; i < dest->nr_pages; i++) { s = src->stripe_pages[i]; - if (!s || !PageUptodate(s)) { + if (!s || !full_page_sectors_uptodate(src, i)) continue; - } d = dest->stripe_pages[i]; if (d) @@ -309,6 +375,8 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) dest->stripe_pages[i] = s; src->stripe_pages[i] = NULL; } + index_stripe_sectors(dest); + index_stripe_sectors(src); } /* @@ -600,39 +668,39 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, return 1; } -static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe, - int index) +static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, + unsigned int sector_nr) { - return stripe * rbio->stripe_npages + index; + ASSERT(stripe_nr < rbio->real_stripes); + ASSERT(sector_nr < rbio->stripe_nsectors); + + return stripe_nr * rbio->stripe_nsectors + sector_nr; } -/* - * these are just the pages from the rbio array, not from anything - * the FS sent down to us - */ -static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, - int index) +/* Return a sector from rbio->stripe_sectors, not from the bio list */ +static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, + unsigned int sector_nr) { - return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)]; + return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr, + sector_nr)]; } -/* - * helper to index into the pstripe - */ -static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) +/* Grab a sector inside P stripe */ +static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio, + unsigned int sector_nr) { - return rbio_stripe_page(rbio, rbio->nr_data, index); + return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr); } -/* - * helper to index into the qstripe, returns null - * if there is no qstripe - */ -static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) +/* Grab a sector inside Q stripe, return NULL if not RAID6 */ +static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio, + unsigned int sector_nr) { if (rbio->nr_data + 1 == rbio->real_stripes) return NULL; - return rbio_stripe_page(rbio, rbio->nr_data + 1, index); + return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr); } /* @@ -911,47 +979,43 @@ static void raid_write_end_io(struct bio *bio) rbio_orig_end_io(rbio, err); } -/* - * the read/modify/write code wants to use the original bio for - * any pages it included, and then use the rbio for everything - * else. This function decides if a given index (stripe number) - * and page number in that stripe fall inside the original bio - * or the rbio. - * - * if you set bio_list_only, you'll get a NULL back for any ranges - * that are outside the bio_list +/** + * Get a sector pointer specified by its @stripe_nr and @sector_nr * - * This doesn't take any refs on anything, you get a bare page pointer - * and the caller must bump refs as required. + * @rbio: The raid bio + * @stripe_nr: Stripe number, valid range [0, real_stripe) + * @sector_nr: Sector number inside the stripe, + * valid range [0, stripe_nsectors) + * @bio_list_only: Whether to use sectors inside the bio list only. * - * You must call index_rbio_pages once before you can trust - * the answers from this function. + * The read/modify/write code wants to reuse the original bio page as much + * as possible, and only use stripe_sectors as fallback. */ -static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, - int index, int pagenr, int bio_list_only) +static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, + int stripe_nr, int sector_nr, + bool bio_list_only) { - int chunk_page; - struct page *p = NULL; + struct sector_ptr *sector; + int index; + + ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes); + ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); - chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; + index = stripe_nr * rbio->stripe_nsectors + sector_nr; + ASSERT(index >= 0 && index < rbio->nr_sectors); spin_lock_irq(&rbio->bio_list_lock); - p = rbio->bio_pages[chunk_page]; + sector = &rbio->bio_sectors[index]; + if (sector->page || bio_list_only) { + /* Don't return sector without a valid page pointer */ + if (!sector->page) + sector = NULL; + spin_unlock_irq(&rbio->bio_list_lock); + return sector; + } spin_unlock_irq(&rbio->bio_list_lock); - if (p || bio_list_only) - return p; - - return rbio->stripe_pages[chunk_page]; -} - -/* - * number of pages we need for the entire stripe across all the - * drives - */ -static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) -{ - return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes; + return &rbio->stripe_sectors[index]; } /* @@ -960,22 +1024,28 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) */ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, struct btrfs_io_context *bioc, - u64 stripe_len) + u32 stripe_len) { + const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs; + const unsigned int stripe_npages = stripe_len >> PAGE_SHIFT; + const unsigned int num_pages = stripe_npages * real_stripes; + const unsigned int stripe_nsectors = stripe_len >> fs_info->sectorsize_bits; + const unsigned int num_sectors = stripe_nsectors * real_stripes; struct btrfs_raid_bio *rbio; int nr_data = 0; - int real_stripes = bioc->num_stripes - bioc->num_tgtdevs; - int num_pages = rbio_nr_pages(stripe_len, real_stripes); - int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE); void *p; + ASSERT(IS_ALIGNED(stripe_len, PAGE_SIZE)); + /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ + ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); + rbio = kzalloc(sizeof(*rbio) + sizeof(*rbio->stripe_pages) * num_pages + - sizeof(*rbio->bio_pages) * num_pages + + sizeof(*rbio->bio_sectors) * num_sectors + + sizeof(*rbio->stripe_sectors) * num_sectors + sizeof(*rbio->finish_pointers) * real_stripes + - sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) + - sizeof(*rbio->finish_pbitmap) * - BITS_TO_LONGS(stripe_npages), + sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_nsectors) + + sizeof(*rbio->finish_pbitmap) * BITS_TO_LONGS(stripe_nsectors), GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); @@ -988,8 +1058,10 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, rbio->bioc = bioc; rbio->stripe_len = stripe_len; rbio->nr_pages = num_pages; + rbio->nr_sectors = num_sectors; rbio->real_stripes = real_stripes; rbio->stripe_npages = stripe_npages; + rbio->stripe_nsectors = stripe_nsectors; rbio->faila = -1; rbio->failb = -1; refcount_set(&rbio->refs, 1); @@ -997,8 +1069,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, atomic_set(&rbio->stripes_pending, 0); /* - * the stripe_pages, bio_pages, etc arrays point to the extra - * memory we allocated past the end of the rbio + * The stripe_pages, bio_sectors, etc arrays point to the extra memory + * we allocated past the end of the rbio. */ p = rbio + 1; #define CONSUME_ALLOC(ptr, count) do { \ @@ -1006,10 +1078,11 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, p = (unsigned char *)p + sizeof(*(ptr)) * (count); \ } while (0) CONSUME_ALLOC(rbio->stripe_pages, num_pages); - CONSUME_ALLOC(rbio->bio_pages, num_pages); + CONSUME_ALLOC(rbio->bio_sectors, num_sectors); + CONSUME_ALLOC(rbio->stripe_sectors, num_sectors); CONSUME_ALLOC(rbio->finish_pointers, real_stripes); - CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages)); - CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages)); + CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_nsectors)); + CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_nsectors)); #undef CONSUME_ALLOC if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) @@ -1026,59 +1099,63 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, /* allocate pages for all the stripes in the bio, including parity */ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) { - int i; - struct page *page; + int ret; - for (i = 0; i < rbio->nr_pages; i++) { - if (rbio->stripe_pages[i]) - continue; - page = alloc_page(GFP_NOFS); - if (!page) - return -ENOMEM; - rbio->stripe_pages[i] = page; - } + ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages); + if (ret < 0) + return ret; + /* Mapping all sectors */ + index_stripe_sectors(rbio); return 0; } /* only allocate pages for p/q stripes */ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) { - int i; - struct page *page; + const int data_pages = rbio->nr_data * rbio->stripe_npages; + int ret; - i = rbio_stripe_page_index(rbio, rbio->nr_data, 0); + ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages, + rbio->stripe_pages + data_pages); + if (ret < 0) + return ret; - for (; i < rbio->nr_pages; i++) { - if (rbio->stripe_pages[i]) - continue; - page = alloc_page(GFP_NOFS); - if (!page) - return -ENOMEM; - rbio->stripe_pages[i] = page; - } + index_stripe_sectors(rbio); return 0; } /* - * add a single page from a specific stripe into our list of bios for IO - * this will try to merge into existing bios if possible, and returns - * zero if all went well. + * Add a single sector @sector into our list of bios for IO. + * + * Return 0 if everything went well. + * Return <0 for error. */ -static int rbio_add_io_page(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list, - struct page *page, - int stripe_nr, - unsigned long page_index, - unsigned long bio_max_len) -{ +static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list, + struct sector_ptr *sector, + unsigned int stripe_nr, + unsigned int sector_nr, + unsigned long bio_max_len, + unsigned int opf) +{ + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; struct bio *last = bio_list->tail; int ret; struct bio *bio; struct btrfs_io_stripe *stripe; u64 disk_start; + /* + * Note: here stripe_nr has taken device replace into consideration, + * thus it can be larger than rbio->real_stripe. + * So here we check against bioc->num_stripes, not rbio->real_stripes. + */ + ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes); + ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); + ASSERT(sector->page); + stripe = &rbio->bioc->stripes[stripe_nr]; - disk_start = stripe->physical + (page_index << PAGE_SHIFT); + disk_start = stripe->physical + sector_nr * sectorsize; /* if the device is missing, just fail this stripe */ if (!stripe->dev->bdev) @@ -1095,20 +1172,20 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, */ if (last_end == disk_start && !last->bi_status && last->bi_bdev == stripe->dev->bdev) { - ret = bio_add_page(last, page, PAGE_SIZE, 0); - if (ret == PAGE_SIZE) + ret = bio_add_page(last, sector->page, sectorsize, + sector->pgoff); + if (ret == sectorsize) return 0; } } /* put a new bio on the list */ - bio = btrfs_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1); - btrfs_bio(bio)->device = stripe->dev; - bio->bi_iter.bi_size = 0; - bio_set_dev(bio, stripe->dev->bdev); + bio = bio_alloc(stripe->dev->bdev, max(bio_max_len >> PAGE_SHIFT, 1UL), + opf, GFP_NOFS); bio->bi_iter.bi_sector = disk_start >> 9; + bio->bi_private = rbio; - bio_add_page(bio, page, PAGE_SIZE, 0); + bio_add_page(bio, sector->page, sectorsize, sector->pgoff); bio_list_add(bio_list, bio); return 0; } @@ -1130,6 +1207,32 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) } } +static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) +{ + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + struct bio_vec bvec; + struct bvec_iter iter; + u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - + rbio->bioc->raid_map[0]; + + if (bio_flagged(bio, BIO_CLONED)) + bio->bi_iter = btrfs_bio(bio)->iter; + + bio_for_each_segment(bvec, bio, iter) { + u32 bvec_offset; + + for (bvec_offset = 0; bvec_offset < bvec.bv_len; + bvec_offset += sectorsize, offset += sectorsize) { + int index = offset / sectorsize; + struct sector_ptr *sector = &rbio->bio_sectors[index]; + + sector->page = bvec.bv_page; + sector->pgoff = bvec.bv_offset + bvec_offset; + ASSERT(sector->pgoff < PAGE_SIZE); + } + } +} + /* * helper function to walk our bio list and populate the bio_pages array with * the result. This seems expensive, but it is faster than constantly @@ -1141,28 +1244,11 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) static void index_rbio_pages(struct btrfs_raid_bio *rbio) { struct bio *bio; - u64 start; - unsigned long stripe_offset; - unsigned long page_index; spin_lock_irq(&rbio->bio_list_lock); - bio_list_for_each(bio, &rbio->bio_list) { - struct bio_vec bvec; - struct bvec_iter iter; - int i = 0; + bio_list_for_each(bio, &rbio->bio_list) + index_one_bio(rbio, bio); - start = bio->bi_iter.bi_sector << 9; - stripe_offset = start - rbio->bioc->raid_map[0]; - page_index = stripe_offset >> PAGE_SHIFT; - - if (bio_flagged(bio, BIO_CLONED)) - bio->bi_iter = btrfs_bio(bio)->iter; - - bio_for_each_segment(bvec, bio, iter) { - rbio->bio_pages[page_index + i] = bvec.bv_page; - i++; - } - } spin_unlock_irq(&rbio->bio_list_lock); } @@ -1177,10 +1263,11 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) static noinline void finish_rmw(struct btrfs_raid_bio *rbio) { struct btrfs_io_context *bioc = rbio->bioc; + const u32 sectorsize = bioc->fs_info->sectorsize; void **pointers = rbio->finish_pointers; int nr_data = rbio->nr_data; int stripe; - int pagenr; + int sectornr; bool has_qstripe; struct bio_list bio_list; struct bio *bio; @@ -1224,35 +1311,37 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) else clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - struct page *p; - /* first collect one page from each data stripe */ + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + struct sector_ptr *sector; + + /* First collect one sector from each data stripe */ for (stripe = 0; stripe < nr_data; stripe++) { - p = page_in_rbio(rbio, stripe, pagenr, 0); - pointers[stripe] = kmap_local_page(p); + sector = sector_in_rbio(rbio, stripe, sectornr, 0); + pointers[stripe] = kmap_local_page(sector->page) + + sector->pgoff; } - /* then add the parity stripe */ - p = rbio_pstripe_page(rbio, pagenr); - SetPageUptodate(p); - pointers[stripe++] = kmap_local_page(p); + /* Then add the parity stripe */ + sector = rbio_pstripe_sector(rbio, sectornr); + sector->uptodate = 1; + pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; if (has_qstripe) { - /* - * raid6, add the qstripe and call the - * library function to fill in our p/q + * RAID6, add the qstripe and call the library function + * to fill in our p/q */ - p = rbio_qstripe_page(rbio, pagenr); - SetPageUptodate(p); - pointers[stripe++] = kmap_local_page(p); + sector = rbio_qstripe_sector(rbio, sectornr); + sector->uptodate = 1; + pointers[stripe++] = kmap_local_page(sector->page) + + sector->pgoff; - raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, + raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, pointers); } else { /* raid5 */ - copy_page(pointers[nr_data], pointers[0]); - run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); + memcpy(pointers[nr_data], pointers[0], sectorsize); + run_xor(pointers + 1, nr_data - 1, sectorsize); } for (stripe = stripe - 1; stripe >= 0; stripe--) kunmap_local(pointers[stripe]); @@ -1264,18 +1353,20 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) * everything else. */ for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - struct page *page; + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + struct sector_ptr *sector; + if (stripe < rbio->nr_data) { - page = page_in_rbio(rbio, stripe, pagenr, 1); - if (!page) + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (!sector) continue; } else { - page = rbio_stripe_page(rbio, stripe, pagenr); + sector = rbio_stripe_sector(rbio, stripe, sectornr); } - ret = rbio_add_io_page(rbio, &bio_list, - page, stripe, pagenr, rbio->stripe_len); + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + sectornr, rbio->stripe_len, + REQ_OP_WRITE); if (ret) goto cleanup; } @@ -1288,19 +1379,21 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) if (!bioc->tgtdev_map[stripe]) continue; - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - struct page *page; + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + struct sector_ptr *sector; + if (stripe < rbio->nr_data) { - page = page_in_rbio(rbio, stripe, pagenr, 1); - if (!page) + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (!sector) continue; } else { - page = rbio_stripe_page(rbio, stripe, pagenr); + sector = rbio_stripe_sector(rbio, stripe, sectornr); } - ret = rbio_add_io_page(rbio, &bio_list, page, + ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->bioc->tgtdev_map[stripe], - pagenr, rbio->stripe_len); + sectornr, rbio->stripe_len, + REQ_OP_WRITE); if (ret) goto cleanup; } @@ -1311,9 +1404,7 @@ write_data: BUG_ON(atomic_read(&rbio->stripes_pending) == 0); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_private = rbio; bio->bi_end_io = raid_write_end_io; - bio->bi_opf = REQ_OP_WRITE; submit_bio(bio); } @@ -1417,18 +1508,48 @@ static int fail_bio_stripe(struct btrfs_raid_bio *rbio, } /* + * For subpage case, we can no longer set page Uptodate directly for + * stripe_pages[], thus we need to locate the sector. + */ +static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, + struct page *page, + unsigned int pgoff) +{ + int i; + + for (i = 0; i < rbio->nr_sectors; i++) { + struct sector_ptr *sector = &rbio->stripe_sectors[i]; + + if (sector->page == page && sector->pgoff == pgoff) + return sector; + } + return NULL; +} + +/* * this sets each page in the bio uptodate. It should only be used on private * rbio pages, nothing that comes in from the higher layers */ -static void set_bio_pages_uptodate(struct bio *bio) +static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) { + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; struct bio_vec *bvec; struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) - SetPageUptodate(bvec->bv_page); + bio_for_each_segment_all(bvec, bio, iter_all) { + struct sector_ptr *sector; + int pgoff; + + for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len; + pgoff += sectorsize) { + sector = find_stripe_sector(rbio, bvec->bv_page, pgoff); + ASSERT(sector); + if (sector) + sector->uptodate = 1; + } + } } /* @@ -1446,7 +1567,7 @@ static void raid_rmw_end_io(struct bio *bio) if (bio->bi_status) fail_bio_stripe(rbio, bio); else - set_bio_pages_uptodate(bio); + set_bio_pages_uptodate(rbio, bio); bio_put(bio); @@ -1478,7 +1599,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) int bios_to_read = 0; struct bio_list bio_list; int ret; - int pagenr; + int sectornr; int stripe; struct bio *bio; @@ -1496,28 +1617,30 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) * stripe */ for (stripe = 0; stripe < rbio->nr_data; stripe++) { - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - struct page *page; + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + struct sector_ptr *sector; + /* - * we want to find all the pages missing from - * the rbio and read them from the disk. If - * page_in_rbio finds a page in the bio list - * we don't need to read it off the stripe. + * We want to find all the sectors missing from the + * rbio and read them from the disk. If * sector_in_rbio() + * finds a page in the bio list we don't need to read + * it off the stripe. */ - page = page_in_rbio(rbio, stripe, pagenr, 1); - if (page) + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (sector) continue; - page = rbio_stripe_page(rbio, stripe, pagenr); + sector = rbio_stripe_sector(rbio, stripe, sectornr); /* - * the bio cache may have handed us an uptodate - * page. If so, be happy and use it + * The bio cache may have handed us an uptodate page. + * If so, be happy and use it. */ - if (PageUptodate(page)) + if (sector->uptodate) continue; - ret = rbio_add_io_page(rbio, &bio_list, page, - stripe, pagenr, rbio->stripe_len); + ret = rbio_add_io_sector(rbio, &bio_list, sector, + stripe, sectornr, rbio->stripe_len, + REQ_OP_READ); if (ret) goto cleanup; } @@ -1540,9 +1663,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) */ atomic_set(&rbio->stripes_pending, bios_to_read); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_private = rbio; bio->bi_end_io = raid_rmw_end_io; - bio->bi_opf = REQ_OP_READ; btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); @@ -1624,7 +1745,7 @@ struct btrfs_plug_cb { struct blk_plug_cb cb; struct btrfs_fs_info *info; struct list_head rbio_list; - struct btrfs_work work; + struct work_struct work; }; /* @@ -1692,7 +1813,7 @@ static void run_plug(struct btrfs_plug_cb *plug) * if the unplug comes from schedule, we have to push the * work off to a helper thread */ -static void unplug_work(struct btrfs_work *work) +static void unplug_work(struct work_struct *work) { struct btrfs_plug_cb *plug; plug = container_of(work, struct btrfs_plug_cb, work); @@ -1705,9 +1826,8 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) plug = container_of(cb, struct btrfs_plug_cb, cb); if (from_schedule) { - btrfs_init_work(&plug->work, unplug_work, NULL, NULL); - btrfs_queue_work(plug->info->rmw_workers, - &plug->work); + INIT_WORK(&plug->work, unplug_work); + queue_work(plug->info->rmw_workers, &plug->work); return; } run_plug(plug); @@ -1716,8 +1836,7 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) /* * our main entry point for writes from the rest of the FS. */ -int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, - u64 stripe_len) +int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; @@ -1772,14 +1891,18 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, */ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) { - int pagenr, stripe; + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + int sectornr, stripe; void **pointers; void **unmap_array; int faila = -1, failb = -1; - struct page *page; blk_status_t err; int i; + /* + * This array stores the pointer for each sector, thus it has the extra + * pgoff value added from each sector + */ pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); if (!pointers) { err = BLK_STS_RESOURCE; @@ -1808,43 +1931,44 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) index_rbio_pages(rbio); - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + struct sector_ptr *sector; + /* * Now we just use bitmap to mark the horizontal stripes in * which we have data when doing parity scrub. */ if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && - !test_bit(pagenr, rbio->dbitmap)) + !test_bit(sectornr, rbio->dbitmap)) continue; /* - * Setup our array of pointers with pages from each stripe + * Setup our array of pointers with sectors from each stripe * * NOTE: store a duplicate array of pointers to preserve the * pointer order */ for (stripe = 0; stripe < rbio->real_stripes; stripe++) { /* - * if we're rebuilding a read, we have to use + * If we're rebuilding a read, we have to use * pages from the bio list */ if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && (stripe == faila || stripe == failb)) { - page = page_in_rbio(rbio, stripe, pagenr, 0); + sector = sector_in_rbio(rbio, stripe, sectornr, 0); } else { - page = rbio_stripe_page(rbio, stripe, pagenr); + sector = rbio_stripe_sector(rbio, stripe, sectornr); } - pointers[stripe] = kmap_local_page(page); + ASSERT(sector->page); + pointers[stripe] = kmap_local_page(sector->page) + + sector->pgoff; unmap_array[stripe] = pointers[stripe]; } - /* all raid6 handling here */ + /* All raid6 handling here */ if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { - /* - * single failure, rebuild from parity raid5 - * style - */ + /* Single failure, rebuild from parity raid5 style */ if (failb < 0) { if (faila == rbio->nr_data) { /* @@ -1887,10 +2011,10 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { raid6_datap_recov(rbio->real_stripes, - PAGE_SIZE, faila, pointers); + sectorsize, faila, pointers); } else { raid6_2data_recov(rbio->real_stripes, - PAGE_SIZE, faila, failb, + sectorsize, faila, failb, pointers); } } else { @@ -1900,7 +2024,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) BUG_ON(failb != -1); pstripe: /* Copy parity block into failed block to start with */ - copy_page(pointers[faila], pointers[rbio->nr_data]); + memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); /* rearrange the pointer array */ p = pointers[faila]; @@ -1909,7 +2033,7 @@ pstripe: pointers[rbio->nr_data - 1] = p; /* xor in the rest */ - run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE); + run_xor(pointers, rbio->nr_data - 1, sectorsize); } /* if we're doing this rebuild as part of an rmw, go through * and set all of our private rbio pages in the @@ -1918,14 +2042,14 @@ pstripe: * other endio functions will fiddle the uptodate bits */ if (rbio->operation == BTRFS_RBIO_WRITE) { - for (i = 0; i < rbio->stripe_npages; i++) { + for (i = 0; i < rbio->stripe_nsectors; i++) { if (faila != -1) { - page = rbio_stripe_page(rbio, faila, i); - SetPageUptodate(page); + sector = rbio_stripe_sector(rbio, faila, i); + sector->uptodate = 1; } if (failb != -1) { - page = rbio_stripe_page(rbio, failb, i); - SetPageUptodate(page); + sector = rbio_stripe_sector(rbio, failb, i); + sector->uptodate = 1; } } } @@ -1998,7 +2122,7 @@ static void raid_recover_end_io(struct bio *bio) if (bio->bi_status) fail_bio_stripe(rbio, bio); else - set_bio_pages_uptodate(bio); + set_bio_pages_uptodate(rbio, bio); bio_put(bio); if (!atomic_dec_and_test(&rbio->stripes_pending)) @@ -2023,7 +2147,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) int bios_to_read = 0; struct bio_list bio_list; int ret; - int pagenr; + int sectornr; int stripe; struct bio *bio; @@ -2046,20 +2170,20 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) continue; } - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - struct page *p; + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + struct sector_ptr *sector; /* * the rmw code may have already read this * page in */ - p = rbio_stripe_page(rbio, stripe, pagenr); - if (PageUptodate(p)) + sector = rbio_stripe_sector(rbio, stripe, sectornr); + if (sector->uptodate) continue; - ret = rbio_add_io_page(rbio, &bio_list, - rbio_stripe_page(rbio, stripe, pagenr), - stripe, pagenr, rbio->stripe_len); + ret = rbio_add_io_sector(rbio, &bio_list, sector, + stripe, sectornr, rbio->stripe_len, + REQ_OP_READ); if (ret < 0) goto cleanup; } @@ -2086,9 +2210,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) */ atomic_set(&rbio->stripes_pending, bios_to_read); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_private = rbio; bio->bi_end_io = raid_recover_end_io; - bio->bi_opf = REQ_OP_READ; btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); @@ -2115,7 +2237,7 @@ cleanup: * of the drive. */ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, - u64 stripe_len, int mirror_num, int generic_io) + u32 stripe_len, int mirror_num, int generic_io) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; @@ -2193,7 +2315,7 @@ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, } -static void rmw_work(struct btrfs_work *work) +static void rmw_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; @@ -2201,7 +2323,7 @@ static void rmw_work(struct btrfs_work *work) raid56_rmw_stripe(rbio); } -static void read_rebuild_work(struct btrfs_work *work) +static void read_rebuild_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; @@ -2221,7 +2343,7 @@ static void read_rebuild_work(struct btrfs_work *work) struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, struct btrfs_io_context *bioc, - u64 stripe_len, struct btrfs_device *scrub_dev, + u32 stripe_len, struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors) { struct btrfs_fs_info *fs_info = bioc->fs_info; @@ -2252,9 +2374,6 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, } ASSERT(i < rbio->real_stripes); - /* Now we just support the sectorsize equals to page size */ - ASSERT(fs_info->sectorsize == PAGE_SIZE); - ASSERT(rbio->stripe_npages == stripe_nsectors); bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); /* @@ -2268,17 +2387,19 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, /* Used for both parity scrub and missing. */ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, - u64 logical) + unsigned int pgoff, u64 logical) { + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; int stripe_offset; int index; ASSERT(logical >= rbio->bioc->raid_map[0]); - ASSERT(logical + PAGE_SIZE <= rbio->bioc->raid_map[0] + + ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] + rbio->stripe_len * rbio->nr_data); stripe_offset = (int)(logical - rbio->bioc->raid_map[0]); - index = stripe_offset >> PAGE_SHIFT; - rbio->bio_pages[index] = page; + index = stripe_offset / sectorsize; + rbio->bio_sectors[index].page = page; + rbio->bio_sectors[index].pgoff = pgoff; } /* @@ -2287,14 +2408,16 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, */ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) { - int i; - int bit; - int index; - struct page *page; + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + int stripe; + int sectornr; + + for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) { + for (stripe = 0; stripe < rbio->real_stripes; stripe++) { + struct page *page; + int index = (stripe * rbio->stripe_nsectors + sectornr) * + sectorsize >> PAGE_SHIFT; - for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) { - for (i = 0; i < rbio->real_stripes; i++) { - index = i * rbio->stripe_npages + bit; if (rbio->stripe_pages[index]) continue; @@ -2304,6 +2427,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) rbio->stripe_pages[index] = page; } } + index_stripe_sectors(rbio); return 0; } @@ -2311,14 +2435,15 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) { struct btrfs_io_context *bioc = rbio->bioc; + const u32 sectorsize = bioc->fs_info->sectorsize; void **pointers = rbio->finish_pointers; unsigned long *pbitmap = rbio->finish_pbitmap; int nr_data = rbio->nr_data; int stripe; - int pagenr; + int sectornr; bool has_qstripe; - struct page *p_page = NULL; - struct page *q_page = NULL; + struct sector_ptr p_sector = { 0 }; + struct sector_ptr q_sector = { 0 }; struct bio_list bio_list; struct bio *bio; int is_replace = 0; @@ -2335,7 +2460,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) { is_replace = 1; - bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages); + bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_nsectors); } /* @@ -2348,54 +2473,59 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, if (!need_check) goto writeback; - p_page = alloc_page(GFP_NOFS); - if (!p_page) + p_sector.page = alloc_page(GFP_NOFS); + if (!p_sector.page) goto cleanup; - SetPageUptodate(p_page); + p_sector.pgoff = 0; + p_sector.uptodate = 1; if (has_qstripe) { /* RAID6, allocate and map temp space for the Q stripe */ - q_page = alloc_page(GFP_NOFS); - if (!q_page) { - __free_page(p_page); + q_sector.page = alloc_page(GFP_NOFS); + if (!q_sector.page) { + __free_page(p_sector.page); + p_sector.page = NULL; goto cleanup; } - SetPageUptodate(q_page); - pointers[rbio->real_stripes - 1] = kmap_local_page(q_page); + q_sector.pgoff = 0; + q_sector.uptodate = 1; + pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page); } atomic_set(&rbio->error, 0); /* Map the parity stripe just once */ - pointers[nr_data] = kmap_local_page(p_page); + pointers[nr_data] = kmap_local_page(p_sector.page); - for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { - struct page *p; + for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) { + struct sector_ptr *sector; void *parity; + /* first collect one page from each data stripe */ for (stripe = 0; stripe < nr_data; stripe++) { - p = page_in_rbio(rbio, stripe, pagenr, 0); - pointers[stripe] = kmap_local_page(p); + sector = sector_in_rbio(rbio, stripe, sectornr, 0); + pointers[stripe] = kmap_local_page(sector->page) + + sector->pgoff; } if (has_qstripe) { /* RAID6, call the library function to fill in our P/Q */ - raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, + raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, pointers); } else { /* raid5 */ - copy_page(pointers[nr_data], pointers[0]); - run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); + memcpy(pointers[nr_data], pointers[0], sectorsize); + run_xor(pointers + 1, nr_data - 1, sectorsize); } /* Check scrubbing parity and repair it */ - p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); - parity = kmap_local_page(p); - if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE)) - copy_page(parity, pointers[rbio->scrubp]); + sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); + parity = kmap_local_page(sector->page) + sector->pgoff; + if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0) + memcpy(parity, pointers[rbio->scrubp], sectorsize); else /* Parity is right, needn't writeback */ - bitmap_clear(rbio->dbitmap, pagenr, 1); + bitmap_clear(rbio->dbitmap, sectornr, 1); kunmap_local(parity); for (stripe = nr_data - 1; stripe >= 0; stripe--) @@ -2403,10 +2533,12 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, } kunmap_local(pointers[nr_data]); - __free_page(p_page); - if (q_page) { + __free_page(p_sector.page); + p_sector.page = NULL; + if (q_sector.page) { kunmap_local(pointers[rbio->real_stripes - 1]); - __free_page(q_page); + __free_page(q_sector.page); + q_sector.page = NULL; } writeback: @@ -2415,12 +2547,12 @@ writeback: * higher layers (the bio_list in our rbio) and our p/q. Ignore * everything else. */ - for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { - struct page *page; + for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) { + struct sector_ptr *sector; - page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); - ret = rbio_add_io_page(rbio, &bio_list, - page, rbio->scrubp, pagenr, rbio->stripe_len); + sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); + ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp, + sectornr, rbio->stripe_len, REQ_OP_WRITE); if (ret) goto cleanup; } @@ -2428,13 +2560,13 @@ writeback: if (!is_replace) goto submit_write; - for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) { - struct page *page; + for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { + struct sector_ptr *sector; - page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); - ret = rbio_add_io_page(rbio, &bio_list, page, + sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); + ret = rbio_add_io_sector(rbio, &bio_list, sector, bioc->tgtdev_map[rbio->scrubp], - pagenr, rbio->stripe_len); + sectornr, rbio->stripe_len, REQ_OP_WRITE); if (ret) goto cleanup; } @@ -2450,9 +2582,7 @@ submit_write: atomic_set(&rbio->stripes_pending, nr_data); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_private = rbio; bio->bi_end_io = raid_write_end_io; - bio->bi_opf = REQ_OP_WRITE; submit_bio(bio); } @@ -2548,7 +2678,7 @@ static void raid56_parity_scrub_end_io(struct bio *bio) if (bio->bi_status) fail_bio_stripe(rbio, bio); else - set_bio_pages_uptodate(bio); + set_bio_pages_uptodate(rbio, bio); bio_put(bio); @@ -2568,7 +2698,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) int bios_to_read = 0; struct bio_list bio_list; int ret; - int pagenr; + int sectornr; int stripe; struct bio *bio; @@ -2584,28 +2714,29 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) * stripe */ for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { - struct page *page; + for_each_set_bit(sectornr , rbio->dbitmap, rbio->stripe_nsectors) { + struct sector_ptr *sector; /* - * we want to find all the pages missing from - * the rbio and read them from the disk. If - * page_in_rbio finds a page in the bio list - * we don't need to read it off the stripe. + * We want to find all the sectors missing from the + * rbio and read them from the disk. If * sector_in_rbio() + * finds a sector in the bio list we don't need to read + * it off the stripe. */ - page = page_in_rbio(rbio, stripe, pagenr, 1); - if (page) + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (sector) continue; - page = rbio_stripe_page(rbio, stripe, pagenr); + sector = rbio_stripe_sector(rbio, stripe, sectornr); /* - * the bio cache may have handed us an uptodate - * page. If so, be happy and use it + * The bio cache may have handed us an uptodate sector. + * If so, be happy and use it. */ - if (PageUptodate(page)) + if (sector->uptodate) continue; - ret = rbio_add_io_page(rbio, &bio_list, page, - stripe, pagenr, rbio->stripe_len); + ret = rbio_add_io_sector(rbio, &bio_list, sector, + stripe, sectornr, rbio->stripe_len, + REQ_OP_READ); if (ret) goto cleanup; } @@ -2628,9 +2759,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) */ atomic_set(&rbio->stripes_pending, bios_to_read); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_private = rbio; bio->bi_end_io = raid56_parity_scrub_end_io; - bio->bi_opf = REQ_OP_READ; btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); @@ -2651,7 +2780,7 @@ finish: validate_rbio_for_parity_scrub(rbio); } -static void scrub_parity_work(struct btrfs_work *work) +static void scrub_parity_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 72c00fc284b5..aaad08aefd7d 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -31,15 +31,14 @@ struct btrfs_raid_bio; struct btrfs_device; int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, - u64 stripe_len, int mirror_num, int generic_io); -int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, - u64 stripe_len); + u32 stripe_len, int mirror_num, int generic_io); +int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len); void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, - u64 logical); + unsigned int pgoff, u64 logical); struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, - struct btrfs_io_context *bioc, u64 stripe_len, + struct btrfs_io_context *bioc, u32 stripe_len, struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors); void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 998e3f180d90..c39f8b3a5a4a 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -614,14 +614,23 @@ static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, struct inode *inode2, u64 loff2, u64 len) { + u64 range1_end = loff1 + len - 1; + u64 range2_end = loff2 + len - 1; + if (inode1 < inode2) { swap(inode1, inode2); swap(loff1, loff2); + swap(range1_end, range2_end); } else if (inode1 == inode2 && loff2 < loff1) { swap(loff1, loff2); + swap(range1_end, range2_end); } - lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); - lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); + + lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end); + lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end); + + btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end); + btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end); } static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2) @@ -771,7 +780,6 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize; - bool same_inode = inode_out == inode_in; u64 wb_len; int ret; @@ -810,15 +818,6 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, wb_len = ALIGN(*len, bs); /* - * Since we don't lock ranges, wait for ongoing lockless dio writes (as - * any in progress could create its ordered extents after we wait for - * existing ordered extents below). - */ - inode_dio_wait(inode_in); - if (!same_inode) - inode_dio_wait(inode_out); - - /* * Workaround to make sure NOCOW buffered write reach disk as NOCOW. * * Btrfs' back references do not have a block level granularity, they diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index fdc2c4b411f0..edddd93d2118 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -362,7 +362,7 @@ struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr) rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr); if (rb_node) { node = rb_entry(rb_node, struct mapping_node, rb_node); - root = (struct btrfs_root *)node->data; + root = node->data; } spin_unlock(&rc->reloc_root_tree.lock); return btrfs_grab_root(root); @@ -2997,7 +2997,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, /* Reserve metadata for this range */ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), - clamped_len, clamped_len); + clamped_len, clamped_len, + false); if (ret) goto release_page; @@ -3845,8 +3846,7 @@ out: btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); if (err) { - if (inode) - iput(inode); + iput(inode); inode = ERR_PTR(err); } return inode; @@ -3977,6 +3977,17 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) if (!bg) return -ENOENT; + /* + * Relocation of a data block group creates ordered extents. Without + * sb_start_write(), we can freeze the filesystem while unfinished + * ordered extents are left. Such ordered extents can cause a deadlock + * e.g. when syncfs() is waiting for their completion but they can't + * finish because they block when joining a transaction, due to the + * fact that the freeze locks are being held in write mode. + */ + if (bg->flags & BTRFS_BLOCK_GROUP_DATA) + ASSERT(sb_write_started(fs_info->sb)); + if (btrfs_pinned_by_swapfile(fs_info, bg)) { btrfs_put_block_group(bg); return -ETXTBSY; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index ca7426ef61c8..a64b26b16904 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -509,7 +509,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, /* One for parent inode, two for dir entries */ qgroup_num_bytes = 3 * fs_info->nodesize; ret = btrfs_qgroup_reserve_meta_prealloc(root, - qgroup_num_bytes, true); + qgroup_num_bytes, true, + false); if (ret) return ret; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 8cd713d37ad2..e7b0323e6efd 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -45,14 +45,14 @@ struct scrub_ctx; * operations. The first one configures an upper limit for the number * of (dynamically allocated) pages that are added to a bio. */ -#define SCRUB_PAGES_PER_BIO 32 /* 128KiB per bio for x86 */ -#define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for x86 */ +#define SCRUB_SECTORS_PER_BIO 32 /* 128KiB per bio for 4KiB pages */ +#define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for 4KiB pages */ /* * The following value times PAGE_SIZE needs to be large enough to match the * largest node/leaf/sector size that shall be supported. */ -#define SCRUB_MAX_PAGES_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K) +#define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K) struct scrub_recover { refcount_t refs; @@ -60,7 +60,7 @@ struct scrub_recover { u64 map_length; }; -struct scrub_page { +struct scrub_sector { struct scrub_block *sblock; struct page *page; struct btrfs_device *dev; @@ -87,16 +87,16 @@ struct scrub_bio { blk_status_t status; u64 logical; u64 physical; - struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; - int page_count; + struct scrub_sector *sectors[SCRUB_SECTORS_PER_BIO]; + int sector_count; int next_free; - struct btrfs_work work; + struct work_struct work; }; struct scrub_block { - struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK]; - int page_count; - atomic_t outstanding_pages; + struct scrub_sector *sectors[SCRUB_MAX_SECTORS_PER_BLOCK]; + int sector_count; + atomic_t outstanding_sectors; refcount_t refs; /* free mem on transition to zero */ struct scrub_ctx *sctx; struct scrub_parity *sparity; @@ -110,7 +110,7 @@ struct scrub_block { /* It is for the data with checksum */ unsigned int data_corrected:1; }; - struct btrfs_work work; + struct work_struct work; }; /* Used for the chunks with parity stripe such RAID5/6 */ @@ -129,10 +129,10 @@ struct scrub_parity { refcount_t refs; - struct list_head spages; + struct list_head sectors_list; /* Work of parity check and repair */ - struct btrfs_work work; + struct work_struct work; /* Mark the parity blocks which have data */ unsigned long *dbitmap; @@ -158,7 +158,7 @@ struct scrub_ctx { struct list_head csum_list; atomic_t cancel_req; int readonly; - int pages_per_bio; + int sectors_per_bio; /* State of IO submission throttling affecting the associated device */ ktime_t throttle_deadline; @@ -212,43 +212,43 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, static void scrub_recheck_block_checksum(struct scrub_block *sblock); static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good); -static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, +static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good, - int page_num, int force_write); + int sector_num, int force_write); static void scrub_write_block_to_dev_replace(struct scrub_block *sblock); -static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, - int page_num); +static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, + int sector_num); static int scrub_checksum_data(struct scrub_block *sblock); static int scrub_checksum_tree_block(struct scrub_block *sblock); static int scrub_checksum_super(struct scrub_block *sblock); static void scrub_block_put(struct scrub_block *sblock); -static void scrub_page_get(struct scrub_page *spage); -static void scrub_page_put(struct scrub_page *spage); +static void scrub_sector_get(struct scrub_sector *sector); +static void scrub_sector_put(struct scrub_sector *sector); static void scrub_parity_get(struct scrub_parity *sparity); static void scrub_parity_put(struct scrub_parity *sparity); -static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len, - u64 physical, struct btrfs_device *dev, u64 flags, - u64 gen, int mirror_num, u8 *csum, - u64 physical_for_dev_replace); +static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len, + u64 physical, struct btrfs_device *dev, u64 flags, + u64 gen, int mirror_num, u8 *csum, + u64 physical_for_dev_replace); static void scrub_bio_end_io(struct bio *bio); -static void scrub_bio_end_io_worker(struct btrfs_work *work); +static void scrub_bio_end_io_worker(struct work_struct *work); static void scrub_block_complete(struct scrub_block *sblock); -static void scrub_remap_extent(struct btrfs_fs_info *fs_info, - u64 extent_logical, u32 extent_len, - u64 *extent_physical, - struct btrfs_device **extent_dev, - int *extent_mirror_num); -static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, - struct scrub_page *spage); +static void scrub_find_good_copy(struct btrfs_fs_info *fs_info, + u64 extent_logical, u32 extent_len, + u64 *extent_physical, + struct btrfs_device **extent_dev, + int *extent_mirror_num); +static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx, + struct scrub_sector *sector); static void scrub_wr_submit(struct scrub_ctx *sctx); static void scrub_wr_bio_end_io(struct bio *bio); -static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); +static void scrub_wr_bio_end_io_worker(struct work_struct *work); static void scrub_put_ctx(struct scrub_ctx *sctx); -static inline int scrub_is_page_on_raid56(struct scrub_page *spage) +static inline int scrub_is_page_on_raid56(struct scrub_sector *sector) { - return spage->recover && - (spage->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); + return sector->recover && + (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); } static void scrub_pending_bio_inc(struct scrub_ctx *sctx) @@ -535,9 +535,9 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) if (sctx->curr != -1) { struct scrub_bio *sbio = sctx->bios[sctx->curr]; - for (i = 0; i < sbio->page_count; i++) { - WARN_ON(!sbio->pagev[i]->page); - scrub_block_put(sbio->pagev[i]->sblock); + for (i = 0; i < sbio->sector_count; i++) { + WARN_ON(!sbio->sectors[i]->page); + scrub_block_put(sbio->sectors[i]->sblock); } bio_put(sbio->bio); } @@ -572,7 +572,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( goto nomem; refcount_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; - sctx->pages_per_bio = SCRUB_PAGES_PER_BIO; + sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO; sctx->curr = -1; sctx->fs_info = fs_info; INIT_LIST_HEAD(&sctx->csum_list); @@ -586,9 +586,8 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( sbio->index = i; sbio->sctx = sctx; - sbio->page_count = 0; - btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL, - NULL); + sbio->sector_count = 0; + INIT_WORK(&sbio->work, scrub_bio_end_io_worker); if (i != SCRUB_BIOS_PER_SCTX - 1) sctx->bios[i]->next_free = i + 1; @@ -728,16 +727,16 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) u8 ref_level = 0; int ret; - WARN_ON(sblock->page_count < 1); - dev = sblock->pagev[0]->dev; + WARN_ON(sblock->sector_count < 1); + dev = sblock->sectors[0]->dev; fs_info = sblock->sctx->fs_info; path = btrfs_alloc_path(); if (!path) return; - swarn.physical = sblock->pagev[0]->physical; - swarn.logical = sblock->pagev[0]->logical; + swarn.physical = sblock->sectors[0]->physical; + swarn.logical = sblock->sectors[0]->logical; swarn.errstr = errstr; swarn.dev = NULL; @@ -798,8 +797,8 @@ static inline void scrub_put_recover(struct btrfs_fs_info *fs_info, /* * scrub_handle_errored_block gets called when either verification of the - * pages failed or the bio failed to read, e.g. with EIO. In the latter - * case, this function handles all pages in the bio, even though only one + * sectors failed or the bio failed to read, e.g. with EIO. In the latter + * case, this function handles all sectors in the bio, even though only one * may be bad. * The goal of this function is to repair the errored block by using the * contents of one of the mirrors. @@ -817,16 +816,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) struct scrub_block *sblock_bad; int ret; int mirror_index; - int page_num; + int sector_num; int success; bool full_stripe_locked; unsigned int nofs_flag; static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - BUG_ON(sblock_to_check->page_count < 1); + BUG_ON(sblock_to_check->sector_count < 1); fs_info = sctx->fs_info; - if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { + if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { /* * if we find an error in a super block, we just report it. * They will get written with the next transaction commit @@ -837,13 +836,13 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) spin_unlock(&sctx->stat_lock); return 0; } - logical = sblock_to_check->pagev[0]->logical; - BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1); - failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1; - is_metadata = !(sblock_to_check->pagev[0]->flags & + logical = sblock_to_check->sectors[0]->logical; + BUG_ON(sblock_to_check->sectors[0]->mirror_num < 1); + failed_mirror_index = sblock_to_check->sectors[0]->mirror_num - 1; + is_metadata = !(sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA); - have_csum = sblock_to_check->pagev[0]->have_csum; - dev = sblock_to_check->pagev[0]->dev; + have_csum = sblock_to_check->sectors[0]->have_csum; + dev = sblock_to_check->sectors[0]->dev; if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical)) return 0; @@ -854,7 +853,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * might be waiting the scrub task to pause (which needs to wait for all * the worker tasks to complete before pausing). * We do allocations in the workers through insert_full_stripe_lock() - * and scrub_add_page_to_wr_bio(), which happens down the call chain of + * and scrub_add_sector_to_wr_bio(), which happens down the call chain of * this function. */ nofs_flag = memalloc_nofs_save(); @@ -918,7 +917,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) goto out; } - /* setup the context, map the logical blocks and alloc the pages */ + /* Setup the context, map the logical blocks and alloc the sectors */ ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck); if (ret) { spin_lock(&sctx->stat_lock); @@ -937,7 +936,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) if (!sblock_bad->header_error && !sblock_bad->checksum_error && sblock_bad->no_io_error_seen) { /* - * the error disappeared after reading page by page, or + * The error disappeared after reading sector by sector, or * the area was part of a huge bio and other parts of the * bio caused I/O errors, or the block layer merged several * read requests into one and the error is caused by a @@ -998,10 +997,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * that is known to contain an error is rewritten. Afterwards * the block is known to be corrected. * If a mirror is found which is completely correct, and no - * checksum is present, only those pages are rewritten that had + * checksum is present, only those sectors are rewritten that had * an I/O error in the block to be repaired, since it cannot be - * determined, which copy of the other pages is better (and it - * could happen otherwise that a correct page would be + * determined, which copy of the other sectors is better (and it + * could happen otherwise that a correct sector would be * overwritten by a bad one). */ for (mirror_index = 0; ;mirror_index++) { @@ -1011,25 +1010,25 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) continue; /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */ - if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) { + if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) { if (mirror_index >= BTRFS_MAX_MIRRORS) break; - if (!sblocks_for_recheck[mirror_index].page_count) + if (!sblocks_for_recheck[mirror_index].sector_count) break; sblock_other = sblocks_for_recheck + mirror_index; } else { - struct scrub_recover *r = sblock_bad->pagev[0]->recover; + struct scrub_recover *r = sblock_bad->sectors[0]->recover; int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs; if (mirror_index >= max_allowed) break; - if (!sblocks_for_recheck[1].page_count) + if (!sblocks_for_recheck[1].sector_count) break; ASSERT(failed_mirror_index == 0); sblock_other = sblocks_for_recheck + 1; - sblock_other->pagev[0]->mirror_num = 1 + mirror_index; + sblock_other->sectors[0]->mirror_num = 1 + mirror_index; } /* build and submit the bios, check checksums */ @@ -1078,16 +1077,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * area are unreadable. */ success = 1; - for (page_num = 0; page_num < sblock_bad->page_count; - page_num++) { - struct scrub_page *spage_bad = sblock_bad->pagev[page_num]; + for (sector_num = 0; sector_num < sblock_bad->sector_count; + sector_num++) { + struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num]; struct scrub_block *sblock_other = NULL; - /* skip no-io-error page in scrub */ - if (!spage_bad->io_error && !sctx->is_dev_replace) + /* Skip no-io-error sectors in scrub */ + if (!sector_bad->io_error && !sctx->is_dev_replace) continue; - if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) { + if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) { /* * In case of dev replace, if raid56 rebuild process * didn't work out correct data, then copy the content @@ -1096,14 +1095,14 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * sblock_for_recheck array to target device. */ sblock_other = NULL; - } else if (spage_bad->io_error) { - /* try to find no-io-error page in mirrors */ + } else if (sector_bad->io_error) { + /* Try to find no-io-error sector in mirrors */ for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS && - sblocks_for_recheck[mirror_index].page_count > 0; + sblocks_for_recheck[mirror_index].sector_count > 0; mirror_index++) { if (!sblocks_for_recheck[mirror_index]. - pagev[page_num]->io_error) { + sectors[sector_num]->io_error) { sblock_other = sblocks_for_recheck + mirror_index; break; @@ -1115,27 +1114,26 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) if (sctx->is_dev_replace) { /* - * did not find a mirror to fetch the page - * from. scrub_write_page_to_dev_replace() - * handles this case (page->io_error), by - * filling the block with zeros before - * submitting the write request + * Did not find a mirror to fetch the sector from. + * scrub_write_sector_to_dev_replace() handles this + * case (sector->io_error), by filling the block with + * zeros before submitting the write request */ if (!sblock_other) sblock_other = sblock_bad; - if (scrub_write_page_to_dev_replace(sblock_other, - page_num) != 0) { + if (scrub_write_sector_to_dev_replace(sblock_other, + sector_num) != 0) { atomic64_inc( &fs_info->dev_replace.num_write_errors); success = 0; } } else if (sblock_other) { - ret = scrub_repair_page_from_good_copy(sblock_bad, - sblock_other, - page_num, 0); + ret = scrub_repair_sector_from_good_copy(sblock_bad, + sblock_other, + sector_num, 0); if (0 == ret) - spage_bad->io_error = 0; + sector_bad->io_error = 0; else success = 0; } @@ -1186,18 +1184,16 @@ out: struct scrub_block *sblock = sblocks_for_recheck + mirror_index; struct scrub_recover *recover; - int page_index; + int i; - for (page_index = 0; page_index < sblock->page_count; - page_index++) { - sblock->pagev[page_index]->sblock = NULL; - recover = sblock->pagev[page_index]->recover; + for (i = 0; i < sblock->sector_count; i++) { + sblock->sectors[i]->sblock = NULL; + recover = sblock->sectors[i]->recover; if (recover) { scrub_put_recover(fs_info, recover); - sblock->pagev[page_index]->recover = - NULL; + sblock->sectors[i]->recover = NULL; } - scrub_page_put(sblock->pagev[page_index]); + scrub_sector_put(sblock->sectors[i]); } } kfree(sblocks_for_recheck); @@ -1255,26 +1251,25 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, { struct scrub_ctx *sctx = original_sblock->sctx; struct btrfs_fs_info *fs_info = sctx->fs_info; - u64 length = original_sblock->page_count * fs_info->sectorsize; - u64 logical = original_sblock->pagev[0]->logical; - u64 generation = original_sblock->pagev[0]->generation; - u64 flags = original_sblock->pagev[0]->flags; - u64 have_csum = original_sblock->pagev[0]->have_csum; + u64 length = original_sblock->sector_count << fs_info->sectorsize_bits; + u64 logical = original_sblock->sectors[0]->logical; + u64 generation = original_sblock->sectors[0]->generation; + u64 flags = original_sblock->sectors[0]->flags; + u64 have_csum = original_sblock->sectors[0]->have_csum; struct scrub_recover *recover; struct btrfs_io_context *bioc; u64 sublen; u64 mapped_length; u64 stripe_offset; int stripe_index; - int page_index = 0; + int sector_index = 0; int mirror_index; int nmirrors; int ret; /* - * note: the two members refs and outstanding_pages - * are not used (and not set) in the blocks that are used for - * the recheck procedure + * Note: the two members refs and outstanding_sectors are not used (and + * not set) in the blocks that are used for the recheck procedure. */ while (length > 0) { @@ -1306,20 +1301,20 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, recover->bioc = bioc; recover->map_length = mapped_length; - ASSERT(page_index < SCRUB_MAX_PAGES_PER_BLOCK); + ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK); nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS); for (mirror_index = 0; mirror_index < nmirrors; mirror_index++) { struct scrub_block *sblock; - struct scrub_page *spage; + struct scrub_sector *sector; sblock = sblocks_for_recheck + mirror_index; sblock->sctx = sctx; - spage = kzalloc(sizeof(*spage), GFP_NOFS); - if (!spage) { + sector = kzalloc(sizeof(*sector), GFP_NOFS); + if (!sector) { leave_nomem: spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -1327,16 +1322,16 @@ leave_nomem: scrub_put_recover(fs_info, recover); return -ENOMEM; } - scrub_page_get(spage); - sblock->pagev[page_index] = spage; - spage->sblock = sblock; - spage->flags = flags; - spage->generation = generation; - spage->logical = logical; - spage->have_csum = have_csum; + scrub_sector_get(sector); + sblock->sectors[sector_index] = sector; + sector->sblock = sblock; + sector->flags = flags; + sector->generation = generation; + sector->logical = logical; + sector->have_csum = have_csum; if (have_csum) - memcpy(spage->csum, - original_sblock->pagev[0]->csum, + memcpy(sector->csum, + original_sblock->sectors[0]->csum, sctx->fs_info->csum_size); scrub_stripe_index_and_offset(logical, @@ -1348,28 +1343,28 @@ leave_nomem: mirror_index, &stripe_index, &stripe_offset); - spage->physical = bioc->stripes[stripe_index].physical + + sector->physical = bioc->stripes[stripe_index].physical + stripe_offset; - spage->dev = bioc->stripes[stripe_index].dev; + sector->dev = bioc->stripes[stripe_index].dev; - BUG_ON(page_index >= original_sblock->page_count); - spage->physical_for_dev_replace = - original_sblock->pagev[page_index]-> + BUG_ON(sector_index >= original_sblock->sector_count); + sector->physical_for_dev_replace = + original_sblock->sectors[sector_index]-> physical_for_dev_replace; - /* for missing devices, dev->bdev is NULL */ - spage->mirror_num = mirror_index + 1; - sblock->page_count++; - spage->page = alloc_page(GFP_NOFS); - if (!spage->page) + /* For missing devices, dev->bdev is NULL */ + sector->mirror_num = mirror_index + 1; + sblock->sector_count++; + sector->page = alloc_page(GFP_NOFS); + if (!sector->page) goto leave_nomem; scrub_get_recover(recover); - spage->recover = recover; + sector->recover = recover; } scrub_put_recover(fs_info, recover); length -= sublen; logical += sublen; - page_index++; + sector_index++; } return 0; @@ -1382,19 +1377,19 @@ static void scrub_bio_wait_endio(struct bio *bio) static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, struct bio *bio, - struct scrub_page *spage) + struct scrub_sector *sector) { DECLARE_COMPLETION_ONSTACK(done); int ret; int mirror_num; - bio->bi_iter.bi_sector = spage->logical >> 9; + bio->bi_iter.bi_sector = sector->logical >> 9; bio->bi_private = &done; bio->bi_end_io = scrub_bio_wait_endio; - mirror_num = spage->sblock->pagev[0]->mirror_num; - ret = raid56_parity_recover(bio, spage->recover->bioc, - spage->recover->map_length, + mirror_num = sector->sblock->sectors[0]->mirror_num; + ret = raid56_parity_recover(bio, sector->recover->bioc, + sector->recover->map_length, mirror_num, 0); if (ret) return ret; @@ -1406,26 +1401,25 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info, struct scrub_block *sblock) { - struct scrub_page *first_page = sblock->pagev[0]; + struct scrub_sector *first_sector = sblock->sectors[0]; struct bio *bio; - int page_num; + int i; - /* All pages in sblock belong to the same stripe on the same device. */ - ASSERT(first_page->dev); - if (!first_page->dev->bdev) + /* All sectors in sblock belong to the same stripe on the same device. */ + ASSERT(first_sector->dev); + if (!first_sector->dev->bdev) goto out; - bio = btrfs_bio_alloc(BIO_MAX_VECS); - bio_set_dev(bio, first_page->dev->bdev); + bio = bio_alloc(first_sector->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); - for (page_num = 0; page_num < sblock->page_count; page_num++) { - struct scrub_page *spage = sblock->pagev[page_num]; + for (i = 0; i < sblock->sector_count; i++) { + struct scrub_sector *sector = sblock->sectors[i]; - WARN_ON(!spage->page); - bio_add_page(bio, spage->page, PAGE_SIZE, 0); + WARN_ON(!sector->page); + bio_add_page(bio, sector->page, PAGE_SIZE, 0); } - if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) { + if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) { bio_put(bio); goto out; } @@ -1436,65 +1430,63 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info, return; out: - for (page_num = 0; page_num < sblock->page_count; page_num++) - sblock->pagev[page_num]->io_error = 1; + for (i = 0; i < sblock->sector_count; i++) + sblock->sectors[i]->io_error = 1; sblock->no_io_error_seen = 0; } /* - * this function will check the on disk data for checksum errors, header - * errors and read I/O errors. If any I/O errors happen, the exact pages - * which are errored are marked as being bad. The goal is to enable scrub - * to take those pages that are not errored from all the mirrors so that - * the pages that are errored in the just handled mirror can be repaired. + * This function will check the on disk data for checksum errors, header errors + * and read I/O errors. If any I/O errors happen, the exact sectors which are + * errored are marked as being bad. The goal is to enable scrub to take those + * sectors that are not errored from all the mirrors so that the sectors that + * are errored in the just handled mirror can be repaired. */ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, struct scrub_block *sblock, int retry_failed_mirror) { - int page_num; + int i; sblock->no_io_error_seen = 1; /* short cut for raid56 */ - if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0])) + if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0])) return scrub_recheck_block_on_raid56(fs_info, sblock); - for (page_num = 0; page_num < sblock->page_count; page_num++) { - struct bio *bio; - struct scrub_page *spage = sblock->pagev[page_num]; + for (i = 0; i < sblock->sector_count; i++) { + struct scrub_sector *sector = sblock->sectors[i]; + struct bio bio; + struct bio_vec bvec; - if (spage->dev->bdev == NULL) { - spage->io_error = 1; + if (sector->dev->bdev == NULL) { + sector->io_error = 1; sblock->no_io_error_seen = 0; continue; } - WARN_ON(!spage->page); - bio = btrfs_bio_alloc(1); - bio_set_dev(bio, spage->dev->bdev); - - bio_add_page(bio, spage->page, fs_info->sectorsize, 0); - bio->bi_iter.bi_sector = spage->physical >> 9; - bio->bi_opf = REQ_OP_READ; + WARN_ON(!sector->page); + bio_init(&bio, sector->dev->bdev, &bvec, 1, REQ_OP_READ); + bio_add_page(&bio, sector->page, fs_info->sectorsize, 0); + bio.bi_iter.bi_sector = sector->physical >> 9; - if (btrfsic_submit_bio_wait(bio)) { - spage->io_error = 1; + btrfsic_check_bio(&bio); + if (submit_bio_wait(&bio)) { + sector->io_error = 1; sblock->no_io_error_seen = 0; } - bio_put(bio); + bio_uninit(&bio); } if (sblock->no_io_error_seen) scrub_recheck_block_checksum(sblock); } -static inline int scrub_check_fsid(u8 fsid[], - struct scrub_page *spage) +static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector) { - struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices; + struct btrfs_fs_devices *fs_devices = sector->dev->fs_devices; int ret; ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE); @@ -1507,7 +1499,7 @@ static void scrub_recheck_block_checksum(struct scrub_block *sblock) sblock->checksum_error = 0; sblock->generation_error = 0; - if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA) + if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA) scrub_checksum_data(sblock); else scrub_checksum_tree_block(sblock); @@ -1516,15 +1508,14 @@ static void scrub_recheck_block_checksum(struct scrub_block *sblock) static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good) { - int page_num; + int i; int ret = 0; - for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { + for (i = 0; i < sblock_bad->sector_count; i++) { int ret_sub; - ret_sub = scrub_repair_page_from_good_copy(sblock_bad, - sblock_good, - page_num, 1); + ret_sub = scrub_repair_sector_from_good_copy(sblock_bad, + sblock_good, i, 1); if (ret_sub) ret = ret_sub; } @@ -1532,47 +1523,43 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, return ret; } -static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, - struct scrub_block *sblock_good, - int page_num, int force_write) +static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad, + struct scrub_block *sblock_good, + int sector_num, int force_write) { - struct scrub_page *spage_bad = sblock_bad->pagev[page_num]; - struct scrub_page *spage_good = sblock_good->pagev[page_num]; + struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num]; + struct scrub_sector *sector_good = sblock_good->sectors[sector_num]; struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info; const u32 sectorsize = fs_info->sectorsize; - BUG_ON(spage_bad->page == NULL); - BUG_ON(spage_good->page == NULL); + BUG_ON(sector_bad->page == NULL); + BUG_ON(sector_good->page == NULL); if (force_write || sblock_bad->header_error || - sblock_bad->checksum_error || spage_bad->io_error) { - struct bio *bio; + sblock_bad->checksum_error || sector_bad->io_error) { + struct bio bio; + struct bio_vec bvec; int ret; - if (!spage_bad->dev->bdev) { + if (!sector_bad->dev->bdev) { btrfs_warn_rl(fs_info, "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected"); return -EIO; } - bio = btrfs_bio_alloc(1); - bio_set_dev(bio, spage_bad->dev->bdev); - bio->bi_iter.bi_sector = spage_bad->physical >> 9; - bio->bi_opf = REQ_OP_WRITE; + bio_init(&bio, sector_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE); + bio.bi_iter.bi_sector = sector_bad->physical >> 9; + __bio_add_page(&bio, sector_good->page, sectorsize, 0); - ret = bio_add_page(bio, spage_good->page, sectorsize, 0); - if (ret != sectorsize) { - bio_put(bio); - return -EIO; - } + btrfsic_check_bio(&bio); + ret = submit_bio_wait(&bio); + bio_uninit(&bio); - if (btrfsic_submit_bio_wait(bio)) { - btrfs_dev_stat_inc_and_print(spage_bad->dev, + if (ret) { + btrfs_dev_stat_inc_and_print(sector_bad->dev, BTRFS_DEV_STAT_WRITE_ERRS); atomic64_inc(&fs_info->dev_replace.num_write_errors); - bio_put(bio); return -EIO; } - bio_put(bio); } return 0; @@ -1581,7 +1568,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) { struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; - int page_num; + int i; /* * This block is used for the check of the parity on the source device, @@ -1590,25 +1577,24 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) if (sblock->sparity) return; - for (page_num = 0; page_num < sblock->page_count; page_num++) { + for (i = 0; i < sblock->sector_count; i++) { int ret; - ret = scrub_write_page_to_dev_replace(sblock, page_num); + ret = scrub_write_sector_to_dev_replace(sblock, i); if (ret) atomic64_inc(&fs_info->dev_replace.num_write_errors); } } -static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, - int page_num) +static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num) { - struct scrub_page *spage = sblock->pagev[page_num]; + struct scrub_sector *sector = sblock->sectors[sector_num]; - BUG_ON(spage->page == NULL); - if (spage->io_error) - clear_page(page_address(spage->page)); + BUG_ON(sector->page == NULL); + if (sector->io_error) + clear_page(page_address(sector->page)); - return scrub_add_page_to_wr_bio(sblock->sctx, spage); + return scrub_add_sector_to_wr_bio(sblock->sctx, sector); } static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) @@ -1633,8 +1619,8 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) return ret; } -static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, - struct scrub_page *spage) +static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx, + struct scrub_sector *sector) { struct scrub_bio *sbio; int ret; @@ -1650,45 +1636,38 @@ again: return -ENOMEM; } sctx->wr_curr_bio->sctx = sctx; - sctx->wr_curr_bio->page_count = 0; + sctx->wr_curr_bio->sector_count = 0; } sbio = sctx->wr_curr_bio; - if (sbio->page_count == 0) { - struct bio *bio; - - ret = fill_writer_pointer_gap(sctx, - spage->physical_for_dev_replace); + if (sbio->sector_count == 0) { + ret = fill_writer_pointer_gap(sctx, sector->physical_for_dev_replace); if (ret) { mutex_unlock(&sctx->wr_lock); return ret; } - sbio->physical = spage->physical_for_dev_replace; - sbio->logical = spage->logical; + sbio->physical = sector->physical_for_dev_replace; + sbio->logical = sector->logical; sbio->dev = sctx->wr_tgtdev; - bio = sbio->bio; - if (!bio) { - bio = btrfs_bio_alloc(sctx->pages_per_bio); - sbio->bio = bio; + if (!sbio->bio) { + sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio, + REQ_OP_WRITE, GFP_NOFS); } - - bio->bi_private = sbio; - bio->bi_end_io = scrub_wr_bio_end_io; - bio_set_dev(bio, sbio->dev->bdev); - bio->bi_iter.bi_sector = sbio->physical >> 9; - bio->bi_opf = REQ_OP_WRITE; + sbio->bio->bi_private = sbio; + sbio->bio->bi_end_io = scrub_wr_bio_end_io; + sbio->bio->bi_iter.bi_sector = sbio->physical >> 9; sbio->status = 0; - } else if (sbio->physical + sbio->page_count * sectorsize != - spage->physical_for_dev_replace || - sbio->logical + sbio->page_count * sectorsize != - spage->logical) { + } else if (sbio->physical + sbio->sector_count * sectorsize != + sector->physical_for_dev_replace || + sbio->logical + sbio->sector_count * sectorsize != + sector->logical) { scrub_wr_submit(sctx); goto again; } - ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0); + ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0); if (ret != sectorsize) { - if (sbio->page_count < 1) { + if (sbio->sector_count < 1) { bio_put(sbio->bio); sbio->bio = NULL; mutex_unlock(&sctx->wr_lock); @@ -1698,10 +1677,10 @@ again: goto again; } - sbio->pagev[sbio->page_count] = spage; - scrub_page_get(spage); - sbio->page_count++; - if (sbio->page_count == sctx->pages_per_bio) + sbio->sectors[sbio->sector_count] = sector; + scrub_sector_get(sector); + sbio->sector_count++; + if (sbio->sector_count == sctx->sectors_per_bio) scrub_wr_submit(sctx); mutex_unlock(&sctx->wr_lock); @@ -1717,16 +1696,16 @@ static void scrub_wr_submit(struct scrub_ctx *sctx) sbio = sctx->wr_curr_bio; sctx->wr_curr_bio = NULL; - WARN_ON(!sbio->bio->bi_bdev); scrub_pending_bio_inc(sctx); /* process all writes in a single worker thread. Then the block layer * orders the requests before sending them to the driver which * doubled the write performance on spinning disks when measured * with Linux 3.5 */ - btrfsic_submit_bio(sbio->bio); + btrfsic_check_bio(sbio->bio); + submit_bio(sbio->bio); if (btrfs_is_zoned(sctx->fs_info)) - sctx->write_pointer = sbio->physical + sbio->page_count * + sctx->write_pointer = sbio->physical + sbio->sector_count * sctx->fs_info->sectorsize; } @@ -1738,31 +1717,31 @@ static void scrub_wr_bio_end_io(struct bio *bio) sbio->status = bio->bi_status; sbio->bio = bio; - btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL); - btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); + INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker); + queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); } -static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) +static void scrub_wr_bio_end_io_worker(struct work_struct *work) { struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); struct scrub_ctx *sctx = sbio->sctx; int i; - ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO); + ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO); if (sbio->status) { struct btrfs_dev_replace *dev_replace = &sbio->sctx->fs_info->dev_replace; - for (i = 0; i < sbio->page_count; i++) { - struct scrub_page *spage = sbio->pagev[i]; + for (i = 0; i < sbio->sector_count; i++) { + struct scrub_sector *sector = sbio->sectors[i]; - spage->io_error = 1; + sector->io_error = 1; atomic64_inc(&dev_replace->num_write_errors); } } - for (i = 0; i < sbio->page_count; i++) - scrub_page_put(sbio->pagev[i]); + for (i = 0; i < sbio->sector_count; i++) + scrub_sector_put(sbio->sectors[i]); bio_put(sbio->bio); kfree(sbio); @@ -1786,8 +1765,8 @@ static int scrub_checksum(struct scrub_block *sblock) sblock->generation_error = 0; sblock->checksum_error = 0; - WARN_ON(sblock->page_count < 1); - flags = sblock->pagev[0]->flags; + WARN_ON(sblock->sector_count < 1); + flags = sblock->sectors[0]->flags; ret = 0; if (flags & BTRFS_EXTENT_FLAG_DATA) ret = scrub_checksum_data(sblock); @@ -1809,26 +1788,26 @@ static int scrub_checksum_data(struct scrub_block *sblock) struct btrfs_fs_info *fs_info = sctx->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 csum[BTRFS_CSUM_SIZE]; - struct scrub_page *spage; + struct scrub_sector *sector; char *kaddr; - BUG_ON(sblock->page_count < 1); - spage = sblock->pagev[0]; - if (!spage->have_csum) + BUG_ON(sblock->sector_count < 1); + sector = sblock->sectors[0]; + if (!sector->have_csum) return 0; - kaddr = page_address(spage->page); + kaddr = page_address(sector->page); shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); /* - * In scrub_pages() and scrub_pages_for_parity() we ensure each spage + * In scrub_sectors() and scrub_sectors_for_parity() we ensure each sector * only contains one sector of data. */ crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); - if (memcmp(csum, spage->csum, fs_info->csum_size)) + if (memcmp(csum, sector->csum, fs_info->csum_size)) sblock->checksum_error = 1; return sblock->checksum_error; } @@ -1849,16 +1828,16 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) const u32 sectorsize = sctx->fs_info->sectorsize; const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits; int i; - struct scrub_page *spage; + struct scrub_sector *sector; char *kaddr; - BUG_ON(sblock->page_count < 1); + BUG_ON(sblock->sector_count < 1); - /* Each member in pagev is just one block, not a full page */ - ASSERT(sblock->page_count == num_sectors); + /* Each member in sectors is just one sector */ + ASSERT(sblock->sector_count == num_sectors); - spage = sblock->pagev[0]; - kaddr = page_address(spage->page); + sector = sblock->sectors[0]; + kaddr = page_address(sector->page); h = (struct btrfs_header *)kaddr; memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size); @@ -1867,15 +1846,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) * a) don't have an extent buffer and * b) the page is already kmapped */ - if (spage->logical != btrfs_stack_header_bytenr(h)) + if (sector->logical != btrfs_stack_header_bytenr(h)) sblock->header_error = 1; - if (spage->generation != btrfs_stack_header_generation(h)) { + if (sector->generation != btrfs_stack_header_generation(h)) { sblock->header_error = 1; sblock->generation_error = 1; } - if (!scrub_check_fsid(h->fsid, spage)) + if (!scrub_check_fsid(h->fsid, sector)) sblock->header_error = 1; if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, @@ -1888,7 +1867,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) sectorsize - BTRFS_CSUM_SIZE); for (i = 1; i < num_sectors; i++) { - kaddr = page_address(sblock->pagev[i]->page); + kaddr = page_address(sblock->sectors[i]->page); crypto_shash_update(shash, kaddr, sectorsize); } @@ -1906,23 +1885,23 @@ static int scrub_checksum_super(struct scrub_block *sblock) struct btrfs_fs_info *fs_info = sctx->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 calculated_csum[BTRFS_CSUM_SIZE]; - struct scrub_page *spage; + struct scrub_sector *sector; char *kaddr; int fail_gen = 0; int fail_cor = 0; - BUG_ON(sblock->page_count < 1); - spage = sblock->pagev[0]; - kaddr = page_address(spage->page); + BUG_ON(sblock->sector_count < 1); + sector = sblock->sectors[0]; + kaddr = page_address(sector->page); s = (struct btrfs_super_block *)kaddr; - if (spage->logical != btrfs_super_bytenr(s)) + if (sector->logical != btrfs_super_bytenr(s)) ++fail_cor; - if (spage->generation != btrfs_super_generation(s)) + if (sector->generation != btrfs_super_generation(s)) ++fail_gen; - if (!scrub_check_fsid(s->fsid, spage)) + if (!scrub_check_fsid(s->fsid, sector)) ++fail_cor; shash->tfm = fs_info->csum_shash; @@ -1943,10 +1922,10 @@ static int scrub_checksum_super(struct scrub_block *sblock) ++sctx->stat.super_errors; spin_unlock(&sctx->stat_lock); if (fail_cor) - btrfs_dev_stat_inc_and_print(spage->dev, + btrfs_dev_stat_inc_and_print(sector->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); else - btrfs_dev_stat_inc_and_print(spage->dev, + btrfs_dev_stat_inc_and_print(sector->dev, BTRFS_DEV_STAT_GENERATION_ERRS); } @@ -1966,23 +1945,23 @@ static void scrub_block_put(struct scrub_block *sblock) if (sblock->sparity) scrub_parity_put(sblock->sparity); - for (i = 0; i < sblock->page_count; i++) - scrub_page_put(sblock->pagev[i]); + for (i = 0; i < sblock->sector_count; i++) + scrub_sector_put(sblock->sectors[i]); kfree(sblock); } } -static void scrub_page_get(struct scrub_page *spage) +static void scrub_sector_get(struct scrub_sector *sector) { - atomic_inc(&spage->refs); + atomic_inc(§or->refs); } -static void scrub_page_put(struct scrub_page *spage) +static void scrub_sector_put(struct scrub_sector *sector) { - if (atomic_dec_and_test(&spage->refs)) { - if (spage->page) - __free_page(spage->page); - kfree(spage); + if (atomic_dec_and_test(§or->refs)) { + if (sector->page) + __free_page(sector->page); + kfree(sector); } } @@ -2057,13 +2036,14 @@ static void scrub_submit(struct scrub_ctx *sctx) sbio = sctx->bios[sctx->curr]; sctx->curr = -1; scrub_pending_bio_inc(sctx); - btrfsic_submit_bio(sbio->bio); + btrfsic_check_bio(sbio->bio); + submit_bio(sbio->bio); } -static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, - struct scrub_page *spage) +static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx, + struct scrub_sector *sector) { - struct scrub_block *sblock = spage->sblock; + struct scrub_block *sblock = sector->sblock; struct scrub_bio *sbio; const u32 sectorsize = sctx->fs_info->sectorsize; int ret; @@ -2078,7 +2058,7 @@ again: if (sctx->curr != -1) { sctx->first_free = sctx->bios[sctx->curr]->next_free; sctx->bios[sctx->curr]->next_free = -1; - sctx->bios[sctx->curr]->page_count = 0; + sctx->bios[sctx->curr]->sector_count = 0; spin_unlock(&sctx->list_lock); } else { spin_unlock(&sctx->list_lock); @@ -2086,37 +2066,31 @@ again: } } sbio = sctx->bios[sctx->curr]; - if (sbio->page_count == 0) { - struct bio *bio; - - sbio->physical = spage->physical; - sbio->logical = spage->logical; - sbio->dev = spage->dev; - bio = sbio->bio; - if (!bio) { - bio = btrfs_bio_alloc(sctx->pages_per_bio); - sbio->bio = bio; + if (sbio->sector_count == 0) { + sbio->physical = sector->physical; + sbio->logical = sector->logical; + sbio->dev = sector->dev; + if (!sbio->bio) { + sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio, + REQ_OP_READ, GFP_NOFS); } - - bio->bi_private = sbio; - bio->bi_end_io = scrub_bio_end_io; - bio_set_dev(bio, sbio->dev->bdev); - bio->bi_iter.bi_sector = sbio->physical >> 9; - bio->bi_opf = REQ_OP_READ; + sbio->bio->bi_private = sbio; + sbio->bio->bi_end_io = scrub_bio_end_io; + sbio->bio->bi_iter.bi_sector = sbio->physical >> 9; sbio->status = 0; - } else if (sbio->physical + sbio->page_count * sectorsize != - spage->physical || - sbio->logical + sbio->page_count * sectorsize != - spage->logical || - sbio->dev != spage->dev) { + } else if (sbio->physical + sbio->sector_count * sectorsize != + sector->physical || + sbio->logical + sbio->sector_count * sectorsize != + sector->logical || + sbio->dev != sector->dev) { scrub_submit(sctx); goto again; } - sbio->pagev[sbio->page_count] = spage; - ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0); + sbio->sectors[sbio->sector_count] = sector; + ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0); if (ret != sectorsize) { - if (sbio->page_count < 1) { + if (sbio->sector_count < 1) { bio_put(sbio->bio); sbio->bio = NULL; return -EIO; @@ -2126,9 +2100,9 @@ again: } scrub_block_get(sblock); /* one for the page added to the bio */ - atomic_inc(&sblock->outstanding_pages); - sbio->page_count++; - if (sbio->page_count == sctx->pages_per_bio) + atomic_inc(&sblock->outstanding_sectors); + sbio->sector_count++; + if (sbio->sector_count == sctx->sectors_per_bio) scrub_submit(sctx); return 0; @@ -2144,10 +2118,10 @@ static void scrub_missing_raid56_end_io(struct bio *bio) bio_put(bio); - btrfs_queue_work(fs_info->scrub_workers, &sblock->work); + queue_work(fs_info->scrub_workers, &sblock->work); } -static void scrub_missing_raid56_worker(struct btrfs_work *work) +static void scrub_missing_raid56_worker(struct work_struct *work) { struct scrub_block *sblock = container_of(work, struct scrub_block, work); struct scrub_ctx *sctx = sblock->sctx; @@ -2155,8 +2129,8 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work) u64 logical; struct btrfs_device *dev; - logical = sblock->pagev[0]->logical; - dev = sblock->pagev[0]->dev; + logical = sblock->sectors[0]->logical; + dev = sblock->sectors[0]->dev; if (sblock->no_io_error_seen) scrub_recheck_block_checksum(sblock); @@ -2193,8 +2167,8 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) { struct scrub_ctx *sctx = sblock->sctx; struct btrfs_fs_info *fs_info = sctx->fs_info; - u64 length = sblock->page_count * PAGE_SIZE; - u64 logical = sblock->pagev[0]->logical; + u64 length = sblock->sector_count << fs_info->sectorsize_bits; + u64 logical = sblock->sectors[0]->logical; struct btrfs_io_context *bioc = NULL; struct bio *bio; struct btrfs_raid_bio *rbio; @@ -2213,12 +2187,12 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) * We shouldn't be scrubbing a missing device. Even for dev * replace, we should only get here for RAID 5/6. We either * managed to mount something with no mirrors remaining or - * there's a bug in scrub_remap_extent()/btrfs_map_block(). + * there's a bug in scrub_find_good_copy()/btrfs_map_block(). */ goto bioc_out; } - bio = btrfs_bio_alloc(BIO_MAX_VECS); + bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); bio->bi_iter.bi_sector = logical >> 9; bio->bi_private = sblock; bio->bi_end_io = scrub_missing_raid56_end_io; @@ -2227,13 +2201,17 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) if (!rbio) goto rbio_out; - for (i = 0; i < sblock->page_count; i++) { - struct scrub_page *spage = sblock->pagev[i]; + for (i = 0; i < sblock->sector_count; i++) { + struct scrub_sector *sector = sblock->sectors[i]; - raid56_add_scrub_pages(rbio, spage->page, spage->logical); + /* + * For now, our scrub is still one page per sector, so pgoff + * is always 0. + */ + raid56_add_scrub_pages(rbio, sector->page, 0, sector->logical); } - btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL); + INIT_WORK(&sblock->work, scrub_missing_raid56_worker); scrub_block_get(sblock); scrub_pending_bio_inc(sctx); raid56_submit_missing_rbio(rbio); @@ -2249,7 +2227,7 @@ bioc_out: spin_unlock(&sctx->stat_lock); } -static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len, +static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len, u64 physical, struct btrfs_device *dev, u64 flags, u64 gen, int mirror_num, u8 *csum, u64 physical_for_dev_replace) @@ -2273,7 +2251,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len, sblock->no_io_error_seen = 1; for (index = 0; len > 0; index++) { - struct scrub_page *spage; + struct scrub_sector *sector; /* * Here we will allocate one page for one sector to scrub. * This is fine if PAGE_SIZE == sectorsize, but will cost @@ -2281,8 +2259,8 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len, */ u32 l = min(sectorsize, len); - spage = kzalloc(sizeof(*spage), GFP_KERNEL); - if (!spage) { + sector = kzalloc(sizeof(*sector), GFP_KERNEL); + if (!sector) { leave_nomem: spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2290,26 +2268,26 @@ leave_nomem: scrub_block_put(sblock); return -ENOMEM; } - ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK); - scrub_page_get(spage); - sblock->pagev[index] = spage; - spage->sblock = sblock; - spage->dev = dev; - spage->flags = flags; - spage->generation = gen; - spage->logical = logical; - spage->physical = physical; - spage->physical_for_dev_replace = physical_for_dev_replace; - spage->mirror_num = mirror_num; + ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK); + scrub_sector_get(sector); + sblock->sectors[index] = sector; + sector->sblock = sblock; + sector->dev = dev; + sector->flags = flags; + sector->generation = gen; + sector->logical = logical; + sector->physical = physical; + sector->physical_for_dev_replace = physical_for_dev_replace; + sector->mirror_num = mirror_num; if (csum) { - spage->have_csum = 1; - memcpy(spage->csum, csum, sctx->fs_info->csum_size); + sector->have_csum = 1; + memcpy(sector->csum, csum, sctx->fs_info->csum_size); } else { - spage->have_csum = 0; + sector->have_csum = 0; } - sblock->page_count++; - spage->page = alloc_page(GFP_KERNEL); - if (!spage->page) + sblock->sector_count++; + sector->page = alloc_page(GFP_KERNEL); + if (!sector->page) goto leave_nomem; len -= l; logical += l; @@ -2317,7 +2295,7 @@ leave_nomem: physical_for_dev_replace += l; } - WARN_ON(sblock->page_count == 0); + WARN_ON(sblock->sector_count == 0); if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) { /* * This case should only be hit for RAID 5/6 device replace. See @@ -2325,11 +2303,11 @@ leave_nomem: */ scrub_missing_raid56_pages(sblock); } else { - for (index = 0; index < sblock->page_count; index++) { - struct scrub_page *spage = sblock->pagev[index]; + for (index = 0; index < sblock->sector_count; index++) { + struct scrub_sector *sector = sblock->sectors[index]; int ret; - ret = scrub_add_page_to_rd_bio(sctx, spage); + ret = scrub_add_sector_to_rd_bio(sctx, sector); if (ret) { scrub_block_put(sblock); return ret; @@ -2353,31 +2331,31 @@ static void scrub_bio_end_io(struct bio *bio) sbio->status = bio->bi_status; sbio->bio = bio; - btrfs_queue_work(fs_info->scrub_workers, &sbio->work); + queue_work(fs_info->scrub_workers, &sbio->work); } -static void scrub_bio_end_io_worker(struct btrfs_work *work) +static void scrub_bio_end_io_worker(struct work_struct *work) { struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); struct scrub_ctx *sctx = sbio->sctx; int i; - ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO); + ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO); if (sbio->status) { - for (i = 0; i < sbio->page_count; i++) { - struct scrub_page *spage = sbio->pagev[i]; + for (i = 0; i < sbio->sector_count; i++) { + struct scrub_sector *sector = sbio->sectors[i]; - spage->io_error = 1; - spage->sblock->no_io_error_seen = 0; + sector->io_error = 1; + sector->sblock->no_io_error_seen = 0; } } - /* now complete the scrub_block items that have all pages completed */ - for (i = 0; i < sbio->page_count; i++) { - struct scrub_page *spage = sbio->pagev[i]; - struct scrub_block *sblock = spage->sblock; + /* Now complete the scrub_block items that have all pages completed */ + for (i = 0; i < sbio->sector_count; i++) { + struct scrub_sector *sector = sbio->sectors[i]; + struct scrub_block *sblock = sector->sblock; - if (atomic_dec_and_test(&sblock->outstanding_pages)) + if (atomic_dec_and_test(&sblock->outstanding_sectors)) scrub_block_complete(sblock); scrub_block_put(sblock); } @@ -2456,8 +2434,8 @@ static void scrub_block_complete(struct scrub_block *sblock) } if (sblock->sparity && corrupted && !sblock->data_corrected) { - u64 start = sblock->pagev[0]->logical; - u64 end = sblock->pagev[sblock->page_count - 1]->logical + + u64 start = sblock->sectors[0]->logical; + u64 end = sblock->sectors[sblock->sector_count - 1]->logical + sblock->sctx->fs_info->sectorsize; ASSERT(end - start <= U32_MAX); @@ -2532,8 +2510,11 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum) static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, u64 logical, u32 len, u64 physical, struct btrfs_device *dev, u64 flags, - u64 gen, int mirror_num, u64 physical_for_dev_replace) + u64 gen, int mirror_num) { + struct btrfs_device *src_dev = dev; + u64 src_physical = physical; + int src_mirror = mirror_num; int ret; u8 csum[BTRFS_CSUM_SIZE]; u32 blocksize; @@ -2561,6 +2542,18 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, WARN_ON(1); } + /* + * For dev-replace case, we can have @dev being a missing device. + * Regular scrub will avoid its execution on missing device at all, + * as that would trigger tons of read error. + * + * Reading from missing device will cause read error counts to + * increase unnecessarily. + * So here we change the read source to a good mirror. + */ + if (sctx->is_dev_replace && !dev->bdev) + scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical, + &src_dev, &src_mirror); while (len) { u32 l = min(len, blocksize); int have_csum = 0; @@ -2571,20 +2564,20 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, if (have_csum == 0) ++sctx->stat.no_csum; } - ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen, - mirror_num, have_csum ? csum : NULL, - physical_for_dev_replace); + ret = scrub_sectors(sctx, logical, l, src_physical, src_dev, + flags, gen, src_mirror, + have_csum ? csum : NULL, physical); if (ret) return ret; len -= l; logical += l; physical += l; - physical_for_dev_replace += l; + src_physical += l; } return 0; } -static int scrub_pages_for_parity(struct scrub_parity *sparity, +static int scrub_sectors_for_parity(struct scrub_parity *sparity, u64 logical, u32 len, u64 physical, struct btrfs_device *dev, u64 flags, u64 gen, int mirror_num, u8 *csum) @@ -2613,10 +2606,10 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity, scrub_parity_get(sparity); for (index = 0; len > 0; index++) { - struct scrub_page *spage; + struct scrub_sector *sector; - spage = kzalloc(sizeof(*spage), GFP_KERNEL); - if (!spage) { + sector = kzalloc(sizeof(*sector), GFP_KERNEL); + if (!sector) { leave_nomem: spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2624,29 +2617,29 @@ leave_nomem: scrub_block_put(sblock); return -ENOMEM; } - ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK); + ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK); /* For scrub block */ - scrub_page_get(spage); - sblock->pagev[index] = spage; + scrub_sector_get(sector); + sblock->sectors[index] = sector; /* For scrub parity */ - scrub_page_get(spage); - list_add_tail(&spage->list, &sparity->spages); - spage->sblock = sblock; - spage->dev = dev; - spage->flags = flags; - spage->generation = gen; - spage->logical = logical; - spage->physical = physical; - spage->mirror_num = mirror_num; + scrub_sector_get(sector); + list_add_tail(§or->list, &sparity->sectors_list); + sector->sblock = sblock; + sector->dev = dev; + sector->flags = flags; + sector->generation = gen; + sector->logical = logical; + sector->physical = physical; + sector->mirror_num = mirror_num; if (csum) { - spage->have_csum = 1; - memcpy(spage->csum, csum, sctx->fs_info->csum_size); + sector->have_csum = 1; + memcpy(sector->csum, csum, sctx->fs_info->csum_size); } else { - spage->have_csum = 0; + sector->have_csum = 0; } - sblock->page_count++; - spage->page = alloc_page(GFP_KERNEL); - if (!spage->page) + sblock->sector_count++; + sector->page = alloc_page(GFP_KERNEL); + if (!sector->page) goto leave_nomem; @@ -2656,19 +2649,19 @@ leave_nomem: physical += sectorsize; } - WARN_ON(sblock->page_count == 0); - for (index = 0; index < sblock->page_count; index++) { - struct scrub_page *spage = sblock->pagev[index]; + WARN_ON(sblock->sector_count == 0); + for (index = 0; index < sblock->sector_count; index++) { + struct scrub_sector *sector = sblock->sectors[index]; int ret; - ret = scrub_add_page_to_rd_bio(sctx, spage); + ret = scrub_add_sector_to_rd_bio(sctx, sector); if (ret) { scrub_block_put(sblock); return ret; } } - /* last one frees, either here or in bio completion for last page */ + /* Last one frees, either here or in bio completion for last sector */ scrub_block_put(sblock); return 0; } @@ -2707,7 +2700,7 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity, if (have_csum == 0) goto skip; } - ret = scrub_pages_for_parity(sparity, logical, l, physical, dev, + ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev, flags, gen, mirror_num, have_csum ? csum : NULL); if (ret) @@ -2767,7 +2760,7 @@ static int get_raid56_logic_offset(u64 physical, int num, static void scrub_free_parity(struct scrub_parity *sparity) { struct scrub_ctx *sctx = sparity->sctx; - struct scrub_page *curr, *next; + struct scrub_sector *curr, *next; int nbits; nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors); @@ -2778,15 +2771,15 @@ static void scrub_free_parity(struct scrub_parity *sparity) spin_unlock(&sctx->stat_lock); } - list_for_each_entry_safe(curr, next, &sparity->spages, list) { + list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) { list_del_init(&curr->list); - scrub_page_put(curr); + scrub_sector_put(curr); } kfree(sparity); } -static void scrub_parity_bio_endio_worker(struct btrfs_work *work) +static void scrub_parity_bio_endio_worker(struct work_struct *work) { struct scrub_parity *sparity = container_of(work, struct scrub_parity, work); @@ -2798,7 +2791,7 @@ static void scrub_parity_bio_endio_worker(struct btrfs_work *work) static void scrub_parity_bio_endio(struct bio *bio) { - struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; + struct scrub_parity *sparity = bio->bi_private; struct btrfs_fs_info *fs_info = sparity->sctx->fs_info; if (bio->bi_status) @@ -2807,9 +2800,8 @@ static void scrub_parity_bio_endio(struct bio *bio) bio_put(bio); - btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL, - NULL); - btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work); + INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker); + queue_work(fs_info->scrub_parity_workers, &sparity->work); } static void scrub_parity_check_and_repair(struct scrub_parity *sparity) @@ -2834,7 +2826,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) if (ret || !bioc || !bioc->raid_map) goto bioc_out; - bio = btrfs_bio_alloc(BIO_MAX_VECS); + bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); bio->bi_iter.bi_sector = sparity->logic_start >> 9; bio->bi_private = sparity; bio->bi_end_io = scrub_parity_bio_endio; @@ -2882,6 +2874,251 @@ static void scrub_parity_put(struct scrub_parity *sparity) scrub_parity_check_and_repair(sparity); } +/* + * Return 0 if the extent item range covers any byte of the range. + * Return <0 if the extent item is before @search_start. + * Return >0 if the extent item is after @start_start + @search_len. + */ +static int compare_extent_item_range(struct btrfs_path *path, + u64 search_start, u64 search_len) +{ + struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info; + u64 len; + struct btrfs_key key; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY || + key.type == BTRFS_METADATA_ITEM_KEY); + if (key.type == BTRFS_METADATA_ITEM_KEY) + len = fs_info->nodesize; + else + len = key.offset; + + if (key.objectid + len <= search_start) + return -1; + if (key.objectid >= search_start + search_len) + return 1; + return 0; +} + +/* + * Locate one extent item which covers any byte in range + * [@search_start, @search_start + @search_length) + * + * If the path is not initialized, we will initialize the search by doing + * a btrfs_search_slot(). + * If the path is already initialized, we will use the path as the initial + * slot, to avoid duplicated btrfs_search_slot() calls. + * + * NOTE: If an extent item starts before @search_start, we will still + * return the extent item. This is for data extent crossing stripe boundary. + * + * Return 0 if we found such extent item, and @path will point to the extent item. + * Return >0 if no such extent item can be found, and @path will be released. + * Return <0 if hit fatal error, and @path will be released. + */ +static int find_first_extent_item(struct btrfs_root *extent_root, + struct btrfs_path *path, + u64 search_start, u64 search_len) +{ + struct btrfs_fs_info *fs_info = extent_root->fs_info; + struct btrfs_key key; + int ret; + + /* Continue using the existing path */ + if (path->nodes[0]) + goto search_forward; + + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; + key.objectid = search_start; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); + if (ret < 0) + return ret; + + ASSERT(ret > 0); + /* + * Here we intentionally pass 0 as @min_objectid, as there could be + * an extent item starting before @search_start. + */ + ret = btrfs_previous_extent_item(extent_root, path, 0); + if (ret < 0) + return ret; + /* + * No matter whether we have found an extent item, the next loop will + * properly do every check on the key. + */ +search_forward: + while (true) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid >= search_start + search_len) + break; + if (key.type != BTRFS_METADATA_ITEM_KEY && + key.type != BTRFS_EXTENT_ITEM_KEY) + goto next; + + ret = compare_extent_item_range(path, search_start, search_len); + if (ret == 0) + return ret; + if (ret > 0) + break; +next: + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(extent_root, path); + if (ret) { + /* Either no more item or fatal error */ + btrfs_release_path(path); + return ret; + } + } + } + btrfs_release_path(path); + return 1; +} + +static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret, + u64 *size_ret, u64 *flags_ret, u64 *generation_ret) +{ + struct btrfs_key key; + struct btrfs_extent_item *ei; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + ASSERT(key.type == BTRFS_METADATA_ITEM_KEY || + key.type == BTRFS_EXTENT_ITEM_KEY); + *extent_start_ret = key.objectid; + if (key.type == BTRFS_METADATA_ITEM_KEY) + *size_ret = path->nodes[0]->fs_info->nodesize; + else + *size_ret = key.offset; + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); + *flags_ret = btrfs_extent_flags(path->nodes[0], ei); + *generation_ret = btrfs_extent_generation(path->nodes[0], ei); +} + +static bool does_range_cross_boundary(u64 extent_start, u64 extent_len, + u64 boundary_start, u64 boudary_len) +{ + return (extent_start < boundary_start && + extent_start + extent_len > boundary_start) || + (extent_start < boundary_start + boudary_len && + extent_start + extent_len > boundary_start + boudary_len); +} + +static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx, + struct scrub_parity *sparity, + struct map_lookup *map, + struct btrfs_device *sdev, + struct btrfs_path *path, + u64 logical) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical); + struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical); + u64 cur_logical = logical; + int ret; + + ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK); + + /* Path must not be populated */ + ASSERT(!path->nodes[0]); + + while (cur_logical < logical + map->stripe_len) { + struct btrfs_io_context *bioc = NULL; + struct btrfs_device *extent_dev; + u64 extent_start; + u64 extent_size; + u64 mapped_length; + u64 extent_flags; + u64 extent_gen; + u64 extent_physical; + u64 extent_mirror_num; + + ret = find_first_extent_item(extent_root, path, cur_logical, + logical + map->stripe_len - cur_logical); + /* No more extent item in this data stripe */ + if (ret > 0) { + ret = 0; + break; + } + if (ret < 0) + break; + get_extent_info(path, &extent_start, &extent_size, &extent_flags, + &extent_gen); + + /* Metadata should not cross stripe boundaries */ + if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && + does_range_cross_boundary(extent_start, extent_size, + logical, map->stripe_len)) { + btrfs_err(fs_info, + "scrub: tree block %llu spanning stripes, ignored. logical=%llu", + extent_start, logical); + spin_lock(&sctx->stat_lock); + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + cur_logical += extent_size; + continue; + } + + /* Skip hole range which doesn't have any extent */ + cur_logical = max(extent_start, cur_logical); + + /* Truncate the range inside this data stripe */ + extent_size = min(extent_start + extent_size, + logical + map->stripe_len) - cur_logical; + extent_start = cur_logical; + ASSERT(extent_size <= U32_MAX); + + scrub_parity_mark_sectors_data(sparity, extent_start, extent_size); + + mapped_length = extent_size; + ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start, + &mapped_length, &bioc, 0); + if (!ret && (!bioc || mapped_length < extent_size)) + ret = -EIO; + if (ret) { + btrfs_put_bioc(bioc); + scrub_parity_mark_sectors_error(sparity, extent_start, + extent_size); + break; + } + extent_physical = bioc->stripes[0].physical; + extent_mirror_num = bioc->mirror_num; + extent_dev = bioc->stripes[0].dev; + btrfs_put_bioc(bioc); + + ret = btrfs_lookup_csums_range(csum_root, extent_start, + extent_start + extent_size - 1, + &sctx->csum_list, 1); + if (ret) { + scrub_parity_mark_sectors_error(sparity, extent_start, + extent_size); + break; + } + + ret = scrub_extent_for_parity(sparity, extent_start, + extent_size, extent_physical, + extent_dev, extent_flags, + extent_gen, extent_mirror_num); + scrub_free_csums(sctx); + + if (ret) { + scrub_parity_mark_sectors_error(sparity, extent_start, + extent_size); + break; + } + + cond_resched(); + cur_logical += extent_size; + } + btrfs_release_path(path); + return ret; +} + static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, struct map_lookup *map, struct btrfs_device *sdev, @@ -2889,28 +3126,12 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, u64 logic_end) { struct btrfs_fs_info *fs_info = sctx->fs_info; - struct btrfs_root *root = btrfs_extent_root(fs_info, logic_start); - struct btrfs_root *csum_root; - struct btrfs_extent_item *extent; - struct btrfs_io_context *bioc = NULL; struct btrfs_path *path; - u64 flags; + u64 cur_logical; int ret; - int slot; - struct extent_buffer *l; - struct btrfs_key key; - u64 generation; - u64 extent_logical; - u64 extent_physical; - /* Check the comment in scrub_stripe() for why u32 is enough here */ - u32 extent_len; - u64 mapped_length; - struct btrfs_device *extent_dev; struct scrub_parity *sparity; int nsectors; int bitmap_len; - int extent_mirror_num; - int stop_loop = 0; path = btrfs_alloc_path(); if (!path) { @@ -2943,178 +3164,19 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, sparity->logic_start = logic_start; sparity->logic_end = logic_end; refcount_set(&sparity->refs, 1); - INIT_LIST_HEAD(&sparity->spages); + INIT_LIST_HEAD(&sparity->sectors_list); sparity->dbitmap = sparity->bitmap; sparity->ebitmap = (void *)sparity->bitmap + bitmap_len; ret = 0; - while (logic_start < logic_end) { - if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) - key.type = BTRFS_METADATA_ITEM_KEY; - else - key.type = BTRFS_EXTENT_ITEM_KEY; - key.objectid = logic_start; - key.offset = (u64)-1; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + for (cur_logical = logic_start; cur_logical < logic_end; + cur_logical += map->stripe_len) { + ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map, + sdev, path, cur_logical); if (ret < 0) - goto out; - - if (ret > 0) { - ret = btrfs_previous_extent_item(root, path, 0); - if (ret < 0) - goto out; - if (ret > 0) { - btrfs_release_path(path); - ret = btrfs_search_slot(NULL, root, &key, - path, 0, 0); - if (ret < 0) - goto out; - } - } - - stop_loop = 0; - while (1) { - u64 bytes; - - l = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(l)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto out; - - stop_loop = 1; - break; - } - btrfs_item_key_to_cpu(l, &key, slot); - - if (key.type != BTRFS_EXTENT_ITEM_KEY && - key.type != BTRFS_METADATA_ITEM_KEY) - goto next; - - if (key.type == BTRFS_METADATA_ITEM_KEY) - bytes = fs_info->nodesize; - else - bytes = key.offset; - - if (key.objectid + bytes <= logic_start) - goto next; - - if (key.objectid >= logic_end) { - stop_loop = 1; - break; - } - - while (key.objectid >= logic_start + map->stripe_len) - logic_start += map->stripe_len; - - extent = btrfs_item_ptr(l, slot, - struct btrfs_extent_item); - flags = btrfs_extent_flags(l, extent); - generation = btrfs_extent_generation(l, extent); - - if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && - (key.objectid < logic_start || - key.objectid + bytes > - logic_start + map->stripe_len)) { - btrfs_err(fs_info, - "scrub: tree block %llu spanning stripes, ignored. logical=%llu", - key.objectid, logic_start); - spin_lock(&sctx->stat_lock); - sctx->stat.uncorrectable_errors++; - spin_unlock(&sctx->stat_lock); - goto next; - } -again: - extent_logical = key.objectid; - ASSERT(bytes <= U32_MAX); - extent_len = bytes; - - if (extent_logical < logic_start) { - extent_len -= logic_start - extent_logical; - extent_logical = logic_start; - } - - if (extent_logical + extent_len > - logic_start + map->stripe_len) - extent_len = logic_start + map->stripe_len - - extent_logical; - - scrub_parity_mark_sectors_data(sparity, extent_logical, - extent_len); - - mapped_length = extent_len; - bioc = NULL; - ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, - extent_logical, &mapped_length, &bioc, - 0); - if (!ret) { - if (!bioc || mapped_length < extent_len) - ret = -EIO; - } - if (ret) { - btrfs_put_bioc(bioc); - goto out; - } - extent_physical = bioc->stripes[0].physical; - extent_mirror_num = bioc->mirror_num; - extent_dev = bioc->stripes[0].dev; - btrfs_put_bioc(bioc); - - csum_root = btrfs_csum_root(fs_info, extent_logical); - ret = btrfs_lookup_csums_range(csum_root, - extent_logical, - extent_logical + extent_len - 1, - &sctx->csum_list, 1); - if (ret) - goto out; - - ret = scrub_extent_for_parity(sparity, extent_logical, - extent_len, - extent_physical, - extent_dev, flags, - generation, - extent_mirror_num); - - scrub_free_csums(sctx); - - if (ret) - goto out; - - if (extent_logical + extent_len < - key.objectid + bytes) { - logic_start += map->stripe_len; - - if (logic_start >= logic_end) { - stop_loop = 1; - break; - } - - if (logic_start < key.objectid + bytes) { - cond_resched(); - goto again; - } - } -next: - path->slots[0]++; - } - - btrfs_release_path(path); - - if (stop_loop) break; - - logic_start += map->stripe_len; - } -out: - if (ret < 0) { - ASSERT(logic_end - logic_start <= U32_MAX); - scrub_parity_mark_sectors_error(sparity, logic_start, - logic_end - logic_start); } + scrub_parity_put(sparity); scrub_submit(sctx); mutex_lock(&sctx->wr_lock); @@ -3165,6 +3227,206 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, return ret; } +/* + * Scrub one range which can only has simple mirror based profile. + * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in + * RAID0/RAID10). + * + * Since we may need to handle a subset of block group, we need @logical_start + * and @logical_length parameter. + */ +static int scrub_simple_mirror(struct scrub_ctx *sctx, + struct btrfs_root *extent_root, + struct btrfs_root *csum_root, + struct btrfs_block_group *bg, + struct map_lookup *map, + u64 logical_start, u64 logical_length, + struct btrfs_device *device, + u64 physical, int mirror_num) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + const u64 logical_end = logical_start + logical_length; + /* An artificial limit, inherit from old scrub behavior */ + const u32 max_length = SZ_64K; + struct btrfs_path path = { 0 }; + u64 cur_logical = logical_start; + int ret; + + /* The range must be inside the bg */ + ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); + + path.search_commit_root = 1; + path.skip_locking = 1; + /* Go through each extent items inside the logical range */ + while (cur_logical < logical_end) { + u64 extent_start; + u64 extent_len; + u64 extent_flags; + u64 extent_gen; + u64 scrub_len; + + /* Canceled? */ + if (atomic_read(&fs_info->scrub_cancel_req) || + atomic_read(&sctx->cancel_req)) { + ret = -ECANCELED; + break; + } + /* Paused? */ + if (atomic_read(&fs_info->scrub_pause_req)) { + /* Push queued extents */ + sctx->flush_all_writes = true; + scrub_submit(sctx); + mutex_lock(&sctx->wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_lock); + wait_event(sctx->list_wait, + atomic_read(&sctx->bios_in_flight) == 0); + sctx->flush_all_writes = false; + scrub_blocked_if_needed(fs_info); + } + /* Block group removed? */ + spin_lock(&bg->lock); + if (bg->removed) { + spin_unlock(&bg->lock); + ret = 0; + break; + } + spin_unlock(&bg->lock); + + ret = find_first_extent_item(extent_root, &path, cur_logical, + logical_end - cur_logical); + if (ret > 0) { + /* No more extent, just update the accounting */ + sctx->stat.last_physical = physical + logical_length; + ret = 0; + break; + } + if (ret < 0) + break; + get_extent_info(&path, &extent_start, &extent_len, + &extent_flags, &extent_gen); + /* Skip hole range which doesn't have any extent */ + cur_logical = max(extent_start, cur_logical); + + /* + * Scrub len has three limits: + * - Extent size limit + * - Scrub range limit + * This is especially imporatant for RAID0/RAID10 to reuse + * this function + * - Max scrub size limit + */ + scrub_len = min(min(extent_start + extent_len, + logical_end), cur_logical + max_length) - + cur_logical; + + if (extent_flags & BTRFS_EXTENT_FLAG_DATA) { + ret = btrfs_lookup_csums_range(csum_root, cur_logical, + cur_logical + scrub_len - 1, + &sctx->csum_list, 1); + if (ret) + break; + } + if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && + does_range_cross_boundary(extent_start, extent_len, + logical_start, logical_length)) { + btrfs_err(fs_info, +"scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)", + extent_start, logical_start, logical_end); + spin_lock(&sctx->stat_lock); + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + cur_logical += scrub_len; + continue; + } + ret = scrub_extent(sctx, map, cur_logical, scrub_len, + cur_logical - logical_start + physical, + device, extent_flags, extent_gen, + mirror_num); + scrub_free_csums(sctx); + if (ret) + break; + if (sctx->is_dev_replace) + sync_replace_for_zoned(sctx); + cur_logical += scrub_len; + /* Don't hold CPU for too long time */ + cond_resched(); + } + btrfs_release_path(&path); + return ret; +} + +/* Calculate the full stripe length for simple stripe based profiles */ +static u64 simple_stripe_full_stripe_len(const struct map_lookup *map) +{ + ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)); + + return map->num_stripes / map->sub_stripes * map->stripe_len; +} + +/* Get the logical bytenr for the stripe */ +static u64 simple_stripe_get_logical(struct map_lookup *map, + struct btrfs_block_group *bg, + int stripe_index) +{ + ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)); + ASSERT(stripe_index < map->num_stripes); + + /* + * (stripe_index / sub_stripes) gives how many data stripes we need to + * skip. + */ + return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start; +} + +/* Get the mirror number for the stripe */ +static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index) +{ + ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)); + ASSERT(stripe_index < map->num_stripes); + + /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */ + return stripe_index % map->sub_stripes + 1; +} + +static int scrub_simple_stripe(struct scrub_ctx *sctx, + struct btrfs_root *extent_root, + struct btrfs_root *csum_root, + struct btrfs_block_group *bg, + struct map_lookup *map, + struct btrfs_device *device, + int stripe_index) +{ + const u64 logical_increment = simple_stripe_full_stripe_len(map); + const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index); + const u64 orig_physical = map->stripes[stripe_index].physical; + const int mirror_num = simple_stripe_mirror_num(map, stripe_index); + u64 cur_logical = orig_logical; + u64 cur_physical = orig_physical; + int ret = 0; + + while (cur_logical < bg->start + bg->length) { + /* + * Inside each stripe, RAID0 is just SINGLE, and RAID10 is + * just RAID1, so we can reuse scrub_simple_mirror() to scrub + * this stripe. + */ + ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map, + cur_logical, map->stripe_len, device, + cur_physical, mirror_num); + if (ret) + return ret; + /* Skip to next stripe which belongs to the target device */ + cur_logical += logical_increment; + /* For physical offset, we just go to next stripe */ + cur_physical += map->stripe_len; + } + return ret; +} + static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, struct map_lookup *map, @@ -3175,59 +3437,22 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_root *root; struct btrfs_root *csum_root; - struct btrfs_extent_item *extent; struct blk_plug plug; + const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; const u64 chunk_logical = bg->start; - u64 flags; int ret; - int slot; - u64 nstripes; - struct extent_buffer *l; - u64 physical; + u64 physical = map->stripes[stripe_index].physical; + const u64 physical_end = physical + dev_extent_len; u64 logical; u64 logic_end; - u64 physical_end; - u64 generation; - int mirror_num; - struct btrfs_key key; + /* The logical increment after finishing one stripe */ u64 increment; + /* Offset inside the chunk */ u64 offset; - u64 extent_logical; - u64 extent_physical; - /* - * Unlike chunk length, extent length should never go beyond - * BTRFS_MAX_EXTENT_SIZE, thus u32 is enough here. - */ - u32 extent_len; u64 stripe_logical; u64 stripe_end; - struct btrfs_device *extent_dev; - int extent_mirror_num; int stop_loop = 0; - physical = map->stripes[stripe_index].physical; - offset = 0; - nstripes = div64_u64(dev_extent_len, map->stripe_len); - mirror_num = 1; - increment = map->stripe_len; - if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - offset = map->stripe_len * stripe_index; - increment = map->stripe_len * map->num_stripes; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { - int factor = map->num_stripes / map->sub_stripes; - offset = map->stripe_len * (stripe_index / map->sub_stripes); - increment = map->stripe_len * factor; - mirror_num = stripe_index % map->sub_stripes + 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { - mirror_num = stripe_index % map->num_stripes + 1; - } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - mirror_num = stripe_index % map->num_stripes + 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - get_raid56_logic_offset(physical, stripe_index, map, &offset, - NULL); - increment = map->stripe_len * nr_data_stripes(map); - } - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -3241,21 +3466,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, path->skip_locking = 1; path->reada = READA_FORWARD; - logical = chunk_logical + offset; - physical_end = physical + nstripes * map->stripe_len; - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - get_raid56_logic_offset(physical_end, stripe_index, - map, &logic_end, NULL); - logic_end += chunk_logical; - } else { - logic_end = logical + increment * nstripes; - } wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); scrub_blocked_if_needed(fs_info); - root = btrfs_extent_root(fs_info, logical); - csum_root = btrfs_csum_root(fs_info, logical); + root = btrfs_extent_root(fs_info, bg->start); + csum_root = btrfs_csum_root(fs_info, bg->start); /* * collect all data csums for the stripe to avoid seeking during @@ -3272,241 +3488,83 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, } /* - * now find all extents for each stripe and scrub them + * There used to be a big double loop to handle all profiles using the + * same routine, which grows larger and more gross over time. + * + * So here we handle each profile differently, so simpler profiles + * have simpler scrubbing function. */ - ret = 0; - while (physical < physical_end) { - /* - * canceled? - */ - if (atomic_read(&fs_info->scrub_cancel_req) || - atomic_read(&sctx->cancel_req)) { - ret = -ECANCELED; - goto out; - } + if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID56_MASK))) { /* - * check to see if we have to pause + * Above check rules out all complex profile, the remaining + * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple + * mirrored duplication without stripe. + * + * Only @physical and @mirror_num needs to calculated using + * @stripe_index. */ - if (atomic_read(&fs_info->scrub_pause_req)) { - /* push queued extents */ - sctx->flush_all_writes = true; - scrub_submit(sctx); - mutex_lock(&sctx->wr_lock); - scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_lock); - wait_event(sctx->list_wait, - atomic_read(&sctx->bios_in_flight) == 0); - sctx->flush_all_writes = false; - scrub_blocked_if_needed(fs_info); - } - - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - ret = get_raid56_logic_offset(physical, stripe_index, - map, &logical, - &stripe_logical); - logical += chunk_logical; - if (ret) { - /* it is parity strip */ - stripe_logical += chunk_logical; - stripe_end = stripe_logical + increment; - ret = scrub_raid56_parity(sctx, map, scrub_dev, - stripe_logical, - stripe_end); - if (ret) - goto out; - goto skip; - } - } - - if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) - key.type = BTRFS_METADATA_ITEM_KEY; - else - key.type = BTRFS_EXTENT_ITEM_KEY; - key.objectid = logical; - key.offset = (u64)-1; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - if (ret > 0) { - ret = btrfs_previous_extent_item(root, path, 0); - if (ret < 0) - goto out; - if (ret > 0) { - /* there's no smaller item, so stick with the - * larger one */ - btrfs_release_path(path); - ret = btrfs_search_slot(NULL, root, &key, - path, 0, 0); - if (ret < 0) - goto out; - } - } - - stop_loop = 0; - while (1) { - u64 bytes; - - l = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(l)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto out; - - stop_loop = 1; - break; - } - btrfs_item_key_to_cpu(l, &key, slot); - - if (key.type != BTRFS_EXTENT_ITEM_KEY && - key.type != BTRFS_METADATA_ITEM_KEY) - goto next; - - if (key.type == BTRFS_METADATA_ITEM_KEY) - bytes = fs_info->nodesize; - else - bytes = key.offset; - - if (key.objectid + bytes <= logical) - goto next; - - if (key.objectid >= logical + map->stripe_len) { - /* out of this device extent */ - if (key.objectid >= logic_end) - stop_loop = 1; - break; - } - - /* - * If our block group was removed in the meanwhile, just - * stop scrubbing since there is no point in continuing. - * Continuing would prevent reusing its device extents - * for new block groups for a long time. - */ - spin_lock(&bg->lock); - if (bg->removed) { - spin_unlock(&bg->lock); - ret = 0; - goto out; - } - spin_unlock(&bg->lock); - - extent = btrfs_item_ptr(l, slot, - struct btrfs_extent_item); - flags = btrfs_extent_flags(l, extent); - generation = btrfs_extent_generation(l, extent); - - if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && - (key.objectid < logical || - key.objectid + bytes > - logical + map->stripe_len)) { - btrfs_err(fs_info, - "scrub: tree block %llu spanning stripes, ignored. logical=%llu", - key.objectid, logical); - spin_lock(&sctx->stat_lock); - sctx->stat.uncorrectable_errors++; - spin_unlock(&sctx->stat_lock); - goto next; - } - -again: - extent_logical = key.objectid; - ASSERT(bytes <= U32_MAX); - extent_len = bytes; - - /* - * trim extent to this stripe - */ - if (extent_logical < logical) { - extent_len -= logical - extent_logical; - extent_logical = logical; - } - if (extent_logical + extent_len > - logical + map->stripe_len) { - extent_len = logical + map->stripe_len - - extent_logical; - } + ret = scrub_simple_mirror(sctx, root, csum_root, bg, map, + bg->start, bg->length, scrub_dev, + map->stripes[stripe_index].physical, + stripe_index + 1); + offset = 0; + goto out; + } + if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { + ret = scrub_simple_stripe(sctx, root, csum_root, bg, map, + scrub_dev, stripe_index); + offset = map->stripe_len * (stripe_index / map->sub_stripes); + goto out; + } - extent_physical = extent_logical - logical + physical; - extent_dev = scrub_dev; - extent_mirror_num = mirror_num; - if (sctx->is_dev_replace) - scrub_remap_extent(fs_info, extent_logical, - extent_len, &extent_physical, - &extent_dev, - &extent_mirror_num); - - if (flags & BTRFS_EXTENT_FLAG_DATA) { - ret = btrfs_lookup_csums_range(csum_root, - extent_logical, - extent_logical + extent_len - 1, - &sctx->csum_list, 1); - if (ret) - goto out; - } + /* Only RAID56 goes through the old code */ + ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK); + ret = 0; - ret = scrub_extent(sctx, map, extent_logical, extent_len, - extent_physical, extent_dev, flags, - generation, extent_mirror_num, - extent_logical - logical + physical); + /* Calculate the logical end of the stripe */ + get_raid56_logic_offset(physical_end, stripe_index, + map, &logic_end, NULL); + logic_end += chunk_logical; - scrub_free_csums(sctx); + /* Initialize @offset in case we need to go to out: label */ + get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL); + increment = map->stripe_len * nr_data_stripes(map); + /* + * Due to the rotation, for RAID56 it's better to iterate each stripe + * using their physical offset. + */ + while (physical < physical_end) { + ret = get_raid56_logic_offset(physical, stripe_index, map, + &logical, &stripe_logical); + logical += chunk_logical; + if (ret) { + /* it is parity strip */ + stripe_logical += chunk_logical; + stripe_end = stripe_logical + increment; + ret = scrub_raid56_parity(sctx, map, scrub_dev, + stripe_logical, + stripe_end); if (ret) goto out; + goto next; + } - if (sctx->is_dev_replace) - sync_replace_for_zoned(sctx); - - if (extent_logical + extent_len < - key.objectid + bytes) { - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - /* - * loop until we find next data stripe - * or we have finished all stripes. - */ -loop: - physical += map->stripe_len; - ret = get_raid56_logic_offset(physical, - stripe_index, map, - &logical, &stripe_logical); - logical += chunk_logical; - - if (ret && physical < physical_end) { - stripe_logical += chunk_logical; - stripe_end = stripe_logical + - increment; - ret = scrub_raid56_parity(sctx, - map, scrub_dev, - stripe_logical, - stripe_end); - if (ret) - goto out; - goto loop; - } - } else { - physical += map->stripe_len; - logical += increment; - } - if (logical < key.objectid + bytes) { - cond_resched(); - goto again; - } - - if (physical >= physical_end) { - stop_loop = 1; - break; - } - } + /* + * Now we're at a data stripe, scrub each extents in the range. + * + * At this stage, if we ignore the repair part, inside each data + * stripe it is no different than SINGLE profile. + * We can reuse scrub_simple_mirror() here, as the repair part + * is still based on @mirror_num. + */ + ret = scrub_simple_mirror(sctx, root, csum_root, bg, map, + logical, map->stripe_len, + scrub_dev, physical, 1); + if (ret < 0) + goto out; next: - path->slots[0]++; - } - btrfs_release_path(path); -skip: logical += increment; physical += map->stripe_len; spin_lock(&sctx->stat_lock); @@ -3964,9 +4022,9 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, if (!btrfs_check_super_location(scrub_dev, bytenr)) continue; - ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, - scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, - NULL, bytenr); + ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, + scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, + NULL, bytenr); if (ret) return ret; } @@ -3979,22 +4037,23 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info) { if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt, &fs_info->scrub_lock)) { - struct btrfs_workqueue *scrub_workers = NULL; - struct btrfs_workqueue *scrub_wr_comp = NULL; - struct btrfs_workqueue *scrub_parity = NULL; - - scrub_workers = fs_info->scrub_workers; - scrub_wr_comp = fs_info->scrub_wr_completion_workers; - scrub_parity = fs_info->scrub_parity_workers; + struct workqueue_struct *scrub_workers = fs_info->scrub_workers; + struct workqueue_struct *scrub_wr_comp = + fs_info->scrub_wr_completion_workers; + struct workqueue_struct *scrub_parity = + fs_info->scrub_parity_workers; fs_info->scrub_workers = NULL; fs_info->scrub_wr_completion_workers = NULL; fs_info->scrub_parity_workers = NULL; mutex_unlock(&fs_info->scrub_lock); - btrfs_destroy_workqueue(scrub_workers); - btrfs_destroy_workqueue(scrub_wr_comp); - btrfs_destroy_workqueue(scrub_parity); + if (scrub_workers) + destroy_workqueue(scrub_workers); + if (scrub_wr_comp) + destroy_workqueue(scrub_wr_comp); + if (scrub_parity) + destroy_workqueue(scrub_parity); } } @@ -4004,9 +4063,9 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info) static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, int is_dev_replace) { - struct btrfs_workqueue *scrub_workers = NULL; - struct btrfs_workqueue *scrub_wr_comp = NULL; - struct btrfs_workqueue *scrub_parity = NULL; + struct workqueue_struct *scrub_workers = NULL; + struct workqueue_struct *scrub_wr_comp = NULL; + struct workqueue_struct *scrub_parity = NULL; unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; int max_active = fs_info->thread_pool_size; int ret = -ENOMEM; @@ -4014,18 +4073,16 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt)) return 0; - scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags, - is_dev_replace ? 1 : max_active, 4); + scrub_workers = alloc_workqueue("btrfs-scrub", flags, + is_dev_replace ? 1 : max_active); if (!scrub_workers) goto fail_scrub_workers; - scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags, - max_active, 2); + scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active); if (!scrub_wr_comp) goto fail_scrub_wr_completion_workers; - scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags, - max_active, 2); + scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active); if (!scrub_parity) goto fail_scrub_parity_workers; @@ -4046,11 +4103,11 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, mutex_unlock(&fs_info->scrub_lock); ret = 0; - btrfs_destroy_workqueue(scrub_parity); + destroy_workqueue(scrub_parity); fail_scrub_parity_workers: - btrfs_destroy_workqueue(scrub_wr_comp); + destroy_workqueue(scrub_wr_comp); fail_scrub_wr_completion_workers: - btrfs_destroy_workqueue(scrub_workers); + destroy_workqueue(scrub_workers); fail_scrub_workers: return ret; } @@ -4082,18 +4139,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, } if (fs_info->nodesize > - PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK || - fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) { + SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits || + fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_SECTORS_PER_BLOCK) { /* - * would exhaust the array bounds of pagev member in + * Would exhaust the array bounds of sectorv member in * struct scrub_block */ btrfs_err(fs_info, - "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails", - fs_info->nodesize, - SCRUB_MAX_PAGES_PER_BLOCK, - fs_info->sectorsize, - SCRUB_MAX_PAGES_PER_BLOCK); +"scrub: nodesize and sectorsize <= SCRUB_MAX_SECTORS_PER_BLOCK (%d <= %d && %d <= %d) fails", + fs_info->nodesize, SCRUB_MAX_SECTORS_PER_BLOCK, + fs_info->sectorsize, SCRUB_MAX_SECTORS_PER_BLOCK); return -EINVAL; } @@ -4161,7 +4216,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, /* * In order to avoid deadlock with reclaim when there is a transaction * trying to pause scrub, make sure we use GFP_NOFS for all the - * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity() + * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity() * invoked by our callees. The pausing request is done when the * transaction commit starts, and it blocks the transaction until scrub * is paused (done at specific points at scrub_stripe() or right above @@ -4295,11 +4350,11 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; } -static void scrub_remap_extent(struct btrfs_fs_info *fs_info, - u64 extent_logical, u32 extent_len, - u64 *extent_physical, - struct btrfs_device **extent_dev, - int *extent_mirror_num) +static void scrub_find_good_copy(struct btrfs_fs_info *fs_info, + u64 extent_logical, u32 extent_len, + u64 *extent_physical, + struct btrfs_device **extent_dev, + int *extent_mirror_num) { u64 mapped_length; struct btrfs_io_context *bioc = NULL; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 7d1642937274..5a05beabf0c3 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -10,7 +10,6 @@ #include <linux/mount.h> #include <linux/xattr.h> #include <linux/posix_acl_xattr.h> -#include <linux/radix-tree.h> #include <linux/vmalloc.h> #include <linux/string.h> #include <linux/compat.h> @@ -128,11 +127,18 @@ struct send_ctx { struct list_head new_refs; struct list_head deleted_refs; - struct radix_tree_root name_cache; + struct xarray name_cache; struct list_head name_cache_list; int name_cache_size; + /* + * The inode we are currently processing. It's not NULL only when we + * need to issue write commands for data extents from this inode. + */ + struct inode *cur_inode; struct file_ra_state ra; + u64 page_cache_clear_start; + bool clean_page_cache; /* * We process inodes by their increasing order, so if before an @@ -262,14 +268,13 @@ struct orphan_dir_info { struct name_cache_entry { struct list_head list; /* - * radix_tree has only 32bit entries but we need to handle 64bit inums. - * We use the lower 32bit of the 64bit inum to store it in the tree. If - * more then one inum would fall into the same entry, we use radix_list - * to store the additional entries. radix_list is also used to store - * entries where two entries have the same inum but different - * generations. + * On 32bit kernels, xarray has only 32bit indices, but we need to + * handle 64bit inums. We use the lower 32bit of the 64bit inum to store + * it in the tree. If more than one inum would fall into the same entry, + * we use inum_aliases to store the additional entries. inum_aliases is + * also used to store entries with the same inum but different generations. */ - struct list_head radix_list; + struct list_head inum_aliases; u64 ino; u64 gen; u64 parent_ino; @@ -2019,9 +2024,9 @@ out: } /* - * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit, + * Insert a name cache entry. On 32bit kernels the xarray index is 32bit, * so we need to do some special handling in case we have clashes. This function - * takes care of this with the help of name_cache_entry::radix_list. + * takes care of this with the help of name_cache_entry::inum_aliases. * In case of error, nce is kfreed. */ static int name_cache_insert(struct send_ctx *sctx, @@ -2030,8 +2035,7 @@ static int name_cache_insert(struct send_ctx *sctx, int ret = 0; struct list_head *nce_head; - nce_head = radix_tree_lookup(&sctx->name_cache, - (unsigned long)nce->ino); + nce_head = xa_load(&sctx->name_cache, (unsigned long)nce->ino); if (!nce_head) { nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL); if (!nce_head) { @@ -2040,14 +2044,14 @@ static int name_cache_insert(struct send_ctx *sctx, } INIT_LIST_HEAD(nce_head); - ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); + ret = xa_insert(&sctx->name_cache, nce->ino, nce_head, GFP_KERNEL); if (ret < 0) { kfree(nce_head); kfree(nce); return ret; } } - list_add_tail(&nce->radix_list, nce_head); + list_add_tail(&nce->inum_aliases, nce_head); list_add_tail(&nce->list, &sctx->name_cache_list); sctx->name_cache_size++; @@ -2059,15 +2063,14 @@ static void name_cache_delete(struct send_ctx *sctx, { struct list_head *nce_head; - nce_head = radix_tree_lookup(&sctx->name_cache, - (unsigned long)nce->ino); + nce_head = xa_load(&sctx->name_cache, (unsigned long)nce->ino); if (!nce_head) { btrfs_err(sctx->send_root->fs_info, "name_cache_delete lookup failed ino %llu cache size %d, leaking memory", nce->ino, sctx->name_cache_size); } - list_del(&nce->radix_list); + list_del(&nce->inum_aliases); list_del(&nce->list); sctx->name_cache_size--; @@ -2075,7 +2078,7 @@ static void name_cache_delete(struct send_ctx *sctx, * We may not get to the final release of nce_head if the lookup fails */ if (nce_head && list_empty(nce_head)) { - radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); + xa_erase(&sctx->name_cache, (unsigned long)nce->ino); kfree(nce_head); } } @@ -2086,11 +2089,11 @@ static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, struct list_head *nce_head; struct name_cache_entry *cur; - nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino); + nce_head = xa_load(&sctx->name_cache, (unsigned long)ino); if (!nce_head) return NULL; - list_for_each_entry(cur, nce_head, radix_list) { + list_for_each_entry(cur, nce_head, inum_aliases) { if (cur->ino == ino && cur->gen == gen) return cur; } @@ -2675,61 +2678,43 @@ out: static int did_create_dir(struct send_ctx *sctx, u64 dir) { int ret = 0; + int iter_ret = 0; struct btrfs_path *path = NULL; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_key di_key; - struct extent_buffer *eb; struct btrfs_dir_item *di; - int slot; path = alloc_path_for_send(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0); - if (ret < 0) - goto out; - while (1) { - eb = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(sctx->send_root, path); - if (ret < 0) { - goto out; - } else if (ret > 0) { - ret = 0; - break; - } - continue; - } + btrfs_for_each_slot(sctx->send_root, &key, &found_key, path, iter_ret) { + struct extent_buffer *eb = path->nodes[0]; - btrfs_item_key_to_cpu(eb, &found_key, slot); if (found_key.objectid != key.objectid || found_key.type != key.type) { ret = 0; - goto out; + break; } - di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); + di = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(eb, di, &di_key); if (di_key.type != BTRFS_ROOT_ITEM_KEY && di_key.objectid < sctx->send_progress) { ret = 1; - goto out; + break; } - - path->slots[0]++; } + /* Catch error found during iteration */ + if (iter_ret < 0) + ret = iter_ret; -out: btrfs_free_path(path); return ret; } @@ -2933,6 +2918,7 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, u64 send_progress) { int ret = 0; + int iter_ret = 0; struct btrfs_root *root = sctx->parent_root; struct btrfs_path *path; struct btrfs_key key; @@ -2959,23 +2945,9 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, if (odi) key.offset = odi->last_dir_index_offset; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - while (1) { + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct waiting_dir_move *dm; - if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out; - else if (ret > 0) - break; - continue; - } - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); if (found_key.objectid != key.objectid || found_key.type != key.type) break; @@ -3010,8 +2982,10 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, ret = 0; goto out; } - - path->slots[0]++; + } + if (iter_ret < 0) { + ret = iter_ret; + goto out; } free_orphan_dir_info(sctx, odi); @@ -3579,7 +3553,7 @@ static int check_ino_in_path(struct btrfs_root *root, } /* - * Check if ino ino1 is an ancestor of inode ino2 in the given root for any + * Check if inode ino1 is an ancestor of inode ino2 in the given root for any * possible path (in case ino2 is not a directory and has multiple hard links). * Return 1 if true, 0 if false and < 0 on error. */ @@ -3591,6 +3565,7 @@ static int is_ancestor(struct btrfs_root *root, { bool free_fs_path = false; int ret = 0; + int iter_ret = 0; struct btrfs_path *path = NULL; struct btrfs_key key; @@ -3611,26 +3586,12 @@ static int is_ancestor(struct btrfs_root *root, key.type = BTRFS_INODE_REF_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - while (true) { + btrfs_for_each_slot(root, &key, &key, path, iter_ret) { struct extent_buffer *leaf = path->nodes[0]; int slot = path->slots[0]; u32 cur_offset = 0; u32 item_size; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out; - if (ret > 0) - break; - continue; - } - - btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid != ino2) break; if (key.type != BTRFS_INODE_REF_KEY && @@ -3668,10 +3629,12 @@ static int is_ancestor(struct btrfs_root *root, if (ret) goto out; } - path->slots[0]++; } ret = 0; - out: + if (iter_ret < 0) + ret = iter_ret; + +out: btrfs_free_path(path); if (free_fs_path) fs_path_free(fs_path); @@ -4551,13 +4514,12 @@ out: static int process_all_refs(struct send_ctx *sctx, enum btrfs_compare_tree_result cmd) { - int ret; + int ret = 0; + int iter_ret = 0; struct btrfs_root *root; struct btrfs_path *path; struct btrfs_key key; struct btrfs_key found_key; - struct extent_buffer *eb; - int slot; iterate_inode_ref_t cb; int pending_move = 0; @@ -4581,24 +4543,7 @@ static int process_all_refs(struct send_ctx *sctx, key.objectid = sctx->cmp_key->objectid; key.type = BTRFS_INODE_REF_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - while (1) { - eb = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out; - else if (ret > 0) - break; - continue; - } - - btrfs_item_key_to_cpu(eb, &found_key, slot); - + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { if (found_key.objectid != key.objectid || (found_key.type != BTRFS_INODE_REF_KEY && found_key.type != BTRFS_INODE_EXTREF_KEY)) @@ -4607,8 +4552,11 @@ static int process_all_refs(struct send_ctx *sctx, ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx); if (ret < 0) goto out; - - path->slots[0]++; + } + /* Catch error found during iteration */ + if (iter_ret < 0) { + ret = iter_ret; + goto out; } btrfs_release_path(path); @@ -4870,13 +4818,12 @@ out: static int process_all_new_xattrs(struct send_ctx *sctx) { - int ret; + int ret = 0; + int iter_ret = 0; struct btrfs_root *root; struct btrfs_path *path; struct btrfs_key key; struct btrfs_key found_key; - struct extent_buffer *eb; - int slot; path = alloc_path_for_send(); if (!path) @@ -4887,39 +4834,21 @@ static int process_all_new_xattrs(struct send_ctx *sctx) key.objectid = sctx->cmp_key->objectid; key.type = BTRFS_XATTR_ITEM_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - while (1) { - eb = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) { - goto out; - } else if (ret > 0) { - ret = 0; - break; - } - continue; - } - - btrfs_item_key_to_cpu(eb, &found_key, slot); + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { if (found_key.objectid != key.objectid || found_key.type != key.type) { ret = 0; - goto out; + break; } ret = iterate_dir_item(root, path, __process_new_xattr, sctx); if (ret < 0) - goto out; - - path->slots[0]++; + break; } + /* Catch error found during iteration */ + if (iter_ret < 0) + ret = iter_ret; -out: btrfs_free_path(path); return ret; } @@ -4946,7 +4875,6 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) { struct btrfs_root *root = sctx->send_root; struct btrfs_fs_info *fs_info = root->fs_info; - struct inode *inode; struct page *page; pgoff_t index = offset >> PAGE_SHIFT; pgoff_t last_index; @@ -4957,37 +4885,30 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) if (ret) return ret; - inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); - if (IS_ERR(inode)) - return PTR_ERR(inode); - last_index = (offset + len - 1) >> PAGE_SHIFT; - /* initial readahead */ - memset(&sctx->ra, 0, sizeof(struct file_ra_state)); - file_ra_state_init(&sctx->ra, inode->i_mapping); - while (index <= last_index) { unsigned cur_len = min_t(unsigned, len, PAGE_SIZE - pg_offset); - page = find_lock_page(inode->i_mapping, index); + page = find_lock_page(sctx->cur_inode->i_mapping, index); if (!page) { - page_cache_sync_readahead(inode->i_mapping, &sctx->ra, - NULL, index, last_index + 1 - index); + page_cache_sync_readahead(sctx->cur_inode->i_mapping, + &sctx->ra, NULL, index, + last_index + 1 - index); - page = find_or_create_page(inode->i_mapping, index, - GFP_KERNEL); + page = find_or_create_page(sctx->cur_inode->i_mapping, + index, GFP_KERNEL); if (!page) { ret = -ENOMEM; break; } } - if (PageReadahead(page)) { - page_cache_async_readahead(inode->i_mapping, &sctx->ra, - NULL, page, index, last_index + 1 - index); - } + if (PageReadahead(page)) + page_cache_async_readahead(sctx->cur_inode->i_mapping, + &sctx->ra, NULL, page, index, + last_index + 1 - index); if (!PageUptodate(page)) { btrfs_readpage(NULL, page); @@ -5013,7 +4934,7 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) len -= cur_len; sctx->send_size += cur_len; } - iput(inode); + return ret; } @@ -5220,12 +5141,49 @@ static int send_extent_data(struct send_ctx *sctx, const u64 offset, const u64 len) { + const u64 end = offset + len; u64 read_size = max_send_read_size(sctx); u64 sent = 0; if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) return send_update_extent(sctx, offset, len); + if (sctx->cur_inode == NULL) { + struct btrfs_root *root = sctx->send_root; + + sctx->cur_inode = btrfs_iget(root->fs_info->sb, sctx->cur_ino, root); + if (IS_ERR(sctx->cur_inode)) { + int err = PTR_ERR(sctx->cur_inode); + + sctx->cur_inode = NULL; + return err; + } + memset(&sctx->ra, 0, sizeof(struct file_ra_state)); + file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping); + + /* + * It's very likely there are no pages from this inode in the page + * cache, so after reading extents and sending their data, we clean + * the page cache to avoid trashing the page cache (adding pressure + * to the page cache and forcing eviction of other data more useful + * for applications). + * + * We decide if we should clean the page cache simply by checking + * if the inode's mapping nrpages is 0 when we first open it, and + * not by using something like filemap_range_has_page() before + * reading an extent because when we ask the readahead code to + * read a given file range, it may (and almost always does) read + * pages from beyond that range (see the documentation for + * page_cache_sync_readahead()), so it would not be reliable, + * because after reading the first extent future calls to + * filemap_range_has_page() would return true because the readahead + * on the previous extent resulted in reading pages of the current + * extent as well. + */ + sctx->clean_page_cache = (sctx->cur_inode->i_mapping->nrpages == 0); + sctx->page_cache_clear_start = round_down(offset, PAGE_SIZE); + } + while (sent < len) { u64 size = min(len - sent, read_size); int ret; @@ -5235,6 +5193,37 @@ static int send_extent_data(struct send_ctx *sctx, return ret; sent += size; } + + if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) { + /* + * Always operate only on ranges that are a multiple of the page + * size. This is not only to prevent zeroing parts of a page in + * the case of subpage sector size, but also to guarantee we evict + * pages, as passing a range that is smaller than page size does + * not evict the respective page (only zeroes part of its content). + * + * Always start from the end offset of the last range cleared. + * This is because the readahead code may (and very often does) + * reads pages beyond the range we request for readahead. So if + * we have an extent layout like this: + * + * [ extent A ] [ extent B ] [ extent C ] + * + * When we ask page_cache_sync_readahead() to read extent A, it + * may also trigger reads for pages of extent B. If we are doing + * an incremental send and extent B has not changed between the + * parent and send snapshots, some or all of its pages may end + * up being read and placed in the page cache. So when truncating + * the page cache we always start from the end offset of the + * previously processed extent up to the end of the current + * extent. + */ + truncate_inode_pages_range(&sctx->cur_inode->i_data, + sctx->page_cache_clear_start, + end - 1); + sctx->page_cache_clear_start = end; + } + return 0; } @@ -5965,13 +5954,12 @@ out: static int process_all_extents(struct send_ctx *sctx) { - int ret; + int ret = 0; + int iter_ret = 0; struct btrfs_root *root; struct btrfs_path *path; struct btrfs_key key; struct btrfs_key found_key; - struct extent_buffer *eb; - int slot; root = sctx->send_root; path = alloc_path_for_send(); @@ -5981,41 +5969,21 @@ static int process_all_extents(struct send_ctx *sctx) key.objectid = sctx->cmp_key->objectid; key.type = BTRFS_EXTENT_DATA_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - while (1) { - eb = path->nodes[0]; - slot = path->slots[0]; - - if (slot >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) { - goto out; - } else if (ret > 0) { - ret = 0; - break; - } - continue; - } - - btrfs_item_key_to_cpu(eb, &found_key, slot); - + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { if (found_key.objectid != key.objectid || found_key.type != key.type) { ret = 0; - goto out; + break; } ret = process_extent(sctx, path, &found_key); if (ret < 0) - goto out; - - path->slots[0]++; + break; } + /* Catch error found during iteration */ + if (iter_ret < 0) + ret = iter_ret; -out: btrfs_free_path(path); return ret; } @@ -6205,8 +6173,11 @@ static int btrfs_unlink_all_paths(struct send_ctx *sctx) { LIST_HEAD(deleted_refs); struct btrfs_path *path; + struct btrfs_root *root = sctx->parent_root; struct btrfs_key key; + struct btrfs_key found_key; struct parent_paths_ctx ctx; + int iter_ret = 0; int ret; path = alloc_path_for_send(); @@ -6216,39 +6187,26 @@ static int btrfs_unlink_all_paths(struct send_ctx *sctx) key.objectid = sctx->cur_ino; key.type = BTRFS_INODE_REF_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0); - if (ret < 0) - goto out; ctx.refs = &deleted_refs; ctx.sctx = sctx; - while (true) { - struct extent_buffer *eb = path->nodes[0]; - int slot = path->slots[0]; - - if (slot >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(sctx->parent_root, path); - if (ret < 0) - goto out; - else if (ret > 0) - break; - continue; - } - - btrfs_item_key_to_cpu(eb, &key, slot); - if (key.objectid != sctx->cur_ino) + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { + if (found_key.objectid != key.objectid) break; - if (key.type != BTRFS_INODE_REF_KEY && - key.type != BTRFS_INODE_EXTREF_KEY) + if (found_key.type != key.type && + found_key.type != BTRFS_INODE_EXTREF_KEY) break; - ret = iterate_inode_ref(sctx->parent_root, path, &key, 1, + ret = iterate_inode_ref(root, path, &found_key, 1, record_parent_ref, &ctx); if (ret < 0) goto out; - - path->slots[0]++; + } + /* Catch error found during iteration */ + if (iter_ret < 0) { + ret = iter_ret; + goto out; } while (!list_empty(&deleted_refs)) { @@ -6270,6 +6228,30 @@ out: return ret; } +static void close_current_inode(struct send_ctx *sctx) +{ + u64 i_size; + + if (sctx->cur_inode == NULL) + return; + + i_size = i_size_read(sctx->cur_inode); + + /* + * If we are doing an incremental send, we may have extents between the + * last processed extent and the i_size that have not been processed + * because they haven't changed but we may have read some of their pages + * through readahead, see the comments at send_extent_data(). + */ + if (sctx->clean_page_cache && sctx->page_cache_clear_start < i_size) + truncate_inode_pages_range(&sctx->cur_inode->i_data, + sctx->page_cache_clear_start, + round_up(i_size, PAGE_SIZE) - 1); + + iput(sctx->cur_inode); + sctx->cur_inode = NULL; +} + static int changed_inode(struct send_ctx *sctx, enum btrfs_compare_tree_result result) { @@ -6280,6 +6262,8 @@ static int changed_inode(struct send_ctx *sctx, u64 left_gen = 0; u64 right_gen = 0; + close_current_inode(sctx); + sctx->cur_ino = key->objectid; sctx->cur_inode_new_gen = 0; sctx->cur_inode_last_extent = (u64)-1; @@ -7534,7 +7518,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) INIT_LIST_HEAD(&sctx->new_refs); INIT_LIST_HEAD(&sctx->deleted_refs); - INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL); + xa_init_flags(&sctx->name_cache, GFP_KERNEL); INIT_LIST_HEAD(&sctx->name_cache_list); sctx->flags = arg->flags; @@ -7766,6 +7750,8 @@ out: name_cache_free(sctx); + close_current_inode(sctx); + kfree(sctx); } diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index b87931a458eb..2dd8754cb990 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -181,6 +181,12 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) found->full = 0; } +/* + * Block groups with more than this value (percents) of unusable space will be + * scheduled for background reclaim. + */ +#define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75) + static int create_space_info(struct btrfs_fs_info *info, u64 flags) { @@ -203,6 +209,9 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) INIT_LIST_HEAD(&space_info->priority_tickets); space_info->clamp = 1; + if (btrfs_is_zoned(info)) + space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH; + ret = btrfs_sysfs_add_space_info_type(info, space_info); if (ret) return ret; @@ -519,7 +528,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2; } - trans = (struct btrfs_trans_handle *)current->journal_info; + trans = current->journal_info; /* * If we are doing more ordered than delalloc we need to just wait on diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index d841fed73492..c096695598c1 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -3,6 +3,8 @@ #ifndef BTRFS_SPACE_INFO_H #define BTRFS_SPACE_INFO_H +#include "volumes.h" + struct btrfs_space_info { spinlock_t lock; @@ -24,6 +26,12 @@ struct btrfs_space_info { the space info if we had an ENOSPC in the allocator. */ + /* + * Once a block group drops below this threshold (percents) we'll + * schedule it for reclaim. + */ + int bg_reclaim_threshold; + int clamp; /* Used to scale our threshold for preemptive flushing. The value is >> clamp, so turns out to be a 2^clamp divisor. */ diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index ef7ae20d2b77..a105b291444f 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -63,6 +63,29 @@ * This means a slightly higher tree locking latency. */ +bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page) +{ + if (fs_info->sectorsize >= PAGE_SIZE) + return false; + + /* + * Only data pages (either through DIO or compression) can have no + * mapping. And if page->mapping->host is data inode, it's subpage. + * As we have ruled our sectorsize >= PAGE_SIZE case already. + */ + if (!page->mapping || !page->mapping->host || + is_data_inode(page->mapping->host)) + return true; + + /* + * Now the only remaining case is metadata, which we only go subpage + * routine if nodesize < PAGE_SIZE. + */ + if (fs_info->nodesize < PAGE_SIZE) + return true; + return false; +} + void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize) { unsigned int cur = 0; @@ -107,7 +130,7 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, ASSERT(PageLocked(page)); /* Either not subpage, or the page already has private attached */ - if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page)) + if (!btrfs_is_subpage(fs_info, page) || PagePrivate(page)) return 0; subpage = btrfs_alloc_subpage(fs_info, type); @@ -124,10 +147,10 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct btrfs_subpage *subpage; /* Either not subpage, or already detached */ - if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page)) + if (!btrfs_is_subpage(fs_info, page) || !PagePrivate(page)) return; - subpage = (struct btrfs_subpage *)detach_page_private(page); + subpage = detach_page_private(page); ASSERT(subpage); btrfs_free_subpage(subpage); } @@ -175,7 +198,7 @@ void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, { struct btrfs_subpage *subpage; - if (fs_info->sectorsize == PAGE_SIZE) + if (!btrfs_is_subpage(fs_info, page)) return; ASSERT(PagePrivate(page) && page->mapping); @@ -190,7 +213,7 @@ void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, { struct btrfs_subpage *subpage; - if (fs_info->sectorsize == PAGE_SIZE) + if (!btrfs_is_subpage(fs_info, page)) return; ASSERT(PagePrivate(page) && page->mapping); @@ -319,7 +342,7 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { lock_page(page); return 0; } @@ -336,7 +359,7 @@ int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) return unlock_page(page); btrfs_subpage_clamp_range(page, &start, &len); if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len)) @@ -620,7 +643,7 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked); void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ set_page_func(page); \ return; \ } \ @@ -629,7 +652,7 @@ void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ clear_page_func(page); \ return; \ } \ @@ -638,14 +661,14 @@ void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \ return test_page_func(page); \ return btrfs_subpage_test_##name(fs_info, page, start, len); \ } \ void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ set_page_func(page); \ return; \ } \ @@ -655,7 +678,7 @@ void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ clear_page_func(page); \ return; \ } \ @@ -665,7 +688,7 @@ void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \ return test_page_func(page); \ btrfs_subpage_clamp_range(page, &start, &len); \ return btrfs_subpage_test_##name(fs_info, page, start, len); \ @@ -694,7 +717,7 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, return; ASSERT(!PageDirty(page)); - if (fs_info->sectorsize == PAGE_SIZE) + if (!btrfs_is_subpage(fs_info, page)) return; ASSERT(PagePrivate(page) && page->private); @@ -722,8 +745,8 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, struct btrfs_subpage *subpage; ASSERT(PageLocked(page)); - /* For regular page size case, we just unlock the page */ - if (fs_info->sectorsize == PAGE_SIZE) + /* For non-subpage case, we just unlock the page */ + if (!btrfs_is_subpage(fs_info, page)) return unlock_page(page); ASSERT(PagePrivate(page) && page->private); diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 7accb5c40d33..0e80ad336904 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -74,6 +74,8 @@ enum btrfs_subpage_type { BTRFS_SUBPAGE_DATA, }; +bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page); + void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize); int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct page *page, enum btrfs_subpage_type type); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index b228efe8ab6e..b1fdc6a26c76 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -261,7 +261,7 @@ static struct ratelimit_state printk_limits[] = { RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100), }; -void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) +void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0"; struct va_format vaf; @@ -292,10 +292,10 @@ void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, . char statestr[STATE_STRING_BUF_LEN]; btrfs_state_to_string(fs_info, statestr); - printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type, + _printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type, fs_info->sb->s_id, statestr, &vaf); } else { - printk("%sBTRFS %s: %pV\n", lvl, type, &vaf); + _printk("%sBTRFS %s: %pV\n", lvl, type, &vaf); } } @@ -1903,6 +1903,7 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, old_pool_size, new_pool_size); btrfs_workqueue_set_max(fs_info->workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->hipri_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size); @@ -1912,8 +1913,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size); btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size); - btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers, - new_pool_size); } static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index ba78ca5aabbb..92a1fa8e3da6 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -394,11 +394,9 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj, { ssize_t ret = 0; - /* 4K sector size is also supported with 64K page size */ - if (PAGE_SIZE == SZ_64K) + /* An artificial limit to only support 4K and PAGE_SIZE */ + if (PAGE_SIZE > SZ_4K) ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K); - - /* Only sectorsize == PAGE_SIZE is now supported */ ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE); return ret; @@ -722,6 +720,42 @@ SPACE_INFO_ATTR(bytes_zone_unusable); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); +static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + ssize_t ret; + + ret = sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold)); + + return ret; +} + +static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + int thresh; + int ret; + + ret = kstrtoint(buf, 10, &thresh); + if (ret) + return ret; + + if (thresh < 0 || thresh > 100) + return -EINVAL; + + WRITE_ONCE(space_info->bg_reclaim_threshold, thresh); + + return len; +} + +BTRFS_ATTR_RW(space_info, bg_reclaim_threshold, + btrfs_sinfo_bg_reclaim_threshold_show, + btrfs_sinfo_bg_reclaim_threshold_store); + /* * Allocation information about block group types. * @@ -738,6 +772,7 @@ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, bytes_zone_unusable), BTRFS_ATTR_PTR(space_info, disk_used), BTRFS_ATTR_PTR(space_info, disk_total), + BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold), NULL, }; ATTRIBUTE_GROUPS(space_info); diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index d8e56edd6991..1591bfa55bcc 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -150,8 +150,8 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) { - struct radix_tree_iter iter; - void **slot; + unsigned long index; + struct extent_buffer *eb; struct btrfs_device *dev, *tmp; if (!fs_info) @@ -163,25 +163,9 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) test_mnt->mnt_sb->s_fs_info = NULL; - spin_lock(&fs_info->buffer_lock); - radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { - struct extent_buffer *eb; - - eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock); - if (!eb) - continue; - /* Shouldn't happen but that kind of thinking creates CVE's */ - if (radix_tree_exception(eb)) { - if (radix_tree_deref_retry(eb)) - slot = radix_tree_iter_retry(&iter); - continue; - } - slot = radix_tree_iter_resume(slot, &iter); - spin_unlock(&fs_info->buffer_lock); + xa_for_each(&fs_info->extent_buffers, index, eb) { free_extent_buffer_stale(eb); - spin_lock(&fs_info->buffer_lock); } - spin_unlock(&fs_info->buffer_lock); btrfs_mapping_tree_free(&fs_info->mapping_tree); list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices, @@ -202,7 +186,7 @@ void btrfs_free_dummy_root(struct btrfs_root *root) if (!root) return; /* Will be freed by btrfs_free_fs_roots */ - if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state))) + if (WARN_ON(test_bit(BTRFS_ROOT_REGISTERED, &root->state))) return; btrfs_global_root_delete(root); btrfs_put_root(root); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index b008c5110958..06c0a958d114 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -23,7 +23,7 @@ #include "space-info.h" #include "zoned.h" -#define BTRFS_ROOT_TRANS_TAG 0 +#define BTRFS_ROOT_TRANS_TAG XA_MARK_0 /* * Transaction states and transitions @@ -221,7 +221,7 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) * the caching thread will re-start it's search from 3, and thus find * the hole from [4,6) to add to the free space cache. */ - spin_lock(&fs_info->block_group_cache_lock); + write_lock(&fs_info->block_group_cache_lock); list_for_each_entry_safe(caching_ctl, next, &fs_info->caching_block_groups, list) { struct btrfs_block_group *cache = caching_ctl->block_group; @@ -234,7 +234,7 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) cache->last_byte_to_unpin = caching_ctl->progress; } } - spin_unlock(&fs_info->block_group_cache_lock); + write_unlock(&fs_info->block_group_cache_lock); up_write(&fs_info->commit_root_sem); } @@ -437,15 +437,15 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, */ smp_wmb(); - spin_lock(&fs_info->fs_roots_radix_lock); + spin_lock(&fs_info->fs_roots_lock); if (root->last_trans == trans->transid && !force) { - spin_unlock(&fs_info->fs_roots_radix_lock); + spin_unlock(&fs_info->fs_roots_lock); return 0; } - radix_tree_tag_set(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); - spin_unlock(&fs_info->fs_roots_radix_lock); + xa_set_mark(&fs_info->fs_roots, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + spin_unlock(&fs_info->fs_roots_lock); root->last_trans = trans->transid; /* this is pretty tricky. We don't want to @@ -487,11 +487,9 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, spin_unlock(&cur_trans->dropped_roots_lock); /* Make sure we don't try to update the root at commit time */ - spin_lock(&fs_info->fs_roots_radix_lock); - radix_tree_tag_clear(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); - spin_unlock(&fs_info->fs_roots_radix_lock); + xa_clear_mark(&fs_info->fs_roots, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); } int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, @@ -1404,9 +1402,8 @@ void btrfs_add_dead_root(struct btrfs_root *root) static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *gang[8]; - int i; - int ret; + struct btrfs_root *root; + unsigned long index; /* * At this point no one can be using this transaction to modify any tree @@ -1414,57 +1411,46 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) */ ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); - spin_lock(&fs_info->fs_roots_radix_lock); - while (1) { - ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, - (void **)gang, 0, - ARRAY_SIZE(gang), - BTRFS_ROOT_TRANS_TAG); - if (ret == 0) - break; - for (i = 0; i < ret; i++) { - struct btrfs_root *root = gang[i]; - int ret2; - - /* - * At this point we can neither have tasks logging inodes - * from a root nor trying to commit a log tree. - */ - ASSERT(atomic_read(&root->log_writers) == 0); - ASSERT(atomic_read(&root->log_commit[0]) == 0); - ASSERT(atomic_read(&root->log_commit[1]) == 0); - - radix_tree_tag_clear(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); - spin_unlock(&fs_info->fs_roots_radix_lock); - - btrfs_free_log(trans, root); - ret2 = btrfs_update_reloc_root(trans, root); - if (ret2) - return ret2; - - /* see comments in should_cow_block() */ - clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); - smp_mb__after_atomic(); - - if (root->commit_root != root->node) { - list_add_tail(&root->dirty_list, - &trans->transaction->switch_commits); - btrfs_set_root_node(&root->root_item, - root->node); - } + spin_lock(&fs_info->fs_roots_lock); + xa_for_each_marked(&fs_info->fs_roots, index, root, BTRFS_ROOT_TRANS_TAG) { + int ret; + + /* + * At this point we can neither have tasks logging inodes + * from a root nor trying to commit a log tree. + */ + ASSERT(atomic_read(&root->log_writers) == 0); + ASSERT(atomic_read(&root->log_commit[0]) == 0); + ASSERT(atomic_read(&root->log_commit[1]) == 0); + + xa_clear_mark(&fs_info->fs_roots, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + spin_unlock(&fs_info->fs_roots_lock); - ret2 = btrfs_update_root(trans, fs_info->tree_root, - &root->root_key, - &root->root_item); - if (ret2) - return ret2; - spin_lock(&fs_info->fs_roots_radix_lock); - btrfs_qgroup_free_meta_all_pertrans(root); + btrfs_free_log(trans, root); + ret = btrfs_update_reloc_root(trans, root); + if (ret) + return ret; + + /* See comments in should_cow_block() */ + clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); + smp_mb__after_atomic(); + + if (root->commit_root != root->node) { + list_add_tail(&root->dirty_list, + &trans->transaction->switch_commits); + btrfs_set_root_node(&root->root_item, root->node); } + + ret = btrfs_update_root(trans, fs_info->tree_root, + &root->root_key, &root->root_item); + if (ret) + return ret; + spin_lock(&fs_info->fs_roots_lock); + btrfs_qgroup_free_meta_all_pertrans(root); } - spin_unlock(&fs_info->fs_roots_radix_lock); + spin_unlock(&fs_info->fs_roots_lock); return 0; } diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index e56c0107eea3..9e0e0ae2288c 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1855,3 +1855,58 @@ out: return ret; } ALLOW_ERROR_INJECTION(btrfs_check_node, ERRNO); + +int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner) +{ + const bool is_subvol = is_fstree(root_owner); + const u64 eb_owner = btrfs_header_owner(eb); + + /* + * Skip dummy fs, as selftests don't create unique ebs for each dummy + * root. + */ + if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &eb->fs_info->fs_state)) + return 0; + /* + * There are several call sites (backref walking, qgroup, and data + * reloc) passing 0 as @root_owner, as they are not holding the + * tree root. In that case, we can not do a reliable ownership check, + * so just exit. + */ + if (root_owner == 0) + return 0; + /* + * These trees use key.offset as their owner, our callers don't have + * the extra capacity to pass key.offset here. So we just skip them. + */ + if (root_owner == BTRFS_TREE_LOG_OBJECTID || + root_owner == BTRFS_TREE_RELOC_OBJECTID) + return 0; + + if (!is_subvol) { + /* For non-subvolume trees, the eb owner should match root owner */ + if (unlikely(root_owner != eb_owner)) { + btrfs_crit(eb->fs_info, +"corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect %llu", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + root_owner, btrfs_header_bytenr(eb), eb_owner, + root_owner); + return -EUCLEAN; + } + return 0; + } + + /* + * For subvolume trees, owners can mismatch, but they should all belong + * to subvolume trees. + */ + if (unlikely(is_subvol != is_fstree(eb_owner))) { + btrfs_crit(eb->fs_info, +"corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect [%llu, %llu]", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + root_owner, btrfs_header_bytenr(eb), eb_owner, + BTRFS_FIRST_FREE_OBJECTID, BTRFS_LAST_FREE_OBJECTID); + return -EUCLEAN; + } + return 0; +} diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index 32fecc9dc1dd..ece497e26558 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -25,5 +25,6 @@ int btrfs_check_node(struct extent_buffer *node); int btrfs_check_chunk_valid(struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 logical); +int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e65633686378..370388fadf96 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -333,7 +333,7 @@ static int process_one_buffer(struct btrfs_root *log, * pin down any logged extents, so we have to read the block. */ if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { - ret = btrfs_read_buffer(eb, gen, level, NULL); + ret = btrfs_read_extent_buffer(eb, gen, level, NULL); if (ret) return ret; } @@ -894,8 +894,7 @@ update_inode: btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); out: - if (inode) - iput(inode); + iput(inode); return ret; } @@ -2575,7 +2574,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, int i; int ret; - ret = btrfs_read_buffer(eb, gen, level, NULL); + ret = btrfs_read_extent_buffer(eb, gen, level, NULL); if (ret) return ret; @@ -2786,7 +2785,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, path->slots[*level]++; if (wc->free) { - ret = btrfs_read_buffer(next, ptr_gen, + ret = btrfs_read_extent_buffer(next, ptr_gen, *level - 1, &first_key); if (ret) { free_extent_buffer(next); @@ -2815,7 +2814,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, free_extent_buffer(next); continue; } - ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key); + ret = btrfs_read_extent_buffer(next, ptr_gen, *level - 1, &first_key); if (ret) { free_extent_buffer(next); return ret; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a8cc736731fd..9c20049d1fec 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -164,24 +164,12 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { */ enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags) { - if (flags & BTRFS_BLOCK_GROUP_RAID10) - return BTRFS_RAID_RAID10; - else if (flags & BTRFS_BLOCK_GROUP_RAID1) - return BTRFS_RAID_RAID1; - else if (flags & BTRFS_BLOCK_GROUP_RAID1C3) - return BTRFS_RAID_RAID1C3; - else if (flags & BTRFS_BLOCK_GROUP_RAID1C4) - return BTRFS_RAID_RAID1C4; - else if (flags & BTRFS_BLOCK_GROUP_DUP) - return BTRFS_RAID_DUP; - else if (flags & BTRFS_BLOCK_GROUP_RAID0) - return BTRFS_RAID_RAID0; - else if (flags & BTRFS_BLOCK_GROUP_RAID5) - return BTRFS_RAID_RAID5; - else if (flags & BTRFS_BLOCK_GROUP_RAID6) - return BTRFS_RAID_RAID6; - - return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ + const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK); + + if (!profile) + return BTRFS_RAID_SINGLE; + + return BTRFS_BG_FLAG_TO_INDEX(profile); } const char *btrfs_bg_type_to_raid_name(u64 flags) @@ -405,7 +393,6 @@ void btrfs_free_device(struct btrfs_device *device) WARN_ON(!list_empty(&device->post_commit_list)); rcu_string_free(device->name); extent_io_tree_release(&device->alloc_state); - bio_put(device->flush_bio); btrfs_destroy_dev_zone_info(device); kfree(device); } @@ -643,7 +630,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); } - if (!blk_queue_nonrot(bdev_get_queue(bdev))) + if (!bdev_nonrot(bdev)) fs_devices->rotating = true; device->bdev = bdev; @@ -2706,7 +2693,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path atomic64_add(device->total_bytes, &fs_info->free_chunk_space); - if (!blk_queue_nonrot(bdev_get_queue(bdev))) + if (!bdev_nonrot(bdev)) fs_devices->rotating = true; orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); @@ -4063,13 +4050,6 @@ static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) return true; - if (fs_info->sectorsize < PAGE_SIZE && - bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) { - btrfs_err(fs_info, - "RAID56 is not yet supported for sectorsize %u with page size %lu", - fs_info->sectorsize, PAGE_SIZE); - return false; - } /* Profile is valid and does not have bits outside of the allowed set */ if (alloc_profile_is_valid(bargs->target, 1) && (bargs->target & ~allowed) == 0) @@ -6313,7 +6293,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, u64 offset; u64 stripe_offset; u64 stripe_nr; - u64 stripe_len; + u32 stripe_len; u64 raid56_full_stripe_start = (u64)-1; int data_stripes; @@ -6324,19 +6304,13 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, offset = logical - em->start; /* Len of a stripe in a chunk */ stripe_len = map->stripe_len; - /* Stripe where this block falls in */ - stripe_nr = div64_u64(offset, stripe_len); - /* Offset of stripe in the chunk */ - stripe_offset = stripe_nr * stripe_len; - if (offset < stripe_offset) { - btrfs_crit(fs_info, -"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu", - stripe_offset, offset, em->start, logical, stripe_len); - return -EINVAL; - } + /* + * Stripe_nr is where this block falls in + * stripe_offset is the offset of this block in its stripe. + */ + stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset); + ASSERT(stripe_offset < U32_MAX); - /* stripe_offset is the offset of this block in its stripe */ - stripe_offset = offset - stripe_offset; data_stripes = nr_data_stripes(map); /* Only stripe based profiles needs to check against stripe length. */ @@ -6738,11 +6712,11 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid, bio->bi_iter.bi_size); - bio_set_dev(bio, dev->bdev); btrfs_bio_counter_inc_noblocked(fs_info); - btrfsic_submit_bio(bio); + btrfsic_check_bio(bio); + submit_bio(bio); } static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical) @@ -6824,10 +6798,12 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, continue; } - if (dev_nr < total_devs - 1) - bio = btrfs_bio_clone(first_bio); - else + if (dev_nr < total_devs - 1) { + bio = btrfs_bio_clone(dev->bdev, first_bio); + } else { bio = first_bio; + bio_set_dev(bio, dev->bdev); + } submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev); } @@ -6949,16 +6925,6 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, if (!dev) return ERR_PTR(-ENOMEM); - /* - * Preallocate a bio that's always going to be used for flushing device - * barriers and matches the device lifespan - */ - dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0); - if (!dev->flush_bio) { - kfree(dev); - return ERR_PTR(-ENOMEM); - } - INIT_LIST_HEAD(&dev->dev_list); INIT_LIST_HEAD(&dev->dev_alloc_list); INIT_LIST_HEAD(&dev->post_commit_list); @@ -7370,7 +7336,6 @@ static int read_one_dev(struct extent_buffer *leaf, int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) { - struct btrfs_root *root = fs_info->tree_root; struct btrfs_super_block *super_copy = fs_info->super_copy; struct extent_buffer *sb; struct btrfs_disk_key *disk_key; @@ -7386,30 +7351,16 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) struct btrfs_key key; ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); + /* - * This will create extent buffer of nodesize, superblock size is - * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will - * overallocate but we can keep it as-is, only the first page is used. + * We allocated a dummy extent, just to use extent buffer accessors. + * There will be unused space after BTRFS_SUPER_INFO_SIZE, but + * that's fine, we will not go beyond system chunk array anyway. */ - sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET, - root->root_key.objectid, 0); - if (IS_ERR(sb)) - return PTR_ERR(sb); + sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET); + if (!sb) + return -ENOMEM; set_extent_buffer_uptodate(sb); - /* - * The sb extent buffer is artificial and just used to read the system array. - * set_extent_buffer_uptodate() call does not properly mark all it's - * pages up-to-date when the page is larger: extent does not cover the - * whole page and consequently check_page_uptodate does not find all - * the page's extents up-to-date (the hole beyond sb), - * write_extent_buffer then triggers a WARN_ON. - * - * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, - * but sb spans only this function. Add an explicit SetPageUptodate call - * to silence the warning eg. on PowerPC 64. - */ - if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE) - SetPageUptodate(sb->pages[0]); write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); array_size = btrfs_super_sys_array_size(super_copy); @@ -7572,6 +7523,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) struct btrfs_key found_key; int ret; int slot; + int iter_ret = 0; u64 total_dev = 0; u64 last_ra_node = 0; @@ -7615,30 +7567,18 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.offset = 0; key.type = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto error; - while (1) { - struct extent_buffer *node; + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { + struct extent_buffer *node = path->nodes[1]; leaf = path->nodes[0]; slot = path->slots[0]; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto error; - break; - } - node = path->nodes[1]; + if (node) { if (last_ra_node != node->start) { readahead_tree_node_children(node); last_ra_node = node->start; } } - btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.type == BTRFS_DEV_ITEM_KEY) { struct btrfs_dev_item *dev_item; dev_item = btrfs_item_ptr(leaf, slot, @@ -7663,7 +7603,11 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) if (ret) goto error; } - path->slots[0]++; + } + /* Catch error found during iteration */ + if (iter_ret < 0) { + ret = iter_ret; + goto error; } /* @@ -7671,12 +7615,12 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) * do another round of validation checks. */ if (total_dev != fs_info->fs_devices->total_devices) { - btrfs_err(fs_info, - "super_num_devices %llu mismatch with num_devices %llu found here", + btrfs_warn(fs_info, +"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit", btrfs_super_num_devices(fs_info->super_copy), total_dev); - ret = -EINVAL; - goto error; + fs_info->fs_devices->total_devices = total_dev; + btrfs_set_super_num_devices(fs_info->super_copy, total_dev); } if (btrfs_super_total_bytes(fs_info->super_copy) < fs_info->fs_devices->total_rw_bytes) { @@ -8288,7 +8232,7 @@ bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) static int relocating_repair_kthread(void *data) { - struct btrfs_block_group *cache = (struct btrfs_block_group *)data; + struct btrfs_block_group *cache = data; struct btrfs_fs_info *fs_info = cache->fs_info; u64 target; int ret = 0; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index f3e28f11cfb6..6721002000ee 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -17,17 +17,51 @@ extern struct mutex uuid_mutex; #define BTRFS_STRIPE_LEN SZ_64K +/* Used by sanity check for btrfs_raid_types. */ +#define const_ffs(n) (__builtin_ctzll(n) + 1) + +/* + * The conversion from BTRFS_BLOCK_GROUP_* bits to btrfs_raid_type requires + * RAID0 always to be the lowest profile bit. + * Although it's part of on-disk format and should never change, do extra + * compile-time sanity checks. + */ +static_assert(const_ffs(BTRFS_BLOCK_GROUP_RAID0) < + const_ffs(BTRFS_BLOCK_GROUP_PROFILE_MASK & ~BTRFS_BLOCK_GROUP_RAID0)); +static_assert(const_ilog2(BTRFS_BLOCK_GROUP_RAID0) > + ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK)); + +/* ilog2() can handle both constants and variables */ +#define BTRFS_BG_FLAG_TO_INDEX(profile) \ + ilog2((profile) >> (ilog2(BTRFS_BLOCK_GROUP_RAID0) - 1)) + +enum btrfs_raid_types { + /* SINGLE is the special one as it doesn't have on-disk bit. */ + BTRFS_RAID_SINGLE = 0, + + BTRFS_RAID_RAID0 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID0), + BTRFS_RAID_RAID1 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1), + BTRFS_RAID_DUP = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_DUP), + BTRFS_RAID_RAID10 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID10), + BTRFS_RAID_RAID5 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID5), + BTRFS_RAID_RAID6 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID6), + BTRFS_RAID_RAID1C3 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C3), + BTRFS_RAID_RAID1C4 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C4), + + BTRFS_NR_RAID_TYPES +}; + struct btrfs_io_geometry { /* remaining bytes before crossing a stripe */ u64 len; /* offset of logical address in chunk */ u64 offset; /* length of single IO stripe */ - u64 stripe_len; + u32 stripe_len; + /* offset of address in stripe */ + u32 stripe_offset; /* number of stripe where address falls */ u64 stripe_nr; - /* offset of address in stripe */ - u64 stripe_offset; /* offset of raid56 stripe into the chunk */ u64 raid56_stripe_offset; }; @@ -121,8 +155,8 @@ struct btrfs_device { /* bytes used on the current transaction */ u64 commit_bytes_used; - /* for sending down flush barriers */ - struct bio *flush_bio; + /* Bio used for flushing device barriers */ + struct bio flush_bio; struct completion flush_wait; /* per-device scrub information */ @@ -430,7 +464,7 @@ struct map_lookup { u64 type; int io_align; int io_width; - u64 stripe_len; + u32 stripe_len; int num_stripes; int sub_stripes; int verified_stripes; /* For mount time dev extent verification */ diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 85691dc2232f..7421abcf325a 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -272,10 +272,12 @@ out: ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) { + struct btrfs_key found_key; struct btrfs_key key; struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; + int iter_ret = 0; int ret = 0; size_t total_size = 0, size_left = size; @@ -294,44 +296,23 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) path->reada = READA_FORWARD; /* search for our xattrs */ - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto err; - - while (1) { + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct extent_buffer *leaf; int slot; struct btrfs_dir_item *di; - struct btrfs_key found_key; u32 item_size; u32 cur; leaf = path->nodes[0]; slot = path->slots[0]; - /* this is where we start walking through the path */ - if (slot >= btrfs_header_nritems(leaf)) { - /* - * if we've reached the last slot in this leaf we need - * to go to the next leaf and reset everything - */ - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto err; - else if (ret > 0) - break; - continue; - } - - btrfs_item_key_to_cpu(leaf, &found_key, slot); - /* check to make sure this item is what we want */ if (found_key.objectid != key.objectid) break; if (found_key.type > BTRFS_XATTR_ITEM_KEY) break; if (found_key.type < BTRFS_XATTR_ITEM_KEY) - goto next_item; + continue; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); item_size = btrfs_item_size(leaf, slot); @@ -351,8 +332,8 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) goto next; if (!buffer || (name_len + 1) > size_left) { - ret = -ERANGE; - goto err; + iter_ret = -ERANGE; + break; } read_extent_buffer(leaf, buffer, name_ptr, name_len); @@ -364,12 +345,13 @@ next: cur += this_len; di = (struct btrfs_dir_item *)((char *)di + this_len); } -next_item: - path->slots[0]++; } - ret = total_size; -err: + if (iter_ret < 0) + ret = iter_ret; + else + ret = total_size; + btrfs_free_path(path); return ret; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index d31b0eda210f..11237a913bee 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -51,11 +51,13 @@ #define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5) /* - * Maximum supported zone size. Currently, SMR disks have a zone size of - * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not - * expect the zone size to become larger than 8GiB in the near future. + * Minimum / maximum supported zone size. Currently, SMR disks have a zone + * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. + * We do not expect the zone size to become larger than 8GiB or smaller than + * 4MiB in the near future. */ #define BTRFS_MAX_ZONE_SIZE SZ_8G +#define BTRFS_MIN_ZONE_SIZE SZ_4M #define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT) @@ -350,7 +352,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_zoned_device_info *zone_info = NULL; struct block_device *bdev = device->bdev; - struct request_queue *queue = bdev_get_queue(bdev); unsigned int max_active_zones; unsigned int nactive; sector_t nr_sectors; @@ -402,6 +403,13 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) zone_info->zone_size, BTRFS_MAX_ZONE_SIZE); ret = -EINVAL; goto out; + } else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) { + btrfs_err_in_rcu(fs_info, + "zoned: %s: zone size %llu smaller than supported minimum %u", + rcu_str_deref(device->name), + zone_info->zone_size, BTRFS_MIN_ZONE_SIZE); + ret = -EINVAL; + goto out; } nr_sectors = bdev_nr_sectors(bdev); @@ -410,7 +418,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; - max_active_zones = queue_max_active_zones(queue); + max_active_zones = bdev_max_active_zones(bdev); if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) { btrfs_err_in_rcu(fs_info, "zoned: %s: max active zones %u is too small, need at least %u active zones", @@ -1836,7 +1844,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) } /* No space left */ - if (block_group->alloc_offset == block_group->zone_capacity) { + if (btrfs_zoned_bg_is_full(block_group)) { ret = false; goto out_unlock; } @@ -1873,20 +1881,14 @@ out_unlock: return ret; } -int btrfs_zone_finish(struct btrfs_block_group *block_group) +static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct map_lookup *map; - struct btrfs_device *device; - u64 physical; + bool need_zone_finish; int ret = 0; int i; - if (!btrfs_is_zoned(fs_info)) - return 0; - - map = block_group->physical_map; - spin_lock(&block_group->lock); if (!block_group->zone_is_active) { spin_unlock(&block_group->lock); @@ -1896,40 +1898,56 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group) /* Check if we have unwritten allocated space */ if ((block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) && - block_group->alloc_offset > block_group->meta_write_pointer) { + block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) { spin_unlock(&block_group->lock); return -EAGAIN; } - spin_unlock(&block_group->lock); - - ret = btrfs_inc_block_group_ro(block_group, false); - if (ret) - return ret; - - /* Ensure all writes in this block group finish */ - btrfs_wait_block_group_reservations(block_group); - /* No need to wait for NOCOW writers. Zoned mode does not allow that. */ - btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start, - block_group->length); - - spin_lock(&block_group->lock); /* - * Bail out if someone already deactivated the block group, or - * allocated space is left in the block group. + * If we are sure that the block group is full (= no more room left for + * new allocation) and the IO for the last usable block is completed, we + * don't need to wait for the other IOs. This holds because we ensure + * the sequential IO submissions using the ZONE_APPEND command for data + * and block_group->meta_write_pointer for metadata. */ - if (!block_group->zone_is_active) { + if (!fully_written) { spin_unlock(&block_group->lock); - btrfs_dec_block_group_ro(block_group); - return 0; - } - if (block_group->reserved) { - spin_unlock(&block_group->lock); - btrfs_dec_block_group_ro(block_group); - return -EAGAIN; + ret = btrfs_inc_block_group_ro(block_group, false); + if (ret) + return ret; + + /* Ensure all writes in this block group finish */ + btrfs_wait_block_group_reservations(block_group); + /* No need to wait for NOCOW writers. Zoned mode does not allow that */ + btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start, + block_group->length); + + spin_lock(&block_group->lock); + + /* + * Bail out if someone already deactivated the block group, or + * allocated space is left in the block group. + */ + if (!block_group->zone_is_active) { + spin_unlock(&block_group->lock); + btrfs_dec_block_group_ro(block_group); + return 0; + } + + if (block_group->reserved) { + spin_unlock(&block_group->lock); + btrfs_dec_block_group_ro(block_group); + return -EAGAIN; + } } + /* + * The block group is not fully allocated, so not fully written yet. We + * need to send ZONE_FINISH command to free up an active zone. + */ + need_zone_finish = !btrfs_zoned_bg_is_full(block_group); + block_group->zone_is_active = 0; block_group->alloc_offset = block_group->zone_capacity; block_group->free_space_ctl->free_space = 0; @@ -1937,24 +1955,29 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group) btrfs_clear_data_reloc_bg(block_group); spin_unlock(&block_group->lock); + map = block_group->physical_map; for (i = 0; i < map->num_stripes; i++) { - device = map->stripes[i].dev; - physical = map->stripes[i].physical; + struct btrfs_device *device = map->stripes[i].dev; + const u64 physical = map->stripes[i].physical; if (device->zone_info->max_active_zones == 0) continue; - ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, - physical >> SECTOR_SHIFT, - device->zone_info->zone_size >> SECTOR_SHIFT, - GFP_NOFS); + if (need_zone_finish) { + ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, + physical >> SECTOR_SHIFT, + device->zone_info->zone_size >> SECTOR_SHIFT, + GFP_NOFS); - if (ret) - return ret; + if (ret) + return ret; + } btrfs_dev_clear_active_zone(device, physical); } - btrfs_dec_block_group_ro(block_group); + + if (!fully_written) + btrfs_dec_block_group_ro(block_group); spin_lock(&fs_info->zone_active_bgs_lock); ASSERT(!list_empty(&block_group->active_bg_list)); @@ -1967,6 +1990,14 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group) return 0; } +int btrfs_zone_finish(struct btrfs_block_group *block_group) +{ + if (!btrfs_is_zoned(block_group->fs_info)) + return 0; + + return do_zone_finish(block_group, false); +} + bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) { struct btrfs_fs_info *fs_info = fs_devices->fs_info; @@ -1998,9 +2029,7 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) { struct btrfs_block_group *block_group; - struct map_lookup *map; - struct btrfs_device *device; - u64 physical; + u64 min_alloc_bytes; if (!btrfs_is_zoned(fs_info)) return; @@ -2008,42 +2037,52 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len block_group = btrfs_lookup_block_group(fs_info, logical); ASSERT(block_group); - if (logical + length < block_group->start + block_group->zone_capacity) - goto out; - - spin_lock(&block_group->lock); + /* No MIXED_BG on zoned btrfs. */ + if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) + min_alloc_bytes = fs_info->sectorsize; + else + min_alloc_bytes = fs_info->nodesize; - if (!block_group->zone_is_active) { - spin_unlock(&block_group->lock); + /* Bail out if we can allocate more data from this block group. */ + if (logical + length + min_alloc_bytes <= + block_group->start + block_group->zone_capacity) goto out; - } - block_group->zone_is_active = 0; - /* We should have consumed all the free space */ - ASSERT(block_group->alloc_offset == block_group->zone_capacity); - ASSERT(block_group->free_space_ctl->free_space == 0); - btrfs_clear_treelog_bg(block_group); - btrfs_clear_data_reloc_bg(block_group); - spin_unlock(&block_group->lock); + do_zone_finish(block_group, true); - map = block_group->physical_map; - device = map->stripes[0].dev; - physical = map->stripes[0].physical; +out: + btrfs_put_block_group(block_group); +} - if (!device->zone_info->max_active_zones) - goto out; +static void btrfs_zone_finish_endio_workfn(struct work_struct *work) +{ + struct btrfs_block_group *bg = + container_of(work, struct btrfs_block_group, zone_finish_work); - btrfs_dev_clear_active_zone(device, physical); + wait_on_extent_buffer_writeback(bg->last_eb); + free_extent_buffer(bg->last_eb); + btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length); + btrfs_put_block_group(bg); +} - spin_lock(&fs_info->zone_active_bgs_lock); - ASSERT(!list_empty(&block_group->active_bg_list)); - list_del_init(&block_group->active_bg_list); - spin_unlock(&fs_info->zone_active_bgs_lock); +void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, + struct extent_buffer *eb) +{ + if (!bg->seq_zone || eb->start + eb->len * 2 <= bg->start + bg->zone_capacity) + return; - btrfs_put_block_group(block_group); + if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) { + btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing", + bg->start); + return; + } -out: - btrfs_put_block_group(block_group); + /* For the work */ + btrfs_get_block_group(bg); + atomic_inc(&eb->refs); + bg->last_eb = eb; + INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn); + queue_work(system_unbound_wq, &bg->zone_finish_work); } void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) @@ -2073,3 +2112,30 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) } mutex_unlock(&fs_devices->device_list_mutex); } + +bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + u64 used = 0; + u64 total = 0; + u64 factor; + + ASSERT(btrfs_is_zoned(fs_info)); + + if (fs_info->bg_reclaim_threshold == 0) + return false; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (!device->bdev) + continue; + + total += device->disk_total_bytes; + used += device->bytes_used; + } + mutex_unlock(&fs_devices->device_list_mutex); + + factor = div64_u64(used * 100, total); + return factor >= fs_info->bg_reclaim_threshold; +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 6dee76248cb4..bb1a189e11f9 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -10,11 +10,7 @@ #include "block-group.h" #include "btrfs_inode.h" -/* - * Block groups with more than this value (percents) of unusable space will be - * scheduled for background reclaim. - */ -#define BTRFS_DEFAULT_RECLAIM_THRESH 75 +#define BTRFS_DEFAULT_RECLAIM_THRESH (75) struct btrfs_zoned_device_info { /* @@ -76,8 +72,11 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group); bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags); void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length); +void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, + struct extent_buffer *eb); void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); +bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -233,9 +232,17 @@ static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) { } +static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, + struct extent_buffer *eb) { } + static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { } static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { } + +static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) +{ + return false; +} #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) @@ -370,4 +377,10 @@ static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode) mutex_unlock(&root->fs_info->zoned_data_reloc_io_lock); } +static inline bool btrfs_zoned_bg_is_full(const struct btrfs_block_group *bg) +{ + ASSERT(btrfs_is_zoned(bg->fs_info)); + return (bg->alloc_offset == bg->zone_capacity); +} + #endif diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index fc42dd0badd7..0fe31a6f6e68 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -93,22 +93,26 @@ static inline struct workspace *list_to_workspace(struct list_head *list) void zstd_free_workspace(struct list_head *ws); struct list_head *zstd_alloc_workspace(unsigned int level); -/* - * zstd_reclaim_timer_fn - reclaim timer + +/** + * Timer callback to free unused workspaces. + * * @t: timer * * This scans the lru_list and attempts to reclaim any workspace that hasn't * been used for ZSTD_BTRFS_RECLAIM_JIFFIES. + * + * The context is softirq and does not need the _bh locking primitives. */ static void zstd_reclaim_timer_fn(struct timer_list *timer) { unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES; struct list_head *pos, *next; - spin_lock_bh(&wsm.lock); + spin_lock(&wsm.lock); if (list_empty(&wsm.lru_list)) { - spin_unlock_bh(&wsm.lock); + spin_unlock(&wsm.lock); return; } @@ -137,7 +141,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) if (!list_empty(&wsm.lru_list)) mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES); - spin_unlock_bh(&wsm.lock); + spin_unlock(&wsm.lock); } /* diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig index 719faeeda168..8df715640a48 100644 --- a/fs/cachefiles/Kconfig +++ b/fs/cachefiles/Kconfig @@ -26,3 +26,15 @@ config CACHEFILES_ERROR_INJECTION help This permits error injection to be enabled in cachefiles whilst a cache is in service. + +config CACHEFILES_ONDEMAND + bool "Support for on-demand read" + depends on CACHEFILES + default n + help + This permits userspace to enable the cachefiles on-demand read mode. + In this mode, when a cache miss occurs, responsibility for fetching + the data lies with the cachefiles backend instead of with the netfs + and is delegated to userspace. + + If unsure, say N. diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile index 16d811f1a2fa..c37a7a9af10b 100644 --- a/fs/cachefiles/Makefile +++ b/fs/cachefiles/Makefile @@ -16,5 +16,6 @@ cachefiles-y := \ xattr.o cachefiles-$(CONFIG_CACHEFILES_ERROR_INJECTION) += error_inject.o +cachefiles-$(CONFIG_CACHEFILES_ONDEMAND) += ondemand.o obj-$(CONFIG_CACHEFILES) := cachefiles.o diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index 7ac04ee2c0a0..aa4efcabb5e3 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -75,6 +75,9 @@ static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = { { "inuse", cachefiles_daemon_inuse }, { "secctx", cachefiles_daemon_secctx }, { "tag", cachefiles_daemon_tag }, +#ifdef CONFIG_CACHEFILES_ONDEMAND + { "copen", cachefiles_ondemand_copen }, +#endif { "", NULL } }; @@ -108,6 +111,9 @@ static int cachefiles_daemon_open(struct inode *inode, struct file *file) INIT_LIST_HEAD(&cache->volumes); INIT_LIST_HEAD(&cache->object_list); spin_lock_init(&cache->object_list_lock); + refcount_set(&cache->unbind_pincount, 1); + xa_init_flags(&cache->reqs, XA_FLAGS_ALLOC); + xa_init_flags(&cache->ondemand_ids, XA_FLAGS_ALLOC1); /* set default caching limits * - limit at 1% free space and/or free files @@ -126,6 +132,53 @@ static int cachefiles_daemon_open(struct inode *inode, struct file *file) return 0; } +static void cachefiles_flush_reqs(struct cachefiles_cache *cache) +{ + struct xarray *xa = &cache->reqs; + struct cachefiles_req *req; + unsigned long index; + + /* + * Make sure the following two operations won't be reordered. + * 1) set CACHEFILES_DEAD bit + * 2) flush requests in the xarray + * Otherwise the request may be enqueued after xarray has been + * flushed, leaving the orphan request never being completed. + * + * CPU 1 CPU 2 + * ===== ===== + * flush requests in the xarray + * test CACHEFILES_DEAD bit + * enqueue the request + * set CACHEFILES_DEAD bit + */ + smp_mb(); + + xa_lock(xa); + xa_for_each(xa, index, req) { + req->error = -EIO; + complete(&req->done); + } + xa_unlock(xa); + + xa_destroy(&cache->reqs); + xa_destroy(&cache->ondemand_ids); +} + +void cachefiles_put_unbind_pincount(struct cachefiles_cache *cache) +{ + if (refcount_dec_and_test(&cache->unbind_pincount)) { + cachefiles_daemon_unbind(cache); + cachefiles_open = 0; + kfree(cache); + } +} + +void cachefiles_get_unbind_pincount(struct cachefiles_cache *cache) +{ + refcount_inc(&cache->unbind_pincount); +} + /* * Release a cache. */ @@ -139,36 +192,27 @@ static int cachefiles_daemon_release(struct inode *inode, struct file *file) set_bit(CACHEFILES_DEAD, &cache->flags); - cachefiles_daemon_unbind(cache); + if (cachefiles_in_ondemand_mode(cache)) + cachefiles_flush_reqs(cache); /* clean up the control file interface */ cache->cachefilesd = NULL; file->private_data = NULL; - cachefiles_open = 0; - kfree(cache); + cachefiles_put_unbind_pincount(cache); _leave(""); return 0; } -/* - * Read the cache state. - */ -static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer, - size_t buflen, loff_t *pos) +static ssize_t cachefiles_do_daemon_read(struct cachefiles_cache *cache, + char __user *_buffer, size_t buflen) { - struct cachefiles_cache *cache = file->private_data; unsigned long long b_released; unsigned f_released; char buffer[256]; int n; - //_enter(",,%zu,", buflen); - - if (!test_bit(CACHEFILES_READY, &cache->flags)) - return 0; - /* check how much space the cache has */ cachefiles_has_space(cache, 0, 0, cachefiles_has_space_check); @@ -207,6 +251,25 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer, } /* + * Read the cache state. + */ +static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer, + size_t buflen, loff_t *pos) +{ + struct cachefiles_cache *cache = file->private_data; + + //_enter(",,%zu,", buflen); + + if (!test_bit(CACHEFILES_READY, &cache->flags)) + return 0; + + if (cachefiles_in_ondemand_mode(cache)) + return cachefiles_ondemand_daemon_read(cache, _buffer, buflen); + else + return cachefiles_do_daemon_read(cache, _buffer, buflen); +} + +/* * Take a command from cachefilesd, parse it and act on it. */ static ssize_t cachefiles_daemon_write(struct file *file, @@ -297,8 +360,13 @@ static __poll_t cachefiles_daemon_poll(struct file *file, poll_wait(file, &cache->daemon_pollwq, poll); mask = 0; - if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags)) - mask |= EPOLLIN; + if (cachefiles_in_ondemand_mode(cache)) { + if (!xa_empty(&cache->reqs)) + mask |= EPOLLIN; + } else { + if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags)) + mask |= EPOLLIN; + } if (test_bit(CACHEFILES_CULLING, &cache->flags)) mask |= EPOLLOUT; @@ -687,11 +755,6 @@ static int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args) cache->brun_percent >= 100) return -ERANGE; - if (*args) { - pr_err("'bind' command doesn't take an argument\n"); - return -EINVAL; - } - if (!cache->rootdirname) { pr_err("No cache directory specified\n"); return -EINVAL; @@ -703,6 +766,18 @@ static int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args) return -EBUSY; } + if (IS_ENABLED(CONFIG_CACHEFILES_ONDEMAND)) { + if (!strcmp(args, "ondemand")) { + set_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags); + } else if (*args) { + pr_err("Invalid argument to the 'bind' command\n"); + return -EINVAL; + } + } else if (*args) { + pr_err("'bind' command doesn't take an argument\n"); + return -EINVAL; + } + /* Make sure we have copies of the tag string */ if (!cache->tag) { /* diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index ae93cee9d25d..a69073a1d3f0 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -362,6 +362,8 @@ static void cachefiles_withdraw_cookie(struct fscache_cookie *cookie) spin_unlock(&cache->object_list_lock); } + cachefiles_ondemand_clean_object(object); + if (object->file) { cachefiles_begin_secure(cache, &saved_cred); cachefiles_clean_up_object(object, cache); diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index c793d33b0224..6cba2c6de2f9 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -15,6 +15,8 @@ #include <linux/fscache-cache.h> #include <linux/cred.h> #include <linux/security.h> +#include <linux/xarray.h> +#include <linux/cachefiles.h> #define CACHEFILES_DIO_BLOCK_SIZE 4096 @@ -58,8 +60,13 @@ struct cachefiles_object { enum cachefiles_content content_info:8; /* Info about content presence */ unsigned long flags; #define CACHEFILES_OBJECT_USING_TMPFILE 0 /* Have an unlinked tmpfile */ +#ifdef CONFIG_CACHEFILES_ONDEMAND + int ondemand_id; +#endif }; +#define CACHEFILES_ONDEMAND_ID_CLOSED -1 + /* * Cache files cache definition */ @@ -98,11 +105,31 @@ struct cachefiles_cache { #define CACHEFILES_DEAD 1 /* T if cache dead */ #define CACHEFILES_CULLING 2 /* T if cull engaged */ #define CACHEFILES_STATE_CHANGED 3 /* T if state changed (poll trigger) */ +#define CACHEFILES_ONDEMAND_MODE 4 /* T if in on-demand read mode */ char *rootdirname; /* name of cache root directory */ char *secctx; /* LSM security context */ char *tag; /* cache binding tag */ + refcount_t unbind_pincount;/* refcount to do daemon unbind */ + struct xarray reqs; /* xarray of pending on-demand requests */ + struct xarray ondemand_ids; /* xarray for ondemand_id allocation */ + u32 ondemand_id_next; +}; + +static inline bool cachefiles_in_ondemand_mode(struct cachefiles_cache *cache) +{ + return IS_ENABLED(CONFIG_CACHEFILES_ONDEMAND) && + test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags); +} + +struct cachefiles_req { + struct cachefiles_object *object; + struct completion done; + int error; + struct cachefiles_msg msg; }; +#define CACHEFILES_REQ_NEW XA_MARK_1 + #include <trace/events/cachefiles.h> static inline @@ -145,6 +172,8 @@ extern int cachefiles_has_space(struct cachefiles_cache *cache, * daemon.c */ extern const struct file_operations cachefiles_daemon_fops; +extern void cachefiles_get_unbind_pincount(struct cachefiles_cache *cache); +extern void cachefiles_put_unbind_pincount(struct cachefiles_cache *cache); /* * error_inject.c @@ -201,6 +230,16 @@ extern void cachefiles_put_object(struct cachefiles_object *object, */ extern bool cachefiles_begin_operation(struct netfs_cache_resources *cres, enum fscache_want_state want_state); +extern int __cachefiles_prepare_write(struct cachefiles_object *object, + struct file *file, + loff_t *_start, size_t *_len, + bool no_space_allocated_yet); +extern int __cachefiles_write(struct cachefiles_object *object, + struct file *file, + loff_t start_pos, + struct iov_iter *iter, + netfs_io_terminated_t term_func, + void *term_func_priv); /* * key.c @@ -241,6 +280,45 @@ extern bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache, struct cachefiles_object *object); /* + * ondemand.c + */ +#ifdef CONFIG_CACHEFILES_ONDEMAND +extern ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, + char __user *_buffer, size_t buflen); + +extern int cachefiles_ondemand_copen(struct cachefiles_cache *cache, + char *args); + +extern int cachefiles_ondemand_init_object(struct cachefiles_object *object); +extern void cachefiles_ondemand_clean_object(struct cachefiles_object *object); + +extern int cachefiles_ondemand_read(struct cachefiles_object *object, + loff_t pos, size_t len); + +#else +static inline ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, + char __user *_buffer, size_t buflen) +{ + return -EOPNOTSUPP; +} + +static inline int cachefiles_ondemand_init_object(struct cachefiles_object *object) +{ + return 0; +} + +static inline void cachefiles_ondemand_clean_object(struct cachefiles_object *object) +{ +} + +static inline int cachefiles_ondemand_read(struct cachefiles_object *object, + loff_t pos, size_t len) +{ + return -EOPNOTSUPP; +} +#endif + +/* * security.c */ extern int cachefiles_get_security_ID(struct cachefiles_cache *cache); diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 9dc81e781f2b..000a28f46e59 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -277,36 +277,33 @@ static void cachefiles_write_complete(struct kiocb *iocb, long ret) /* * Initiate a write to the cache. */ -static int cachefiles_write(struct netfs_cache_resources *cres, - loff_t start_pos, - struct iov_iter *iter, - netfs_io_terminated_t term_func, - void *term_func_priv) +int __cachefiles_write(struct cachefiles_object *object, + struct file *file, + loff_t start_pos, + struct iov_iter *iter, + netfs_io_terminated_t term_func, + void *term_func_priv) { - struct cachefiles_object *object; struct cachefiles_cache *cache; struct cachefiles_kiocb *ki; struct inode *inode; - struct file *file; unsigned int old_nofs; - ssize_t ret = -ENOBUFS; + ssize_t ret; size_t len = iov_iter_count(iter); - if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE)) - goto presubmission_error; fscache_count_write(); - object = cachefiles_cres_object(cres); cache = object->volume->cache; - file = cachefiles_cres_file(cres); _enter("%pD,%li,%llx,%zx/%llx", file, file_inode(file)->i_ino, start_pos, len, i_size_read(file_inode(file))); - ret = -ENOMEM; ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL); - if (!ki) - goto presubmission_error; + if (!ki) { + if (term_func) + term_func(term_func_priv, -ENOMEM, false); + return -ENOMEM; + } refcount_set(&ki->ki_refcnt, 2); ki->iocb.ki_filp = file; @@ -314,7 +311,6 @@ static int cachefiles_write(struct netfs_cache_resources *cres, ki->iocb.ki_flags = IOCB_DIRECT | IOCB_WRITE; ki->iocb.ki_ioprio = get_current_ioprio(); ki->object = object; - ki->inval_counter = cres->inval_counter; ki->start = start_pos; ki->len = len; ki->term_func = term_func; @@ -369,11 +365,24 @@ in_progress: cachefiles_put_kiocb(ki); _leave(" = %zd", ret); return ret; +} -presubmission_error: - if (term_func) - term_func(term_func_priv, ret, false); - return ret; +static int cachefiles_write(struct netfs_cache_resources *cres, + loff_t start_pos, + struct iov_iter *iter, + netfs_io_terminated_t term_func, + void *term_func_priv) +{ + if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE)) { + if (term_func) + term_func(term_func_priv, -ENOBUFS, false); + return -ENOBUFS; + } + + return __cachefiles_write(cachefiles_cres_object(cres), + cachefiles_cres_file(cres), + start_pos, iter, + term_func, term_func_priv); } /* @@ -394,6 +403,7 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest * enum netfs_io_source ret = NETFS_DOWNLOAD_FROM_SERVER; loff_t off, to; ino_t ino = file ? file_inode(file)->i_ino : 0; + int rc; _enter("%zx @%llx/%llx", subreq->len, subreq->start, i_size); @@ -406,7 +416,8 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest * if (test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) { __set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); why = cachefiles_trace_read_no_data; - goto out_no_object; + if (!test_bit(NETFS_SREQ_ONDEMAND, &subreq->flags)) + goto out_no_object; } /* The object and the file may be being created in the background. */ @@ -423,7 +434,7 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest * object = cachefiles_cres_object(cres); cache = object->volume->cache; cachefiles_begin_secure(cache, &saved_cred); - +retry: off = cachefiles_inject_read_error(); if (off == 0) off = vfs_llseek(file, subreq->start, SEEK_DATA); @@ -474,6 +485,15 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest * download_and_store: __set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); + if (test_bit(NETFS_SREQ_ONDEMAND, &subreq->flags)) { + rc = cachefiles_ondemand_read(object, subreq->start, + subreq->len); + if (!rc) { + __clear_bit(NETFS_SREQ_ONDEMAND, &subreq->flags); + goto retry; + } + ret = NETFS_INVALID_READ; + } out: cachefiles_end_secure(cache, saved_cred); out_no_object: @@ -484,13 +504,12 @@ out_no_object: /* * Prepare for a write to occur. */ -static int __cachefiles_prepare_write(struct netfs_cache_resources *cres, - loff_t *_start, size_t *_len, loff_t i_size, - bool no_space_allocated_yet) +int __cachefiles_prepare_write(struct cachefiles_object *object, + struct file *file, + loff_t *_start, size_t *_len, + bool no_space_allocated_yet) { - struct cachefiles_object *object = cachefiles_cres_object(cres); struct cachefiles_cache *cache = object->volume->cache; - struct file *file = cachefiles_cres_file(cres); loff_t start = *_start, pos; size_t len = *_len, down; int ret; @@ -577,7 +596,8 @@ static int cachefiles_prepare_write(struct netfs_cache_resources *cres, } cachefiles_begin_secure(cache, &saved_cred); - ret = __cachefiles_prepare_write(cres, _start, _len, i_size, + ret = __cachefiles_prepare_write(object, cachefiles_cres_file(cres), + _start, _len, no_space_allocated_yet); cachefiles_end_secure(cache, saved_cred); return ret; diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index ca9f3e4ec4b3..facf2ebe464b 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -452,10 +452,9 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object) struct dentry *fan = volume->fanout[(u8)object->cookie->key_hash]; struct file *file; struct path path; - uint64_t ni_size = object->cookie->object_size; + uint64_t ni_size; long ret; - ni_size = round_up(ni_size, CACHEFILES_DIO_BLOCK_SIZE); cachefiles_begin_secure(cache, &saved_cred); @@ -481,6 +480,15 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object) goto out_dput; } + ret = cachefiles_ondemand_init_object(object); + if (ret < 0) { + file = ERR_PTR(ret); + goto out_unuse; + } + + ni_size = object->cookie->object_size; + ni_size = round_up(ni_size, CACHEFILES_DIO_BLOCK_SIZE); + if (ni_size > 0) { trace_cachefiles_trunc(object, d_backing_inode(path.dentry), 0, ni_size, cachefiles_trunc_expand_tmpfile); @@ -586,6 +594,10 @@ static bool cachefiles_open_file(struct cachefiles_object *object, } _debug("file -> %pd positive", dentry); + ret = cachefiles_ondemand_init_object(object); + if (ret < 0) + goto error_fput; + ret = cachefiles_check_auxdata(object, file); if (ret < 0) goto check_failed; diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c new file mode 100644 index 000000000000..a41ae6efc545 --- /dev/null +++ b/fs/cachefiles/ondemand.c @@ -0,0 +1,503 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/fdtable.h> +#include <linux/anon_inodes.h> +#include <linux/uio.h> +#include "internal.h" + +static int cachefiles_ondemand_fd_release(struct inode *inode, + struct file *file) +{ + struct cachefiles_object *object = file->private_data; + struct cachefiles_cache *cache = object->volume->cache; + int object_id = object->ondemand_id; + struct cachefiles_req *req; + XA_STATE(xas, &cache->reqs, 0); + + xa_lock(&cache->reqs); + object->ondemand_id = CACHEFILES_ONDEMAND_ID_CLOSED; + + /* + * Flush all pending READ requests since their completion depends on + * anon_fd. + */ + xas_for_each(&xas, req, ULONG_MAX) { + if (req->msg.opcode == CACHEFILES_OP_READ) { + req->error = -EIO; + complete(&req->done); + xas_store(&xas, NULL); + } + } + xa_unlock(&cache->reqs); + + xa_erase(&cache->ondemand_ids, object_id); + trace_cachefiles_ondemand_fd_release(object, object_id); + cachefiles_put_object(object, cachefiles_obj_put_ondemand_fd); + cachefiles_put_unbind_pincount(cache); + return 0; +} + +static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb, + struct iov_iter *iter) +{ + struct cachefiles_object *object = kiocb->ki_filp->private_data; + struct cachefiles_cache *cache = object->volume->cache; + struct file *file = object->file; + size_t len = iter->count; + loff_t pos = kiocb->ki_pos; + const struct cred *saved_cred; + int ret; + + if (!file) + return -ENOBUFS; + + cachefiles_begin_secure(cache, &saved_cred); + ret = __cachefiles_prepare_write(object, file, &pos, &len, true); + cachefiles_end_secure(cache, saved_cred); + if (ret < 0) + return ret; + + trace_cachefiles_ondemand_fd_write(object, file_inode(file), pos, len); + ret = __cachefiles_write(object, file, pos, iter, NULL, NULL); + if (!ret) + ret = len; + + return ret; +} + +static loff_t cachefiles_ondemand_fd_llseek(struct file *filp, loff_t pos, + int whence) +{ + struct cachefiles_object *object = filp->private_data; + struct file *file = object->file; + + if (!file) + return -ENOBUFS; + + return vfs_llseek(file, pos, whence); +} + +static long cachefiles_ondemand_fd_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + struct cachefiles_object *object = filp->private_data; + struct cachefiles_cache *cache = object->volume->cache; + struct cachefiles_req *req; + unsigned long id; + + if (ioctl != CACHEFILES_IOC_READ_COMPLETE) + return -EINVAL; + + if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags)) + return -EOPNOTSUPP; + + id = arg; + req = xa_erase(&cache->reqs, id); + if (!req) + return -EINVAL; + + trace_cachefiles_ondemand_cread(object, id); + complete(&req->done); + return 0; +} + +static const struct file_operations cachefiles_ondemand_fd_fops = { + .owner = THIS_MODULE, + .release = cachefiles_ondemand_fd_release, + .write_iter = cachefiles_ondemand_fd_write_iter, + .llseek = cachefiles_ondemand_fd_llseek, + .unlocked_ioctl = cachefiles_ondemand_fd_ioctl, +}; + +/* + * OPEN request Completion (copen) + * - command: "copen <id>,<cache_size>" + * <cache_size> indicates the object size if >=0, error code if negative + */ +int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args) +{ + struct cachefiles_req *req; + struct fscache_cookie *cookie; + char *pid, *psize; + unsigned long id; + long size; + int ret; + + if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags)) + return -EOPNOTSUPP; + + if (!*args) { + pr_err("Empty id specified\n"); + return -EINVAL; + } + + pid = args; + psize = strchr(args, ','); + if (!psize) { + pr_err("Cache size is not specified\n"); + return -EINVAL; + } + + *psize = 0; + psize++; + + ret = kstrtoul(pid, 0, &id); + if (ret) + return ret; + + req = xa_erase(&cache->reqs, id); + if (!req) + return -EINVAL; + + /* fail OPEN request if copen format is invalid */ + ret = kstrtol(psize, 0, &size); + if (ret) { + req->error = ret; + goto out; + } + + /* fail OPEN request if daemon reports an error */ + if (size < 0) { + if (!IS_ERR_VALUE(size)) + size = -EINVAL; + req->error = size; + goto out; + } + + cookie = req->object->cookie; + cookie->object_size = size; + if (size) + clear_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags); + else + set_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags); + trace_cachefiles_ondemand_copen(req->object, id, size); + +out: + complete(&req->done); + return ret; +} + +static int cachefiles_ondemand_get_fd(struct cachefiles_req *req) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + struct cachefiles_open *load; + struct file *file; + u32 object_id; + int ret, fd; + + object = cachefiles_grab_object(req->object, + cachefiles_obj_get_ondemand_fd); + cache = object->volume->cache; + + ret = xa_alloc_cyclic(&cache->ondemand_ids, &object_id, NULL, + XA_LIMIT(1, INT_MAX), + &cache->ondemand_id_next, GFP_KERNEL); + if (ret < 0) + goto err; + + fd = get_unused_fd_flags(O_WRONLY); + if (fd < 0) { + ret = fd; + goto err_free_id; + } + + file = anon_inode_getfile("[cachefiles]", &cachefiles_ondemand_fd_fops, + object, O_WRONLY); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + goto err_put_fd; + } + + file->f_mode |= FMODE_PWRITE | FMODE_LSEEK; + fd_install(fd, file); + + load = (void *)req->msg.data; + load->fd = fd; + req->msg.object_id = object_id; + object->ondemand_id = object_id; + + cachefiles_get_unbind_pincount(cache); + trace_cachefiles_ondemand_open(object, &req->msg, load); + return 0; + +err_put_fd: + put_unused_fd(fd); +err_free_id: + xa_erase(&cache->ondemand_ids, object_id); +err: + cachefiles_put_object(object, cachefiles_obj_put_ondemand_fd); + return ret; +} + +ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, + char __user *_buffer, size_t buflen) +{ + struct cachefiles_req *req; + struct cachefiles_msg *msg; + unsigned long id = 0; + size_t n; + int ret = 0; + XA_STATE(xas, &cache->reqs, 0); + + /* + * Search for a request that has not ever been processed, to prevent + * requests from being processed repeatedly. + */ + xa_lock(&cache->reqs); + req = xas_find_marked(&xas, UINT_MAX, CACHEFILES_REQ_NEW); + if (!req) { + xa_unlock(&cache->reqs); + return 0; + } + + msg = &req->msg; + n = msg->len; + + if (n > buflen) { + xa_unlock(&cache->reqs); + return -EMSGSIZE; + } + + xas_clear_mark(&xas, CACHEFILES_REQ_NEW); + xa_unlock(&cache->reqs); + + id = xas.xa_index; + msg->msg_id = id; + + if (msg->opcode == CACHEFILES_OP_OPEN) { + ret = cachefiles_ondemand_get_fd(req); + if (ret) + goto error; + } + + if (copy_to_user(_buffer, msg, n) != 0) { + ret = -EFAULT; + goto err_put_fd; + } + + /* CLOSE request has no reply */ + if (msg->opcode == CACHEFILES_OP_CLOSE) { + xa_erase(&cache->reqs, id); + complete(&req->done); + } + + return n; + +err_put_fd: + if (msg->opcode == CACHEFILES_OP_OPEN) + close_fd(((struct cachefiles_open *)msg->data)->fd); +error: + xa_erase(&cache->reqs, id); + req->error = ret; + complete(&req->done); + return ret; +} + +typedef int (*init_req_fn)(struct cachefiles_req *req, void *private); + +static int cachefiles_ondemand_send_req(struct cachefiles_object *object, + enum cachefiles_opcode opcode, + size_t data_len, + init_req_fn init_req, + void *private) +{ + struct cachefiles_cache *cache = object->volume->cache; + struct cachefiles_req *req; + XA_STATE(xas, &cache->reqs, 0); + int ret; + + if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags)) + return 0; + + if (test_bit(CACHEFILES_DEAD, &cache->flags)) + return -EIO; + + req = kzalloc(sizeof(*req) + data_len, GFP_KERNEL); + if (!req) + return -ENOMEM; + + req->object = object; + init_completion(&req->done); + req->msg.opcode = opcode; + req->msg.len = sizeof(struct cachefiles_msg) + data_len; + + ret = init_req(req, private); + if (ret) + goto out; + + do { + /* + * Stop enqueuing the request when daemon is dying. The + * following two operations need to be atomic as a whole. + * 1) check cache state, and + * 2) enqueue request if cache is alive. + * Otherwise the request may be enqueued after xarray has been + * flushed, leaving the orphan request never being completed. + * + * CPU 1 CPU 2 + * ===== ===== + * test CACHEFILES_DEAD bit + * set CACHEFILES_DEAD bit + * flush requests in the xarray + * enqueue the request + */ + xas_lock(&xas); + + if (test_bit(CACHEFILES_DEAD, &cache->flags)) { + xas_unlock(&xas); + ret = -EIO; + goto out; + } + + /* coupled with the barrier in cachefiles_flush_reqs() */ + smp_mb(); + + if (opcode != CACHEFILES_OP_OPEN && object->ondemand_id <= 0) { + WARN_ON_ONCE(object->ondemand_id == 0); + xas_unlock(&xas); + ret = -EIO; + goto out; + } + + xas.xa_index = 0; + xas_find_marked(&xas, UINT_MAX, XA_FREE_MARK); + if (xas.xa_node == XAS_RESTART) + xas_set_err(&xas, -EBUSY); + xas_store(&xas, req); + xas_clear_mark(&xas, XA_FREE_MARK); + xas_set_mark(&xas, CACHEFILES_REQ_NEW); + xas_unlock(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); + + ret = xas_error(&xas); + if (ret) + goto out; + + wake_up_all(&cache->daemon_pollwq); + wait_for_completion(&req->done); + ret = req->error; +out: + kfree(req); + return ret; +} + +static int cachefiles_ondemand_init_open_req(struct cachefiles_req *req, + void *private) +{ + struct cachefiles_object *object = req->object; + struct fscache_cookie *cookie = object->cookie; + struct fscache_volume *volume = object->volume->vcookie; + struct cachefiles_open *load = (void *)req->msg.data; + size_t volume_key_size, cookie_key_size; + void *volume_key, *cookie_key; + + /* + * Volume key is a NUL-terminated string. key[0] stores strlen() of the + * string, followed by the content of the string (excluding '\0'). + */ + volume_key_size = volume->key[0] + 1; + volume_key = volume->key + 1; + + /* Cookie key is binary data, which is netfs specific. */ + cookie_key_size = cookie->key_len; + cookie_key = fscache_get_key(cookie); + + if (!(object->cookie->advice & FSCACHE_ADV_WANT_CACHE_SIZE)) { + pr_err("WANT_CACHE_SIZE is needed for on-demand mode\n"); + return -EINVAL; + } + + load->volume_key_size = volume_key_size; + load->cookie_key_size = cookie_key_size; + memcpy(load->data, volume_key, volume_key_size); + memcpy(load->data + volume_key_size, cookie_key, cookie_key_size); + + return 0; +} + +static int cachefiles_ondemand_init_close_req(struct cachefiles_req *req, + void *private) +{ + struct cachefiles_object *object = req->object; + int object_id = object->ondemand_id; + + /* + * It's possible that object id is still 0 if the cookie looking up + * phase failed before OPEN request has ever been sent. Also avoid + * sending CLOSE request for CACHEFILES_ONDEMAND_ID_CLOSED, which means + * anon_fd has already been closed. + */ + if (object_id <= 0) + return -ENOENT; + + req->msg.object_id = object_id; + trace_cachefiles_ondemand_close(object, &req->msg); + return 0; +} + +struct cachefiles_read_ctx { + loff_t off; + size_t len; +}; + +static int cachefiles_ondemand_init_read_req(struct cachefiles_req *req, + void *private) +{ + struct cachefiles_object *object = req->object; + struct cachefiles_read *load = (void *)req->msg.data; + struct cachefiles_read_ctx *read_ctx = private; + int object_id = object->ondemand_id; + + /* Stop enqueuing requests when daemon has closed anon_fd. */ + if (object_id <= 0) { + WARN_ON_ONCE(object_id == 0); + pr_info_once("READ: anonymous fd closed prematurely.\n"); + return -EIO; + } + + req->msg.object_id = object_id; + load->off = read_ctx->off; + load->len = read_ctx->len; + trace_cachefiles_ondemand_read(object, &req->msg, load); + return 0; +} + +int cachefiles_ondemand_init_object(struct cachefiles_object *object) +{ + struct fscache_cookie *cookie = object->cookie; + struct fscache_volume *volume = object->volume->vcookie; + size_t volume_key_size, cookie_key_size, data_len; + + /* + * CacheFiles will firstly check the cache file under the root cache + * directory. If the coherency check failed, it will fallback to + * creating a new tmpfile as the cache file. Reuse the previously + * allocated object ID if any. + */ + if (object->ondemand_id > 0) + return 0; + + volume_key_size = volume->key[0] + 1; + cookie_key_size = cookie->key_len; + data_len = sizeof(struct cachefiles_open) + + volume_key_size + cookie_key_size; + + return cachefiles_ondemand_send_req(object, CACHEFILES_OP_OPEN, + data_len, cachefiles_ondemand_init_open_req, NULL); +} + +void cachefiles_ondemand_clean_object(struct cachefiles_object *object) +{ + cachefiles_ondemand_send_req(object, CACHEFILES_OP_CLOSE, 0, + cachefiles_ondemand_init_close_req, NULL); +} + +int cachefiles_ondemand_read(struct cachefiles_object *object, + loff_t pos, size_t len) +{ + struct cachefiles_read_ctx read_ctx = {pos, len}; + + return cachefiles_ondemand_send_req(object, CACHEFILES_OP_READ, + sizeof(struct cachefiles_read), + cachefiles_ondemand_init_read_req, &read_ctx); +} diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index aa25bffd4823..b6edcf89a429 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -85,7 +85,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio) if (folio_test_dirty(folio)) { dout("%p dirty_folio %p idx %lu -- already dirty\n", mapping->host, folio, folio->index); - BUG_ON(!folio_get_private(folio)); + VM_BUG_ON_FOLIO(!folio_test_private(folio), folio); return false; } @@ -122,7 +122,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio) * Reference snap context in folio->private. Also set * PagePrivate so that we get invalidate_folio callback. */ - BUG_ON(folio_get_private(folio)); + VM_BUG_ON_FOLIO(folio_test_private(folio), folio); folio_attach_private(folio, snapc); return ceph_fscache_dirty_folio(mapping, folio); @@ -150,7 +150,7 @@ static void ceph_invalidate_folio(struct folio *folio, size_t offset, } WARN_ON(!folio_test_locked(folio)); - if (folio_get_private(folio)) { + if (folio_test_private(folio)) { dout("%p invalidate_folio idx %lu full dirty page\n", inode, folio->index); @@ -729,8 +729,11 @@ static void writepages_finish(struct ceph_osd_request *req) /* clean all pages */ for (i = 0; i < req->r_num_ops; i++) { - if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) + if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) { + pr_warn("%s incorrect op %d req %p index %d tid %llu\n", + __func__, req->r_ops[i].op, req, i, req->r_tid); break; + } osd_data = osd_req_op_extent_osd_data(req, i); BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 6c9e837aa1d3..8c8226c0feac 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -629,9 +629,15 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, iinfo.change_attr = 1; ceph_encode_timespec64(&iinfo.btime, &now); - iinfo.xattr_len = ARRAY_SIZE(xattr_buf); - iinfo.xattr_data = xattr_buf; - memset(iinfo.xattr_data, 0, iinfo.xattr_len); + if (req->r_pagelist) { + iinfo.xattr_len = req->r_pagelist->length; + iinfo.xattr_data = req->r_pagelist->mapped_tail; + } else { + /* fake it */ + iinfo.xattr_len = ARRAY_SIZE(xattr_buf); + iinfo.xattr_data = xattr_buf; + memset(iinfo.xattr_data, 0, iinfo.xattr_len); + } in.ino = cpu_to_le64(vino.ino); in.snapid = cpu_to_le64(CEPH_NOSNAP); @@ -743,6 +749,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, err = ceph_security_init_secctx(dentry, mode, &as_ctx); if (err < 0) goto out_ctx; + /* Async create can't handle more than a page of xattrs */ + if (as_ctx.pagelist && + !list_is_singular(&as_ctx.pagelist->head)) + try_async = false; } else if (!d_in_lookup(dentry)) { /* If it's not being looked up, it's negative */ return -ENOENT; diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 526a4c1bed99..e78be66bbf01 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -113,7 +113,7 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, if (WARN_ON_ONCE(len <= 0)) return -EINVAL; - if (WARN_ON_ONCE(len % FS_CRYPTO_BLOCK_SIZE != 0)) + if (WARN_ON_ONCE(len % FSCRYPT_CONTENTS_ALIGNMENT != 0)) return -EINVAL; fscrypt_generate_iv(&iv, lblk_num, ci); @@ -213,8 +213,8 @@ EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks); * fscrypt_encrypt_block_inplace() - Encrypt a filesystem block in-place * @inode: The inode to which this block belongs * @page: The page containing the block to encrypt - * @len: Size of block to encrypt. Doesn't need to be a multiple of the - * fs block size, but must be a multiple of FS_CRYPTO_BLOCK_SIZE. + * @len: Size of block to encrypt. This must be a multiple of + * FSCRYPT_CONTENTS_ALIGNMENT. * @offs: Byte offset within @page at which the block to encrypt begins * @lblk_num: Filesystem logical block number of the block, i.e. the 0-based * number of the block within the file @@ -283,8 +283,8 @@ EXPORT_SYMBOL(fscrypt_decrypt_pagecache_blocks); * fscrypt_decrypt_block_inplace() - Decrypt a filesystem block in-place * @inode: The inode to which this block belongs * @page: The page containing the block to decrypt - * @len: Size of block to decrypt. Doesn't need to be a multiple of the - * fs block size, but must be a multiple of FS_CRYPTO_BLOCK_SIZE. + * @len: Size of block to decrypt. This must be a multiple of + * FSCRYPT_CONTENTS_ALIGNMENT. * @offs: Byte offset within @page at which the block to decrypt begins * @lblk_num: Filesystem logical block number of the block, i.e. the 0-based * number of the block within the file diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index a9be4bc74a94..14e0ef5e9a20 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -19,6 +19,13 @@ #include "fscrypt_private.h" /* + * The minimum message length (input and output length), in bytes, for all + * filenames encryption modes. Filenames shorter than this will be zero-padded + * before being encrypted. + */ +#define FSCRYPT_FNAME_MIN_MSG_LEN 16 + +/* * struct fscrypt_nokey_name - identifier for directory entry when key is absent * * When userspace lists an encrypted directory without access to the key, the @@ -267,7 +274,7 @@ bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, if (orig_len > max_len) return false; - encrypted_len = max(orig_len, (u32)FS_CRYPTO_BLOCK_SIZE); + encrypted_len = max_t(u32, orig_len, FSCRYPT_FNAME_MIN_MSG_LEN); encrypted_len = round_up(encrypted_len, padding); *encrypted_len_ret = min(encrypted_len, max_len); return true; @@ -350,7 +357,7 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode, return 0; } - if (iname->len < FS_CRYPTO_BLOCK_SIZE) + if (iname->len < FSCRYPT_FNAME_MIN_MSG_LEN) return -EUCLEAN; if (fscrypt_has_encryption_key(inode)) diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 5b0a9e6478b5..6b4c8094cc7b 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -545,8 +545,8 @@ struct key * fscrypt_find_master_key(struct super_block *sb, const struct fscrypt_key_specifier *mk_spec); -int fscrypt_add_test_dummy_key(struct super_block *sb, - struct fscrypt_key_specifier *key_spec); +int fscrypt_get_test_dummy_key_identifier( + u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); int fscrypt_verify_key_added(struct super_block *sb, const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); @@ -561,7 +561,9 @@ struct fscrypt_mode { int keysize; /* key size in bytes */ int security_strength; /* security strength in bytes */ int ivsize; /* IV size in bytes */ - int logged_impl_name; + int logged_cryptoapi_impl; + int logged_blk_crypto_native; + int logged_blk_crypto_fallback; enum blk_crypto_mode_num blk_crypto_mode; }; @@ -621,6 +623,8 @@ int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci); bool fscrypt_policies_equal(const union fscrypt_policy *policy1, const union fscrypt_policy *policy2); +int fscrypt_policy_to_key_spec(const union fscrypt_policy *policy, + struct fscrypt_key_specifier *key_spec); bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, const struct inode *inode); int fscrypt_policy_from_context(union fscrypt_policy *policy_u, diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index 93c2ca858092..90f3e68f166e 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -12,7 +12,7 @@ * provides the key and IV to use. */ -#include <linux/blk-crypto.h> +#include <linux/blk-crypto-profile.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/sched/mm.h> @@ -64,6 +64,35 @@ static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci) return DIV_ROUND_UP(lblk_bits, 8); } +/* + * Log a message when starting to use blk-crypto (native) or blk-crypto-fallback + * for an encryption mode for the first time. This is the blk-crypto + * counterpart to the message logged when starting to use the crypto API for the + * first time. A limitation is that these messages don't convey which specific + * filesystems or files are using each implementation. However, *usually* + * systems use just one implementation per mode, which makes these messages + * helpful for debugging problems where the "wrong" implementation is used. + */ +static void fscrypt_log_blk_crypto_impl(struct fscrypt_mode *mode, + struct request_queue **devs, + int num_devs, + const struct blk_crypto_config *cfg) +{ + int i; + + for (i = 0; i < num_devs; i++) { + if (!IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) || + __blk_crypto_cfg_supported(devs[i]->crypto_profile, cfg)) { + if (!xchg(&mode->logged_blk_crypto_native, 1)) + pr_info("fscrypt: %s using blk-crypto (native)\n", + mode->friendly_name); + } else if (!xchg(&mode->logged_blk_crypto_fallback, 1)) { + pr_info("fscrypt: %s using blk-crypto-fallback\n", + mode->friendly_name); + } + } +} + /* Enable inline encryption for this file if supported. */ int fscrypt_select_encryption_impl(struct fscrypt_info *ci) { @@ -117,6 +146,8 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci) goto out_free_devs; } + fscrypt_log_blk_crypto_impl(ci->ci_mode, devs, num_devs, &crypto_cfg); + ci->ci_inlinecrypt = true; out_free_devs: kfree(devs); diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 0b3ffbb4faf4..caee9f8620dd 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -688,28 +688,68 @@ out_wipe_secret: } EXPORT_SYMBOL_GPL(fscrypt_ioctl_add_key); -/* - * Add the key for '-o test_dummy_encryption' to the filesystem keyring. - * - * Use a per-boot random key to prevent people from misusing this option. - */ -int fscrypt_add_test_dummy_key(struct super_block *sb, - struct fscrypt_key_specifier *key_spec) +static void +fscrypt_get_test_dummy_secret(struct fscrypt_master_key_secret *secret) { static u8 test_key[FSCRYPT_MAX_KEY_SIZE]; + + get_random_once(test_key, FSCRYPT_MAX_KEY_SIZE); + + memset(secret, 0, sizeof(*secret)); + secret->size = FSCRYPT_MAX_KEY_SIZE; + memcpy(secret->raw, test_key, FSCRYPT_MAX_KEY_SIZE); +} + +int fscrypt_get_test_dummy_key_identifier( + u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) +{ struct fscrypt_master_key_secret secret; int err; - get_random_once(test_key, FSCRYPT_MAX_KEY_SIZE); + fscrypt_get_test_dummy_secret(&secret); - memset(&secret, 0, sizeof(secret)); - secret.size = FSCRYPT_MAX_KEY_SIZE; - memcpy(secret.raw, test_key, FSCRYPT_MAX_KEY_SIZE); + err = fscrypt_init_hkdf(&secret.hkdf, secret.raw, secret.size); + if (err) + goto out; + err = fscrypt_hkdf_expand(&secret.hkdf, HKDF_CONTEXT_KEY_IDENTIFIER, + NULL, 0, key_identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE); +out: + wipe_master_key_secret(&secret); + return err; +} + +/** + * fscrypt_add_test_dummy_key() - add the test dummy encryption key + * @sb: the filesystem instance to add the key to + * @dummy_policy: the encryption policy for test_dummy_encryption + * + * If needed, add the key for the test_dummy_encryption mount option to the + * filesystem. To prevent misuse of this mount option, a per-boot random key is + * used instead of a hardcoded one. This makes it so that any encrypted files + * created using this option won't be accessible after a reboot. + * + * Return: 0 on success, -errno on failure + */ +int fscrypt_add_test_dummy_key(struct super_block *sb, + const struct fscrypt_dummy_policy *dummy_policy) +{ + const union fscrypt_policy *policy = dummy_policy->policy; + struct fscrypt_key_specifier key_spec; + struct fscrypt_master_key_secret secret; + int err; - err = add_master_key(sb, &secret, key_spec); + if (!policy) + return 0; + err = fscrypt_policy_to_key_spec(policy, &key_spec); + if (err) + return err; + fscrypt_get_test_dummy_secret(&secret); + err = add_master_key(sb, &secret, &key_spec); wipe_master_key_secret(&secret); return err; } +EXPORT_SYMBOL_GPL(fscrypt_add_test_dummy_key); /* * Verify that the current user has added a master key with the given identifier diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index eede186b04ce..c35711896bd4 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -94,7 +94,7 @@ fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key, mode->cipher_str, PTR_ERR(tfm)); return tfm; } - if (!xchg(&mode->logged_impl_name, 1)) { + if (!xchg(&mode->logged_cryptoapi_impl, 1)) { /* * fscrypt performance can vary greatly depending on which * crypto algorithm implementation is used. Help people debug @@ -425,23 +425,9 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, if (err) return err; - switch (ci->ci_policy.version) { - case FSCRYPT_POLICY_V1: - mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR; - memcpy(mk_spec.u.descriptor, - ci->ci_policy.v1.master_key_descriptor, - FSCRYPT_KEY_DESCRIPTOR_SIZE); - break; - case FSCRYPT_POLICY_V2: - mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; - memcpy(mk_spec.u.identifier, - ci->ci_policy.v2.master_key_identifier, - FSCRYPT_KEY_IDENTIFIER_SIZE); - break; - default: - WARN_ON(1); - return -EINVAL; - } + err = fscrypt_policy_to_key_spec(&ci->ci_policy, &mk_spec); + if (err) + return err; key = fscrypt_find_master_key(ci->ci_inode->i_sb, &mk_spec); if (IS_ERR(key)) { diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index ed3d623724cd..5f858cee1e3b 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -10,6 +10,7 @@ * Modified by Eric Biggers, 2019 for v2 policy support. */ +#include <linux/fs_context.h> #include <linux/random.h> #include <linux/seq_file.h> #include <linux/string.h> @@ -32,6 +33,26 @@ bool fscrypt_policies_equal(const union fscrypt_policy *policy1, return !memcmp(policy1, policy2, fscrypt_policy_size(policy1)); } +int fscrypt_policy_to_key_spec(const union fscrypt_policy *policy, + struct fscrypt_key_specifier *key_spec) +{ + switch (policy->version) { + case FSCRYPT_POLICY_V1: + key_spec->type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR; + memcpy(key_spec->u.descriptor, policy->v1.master_key_descriptor, + FSCRYPT_KEY_DESCRIPTOR_SIZE); + return 0; + case FSCRYPT_POLICY_V2: + key_spec->type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; + memcpy(key_spec->u.identifier, policy->v2.master_key_identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE); + return 0; + default: + WARN_ON(1); + return -EINVAL; + } +} + static const union fscrypt_policy * fscrypt_get_dummy_policy(struct super_block *sb) { @@ -704,73 +725,45 @@ int fscrypt_set_context(struct inode *inode, void *fs_data) EXPORT_SYMBOL_GPL(fscrypt_set_context); /** - * fscrypt_set_test_dummy_encryption() - handle '-o test_dummy_encryption' - * @sb: the filesystem on which test_dummy_encryption is being specified - * @arg: the argument to the test_dummy_encryption option. May be NULL. - * @dummy_policy: the filesystem's current dummy policy (input/output, see - * below) - * - * Handle the test_dummy_encryption mount option by creating a dummy encryption - * policy, saving it in @dummy_policy, and adding the corresponding dummy - * encryption key to the filesystem. If the @dummy_policy is already set, then - * instead validate that it matches @arg. Don't support changing it via - * remount, as that is difficult to do safely. + * fscrypt_parse_test_dummy_encryption() - parse the test_dummy_encryption mount option + * @param: the mount option + * @dummy_policy: (input/output) the place to write the dummy policy that will + * result from parsing the option. Zero-initialize this. If a policy is + * already set here (due to test_dummy_encryption being given multiple + * times), then this function will verify that the policies are the same. * - * Return: 0 on success (dummy policy set, or the same policy is already set); - * -EEXIST if a different dummy policy is already set; - * or another -errno value. + * Return: 0 on success; -EINVAL if the argument is invalid; -EEXIST if the + * argument conflicts with one already specified; or -ENOMEM. */ -int fscrypt_set_test_dummy_encryption(struct super_block *sb, const char *arg, - struct fscrypt_dummy_policy *dummy_policy) +int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param, + struct fscrypt_dummy_policy *dummy_policy) { - struct fscrypt_key_specifier key_spec = { 0 }; - int version; - union fscrypt_policy *policy = NULL; + const char *arg = "v2"; + union fscrypt_policy *policy; int err; - if (!arg) - arg = "v2"; - - if (!strcmp(arg, "v1")) { - version = FSCRYPT_POLICY_V1; - key_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR; - memset(key_spec.u.descriptor, 0x42, - FSCRYPT_KEY_DESCRIPTOR_SIZE); - } else if (!strcmp(arg, "v2")) { - version = FSCRYPT_POLICY_V2; - key_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; - /* key_spec.u.identifier gets filled in when adding the key */ - } else { - err = -EINVAL; - goto out; - } + if (param->type == fs_value_is_string && *param->string) + arg = param->string; policy = kzalloc(sizeof(*policy), GFP_KERNEL); - if (!policy) { - err = -ENOMEM; - goto out; - } - - err = fscrypt_add_test_dummy_key(sb, &key_spec); - if (err) - goto out; + if (!policy) + return -ENOMEM; - policy->version = version; - switch (policy->version) { - case FSCRYPT_POLICY_V1: + if (!strcmp(arg, "v1")) { + policy->version = FSCRYPT_POLICY_V1; policy->v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; policy->v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; - memcpy(policy->v1.master_key_descriptor, key_spec.u.descriptor, + memset(policy->v1.master_key_descriptor, 0x42, FSCRYPT_KEY_DESCRIPTOR_SIZE); - break; - case FSCRYPT_POLICY_V2: + } else if (!strcmp(arg, "v2")) { + policy->version = FSCRYPT_POLICY_V2; policy->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; policy->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; - memcpy(policy->v2.master_key_identifier, key_spec.u.identifier, - FSCRYPT_KEY_IDENTIFIER_SIZE); - break; - default: - WARN_ON(1); + err = fscrypt_get_test_dummy_key_identifier( + policy->v2.master_key_identifier); + if (err) + goto out; + } else { err = -EINVAL; goto out; } @@ -789,6 +782,37 @@ out: kfree(policy); return err; } +EXPORT_SYMBOL_GPL(fscrypt_parse_test_dummy_encryption); + +/** + * fscrypt_dummy_policies_equal() - check whether two dummy policies are equal + * @p1: the first test dummy policy (may be unset) + * @p2: the second test dummy policy (may be unset) + * + * Return: %true if the dummy policies are both set and equal, or both unset. + */ +bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1, + const struct fscrypt_dummy_policy *p2) +{ + if (!p1->policy && !p2->policy) + return true; + if (!p1->policy || !p2->policy) + return false; + return fscrypt_policies_equal(p1->policy, p2->policy); +} +EXPORT_SYMBOL_GPL(fscrypt_dummy_policies_equal); + +/* Deprecated, do not use */ +int fscrypt_set_test_dummy_encryption(struct super_block *sb, const char *arg, + struct fscrypt_dummy_policy *dummy_policy) +{ + struct fs_parameter param = { + .type = fs_value_is_string, + .string = arg ? (char *)arg : "", + }; + return fscrypt_parse_test_dummy_encryption(¶m, dummy_policy) ?: + fscrypt_add_test_dummy_key(sb, dummy_policy); +} EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption); /** diff --git a/fs/direct-io.c b/fs/direct-io.c index aef06e607b40..840752006f60 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1115,11 +1115,10 @@ static inline int drop_refcount(struct dio *dio) * individual fields and will generate much worse code. This is important * for the whole file. */ -static inline ssize_t -do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, - struct block_device *bdev, struct iov_iter *iter, - get_block_t get_block, dio_iodone_t end_io, - dio_submit_t submit_io, int flags) +ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, struct iov_iter *iter, + get_block_t get_block, dio_iodone_t end_io, + dio_submit_t submit_io, int flags) { unsigned i_blkbits = READ_ONCE(inode->i_blkbits); unsigned blkbits = i_blkbits; @@ -1334,29 +1333,6 @@ fail_dio: kmem_cache_free(dio_cache, dio); return retval; } - -ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, - struct block_device *bdev, struct iov_iter *iter, - get_block_t get_block, - dio_iodone_t end_io, dio_submit_t submit_io, - int flags) -{ - /* - * The block device state is needed in the end to finally - * submit everything. Since it's likely to be cache cold - * prefetch it here as first thing to hide some of the - * latency. - * - * Attempt to prefetch the pieces we likely need later. - */ - prefetch(&bdev->bd_disk->part_tbl); - prefetch(bdev->bd_disk->queue); - prefetch((char *)bdev->bd_disk->queue + SMP_CACHE_BYTES); - - return do_blockdev_direct_IO(iocb, inode, bdev, iter, get_block, - end_io, submit_io, flags); -} - EXPORT_SYMBOL(__blockdev_direct_IO); static __init int dio_init(void) diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c index b6692f81ec83..fb1981654bb2 100644 --- a/fs/dlm/dir.c +++ b/fs/dlm/dir.c @@ -101,7 +101,7 @@ int dlm_recover_directory(struct dlm_ls *ls) */ b = ls->ls_recover_buf->rc_buf; - left = ls->ls_recover_buf->rc_header.h_length; + left = le16_to_cpu(ls->ls_recover_buf->rc_header.h_length); left -= sizeof(struct dlm_rcom); for (;;) { diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 74a9590a4dd5..776c3ed519f0 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -379,15 +379,15 @@ static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag) #define DLM_FIN 5 struct dlm_header { - uint32_t h_version; + __le32 h_version; union { /* for DLM_MSG and DLM_RCOM */ - uint32_t h_lockspace; + __le32 h_lockspace; /* for DLM_ACK and DLM_OPTS */ - uint32_t h_seq; + __le32 h_seq; } u; - uint32_t h_nodeid; /* nodeid of sender */ - uint16_t h_length; + __le32 h_nodeid; /* nodeid of sender */ + __le16 h_length; uint8_t h_cmd; /* DLM_MSG, DLM_RCOM */ uint8_t h_pad; }; @@ -409,24 +409,24 @@ struct dlm_header { struct dlm_message { struct dlm_header m_header; - uint32_t m_type; /* DLM_MSG_ */ - uint32_t m_nodeid; - uint32_t m_pid; - uint32_t m_lkid; /* lkid on sender */ - uint32_t m_remid; /* lkid on receiver */ - uint32_t m_parent_lkid; - uint32_t m_parent_remid; - uint32_t m_exflags; - uint32_t m_sbflags; - uint32_t m_flags; - uint32_t m_lvbseq; - uint32_t m_hash; - int m_status; - int m_grmode; - int m_rqmode; - int m_bastmode; - int m_asts; - int m_result; /* 0 or -EXXX */ + __le32 m_type; /* DLM_MSG_ */ + __le32 m_nodeid; + __le32 m_pid; + __le32 m_lkid; /* lkid on sender */ + __le32 m_remid; /* lkid on receiver */ + __le32 m_parent_lkid; + __le32 m_parent_remid; + __le32 m_exflags; + __le32 m_sbflags; + __le32 m_flags; + __le32 m_lvbseq; + __le32 m_hash; + __le32 m_status; + __le32 m_grmode; + __le32 m_rqmode; + __le32 m_bastmode; + __le32 m_asts; + __le32 m_result; /* 0 or -EXXX */ char m_extra[]; /* name or lvb */ }; @@ -451,18 +451,18 @@ struct dlm_message { struct dlm_rcom { struct dlm_header rc_header; - uint32_t rc_type; /* DLM_RCOM_ */ - int rc_result; /* multi-purpose */ - uint64_t rc_id; /* match reply with request */ - uint64_t rc_seq; /* sender's ls_recover_seq */ - uint64_t rc_seq_reply; /* remote ls_recover_seq */ + __le32 rc_type; /* DLM_RCOM_ */ + __le32 rc_result; /* multi-purpose */ + __le64 rc_id; /* match reply with request */ + __le64 rc_seq; /* sender's ls_recover_seq */ + __le64 rc_seq_reply; /* remote ls_recover_seq */ char rc_buf[]; }; struct dlm_opt_header { - uint16_t t_type; - uint16_t t_length; - uint32_t t_pad; + __le16 t_type; + __le16 t_length; + __le32 t_pad; /* need to be 8 byte aligned */ char t_value[]; }; @@ -472,8 +472,8 @@ struct dlm_opts { struct dlm_header o_header; uint8_t o_nextcmd; uint8_t o_pad; - uint16_t o_optlen; - uint32_t o_pad2; + __le16 o_optlen; + __le32 o_pad2; char o_opts[]; }; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index bdb51d209ba2..226822f49d30 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -350,10 +350,12 @@ static void put_rsb(struct dlm_rsb *r) { struct dlm_ls *ls = r->res_ls; uint32_t bucket = r->res_bucket; + int rv; - spin_lock(&ls->ls_rsbtbl[bucket].lock); - kref_put(&r->res_ref, toss_rsb); - spin_unlock(&ls->ls_rsbtbl[bucket].lock); + rv = kref_put_lock(&r->res_ref, toss_rsb, + &ls->ls_rsbtbl[bucket].lock); + if (rv) + spin_unlock(&ls->ls_rsbtbl[bucket].lock); } void dlm_put_rsb(struct dlm_rsb *r) @@ -602,7 +604,6 @@ static int find_rsb_dir(struct dlm_ls *ls, char *name, int len, */ kref_get(&r->res_ref); - error = 0; goto out_unlock; @@ -880,6 +881,88 @@ static int validate_master_nodeid(struct dlm_ls *ls, struct dlm_rsb *r, } } +static void __dlm_master_lookup(struct dlm_ls *ls, struct dlm_rsb *r, int our_nodeid, + int from_nodeid, bool toss_list, unsigned int flags, + int *r_nodeid, int *result) +{ + int fix_master = (flags & DLM_LU_RECOVER_MASTER); + int from_master = (flags & DLM_LU_RECOVER_DIR); + + if (r->res_dir_nodeid != our_nodeid) { + /* should not happen, but may as well fix it and carry on */ + log_error(ls, "%s res_dir %d our %d %s", __func__, + r->res_dir_nodeid, our_nodeid, r->res_name); + r->res_dir_nodeid = our_nodeid; + } + + if (fix_master && dlm_is_removed(ls, r->res_master_nodeid)) { + /* Recovery uses this function to set a new master when + * the previous master failed. Setting NEW_MASTER will + * force dlm_recover_masters to call recover_master on this + * rsb even though the res_nodeid is no longer removed. + */ + + r->res_master_nodeid = from_nodeid; + r->res_nodeid = from_nodeid; + rsb_set_flag(r, RSB_NEW_MASTER); + + if (toss_list) { + /* I don't think we should ever find it on toss list. */ + log_error(ls, "%s fix_master on toss", __func__); + dlm_dump_rsb(r); + } + } + + if (from_master && (r->res_master_nodeid != from_nodeid)) { + /* this will happen if from_nodeid became master during + * a previous recovery cycle, and we aborted the previous + * cycle before recovering this master value + */ + + log_limit(ls, "%s from_master %d master_nodeid %d res_nodeid %d first %x %s", + __func__, from_nodeid, r->res_master_nodeid, + r->res_nodeid, r->res_first_lkid, r->res_name); + + if (r->res_master_nodeid == our_nodeid) { + log_error(ls, "from_master %d our_master", from_nodeid); + dlm_dump_rsb(r); + goto ret_assign; + } + + r->res_master_nodeid = from_nodeid; + r->res_nodeid = from_nodeid; + rsb_set_flag(r, RSB_NEW_MASTER); + } + + if (!r->res_master_nodeid) { + /* this will happen if recovery happens while we're looking + * up the master for this rsb + */ + + log_debug(ls, "%s master 0 to %d first %x %s", __func__, + from_nodeid, r->res_first_lkid, r->res_name); + r->res_master_nodeid = from_nodeid; + r->res_nodeid = from_nodeid; + } + + if (!from_master && !fix_master && + (r->res_master_nodeid == from_nodeid)) { + /* this can happen when the master sends remove, the dir node + * finds the rsb on the keep list and ignores the remove, + * and the former master sends a lookup + */ + + log_limit(ls, "%s from master %d flags %x first %x %s", + __func__, from_nodeid, flags, r->res_first_lkid, + r->res_name); + } + + ret_assign: + *r_nodeid = r->res_master_nodeid; + if (result) + *result = DLM_LU_MATCH; +} + /* * We're the dir node for this res and another node wants to know the * master nodeid. During normal operation (non recovery) this is only @@ -914,10 +997,8 @@ int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, char *name, int len, { struct dlm_rsb *r = NULL; uint32_t hash, b; - int from_master = (flags & DLM_LU_RECOVER_DIR); - int fix_master = (flags & DLM_LU_RECOVER_MASTER); int our_nodeid = dlm_our_nodeid(); - int dir_nodeid, error, toss_list = 0; + int dir_nodeid, error; if (len > DLM_RESNAME_MAXLEN) return -EINVAL; @@ -949,12 +1030,21 @@ int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, char *name, int len, error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r); if (!error) { /* because the rsb is active, we need to lock_rsb before - checking/changing re_master_nodeid */ + * checking/changing re_master_nodeid + */ hold_rsb(r); spin_unlock(&ls->ls_rsbtbl[b].lock); lock_rsb(r); - goto found; + + __dlm_master_lookup(ls, r, our_nodeid, from_nodeid, false, + flags, r_nodeid, result); + + /* the rsb was active */ + unlock_rsb(r); + put_rsb(r); + + return 0; } error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); @@ -962,90 +1052,16 @@ int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, char *name, int len, goto not_found; /* because the rsb is inactive (on toss list), it's not refcounted - and lock_rsb is not used, but is protected by the rsbtbl lock */ - - toss_list = 1; - found: - if (r->res_dir_nodeid != our_nodeid) { - /* should not happen, but may as well fix it and carry on */ - log_error(ls, "dlm_master_lookup res_dir %d our %d %s", - r->res_dir_nodeid, our_nodeid, r->res_name); - r->res_dir_nodeid = our_nodeid; - } - - if (fix_master && dlm_is_removed(ls, r->res_master_nodeid)) { - /* Recovery uses this function to set a new master when - the previous master failed. Setting NEW_MASTER will - force dlm_recover_masters to call recover_master on this - rsb even though the res_nodeid is no longer removed. */ - - r->res_master_nodeid = from_nodeid; - r->res_nodeid = from_nodeid; - rsb_set_flag(r, RSB_NEW_MASTER); - - if (toss_list) { - /* I don't think we should ever find it on toss list. */ - log_error(ls, "dlm_master_lookup fix_master on toss"); - dlm_dump_rsb(r); - } - } - - if (from_master && (r->res_master_nodeid != from_nodeid)) { - /* this will happen if from_nodeid became master during - a previous recovery cycle, and we aborted the previous - cycle before recovering this master value */ - - log_limit(ls, "dlm_master_lookup from_master %d " - "master_nodeid %d res_nodeid %d first %x %s", - from_nodeid, r->res_master_nodeid, r->res_nodeid, - r->res_first_lkid, r->res_name); - - if (r->res_master_nodeid == our_nodeid) { - log_error(ls, "from_master %d our_master", from_nodeid); - dlm_dump_rsb(r); - goto out_found; - } - - r->res_master_nodeid = from_nodeid; - r->res_nodeid = from_nodeid; - rsb_set_flag(r, RSB_NEW_MASTER); - } - - if (!r->res_master_nodeid) { - /* this will happen if recovery happens while we're looking - up the master for this rsb */ - - log_debug(ls, "dlm_master_lookup master 0 to %d first %x %s", - from_nodeid, r->res_first_lkid, r->res_name); - r->res_master_nodeid = from_nodeid; - r->res_nodeid = from_nodeid; - } - - if (!from_master && !fix_master && - (r->res_master_nodeid == from_nodeid)) { - /* this can happen when the master sends remove, the dir node - finds the rsb on the keep list and ignores the remove, - and the former master sends a lookup */ + * and lock_rsb is not used, but is protected by the rsbtbl lock + */ - log_limit(ls, "dlm_master_lookup from master %d flags %x " - "first %x %s", from_nodeid, flags, - r->res_first_lkid, r->res_name); - } + __dlm_master_lookup(ls, r, our_nodeid, from_nodeid, true, flags, + r_nodeid, result); - out_found: - *r_nodeid = r->res_master_nodeid; - if (result) - *result = DLM_LU_MATCH; + r->res_toss_time = jiffies; + /* the rsb was inactive (on toss list) */ + spin_unlock(&ls->ls_rsbtbl[b].lock); - if (toss_list) { - r->res_toss_time = jiffies; - /* the rsb was inactive (on toss list) */ - spin_unlock(&ls->ls_rsbtbl[b].lock); - } else { - /* the rsb was active */ - unlock_rsb(r); - put_rsb(r); - } return 0; not_found: @@ -1076,7 +1092,6 @@ int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, char *name, int len, if (result) *result = DLM_LU_ADD; *r_nodeid = from_nodeid; - error = 0; out_unlock: spin_unlock(&ls->ls_rsbtbl[b].lock); return error; @@ -1253,9 +1268,11 @@ static void kill_lkb(struct kref *kref) static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb) { uint32_t lkid = lkb->lkb_id; + int rv; - spin_lock(&ls->ls_lkbidr_spin); - if (kref_put(&lkb->lkb_ref, kill_lkb)) { + rv = kref_put_lock(&lkb->lkb_ref, kill_lkb, + &ls->ls_lkbidr_spin); + if (rv) { idr_remove(&ls->ls_lkbidr, lkid); spin_unlock(&ls->ls_lkbidr_spin); @@ -1265,11 +1282,9 @@ static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb) if (lkb->lkb_lvbptr && is_master_copy(lkb)) dlm_free_lvb(lkb->lkb_lvbptr); dlm_free_lkb(lkb); - return 1; - } else { - spin_unlock(&ls->ls_lkbidr_spin); - return 0; } + + return rv; } int dlm_put_lkb(struct dlm_lkb *lkb) @@ -1306,13 +1321,17 @@ static inline void unhold_lkb(struct dlm_lkb *lkb) static void lkb_add_ordered(struct list_head *new, struct list_head *head, int mode) { - struct dlm_lkb *lkb = NULL; + struct dlm_lkb *lkb = NULL, *iter; - list_for_each_entry(lkb, head, lkb_statequeue) - if (lkb->lkb_rqmode < mode) + list_for_each_entry(iter, head, lkb_statequeue) + if (iter->lkb_rqmode < mode) { + lkb = iter; + list_add_tail(new, &iter->lkb_statequeue); break; + } - __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue); + if (!lkb) + list_add_tail(new, head); } /* add/remove lkb to rsb's grant/convert/wait queue */ @@ -1559,6 +1578,7 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype, lkb->lkb_wait_type = 0; lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL; lkb->lkb_wait_count--; + unhold_lkb(lkb); goto out_del; } @@ -1571,8 +1591,8 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype, } log_error(ls, "remwait error %x remote %d %x msg %d flags %x no wait", - lkb->lkb_id, ms ? ms->m_header.h_nodeid : 0, lkb->lkb_remid, - mstype, lkb->lkb_flags); + lkb->lkb_id, ms ? le32_to_cpu(ms->m_header.h_nodeid) : 0, + lkb->lkb_remid, mstype, lkb->lkb_flags); return -1; out_del: @@ -1585,6 +1605,7 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype, log_error(ls, "remwait error %x reply %d wait_type %d overlap", lkb->lkb_id, mstype, lkb->lkb_wait_type); lkb->lkb_wait_count--; + unhold_lkb(lkb); lkb->lkb_wait_type = 0; } @@ -1617,10 +1638,10 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms) struct dlm_ls *ls = lkb->lkb_resource->res_ls; int error; - if (ms->m_flags != DLM_IFL_STUB_MS) + if (ms->m_flags != cpu_to_le32(DLM_IFL_STUB_MS)) mutex_lock(&ls->ls_waiters_mutex); - error = _remove_from_waiters(lkb, ms->m_type, ms); - if (ms->m_flags != DLM_IFL_STUB_MS) + error = _remove_from_waiters(lkb, le32_to_cpu(ms->m_type), ms); + if (ms->m_flags != cpu_to_le32(DLM_IFL_STUB_MS)) mutex_unlock(&ls->ls_waiters_mutex); return error; } @@ -1795,7 +1816,6 @@ static void shrink_bucket(struct dlm_ls *ls, int b) memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN); spin_unlock(&ls->ls_remove_spin); spin_unlock(&ls->ls_rsbtbl[b].lock); - wake_up(&ls->ls_remove_wait); send_remove(r); @@ -1804,6 +1824,7 @@ static void shrink_bucket(struct dlm_ls *ls, int b) ls->ls_remove_len = 0; memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN); spin_unlock(&ls->ls_remove_spin); + wake_up(&ls->ls_remove_wait); dlm_free_rsb(r); } @@ -1866,7 +1887,7 @@ static void del_timeout(struct dlm_lkb *lkb) void dlm_scan_timeout(struct dlm_ls *ls) { struct dlm_rsb *r; - struct dlm_lkb *lkb; + struct dlm_lkb *lkb = NULL, *iter; int do_cancel, do_warn; s64 wait_us; @@ -1877,27 +1898,28 @@ void dlm_scan_timeout(struct dlm_ls *ls) do_cancel = 0; do_warn = 0; mutex_lock(&ls->ls_timeout_mutex); - list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) { + list_for_each_entry(iter, &ls->ls_timeout, lkb_time_list) { wait_us = ktime_to_us(ktime_sub(ktime_get(), - lkb->lkb_timestamp)); + iter->lkb_timestamp)); - if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) && - wait_us >= (lkb->lkb_timeout_cs * 10000)) + if ((iter->lkb_exflags & DLM_LKF_TIMEOUT) && + wait_us >= (iter->lkb_timeout_cs * 10000)) do_cancel = 1; - if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) && + if ((iter->lkb_flags & DLM_IFL_WATCH_TIMEWARN) && wait_us >= dlm_config.ci_timewarn_cs * 10000) do_warn = 1; if (!do_cancel && !do_warn) continue; - hold_lkb(lkb); + hold_lkb(iter); + lkb = iter; break; } mutex_unlock(&ls->ls_timeout_mutex); - if (!do_cancel && !do_warn) + if (!lkb) break; r = lkb->lkb_resource; @@ -2051,7 +2073,7 @@ static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb, if (len > r->res_ls->ls_lvblen) len = r->res_ls->ls_lvblen; memcpy(lkb->lkb_lvbptr, ms->m_extra, len); - lkb->lkb_lvbseq = ms->m_lvbseq; + lkb->lkb_lvbseq = le32_to_cpu(ms->m_lvbseq); } } @@ -2182,10 +2204,10 @@ static void munge_demoted(struct dlm_lkb *lkb) static void munge_altmode(struct dlm_lkb *lkb, struct dlm_message *ms) { - if (ms->m_type != DLM_MSG_REQUEST_REPLY && - ms->m_type != DLM_MSG_GRANT) { + if (ms->m_type != cpu_to_le32(DLM_MSG_REQUEST_REPLY) && + ms->m_type != cpu_to_le32(DLM_MSG_GRANT)) { log_print("munge_altmode %x invalid reply type %d", - lkb->lkb_id, ms->m_type); + lkb->lkb_id, le32_to_cpu(ms->m_type)); return; } @@ -2912,7 +2934,8 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, if (lkb->lkb_status != DLM_LKSTS_GRANTED) goto out; - if (lkb->lkb_wait_type) + /* lock not allowed if there's any op in progress */ + if (lkb->lkb_wait_type || lkb->lkb_wait_count) goto out; if (is_overlap(lkb)) @@ -3563,13 +3586,13 @@ static int _create_message(struct dlm_ls *ls, int mb_len, ms = (struct dlm_message *) mb; - ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); - ms->m_header.u.h_lockspace = ls->ls_global_id; - ms->m_header.h_nodeid = dlm_our_nodeid(); - ms->m_header.h_length = mb_len; + ms->m_header.h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR); + ms->m_header.u.h_lockspace = cpu_to_le32(ls->ls_global_id); + ms->m_header.h_nodeid = cpu_to_le32(dlm_our_nodeid()); + ms->m_header.h_length = cpu_to_le16(mb_len); ms->m_header.h_cmd = DLM_MSG; - ms->m_type = mstype; + ms->m_type = cpu_to_le32(mstype); *mh_ret = mh; *ms_ret = ms; @@ -3608,7 +3631,6 @@ static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb, static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms) { - dlm_message_out(ms); dlm_midcomms_commit_mhandle(mh); return 0; } @@ -3616,40 +3638,40 @@ static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms) static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb, struct dlm_message *ms) { - ms->m_nodeid = lkb->lkb_nodeid; - ms->m_pid = lkb->lkb_ownpid; - ms->m_lkid = lkb->lkb_id; - ms->m_remid = lkb->lkb_remid; - ms->m_exflags = lkb->lkb_exflags; - ms->m_sbflags = lkb->lkb_sbflags; - ms->m_flags = lkb->lkb_flags; - ms->m_lvbseq = lkb->lkb_lvbseq; - ms->m_status = lkb->lkb_status; - ms->m_grmode = lkb->lkb_grmode; - ms->m_rqmode = lkb->lkb_rqmode; - ms->m_hash = r->res_hash; + ms->m_nodeid = cpu_to_le32(lkb->lkb_nodeid); + ms->m_pid = cpu_to_le32(lkb->lkb_ownpid); + ms->m_lkid = cpu_to_le32(lkb->lkb_id); + ms->m_remid = cpu_to_le32(lkb->lkb_remid); + ms->m_exflags = cpu_to_le32(lkb->lkb_exflags); + ms->m_sbflags = cpu_to_le32(lkb->lkb_sbflags); + ms->m_flags = cpu_to_le32(lkb->lkb_flags); + ms->m_lvbseq = cpu_to_le32(lkb->lkb_lvbseq); + ms->m_status = cpu_to_le32(lkb->lkb_status); + ms->m_grmode = cpu_to_le32(lkb->lkb_grmode); + ms->m_rqmode = cpu_to_le32(lkb->lkb_rqmode); + ms->m_hash = cpu_to_le32(r->res_hash); /* m_result and m_bastmode are set from function args, not from lkb fields */ if (lkb->lkb_bastfn) - ms->m_asts |= DLM_CB_BAST; + ms->m_asts |= cpu_to_le32(DLM_CB_BAST); if (lkb->lkb_astfn) - ms->m_asts |= DLM_CB_CAST; + ms->m_asts |= cpu_to_le32(DLM_CB_CAST); /* compare with switch in create_message; send_remove() doesn't use send_args() */ switch (ms->m_type) { - case DLM_MSG_REQUEST: - case DLM_MSG_LOOKUP: + case cpu_to_le32(DLM_MSG_REQUEST): + case cpu_to_le32(DLM_MSG_LOOKUP): memcpy(ms->m_extra, r->res_name, r->res_length); break; - case DLM_MSG_CONVERT: - case DLM_MSG_UNLOCK: - case DLM_MSG_REQUEST_REPLY: - case DLM_MSG_CONVERT_REPLY: - case DLM_MSG_GRANT: + case cpu_to_le32(DLM_MSG_CONVERT): + case cpu_to_le32(DLM_MSG_UNLOCK): + case cpu_to_le32(DLM_MSG_REQUEST_REPLY): + case cpu_to_le32(DLM_MSG_CONVERT_REPLY): + case cpu_to_le32(DLM_MSG_GRANT): if (!lkb->lkb_lvbptr) break; memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen); @@ -3699,8 +3721,8 @@ static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) /* down conversions go without a reply from the master */ if (!error && down_conversion(lkb)) { remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY); - r->res_ls->ls_stub_ms.m_flags = DLM_IFL_STUB_MS; - r->res_ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY; + r->res_ls->ls_stub_ms.m_flags = cpu_to_le32(DLM_IFL_STUB_MS); + r->res_ls->ls_stub_ms.m_type = cpu_to_le32(DLM_MSG_CONVERT_REPLY); r->res_ls->ls_stub_ms.m_result = 0; __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms); } @@ -3757,7 +3779,7 @@ static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode) send_args(r, lkb, ms); - ms->m_bastmode = mode; + ms->m_bastmode = cpu_to_le32(mode); error = send_message(mh, ms); out: @@ -3805,7 +3827,7 @@ static int send_remove(struct dlm_rsb *r) goto out; memcpy(ms->m_extra, r->res_name, r->res_length); - ms->m_hash = r->res_hash; + ms->m_hash = cpu_to_le32(r->res_hash); error = send_message(mh, ms); out: @@ -3827,7 +3849,7 @@ static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, send_args(r, lkb, ms); - ms->m_result = rv; + ms->m_result = cpu_to_le32(to_dlm_errno(rv)); error = send_message(mh, ms); out: @@ -3860,15 +3882,15 @@ static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in, struct dlm_rsb *r = &ls->ls_stub_rsb; struct dlm_message *ms; struct dlm_mhandle *mh; - int error, nodeid = ms_in->m_header.h_nodeid; + int error, nodeid = le32_to_cpu(ms_in->m_header.h_nodeid); error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh); if (error) goto out; ms->m_lkid = ms_in->m_lkid; - ms->m_result = rv; - ms->m_nodeid = ret_nodeid; + ms->m_result = cpu_to_le32(to_dlm_errno(rv)); + ms->m_nodeid = cpu_to_le32(ret_nodeid); error = send_message(mh, ms); out: @@ -3881,25 +3903,26 @@ static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in, static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms) { - lkb->lkb_exflags = ms->m_exflags; - lkb->lkb_sbflags = ms->m_sbflags; + lkb->lkb_exflags = le32_to_cpu(ms->m_exflags); + lkb->lkb_sbflags = le32_to_cpu(ms->m_sbflags); lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) | - (ms->m_flags & 0x0000FFFF); + (le32_to_cpu(ms->m_flags) & 0x0000FFFF); } static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms) { - if (ms->m_flags == DLM_IFL_STUB_MS) + if (ms->m_flags == cpu_to_le32(DLM_IFL_STUB_MS)) return; - lkb->lkb_sbflags = ms->m_sbflags; + lkb->lkb_sbflags = le32_to_cpu(ms->m_sbflags); lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) | - (ms->m_flags & 0x0000FFFF); + (le32_to_cpu(ms->m_flags) & 0x0000FFFF); } static int receive_extralen(struct dlm_message *ms) { - return (ms->m_header.h_length - sizeof(struct dlm_message)); + return (le16_to_cpu(ms->m_header.h_length) - + sizeof(struct dlm_message)); } static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb, @@ -3933,14 +3956,14 @@ static void fake_astfn(void *astparam) static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb, struct dlm_message *ms) { - lkb->lkb_nodeid = ms->m_header.h_nodeid; - lkb->lkb_ownpid = ms->m_pid; - lkb->lkb_remid = ms->m_lkid; + lkb->lkb_nodeid = le32_to_cpu(ms->m_header.h_nodeid); + lkb->lkb_ownpid = le32_to_cpu(ms->m_pid); + lkb->lkb_remid = le32_to_cpu(ms->m_lkid); lkb->lkb_grmode = DLM_LOCK_IV; - lkb->lkb_rqmode = ms->m_rqmode; + lkb->lkb_rqmode = le32_to_cpu(ms->m_rqmode); - lkb->lkb_bastfn = (ms->m_asts & DLM_CB_BAST) ? &fake_bastfn : NULL; - lkb->lkb_astfn = (ms->m_asts & DLM_CB_CAST) ? &fake_astfn : NULL; + lkb->lkb_bastfn = (ms->m_asts & cpu_to_le32(DLM_CB_BAST)) ? &fake_bastfn : NULL; + lkb->lkb_astfn = (ms->m_asts & cpu_to_le32(DLM_CB_CAST)) ? &fake_astfn : NULL; if (lkb->lkb_exflags & DLM_LKF_VALBLK) { /* lkb was just created so there won't be an lvb yet */ @@ -3961,8 +3984,8 @@ static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb, if (receive_lvb(ls, lkb, ms)) return -ENOMEM; - lkb->lkb_rqmode = ms->m_rqmode; - lkb->lkb_lvbseq = ms->m_lvbseq; + lkb->lkb_rqmode = le32_to_cpu(ms->m_rqmode); + lkb->lkb_lvbseq = le32_to_cpu(ms->m_lvbseq); return 0; } @@ -3981,8 +4004,8 @@ static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms) { struct dlm_lkb *lkb = &ls->ls_stub_lkb; - lkb->lkb_nodeid = ms->m_header.h_nodeid; - lkb->lkb_remid = ms->m_lkid; + lkb->lkb_nodeid = le32_to_cpu(ms->m_header.h_nodeid); + lkb->lkb_remid = le32_to_cpu(ms->m_lkid); } /* This is called after the rsb is locked so that we can safely inspect @@ -3990,11 +4013,12 @@ static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms) static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms) { - int from = ms->m_header.h_nodeid; + int from = le32_to_cpu(ms->m_header.h_nodeid); int error = 0; /* currently mixing of user/kernel locks are not supported */ - if (ms->m_flags & DLM_IFL_USER && ~lkb->lkb_flags & DLM_IFL_USER) { + if (ms->m_flags & cpu_to_le32(DLM_IFL_USER) && + ~lkb->lkb_flags & DLM_IFL_USER) { log_error(lkb->lkb_resource->res_ls, "got user dlm message for a kernel lock"); error = -EINVAL; @@ -4002,23 +4026,23 @@ static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms) } switch (ms->m_type) { - case DLM_MSG_CONVERT: - case DLM_MSG_UNLOCK: - case DLM_MSG_CANCEL: + case cpu_to_le32(DLM_MSG_CONVERT): + case cpu_to_le32(DLM_MSG_UNLOCK): + case cpu_to_le32(DLM_MSG_CANCEL): if (!is_master_copy(lkb) || lkb->lkb_nodeid != from) error = -EINVAL; break; - case DLM_MSG_CONVERT_REPLY: - case DLM_MSG_UNLOCK_REPLY: - case DLM_MSG_CANCEL_REPLY: - case DLM_MSG_GRANT: - case DLM_MSG_BAST: + case cpu_to_le32(DLM_MSG_CONVERT_REPLY): + case cpu_to_le32(DLM_MSG_UNLOCK_REPLY): + case cpu_to_le32(DLM_MSG_CANCEL_REPLY): + case cpu_to_le32(DLM_MSG_GRANT): + case cpu_to_le32(DLM_MSG_BAST): if (!is_process_copy(lkb) || lkb->lkb_nodeid != from) error = -EINVAL; break; - case DLM_MSG_REQUEST_REPLY: + case cpu_to_le32(DLM_MSG_REQUEST_REPLY): if (!is_process_copy(lkb)) error = -EINVAL; else if (lkb->lkb_nodeid != -1 && lkb->lkb_nodeid != from) @@ -4033,8 +4057,8 @@ out: if (error) log_error(lkb->lkb_resource->res_ls, "ignore invalid message %d from %d %x %x %x %d", - ms->m_type, from, lkb->lkb_id, lkb->lkb_remid, - lkb->lkb_flags, lkb->lkb_nodeid); + le32_to_cpu(ms->m_type), from, lkb->lkb_id, + lkb->lkb_remid, lkb->lkb_flags, lkb->lkb_nodeid); return error; } @@ -4079,22 +4103,23 @@ static void send_repeat_remove(struct dlm_ls *ls, char *ms_name, int len) memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN); spin_unlock(&ls->ls_remove_spin); spin_unlock(&ls->ls_rsbtbl[b].lock); - wake_up(&ls->ls_remove_wait); rv = _create_message(ls, sizeof(struct dlm_message) + len, dir_nodeid, DLM_MSG_REMOVE, &ms, &mh); if (rv) - return; + goto out; memcpy(ms->m_extra, name, len); - ms->m_hash = hash; + ms->m_hash = cpu_to_le32(hash); send_message(mh, ms); +out: spin_lock(&ls->ls_remove_spin); ls->ls_remove_len = 0; memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN); spin_unlock(&ls->ls_remove_spin); + wake_up(&ls->ls_remove_wait); } static int receive_request(struct dlm_ls *ls, struct dlm_message *ms) @@ -4104,7 +4129,7 @@ static int receive_request(struct dlm_ls *ls, struct dlm_message *ms) int from_nodeid; int error, namelen = 0; - from_nodeid = ms->m_header.h_nodeid; + from_nodeid = le32_to_cpu(ms->m_header.h_nodeid); error = create_lkb(ls, &lkb); if (error) @@ -4177,7 +4202,7 @@ static int receive_request(struct dlm_ls *ls, struct dlm_message *ms) if (error != -ENOTBLK) { log_limit(ls, "receive_request %x from %d %d", - ms->m_lkid, from_nodeid, error); + le32_to_cpu(ms->m_lkid), from_nodeid, error); } if (namelen && error == -EBADR) { @@ -4196,15 +4221,16 @@ static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms) struct dlm_rsb *r; int error, reply = 1; - error = find_lkb(ls, ms->m_remid, &lkb); + error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb); if (error) goto fail; - if (lkb->lkb_remid != ms->m_lkid) { + if (lkb->lkb_remid != le32_to_cpu(ms->m_lkid)) { log_error(ls, "receive_convert %x remid %x recover_seq %llu " "remote %d %x", lkb->lkb_id, lkb->lkb_remid, (unsigned long long)lkb->lkb_recover_seq, - ms->m_header.h_nodeid, ms->m_lkid); + le32_to_cpu(ms->m_header.h_nodeid), + le32_to_cpu(ms->m_lkid)); error = -ENOENT; dlm_put_lkb(lkb); goto fail; @@ -4251,14 +4277,15 @@ static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms) struct dlm_rsb *r; int error; - error = find_lkb(ls, ms->m_remid, &lkb); + error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb); if (error) goto fail; - if (lkb->lkb_remid != ms->m_lkid) { + if (lkb->lkb_remid != le32_to_cpu(ms->m_lkid)) { log_error(ls, "receive_unlock %x remid %x remote %d %x", lkb->lkb_id, lkb->lkb_remid, - ms->m_header.h_nodeid, ms->m_lkid); + le32_to_cpu(ms->m_header.h_nodeid), + le32_to_cpu(ms->m_lkid)); error = -ENOENT; dlm_put_lkb(lkb); goto fail; @@ -4302,7 +4329,7 @@ static int receive_cancel(struct dlm_ls *ls, struct dlm_message *ms) struct dlm_rsb *r; int error; - error = find_lkb(ls, ms->m_remid, &lkb); + error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb); if (error) goto fail; @@ -4338,7 +4365,7 @@ static int receive_grant(struct dlm_ls *ls, struct dlm_message *ms) struct dlm_rsb *r; int error; - error = find_lkb(ls, ms->m_remid, &lkb); + error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb); if (error) return error; @@ -4369,7 +4396,7 @@ static int receive_bast(struct dlm_ls *ls, struct dlm_message *ms) struct dlm_rsb *r; int error; - error = find_lkb(ls, ms->m_remid, &lkb); + error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb); if (error) return error; @@ -4382,8 +4409,8 @@ static int receive_bast(struct dlm_ls *ls, struct dlm_message *ms) if (error) goto out; - queue_bast(r, lkb, ms->m_bastmode); - lkb->lkb_highbast = ms->m_bastmode; + queue_bast(r, lkb, le32_to_cpu(ms->m_bastmode)); + lkb->lkb_highbast = le32_to_cpu(ms->m_bastmode); out: unlock_rsb(r); put_rsb(r); @@ -4395,7 +4422,7 @@ static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms) { int len, error, ret_nodeid, from_nodeid, our_nodeid; - from_nodeid = ms->m_header.h_nodeid; + from_nodeid = le32_to_cpu(ms->m_header.h_nodeid); our_nodeid = dlm_our_nodeid(); len = receive_extralen(ms); @@ -4418,7 +4445,7 @@ static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms) uint32_t hash, b; int rv, len, dir_nodeid, from_nodeid; - from_nodeid = ms->m_header.h_nodeid; + from_nodeid = le32_to_cpu(ms->m_header.h_nodeid); len = receive_extralen(ms); @@ -4428,7 +4455,7 @@ static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms) return; } - dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash); + dir_nodeid = dlm_hash2nodeid(ls, le32_to_cpu(ms->m_hash)); if (dir_nodeid != dlm_our_nodeid()) { log_error(ls, "receive_remove from %d bad nodeid %d", from_nodeid, dir_nodeid); @@ -4501,7 +4528,7 @@ static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms) static void receive_purge(struct dlm_ls *ls, struct dlm_message *ms) { - do_purge(ls, ms->m_nodeid, ms->m_pid); + do_purge(ls, le32_to_cpu(ms->m_nodeid), le32_to_cpu(ms->m_pid)); } static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) @@ -4509,9 +4536,9 @@ static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) struct dlm_lkb *lkb; struct dlm_rsb *r; int error, mstype, result; - int from_nodeid = ms->m_header.h_nodeid; + int from_nodeid = le32_to_cpu(ms->m_header.h_nodeid); - error = find_lkb(ls, ms->m_remid, &lkb); + error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb); if (error) return error; @@ -4527,7 +4554,8 @@ static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY); if (error) { log_error(ls, "receive_request_reply %x remote %d %x result %d", - lkb->lkb_id, from_nodeid, ms->m_lkid, ms->m_result); + lkb->lkb_id, from_nodeid, le32_to_cpu(ms->m_lkid), + from_dlm_errno(le32_to_cpu(ms->m_result))); dlm_dump_rsb(r); goto out; } @@ -4541,7 +4569,7 @@ static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) } /* this is the value returned from do_request() on the master */ - result = ms->m_result; + result = from_dlm_errno(le32_to_cpu(ms->m_result)); switch (result) { case -EAGAIN: @@ -4555,7 +4583,7 @@ static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) case 0: /* request was queued or granted on remote master */ receive_flags_reply(lkb, ms); - lkb->lkb_remid = ms->m_lkid; + lkb->lkb_remid = le32_to_cpu(ms->m_lkid); if (is_altmode(lkb)) munge_altmode(lkb, ms); if (result) { @@ -4628,7 +4656,7 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, struct dlm_message *ms) { /* this is the value returned from do_convert() on the master */ - switch (ms->m_result) { + switch (from_dlm_errno(le32_to_cpu(ms->m_result))) { case -EAGAIN: /* convert would block (be queued) on remote master */ queue_cast(r, lkb, -EAGAIN); @@ -4661,8 +4689,9 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, default: log_error(r->res_ls, "receive_convert_reply %x remote %d %x %d", - lkb->lkb_id, ms->m_header.h_nodeid, ms->m_lkid, - ms->m_result); + lkb->lkb_id, le32_to_cpu(ms->m_header.h_nodeid), + le32_to_cpu(ms->m_lkid), + from_dlm_errno(le32_to_cpu(ms->m_result))); dlm_print_rsb(r); dlm_print_lkb(lkb); } @@ -4696,7 +4725,7 @@ static int receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms) struct dlm_lkb *lkb; int error; - error = find_lkb(ls, ms->m_remid, &lkb); + error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb); if (error) return error; @@ -4724,7 +4753,7 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms) /* this is the value returned from do_unlock() on the master */ - switch (ms->m_result) { + switch (from_dlm_errno(le32_to_cpu(ms->m_result))) { case -DLM_EUNLOCK: receive_flags_reply(lkb, ms); remove_lock_pc(r, lkb); @@ -4734,7 +4763,7 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms) break; default: log_error(r->res_ls, "receive_unlock_reply %x error %d", - lkb->lkb_id, ms->m_result); + lkb->lkb_id, from_dlm_errno(le32_to_cpu(ms->m_result))); } out: unlock_rsb(r); @@ -4746,7 +4775,7 @@ static int receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms) struct dlm_lkb *lkb; int error; - error = find_lkb(ls, ms->m_remid, &lkb); + error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb); if (error) return error; @@ -4774,7 +4803,7 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms) /* this is the value returned from do_cancel() on the master */ - switch (ms->m_result) { + switch (from_dlm_errno(le32_to_cpu(ms->m_result))) { case -DLM_ECANCEL: receive_flags_reply(lkb, ms); revert_lock_pc(r, lkb); @@ -4784,7 +4813,8 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms) break; default: log_error(r->res_ls, "receive_cancel_reply %x error %d", - lkb->lkb_id, ms->m_result); + lkb->lkb_id, + from_dlm_errno(le32_to_cpu(ms->m_result))); } out: unlock_rsb(r); @@ -4796,7 +4826,7 @@ static int receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms) struct dlm_lkb *lkb; int error; - error = find_lkb(ls, ms->m_remid, &lkb); + error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb); if (error) return error; @@ -4812,9 +4842,10 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms) int error, ret_nodeid; int do_lookup_list = 0; - error = find_lkb(ls, ms->m_lkid, &lkb); + error = find_lkb(ls, le32_to_cpu(ms->m_lkid), &lkb); if (error) { - log_error(ls, "receive_lookup_reply no lkid %x", ms->m_lkid); + log_error(ls, "%s no lkid %x", __func__, + le32_to_cpu(ms->m_lkid)); return; } @@ -4829,7 +4860,7 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms) if (error) goto out; - ret_nodeid = ms->m_nodeid; + ret_nodeid = le32_to_cpu(ms->m_nodeid); /* We sometimes receive a request from the dir node for this rsb before we've received the dir node's loookup_reply for it. @@ -4841,8 +4872,8 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms) /* This should never happen */ log_error(ls, "receive_lookup_reply %x from %d ret %d " "master %d dir %d our %d first %x %s", - lkb->lkb_id, ms->m_header.h_nodeid, ret_nodeid, - r->res_master_nodeid, r->res_dir_nodeid, + lkb->lkb_id, le32_to_cpu(ms->m_header.h_nodeid), + ret_nodeid, r->res_master_nodeid, r->res_dir_nodeid, dlm_our_nodeid(), r->res_first_lkid, r->res_name); } @@ -4854,7 +4885,7 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms) } else if (ret_nodeid == -1) { /* the remote node doesn't believe it's the dir node */ log_error(ls, "receive_lookup_reply %x from %d bad ret_nodeid", - lkb->lkb_id, ms->m_header.h_nodeid); + lkb->lkb_id, le32_to_cpu(ms->m_header.h_nodeid)); r->res_master_nodeid = 0; r->res_nodeid = -1; lkb->lkb_nodeid = -1; @@ -4888,10 +4919,12 @@ static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms, { int error = 0, noent = 0; - if (!dlm_is_member(ls, ms->m_header.h_nodeid)) { + if (!dlm_is_member(ls, le32_to_cpu(ms->m_header.h_nodeid))) { log_limit(ls, "receive %d from non-member %d %x %x %d", - ms->m_type, ms->m_header.h_nodeid, ms->m_lkid, - ms->m_remid, ms->m_result); + le32_to_cpu(ms->m_type), + le32_to_cpu(ms->m_header.h_nodeid), + le32_to_cpu(ms->m_lkid), le32_to_cpu(ms->m_remid), + from_dlm_errno(le32_to_cpu(ms->m_result))); return; } @@ -4899,77 +4932,78 @@ static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms, /* messages sent to a master node */ - case DLM_MSG_REQUEST: + case cpu_to_le32(DLM_MSG_REQUEST): error = receive_request(ls, ms); break; - case DLM_MSG_CONVERT: + case cpu_to_le32(DLM_MSG_CONVERT): error = receive_convert(ls, ms); break; - case DLM_MSG_UNLOCK: + case cpu_to_le32(DLM_MSG_UNLOCK): error = receive_unlock(ls, ms); break; - case DLM_MSG_CANCEL: + case cpu_to_le32(DLM_MSG_CANCEL): noent = 1; error = receive_cancel(ls, ms); break; /* messages sent from a master node (replies to above) */ - case DLM_MSG_REQUEST_REPLY: + case cpu_to_le32(DLM_MSG_REQUEST_REPLY): error = receive_request_reply(ls, ms); break; - case DLM_MSG_CONVERT_REPLY: + case cpu_to_le32(DLM_MSG_CONVERT_REPLY): error = receive_convert_reply(ls, ms); break; - case DLM_MSG_UNLOCK_REPLY: + case cpu_to_le32(DLM_MSG_UNLOCK_REPLY): error = receive_unlock_reply(ls, ms); break; - case DLM_MSG_CANCEL_REPLY: + case cpu_to_le32(DLM_MSG_CANCEL_REPLY): error = receive_cancel_reply(ls, ms); break; /* messages sent from a master node (only two types of async msg) */ - case DLM_MSG_GRANT: + case cpu_to_le32(DLM_MSG_GRANT): noent = 1; error = receive_grant(ls, ms); break; - case DLM_MSG_BAST: + case cpu_to_le32(DLM_MSG_BAST): noent = 1; error = receive_bast(ls, ms); break; /* messages sent to a dir node */ - case DLM_MSG_LOOKUP: + case cpu_to_le32(DLM_MSG_LOOKUP): receive_lookup(ls, ms); break; - case DLM_MSG_REMOVE: + case cpu_to_le32(DLM_MSG_REMOVE): receive_remove(ls, ms); break; /* messages sent from a dir node (remove has no reply) */ - case DLM_MSG_LOOKUP_REPLY: + case cpu_to_le32(DLM_MSG_LOOKUP_REPLY): receive_lookup_reply(ls, ms); break; /* other messages */ - case DLM_MSG_PURGE: + case cpu_to_le32(DLM_MSG_PURGE): receive_purge(ls, ms); break; default: - log_error(ls, "unknown message type %d", ms->m_type); + log_error(ls, "unknown message type %d", + le32_to_cpu(ms->m_type)); } /* @@ -4985,22 +5019,26 @@ static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms, if (error == -ENOENT && noent) { log_debug(ls, "receive %d no %x remote %d %x saved_seq %u", - ms->m_type, ms->m_remid, ms->m_header.h_nodeid, - ms->m_lkid, saved_seq); + le32_to_cpu(ms->m_type), le32_to_cpu(ms->m_remid), + le32_to_cpu(ms->m_header.h_nodeid), + le32_to_cpu(ms->m_lkid), saved_seq); } else if (error == -ENOENT) { log_error(ls, "receive %d no %x remote %d %x saved_seq %u", - ms->m_type, ms->m_remid, ms->m_header.h_nodeid, - ms->m_lkid, saved_seq); + le32_to_cpu(ms->m_type), le32_to_cpu(ms->m_remid), + le32_to_cpu(ms->m_header.h_nodeid), + le32_to_cpu(ms->m_lkid), saved_seq); - if (ms->m_type == DLM_MSG_CONVERT) - dlm_dump_rsb_hash(ls, ms->m_hash); + if (ms->m_type == cpu_to_le32(DLM_MSG_CONVERT)) + dlm_dump_rsb_hash(ls, le32_to_cpu(ms->m_hash)); } if (error == -EINVAL) { log_error(ls, "receive %d inval from %d lkid %x remid %x " "saved_seq %u", - ms->m_type, ms->m_header.h_nodeid, - ms->m_lkid, ms->m_remid, saved_seq); + le32_to_cpu(ms->m_type), + le32_to_cpu(ms->m_header.h_nodeid), + le32_to_cpu(ms->m_lkid), le32_to_cpu(ms->m_remid), + saved_seq); } } @@ -5021,7 +5059,7 @@ static void dlm_receive_message(struct dlm_ls *ls, struct dlm_message *ms, lockspace generation before we left. */ if (!ls->ls_generation) { log_limit(ls, "receive %d from %d ignore old gen", - ms->m_type, nodeid); + le32_to_cpu(ms->m_type), nodeid); return; } @@ -5054,30 +5092,30 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid) switch (hd->h_cmd) { case DLM_MSG: - dlm_message_in(&p->message); - type = p->message.m_type; + type = le32_to_cpu(p->message.m_type); break; case DLM_RCOM: - dlm_rcom_in(&p->rcom); - type = p->rcom.rc_type; + type = le32_to_cpu(p->rcom.rc_type); break; default: log_print("invalid h_cmd %d from %u", hd->h_cmd, nodeid); return; } - if (hd->h_nodeid != nodeid) { + if (le32_to_cpu(hd->h_nodeid) != nodeid) { log_print("invalid h_nodeid %d from %d lockspace %x", - hd->h_nodeid, nodeid, hd->u.h_lockspace); + le32_to_cpu(hd->h_nodeid), nodeid, + le32_to_cpu(hd->u.h_lockspace)); return; } - ls = dlm_find_lockspace_global(hd->u.h_lockspace); + ls = dlm_find_lockspace_global(le32_to_cpu(hd->u.h_lockspace)); if (!ls) { if (dlm_config.ci_log_debug) { printk_ratelimited(KERN_DEBUG "dlm: invalid lockspace " "%u from %d cmd %d type %d\n", - hd->u.h_lockspace, nodeid, hd->h_cmd, type); + le32_to_cpu(hd->u.h_lockspace), nodeid, + hd->h_cmd, type); } if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS) @@ -5104,10 +5142,10 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb, if (middle_conversion(lkb)) { hold_lkb(lkb); memset(ms_stub, 0, sizeof(struct dlm_message)); - ms_stub->m_flags = DLM_IFL_STUB_MS; - ms_stub->m_type = DLM_MSG_CONVERT_REPLY; - ms_stub->m_result = -EINPROGRESS; - ms_stub->m_header.h_nodeid = lkb->lkb_nodeid; + ms_stub->m_flags = cpu_to_le32(DLM_IFL_STUB_MS); + ms_stub->m_type = cpu_to_le32(DLM_MSG_CONVERT_REPLY); + ms_stub->m_result = cpu_to_le32(to_dlm_errno(-EINPROGRESS)); + ms_stub->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid); _receive_convert_reply(lkb, ms_stub); /* Same special case as in receive_rcom_lock_args() */ @@ -5226,10 +5264,10 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) case DLM_MSG_UNLOCK: hold_lkb(lkb); memset(ms_stub, 0, sizeof(struct dlm_message)); - ms_stub->m_flags = DLM_IFL_STUB_MS; - ms_stub->m_type = DLM_MSG_UNLOCK_REPLY; - ms_stub->m_result = stub_unlock_result; - ms_stub->m_header.h_nodeid = lkb->lkb_nodeid; + ms_stub->m_flags = cpu_to_le32(DLM_IFL_STUB_MS); + ms_stub->m_type = cpu_to_le32(DLM_MSG_UNLOCK_REPLY); + ms_stub->m_result = cpu_to_le32(to_dlm_errno(stub_unlock_result)); + ms_stub->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid); _receive_unlock_reply(lkb, ms_stub); dlm_put_lkb(lkb); break; @@ -5237,10 +5275,10 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) case DLM_MSG_CANCEL: hold_lkb(lkb); memset(ms_stub, 0, sizeof(struct dlm_message)); - ms_stub->m_flags = DLM_IFL_STUB_MS; - ms_stub->m_type = DLM_MSG_CANCEL_REPLY; - ms_stub->m_result = stub_cancel_result; - ms_stub->m_header.h_nodeid = lkb->lkb_nodeid; + ms_stub->m_flags = cpu_to_le32(DLM_IFL_STUB_MS); + ms_stub->m_type = cpu_to_le32(DLM_MSG_CANCEL_REPLY); + ms_stub->m_result = cpu_to_le32(to_dlm_errno(stub_cancel_result)); + ms_stub->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid); _receive_cancel_reply(lkb, ms_stub); dlm_put_lkb(lkb); break; @@ -5257,21 +5295,18 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls) { - struct dlm_lkb *lkb; - int found = 0; + struct dlm_lkb *lkb = NULL, *iter; mutex_lock(&ls->ls_waiters_mutex); - list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) { - if (lkb->lkb_flags & DLM_IFL_RESEND) { - hold_lkb(lkb); - found = 1; + list_for_each_entry(iter, &ls->ls_waiters, lkb_wait_reply) { + if (iter->lkb_flags & DLM_IFL_RESEND) { + hold_lkb(iter); + lkb = iter; break; } } mutex_unlock(&ls->ls_waiters_mutex); - if (!found) - lkb = NULL; return lkb; } @@ -5331,11 +5366,16 @@ int dlm_recover_waiters_post(struct dlm_ls *ls) lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK; lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL; lkb->lkb_wait_type = 0; - lkb->lkb_wait_count = 0; + /* drop all wait_count references we still + * hold a reference for this iteration. + */ + while (lkb->lkb_wait_count) { + lkb->lkb_wait_count--; + unhold_lkb(lkb); + } mutex_lock(&ls->ls_waiters_mutex); list_del_init(&lkb->lkb_wait_reply); mutex_unlock(&ls->ls_waiters_mutex); - unhold_lkb(lkb); /* for waiters list */ if (oc || ou) { /* do an unlock or cancel instead of resending */ @@ -5605,7 +5645,7 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, { struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf; - lkb->lkb_nodeid = rc->rc_header.h_nodeid; + lkb->lkb_nodeid = le32_to_cpu(rc->rc_header.h_nodeid); lkb->lkb_ownpid = le32_to_cpu(rl->rl_ownpid); lkb->lkb_remid = le32_to_cpu(rl->rl_lkid); lkb->lkb_exflags = le32_to_cpu(rl->rl_exflags); @@ -5620,8 +5660,8 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, lkb->lkb_astfn = (rl->rl_asts & DLM_CB_CAST) ? &fake_astfn : NULL; if (lkb->lkb_exflags & DLM_LKF_VALBLK) { - int lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) - - sizeof(struct rcom_lock); + int lvblen = le16_to_cpu(rc->rc_header.h_length) - + sizeof(struct dlm_rcom) - sizeof(struct rcom_lock); if (lvblen > ls->ls_lvblen) return -EINVAL; lkb->lkb_lvbptr = dlm_allocate_lvb(ls); @@ -5657,7 +5697,7 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) struct dlm_rsb *r; struct dlm_lkb *lkb; uint32_t remid = 0; - int from_nodeid = rc->rc_header.h_nodeid; + int from_nodeid = le32_to_cpu(rc->rc_header.h_nodeid); int error; if (rl->rl_parent_lkid) { @@ -5707,7 +5747,6 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) attach_lkb(r, lkb); add_lkb(r, lkb, rl->rl_status); - error = 0; ls->ls_recover_locks_in++; if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue)) @@ -5747,7 +5786,8 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc) error = find_lkb(ls, lkid, &lkb); if (error) { log_error(ls, "dlm_recover_process_copy no %x remote %d %x %d", - lkid, rc->rc_header.h_nodeid, remid, result); + lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid, + result); return error; } @@ -5757,7 +5797,8 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc) if (!is_process_copy(lkb)) { log_error(ls, "dlm_recover_process_copy bad %x remote %d %x %d", - lkid, rc->rc_header.h_nodeid, remid, result); + lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid, + result); dlm_dump_rsb(r); unlock_rsb(r); put_rsb(r); @@ -5772,7 +5813,8 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc) a barrier between recover_masters and recover_locks. */ log_debug(ls, "dlm_recover_process_copy %x remote %d %x %d", - lkid, rc->rc_header.h_nodeid, remid, result); + lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid, + result); dlm_send_rcom_lock(r, lkb); goto out; @@ -5782,7 +5824,8 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc) break; default: log_error(ls, "dlm_recover_process_copy %x remote %d %x %d unk", - lkid, rc->rc_header.h_nodeid, remid, result); + lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid, + result); } /* an ack for dlm_recover_locks() which waits for replies from @@ -5925,37 +5968,36 @@ int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int mode, uint32_t flags, void *name, unsigned int namelen, unsigned long timeout_cs, uint32_t *lkid) { - struct dlm_lkb *lkb; + struct dlm_lkb *lkb = NULL, *iter; struct dlm_user_args *ua; int found_other_mode = 0; - int found = 0; int rv = 0; mutex_lock(&ls->ls_orphans_mutex); - list_for_each_entry(lkb, &ls->ls_orphans, lkb_ownqueue) { - if (lkb->lkb_resource->res_length != namelen) + list_for_each_entry(iter, &ls->ls_orphans, lkb_ownqueue) { + if (iter->lkb_resource->res_length != namelen) continue; - if (memcmp(lkb->lkb_resource->res_name, name, namelen)) + if (memcmp(iter->lkb_resource->res_name, name, namelen)) continue; - if (lkb->lkb_grmode != mode) { + if (iter->lkb_grmode != mode) { found_other_mode = 1; continue; } - found = 1; - list_del_init(&lkb->lkb_ownqueue); - lkb->lkb_flags &= ~DLM_IFL_ORPHAN; - *lkid = lkb->lkb_id; + lkb = iter; + list_del_init(&iter->lkb_ownqueue); + iter->lkb_flags &= ~DLM_IFL_ORPHAN; + *lkid = iter->lkb_id; break; } mutex_unlock(&ls->ls_orphans_mutex); - if (!found && found_other_mode) { + if (!lkb && found_other_mode) { rv = -EAGAIN; goto out; } - if (!found) { + if (!lkb) { rv = -ENOENT; goto out; } @@ -6307,8 +6349,8 @@ static int send_purge(struct dlm_ls *ls, int nodeid, int pid) DLM_MSG_PURGE, &ms, &mh); if (error) return error; - ms->m_nodeid = nodeid; - ms->m_pid = pid; + ms->m_nodeid = cpu_to_le32(nodeid); + ms->m_pid = cpu_to_le32(pid); return send_message(mh, ms); } diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 0d3833a124a3..19ed41a5da93 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -922,3 +922,15 @@ void dlm_stop_lockspaces(void) log_print("dlm user daemon left %d lockspaces", count); } +void dlm_stop_lockspaces_check(void) +{ + struct dlm_ls *ls; + + spin_lock(&lslist_lock); + list_for_each_entry(ls, &lslist, ls_list) { + if (WARN_ON(!rwsem_is_locked(&ls->ls_in_recovery) || + !dlm_locking_stopped(ls))) + break; + } + spin_unlock(&lslist_lock); +} diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h index a78d853b9342..306fc4f4ea15 100644 --- a/fs/dlm/lockspace.h +++ b/fs/dlm/lockspace.h @@ -19,6 +19,7 @@ struct dlm_ls *dlm_find_lockspace_local(void *id); struct dlm_ls *dlm_find_lockspace_device(int minor); void dlm_put_lockspace(struct dlm_ls *ls); void dlm_stop_lockspaces(void); +void dlm_stop_lockspaces_check(void); #endif /* __LOCKSPACE_DOT_H__ */ diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index e284d696c1fd..19e82f08c0e0 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1303,6 +1303,10 @@ static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len, return msg; } +/* avoid false positive for nodes_srcu, unlock happens in + * dlm_lowcomms_commit_msg which is a must call if success + */ +#ifndef __CHECKER__ struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, char **ppc, void (*cb)(void *data), void *data) @@ -1336,6 +1340,7 @@ struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, msg->idx = idx; return msg; } +#endif static void _dlm_lowcomms_commit_msg(struct dlm_msg *msg) { @@ -1362,11 +1367,16 @@ out: return; } +/* avoid false positive for nodes_srcu, lock was happen in + * dlm_lowcomms_new_msg + */ +#ifndef __CHECKER__ void dlm_lowcomms_commit_msg(struct dlm_msg *msg) { _dlm_lowcomms_commit_msg(msg); srcu_read_unlock(&connections_srcu, msg->idx); } +#endif void dlm_lowcomms_put_msg(struct dlm_msg *msg) { @@ -1789,7 +1799,7 @@ static int dlm_listen_for_all(void) SOCK_STREAM, dlm_proto_ops->proto, &sock); if (result < 0) { log_print("Can't create comms socket: %d", result); - goto out; + return result; } sock_set_mark(sock->sk, dlm_config.ci_mark); diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 61f906e705db..98084e0cfccf 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -20,7 +20,7 @@ int dlm_slots_version(struct dlm_header *h) { - if ((h->h_version & 0x0000FFFF) < DLM_HEADER_SLOTS) + if ((le32_to_cpu(h->h_version) & 0x0000FFFF) < DLM_HEADER_SLOTS) return 0; return 1; } @@ -120,18 +120,13 @@ int dlm_slots_copy_in(struct dlm_ls *ls) ro0 = (struct rcom_slot *)(rc->rc_buf + sizeof(struct rcom_config)); - for (i = 0, ro = ro0; i < num_slots; i++, ro++) { - ro->ro_nodeid = le32_to_cpu(ro->ro_nodeid); - ro->ro_slot = le16_to_cpu(ro->ro_slot); - } - log_slots(ls, gen, num_slots, ro0, NULL, 0); list_for_each_entry(memb, &ls->ls_nodes, list) { for (i = 0, ro = ro0; i < num_slots; i++, ro++) { - if (ro->ro_nodeid != memb->nodeid) + if (le32_to_cpu(ro->ro_nodeid) != memb->nodeid) continue; - memb->slot = ro->ro_slot; + memb->slot = le16_to_cpu(ro->ro_slot); memb->slot_prev = memb->slot; break; } diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 3635e42b0669..6489bc22ad61 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -135,6 +135,7 @@ #include <net/tcp.h> #include "dlm_internal.h" +#include "lockspace.h" #include "lowcomms.h" #include "config.h" #include "memory.h" @@ -380,13 +381,12 @@ static int dlm_send_ack(int nodeid, uint32_t seq) m_header = (struct dlm_header *)ppc; - m_header->h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); - m_header->h_nodeid = dlm_our_nodeid(); - m_header->h_length = mb_len; + m_header->h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR); + m_header->h_nodeid = cpu_to_le32(dlm_our_nodeid()); + m_header->h_length = cpu_to_le16(mb_len); m_header->h_cmd = DLM_ACK; - m_header->u.h_seq = seq; + m_header->u.h_seq = cpu_to_le32(seq); - header_out(m_header); dlm_lowcomms_commit_msg(msg); dlm_lowcomms_put_msg(msg); @@ -409,13 +409,11 @@ static int dlm_send_fin(struct midcomms_node *node, m_header = (struct dlm_header *)ppc; - m_header->h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); - m_header->h_nodeid = dlm_our_nodeid(); - m_header->h_length = mb_len; + m_header->h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR); + m_header->h_nodeid = cpu_to_le32(dlm_our_nodeid()); + m_header->h_length = cpu_to_le16(mb_len); m_header->h_cmd = DLM_FIN; - header_out(m_header); - pr_debug("sending fin msg to node %d\n", node->nodeid); dlm_midcomms_commit_mhandle(mh); set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags); @@ -574,14 +572,14 @@ dlm_midcomms_recv_node_lookup(int nodeid, const union dlm_packet *p, return NULL; } - switch (le32_to_cpu(p->rcom.rc_type)) { - case DLM_RCOM_NAMES: + switch (p->rcom.rc_type) { + case cpu_to_le32(DLM_RCOM_NAMES): fallthrough; - case DLM_RCOM_NAMES_REPLY: + case cpu_to_le32(DLM_RCOM_NAMES_REPLY): fallthrough; - case DLM_RCOM_STATUS: + case cpu_to_le32(DLM_RCOM_STATUS): fallthrough; - case DLM_RCOM_STATUS_REPLY: + case cpu_to_le32(DLM_RCOM_STATUS_REPLY): node = nodeid2node(nodeid, 0); if (node) { spin_lock(&node->state_lock); @@ -741,14 +739,14 @@ static void dlm_midcomms_receive_buffer_3_2(union dlm_packet *p, int nodeid) * * length already checked. */ - switch (le32_to_cpu(p->rcom.rc_type)) { - case DLM_RCOM_NAMES: + switch (p->rcom.rc_type) { + case cpu_to_le32(DLM_RCOM_NAMES): fallthrough; - case DLM_RCOM_NAMES_REPLY: + case cpu_to_le32(DLM_RCOM_NAMES_REPLY): fallthrough; - case DLM_RCOM_STATUS: + case cpu_to_le32(DLM_RCOM_STATUS): fallthrough; - case DLM_RCOM_STATUS_REPLY: + case cpu_to_le32(DLM_RCOM_STATUS_REPLY): break; default: log_print("unsupported rcom type received: %u, will skip this message from node %d", @@ -1020,11 +1018,10 @@ static void dlm_fill_opts_header(struct dlm_opts *opts, uint16_t inner_len, uint32_t seq) { opts->o_header.h_cmd = DLM_OPTS; - opts->o_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); - opts->o_header.h_nodeid = dlm_our_nodeid(); - opts->o_header.h_length = DLM_MIDCOMMS_OPT_LEN + inner_len; - opts->o_header.u.h_seq = seq; - header_out(&opts->o_header); + opts->o_header.h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR); + opts->o_header.h_nodeid = cpu_to_le32(dlm_our_nodeid()); + opts->o_header.h_length = cpu_to_le16(DLM_MIDCOMMS_OPT_LEN + inner_len); + opts->o_header.u.h_seq = cpu_to_le32(seq); } static void midcomms_new_msg_cb(void *data) @@ -1062,6 +1059,10 @@ static struct dlm_msg *dlm_midcomms_get_msg_3_2(struct dlm_mhandle *mh, int node return msg; } +/* avoid false positive for nodes_srcu, unlock happens in + * dlm_midcomms_commit_mhandle which is a must call if success + */ +#ifndef __CHECKER__ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, gfp_t allocation, char **ppc) { @@ -1127,6 +1128,7 @@ err: srcu_read_unlock(&nodes_srcu, idx); return NULL; } +#endif static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh) { @@ -1136,6 +1138,10 @@ static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh) dlm_lowcomms_commit_msg(mh->msg); } +/* avoid false positive for nodes_srcu, lock was happen in + * dlm_midcomms_get_mhandle + */ +#ifndef __CHECKER__ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh) { switch (mh->node->version) { @@ -1157,6 +1163,7 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh) break; } } +#endif int dlm_midcomms_start(void) { @@ -1406,6 +1413,8 @@ int dlm_midcomms_close(int nodeid) if (nodeid == dlm_our_nodeid()) return 0; + dlm_stop_lockspaces_check(); + idx = srcu_read_lock(&nodes_srcu); /* Abort pending close/remove operation */ node = nodeid2node(nodeid, 0); @@ -1455,7 +1464,7 @@ static void midcomms_new_rawmsg_cb(void *data) switch (h->h_cmd) { case DLM_OPTS: if (!h->u.h_seq) - h->u.h_seq = rd->node->seq_send++; + h->u.h_seq = cpu_to_le32(rd->node->seq_send++); break; default: break; diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index c38b2b8ffd1d..0993eebf2060 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -13,26 +13,26 @@ #include "dlm_internal.h" #include "lockspace.h" -static spinlock_t ops_lock; -static struct list_head send_list; -static struct list_head recv_list; -static wait_queue_head_t send_wq; -static wait_queue_head_t recv_wq; +static DEFINE_SPINLOCK(ops_lock); +static LIST_HEAD(send_list); +static LIST_HEAD(recv_list); +static DECLARE_WAIT_QUEUE_HEAD(send_wq); +static DECLARE_WAIT_QUEUE_HEAD(recv_wq); -struct plock_op { - struct list_head list; - int done; - struct dlm_plock_info info; -}; - -struct plock_xop { - struct plock_op xop; - int (*callback)(struct file_lock *fl, int result); +struct plock_async_data { void *fl; void *file; struct file_lock flc; + int (*callback)(struct file_lock *fl, int result); }; +struct plock_op { + struct list_head list; + int done; + struct dlm_plock_info info; + /* if set indicates async handling */ + struct plock_async_data *data; +}; static inline void set_version(struct dlm_plock_info *info) { @@ -58,10 +58,15 @@ static int check_version(struct dlm_plock_info *info) return 0; } +static void dlm_release_plock_op(struct plock_op *op) +{ + kfree(op->data); + kfree(op); +} + static void send_op(struct plock_op *op) { set_version(&op->info); - INIT_LIST_HEAD(&op->list); spin_lock(&ops_lock); list_add_tail(&op->list, &send_list); spin_unlock(&ops_lock); @@ -101,22 +106,21 @@ static void do_unlock_close(struct dlm_ls *ls, u64 number, int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, int cmd, struct file_lock *fl) { + struct plock_async_data *op_data; struct dlm_ls *ls; struct plock_op *op; - struct plock_xop *xop; int rv; ls = dlm_find_lockspace_local(lockspace); if (!ls) return -EINVAL; - xop = kzalloc(sizeof(*xop), GFP_NOFS); - if (!xop) { + op = kzalloc(sizeof(*op), GFP_NOFS); + if (!op) { rv = -ENOMEM; goto out; } - op = &xop->xop; op->info.optype = DLM_PLOCK_OP_LOCK; op->info.pid = fl->fl_pid; op->info.ex = (fl->fl_type == F_WRLCK); @@ -125,46 +129,49 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, op->info.number = number; op->info.start = fl->fl_start; op->info.end = fl->fl_end; + /* async handling */ if (fl->fl_lmops && fl->fl_lmops->lm_grant) { + op_data = kzalloc(sizeof(*op_data), GFP_NOFS); + if (!op_data) { + dlm_release_plock_op(op); + rv = -ENOMEM; + goto out; + } + /* fl_owner is lockd which doesn't distinguish processes on the nfs client */ op->info.owner = (__u64) fl->fl_pid; - xop->callback = fl->fl_lmops->lm_grant; - locks_init_lock(&xop->flc); - locks_copy_lock(&xop->flc, fl); - xop->fl = fl; - xop->file = file; + op_data->callback = fl->fl_lmops->lm_grant; + locks_init_lock(&op_data->flc); + locks_copy_lock(&op_data->flc, fl); + op_data->fl = fl; + op_data->file = file; + + op->data = op_data; + + send_op(op); + rv = FILE_LOCK_DEFERRED; + goto out; } else { op->info.owner = (__u64)(long) fl->fl_owner; - xop->callback = NULL; } send_op(op); - if (xop->callback == NULL) { - rv = wait_event_interruptible(recv_wq, (op->done != 0)); - if (rv == -ERESTARTSYS) { - log_debug(ls, "dlm_posix_lock: wait killed %llx", - (unsigned long long)number); - spin_lock(&ops_lock); - list_del(&op->list); - spin_unlock(&ops_lock); - kfree(xop); - do_unlock_close(ls, number, file, fl); - goto out; - } - } else { - rv = FILE_LOCK_DEFERRED; + rv = wait_event_interruptible(recv_wq, (op->done != 0)); + if (rv == -ERESTARTSYS) { + spin_lock(&ops_lock); + list_del(&op->list); + spin_unlock(&ops_lock); + log_print("%s: wait interrupted %x %llx, op removed", + __func__, ls->ls_global_id, + (unsigned long long)number); + dlm_release_plock_op(op); + do_unlock_close(ls, number, file, fl); goto out; } - spin_lock(&ops_lock); - if (!list_empty(&op->list)) { - log_error(ls, "dlm_posix_lock: op on list %llx", - (unsigned long long)number); - list_del(&op->list); - } - spin_unlock(&ops_lock); + WARN_ON(!list_empty(&op->list)); rv = op->info.rv; @@ -174,7 +181,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, (unsigned long long)number); } - kfree(xop); + dlm_release_plock_op(op); out: dlm_put_lockspace(ls); return rv; @@ -184,26 +191,20 @@ EXPORT_SYMBOL_GPL(dlm_posix_lock); /* Returns failure iff a successful lock operation should be canceled */ static int dlm_plock_callback(struct plock_op *op) { + struct plock_async_data *op_data = op->data; struct file *file; struct file_lock *fl; struct file_lock *flc; int (*notify)(struct file_lock *fl, int result) = NULL; - struct plock_xop *xop = (struct plock_xop *)op; int rv = 0; - spin_lock(&ops_lock); - if (!list_empty(&op->list)) { - log_print("dlm_plock_callback: op on list %llx", - (unsigned long long)op->info.number); - list_del(&op->list); - } - spin_unlock(&ops_lock); + WARN_ON(!list_empty(&op->list)); /* check if the following 2 are still valid or make a copy */ - file = xop->file; - flc = &xop->flc; - fl = xop->fl; - notify = xop->callback; + file = op_data->file; + flc = &op_data->flc; + fl = op_data->fl; + notify = op_data->callback; if (op->info.rv) { notify(fl, op->info.rv); @@ -234,7 +235,7 @@ static int dlm_plock_callback(struct plock_op *op) } out: - kfree(xop); + dlm_release_plock_op(op); return rv; } @@ -290,13 +291,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, send_op(op); wait_event(recv_wq, (op->done != 0)); - spin_lock(&ops_lock); - if (!list_empty(&op->list)) { - log_error(ls, "dlm_posix_unlock: op on list %llx", - (unsigned long long)number); - list_del(&op->list); - } - spin_unlock(&ops_lock); + WARN_ON(!list_empty(&op->list)); rv = op->info.rv; @@ -304,7 +299,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, rv = 0; out_free: - kfree(op); + dlm_release_plock_op(op); out: dlm_put_lockspace(ls); fl->fl_flags = fl_flags; @@ -344,13 +339,7 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file, send_op(op); wait_event(recv_wq, (op->done != 0)); - spin_lock(&ops_lock); - if (!list_empty(&op->list)) { - log_error(ls, "dlm_posix_get: op on list %llx", - (unsigned long long)number); - list_del(&op->list); - } - spin_unlock(&ops_lock); + WARN_ON(!list_empty(&op->list)); /* info.rv from userspace is 1 for conflict, 0 for no-conflict, -ENOENT if there are no locks on the file */ @@ -370,7 +359,7 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file, rv = 0; } - kfree(op); + dlm_release_plock_op(op); out: dlm_put_lockspace(ls); return rv; @@ -406,7 +395,7 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count, (the process did not make an unlock call). */ if (op->info.flags & DLM_PLOCK_FL_CLOSE) - kfree(op); + dlm_release_plock_op(op); if (copy_to_user(u, &info, sizeof(info))) return -EFAULT; @@ -418,9 +407,9 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count, static ssize_t dev_write(struct file *file, const char __user *u, size_t count, loff_t *ppos) { + struct plock_op *op = NULL, *iter; struct dlm_plock_info info; - struct plock_op *op; - int found = 0, do_callback = 0; + int do_callback = 0; if (count != sizeof(info)) return -EINVAL; @@ -432,31 +421,30 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count, return -EINVAL; spin_lock(&ops_lock); - list_for_each_entry(op, &recv_list, list) { - if (op->info.fsid == info.fsid && - op->info.number == info.number && - op->info.owner == info.owner) { - struct plock_xop *xop = (struct plock_xop *)op; - list_del_init(&op->list); - memcpy(&op->info, &info, sizeof(info)); - if (xop->callback) + list_for_each_entry(iter, &recv_list, list) { + if (iter->info.fsid == info.fsid && + iter->info.number == info.number && + iter->info.owner == info.owner) { + list_del_init(&iter->list); + memcpy(&iter->info, &info, sizeof(info)); + if (iter->data) do_callback = 1; else - op->done = 1; - found = 1; + iter->done = 1; + op = iter; break; } } spin_unlock(&ops_lock); - if (found) { + if (op) { if (do_callback) dlm_plock_callback(op); else wake_up(&recv_wq); } else - log_print("dev_write no op %x %llx", info.fsid, - (unsigned long long)info.number); + log_print("%s: no op %x %llx - may got interrupted?", __func__, + info.fsid, (unsigned long long)info.number); return count; } @@ -492,12 +480,6 @@ int dlm_plock_init(void) { int rv; - spin_lock_init(&ops_lock); - INIT_LIST_HEAD(&send_list); - INIT_LIST_HEAD(&recv_list); - init_waitqueue_head(&send_wq); - init_waitqueue_head(&recv_wq); - rv = misc_register(&plock_dev_misc); if (rv) log_print("dlm_plock_init: misc_register failed %d", rv); diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index 5821b777a1a7..f19860315043 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -34,16 +34,16 @@ static void _create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, rc = (struct dlm_rcom *) mb; - rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); - rc->rc_header.u.h_lockspace = ls->ls_global_id; - rc->rc_header.h_nodeid = dlm_our_nodeid(); - rc->rc_header.h_length = mb_len; + rc->rc_header.h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR); + rc->rc_header.u.h_lockspace = cpu_to_le32(ls->ls_global_id); + rc->rc_header.h_nodeid = cpu_to_le32(dlm_our_nodeid()); + rc->rc_header.h_length = cpu_to_le16(mb_len); rc->rc_header.h_cmd = DLM_RCOM; - rc->rc_type = type; + rc->rc_type = cpu_to_le32(type); spin_lock(&ls->ls_recover_lock); - rc->rc_seq = ls->ls_recover_seq; + rc->rc_seq = cpu_to_le64(ls->ls_recover_seq); spin_unlock(&ls->ls_recover_lock); *rc_ret = rc; @@ -91,13 +91,11 @@ static int create_rcom_stateless(struct dlm_ls *ls, int to_nodeid, int type, static void send_rcom(struct dlm_mhandle *mh, struct dlm_rcom *rc) { - dlm_rcom_out(rc); dlm_midcomms_commit_mhandle(mh); } static void send_rcom_stateless(struct dlm_msg *msg, struct dlm_rcom *rc) { - dlm_rcom_out(rc); dlm_lowcomms_commit_msg(msg); dlm_lowcomms_put_msg(msg); } @@ -127,10 +125,10 @@ static int check_rcom_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) { struct rcom_config *rf = (struct rcom_config *) rc->rc_buf; - if ((rc->rc_header.h_version & 0xFFFF0000) != DLM_HEADER_MAJOR) { + if ((le32_to_cpu(rc->rc_header.h_version) & 0xFFFF0000) != DLM_HEADER_MAJOR) { log_error(ls, "version mismatch: %x nodeid %d: %x", DLM_HEADER_MAJOR | DLM_HEADER_MINOR, nodeid, - rc->rc_header.h_version); + le32_to_cpu(rc->rc_header.h_version)); return -EPROTO; } @@ -145,10 +143,10 @@ static int check_rcom_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) return 0; } -static void allow_sync_reply(struct dlm_ls *ls, uint64_t *new_seq) +static void allow_sync_reply(struct dlm_ls *ls, __le64 *new_seq) { spin_lock(&ls->ls_rcom_spin); - *new_seq = ++ls->ls_rcom_seq; + *new_seq = cpu_to_le64(++ls->ls_rcom_seq); set_bit(LSFL_RCOM_WAIT, &ls->ls_flags); spin_unlock(&ls->ls_rcom_spin); } @@ -182,7 +180,7 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags) if (nodeid == dlm_our_nodeid()) { rc = ls->ls_recover_buf; - rc->rc_result = dlm_recover_status(ls); + rc->rc_result = cpu_to_le32(dlm_recover_status(ls)); goto out; } @@ -208,7 +206,7 @@ retry: rc = ls->ls_recover_buf; - if (rc->rc_result == -ESRCH) { + if (rc->rc_result == cpu_to_le32(-ESRCH)) { /* we pretend the remote lockspace exists with 0 status */ log_debug(ls, "remote node %d not ready", nodeid); rc->rc_result = 0; @@ -227,7 +225,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) struct dlm_rcom *rc; struct rcom_status *rs; uint32_t status; - int nodeid = rc_in->rc_header.h_nodeid; + int nodeid = le32_to_cpu(rc_in->rc_header.h_nodeid); int len = sizeof(struct rcom_config); struct dlm_msg *msg; int num_slots = 0; @@ -259,7 +257,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) rc->rc_id = rc_in->rc_id; rc->rc_seq_reply = rc_in->rc_seq; - rc->rc_result = status; + rc->rc_result = cpu_to_le32(status); set_rcom_config(ls, (struct rcom_config *)rc->rc_buf, num_slots); @@ -287,14 +285,16 @@ static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) { spin_lock(&ls->ls_rcom_spin); if (!test_bit(LSFL_RCOM_WAIT, &ls->ls_flags) || - rc_in->rc_id != ls->ls_rcom_seq) { + le64_to_cpu(rc_in->rc_id) != ls->ls_rcom_seq) { log_debug(ls, "reject reply %d from %d seq %llx expect %llx", - rc_in->rc_type, rc_in->rc_header.h_nodeid, - (unsigned long long)rc_in->rc_id, + le32_to_cpu(rc_in->rc_type), + le32_to_cpu(rc_in->rc_header.h_nodeid), + (unsigned long long)le64_to_cpu(rc_in->rc_id), (unsigned long long)ls->ls_rcom_seq); goto out; } - memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length); + memcpy(ls->ls_recover_buf, rc_in, + le16_to_cpu(rc_in->rc_header.h_length)); set_bit(LSFL_RCOM_READY, &ls->ls_flags); clear_bit(LSFL_RCOM_WAIT, &ls->ls_flags); wake_up(&ls->ls_wait_general); @@ -336,8 +336,9 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in) int error, inlen, outlen, nodeid; struct dlm_msg *msg; - nodeid = rc_in->rc_header.h_nodeid; - inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom); + nodeid = le32_to_cpu(rc_in->rc_header.h_nodeid); + inlen = le16_to_cpu(rc_in->rc_header.h_length) - + sizeof(struct dlm_rcom); outlen = DLM_MAX_APP_BUFSIZE - sizeof(struct dlm_rcom); error = create_rcom_stateless(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, @@ -364,7 +365,7 @@ int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) if (error) goto out; memcpy(rc->rc_buf, r->res_name, r->res_length); - rc->rc_id = (unsigned long) r->res_id; + rc->rc_id = cpu_to_le64(r->res_id); send_rcom(mh, rc); out: @@ -375,11 +376,12 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) { struct dlm_rcom *rc; struct dlm_mhandle *mh; - int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid; - int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom); + int error, ret_nodeid, nodeid = le32_to_cpu(rc_in->rc_header.h_nodeid); + int len = le16_to_cpu(rc_in->rc_header.h_length) - + sizeof(struct dlm_rcom); /* Old code would send this special id to trigger a debug dump. */ - if (rc_in->rc_id == 0xFFFFFFFF) { + if (rc_in->rc_id == cpu_to_le64(0xFFFFFFFF)) { log_error(ls, "receive_rcom_lookup dump from %d", nodeid); dlm_dump_rsb_name(ls, rc_in->rc_buf, len); return; @@ -393,7 +395,7 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) DLM_LU_RECOVER_MASTER, &ret_nodeid, NULL); if (error) ret_nodeid = error; - rc->rc_result = ret_nodeid; + rc->rc_result = cpu_to_le32(ret_nodeid); rc->rc_id = rc_in->rc_id; rc->rc_seq_reply = rc_in->rc_seq; @@ -452,7 +454,7 @@ int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) rl = (struct rcom_lock *) rc->rc_buf; pack_rcom_lock(r, lkb, rl); - rc->rc_id = (unsigned long) r; + rc->rc_id = cpu_to_le64((uintptr_t)r); send_rcom(mh, rc); out: @@ -464,7 +466,7 @@ static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in) { struct dlm_rcom *rc; struct dlm_mhandle *mh; - int error, nodeid = rc_in->rc_header.h_nodeid; + int error, nodeid = le32_to_cpu(rc_in->rc_header.h_nodeid); dlm_recover_master_copy(ls, rc_in); @@ -500,21 +502,20 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) rc = (struct dlm_rcom *) mb; - rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); + rc->rc_header.h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR); rc->rc_header.u.h_lockspace = rc_in->rc_header.u.h_lockspace; - rc->rc_header.h_nodeid = dlm_our_nodeid(); - rc->rc_header.h_length = mb_len; + rc->rc_header.h_nodeid = cpu_to_le32(dlm_our_nodeid()); + rc->rc_header.h_length = cpu_to_le16(mb_len); rc->rc_header.h_cmd = DLM_RCOM; - rc->rc_type = DLM_RCOM_STATUS_REPLY; + rc->rc_type = cpu_to_le32(DLM_RCOM_STATUS_REPLY); rc->rc_id = rc_in->rc_id; rc->rc_seq_reply = rc_in->rc_seq; - rc->rc_result = -ESRCH; + rc->rc_result = cpu_to_le32(-ESRCH); rf = (struct rcom_config *) rc->rc_buf; rf->rf_lvblen = cpu_to_le32(~0U); - dlm_rcom_out(rc); dlm_midcomms_commit_mhandle(mh); return 0; @@ -573,27 +574,27 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) uint64_t seq; switch (rc->rc_type) { - case DLM_RCOM_STATUS_REPLY: + case cpu_to_le32(DLM_RCOM_STATUS_REPLY): reply = 1; break; - case DLM_RCOM_NAMES: + case cpu_to_le32(DLM_RCOM_NAMES): names = 1; break; - case DLM_RCOM_NAMES_REPLY: + case cpu_to_le32(DLM_RCOM_NAMES_REPLY): names = 1; reply = 1; break; - case DLM_RCOM_LOOKUP: + case cpu_to_le32(DLM_RCOM_LOOKUP): lookup = 1; break; - case DLM_RCOM_LOOKUP_REPLY: + case cpu_to_le32(DLM_RCOM_LOOKUP_REPLY): lookup = 1; reply = 1; break; - case DLM_RCOM_LOCK: + case cpu_to_le32(DLM_RCOM_LOCK): lock = 1; break; - case DLM_RCOM_LOCK_REPLY: + case cpu_to_le32(DLM_RCOM_LOCK_REPLY): lock = 1; reply = 1; break; @@ -605,10 +606,10 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) seq = ls->ls_recover_seq; spin_unlock(&ls->ls_recover_lock); - if (stop && (rc->rc_type != DLM_RCOM_STATUS)) + if (stop && (rc->rc_type != cpu_to_le32(DLM_RCOM_STATUS))) goto ignore; - if (reply && (rc->rc_seq_reply != seq)) + if (reply && (le64_to_cpu(rc->rc_seq_reply) != seq)) goto ignore; if (!(status & DLM_RS_NODES) && (names || lookup || lock)) @@ -618,59 +619,60 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) goto ignore; switch (rc->rc_type) { - case DLM_RCOM_STATUS: + case cpu_to_le32(DLM_RCOM_STATUS): receive_rcom_status(ls, rc); break; - case DLM_RCOM_NAMES: + case cpu_to_le32(DLM_RCOM_NAMES): receive_rcom_names(ls, rc); break; - case DLM_RCOM_LOOKUP: + case cpu_to_le32(DLM_RCOM_LOOKUP): receive_rcom_lookup(ls, rc); break; - case DLM_RCOM_LOCK: - if (rc->rc_header.h_length < lock_size) + case cpu_to_le32(DLM_RCOM_LOCK): + if (le16_to_cpu(rc->rc_header.h_length) < lock_size) goto Eshort; receive_rcom_lock(ls, rc); break; - case DLM_RCOM_STATUS_REPLY: + case cpu_to_le32(DLM_RCOM_STATUS_REPLY): receive_sync_reply(ls, rc); break; - case DLM_RCOM_NAMES_REPLY: + case cpu_to_le32(DLM_RCOM_NAMES_REPLY): receive_sync_reply(ls, rc); break; - case DLM_RCOM_LOOKUP_REPLY: + case cpu_to_le32(DLM_RCOM_LOOKUP_REPLY): receive_rcom_lookup_reply(ls, rc); break; - case DLM_RCOM_LOCK_REPLY: - if (rc->rc_header.h_length < lock_size) + case cpu_to_le32(DLM_RCOM_LOCK_REPLY): + if (le16_to_cpu(rc->rc_header.h_length) < lock_size) goto Eshort; dlm_recover_process_copy(ls, rc); break; default: - log_error(ls, "receive_rcom bad type %d", rc->rc_type); + log_error(ls, "receive_rcom bad type %d", + le32_to_cpu(rc->rc_type)); } return; ignore: log_limit(ls, "dlm_receive_rcom ignore msg %d " "from %d %llu %llu recover seq %llu sts %x gen %u", - rc->rc_type, + le32_to_cpu(rc->rc_type), nodeid, - (unsigned long long)rc->rc_seq, - (unsigned long long)rc->rc_seq_reply, + (unsigned long long)le64_to_cpu(rc->rc_seq), + (unsigned long long)le64_to_cpu(rc->rc_seq_reply), (unsigned long long)seq, status, ls->ls_generation); return; Eshort: log_error(ls, "recovery message %d from %d is too short", - rc->rc_type, nodeid); + le32_to_cpu(rc->rc_type), nodeid); } diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index 8928e99dfd47..ccff1791803f 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -114,7 +114,7 @@ static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status, if (save_slots) dlm_slot_save(ls, rc, memb); - if (rc->rc_result & wait_status) + if (le32_to_cpu(rc->rc_result) & wait_status) break; if (delay < 1000) delay += 20; @@ -141,7 +141,7 @@ static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status, if (error) break; - if (rc->rc_result & wait_status) + if (le32_to_cpu(rc->rc_result) & wait_status) break; if (delay < 1000) delay += 20; @@ -568,14 +568,14 @@ int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc) struct dlm_rsb *r; int ret_nodeid, new_master; - r = recover_idr_find(ls, rc->rc_id); + r = recover_idr_find(ls, le64_to_cpu(rc->rc_id)); if (!r) { log_error(ls, "dlm_recover_master_reply no id %llx", - (unsigned long long)rc->rc_id); + (unsigned long long)le64_to_cpu(rc->rc_id)); goto out; } - ret_nodeid = rc->rc_result; + ret_nodeid = le32_to_cpu(rc->rc_result); if (ret_nodeid == dlm_our_nodeid()) new_master = 0; @@ -732,10 +732,9 @@ void dlm_recovered_lock(struct dlm_rsb *r) static void recover_lvb(struct dlm_rsb *r) { - struct dlm_lkb *lkb, *high_lkb = NULL; + struct dlm_lkb *big_lkb = NULL, *iter, *high_lkb = NULL; uint32_t high_seq = 0; int lock_lvb_exists = 0; - int big_lock_exists = 0; int lvblen = r->res_ls->ls_lvblen; if (!rsb_flag(r, RSB_NEW_MASTER2) && @@ -751,37 +750,37 @@ static void recover_lvb(struct dlm_rsb *r) /* we are the new master, so figure out if VALNOTVALID should be set, and set the rsb lvb from the best lkb available. */ - list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) { - if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) + list_for_each_entry(iter, &r->res_grantqueue, lkb_statequeue) { + if (!(iter->lkb_exflags & DLM_LKF_VALBLK)) continue; lock_lvb_exists = 1; - if (lkb->lkb_grmode > DLM_LOCK_CR) { - big_lock_exists = 1; + if (iter->lkb_grmode > DLM_LOCK_CR) { + big_lkb = iter; goto setflag; } - if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) { - high_lkb = lkb; - high_seq = lkb->lkb_lvbseq; + if (((int)iter->lkb_lvbseq - (int)high_seq) >= 0) { + high_lkb = iter; + high_seq = iter->lkb_lvbseq; } } - list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) { - if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) + list_for_each_entry(iter, &r->res_convertqueue, lkb_statequeue) { + if (!(iter->lkb_exflags & DLM_LKF_VALBLK)) continue; lock_lvb_exists = 1; - if (lkb->lkb_grmode > DLM_LOCK_CR) { - big_lock_exists = 1; + if (iter->lkb_grmode > DLM_LOCK_CR) { + big_lkb = iter; goto setflag; } - if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) { - high_lkb = lkb; - high_seq = lkb->lkb_lvbseq; + if (((int)iter->lkb_lvbseq - (int)high_seq) >= 0) { + high_lkb = iter; + high_seq = iter->lkb_lvbseq; } } @@ -790,7 +789,7 @@ static void recover_lvb(struct dlm_rsb *r) goto out; /* lvb is invalidated if only NL/CR locks remain */ - if (!big_lock_exists) + if (!big_lkb) rsb_set_flag(r, RSB_VALNOTVALID); if (!r->res_lvbptr) { @@ -799,9 +798,9 @@ static void recover_lvb(struct dlm_rsb *r) goto out; } - if (big_lock_exists) { - r->res_lvbseq = lkb->lkb_lvbseq; - memcpy(r->res_lvbptr, lkb->lkb_lvbptr, lvblen); + if (big_lkb) { + r->res_lvbseq = big_lkb->lkb_lvbseq; + memcpy(r->res_lvbptr, big_lkb->lkb_lvbptr, lvblen); } else if (high_lkb) { r->res_lvbseq = high_lkb->lkb_lvbseq; memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen); diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c index ccb5307c21e9..036a9a0078f6 100644 --- a/fs/dlm/requestqueue.c +++ b/fs/dlm/requestqueue.c @@ -14,6 +14,7 @@ #include "dir.h" #include "config.h" #include "requestqueue.h" +#include "util.h" struct rq_entry { struct list_head list; @@ -32,7 +33,8 @@ struct rq_entry { void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms) { struct rq_entry *e; - int length = ms->m_header.h_length - sizeof(struct dlm_message); + int length = le16_to_cpu(ms->m_header.h_length) - + sizeof(struct dlm_message); e = kmalloc(sizeof(struct rq_entry) + length, GFP_NOFS); if (!e) { @@ -42,7 +44,7 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms) e->recover_seq = ls->ls_recover_seq & 0xFFFFFFFF; e->nodeid = nodeid; - memcpy(&e->request, ms, ms->m_header.h_length); + memcpy(&e->request, ms, le16_to_cpu(ms->m_header.h_length)); atomic_inc(&ls->ls_requestqueue_cnt); mutex_lock(&ls->ls_requestqueue_mutex); @@ -82,8 +84,10 @@ int dlm_process_requestqueue(struct dlm_ls *ls) log_limit(ls, "dlm_process_requestqueue msg %d from %d " "lkid %x remid %x result %d seq %u", - ms->m_type, ms->m_header.h_nodeid, - ms->m_lkid, ms->m_remid, ms->m_result, + le32_to_cpu(ms->m_type), + le32_to_cpu(ms->m_header.h_nodeid), + le32_to_cpu(ms->m_lkid), le32_to_cpu(ms->m_remid), + from_dlm_errno(le32_to_cpu(ms->m_result)), e->recover_seq); dlm_receive_message_saved(ls, &e->request, e->recover_seq); @@ -124,7 +128,7 @@ void dlm_wait_requestqueue(struct dlm_ls *ls) static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid) { - uint32_t type = ms->m_type; + __le32 type = ms->m_type; /* the ls is being cleaned up and freed by release_lockspace */ if (!atomic_read(&ls->ls_count)) @@ -136,9 +140,9 @@ static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid) /* directory operations are always purged because the directory is always rebuilt during recovery and the lookups resent */ - if (type == DLM_MSG_REMOVE || - type == DLM_MSG_LOOKUP || - type == DLM_MSG_LOOKUP_REPLY) + if (type == cpu_to_le32(DLM_MSG_REMOVE) || + type == cpu_to_le32(DLM_MSG_LOOKUP) || + type == cpu_to_le32(DLM_MSG_LOOKUP_REPLY)) return 1; if (!dlm_no_directory(ls)) diff --git a/fs/dlm/user.c b/fs/dlm/user.c index e5cefa90b1ce..1060b24f18d4 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -108,11 +108,11 @@ static void compat_input(struct dlm_write_request *kb, kb->i.lock.parent = kb32->i.lock.parent; kb->i.lock.xid = kb32->i.lock.xid; kb->i.lock.timeout = kb32->i.lock.timeout; - kb->i.lock.castparam = (void *)(long)kb32->i.lock.castparam; - kb->i.lock.castaddr = (void *)(long)kb32->i.lock.castaddr; - kb->i.lock.bastparam = (void *)(long)kb32->i.lock.bastparam; - kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr; - kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb; + kb->i.lock.castparam = (__user void *)(long)kb32->i.lock.castparam; + kb->i.lock.castaddr = (__user void *)(long)kb32->i.lock.castaddr; + kb->i.lock.bastparam = (__user void *)(long)kb32->i.lock.bastparam; + kb->i.lock.bastaddr = (__user void *)(long)kb32->i.lock.bastaddr; + kb->i.lock.lksb = (__user void *)(long)kb32->i.lock.lksb; memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN); memcpy(kb->i.lock.name, kb32->i.lock.name, namelen); } @@ -127,9 +127,9 @@ static void compat_output(struct dlm_lock_result *res, res32->version[1] = res->version[1]; res32->version[2] = res->version[2]; - res32->user_astaddr = (__u32)(long)res->user_astaddr; - res32->user_astparam = (__u32)(long)res->user_astparam; - res32->user_lksb = (__u32)(long)res->user_lksb; + res32->user_astaddr = (__u32)(__force long)res->user_astaddr; + res32->user_astparam = (__u32)(__force long)res->user_astparam; + res32->user_lksb = (__u32)(__force long)res->user_lksb; res32->bast_mode = res->bast_mode; res32->lvb_offset = res->lvb_offset; diff --git a/fs/dlm/util.c b/fs/dlm/util.c index 58acbcc2081a..f2bc401f312f 100644 --- a/fs/dlm/util.c +++ b/fs/dlm/util.c @@ -20,28 +20,10 @@ #define DLM_ERRNO_ETIMEDOUT 110 #define DLM_ERRNO_EINPROGRESS 115 -void header_out(struct dlm_header *hd) -{ - hd->h_version = cpu_to_le32(hd->h_version); - /* does it for others u32 in union as well */ - hd->u.h_lockspace = cpu_to_le32(hd->u.h_lockspace); - hd->h_nodeid = cpu_to_le32(hd->h_nodeid); - hd->h_length = cpu_to_le16(hd->h_length); -} - -void header_in(struct dlm_header *hd) -{ - hd->h_version = le32_to_cpu(hd->h_version); - /* does it for others u32 in union as well */ - hd->u.h_lockspace = le32_to_cpu(hd->u.h_lockspace); - hd->h_nodeid = le32_to_cpu(hd->h_nodeid); - hd->h_length = le16_to_cpu(hd->h_length); -} - /* higher errno values are inconsistent across architectures, so select one set of values for on the wire */ -static int to_dlm_errno(int err) +int to_dlm_errno(int err) { switch (err) { case -EDEADLK: @@ -62,7 +44,7 @@ static int to_dlm_errno(int err) return err; } -static int from_dlm_errno(int err) +int from_dlm_errno(int err) { switch (err) { case -DLM_ERRNO_EDEADLK: @@ -82,73 +64,3 @@ static int from_dlm_errno(int err) } return err; } - -void dlm_message_out(struct dlm_message *ms) -{ - header_out(&ms->m_header); - - ms->m_type = cpu_to_le32(ms->m_type); - ms->m_nodeid = cpu_to_le32(ms->m_nodeid); - ms->m_pid = cpu_to_le32(ms->m_pid); - ms->m_lkid = cpu_to_le32(ms->m_lkid); - ms->m_remid = cpu_to_le32(ms->m_remid); - ms->m_parent_lkid = cpu_to_le32(ms->m_parent_lkid); - ms->m_parent_remid = cpu_to_le32(ms->m_parent_remid); - ms->m_exflags = cpu_to_le32(ms->m_exflags); - ms->m_sbflags = cpu_to_le32(ms->m_sbflags); - ms->m_flags = cpu_to_le32(ms->m_flags); - ms->m_lvbseq = cpu_to_le32(ms->m_lvbseq); - ms->m_hash = cpu_to_le32(ms->m_hash); - ms->m_status = cpu_to_le32(ms->m_status); - ms->m_grmode = cpu_to_le32(ms->m_grmode); - ms->m_rqmode = cpu_to_le32(ms->m_rqmode); - ms->m_bastmode = cpu_to_le32(ms->m_bastmode); - ms->m_asts = cpu_to_le32(ms->m_asts); - ms->m_result = cpu_to_le32(to_dlm_errno(ms->m_result)); -} - -void dlm_message_in(struct dlm_message *ms) -{ - header_in(&ms->m_header); - - ms->m_type = le32_to_cpu(ms->m_type); - ms->m_nodeid = le32_to_cpu(ms->m_nodeid); - ms->m_pid = le32_to_cpu(ms->m_pid); - ms->m_lkid = le32_to_cpu(ms->m_lkid); - ms->m_remid = le32_to_cpu(ms->m_remid); - ms->m_parent_lkid = le32_to_cpu(ms->m_parent_lkid); - ms->m_parent_remid = le32_to_cpu(ms->m_parent_remid); - ms->m_exflags = le32_to_cpu(ms->m_exflags); - ms->m_sbflags = le32_to_cpu(ms->m_sbflags); - ms->m_flags = le32_to_cpu(ms->m_flags); - ms->m_lvbseq = le32_to_cpu(ms->m_lvbseq); - ms->m_hash = le32_to_cpu(ms->m_hash); - ms->m_status = le32_to_cpu(ms->m_status); - ms->m_grmode = le32_to_cpu(ms->m_grmode); - ms->m_rqmode = le32_to_cpu(ms->m_rqmode); - ms->m_bastmode = le32_to_cpu(ms->m_bastmode); - ms->m_asts = le32_to_cpu(ms->m_asts); - ms->m_result = from_dlm_errno(le32_to_cpu(ms->m_result)); -} - -void dlm_rcom_out(struct dlm_rcom *rc) -{ - header_out(&rc->rc_header); - - rc->rc_type = cpu_to_le32(rc->rc_type); - rc->rc_result = cpu_to_le32(rc->rc_result); - rc->rc_id = cpu_to_le64(rc->rc_id); - rc->rc_seq = cpu_to_le64(rc->rc_seq); - rc->rc_seq_reply = cpu_to_le64(rc->rc_seq_reply); -} - -void dlm_rcom_in(struct dlm_rcom *rc) -{ - header_in(&rc->rc_header); - - rc->rc_type = le32_to_cpu(rc->rc_type); - rc->rc_result = le32_to_cpu(rc->rc_result); - rc->rc_id = le64_to_cpu(rc->rc_id); - rc->rc_seq = le64_to_cpu(rc->rc_seq); - rc->rc_seq_reply = le64_to_cpu(rc->rc_seq_reply); -} diff --git a/fs/dlm/util.h b/fs/dlm/util.h index d46f23c7a6a0..b6a4b8adca8d 100644 --- a/fs/dlm/util.h +++ b/fs/dlm/util.h @@ -11,12 +11,8 @@ #ifndef __UTIL_DOT_H__ #define __UTIL_DOT_H__ -void dlm_message_out(struct dlm_message *ms); -void dlm_message_in(struct dlm_message *ms); -void dlm_rcom_out(struct dlm_rcom *rc); -void dlm_rcom_in(struct dlm_rcom *rc); -void header_out(struct dlm_header *hd); -void header_in(struct dlm_header *hd); +int to_dlm_errno(int err); +int from_dlm_errno(int err); #endif diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index f57255ab88ed..85490370e0ca 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -98,3 +98,13 @@ config EROFS_FS_ZIP_LZMA systems will be readable without selecting this option. If unsure, say N. + +config EROFS_FS_ONDEMAND + bool "EROFS fscache-based on-demand read support" + depends on CACHEFILES_ONDEMAND && (EROFS_FS=m && FSCACHE || EROFS_FS=y && FSCACHE=y) + default n + help + This permits EROFS to use fscache-backed data blobs with on-demand + read support. + + If unsure, say N. diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile index 8a3317e38e5a..99bbc597a3e9 100644 --- a/fs/erofs/Makefile +++ b/fs/erofs/Makefile @@ -5,3 +5,4 @@ erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o sysfs.o erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o +erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 780db1e5f4b7..252f4ee977d5 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -6,6 +6,7 @@ */ #include "internal.h" #include <linux/prefetch.h> +#include <linux/sched/mm.h> #include <linux/dax.h> #include <trace/events/erofs.h> @@ -35,14 +36,20 @@ void *erofs_bread(struct erofs_buf *buf, struct inode *inode, erofs_off_t offset = blknr_to_addr(blkaddr); pgoff_t index = offset >> PAGE_SHIFT; struct page *page = buf->page; + struct folio *folio; + unsigned int nofs_flag; if (!page || page->index != index) { erofs_put_metabuf(buf); - page = read_cache_page_gfp(mapping, index, - mapping_gfp_constraint(mapping, ~__GFP_FS)); - if (IS_ERR(page)) - return page; + + nofs_flag = memalloc_nofs_save(); + folio = read_cache_folio(mapping, index, NULL, NULL); + memalloc_nofs_restore(nofs_flag); + if (IS_ERR(folio)) + return folio; + /* should already be PageUptodate, no need to lock page */ + page = folio_file_page(folio, index); buf->page = page; } if (buf->kmap_type == EROFS_NO_KMAP) { @@ -63,6 +70,10 @@ void *erofs_bread(struct erofs_buf *buf, struct inode *inode, void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, erofs_blk_t blkaddr, enum erofs_kmap_type type) { + if (erofs_is_fscache_mode(sb)) + return erofs_bread(buf, EROFS_SB(sb)->s_fscache->inode, + blkaddr, type); + return erofs_bread(buf, sb->s_bdev->bd_inode, blkaddr, type); } @@ -110,8 +121,8 @@ static int erofs_map_blocks_flatmode(struct inode *inode, return 0; } -static int erofs_map_blocks(struct inode *inode, - struct erofs_map_blocks *map, int flags) +int erofs_map_blocks(struct inode *inode, + struct erofs_map_blocks *map, int flags) { struct super_block *sb = inode->i_sb; struct erofs_inode *vi = EROFS_I(inode); @@ -199,6 +210,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) map->m_bdev = sb->s_bdev; map->m_daxdev = EROFS_SB(sb)->dax_dev; map->m_dax_part_off = EROFS_SB(sb)->dax_part_off; + map->m_fscache = EROFS_SB(sb)->s_fscache; if (map->m_deviceid) { down_read(&devs->rwsem); @@ -210,6 +222,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) map->m_bdev = dif->bdev; map->m_daxdev = dif->dax_dev; map->m_dax_part_off = dif->dax_part_off; + map->m_fscache = dif->fscache; up_read(&devs->rwsem); } else if (devs->extra_devices) { down_read(&devs->rwsem); @@ -227,6 +240,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) map->m_bdev = dif->bdev; map->m_daxdev = dif->dax_dev; map->m_dax_part_off = dif->dax_part_off; + map->m_fscache = dif->fscache; break; } } @@ -385,7 +399,7 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (!err) return iomap_dio_rw(iocb, to, &erofs_iomap_ops, - NULL, 0, 0); + NULL, 0, NULL, 0); if (err < 0) return err; } diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 3efa686c7644..6dca1900c733 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -46,8 +46,6 @@ int z_erofs_load_lz4_config(struct super_block *sb, erofs_err(sb, "too large lz4 pclusterblks %u", sbi->lz4.max_pclusterblks); return -EINVAL; - } else if (sbi->lz4.max_pclusterblks >= 2) { - erofs_info(sb, "EXPERIMENTAL big pcluster feature in use. Use at your own risk!"); } } else { distance = le16_to_cpu(dsb->u1.lz4_max_distance); @@ -322,6 +320,7 @@ static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq, PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; const unsigned int righthalf = min_t(unsigned int, rq->outputsize, PAGE_SIZE - rq->pageofs_out); + const unsigned int lefthalf = rq->outputsize - righthalf; unsigned char *src, *dst; if (nrpages_out > 2) { @@ -344,10 +343,10 @@ static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq, if (nrpages_out == 2) { DBG_BUGON(!rq->out[1]); if (rq->out[1] == *rq->in) { - memmove(src, src + righthalf, rq->pageofs_out); + memmove(src, src + righthalf, lefthalf); } else { dst = kmap_atomic(rq->out[1]); - memcpy(dst, src + righthalf, rq->pageofs_out); + memcpy(dst, src + righthalf, lefthalf); kunmap_atomic(dst); } } diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 1238ca104f09..2b48373f690b 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -37,12 +37,9 @@ #define EROFS_SB_EXTSLOT_SIZE 16 struct erofs_deviceslot { - union { - u8 uuid[16]; /* used for device manager later */ - u8 userdata[64]; /* digest(sha256), etc. */ - } u; - __le32 blocks; /* total fs blocks of this device */ - __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */ + u8 tag[64]; /* digest(sha256), etc. */ + __le32 blocks; /* total fs blocks of this device */ + __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */ u8 reserved[56]; }; #define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot) @@ -58,8 +55,8 @@ struct erofs_super_block { __le16 root_nid; /* nid of root directory */ __le64 inos; /* total valid ino # (== f_files - f_favail) */ - __le64 build_time; /* inode v1 time derivation */ - __le32 build_time_nsec; /* inode v1 time derivation in nano scale */ + __le64 build_time; /* compact inode time derivation */ + __le32 build_time_nsec; /* compact inode time derivation in ns scale */ __le32 blocks; /* used for statfs */ __le32 meta_blkaddr; /* start block address of metadata area */ __le32 xattr_blkaddr; /* start block address of shared xattr area */ @@ -79,15 +76,15 @@ struct erofs_super_block { /* * erofs inode datalayout (i_format in on-disk inode): - * 0 - inode plain without inline data A: + * 0 - uncompressed flat inode without tail-packing inline data: * inode, [xattrs], ... | ... | no-holed data - * 1 - inode VLE compression B (legacy): - * inode, [xattrs], extents ... | ... - * 2 - inode plain with inline data C: - * inode, [xattrs], last_inline_data, ... | ... | no-holed data - * 3 - inode compression D: + * 1 - compressed inode with non-compact indexes: + * inode, [xattrs], [map_header], extents ... | ... + * 2 - uncompressed flat inode with tail-packing inline data: + * inode, [xattrs], tailpacking data, ... | ... | no-holed data + * 3 - compressed inode with compact indexes: * inode, [xattrs], map_header, extents ... | ... - * 4 - inode chunk-based E: + * 4 - chunk-based inode with (optional) multi-device support: * inode, [xattrs], chunk indexes ... | ... * 5~7 - reserved */ @@ -106,7 +103,7 @@ static inline bool erofs_inode_is_data_compressed(unsigned int datamode) datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY; } -/* bit definitions of inode i_advise */ +/* bit definitions of inode i_format */ #define EROFS_I_VERSION_BITS 1 #define EROFS_I_DATALAYOUT_BITS 3 @@ -140,8 +137,9 @@ struct erofs_inode_compact { __le32 i_size; __le32 i_reserved; union { - /* file total compressed blocks for data mapping 1 */ + /* total compressed blocks for compressed inodes */ __le32 compressed_blocks; + /* block address for uncompressed flat inodes */ __le32 raw_blkaddr; /* for device files, used to indicate old/new device # */ @@ -156,9 +154,9 @@ struct erofs_inode_compact { __le32 i_reserved2; }; -/* 32 bytes on-disk inode */ +/* 32-byte on-disk inode */ #define EROFS_INODE_LAYOUT_COMPACT 0 -/* 64 bytes on-disk inode */ +/* 64-byte on-disk inode */ #define EROFS_INODE_LAYOUT_EXTENDED 1 /* 64-byte complete form of an ondisk inode */ @@ -171,8 +169,9 @@ struct erofs_inode_extended { __le16 i_reserved; __le64 i_size; union { - /* file total compressed blocks for data mapping 1 */ + /* total compressed blocks for compressed inodes */ __le32 compressed_blocks; + /* block address for uncompressed flat inodes */ __le32 raw_blkaddr; /* for device files, used to indicate old/new device # */ @@ -365,17 +364,16 @@ enum { struct z_erofs_vle_decompressed_index { __le16 di_advise; - /* where to decompress in the head cluster */ + /* where to decompress in the head lcluster */ __le16 di_clusterofs; union { - /* for the head cluster */ + /* for the HEAD lclusters */ __le32 blkaddr; /* - * for the rest clusters - * eg. for 4k page-sized cluster, maximum 4K*64k = 256M) - * [0] - pointing to the head cluster - * [1] - pointing to the tail cluster + * for the NONHEAD lclusters + * [0] - distance to its HEAD lcluster + * [1] - distance to the next HEAD lcluster */ __le16 delta[2]; } di_u; diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c new file mode 100644 index 000000000000..7e4417167d0b --- /dev/null +++ b/fs/erofs/fscache.c @@ -0,0 +1,521 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022, Alibaba Cloud + */ +#include <linux/fscache.h> +#include "internal.h" + +static struct netfs_io_request *erofs_fscache_alloc_request(struct address_space *mapping, + loff_t start, size_t len) +{ + struct netfs_io_request *rreq; + + rreq = kzalloc(sizeof(struct netfs_io_request), GFP_KERNEL); + if (!rreq) + return ERR_PTR(-ENOMEM); + + rreq->start = start; + rreq->len = len; + rreq->mapping = mapping; + INIT_LIST_HEAD(&rreq->subrequests); + refcount_set(&rreq->ref, 1); + return rreq; +} + +static void erofs_fscache_put_request(struct netfs_io_request *rreq) +{ + if (!refcount_dec_and_test(&rreq->ref)) + return; + if (rreq->cache_resources.ops) + rreq->cache_resources.ops->end_operation(&rreq->cache_resources); + kfree(rreq); +} + +static void erofs_fscache_put_subrequest(struct netfs_io_subrequest *subreq) +{ + if (!refcount_dec_and_test(&subreq->ref)) + return; + erofs_fscache_put_request(subreq->rreq); + kfree(subreq); +} + +static void erofs_fscache_clear_subrequests(struct netfs_io_request *rreq) +{ + struct netfs_io_subrequest *subreq; + + while (!list_empty(&rreq->subrequests)) { + subreq = list_first_entry(&rreq->subrequests, + struct netfs_io_subrequest, rreq_link); + list_del(&subreq->rreq_link); + erofs_fscache_put_subrequest(subreq); + } +} + +static void erofs_fscache_rreq_unlock_folios(struct netfs_io_request *rreq) +{ + struct netfs_io_subrequest *subreq; + struct folio *folio; + unsigned int iopos = 0; + pgoff_t start_page = rreq->start / PAGE_SIZE; + pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; + bool subreq_failed = false; + + XA_STATE(xas, &rreq->mapping->i_pages, start_page); + + subreq = list_first_entry(&rreq->subrequests, + struct netfs_io_subrequest, rreq_link); + subreq_failed = (subreq->error < 0); + + rcu_read_lock(); + xas_for_each(&xas, folio, last_page) { + unsigned int pgpos = + (folio_index(folio) - start_page) * PAGE_SIZE; + unsigned int pgend = pgpos + folio_size(folio); + bool pg_failed = false; + + for (;;) { + if (!subreq) { + pg_failed = true; + break; + } + + pg_failed |= subreq_failed; + if (pgend < iopos + subreq->len) + break; + + iopos += subreq->len; + if (!list_is_last(&subreq->rreq_link, + &rreq->subrequests)) { + subreq = list_next_entry(subreq, rreq_link); + subreq_failed = (subreq->error < 0); + } else { + subreq = NULL; + subreq_failed = false; + } + if (pgend == iopos) + break; + } + + if (!pg_failed) + folio_mark_uptodate(folio); + + folio_unlock(folio); + } + rcu_read_unlock(); +} + +static void erofs_fscache_rreq_complete(struct netfs_io_request *rreq) +{ + erofs_fscache_rreq_unlock_folios(rreq); + erofs_fscache_clear_subrequests(rreq); + erofs_fscache_put_request(rreq); +} + +static void erofc_fscache_subreq_complete(void *priv, + ssize_t transferred_or_error, bool was_async) +{ + struct netfs_io_subrequest *subreq = priv; + struct netfs_io_request *rreq = subreq->rreq; + + if (IS_ERR_VALUE(transferred_or_error)) + subreq->error = transferred_or_error; + + if (atomic_dec_and_test(&rreq->nr_outstanding)) + erofs_fscache_rreq_complete(rreq); + + erofs_fscache_put_subrequest(subreq); +} + +/* + * Read data from fscache and fill the read data into page cache described by + * @rreq, which shall be both aligned with PAGE_SIZE. @pstart describes + * the start physical address in the cache file. + */ +static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie, + struct netfs_io_request *rreq, loff_t pstart) +{ + enum netfs_io_source source; + struct super_block *sb = rreq->mapping->host->i_sb; + struct netfs_io_subrequest *subreq; + struct netfs_cache_resources *cres = &rreq->cache_resources; + struct iov_iter iter; + loff_t start = rreq->start; + size_t len = rreq->len; + size_t done = 0; + int ret; + + atomic_set(&rreq->nr_outstanding, 1); + + ret = fscache_begin_read_operation(cres, cookie); + if (ret) + goto out; + + while (done < len) { + subreq = kzalloc(sizeof(struct netfs_io_subrequest), + GFP_KERNEL); + if (subreq) { + INIT_LIST_HEAD(&subreq->rreq_link); + refcount_set(&subreq->ref, 2); + subreq->rreq = rreq; + refcount_inc(&rreq->ref); + } else { + ret = -ENOMEM; + goto out; + } + + subreq->start = pstart + done; + subreq->len = len - done; + subreq->flags = 1 << NETFS_SREQ_ONDEMAND; + + list_add_tail(&subreq->rreq_link, &rreq->subrequests); + + source = cres->ops->prepare_read(subreq, LLONG_MAX); + if (WARN_ON(subreq->len == 0)) + source = NETFS_INVALID_READ; + if (source != NETFS_READ_FROM_CACHE) { + erofs_err(sb, "failed to fscache prepare_read (source %d)", + source); + ret = -EIO; + subreq->error = ret; + erofs_fscache_put_subrequest(subreq); + goto out; + } + + atomic_inc(&rreq->nr_outstanding); + + iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, + start + done, subreq->len); + + ret = fscache_read(cres, subreq->start, &iter, + NETFS_READ_HOLE_FAIL, + erofc_fscache_subreq_complete, subreq); + if (ret == -EIOCBQUEUED) + ret = 0; + if (ret) { + erofs_err(sb, "failed to fscache_read (ret %d)", ret); + goto out; + } + + done += subreq->len; + } +out: + if (atomic_dec_and_test(&rreq->nr_outstanding)) + erofs_fscache_rreq_complete(rreq); + + return ret; +} + +static int erofs_fscache_meta_readpage(struct file *data, struct page *page) +{ + int ret; + struct folio *folio = page_folio(page); + struct super_block *sb = folio_mapping(folio)->host->i_sb; + struct netfs_io_request *rreq; + struct erofs_map_dev mdev = { + .m_deviceid = 0, + .m_pa = folio_pos(folio), + }; + + ret = erofs_map_dev(sb, &mdev); + if (ret) + goto out; + + rreq = erofs_fscache_alloc_request(folio_mapping(folio), + folio_pos(folio), folio_size(folio)); + if (IS_ERR(rreq)) + goto out; + + return erofs_fscache_read_folios_async(mdev.m_fscache->cookie, + rreq, mdev.m_pa); +out: + folio_unlock(folio); + return ret; +} + +static int erofs_fscache_readpage_inline(struct folio *folio, + struct erofs_map_blocks *map) +{ + struct super_block *sb = folio_mapping(folio)->host->i_sb; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + erofs_blk_t blknr; + size_t offset, len; + void *src, *dst; + + /* For tail packing layout, the offset may be non-zero. */ + offset = erofs_blkoff(map->m_pa); + blknr = erofs_blknr(map->m_pa); + len = map->m_llen; + + src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP); + if (IS_ERR(src)) + return PTR_ERR(src); + + dst = kmap_local_folio(folio, 0); + memcpy(dst, src + offset, len); + memset(dst + len, 0, PAGE_SIZE - len); + kunmap_local(dst); + + erofs_put_metabuf(&buf); + return 0; +} + +static int erofs_fscache_readpage(struct file *file, struct page *page) +{ + struct folio *folio = page_folio(page); + struct inode *inode = folio_mapping(folio)->host; + struct super_block *sb = inode->i_sb; + struct erofs_map_blocks map; + struct erofs_map_dev mdev; + struct netfs_io_request *rreq; + erofs_off_t pos; + loff_t pstart; + int ret; + + DBG_BUGON(folio_size(folio) != EROFS_BLKSIZ); + + pos = folio_pos(folio); + map.m_la = pos; + + ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); + if (ret) + goto out_unlock; + + if (!(map.m_flags & EROFS_MAP_MAPPED)) { + folio_zero_range(folio, 0, folio_size(folio)); + goto out_uptodate; + } + + if (map.m_flags & EROFS_MAP_META) { + ret = erofs_fscache_readpage_inline(folio, &map); + goto out_uptodate; + } + + mdev = (struct erofs_map_dev) { + .m_deviceid = map.m_deviceid, + .m_pa = map.m_pa, + }; + + ret = erofs_map_dev(sb, &mdev); + if (ret) + goto out_unlock; + + + rreq = erofs_fscache_alloc_request(folio_mapping(folio), + folio_pos(folio), folio_size(folio)); + if (IS_ERR(rreq)) + goto out_unlock; + + pstart = mdev.m_pa + (pos - map.m_la); + return erofs_fscache_read_folios_async(mdev.m_fscache->cookie, + rreq, pstart); + +out_uptodate: + if (!ret) + folio_mark_uptodate(folio); +out_unlock: + folio_unlock(folio); + return ret; +} + +static void erofs_fscache_advance_folios(struct readahead_control *rac, + size_t len, bool unlock) +{ + while (len) { + struct folio *folio = readahead_folio(rac); + len -= folio_size(folio); + if (unlock) { + folio_mark_uptodate(folio); + folio_unlock(folio); + } + } +} + +static void erofs_fscache_readahead(struct readahead_control *rac) +{ + struct inode *inode = rac->mapping->host; + struct super_block *sb = inode->i_sb; + size_t len, count, done = 0; + erofs_off_t pos; + loff_t start, offset; + int ret; + + if (!readahead_count(rac)) + return; + + start = readahead_pos(rac); + len = readahead_length(rac); + + do { + struct erofs_map_blocks map; + struct erofs_map_dev mdev; + struct netfs_io_request *rreq; + + pos = start + done; + map.m_la = pos; + + ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); + if (ret) + return; + + offset = start + done; + count = min_t(size_t, map.m_llen - (pos - map.m_la), + len - done); + + if (!(map.m_flags & EROFS_MAP_MAPPED)) { + struct iov_iter iter; + + iov_iter_xarray(&iter, READ, &rac->mapping->i_pages, + offset, count); + iov_iter_zero(count, &iter); + + erofs_fscache_advance_folios(rac, count, true); + ret = count; + continue; + } + + if (map.m_flags & EROFS_MAP_META) { + struct folio *folio = readahead_folio(rac); + + ret = erofs_fscache_readpage_inline(folio, &map); + if (!ret) { + folio_mark_uptodate(folio); + ret = folio_size(folio); + } + + folio_unlock(folio); + continue; + } + + mdev = (struct erofs_map_dev) { + .m_deviceid = map.m_deviceid, + .m_pa = map.m_pa, + }; + ret = erofs_map_dev(sb, &mdev); + if (ret) + return; + + rreq = erofs_fscache_alloc_request(rac->mapping, offset, count); + if (IS_ERR(rreq)) + return; + /* + * Drop the ref of folios here. Unlock them in + * rreq_unlock_folios() when rreq complete. + */ + erofs_fscache_advance_folios(rac, count, false); + ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie, + rreq, mdev.m_pa + (pos - map.m_la)); + if (!ret) + ret = count; + } while (ret > 0 && ((done += ret) < len)); +} + +static const struct address_space_operations erofs_fscache_meta_aops = { + .readpage = erofs_fscache_meta_readpage, +}; + +const struct address_space_operations erofs_fscache_access_aops = { + .readpage = erofs_fscache_readpage, + .readahead = erofs_fscache_readahead, +}; + +int erofs_fscache_register_cookie(struct super_block *sb, + struct erofs_fscache **fscache, + char *name, bool need_inode) +{ + struct fscache_volume *volume = EROFS_SB(sb)->volume; + struct erofs_fscache *ctx; + struct fscache_cookie *cookie; + int ret; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + cookie = fscache_acquire_cookie(volume, FSCACHE_ADV_WANT_CACHE_SIZE, + name, strlen(name), NULL, 0, 0); + if (!cookie) { + erofs_err(sb, "failed to get cookie for %s", name); + ret = -EINVAL; + goto err; + } + + fscache_use_cookie(cookie, false); + ctx->cookie = cookie; + + if (need_inode) { + struct inode *const inode = new_inode(sb); + + if (!inode) { + erofs_err(sb, "failed to get anon inode for %s", name); + ret = -ENOMEM; + goto err_cookie; + } + + set_nlink(inode, 1); + inode->i_size = OFFSET_MAX; + inode->i_mapping->a_ops = &erofs_fscache_meta_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + + ctx->inode = inode; + } + + *fscache = ctx; + return 0; + +err_cookie: + fscache_unuse_cookie(ctx->cookie, NULL, NULL); + fscache_relinquish_cookie(ctx->cookie, false); + ctx->cookie = NULL; +err: + kfree(ctx); + return ret; +} + +void erofs_fscache_unregister_cookie(struct erofs_fscache **fscache) +{ + struct erofs_fscache *ctx = *fscache; + + if (!ctx) + return; + + fscache_unuse_cookie(ctx->cookie, NULL, NULL); + fscache_relinquish_cookie(ctx->cookie, false); + ctx->cookie = NULL; + + iput(ctx->inode); + ctx->inode = NULL; + + kfree(ctx); + *fscache = NULL; +} + +int erofs_fscache_register_fs(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct fscache_volume *volume; + char *name; + int ret = 0; + + name = kasprintf(GFP_KERNEL, "erofs,%s", sbi->opt.fsid); + if (!name) + return -ENOMEM; + + volume = fscache_acquire_volume(name, NULL, NULL, 0); + if (IS_ERR_OR_NULL(volume)) { + erofs_err(sb, "failed to register volume for %s", name); + ret = volume ? PTR_ERR(volume) : -EOPNOTSUPP; + volume = NULL; + } + + sbi->volume = volume; + kfree(name); + return ret; +} + +void erofs_fscache_unregister_fs(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + + fscache_relinquish_volume(sbi->volume, NULL, false); + sbi->volume = NULL; +} diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index e8b37ba5e9ad..bcc8335b46b3 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -8,11 +8,6 @@ #include <trace/events/erofs.h> -/* - * if inode is successfully read, return its inode page (or sometimes - * the inode payload page if it's an extended inode) in order to fill - * inline data if possible. - */ static void *erofs_read_inode(struct erofs_buf *buf, struct inode *inode, unsigned int *ofs) { @@ -297,6 +292,10 @@ static int erofs_fill_inode(struct inode *inode, int isdir) goto out_unlock; } inode->i_mapping->a_ops = &erofs_raw_access_aops; +#ifdef CONFIG_EROFS_FS_ONDEMAND + if (erofs_is_fscache_mode(inode->i_sb)) + inode->i_mapping->a_ops = &erofs_fscache_access_aops; +#endif out_unlock: erofs_put_metabuf(&buf); @@ -370,7 +369,7 @@ int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path, stat->attributes_mask |= (STATX_ATTR_COMPRESSED | STATX_ATTR_IMMUTABLE); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(mnt_userns, inode, stat); return 0; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 5298c4ee277d..cfee49d33b95 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -49,6 +49,7 @@ typedef u32 erofs_blk_t; struct erofs_device_info { char *path; + struct erofs_fscache *fscache; struct block_device *bdev; struct dax_device *dax_dev; u64 dax_part_off; @@ -74,6 +75,7 @@ struct erofs_mount_opts { unsigned int max_sync_decompress_pages; #endif unsigned int mount_opt; + char *fsid; }; struct erofs_dev_context { @@ -96,6 +98,11 @@ struct erofs_sb_lz4_info { u16 max_pclusterblks; }; +struct erofs_fscache { + struct fscache_cookie *cookie; + struct inode *inode; +}; + struct erofs_sb_info { struct erofs_mount_opts opt; /* options */ #ifdef CONFIG_EROFS_FS_ZIP @@ -146,6 +153,10 @@ struct erofs_sb_info { /* sysfs support */ struct kobject s_kobj; /* /sys/fs/erofs/<devname> */ struct completion s_kobj_unregister; + + /* fscache support */ + struct fscache_volume *volume; + struct erofs_fscache *s_fscache; }; #define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info) @@ -161,6 +172,11 @@ struct erofs_sb_info { #define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option) #define test_opt(opt, option) ((opt)->mount_opt & EROFS_MOUNT_##option) +static inline bool erofs_is_fscache_mode(struct super_block *sb) +{ + return IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && !sb->s_bdev; +} + enum { EROFS_ZIP_CACHE_DISABLED, EROFS_ZIP_CACHE_READAHEAD, @@ -381,31 +397,6 @@ extern const struct super_operations erofs_sops; extern const struct address_space_operations erofs_raw_access_aops; extern const struct address_space_operations z_erofs_aops; -/* - * Logical to physical block mapping - * - * Different with other file systems, it is used for 2 access modes: - * - * 1) RAW access mode: - * - * Users pass a valid (m_lblk, m_lofs -- usually 0) pair, - * and get the valid m_pblk, m_pofs and the longest m_len(in bytes). - * - * Note that m_lblk in the RAW access mode refers to the number of - * the compressed ondisk block rather than the uncompressed - * in-memory block for the compressed file. - * - * m_pofs equals to m_lofs except for the inline data page. - * - * 2) Normal access mode: - * - * If the inode is not compressed, it has no difference with - * the RAW access mode. However, if the inode is compressed, - * users should pass a valid (m_lblk, m_lofs) pair, and get - * the needed m_pblk, m_pofs, m_len to get the compressed data - * and the updated m_lblk, m_lofs which indicates the start - * of the corresponding uncompressed data in the file. - */ enum { BH_Encoded = BH_PrivateStart, BH_FullMapped, @@ -467,6 +458,7 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode, #endif /* !CONFIG_EROFS_FS_ZIP */ struct erofs_map_dev { + struct erofs_fscache *m_fscache; struct block_device *m_bdev; struct dax_device *m_daxdev; u64 m_dax_part_off; @@ -486,6 +478,8 @@ void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev); int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); +int erofs_map_blocks(struct inode *inode, + struct erofs_map_blocks *map, int flags); /* inode.c */ static inline unsigned long erofs_inode_hash(erofs_nid_t nid) @@ -509,7 +503,7 @@ int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path, /* namei.c */ extern const struct inode_operations erofs_dir_iops; -int erofs_namei(struct inode *dir, struct qstr *name, +int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, unsigned int *d_type); /* dir.c */ @@ -611,6 +605,36 @@ static inline int z_erofs_load_lzma_config(struct super_block *sb, } #endif /* !CONFIG_EROFS_FS_ZIP */ +/* fscache.c */ +#ifdef CONFIG_EROFS_FS_ONDEMAND +int erofs_fscache_register_fs(struct super_block *sb); +void erofs_fscache_unregister_fs(struct super_block *sb); + +int erofs_fscache_register_cookie(struct super_block *sb, + struct erofs_fscache **fscache, + char *name, bool need_inode); +void erofs_fscache_unregister_cookie(struct erofs_fscache **fscache); + +extern const struct address_space_operations erofs_fscache_access_aops; +#else +static inline int erofs_fscache_register_fs(struct super_block *sb) +{ + return 0; +} +static inline void erofs_fscache_unregister_fs(struct super_block *sb) {} + +static inline int erofs_fscache_register_cookie(struct super_block *sb, + struct erofs_fscache **fscache, + char *name, bool need_inode) +{ + return -EOPNOTSUPP; +} + +static inline void erofs_fscache_unregister_cookie(struct erofs_fscache **fscache) +{ +} +#endif + #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #endif /* __EROFS_INTERNAL_H */ diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index 554efa363317..fd75506799c4 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -165,9 +165,8 @@ out: /* free if the candidate is valid */ return candidate; } -int erofs_namei(struct inode *dir, - struct qstr *name, - erofs_nid_t *nid, unsigned int *d_type) +int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, + unsigned int *d_type) { int ndirents; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 0c4b41130c2f..c6f5fa4ab244 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -13,6 +13,7 @@ #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/dax.h> +#include <linux/exportfs.h> #include "xattr.h" #define CREATE_TRACE_POINTS @@ -219,7 +220,52 @@ static int erofs_load_compr_cfgs(struct super_block *sb, } #endif -static int erofs_init_devices(struct super_block *sb, +static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, + struct erofs_device_info *dif, erofs_off_t *pos) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_deviceslot *dis; + struct block_device *bdev; + void *ptr; + int ret; + + ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*pos), EROFS_KMAP); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + dis = ptr + erofs_blkoff(*pos); + + if (!dif->path) { + if (!dis->tag[0]) { + erofs_err(sb, "empty device tag @ pos %llu", *pos); + return -EINVAL; + } + dif->path = kmemdup_nul(dis->tag, sizeof(dis->tag), GFP_KERNEL); + if (!dif->path) + return -ENOMEM; + } + + if (erofs_is_fscache_mode(sb)) { + ret = erofs_fscache_register_cookie(sb, &dif->fscache, + dif->path, false); + if (ret) + return ret; + } else { + bdev = blkdev_get_by_path(dif->path, FMODE_READ | FMODE_EXCL, + sb->s_type); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + dif->bdev = bdev; + dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off); + } + + dif->blocks = le32_to_cpu(dis->blocks); + dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr); + sbi->total_blocks += dif->blocks; + *pos += EROFS_DEVT_SLOT_SIZE; + return 0; +} + +static int erofs_scan_devices(struct super_block *sb, struct erofs_super_block *dsb) { struct erofs_sb_info *sbi = EROFS_SB(sb); @@ -227,8 +273,6 @@ static int erofs_init_devices(struct super_block *sb, erofs_off_t pos; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_device_info *dif; - struct erofs_deviceslot *dis; - void *ptr; int id, err = 0; sbi->total_blocks = sbi->primarydevice_blocks; @@ -237,7 +281,8 @@ static int erofs_init_devices(struct super_block *sb, else ondisk_extradevs = le16_to_cpu(dsb->extra_devices); - if (ondisk_extradevs != sbi->devs->extra_devices) { + if (sbi->devs->extra_devices && + ondisk_extradevs != sbi->devs->extra_devices) { erofs_err(sb, "extra devices don't match (ondisk %u, given %u)", ondisk_extradevs, sbi->devs->extra_devices); return -EINVAL; @@ -248,30 +293,31 @@ static int erofs_init_devices(struct super_block *sb, sbi->device_id_mask = roundup_pow_of_two(ondisk_extradevs + 1) - 1; pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE; down_read(&sbi->devs->rwsem); - idr_for_each_entry(&sbi->devs->tree, dif, id) { - struct block_device *bdev; - - ptr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), - EROFS_KMAP); - if (IS_ERR(ptr)) { - err = PTR_ERR(ptr); - break; + if (sbi->devs->extra_devices) { + idr_for_each_entry(&sbi->devs->tree, dif, id) { + err = erofs_init_device(&buf, sb, dif, &pos); + if (err) + break; } - dis = ptr + erofs_blkoff(pos); - - bdev = blkdev_get_by_path(dif->path, - FMODE_READ | FMODE_EXCL, - sb->s_type); - if (IS_ERR(bdev)) { - err = PTR_ERR(bdev); - break; + } else { + for (id = 0; id < ondisk_extradevs; id++) { + dif = kzalloc(sizeof(*dif), GFP_KERNEL); + if (!dif) { + err = -ENOMEM; + break; + } + + err = idr_alloc(&sbi->devs->tree, dif, 0, 0, GFP_KERNEL); + if (err < 0) { + kfree(dif); + break; + } + ++sbi->devs->extra_devices; + + err = erofs_init_device(&buf, sb, dif, &pos); + if (err) + break; } - dif->bdev = bdev; - dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off); - dif->blocks = le32_to_cpu(dis->blocks); - dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr); - sbi->total_blocks += dif->blocks; - pos += EROFS_DEVT_SLOT_SIZE; } up_read(&sbi->devs->rwsem); erofs_put_metabuf(&buf); @@ -358,10 +404,12 @@ static int erofs_read_superblock(struct super_block *sb) goto out; /* handle multiple devices */ - ret = erofs_init_devices(sb, dsb); + ret = erofs_scan_devices(sb, dsb); if (erofs_sb_has_ztailpacking(sbi)) erofs_info(sb, "EXPERIMENTAL compressed inline data feature in use. Use at your own risk!"); + if (erofs_is_fscache_mode(sb)) + erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!"); out: erofs_put_metabuf(&buf); return ret; @@ -390,6 +438,7 @@ enum { Opt_dax, Opt_dax_enum, Opt_device, + Opt_fsid, Opt_err }; @@ -414,6 +463,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = { fsparam_flag("dax", Opt_dax), fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums), fsparam_string("device", Opt_device), + fsparam_string("fsid", Opt_fsid), {} }; @@ -509,6 +559,16 @@ static int erofs_fc_parse_param(struct fs_context *fc, } ++ctx->devs->extra_devices; break; + case Opt_fsid: +#ifdef CONFIG_EROFS_FS_ONDEMAND + kfree(ctx->opt.fsid); + ctx->opt.fsid = kstrdup(param->string, GFP_KERNEL); + if (!ctx->opt.fsid) + return -ENOMEM; +#else + errorfc(fc, "fsid option not supported"); +#endif + break; default: return -ENOPARAM; } @@ -577,6 +637,44 @@ static int erofs_init_managed_cache(struct super_block *sb) static int erofs_init_managed_cache(struct super_block *sb) { return 0; } #endif +static struct inode *erofs_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + return erofs_iget(sb, ino, false); +} + +static struct dentry *erofs_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + erofs_nfs_get_inode); +} + +static struct dentry *erofs_fh_to_parent(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + erofs_nfs_get_inode); +} + +static struct dentry *erofs_get_parent(struct dentry *child) +{ + erofs_nid_t nid; + unsigned int d_type; + int err; + + err = erofs_namei(d_inode(child), &dotdot_name, &nid, &d_type); + if (err) + return ERR_PTR(err); + return d_obtain_alias(erofs_iget(child->d_sb, nid, d_type == FT_DIR)); +} + +static const struct export_operations erofs_export_ops = { + .fh_to_dentry = erofs_fh_to_dentry, + .fh_to_parent = erofs_fh_to_parent, + .get_parent = erofs_get_parent, +}; + static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) { struct inode *inode; @@ -585,11 +683,9 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) int err; sb->s_magic = EROFS_SUPER_MAGIC; - - if (!sb_set_blocksize(sb, EROFS_BLKSIZ)) { - erofs_err(sb, "failed to set erofs blksize"); - return -EINVAL; - } + sb->s_flags |= SB_RDONLY | SB_NOATIME; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_op = &erofs_sops; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) @@ -597,10 +693,36 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_fs_info = sbi; sbi->opt = ctx->opt; - sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->dax_part_off); + ctx->opt.fsid = NULL; sbi->devs = ctx->devs; ctx->devs = NULL; + if (erofs_is_fscache_mode(sb)) { + sb->s_blocksize = EROFS_BLKSIZ; + sb->s_blocksize_bits = LOG_BLOCK_SIZE; + + err = erofs_fscache_register_fs(sb); + if (err) + return err; + + err = erofs_fscache_register_cookie(sb, &sbi->s_fscache, + sbi->opt.fsid, true); + if (err) + return err; + + err = super_setup_bdi(sb); + if (err) + return err; + } else { + if (!sb_set_blocksize(sb, EROFS_BLKSIZ)) { + erofs_err(sb, "failed to set erofs blksize"); + return -EINVAL; + } + + sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev, + &sbi->dax_part_off); + } + err = erofs_read_superblock(sb); if (err) return err; @@ -613,12 +735,10 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) clear_opt(&sbi->opt, DAX_ALWAYS); } } - sb->s_flags |= SB_RDONLY | SB_NOATIME; - sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_time_gran = 1; - sb->s_op = &erofs_sops; + sb->s_time_gran = 1; sb->s_xattr = erofs_xattr_handlers; + sb->s_export_op = &erofs_export_ops; if (test_opt(&sbi->opt, POSIX_ACL)) sb->s_flags |= SB_POSIXACL; @@ -661,6 +781,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) static int erofs_fc_get_tree(struct fs_context *fc) { + struct erofs_fs_context *ctx = fc->fs_private; + + if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && ctx->opt.fsid) + return get_tree_nodev(fc, erofs_fc_fill_super); + return get_tree_bdev(fc, erofs_fc_fill_super); } @@ -690,6 +815,7 @@ static int erofs_release_device_info(int id, void *ptr, void *data) fs_put_dax(dif->dax_dev); if (dif->bdev) blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL); + erofs_fscache_unregister_cookie(&dif->fscache); kfree(dif->path); kfree(dif); return 0; @@ -709,6 +835,7 @@ static void erofs_fc_free(struct fs_context *fc) struct erofs_fs_context *ctx = fc->fs_private; erofs_free_dev_context(ctx->devs); + kfree(ctx->opt.fsid); kfree(ctx); } @@ -749,7 +876,10 @@ static void erofs_kill_sb(struct super_block *sb) WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC); - kill_block_super(sb); + if (erofs_is_fscache_mode(sb)) + generic_shutdown_super(sb); + else + kill_block_super(sb); sbi = EROFS_SB(sb); if (!sbi) @@ -757,6 +887,9 @@ static void erofs_kill_sb(struct super_block *sb) erofs_free_dev_context(sbi->devs); fs_put_dax(sbi->dax_dev); + erofs_fscache_unregister_cookie(&sbi->s_fscache); + erofs_fscache_unregister_fs(sb); + kfree(sbi->opt.fsid); kfree(sbi); sb->s_fs_info = NULL; } @@ -774,6 +907,7 @@ static void erofs_put_super(struct super_block *sb) iput(sbi->managed_cache); sbi->managed_cache = NULL; #endif + erofs_fscache_unregister_cookie(&sbi->s_fscache); } static struct file_system_type erofs_fs_type = { @@ -781,7 +915,7 @@ static struct file_system_type erofs_fs_type = { .name = "erofs", .init_fs_context = erofs_init_fs_context, .kill_sb = erofs_kill_sb, - .fs_flags = FS_REQUIRES_DEV, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("erofs"); @@ -857,7 +991,10 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct erofs_sb_info *sbi = EROFS_SB(sb); - u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + u64 id = 0; + + if (!erofs_is_fscache_mode(sb)) + id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = sb->s_magic; buf->f_bsize = EROFS_BLKSIZ; @@ -902,6 +1039,10 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",dax=always"); if (test_opt(opt, DAX_NEVER)) seq_puts(seq, ",dax=never"); +#ifdef CONFIG_EROFS_FS_ONDEMAND + if (opt->fsid) + seq_printf(seq, ",fsid=%s", opt->fsid); +#endif return 0; } diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c index f3babf1e6608..c1383e508bbe 100644 --- a/fs/erofs/sysfs.c +++ b/fs/erofs/sysfs.c @@ -205,8 +205,8 @@ int erofs_register_sysfs(struct super_block *sb) sbi->s_kobj.kset = &erofs_root; init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, - "%s", sb->s_id); + err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", + erofs_is_fscache_mode(sb) ? sbi->opt.fsid : sb->s_id); if (err) goto put_sb_kobj; return 0; diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c index 03f142307174..9f42f25fab92 100644 --- a/fs/exfat/balloc.c +++ b/fs/exfat/balloc.c @@ -148,7 +148,9 @@ int exfat_set_bitmap(struct inode *inode, unsigned int clu, bool sync) struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); - WARN_ON(clu < EXFAT_FIRST_CLUSTER); + if (!is_valid_cluster(sbi, clu)) + return -EINVAL; + ent_idx = CLUSTER_TO_BITMAP_ENT(clu); i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx); b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx); @@ -166,7 +168,9 @@ void exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync) struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_mount_options *opts = &sbi->options; - WARN_ON(clu < EXFAT_FIRST_CLUSTER); + if (!is_valid_cluster(sbi, clu)) + return; + ent_idx = CLUSTER_TO_BITMAP_ENT(clu); i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx); b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx); diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index c6800b880920..4a7a2308eb72 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -203,6 +203,7 @@ struct exfat_mount_options { /* on error: continue, panic, remount-ro */ enum exfat_error_mode errors; unsigned utf8:1, /* Use of UTF-8 character set */ + sys_tz:1, /* Use local timezone */ discard:1, /* Issue discard requests on deletions */ keep_last_dots:1; /* Keep trailing periods in paths */ int time_offset; /* Offset of timestamps from UTC (in minutes) */ @@ -381,6 +382,12 @@ static inline int exfat_sector_to_cluster(struct exfat_sb_info *sbi, EXFAT_RESERVED_CLUSTERS; } +static inline bool is_valid_cluster(struct exfat_sb_info *sbi, + unsigned int clus) +{ + return clus >= EXFAT_FIRST_CLUSTER && clus < sbi->num_clusters; +} + /* super.c */ int exfat_set_volume_dirty(struct super_block *sb); int exfat_clear_volume_dirty(struct super_block *sb); diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c index a3464e56a7e1..9de6a6b844c9 100644 --- a/fs/exfat/fatent.c +++ b/fs/exfat/fatent.c @@ -6,6 +6,7 @@ #include <linux/slab.h> #include <asm/unaligned.h> #include <linux/buffer_head.h> +#include <linux/blkdev.h> #include "exfat_raw.h" #include "exfat_fs.h" @@ -81,12 +82,6 @@ int exfat_ent_set(struct super_block *sb, unsigned int loc, return 0; } -static inline bool is_valid_cluster(struct exfat_sb_info *sbi, - unsigned int clus) -{ - return clus >= EXFAT_FIRST_CLUSTER && clus < sbi->num_clusters; -} - int exfat_ent_get(struct super_block *sb, unsigned int loc, unsigned int *content) { @@ -274,10 +269,9 @@ int exfat_zeroed_cluster(struct inode *dir, unsigned int clu) { struct super_block *sb = dir->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); - struct buffer_head *bhs[MAX_BUF_PER_PAGE]; - int nr_bhs = MAX_BUF_PER_PAGE; + struct buffer_head *bh; sector_t blknr, last_blknr; - int err, i, n; + int i; blknr = exfat_cluster_to_sector(sbi, clu); last_blknr = blknr + sbi->sect_per_clus; @@ -291,30 +285,23 @@ int exfat_zeroed_cluster(struct inode *dir, unsigned int clu) } /* Zeroing the unused blocks on this cluster */ - while (blknr < last_blknr) { - for (n = 0; n < nr_bhs && blknr < last_blknr; n++, blknr++) { - bhs[n] = sb_getblk(sb, blknr); - if (!bhs[n]) { - err = -ENOMEM; - goto release_bhs; - } - memset(bhs[n]->b_data, 0, sb->s_blocksize); - } - - err = exfat_update_bhs(bhs, n, IS_DIRSYNC(dir)); - if (err) - goto release_bhs; + for (i = blknr; i < last_blknr; i++) { + bh = sb_getblk(sb, i); + if (!bh) + return -ENOMEM; - for (i = 0; i < n; i++) - brelse(bhs[i]); + memset(bh->b_data, 0, sb->s_blocksize); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + brelse(bh); } - return 0; -release_bhs: - exfat_err(sb, "failed zeroed sect %llu\n", (unsigned long long)blknr); - for (i = 0; i < n; i++) - bforget(bhs[i]); - return err; + if (IS_DIRSYNC(dir)) + return sync_blockdev_range(sb->s_bdev, + EXFAT_BLK_TO_B(blknr, sb), + EXFAT_BLK_TO_B(last_blknr, sb) - 1); + + return 0; } int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 2f5130059236..20d4e47f57ab 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -351,21 +351,20 @@ out: static int exfat_ioctl_fitrim(struct inode *inode, unsigned long arg) { - struct request_queue *q = bdev_get_queue(inode->i_sb->s_bdev); struct fstrim_range range; int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(inode->i_sb->s_bdev)) return -EOPNOTSUPP; if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) return -EFAULT; range.minlen = max_t(unsigned int, range.minlen, - q->limits.discard_granularity); + bdev_discard_granularity(inode->i_sb->s_bdev)); ret = exfat_trim_fs(inode, &range); if (ret < 0) diff --git a/fs/exfat/misc.c b/fs/exfat/misc.c index d5bd8e6d9741..9380e0188b55 100644 --- a/fs/exfat/misc.c +++ b/fs/exfat/misc.c @@ -74,6 +74,13 @@ static void exfat_adjust_tz(struct timespec64 *ts, u8 tz_off) ts->tv_sec += TIMEZONE_SEC(0x80 - tz_off); } +static inline int exfat_tz_offset(struct exfat_sb_info *sbi) +{ + if (sbi->options.sys_tz) + return -sys_tz.tz_minuteswest; + return sbi->options.time_offset; +} + /* Convert a EXFAT time/date pair to a UNIX date (seconds since 1 1 70). */ void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, u8 tz, __le16 time, __le16 date, u8 time_cs) @@ -96,8 +103,7 @@ void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, /* Adjust timezone to UTC0. */ exfat_adjust_tz(ts, tz & ~EXFAT_TZ_VALID); else - /* Convert from local time to UTC using time_offset. */ - ts->tv_sec -= sbi->options.time_offset * SECS_PER_MIN; + ts->tv_sec -= exfat_tz_offset(sbi) * SECS_PER_MIN; } /* Convert linear UNIX date to a EXFAT time/date pair. */ diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index a02a04a993bf..76acc3721951 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -1080,6 +1080,7 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir, exfat_remove_entries(inode, p_dir, oldentry, 0, num_old_entries); + ei->dir = *p_dir; ei->entry = newentry; } else { if (exfat_get_entry_type(epold) == TYPE_FILE) { @@ -1167,28 +1168,6 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir, return 0; } -static void exfat_update_parent_info(struct exfat_inode_info *ei, - struct inode *parent_inode) -{ - struct exfat_sb_info *sbi = EXFAT_SB(parent_inode->i_sb); - struct exfat_inode_info *parent_ei = EXFAT_I(parent_inode); - loff_t parent_isize = i_size_read(parent_inode); - - /* - * the problem that struct exfat_inode_info caches wrong parent info. - * - * because of flag-mismatch of ei->dir, - * there is abnormal traversing cluster chain. - */ - if (unlikely(parent_ei->flags != ei->dir.flags || - parent_isize != EXFAT_CLU_TO_B(ei->dir.size, sbi) || - parent_ei->start_clu != ei->dir.dir)) { - exfat_chain_set(&ei->dir, parent_ei->start_clu, - EXFAT_B_TO_CLU_ROUND_UP(parent_isize, sbi), - parent_ei->flags); - } -} - /* rename or move a old file into a new file */ static int __exfat_rename(struct inode *old_parent_inode, struct exfat_inode_info *ei, struct inode *new_parent_inode, @@ -1219,8 +1198,6 @@ static int __exfat_rename(struct inode *old_parent_inode, return -ENOENT; } - exfat_update_parent_info(ei, old_parent_inode); - exfat_chain_dup(&olddir, &ei->dir); dentry = ei->entry; @@ -1241,8 +1218,6 @@ static int __exfat_rename(struct inode *old_parent_inode, goto out; } - exfat_update_parent_info(new_ei, new_parent_inode); - p_dir = &(new_ei->dir); new_entry = new_ei->entry; ep = exfat_get_dentry(sb, p_dir, new_entry, &new_bh); diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 8ca21e7917d1..6a4dfe9f31ee 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -170,7 +170,9 @@ static int exfat_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",discard"); if (opts->keep_last_dots) seq_puts(m, ",keep_last_dots"); - if (opts->time_offset) + if (opts->sys_tz) + seq_puts(m, ",sys_tz"); + else if (opts->time_offset) seq_printf(m, ",time_offset=%d", opts->time_offset); return 0; } @@ -214,6 +216,7 @@ enum { Opt_errors, Opt_discard, Opt_keep_last_dots, + Opt_sys_tz, Opt_time_offset, /* Deprecated options */ @@ -241,6 +244,7 @@ static const struct fs_parameter_spec exfat_parameters[] = { fsparam_enum("errors", Opt_errors, exfat_param_enums), fsparam_flag("discard", Opt_discard), fsparam_flag("keep_last_dots", Opt_keep_last_dots), + fsparam_flag("sys_tz", Opt_sys_tz), fsparam_s32("time_offset", Opt_time_offset), __fsparam(NULL, "utf8", Opt_utf8, fs_param_deprecated, NULL), @@ -298,6 +302,9 @@ static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_keep_last_dots: opts->keep_last_dots = 1; break; + case Opt_sys_tz: + opts->sys_tz = 1; + break; case Opt_time_offset: /* * Make the limit 24 just in case someone invents something @@ -627,13 +634,9 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc) if (opts->allow_utime == (unsigned short)-1) opts->allow_utime = ~opts->fs_dmask & 0022; - if (opts->discard) { - struct request_queue *q = bdev_get_queue(sb->s_bdev); - - if (!blk_queue_discard(q)) { - exfat_warn(sb, "mounting with \"discard\" option, but the device does not support discard"); - opts->discard = 0; - } + if (opts->discard && !bdev_max_discard_sectors(sb->s_bdev)) { + exfat_warn(sb, "mounting with \"discard\" option, but the device does not support discard"); + opts->discard = 0; } sb->s_flags |= SB_NODIRATIME; diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 7d89142e1421..72206a292676 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -17,3 +17,4 @@ ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o ext4-inode-test-objs += inode-test.o obj-$(CONFIG_EXT4_KUNIT_TESTS) += ext4-inode-test.o ext4-$(CONFIG_FS_VERITY) += verity.o +ext4-$(CONFIG_FS_ENCRYPTION) += crypto.o diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c new file mode 100644 index 000000000000..e20ac0654b3f --- /dev/null +++ b/fs/ext4/crypto.c @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/quotaops.h> +#include <linux/uuid.h> + +#include "ext4.h" +#include "xattr.h" +#include "ext4_jbd2.h" + +static void ext4_fname_from_fscrypt_name(struct ext4_filename *dst, + const struct fscrypt_name *src) +{ + memset(dst, 0, sizeof(*dst)); + + dst->usr_fname = src->usr_fname; + dst->disk_name = src->disk_name; + dst->hinfo.hash = src->hash; + dst->hinfo.minor_hash = src->minor_hash; + dst->crypto_buf = src->crypto_buf; +} + +int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct ext4_filename *fname) +{ + struct fscrypt_name name; + int err; + + err = fscrypt_setup_filename(dir, iname, lookup, &name); + if (err) + return err; + + ext4_fname_from_fscrypt_name(fname, &name); + +#if IS_ENABLED(CONFIG_UNICODE) + err = ext4_fname_setup_ci_filename(dir, iname, fname); +#endif + return err; +} + +int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, + struct ext4_filename *fname) +{ + struct fscrypt_name name; + int err; + + err = fscrypt_prepare_lookup(dir, dentry, &name); + if (err) + return err; + + ext4_fname_from_fscrypt_name(fname, &name); + +#if IS_ENABLED(CONFIG_UNICODE) + err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname); +#endif + return err; +} + +void ext4_fname_free_filename(struct ext4_filename *fname) +{ + struct fscrypt_name name; + + name.crypto_buf = fname->crypto_buf; + fscrypt_free_filename(&name); + + fname->crypto_buf.name = NULL; + fname->usr_fname = NULL; + fname->disk_name.name = NULL; + +#if IS_ENABLED(CONFIG_UNICODE) + kfree(fname->cf_name.name); + fname->cf_name.name = NULL; +#endif +} + +static bool uuid_is_zero(__u8 u[16]) +{ + int i; + + for (i = 0; i < 16; i++) + if (u[i]) + return false; + return true; +} + +int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg) +{ + struct super_block *sb = file_inode(filp)->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int err, err2; + handle_t *handle; + + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + + if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) { + err = mnt_want_write_file(filp); + if (err) + return err; + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto pwsalt_err_exit; + } + err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, + EXT4_JTR_NONE); + if (err) + goto pwsalt_err_journal; + lock_buffer(sbi->s_sbh); + generate_random_uuid(sbi->s_es->s_encrypt_pw_salt); + ext4_superblock_csum_set(sb); + unlock_buffer(sbi->s_sbh); + err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); +pwsalt_err_journal: + err2 = ext4_journal_stop(handle); + if (err2 && !err) + err = err2; +pwsalt_err_exit: + mnt_drop_write_file(filp); + if (err) + return err; + } + + if (copy_to_user(arg, sbi->s_es->s_encrypt_pw_salt, 16)) + return -EFAULT; + return 0; +} + +static int ext4_get_context(struct inode *inode, void *ctx, size_t len) +{ + return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len); +} + +static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, + void *fs_data) +{ + handle_t *handle = fs_data; + int res, res2, credits, retries = 0; + + /* + * Encrypting the root directory is not allowed because e2fsck expects + * lost+found to exist and be unencrypted, and encrypting the root + * directory would imply encrypting the lost+found directory as well as + * the filename "lost+found" itself. + */ + if (inode->i_ino == EXT4_ROOT_INO) + return -EPERM; + + if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode))) + return -EINVAL; + + if (ext4_test_inode_flag(inode, EXT4_INODE_DAX)) + return -EOPNOTSUPP; + + res = ext4_convert_inline_data(inode); + if (res) + return res; + + /* + * If a journal handle was specified, then the encryption context is + * being set on a new inode via inheritance and is part of a larger + * transaction to create the inode. Otherwise the encryption context is + * being set on an existing inode in its own transaction. Only in the + * latter case should the "retry on ENOSPC" logic be used. + */ + + if (handle) { + res = ext4_xattr_set_handle(handle, inode, + EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, 0); + if (!res) { + ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); + ext4_clear_inode_state(inode, + EXT4_STATE_MAY_INLINE_DATA); + /* + * Update inode->i_flags - S_ENCRYPTED will be enabled, + * S_DAX may be disabled + */ + ext4_set_inode_flags(inode, false); + } + return res; + } + + res = dquot_initialize(inode); + if (res) + return res; +retry: + res = ext4_xattr_set_credits(inode, len, false /* is_create */, + &credits); + if (res) + return res; + + handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, 0); + if (!res) { + ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); + /* + * Update inode->i_flags - S_ENCRYPTED will be enabled, + * S_DAX may be disabled + */ + ext4_set_inode_flags(inode, false); + res = ext4_mark_inode_dirty(handle, inode); + if (res) + EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); + } + res2 = ext4_journal_stop(handle); + + if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + if (!res) + res = res2; + return res; +} + +static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb) +{ + return EXT4_SB(sb)->s_dummy_enc_policy.policy; +} + +static bool ext4_has_stable_inodes(struct super_block *sb) +{ + return ext4_has_feature_stable_inodes(sb); +} + +static void ext4_get_ino_and_lblk_bits(struct super_block *sb, + int *ino_bits_ret, int *lblk_bits_ret) +{ + *ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count); + *lblk_bits_ret = 8 * sizeof(ext4_lblk_t); +} + +const struct fscrypt_operations ext4_cryptops = { + .key_prefix = "ext4:", + .get_context = ext4_get_context, + .set_context = ext4_set_context, + .get_dummy_policy = ext4_get_dummy_policy, + .empty_dir = ext4_empty_dir, + .has_stable_inodes = ext4_has_stable_inodes, + .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits, +}; diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index a6bb86f52b9a..3985f8c33f95 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -412,7 +412,7 @@ struct fname { }; /* - * This functoin implements a non-recursive way of freeing all of the + * This function implements a non-recursive way of freeing all of the * nodes in the red-black tree. */ static void free_rb_tree_fname(struct rb_root *root) @@ -515,7 +515,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, /* * This is a helper function for ext4_dx_readdir. It calls filldir - * for all entres on the fname linked list. (Normally there is only + * for all entries on the fname linked list. (Normally there is only * one entry on the linked list, unless there are 62 bit hash collisions.) */ static int call_filldir(struct file *file, struct dir_context *ctx, @@ -648,7 +648,7 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, unsigned int offset = 0; char *top; - de = (struct ext4_dir_entry_2 *)buf; + de = buf; top = buf + buf_size; while ((char *) de < top) { if (ext4_check_dir_entry(dir, NULL, de, bh, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a743b1e3b89e..d5cea9c2e2a2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -673,6 +673,8 @@ enum { /* Caller will submit data before dropping transaction handle. This * allows jbd2 to avoid submitting data before commit. */ #define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400 + /* Caller is in the atomic contex, find extent if it has been cached */ +#define EXT4_GET_BLOCKS_CACHED_NOWAIT 0x0800 /* * The bit position of these flags must not overlap with any of the @@ -1440,12 +1442,6 @@ struct ext4_super_block { #ifdef __KERNEL__ -#ifdef CONFIG_FS_ENCRYPTION -#define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_policy.policy != NULL) -#else -#define DUMMY_ENCRYPTION_ENABLED(sbi) (0) -#endif - /* Number of quota types we support */ #define EXT4_MAXQUOTAS 3 @@ -2731,74 +2727,20 @@ extern int ext4_fname_setup_ci_filename(struct inode *dir, struct ext4_filename *fname); #endif +/* ext4 encryption related stuff goes here crypto.c */ #ifdef CONFIG_FS_ENCRYPTION -static inline void ext4_fname_from_fscrypt_name(struct ext4_filename *dst, - const struct fscrypt_name *src) -{ - memset(dst, 0, sizeof(*dst)); - - dst->usr_fname = src->usr_fname; - dst->disk_name = src->disk_name; - dst->hinfo.hash = src->hash; - dst->hinfo.minor_hash = src->minor_hash; - dst->crypto_buf = src->crypto_buf; -} - -static inline int ext4_fname_setup_filename(struct inode *dir, - const struct qstr *iname, - int lookup, - struct ext4_filename *fname) -{ - struct fscrypt_name name; - int err; - - err = fscrypt_setup_filename(dir, iname, lookup, &name); - if (err) - return err; - - ext4_fname_from_fscrypt_name(fname, &name); - -#if IS_ENABLED(CONFIG_UNICODE) - err = ext4_fname_setup_ci_filename(dir, iname, fname); -#endif - return err; -} +extern const struct fscrypt_operations ext4_cryptops; -static inline int ext4_fname_prepare_lookup(struct inode *dir, - struct dentry *dentry, - struct ext4_filename *fname) -{ - struct fscrypt_name name; - int err; +int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct ext4_filename *fname); - err = fscrypt_prepare_lookup(dir, dentry, &name); - if (err) - return err; +int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, + struct ext4_filename *fname); - ext4_fname_from_fscrypt_name(fname, &name); +void ext4_fname_free_filename(struct ext4_filename *fname); -#if IS_ENABLED(CONFIG_UNICODE) - err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname); -#endif - return err; -} - -static inline void ext4_fname_free_filename(struct ext4_filename *fname) -{ - struct fscrypt_name name; +int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg); - name.crypto_buf = fname->crypto_buf; - fscrypt_free_filename(&name); - - fname->crypto_buf.name = NULL; - fname->usr_fname = NULL; - fname->disk_name.name = NULL; - -#if IS_ENABLED(CONFIG_UNICODE) - kfree(fname->cf_name.name); - fname->cf_name.name = NULL; -#endif -} #else /* !CONFIG_FS_ENCRYPTION */ static inline int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, @@ -2831,6 +2773,12 @@ static inline void ext4_fname_free_filename(struct ext4_filename *fname) fname->cf_name.name = NULL; #endif } + +static inline int ext4_ioctl_get_encryption_pwsalt(struct file *filp, + void __user *arg) +{ + return -EOPNOTSUPP; +} #endif /* !CONFIG_FS_ENCRYPTION */ /* dir.c */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e473fde6b64b..c148bb97b527 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -372,7 +372,7 @@ static int ext4_valid_extent_entries(struct inode *inode, { unsigned short entries; ext4_lblk_t lblock = 0; - ext4_lblk_t prev = 0; + ext4_lblk_t cur = 0; if (eh->eh_entries == 0) return 1; @@ -396,11 +396,11 @@ static int ext4_valid_extent_entries(struct inode *inode, /* Check for overlapping extents */ lblock = le32_to_cpu(ext->ee_block); - if ((lblock <= prev) && prev) { + if (lblock < cur) { *pblk = ext4_ext_pblock(ext); return 0; } - prev = lblock + ext4_ext_get_actual_len(ext) - 1; + cur = lblock + ext4_ext_get_actual_len(ext); ext++; entries--; } @@ -420,13 +420,13 @@ static int ext4_valid_extent_entries(struct inode *inode, /* Check for overlapping index extents */ lblock = le32_to_cpu(ext_idx->ei_block); - if ((lblock <= prev) && prev) { + if (lblock < cur) { *pblk = ext4_idx_pblock(ext_idx); return 0; } ext_idx++; entries--; - prev = lblock; + cur = lblock + 1; } } return 1; @@ -4693,15 +4693,17 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) FALLOC_FL_INSERT_RANGE)) return -EOPNOTSUPP; + inode_lock(inode); + ret = ext4_convert_inline_data(inode); + inode_unlock(inode); + if (ret) + goto exit; + if (mode & FALLOC_FL_PUNCH_HOLE) { ret = ext4_punch_hole(file, offset, len); goto exit; } - ret = ext4_convert_inline_data(inode); - if (ret) - goto exit; - if (mode & FALLOC_FL_COLLAPSE_RANGE) { ret = ext4_collapse_range(file, offset, len); goto exit; diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 3d72565ec6e8..795a60ad1897 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -970,7 +970,7 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) /* Submit data for all the fast commit inodes */ static int ext4_fc_submit_inode_data_all(journal_t *journal) { - struct super_block *sb = (struct super_block *)(journal->j_private); + struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *ei; int ret = 0; @@ -1004,7 +1004,7 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal) /* Wait for completion of data for all the fast commit inodes */ static int ext4_fc_wait_inode_data_all(journal_t *journal) { - struct super_block *sb = (struct super_block *)(journal->j_private); + struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *pos, *n; int ret = 0; @@ -1031,7 +1031,7 @@ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) __acquires(&sbi->s_fc_lock) __releases(&sbi->s_fc_lock) { - struct super_block *sb = (struct super_block *)(journal->j_private); + struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n; struct inode *inode; @@ -1093,7 +1093,7 @@ lock_and_exit: static int ext4_fc_perform_commit(journal_t *journal) { - struct super_block *sb = (struct super_block *)(journal->j_private); + struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *iter; struct ext4_fc_head head; @@ -1198,7 +1198,7 @@ static void ext4_fc_update_stats(struct super_block *sb, int status, */ int ext4_fc_commit(journal_t *journal, tid_t commit_tid) { - struct super_block *sb = (struct super_block *)(journal->j_private); + struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); int nblks = 0, ret, bsize = journal->j_blocksize; int subtid = atomic_read(&sbi->s_fc_subtid); @@ -1659,8 +1659,7 @@ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl, set_nlink(inode, 1); ext4_mark_inode_dirty(NULL, inode); out: - if (inode) - iput(inode); + iput(inode); return ret; } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 6feb07e3e1eb..109d07629f81 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -76,7 +76,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) return generic_file_read_iter(iocb, to); } - ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, 0); + ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, NULL, 0); inode_unlock_shared(inode); file_accessed(iocb->ki_filp); @@ -565,7 +565,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) iomap_ops = &ext4_iomap_overwrite_ops; ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0, - 0); + NULL, 0); if (ret == -ENOTBLK) ret = 0; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 9c076262770d..513762c087a9 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1083,14 +1083,14 @@ static void ext4_update_final_de(void *de_buf, int old_size, int new_size) void *limit; int de_len; - de = (struct ext4_dir_entry_2 *)de_buf; + de = de_buf; if (old_size) { limit = de_buf + old_size; do { prev_de = de; de_len = ext4_rec_len_from_disk(de->rec_len, old_size); de_buf += de_len; - de = (struct ext4_dir_entry_2 *)de_buf; + de = de_buf; } while (de_buf < limit); prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size - @@ -1155,7 +1155,7 @@ static int ext4_finish_convert_inline_dir(handle_t *handle, * First create "." and ".." and then copy the dir information * back to the block. */ - de = (struct ext4_dir_entry_2 *)target; + de = target; de = ext4_init_dot_dotdot(inode, de, inode->i_sb->s_blocksize, csum_size, le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1); @@ -2005,6 +2005,18 @@ int ext4_convert_inline_data(struct inode *inode) if (!ext4_has_inline_data(inode)) { ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); return 0; + } else if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { + /* + * Inode has inline data but EXT4_STATE_MAY_INLINE_DATA is + * cleared. This means we are in the middle of moving of + * inline data to delay allocated block. Just force writeout + * here to finish conversion. + */ + error = filemap_flush(inode->i_mapping); + if (error) + return error; + if (!ext4_has_inline_data(inode)) + return 0; } needed_blocks = ext4_writepage_trans_blocks(inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 646ece9b3455..7555cbe77148 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -41,7 +41,6 @@ #include <linux/bitops.h> #include <linux/iomap.h> #include <linux/iversion.h> -#include <linux/dax.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -199,8 +198,7 @@ void ext4_evict_inode(struct inode *inode) */ if (inode->i_ino != EXT4_JOURNAL_INO && ext4_should_journal_data(inode) && - (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && - inode->i_data.nrpages) { + S_ISREG(inode->i_mode) && inode->i_data.nrpages) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; @@ -545,12 +543,21 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, } else { BUG(); } + + if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT) + return retval; #ifdef ES_AGGRESSIVE_TEST ext4_map_blocks_es_recheck(handle, inode, map, &orig_map, flags); #endif goto found; } + /* + * In the query cache no-wait mode, nothing we can do more if we + * cannot find extent in the cache. + */ + if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT) + return 0; /* * Try to see if we can get the block without requesting a new @@ -837,10 +844,12 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, struct ext4_map_blocks map; struct buffer_head *bh; int create = map_flags & EXT4_GET_BLOCKS_CREATE; + bool nowait = map_flags & EXT4_GET_BLOCKS_CACHED_NOWAIT; int err; ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) || handle != NULL || create == 0); + ASSERT(create == 0 || !nowait); map.m_lblk = block; map.m_len = 1; @@ -851,6 +860,9 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, if (err < 0) return ERR_PTR(err); + if (nowait) + return sb_find_get_block(inode->i_sb, map.m_pblk); + bh = sb_getblk(inode->i_sb, map.m_pblk); if (unlikely(!bh)) return ERR_PTR(-ENOMEM); @@ -2944,8 +2956,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, index = pos >> PAGE_SHIFT; - if (ext4_nonda_switch(inode->i_sb) || S_ISLNK(inode->i_mode) || - ext4_verity_in_progress(inode)) { + if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; return ext4_write_begin(file, mapping, pos, len, flags, pagep, fsdata); @@ -3967,15 +3978,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) trace_ext4_punch_hole(inode, offset, length, 0); - ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); - if (ext4_has_inline_data(inode)) { - filemap_invalidate_lock(mapping); - ret = ext4_convert_inline_data(inode); - filemap_invalidate_unlock(mapping); - if (ret) - return ret; - } - /* * Write out all dirty pages to avoid race conditions * Then release them. @@ -4991,7 +4993,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, } if (IS_ENCRYPTED(inode)) { inode->i_op = &ext4_encrypted_symlink_inode_operations; - ext4_set_aops(inode); } else if (ext4_inode_is_fast_symlink(inode)) { inode->i_link = (char *)ei->i_data; inode->i_op = &ext4_fast_symlink_inode_operations; @@ -4999,9 +5000,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, sizeof(ei->i_data) - 1); } else { inode->i_op = &ext4_symlink_inode_operations; - ext4_set_aops(inode); } - inode_nohighmem(inode); } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { inode->i_op = &ext4_special_inode_operations; @@ -5398,6 +5397,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (attr->ia_valid & ATTR_SIZE) { handle_t *handle; loff_t oldsize = inode->i_size; + loff_t old_disksize; int shrink = (attr->ia_size < inode->i_size); if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { @@ -5469,6 +5469,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, inode->i_sb->s_blocksize_bits); down_write(&EXT4_I(inode)->i_data_sem); + old_disksize = EXT4_I(inode)->i_disksize; EXT4_I(inode)->i_disksize = attr->ia_size; rc = ext4_mark_inode_dirty(handle, inode); if (!error) @@ -5480,6 +5481,8 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, */ if (!error) i_size_write(inode, attr->ia_size); + else + EXT4_I(inode)->i_disksize = old_disksize; up_write(&EXT4_I(inode)->i_data_sem); ext4_journal_stop(handle); if (error) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index ba44fa1be70a..cb01c1da0f9d 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -16,7 +16,6 @@ #include <linux/file.h> #include <linux/quotaops.h> #include <linux/random.h> -#include <linux/uuid.h> #include <linux/uaccess.h> #include <linux/delay.h> #include <linux/iversion.h> @@ -504,18 +503,6 @@ journal_err_out: return err; } -#ifdef CONFIG_FS_ENCRYPTION -static int uuid_is_zero(__u8 u[16]) -{ - int i; - - for (i = 0; i < 16; i++) - if (u[i]) - return 0; - return 1; -} -#endif - /* * If immutable is set and we are not clearing it, we're not allowed to change * anything else in the inode. Don't error out if we're only trying to set @@ -1044,7 +1031,6 @@ static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg) __u32 flags = 0; unsigned int flush_flags = 0; struct super_block *sb = file_inode(filp)->i_sb; - struct request_queue *q; if (copy_from_user(&flags, (__u32 __user *)arg, sizeof(__u32))) @@ -1065,10 +1051,8 @@ static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg) if (flags & ~EXT4_IOC_CHECKPOINT_FLAG_VALID) return -EINVAL; - q = bdev_get_queue(EXT4_SB(sb)->s_journal->j_dev); - if (!q) - return -ENXIO; - if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !blk_queue_discard(q)) + if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && + !bdev_max_discard_sectors(EXT4_SB(sb)->s_journal->j_dev)) return -EOPNOTSUPP; if (flags & EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN) @@ -1393,14 +1377,13 @@ resizefs_out: case FITRIM: { - struct request_queue *q = bdev_get_queue(sb->s_bdev); struct fstrim_range range; int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(sb->s_bdev)) return -EOPNOTSUPP; /* @@ -1432,51 +1415,9 @@ resizefs_out: return -EOPNOTSUPP; return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); - case FS_IOC_GET_ENCRYPTION_PWSALT: { -#ifdef CONFIG_FS_ENCRYPTION - int err, err2; - struct ext4_sb_info *sbi = EXT4_SB(sb); - handle_t *handle; + case FS_IOC_GET_ENCRYPTION_PWSALT: + return ext4_ioctl_get_encryption_pwsalt(filp, (void __user *)arg); - if (!ext4_has_feature_encrypt(sb)) - return -EOPNOTSUPP; - if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) { - err = mnt_want_write_file(filp); - if (err) - return err; - handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - goto pwsalt_err_exit; - } - err = ext4_journal_get_write_access(handle, sb, - sbi->s_sbh, - EXT4_JTR_NONE); - if (err) - goto pwsalt_err_journal; - lock_buffer(sbi->s_sbh); - generate_random_uuid(sbi->s_es->s_encrypt_pw_salt); - ext4_superblock_csum_set(sb); - unlock_buffer(sbi->s_sbh); - err = ext4_handle_dirty_metadata(handle, NULL, - sbi->s_sbh); - pwsalt_err_journal: - err2 = ext4_journal_stop(handle); - if (err2 && !err) - err = err2; - pwsalt_err_exit: - mnt_drop_write_file(filp); - if (err) - return err; - } - if (copy_to_user((void __user *) arg, - sbi->s_es->s_encrypt_pw_salt, 16)) - return -EFAULT; - return 0; -#else - return -EOPNOTSUPP; -#endif - } case FS_IOC_GET_ENCRYPTION_POLICY: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 252c168454c7..9f12f29bc346 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -695,13 +695,10 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, for (i = 0; i < max; i++) { if (mb_test_bit(i, buddy)) { - /* only single bit in buddy2 may be 1 */ + /* only single bit in buddy2 may be 0 */ if (!mb_test_bit(i << 1, buddy2)) { MB_CHECK_ASSERT( mb_test_bit((i<<1)+1, buddy2)); - } else if (!mb_test_bit((i << 1) + 1, buddy2)) { - MB_CHECK_ASSERT( - mb_test_bit(i << 1, buddy2)); } continue; } @@ -2919,7 +2916,7 @@ const struct seq_operations ext4_mb_seq_groups_ops = { int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) { - struct super_block *sb = (struct super_block *)seq->private; + struct super_block *sb = seq->private; struct ext4_sb_info *sbi = EXT4_SB(sb); seq_puts(seq, "mballoc:\n"); @@ -3498,7 +3495,7 @@ int ext4_mb_init(struct super_block *sb) spin_lock_init(&lg->lg_prealloc_lock); } - if (blk_queue_nonrot(bdev_get_queue(sb->s_bdev))) + if (bdev_nonrot(sb->s_bdev)) sbi->s_mb_max_linear_groups = 0; else sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT; @@ -3629,7 +3626,7 @@ static inline int ext4_issue_discard(struct super_block *sb, return __blkdev_issue_discard(sb->s_bdev, (sector_t)discard_block << (sb->s_blocksize_bits - 9), (sector_t)count << (sb->s_blocksize_bits - 9), - GFP_NOFS, 0, biop); + GFP_NOFS, biop); } else return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); } @@ -6398,6 +6395,7 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) * @start: first group block to examine * @max: last group block to examine * @minblocks: minimum extent block count + * @set_trimmed: set the trimmed flag if at least one block is trimmed * * ext4_trim_all_free walks through group's block bitmap searching for free * extents. When the free extent is found, mark it as used in group buddy @@ -6407,7 +6405,7 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) static ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t max, - ext4_grpblk_t minblocks) + ext4_grpblk_t minblocks, bool set_trimmed) { struct ext4_buddy e4b; int ret; @@ -6426,7 +6424,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) || minblocks < EXT4_SB(sb)->s_last_trim_minblks) { ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks); - if (ret >= 0) + if (ret >= 0 && set_trimmed) EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); } else { ret = 0; @@ -6455,7 +6453,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, */ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) { - struct request_queue *q = bdev_get_queue(sb->s_bdev); + unsigned int discard_granularity = bdev_discard_granularity(sb->s_bdev); struct ext4_group_info *grp; ext4_group_t group, first_group, last_group; ext4_grpblk_t cnt = 0, first_cluster, last_cluster; @@ -6463,6 +6461,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) ext4_fsblk_t first_data_blk = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); + bool whole_group, eof = false; int ret = 0; start = range->start >> sb->s_blocksize_bits; @@ -6475,14 +6474,16 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) range->len < sb->s_blocksize) return -EINVAL; /* No point to try to trim less than discard granularity */ - if (range->minlen < q->limits.discard_granularity) { + if (range->minlen < discard_granularity) { minlen = EXT4_NUM_B2C(EXT4_SB(sb), - q->limits.discard_granularity >> sb->s_blocksize_bits); + discard_granularity >> sb->s_blocksize_bits); if (minlen > EXT4_CLUSTERS_PER_GROUP(sb)) goto out; } - if (end >= max_blks) + if (end >= max_blks - 1) { end = max_blks - 1; + eof = true; + } if (end <= first_data_blk) goto out; if (start < first_data_blk) @@ -6496,6 +6497,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) /* end now represents the last cluster to discard in this group */ end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; + whole_group = true; for (group = first_group; group <= last_group; group++) { grp = ext4_get_group_info(sb, group); @@ -6512,12 +6514,13 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) * change it for the last group, note that last_cluster is * already computed earlier by ext4_get_group_no_and_offset() */ - if (group == last_group) + if (group == last_group) { end = last_cluster; - + whole_group = eof ? true : end == EXT4_CLUSTERS_PER_GROUP(sb) - 1; + } if (grp->bb_free >= minlen) { cnt = ext4_trim_all_free(sb, group, first_cluster, - end, minlen); + end, minlen, whole_group); if (cnt < 0) { ret = cnt; break; diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index cebea4270817..79d05e464c43 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -127,7 +127,7 @@ void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, */ static int kmmpd(void *data) { - struct super_block *sb = (struct super_block *) data; + struct super_block *sb = data; struct ext4_super_block *es = EXT4_SB(sb)->s_es; struct buffer_head *bh = EXT4_SB(sb)->s_mmp_bh; struct mmp_struct *mmp; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 767b4bfe39c3..47d0ca4c795b 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -277,9 +277,9 @@ static struct dx_frame *dx_probe(struct ext4_filename *fname, struct dx_hash_info *hinfo, struct dx_frame *frame); static void dx_release(struct dx_frame *frames); -static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de, - unsigned blocksize, struct dx_hash_info *hinfo, - struct dx_map_entry map[]); +static int dx_make_map(struct inode *dir, struct buffer_head *bh, + struct dx_hash_info *hinfo, + struct dx_map_entry *map_tail); static void dx_sort_map(struct dx_map_entry *map, unsigned count); static struct ext4_dir_entry_2 *dx_move_dirents(struct inode *dir, char *from, char *to, struct dx_map_entry *offsets, @@ -777,12 +777,14 @@ static struct dx_frame * dx_probe(struct ext4_filename *fname, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame_in) { - unsigned count, indirect; + unsigned count, indirect, level, i; struct dx_entry *at, *entries, *p, *q, *m; struct dx_root *root; struct dx_frame *frame = frame_in; struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); u32 hash; + ext4_lblk_t block; + ext4_lblk_t blocks[EXT4_HTREE_LEVEL]; memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); frame->bh = ext4_read_dirblock(dir, 0, INDEX); @@ -854,6 +856,8 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, } dxtrace(printk("Look up %x", hash)); + level = 0; + blocks[0] = 0; while (1) { count = dx_get_count(entries); if (!count || count > dx_get_limit(entries)) { @@ -882,15 +886,27 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, dx_get_block(at))); frame->entries = entries; frame->at = at; - if (!indirect--) + + block = dx_get_block(at); + for (i = 0; i <= level; i++) { + if (blocks[i] == block) { + ext4_warning_inode(dir, + "dx entry: tree cycle block %u points back to block %u", + blocks[level], block); + goto fail; + } + } + if (++level > indirect) return frame; + blocks[level] = block; frame++; - frame->bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX); + frame->bh = ext4_read_dirblock(dir, block, INDEX); if (IS_ERR(frame->bh)) { ret_err = (struct dx_frame *) frame->bh; frame->bh = NULL; goto fail; } + entries = ((struct dx_node *) frame->bh->b_data)->entries; if (dx_get_limit(entries) != dx_node_limit(dir)) { @@ -1249,15 +1265,23 @@ static inline int search_dirblock(struct buffer_head *bh, * Create map of hash values, offsets, and sizes, stored at end of block. * Returns number of entries mapped. */ -static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de, - unsigned blocksize, struct dx_hash_info *hinfo, +static int dx_make_map(struct inode *dir, struct buffer_head *bh, + struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) { int count = 0; - char *base = (char *) de; + struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)bh->b_data; + unsigned int buflen = bh->b_size; + char *base = bh->b_data; struct dx_hash_info h = *hinfo; - while ((char *) de < base + blocksize) { + if (ext4_has_metadata_csum(dir->i_sb)) + buflen -= sizeof(struct ext4_dir_entry_tail); + + while ((char *) de < base + buflen) { + if (ext4_check_dir_entry(dir, NULL, de, bh, base, buflen, + ((char *)de) - base)) + return -EFSCORRUPTED; if (de->name_len && de->inode) { if (ext4_hash_in_dirent(dir)) h.hash = EXT4_DIRENT_HASH(de); @@ -1270,8 +1294,7 @@ static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de, count++; cond_resched(); } - /* XXX: do we need to check rec_len == 0 case? -Chris */ - de = ext4_next_entry(de, blocksize); + de = ext4_next_entry(de, dir->i_sb->s_blocksize); } return count; } @@ -1943,8 +1966,11 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, /* create map in the end of data2 block */ map = (struct dx_map_entry *) (data2 + blocksize); - count = dx_make_map(dir, (struct ext4_dir_entry_2 *) data1, - blocksize, hinfo, map); + count = dx_make_map(dir, *bh, hinfo, map); + if (count < 0) { + err = count; + goto journal_error; + } map -= count; dx_sort_map(map, count); /* Ensure that neither split block is over half full */ @@ -2031,7 +2057,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, unsigned int offset = 0; char *top; - de = (struct ext4_dir_entry_2 *)buf; + de = buf; top = buf + buf_size - reclen; while ((char *) de <= top) { if (ext4_check_dir_entry(dir, NULL, de, bh, @@ -2587,7 +2613,7 @@ int ext4_generic_delete_entry(struct inode *dir, i = 0; pde = NULL; - de = (struct ext4_dir_entry_2 *)entry_buf; + de = entry_buf; while (i < buf_size - csum_size) { if (ext4_check_dir_entry(dir, NULL, de, bh, entry_buf, buf_size, i)) @@ -3249,6 +3275,32 @@ out_trace: return retval; } +static int ext4_init_symlink_block(handle_t *handle, struct inode *inode, + struct fscrypt_str *disk_link) +{ + struct buffer_head *bh; + char *kaddr; + int err = 0; + + bh = ext4_bread(handle, inode, 0, EXT4_GET_BLOCKS_CREATE); + if (IS_ERR(bh)) + return PTR_ERR(bh); + + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); + if (err) + goto out; + + kaddr = (char *)bh->b_data; + memcpy(kaddr, disk_link->name, disk_link->len); + inode->i_size = disk_link->len - 1; + EXT4_I(inode)->i_disksize = inode->i_size; + err = ext4_handle_dirty_metadata(handle, inode, bh); +out: + brelse(bh); + return err; +} + static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, const char *symname) { @@ -3257,6 +3309,7 @@ static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir, int err, len = strlen(symname); int credits; struct fscrypt_str disk_link; + int retries = 0; if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) return -EIO; @@ -3270,26 +3323,15 @@ static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - if ((disk_link.len > EXT4_N_BLOCKS * 4)) { - /* - * For non-fast symlinks, we just allocate inode and put it on - * orphan list in the first transaction => we need bitmap, - * group descriptor, sb, inode block, quota blocks, and - * possibly selinux xattr blocks. - */ - credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + - EXT4_XATTR_TRANS_BLOCKS; - } else { - /* - * Fast symlink. We have to add entry to directory - * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS), - * allocate new inode (bitmap, group descriptor, inode block, - * quota blocks, sb is already counted in previous macros). - */ - credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3; - } - + /* + * EXT4_INDEX_EXTRA_TRANS_BLOCKS for addition of entry into the + * directory. +3 for inode, inode bitmap, group descriptor allocation. + * EXT4_DATA_TRANS_BLOCKS for the data block allocation and + * modification. + */ + credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3; +retry: inode = ext4_new_inode_start_handle(mnt_userns, dir, S_IFLNK|S_IRWXUGO, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); @@ -3297,7 +3339,8 @@ static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (IS_ERR(inode)) { if (handle) ext4_journal_stop(handle); - return PTR_ERR(inode); + err = PTR_ERR(inode); + goto out_retry; } if (IS_ENCRYPTED(inode)) { @@ -3305,75 +3348,44 @@ static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (err) goto err_drop_inode; inode->i_op = &ext4_encrypted_symlink_inode_operations; + } else { + if ((disk_link.len > EXT4_N_BLOCKS * 4)) { + inode->i_op = &ext4_symlink_inode_operations; + } else { + inode->i_op = &ext4_fast_symlink_inode_operations; + inode->i_link = (char *)&EXT4_I(inode)->i_data; + } } if ((disk_link.len > EXT4_N_BLOCKS * 4)) { - if (!IS_ENCRYPTED(inode)) - inode->i_op = &ext4_symlink_inode_operations; - inode_nohighmem(inode); - ext4_set_aops(inode); - /* - * We cannot call page_symlink() with transaction started - * because it calls into ext4_write_begin() which can wait - * for transaction commit if we are running out of space - * and thus we deadlock. So we have to stop transaction now - * and restart it when symlink contents is written. - * - * To keep fs consistent in case of crash, we have to put inode - * to orphan list in the mean time. - */ - drop_nlink(inode); - err = ext4_orphan_add(handle, inode); - if (handle) - ext4_journal_stop(handle); - handle = NULL; - if (err) - goto err_drop_inode; - err = __page_symlink(inode, disk_link.name, disk_link.len, 1); - if (err) - goto err_drop_inode; - /* - * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS - * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified - */ - handle = ext4_journal_start(dir, EXT4_HT_DIR, - EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - handle = NULL; - goto err_drop_inode; - } - set_nlink(inode, 1); - err = ext4_orphan_del(handle, inode); + /* alloc symlink block and fill it */ + err = ext4_init_symlink_block(handle, inode, &disk_link); if (err) goto err_drop_inode; } else { /* clear the extent format for fast symlink */ ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); - if (!IS_ENCRYPTED(inode)) { - inode->i_op = &ext4_fast_symlink_inode_operations; - inode->i_link = (char *)&EXT4_I(inode)->i_data; - } memcpy((char *)&EXT4_I(inode)->i_data, disk_link.name, disk_link.len); inode->i_size = disk_link.len - 1; + EXT4_I(inode)->i_disksize = inode->i_size; } - EXT4_I(inode)->i_disksize = inode->i_size; err = ext4_add_nondir(handle, dentry, &inode); if (handle) ext4_journal_stop(handle); - if (inode) - iput(inode); - goto out_free_encrypted_link; + iput(inode); + goto out_retry; err_drop_inode: - if (handle) - ext4_journal_stop(handle); clear_nlink(inode); + ext4_orphan_add(handle, inode); unlock_new_inode(inode); + if (handle) + ext4_journal_stop(handle); iput(inode); -out_free_encrypted_link: +out_retry: + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; if (disk_link.name != (unsigned char *)symname) kfree(disk_link.name); return err; @@ -3455,6 +3467,9 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, struct buffer_head *bh; if (!ext4_has_inline_data(inode)) { + struct ext4_dir_entry_2 *de; + unsigned int offset; + /* The first directory block must not be a hole, so * treat it as DIRENT_HTREE */ @@ -3463,9 +3478,30 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, *retval = PTR_ERR(bh); return NULL; } - *parent_de = ext4_next_entry( - (struct ext4_dir_entry_2 *)bh->b_data, - inode->i_sb->s_blocksize); + + de = (struct ext4_dir_entry_2 *) bh->b_data; + if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, + bh->b_size, 0) || + le32_to_cpu(de->inode) != inode->i_ino || + strcmp(".", de->name)) { + EXT4_ERROR_INODE(inode, "directory missing '.'"); + brelse(bh); + *retval = -EFSCORRUPTED; + return NULL; + } + offset = ext4_rec_len_from_disk(de->rec_len, + inode->i_sb->s_blocksize); + de = ext4_next_entry(de, inode->i_sb->s_blocksize); + if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, + bh->b_size, offset) || + le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) { + EXT4_ERROR_INODE(inode, "directory missing '..'"); + brelse(bh); + *retval = -EFSCORRUPTED; + return NULL; + } + *parent_de = de; + return bh; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1466fbdbc8e3..450c918d68fc 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1211,6 +1211,9 @@ static void ext4_put_super(struct super_block *sb) */ ext4_unregister_sysfs(sb); + if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs unmount")) + ext4_msg(sb, KERN_INFO, "unmounting filesystem."); + ext4_unregister_li_request(sb); ext4_quota_off_umount(sb); @@ -1397,7 +1400,7 @@ static void ext4_destroy_inode(struct inode *inode) static void init_once(void *foo) { - struct ext4_inode_info *ei = (struct ext4_inode_info *) foo; + struct ext4_inode_info *ei = foo; INIT_LIST_HEAD(&ei->i_orphan); init_rwsem(&ei->xattr_sem); @@ -1492,128 +1495,6 @@ static int ext4_nfs_commit_metadata(struct inode *inode) return ext4_write_inode(inode, &wbc); } -#ifdef CONFIG_FS_ENCRYPTION -static int ext4_get_context(struct inode *inode, void *ctx, size_t len) -{ - return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len); -} - -static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, - void *fs_data) -{ - handle_t *handle = fs_data; - int res, res2, credits, retries = 0; - - /* - * Encrypting the root directory is not allowed because e2fsck expects - * lost+found to exist and be unencrypted, and encrypting the root - * directory would imply encrypting the lost+found directory as well as - * the filename "lost+found" itself. - */ - if (inode->i_ino == EXT4_ROOT_INO) - return -EPERM; - - if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode))) - return -EINVAL; - - if (ext4_test_inode_flag(inode, EXT4_INODE_DAX)) - return -EOPNOTSUPP; - - res = ext4_convert_inline_data(inode); - if (res) - return res; - - /* - * If a journal handle was specified, then the encryption context is - * being set on a new inode via inheritance and is part of a larger - * transaction to create the inode. Otherwise the encryption context is - * being set on an existing inode in its own transaction. Only in the - * latter case should the "retry on ENOSPC" logic be used. - */ - - if (handle) { - res = ext4_xattr_set_handle(handle, inode, - EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, - ctx, len, 0); - if (!res) { - ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); - ext4_clear_inode_state(inode, - EXT4_STATE_MAY_INLINE_DATA); - /* - * Update inode->i_flags - S_ENCRYPTED will be enabled, - * S_DAX may be disabled - */ - ext4_set_inode_flags(inode, false); - } - return res; - } - - res = dquot_initialize(inode); - if (res) - return res; -retry: - res = ext4_xattr_set_credits(inode, len, false /* is_create */, - &credits); - if (res) - return res; - - handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, - ctx, len, 0); - if (!res) { - ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); - /* - * Update inode->i_flags - S_ENCRYPTED will be enabled, - * S_DAX may be disabled - */ - ext4_set_inode_flags(inode, false); - res = ext4_mark_inode_dirty(handle, inode); - if (res) - EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); - } - res2 = ext4_journal_stop(handle); - - if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - if (!res) - res = res2; - return res; -} - -static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb) -{ - return EXT4_SB(sb)->s_dummy_enc_policy.policy; -} - -static bool ext4_has_stable_inodes(struct super_block *sb) -{ - return ext4_has_feature_stable_inodes(sb); -} - -static void ext4_get_ino_and_lblk_bits(struct super_block *sb, - int *ino_bits_ret, int *lblk_bits_ret) -{ - *ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count); - *lblk_bits_ret = 8 * sizeof(ext4_lblk_t); -} - -static const struct fscrypt_operations ext4_cryptops = { - .key_prefix = "ext4:", - .get_context = ext4_get_context, - .set_context = ext4_set_context, - .get_dummy_policy = ext4_get_dummy_policy, - .empty_dir = ext4_empty_dir, - .has_stable_inodes = ext4_has_stable_inodes, - .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits, -}; -#endif - #ifdef CONFIG_QUOTA static const char * const quotatypes[] = INITQFNAMES; #define QTYPE2NAME(t) (quotatypes[t]) @@ -1867,7 +1748,6 @@ static const struct fs_parameter_spec ext4_param_specs[] = { }; #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) -#define DEFAULT_MB_OPTIMIZE_SCAN (-1) static const char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n" @@ -1913,6 +1793,7 @@ static const struct mount_opts { MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET}, {Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR}, + {Opt_commit, 0, MOPT_NO_EXT2}, {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, @@ -2427,11 +2308,12 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) ctx->spec |= EXT4_SPEC_DUMMY_ENCRYPTION; ctx->test_dummy_enc_arg = kmemdup_nul(param->string, param->size, GFP_KERNEL); + return 0; #else ext4_msg(NULL, KERN_WARNING, - "Test dummy encryption mount option ignored"); + "test_dummy_encryption option not supported"); + return -EINVAL; #endif - return 0; case Opt_dax: case Opt_dax_type: #ifdef CONFIG_FS_DAX @@ -2625,8 +2507,10 @@ parse_failed: ret = ext4_apply_options(fc, sb); out_free: - kfree(s_ctx); - kfree(fc); + if (fc) { + ext4_fc_free(fc); + kfree(fc); + } kfree(s_mount_opts); return ret; } @@ -2786,12 +2670,44 @@ err_jquota_specified: #endif } +static int ext4_check_test_dummy_encryption(const struct fs_context *fc, + struct super_block *sb) +{ +#ifdef CONFIG_FS_ENCRYPTION + const struct ext4_fs_context *ctx = fc->fs_private; + const struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (!(ctx->spec & EXT4_SPEC_DUMMY_ENCRYPTION)) + return 0; + + if (!ext4_has_feature_encrypt(sb)) { + ext4_msg(NULL, KERN_WARNING, + "test_dummy_encryption requires encrypt feature"); + return -EINVAL; + } + /* + * This mount option is just for testing, and it's not worthwhile to + * implement the extra complexity (e.g. RCU protection) that would be + * needed to allow it to be set or changed during remount. We do allow + * it to be specified during remount, but only if there is no change. + */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE && + !sbi->s_dummy_enc_policy.policy) { + ext4_msg(NULL, KERN_WARNING, + "Can't set test_dummy_encryption on remount"); + return -EINVAL; + } +#endif /* CONFIG_FS_ENCRYPTION */ + return 0; +} + static int ext4_check_opt_consistency(struct fs_context *fc, struct super_block *sb) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = fc->s_fs_info; int is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE; + int err; if ((ctx->opt_flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) { ext4_msg(NULL, KERN_ERR, @@ -2821,20 +2737,9 @@ static int ext4_check_opt_consistency(struct fs_context *fc, "for blocksize < PAGE_SIZE"); } -#ifdef CONFIG_FS_ENCRYPTION - /* - * This mount option is just for testing, and it's not worthwhile to - * implement the extra complexity (e.g. RCU protection) that would be - * needed to allow it to be set or changed during remount. We do allow - * it to be specified during remount, but only if there is no change. - */ - if ((ctx->spec & EXT4_SPEC_DUMMY_ENCRYPTION) && - is_remount && !sbi->s_dummy_enc_policy.policy) { - ext4_msg(NULL, KERN_WARNING, - "Can't set test_dummy_encryption on remount"); - return -1; - } -#endif + err = ext4_check_test_dummy_encryption(fc, sb); + if (err) + return err; if ((ctx->spec & EXT4_SPEC_DATAJ) && is_remount) { if (!sbi->s_journal) { @@ -3837,7 +3742,7 @@ static struct task_struct *ext4_lazyinit_task; */ static int ext4_lazyinit_thread(void *arg) { - struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg; + struct ext4_lazy_init *eli = arg; struct list_head *pos, *n; struct ext4_li_request *elr; unsigned long next_wakeup, cur; @@ -4409,7 +4314,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) int silent = fc->sb_flags & SB_SILENT; /* Set defaults for the variables that will be set during parsing */ - ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) + ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sectors_written_start = @@ -4886,7 +4792,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sbi->s_inodes_per_block; sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb); sbi->s_sbh = bh; - sbi->s_mount_state = le16_to_cpu(es->s_state); + sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY; sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); @@ -5279,12 +5185,6 @@ no_journal: goto failed_mount_wq; } - if (DUMMY_ENCRYPTION_ENABLED(sbi) && !sb_rdonly(sb) && - !ext4_has_feature_encrypt(sb)) { - ext4_set_feature_encrypt(sb); - ext4_commit_super(sb); - } - /* * Get the # of file system overhead blocks from the * superblock if present. @@ -5474,13 +5374,9 @@ no_journal: goto failed_mount9; } - if (test_opt(sb, DISCARD)) { - struct request_queue *q = bdev_get_queue(sb->s_bdev); - if (!blk_queue_discard(q)) - ext4_msg(sb, KERN_WARNING, - "mounting with \"discard\" option, but " - "the device does not support discard"); - } + if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) + ext4_msg(sb, KERN_WARNING, + "mounting with \"discard\" option, but the device does not support discard"); if (es->s_error_count) mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ @@ -6276,7 +6172,6 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) char *to_free[EXT4_MAXQUOTAS]; #endif - ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; /* Store the original options */ old_sb_flags = sb->s_flags; @@ -6302,9 +6197,14 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) } else old_opts.s_qf_names[i] = NULL; #endif - if (sbi->s_journal && sbi->s_journal->j_task->io_context) - ctx->journal_ioprio = - sbi->s_journal->j_task->io_context->ioprio; + if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) { + if (sbi->s_journal && sbi->s_journal->j_task->io_context) + ctx->journal_ioprio = + sbi->s_journal->j_task->io_context->ioprio; + else + ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + + } ext4_apply_options(fc, sb); @@ -6445,7 +6345,8 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) if (err) goto restore_opts; } - sbi->s_mount_state = le16_to_cpu(es->s_state); + sbi->s_mount_state = (le16_to_cpu(es->s_state) & + ~EXT4_FC_REPLAY); err = ext4_setup_super(sb, es, 0); if (err) diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 69109746e6e2..d281f5bcc526 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -27,7 +27,7 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct page *cpage = NULL; + struct buffer_head *bh = NULL; const void *caddr; unsigned int max_size; const char *paddr; @@ -39,16 +39,19 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry, caddr = EXT4_I(inode)->i_data; max_size = sizeof(EXT4_I(inode)->i_data); } else { - cpage = read_mapping_page(inode->i_mapping, 0, NULL); - if (IS_ERR(cpage)) - return ERR_CAST(cpage); - caddr = page_address(cpage); + bh = ext4_bread(NULL, inode, 0, 0); + if (IS_ERR(bh)) + return ERR_CAST(bh); + if (!bh) { + EXT4_ERROR_INODE(inode, "bad symlink."); + return ERR_PTR(-EFSCORRUPTED); + } + caddr = bh->b_data; max_size = inode->i_sb->s_blocksize; } paddr = fscrypt_get_symlink(inode, caddr, max_size, done); - if (cpage) - put_page(cpage); + brelse(bh); return paddr; } @@ -62,6 +65,38 @@ static int ext4_encrypted_symlink_getattr(struct user_namespace *mnt_userns, return fscrypt_symlink_getattr(path, stat); } +static void ext4_free_link(void *bh) +{ + brelse(bh); +} + +static const char *ext4_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) +{ + struct buffer_head *bh; + + if (!dentry) { + bh = ext4_getblk(NULL, inode, 0, EXT4_GET_BLOCKS_CACHED_NOWAIT); + if (IS_ERR(bh)) + return ERR_CAST(bh); + if (!bh || !ext4_buffer_uptodate(bh)) + return ERR_PTR(-ECHILD); + } else { + bh = ext4_bread(NULL, inode, 0, 0); + if (IS_ERR(bh)) + return ERR_CAST(bh); + if (!bh) { + EXT4_ERROR_INODE(inode, "bad symlink."); + return ERR_PTR(-EFSCORRUPTED); + } + } + + set_delayed_call(callback, ext4_free_link, bh); + nd_terminate_link(bh->b_data, inode->i_size, + inode->i_sb->s_blocksize - 1); + return bh->b_data; +} + const struct inode_operations ext4_encrypted_symlink_inode_operations = { .get_link = ext4_encrypted_get_link, .setattr = ext4_setattr, @@ -70,7 +105,7 @@ const struct inode_operations ext4_encrypted_symlink_inode_operations = { }; const struct inode_operations ext4_symlink_inode_operations = { - .get_link = page_get_link, + .get_link = ext4_get_link, .setattr = ext4_setattr, .getattr = ext4_getattr, .listxattr = ext4_listxattr, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8c570de21ed5..2b2b3c87e45e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4372,8 +4372,7 @@ static inline bool f2fs_hw_should_discard(struct f2fs_sb_info *sbi) static inline bool f2fs_bdev_support_discard(struct block_device *bdev) { - return blk_queue_discard(bdev_get_queue(bdev)) || - bdev_is_zoned(bdev); + return bdev_max_discard_sectors(bdev) || bdev_is_zoned(bdev); } static inline bool f2fs_hw_support_discard(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5b89af0f27f0..100637b1adb3 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2285,7 +2285,6 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; - struct request_queue *q = bdev_get_queue(sb->s_bdev); struct fstrim_range range; int ret; @@ -2304,7 +2303,7 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) return ret; range.minlen = max((unsigned int)range.minlen, - q->limits.discard_granularity); + bdev_discard_granularity(sb->s_bdev)); ret = f2fs_trim_fs(F2FS_SB(sb), &range); mnt_drop_write_file(filp); if (ret < 0) @@ -3686,18 +3685,18 @@ out: static int f2fs_secure_erase(struct block_device *bdev, struct inode *inode, pgoff_t off, block_t block, block_t len, u32 flags) { - struct request_queue *q = bdev_get_queue(bdev); sector_t sector = SECTOR_FROM_BLOCK(block); sector_t nr_sects = SECTOR_FROM_BLOCK(len); int ret = 0; - if (!q) - return -ENXIO; - - if (flags & F2FS_TRIM_FILE_DISCARD) - ret = blkdev_issue_discard(bdev, sector, nr_sects, GFP_NOFS, - blk_queue_secure_erase(q) ? - BLKDEV_DISCARD_SECURE : 0); + if (flags & F2FS_TRIM_FILE_DISCARD) { + if (bdev_max_secure_erase_sectors(bdev)) + ret = blkdev_issue_secure_erase(bdev, sector, nr_sects, + GFP_NOFS); + else + ret = blkdev_issue_discard(bdev, sector, nr_sects, + GFP_NOFS); + } if (!ret && (flags & F2FS_TRIM_FILE_ZEROOUT)) { if (IS_ENCRYPTED(inode)) @@ -4309,7 +4308,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) */ inc_page_count(sbi, F2FS_DIO_READ); dio = __iomap_dio_rw(iocb, to, &f2fs_iomap_ops, - &f2fs_iomap_dio_read_ops, 0, 0); + &f2fs_iomap_dio_read_ops, 0, NULL, 0); if (IS_ERR_OR_NULL(dio)) { ret = PTR_ERR_OR_ZERO(dio); if (ret != -EIOCBQUEUED) @@ -4527,7 +4526,7 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, if (pos + count > inode->i_size) dio_flags |= IOMAP_DIO_FORCE_WAIT; dio = __iomap_dio_rw(iocb, from, &f2fs_iomap_ops, - &f2fs_iomap_dio_write_ops, dio_flags, 0); + &f2fs_iomap_dio_write_ops, dio_flags, NULL, 0); if (IS_ERR_OR_NULL(dio)) { ret = PTR_ERR_OR_ZERO(dio); if (ret == -ENOTBLK) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index bd9731cdec56..7225ce09f3ab 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1196,9 +1196,8 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, unsigned int *issued) { struct block_device *bdev = dc->bdev; - struct request_queue *q = bdev_get_queue(bdev); unsigned int max_discard_blocks = - SECTOR_TO_BLOCK(q->limits.max_discard_sectors); + SECTOR_TO_BLOCK(bdev_max_discard_sectors(bdev)); struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? &(dcc->fstrim_list) : &(dcc->wait_list); @@ -1245,7 +1244,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, err = __blkdev_issue_discard(bdev, SECTOR_FROM_BLOCK(start), SECTOR_FROM_BLOCK(len), - GFP_NOFS, 0, &bio); + GFP_NOFS, &bio); submit: if (err) { spin_lock_irqsave(&dc->lock, flags); @@ -1375,9 +1374,8 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, struct discard_cmd *dc; struct discard_info di = {0}; struct rb_node **insert_p = NULL, *insert_parent = NULL; - struct request_queue *q = bdev_get_queue(bdev); unsigned int max_discard_blocks = - SECTOR_TO_BLOCK(q->limits.max_discard_sectors); + SECTOR_TO_BLOCK(bdev_max_discard_sectors(bdev)); block_t end = lstart + len; dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, diff --git a/fs/fat/file.c b/fs/fat/file.c index a5a309fcc7fa..bf91f977debe 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -127,13 +127,12 @@ static int fat_ioctl_fitrim(struct inode *inode, unsigned long arg) struct super_block *sb = inode->i_sb; struct fstrim_range __user *user_range; struct fstrim_range range; - struct request_queue *q = bdev_get_queue(sb->s_bdev); int err; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(sb->s_bdev)) return -EOPNOTSUPP; user_range = (struct fstrim_range __user *)arg; @@ -141,7 +140,7 @@ static int fat_ioctl_fitrim(struct inode *inode, unsigned long arg) return -EFAULT; range.minlen = max_t(unsigned int, range.minlen, - q->limits.discard_granularity); + bdev_discard_granularity(sb->s_bdev)); err = fat_trim_fs(inode, &range); if (err < 0) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index bf6051bdf1d1..3d1afb95a925 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1872,13 +1872,9 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, goto out_fail; } - if (sbi->options.discard) { - struct request_queue *q = bdev_get_queue(sb->s_bdev); - if (!blk_queue_discard(q)) - fat_msg(sb, KERN_WARNING, - "mounting with \"discard\" option, but " - "the device does not support discard"); - } + if (sbi->options.discard && !bdev_max_discard_sectors(sb->s_bdev)) + fat_msg(sb, KERN_WARNING, + "mounting with \"discard\" option, but the device does not support discard"); fat_set_state(sb, 1, 0); return 0; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 591fe9cf1659..a1074a26e784 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1712,6 +1712,10 @@ static int writeback_single_inode(struct inode *inode, */ if (!(inode->i_state & I_DIRTY_ALL)) inode_cgwb_move_to_attached(inode, wb); + else if (!(inode->i_state & I_SYNC_QUEUED) && + (inode->i_state & I_DIRTY)) + redirty_tail_locked(inode, wb); + spin_unlock(&wb->list_lock); inode_sync_complete(inode); out: @@ -1775,11 +1779,12 @@ static long writeback_sb_inodes(struct super_block *sb, }; unsigned long start_time = jiffies; long write_chunk; - long wrote = 0; /* count both pages and inodes */ + long total_wrote = 0; /* count both pages and inodes */ while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); struct bdi_writeback *tmp_wb; + long wrote; if (inode->i_sb != sb) { if (work->sb) { @@ -1855,7 +1860,9 @@ static long writeback_sb_inodes(struct super_block *sb, wbc_detach_inode(&wbc); work->nr_pages -= write_chunk - wbc.nr_to_write; - wrote += write_chunk - wbc.nr_to_write; + wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped; + wrote = wrote < 0 ? 0 : wrote; + total_wrote += wrote; if (need_resched()) { /* @@ -1877,7 +1884,7 @@ static long writeback_sb_inodes(struct super_block *sb, tmp_wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); if (!(inode->i_state & I_DIRTY_ALL)) - wrote++; + total_wrote++; requeue_inode(inode, tmp_wb, &wbc); inode_sync_complete(inode); spin_unlock(&inode->i_lock); @@ -1891,14 +1898,14 @@ static long writeback_sb_inodes(struct super_block *sb, * bail out to wb_writeback() often enough to check * background threshold and other termination conditions. */ - if (wrote) { + if (total_wrote) { if (time_is_before_jiffies(start_time + HZ / 10UL)) break; if (work->nr_pages <= 0) break; } } - return wrote; + return total_wrote; } static long __writeback_inodes_wb(struct bdi_writeback *wb, diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 39080b2d6cf8..b6697333bb2b 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1153,13 +1153,12 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length, if (length != written && (iomap->flags & IOMAP_F_NEW)) { /* Deallocate blocks that were just allocated. */ - loff_t blockmask = i_blocksize(inode) - 1; - loff_t end = (pos + length) & ~blockmask; + loff_t hstart = round_up(pos + written, i_blocksize(inode)); + loff_t hend = iomap->offset + iomap->length; - pos = (pos + written + blockmask) & ~blockmask; - if (pos < end) { - truncate_pagecache_range(inode, pos, end - 1); - punch_hole(ip, pos, end - pos); + if (hstart < hend) { + truncate_pagecache_range(inode, hstart, hend - 1); + punch_hole(ip, hstart, hend - hstart); } } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 48f01323c37c..2cceb193dcd8 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -770,30 +770,27 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, return ret ? ret : ret1; } -static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i, +static inline bool should_fault_in_pages(struct iov_iter *i, + struct kiocb *iocb, size_t *prev_count, size_t *window_size) { size_t count = iov_iter_count(i); size_t size, offs; - if (likely(!count)) - return false; - if (ret <= 0 && ret != -EFAULT) + if (!count) return false; if (!iter_is_iovec(i)) return false; size = PAGE_SIZE; - offs = offset_in_page(i->iov[0].iov_base + i->iov_offset); + offs = offset_in_page(iocb->ki_pos); if (*prev_count != count || !*window_size) { size_t nr_dirtied; - size = ALIGN(offs + count, PAGE_SIZE); - size = min_t(size_t, size, SZ_1M); nr_dirtied = max(current->nr_dirtied_pause - current->nr_dirtied, 8); - size = min(size, nr_dirtied << PAGE_SHIFT); + size = min_t(size_t, SZ_1M, nr_dirtied << PAGE_SHIFT); } *prev_count = count; @@ -807,7 +804,7 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to, struct file *file = iocb->ki_filp; struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); size_t prev_count = 0, window_size = 0; - size_t written = 0; + size_t read = 0; ssize_t ret; /* @@ -835,35 +832,33 @@ retry: ret = gfs2_glock_nq(gh); if (ret) goto out_uninit; -retry_under_glock: pagefault_disable(); to->nofault = true; ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, - IOMAP_DIO_PARTIAL, written); + IOMAP_DIO_PARTIAL, NULL, read); to->nofault = false; pagefault_enable(); + if (ret <= 0 && ret != -EFAULT) + goto out_unlock; + /* No increment (+=) because iomap_dio_rw returns a cumulative value. */ if (ret > 0) - written = ret; - - if (should_fault_in_pages(ret, to, &prev_count, &window_size)) { - size_t leftover; + read = ret; - gfs2_holder_allow_demote(gh); - leftover = fault_in_iov_iter_writeable(to, window_size); - gfs2_holder_disallow_demote(gh); - if (leftover != window_size) { - if (gfs2_holder_queued(gh)) - goto retry_under_glock; + if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) { + gfs2_glock_dq(gh); + window_size -= fault_in_iov_iter_writeable(to, window_size); + if (window_size) goto retry; - } } +out_unlock: if (gfs2_holder_queued(gh)) gfs2_glock_dq(gh); out_uninit: gfs2_holder_uninit(gh); + /* User space doesn't expect partial success. */ if (ret < 0) return ret; - return written; + return read; } static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, @@ -873,7 +868,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, struct inode *inode = file->f_mapping->host; struct gfs2_inode *ip = GFS2_I(inode); size_t prev_count = 0, window_size = 0; - size_t read = 0; + size_t written = 0; ssize_t ret; /* @@ -901,39 +896,37 @@ retry: goto out_uninit; /* Silently fall back to buffered I/O when writing beyond EOF */ if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode)) - goto out; -retry_under_glock: + goto out_unlock; from->nofault = true; ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, - IOMAP_DIO_PARTIAL, read); + IOMAP_DIO_PARTIAL, NULL, written); from->nofault = false; - - if (ret == -ENOTBLK) - ret = 0; + if (ret <= 0) { + if (ret == -ENOTBLK) + ret = 0; + if (ret != -EFAULT) + goto out_unlock; + } + /* No increment (+=) because iomap_dio_rw returns a cumulative value. */ if (ret > 0) - read = ret; - - if (should_fault_in_pages(ret, from, &prev_count, &window_size)) { - size_t leftover; + written = ret; - gfs2_holder_allow_demote(gh); - leftover = fault_in_iov_iter_readable(from, window_size); - gfs2_holder_disallow_demote(gh); - if (leftover != window_size) { - if (gfs2_holder_queued(gh)) - goto retry_under_glock; + if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { + gfs2_glock_dq(gh); + window_size -= fault_in_iov_iter_readable(from, window_size); + if (window_size) goto retry; - } } -out: +out_unlock: if (gfs2_holder_queued(gh)) gfs2_glock_dq(gh); out_uninit: gfs2_holder_uninit(gh); + /* User space doesn't expect partial success. */ if (ret < 0) return ret; - return read; + return written; } static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -941,7 +934,7 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) struct gfs2_inode *ip; struct gfs2_holder gh; size_t prev_count = 0, window_size = 0; - size_t written = 0; + size_t read = 0; ssize_t ret; /* @@ -962,7 +955,7 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (ret >= 0) { if (!iov_iter_count(to)) return ret; - written = ret; + read = ret; } else if (ret != -EFAULT) { if (ret != -EAGAIN) return ret; @@ -975,30 +968,26 @@ retry: ret = gfs2_glock_nq(&gh); if (ret) goto out_uninit; -retry_under_glock: pagefault_disable(); ret = generic_file_read_iter(iocb, to); pagefault_enable(); + if (ret <= 0 && ret != -EFAULT) + goto out_unlock; if (ret > 0) - written += ret; - - if (should_fault_in_pages(ret, to, &prev_count, &window_size)) { - size_t leftover; + read += ret; - gfs2_holder_allow_demote(&gh); - leftover = fault_in_iov_iter_writeable(to, window_size); - gfs2_holder_disallow_demote(&gh); - if (leftover != window_size) { - if (gfs2_holder_queued(&gh)) - goto retry_under_glock; + if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) { + gfs2_glock_dq(&gh); + window_size -= fault_in_iov_iter_writeable(to, window_size); + if (window_size) goto retry; - } } +out_unlock: if (gfs2_holder_queued(&gh)) gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); - return written ? written : ret; + return read ? read : ret; } static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, @@ -1012,7 +1001,7 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, struct gfs2_holder *statfs_gh = NULL; size_t prev_count = 0, window_size = 0; size_t orig_count = iov_iter_count(from); - size_t read = 0; + size_t written = 0; ssize_t ret; /* @@ -1030,10 +1019,18 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh); retry: + if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { + window_size -= fault_in_iov_iter_readable(from, window_size); + if (!window_size) { + ret = -EFAULT; + goto out_uninit; + } + from->count = min(from->count, window_size); + } ret = gfs2_glock_nq(gh); if (ret) goto out_uninit; -retry_under_glock: + if (inode == sdp->sd_rindex) { struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); @@ -1050,25 +1047,19 @@ retry_under_glock: current->backing_dev_info = NULL; if (ret > 0) { iocb->ki_pos += ret; - read += ret; + written += ret; } if (inode == sdp->sd_rindex) gfs2_glock_dq_uninit(statfs_gh); - from->count = orig_count - read; - if (should_fault_in_pages(ret, from, &prev_count, &window_size)) { - size_t leftover; - - gfs2_holder_allow_demote(gh); - leftover = fault_in_iov_iter_readable(from, window_size); - gfs2_holder_disallow_demote(gh); - if (leftover != window_size) { - from->count = min(from->count, window_size - leftover); - if (gfs2_holder_queued(gh)) - goto retry_under_glock; - goto retry; - } + if (ret <= 0 && ret != -EFAULT) + goto out_unlock; + + from->count = orig_count - written; + if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { + gfs2_glock_dq(gh); + goto retry; } out_unlock: if (gfs2_holder_queued(gh)) @@ -1077,8 +1068,8 @@ out_uninit: gfs2_holder_uninit(gh); if (statfs_gh) kfree(statfs_gh); - from->count = orig_count - read; - return read ? read : ret; + from->count = orig_count - written; + return written ? written : ret; } /** diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 630c6550eacf..c992d53013d3 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -127,9 +127,11 @@ static void gfs2_glock_dealloc(struct rcu_head *rcu) struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu); kfree(gl->gl_lksb.sb_lvbptr); - if (gl->gl_ops->go_flags & GLOF_ASPACE) - kmem_cache_free(gfs2_glock_aspace_cachep, gl); - else + if (gl->gl_ops->go_flags & GLOF_ASPACE) { + struct gfs2_glock_aspace *gla = + container_of(gl, struct gfs2_glock_aspace, glock); + kmem_cache_free(gfs2_glock_aspace_cachep, gla); + } else kmem_cache_free(gfs2_glock_cachep, gl); } @@ -1159,7 +1161,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, .ln_sbd = sdp }; struct gfs2_glock *gl, *tmp; struct address_space *mapping; - struct kmem_cache *cachep; int ret = 0; gl = find_insert_glock(&name, NULL); @@ -1170,20 +1171,24 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, if (!create) return -ENOENT; - if (glops->go_flags & GLOF_ASPACE) - cachep = gfs2_glock_aspace_cachep; - else - cachep = gfs2_glock_cachep; - gl = kmem_cache_alloc(cachep, GFP_NOFS); - if (!gl) - return -ENOMEM; - + if (glops->go_flags & GLOF_ASPACE) { + struct gfs2_glock_aspace *gla = + kmem_cache_alloc(gfs2_glock_aspace_cachep, GFP_NOFS); + if (!gla) + return -ENOMEM; + gl = &gla->glock; + } else { + gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_NOFS); + if (!gl) + return -ENOMEM; + } memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb)); + gl->gl_ops = glops; if (glops->go_flags & GLOF_LVB) { gl->gl_lksb.sb_lvbptr = kzalloc(GDLM_LVB_SIZE, GFP_NOFS); if (!gl->gl_lksb.sb_lvbptr) { - kmem_cache_free(cachep, gl); + gfs2_glock_dealloc(&gl->gl_rcu); return -ENOMEM; } } @@ -1197,7 +1202,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, gl->gl_state = LM_ST_UNLOCKED; gl->gl_target = LM_ST_UNLOCKED; gl->gl_demote_state = LM_ST_EXCLUSIVE; - gl->gl_ops = glops; gl->gl_dstamp = 0; preempt_disable(); /* We use the global stats to estimate the initial per-glock stats */ @@ -1234,8 +1238,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, *glp = tmp; out_free: - kfree(gl->gl_lksb.sb_lvbptr); - kmem_cache_free(cachep, gl); + gfs2_glock_dealloc(&gl->gl_rcu); if (atomic_dec_and_test(&sdp->sd_glock_disposal)) wake_up(&sdp->sd_glock_wait); diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 4f8642301801..c0ae9100a0bc 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -138,6 +138,11 @@ struct lm_lockops { const match_table_t *lm_tokens; }; +struct gfs2_glock_aspace { + struct gfs2_glock glock; + struct address_space mapping; +}; + extern struct workqueue_struct *gfs2_delete_workqueue; static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl) { @@ -179,8 +184,11 @@ static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl) static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl) { - if (gl->gl_ops->go_flags & GLOF_ASPACE) - return (struct address_space *)(gl + 1); + if (gl->gl_ops->go_flags & GLOF_ASPACE) { + struct gfs2_glock_aspace *gla = + container_of(gl, struct gfs2_glock_aspace, glock); + return &gla->mapping; + } return NULL; } diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 28d0eb23e18e..244187e3e70f 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -62,11 +62,10 @@ static void gfs2_init_glock_once(void *foo) static void gfs2_init_gl_aspace_once(void *foo) { - struct gfs2_glock *gl = foo; - struct address_space *mapping = (struct address_space *)(gl + 1); + struct gfs2_glock_aspace *gla = foo; - gfs2_init_glock_once(gl); - address_space_init_once(mapping); + gfs2_init_glock_once(&gla->glock); + address_space_init_once(&gla->mapping); } /** @@ -104,8 +103,7 @@ static int __init init_gfs2_fs(void) goto fail_cachep1; gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock(aspace)", - sizeof(struct gfs2_glock) + - sizeof(struct address_space), + sizeof(struct gfs2_glock_aspace), 0, 0, gfs2_init_gl_aspace_once); if (!gfs2_glock_aspace_cachep) diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index 21880d72081a..d0a58cdd433a 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -40,9 +40,11 @@ extern const struct address_space_operations gfs2_rgrp_aops; static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping) { struct inode *inode = mapping->host; - if (mapping->a_ops == &gfs2_meta_aops) - return (((struct gfs2_glock *)mapping) - 1)->gl_name.ln_sbd; - else if (mapping->a_ops == &gfs2_rgrp_aops) + if (mapping->a_ops == &gfs2_meta_aops) { + struct gfs2_glock_aspace *gla = + container_of(mapping, struct gfs2_glock_aspace, mapping); + return gla->glock.gl_name.ln_sbd; + } else if (mapping->a_ops == &gfs2_rgrp_aops) return container_of(mapping, struct gfs2_sbd, sd_aspace); else return inode->i_sb->s_fs_info; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index be0997e24d60..59d727a4ae2c 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -365,11 +365,12 @@ static void slot_put(struct gfs2_quota_data *qd) static int bh_get(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd; - struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); + struct inode *inode = sdp->sd_qc_inode; + struct gfs2_inode *ip = GFS2_I(inode); unsigned int block, offset; struct buffer_head *bh; + struct iomap iomap = { }; int error; - struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 }; mutex_lock(&sdp->sd_quota_mutex); @@ -381,11 +382,17 @@ static int bh_get(struct gfs2_quota_data *qd) block = qd->qd_slot / sdp->sd_qc_per_block; offset = qd->qd_slot % sdp->sd_qc_per_block; - bh_map.b_size = BIT(ip->i_inode.i_blkbits); - error = gfs2_block_map(&ip->i_inode, block, &bh_map, 0); + error = gfs2_iomap_get(inode, + (loff_t)block << inode->i_blkbits, + i_blocksize(inode), &iomap); if (error) goto fail; - error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, 0, &bh); + error = -ENOENT; + if (iomap.type != IOMAP_MAPPED) + goto fail; + + error = gfs2_meta_read(ip->i_gl, iomap.addr >> inode->i_blkbits, + DIO_WAIT, 0, &bh); if (error) goto fail; error = -EIO; @@ -443,9 +450,8 @@ static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd, static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp) { - struct gfs2_quota_data *qd = NULL; + struct gfs2_quota_data *qd = NULL, *iter; int error; - int found = 0; *qdp = NULL; @@ -454,15 +460,13 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp) spin_lock(&qd_lock); - list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { - found = qd_check_sync(sdp, qd, &sdp->sd_quota_sync_gen); - if (found) + list_for_each_entry(iter, &sdp->sd_quota_list, qd_list) { + if (qd_check_sync(sdp, iter, &sdp->sd_quota_sync_gen)) { + qd = iter; break; + } } - if (!found) - qd = NULL; - spin_unlock(&qd_lock); if (qd) { @@ -531,34 +535,42 @@ static void qdsb_put(struct gfs2_quota_data *qd) */ int gfs2_qa_get(struct gfs2_inode *ip) { - int error = 0; struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct inode *inode = &ip->i_inode; if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return 0; - down_write(&ip->i_rw_mutex); + spin_lock(&inode->i_lock); if (ip->i_qadata == NULL) { - ip->i_qadata = kmem_cache_zalloc(gfs2_qadata_cachep, GFP_NOFS); - if (!ip->i_qadata) { - error = -ENOMEM; - goto out; - } + struct gfs2_qadata *tmp; + + spin_unlock(&inode->i_lock); + tmp = kmem_cache_zalloc(gfs2_qadata_cachep, GFP_NOFS); + if (!tmp) + return -ENOMEM; + + spin_lock(&inode->i_lock); + if (ip->i_qadata == NULL) + ip->i_qadata = tmp; + else + kmem_cache_free(gfs2_qadata_cachep, tmp); } ip->i_qadata->qa_ref++; -out: - up_write(&ip->i_rw_mutex); - return error; + spin_unlock(&inode->i_lock); + return 0; } void gfs2_qa_put(struct gfs2_inode *ip) { - down_write(&ip->i_rw_mutex); + struct inode *inode = &ip->i_inode; + + spin_lock(&inode->i_lock); if (ip->i_qadata && --ip->i_qadata->qa_ref == 0) { kmem_cache_free(gfs2_qadata_cachep, ip->i_qadata); ip->i_qadata = NULL; } - up_write(&ip->i_rw_mutex); + spin_unlock(&inode->i_lock); } int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 016ed1b2ca1d..2bb085a72e8e 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -55,17 +55,16 @@ int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where) { struct list_head *head = &jd->jd_revoke_list; - struct gfs2_revoke_replay *rr; - int found = 0; + struct gfs2_revoke_replay *rr = NULL, *iter; - list_for_each_entry(rr, head, rr_list) { - if (rr->rr_blkno == blkno) { - found = 1; + list_for_each_entry(iter, head, rr_list) { + if (iter->rr_blkno == blkno) { + rr = iter; break; } } - if (found) { + if (rr) { rr->rr_where = where; return 0; } @@ -83,18 +82,17 @@ int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where) int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where) { - struct gfs2_revoke_replay *rr; + struct gfs2_revoke_replay *rr = NULL, *iter; int wrap, a, b, revoke; - int found = 0; - list_for_each_entry(rr, &jd->jd_revoke_list, rr_list) { - if (rr->rr_blkno == blkno) { - found = 1; + list_for_each_entry(iter, &jd->jd_revoke_list, rr_list) { + if (iter->rr_blkno == blkno) { + rr = iter; break; } } - if (!found) + if (!rr) return 0; wrap = (rr->rr_where < jd->jd_replay_tail); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 801ad9f4f2be..8a63870eef5a 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1315,7 +1315,7 @@ int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, u64 blk; sector_t start = 0; sector_t nr_blks = 0; - int rv; + int rv = -EIO; unsigned int x; u32 trimmed = 0; u8 diff; @@ -1371,7 +1371,7 @@ fail: if (sdp->sd_args.ar_discard) fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem\n", rv); sdp->sd_args.ar_discard = 0; - return -EIO; + return rv; } /** @@ -1386,7 +1386,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp) { struct inode *inode = file_inode(filp); struct gfs2_sbd *sdp = GFS2_SB(inode); - struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev); + struct block_device *bdev = sdp->sd_vfs->s_bdev; struct buffer_head *bh; struct gfs2_rgrpd *rgd; struct gfs2_rgrpd *rgd_end; @@ -1405,7 +1405,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp) if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) return -EROFS; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(bdev)) return -EOPNOTSUPP; if (copy_from_user(&r, argp, sizeof(r))) @@ -1418,8 +1418,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp) start = r.start >> bs_shift; end = start + (r.len >> bs_shift); minlen = max_t(u64, r.minlen, sdp->sd_sb.sb_bsize); - minlen = max_t(u64, minlen, - q->limits.discard_granularity) >> bs_shift; + minlen = max_t(u64, minlen, bdev_discard_granularity(bdev)) >> bs_shift; if (end <= start || minlen > sdp->sd_max_rg_data) return -EINVAL; diff --git a/fs/internal.h b/fs/internal.h index 08503dc68d2b..9a6c233ee7f1 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -191,3 +191,32 @@ long splice_file_to_pipe(struct file *in, struct pipe_inode_info *opipe, loff_t *offset, size_t len, unsigned int flags); + +/* + * fs/xattr.c: + */ +struct xattr_name { + char name[XATTR_NAME_MAX + 1]; +}; + +struct xattr_ctx { + /* Value of attribute */ + union { + const void __user *cvalue; + void __user *value; + }; + void *kvalue; + size_t size; + /* Attribute name */ + struct xattr_name *kname; + unsigned int flags; +}; + + +ssize_t do_getxattr(struct user_namespace *mnt_userns, + struct dentry *d, + struct xattr_ctx *ctx); + +int setxattr_copy(const char __user *name, struct xattr_ctx *ctx); +int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct xattr_ctx *ctx); diff --git a/fs/io-wq.c b/fs/io-wq.c index 32aeb2c581c5..824623bcf1a5 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -871,7 +871,7 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe, static bool io_wq_worker_wake(struct io_worker *worker, void *data) { - set_notify_signal(worker->task); + __set_notify_signal(worker->task); wake_up_process(worker->task); return false; } @@ -991,7 +991,7 @@ static bool __io_wq_worker_cancel(struct io_worker *worker, { if (work && match->fn(work, match->data)) { work->flags |= IO_WQ_WORK_CANCEL; - set_notify_signal(worker->task); + __set_notify_signal(worker->task); return true; } diff --git a/fs/io-wq.h b/fs/io-wq.h index dbecd27656c7..ba6eee76d028 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -155,6 +155,7 @@ struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack) struct io_wq_work { struct io_wq_work_node list; unsigned flags; + int cancel_seq; }; static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) diff --git a/fs/io_uring.c b/fs/io_uring.c index 91de361ea9ab..9f1c682d7caf 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -80,6 +80,7 @@ #include <linux/io_uring.h> #include <linux/audit.h> #include <linux/security.h> +#include <linux/xattr.h> #define CREATE_TRACE_POINTS #include <trace/events/io_uring.h> @@ -94,7 +95,7 @@ #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 /* only define max */ -#define IORING_MAX_FIXED_FILES (1U << 15) +#define IORING_MAX_FIXED_FILES (1U << 20) #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) @@ -113,6 +114,11 @@ #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ REQ_F_POLLED | REQ_F_CREDS | REQ_F_ASYNC_DATA) +#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ + IO_REQ_CLEAN_FLAGS) + +#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED) + #define IO_TCTX_REFS_CACHE_NR (1U << 10) struct io_uring { @@ -166,7 +172,7 @@ struct io_rings { * The application needs a full memory barrier before checking * for IORING_SQ_NEED_WAKEUP after updating the sq tail. */ - u32 sq_flags; + atomic_t sq_flags; /* * Runtime CQ flags * @@ -198,13 +204,6 @@ struct io_rings { struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; }; -enum io_uring_cmd_flags { - IO_URING_F_COMPLETE_DEFER = 1, - IO_URING_F_UNLOCKED = 2, - /* int's last bit, sign checks are usually faster than a bit test */ - IO_URING_F_NONBLOCK = INT_MIN, -}; - struct io_mapped_ubuf { u64 ubuf; u64 ubuf_end; @@ -216,10 +215,27 @@ struct io_mapped_ubuf { struct io_ring_ctx; struct io_overflow_cqe { - struct io_uring_cqe cqe; struct list_head list; + struct io_uring_cqe cqe; }; +/* + * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0 + * and define IO_URING_SCM_ALL. For this case, we use SCM for all files as we + * can't safely always dereference the file when the task has exited and ring + * cleanup is done. If a file is tracked and part of SCM, then unix gc on + * process exit may reap it before __io_sqe_files_unregister() is run. + */ +#define FFS_NOWAIT 0x1UL +#define FFS_ISREG 0x2UL +#if defined(CONFIG_64BIT) +#define FFS_SCM 0x4UL +#else +#define IO_URING_SCM_ALL +#define FFS_SCM 0x0UL +#endif +#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG|FFS_SCM) + struct io_fixed_file { /* file * with additional FFS_* flags */ unsigned long file_ptr; @@ -237,6 +253,8 @@ struct io_rsrc_put { struct io_file_table { struct io_fixed_file *files; + unsigned long *bitmap; + unsigned int alloc_hint; }; struct io_rsrc_node { @@ -261,10 +279,26 @@ struct io_rsrc_data { bool quiesce; }; +#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) struct io_buffer_list { - struct list_head list; - struct list_head buf_list; + /* + * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not, + * then these are classic provided buffers and ->buf_list is used. + */ + union { + struct list_head buf_list; + struct { + struct page **buf_pages; + struct io_uring_buf_ring *buf_ring; + }; + }; __u16 bgid; + + /* below is for ring provided buffers */ + __u16 buf_nr_pages; + __u16 nr_entries; + __u32 head; + __u32 mask; }; struct io_buffer { @@ -337,7 +371,7 @@ struct io_ev_fd { struct rcu_head rcu; }; -#define IO_BUFFERS_HASH_BITS 5 +#define BGID_ARRAY 64 struct io_ring_ctx { /* const or read-mostly hot data */ @@ -346,6 +380,7 @@ struct io_ring_ctx { struct io_rings *rings; unsigned int flags; + enum task_work_notify_mode notify_method; unsigned int compat: 1; unsigned int drain_next: 1; unsigned int restricted: 1; @@ -353,6 +388,7 @@ struct io_ring_ctx { unsigned int drain_active: 1; unsigned int drain_disabled: 1; unsigned int has_evfd: 1; + unsigned int syscall_iopoll: 1; } ____cacheline_aligned_in_smp; /* submission data */ @@ -382,17 +418,21 @@ struct io_ring_ctx { */ struct io_rsrc_node *rsrc_node; int rsrc_cached_refs; + atomic_t cancel_seq; struct io_file_table file_table; unsigned nr_user_files; unsigned nr_user_bufs; struct io_mapped_ubuf **user_bufs; struct io_submit_state submit_state; + + struct io_buffer_list *io_bl; + struct xarray io_bl_xa; + struct list_head io_buffers_cache; + struct list_head timeout_list; struct list_head ltimeout_list; struct list_head cq_overflow_list; - struct list_head *io_buffers; - struct list_head io_buffers_cache; struct list_head apoll_cache; struct xarray personalities; u32 pers_next; @@ -409,9 +449,16 @@ struct io_ring_ctx { struct wait_queue_head sqo_sq_wait; struct list_head sqd_list; - unsigned long check_cq_overflow; + unsigned long check_cq; struct { + /* + * We cache a range of free CQEs we can use, once exhausted it + * should go through a slower range setup, see __io_get_cqe() + */ + struct io_uring_cqe *cqe_cached; + struct io_uring_cqe *cqe_sentinel; + unsigned cached_cq_tail; unsigned cq_entries; struct io_ev_fd __rcu *io_ev_fd; @@ -497,7 +544,7 @@ struct io_uring_task { spinlock_t task_lock; struct io_wq_work_list task_list; - struct io_wq_work_list prior_task_list; + struct io_wq_work_list prio_task_list; struct callback_head task_work; struct file **registered_rings; bool task_running; @@ -546,6 +593,16 @@ struct io_accept { unsigned long nofile; }; +struct io_socket { + struct file *file; + int domain; + int type; + int protocol; + int flags; + u32 file_slot; + unsigned long nofile; +}; + struct io_sync { struct file *file; loff_t len; @@ -557,6 +614,8 @@ struct io_sync { struct io_cancel { struct file *file; u64 addr; + u32 flags; + s32 fd; }; struct io_timeout { @@ -585,7 +644,7 @@ struct io_rw { struct kiocb kiocb; u64 addr; u32 len; - u32 flags; + rwf_t flags; }; struct io_connect { @@ -602,9 +661,9 @@ struct io_sr_msg { void __user *buf; }; int msg_flags; - int bgid; size_t len; size_t done_io; + unsigned int flags; }; struct io_open { @@ -722,6 +781,12 @@ struct io_msg { u32 len; }; +struct io_nop { + struct file *file; + u64 extra1; + u64 extra2; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -748,6 +813,12 @@ struct io_async_rw { struct wait_page_queue wpq; }; +struct io_xattr { + struct file *file; + struct xattr_ctx ctx; + struct filename *filename; +}; + enum { REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, @@ -766,6 +837,7 @@ enum { REQ_F_NEED_CLEANUP_BIT, REQ_F_POLLED_BIT, REQ_F_BUFFER_SELECTED_BIT, + REQ_F_BUFFER_RING_BIT, REQ_F_COMPLETE_INLINE_BIT, REQ_F_REISSUE_BIT, REQ_F_CREDS_BIT, @@ -776,6 +848,7 @@ enum { REQ_F_SINGLE_POLL_BIT, REQ_F_DOUBLE_POLL_BIT, REQ_F_PARTIAL_IO_BIT, + REQ_F_APOLL_MULTISHOT_BIT, /* keep async read/write and isreg together and in order */ REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, @@ -816,6 +889,8 @@ enum { REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), /* buffer already selected */ REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), + /* buffer selected from ring, needs commit */ + REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT), /* completion is deferred through io_comp_state */ REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), /* caller should reissue async */ @@ -840,6 +915,8 @@ enum { REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), /* request has already done partial IO */ REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), + /* fast poll multishot mode */ + REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), }; struct async_poll { @@ -862,6 +939,21 @@ enum { IORING_RSRC_BUFFER = 1, }; +struct io_cqe { + __u64 user_data; + __s32 res; + /* fd initially, then cflags for completion */ + union { + __u32 flags; + int fd; + }; +}; + +enum { + IO_CHECK_CQ_OVERFLOW_BIT, + IO_CHECK_CQ_DROPPED_BIT, +}; + /* * NOTE! Each of the iocb union members has the file pointer * as the first entry in their struct definition. So you can @@ -897,46 +989,65 @@ struct io_kiocb { struct io_symlink symlink; struct io_hardlink hardlink; struct io_msg msg; + struct io_xattr xattr; + struct io_socket sock; + struct io_nop nop; + struct io_uring_cmd uring_cmd; }; u8 opcode; /* polled IO has completed */ u8 iopoll_completed; + /* + * Can be either a fixed buffer index, or used with provided buffers. + * For the latter, before issue it points to the buffer group ID, + * and after selection it points to the buffer ID itself. + */ u16 buf_index; unsigned int flags; - u64 user_data; - u32 result; - /* fd initially, then cflags for completion */ - union { - u32 cflags; - int fd; - }; + struct io_cqe cqe; struct io_ring_ctx *ctx; struct task_struct *task; - struct percpu_ref *fixed_rsrc_refs; - /* store used ubuf, so we can prevent reloading */ - struct io_mapped_ubuf *imu; + struct io_rsrc_node *rsrc_node; + + union { + /* store used ubuf, so we can prevent reloading */ + struct io_mapped_ubuf *imu; + + /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ + struct io_buffer *kbuf; + + /* + * stores buffer ID for ring provided buffers, valid IFF + * REQ_F_BUFFER_RING is set. + */ + struct io_buffer_list *buf_list; + }; union { /* used by request caches, completion batching and iopoll */ struct io_wq_work_node comp_list; /* cache ->apoll->events */ - int apoll_events; + __poll_t apoll_events; }; atomic_t refs; atomic_t poll_refs; struct io_task_work io_task_work; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ - struct hlist_node hash_node; + union { + struct hlist_node hash_node; + struct { + u64 extra1; + u64 extra2; + }; + }; /* internal polling, see IORING_FEAT_FAST_POLL */ struct async_poll *apoll; /* opcode allocated if it needs to store data for async defer */ void *async_data; - /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ - struct io_buffer *kbuf; /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */ struct io_kiocb *link; /* custom credentials, valid IFF REQ_F_CREDS is set */ @@ -956,6 +1067,24 @@ struct io_defer_entry { u32 seq; }; +struct io_cancel_data { + struct io_ring_ctx *ctx; + union { + u64 data; + struct file *file; + }; + u32 flags; + int seq; +}; + +/* + * The URING_CMD payload starts at 'cmd' in the first sqe, and continues into + * the following sqe if SQE128 is used. + */ +#define uring_cmd_pdu_size(is_sqe128) \ + ((1 + !!(is_sqe128)) * sizeof(struct io_uring_sqe) - \ + offsetof(struct io_uring_sqe, cmd)) + struct io_op_def { /* needs req->file assigned */ unsigned needs_file : 1; @@ -977,12 +1106,20 @@ struct io_op_def { unsigned not_supported : 1; /* skip auditing */ unsigned audit_skip : 1; + /* supports ioprio */ + unsigned ioprio : 1; + /* supports iopoll */ + unsigned iopoll : 1; /* size of async data needed, if any */ unsigned short async_size; }; static const struct io_op_def io_op_defs[] = { - [IORING_OP_NOP] = {}, + [IORING_OP_NOP] = { + .audit_skip = 1, + .iopoll = 1, + .buffer_select = 1, + }, [IORING_OP_READV] = { .needs_file = 1, .unbound_nonreg_file = 1, @@ -991,6 +1128,8 @@ static const struct io_op_def io_op_defs[] = { .needs_async_setup = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_WRITEV] = { @@ -1001,6 +1140,8 @@ static const struct io_op_def io_op_defs[] = { .needs_async_setup = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_FSYNC] = { @@ -1013,6 +1154,8 @@ static const struct io_op_def io_op_defs[] = { .pollin = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_WRITE_FIXED] = { @@ -1022,6 +1165,8 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_POLL_ADD] = { @@ -1064,6 +1209,7 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollin = 1, .poll_exclusive = 1, + .ioprio = 1, /* used for flags */ }, [IORING_OP_ASYNC_CANCEL] = { .audit_skip = 1, @@ -1086,6 +1232,7 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_CLOSE] = {}, [IORING_OP_FILES_UPDATE] = { .audit_skip = 1, + .iopoll = 1, }, [IORING_OP_STATX] = { .audit_skip = 1, @@ -1097,6 +1244,8 @@ static const struct io_op_def io_op_defs[] = { .buffer_select = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_WRITE] = { @@ -1106,6 +1255,8 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_FADVISE] = { @@ -1140,9 +1291,11 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_PROVIDE_BUFFERS] = { .audit_skip = 1, + .iopoll = 1, }, [IORING_OP_REMOVE_BUFFERS] = { .audit_skip = 1, + .iopoll = 1, }, [IORING_OP_TEE] = { .needs_file = 1, @@ -1160,11 +1313,30 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_LINKAT] = {}, [IORING_OP_MSG_RING] = { .needs_file = 1, + .iopoll = 1, + }, + [IORING_OP_FSETXATTR] = { + .needs_file = 1 + }, + [IORING_OP_SETXATTR] = {}, + [IORING_OP_FGETXATTR] = { + .needs_file = 1 + }, + [IORING_OP_GETXATTR] = {}, + [IORING_OP_SOCKET] = { + .audit_skip = 1, + }, + [IORING_OP_URING_CMD] = { + .needs_file = 1, + .plug = 1, + .needs_async_setup = 1, + .async_size = uring_cmd_pdu_size(1), }, }; /* requests with any of those set should undergo io_disarm_next() */ #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) +#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) static bool io_disarm_next(struct io_kiocb *req); static void io_uring_del_tctx_node(unsigned long index); @@ -1173,10 +1345,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, bool cancel_all); static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); -static void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags); - -static void io_put_req(struct io_kiocb *req); -static void io_put_req_deferred(struct io_kiocb *req); +static void __io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags); static void io_dismantle_req(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, @@ -1185,10 +1354,10 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, static void io_clean_op(struct io_kiocb *req); static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); -static inline struct file *io_file_get_normal(struct io_kiocb *req, int fd); +static struct file *io_file_get_normal(struct io_kiocb *req, int fd); static void io_drop_inflight_file(struct io_kiocb *req); static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags); -static void __io_queue_sqe(struct io_kiocb *req); +static void io_queue_sqe(struct io_kiocb *req); static void io_rsrc_put_work(struct work_struct *work); static void io_req_task_queue(struct io_kiocb *req); @@ -1201,11 +1370,115 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags); static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); static void io_eventfd_signal(struct io_ring_ctx *ctx); +static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); static struct kmem_cache *req_cachep; static const struct file_operations io_uring_fops; +const char *io_uring_get_opcode(u8 opcode) +{ + switch ((enum io_uring_op)opcode) { + case IORING_OP_NOP: + return "NOP"; + case IORING_OP_READV: + return "READV"; + case IORING_OP_WRITEV: + return "WRITEV"; + case IORING_OP_FSYNC: + return "FSYNC"; + case IORING_OP_READ_FIXED: + return "READ_FIXED"; + case IORING_OP_WRITE_FIXED: + return "WRITE_FIXED"; + case IORING_OP_POLL_ADD: + return "POLL_ADD"; + case IORING_OP_POLL_REMOVE: + return "POLL_REMOVE"; + case IORING_OP_SYNC_FILE_RANGE: + return "SYNC_FILE_RANGE"; + case IORING_OP_SENDMSG: + return "SENDMSG"; + case IORING_OP_RECVMSG: + return "RECVMSG"; + case IORING_OP_TIMEOUT: + return "TIMEOUT"; + case IORING_OP_TIMEOUT_REMOVE: + return "TIMEOUT_REMOVE"; + case IORING_OP_ACCEPT: + return "ACCEPT"; + case IORING_OP_ASYNC_CANCEL: + return "ASYNC_CANCEL"; + case IORING_OP_LINK_TIMEOUT: + return "LINK_TIMEOUT"; + case IORING_OP_CONNECT: + return "CONNECT"; + case IORING_OP_FALLOCATE: + return "FALLOCATE"; + case IORING_OP_OPENAT: + return "OPENAT"; + case IORING_OP_CLOSE: + return "CLOSE"; + case IORING_OP_FILES_UPDATE: + return "FILES_UPDATE"; + case IORING_OP_STATX: + return "STATX"; + case IORING_OP_READ: + return "READ"; + case IORING_OP_WRITE: + return "WRITE"; + case IORING_OP_FADVISE: + return "FADVISE"; + case IORING_OP_MADVISE: + return "MADVISE"; + case IORING_OP_SEND: + return "SEND"; + case IORING_OP_RECV: + return "RECV"; + case IORING_OP_OPENAT2: + return "OPENAT2"; + case IORING_OP_EPOLL_CTL: + return "EPOLL_CTL"; + case IORING_OP_SPLICE: + return "SPLICE"; + case IORING_OP_PROVIDE_BUFFERS: + return "PROVIDE_BUFFERS"; + case IORING_OP_REMOVE_BUFFERS: + return "REMOVE_BUFFERS"; + case IORING_OP_TEE: + return "TEE"; + case IORING_OP_SHUTDOWN: + return "SHUTDOWN"; + case IORING_OP_RENAMEAT: + return "RENAMEAT"; + case IORING_OP_UNLINKAT: + return "UNLINKAT"; + case IORING_OP_MKDIRAT: + return "MKDIRAT"; + case IORING_OP_SYMLINKAT: + return "SYMLINKAT"; + case IORING_OP_LINKAT: + return "LINKAT"; + case IORING_OP_MSG_RING: + return "MSG_RING"; + case IORING_OP_FSETXATTR: + return "FSETXATTR"; + case IORING_OP_SETXATTR: + return "SETXATTR"; + case IORING_OP_FGETXATTR: + return "FGETXATTR"; + case IORING_OP_GETXATTR: + return "GETXATTR"; + case IORING_OP_SOCKET: + return "SOCKET"; + case IORING_OP_URING_CMD: + return "URING_CMD"; + case IORING_OP_LAST: + return "INVALID"; + } + return "INVALID"; +} + struct sock *io_uring_get_socket(struct file *file) { #if defined(CONFIG_UNIX) @@ -1219,6 +1492,42 @@ struct sock *io_uring_get_socket(struct file *file) } EXPORT_SYMBOL(io_uring_get_socket); +#if defined(CONFIG_UNIX) +static inline bool io_file_need_scm(struct file *filp) +{ +#if defined(IO_URING_SCM_ALL) + return true; +#else + return !!unix_get_socket(filp); +#endif +} +#else +static inline bool io_file_need_scm(struct file *filp) +{ + return false; +} +#endif + +static void io_ring_submit_unlock(struct io_ring_ctx *ctx, unsigned issue_flags) +{ + lockdep_assert_held(&ctx->uring_lock); + if (issue_flags & IO_URING_F_UNLOCKED) + mutex_unlock(&ctx->uring_lock); +} + +static void io_ring_submit_lock(struct io_ring_ctx *ctx, unsigned issue_flags) +{ + /* + * "Normal" inline submissions always hold the uring_lock, since we + * grab it from the system call. Same is true for the SQPOLL offload. + * The only exception is when we've detached the request and issue it + * from an async worker thread, grab the lock for that case. + */ + if (issue_flags & IO_URING_F_UNLOCKED) + mutex_lock(&ctx->uring_lock); + lockdep_assert_held(&ctx->uring_lock); +} + static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) { if (!*locked) { @@ -1280,31 +1589,36 @@ static inline void io_req_set_refcount(struct io_kiocb *req) #define IO_RSRC_REF_BATCH 100 +static void io_rsrc_put_node(struct io_rsrc_node *node, int nr) +{ + percpu_ref_put_many(&node->refs, nr); +} + static inline void io_req_put_rsrc_locked(struct io_kiocb *req, struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { - struct percpu_ref *ref = req->fixed_rsrc_refs; + struct io_rsrc_node *node = req->rsrc_node; - if (ref) { - if (ref == &ctx->rsrc_node->refs) + if (node) { + if (node == ctx->rsrc_node) ctx->rsrc_cached_refs++; else - percpu_ref_put(ref); + io_rsrc_put_node(node, 1); } } -static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx) +static inline void io_req_put_rsrc(struct io_kiocb *req) { - if (req->fixed_rsrc_refs) - percpu_ref_put(req->fixed_rsrc_refs); + if (req->rsrc_node) + io_rsrc_put_node(req->rsrc_node, 1); } static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { if (ctx->rsrc_cached_refs) { - percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs); + io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs); ctx->rsrc_cached_refs = 0; } } @@ -1320,8 +1634,8 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req, struct io_ring_ctx *ctx, unsigned int issue_flags) { - if (!req->fixed_rsrc_refs) { - req->fixed_rsrc_refs = &ctx->rsrc_node->refs; + if (!req->rsrc_node) { + req->rsrc_node = ctx->rsrc_node; if (!(issue_flags & IO_URING_F_UNLOCKED)) { lockdep_assert_held(&ctx->uring_lock); @@ -1329,28 +1643,30 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req, if (unlikely(ctx->rsrc_cached_refs < 0)) io_rsrc_refs_refill(ctx); } else { - percpu_ref_get(req->fixed_rsrc_refs); + percpu_ref_get(&req->rsrc_node->refs); } } } static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list) { - struct io_buffer *kbuf = req->kbuf; - unsigned int cflags; + if (req->flags & REQ_F_BUFFER_RING) { + if (req->buf_list) + req->buf_list->head++; + req->flags &= ~REQ_F_BUFFER_RING; + } else { + list_add(&req->kbuf->list, list); + req->flags &= ~REQ_F_BUFFER_SELECTED; + } - cflags = IORING_CQE_F_BUFFER | (kbuf->bid << IORING_CQE_BUFFER_SHIFT); - req->flags &= ~REQ_F_BUFFER_SELECTED; - list_add(&kbuf->list, list); - req->kbuf = NULL; - return cflags; + return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); } static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) { lockdep_assert_held(&req->ctx->completion_lock); - if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) + if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) return 0; return __io_put_kbuf(req, &req->ctx->io_buffers_comp); } @@ -1360,7 +1676,7 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req, { unsigned int cflags; - if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) + if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) return 0; /* @@ -1375,7 +1691,10 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req, * We migrate buffers from the comp_list to the issue cache list * when we need one. */ - if (issue_flags & IO_URING_F_UNLOCKED) { + if (req->flags & REQ_F_BUFFER_RING) { + /* no buffers to recycle for this case */ + cflags = __io_put_kbuf(req, NULL); + } else if (issue_flags & IO_URING_F_UNLOCKED) { struct io_ring_ctx *ctx = req->ctx; spin_lock(&ctx->completion_lock); @@ -1393,15 +1712,10 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req, static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, unsigned int bgid) { - struct list_head *hash_list; - struct io_buffer_list *bl; - - hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)]; - list_for_each_entry(bl, hash_list, list) - if (bl->bgid == bgid || bgid == -1U) - return bl; + if (ctx->io_bl && bgid < BGID_ARRAY) + return &ctx->io_bl[bgid]; - return NULL; + return xa_load(&ctx->io_bl_xa, bgid); } static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) @@ -1410,25 +1724,33 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) struct io_buffer_list *bl; struct io_buffer *buf; - if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) + if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) return; /* don't recycle if we already did IO to this buffer */ if (req->flags & REQ_F_PARTIAL_IO) return; + /* + * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear + * the flag and hence ensure that bl->head doesn't get incremented. + * If the tail has already been incremented, hang on to it. + */ + if (req->flags & REQ_F_BUFFER_RING) { + if (req->buf_list) { + req->buf_index = req->buf_list->bgid; + req->flags &= ~REQ_F_BUFFER_RING; + } + return; + } - if (issue_flags & IO_URING_F_UNLOCKED) - mutex_lock(&ctx->uring_lock); - - lockdep_assert_held(&ctx->uring_lock); + io_ring_submit_lock(ctx, issue_flags); buf = req->kbuf; bl = io_buffer_get_list(ctx, buf->bgid); list_add(&buf->list, &bl->buf_list); req->flags &= ~REQ_F_BUFFER_SELECTED; - req->kbuf = NULL; + req->buf_index = buf->bgid; - if (issue_flags & IO_URING_F_UNLOCKED) - mutex_unlock(&ctx->uring_lock); + io_ring_submit_unlock(ctx, issue_flags); } static bool io_match_task(struct io_kiocb *head, struct task_struct *task, @@ -1469,7 +1791,12 @@ static inline void req_set_fail(struct io_kiocb *req) static inline void req_fail_link_node(struct io_kiocb *req, int res) { req_set_fail(req); - req->result = res; + req->cqe.res = res; +} + +static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) +{ + wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); } static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) @@ -1506,12 +1833,14 @@ static __cold void io_fallback_req_func(struct work_struct *work) static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; - int i, hash_bits; + int hash_bits; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return NULL; + xa_init(&ctx->io_bl_xa); + /* * Use 5 bits less than the max cq entries, that should give us around * 32 entries per hash list if totally full and uniformly spread. @@ -1533,13 +1862,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) /* set invalid range, so io_import_fixed() fails meeting it */ ctx->dummy_ubuf->ubuf = -1UL; - ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS, - sizeof(struct list_head), GFP_KERNEL); - if (!ctx->io_buffers) - goto err; - for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) - INIT_LIST_HEAD(&ctx->io_buffers[i]); - if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) goto err; @@ -1575,7 +1897,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) err: kfree(ctx->dummy_ubuf); kfree(ctx->cancel_hash); - kfree(ctx->io_buffers); + kfree(ctx->io_bl); + xa_destroy(&ctx->io_bl_xa); kfree(ctx); return NULL; } @@ -1599,10 +1922,6 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq) return false; } -#define FFS_NOWAIT 0x1UL -#define FFS_ISREG 0x2UL -#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG) - static inline bool io_req_ffs_set(struct io_kiocb *req) { return req->flags & REQ_F_FIXED_FILE; @@ -1629,6 +1948,17 @@ static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) return __io_prep_linked_timeout(req); } +static noinline void __io_arm_ltimeout(struct io_kiocb *req) +{ + io_queue_linked_timeout(__io_prep_linked_timeout(req)); +} + +static inline void io_arm_ltimeout(struct io_kiocb *req) +{ + if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT)) + __io_arm_ltimeout(req); +} + static void io_prep_async_work(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; @@ -1641,6 +1971,7 @@ static void io_prep_async_work(struct io_kiocb *req) req->work.list.next = NULL; req->work.flags = 0; + req->work.cancel_seq = atomic_read(&ctx->cancel_seq); if (req->flags & REQ_F_FORCE_ASYNC) req->work.flags |= IO_WQ_WORK_CONCURRENT; @@ -1672,17 +2003,15 @@ static void io_prep_async_link(struct io_kiocb *req) static inline void io_req_add_compl_list(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; - struct io_submit_state *state = &ctx->submit_state; + struct io_submit_state *state = &req->ctx->submit_state; if (!(req->flags & REQ_F_CQE_SKIP)) - ctx->submit_state.flush_cqes = true; + state->flush_cqes = true; wq_list_add_tail(&req->comp_list, &state->compl_reqs); } -static void io_queue_async_work(struct io_kiocb *req, bool *dont_use) +static void io_queue_iowq(struct io_kiocb *req, bool *dont_use) { - struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->task->io_uring; @@ -1702,8 +2031,9 @@ static void io_queue_async_work(struct io_kiocb *req, bool *dont_use) if (WARN_ON_ONCE(!same_thread_group(req->task, current))) req->work.flags |= IO_WQ_WORK_CANCEL; - trace_io_uring_queue_async_work(ctx, req, req->user_data, req->opcode, req->flags, - &req->work, io_wq_is_hashed(&req->work)); + trace_io_uring_queue_async_work(req->ctx, req, req->cqe.user_data, + req->opcode, req->flags, &req->work, + io_wq_is_hashed(&req->work)); io_wq_enqueue(tctx->io_wq, &req->work); if (link) io_queue_linked_timeout(link); @@ -1721,8 +2051,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status) atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); list_del_init(&req->timeout.list); - io_fill_cqe_req(req, status, 0); - io_put_req_deferred(req); + io_req_tw_post_queue(req, status, 0); } } @@ -1804,21 +2133,53 @@ static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); } -static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) +/* + * writes to the cq entry need to come after reading head; the + * control dependency is enough as we're using WRITE_ONCE to + * fill the cq entry + */ +static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; - unsigned tail, mask = ctx->cq_entries - 1; - - /* - * writes to the cq entry need to come after reading head; the - * control dependency is enough as we're using WRITE_ONCE to - * fill the cq entry - */ - if (__io_cqring_events(ctx) == ctx->cq_entries) + unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); + unsigned int shift = 0; + unsigned int free, queued, len; + + if (ctx->flags & IORING_SETUP_CQE32) + shift = 1; + + /* userspace may cheat modifying the tail, be safe and do min */ + queued = min(__io_cqring_events(ctx), ctx->cq_entries); + free = ctx->cq_entries - queued; + /* we need a contiguous range, limit based on the current array offset */ + len = min(free, ctx->cq_entries - off); + if (!len) return NULL; - tail = ctx->cached_cq_tail++; - return &rings->cqes[tail & mask]; + ctx->cached_cq_tail++; + ctx->cqe_cached = &rings->cqes[off]; + ctx->cqe_sentinel = ctx->cqe_cached + len; + ctx->cqe_cached++; + return &rings->cqes[off << shift]; +} + +static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) +{ + if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { + struct io_uring_cqe *cqe = ctx->cqe_cached; + + if (ctx->flags & IORING_SETUP_CQE32) { + unsigned int off = ctx->cqe_cached - ctx->rings->cqes; + + cqe += off; + } + + ctx->cached_cq_tail++; + ctx->cqe_cached++; + return cqe; + } + + return __io_get_cqe(ctx); } static void io_eventfd_signal(struct io_ring_ctx *ctx) @@ -1889,10 +2250,14 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { bool all_flushed, posted; + size_t cqe_size = sizeof(struct io_uring_cqe); if (!force && __io_cqring_events(ctx) == ctx->cq_entries) return false; + if (ctx->flags & IORING_SETUP_CQE32) + cqe_size <<= 1; + posted = false; spin_lock(&ctx->completion_lock); while (!list_empty(&ctx->cq_overflow_list)) { @@ -1904,7 +2269,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list); if (cqe) - memcpy(cqe, &ocqe->cqe, sizeof(*cqe)); + memcpy(cqe, &ocqe->cqe, cqe_size); else io_account_cq_overflow(ctx); @@ -1915,13 +2280,11 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) all_flushed = list_empty(&ctx->cq_overflow_list); if (all_flushed) { - clear_bit(0, &ctx->check_cq_overflow); - WRITE_ONCE(ctx->rings->sq_flags, - ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW); + clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); + atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); } - if (posted) - io_commit_cqring(ctx); + io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); if (posted) io_cqring_ev_posted(ctx); @@ -1932,7 +2295,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx) { bool ret = true; - if (test_bit(0, &ctx->check_cq_overflow)) { + if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) { /* iopoll syncs against uring_lock, not completion_lock */ if (ctx->flags & IORING_SETUP_IOPOLL) mutex_lock(&ctx->uring_lock); @@ -1944,19 +2307,23 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx) return ret; } -/* must to be called somewhat shortly after putting a request */ -static inline void io_put_task(struct task_struct *task, int nr) +static void __io_put_task(struct task_struct *task, int nr) { struct io_uring_task *tctx = task->io_uring; - if (likely(task == current)) { - tctx->cached_refs += nr; - } else { - percpu_counter_sub(&tctx->inflight, nr); - if (unlikely(atomic_read(&tctx->in_idle))) - wake_up(&tctx->wait); - put_task_struct_many(task, nr); - } + percpu_counter_sub(&tctx->inflight, nr); + if (unlikely(atomic_read(&tctx->in_idle))) + wake_up(&tctx->wait); + put_task_struct_many(task, nr); +} + +/* must to be called somewhat shortly after putting a request */ +static inline void io_put_task(struct task_struct *task, int nr) +{ + if (likely(task == current)) + task->io_uring->cached_refs += nr; + else + __io_put_task(task, nr); } static void io_task_refs_refill(struct io_uring_task *tctx) @@ -1990,11 +2357,18 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task) } static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags) + s32 res, u32 cflags, u64 extra1, + u64 extra2) { struct io_overflow_cqe *ocqe; + size_t ocq_size = sizeof(struct io_overflow_cqe); + bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); + + if (is_cqe32) + ocq_size += sizeof(struct io_uring_cqe); - ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT); + ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT); + trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe); if (!ocqe) { /* * If we're in ring overflow flush mode, or in task cancel mode, @@ -2002,17 +2376,21 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, * on the floor. */ io_account_cq_overflow(ctx); + set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq); return false; } if (list_empty(&ctx->cq_overflow_list)) { - set_bit(0, &ctx->check_cq_overflow); - WRITE_ONCE(ctx->rings->sq_flags, - ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW); + set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); + atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); } ocqe->cqe.user_data = user_data; ocqe->cqe.res = res; ocqe->cqe.flags = cflags; + if (is_cqe32) { + ocqe->cqe.big_cqe[0] = extra1; + ocqe->cqe.big_cqe[1] = extra2; + } list_add_tail(&ocqe->list, &ctx->cq_overflow_list); return true; } @@ -2034,42 +2412,114 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, WRITE_ONCE(cqe->flags, cflags); return true; } - return io_cqring_event_overflow(ctx, user_data, res, cflags); + return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); +} + +static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + struct io_uring_cqe *cqe; + + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, + req->cqe.res, req->cqe.flags, 0, 0); + + /* + * If we can't get a cq entry, userspace overflowed the + * submission (by quite a lot). Increment the overflow count in + * the ring. + */ + cqe = io_get_cqe(ctx); + if (likely(cqe)) { + memcpy(cqe, &req->cqe, sizeof(*cqe)); + return true; + } + return io_cqring_event_overflow(ctx, req->cqe.user_data, + req->cqe.res, req->cqe.flags, 0, 0); +} + +static inline bool __io_fill_cqe32_req_filled(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + struct io_uring_cqe *cqe; + u64 extra1 = req->extra1; + u64 extra2 = req->extra2; + + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, + req->cqe.res, req->cqe.flags, extra1, extra2); + + /* + * If we can't get a cq entry, userspace overflowed the + * submission (by quite a lot). Increment the overflow count in + * the ring. + */ + cqe = io_get_cqe(ctx); + if (likely(cqe)) { + memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe)); + cqe->big_cqe[0] = extra1; + cqe->big_cqe[1] = extra2; + return true; + } + + return io_cqring_event_overflow(ctx, req->cqe.user_data, req->cqe.res, + req->cqe.flags, extra1, extra2); } static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) { - trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags); - return __io_fill_cqe(req->ctx, req->user_data, res, cflags); + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, res, cflags, 0, 0); + return __io_fill_cqe(req->ctx, req->cqe.user_data, res, cflags); } -static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) +static inline void __io_fill_cqe32_req(struct io_kiocb *req, s32 res, u32 cflags, + u64 extra1, u64 extra2) { - if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe_req(req, res, cflags); + struct io_ring_ctx *ctx = req->ctx; + struct io_uring_cqe *cqe; + + if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32))) + return; + if (req->flags & REQ_F_CQE_SKIP) + return; + + trace_io_uring_complete(ctx, req, req->cqe.user_data, res, cflags, + extra1, extra2); + + /* + * If we can't get a cq entry, userspace overflowed the + * submission (by quite a lot). Increment the overflow count in + * the ring. + */ + cqe = io_get_cqe(ctx); + if (likely(cqe)) { + WRITE_ONCE(cqe->user_data, req->cqe.user_data); + WRITE_ONCE(cqe->res, res); + WRITE_ONCE(cqe->flags, cflags); + WRITE_ONCE(cqe->big_cqe[0], extra1); + WRITE_ONCE(cqe->big_cqe[1], extra2); + return; + } + + io_cqring_event_overflow(ctx, req->cqe.user_data, res, cflags, extra1, extra2); } static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { ctx->cq_extra++; - trace_io_uring_complete(ctx, NULL, user_data, res, cflags); + trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0); return __io_fill_cqe(ctx, user_data, res, cflags); } -static void __io_req_complete_post(struct io_kiocb *req, s32 res, - u32 cflags) +static void __io_req_complete_put(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; - - if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe_req(req, res, cflags); /* * If we're the last reference to this request, add to our locked * free_list cache. */ if (req_ref_put_and_test(req)) { - if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { + struct io_ring_ctx *ctx = req->ctx; + + if (req->flags & IO_REQ_LINK_FLAGS) { if (req->flags & IO_DISARM_MASK) io_disarm_next(req); if (req->link) { @@ -2077,7 +2527,7 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res, req->link = NULL; } } - io_req_put_rsrc(req, ctx); + io_req_put_rsrc(req); /* * Selected buffer deallocation in io_clean_op() assumes that * we don't hold ->completion_lock. Clean them here to avoid @@ -2091,8 +2541,23 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res, } } -static void io_req_complete_post(struct io_kiocb *req, s32 res, - u32 cflags) +static void __io_req_complete_post(struct io_kiocb *req, s32 res, + u32 cflags) +{ + if (!(req->flags & REQ_F_CQE_SKIP)) + __io_fill_cqe_req(req, res, cflags); + __io_req_complete_put(req); +} + +static void __io_req_complete_post32(struct io_kiocb *req, s32 res, + u32 cflags, u64 extra1, u64 extra2) +{ + if (!(req->flags & REQ_F_CQE_SKIP)) + __io_fill_cqe32_req(req, res, cflags, extra1, extra2); + __io_req_complete_put(req); +} + +static void io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags) { struct io_ring_ctx *ctx = req->ctx; @@ -2103,11 +2568,23 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res, io_cqring_ev_posted(ctx); } +static void io_req_complete_post32(struct io_kiocb *req, s32 res, + u32 cflags, u64 extra1, u64 extra2) +{ + struct io_ring_ctx *ctx = req->ctx; + + spin_lock(&ctx->completion_lock); + __io_req_complete_post32(req, res, cflags, extra1, extra2); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + io_cqring_ev_posted(ctx); +} + static inline void io_req_complete_state(struct io_kiocb *req, s32 res, u32 cflags) { - req->result = res; - req->cflags = cflags; + req->cqe.res = res; + req->cqe.flags = cflags; req->flags |= REQ_F_COMPLETE_INLINE; } @@ -2120,8 +2597,23 @@ static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags, io_req_complete_post(req, res, cflags); } +static inline void __io_req_complete32(struct io_kiocb *req, + unsigned int issue_flags, s32 res, + u32 cflags, u64 extra1, u64 extra2) +{ + if (issue_flags & IO_URING_F_COMPLETE_DEFER) { + io_req_complete_state(req, res, cflags); + req->extra1 = extra1; + req->extra2 = extra2; + } else { + io_req_complete_post32(req, res, cflags, extra1, extra2); + } +} + static inline void io_req_complete(struct io_kiocb *req, s32 res) { + if (res < 0) + req_set_fail(req); __io_req_complete(req, 0, res, 0); } @@ -2131,17 +2623,6 @@ static void io_req_complete_failed(struct io_kiocb *req, s32 res) io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); } -static void io_req_complete_fail_submit(struct io_kiocb *req) -{ - /* - * We don't submit, fail them all, for that replace hardlinks with - * normal links. Extra REQ_F_LINK is tolerated. - */ - req->flags &= ~REQ_F_HARDLINK; - req->flags |= REQ_F_LINK; - io_req_complete_failed(req, req->result); -} - /* * Don't initialise the fields below on every allocation, but do that in * advance and keep them valid across allocations. @@ -2152,7 +2633,7 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) req->link = NULL; req->async_data = NULL; /* not necessary, but safer to zero */ - req->result = 0; + req->cqe.res = 0; } static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, @@ -2164,19 +2645,9 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, spin_unlock(&ctx->completion_lock); } -/* Returns true IFF there are requests in the cache */ -static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) +static inline bool io_req_cache_empty(struct io_ring_ctx *ctx) { - struct io_submit_state *state = &ctx->submit_state; - - /* - * If we have more than a batch's worth of requests in our IRQ side - * locked cache, grab the lock and move them over to our submission - * side cache. - */ - if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH) - io_flush_cached_locked_reqs(ctx, state); - return !!state->free_list.next; + return !ctx->submit_state.free_list.next; } /* @@ -2188,14 +2659,20 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { - struct io_submit_state *state = &ctx->submit_state; gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; void *reqs[IO_REQ_ALLOC_BATCH]; - struct io_kiocb *req; int ret, i; - if (likely(state->free_list.next || io_flush_cached_reqs(ctx))) - return true; + /* + * If we have more than a batch's worth of requests in our IRQ side + * locked cache, grab the lock and move them over to our submission + * side cache. + */ + if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) { + io_flush_cached_locked_reqs(ctx, &ctx->submit_state); + if (!io_req_cache_empty(ctx)) + return true; + } ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs); @@ -2212,17 +2689,17 @@ static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) percpu_ref_get_many(&ctx->refs, ret); for (i = 0; i < ret; i++) { - req = reqs[i]; + struct io_kiocb *req = reqs[i]; io_preinit_req(req, ctx); - wq_stack_add_head(&req->comp_list, &state->free_list); + io_req_add_to_cache(req, ctx); } return true; } static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx) { - if (unlikely(!ctx->submit_state.free_list.next)) + if (unlikely(io_req_cache_empty(ctx))) return __io_alloc_req_refill(ctx); return true; } @@ -2251,11 +2728,11 @@ static inline void io_dismantle_req(struct io_kiocb *req) io_put_file(req->file); } -static __cold void __io_free_req(struct io_kiocb *req) +static __cold void io_free_req(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - io_req_put_rsrc(req, ctx); + io_req_put_rsrc(req); io_dismantle_req(req); io_put_task(req->task, 1); @@ -2273,7 +2750,7 @@ static inline void io_remove_next_linked(struct io_kiocb *req) nxt->link = NULL; } -static bool io_kill_linked_timeout(struct io_kiocb *req) +static struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req) __must_hold(&req->ctx->completion_lock) __must_hold(&req->ctx->timeout_lock) { @@ -2286,13 +2763,10 @@ static bool io_kill_linked_timeout(struct io_kiocb *req) link->timeout.head = NULL; if (hrtimer_try_to_cancel(&io->timer) != -1) { list_del(&link->timeout.list); - /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */ - io_fill_cqe_req(link, -ECANCELED, 0); - io_put_req_deferred(link); - return true; + return link; } } - return false; + return NULL; } static void io_fail_links(struct io_kiocb *req) @@ -2306,19 +2780,19 @@ static void io_fail_links(struct io_kiocb *req) long res = -ECANCELED; if (link->flags & REQ_F_FAIL) - res = link->result; + res = link->cqe.res; nxt = link->link; link->link = NULL; - trace_io_uring_fail_link(req->ctx, req, req->user_data, + trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data, req->opcode, link); - if (!ignore_cqes) { + if (ignore_cqes) + link->flags |= REQ_F_CQE_SKIP; + else link->flags &= ~REQ_F_CQE_SKIP; - io_fill_cqe_req(link, res, 0); - } - io_put_req_deferred(link); + __io_req_complete_post(link, res, 0); link = nxt; } } @@ -2326,25 +2800,27 @@ static void io_fail_links(struct io_kiocb *req) static bool io_disarm_next(struct io_kiocb *req) __must_hold(&req->ctx->completion_lock) { + struct io_kiocb *link = NULL; bool posted = false; if (req->flags & REQ_F_ARM_LTIMEOUT) { - struct io_kiocb *link = req->link; - + link = req->link; req->flags &= ~REQ_F_ARM_LTIMEOUT; if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { io_remove_next_linked(req); - /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */ - io_fill_cqe_req(link, -ECANCELED, 0); - io_put_req_deferred(link); + io_req_tw_post_queue(link, -ECANCELED, 0); posted = true; } } else if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; spin_lock_irq(&ctx->timeout_lock); - posted = io_kill_linked_timeout(req); + link = io_disarm_linked_timeout(req); spin_unlock_irq(&ctx->timeout_lock); + if (link) { + posted = true; + io_req_tw_post_queue(link, -ECANCELED, 0); + } } if (unlikely((req->flags & REQ_F_FAIL) && !(req->flags & REQ_F_HARDLINK))) { @@ -2361,8 +2837,7 @@ static void __io_req_find_next_prep(struct io_kiocb *req) spin_lock(&ctx->completion_lock); posted = io_disarm_next(req); - if (posted) - io_commit_cqring(ctx); + io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); if (posted) io_cqring_ev_posted(ctx); @@ -2372,8 +2847,6 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) { struct io_kiocb *nxt; - if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK)))) - return NULL; /* * If LINK is set, we have dependent requests in this chain. If we * didn't fail this request, queue the first one up, moving any other @@ -2391,6 +2864,8 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) { if (!ctx) return; + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); if (*locked) { io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); @@ -2434,7 +2909,7 @@ static void handle_prev_tw_list(struct io_wq_work_node *node, if (likely(*uring_locked)) req->io_task_work.func(req, uring_locked); else - __io_req_complete_post(req, req->result, + __io_req_complete_post(req, req->cqe.res, io_put_kbuf_comp(req)); node = next; } while (node); @@ -2475,15 +2950,11 @@ static void tctx_task_work(struct callback_head *cb) while (1) { struct io_wq_work_node *node1, *node2; - if (!tctx->task_list.first && - !tctx->prior_task_list.first && uring_locked) - io_submit_flush_completions(ctx); - spin_lock_irq(&tctx->task_lock); - node1 = tctx->prior_task_list.first; + node1 = tctx->prio_task_list.first; node2 = tctx->task_list.first; INIT_WQ_LIST(&tctx->task_list); - INIT_WQ_LIST(&tctx->prior_task_list); + INIT_WQ_LIST(&tctx->prio_task_list); if (!node2 && !node1) tctx->task_running = false; spin_unlock_irq(&tctx->task_lock); @@ -2492,10 +2963,13 @@ static void tctx_task_work(struct callback_head *cb) if (node1) handle_prev_tw_list(node1, &ctx, &uring_locked); - if (node2) handle_tw_list(node2, &ctx, &uring_locked); cond_resched(); + + if (data_race(!tctx->task_list.first) && + data_race(!tctx->prio_task_list.first) && uring_locked) + io_submit_flush_completions(ctx); } ctx_flush_and_put(ctx, &uring_locked); @@ -2505,24 +2979,19 @@ static void tctx_task_work(struct callback_head *cb) io_uring_drop_tctx_refs(current); } -static void io_req_task_work_add(struct io_kiocb *req, bool priority) +static void __io_req_task_work_add(struct io_kiocb *req, + struct io_uring_task *tctx, + struct io_wq_work_list *list) { - struct task_struct *tsk = req->task; - struct io_uring_task *tctx = tsk->io_uring; - enum task_work_notify_mode notify; + struct io_ring_ctx *ctx = req->ctx; struct io_wq_work_node *node; unsigned long flags; bool running; - WARN_ON_ONCE(!tctx); - io_drop_inflight_file(req); spin_lock_irqsave(&tctx->task_lock, flags); - if (priority) - wq_list_add_tail(&req->io_task_work.node, &tctx->prior_task_list); - else - wq_list_add_tail(&req->io_task_work.node, &tctx->task_list); + wq_list_add_tail(&req->io_task_work.node, list); running = tctx->task_running; if (!running) tctx->task_running = true; @@ -2532,22 +3001,15 @@ static void io_req_task_work_add(struct io_kiocb *req, bool priority) if (running) return; - /* - * SQPOLL kernel thread doesn't need notification, just a wakeup. For - * all other cases, use TWA_SIGNAL unconditionally to ensure we're - * processing task_work. There's no reliable way to tell if TWA_RESUME - * will do the job. - */ - notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL; - if (likely(!task_work_add(tsk, &tctx->task_work, notify))) { - if (notify == TWA_NONE) - wake_up_process(tsk); + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + + if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method))) return; - } spin_lock_irqsave(&tctx->task_lock, flags); tctx->task_running = false; - node = wq_list_merge(&tctx->prior_task_list, &tctx->task_list); + node = wq_list_merge(&tctx->prio_task_list, &tctx->task_list); spin_unlock_irqrestore(&tctx->task_lock, flags); while (node) { @@ -2559,47 +3021,73 @@ static void io_req_task_work_add(struct io_kiocb *req, bool priority) } } -static void io_req_task_cancel(struct io_kiocb *req, bool *locked) +static void io_req_task_work_add(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; + struct io_uring_task *tctx = req->task->io_uring; + + __io_req_task_work_add(req, tctx, &tctx->task_list); +} + +static void io_req_task_prio_work_add(struct io_kiocb *req) +{ + struct io_uring_task *tctx = req->task->io_uring; + if (req->ctx->flags & IORING_SETUP_SQPOLL) + __io_req_task_work_add(req, tctx, &tctx->prio_task_list); + else + __io_req_task_work_add(req, tctx, &tctx->task_list); +} + +static void io_req_tw_post(struct io_kiocb *req, bool *locked) +{ + io_req_complete_post(req, req->cqe.res, req->cqe.flags); +} + +static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags) +{ + req->cqe.res = res; + req->cqe.flags = cflags; + req->io_task_work.func = io_req_tw_post; + io_req_task_work_add(req); +} + +static void io_req_task_cancel(struct io_kiocb *req, bool *locked) +{ /* not needed for normal modes, but SQPOLL depends on it */ - io_tw_lock(ctx, locked); - io_req_complete_failed(req, req->result); + io_tw_lock(req->ctx, locked); + io_req_complete_failed(req, req->cqe.res); } static void io_req_task_submit(struct io_kiocb *req, bool *locked) { - struct io_ring_ctx *ctx = req->ctx; - - io_tw_lock(ctx, locked); + io_tw_lock(req->ctx, locked); /* req->task == current here, checking PF_EXITING is safe */ if (likely(!(req->task->flags & PF_EXITING))) - __io_queue_sqe(req); + io_queue_sqe(req); else io_req_complete_failed(req, -EFAULT); } static void io_req_task_queue_fail(struct io_kiocb *req, int ret) { - req->result = ret; + req->cqe.res = ret; req->io_task_work.func = io_req_task_cancel; - io_req_task_work_add(req, false); + io_req_task_work_add(req); } static void io_req_task_queue(struct io_kiocb *req) { req->io_task_work.func = io_req_task_submit; - io_req_task_work_add(req, false); + io_req_task_work_add(req); } static void io_req_task_queue_reissue(struct io_kiocb *req) { - req->io_task_work.func = io_queue_async_work; - io_req_task_work_add(req, false); + req->io_task_work.func = io_queue_iowq; + io_req_task_work_add(req); } -static inline void io_queue_next(struct io_kiocb *req) +static void io_queue_next(struct io_kiocb *req) { struct io_kiocb *nxt = io_req_find_next(req); @@ -2607,17 +3095,6 @@ static inline void io_queue_next(struct io_kiocb *req) io_req_task_queue(nxt); } -static void io_free_req(struct io_kiocb *req) -{ - io_queue_next(req); - __io_free_req(req); -} - -static void io_free_req_work(struct io_kiocb *req, bool *locked) -{ - io_free_req(req); -} - static void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) __must_hold(&ctx->uring_lock) @@ -2629,15 +3106,30 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); - if (unlikely(req->flags & REQ_F_REFCOUNT)) { - node = req->comp_list.next; - if (!req_ref_put_and_test(req)) - continue; + if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) { + if (req->flags & REQ_F_REFCOUNT) { + node = req->comp_list.next; + if (!req_ref_put_and_test(req)) + continue; + } + if ((req->flags & REQ_F_POLLED) && req->apoll) { + struct async_poll *apoll = req->apoll; + + if (apoll->double_poll) + kfree(apoll->double_poll); + list_add(&apoll->poll.wait.entry, + &ctx->apoll_cache); + req->flags &= ~REQ_F_POLLED; + } + if (req->flags & IO_REQ_LINK_FLAGS) + io_queue_next(req); + if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) + io_clean_op(req); } + if (!(req->flags & REQ_F_FIXED_FILE)) + io_put_file(req->file); io_req_put_rsrc_locked(req, ctx); - io_queue_next(req); - io_dismantle_req(req); if (req->task != task) { if (task) @@ -2647,7 +3139,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, } task_refs++; node = req->comp_list.next; - wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); + io_req_add_to_cache(req, ctx); } while (node); if (task) @@ -2666,16 +3158,11 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); - if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe_req(req, req->result, req->cflags); - if ((req->flags & REQ_F_POLLED) && req->apoll) { - struct async_poll *apoll = req->apoll; - - if (apoll->double_poll) - kfree(apoll->double_poll); - list_add(&apoll->poll.wait.entry, - &ctx->apoll_cache); - req->flags &= ~REQ_F_POLLED; + if (!(req->flags & REQ_F_CQE_SKIP)) { + if (!(ctx->flags & IORING_SETUP_CQE32)) + __io_fill_cqe_req_filled(ctx, req); + else + __io_fill_cqe32_req_filled(ctx, req); } } @@ -2698,23 +3185,18 @@ static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req) struct io_kiocb *nxt = NULL; if (req_ref_put_and_test(req)) { - nxt = io_req_find_next(req); - __io_free_req(req); + if (unlikely(req->flags & IO_REQ_LINK_FLAGS)) + nxt = io_req_find_next(req); + io_free_req(req); } return nxt; } static inline void io_put_req(struct io_kiocb *req) { - if (req_ref_put_and_test(req)) - io_free_req(req); -} - -static inline void io_put_req_deferred(struct io_kiocb *req) -{ if (req_ref_put_and_test(req)) { - req->io_task_work.func = io_free_req_work; - io_req_task_work_add(req, false); + io_queue_next(req); + io_free_req(req); } } @@ -2800,7 +3282,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) nr_events++; if (unlikely(req->flags & REQ_F_CQE_SKIP)) continue; - __io_fill_cqe_req(req, req->result, io_put_kbuf(req, 0)); + __io_fill_cqe_req(req, req->cqe.res, io_put_kbuf(req, 0)); } if (unlikely(!nr_events)) @@ -2846,22 +3328,26 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) { unsigned int nr_events = 0; int ret = 0; + unsigned long check_cq; /* - * We disallow the app entering submit/complete with polling, but we - * still need to lock the ring to prevent racing with polled issue - * that got punted to a workqueue. - */ - mutex_lock(&ctx->uring_lock); - /* * Don't enter poll loop if we already have events pending. * If we do, we can potentially be spinning for commands that * already triggered a CQE (eg in error). */ - if (test_bit(0, &ctx->check_cq_overflow)) + check_cq = READ_ONCE(ctx->check_cq); + if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) __io_cqring_overflow_flush(ctx, false); if (io_cqring_events(ctx)) - goto out; + return 0; + + /* + * Similarly do not spin if we have not informed the user of any + * dropped CQE. + */ + if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))) + return -EBADR; + do { /* * If a submit got punted to a workqueue, we can have the @@ -2891,8 +3377,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) nr_events += ret; ret = 0; } while (nr_events < min && !need_resched()); -out: - mutex_unlock(&ctx->uring_lock); + return ret; } @@ -2965,21 +3450,21 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) } else { fsnotify_access(req->file); } - if (unlikely(res != req->result)) { + if (unlikely(res != req->cqe.res)) { if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_should_reissue(req)) { req->flags |= REQ_F_REISSUE; return true; } req_set_fail(req); - req->result = res; + req->cqe.res = res; } return false; } static inline void io_req_task_complete(struct io_kiocb *req, bool *locked) { - int res = req->result; + int res = req->cqe.res; if (*locked) { io_req_complete_state(req, res, io_put_kbuf(req, 0)); @@ -2995,7 +3480,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res, { if (__io_complete_rw_common(req, res)) return; - __io_req_complete(req, issue_flags, req->result, + __io_req_complete(req, issue_flags, req->cqe.res, io_put_kbuf(req, issue_flags)); } @@ -3005,9 +3490,9 @@ static void io_complete_rw(struct kiocb *kiocb, long res) if (__io_complete_rw_common(req, res)) return; - req->result = res; + req->cqe.res = res; req->io_task_work.func = io_req_task_complete; - io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL)); + io_req_task_prio_work_add(req); } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) @@ -3016,12 +3501,12 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); - if (unlikely(res != req->result)) { + if (unlikely(res != req->cqe.res)) { if (res == -EAGAIN && io_rw_should_reissue(req)) { req->flags |= REQ_F_REISSUE; return; } - req->result = res; + req->cqe.res = res; } /* order with io_iopoll_complete() checking ->iopoll_completed */ @@ -3131,6 +3616,8 @@ static unsigned int io_file_get_flags(struct file *file) res |= FFS_ISREG; if (__io_file_supports_nowait(file, mode)) res |= FFS_NOWAIT; + if (io_file_need_scm(file)) + res |= FFS_SCM; return res; } @@ -3162,6 +3649,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->rw.addr = READ_ONCE(sqe->addr); req->rw.len = READ_ONCE(sqe->len); req->rw.flags = READ_ONCE(sqe->rw_flags); + /* used for fixed read/write too - just read unconditionally */ req->buf_index = READ_ONCE(sqe->buf_index); return 0; } @@ -3310,77 +3798,96 @@ static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, return __io_import_fixed(req, rw, iter, imu); } -static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock) -{ - if (needs_lock) - mutex_unlock(&ctx->uring_lock); -} - -static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock) +static int io_buffer_add_list(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, unsigned int bgid) { - /* - * "Normal" inline submissions always hold the uring_lock, since we - * grab it from the system call. Same is true for the SQPOLL offload. - * The only exception is when we've detached the request and issue it - * from an async worker thread, grab the lock for that case. - */ - if (needs_lock) - mutex_lock(&ctx->uring_lock); -} - -static void io_buffer_add_list(struct io_ring_ctx *ctx, - struct io_buffer_list *bl, unsigned int bgid) -{ - struct list_head *list; - - list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)]; - INIT_LIST_HEAD(&bl->buf_list); bl->bgid = bgid; - list_add(&bl->list, list); + if (bgid < BGID_ARRAY) + return 0; + + return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); } -static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, - int bgid, unsigned int issue_flags) +static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, + struct io_buffer_list *bl) { - struct io_buffer *kbuf = req->kbuf; - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - - if (req->flags & REQ_F_BUFFER_SELECTED) - return kbuf; + if (!list_empty(&bl->buf_list)) { + struct io_buffer *kbuf; - io_ring_submit_lock(ctx, needs_lock); - - lockdep_assert_held(&ctx->uring_lock); - - bl = io_buffer_get_list(ctx, bgid); - if (bl && !list_empty(&bl->buf_list)) { kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); list_del(&kbuf->list); if (*len > kbuf->len) *len = kbuf->len; req->flags |= REQ_F_BUFFER_SELECTED; req->kbuf = kbuf; + req->buf_index = kbuf->bid; + return u64_to_user_ptr(kbuf->addr); + } + return NULL; +} + +static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, + struct io_buffer_list *bl, + unsigned int issue_flags) +{ + struct io_uring_buf_ring *br = bl->buf_ring; + struct io_uring_buf *buf; + __u32 head = bl->head; + + if (unlikely(smp_load_acquire(&br->tail) == head)) { + io_ring_submit_unlock(req->ctx, issue_flags); + return NULL; + } + + head &= bl->mask; + if (head < IO_BUFFER_LIST_BUF_PER_PAGE) { + buf = &br->bufs[head]; } else { - kbuf = ERR_PTR(-ENOBUFS); + int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); + int index = head / IO_BUFFER_LIST_BUF_PER_PAGE - 1; + buf = page_address(bl->buf_pages[index]); + buf += off; } + if (*len > buf->len) + *len = buf->len; + req->flags |= REQ_F_BUFFER_RING; + req->buf_list = bl; + req->buf_index = buf->bid; - io_ring_submit_unlock(req->ctx, needs_lock); - return kbuf; + if (issue_flags & IO_URING_F_UNLOCKED) { + /* + * If we came in unlocked, we have no choice but to consume the + * buffer here. This does mean it'll be pinned until the IO + * completes. But coming in unlocked means we're in io-wq + * context, hence there should be no further retry. For the + * locked case, the caller must ensure to call the commit when + * the transfer completes (or if we get -EAGAIN and must poll + * or retry). + */ + req->buf_list = NULL; + bl->head++; + } + return u64_to_user_ptr(buf->addr); } -static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len, - unsigned int issue_flags) +static void __user *io_buffer_select(struct io_kiocb *req, size_t *len, + unsigned int issue_flags) { - struct io_buffer *kbuf; - u16 bgid; + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + void __user *ret = NULL; + + io_ring_submit_lock(req->ctx, issue_flags); - bgid = req->buf_index; - kbuf = io_buffer_select(req, len, bgid, issue_flags); - if (IS_ERR(kbuf)) - return kbuf; - return u64_to_user_ptr(kbuf->addr); + bl = io_buffer_get_list(ctx, req->buf_index); + if (likely(bl)) { + if (bl->buf_nr_pages) + ret = io_ring_buffer_select(req, len, bl, issue_flags); + else + ret = io_provided_buffer_select(req, len, bl); + } + io_ring_submit_unlock(req->ctx, issue_flags); + return ret; } #ifdef CONFIG_COMPAT @@ -3390,7 +3897,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, struct compat_iovec __user *uiov; compat_ssize_t clen; void __user *buf; - ssize_t len; + size_t len; uiov = u64_to_user_ptr(req->rw.addr); if (!access_ok(uiov, sizeof(*uiov))) @@ -3401,11 +3908,12 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, return -EINVAL; len = clen; - buf = io_rw_buffer_select(req, &len, issue_flags); - if (IS_ERR(buf)) - return PTR_ERR(buf); + buf = io_buffer_select(req, &len, issue_flags); + if (!buf) + return -ENOBUFS; + req->rw.addr = (unsigned long) buf; iov[0].iov_base = buf; - iov[0].iov_len = (compat_size_t) len; + req->rw.len = iov[0].iov_len = (compat_size_t) len; return 0; } #endif @@ -3423,22 +3931,21 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, len = iov[0].iov_len; if (len < 0) return -EINVAL; - buf = io_rw_buffer_select(req, &len, issue_flags); - if (IS_ERR(buf)) - return PTR_ERR(buf); + buf = io_buffer_select(req, &len, issue_flags); + if (!buf) + return -ENOBUFS; + req->rw.addr = (unsigned long) buf; iov[0].iov_base = buf; - iov[0].iov_len = len; + req->rw.len = iov[0].iov_len = len; return 0; } static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, unsigned int issue_flags) { - if (req->flags & REQ_F_BUFFER_SELECTED) { - struct io_buffer *kbuf = req->kbuf; - - iov[0].iov_base = u64_to_user_ptr(kbuf->addr); - iov[0].iov_len = kbuf->len; + if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { + iov[0].iov_base = u64_to_user_ptr(req->rw.addr); + iov[0].iov_len = req->rw.len; return 0; } if (req->rw.len != 1) @@ -3452,6 +3959,13 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, return __io_iov_buffer_select(req, iov, issue_flags); } +static inline bool io_do_buffer_select(struct io_kiocb *req) +{ + if (!(req->flags & REQ_F_BUFFER_SELECT)) + return false; + return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)); +} + static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, struct io_rw_state *s, unsigned int issue_flags) @@ -3470,18 +3984,15 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, return NULL; } - /* buffer index only valid with fixed read/write, or buffer select */ - if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))) - return ERR_PTR(-EINVAL); - buf = u64_to_user_ptr(req->rw.addr); sqe_len = req->rw.len; if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { - if (req->flags & REQ_F_BUFFER_SELECT) { - buf = io_rw_buffer_select(req, &sqe_len, issue_flags); - if (IS_ERR(buf)) - return ERR_CAST(buf); + if (io_do_buffer_select(req)) { + buf = io_buffer_select(req, &sqe_len, issue_flags); + if (!buf) + return ERR_PTR(-ENOBUFS); + req->rw.addr = (unsigned long) buf; req->rw.len = sqe_len; } @@ -3836,7 +4347,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) kfree(iovec); return ret; } - req->result = iov_iter_count(&s->iter); + req->cqe.res = iov_iter_count(&s->iter); if (force_nonblock) { /* If the file doesn't support async, just async punt */ @@ -3852,7 +4363,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) ppos = io_kiocb_update_pos(req); - ret = rw_verify_area(READ, req->file, ppos, req->result); + ret = rw_verify_area(READ, req->file, ppos, req->cqe.res); if (unlikely(ret)) { kfree(iovec); return ret; @@ -3874,7 +4385,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) ret = 0; } else if (ret == -EIOCBQUEUED) { goto out_free; - } else if (ret == req->result || ret <= 0 || !force_nonblock || + } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) { /* read all, failed, already did sync or don't want to retry */ goto done; @@ -3964,7 +4475,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) kfree(iovec); return ret; } - req->result = iov_iter_count(&s->iter); + req->cqe.res = iov_iter_count(&s->iter); if (force_nonblock) { /* If the file doesn't support async, just async punt */ @@ -3984,7 +4495,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) ppos = io_kiocb_update_pos(req); - ret = rw_verify_area(WRITE, req->file, ppos, req->result); + ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res); if (unlikely(ret)) goto out_free; @@ -4048,9 +4559,7 @@ static int io_renameat_prep(struct io_kiocb *req, struct io_rename *ren = &req->rename; const char __user *oldf, *newf; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -4087,22 +4596,257 @@ static int io_renameat(struct io_kiocb *req, unsigned int issue_flags) ren->newpath, ren->flags); req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret < 0) - req_set_fail(req); io_req_complete(req, ret); return 0; } +static inline void __io_xattr_finish(struct io_kiocb *req) +{ + struct io_xattr *ix = &req->xattr; + + if (ix->filename) + putname(ix->filename); + + kfree(ix->ctx.kname); + kvfree(ix->ctx.kvalue); +} + +static void io_xattr_finish(struct io_kiocb *req, int ret) +{ + req->flags &= ~REQ_F_NEED_CLEANUP; + + __io_xattr_finish(req); + io_req_complete(req, ret); +} + +static int __io_getxattr_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_xattr *ix = &req->xattr; + const char __user *name; + int ret; + + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + ix->filename = NULL; + ix->ctx.kvalue = NULL; + name = u64_to_user_ptr(READ_ONCE(sqe->addr)); + ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + ix->ctx.size = READ_ONCE(sqe->len); + ix->ctx.flags = READ_ONCE(sqe->xattr_flags); + + if (ix->ctx.flags) + return -EINVAL; + + ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL); + if (!ix->ctx.kname) + return -ENOMEM; + + ret = strncpy_from_user(ix->ctx.kname->name, name, + sizeof(ix->ctx.kname->name)); + if (!ret || ret == sizeof(ix->ctx.kname->name)) + ret = -ERANGE; + if (ret < 0) { + kfree(ix->ctx.kname); + return ret; + } + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +static int io_fgetxattr_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + return __io_getxattr_prep(req, sqe); +} + +static int io_getxattr_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_xattr *ix = &req->xattr; + const char __user *path; + int ret; + + ret = __io_getxattr_prep(req, sqe); + if (ret) + return ret; + + path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); + + ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); + if (IS_ERR(ix->filename)) { + ret = PTR_ERR(ix->filename); + ix->filename = NULL; + } + + return ret; +} + +static int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_xattr *ix = &req->xattr; + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = do_getxattr(mnt_user_ns(req->file->f_path.mnt), + req->file->f_path.dentry, + &ix->ctx); + + io_xattr_finish(req, ret); + return 0; +} + +static int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_xattr *ix = &req->xattr; + unsigned int lookup_flags = LOOKUP_FOLLOW; + struct path path; + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + +retry: + ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); + if (!ret) { + ret = do_getxattr(mnt_user_ns(path.mnt), + path.dentry, + &ix->ctx); + + path_put(&path); + if (retry_estale(ret, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + } + + io_xattr_finish(req, ret); + return 0; +} + +static int __io_setxattr_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_xattr *ix = &req->xattr; + const char __user *name; + int ret; + + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + ix->filename = NULL; + name = u64_to_user_ptr(READ_ONCE(sqe->addr)); + ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + ix->ctx.kvalue = NULL; + ix->ctx.size = READ_ONCE(sqe->len); + ix->ctx.flags = READ_ONCE(sqe->xattr_flags); + + ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL); + if (!ix->ctx.kname) + return -ENOMEM; + + ret = setxattr_copy(name, &ix->ctx); + if (ret) { + kfree(ix->ctx.kname); + return ret; + } + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +static int io_setxattr_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_xattr *ix = &req->xattr; + const char __user *path; + int ret; + + ret = __io_setxattr_prep(req, sqe); + if (ret) + return ret; + + path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); + + ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); + if (IS_ERR(ix->filename)) { + ret = PTR_ERR(ix->filename); + ix->filename = NULL; + } + + return ret; +} + +static int io_fsetxattr_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + return __io_setxattr_prep(req, sqe); +} + +static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags, + struct path *path) +{ + struct io_xattr *ix = &req->xattr; + int ret; + + ret = mnt_want_write(path->mnt); + if (!ret) { + ret = do_setxattr(mnt_user_ns(path->mnt), path->dentry, &ix->ctx); + mnt_drop_write(path->mnt); + } + + return ret; +} + +static int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags) +{ + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = __io_setxattr(req, issue_flags, &req->file->f_path); + io_xattr_finish(req, ret); + + return 0; +} + +static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_xattr *ix = &req->xattr; + unsigned int lookup_flags = LOOKUP_FOLLOW; + struct path path; + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + +retry: + ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); + if (!ret) { + ret = __io_setxattr(req, issue_flags, &path); + path_put(&path); + if (retry_estale(ret, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + } + + io_xattr_finish(req, ret); + return 0; +} + static int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_unlink *un = &req->unlink; const char __user *fname; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || - sqe->splice_fd_in) + if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -4136,8 +4880,6 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) ret = do_unlinkat(un->dfd, un->filename); req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret < 0) - req_set_fail(req); io_req_complete(req, ret); return 0; } @@ -4148,10 +4890,7 @@ static int io_mkdirat_prep(struct io_kiocb *req, struct io_mkdir *mkd = &req->mkdir; const char __user *fname; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index || - sqe->splice_fd_in) + if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -4179,8 +4918,6 @@ static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode); req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret < 0) - req_set_fail(req); io_req_complete(req, ret); return 0; } @@ -4191,10 +4928,7 @@ static int io_symlinkat_prep(struct io_kiocb *req, struct io_symlink *sl = &req->symlink; const char __user *oldpath, *newpath; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index || - sqe->splice_fd_in) + if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -4228,8 +4962,6 @@ static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath); req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret < 0) - req_set_fail(req); io_req_complete(req, ret); return 0; } @@ -4240,9 +4972,7 @@ static int io_linkat_prep(struct io_kiocb *req, struct io_hardlink *lnk = &req->hardlink; const char __user *oldf, *newf; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) + if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -4279,9 +5009,97 @@ static int io_linkat(struct io_kiocb *req, unsigned int issue_flags) lnk->newpath, lnk->flags); req->flags &= ~REQ_F_NEED_CLEANUP; + io_req_complete(req, ret); + return 0; +} + +static void io_uring_cmd_work(struct io_kiocb *req, bool *locked) +{ + req->uring_cmd.task_work_cb(&req->uring_cmd); +} + +void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *)) +{ + struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd); + + req->uring_cmd.task_work_cb = task_work_cb; + req->io_task_work.func = io_uring_cmd_work; + io_req_task_prio_work_add(req); +} +EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task); + +/* + * Called by consumers of io_uring_cmd, if they originally returned + * -EIOCBQUEUED upon receiving the command. + */ +void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2) +{ + struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd); + if (ret < 0) req_set_fail(req); - io_req_complete(req, ret); + if (req->ctx->flags & IORING_SETUP_CQE32) + __io_req_complete32(req, 0, ret, 0, res2, 0); + else + io_req_complete(req, ret); +} +EXPORT_SYMBOL_GPL(io_uring_cmd_done); + +static int io_uring_cmd_prep_async(struct io_kiocb *req) +{ + size_t cmd_size; + + cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128); + + memcpy(req->async_data, req->uring_cmd.cmd, cmd_size); + return 0; +} + +static int io_uring_cmd_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_uring_cmd *ioucmd = &req->uring_cmd; + + if (sqe->rw_flags) + return -EINVAL; + ioucmd->cmd = sqe->cmd; + ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); + return 0; +} + +static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_uring_cmd *ioucmd = &req->uring_cmd; + struct io_ring_ctx *ctx = req->ctx; + struct file *file = req->file; + int ret; + + if (!req->file->f_op->uring_cmd) + return -EOPNOTSUPP; + + if (ctx->flags & IORING_SETUP_SQE128) + issue_flags |= IO_URING_F_SQE128; + if (ctx->flags & IORING_SETUP_CQE32) + issue_flags |= IO_URING_F_CQE32; + if (ctx->flags & IORING_SETUP_IOPOLL) + issue_flags |= IO_URING_F_IOPOLL; + + if (req_has_async_data(req)) + ioucmd->cmd = req->async_data; + + ret = file->f_op->uring_cmd(ioucmd, issue_flags); + if (ret == -EAGAIN) { + if (!req_has_async_data(req)) { + if (io_alloc_async_data(req)) + return -ENOMEM; + io_uring_cmd_prep_async(req); + } + return -EAGAIN; + } + + if (ret != -EIOCBQUEUED) + io_uring_cmd_done(ioucmd, ret, 0); return 0; } @@ -4289,9 +5107,7 @@ static int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_NET) - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags || + if (unlikely(sqe->off || sqe->addr || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; @@ -4316,8 +5132,6 @@ static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) return -ENOTSOCK; ret = __sys_shutdown_sock(sock, req->shutdown.how); - if (ret < 0) - req_set_fail(req); io_req_complete(req, ret); return 0; #else @@ -4331,9 +5145,6 @@ static int __io_splice_prep(struct io_kiocb *req, struct io_splice *sp = &req->splice; unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - sp->len = READ_ONCE(sqe->len); sp->flags = READ_ONCE(sqe->splice_flags); if (unlikely(sp->flags & ~valid_flags)) @@ -4378,7 +5189,7 @@ static int io_tee(struct io_kiocb *req, unsigned int issue_flags) done: if (ret != sp->len) req_set_fail(req); - io_req_complete(req, ret); + __io_req_complete(req, 0, ret, 0); return 0; } @@ -4423,7 +5234,20 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags) done: if (ret != sp->len) req_set_fail(req); - io_req_complete(req, ret); + __io_req_complete(req, 0, ret, 0); + return 0; +} + +static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + /* + * If the ring is setup with CQE32, relay back addr/addr + */ + if (req->ctx->flags & IORING_SETUP_CQE32) { + req->nop.extra1 = READ_ONCE(sqe->addr); + req->nop.extra2 = READ_ONCE(sqe->addr2); + } + return 0; } @@ -4432,20 +5256,31 @@ done: */ static int io_nop(struct io_kiocb *req, unsigned int issue_flags) { - struct io_ring_ctx *ctx = req->ctx; + unsigned int cflags; + void __user *buf; - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; + if (req->flags & REQ_F_BUFFER_SELECT) { + size_t len = 1; - __io_req_complete(req, issue_flags, 0, 0); + buf = io_buffer_select(req, &len, issue_flags); + if (!buf) + return -ENOBUFS; + } + + cflags = io_put_kbuf(req, issue_flags); + if (!(req->ctx->flags & IORING_SETUP_CQE32)) + __io_req_complete(req, issue_flags, 0, cflags); + else + __io_req_complete32(req, issue_flags, 0, cflags, + req->nop.extra1, req->nop.extra2); return 0; } static int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (unlikely(sqe->addr || sqe->ioprio || sqe->rw_flags || - sqe->splice_fd_in || sqe->buf_index || sqe->personality)) + if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in || + sqe->buf_index || sqe->personality)) return -EINVAL; req->msg.user_data = READ_ONCE(sqe->off); @@ -4481,17 +5316,15 @@ done: if (ret < 0) req_set_fail(req); __io_req_complete(req, issue_flags, ret, 0); + /* put file to avoid an attempt to IOPOLL the req */ + io_put_file(req->file); + req->file = NULL; return 0; } static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_ring_ctx *ctx = req->ctx; - - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || - sqe->splice_fd_in)) + if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; req->sync.flags = READ_ONCE(sqe->fsync_flags); @@ -4515,8 +5348,6 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) ret = vfs_fsync_range(req->file, req->sync.off, end > 0 ? end : LLONG_MAX, req->sync.flags & IORING_FSYNC_DATASYNC); - if (ret < 0) - req_set_fail(req); io_req_complete(req, ret); return 0; } @@ -4524,10 +5355,7 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) static int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (sqe->ioprio || sqe->buf_index || sqe->rw_flags || - sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; req->sync.off = READ_ONCE(sqe->off); @@ -4545,9 +5373,7 @@ static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, req->sync.len); - if (ret < 0) - req_set_fail(req); - else + if (ret >= 0) fsnotify_modify(req->file); io_req_complete(req, ret); return 0; @@ -4558,9 +5384,7 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe const char __user *fname; int ret; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (unlikely(sqe->ioprio || sqe->buf_index)) + if (unlikely(sqe->buf_index)) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -4615,6 +5439,61 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return __io_openat_prep(req, sqe); } +static int io_file_bitmap_get(struct io_ring_ctx *ctx) +{ + struct io_file_table *table = &ctx->file_table; + unsigned long nr = ctx->nr_user_files; + int ret; + + if (table->alloc_hint >= nr) + table->alloc_hint = 0; + + do { + ret = find_next_zero_bit(table->bitmap, nr, table->alloc_hint); + if (ret != nr) { + table->alloc_hint = ret + 1; + return ret; + } + if (!table->alloc_hint) + break; + + nr = table->alloc_hint; + table->alloc_hint = 0; + } while (1); + + return -ENFILE; +} + +static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, + struct file *file, unsigned int file_slot) +{ + bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC; + struct io_ring_ctx *ctx = req->ctx; + int ret; + + if (alloc_slot) { + io_ring_submit_lock(ctx, issue_flags); + ret = io_file_bitmap_get(ctx); + if (unlikely(ret < 0)) { + io_ring_submit_unlock(ctx, issue_flags); + return ret; + } + + file_slot = ret; + } else { + file_slot--; + } + + ret = io_install_fixed_file(req, file, issue_flags, file_slot); + if (alloc_slot) { + io_ring_submit_unlock(ctx, issue_flags); + if (!ret) + return file_slot; + } + + return ret; +} + static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) { struct open_flags op; @@ -4670,8 +5549,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) if (!fixed) fd_install(ret, file); else - ret = io_install_fixed_file(req, file, issue_flags, - req->open.file_slot - 1); + ret = io_fixed_fd_install(req, issue_flags, file, + req->open.file_slot); err: putname(req->open.filename); req->flags &= ~REQ_F_NEED_CLEANUP; @@ -4692,7 +5571,7 @@ static int io_remove_buffers_prep(struct io_kiocb *req, struct io_provide_buf *p = &req->pbuf; u64 tmp; - if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off || + if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off || sqe->splice_fd_in) return -EINVAL; @@ -4715,6 +5594,20 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, if (!nbufs) return 0; + if (bl->buf_nr_pages) { + int j; + + i = bl->buf_ring->tail - bl->head; + for (j = 0; j < bl->buf_nr_pages; j++) + unpin_user_page(bl->buf_pages[j]); + kvfree(bl->buf_pages); + bl->buf_pages = NULL; + bl->buf_nr_pages = 0; + /* make sure it's seen as empty */ + INIT_LIST_HEAD(&bl->buf_list); + return i; + } + /* the head kbuf is the list itself */ while (!list_empty(&bl->buf_list)) { struct io_buffer *nxt; @@ -4736,22 +5629,23 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) struct io_ring_ctx *ctx = req->ctx; struct io_buffer_list *bl; int ret = 0; - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; - - io_ring_submit_lock(ctx, needs_lock); - lockdep_assert_held(&ctx->uring_lock); + io_ring_submit_lock(ctx, issue_flags); ret = -ENOENT; bl = io_buffer_get_list(ctx, p->bgid); - if (bl) - ret = __io_remove_buffers(ctx, bl, p->nbufs); + if (bl) { + ret = -EINVAL; + /* can't use provide/remove buffers command on mapped buffers */ + if (!bl->buf_nr_pages) + ret = __io_remove_buffers(ctx, bl, p->nbufs); + } if (ret < 0) req_set_fail(req); /* complete before unlock, IOPOLL may need the lock */ __io_req_complete(req, issue_flags, ret, 0); - io_ring_submit_unlock(ctx, needs_lock); + io_ring_submit_unlock(ctx, issue_flags); return 0; } @@ -4762,7 +5656,7 @@ static int io_provide_buffers_prep(struct io_kiocb *req, struct io_provide_buf *p = &req->pbuf; u64 tmp; - if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) + if (sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; tmp = READ_ONCE(sqe->fd); @@ -4859,26 +5753,56 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, return i ? 0 : -ENOMEM; } +static __cold int io_init_bl_list(struct io_ring_ctx *ctx) +{ + int i; + + ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), + GFP_KERNEL); + if (!ctx->io_bl) + return -ENOMEM; + + for (i = 0; i < BGID_ARRAY; i++) { + INIT_LIST_HEAD(&ctx->io_bl[i].buf_list); + ctx->io_bl[i].bgid = i; + } + + return 0; +} + static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) { struct io_provide_buf *p = &req->pbuf; struct io_ring_ctx *ctx = req->ctx; struct io_buffer_list *bl; int ret = 0; - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; - io_ring_submit_lock(ctx, needs_lock); + io_ring_submit_lock(ctx, issue_flags); - lockdep_assert_held(&ctx->uring_lock); + if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) { + ret = io_init_bl_list(ctx); + if (ret) + goto err; + } bl = io_buffer_get_list(ctx, p->bgid); if (unlikely(!bl)) { - bl = kmalloc(sizeof(*bl), GFP_KERNEL); + bl = kzalloc(sizeof(*bl), GFP_KERNEL); if (!bl) { ret = -ENOMEM; goto err; } - io_buffer_add_list(ctx, bl, p->bgid); + INIT_LIST_HEAD(&bl->buf_list); + ret = io_buffer_add_list(ctx, bl, p->bgid); + if (ret) { + kfree(bl); + goto err; + } + } + /* can't add buffers via this command for a mapped buffer ring */ + if (bl->buf_nr_pages) { + ret = -EINVAL; + goto err; } ret = io_add_buffers(ctx, p, bl); @@ -4887,7 +5811,7 @@ err: req_set_fail(req); /* complete before unlock, IOPOLL may need the lock */ __io_req_complete(req, issue_flags, ret, 0); - io_ring_submit_unlock(ctx, needs_lock); + io_ring_submit_unlock(ctx, issue_flags); return 0; } @@ -4895,9 +5819,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_EPOLL) - if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; req->epoll.epfd = READ_ONCE(sqe->fd); @@ -4941,9 +5863,7 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) - if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (sqe->buf_index || sqe->off || sqe->splice_fd_in) return -EINVAL; req->madvise.addr = READ_ONCE(sqe->addr); @@ -4965,8 +5885,6 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); - if (ret < 0) - req_set_fail(req); io_req_complete(req, ret); return 0; #else @@ -4976,9 +5894,7 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (sqe->buf_index || sqe->addr || sqe->splice_fd_in) return -EINVAL; req->fadvise.offset = READ_ONCE(sqe->off); @@ -5014,9 +5930,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { const char __user *path; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; @@ -5052,19 +5966,13 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags) ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask, ctx->buffer); - - if (ret < 0) - req_set_fail(req); io_req_complete(req, ret); return 0; } static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || - sqe->rw_flags || sqe->buf_index) + if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; @@ -5096,7 +6004,8 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags) spin_unlock(&files->file_lock); goto err; } - file = fdt->fd[close->fd]; + file = rcu_dereference_protected(fdt->fd[close->fd], + lockdep_is_held(&files->file_lock)); if (!file || file->f_op == &io_uring_fops) { spin_unlock(&files->file_lock); file = NULL; @@ -5130,12 +6039,7 @@ err: static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_ring_ctx *ctx = req->ctx; - - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || - sqe->splice_fd_in)) + if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; req->sync.off = READ_ONCE(sqe->off); @@ -5154,13 +6058,18 @@ static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) ret = sync_file_range(req->file, req->sync.off, req->sync.len, req->sync.flags); - if (ret < 0) - req_set_fail(req); io_req_complete(req, ret); return 0; } #if defined(CONFIG_NET) +static bool io_net_retry(struct socket *sock, int flags) +{ + if (!(flags & MSG_WAITALL)) + return false; + return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; +} + static int io_setup_async_msg(struct io_kiocb *req, struct io_async_msghdr *kmsg) { @@ -5206,13 +6115,16 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = &req->sr_msg; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (unlikely(sqe->file_index)) return -EINVAL; if (unlikely(sqe->addr2 || sqe->file_index)) return -EINVAL; sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); + sr->flags = READ_ONCE(sqe->addr2); + if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) + return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; @@ -5221,12 +6133,14 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->ctx->compat) sr->msg_flags |= MSG_CMSG_COMPAT; #endif + sr->done_io = 0; return 0; } static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_msghdr iomsg, *kmsg; + struct io_sr_msg *sr = &req->sr_msg; struct socket *sock; unsigned flags; int min_ret = 0; @@ -5245,7 +6159,11 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) kmsg = &iomsg; } - flags = req->sr_msg.msg_flags; + if (!(req->flags & REQ_F_POLLED) && + (sr->flags & IORING_RECVSEND_POLL_FIRST)) + return io_setup_async_msg(req, kmsg); + + flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) flags |= MSG_DONTWAIT; if (flags & MSG_WAITALL) @@ -5258,12 +6176,21 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) return io_setup_async_msg(req, kmsg); if (ret == -ERESTARTSYS) ret = -EINTR; + if (ret > 0 && io_net_retry(sock, flags)) { + sr->done_io += ret; + req->flags |= REQ_F_PARTIAL_IO; + return io_setup_async_msg(req, kmsg); + } req_set_fail(req); } /* fast path, check for non-NULL to avoid function call */ if (kmsg->free_iov) kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; + if (ret >= 0) + ret += sr->done_io; + else if (sr->done_io) + ret = sr->done_io; __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -5278,6 +6205,10 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags) int min_ret = 0; int ret; + if (!(req->flags & REQ_F_POLLED) && + (sr->flags & IORING_RECVSEND_POLL_FIRST)) + return -EAGAIN; + sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; @@ -5291,7 +6222,7 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags) msg.msg_controllen = 0; msg.msg_namelen = 0; - flags = req->sr_msg.msg_flags; + flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) flags |= MSG_DONTWAIT; if (flags & MSG_WAITALL) @@ -5304,8 +6235,19 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; if (ret == -ERESTARTSYS) ret = -EINTR; + if (ret > 0 && io_net_retry(sock, flags)) { + sr->len -= ret; + sr->buf += ret; + sr->done_io += ret; + req->flags |= REQ_F_PARTIAL_IO; + return -EAGAIN; + } req_set_fail(req); } + if (ret >= 0) + ret += sr->done_io; + else if (sr->done_io) + ret = sr->done_io; __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -5397,14 +6339,6 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req, return __io_recvmsg_copy_hdr(req, iomsg); } -static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, - unsigned int issue_flags) -{ - struct io_sr_msg *sr = &req->sr_msg; - - return io_buffer_select(req, &sr->len, sr->bgid, issue_flags); -} - static int io_recvmsg_prep_async(struct io_kiocb *req) { int ret; @@ -5419,14 +6353,16 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = &req->sr_msg; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (unlikely(sqe->file_index)) return -EINVAL; if (unlikely(sqe->addr2 || sqe->file_index)) return -EINVAL; sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); - sr->bgid = READ_ONCE(sqe->buf_group); + sr->flags = READ_ONCE(sqe->addr2); + if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) + return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; @@ -5439,19 +6375,12 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static bool io_net_retry(struct socket *sock, int flags) -{ - if (!(flags & MSG_WAITALL)) - return false; - return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; -} - static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_msghdr iomsg, *kmsg; struct io_sr_msg *sr = &req->sr_msg; struct socket *sock; - struct io_buffer *kbuf; + unsigned int cflags; unsigned flags; int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; @@ -5469,24 +6398,30 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) kmsg = &iomsg; } - if (req->flags & REQ_F_BUFFER_SELECT) { - kbuf = io_recv_buffer_select(req, issue_flags); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); - kmsg->fast_iov[0].iov_len = req->sr_msg.len; - iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, - 1, req->sr_msg.len); + if (!(req->flags & REQ_F_POLLED) && + (sr->flags & IORING_RECVSEND_POLL_FIRST)) + return io_setup_async_msg(req, kmsg); + + if (io_do_buffer_select(req)) { + void __user *buf; + + buf = io_buffer_select(req, &sr->len, issue_flags); + if (!buf) + return -ENOBUFS; + kmsg->fast_iov[0].iov_base = buf; + kmsg->fast_iov[0].iov_len = sr->len; + iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1, + sr->len); } - flags = req->sr_msg.msg_flags; + flags = sr->msg_flags; if (force_nonblock) flags |= MSG_DONTWAIT; if (flags & MSG_WAITALL) min_ret = iov_iter_count(&kmsg->msg.msg_iter); - ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, - kmsg->uaddr, flags); + kmsg->msg.msg_get_inq = 1; + ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags); if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) return io_setup_async_msg(req, kmsg); @@ -5510,45 +6445,54 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) ret += sr->done_io; else if (sr->done_io) ret = sr->done_io; - __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags)); + cflags = io_put_kbuf(req, issue_flags); + if (kmsg->msg.msg_inq) + cflags |= IORING_CQE_F_SOCK_NONEMPTY; + __io_req_complete(req, issue_flags, ret, cflags); return 0; } static int io_recv(struct io_kiocb *req, unsigned int issue_flags) { - struct io_buffer *kbuf; struct io_sr_msg *sr = &req->sr_msg; struct msghdr msg; - void __user *buf = sr->buf; struct socket *sock; struct iovec iov; + unsigned int cflags; unsigned flags; int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + if (!(req->flags & REQ_F_POLLED) && + (sr->flags & IORING_RECVSEND_POLL_FIRST)) + return -EAGAIN; + sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; - if (req->flags & REQ_F_BUFFER_SELECT) { - kbuf = io_recv_buffer_select(req, issue_flags); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - buf = u64_to_user_ptr(kbuf->addr); + if (io_do_buffer_select(req)) { + void __user *buf; + + buf = io_buffer_select(req, &sr->len, issue_flags); + if (!buf) + return -ENOBUFS; + sr->buf = buf; } - ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter); + ret = import_single_range(READ, sr->buf, sr->len, &iov, &msg.msg_iter); if (unlikely(ret)) goto out_free; msg.msg_name = NULL; + msg.msg_namelen = 0; msg.msg_control = NULL; + msg.msg_get_inq = 1; + msg.msg_flags = 0; msg.msg_controllen = 0; - msg.msg_namelen = 0; msg.msg_iocb = NULL; - msg.msg_flags = 0; - flags = req->sr_msg.msg_flags; + flags = sr->msg_flags; if (force_nonblock) flags |= MSG_DONTWAIT; if (flags & MSG_WAITALL) @@ -5577,36 +6521,49 @@ out_free: ret += sr->done_io; else if (sr->done_io) ret = sr->done_io; - __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags)); + cflags = io_put_kbuf(req, issue_flags); + if (msg.msg_inq) + cflags |= IORING_CQE_F_SOCK_NONEMPTY; + __io_req_complete(req, issue_flags, ret, cflags); return 0; } static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_accept *accept = &req->accept; + unsigned flags; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->len || sqe->buf_index) + if (sqe->len || sqe->buf_index) return -EINVAL; accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); accept->flags = READ_ONCE(sqe->accept_flags); accept->nofile = rlimit(RLIMIT_NOFILE); + flags = READ_ONCE(sqe->ioprio); + if (flags & ~IORING_ACCEPT_MULTISHOT) + return -EINVAL; accept->file_slot = READ_ONCE(sqe->file_index); - if (accept->file_slot && (accept->flags & SOCK_CLOEXEC)) - return -EINVAL; + if (accept->file_slot) { + if (accept->flags & SOCK_CLOEXEC) + return -EINVAL; + if (flags & IORING_ACCEPT_MULTISHOT && + accept->file_slot != IORING_FILE_INDEX_ALLOC) + return -EINVAL; + } if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; + if (flags & IORING_ACCEPT_MULTISHOT) + req->flags |= REQ_F_APOLL_MULTISHOT; return 0; } static int io_accept(struct io_kiocb *req, unsigned int issue_flags) { + struct io_ring_ctx *ctx = req->ctx; struct io_accept *accept = &req->accept; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; @@ -5614,6 +6571,7 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) struct file *file; int ret, fd; +retry: if (!fixed) { fd = __get_unused_fd_flags(accept->flags, accept->nofile); if (unlikely(fd < 0)) @@ -5625,7 +6583,89 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) if (!fixed) put_unused_fd(fd); ret = PTR_ERR(file); - if (ret == -EAGAIN && force_nonblock) + if (ret == -EAGAIN && force_nonblock) { + /* + * if it's multishot and polled, we don't need to + * return EAGAIN to arm the poll infra since it + * has already been done + */ + if ((req->flags & IO_APOLL_MULTI_POLLED) == + IO_APOLL_MULTI_POLLED) + ret = 0; + return ret; + } + if (ret == -ERESTARTSYS) + ret = -EINTR; + req_set_fail(req); + } else if (!fixed) { + fd_install(fd, file); + ret = fd; + } else { + ret = io_fixed_fd_install(req, issue_flags, file, + accept->file_slot); + } + + if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { + __io_req_complete(req, issue_flags, ret, 0); + return 0; + } + if (ret >= 0) { + bool filled; + + spin_lock(&ctx->completion_lock); + filled = io_fill_cqe_aux(ctx, req->cqe.user_data, ret, + IORING_CQE_F_MORE); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + if (filled) { + io_cqring_ev_posted(ctx); + goto retry; + } + ret = -ECANCELED; + } + + return ret; +} + +static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_socket *sock = &req->sock; + + if (sqe->addr || sqe->rw_flags || sqe->buf_index) + return -EINVAL; + + sock->domain = READ_ONCE(sqe->fd); + sock->type = READ_ONCE(sqe->off); + sock->protocol = READ_ONCE(sqe->len); + sock->file_slot = READ_ONCE(sqe->file_index); + sock->nofile = rlimit(RLIMIT_NOFILE); + + sock->flags = sock->type & ~SOCK_TYPE_MASK; + if (sock->file_slot && (sock->flags & SOCK_CLOEXEC)) + return -EINVAL; + if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) + return -EINVAL; + return 0; +} + +static int io_socket(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_socket *sock = &req->sock; + bool fixed = !!sock->file_slot; + struct file *file; + int ret, fd; + + if (!fixed) { + fd = __get_unused_fd_flags(sock->flags, sock->nofile); + if (unlikely(fd < 0)) + return fd; + } + file = __sys_socket_file(sock->domain, sock->type, sock->protocol); + if (IS_ERR(file)) { + if (!fixed) + put_unused_fd(fd); + ret = PTR_ERR(file); + if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) return -EAGAIN; if (ret == -ERESTARTSYS) ret = -EINTR; @@ -5635,7 +6675,7 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) ret = fd; } else { ret = io_install_fixed_file(req, file, issue_flags, - accept->file_slot - 1); + sock->file_slot - 1); } __io_req_complete(req, issue_flags, ret, 0); return 0; @@ -5653,10 +6693,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_connect *conn = &req->connect; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags || - sqe->splice_fd_in) + if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); @@ -5729,6 +6766,7 @@ IO_NETOP_PREP_ASYNC(sendmsg); IO_NETOP_PREP_ASYNC(recvmsg); IO_NETOP_PREP_ASYNC(connect); IO_NETOP_PREP(accept); +IO_NETOP_PREP(socket); IO_NETOP_FN(send); IO_NETOP_FN(recv); #endif /* CONFIG_NET */ @@ -5779,7 +6817,7 @@ static void io_poll_req_insert(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; struct hlist_head *list; - list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; + list = &ctx->cancel_hash[hash_long(req->cqe.user_data, ctx->cancel_hash_bits)]; hlist_add_head(&req->hash_node, list); } @@ -5838,22 +6876,23 @@ static void io_poll_remove_entries(struct io_kiocb *req) rcu_read_unlock(); } +static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags); /* * All poll tw should go through this. Checks for poll events, manages * references, does rewait, etc. * * Returns a negative error on failure. >0 when no action require, which is * either spurious wakeup or multishot CQE is served. 0 when it's done with - * the request, then the mask is stored in req->result. + * the request, then the mask is stored in req->cqe.res. */ -static int io_poll_check_events(struct io_kiocb *req, bool locked) +static int io_poll_check_events(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; - int v; + int v, ret; /* req->task == current here, checking PF_EXITING is safe */ if (unlikely(req->task->flags & PF_EXITING)) - io_poll_mark_cancelled(req); + return -ECANCELED; do { v = atomic_read(&req->poll_refs); @@ -5864,32 +6903,46 @@ static int io_poll_check_events(struct io_kiocb *req, bool locked) if (v & IO_POLL_CANCEL_FLAG) return -ECANCELED; - if (!req->result) { + if (!req->cqe.res) { struct poll_table_struct pt = { ._key = req->apoll_events }; unsigned flags = locked ? 0 : IO_URING_F_UNLOCKED; if (unlikely(!io_assign_file(req, flags))) return -EBADF; - req->result = vfs_poll(req->file, &pt) & req->apoll_events; + req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events; } - /* multishot, just fill an CQE and proceed */ - if (req->result && !(req->apoll_events & EPOLLONESHOT)) { - __poll_t mask = mangle_poll(req->result & req->apoll_events); + if ((unlikely(!req->cqe.res))) + continue; + if (req->apoll_events & EPOLLONESHOT) + return 0; + + /* multishot, just fill a CQE and proceed */ + if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { + __poll_t mask = mangle_poll(req->cqe.res & + req->apoll_events); bool filled; spin_lock(&ctx->completion_lock); - filled = io_fill_cqe_aux(ctx, req->user_data, mask, - IORING_CQE_F_MORE); + filled = io_fill_cqe_aux(ctx, req->cqe.user_data, + mask, IORING_CQE_F_MORE); io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); - if (unlikely(!filled)) - return -ECANCELED; - io_cqring_ev_posted(ctx); - } else if (req->result) { - return 0; + if (filled) { + io_cqring_ev_posted(ctx); + continue; + } + return -ECANCELED; } + io_tw_lock(req->ctx, locked); + if (unlikely(req->task->flags & PF_EXITING)) + return -EFAULT; + ret = io_issue_sqe(req, + IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); + if (ret) + return ret; + /* * Release all references, retry if someone tried to restart * task_work while we were executing it. @@ -5904,21 +6957,21 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) struct io_ring_ctx *ctx = req->ctx; int ret; - ret = io_poll_check_events(req, *locked); + ret = io_poll_check_events(req, locked); if (ret > 0) return; if (!ret) { - req->result = mangle_poll(req->result & req->poll.events); + req->cqe.res = mangle_poll(req->cqe.res & req->poll.events); } else { - req->result = ret; + req->cqe.res = ret; req_set_fail(req); } io_poll_remove_entries(req); spin_lock(&ctx->completion_lock); hash_del(&req->hash_node); - __io_req_complete_post(req, req->result, 0); + __io_req_complete_post(req, req->cqe.res, 0); io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); @@ -5929,7 +6982,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) struct io_ring_ctx *ctx = req->ctx; int ret; - ret = io_poll_check_events(req, *locked); + ret = io_poll_check_events(req, locked); if (ret > 0) return; @@ -5944,9 +6997,9 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) io_req_complete_failed(req, ret); } -static void __io_poll_execute(struct io_kiocb *req, int mask, int events) +static void __io_poll_execute(struct io_kiocb *req, int mask, __poll_t events) { - req->result = mask; + req->cqe.res = mask; /* * This is useful for poll that is armed on behalf of another * request, and where the wakeup path could be on a different @@ -5959,11 +7012,12 @@ static void __io_poll_execute(struct io_kiocb *req, int mask, int events) else req->io_task_work.func = io_apoll_task_func; - trace_io_uring_task_add(req->ctx, req, req->user_data, req->opcode, mask); - io_req_task_work_add(req, false); + trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask); + io_req_task_work_add(req); } -static inline void io_poll_execute(struct io_kiocb *req, int res, int events) +static inline void io_poll_execute(struct io_kiocb *req, int res, + __poll_t events) { if (io_poll_get_ownership(req)) __io_poll_execute(req, res, events); @@ -5978,6 +7032,7 @@ static void io_poll_cancel_req(struct io_kiocb *req) #define wqe_to_req(wait) ((void *)((unsigned long) (wait)->private & ~1)) #define wqe_is_double(wait) ((unsigned long) (wait)->private & 1) +#define IO_ASYNC_POLL_COMMON (EPOLLONESHOT | EPOLLPRI) static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) @@ -6012,7 +7067,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, } /* for instances that support it check for an event match first */ - if (mask && !(mask & poll->events)) + if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON))) return 0; if (io_poll_get_ownership(req)) { @@ -6098,6 +7153,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, int v; INIT_HLIST_NODE(&req->hash_node); + req->work.cancel_seq = atomic_read(&ctx->cancel_seq); io_init_poll_iocb(poll, mask, io_poll_wake); poll->file = req->file; @@ -6168,28 +7224,34 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) struct io_ring_ctx *ctx = req->ctx; struct async_poll *apoll; struct io_poll_table ipt; - __poll_t mask = EPOLLONESHOT | POLLERR | POLLPRI; + __poll_t mask = POLLPRI | POLLERR; int ret; if (!def->pollin && !def->pollout) return IO_APOLL_ABORTED; - if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED)) + if (!file_can_poll(req->file)) return IO_APOLL_ABORTED; + if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED) + return IO_APOLL_ABORTED; + if (!(req->flags & REQ_F_APOLL_MULTISHOT)) + mask |= EPOLLONESHOT; if (def->pollin) { - mask |= POLLIN | POLLRDNORM; + mask |= EPOLLIN | EPOLLRDNORM; /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ if ((req->opcode == IORING_OP_RECVMSG) && (req->sr_msg.msg_flags & MSG_ERRQUEUE)) - mask &= ~POLLIN; + mask &= ~EPOLLIN; } else { - mask |= POLLOUT | POLLWRNORM; + mask |= EPOLLOUT | EPOLLWRNORM; } if (def->poll_exclusive) mask |= EPOLLEXCLUSIVE; - if (!(issue_flags & IO_URING_F_UNLOCKED) && - !list_empty(&ctx->apoll_cache)) { + if (req->flags & REQ_F_POLLED) { + apoll = req->apoll; + } else if (!(issue_flags & IO_URING_F_UNLOCKED) && + !list_empty(&ctx->apoll_cache)) { apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, poll.wait.entry); list_del_init(&apoll->poll.wait.entry); @@ -6209,7 +7271,7 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) if (ret || ipt.error) return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; - trace_io_uring_poll_arm(ctx, req, req->user_data, req->opcode, + trace_io_uring_poll_arm(ctx, req, req->cqe.user_data, req->opcode, mask, apoll->poll.events); return IO_APOLL_OK; } @@ -6242,24 +7304,53 @@ static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, return found; } -static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr, - bool poll_only) +static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, + struct io_cancel_data *cd) __must_hold(&ctx->completion_lock) { struct hlist_head *list; struct io_kiocb *req; - list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)]; + list = &ctx->cancel_hash[hash_long(cd->data, ctx->cancel_hash_bits)]; hlist_for_each_entry(req, list, hash_node) { - if (sqe_addr != req->user_data) + if (cd->data != req->cqe.user_data) continue; if (poll_only && req->opcode != IORING_OP_POLL_ADD) continue; + if (cd->flags & IORING_ASYNC_CANCEL_ALL) { + if (cd->seq == req->work.cancel_seq) + continue; + req->work.cancel_seq = cd->seq; + } return req; } return NULL; } +static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, + struct io_cancel_data *cd) + __must_hold(&ctx->completion_lock) +{ + struct io_kiocb *req; + int i; + + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { + struct hlist_head *list; + + list = &ctx->cancel_hash[i]; + hlist_for_each_entry(req, list, hash_node) { + if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && + req->file != cd->file) + continue; + if (cd->seq == req->work.cancel_seq) + continue; + req->work.cancel_seq = cd->seq; + return req; + } + } + return NULL; +} + static bool io_poll_disarm(struct io_kiocb *req) __must_hold(&ctx->completion_lock) { @@ -6270,12 +7361,15 @@ static bool io_poll_disarm(struct io_kiocb *req) return true; } -static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr, - bool poll_only) +static int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) __must_hold(&ctx->completion_lock) { - struct io_kiocb *req = io_poll_find(ctx, sqe_addr, poll_only); + struct io_kiocb *req; + if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY)) + req = io_poll_file_find(ctx, cd); + else + req = io_poll_find(ctx, false, cd); if (!req) return -ENOENT; io_poll_cancel_req(req); @@ -6302,9 +7396,7 @@ static int io_poll_update_prep(struct io_kiocb *req, struct io_poll_update *upd = &req->poll_update; u32 flags; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; flags = READ_ONCE(sqe->len); if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA | @@ -6334,9 +7426,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe struct io_poll_iocb *poll = &req->poll; u32 flags; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr) + if (sqe->buf_index || sqe->off || sqe->addr) return -EINVAL; flags = READ_ONCE(sqe->len); if (flags & ~IORING_POLL_ADD_MULTI) @@ -6366,13 +7456,14 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) { + struct io_cancel_data cd = { .data = req->poll_update.old_user_data, }; struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *preq; int ret2, ret = 0; bool locked; spin_lock(&ctx->completion_lock); - preq = io_poll_find(ctx, req->poll_update.old_user_data, true); + preq = io_poll_find(ctx, true, &cd); if (!preq || !io_poll_disarm(preq)) { spin_unlock(&ctx->completion_lock); ret = preq ? -EALREADY : -ENOENT; @@ -6388,7 +7479,7 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) preq->poll.events |= IO_POLL_UNMASK; } if (req->poll_update.update_user_data) - preq->user_data = req->poll_update.new_user_data; + preq->cqe.user_data = req->poll_update.new_user_data; ret2 = io_poll_add(preq, issue_flags); /* successfully updated, don't complete poll request */ @@ -6397,7 +7488,7 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) } req_set_fail(preq); - preq->result = -ECANCELED; + preq->cqe.res = -ECANCELED; locked = !(issue_flags & IO_URING_F_UNLOCKED); io_req_task_complete(preq, &locked); out: @@ -6425,14 +7516,14 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) req_set_fail(req); - req->result = -ETIME; + req->cqe.res = -ETIME; req->io_task_work.func = io_req_task_complete; - io_req_task_work_add(req, false); + io_req_task_work_add(req); return HRTIMER_NORESTART; } static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, - __u64 user_data) + struct io_cancel_data *cd) __must_hold(&ctx->timeout_lock) { struct io_timeout_data *io; @@ -6440,9 +7531,16 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, bool found = false; list_for_each_entry(req, &ctx->timeout_list, timeout.list) { - found = user_data == req->user_data; - if (found) - break; + if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && + cd->data != req->cqe.user_data) + continue; + if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { + if (cd->seq == req->work.cancel_seq) + continue; + req->work.cancel_seq = cd->seq; + } + found = true; + break; } if (!found) return ERR_PTR(-ENOENT); @@ -6454,11 +7552,14 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, return req; } -static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) +static int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) __must_hold(&ctx->completion_lock) - __must_hold(&ctx->timeout_lock) { - struct io_kiocb *req = io_timeout_extract(ctx, user_data); + struct io_kiocb *req; + + spin_lock_irq(&ctx->timeout_lock); + req = io_timeout_extract(ctx, cd); + spin_unlock_irq(&ctx->timeout_lock); if (IS_ERR(req)) return PTR_ERR(req); @@ -6491,7 +7592,7 @@ static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, bool found = false; list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) { - found = user_data == req->user_data; + found = user_data == req->cqe.user_data; if (found) break; } @@ -6511,7 +7612,8 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, struct timespec64 *ts, enum hrtimer_mode mode) __must_hold(&ctx->timeout_lock) { - struct io_kiocb *req = io_timeout_extract(ctx, user_data); + struct io_cancel_data cd = { .data = user_data, }; + struct io_kiocb *req = io_timeout_extract(ctx, &cd); struct io_timeout_data *data; if (IS_ERR(req)) @@ -6531,11 +7633,9 @@ static int io_timeout_remove_prep(struct io_kiocb *req, { struct io_timeout_rem *tr = &req->timeout_rem; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in) + if (sqe->buf_index || sqe->len || sqe->splice_fd_in) return -EINVAL; tr->ltimeout = false; @@ -6576,10 +7676,10 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) int ret; if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) { + struct io_cancel_data cd = { .data = tr->addr, }; + spin_lock(&ctx->completion_lock); - spin_lock_irq(&ctx->timeout_lock); - ret = io_timeout_cancel(ctx, tr->addr); - spin_unlock_irq(&ctx->timeout_lock); + ret = io_timeout_cancel(ctx, &cd); spin_unlock(&ctx->completion_lock); } else { enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); @@ -6605,10 +7705,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, unsigned flags; u32 off = READ_ONCE(sqe->off); - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->len != 1 || - sqe->splice_fd_in) + if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in) return -EINVAL; if (off && is_timeout_link) return -EINVAL; @@ -6707,30 +7804,42 @@ add: return 0; } -struct io_cancel_data { - struct io_ring_ctx *ctx; - u64 user_data; -}; - static bool io_cancel_cb(struct io_wq_work *work, void *data) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_cancel_data *cd = data; - return req->ctx == cd->ctx && req->user_data == cd->user_data; + if (req->ctx != cd->ctx) + return false; + if (cd->flags & IORING_ASYNC_CANCEL_ANY) { + ; + } else if (cd->flags & IORING_ASYNC_CANCEL_FD) { + if (req->file != cd->file) + return false; + } else { + if (req->cqe.user_data != cd->data) + return false; + } + if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { + if (cd->seq == req->work.cancel_seq) + return false; + req->work.cancel_seq = cd->seq; + } + return true; } -static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data, - struct io_ring_ctx *ctx) +static int io_async_cancel_one(struct io_uring_task *tctx, + struct io_cancel_data *cd) { - struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, }; enum io_wq_cancel cancel_ret; int ret = 0; + bool all; if (!tctx || !tctx->io_wq) return -ENOENT; - cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false); + all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); + cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all); switch (cancel_ret) { case IO_WQ_CANCEL_OK: ret = 0; @@ -6746,14 +7855,14 @@ static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data, return ret; } -static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr) +static int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd) { struct io_ring_ctx *ctx = req->ctx; int ret; WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current); - ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); + ret = io_async_cancel_one(req->task->io_uring, cd); /* * Fall-through even for -EALREADY, as we may have poll armed * that need unarming. @@ -6762,56 +7871,98 @@ static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr) return 0; spin_lock(&ctx->completion_lock); - ret = io_poll_cancel(ctx, sqe_addr, false); + ret = io_poll_cancel(ctx, cd); if (ret != -ENOENT) goto out; - - spin_lock_irq(&ctx->timeout_lock); - ret = io_timeout_cancel(ctx, sqe_addr); - spin_unlock_irq(&ctx->timeout_lock); + if (!(cd->flags & IORING_ASYNC_CANCEL_FD)) + ret = io_timeout_cancel(ctx, cd); out: spin_unlock(&ctx->completion_lock); return ret; } +#define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \ + IORING_ASYNC_CANCEL_ANY) + static int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (unlikely(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; - if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) - return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags || - sqe->splice_fd_in) + if (sqe->off || sqe->len || sqe->splice_fd_in) return -EINVAL; req->cancel.addr = READ_ONCE(sqe->addr); + req->cancel.flags = READ_ONCE(sqe->cancel_flags); + if (req->cancel.flags & ~CANCEL_FLAGS) + return -EINVAL; + if (req->cancel.flags & IORING_ASYNC_CANCEL_FD) { + if (req->cancel.flags & IORING_ASYNC_CANCEL_ANY) + return -EINVAL; + req->cancel.fd = READ_ONCE(sqe->fd); + } + return 0; } -static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) +static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req, + unsigned int issue_flags) { - struct io_ring_ctx *ctx = req->ctx; - u64 sqe_addr = req->cancel.addr; - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; + bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); + struct io_ring_ctx *ctx = cd->ctx; struct io_tctx_node *node; - int ret; + int ret, nr = 0; - ret = io_try_cancel_userdata(req, sqe_addr); - if (ret != -ENOENT) - goto done; + do { + ret = io_try_cancel(req, cd); + if (ret == -ENOENT) + break; + if (!all) + return ret; + nr++; + } while (1); /* slow path, try all io-wq's */ - io_ring_submit_lock(ctx, needs_lock); + io_ring_submit_lock(ctx, issue_flags); ret = -ENOENT; list_for_each_entry(node, &ctx->tctx_list, ctx_node) { struct io_uring_task *tctx = node->task->io_uring; - ret = io_async_cancel_one(tctx, req->cancel.addr, ctx); - if (ret != -ENOENT) - break; + ret = io_async_cancel_one(tctx, cd); + if (ret != -ENOENT) { + if (!all) + break; + nr++; + } } - io_ring_submit_unlock(ctx, needs_lock); + io_ring_submit_unlock(ctx, issue_flags); + return all ? nr : ret; +} + +static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_cancel_data cd = { + .ctx = req->ctx, + .data = req->cancel.addr, + .flags = req->cancel.flags, + .seq = atomic_inc_return(&req->ctx->cancel_seq), + }; + int ret; + + if (cd.flags & IORING_ASYNC_CANCEL_FD) { + if (req->flags & REQ_F_FIXED_FILE) + req->file = io_file_get_fixed(req, req->cancel.fd, + issue_flags); + else + req->file = io_file_get_normal(req, req->cancel.fd); + if (!req->file) { + ret = -EBADF; + goto done; + } + cd.file = req->file; + } + + ret = __io_async_cancel(&cd, req, issue_flags); done: if (ret < 0) req_set_fail(req); @@ -6824,7 +7975,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req, { if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; - if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) + if (sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; req->rsrc_update.offset = READ_ONCE(sqe->off); @@ -6838,7 +7989,6 @@ static int io_rsrc_update_prep(struct io_kiocb *req, static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; struct io_uring_rsrc_update2 up; int ret; @@ -6849,10 +7999,10 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) up.resv = 0; up.resv2 = 0; - io_ring_submit_lock(ctx, needs_lock); + io_ring_submit_lock(ctx, issue_flags); ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, req->rsrc_update.nr_args); - io_ring_submit_unlock(ctx, needs_lock); + io_ring_submit_unlock(ctx, issue_flags); if (ret < 0) req_set_fail(req); @@ -6864,7 +8014,7 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { switch (req->opcode) { case IORING_OP_NOP: - return 0; + return io_nop_prep(req, sqe); case IORING_OP_READV: case IORING_OP_READ_FIXED: case IORING_OP_READ: @@ -6938,6 +8088,18 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_linkat_prep(req, sqe); case IORING_OP_MSG_RING: return io_msg_ring_prep(req, sqe); + case IORING_OP_FSETXATTR: + return io_fsetxattr_prep(req, sqe); + case IORING_OP_SETXATTR: + return io_setxattr_prep(req, sqe); + case IORING_OP_FGETXATTR: + return io_fgetxattr_prep(req, sqe); + case IORING_OP_GETXATTR: + return io_getxattr_prep(req, sqe); + case IORING_OP_SOCKET: + return io_socket_prep(req, sqe); + case IORING_OP_URING_CMD: + return io_uring_cmd_prep(req, sqe); } printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", @@ -6951,7 +8113,7 @@ static int io_req_prep_async(struct io_kiocb *req) /* assign early for deferred execution for non-fixed file */ if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE)) - req->file = io_file_get_normal(req, req->fd); + req->file = io_file_get_normal(req, req->cqe.fd); if (!def->needs_async_setup) return 0; if (WARN_ON_ONCE(req_has_async_data(req))) @@ -6970,6 +8132,8 @@ static int io_req_prep_async(struct io_kiocb *req) return io_recvmsg_prep_async(req); case IORING_OP_CONNECT: return io_connect_prep_async(req); + case IORING_OP_URING_CMD: + return io_uring_cmd_prep_async(req); } printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n", req->opcode); @@ -6979,9 +8143,10 @@ static int io_req_prep_async(struct io_kiocb *req) static u32 io_get_sequence(struct io_kiocb *req) { u32 seq = req->ctx->cached_sq_head; + struct io_kiocb *cur; /* need original cached_sq_head, but it was increased for each req */ - io_for_each_link(req, req) + io_for_each_link(cur, req) seq--; return seq; } @@ -7024,7 +8189,7 @@ fail: goto queue; } - trace_io_uring_defer(ctx, req, req->user_data, req->opcode); + trace_io_uring_defer(ctx, req, req->cqe.user_data, req->opcode); de->req = req; de->seq = seq; list_add_tail(&de->list, &ctx->defer_list); @@ -7086,6 +8251,12 @@ static void io_clean_op(struct io_kiocb *req) if (req->statx.filename) putname(req->statx.filename); break; + case IORING_OP_SETXATTR: + case IORING_OP_FSETXATTR: + case IORING_OP_GETXATTR: + case IORING_OP_FGETXATTR: + __io_xattr_finish(req); + break; } } if ((req->flags & REQ_F_POLLED) && req->apoll) { @@ -7108,15 +8279,11 @@ static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags) return true; if (req->flags & REQ_F_FIXED_FILE) - req->file = io_file_get_fixed(req, req->fd, issue_flags); + req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags); else - req->file = io_file_get_normal(req, req->fd); - if (req->file) - return true; + req->file = io_file_get_normal(req, req->cqe.fd); - req_set_fail(req); - req->result = -EBADF; - return false; + return !!req->file; } static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) @@ -7246,6 +8413,24 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) case IORING_OP_MSG_RING: ret = io_msg_ring(req, issue_flags); break; + case IORING_OP_FSETXATTR: + ret = io_fsetxattr(req, issue_flags); + break; + case IORING_OP_SETXATTR: + ret = io_setxattr(req, issue_flags); + break; + case IORING_OP_FGETXATTR: + ret = io_fgetxattr(req, issue_flags); + break; + case IORING_OP_GETXATTR: + ret = io_getxattr(req, issue_flags); + break; + case IORING_OP_SOCKET: + ret = io_socket(req, issue_flags); + break; + case IORING_OP_URING_CMD: + ret = io_uring_cmd(req, issue_flags); + break; default: ret = -EINVAL; break; @@ -7279,7 +8464,6 @@ static void io_wq_submit_work(struct io_wq_work *work) const struct io_op_def *def = &io_op_defs[req->opcode]; unsigned int issue_flags = IO_URING_F_UNLOCKED; bool needs_poll = false; - struct io_kiocb *timeout; int ret = 0, err = -ECANCELED; /* one will be dropped by ->io_free_work() after returning to io-wq */ @@ -7288,10 +8472,7 @@ static void io_wq_submit_work(struct io_wq_work *work) else req_ref_get(req); - timeout = io_prep_linked_timeout(req); - if (timeout) - io_queue_linked_timeout(timeout); - + io_arm_ltimeout(req); /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ if (work->flags & IO_WQ_WORK_CANCEL) { @@ -7324,6 +8505,8 @@ fail: * wait for request slots on the block side. */ if (!needs_poll) { + if (!(req->ctx->flags & IORING_SETUP_IOPOLL)) + break; cond_resched(); continue; } @@ -7369,8 +8552,7 @@ static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, struct file *file = NULL; unsigned long file_ptr; - if (issue_flags & IO_URING_F_UNLOCKED) - mutex_lock(&ctx->uring_lock); + io_ring_submit_lock(ctx, issue_flags); if (unlikely((unsigned int)fd >= ctx->nr_user_files)) goto out; @@ -7381,9 +8563,9 @@ static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, /* mask in overlapping REQ_F and FFS bits */ req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT); io_req_set_rsrc_node(req, ctx, 0); + WARN_ON_ONCE(file && !test_bit(fd, ctx->file_table.bitmap)); out: - if (issue_flags & IO_URING_F_UNLOCKED) - mutex_unlock(&ctx->uring_lock); + io_ring_submit_unlock(ctx, issue_flags); return file; } @@ -7404,7 +8586,7 @@ static struct file *io_file_get_normal(struct io_kiocb *req, int fd) { struct file *file = fget(fd); - trace_io_uring_file_get(req->ctx, req, req->user_data, fd); + trace_io_uring_file_get(req->ctx, req, req->cqe.user_data, fd); /* we don't allow fixed io_uring files */ if (file && file->f_op == &io_uring_fops) @@ -7418,8 +8600,14 @@ static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) int ret = -ENOENT; if (prev) { - if (!(req->task->flags & PF_EXITING)) - ret = io_try_cancel_userdata(req, prev->user_data); + if (!(req->task->flags & PF_EXITING)) { + struct io_cancel_data cd = { + .ctx = req->ctx, + .data = prev->cqe.user_data, + }; + + ret = io_try_cancel(req, &cd); + } io_req_complete_post(req, ret ?: -ETIME, 0); io_put_req(prev); } else { @@ -7453,7 +8641,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) spin_unlock_irqrestore(&ctx->timeout_lock, flags); req->io_task_work.func = io_req_task_link_timeout; - io_req_task_work_add(req, false); + io_req_task_work_add(req); return HRTIMER_NORESTART; } @@ -7479,10 +8667,17 @@ static void io_queue_linked_timeout(struct io_kiocb *req) io_put_req(req); } -static void io_queue_sqe_arm_apoll(struct io_kiocb *req) +static void io_queue_async(struct io_kiocb *req, int ret) __must_hold(&req->ctx->uring_lock) { - struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); + struct io_kiocb *linked_timeout; + + if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) { + io_req_complete_failed(req, ret); + return; + } + + linked_timeout = io_prep_linked_timeout(req); switch (io_arm_poll_handler(req, 0)) { case IO_APOLL_READY: @@ -7493,7 +8688,7 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req) * Queued up for async execution, worker will release * submit reference when the iocb is actually submitted. */ - io_queue_async_work(req, NULL); + io_queue_iowq(req, NULL); break; case IO_APOLL_OK: break; @@ -7503,10 +8698,9 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req) io_queue_linked_timeout(linked_timeout); } -static inline void __io_queue_sqe(struct io_kiocb *req) +static inline void io_queue_sqe(struct io_kiocb *req) __must_hold(&req->ctx->uring_lock) { - struct io_kiocb *linked_timeout; int ret; ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); @@ -7519,22 +8713,23 @@ static inline void __io_queue_sqe(struct io_kiocb *req) * We async punt it if the file wasn't marked NOWAIT, or if the file * doesn't support non-blocking read/write attempts */ - if (likely(!ret)) { - linked_timeout = io_prep_linked_timeout(req); - if (linked_timeout) - io_queue_linked_timeout(linked_timeout); - } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { - io_queue_sqe_arm_apoll(req); - } else { - io_req_complete_failed(req, ret); - } + if (likely(!ret)) + io_arm_ltimeout(req); + else + io_queue_async(req, ret); } static void io_queue_sqe_fallback(struct io_kiocb *req) __must_hold(&req->ctx->uring_lock) { - if (req->flags & REQ_F_FAIL) { - io_req_complete_fail_submit(req); + if (unlikely(req->flags & REQ_F_FAIL)) { + /* + * We don't submit, fail them all, for that replace hardlinks + * with normal links. Extra REQ_F_LINK is tolerated. + */ + req->flags &= ~REQ_F_HARDLINK; + req->flags |= REQ_F_LINK; + io_req_complete_failed(req, req->cqe.res); } else if (unlikely(req->ctx->drain_active)) { io_drain_req(req); } else { @@ -7543,19 +8738,10 @@ static void io_queue_sqe_fallback(struct io_kiocb *req) if (unlikely(ret)) io_req_complete_failed(req, ret); else - io_queue_async_work(req, NULL); + io_queue_iowq(req, NULL); } } -static inline void io_queue_sqe(struct io_kiocb *req) - __must_hold(&req->ctx->uring_lock) -{ - if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) - __io_queue_sqe(req); - else - io_queue_sqe_fallback(req); -} - /* * Check SQE restrictions (opcode and flags). * @@ -7610,9 +8796,9 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->opcode = opcode = READ_ONCE(sqe->opcode); /* same numerical values with corresponding REQ_F_*, safe to copy */ req->flags = sqe_flags = READ_ONCE(sqe->flags); - req->user_data = READ_ONCE(sqe->user_data); + req->cqe.user_data = READ_ONCE(sqe->user_data); req->file = NULL; - req->fixed_rsrc_refs = NULL; + req->rsrc_node = NULL; req->task = current; if (unlikely(opcode >= IORING_OP_LAST)) { @@ -7623,9 +8809,11 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, /* enforce forwards compatibility on users */ if (sqe_flags & ~SQE_VALID_FLAGS) return -EINVAL; - if ((sqe_flags & IOSQE_BUFFER_SELECT) && - !io_op_defs[opcode].buffer_select) - return -EOPNOTSUPP; + if (sqe_flags & IOSQE_BUFFER_SELECT) { + if (!io_op_defs[opcode].buffer_select) + return -EOPNOTSUPP; + req->buf_index = READ_ONCE(sqe->buf_group); + } if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS) ctx->drain_disabled = true; if (sqe_flags & IOSQE_IO_DRAIN) { @@ -7648,10 +8836,15 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, } } + if (!io_op_defs[opcode].ioprio && sqe->ioprio) + return -EINVAL; + if (!io_op_defs[opcode].iopoll && (ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (io_op_defs[opcode].needs_file) { struct io_submit_state *state = &ctx->submit_state; - req->fd = READ_ONCE(sqe->fd); + req->cqe.fd = READ_ONCE(sqe->fd); /* * Plug now if we have more than 2 IO left after this, and the @@ -7683,7 +8876,44 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, return io_req_prep(req, sqe); } -static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, +static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe, + struct io_kiocb *req, int ret) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_submit_link *link = &ctx->submit_state.link; + struct io_kiocb *head = link->head; + + trace_io_uring_req_failed(sqe, ctx, req, ret); + + /* + * Avoid breaking links in the middle as it renders links with SQPOLL + * unusable. Instead of failing eagerly, continue assembling the link if + * applicable and mark the head with REQ_F_FAIL. The link flushing code + * should find the flag and handle the rest. + */ + req_fail_link_node(req, ret); + if (head && !(head->flags & REQ_F_FAIL)) + req_fail_link_node(head, -ECANCELED); + + if (!(req->flags & IO_REQ_LINK_FLAGS)) { + if (head) { + link->last->link = req; + link->head = NULL; + req = head; + } + io_queue_sqe_fallback(req); + return ret; + } + + if (head) + link->last->link = req; + else + link->head = req; + link->last = req; + return 0; +} + +static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) __must_hold(&ctx->uring_lock) { @@ -7691,35 +8921,11 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, int ret; ret = io_init_req(ctx, req, sqe); - if (unlikely(ret)) { - trace_io_uring_req_failed(sqe, ctx, req, ret); - - /* fail even hard links since we don't submit */ - if (link->head) { - /* - * we can judge a link req is failed or cancelled by if - * REQ_F_FAIL is set, but the head is an exception since - * it may be set REQ_F_FAIL because of other req's failure - * so let's leverage req->result to distinguish if a head - * is set REQ_F_FAIL because of its failure or other req's - * failure so that we can set the correct ret code for it. - * init result here to avoid affecting the normal path. - */ - if (!(link->head->flags & REQ_F_FAIL)) - req_fail_link_node(link->head, -ECANCELED); - } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { - /* - * the current req is a normal req, we should return - * error and thus break the submittion loop. - */ - io_req_complete_failed(req, ret); - return ret; - } - req_fail_link_node(req, ret); - } + if (unlikely(ret)) + return io_submit_fail_init(sqe, req, ret); /* don't need @sqe from now on */ - trace_io_uring_submit_sqe(ctx, req, req->user_data, req->opcode, + trace_io_uring_submit_sqe(ctx, req, req->cqe.user_data, req->opcode, req->flags, true, ctx->flags & IORING_SETUP_SQPOLL); @@ -7730,29 +8936,32 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, * submitted sync once the chain is complete. If none of those * conditions are true (normal request), then just queue it. */ - if (link->head) { - struct io_kiocb *head = link->head; - - if (!(req->flags & REQ_F_FAIL)) { - ret = io_req_prep_async(req); - if (unlikely(ret)) { - req_fail_link_node(req, ret); - if (!(head->flags & REQ_F_FAIL)) - req_fail_link_node(head, -ECANCELED); - } - } - trace_io_uring_link(ctx, req, head); + if (unlikely(link->head)) { + ret = io_req_prep_async(req); + if (unlikely(ret)) + return io_submit_fail_init(sqe, req, ret); + + trace_io_uring_link(ctx, req, link->head); link->last->link = req; link->last = req; - if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) + if (req->flags & IO_REQ_LINK_FLAGS) return 0; - /* last request of a link, enqueue the link */ + /* last request of the link, flush it */ + req = link->head; link->head = NULL; - req = head; - } else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { - link->head = req; - link->last = req; + if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)) + goto fallback; + + } else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS | + REQ_F_FORCE_ASYNC | REQ_F_FAIL))) { + if (req->flags & IO_REQ_LINK_FLAGS) { + link->head = req; + link->last = req; + } else { +fallback: + io_queue_sqe_fallback(req); + } return 0; } @@ -7767,8 +8976,8 @@ static void io_submit_state_end(struct io_ring_ctx *ctx) { struct io_submit_state *state = &ctx->submit_state; - if (state->link.head) - io_queue_sqe(state->link.head); + if (unlikely(state->link.head)) + io_queue_sqe_fallback(state->link.head); /* flush only after queuing links as they can generate completions */ io_submit_flush_completions(ctx); if (state->plug_started) @@ -7822,8 +9031,12 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) * though the application is the one updating it. */ head = READ_ONCE(ctx->sq_array[sq_idx]); - if (likely(head < ctx->sq_entries)) + if (likely(head < ctx->sq_entries)) { + /* double index for 128-byte SQEs, twice as long */ + if (ctx->flags & IORING_SETUP_SQE128) + head <<= 1; return &ctx->sq_sqes[head]; + } /* drop invalid entries */ ctx->cq_extra--; @@ -7836,54 +9049,52 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) __must_hold(&ctx->uring_lock) { unsigned int entries = io_sqring_entries(ctx); - int submitted = 0; + unsigned int left; + int ret; if (unlikely(!entries)) return 0; /* make sure SQ entry isn't read before tail */ - nr = min3(nr, ctx->sq_entries, entries); - io_get_task_refs(nr); + ret = left = min3(nr, ctx->sq_entries, entries); + io_get_task_refs(left); + io_submit_state_start(&ctx->submit_state, left); - io_submit_state_start(&ctx->submit_state, nr); do { const struct io_uring_sqe *sqe; struct io_kiocb *req; - if (unlikely(!io_alloc_req_refill(ctx))) { - if (!submitted) - submitted = -EAGAIN; + if (unlikely(!io_alloc_req_refill(ctx))) break; - } req = io_alloc_req(ctx); sqe = io_get_sqe(ctx); if (unlikely(!sqe)) { - wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); + io_req_add_to_cache(req, ctx); break; } - /* will complete beyond this point, count as submitted */ - submitted++; - if (io_submit_sqe(ctx, req, sqe)) { - /* - * Continue submitting even for sqe failure if the - * ring was setup with IORING_SETUP_SUBMIT_ALL - */ - if (!(ctx->flags & IORING_SETUP_SUBMIT_ALL)) - break; - } - } while (submitted < nr); - if (unlikely(submitted != nr)) { - int ref_used = (submitted == -EAGAIN) ? 0 : submitted; - int unused = nr - ref_used; + /* + * Continue submitting even for sqe failure if the + * ring was setup with IORING_SETUP_SUBMIT_ALL + */ + if (unlikely(io_submit_sqe(ctx, req, sqe)) && + !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) { + left--; + break; + } + } while (--left); - current->io_uring->cached_refs += unused; + if (unlikely(left)) { + ret -= left; + /* try again if it submitted nothing and can't allocate a req */ + if (!ret && io_req_cache_empty(ctx)) + ret = -EAGAIN; + current->io_uring->cached_refs += left; } io_submit_state_end(ctx); /* Commit SQ ring head once we've consumed and submitted all SQEs */ io_commit_sqring(ctx); - - return submitted; + return ret; } static inline bool io_sqd_events_pending(struct io_sq_data *sqd) @@ -7891,23 +9102,6 @@ static inline bool io_sqd_events_pending(struct io_sq_data *sqd) return READ_ONCE(sqd->state); } -static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx) -{ - /* Tell userspace we may need a wakeup call */ - spin_lock(&ctx->completion_lock); - WRITE_ONCE(ctx->rings->sq_flags, - ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP); - spin_unlock(&ctx->completion_lock); -} - -static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx) -{ - spin_lock(&ctx->completion_lock); - WRITE_ONCE(ctx->rings->sq_flags, - ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP); - spin_unlock(&ctx->completion_lock); -} - static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) { unsigned int to_submit; @@ -8023,8 +9217,8 @@ static int io_sq_thread(void *data) bool needs_sched = true; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { - io_ring_set_wakeup_flag(ctx); - + atomic_or(IORING_SQ_NEED_WAKEUP, + &ctx->rings->sq_flags); if ((ctx->flags & IORING_SETUP_IOPOLL) && !wq_list_empty(&ctx->iopoll_list)) { needs_sched = false; @@ -8035,7 +9229,7 @@ static int io_sq_thread(void *data) * Ensure the store of the wakeup flag is not * reordered with the load of the SQ tail */ - smp_mb(); + smp_mb__after_atomic(); if (io_sqring_entries(ctx)) { needs_sched = false; @@ -8049,7 +9243,8 @@ static int io_sq_thread(void *data) mutex_lock(&sqd->lock); } list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - io_ring_clear_wakeup_flag(ctx); + atomic_andnot(IORING_SQ_NEED_WAKEUP, + &ctx->rings->sq_flags); } finish_wait(&sqd->wait, &wait); @@ -8059,7 +9254,7 @@ static int io_sq_thread(void *data) io_uring_cancel_generic(true, sqd); sqd->thread = NULL; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - io_ring_set_wakeup_flag(ctx); + atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags); io_run_task_work(); mutex_unlock(&sqd->lock); @@ -8099,7 +9294,8 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, * Cannot safely flush overflowed CQEs from here, ensure we wake up * the task, and the next invocation will do it. */ - if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow)) + if (io_should_wake(iowq) || + test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &iowq->ctx->check_cq)) return autoremove_wake_function(curr, mode, wake_flags, key); return -1; } @@ -8121,15 +9317,18 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, ktime_t timeout) { int ret; + unsigned long check_cq; /* make sure we run task_work before checking for signals */ ret = io_run_task_work_sig(); if (ret || io_should_wake(iowq)) return ret; + check_cq = READ_ONCE(ctx->check_cq); /* let the caller flush overflows, retry */ - if (test_bit(0, &ctx->check_cq_overflow)) + if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) return 1; - + if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))) + return -EBADR; if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS)) return -ETIME; return 1; @@ -8194,10 +9393,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, TASK_INTERRUPTIBLE); ret = io_cqring_wait_schedule(ctx, &iowq, timeout); - finish_wait(&ctx->cq_wait, &iowq.wq); cond_resched(); } while (ret > 0); + finish_wait(&ctx->cq_wait, &iowq.wq); restore_saved_sigmask_unless(ret == -EINTR); return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; @@ -8435,17 +9634,57 @@ static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files) { table->files = kvcalloc(nr_files, sizeof(table->files[0]), GFP_KERNEL_ACCOUNT); - return !!table->files; + if (unlikely(!table->files)) + return false; + + table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT); + if (unlikely(!table->bitmap)) { + kvfree(table->files); + return false; + } + + return true; } static void io_free_file_tables(struct io_file_table *table) { kvfree(table->files); + bitmap_free(table->bitmap); table->files = NULL; + table->bitmap = NULL; +} + +static inline void io_file_bitmap_set(struct io_file_table *table, int bit) +{ + WARN_ON_ONCE(test_bit(bit, table->bitmap)); + __set_bit(bit, table->bitmap); + if (bit == table->alloc_hint) + table->alloc_hint++; +} + +static inline void io_file_bitmap_clear(struct io_file_table *table, int bit) +{ + __clear_bit(bit, table->bitmap); + table->alloc_hint = bit; } static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) { +#if !defined(IO_URING_SCM_ALL) + int i; + + for (i = 0; i < ctx->nr_user_files; i++) { + struct file *file = io_file_from_index(ctx, i); + + if (!file) + continue; + if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM) + continue; + io_file_bitmap_clear(&ctx->file_table, i); + fput(file); + } +#endif + #if defined(CONFIG_UNIX) if (ctx->ring_sock) { struct sock *sock = ctx->ring_sock->sk; @@ -8454,16 +9693,6 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) kfree_skb(skb); } -#else - int i; - - for (i = 0; i < ctx->nr_user_files; i++) { - struct file *file; - - file = io_file_from_index(ctx, i); - if (file) - fput(file); - } #endif io_free_file_tables(&ctx->file_table); io_rsrc_data_free(ctx->file_data); @@ -8608,107 +9837,66 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p, return sqd; } -#if defined(CONFIG_UNIX) /* * Ensure the UNIX gc is aware of our file set, so we are certain that * the io_uring can be safely unregistered on process exit, even if we have - * loops in the file referencing. + * loops in the file referencing. We account only files that can hold other + * files because otherwise they can't form a loop and so are not interesting + * for GC. */ -static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) +static int io_scm_file_account(struct io_ring_ctx *ctx, struct file *file) { +#if defined(CONFIG_UNIX) struct sock *sk = ctx->ring_sock->sk; + struct sk_buff_head *head = &sk->sk_receive_queue; struct scm_fp_list *fpl; struct sk_buff *skb; - int i, nr_files; - fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); - if (!fpl) - return -ENOMEM; - - skb = alloc_skb(0, GFP_KERNEL); - if (!skb) { - kfree(fpl); - return -ENOMEM; - } + if (likely(!io_file_need_scm(file))) + return 0; - skb->sk = sk; + /* + * See if we can merge this file into an existing skb SCM_RIGHTS + * file set. If there's no room, fall back to allocating a new skb + * and filling it in. + */ + spin_lock_irq(&head->lock); + skb = skb_peek(head); + if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD) + __skb_unlink(skb, head); + else + skb = NULL; + spin_unlock_irq(&head->lock); - nr_files = 0; - fpl->user = get_uid(current_user()); - for (i = 0; i < nr; i++) { - struct file *file = io_file_from_index(ctx, i + offset); + if (!skb) { + fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); + if (!fpl) + return -ENOMEM; - if (!file) - continue; - fpl->fp[nr_files] = get_file(file); - unix_inflight(fpl->user, fpl->fp[nr_files]); - nr_files++; - } + skb = alloc_skb(0, GFP_KERNEL); + if (!skb) { + kfree(fpl); + return -ENOMEM; + } - if (nr_files) { + fpl->user = get_uid(current_user()); fpl->max = SCM_MAX_FD; - fpl->count = nr_files; + fpl->count = 0; + UNIXCB(skb).fp = fpl; + skb->sk = sk; skb->destructor = unix_destruct_scm; refcount_add(skb->truesize, &sk->sk_wmem_alloc); - skb_queue_head(&sk->sk_receive_queue, skb); - - for (i = 0; i < nr; i++) { - struct file *file = io_file_from_index(ctx, i + offset); - - if (file) - fput(file); - } - } else { - kfree_skb(skb); - free_uid(fpl->user); - kfree(fpl); - } - - return 0; -} - -/* - * If UNIX sockets are enabled, fd passing can cause a reference cycle which - * causes regular reference counting to break down. We rely on the UNIX - * garbage collection to take care of this problem for us. - */ -static int io_sqe_files_scm(struct io_ring_ctx *ctx) -{ - unsigned left, total; - int ret = 0; - - total = 0; - left = ctx->nr_user_files; - while (left) { - unsigned this_files = min_t(unsigned, left, SCM_MAX_FD); - - ret = __io_sqe_files_scm(ctx, this_files, total); - if (ret) - break; - left -= this_files; - total += this_files; - } - - if (!ret) - return 0; - - while (total < ctx->nr_user_files) { - struct file *file = io_file_from_index(ctx, total); - - if (file) - fput(file); - total++; } - return ret; -} -#else -static int io_sqe_files_scm(struct io_ring_ctx *ctx) -{ + fpl = UNIXCB(skb).fp; + fpl->fp[fpl->count++] = get_file(file); + unix_inflight(fpl->user, file); + skb_queue_head(head, skb); + fput(file); +#endif return 0; } -#endif static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) { @@ -8719,6 +9907,11 @@ static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) struct sk_buff *skb; int i; + if (!io_file_need_scm(file)) { + fput(file); + return; + } + __skb_queue_head_init(&list); /* @@ -8783,15 +9976,17 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) list_del(&prsrc->list); if (prsrc->tag) { - bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL; + if (ctx->flags & IORING_SETUP_IOPOLL) + mutex_lock(&ctx->uring_lock); - io_ring_submit_lock(ctx, lock_ring); spin_lock(&ctx->completion_lock); io_fill_cqe_aux(ctx, prsrc->tag, 0, 0); io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); - io_ring_submit_unlock(ctx, lock_ring); + + if (ctx->flags & IORING_SETUP_IOPOLL) + mutex_unlock(&ctx->uring_lock); } rsrc_data->do_put(ctx, prsrc); @@ -8845,27 +10040,31 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, if (ret) return ret; - ret = -ENOMEM; - if (!io_alloc_file_tables(&ctx->file_table, nr_args)) - goto out_free; + if (!io_alloc_file_tables(&ctx->file_table, nr_args)) { + io_rsrc_data_free(ctx->file_data); + ctx->file_data = NULL; + return -ENOMEM; + } for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { - if (copy_from_user(&fd, &fds[i], sizeof(fd))) { + struct io_fixed_file *file_slot; + + if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) { ret = -EFAULT; - goto out_fput; + goto fail; } /* allow sparse sets */ - if (fd == -1) { + if (!fds || fd == -1) { ret = -EINVAL; if (unlikely(*io_get_tag_slot(ctx->file_data, i))) - goto out_fput; + goto fail; continue; } file = fget(fd); ret = -EBADF; if (unlikely(!file)) - goto out_fput; + goto fail; /* * Don't allow io_uring instances to be registered. If UNIX @@ -8876,74 +10075,23 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, */ if (file->f_op == &io_uring_fops) { fput(file); - goto out_fput; + goto fail; } - io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file); - } - - ret = io_sqe_files_scm(ctx); - if (ret) { - __io_sqe_files_unregister(ctx); - return ret; - } - - io_rsrc_node_switch(ctx, NULL); - return ret; -out_fput: - for (i = 0; i < ctx->nr_user_files; i++) { - file = io_file_from_index(ctx, i); - if (file) + ret = io_scm_file_account(ctx, file); + if (ret) { fput(file); - } - io_free_file_tables(&ctx->file_table); - ctx->nr_user_files = 0; -out_free: - io_rsrc_data_free(ctx->file_data); - ctx->file_data = NULL; - return ret; -} - -static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, - int index) -{ -#if defined(CONFIG_UNIX) - struct sock *sock = ctx->ring_sock->sk; - struct sk_buff_head *head = &sock->sk_receive_queue; - struct sk_buff *skb; - - /* - * See if we can merge this file into an existing skb SCM_RIGHTS - * file set. If there's no room, fall back to allocating a new skb - * and filling it in. - */ - spin_lock_irq(&head->lock); - skb = skb_peek(head); - if (skb) { - struct scm_fp_list *fpl = UNIXCB(skb).fp; - - if (fpl->count < SCM_MAX_FD) { - __skb_unlink(skb, head); - spin_unlock_irq(&head->lock); - fpl->fp[fpl->count] = get_file(file); - unix_inflight(fpl->user, fpl->fp[fpl->count]); - fpl->count++; - spin_lock_irq(&head->lock); - __skb_queue_head(head, skb); - } else { - skb = NULL; + goto fail; } - } - spin_unlock_irq(&head->lock); - - if (skb) { - fput(file); - return 0; + file_slot = io_fixed_file_slot(&ctx->file_table, i); + io_fixed_file_set(file_slot, file); + io_file_bitmap_set(&ctx->file_table, i); } - return __io_sqe_files_scm(ctx, 1, index); -#else + io_rsrc_node_switch(ctx, NULL); return 0; -#endif +fail: + __io_sqe_files_unregister(ctx); + return ret; } static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, @@ -8967,12 +10115,11 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file, unsigned int issue_flags, u32 slot_index) { struct io_ring_ctx *ctx = req->ctx; - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; bool needs_switch = false; struct io_fixed_file *file_slot; int ret = -EBADF; - io_ring_submit_lock(ctx, needs_lock); + io_ring_submit_lock(ctx, issue_flags); if (file->f_op == &io_uring_fops) goto err; ret = -ENXIO; @@ -8998,22 +10145,20 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file, if (ret) goto err; file_slot->file_ptr = 0; + io_file_bitmap_clear(&ctx->file_table, slot_index); needs_switch = true; } - *io_get_tag_slot(ctx->file_data, slot_index) = 0; - io_fixed_file_set(file_slot, file); - ret = io_sqe_file_register(ctx, file, slot_index); - if (ret) { - file_slot->file_ptr = 0; - goto err; + ret = io_scm_file_account(ctx, file); + if (!ret) { + *io_get_tag_slot(ctx->file_data, slot_index) = 0; + io_fixed_file_set(file_slot, file); + io_file_bitmap_set(&ctx->file_table, slot_index); } - - ret = 0; err: if (needs_switch) io_rsrc_node_switch(ctx, ctx->file_data); - io_ring_submit_unlock(ctx, needs_lock); + io_ring_submit_unlock(ctx, issue_flags); if (ret) fput(file); return ret; @@ -9023,12 +10168,11 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) { unsigned int offset = req->close.file_slot - 1; struct io_ring_ctx *ctx = req->ctx; - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; struct io_fixed_file *file_slot; struct file *file; int ret; - io_ring_submit_lock(ctx, needs_lock); + io_ring_submit_lock(ctx, issue_flags); ret = -ENXIO; if (unlikely(!ctx->file_data)) goto out; @@ -9051,10 +10195,11 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) goto out; file_slot->file_ptr = 0; + io_file_bitmap_clear(&ctx->file_table, offset); io_rsrc_node_switch(ctx, ctx->file_data); ret = 0; out: - io_ring_submit_unlock(ctx, needs_lock); + io_ring_submit_unlock(ctx, issue_flags); return ret; } @@ -9100,6 +10245,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, if (err) break; file_slot->file_ptr = 0; + io_file_bitmap_clear(&ctx->file_table, i); needs_switch = true; } if (fd != -1) { @@ -9121,14 +10267,14 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, err = -EBADF; break; } - *io_get_tag_slot(data, i) = tag; - io_fixed_file_set(file_slot, file); - err = io_sqe_file_register(ctx, file, i); + err = io_scm_file_account(ctx, file); if (err) { - file_slot->file_ptr = 0; fput(file); break; } + *io_get_tag_slot(data, i) = tag; + io_fixed_file_set(file_slot, file); + io_file_bitmap_set(&ctx->file_table, i); } } @@ -9208,7 +10354,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task, task->io_uring = tctx; spin_lock_init(&tctx->task_lock); INIT_WQ_LIST(&tctx->task_list); - INIT_WQ_LIST(&tctx->prior_task_list); + INIT_WQ_LIST(&tctx->prio_task_list); init_task_work(&tctx->task_work, tctx_task_work); return 0; } @@ -9386,8 +10532,8 @@ static void *io_mem_alloc(size_t size) return (void *) __get_free_pages(gfp, get_order(size)); } -static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, - size_t *sq_offset) +static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, + unsigned int cq_entries, size_t *sq_offset) { struct io_rings *rings; size_t off, sq_array_size; @@ -9395,6 +10541,10 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, off = struct_size(rings, cqes, cq_entries); if (off == SIZE_MAX) return SIZE_MAX; + if (ctx->flags & IORING_SETUP_CQE32) { + if (check_shl_overflow(off, 1, &off)) + return SIZE_MAX; + } #ifdef CONFIG_SMP off = ALIGN(off, SMP_CACHE_BYTES); @@ -9556,30 +10706,18 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, return ret; } -static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, - struct io_mapped_ubuf **pimu, - struct page **last_hpage) +static struct page **io_pin_pages(unsigned long ubuf, unsigned long len, + int *npages) { - struct io_mapped_ubuf *imu = NULL; + unsigned long start, end, nr_pages; struct vm_area_struct **vmas = NULL; struct page **pages = NULL; - unsigned long off, start, end, ubuf; - size_t size; - int ret, pret, nr_pages, i; + int i, pret, ret = -ENOMEM; - if (!iov->iov_base) { - *pimu = ctx->dummy_ubuf; - return 0; - } - - ubuf = (unsigned long) iov->iov_base; - end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; + end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; start = ubuf >> PAGE_SHIFT; nr_pages = end - start; - *pimu = NULL; - ret = -ENOMEM; - pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) goto done; @@ -9589,10 +10727,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, if (!vmas) goto done; - imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); - if (!imu) - goto done; - ret = 0; mmap_read_lock(current->mm); pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, @@ -9610,6 +10744,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, break; } } + *npages = nr_pages; } else { ret = pret < 0 ? pret : -EFAULT; } @@ -9623,14 +10758,53 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, unpin_user_pages(pages, pret); goto done; } + ret = 0; +done: + kvfree(vmas); + if (ret < 0) { + kvfree(pages); + pages = ERR_PTR(ret); + } + return pages; +} + +static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, + struct io_mapped_ubuf **pimu, + struct page **last_hpage) +{ + struct io_mapped_ubuf *imu = NULL; + struct page **pages = NULL; + unsigned long off; + size_t size; + int ret, nr_pages, i; + + if (!iov->iov_base) { + *pimu = ctx->dummy_ubuf; + return 0; + } - ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage); + *pimu = NULL; + ret = -ENOMEM; + + pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, + &nr_pages); + if (IS_ERR(pages)) { + ret = PTR_ERR(pages); + pages = NULL; + goto done; + } + + imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); + if (!imu) + goto done; + + ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); if (ret) { - unpin_user_pages(pages, pret); + unpin_user_pages(pages, nr_pages); goto done; } - off = ubuf & ~PAGE_MASK; + off = (unsigned long) iov->iov_base & ~PAGE_MASK; size = iov->iov_len; for (i = 0; i < nr_pages; i++) { size_t vec_len; @@ -9643,8 +10817,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, size -= vec_len; } /* store original address for later verification */ - imu->ubuf = ubuf; - imu->ubuf_end = ubuf + iov->iov_len; + imu->ubuf = (unsigned long) iov->iov_base; + imu->ubuf_end = imu->ubuf + iov->iov_len; imu->nr_bvecs = nr_pages; *pimu = imu; ret = 0; @@ -9652,7 +10826,6 @@ done: if (ret) kvfree(imu); kvfree(pages); - kvfree(vmas); return ret; } @@ -9711,12 +10884,17 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, } for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { - ret = io_copy_iov(ctx, &iov, arg, i); - if (ret) - break; - ret = io_buffer_validate(&iov); - if (ret) - break; + if (arg) { + ret = io_copy_iov(ctx, &iov, arg, i); + if (ret) + break; + ret = io_buffer_validate(&iov); + if (ret) + break; + } else { + memset(&iov, 0, sizeof(iov)); + } + if (!iov.iov_base && *io_get_tag_slot(data, i)) { ret = -EINVAL; break; @@ -9855,19 +11033,19 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx) static void io_destroy_buffers(struct io_ring_ctx *ctx) { + struct io_buffer_list *bl; + unsigned long index; int i; - for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) { - struct list_head *list = &ctx->io_buffers[i]; - - while (!list_empty(list)) { - struct io_buffer_list *bl; + for (i = 0; i < BGID_ARRAY; i++) { + if (!ctx->io_bl) + break; + __io_remove_buffers(ctx, &ctx->io_bl[i], -1U); + } - bl = list_first_entry(list, struct io_buffer_list, list); - __io_remove_buffers(ctx, bl, -1U); - list_del(&bl->list); - kfree(bl); - } + xa_for_each(&ctx->io_bl_xa, index, bl) { + xa_erase(&ctx->io_bl_xa, bl->bgid); + __io_remove_buffers(ctx, bl, -1U); } while (!list_empty(&ctx->io_buffers_pages)) { @@ -9887,7 +11065,7 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) mutex_lock(&ctx->uring_lock); io_flush_cached_locked_reqs(ctx, state); - while (state->free_list.next) { + while (!io_req_cache_empty(ctx)) { struct io_wq_work_node *node; struct io_kiocb *req; @@ -9976,7 +11154,8 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_wq_put_hash(ctx->hash_map); kfree(ctx->cancel_hash); kfree(ctx->dummy_ubuf); - kfree(ctx->io_buffers); + kfree(ctx->io_bl); + xa_destroy(&ctx->io_bl_xa); kfree(ctx); } @@ -10007,7 +11186,8 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) * Users may get EPOLLIN meanwhile seeing nothing in cqring, this * pushs them to do the flush. */ - if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow)) + if (io_cqring_events(ctx) || + test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) mask |= EPOLLIN | EPOLLRDNORM; return mask; @@ -10139,8 +11319,7 @@ static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, } } spin_unlock_irq(&ctx->timeout_lock); - if (canceled != 0) - io_commit_cqring(ctx); + io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); if (canceled != 0) io_cqring_ev_posted(ctx); @@ -10160,11 +11339,13 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) io_unregister_personality(ctx, index); mutex_unlock(&ctx->uring_lock); - io_kill_timeouts(ctx, NULL, true); - io_poll_remove_all(ctx, NULL, true); - - /* if we failed setting up the ctx, we might not have any rings */ - io_iopoll_try_reap_events(ctx); + /* failed during ring init, it couldn't have issued any requests */ + if (ctx->rings) { + io_kill_timeouts(ctx, NULL, true); + io_poll_remove_all(ctx, NULL, true); + /* if we failed setting up the ctx, we might not have any rings */ + io_iopoll_try_reap_events(ctx); + } INIT_WORK(&ctx->exit_work, io_ring_exit_work); /* @@ -10256,6 +11437,10 @@ static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct io_task_cancel cancel = { .task = task, .all = cancel_all, }; struct io_uring_task *tctx = task ? task->io_uring : NULL; + /* failed during ring init, it couldn't have issued any requests */ + if (!ctx->rings) + return; + while (1) { enum io_wq_cancel cret; bool ret = false; @@ -10701,6 +11886,19 @@ static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) return 0; } +static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz) +{ + if (flags & IORING_ENTER_EXT_ARG) { + struct io_uring_getevents_arg arg; + + if (argsz != sizeof(arg)) + return -EINVAL; + if (copy_from_user(&arg, argp, sizeof(arg))) + return -EFAULT; + } + return 0; +} + static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, struct __kernel_timespec __user **ts, const sigset_t __user **sig) @@ -10738,7 +11936,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, size_t, argsz) { struct io_ring_ctx *ctx; - int submitted = 0; struct fd f; long ret; @@ -10801,39 +11998,64 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (ret) goto out; } - submitted = to_submit; + ret = to_submit; } else if (to_submit) { ret = io_uring_add_tctx_node(ctx); if (unlikely(ret)) goto out; - mutex_lock(&ctx->uring_lock); - submitted = io_submit_sqes(ctx, to_submit); - mutex_unlock(&ctx->uring_lock); - if (submitted != to_submit) + mutex_lock(&ctx->uring_lock); + ret = io_submit_sqes(ctx, to_submit); + if (ret != to_submit) { + mutex_unlock(&ctx->uring_lock); goto out; + } + if ((flags & IORING_ENTER_GETEVENTS) && ctx->syscall_iopoll) + goto iopoll_locked; + mutex_unlock(&ctx->uring_lock); } if (flags & IORING_ENTER_GETEVENTS) { - const sigset_t __user *sig; - struct __kernel_timespec __user *ts; - - ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); - if (unlikely(ret)) - goto out; + int ret2; + if (ctx->syscall_iopoll) { + /* + * We disallow the app entering submit/complete with + * polling, but we still need to lock the ring to + * prevent racing with polled issue that got punted to + * a workqueue. + */ + mutex_lock(&ctx->uring_lock); +iopoll_locked: + ret2 = io_validate_ext_arg(flags, argp, argsz); + if (likely(!ret2)) { + min_complete = min(min_complete, + ctx->cq_entries); + ret2 = io_iopoll_check(ctx, min_complete); + } + mutex_unlock(&ctx->uring_lock); + } else { + const sigset_t __user *sig; + struct __kernel_timespec __user *ts; + + ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); + if (likely(!ret2)) { + min_complete = min(min_complete, + ctx->cq_entries); + ret2 = io_cqring_wait(ctx, min_complete, sig, + argsz, ts); + } + } - min_complete = min(min_complete, ctx->cq_entries); + if (!ret) { + ret = ret2; - /* - * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user - * space applications don't need to do io completion events - * polling again, they can rely on io_sq_thread to do polling - * work, which can reduce cpu usage and uring_lock contention. - */ - if (ctx->flags & IORING_SETUP_IOPOLL && - !(ctx->flags & IORING_SETUP_SQPOLL)) { - ret = io_iopoll_check(ctx, min_complete); - } else { - ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts); + /* + * EBADR indicates that one or more CQE were dropped. + * Once the user has been informed we can clear the bit + * as they are obviously ok with those drops. + */ + if (unlikely(ret2 == -EBADR)) + clear_bit(IO_CHECK_CQ_DROPPED_BIT, + &ctx->check_cq); } } @@ -10842,7 +12064,7 @@ out: out_fput: if (!(flags & IORING_ENTER_REGISTERED_RING)) fdput(f); - return submitted ? submitted : ret; + return ret; } #ifdef CONFIG_PROC_FS @@ -10889,10 +12111,15 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, unsigned int sq_tail = READ_ONCE(r->sq.tail); unsigned int cq_head = READ_ONCE(r->cq.head); unsigned int cq_tail = READ_ONCE(r->cq.tail); + unsigned int cq_shift = 0; unsigned int sq_entries, cq_entries; bool has_lock; + bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); unsigned int i; + if (is_cqe32) + cq_shift = 1; + /* * we may get imprecise sqe and cqe info if uring is actively running * since we get cached_sq_head and cached_cq_tail without uring_lock @@ -10925,11 +12152,18 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, cq_entries = min(cq_tail - cq_head, ctx->cq_entries); for (i = 0; i < cq_entries; i++) { unsigned int entry = i + cq_head; - struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask]; + struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift]; - seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n", + if (!is_cqe32) { + seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n", entry & cq_mask, cqe->user_data, cqe->res, cqe->flags); + } else { + seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x, " + "extra1:%llu, extra2:%llu\n", + entry & cq_mask, cqe->user_data, cqe->res, + cqe->flags, cqe->big_cqe[0], cqe->big_cqe[1]); + } } /* @@ -11032,7 +12266,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, ctx->sq_entries = p->sq_entries; ctx->cq_entries = p->cq_entries; - size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset); + size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset); if (size == SIZE_MAX) return -EOVERFLOW; @@ -11047,7 +12281,10 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, rings->sq_ring_entries = p->sq_entries; rings->cq_ring_entries = p->cq_entries; - size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); + if (p->flags & IORING_SETUP_SQE128) + size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries); + else + size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); if (size == SIZE_MAX) { io_mem_free(ctx->rings); ctx->rings = NULL; @@ -11159,11 +12396,41 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, ctx = io_ring_ctx_alloc(p); if (!ctx) return -ENOMEM; + + /* + * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user + * space applications don't need to do io completion events + * polling again, they can rely on io_sq_thread to do polling + * work, which can reduce cpu usage and uring_lock contention. + */ + if (ctx->flags & IORING_SETUP_IOPOLL && + !(ctx->flags & IORING_SETUP_SQPOLL)) + ctx->syscall_iopoll = 1; + ctx->compat = in_compat_syscall(); if (!capable(CAP_IPC_LOCK)) ctx->user = get_uid(current_user()); /* + * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if + * COOP_TASKRUN is set, then IPIs are never needed by the app. + */ + ret = -EINVAL; + if (ctx->flags & IORING_SETUP_SQPOLL) { + /* IPI related flags don't make sense with SQPOLL */ + if (ctx->flags & (IORING_SETUP_COOP_TASKRUN | + IORING_SETUP_TASKRUN_FLAG)) + goto err; + ctx->notify_method = TWA_SIGNAL_NO_IPI; + } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) { + ctx->notify_method = TWA_SIGNAL_NO_IPI; + } else { + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + goto err; + ctx->notify_method = TWA_SIGNAL; + } + + /* * This is just grabbed for accounting purposes. When a process exits, * the mm is exited and dropped before the files, hence we need to hang * on to this mm purely for the purposes of being able to unaccount @@ -11260,10 +12527,12 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | - IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL)) + IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL | + IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | + IORING_SETUP_SQE128 | IORING_SETUP_CQE32)) return -EINVAL; - return io_uring_create(entries, &p, params); + return io_uring_create(entries, &p, params); } SYSCALL_DEFINE2(io_uring_setup, u32, entries, @@ -11476,14 +12745,20 @@ static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, memset(&rr, 0, sizeof(rr)); if (copy_from_user(&rr, arg, size)) return -EFAULT; - if (!rr.nr || rr.resv || rr.resv2) + if (!rr.nr || rr.resv2) + return -EINVAL; + if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) return -EINVAL; switch (type) { case IORING_RSRC_FILE: + if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) + break; return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), rr.nr, u64_to_user_ptr(rr.tags)); case IORING_RSRC_BUFFER: + if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) + break; return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), rr.nr, u64_to_user_ptr(rr.tags)); } @@ -11618,6 +12893,85 @@ err: return ret; } +static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_buf_ring *br; + struct io_uring_buf_reg reg; + struct io_buffer_list *bl; + struct page **pages; + int nr_pages; + + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + + if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) + return -EINVAL; + if (!reg.ring_addr) + return -EFAULT; + if (reg.ring_addr & ~PAGE_MASK) + return -EINVAL; + if (!is_power_of_2(reg.ring_entries)) + return -EINVAL; + + if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) { + int ret = io_init_bl_list(ctx); + if (ret) + return ret; + } + + bl = io_buffer_get_list(ctx, reg.bgid); + if (bl) { + /* if mapped buffer ring OR classic exists, don't allow */ + if (bl->buf_nr_pages || !list_empty(&bl->buf_list)) + return -EEXIST; + } else { + bl = kzalloc(sizeof(*bl), GFP_KERNEL); + if (!bl) + return -ENOMEM; + } + + pages = io_pin_pages(reg.ring_addr, + struct_size(br, bufs, reg.ring_entries), + &nr_pages); + if (IS_ERR(pages)) { + kfree(bl); + return PTR_ERR(pages); + } + + br = page_address(pages[0]); + bl->buf_pages = pages; + bl->buf_nr_pages = nr_pages; + bl->nr_entries = reg.ring_entries; + bl->buf_ring = br; + bl->mask = reg.ring_entries - 1; + io_buffer_add_list(ctx, bl, reg.bgid); + return 0; +} + +static int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_buf_reg reg; + struct io_buffer_list *bl; + + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) + return -EINVAL; + + bl = io_buffer_get_list(ctx, reg.bgid); + if (!bl) + return -ENOENT; + if (!bl->buf_nr_pages) + return -EINVAL; + + __io_remove_buffers(ctx, bl, -1U); + if (bl->bgid >= BGID_ARRAY) { + xa_erase(&ctx->io_bl_xa, bl->bgid); + kfree(bl); + } + return 0; +} + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -11643,6 +12997,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, switch (opcode) { case IORING_REGISTER_BUFFERS: + ret = -EFAULT; + if (!arg) + break; ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); break; case IORING_UNREGISTER_BUFFERS: @@ -11652,6 +13009,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, ret = io_sqe_buffers_unregister(ctx); break; case IORING_REGISTER_FILES: + ret = -EFAULT; + if (!arg) + break; ret = io_sqe_files_register(ctx, arg, nr_args, NULL); break; case IORING_UNREGISTER_FILES: @@ -11746,6 +13106,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, case IORING_UNREGISTER_RING_FDS: ret = io_ringfd_unregister(ctx, arg, nr_args); break; + case IORING_REGISTER_PBUF_RING: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_pbuf_ring(ctx, arg); + break; + case IORING_UNREGISTER_PBUF_RING: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_unregister_pbuf_ring(ctx, arg); + break; default: ret = -EINVAL; break; @@ -11822,6 +13194,7 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(42, __u16, personality); BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); BUILD_BUG_SQE_ELEM(44, __u32, file_index); + BUILD_BUG_SQE_ELEM(48, __u64, addr3); BUILD_BUG_ON(sizeof(struct io_uring_files_update) != sizeof(struct io_uring_rsrc_update)); @@ -11830,6 +13203,10 @@ static int __init io_uring_init(void) /* ->buf_index is u16 */ BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); + BUILD_BUG_ON(BGID_ARRAY * sizeof(struct io_buffer_list) > PAGE_SIZE); + BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0); + BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) != + offsetof(struct io_uring_buf_ring, tail)); /* should fit into one byte */ BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); @@ -11839,6 +13216,10 @@ static int __init io_uring_init(void) BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); + BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32)); + + BUILD_BUG_ON(sizeof(struct io_uring_cmd) > 64); + req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); return 0; diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index b08f5dc31780..370c3241618a 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -51,12 +51,22 @@ struct iomap_dio { }; }; +static struct bio *iomap_dio_alloc_bio(const struct iomap_iter *iter, + struct iomap_dio *dio, unsigned short nr_vecs, unsigned int opf) +{ + if (dio->dops && dio->dops->bio_set) + return bio_alloc_bioset(iter->iomap.bdev, nr_vecs, opf, + GFP_KERNEL, dio->dops->bio_set); + return bio_alloc(iter->iomap.bdev, nr_vecs, opf, GFP_KERNEL); +} + static void iomap_dio_submit_bio(const struct iomap_iter *iter, struct iomap_dio *dio, struct bio *bio, loff_t pos) { atomic_inc(&dio->ref); - if (dio->iocb->ki_flags & IOCB_HIPRI) { + /* Sync dio can't be polled reliably */ + if ((dio->iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(dio->iocb)) { bio_set_polled(bio, dio->iocb); dio->submit.poll_bio = bio; } @@ -144,7 +154,7 @@ static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret) cmpxchg(&dio->error, 0, ret); } -static void iomap_dio_bio_end_io(struct bio *bio) +void iomap_dio_bio_end_io(struct bio *bio) { struct iomap_dio *dio = bio->bi_private; bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); @@ -176,16 +186,16 @@ static void iomap_dio_bio_end_io(struct bio *bio) bio_put(bio); } } +EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io); static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, loff_t pos, unsigned len) { struct inode *inode = file_inode(dio->iocb->ki_filp); struct page *page = ZERO_PAGE(0); - int flags = REQ_SYNC | REQ_IDLE; struct bio *bio; - bio = bio_alloc(iter->iomap.bdev, 1, REQ_OP_WRITE | flags, GFP_KERNEL); + bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, GFP_KERNEL); bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos); @@ -265,8 +275,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, * cache flushes on IO completion. */ if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && - (dio->flags & IOMAP_DIO_WRITE_FUA) && - blk_queue_fua(bdev_get_queue(iomap->bdev))) + (dio->flags & IOMAP_DIO_WRITE_FUA) && bdev_fua(iomap->bdev)) use_fua = true; } @@ -311,7 +320,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, goto out; } - bio = bio_alloc(iomap->bdev, nr_pages, bio_opf, GFP_KERNEL); + bio = iomap_dio_alloc_bio(iter, dio, nr_pages, bio_opf); fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, GFP_KERNEL); bio->bi_iter.bi_sector = iomap_sector(iomap, pos); @@ -474,7 +483,7 @@ static loff_t iomap_dio_iter(const struct iomap_iter *iter, struct iomap_dio * __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, - unsigned int dio_flags, size_t done_before) + unsigned int dio_flags, void *private, size_t done_before) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = file_inode(iocb->ki_filp); @@ -483,6 +492,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, .pos = iocb->ki_pos, .len = iov_iter_count(iter), .flags = IOMAP_DIRECT, + .private = private, }; loff_t end = iomi.pos + iomi.len - 1, ret = 0; bool wait_for_completion = @@ -654,9 +664,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (!READ_ONCE(dio->submit.waiter)) break; - if (!dio->submit.poll_bio || - !bio_poll(dio->submit.poll_bio, NULL, 0)) - blk_io_schedule(); + blk_io_schedule(); } __set_current_state(TASK_RUNNING); } @@ -674,11 +682,12 @@ EXPORT_SYMBOL_GPL(__iomap_dio_rw); ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, - unsigned int dio_flags, size_t done_before) + unsigned int dio_flags, void *private, size_t done_before) { struct iomap_dio *dio; - dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, done_before); + dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, private, + done_before); if (IS_ERR_OR_NULL(dio)) return PTR_ERR_OR_ZERO(dio); return iomap_dio_complete(dio); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index fcacafa4510d..c0cbeeaec2d1 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1762,7 +1762,6 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags) unsigned long block, log_offset; /* logical */ unsigned long long phys_block, block_start, block_stop; /* physical */ loff_t byte_start, byte_stop, byte_count; - struct request_queue *q = bdev_get_queue(journal->j_dev); /* flags must be set to either discard or zeroout */ if ((flags & ~JBD2_JOURNAL_FLUSH_VALID) || !flags || @@ -1770,10 +1769,8 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags) (flags & JBD2_JOURNAL_FLUSH_ZEROOUT))) return -EINVAL; - if (!q) - return -ENXIO; - - if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !blk_queue_discard(q)) + if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && + !bdev_max_discard_sectors(journal->j_dev)) return -EOPNOTSUPP; /* @@ -1828,7 +1825,7 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags) err = blkdev_issue_discard(journal->j_dev, byte_start >> SECTOR_SHIFT, byte_count >> SECTOR_SHIFT, - GFP_NOFS, 0); + GFP_NOFS); } else if (flags & JBD2_JOURNAL_FLUSH_ZEROOUT) { err = blkdev_issue_zeroout(journal->j_dev, byte_start >> SECTOR_SHIFT, diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index 03a845ab4f00..1e7b177ece60 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -110,14 +110,13 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case FITRIM: { struct super_block *sb = inode->i_sb; - struct request_queue *q = bdev_get_queue(sb->s_bdev); struct fstrim_range range; s64 ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!blk_queue_discard(q)) { + if (!bdev_max_discard_sectors(sb->s_bdev)) { jfs_warn("FITRIM not supported on device"); return -EOPNOTSUPP; } @@ -127,7 +126,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return -EFAULT; range.minlen = max_t(unsigned int, range.minlen, - q->limits.discard_granularity); + bdev_discard_granularity(sb->s_bdev)); ret = jfs_ioc_trim(inode, &range); if (ret < 0) diff --git a/fs/jfs/super.c b/fs/jfs/super.c index f1a13a74cddf..85d4f44f2ac4 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -372,19 +372,16 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, } case Opt_discard: - { - struct request_queue *q = bdev_get_queue(sb->s_bdev); /* if set to 1, even copying files will cause * trimming :O * -> user has more control over the online trimming */ sbi->minblks_trim = 64; - if (blk_queue_discard(q)) + if (bdev_max_discard_sectors(sb->s_bdev)) *flag |= JFS_DISCARD; else pr_err("JFS: discard option not supported on device\n"); break; - } case Opt_nodiscard: *flag &= ~JFS_DISCARD; @@ -392,10 +389,9 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, case Opt_discard_minblk: { - struct request_queue *q = bdev_get_queue(sb->s_bdev); char *minblks_trim = args[0].from; int rc; - if (blk_queue_discard(q)) { + if (bdev_max_discard_sectors(sb->s_bdev)) { *flag |= JFS_DISCARD; rc = kstrtouint(minblks_trim, 0, &sbi->minblks_trim); diff --git a/fs/namespace.c b/fs/namespace.c index afe2b64b14f1..41461f55c039 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4026,8 +4026,9 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) static inline bool mnt_allow_writers(const struct mount_kattr *kattr, const struct mount *mnt) { - return !(kattr->attr_set & MNT_READONLY) || - (mnt->mnt.mnt_flags & MNT_READONLY); + return (!(kattr->attr_set & MNT_READONLY) || + (mnt->mnt.mnt_flags & MNT_READONLY)) && + !kattr->mnt_userns; } static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index e2d59bb5e6bb..9a16897e8dc6 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -517,7 +517,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, if (result.negated) ctx->flags &= ~NFS_MOUNT_SOFTREVAL; else - ctx->flags &= NFS_MOUNT_SOFTREVAL; + ctx->flags |= NFS_MOUNT_SOFTREVAL; break; case Opt_posix: if (result.negated) diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index fec194a666f4..87e1004b606d 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -1052,20 +1052,20 @@ out: static int nilfs_ioctl_trim_fs(struct inode *inode, void __user *argp) { struct the_nilfs *nilfs = inode->i_sb->s_fs_info; - struct request_queue *q = bdev_get_queue(nilfs->ns_bdev); struct fstrim_range range; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(nilfs->ns_bdev)) return -EOPNOTSUPP; if (copy_from_user(&range, argp, sizeof(range))) return -EFAULT; - range.minlen = max_t(u64, range.minlen, q->limits.discard_granularity); + range.minlen = max_t(u64, range.minlen, + bdev_discard_granularity(nilfs->ns_bdev)); down_read(&nilfs->ns_segctor_sem); ret = nilfs_sufile_trim_fs(nilfs->ns_sufile, &range); diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index e385cca2004a..77ff8e95421f 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -1100,7 +1100,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range) ret = blkdev_issue_discard(nilfs->ns_bdev, start * sects_per_block, nblocks * sects_per_block, - GFP_NOFS, 0); + GFP_NOFS); if (ret < 0) { put_bh(su_bh); goto out_sem; @@ -1134,7 +1134,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range) ret = blkdev_issue_discard(nilfs->ns_bdev, start * sects_per_block, nblocks * sects_per_block, - GFP_NOFS, 0); + GFP_NOFS); if (!ret) ndiscarded += nblocks; } diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index dd48a8f74d57..3b4a079c9617 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -672,7 +672,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump, ret = blkdev_issue_discard(nilfs->ns_bdev, start * sects_per_block, nblocks * sects_per_block, - GFP_NOFS, 0); + GFP_NOFS); if (ret < 0) return ret; nblocks = 0; @@ -682,7 +682,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump, ret = blkdev_issue_discard(nilfs->ns_bdev, start * sects_per_block, nblocks * sects_per_block, - GFP_NOFS, 0); + GFP_NOFS); return ret; } diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 9b32b76a9c30..a792e21c5309 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1657,6 +1657,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, else mnt = path.mnt; + /* + * FAN_RENAME is not allowed on non-dir (for now). + * We shouldn't have allowed setting any dirent events in mask of + * non-dir, but because we always allowed it, error only if group + * was initialized with the new flag FAN_REPORT_TARGET_FID. + */ + ret = -ENOTDIR; + if (inode && !S_ISDIR(inode->i_mode) && + ((mask & FAN_RENAME) || + ((mask & FANOTIFY_DIRENT_EVENTS) && + FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID)))) + goto path_put_and_out; + /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */ if (mnt || !S_ISDIR(inode->i_mode)) { mask &= ~FAN_EVENT_ON_CHILD; diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 787b53b984ee..15806eeae217 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -22,20 +22,20 @@ static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg) { struct fstrim_range __user *user_range; struct fstrim_range range; - struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); int err; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(sbi->sb->s_bdev)) return -EOPNOTSUPP; user_range = (struct fstrim_range __user *)arg; if (copy_from_user(&range, user_range, sizeof(range))) return -EFAULT; - range.minlen = max_t(u32, range.minlen, q->limits.discard_granularity); + range.minlen = max_t(u32, range.minlen, + bdev_discard_granularity(sbi->sb->s_bdev)); err = ntfs_trim_fs(sbi, &range); if (err < 0) diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 278dcf502410..5781b9e8e3d8 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -882,7 +882,6 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) int err; struct ntfs_sb_info *sbi = sb->s_fs_info; struct block_device *bdev = sb->s_bdev; - struct request_queue *rq; struct inode *inode; struct ntfs_inode *ni; size_t i, tt; @@ -912,15 +911,14 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) goto out; } - rq = bdev_get_queue(bdev); - if (blk_queue_discard(rq) && rq->limits.discard_granularity) { - sbi->discard_granularity = rq->limits.discard_granularity; + if (bdev_max_discard_sectors(bdev) && bdev_discard_granularity(bdev)) { + sbi->discard_granularity = bdev_discard_granularity(bdev); sbi->discard_granularity_mask_inv = ~(u64)(sbi->discard_granularity - 1); } /* Parse boot. */ - err = ntfs_init_from_boot(sb, rq ? queue_logical_block_size(rq) : 512, + err = ntfs_init_from_boot(sb, bdev_logical_block_size(bdev), bdev_nr_bytes(bdev)); if (err) goto out; @@ -1335,7 +1333,7 @@ int ntfs_discard(struct ntfs_sb_info *sbi, CLST lcn, CLST len) return 0; err = blkdev_issue_discard(sb->s_bdev, start >> 9, (end - start) >> 9, - GFP_NOFS, 0); + GFP_NOFS); if (err == -EOPNOTSUPP) sbi->flags |= NTFS_FLAGS_NODISCARD; diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index f59461d85da4..afd54ec66103 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -903,20 +903,19 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case FITRIM: { struct super_block *sb = inode->i_sb; - struct request_queue *q = bdev_get_queue(sb->s_bdev); struct fstrim_range range; int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(sb->s_bdev)) return -EOPNOTSUPP; if (copy_from_user(&range, argp, sizeof(range))) return -EFAULT; - range.minlen = max_t(u64, q->limits.discard_granularity, + range.minlen = max_t(u64, bdev_discard_granularity(sb->s_bdev), range.minlen); ret = ocfs2_trim_fs(sb, &range); if (ret < 0) diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c index 419760fd77bd..f38bda5b83ec 100644 --- a/fs/proc/cpuinfo.c +++ b/fs/proc/cpuinfo.c @@ -5,14 +5,10 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> -__weak void arch_freq_prepare_all(void) -{ -} - extern const struct seq_operations cpuinfo_op; + static int cpuinfo_open(struct inode *inode, struct file *file) { - arch_freq_prepare_all(); return seq_open(file, &cpuinfo_op); } diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 172c86270b31..913bef0d2a36 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -72,7 +72,7 @@ out: return 0; } -static int seq_fdinfo_open(struct inode *inode, struct file *file) +static int proc_fdinfo_access_allowed(struct inode *inode) { bool allowed = false; struct task_struct *task = get_proc_task(inode); @@ -86,6 +86,16 @@ static int seq_fdinfo_open(struct inode *inode, struct file *file) if (!allowed) return -EACCES; + return 0; +} + +static int seq_fdinfo_open(struct inode *inode, struct file *file) +{ + int ret = proc_fdinfo_access_allowed(inode); + + if (ret) + return ret; + return single_open(file, seq_show, inode); } @@ -348,12 +358,23 @@ static int proc_readfdinfo(struct file *file, struct dir_context *ctx) proc_fdinfo_instantiate); } +static int proc_open_fdinfo(struct inode *inode, struct file *file) +{ + int ret = proc_fdinfo_access_allowed(inode); + + if (ret) + return ret; + + return 0; +} + const struct inode_operations proc_fdinfo_inode_operations = { .lookup = proc_lookupfdinfo, .setattr = proc_setattr, }; const struct file_operations proc_fdinfo_operations = { + .open = proc_open_fdinfo, .read = generic_read_dir, .iterate_shared = proc_readfdinfo, .llseek = generic_file_llseek, diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 622c844f6d11..8879d052f96c 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -86,17 +86,10 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length, int error, i; struct bio *bio; - if (page_count <= BIO_MAX_VECS) { - bio = bio_alloc(sb->s_bdev, page_count, REQ_OP_READ, GFP_NOIO); - } else { - bio = bio_kmalloc(GFP_NOIO, page_count); - bio_set_dev(bio, sb->s_bdev); - bio->bi_opf = REQ_OP_READ; - } - + bio = bio_kmalloc(page_count, GFP_NOIO); if (!bio) return -ENOMEM; - + bio_init(bio, sb->s_bdev, bio->bi_inline_vecs, page_count, REQ_OP_READ); bio->bi_iter.bi_sector = block * (msblk->devblksize >> SECTOR_SHIFT); for (i = 0; i < page_count; ++i) { @@ -126,7 +119,8 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length, out_free_bio: bio_free_pages(bio); - bio_put(bio); + bio_uninit(bio); + kfree(bio); return error; } @@ -190,7 +184,8 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, length |= data[0] << 8; } bio_free_pages(bio); - bio_put(bio); + bio_uninit(bio); + kfree(bio); compressed = SQUASHFS_COMPRESSED(length); length = SQUASHFS_COMPRESSED_SIZE(length); @@ -224,7 +219,8 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, out_free_bio: bio_free_pages(bio); - bio_put(bio); + bio_uninit(bio); + kfree(bio); out: if (res < 0) { ERROR("Failed to read block 0x%llx: %d\n", index, res); diff --git a/fs/super.c b/fs/super.c index f1d4a193602d..60f57c7bc0a6 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1204,7 +1204,7 @@ static int set_bdev_super(struct super_block *s, void *data) s->s_dev = s->s_bdev->bd_dev; s->s_bdi = bdi_get(s->s_bdev->bd_disk->bdi); - if (blk_queue_stable_writes(s->s_bdev->bd_disk->queue)) + if (bdev_stable_writes(s->s_bdev)) s->s_iflags |= SB_I_STABLE_WRITES; return 0; } diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 008fa46ef61e..7d6d2f152e03 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -132,7 +132,7 @@ #define WORST_COMPR_FACTOR 2 #ifdef CONFIG_FS_ENCRYPTION -#define UBIFS_CIPHER_BLOCK_SIZE FS_CRYPTO_BLOCK_SIZE +#define UBIFS_CIPHER_BLOCK_SIZE FSCRYPT_CONTENTS_ALIGNMENT #else #define UBIFS_CIPHER_BLOCK_SIZE 0 #endif diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 0ed4861b038f..b3d5f97f16cd 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -75,11 +75,11 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, if (fileident) { if (adinicb || (offset + lfi < 0)) { - memcpy(udf_get_fi_ident(sfi), fileident, lfi); + memcpy(sfi->impUse + liu, fileident, lfi); } else if (offset >= 0) { memcpy(fibh->ebh->b_data + offset, fileident, lfi); } else { - memcpy(udf_get_fi_ident(sfi), fileident, -offset); + memcpy(sfi->impUse + liu, fileident, -offset); memcpy(fibh->ebh->b_data, fileident - offset, lfi + offset); } @@ -88,11 +88,11 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, offset += lfi; if (adinicb || (offset + padlen < 0)) { - memset(udf_get_fi_ident(sfi) + lfi, 0x00, padlen); + memset(sfi->impUse + liu + lfi, 0x00, padlen); } else if (offset >= 0) { memset(fibh->ebh->b_data + offset, 0x00, padlen); } else { - memset(udf_get_fi_ident(sfi) + lfi, 0x00, -offset); + memset(sfi->impUse + liu + lfi, 0x00, -offset); memset(fibh->ebh->b_data, 0x00, padlen + offset); } diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig index 24d1b54de807..54598cd80145 100644 --- a/fs/verity/Kconfig +++ b/fs/verity/Kconfig @@ -3,6 +3,7 @@ config FS_VERITY bool "FS Verity (read-only file-based authenticity protection)" select CRYPTO + select CRYPTO_HASH_INFO # SHA-256 is implied as it's intended to be the default hash algorithm. # To avoid bloat, other wanted algorithms must be selected explicitly. # Note that CRYPTO_SHA256 denotes the generic C implementation, but diff --git a/fs/verity/enable.c b/fs/verity/enable.c index 60a4372aa4d7..d52872c808ff 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -202,7 +202,7 @@ static int enable_verity(struct file *filp, const struct fsverity_operations *vops = inode->i_sb->s_vop; struct merkle_tree_params params = { }; struct fsverity_descriptor *desc; - size_t desc_size = sizeof(*desc) + arg->sig_size; + size_t desc_size = struct_size(desc, signature, arg->sig_size); struct fsverity_info *vi; int err; @@ -281,7 +281,7 @@ static int enable_verity(struct file *filp, * from disk. This is simpler, and it serves as an extra check that the * metadata we're writing is valid before actually enabling verity. */ - vi = fsverity_create_info(inode, desc, desc_size); + vi = fsverity_create_info(inode, desc); if (IS_ERR(vi)) { err = PTR_ERR(vi); goto rollback; diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index a7920434bae5..629785c95007 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -14,7 +14,6 @@ #define pr_fmt(fmt) "fs-verity: " fmt -#include <crypto/sha2.h> #include <linux/fsverity.h> #include <linux/mempool.h> @@ -26,12 +25,6 @@ struct ahash_request; */ #define FS_VERITY_MAX_LEVELS 8 -/* - * Largest digest size among all hash algorithms supported by fs-verity. - * Currently assumed to be <= size of fsverity_descriptor::root_hash. - */ -#define FS_VERITY_MAX_DIGEST_SIZE SHA512_DIGEST_SIZE - /* A hash algorithm supported by fs-verity */ struct fsverity_hash_alg { struct crypto_ahash *tfm; /* hash tfm, allocated on demand */ @@ -122,16 +115,14 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, const u8 *salt, size_t salt_size); struct fsverity_info *fsverity_create_info(const struct inode *inode, - struct fsverity_descriptor *desc, - size_t desc_size); + struct fsverity_descriptor *desc); void fsverity_set_info(struct inode *inode, struct fsverity_info *vi); void fsverity_free_info(struct fsverity_info *vi); int fsverity_get_descriptor(struct inode *inode, - struct fsverity_descriptor **desc_ret, - size_t *desc_size_ret); + struct fsverity_descriptor **desc_ret); int __init fsverity_init_info_cache(void); void __init fsverity_exit_info_cache(void); diff --git a/fs/verity/measure.c b/fs/verity/measure.c index f0d7b30c62db..e99c00350c28 100644 --- a/fs/verity/measure.c +++ b/fs/verity/measure.c @@ -57,3 +57,46 @@ int fsverity_ioctl_measure(struct file *filp, void __user *_uarg) return 0; } EXPORT_SYMBOL_GPL(fsverity_ioctl_measure); + +/** + * fsverity_get_digest() - get a verity file's digest + * @inode: inode to get digest of + * @digest: (out) pointer to the digest + * @alg: (out) pointer to the hash algorithm enumeration + * + * Return the file hash algorithm and digest of an fsverity protected file. + * Assumption: before calling fsverity_get_digest(), the file must have been + * opened. + * + * Return: 0 on success, -errno on failure + */ +int fsverity_get_digest(struct inode *inode, + u8 digest[FS_VERITY_MAX_DIGEST_SIZE], + enum hash_algo *alg) +{ + const struct fsverity_info *vi; + const struct fsverity_hash_alg *hash_alg; + int i; + + vi = fsverity_get_info(inode); + if (!vi) + return -ENODATA; /* not a verity file */ + + hash_alg = vi->tree_params.hash_alg; + memset(digest, 0, FS_VERITY_MAX_DIGEST_SIZE); + + /* convert the verity hash algorithm name to a hash_algo_name enum */ + i = match_string(hash_algo_name, HASH_ALGO__LAST, hash_alg->name); + if (i < 0) + return -EINVAL; + *alg = i; + + if (WARN_ON_ONCE(hash_alg->digest_size != hash_digest_size[*alg])) + return -EINVAL; + memcpy(digest, vi->file_digest, hash_alg->digest_size); + + pr_debug("file digest %s:%*phN\n", hash_algo_name[*alg], + hash_digest_size[*alg], digest); + + return 0; +} diff --git a/fs/verity/open.c b/fs/verity/open.c index 92df87f5fa38..81ff94442f7b 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -147,8 +147,7 @@ static int compute_file_digest(struct fsverity_hash_alg *hash_alg, * fsverity_descriptor must have already undergone basic validation. */ struct fsverity_info *fsverity_create_info(const struct inode *inode, - struct fsverity_descriptor *desc, - size_t desc_size) + struct fsverity_descriptor *desc) { struct fsverity_info *vi; int err; @@ -264,8 +263,7 @@ static bool validate_fsverity_descriptor(struct inode *inode, * the filesystem, and do basic validation of it. */ int fsverity_get_descriptor(struct inode *inode, - struct fsverity_descriptor **desc_ret, - size_t *desc_size_ret) + struct fsverity_descriptor **desc_ret) { int res; struct fsverity_descriptor *desc; @@ -297,7 +295,6 @@ int fsverity_get_descriptor(struct inode *inode, } *desc_ret = desc; - *desc_size_ret = res; return 0; } @@ -306,17 +303,16 @@ static int ensure_verity_info(struct inode *inode) { struct fsverity_info *vi = fsverity_get_info(inode); struct fsverity_descriptor *desc; - size_t desc_size; int err; if (vi) return 0; - err = fsverity_get_descriptor(inode, &desc, &desc_size); + err = fsverity_get_descriptor(inode, &desc); if (err) return err; - vi = fsverity_create_info(inode, desc, desc_size); + vi = fsverity_create_info(inode, desc); if (IS_ERR(vi)) { err = PTR_ERR(vi); goto out_free_desc; diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c index 7e2d0c7bdf0d..6ee849dc7bc1 100644 --- a/fs/verity/read_metadata.c +++ b/fs/verity/read_metadata.c @@ -101,7 +101,7 @@ static int fsverity_read_descriptor(struct inode *inode, size_t desc_size; int res; - res = fsverity_get_descriptor(inode, &desc, &desc_size); + res = fsverity_get_descriptor(inode, &desc); if (res) return res; @@ -119,10 +119,9 @@ static int fsverity_read_signature(struct inode *inode, void __user *buf, u64 offset, int length) { struct fsverity_descriptor *desc; - size_t desc_size; int res; - res = fsverity_get_descriptor(inode, &desc, &desc_size); + res = fsverity_get_descriptor(inode, &desc); if (res) return res; diff --git a/fs/xattr.c b/fs/xattr.c index 998045165916..e8dd03e4561e 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -25,6 +25,8 @@ #include <linux/uaccess.h> +#include "internal.h" + static const char * strcmp_prefix(const char *a, const char *a_prefix) { @@ -539,44 +541,76 @@ EXPORT_SYMBOL_GPL(vfs_removexattr); /* * Extended attribute SET operations */ -static long -setxattr(struct user_namespace *mnt_userns, struct dentry *d, - const char __user *name, const void __user *value, size_t size, - int flags) + +int setxattr_copy(const char __user *name, struct xattr_ctx *ctx) { int error; - void *kvalue = NULL; - char kname[XATTR_NAME_MAX + 1]; - if (flags & ~(XATTR_CREATE|XATTR_REPLACE)) + if (ctx->flags & ~(XATTR_CREATE|XATTR_REPLACE)) return -EINVAL; - error = strncpy_from_user(kname, name, sizeof(kname)); - if (error == 0 || error == sizeof(kname)) - error = -ERANGE; + error = strncpy_from_user(ctx->kname->name, name, + sizeof(ctx->kname->name)); + if (error == 0 || error == sizeof(ctx->kname->name)) + return -ERANGE; if (error < 0) return error; - if (size) { - if (size > XATTR_SIZE_MAX) + error = 0; + if (ctx->size) { + if (ctx->size > XATTR_SIZE_MAX) return -E2BIG; - kvalue = kvmalloc(size, GFP_KERNEL); - if (!kvalue) - return -ENOMEM; - if (copy_from_user(kvalue, value, size)) { - error = -EFAULT; - goto out; + + ctx->kvalue = vmemdup_user(ctx->cvalue, ctx->size); + if (IS_ERR(ctx->kvalue)) { + error = PTR_ERR(ctx->kvalue); + ctx->kvalue = NULL; } - if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || - (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) - posix_acl_fix_xattr_from_user(mnt_userns, d_inode(d), - kvalue, size); } - error = vfs_setxattr(mnt_userns, d, kname, kvalue, size, flags); -out: - kvfree(kvalue); + return error; +} + +static void setxattr_convert(struct user_namespace *mnt_userns, + struct dentry *d, struct xattr_ctx *ctx) +{ + if (ctx->size && + ((strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || + (strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))) + posix_acl_fix_xattr_from_user(mnt_userns, d_inode(d), + ctx->kvalue, ctx->size); +} + +int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct xattr_ctx *ctx) +{ + setxattr_convert(mnt_userns, dentry, ctx); + return vfs_setxattr(mnt_userns, dentry, ctx->kname->name, + ctx->kvalue, ctx->size, ctx->flags); +} + +static long +setxattr(struct user_namespace *mnt_userns, struct dentry *d, + const char __user *name, const void __user *value, size_t size, + int flags) +{ + struct xattr_name kname; + struct xattr_ctx ctx = { + .cvalue = value, + .kvalue = NULL, + .size = size, + .kname = &kname, + .flags = flags, + }; + int error; + error = setxattr_copy(name, &ctx); + if (error) + return error; + + error = do_setxattr(mnt_userns, d, &ctx); + + kvfree(ctx.kvalue); return error; } @@ -642,44 +676,61 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, /* * Extended attribute GET operations */ -static ssize_t -getxattr(struct user_namespace *mnt_userns, struct dentry *d, - const char __user *name, void __user *value, size_t size) +ssize_t +do_getxattr(struct user_namespace *mnt_userns, struct dentry *d, + struct xattr_ctx *ctx) { ssize_t error; - void *kvalue = NULL; - char kname[XATTR_NAME_MAX + 1]; - - error = strncpy_from_user(kname, name, sizeof(kname)); - if (error == 0 || error == sizeof(kname)) - error = -ERANGE; - if (error < 0) - return error; + char *kname = ctx->kname->name; - if (size) { - if (size > XATTR_SIZE_MAX) - size = XATTR_SIZE_MAX; - kvalue = kvzalloc(size, GFP_KERNEL); - if (!kvalue) + if (ctx->size) { + if (ctx->size > XATTR_SIZE_MAX) + ctx->size = XATTR_SIZE_MAX; + ctx->kvalue = kvzalloc(ctx->size, GFP_KERNEL); + if (!ctx->kvalue) return -ENOMEM; } - error = vfs_getxattr(mnt_userns, d, kname, kvalue, size); + error = vfs_getxattr(mnt_userns, d, kname, ctx->kvalue, ctx->size); if (error > 0) { if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) posix_acl_fix_xattr_to_user(mnt_userns, d_inode(d), - kvalue, error); - if (size && copy_to_user(value, kvalue, error)) + ctx->kvalue, error); + if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error)) error = -EFAULT; - } else if (error == -ERANGE && size >= XATTR_SIZE_MAX) { + } else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) { /* The file system tried to returned a value bigger than XATTR_SIZE_MAX bytes. Not possible. */ error = -E2BIG; } - kvfree(kvalue); + return error; +} + +static ssize_t +getxattr(struct user_namespace *mnt_userns, struct dentry *d, + const char __user *name, void __user *value, size_t size) +{ + ssize_t error; + struct xattr_name kname; + struct xattr_ctx ctx = { + .value = value, + .kvalue = NULL, + .size = size, + .kname = &kname, + .flags = 0, + }; + + error = strncpy_from_user(kname.name, name, sizeof(kname.name)); + if (error == 0 || error == sizeof(kname.name)) + error = -ERANGE; + if (error < 0) + return error; + + error = do_getxattr(mnt_userns, d, &ctx); + kvfree(ctx.kvalue); return error; } diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 0191de8ce9ce..c6fe3f6ebb6b 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -114,7 +114,7 @@ xfs_trim_extents( } trace_xfs_discard_extent(mp, agno, fbno, flen); - error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0); + error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS); if (error) goto out_del_cursor; *blocks_trimmed += flen; @@ -152,8 +152,8 @@ xfs_ioc_trim( struct xfs_mount *mp, struct fstrim_range __user *urange) { - struct request_queue *q = bdev_get_queue(mp->m_ddev_targp->bt_bdev); - unsigned int granularity = q->limits.discard_granularity; + unsigned int granularity = + bdev_discard_granularity(mp->m_ddev_targp->bt_bdev); struct fstrim_range range; xfs_daddr_t start, end, minlen; xfs_agnumber_t start_agno, end_agno, agno; @@ -162,7 +162,7 @@ xfs_ioc_trim( if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev)) return -EOPNOTSUPP; /* diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 5bddb1e9e0b3..85c412107a10 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -225,7 +225,7 @@ xfs_file_dio_read( ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); if (ret) return ret; - ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, 0); + ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0); xfs_iunlock(ip, XFS_IOLOCK_SHARED); return ret; @@ -534,7 +534,7 @@ xfs_file_dio_write_aligned( } trace_xfs_file_direct_write(iocb, from); ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, - &xfs_dio_write_ops, 0, 0); + &xfs_dio_write_ops, 0, NULL, 0); out_unlock: if (iolock) xfs_iunlock(ip, iolock); @@ -612,7 +612,7 @@ retry_exclusive: trace_xfs_file_direct_write(iocb, from); ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, - &xfs_dio_write_ops, flags, 0); + &xfs_dio_write_ops, flags, NULL, 0); /* * Retry unaligned I/O with exclusive blocking semantics if the DIO diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index ba57323bfdce..c9f55e4f0957 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -605,7 +605,7 @@ xlog_discard_busy_extents( error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), XFS_FSB_TO_BB(mp, busyp->length), - GFP_NOFS, 0, &bio); + GFP_NOFS, &bio); if (error && error != -EOPNOTSUPP) { xfs_info(mp, "discard failed for extent [0x%llx,%u], error %d", diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 54be9d64093e..a276b8111f63 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1608,14 +1608,10 @@ xfs_fs_fill_super( goto out_filestream_unmount; } - if (xfs_has_discard(mp)) { - struct request_queue *q = bdev_get_queue(sb->s_bdev); - - if (!blk_queue_discard(q)) { - xfs_warn(mp, "mounting with \"discard\" option, but " - "the device does not support discard"); - mp->m_features &= ~XFS_FEAT_DISCARD; - } + if (xfs_has_discard(mp) && !bdev_max_discard_sectors(sb->s_bdev)) { + xfs_warn(mp, + "mounting with \"discard\" option, but the device does not support discard"); + mp->m_features &= ~XFS_FEAT_DISCARD; } if (xfs_has_reflink(mp)) { diff --git a/fs/zonefs/Makefile b/fs/zonefs/Makefile index 33c1a4f1132e..9fe54f5319f2 100644 --- a/fs/zonefs/Makefile +++ b/fs/zonefs/Makefile @@ -3,4 +3,4 @@ ccflags-y += -I$(src) obj-$(CONFIG_ZONEFS_FS) += zonefs.o -zonefs-y := super.o +zonefs-y := super.o sysfs.o diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index e20e7c841489..8f306485c953 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -27,6 +27,39 @@ #define CREATE_TRACE_POINTS #include "trace.h" +/* + * Manage the active zone count. Called with zi->i_truncate_mutex held. + */ +static void zonefs_account_active(struct inode *inode) +{ + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + lockdep_assert_held(&zi->i_truncate_mutex); + + if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) + return; + + /* + * If the zone is active, that is, if it is explicitly open or + * partially written, check if it was already accounted as active. + */ + if ((zi->i_flags & ZONEFS_ZONE_OPEN) || + (zi->i_wpoffset > 0 && zi->i_wpoffset < zi->i_max_size)) { + if (!(zi->i_flags & ZONEFS_ZONE_ACTIVE)) { + zi->i_flags |= ZONEFS_ZONE_ACTIVE; + atomic_inc(&sbi->s_active_seq_files); + } + return; + } + + /* The zone is not active. If it was, update the active count */ + if (zi->i_flags & ZONEFS_ZONE_ACTIVE) { + zi->i_flags &= ~ZONEFS_ZONE_ACTIVE; + atomic_dec(&sbi->s_active_seq_files); + } +} + static inline int zonefs_zone_mgmt(struct inode *inode, enum req_opf op) { @@ -68,8 +101,13 @@ static inline void zonefs_i_size_write(struct inode *inode, loff_t isize) * A full zone is no longer open/active and does not need * explicit closing. */ - if (isize >= zi->i_max_size) - zi->i_flags &= ~ZONEFS_ZONE_OPEN; + if (isize >= zi->i_max_size) { + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + + if (zi->i_flags & ZONEFS_ZONE_ACTIVE) + atomic_dec(&sbi->s_active_seq_files); + zi->i_flags &= ~(ZONEFS_ZONE_OPEN | ZONEFS_ZONE_ACTIVE); + } } static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, @@ -397,6 +435,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, zonefs_update_stats(inode, data_size); zonefs_i_size_write(inode, data_size); zi->i_wpoffset = data_size; + zonefs_account_active(inode); return 0; } @@ -508,6 +547,7 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize) zonefs_update_stats(inode, isize); truncate_setsize(inode, isize); zi->i_wpoffset = isize; + zonefs_account_active(inode); unlock: mutex_unlock(&zi->i_truncate_mutex); @@ -689,13 +729,12 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); struct zonefs_inode_info *zi = ZONEFS_I(inode); struct block_device *bdev = inode->i_sb->s_bdev; - unsigned int max; + unsigned int max = bdev_max_zone_append_sectors(bdev); struct bio *bio; ssize_t size; int nr_pages; ssize_t ret; - max = queue_max_zone_append_sectors(bdev_get_queue(bdev)); max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize); iov_iter_truncate(from, max); @@ -861,13 +900,20 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) ret = zonefs_file_dio_append(iocb, from); else ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops, - &zonefs_write_dio_ops, 0, 0); + &zonefs_write_dio_ops, 0, NULL, 0); if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && (ret > 0 || ret == -EIOCBQUEUED)) { if (ret > 0) count = ret; + + /* + * Update the zone write pointer offset assuming the write + * operation succeeded. If it did not, the error recovery path + * will correct it. Also do active seq file accounting. + */ mutex_lock(&zi->i_truncate_mutex); zi->i_wpoffset += count; + zonefs_account_active(inode); mutex_unlock(&zi->i_truncate_mutex); } @@ -996,7 +1042,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } file_accessed(iocb->ki_filp); ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops, - &zonefs_read_dio_ops, 0, 0); + &zonefs_read_dio_ops, 0, NULL, 0); } else { ret = generic_file_read_iter(iocb, to); if (ret == -EIO) @@ -1009,13 +1055,13 @@ inode_unlock: return ret; } -static inline bool zonefs_file_use_exp_open(struct inode *inode, struct file *file) +/* + * Write open accounting is done only for sequential files. + */ +static inline bool zonefs_seq_file_need_wro(struct inode *inode, + struct file *file) { struct zonefs_inode_info *zi = ZONEFS_I(inode); - struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); - - if (!(sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN)) - return false; if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) return false; @@ -1026,28 +1072,34 @@ static inline bool zonefs_file_use_exp_open(struct inode *inode, struct file *fi return true; } -static int zonefs_open_zone(struct inode *inode) +static int zonefs_seq_file_write_open(struct inode *inode) { struct zonefs_inode_info *zi = ZONEFS_I(inode); - struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); int ret = 0; mutex_lock(&zi->i_truncate_mutex); if (!zi->i_wr_refcnt) { - if (atomic_inc_return(&sbi->s_open_zones) > sbi->s_max_open_zones) { - atomic_dec(&sbi->s_open_zones); - ret = -EBUSY; - goto unlock; - } + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files); - if (i_size_read(inode) < zi->i_max_size) { - ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); - if (ret) { - atomic_dec(&sbi->s_open_zones); + if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { + + if (wro > sbi->s_max_wro_seq_files) { + atomic_dec(&sbi->s_wro_seq_files); + ret = -EBUSY; goto unlock; } - zi->i_flags |= ZONEFS_ZONE_OPEN; + + if (i_size_read(inode) < zi->i_max_size) { + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); + if (ret) { + atomic_dec(&sbi->s_wro_seq_files); + goto unlock; + } + zi->i_flags |= ZONEFS_ZONE_OPEN; + zonefs_account_active(inode); + } } } @@ -1067,30 +1119,31 @@ static int zonefs_file_open(struct inode *inode, struct file *file) if (ret) return ret; - if (zonefs_file_use_exp_open(inode, file)) - return zonefs_open_zone(inode); + if (zonefs_seq_file_need_wro(inode, file)) + return zonefs_seq_file_write_open(inode); return 0; } -static void zonefs_close_zone(struct inode *inode) +static void zonefs_seq_file_write_close(struct inode *inode) { struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct super_block *sb = inode->i_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); int ret = 0; mutex_lock(&zi->i_truncate_mutex); - zi->i_wr_refcnt--; - if (!zi->i_wr_refcnt) { - struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); - struct super_block *sb = inode->i_sb; - /* - * If the file zone is full, it is not open anymore and we only - * need to decrement the open count. - */ - if (!(zi->i_flags & ZONEFS_ZONE_OPEN)) - goto dec; + zi->i_wr_refcnt--; + if (zi->i_wr_refcnt) + goto unlock; + /* + * The file zone may not be open anymore (e.g. the file was truncated to + * its maximum size or it was fully written). For this case, we only + * need to decrement the write open count. + */ + if (zi->i_flags & ZONEFS_ZONE_OPEN) { ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); if (ret) { __zonefs_io_error(inode, false); @@ -1102,14 +1155,23 @@ static void zonefs_close_zone(struct inode *inode) */ if (zi->i_flags & ZONEFS_ZONE_OPEN && !(sb->s_flags & SB_RDONLY)) { - zonefs_warn(sb, "closing zone failed, remounting filesystem read-only\n"); + zonefs_warn(sb, + "closing zone at %llu failed %d\n", + zi->i_zsector, ret); + zonefs_warn(sb, + "remounting filesystem read-only\n"); sb->s_flags |= SB_RDONLY; } + goto unlock; } + zi->i_flags &= ~ZONEFS_ZONE_OPEN; -dec: - atomic_dec(&sbi->s_open_zones); + zonefs_account_active(inode); } + + atomic_dec(&sbi->s_wro_seq_files); + +unlock: mutex_unlock(&zi->i_truncate_mutex); } @@ -1121,8 +1183,8 @@ static int zonefs_file_release(struct inode *inode, struct file *file) * the zone has gone offline or read-only). Make sure we don't fail the * close(2) for user-space. */ - if (zonefs_file_use_exp_open(inode, file)) - zonefs_close_zone(inode); + if (zonefs_seq_file_need_wro(inode, file)) + zonefs_seq_file_write_close(inode); return 0; } @@ -1337,6 +1399,8 @@ static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits; sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits; + mutex_lock(&zi->i_truncate_mutex); + /* * For sequential zones, make sure that any open zone is closed first * to ensure that the initial number of open zones is 0, in sync with @@ -1346,11 +1410,16 @@ static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, if (type == ZONEFS_ZTYPE_SEQ && (zone->cond == BLK_ZONE_COND_IMP_OPEN || zone->cond == BLK_ZONE_COND_EXP_OPEN)) { - mutex_lock(&zi->i_truncate_mutex); ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); - mutex_unlock(&zi->i_truncate_mutex); + if (ret) + goto unlock; } + zonefs_account_active(inode); + +unlock: + mutex_unlock(&zi->i_truncate_mutex); + return ret; } @@ -1688,14 +1757,18 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) sbi->s_gid = GLOBAL_ROOT_GID; sbi->s_perm = 0640; sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO; - sbi->s_max_open_zones = bdev_max_open_zones(sb->s_bdev); - atomic_set(&sbi->s_open_zones, 0); - if (!sbi->s_max_open_zones && + + atomic_set(&sbi->s_wro_seq_files, 0); + sbi->s_max_wro_seq_files = bdev_max_open_zones(sb->s_bdev); + if (!sbi->s_max_wro_seq_files && sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { zonefs_info(sb, "No open zones limit. Ignoring explicit_open mount option\n"); sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN; } + atomic_set(&sbi->s_active_seq_files, 0); + sbi->s_max_active_seq_files = bdev_max_active_zones(sb->s_bdev); + ret = zonefs_read_super(sb); if (ret) return ret; @@ -1710,6 +1783,10 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) if (ret) goto cleanup; + ret = zonefs_sysfs_register(sb); + if (ret) + goto cleanup; + zonefs_info(sb, "Mounting %u zones", blkdev_nr_zones(sb->s_bdev->bd_disk)); @@ -1755,6 +1832,8 @@ static void zonefs_kill_super(struct super_block *sb) if (sb->s_root) d_genocide(sb->s_root); + + zonefs_sysfs_unregister(sb); kill_block_super(sb); kfree(sbi); } @@ -1802,16 +1881,26 @@ static int __init zonefs_init(void) return ret; ret = register_filesystem(&zonefs_type); - if (ret) { - zonefs_destroy_inodecache(); - return ret; - } + if (ret) + goto destroy_inodecache; + + ret = zonefs_sysfs_init(); + if (ret) + goto unregister_fs; return 0; + +unregister_fs: + unregister_filesystem(&zonefs_type); +destroy_inodecache: + zonefs_destroy_inodecache(); + + return ret; } static void __exit zonefs_exit(void) { + zonefs_sysfs_exit(); zonefs_destroy_inodecache(); unregister_filesystem(&zonefs_type); } diff --git a/fs/zonefs/sysfs.c b/fs/zonefs/sysfs.c new file mode 100644 index 000000000000..9cb6755ce39a --- /dev/null +++ b/fs/zonefs/sysfs.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Simple file system for zoned block devices exposing zones as files. + * + * Copyright (C) 2022 Western Digital Corporation or its affiliates. + */ +#include <linux/fs.h> +#include <linux/seq_file.h> +#include <linux/blkdev.h> + +#include "zonefs.h" + +struct zonefs_sysfs_attr { + struct attribute attr; + ssize_t (*show)(struct zonefs_sb_info *sbi, char *buf); +}; + +static inline struct zonefs_sysfs_attr *to_attr(struct attribute *attr) +{ + return container_of(attr, struct zonefs_sysfs_attr, attr); +} + +#define ZONEFS_SYSFS_ATTR_RO(name) \ +static struct zonefs_sysfs_attr zonefs_sysfs_attr_##name = __ATTR_RO(name) + +#define ATTR_LIST(name) &zonefs_sysfs_attr_##name.attr + +static ssize_t zonefs_sysfs_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct zonefs_sb_info *sbi = + container_of(kobj, struct zonefs_sb_info, s_kobj); + struct zonefs_sysfs_attr *zonefs_attr = + container_of(attr, struct zonefs_sysfs_attr, attr); + + if (!zonefs_attr->show) + return 0; + + return zonefs_attr->show(sbi, buf); +} + +static ssize_t max_wro_seq_files_show(struct zonefs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%u\n", sbi->s_max_wro_seq_files); +} +ZONEFS_SYSFS_ATTR_RO(max_wro_seq_files); + +static ssize_t nr_wro_seq_files_show(struct zonefs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%d\n", atomic_read(&sbi->s_wro_seq_files)); +} +ZONEFS_SYSFS_ATTR_RO(nr_wro_seq_files); + +static ssize_t max_active_seq_files_show(struct zonefs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%u\n", sbi->s_max_active_seq_files); +} +ZONEFS_SYSFS_ATTR_RO(max_active_seq_files); + +static ssize_t nr_active_seq_files_show(struct zonefs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%d\n", atomic_read(&sbi->s_active_seq_files)); +} +ZONEFS_SYSFS_ATTR_RO(nr_active_seq_files); + +static struct attribute *zonefs_sysfs_attrs[] = { + ATTR_LIST(max_wro_seq_files), + ATTR_LIST(nr_wro_seq_files), + ATTR_LIST(max_active_seq_files), + ATTR_LIST(nr_active_seq_files), + NULL, +}; +ATTRIBUTE_GROUPS(zonefs_sysfs); + +static void zonefs_sysfs_sb_release(struct kobject *kobj) +{ + struct zonefs_sb_info *sbi = + container_of(kobj, struct zonefs_sb_info, s_kobj); + + complete(&sbi->s_kobj_unregister); +} + +static const struct sysfs_ops zonefs_sysfs_attr_ops = { + .show = zonefs_sysfs_attr_show, +}; + +static struct kobj_type zonefs_sb_ktype = { + .default_groups = zonefs_sysfs_groups, + .sysfs_ops = &zonefs_sysfs_attr_ops, + .release = zonefs_sysfs_sb_release, +}; + +static struct kobject *zonefs_sysfs_root; + +int zonefs_sysfs_register(struct super_block *sb) +{ + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + int ret; + + init_completion(&sbi->s_kobj_unregister); + ret = kobject_init_and_add(&sbi->s_kobj, &zonefs_sb_ktype, + zonefs_sysfs_root, "%s", sb->s_id); + if (ret) { + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + return ret; + } + + sbi->s_sysfs_registered = true; + + return 0; +} + +void zonefs_sysfs_unregister(struct super_block *sb) +{ + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + + if (!sbi || !sbi->s_sysfs_registered) + return; + + kobject_del(&sbi->s_kobj); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); +} + +int __init zonefs_sysfs_init(void) +{ + zonefs_sysfs_root = kobject_create_and_add("zonefs", fs_kobj); + if (!zonefs_sysfs_root) + return -ENOMEM; + + return 0; +} + +void zonefs_sysfs_exit(void) +{ + kobject_put(zonefs_sysfs_root); + zonefs_sysfs_root = NULL; +} diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h index 7b147907c328..4b3de66c3233 100644 --- a/fs/zonefs/zonefs.h +++ b/fs/zonefs/zonefs.h @@ -12,6 +12,7 @@ #include <linux/uuid.h> #include <linux/mutex.h> #include <linux/rwsem.h> +#include <linux/kobject.h> /* * Maximum length of file names: this only needs to be large enough to fit @@ -39,6 +40,7 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone) } #define ZONEFS_ZONE_OPEN (1 << 0) +#define ZONEFS_ZONE_ACTIVE (1 << 1) /* * In-memory inode data. @@ -182,8 +184,15 @@ struct zonefs_sb_info { loff_t s_blocks; loff_t s_used_blocks; - unsigned int s_max_open_zones; - atomic_t s_open_zones; + unsigned int s_max_wro_seq_files; + atomic_t s_wro_seq_files; + + unsigned int s_max_active_seq_files; + atomic_t s_active_seq_files; + + bool s_sysfs_registered; + struct kobject s_kobj; + struct completion s_kobj_unregister; }; static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb) @@ -198,4 +207,9 @@ static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb) #define zonefs_warn(sb, format, args...) \ pr_warn("zonefs (%s) WARNING: " format, sb->s_id, ## args) +int zonefs_sysfs_register(struct super_block *sb); +void zonefs_sysfs_unregister(struct super_block *sb); +int zonefs_sysfs_init(void); +void zonefs_sysfs_exit(void); + #endif |