diff options
Diffstat (limited to 'fs')
639 files changed, 32728 insertions, 18307 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index 42837617a55b..a3159831ba98 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -18,6 +18,10 @@ config VALIDATE_FS_PARSER config FS_IOMAP bool +# Stackable filesystems +config FS_STACK + bool + config BUFFER_HEAD bool @@ -254,7 +258,7 @@ config TMPFS_QUOTA config ARCH_SUPPORTS_HUGETLBFS def_bool n -config HUGETLBFS +menuconfig HUGETLBFS bool "HugeTLB file system support" depends on X86 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN depends on (SYSFS || SYSCTL) @@ -266,6 +270,17 @@ config HUGETLBFS If unsure, say N. +if HUGETLBFS +config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON + bool "HugeTLB Vmemmap Optimization (HVO) defaults to on" + default n + depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP + help + The HugeTLB Vmemmap Optimization (HVO) defaults to off. Say Y here to + enable HVO by default. It can be disabled via hugetlb_free_vmemmap=off + (boot command line) or hugetlb_optimize_vmemmap (sysctl). +endif # HUGETLBFS + config HUGETLB_PAGE def_bool HUGETLBFS select XARRAY_MULTI @@ -275,15 +290,6 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP depends on ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP depends on SPARSEMEM_VMEMMAP -config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON - bool "HugeTLB Vmemmap Optimization (HVO) defaults to on" - default n - depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP - help - The HugeTLB VmemmapvOptimization (HVO) defaults to off. Say Y here to - enable HVO by default. It can be disabled via hugetlb_free_vmemmap=off - (boot command line) or hugetlb_optimize_vmemmap (sysctl). - config ARCH_HAS_GIGANTIC_PAGE bool diff --git a/fs/Makefile b/fs/Makefile index 75522f88e763..a6962c588962 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -39,6 +39,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o +obj-$(CONFIG_FS_STACK) += backing-file.o obj-$(CONFIG_FS_MBCACHE) += mbcache.o obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 3081edb09e46..a183e213a4a5 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -5,6 +5,7 @@ * Copyright (C) 1997-1999 Russell King */ #include <linux/buffer_head.h> +#include <linux/mpage.h> #include <linux/writeback.h> #include "adfs.h" @@ -33,9 +34,10 @@ abort_toobig: return 0; } -static int adfs_writepage(struct page *page, struct writeback_control *wbc) +static int adfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - return block_write_full_page(page, adfs_get_block, wbc); + return mpage_writepages(mapping, wbc, adfs_get_block); } static int adfs_read_folio(struct file *file, struct folio *folio) @@ -76,10 +78,11 @@ static const struct address_space_operations adfs_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = adfs_read_folio, - .writepage = adfs_writepage, + .writepages = adfs_writepages, .write_begin = adfs_write_begin, .write_end = generic_write_end, - .bmap = _adfs_bmap + .migrate_folio = buffer_migrate_folio, + .bmap = _adfs_bmap, }; /* diff --git a/fs/affs/namei.c b/fs/affs/namei.c index d6b9758ee23d..8c154490a2d6 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -532,9 +532,6 @@ static struct dentry *affs_get_parent(struct dentry *child) parent = affs_iget(child->d_sb, be32_to_cpu(AFFS_TAIL(child->d_sb, bh)->parent)); brelse(bh); - if (IS_ERR(parent)) - return ERR_CAST(parent); - return d_obtain_alias(parent); } diff --git a/fs/afs/Makefile b/fs/afs/Makefile index e8956b65d7ff..dcdc0f1bb76f 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -5,6 +5,7 @@ kafs-y := \ addr_list.o \ + addr_prefs.o \ callback.o \ cell.o \ cmservice.o \ @@ -27,6 +28,7 @@ kafs-y := \ server.o \ server_list.o \ super.o \ + validation.o \ vlclient.o \ vl_alias.o \ vl_list.o \ diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c index de1ae0bead3b..6d42f85c6be5 100644 --- a/fs/afs/addr_list.c +++ b/fs/afs/addr_list.c @@ -13,26 +13,55 @@ #include "internal.h" #include "afs_fs.h" +static void afs_free_addrlist(struct rcu_head *rcu) +{ + struct afs_addr_list *alist = container_of(rcu, struct afs_addr_list, rcu); + unsigned int i; + + for (i = 0; i < alist->nr_addrs; i++) + rxrpc_kernel_put_peer(alist->addrs[i].peer); + trace_afs_alist(alist->debug_id, refcount_read(&alist->usage), afs_alist_trace_free); + kfree(alist); +} + /* * Release an address list. */ -void afs_put_addrlist(struct afs_addr_list *alist) +void afs_put_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason) +{ + unsigned int debug_id; + bool dead; + int r; + + if (!alist) + return; + debug_id = alist->debug_id; + dead = __refcount_dec_and_test(&alist->usage, &r); + trace_afs_alist(debug_id, r - 1, reason); + if (dead) + call_rcu(&alist->rcu, afs_free_addrlist); +} + +struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason) { - if (alist && refcount_dec_and_test(&alist->usage)) - kfree_rcu(alist, rcu); + int r; + + if (alist) { + __refcount_inc(&alist->usage, &r); + trace_afs_alist(alist->debug_id, r + 1, reason); + } + return alist; } /* * Allocate an address list. */ -struct afs_addr_list *afs_alloc_addrlist(unsigned int nr, - unsigned short service, - unsigned short port) +struct afs_addr_list *afs_alloc_addrlist(unsigned int nr) { struct afs_addr_list *alist; - unsigned int i; + static atomic_t debug_id; - _enter("%u,%u,%u", nr, service, port); + _enter("%u", nr); if (nr > AFS_MAX_ADDRESSES) nr = AFS_MAX_ADDRESSES; @@ -43,17 +72,8 @@ struct afs_addr_list *afs_alloc_addrlist(unsigned int nr, refcount_set(&alist->usage, 1); alist->max_addrs = nr; - - for (i = 0; i < nr; i++) { - struct sockaddr_rxrpc *srx = &alist->addrs[i]; - srx->srx_family = AF_RXRPC; - srx->srx_service = service; - srx->transport_type = SOCK_DGRAM; - srx->transport_len = sizeof(srx->transport.sin6); - srx->transport.sin6.sin6_family = AF_INET6; - srx->transport.sin6.sin6_port = htons(port); - } - + alist->debug_id = atomic_inc_return(&debug_id); + trace_afs_alist(alist->debug_id, 1, afs_alist_trace_alloc); return alist; } @@ -126,7 +146,7 @@ struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *net, if (!vllist->servers[0].server) goto error_vl; - alist = afs_alloc_addrlist(nr, service, AFS_VL_PORT); + alist = afs_alloc_addrlist(nr); if (!alist) goto error; @@ -197,9 +217,11 @@ struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *net, } if (family == AF_INET) - afs_merge_fs_addr4(alist, x[0], xport); + ret = afs_merge_fs_addr4(net, alist, x[0], xport); else - afs_merge_fs_addr6(alist, x, xport); + ret = afs_merge_fs_addr6(net, alist, x, xport); + if (ret < 0) + goto error; } while (p < end); @@ -216,26 +238,13 @@ bad_address: problem, p - text, (int)len, (int)len, text); ret = -EINVAL; error: - afs_put_addrlist(alist); + afs_put_addrlist(alist, afs_alist_trace_put_parse_error); error_vl: afs_put_vlserverlist(net, vllist); return ERR_PTR(ret); } /* - * Compare old and new address lists to see if there's been any change. - * - How to do this in better than O(Nlog(N)) time? - * - We don't really want to sort the address list, but would rather take the - * list as we got it so as not to undo record rotation by the DNS server. - */ -#if 0 -static int afs_cmp_addr_list(const struct afs_addr_list *a1, - const struct afs_addr_list *a2) -{ -} -#endif - -/* * Perform a DNS query for VL servers and build a up an address list. */ struct afs_vlserver_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry) @@ -271,25 +280,33 @@ struct afs_vlserver_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry /* * Merge an IPv4 entry into a fileserver address list. */ -void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port) +int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *alist, + __be32 xdr, u16 port) { - struct sockaddr_rxrpc *srx; - u32 addr = ntohl(xdr); + struct sockaddr_rxrpc srx; + struct rxrpc_peer *peer; int i; if (alist->nr_addrs >= alist->max_addrs) - return; + return 0; - for (i = 0; i < alist->nr_ipv4; i++) { - struct sockaddr_in *a = &alist->addrs[i].transport.sin; - u32 a_addr = ntohl(a->sin_addr.s_addr); - u16 a_port = ntohs(a->sin_port); + srx.srx_family = AF_RXRPC; + srx.transport_type = SOCK_DGRAM; + srx.transport_len = sizeof(srx.transport.sin); + srx.transport.sin.sin_family = AF_INET; + srx.transport.sin.sin_port = htons(port); + srx.transport.sin.sin_addr.s_addr = xdr; - if (addr == a_addr && port == a_port) - return; - if (addr == a_addr && port < a_port) - break; - if (addr < a_addr) + peer = rxrpc_kernel_lookup_peer(net->socket, &srx, GFP_KERNEL); + if (!peer) + return -ENOMEM; + + for (i = 0; i < alist->nr_ipv4; i++) { + if (peer == alist->addrs[i].peer) { + rxrpc_kernel_put_peer(peer); + return 0; + } + if (peer <= alist->addrs[i].peer) break; } @@ -298,38 +315,42 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port) alist->addrs + i, sizeof(alist->addrs[0]) * (alist->nr_addrs - i)); - srx = &alist->addrs[i]; - srx->srx_family = AF_RXRPC; - srx->transport_type = SOCK_DGRAM; - srx->transport_len = sizeof(srx->transport.sin); - srx->transport.sin.sin_family = AF_INET; - srx->transport.sin.sin_port = htons(port); - srx->transport.sin.sin_addr.s_addr = xdr; + alist->addrs[i].peer = peer; alist->nr_ipv4++; alist->nr_addrs++; + return 0; } /* * Merge an IPv6 entry into a fileserver address list. */ -void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port) +int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *alist, + __be32 *xdr, u16 port) { - struct sockaddr_rxrpc *srx; - int i, diff; + struct sockaddr_rxrpc srx; + struct rxrpc_peer *peer; + int i; if (alist->nr_addrs >= alist->max_addrs) - return; + return 0; - for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) { - struct sockaddr_in6 *a = &alist->addrs[i].transport.sin6; - u16 a_port = ntohs(a->sin6_port); + srx.srx_family = AF_RXRPC; + srx.transport_type = SOCK_DGRAM; + srx.transport_len = sizeof(srx.transport.sin6); + srx.transport.sin6.sin6_family = AF_INET6; + srx.transport.sin6.sin6_port = htons(port); + memcpy(&srx.transport.sin6.sin6_addr, xdr, 16); - diff = memcmp(xdr, &a->sin6_addr, 16); - if (diff == 0 && port == a_port) - return; - if (diff == 0 && port < a_port) - break; - if (diff < 0) + peer = rxrpc_kernel_lookup_peer(net->socket, &srx, GFP_KERNEL); + if (!peer) + return -ENOMEM; + + for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) { + if (peer == alist->addrs[i].peer) { + rxrpc_kernel_put_peer(peer); + return 0; + } + if (peer <= alist->addrs[i].peer) break; } @@ -337,68 +358,7 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port) memmove(alist->addrs + i + 1, alist->addrs + i, sizeof(alist->addrs[0]) * (alist->nr_addrs - i)); - - srx = &alist->addrs[i]; - srx->srx_family = AF_RXRPC; - srx->transport_type = SOCK_DGRAM; - srx->transport_len = sizeof(srx->transport.sin6); - srx->transport.sin6.sin6_family = AF_INET6; - srx->transport.sin6.sin6_port = htons(port); - memcpy(&srx->transport.sin6.sin6_addr, xdr, 16); + alist->addrs[i].peer = peer; alist->nr_addrs++; -} - -/* - * Get an address to try. - */ -bool afs_iterate_addresses(struct afs_addr_cursor *ac) -{ - unsigned long set, failed; - int index; - - if (!ac->alist) - return false; - - set = ac->alist->responded; - failed = ac->alist->failed; - _enter("%lx-%lx-%lx,%d", set, failed, ac->tried, ac->index); - - ac->nr_iterations++; - - set &= ~(failed | ac->tried); - - if (!set) - return false; - - index = READ_ONCE(ac->alist->preferred); - if (test_bit(index, &set)) - goto selected; - - index = __ffs(set); - -selected: - ac->index = index; - set_bit(index, &ac->tried); - ac->responded = false; - return true; -} - -/* - * Release an address list cursor. - */ -int afs_end_cursor(struct afs_addr_cursor *ac) -{ - struct afs_addr_list *alist; - - alist = ac->alist; - if (alist) { - if (ac->responded && - ac->index != alist->preferred && - test_bit(ac->alist->preferred, &ac->tried)) - WRITE_ONCE(alist->preferred, ac->index); - afs_put_addrlist(alist); - ac->alist = NULL; - } - - return ac->error; + return 0; } diff --git a/fs/afs/addr_prefs.c b/fs/afs/addr_prefs.c new file mode 100644 index 000000000000..a189ff8a5034 --- /dev/null +++ b/fs/afs/addr_prefs.c @@ -0,0 +1,531 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Address preferences management + * + * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": addr_prefs: " fmt +#include <linux/slab.h> +#include <linux/ctype.h> +#include <linux/inet.h> +#include <linux/seq_file.h> +#include <keys/rxrpc-type.h> +#include "internal.h" + +static inline struct afs_net *afs_seq2net_single(struct seq_file *m) +{ + return afs_net(seq_file_single_net(m)); +} + +/* + * Split a NUL-terminated string up to the first newline around spaces. The + * source string will be modified to have NUL-terminations inserted. + */ +static int afs_split_string(char **pbuf, char *strv[], unsigned int maxstrv) +{ + unsigned int count = 0; + char *p = *pbuf; + + maxstrv--; /* Allow for terminal NULL */ + for (;;) { + /* Skip over spaces */ + while (isspace(*p)) { + if (*p == '\n') { + p++; + break; + } + p++; + } + if (!*p) + break; + + /* Mark start of word */ + if (count >= maxstrv) { + pr_warn("Too many elements in string\n"); + return -EINVAL; + } + strv[count++] = p; + + /* Skip over word */ + while (!isspace(*p)) + p++; + if (!*p) + break; + + /* Mark end of word */ + if (*p == '\n') { + *p++ = 0; + break; + } + *p++ = 0; + } + + *pbuf = p; + strv[count] = NULL; + return count; +} + +/* + * Parse an address with an optional subnet mask. + */ +static int afs_parse_address(char *p, struct afs_addr_preference *pref) +{ + const char *stop; + unsigned long mask, tmp; + char *end = p + strlen(p); + bool bracket = false; + + if (*p == '[') { + p++; + bracket = true; + } + +#if 0 + if (*p == '[') { + p++; + q = memchr(p, ']', end - p); + if (!q) { + pr_warn("Can't find closing ']'\n"); + return -EINVAL; + } + } else { + for (q = p; q < end; q++) + if (*q == '/') + break; + } +#endif + + if (in4_pton(p, end - p, (u8 *)&pref->ipv4_addr, -1, &stop)) { + pref->family = AF_INET; + mask = 32; + } else if (in6_pton(p, end - p, (u8 *)&pref->ipv6_addr, -1, &stop)) { + pref->family = AF_INET6; + mask = 128; + } else { + pr_warn("Can't determine address family\n"); + return -EINVAL; + } + + p = (char *)stop; + if (bracket) { + if (*p != ']') { + pr_warn("Can't find closing ']'\n"); + return -EINVAL; + } + p++; + } + + if (*p == '/') { + p++; + tmp = simple_strtoul(p, &p, 10); + if (tmp > mask) { + pr_warn("Subnet mask too large\n"); + return -EINVAL; + } + if (tmp == 0) { + pr_warn("Subnet mask too small\n"); + return -EINVAL; + } + mask = tmp; + } + + if (*p) { + pr_warn("Invalid address\n"); + return -EINVAL; + } + + pref->subnet_mask = mask; + return 0; +} + +enum cmp_ret { + CONTINUE_SEARCH, + INSERT_HERE, + EXACT_MATCH, + SUBNET_MATCH, +}; + +/* + * See if a candidate address matches a listed address. + */ +static enum cmp_ret afs_cmp_address_pref(const struct afs_addr_preference *a, + const struct afs_addr_preference *b) +{ + int subnet = min(a->subnet_mask, b->subnet_mask); + const __be32 *pa, *pb; + u32 mask, na, nb; + int diff; + + if (a->family != b->family) + return INSERT_HERE; + + switch (a->family) { + case AF_INET6: + pa = a->ipv6_addr.s6_addr32; + pb = b->ipv6_addr.s6_addr32; + break; + case AF_INET: + pa = &a->ipv4_addr.s_addr; + pb = &b->ipv4_addr.s_addr; + break; + } + + while (subnet > 32) { + diff = ntohl(*pa++) - ntohl(*pb++); + if (diff < 0) + return INSERT_HERE; /* a<b */ + if (diff > 0) + return CONTINUE_SEARCH; /* a>b */ + subnet -= 32; + } + + if (subnet == 0) + return EXACT_MATCH; + + mask = 0xffffffffU << (32 - subnet); + na = ntohl(*pa); + nb = ntohl(*pb); + diff = (na & mask) - (nb & mask); + //kdebug("diff %08x %08x %08x %d", na, nb, mask, diff); + if (diff < 0) + return INSERT_HERE; /* a<b */ + if (diff > 0) + return CONTINUE_SEARCH; /* a>b */ + if (a->subnet_mask == b->subnet_mask) + return EXACT_MATCH; + if (a->subnet_mask > b->subnet_mask) + return SUBNET_MATCH; /* a binds tighter than b */ + return CONTINUE_SEARCH; /* b binds tighter than a */ +} + +/* + * Insert an address preference. + */ +static int afs_insert_address_pref(struct afs_addr_preference_list **_preflist, + struct afs_addr_preference *pref, + int index) +{ + struct afs_addr_preference_list *preflist = *_preflist, *old = preflist; + size_t size, max_prefs; + + _enter("{%u/%u/%u},%u", preflist->ipv6_off, preflist->nr, preflist->max_prefs, index); + + if (preflist->nr == 255) + return -ENOSPC; + if (preflist->nr >= preflist->max_prefs) { + max_prefs = preflist->max_prefs + 1; + size = struct_size(preflist, prefs, max_prefs); + size = roundup_pow_of_two(size); + max_prefs = min_t(size_t, (size - sizeof(*preflist)) / sizeof(*pref), 255); + preflist = kmalloc(size, GFP_KERNEL); + if (!preflist) + return -ENOMEM; + *preflist = **_preflist; + preflist->max_prefs = max_prefs; + *_preflist = preflist; + + if (index < preflist->nr) + memcpy(preflist->prefs + index + 1, old->prefs + index, + sizeof(*pref) * (preflist->nr - index)); + if (index > 0) + memcpy(preflist->prefs, old->prefs, sizeof(*pref) * index); + } else { + if (index < preflist->nr) + memmove(preflist->prefs + index + 1, preflist->prefs + index, + sizeof(*pref) * (preflist->nr - index)); + } + + preflist->prefs[index] = *pref; + preflist->nr++; + if (pref->family == AF_INET) + preflist->ipv6_off++; + return 0; +} + +/* + * Add an address preference. + * echo "add <proto> <IP>[/<mask>] <prior>" >/proc/fs/afs/addr_prefs + */ +static int afs_add_address_pref(struct afs_net *net, struct afs_addr_preference_list **_preflist, + int argc, char **argv) +{ + struct afs_addr_preference_list *preflist = *_preflist; + struct afs_addr_preference pref; + enum cmp_ret cmp; + int ret, i, stop; + + if (argc != 3) { + pr_warn("Wrong number of params\n"); + return -EINVAL; + } + + if (strcmp(argv[0], "udp") != 0) { + pr_warn("Unsupported protocol\n"); + return -EINVAL; + } + + ret = afs_parse_address(argv[1], &pref); + if (ret < 0) + return ret; + + ret = kstrtou16(argv[2], 10, &pref.prio); + if (ret < 0) { + pr_warn("Invalid priority\n"); + return ret; + } + + if (pref.family == AF_INET) { + i = 0; + stop = preflist->ipv6_off; + } else { + i = preflist->ipv6_off; + stop = preflist->nr; + } + + for (; i < stop; i++) { + cmp = afs_cmp_address_pref(&pref, &preflist->prefs[i]); + switch (cmp) { + case CONTINUE_SEARCH: + continue; + case INSERT_HERE: + case SUBNET_MATCH: + return afs_insert_address_pref(_preflist, &pref, i); + case EXACT_MATCH: + preflist->prefs[i].prio = pref.prio; + return 0; + } + } + + return afs_insert_address_pref(_preflist, &pref, i); +} + +/* + * Delete an address preference. + */ +static int afs_delete_address_pref(struct afs_addr_preference_list **_preflist, + int index) +{ + struct afs_addr_preference_list *preflist = *_preflist; + + _enter("{%u/%u/%u},%u", preflist->ipv6_off, preflist->nr, preflist->max_prefs, index); + + if (preflist->nr == 0) + return -ENOENT; + + if (index < preflist->nr - 1) + memmove(preflist->prefs + index, preflist->prefs + index + 1, + sizeof(preflist->prefs[0]) * (preflist->nr - index - 1)); + + if (index < preflist->ipv6_off) + preflist->ipv6_off--; + preflist->nr--; + return 0; +} + +/* + * Delete an address preference. + * echo "del <proto> <IP>[/<mask>]" >/proc/fs/afs/addr_prefs + */ +static int afs_del_address_pref(struct afs_net *net, struct afs_addr_preference_list **_preflist, + int argc, char **argv) +{ + struct afs_addr_preference_list *preflist = *_preflist; + struct afs_addr_preference pref; + enum cmp_ret cmp; + int ret, i, stop; + + if (argc != 2) { + pr_warn("Wrong number of params\n"); + return -EINVAL; + } + + if (strcmp(argv[0], "udp") != 0) { + pr_warn("Unsupported protocol\n"); + return -EINVAL; + } + + ret = afs_parse_address(argv[1], &pref); + if (ret < 0) + return ret; + + if (pref.family == AF_INET) { + i = 0; + stop = preflist->ipv6_off; + } else { + i = preflist->ipv6_off; + stop = preflist->nr; + } + + for (; i < stop; i++) { + cmp = afs_cmp_address_pref(&pref, &preflist->prefs[i]); + switch (cmp) { + case CONTINUE_SEARCH: + continue; + case INSERT_HERE: + case SUBNET_MATCH: + return 0; + case EXACT_MATCH: + return afs_delete_address_pref(_preflist, i); + } + } + + return -ENOANO; +} + +/* + * Handle writes to /proc/fs/afs/addr_prefs + */ +int afs_proc_addr_prefs_write(struct file *file, char *buf, size_t size) +{ + struct afs_addr_preference_list *preflist, *old; + struct seq_file *m = file->private_data; + struct afs_net *net = afs_seq2net_single(m); + size_t psize; + char *argv[5]; + int ret, argc, max_prefs; + + inode_lock(file_inode(file)); + + /* Allocate a candidate new list and initialise it from the old. */ + old = rcu_dereference_protected(net->address_prefs, + lockdep_is_held(&file_inode(file)->i_rwsem)); + + if (old) + max_prefs = old->nr + 1; + else + max_prefs = 1; + + psize = struct_size(old, prefs, max_prefs); + psize = roundup_pow_of_two(psize); + max_prefs = min_t(size_t, (psize - sizeof(*old)) / sizeof(old->prefs[0]), 255); + + ret = -ENOMEM; + preflist = kmalloc(struct_size(preflist, prefs, max_prefs), GFP_KERNEL); + if (!preflist) + goto done; + + if (old) + memcpy(preflist, old, struct_size(preflist, prefs, old->nr)); + else + memset(preflist, 0, sizeof(*preflist)); + preflist->max_prefs = max_prefs; + + do { + argc = afs_split_string(&buf, argv, ARRAY_SIZE(argv)); + if (argc < 0) + return argc; + if (argc < 2) + goto inval; + + if (strcmp(argv[0], "add") == 0) + ret = afs_add_address_pref(net, &preflist, argc - 1, argv + 1); + else if (strcmp(argv[0], "del") == 0) + ret = afs_del_address_pref(net, &preflist, argc - 1, argv + 1); + else + goto inval; + if (ret < 0) + goto done; + } while (*buf); + + preflist->version++; + rcu_assign_pointer(net->address_prefs, preflist); + /* Store prefs before version */ + smp_store_release(&net->address_pref_version, preflist->version); + kfree_rcu(old, rcu); + preflist = NULL; + ret = 0; + +done: + kfree(preflist); + inode_unlock(file_inode(file)); + _leave(" = %d", ret); + return ret; + +inval: + pr_warn("Invalid Command\n"); + ret = -EINVAL; + goto done; +} + +/* + * Mark the priorities on an address list if the address preferences table has + * changed. The caller must hold the RCU read lock. + */ +void afs_get_address_preferences_rcu(struct afs_net *net, struct afs_addr_list *alist) +{ + const struct afs_addr_preference_list *preflist = + rcu_dereference(net->address_prefs); + const struct sockaddr_in6 *sin6; + const struct sockaddr_in *sin; + const struct sockaddr *sa; + struct afs_addr_preference test; + enum cmp_ret cmp; + int i, j; + + if (!preflist || !preflist->nr || !alist->nr_addrs || + smp_load_acquire(&alist->addr_pref_version) == preflist->version) + return; + + test.family = AF_INET; + test.subnet_mask = 32; + test.prio = 0; + for (i = 0; i < alist->nr_ipv4; i++) { + sa = rxrpc_kernel_remote_addr(alist->addrs[i].peer); + sin = (const struct sockaddr_in *)sa; + test.ipv4_addr = sin->sin_addr; + for (j = 0; j < preflist->ipv6_off; j++) { + cmp = afs_cmp_address_pref(&test, &preflist->prefs[j]); + switch (cmp) { + case CONTINUE_SEARCH: + continue; + case INSERT_HERE: + break; + case EXACT_MATCH: + case SUBNET_MATCH: + WRITE_ONCE(alist->addrs[i].prio, preflist->prefs[j].prio); + break; + } + } + } + + test.family = AF_INET6; + test.subnet_mask = 128; + test.prio = 0; + for (; i < alist->nr_addrs; i++) { + sa = rxrpc_kernel_remote_addr(alist->addrs[i].peer); + sin6 = (const struct sockaddr_in6 *)sa; + test.ipv6_addr = sin6->sin6_addr; + for (j = preflist->ipv6_off; j < preflist->nr; j++) { + cmp = afs_cmp_address_pref(&test, &preflist->prefs[j]); + switch (cmp) { + case CONTINUE_SEARCH: + continue; + case INSERT_HERE: + break; + case EXACT_MATCH: + case SUBNET_MATCH: + WRITE_ONCE(alist->addrs[i].prio, preflist->prefs[j].prio); + break; + } + } + } + + smp_store_release(&alist->addr_pref_version, preflist->version); +} + +/* + * Mark the priorities on an address list if the address preferences table has + * changed. Avoid taking the RCU read lock if we can. + */ +void afs_get_address_preferences(struct afs_net *net, struct afs_addr_list *alist) +{ + if (!net->address_prefs || + /* Load version before prefs */ + smp_load_acquire(&net->address_pref_version) == alist->addr_pref_version) + return; + + rcu_read_lock(); + afs_get_address_preferences_rcu(net, alist); + rcu_read_unlock(); +} diff --git a/fs/afs/afs.h b/fs/afs/afs.h index 81815724db6c..b488072aee87 100644 --- a/fs/afs/afs.h +++ b/fs/afs/afs.h @@ -165,7 +165,8 @@ struct afs_status_cb { * AFS volume synchronisation information */ struct afs_volsync { - time64_t creation; /* volume creation time */ + time64_t creation; /* Volume creation time (or TIME64_MIN) */ + time64_t update; /* Volume update time (or TIME64_MIN) */ }; /* diff --git a/fs/afs/callback.c b/fs/afs/callback.c index a484fa642808..99b2c8172021 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -33,22 +33,20 @@ void afs_invalidate_mmap_work(struct work_struct *work) unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false); } -void afs_server_init_callback_work(struct work_struct *work) +static void afs_volume_init_callback(struct afs_volume *volume) { - struct afs_server *server = container_of(work, struct afs_server, initcb_work); struct afs_vnode *vnode; - struct afs_cell *cell = server->cell; - down_read(&cell->fs_open_mmaps_lock); + down_read(&volume->open_mmaps_lock); - list_for_each_entry(vnode, &cell->fs_open_mmaps, cb_mmap_link) { - if (vnode->cb_server == server) { - clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); + list_for_each_entry(vnode, &volume->open_mmaps, cb_mmap_link) { + if (vnode->cb_v_check != atomic_read(&volume->cb_v_break)) { + atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE); queue_work(system_unbound_wq, &vnode->cb_work); } } - up_read(&cell->fs_open_mmaps_lock); + up_read(&volume->open_mmaps_lock); } /* @@ -57,15 +55,20 @@ void afs_server_init_callback_work(struct work_struct *work) */ void afs_init_callback_state(struct afs_server *server) { - rcu_read_lock(); - do { - server->cb_s_break++; - atomic_inc(&server->cell->fs_s_break); - if (!list_empty(&server->cell->fs_open_mmaps)) - queue_work(system_unbound_wq, &server->initcb_work); + struct afs_server_entry *se; - } while ((server = rcu_dereference(server->uuid_next))); - rcu_read_unlock(); + down_read(&server->cell->vs_lock); + + list_for_each_entry(se, &server->volumes, slink) { + se->cb_expires_at = AFS_NO_CB_PROMISE; + se->volume->cb_expires_at = AFS_NO_CB_PROMISE; + trace_afs_cb_v_break(se->volume->vid, atomic_read(&se->volume->cb_v_break), + afs_cb_break_for_s_reinit); + if (!list_empty(&se->volume->open_mmaps)) + afs_volume_init_callback(se->volume); + } + + up_read(&server->cell->vs_lock); } /* @@ -76,9 +79,9 @@ void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reas _enter(""); clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); - if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { + if (atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE) { vnode->cb_break++; - vnode->cb_v_break = vnode->volume->cb_v_break; + vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break); afs_clear_permits(vnode); if (vnode->lock_state == AFS_VNODE_LOCK_WAITING_FOR_CB) @@ -110,13 +113,14 @@ static struct afs_volume *afs_lookup_volume_rcu(struct afs_cell *cell, { struct afs_volume *volume = NULL; struct rb_node *p; - int seq = 0; + int seq = 1; - do { + for (;;) { /* Unfortunately, rbtree walking doesn't give reliable results * under just the RCU read lock, so we have to check for * changes. */ + seq++; /* 2 on the 1st/lockless path, otherwise odd */ read_seqbegin_or_lock(&cell->volume_lock, &seq); p = rcu_dereference_raw(cell->volumes.rb_node); @@ -132,35 +136,63 @@ static struct afs_volume *afs_lookup_volume_rcu(struct afs_cell *cell, volume = NULL; } - } while (need_seqretry(&cell->volume_lock, seq)); + if (volume && afs_try_get_volume(volume, afs_volume_trace_get_callback)) + break; + if (!need_seqretry(&cell->volume_lock, seq)) + break; + seq |= 1; /* Want a lock next time */ + } done_seqretry(&cell->volume_lock, seq); return volume; } /* + * Allow the fileserver to break callbacks at the volume-level. This is + * typically done when, for example, a R/W volume is snapshotted to a R/O + * volume (the only way to change an R/O volume). It may also, however, happen + * when a volserver takes control of a volume (offlining it, moving it, etc.). + * + * Every file in that volume will need to be reevaluated. + */ +static void afs_break_volume_callback(struct afs_server *server, + struct afs_volume *volume) + __releases(RCU) +{ + struct afs_server_list *slist = rcu_dereference(volume->servers); + unsigned int i, cb_v_break; + + write_lock(&volume->cb_v_break_lock); + + for (i = 0; i < slist->nr_servers; i++) + if (slist->servers[i].server == server) + slist->servers[i].cb_expires_at = AFS_NO_CB_PROMISE; + volume->cb_expires_at = AFS_NO_CB_PROMISE; + + cb_v_break = atomic_inc_return_release(&volume->cb_v_break); + trace_afs_cb_v_break(volume->vid, cb_v_break, afs_cb_break_for_volume_callback); + + write_unlock(&volume->cb_v_break_lock); + rcu_read_unlock(); + + if (!list_empty(&volume->open_mmaps)) + afs_volume_init_callback(volume); +} + +/* * allow the fileserver to explicitly break one callback * - happens when * - the backing file is changed * - a lock is released */ -static void afs_break_one_callback(struct afs_volume *volume, +static void afs_break_one_callback(struct afs_server *server, + struct afs_volume *volume, struct afs_fid *fid) { struct super_block *sb; struct afs_vnode *vnode; struct inode *inode; - if (fid->vnode == 0 && fid->unique == 0) { - /* The callback break applies to an entire volume. */ - write_lock(&volume->cb_v_break_lock); - volume->cb_v_break++; - trace_afs_cb_break(fid, volume->cb_v_break, - afs_cb_break_for_volume_callback, false); - write_unlock(&volume->cb_v_break_lock); - return; - } - /* See if we can find a matching inode - even an I_NEW inode needs to * be marked as it can have its callback broken before we finish * setting up the local inode. @@ -187,25 +219,35 @@ static void afs_break_some_callbacks(struct afs_server *server, afs_volid_t vid = cbb->fid.vid; size_t i; + rcu_read_lock(); volume = afs_lookup_volume_rcu(server->cell, vid); + if (cbb->fid.vnode == 0 && cbb->fid.unique == 0) { + afs_break_volume_callback(server, volume); + *_count -= 1; + if (*_count) + memmove(cbb, cbb + 1, sizeof(*cbb) * *_count); + } else { + /* TODO: Find all matching volumes if we couldn't match the server and + * break them anyway. + */ - /* TODO: Find all matching volumes if we couldn't match the server and - * break them anyway. - */ - - for (i = *_count; i > 0; cbb++, i--) { - if (cbb->fid.vid == vid) { - _debug("- Fid { vl=%08llx n=%llu u=%u }", - cbb->fid.vid, - cbb->fid.vnode, - cbb->fid.unique); - --*_count; - if (volume) - afs_break_one_callback(volume, &cbb->fid); - } else { - *residue++ = *cbb; + for (i = *_count; i > 0; cbb++, i--) { + if (cbb->fid.vid == vid) { + _debug("- Fid { vl=%08llx n=%llu u=%u }", + cbb->fid.vid, + cbb->fid.vnode, + cbb->fid.unique); + --*_count; + if (volume) + afs_break_one_callback(server, volume, &cbb->fid); + } else { + *residue++ = *cbb; + } } + rcu_read_unlock(); } + + afs_put_volume(volume, afs_volume_trace_put_callback); } /* @@ -218,11 +260,6 @@ void afs_break_callbacks(struct afs_server *server, size_t count, ASSERT(server != NULL); - rcu_read_lock(); - while (count > 0) afs_break_some_callbacks(server, callbacks, &count); - - rcu_read_unlock(); - return; } diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 926cb1188eba..caa09875f520 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -161,13 +161,12 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, refcount_set(&cell->ref, 1); atomic_set(&cell->active, 0); INIT_WORK(&cell->manager, afs_manage_cell_work); + init_rwsem(&cell->vs_lock); cell->volumes = RB_ROOT; INIT_HLIST_HEAD(&cell->proc_volumes); seqlock_init(&cell->volume_lock); cell->fs_servers = RB_ROOT; seqlock_init(&cell->fs_lock); - INIT_LIST_HEAD(&cell->fs_open_mmaps); - init_rwsem(&cell->fs_open_mmaps_lock); rwlock_init(&cell->vl_servers_lock); cell->flags = (1 << AFS_CELL_FL_CHECK_ALIAS); @@ -817,7 +816,7 @@ done: final_destruction: /* The root volume is pinning the cell */ - afs_put_volume(cell->net, cell->root_volume, afs_volume_trace_put_cell_root); + afs_put_volume(cell->root_volume, afs_volume_trace_put_cell_root); cell->root_volume = NULL; afs_put_cell(cell, afs_cell_trace_put_destroy); } diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index d4ddb20d6732..99a3f20bc786 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -146,10 +146,11 @@ static int afs_find_cm_server_by_peer(struct afs_call *call) { struct sockaddr_rxrpc srx; struct afs_server *server; + struct rxrpc_peer *peer; - rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx); + peer = rxrpc_kernel_get_call_peer(call->net->socket, call->rxcall); - server = afs_find_server(call->net, &srx); + server = afs_find_server(call->net, peer); if (!server) { trace_afs_cm_no_server(call, &srx); return 0; diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 5219182e52e1..c14533ef108f 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -693,8 +693,9 @@ static void afs_do_lookup_success(struct afs_operation *op) vp = &op->file[0]; abort_code = vp->scb.status.abort_code; if (abort_code != 0) { - op->ac.abort_code = abort_code; - op->error = afs_abort_to_error(abort_code); + op->call_abort_code = abort_code; + afs_op_set_error(op, afs_abort_to_error(abort_code)); + op->cumul_error.abort_code = abort_code; } break; @@ -806,8 +807,8 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, cookie->fids[i].vid = dvnode->fid.vid; cookie->ctx.actor = afs_lookup_filldir; cookie->name = dentry->d_name; - cookie->nr_fids = 2; /* slot 0 is saved for the fid we actually want - * and slot 1 for the directory */ + cookie->nr_fids = 2; /* slot 1 is saved for the fid we actually want + * and slot 0 for the directory */ if (!afs_server_supports_ibulk(dvnode)) cookie->one_only = true; @@ -846,13 +847,14 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, _debug("nr_files %u", op->nr_files); /* Need space for examining all the selected files */ - op->error = -ENOMEM; if (op->nr_files > 2) { op->more_files = kvcalloc(op->nr_files - 2, sizeof(struct afs_vnode_param), GFP_KERNEL); - if (!op->more_files) + if (!op->more_files) { + afs_op_nomem(op); goto out_op; + } for (i = 2; i < op->nr_files; i++) { vp = &op->more_files[i - 2]; @@ -878,14 +880,14 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, * lookups contained therein are stored in the reply without aborting * the whole operation. */ - op->error = -ENOTSUPP; + afs_op_set_error(op, -ENOTSUPP); if (!cookie->one_only) { op->ops = &afs_inline_bulk_status_operation; afs_begin_vnode_operation(op); afs_wait_for_operation(op); } - if (op->error == -ENOTSUPP) { + if (afs_op_error(op) == -ENOTSUPP) { /* We could try FS.BulkStatus next, but this aborts the entire * op if any of the lookups fails - so, for the moment, revert * to FS.FetchStatus for op->file[1]. @@ -895,10 +897,10 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, afs_begin_vnode_operation(op); afs_wait_for_operation(op); } - inode = ERR_PTR(op->error); + inode = ERR_PTR(afs_op_error(op)); out_op: - if (op->error == 0) { + if (!afs_op_error(op)) { inode = &op->file[1].vnode->netfs.inode; op->file[1].vnode = NULL; } @@ -1116,7 +1118,12 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) dir = AFS_FS_I(d_inode(parent)); /* validate the parent directory */ - afs_validate(dir, key); + ret = afs_validate(dir, key); + if (ret == -ERESTARTSYS) { + dput(parent); + key_put(key); + return ret; + } if (test_bit(AFS_VNODE_DELETED, &dir->flags)) { _debug("%pd: parent dir deleted", dentry); @@ -1255,9 +1262,10 @@ void afs_check_for_remote_deletion(struct afs_operation *op) { struct afs_vnode *vnode = op->file[0].vnode; - switch (op->ac.abort_code) { + switch (afs_op_abort_code(op)) { case VNOVNODE: set_bit(AFS_VNODE_DELETED, &vnode->flags); + clear_nlink(&vnode->netfs.inode); afs_break_callback(vnode, afs_cb_break_for_deleted); } } @@ -1273,20 +1281,20 @@ static void afs_vnode_new_inode(struct afs_operation *op) _enter(""); - ASSERTCMP(op->error, ==, 0); + ASSERTCMP(afs_op_error(op), ==, 0); inode = afs_iget(op, vp); if (IS_ERR(inode)) { /* ENOMEM or EINTR at a really inconvenient time - just abandon * the new directory on the server. */ - op->error = PTR_ERR(inode); + afs_op_accumulate_error(op, PTR_ERR(inode), 0); return; } vnode = AFS_FS_I(inode); set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); - if (!op->error) + if (!afs_op_error(op)) afs_cache_permit(vnode, op->key, vnode->cb_break, &vp->scb); d_instantiate(op->dentry, inode); } @@ -1320,7 +1328,7 @@ static void afs_create_put(struct afs_operation *op) { _enter("op=%08x", op->debug_id); - if (op->error) + if (afs_op_error(op)) d_drop(op->dentry); } @@ -1373,7 +1381,7 @@ static void afs_dir_remove_subdir(struct dentry *dentry) clear_nlink(&vnode->netfs.inode); set_bit(AFS_VNODE_DELETED, &vnode->flags); - clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); + atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE); clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); } } @@ -1480,7 +1488,7 @@ static void afs_dir_remove_link(struct afs_operation *op) struct dentry *dentry = op->dentry; int ret; - if (op->error != 0 || + if (afs_op_error(op) || (op->file[1].scb.have_status && op->file[1].scb.have_error)) return; if (d_really_is_positive(dentry)) @@ -1504,10 +1512,10 @@ static void afs_dir_remove_link(struct afs_operation *op) ret = afs_validate(vnode, op->key); if (ret != -ESTALE) - op->error = ret; + afs_op_set_error(op, ret); } - _debug("nlink %d [val %d]", vnode->netfs.inode.i_nlink, op->error); + _debug("nlink %d [val %d]", vnode->netfs.inode.i_nlink, afs_op_error(op)); } static void afs_unlink_success(struct afs_operation *op) @@ -1538,7 +1546,7 @@ static void afs_unlink_edit_dir(struct afs_operation *op) static void afs_unlink_put(struct afs_operation *op) { _enter("op=%08x", op->debug_id); - if (op->unlink.need_rehash && op->error < 0 && op->error != -ENOENT) + if (op->unlink.need_rehash && afs_op_error(op) < 0 && afs_op_error(op) != -ENOENT) d_rehash(op->dentry); } @@ -1579,7 +1587,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) /* Try to make sure we have a callback promise on the victim. */ ret = afs_validate(vnode, op->key); if (ret < 0) { - op->error = ret; + afs_op_set_error(op, ret); goto error; } @@ -1588,7 +1596,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) spin_unlock(&dentry->d_lock); /* Start asynchronous writeout of the inode */ write_inode_now(d_inode(dentry), 0); - op->error = afs_sillyrename(dvnode, vnode, dentry, op->key); + afs_op_set_error(op, afs_sillyrename(dvnode, vnode, dentry, op->key)); goto error; } if (!d_unhashed(dentry)) { @@ -1609,7 +1617,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) /* If there was a conflict with a third party, check the status of the * unlinked vnode. */ - if (op->error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) { + if (afs_op_error(op) == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) { op->file[1].update_ctime = false; op->fetch_status.which = 1; op->ops = &afs_fetch_status_operation; @@ -1691,7 +1699,7 @@ static void afs_link_success(struct afs_operation *op) static void afs_link_put(struct afs_operation *op) { _enter("op=%08x", op->debug_id); - if (op->error) + if (afs_op_error(op)) d_drop(op->dentry); } @@ -1889,7 +1897,7 @@ static void afs_rename_put(struct afs_operation *op) if (op->rename.rehash) d_rehash(op->rename.rehash); dput(op->rename.tmp); - if (op->error) + if (afs_op_error(op)) d_rehash(op->dentry); } @@ -1934,7 +1942,7 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, return PTR_ERR(op); ret = afs_validate(vnode, op->key); - op->error = ret; + afs_op_set_error(op, ret); if (ret < 0) goto error; @@ -1971,7 +1979,7 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, op->rename.tmp = d_alloc(new_dentry->d_parent, &new_dentry->d_name); if (!op->rename.tmp) { - op->error = -ENOMEM; + afs_op_nomem(op); goto error; } @@ -1979,7 +1987,7 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, AFS_FS_I(d_inode(new_dentry)), new_dentry, op->key); if (ret) { - op->error = ret; + afs_op_set_error(op, ret); goto error; } diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index bb5807e87fa4..a1e581946b93 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -218,7 +218,7 @@ static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode /* If there was a conflict with a third party, check the status of the * unlinked vnode. */ - if (op->error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) { + if (op->cumul_error.error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) { op->file[1].update_ctime = false; op->fetch_status.which = 1; op->ops = &afs_fetch_status_operation; diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 1f656005018e..2cd40ba601f1 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -373,7 +373,7 @@ error: void afs_dynroot_depopulate(struct super_block *sb) { struct afs_net *net = afs_sb2net(sb); - struct dentry *root = sb->s_root, *subdir, *tmp; + struct dentry *root = sb->s_root, *subdir; /* Prevent more subdirs from being created */ mutex_lock(&net->proc_cells_lock); @@ -382,10 +382,11 @@ void afs_dynroot_depopulate(struct super_block *sb) mutex_unlock(&net->proc_cells_lock); if (root) { + struct hlist_node *n; inode_lock(root->d_inode); /* Remove all the pins for dirs created for manually added cells */ - list_for_each_entry_safe(subdir, tmp, &root->d_subdirs, d_child) { + hlist_for_each_entry_safe(subdir, n, &root->d_children, d_sib) { if (subdir->d_fsdata) { subdir->d_fsdata = NULL; dput(subdir); diff --git a/fs/afs/file.c b/fs/afs/file.c index d37dd201752b..30914e0d9cb2 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -243,12 +243,9 @@ static void afs_fetch_data_notify(struct afs_operation *op) { struct afs_read *req = op->fetch.req; struct netfs_io_subrequest *subreq = req->subreq; - int error = op->error; + int error = afs_op_error(op); - if (error == -ECONNABORTED) - error = afs_abort_to_error(op->ac.abort_code); req->error = error; - if (subreq) { __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); netfs_subreq_terminated(subreq, error ?: req->actual_len, false); @@ -271,7 +268,7 @@ static void afs_fetch_data_success(struct afs_operation *op) static void afs_fetch_data_put(struct afs_operation *op) { - op->fetch.req->error = op->error; + op->fetch.req->error = afs_op_error(op); afs_put_read(op->fetch.req); } @@ -517,13 +514,12 @@ static bool afs_release_folio(struct folio *folio, gfp_t gfp) static void afs_add_open_mmap(struct afs_vnode *vnode) { if (atomic_inc_return(&vnode->cb_nr_mmap) == 1) { - down_write(&vnode->volume->cell->fs_open_mmaps_lock); + down_write(&vnode->volume->open_mmaps_lock); if (list_empty(&vnode->cb_mmap_link)) - list_add_tail(&vnode->cb_mmap_link, - &vnode->volume->cell->fs_open_mmaps); + list_add_tail(&vnode->cb_mmap_link, &vnode->volume->open_mmaps); - up_write(&vnode->volume->cell->fs_open_mmaps_lock); + up_write(&vnode->volume->open_mmaps_lock); } } @@ -532,12 +528,12 @@ static void afs_drop_open_mmap(struct afs_vnode *vnode) if (!atomic_dec_and_test(&vnode->cb_nr_mmap)) return; - down_write(&vnode->volume->cell->fs_open_mmaps_lock); + down_write(&vnode->volume->open_mmaps_lock); if (atomic_read(&vnode->cb_nr_mmap) == 0) list_del_init(&vnode->cb_mmap_link); - up_write(&vnode->volume->cell->fs_open_mmaps_lock); + up_write(&vnode->volume->open_mmaps_lock); flush_work(&vnode->cb_work); } @@ -573,7 +569,7 @@ static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pg { struct afs_vnode *vnode = AFS_FS_I(file_inode(vmf->vma->vm_file)); - if (afs_pagecache_valid(vnode)) + if (afs_check_validity(vnode)) return filemap_map_pages(vmf, start_pgoff, end_pgoff); return 0; } diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c index 7a3803ce3a22..3546b087e791 100644 --- a/fs/afs/fs_operation.c +++ b/fs/afs/fs_operation.c @@ -35,13 +35,15 @@ struct afs_operation *afs_alloc_operation(struct key *key, struct afs_volume *vo key_get(key); } - op->key = key; - op->volume = afs_get_volume(volume, afs_volume_trace_get_new_op); - op->net = volume->cell->net; - op->cb_v_break = volume->cb_v_break; - op->debug_id = atomic_inc_return(&afs_operation_debug_counter); - op->error = -EDESTADDRREQ; - op->ac.error = SHRT_MAX; + op->key = key; + op->volume = afs_get_volume(volume, afs_volume_trace_get_new_op); + op->net = volume->cell->net; + op->cb_v_break = atomic_read(&volume->cb_v_break); + op->pre_volsync.creation = volume->creation_time; + op->pre_volsync.update = volume->update_time; + op->debug_id = atomic_inc_return(&afs_operation_debug_counter); + op->nr_iterations = -1; + afs_op_set_error(op, -EDESTADDRREQ); _leave(" = [op=%08x]", op->debug_id); return op; @@ -71,7 +73,7 @@ static bool afs_get_io_locks(struct afs_operation *op) swap(vnode, vnode2); if (mutex_lock_interruptible(&vnode->io_lock) < 0) { - op->error = -ERESTARTSYS; + afs_op_set_error(op, -ERESTARTSYS); op->flags |= AFS_OPERATION_STOP; _leave(" = f [I 0]"); return false; @@ -80,7 +82,7 @@ static bool afs_get_io_locks(struct afs_operation *op) if (vnode2) { if (mutex_lock_interruptible_nested(&vnode2->io_lock, 1) < 0) { - op->error = -ERESTARTSYS; + afs_op_set_error(op, -ERESTARTSYS); op->flags |= AFS_OPERATION_STOP; mutex_unlock(&vnode->io_lock); op->flags &= ~AFS_OPERATION_LOCK_0; @@ -147,7 +149,7 @@ bool afs_begin_vnode_operation(struct afs_operation *op) afs_prepare_vnode(op, &op->file[0], 0); afs_prepare_vnode(op, &op->file[1], 1); - op->cb_v_break = op->volume->cb_v_break; + op->cb_v_break = atomic_read(&op->volume->cb_v_break); _leave(" = true"); return true; } @@ -159,16 +161,16 @@ static void afs_end_vnode_operation(struct afs_operation *op) { _enter(""); - if (op->error == -EDESTADDRREQ || - op->error == -EADDRNOTAVAIL || - op->error == -ENETUNREACH || - op->error == -EHOSTUNREACH) + switch (afs_op_error(op)) { + case -EDESTADDRREQ: + case -EADDRNOTAVAIL: + case -ENETUNREACH: + case -EHOSTUNREACH: afs_dump_edestaddrreq(op); + break; + } afs_drop_io_locks(op); - - if (op->error == -ECONNABORTED) - op->error = afs_abort_to_error(op->ac.abort_code); } /* @@ -179,37 +181,43 @@ void afs_wait_for_operation(struct afs_operation *op) _enter(""); while (afs_select_fileserver(op)) { - op->cb_s_break = op->server->cb_s_break; + op->call_responded = false; + op->call_error = 0; + op->call_abort_code = 0; if (test_bit(AFS_SERVER_FL_IS_YFS, &op->server->flags) && op->ops->issue_yfs_rpc) op->ops->issue_yfs_rpc(op); else if (op->ops->issue_afs_rpc) op->ops->issue_afs_rpc(op); else - op->ac.error = -ENOTSUPP; - - if (op->call) - op->error = afs_wait_for_call_to_complete(op->call, &op->ac); + op->call_error = -ENOTSUPP; + + if (op->call) { + afs_wait_for_call_to_complete(op->call); + op->call_abort_code = op->call->abort_code; + op->call_error = op->call->error; + op->call_responded = op->call->responded; + afs_put_call(op->call); + } } - switch (op->error) { - case 0: + if (op->call_responded) + set_bit(AFS_SERVER_FL_RESPONDING, &op->server->flags); + + if (!afs_op_error(op)) { _debug("success"); op->ops->success(op); - break; - case -ECONNABORTED: + } else if (op->cumul_error.aborted) { if (op->ops->aborted) op->ops->aborted(op); - fallthrough; - default: + } else { if (op->ops->failed) op->ops->failed(op); - break; } afs_end_vnode_operation(op); - if (op->error == 0 && op->ops->edit_dir) { + if (!afs_op_error(op) && op->ops->edit_dir) { _debug("edit_dir"); op->ops->edit_dir(op); } @@ -221,7 +229,8 @@ void afs_wait_for_operation(struct afs_operation *op) */ int afs_put_operation(struct afs_operation *op) { - int i, ret = op->error; + struct afs_addr_list *alist; + int i, ret = afs_op_error(op); _enter("op=%08x,%d", op->debug_id, ret); @@ -243,9 +252,19 @@ int afs_put_operation(struct afs_operation *op) kfree(op->more_files); } - afs_end_cursor(&op->ac); + if (op->estate) { + alist = op->estate->addresses; + if (alist) { + if (op->call_responded && + op->addr_index != alist->preferred && + test_bit(alist->preferred, &op->addr_tried)) + WRITE_ONCE(alist->preferred, op->addr_index); + } + } + + afs_clear_server_states(op); afs_put_serverlist(op->net, op->server_list); - afs_put_volume(op->net, op->volume, afs_volume_trace_put_put_op); + afs_put_volume(op->volume, afs_volume_trace_put_put_op); key_put(op->key); kfree(op); return ret; diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c index daaf3810cc92..580de4adaaf6 100644 --- a/fs/afs/fs_probe.c +++ b/fs/afs/fs_probe.c @@ -15,6 +15,42 @@ static unsigned int afs_fs_probe_fast_poll_interval = 30 * HZ; static unsigned int afs_fs_probe_slow_poll_interval = 5 * 60 * HZ; +struct afs_endpoint_state *afs_get_endpoint_state(struct afs_endpoint_state *estate, + enum afs_estate_trace where) +{ + if (estate) { + int r; + + __refcount_inc(&estate->ref, &r); + trace_afs_estate(estate->server_id, estate->probe_seq, r, where); + } + return estate; +} + +static void afs_endpoint_state_rcu(struct rcu_head *rcu) +{ + struct afs_endpoint_state *estate = container_of(rcu, struct afs_endpoint_state, rcu); + + trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref), + afs_estate_trace_free); + afs_put_addrlist(estate->addresses, afs_alist_trace_put_estate); + kfree(estate); +} + +void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where) +{ + if (estate) { + unsigned int server_id = estate->server_id, probe_seq = estate->probe_seq; + bool dead; + int r; + + dead = __refcount_dec_and_test(&estate->ref, &r); + trace_afs_estate(server_id, probe_seq, r, where); + if (dead) + call_rcu(&estate->rcu, afs_endpoint_state_rcu); + } +} + /* * Start the probe polling timer. We have to supply it with an inc on the * outstanding server count. @@ -38,9 +74,10 @@ static void afs_schedule_fs_probe(struct afs_net *net, /* * Handle the completion of a set of probes. */ -static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server) +static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server, + struct afs_endpoint_state *estate) { - bool responded = server->probe.responded; + bool responded = test_bit(AFS_ESTATE_RESPONDED, &estate->flags); write_seqlock(&net->fs_lock); if (responded) { @@ -50,6 +87,7 @@ static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server clear_bit(AFS_SERVER_FL_RESPONDING, &server->flags); list_add_tail(&server->probe_link, &net->fs_probe_fast); } + write_sequnlock(&net->fs_lock); afs_schedule_fs_probe(net, server, !responded); @@ -58,12 +96,13 @@ static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server /* * Handle the completion of a probe. */ -static void afs_done_one_fs_probe(struct afs_net *net, struct afs_server *server) +static void afs_done_one_fs_probe(struct afs_net *net, struct afs_server *server, + struct afs_endpoint_state *estate) { _enter(""); - if (atomic_dec_and_test(&server->probe_outstanding)) - afs_finished_fs_probe(net, server); + if (atomic_dec_and_test(&estate->nr_probing)) + afs_finished_fs_probe(net, server, estate); wake_up_all(&server->probe_wq); } @@ -74,24 +113,22 @@ static void afs_done_one_fs_probe(struct afs_net *net, struct afs_server *server */ static void afs_fs_probe_not_done(struct afs_net *net, struct afs_server *server, - struct afs_addr_cursor *ac) + struct afs_endpoint_state *estate, + int index) { - struct afs_addr_list *alist = ac->alist; - unsigned int index = ac->index; - _enter(""); trace_afs_io_error(0, -ENOMEM, afs_io_error_fs_probe_fail); spin_lock(&server->probe_lock); - server->probe.local_failure = true; - if (server->probe.error == 0) - server->probe.error = -ENOMEM; + set_bit(AFS_ESTATE_LOCAL_FAILURE, &estate->flags); + if (estate->error == 0) + estate->error = -ENOMEM; - set_bit(index, &alist->failed); + set_bit(index, &estate->failed_set); spin_unlock(&server->probe_lock); - return afs_done_one_fs_probe(net, server); + return afs_done_one_fs_probe(net, server, estate); } /* @@ -100,30 +137,34 @@ static void afs_fs_probe_not_done(struct afs_net *net, */ void afs_fileserver_probe_result(struct afs_call *call) { - struct afs_addr_list *alist = call->alist; + struct afs_endpoint_state *estate = call->probe; + struct afs_addr_list *alist = estate->addresses; + struct afs_address *addr = &alist->addrs[call->probe_index]; struct afs_server *server = call->server; - unsigned int index = call->addr_ix; - unsigned int rtt_us = 0, cap0; + unsigned int index = call->probe_index; + unsigned int rtt_us = -1, cap0; int ret = call->error; _enter("%pU,%u", &server->uuid, index); + WRITE_ONCE(addr->last_error, ret); + spin_lock(&server->probe_lock); switch (ret) { case 0: - server->probe.error = 0; + estate->error = 0; goto responded; case -ECONNABORTED: - if (!server->probe.responded) { - server->probe.abort_code = call->abort_code; - server->probe.error = ret; + if (!test_bit(AFS_ESTATE_RESPONDED, &estate->flags)) { + estate->abort_code = call->abort_code; + estate->error = ret; } goto responded; case -ENOMEM: case -ENONET: - clear_bit(index, &alist->responded); - server->probe.local_failure = true; + clear_bit(index, &estate->responsive_set); + set_bit(AFS_ESTATE_LOCAL_FAILURE, &estate->flags); trace_afs_io_error(call->debug_id, ret, afs_io_error_fs_probe_fail); goto out; case -ECONNRESET: /* Responded, but call expired. */ @@ -136,29 +177,29 @@ void afs_fileserver_probe_result(struct afs_call *call) case -ETIMEDOUT: case -ETIME: default: - clear_bit(index, &alist->responded); - set_bit(index, &alist->failed); - if (!server->probe.responded && - (server->probe.error == 0 || - server->probe.error == -ETIMEDOUT || - server->probe.error == -ETIME)) - server->probe.error = ret; + clear_bit(index, &estate->responsive_set); + set_bit(index, &estate->failed_set); + if (!test_bit(AFS_ESTATE_RESPONDED, &estate->flags) && + (estate->error == 0 || + estate->error == -ETIMEDOUT || + estate->error == -ETIME)) + estate->error = ret; trace_afs_io_error(call->debug_id, ret, afs_io_error_fs_probe_fail); goto out; } responded: - clear_bit(index, &alist->failed); + clear_bit(index, &estate->failed_set); if (call->service_id == YFS_FS_SERVICE) { - server->probe.is_yfs = true; + set_bit(AFS_ESTATE_IS_YFS, &estate->flags); set_bit(AFS_SERVER_FL_IS_YFS, &server->flags); - alist->addrs[index].srx_service = call->service_id; + server->service_id = call->service_id; } else { - server->probe.not_yfs = true; - if (!server->probe.is_yfs) { + set_bit(AFS_ESTATE_NOT_YFS, &estate->flags); + if (!test_bit(AFS_ESTATE_IS_YFS, &estate->flags)) { clear_bit(AFS_SERVER_FL_IS_YFS, &server->flags); - alist->addrs[index].srx_service = call->service_id; + server->service_id = call->service_id; } cap0 = ntohl(call->tmp); if (cap0 & AFS3_VICED_CAPABILITY_64BITFILES) @@ -167,116 +208,136 @@ responded: clear_bit(AFS_SERVER_FL_HAS_FS64, &server->flags); } - rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us); - if (rtt_us < server->probe.rtt) { - server->probe.rtt = rtt_us; + rtt_us = rxrpc_kernel_get_srtt(addr->peer); + if (rtt_us < estate->rtt) { + estate->rtt = rtt_us; server->rtt = rtt_us; alist->preferred = index; } smp_wmb(); /* Set rtt before responded. */ - server->probe.responded = true; - set_bit(index, &alist->responded); + set_bit(AFS_ESTATE_RESPONDED, &estate->flags); + set_bit(index, &estate->responsive_set); set_bit(AFS_SERVER_FL_RESPONDING, &server->flags); out: spin_unlock(&server->probe_lock); - _debug("probe %pU [%u] %pISpc rtt=%u ret=%d", - &server->uuid, index, &alist->addrs[index].transport, + trace_afs_fs_probe(server, false, estate, index, call->error, call->abort_code, rtt_us); + _debug("probe[%x] %pU [%u] %pISpc rtt=%d ret=%d", + estate->probe_seq, &server->uuid, index, + rxrpc_kernel_remote_addr(alist->addrs[index].peer), rtt_us, ret); - return afs_done_one_fs_probe(call->net, server); + return afs_done_one_fs_probe(call->net, server, estate); } /* - * Probe one or all of a fileserver's addresses to find out the best route and - * to query its capabilities. + * Probe all of a fileserver's addresses to find out the best route and to + * query its capabilities. */ void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, - struct key *key, bool all) + struct afs_addr_list *new_alist, struct key *key) { - struct afs_addr_cursor ac = { - .index = 0, - }; + struct afs_endpoint_state *estate, *old; + struct afs_addr_list *alist; + unsigned long unprobed; _enter("%pU", &server->uuid); - read_lock(&server->fs_lock); - ac.alist = rcu_dereference_protected(server->addresses, - lockdep_is_held(&server->fs_lock)); - afs_get_addrlist(ac.alist); - read_unlock(&server->fs_lock); + estate = kzalloc(sizeof(*estate), GFP_KERNEL); + if (!estate) + return; + + refcount_set(&estate->ref, 1); + estate->server_id = server->debug_id; + estate->rtt = UINT_MAX; + + write_lock(&server->fs_lock); + + old = rcu_dereference_protected(server->endpoint_state, + lockdep_is_held(&server->fs_lock)); + estate->responsive_set = old->responsive_set; + estate->addresses = afs_get_addrlist(new_alist ?: old->addresses, + afs_alist_trace_get_estate); + alist = estate->addresses; + estate->probe_seq = ++server->probe_counter; + atomic_set(&estate->nr_probing, alist->nr_addrs); + + rcu_assign_pointer(server->endpoint_state, estate); + set_bit(AFS_ESTATE_SUPERSEDED, &old->flags); + write_unlock(&server->fs_lock); + + trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref), + afs_estate_trace_alloc_probe); + + afs_get_address_preferences(net, alist); server->probed_at = jiffies; - atomic_set(&server->probe_outstanding, all ? ac.alist->nr_addrs : 1); - memset(&server->probe, 0, sizeof(server->probe)); - server->probe.rtt = UINT_MAX; - - ac.index = ac.alist->preferred; - if (ac.index < 0 || ac.index >= ac.alist->nr_addrs) - all = true; - - if (all) { - for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++) - if (!afs_fs_get_capabilities(net, server, &ac, key)) - afs_fs_probe_not_done(net, server, &ac); - } else { - if (!afs_fs_get_capabilities(net, server, &ac, key)) - afs_fs_probe_not_done(net, server, &ac); + unprobed = (1UL << alist->nr_addrs) - 1; + while (unprobed) { + unsigned int index = 0, i; + int best_prio = -1; + + for (i = 0; i < alist->nr_addrs; i++) { + if (test_bit(i, &unprobed) && + alist->addrs[i].prio > best_prio) { + index = i; + best_prio = alist->addrs[i].prio; + } + } + __clear_bit(index, &unprobed); + + trace_afs_fs_probe(server, true, estate, index, 0, 0, 0); + if (!afs_fs_get_capabilities(net, server, estate, index, key)) + afs_fs_probe_not_done(net, server, estate, index); } - afs_put_addrlist(ac.alist); + afs_put_endpoint_state(old, afs_estate_trace_put_probe); } /* - * Wait for the first as-yet untried fileserver to respond. + * Wait for the first as-yet untried fileserver to respond, for the probe state + * to be superseded or for all probes to finish. */ -int afs_wait_for_fs_probes(struct afs_server_list *slist, unsigned long untried) +int afs_wait_for_fs_probes(struct afs_operation *op, struct afs_server_state *states, bool intr) { - struct wait_queue_entry *waits; - struct afs_server *server; - unsigned int rtt = UINT_MAX, rtt_s; - bool have_responders = false; - int pref = -1, i; + struct afs_endpoint_state *estate; + struct afs_server_list *slist = op->server_list; + bool still_probing = true; + int ret = 0, i; - _enter("%u,%lx", slist->nr_servers, untried); + _enter("%u", slist->nr_servers); - /* Only wait for servers that have a probe outstanding. */ for (i = 0; i < slist->nr_servers; i++) { - if (test_bit(i, &untried)) { - server = slist->servers[i].server; - if (!atomic_read(&server->probe_outstanding)) - __clear_bit(i, &untried); - if (server->probe.responded) - have_responders = true; - } + estate = states[i].endpoint_state; + if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags)) + return 2; + if (atomic_read(&estate->nr_probing)) + still_probing = true; + if (estate->responsive_set & states[i].untried_addrs) + return 1; } - if (have_responders || !untried) + if (!still_probing) return 0; - waits = kmalloc(array_size(slist->nr_servers, sizeof(*waits)), GFP_KERNEL); - if (!waits) - return -ENOMEM; - - for (i = 0; i < slist->nr_servers; i++) { - if (test_bit(i, &untried)) { - server = slist->servers[i].server; - init_waitqueue_entry(&waits[i], current); - add_wait_queue(&server->probe_wq, &waits[i]); - } - } + for (i = 0; i < slist->nr_servers; i++) + add_wait_queue(&slist->servers[i].server->probe_wq, &states[i].probe_waiter); for (;;) { - bool still_probing = false; + still_probing = false; - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); for (i = 0; i < slist->nr_servers; i++) { - if (test_bit(i, &untried)) { - server = slist->servers[i].server; - if (server->probe.responded) - goto stop; - if (atomic_read(&server->probe_outstanding)) - still_probing = true; + estate = states[i].endpoint_state; + if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags)) { + ret = 2; + goto stop; + } + if (atomic_read(&estate->nr_probing)) + still_probing = true; + if (estate->responsive_set & states[i].untried_addrs) { + ret = 1; + goto stop; } } @@ -288,28 +349,12 @@ int afs_wait_for_fs_probes(struct afs_server_list *slist, unsigned long untried) stop: set_current_state(TASK_RUNNING); - for (i = 0; i < slist->nr_servers; i++) { - if (test_bit(i, &untried)) { - server = slist->servers[i].server; - rtt_s = READ_ONCE(server->rtt); - if (test_bit(AFS_SERVER_FL_RESPONDING, &server->flags) && - rtt_s < rtt) { - pref = i; - rtt = rtt_s; - } - - remove_wait_queue(&server->probe_wq, &waits[i]); - } - } - - kfree(waits); - - if (pref == -1 && signal_pending(current)) - return -ERESTARTSYS; + for (i = 0; i < slist->nr_servers; i++) + remove_wait_queue(&slist->servers[i].server->probe_wq, &states[i].probe_waiter); - if (pref >= 0) - slist->preferred = pref; - return 0; + if (!ret && signal_pending(current)) + ret = -ERESTARTSYS; + return ret; } /* @@ -327,7 +372,7 @@ void afs_fs_probe_timer(struct timer_list *timer) /* * Dispatch a probe to a server. */ -static void afs_dispatch_fs_probe(struct afs_net *net, struct afs_server *server, bool all) +static void afs_dispatch_fs_probe(struct afs_net *net, struct afs_server *server) __releases(&net->fs_lock) { struct key *key = NULL; @@ -340,7 +385,7 @@ static void afs_dispatch_fs_probe(struct afs_net *net, struct afs_server *server afs_get_server(server, afs_server_trace_get_probe); write_sequnlock(&net->fs_lock); - afs_fs_probe_fileserver(net, server, key, all); + afs_fs_probe_fileserver(net, server, NULL, key); afs_put_server(net, server, afs_server_trace_put_probe); } @@ -352,7 +397,7 @@ void afs_probe_fileserver(struct afs_net *net, struct afs_server *server) { write_seqlock(&net->fs_lock); if (!list_empty(&server->probe_link)) - return afs_dispatch_fs_probe(net, server, true); + return afs_dispatch_fs_probe(net, server); write_sequnlock(&net->fs_lock); } @@ -412,7 +457,7 @@ again: _debug("probe %pU", &server->uuid); if (server && (first_pass || !need_resched())) { - afs_dispatch_fs_probe(net, server, server == fast); + afs_dispatch_fs_probe(net, server); first_pass = false; goto again; } @@ -436,12 +481,13 @@ again: /* * Wait for a probe on a particular fileserver to complete for 2s. */ -int afs_wait_for_one_fs_probe(struct afs_server *server, bool is_intr) +int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_state *estate, + unsigned long exclude, bool is_intr) { struct wait_queue_entry wait; unsigned long timo = 2 * HZ; - if (atomic_read(&server->probe_outstanding) == 0) + if (atomic_read(&estate->nr_probing) == 0) goto dont_wait; init_wait_entry(&wait, 0); @@ -449,8 +495,9 @@ int afs_wait_for_one_fs_probe(struct afs_server *server, bool is_intr) prepare_to_wait_event(&server->probe_wq, &wait, is_intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); if (timo == 0 || - server->probe.responded || - atomic_read(&server->probe_outstanding) == 0 || + test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags) || + (estate->responsive_set & ~exclude) || + atomic_read(&estate->nr_probing) == 0 || (is_intr && signal_pending(current))) break; timo = schedule_timeout(timo); @@ -459,7 +506,9 @@ int afs_wait_for_one_fs_probe(struct afs_server *server, bool is_intr) finish_wait(&server->probe_wq, &wait); dont_wait: - if (server->probe.responded) + if (estate->responsive_set & ~exclude) + return 1; + if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags)) return 0; if (is_intr && signal_pending(current)) return -ERESTARTSYS; diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 7d37f63ef0f0..79cd30775b7a 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -290,6 +290,7 @@ void afs_fs_fetch_status(struct afs_operation *op) bp[2] = htonl(vp->fid.vnode); bp[3] = htonl(vp->fid.unique); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -442,6 +443,7 @@ static void afs_fs_fetch_data64(struct afs_operation *op) bp[6] = 0; bp[7] = htonl(lower_32_bits(req->len)); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -476,6 +478,7 @@ void afs_fs_fetch_data(struct afs_operation *op) bp[4] = htonl(lower_32_bits(req->pos)); bp[5] = htonl(lower_32_bits(req->len)); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -559,6 +562,7 @@ void afs_fs_create_file(struct afs_operation *op) *bp++ = htonl(op->create.mode & S_IALLUGO); /* unix mode */ *bp++ = 0; /* segment size */ + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -612,6 +616,7 @@ void afs_fs_make_dir(struct afs_operation *op) *bp++ = htonl(op->create.mode & S_IALLUGO); /* unix mode */ *bp++ = 0; /* segment size */ + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -685,6 +690,7 @@ void afs_fs_remove_file(struct afs_operation *op) bp = (void *) bp + padsz; } + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -732,6 +738,7 @@ void afs_fs_remove_dir(struct afs_operation *op) bp = (void *) bp + padsz; } + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -812,6 +819,7 @@ void afs_fs_link(struct afs_operation *op) *bp++ = htonl(vp->fid.vnode); *bp++ = htonl(vp->fid.unique); + call->fid = vp->fid; trace_afs_make_fs_call1(call, &vp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -907,6 +915,7 @@ void afs_fs_symlink(struct afs_operation *op) *bp++ = htonl(S_IRWXUGO); /* unix mode */ *bp++ = 0; /* segment size */ + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -1003,6 +1012,7 @@ void afs_fs_rename(struct afs_operation *op) bp = (void *) bp + n_padsz; } + call->fid = orig_dvp->fid; trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); afs_make_op_call(op, call, GFP_NOFS); } @@ -1090,6 +1100,7 @@ static void afs_fs_store_data64(struct afs_operation *op) *bp++ = htonl(upper_32_bits(op->store.i_size)); *bp++ = htonl(lower_32_bits(op->store.i_size)); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1140,6 +1151,7 @@ void afs_fs_store_data(struct afs_operation *op) *bp++ = htonl(lower_32_bits(op->store.size)); *bp++ = htonl(lower_32_bits(op->store.i_size)); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1206,6 +1218,7 @@ static void afs_fs_setattr_size64(struct afs_operation *op) *bp++ = htonl(upper_32_bits(attr->ia_size)); /* new file length */ *bp++ = htonl(lower_32_bits(attr->ia_size)); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1247,6 +1260,7 @@ static void afs_fs_setattr_size(struct afs_operation *op) *bp++ = 0; /* size of write */ *bp++ = htonl(attr->ia_size); /* new file length */ + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1283,6 +1297,7 @@ void afs_fs_setattr(struct afs_operation *op) xdr_encode_AFS_StoreStatus(&bp, op->setattr.attr); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1446,6 +1461,7 @@ void afs_fs_get_volume_status(struct afs_operation *op) bp[0] = htonl(FSGETVOLUMESTATUS); bp[1] = htonl(vp->fid.vid); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1528,6 +1544,7 @@ void afs_fs_set_lock(struct afs_operation *op) *bp++ = htonl(vp->fid.unique); *bp++ = htonl(op->lock.type); + call->fid = vp->fid; trace_afs_make_fs_calli(call, &vp->fid, op->lock.type); afs_make_op_call(op, call, GFP_NOFS); } @@ -1554,6 +1571,7 @@ void afs_fs_extend_lock(struct afs_operation *op) *bp++ = htonl(vp->fid.vnode); *bp++ = htonl(vp->fid.unique); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1580,6 +1598,7 @@ void afs_fs_release_lock(struct afs_operation *op) *bp++ = htonl(vp->fid.vnode); *bp++ = htonl(vp->fid.unique); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1605,13 +1624,12 @@ static const struct afs_call_type afs_RXFSGiveUpAllCallBacks = { /* * Flush all the callbacks we have on a server. */ -int afs_fs_give_up_all_callbacks(struct afs_net *net, - struct afs_server *server, - struct afs_addr_cursor *ac, - struct key *key) +int afs_fs_give_up_all_callbacks(struct afs_net *net, struct afs_server *server, + struct afs_address *addr, struct key *key) { struct afs_call *call; __be32 *bp; + int ret; _enter(""); @@ -1619,15 +1637,22 @@ int afs_fs_give_up_all_callbacks(struct afs_net *net, if (!call) return -ENOMEM; - call->key = key; + call->key = key; + call->peer = rxrpc_kernel_get_peer(addr->peer); + call->service_id = server->service_id; /* marshall the parameters */ bp = call->request; *bp++ = htonl(FSGIVEUPALLCALLBACKS); call->server = afs_use_server(server, afs_server_trace_give_up_cb); - afs_make_call(ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, ac); + afs_make_call(call, GFP_NOFS); + afs_wait_for_call_to_complete(call); + ret = call->error; + if (call->responded) + set_bit(AFS_SERVER_FL_RESPONDING, &server->flags); + afs_put_call(call); + return ret; } /* @@ -1689,6 +1714,12 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call) return 0; } +static void afs_fs_get_capabilities_destructor(struct afs_call *call) +{ + afs_put_endpoint_state(call->probe, afs_estate_trace_put_getcaps); + afs_flat_call_destructor(call); +} + /* * FS.GetCapabilities operation type */ @@ -1697,7 +1728,7 @@ static const struct afs_call_type afs_RXFSGetCapabilities = { .op = afs_FS_GetCapabilities, .deliver = afs_deliver_fs_get_capabilities, .done = afs_fileserver_probe_result, - .destructor = afs_flat_call_destructor, + .destructor = afs_fs_get_capabilities_destructor, }; /* @@ -1707,7 +1738,8 @@ static const struct afs_call_type afs_RXFSGetCapabilities = { * ->done() - otherwise we return false to indicate we didn't even try. */ bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server, - struct afs_addr_cursor *ac, struct key *key) + struct afs_endpoint_state *estate, unsigned int addr_index, + struct key *key) { struct afs_call *call; __be32 *bp; @@ -1718,10 +1750,14 @@ bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server, if (!call) return false; - call->key = key; - call->server = afs_use_server(server, afs_server_trace_get_caps); - call->upgrade = true; - call->async = true; + call->key = key; + call->server = afs_use_server(server, afs_server_trace_get_caps); + call->peer = rxrpc_kernel_get_peer(estate->addresses->addrs[addr_index].peer); + call->probe = afs_get_endpoint_state(estate, afs_estate_trace_get_getcaps); + call->probe_index = addr_index; + call->service_id = server->service_id; + call->upgrade = true; + call->async = true; call->max_lifespan = AFS_PROBE_MAX_LIFESPAN; /* marshall the parameters */ @@ -1729,7 +1765,7 @@ bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server, *bp++ = htonl(FSGETCAPABILITIES); trace_afs_make_fs_call(call, NULL); - afs_make_call(ac, call, GFP_NOFS); + afs_make_call(call, GFP_NOFS); afs_put_call(call); return true; } @@ -1853,7 +1889,10 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) return ret; bp = call->buffer; - xdr_decode_AFSVolSync(&bp, &op->volsync); + /* Unfortunately, prior to OpenAFS-1.6, volsync here is filled + * with rubbish. + */ + xdr_decode_AFSVolSync(&bp, NULL); call->unmarshall++; fallthrough; @@ -1899,7 +1938,7 @@ void afs_fs_inline_bulk_status(struct afs_operation *op) int i; if (test_bit(AFS_SERVER_FL_NO_IBULK, &op->server->flags)) { - op->error = -ENOTSUPP; + afs_op_set_error(op, -ENOTSUPP); return; } @@ -1928,6 +1967,7 @@ void afs_fs_inline_bulk_status(struct afs_operation *op) *bp++ = htonl(op->more_files[i].fid.unique); } + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -2033,6 +2073,7 @@ void afs_fs_fetch_acl(struct afs_operation *op) bp[2] = htonl(vp->fid.vnode); bp[3] = htonl(vp->fid.unique); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_KERNEL); } @@ -2078,6 +2119,7 @@ void afs_fs_store_acl(struct afs_operation *op) if (acl->size != size) memset((void *)&bp[5] + acl->size, 0, size - acl->size); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_KERNEL); } diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 78efc9719349..4f04f6f33f46 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -85,8 +85,7 @@ static int afs_inode_init_from_status(struct afs_operation *op, write_seqlock(&vnode->cb_lock); - vnode->cb_v_break = op->cb_v_break; - vnode->cb_s_break = op->cb_s_break; + vnode->cb_v_check = op->cb_v_break; vnode->status = *status; t = status->mtime_client; @@ -146,11 +145,10 @@ static int afs_inode_init_from_status(struct afs_operation *op, if (!vp->scb.have_cb) { /* it's a symlink we just created (the fileserver * didn't give us a callback) */ - vnode->cb_expires_at = ktime_get_real_seconds(); + atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE); } else { - vnode->cb_expires_at = vp->scb.callback.expires_at; vnode->cb_server = op->server; - set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); + atomic64_set(&vnode->cb_expires_at, vp->scb.callback.expires_at); } write_sequnlock(&vnode->cb_lock); @@ -214,7 +212,8 @@ static void afs_apply_status(struct afs_operation *op, vnode->status = *status; if (vp->dv_before + vp->dv_delta != status->data_version) { - if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) + if (vnode->cb_ro_snapshot == atomic_read(&vnode->volume->cb_ro_snapshot) && + atomic64_read(&vnode->cb_expires_at) != AFS_NO_CB_PROMISE) pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s (op=%x)\n", vnode->fid.vid, vnode->fid.vnode, (unsigned long long)vp->dv_before + vp->dv_delta, @@ -268,9 +267,9 @@ static void afs_apply_callback(struct afs_operation *op, struct afs_vnode *vnode = vp->vnode; if (!afs_cb_is_broken(vp->cb_break_before, vnode)) { - vnode->cb_expires_at = cb->expires_at; - vnode->cb_server = op->server; - set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); + if (op->volume->type == AFSVL_RWVOL) + vnode->cb_server = op->server; + atomic64_set(&vnode->cb_expires_at, cb->expires_at); } } @@ -331,7 +330,7 @@ static void afs_fetch_status_success(struct afs_operation *op) if (vnode->netfs.inode.i_state & I_NEW) { ret = afs_inode_init_from_status(op, vp, vnode); - op->error = ret; + afs_op_set_error(op, ret); if (ret == 0) afs_cache_permit(vnode, op->key, vp->cb_break_before, &vp->scb); } else { @@ -542,7 +541,7 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key) BUG_ON(!(inode->i_state & I_NEW)); vnode = AFS_FS_I(inode); - vnode->cb_v_break = as->volume->cb_v_break, + vnode->cb_v_check = atomic_read(&as->volume->cb_v_break), afs_set_netfs_context(vnode); op = afs_alloc_operation(key, as->volume); @@ -573,180 +572,6 @@ error: } /* - * mark the data attached to an inode as obsolete due to a write on the server - * - might also want to ditch all the outstanding writes and dirty pages - */ -static void afs_zap_data(struct afs_vnode *vnode) -{ - _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode); - - afs_invalidate_cache(vnode, 0); - - /* nuke all the non-dirty pages that aren't locked, mapped or being - * written back in a regular file and completely discard the pages in a - * directory or symlink */ - if (S_ISREG(vnode->netfs.inode.i_mode)) - invalidate_remote_inode(&vnode->netfs.inode); - else - invalidate_inode_pages2(vnode->netfs.inode.i_mapping); -} - -/* - * Check to see if we have a server currently serving this volume and that it - * hasn't been reinitialised or dropped from the list. - */ -static bool afs_check_server_good(struct afs_vnode *vnode) -{ - struct afs_server_list *slist; - struct afs_server *server; - bool good; - int i; - - if (vnode->cb_fs_s_break == atomic_read(&vnode->volume->cell->fs_s_break)) - return true; - - rcu_read_lock(); - - slist = rcu_dereference(vnode->volume->servers); - for (i = 0; i < slist->nr_servers; i++) { - server = slist->servers[i].server; - if (server == vnode->cb_server) { - good = (vnode->cb_s_break == server->cb_s_break); - rcu_read_unlock(); - return good; - } - } - - rcu_read_unlock(); - return false; -} - -/* - * Check the validity of a vnode/inode. - */ -bool afs_check_validity(struct afs_vnode *vnode) -{ - enum afs_cb_break_reason need_clear = afs_cb_break_no_break; - time64_t now = ktime_get_real_seconds(); - unsigned int cb_break; - int seq = 0; - - do { - read_seqbegin_or_lock(&vnode->cb_lock, &seq); - cb_break = vnode->cb_break; - - if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { - if (vnode->cb_v_break != vnode->volume->cb_v_break) - need_clear = afs_cb_break_for_v_break; - else if (!afs_check_server_good(vnode)) - need_clear = afs_cb_break_for_s_reinit; - else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) - need_clear = afs_cb_break_for_zap; - else if (vnode->cb_expires_at - 10 <= now) - need_clear = afs_cb_break_for_lapsed; - } else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { - ; - } else { - need_clear = afs_cb_break_no_promise; - } - - } while (need_seqretry(&vnode->cb_lock, seq)); - - done_seqretry(&vnode->cb_lock, seq); - - if (need_clear == afs_cb_break_no_break) - return true; - - write_seqlock(&vnode->cb_lock); - if (need_clear == afs_cb_break_no_promise) - vnode->cb_v_break = vnode->volume->cb_v_break; - else if (cb_break == vnode->cb_break) - __afs_break_callback(vnode, need_clear); - else - trace_afs_cb_miss(&vnode->fid, need_clear); - write_sequnlock(&vnode->cb_lock); - return false; -} - -/* - * Returns true if the pagecache is still valid. Does not sleep. - */ -bool afs_pagecache_valid(struct afs_vnode *vnode) -{ - if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) { - if (vnode->netfs.inode.i_nlink) - clear_nlink(&vnode->netfs.inode); - return true; - } - - if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) && - afs_check_validity(vnode)) - return true; - - return false; -} - -/* - * validate a vnode/inode - * - there are several things we need to check - * - parent dir data changes (rm, rmdir, rename, mkdir, create, link, - * symlink) - * - parent dir metadata changed (security changes) - * - dentry data changed (write, truncate) - * - dentry metadata changed (security changes) - */ -int afs_validate(struct afs_vnode *vnode, struct key *key) -{ - int ret; - - _enter("{v={%llx:%llu} fl=%lx},%x", - vnode->fid.vid, vnode->fid.vnode, vnode->flags, - key_serial(key)); - - if (afs_pagecache_valid(vnode)) - goto valid; - - down_write(&vnode->validate_lock); - - /* if the promise has expired, we need to check the server again to get - * a new promise - note that if the (parent) directory's metadata was - * changed then the security may be different and we may no longer have - * access */ - if (!test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { - _debug("not promised"); - ret = afs_fetch_status(vnode, key, false, NULL); - if (ret < 0) { - if (ret == -ENOENT) { - set_bit(AFS_VNODE_DELETED, &vnode->flags); - ret = -ESTALE; - } - goto error_unlock; - } - _debug("new promise [fl=%lx]", vnode->flags); - } - - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { - _debug("file already deleted"); - ret = -ESTALE; - goto error_unlock; - } - - /* if the vnode's data version number changed then its contents are - * different */ - if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) - afs_zap_data(vnode); - up_write(&vnode->validate_lock); -valid: - _leave(" = 0"); - return 0; - -error_unlock: - up_write(&vnode->validate_lock); - _leave(" = %d", ret); - return ret; -} - -/* * read the attributes of an inode */ int afs_getattr(struct mnt_idmap *idmap, const struct path *path, @@ -755,13 +580,13 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path, struct inode *inode = d_inode(path->dentry); struct afs_vnode *vnode = AFS_FS_I(inode); struct key *key; - int ret, seq = 0; + int ret, seq; _enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation); if (vnode->volume && !(query_flags & AT_STATX_DONT_SYNC) && - !test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { + atomic64_read(&vnode->cb_expires_at) == AFS_NO_CB_PROMISE) { key = afs_request_key(vnode->volume->cell); if (IS_ERR(key)) return PTR_ERR(key); @@ -772,7 +597,7 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path, } do { - read_seqbegin_or_lock(&vnode->cb_lock, &seq); + seq = read_seqbegin(&vnode->cb_lock); generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) && stat->nlink > 0) @@ -784,9 +609,8 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path, */ if (S_ISDIR(inode->i_mode)) stat->size = vnode->netfs.remote_i_size; - } while (need_seqretry(&vnode->cb_lock, seq)); + } while (read_seqretry(&vnode->cb_lock, seq)); - done_seqretry(&vnode->cb_lock, seq); return 0; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 7385d62c8cf5..55aa0679d8ce 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -33,6 +33,7 @@ struct pagevec; struct afs_call; struct afs_vnode; +struct afs_server_probe; /* * Partial file-locking emulation mode. (The problem being that AFS3 only @@ -73,21 +74,51 @@ enum afs_call_state { }; /* + * Address preferences. + */ +struct afs_addr_preference { + union { + struct in_addr ipv4_addr; /* AF_INET address to compare against */ + struct in6_addr ipv6_addr; /* AF_INET6 address to compare against */ + }; + sa_family_t family; /* Which address to use */ + u16 prio; /* Priority */ + u8 subnet_mask; /* How many bits to compare */ +}; + +struct afs_addr_preference_list { + struct rcu_head rcu; + u16 version; /* Incremented when prefs list changes */ + u8 ipv6_off; /* Offset of IPv6 addresses */ + u8 nr; /* Number of addresses in total */ + u8 max_prefs; /* Number of prefs allocated */ + struct afs_addr_preference prefs[] __counted_by(max_prefs); +}; + +struct afs_address { + struct rxrpc_peer *peer; + short last_error; /* Last error from this address */ + u16 prio; /* Address priority */ +}; + +/* * List of server addresses. */ struct afs_addr_list { struct rcu_head rcu; refcount_t usage; u32 version; /* Version */ + unsigned int debug_id; + unsigned int addr_pref_version; /* Version of address preference list */ unsigned char max_addrs; unsigned char nr_addrs; unsigned char preferred; /* Preferred address */ unsigned char nr_ipv4; /* Number of IPv4 addresses */ enum dns_record_source source:8; enum dns_lookup_status status:8; - unsigned long failed; /* Mask of addrs that failed locally/ICMP */ + unsigned long probe_failed; /* Mask of addrs that failed locally/ICMP */ unsigned long responded; /* Mask of addrs that responded */ - struct sockaddr_rxrpc addrs[] __counted_by(max_addrs); + struct afs_address addrs[] __counted_by(max_addrs); #define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8)) }; @@ -96,11 +127,11 @@ struct afs_addr_list { */ struct afs_call { const struct afs_call_type *type; /* type of call */ - struct afs_addr_list *alist; /* Address is alist[addr_ix] */ wait_queue_head_t waitq; /* processes awaiting completion */ struct work_struct async_work; /* async I/O processor */ struct work_struct work; /* actual work processor */ struct rxrpc_call *rxcall; /* RxRPC call handle */ + struct rxrpc_peer *peer; /* Remote endpoint */ struct key *key; /* security for this call */ struct afs_net *net; /* The network namespace */ struct afs_server *server; /* The fileserver record if fs op (pins ref) */ @@ -116,11 +147,14 @@ struct afs_call { }; void *buffer; /* reply receive buffer */ union { - long ret0; /* Value to reply with instead of 0 */ + struct afs_endpoint_state *probe; + struct afs_addr_list *vl_probe; struct afs_addr_list *ret_alist; struct afs_vldb_entry *ret_vldb; char *ret_str; }; + struct afs_fid fid; /* Primary vnode ID (or all zeroes) */ + unsigned char probe_index; /* Address in ->probe_alist */ struct afs_operation *op; unsigned int server_index; refcount_t ref; @@ -133,13 +167,13 @@ struct afs_call { unsigned reply_max; /* maximum size of reply */ unsigned count2; /* count used in unmarshalling */ unsigned char unmarshall; /* unmarshalling phase */ - unsigned char addr_ix; /* Address in ->alist */ bool drop_ref; /* T if need to drop ref for incoming call */ bool need_attention; /* T if RxRPC poked us */ bool async; /* T if asynchronous */ bool upgrade; /* T to request service upgrade */ bool intr; /* T if interruptible */ bool unmarshalling_error; /* T if an unmarshalling error occurred */ + bool responded; /* Got a response from the call (may be abort) */ u16 service_id; /* Actual service ID (after upgrade) */ unsigned int debug_id; /* Trace ID */ u32 operation_ID; /* operation ID for an incoming call */ @@ -306,6 +340,8 @@ struct afs_net { struct proc_dir_entry *proc_afs; /* /proc/net/afs directory */ struct afs_sysnames *sysnames; rwlock_t sysnames_lock; + struct afs_addr_preference_list __rcu *address_prefs; + u16 address_pref_version; /* Statistics counters */ atomic_t n_lookup; /* Number of lookups done */ @@ -379,6 +415,7 @@ struct afs_cell { unsigned int debug_id; /* The volumes belonging to this cell */ + struct rw_semaphore vs_lock; /* Lock for server->volumes */ struct rb_root volumes; /* Tree of volumes on this server */ struct hlist_head proc_volumes; /* procfs volume list */ seqlock_t volume_lock; /* For volumes */ @@ -386,9 +423,6 @@ struct afs_cell { /* Active fileserver interaction state. */ struct rb_root fs_servers; /* afs_server (by server UUID) */ seqlock_t fs_lock; /* For fs_servers */ - struct rw_semaphore fs_open_mmaps_lock; - struct list_head fs_open_mmaps; /* List of vnodes that are mmapped */ - atomic_t fs_s_break; /* Counter of CB.InitCallBackState messages */ /* VL server list. */ rwlock_t vl_servers_lock; /* Lock on vl_servers */ @@ -412,13 +446,14 @@ struct afs_vlserver { rwlock_t lock; /* Lock on addresses */ refcount_t ref; unsigned int rtt; /* Server's current RTT in uS */ + unsigned int debug_id; /* Probe state */ wait_queue_head_t probe_wq; atomic_t probe_outstanding; spinlock_t probe_lock; struct { - unsigned int rtt; /* RTT in uS */ + unsigned int rtt; /* Best RTT in uS (or UINT_MAX) */ u32 abort_code; short error; unsigned short flags; @@ -428,6 +463,7 @@ struct afs_vlserver { #define AFS_VLSERVER_PROBE_LOCAL_FAILURE 0x08 /* A local failure prevented a probe */ } probe; + u16 service_id; /* Service ID we're using */ u16 port; u16 name_len; /* Length of name */ char name[]; /* Server name, case-flattened */ @@ -477,6 +513,7 @@ struct afs_vldb_entry { #define AFS_VOL_VTM_RW 0x01 /* R/W version of the volume is available (on this server) */ #define AFS_VOL_VTM_RO 0x02 /* R/O version of the volume is available (on this server) */ #define AFS_VOL_VTM_BAK 0x04 /* backup version of the volume is available (on this server) */ + u8 vlsf_flags[AFS_NMAXNSERVERS]; short error; u8 nr_servers; /* Number of server records */ u8 name_len; @@ -484,6 +521,32 @@ struct afs_vldb_entry { }; /* + * Fileserver endpoint state. The records the addresses of a fileserver's + * endpoints and the state and result of a round of probing on them. This + * allows the rotation algorithm to access those results without them being + * erased by a subsequent round of probing. + */ +struct afs_endpoint_state { + struct rcu_head rcu; + struct afs_addr_list *addresses; /* The addresses being probed */ + unsigned long responsive_set; /* Bitset of responsive endpoints */ + unsigned long failed_set; /* Bitset of endpoints we failed to probe */ + refcount_t ref; + unsigned int server_id; /* Debug ID of server */ + unsigned int probe_seq; /* Probe sequence (from server::probe_counter) */ + atomic_t nr_probing; /* Number of outstanding probes */ + unsigned int rtt; /* Best RTT in uS (or UINT_MAX) */ + s32 abort_code; + short error; + unsigned long flags; +#define AFS_ESTATE_RESPONDED 0 /* Set if the server responded */ +#define AFS_ESTATE_SUPERSEDED 1 /* Set if this record has been superseded */ +#define AFS_ESTATE_IS_YFS 2 /* Set if probe upgraded to YFS */ +#define AFS_ESTATE_NOT_YFS 3 /* Set if probe didn't upgrade to YFS */ +#define AFS_ESTATE_LOCAL_FAILURE 4 /* Set if there was a local failure (eg. ENOMEM) */ +}; + +/* * Record of fileserver with which we're actively communicating. */ struct afs_server { @@ -493,7 +556,6 @@ struct afs_server { struct afs_uuid _uuid; }; - struct afs_addr_list __rcu *addresses; struct afs_cell *cell; /* Cell to which belongs (pins ref) */ struct rb_node uuid_rb; /* Link in net->fs_servers */ struct afs_server __rcu *uuid_next; /* Next server with same UUID */ @@ -502,7 +564,7 @@ struct afs_server { struct hlist_node addr4_link; /* Link in net->fs_addresses4 */ struct hlist_node addr6_link; /* Link in net->fs_addresses6 */ struct hlist_node proc_link; /* Link in net->fs_proc */ - struct work_struct initcb_work; /* Work for CB.InitCallBackState* */ + struct list_head volumes; /* RCU list of afs_server_entry objects */ struct afs_server *gc_next; /* Next server in manager's list */ time64_t unuse_time; /* Time at which last unused */ unsigned long flags; @@ -520,44 +582,47 @@ struct afs_server { refcount_t ref; /* Object refcount */ atomic_t active; /* Active user count */ u32 addr_version; /* Address list version */ + u16 service_id; /* Service ID we're using. */ unsigned int rtt; /* Server's current RTT in uS */ unsigned int debug_id; /* Debugging ID for traces */ /* file service access */ rwlock_t fs_lock; /* access lock */ - /* callback promise management */ - unsigned cb_s_break; /* Break-everything counter. */ - /* Probe state */ + struct afs_endpoint_state __rcu *endpoint_state; /* Latest endpoint/probe state */ unsigned long probed_at; /* Time last probe was dispatched (jiffies) */ wait_queue_head_t probe_wq; - atomic_t probe_outstanding; + unsigned int probe_counter; /* Number of probes issued */ spinlock_t probe_lock; - struct { - unsigned int rtt; /* RTT in uS */ - u32 abort_code; - short error; - bool responded:1; - bool is_yfs:1; - bool not_yfs:1; - bool local_failure:1; - } probe; }; +enum afs_ro_replicating { + AFS_RO_NOT_REPLICATING, /* Not doing replication */ + AFS_RO_REPLICATING_USE_OLD, /* Replicating; use old version */ + AFS_RO_REPLICATING_USE_NEW, /* Replicating; switch to new version */ +} __mode(byte); + /* * Replaceable volume server list. */ struct afs_server_entry { struct afs_server *server; + struct afs_volume *volume; + struct list_head slink; /* Link in server->volumes */ + time64_t cb_expires_at; /* Time at which volume-level callback expires */ + unsigned long flags; +#define AFS_SE_EXCLUDED 0 /* Set if server is to be excluded in rotation */ +#define AFS_SE_VOLUME_OFFLINE 1 /* Set if volume offline notice given */ +#define AFS_SE_VOLUME_BUSY 2 /* Set if volume busy notice given */ }; struct afs_server_list { struct rcu_head rcu; - afs_volid_t vids[AFS_MAXTYPES]; /* Volume IDs */ refcount_t usage; + bool attached; /* T if attached to servers */ + enum afs_ro_replicating ro_replicating; /* RW->RO update (probably) in progress */ unsigned char nr_servers; - unsigned char preferred; /* Preferred server */ unsigned short vnovol_mask; /* Servers to be skipped due to VNOVOL */ unsigned int seq; /* Set to ->servers_seq when installed */ rwlock_t lock; @@ -568,25 +633,23 @@ struct afs_server_list { * Live AFS volume management. */ struct afs_volume { - union { - struct rcu_head rcu; - afs_volid_t vid; /* volume ID */ - }; + struct rcu_head rcu; + afs_volid_t vid; /* The volume ID of this volume */ + afs_volid_t vids[AFS_MAXTYPES]; /* All associated volume IDs */ refcount_t ref; time64_t update_at; /* Time at which to next update */ struct afs_cell *cell; /* Cell to which belongs (pins ref) */ struct rb_node cell_node; /* Link in cell->volumes */ struct hlist_node proc_link; /* Link in cell->proc_volumes */ struct super_block __rcu *sb; /* Superblock on which inodes reside */ + struct work_struct destructor; /* Deferred destructor */ unsigned long flags; #define AFS_VOLUME_NEEDS_UPDATE 0 /* - T if an update needs performing */ #define AFS_VOLUME_UPDATING 1 /* - T if an update is in progress */ #define AFS_VOLUME_WAIT 2 /* - T if users must wait for update */ #define AFS_VOLUME_DELETED 3 /* - T if volume appears deleted */ -#define AFS_VOLUME_OFFLINE 4 /* - T if volume offline notice given */ -#define AFS_VOLUME_BUSY 5 /* - T if volume busy notice given */ -#define AFS_VOLUME_MAYBE_NO_IBULK 6 /* - T if some servers don't have InlineBulkStatus */ -#define AFS_VOLUME_RM_TREE 7 /* - Set if volume removed from cell->volumes */ +#define AFS_VOLUME_MAYBE_NO_IBULK 4 /* - T if some servers don't have InlineBulkStatus */ +#define AFS_VOLUME_RM_TREE 5 /* - Set if volume removed from cell->volumes */ #ifdef CONFIG_AFS_FSCACHE struct fscache_volume *cache; /* Caching cookie */ #endif @@ -594,8 +657,21 @@ struct afs_volume { rwlock_t servers_lock; /* Lock for ->servers */ unsigned int servers_seq; /* Incremented each time ->servers changes */ - unsigned cb_v_break; /* Break-everything counter. */ + /* RO release tracking */ + struct mutex volsync_lock; /* Time/state evaluation lock */ + time64_t creation_time; /* Volume creation time (or TIME64_MIN) */ + time64_t update_time; /* Volume update time (or TIME64_MIN) */ + + /* Callback management */ + struct mutex cb_check_lock; /* Lock to control race to check after v_break */ + time64_t cb_expires_at; /* Earliest volume callback expiry time */ + atomic_t cb_ro_snapshot; /* RO volume update-from-snapshot counter */ + atomic_t cb_v_break; /* Volume-break event counter. */ + atomic_t cb_v_check; /* Volume-break has-been-checked counter. */ + atomic_t cb_scrub; /* Scrub-all-data event counter. */ rwlock_t cb_v_break_lock; + struct rw_semaphore open_mmaps_lock; + struct list_head open_mmaps; /* List of vnodes that are mmapped */ afs_voltype_t type; /* type of volume */ char type_force; /* force volume type (suppress R/O -> R/W) */ @@ -634,7 +710,6 @@ struct afs_vnode { spinlock_t wb_lock; /* lock for wb_keys */ spinlock_t lock; /* waitqueue/flags lock */ unsigned long flags; -#define AFS_VNODE_CB_PROMISED 0 /* Set if vnode has a callback promise */ #define AFS_VNODE_UNSET 1 /* set if vnode attributes not yet set */ #define AFS_VNODE_DIR_VALID 2 /* Set if dir contents are valid */ #define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */ @@ -660,13 +735,14 @@ struct afs_vnode { struct list_head cb_mmap_link; /* Link in cell->fs_open_mmaps */ void *cb_server; /* Server with callback/filelock */ atomic_t cb_nr_mmap; /* Number of mmaps */ - unsigned int cb_fs_s_break; /* Mass server break counter (cell->fs_s_break) */ - unsigned int cb_s_break; /* Mass break counter on ->server */ - unsigned int cb_v_break; /* Mass break counter on ->volume */ + unsigned int cb_ro_snapshot; /* RO volume release counter on ->volume */ + unsigned int cb_scrub; /* Scrub counter on ->volume */ unsigned int cb_break; /* Break counter on vnode */ + unsigned int cb_v_check; /* Break check counter on ->volume */ seqlock_t cb_lock; /* Lock for ->cb_server, ->status, ->cb_*break */ - time64_t cb_expires_at; /* time at which callback expires */ + atomic64_t cb_expires_at; /* time at which callback expires */ +#define AFS_NO_CB_PROMISE TIME64_MIN }; static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode) @@ -714,40 +790,49 @@ struct afs_permits { * Error prioritisation and accumulation. */ struct afs_error { - short error; /* Accumulated error */ + s32 abort_code; /* Cumulative abort code */ + short error; /* Cumulative error */ bool responded; /* T if server responded */ -}; - -/* - * Cursor for iterating over a server's address list. - */ -struct afs_addr_cursor { - struct afs_addr_list *alist; /* Current address list (pins ref) */ - unsigned long tried; /* Tried addresses */ - signed char index; /* Current address */ - bool responded; /* T if the current address responded */ - unsigned short nr_iterations; /* Number of address iterations */ - short error; - u32 abort_code; + bool aborted; /* T if ->error is from an abort */ }; /* * Cursor for iterating over a set of volume location servers. */ struct afs_vl_cursor { - struct afs_addr_cursor ac; struct afs_cell *cell; /* The cell we're querying */ struct afs_vlserver_list *server_list; /* Current server list (pins ref) */ struct afs_vlserver *server; /* Server on which this resides */ + struct afs_addr_list *alist; /* Current address list (pins ref) */ struct key *key; /* Key for the server */ - unsigned long untried; /* Bitmask of untried servers */ - short index; /* Current server */ - short error; + unsigned long untried_servers; /* Bitmask of untried servers */ + unsigned long addr_tried; /* Tried addresses */ + struct afs_error cumul_error; /* Cumulative error */ + unsigned int debug_id; + s32 call_abort_code; + short call_error; /* Error from single call */ + short server_index; /* Current server */ + signed char addr_index; /* Current address */ unsigned short flags; #define AFS_VL_CURSOR_STOP 0x0001 /* Set to cease iteration */ #define AFS_VL_CURSOR_RETRY 0x0002 /* Set to do a retry */ #define AFS_VL_CURSOR_RETRIED 0x0004 /* Set if started a retry */ - unsigned short nr_iterations; /* Number of server iterations */ + short nr_iterations; /* Number of server iterations */ + bool call_responded; /* T if the current address responded */ +}; + +/* + * Fileserver state tracking for an operation. An array of these is kept, + * indexed by server index. + */ +struct afs_server_state { + /* Tracking of fileserver probe state. Other operations may interfere + * by probing a fileserver when accessing other volumes. + */ + unsigned int probe_seq; + unsigned long untried_addrs; /* Addresses we haven't tried yet */ + struct wait_queue_entry probe_waiter; + struct afs_endpoint_state *endpoint_state; /* Endpoint state being monitored */ }; /* @@ -768,7 +853,7 @@ struct afs_vnode_param { struct afs_fid fid; /* Fid to access */ struct afs_status_cb scb; /* Returned status and callback promise */ afs_dataversion_t dv_before; /* Data version before the call */ - unsigned int cb_break_before; /* cb_break + cb_s_break before the call */ + unsigned int cb_break_before; /* cb_break before the call */ u8 dv_delta; /* Expected change in data version */ bool put_vnode:1; /* T if we have a ref on the vnode */ bool need_io_lock:1; /* T if we need the I/O lock on this */ @@ -793,17 +878,17 @@ struct afs_operation { struct afs_volume *volume; /* Volume being accessed */ struct afs_vnode_param file[2]; struct afs_vnode_param *more_files; - struct afs_volsync volsync; + struct afs_volsync pre_volsync; /* Volsync before op */ + struct afs_volsync volsync; /* Volsync returned by op */ struct dentry *dentry; /* Dentry to be altered */ struct dentry *dentry_2; /* Second dentry to be altered */ struct timespec64 mtime; /* Modification time to record */ struct timespec64 ctime; /* Change time to set */ + struct afs_error cumul_error; /* Cumulative error */ short nr_files; /* Number of entries in file[], more_files */ - short error; unsigned int debug_id; unsigned int cb_v_break; /* Volume break counter before op */ - unsigned int cb_s_break; /* Server break counter before op */ union { struct { @@ -848,13 +933,19 @@ struct afs_operation { }; /* Fileserver iteration state */ - struct afs_addr_cursor ac; struct afs_server_list *server_list; /* Current server list (pins ref) */ struct afs_server *server; /* Server we're using (ref pinned by server_list) */ + struct afs_endpoint_state *estate; /* Current endpoint state (doesn't pin ref) */ + struct afs_server_state *server_states; /* States of the servers involved */ struct afs_call *call; - unsigned long untried; /* Bitmask of untried servers */ - short index; /* Current server */ - unsigned short nr_iterations; /* Number of server iterations */ + unsigned long untried_servers; /* Bitmask of untried servers */ + unsigned long addr_tried; /* Tried addresses */ + s32 call_abort_code; /* Abort code from single call */ + short call_error; /* Error from single call */ + short server_index; /* Current server */ + short nr_iterations; /* Number of server iterations */ + signed char addr_index; /* Current address */ + bool call_responded; /* T if the current address responded */ unsigned int flags; #define AFS_OPERATION_STOP 0x0001 /* Set to cease iteration */ @@ -956,31 +1047,32 @@ static inline bool afs_is_folio_dirty_mmapped(unsigned long priv) /* * addr_list.c */ -static inline struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist) -{ - if (alist) - refcount_inc(&alist->usage); - return alist; -} -extern struct afs_addr_list *afs_alloc_addrlist(unsigned int, - unsigned short, - unsigned short); -extern void afs_put_addrlist(struct afs_addr_list *); +struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason); +extern struct afs_addr_list *afs_alloc_addrlist(unsigned int nr); +extern void afs_put_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason); extern struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *, const char *, size_t, char, unsigned short, unsigned short); +bool afs_addr_list_same(const struct afs_addr_list *a, + const struct afs_addr_list *b); extern struct afs_vlserver_list *afs_dns_query(struct afs_cell *, time64_t *); -extern bool afs_iterate_addresses(struct afs_addr_cursor *); -extern int afs_end_cursor(struct afs_addr_cursor *); -extern void afs_merge_fs_addr4(struct afs_addr_list *, __be32, u16); -extern void afs_merge_fs_addr6(struct afs_addr_list *, __be32 *, u16); +extern int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *addr, + __be32 xdr, u16 port); +extern int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *addr, + __be32 *xdr, u16 port); + +/* + * addr_prefs.c + */ +int afs_proc_addr_prefs_write(struct file *file, char *buf, size_t size); +void afs_get_address_preferences_rcu(struct afs_net *net, struct afs_addr_list *alist); +void afs_get_address_preferences(struct afs_net *net, struct afs_addr_list *alist); /* * callback.c */ extern void afs_invalidate_mmap_work(struct work_struct *); -extern void afs_server_init_callback_work(struct work_struct *work); extern void afs_init_callback_state(struct afs_server *); extern void __afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason); extern void afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason); @@ -988,13 +1080,15 @@ extern void afs_break_callbacks(struct afs_server *, size_t, struct afs_callback static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode) { - return vnode->cb_break + vnode->cb_v_break; + return vnode->cb_break + vnode->cb_ro_snapshot + vnode->cb_scrub; } static inline bool afs_cb_is_broken(unsigned int cb_break, const struct afs_vnode *vnode) { - return cb_break != (vnode->cb_break + vnode->volume->cb_v_break); + return cb_break != (vnode->cb_break + + atomic_read(&vnode->volume->cb_ro_snapshot) + + atomic_read(&vnode->volume->cb_scrub)); } /* @@ -1110,15 +1204,16 @@ extern void afs_fs_get_volume_status(struct afs_operation *); extern void afs_fs_set_lock(struct afs_operation *); extern void afs_fs_extend_lock(struct afs_operation *); extern void afs_fs_release_lock(struct afs_operation *); -extern int afs_fs_give_up_all_callbacks(struct afs_net *, struct afs_server *, - struct afs_addr_cursor *, struct key *); -extern bool afs_fs_get_capabilities(struct afs_net *, struct afs_server *, - struct afs_addr_cursor *, struct key *); +int afs_fs_give_up_all_callbacks(struct afs_net *net, struct afs_server *server, + struct afs_address *addr, struct key *key); +bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server, + struct afs_endpoint_state *estate, unsigned int addr_index, + struct key *key); extern void afs_fs_inline_bulk_status(struct afs_operation *); struct afs_acl { u32 size; - u8 data[]; + u8 data[] __counted_by(size); }; extern void afs_fs_fetch_acl(struct afs_operation *); @@ -1133,11 +1228,6 @@ extern bool afs_begin_vnode_operation(struct afs_operation *); extern void afs_wait_for_operation(struct afs_operation *); extern int afs_do_sync_operation(struct afs_operation *); -static inline void afs_op_nomem(struct afs_operation *op) -{ - op->error = -ENOMEM; -} - static inline void afs_op_set_vnode(struct afs_operation *op, unsigned int n, struct afs_vnode *vnode) { @@ -1154,12 +1244,17 @@ static inline void afs_op_set_fid(struct afs_operation *op, unsigned int n, /* * fs_probe.c */ +struct afs_endpoint_state *afs_get_endpoint_state(struct afs_endpoint_state *estate, + enum afs_estate_trace where); +void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where); extern void afs_fileserver_probe_result(struct afs_call *); -extern void afs_fs_probe_fileserver(struct afs_net *, struct afs_server *, struct key *, bool); -extern int afs_wait_for_fs_probes(struct afs_server_list *, unsigned long); +void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, + struct afs_addr_list *new_addrs, struct key *key); +int afs_wait_for_fs_probes(struct afs_operation *op, struct afs_server_state *states, bool intr); extern void afs_probe_fileserver(struct afs_net *, struct afs_server *); extern void afs_fs_probe_dispatcher(struct work_struct *); -extern int afs_wait_for_one_fs_probe(struct afs_server *, bool); +int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_state *estate, + unsigned long exclude, bool is_intr); extern void afs_fs_probe_cleanup(struct afs_net *); /* @@ -1173,9 +1268,6 @@ extern int afs_ilookup5_test_by_fid(struct inode *, void *); extern struct inode *afs_iget_pseudo_dir(struct super_block *, bool); extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *); extern struct inode *afs_root_iget(struct super_block *, struct key *); -extern bool afs_check_validity(struct afs_vnode *); -extern int afs_validate(struct afs_vnode *, struct key *); -bool afs_pagecache_valid(struct afs_vnode *); extern int afs_getattr(struct mnt_idmap *idmap, const struct path *, struct kstat *, u32, unsigned int); extern int afs_setattr(struct mnt_idmap *idmap, struct dentry *, struct iattr *); @@ -1231,6 +1323,31 @@ static inline void __afs_stat(atomic_t *s) extern int afs_abort_to_error(u32); extern void afs_prioritise_error(struct afs_error *, int, u32); +static inline void afs_op_nomem(struct afs_operation *op) +{ + op->cumul_error.error = -ENOMEM; +} + +static inline int afs_op_error(const struct afs_operation *op) +{ + return op->cumul_error.error; +} + +static inline s32 afs_op_abort_code(const struct afs_operation *op) +{ + return op->cumul_error.abort_code; +} + +static inline int afs_op_set_error(struct afs_operation *op, int error) +{ + return op->cumul_error.error = error; +} + +static inline void afs_op_accumulate_error(struct afs_operation *op, int error, s32 abort_code) +{ + afs_prioritise_error(&op->cumul_error, error, abort_code); +} + /* * mntpt.c */ @@ -1261,6 +1378,7 @@ static inline void afs_put_sysnames(struct afs_sysnames *sysnames) {} /* * rotate.c */ +void afs_clear_server_states(struct afs_operation *op); extern bool afs_select_fileserver(struct afs_operation *); extern void afs_dump_edestaddrreq(const struct afs_operation *); @@ -1273,8 +1391,8 @@ extern int __net_init afs_open_socket(struct afs_net *); extern void __net_exit afs_close_socket(struct afs_net *); extern void afs_charge_preallocation(struct work_struct *); extern void afs_put_call(struct afs_call *); -extern void afs_make_call(struct afs_addr_cursor *, struct afs_call *, gfp_t); -extern long afs_wait_for_call_to_complete(struct afs_call *, struct afs_addr_cursor *); +void afs_make_call(struct afs_call *call, gfp_t gfp); +void afs_wait_for_call_to_complete(struct afs_call *call); extern struct afs_call *afs_alloc_flat_call(struct afs_net *, const struct afs_call_type *, size_t, size_t); @@ -1287,12 +1405,16 @@ extern int afs_protocol_error(struct afs_call *, enum afs_eproto_cause); static inline void afs_make_op_call(struct afs_operation *op, struct afs_call *call, gfp_t gfp) { - op->call = call; - op->type = call->type; - call->op = op; - call->key = op->key; - call->intr = !(op->flags & AFS_OPERATION_UNINTR); - afs_make_call(&op->ac, call, gfp); + struct afs_addr_list *alist = op->estate->addresses; + + op->call = call; + op->type = call->type; + call->op = op; + call->key = op->key; + call->intr = !(op->flags & AFS_OPERATION_UNINTR); + call->peer = rxrpc_kernel_get_peer(alist->addrs[op->addr_index].peer); + call->service_id = op->server->service_id; + afs_make_call(call, gfp); } static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t size) @@ -1401,8 +1523,7 @@ extern void __exit afs_clean_up_permit_cache(void); */ extern spinlock_t afs_server_peer_lock; -extern struct afs_server *afs_find_server(struct afs_net *, - const struct sockaddr_rxrpc *); +extern struct afs_server *afs_find_server(struct afs_net *, const struct rxrpc_peer *); extern struct afs_server *afs_find_server_by_uuid(struct afs_net *, const uuid_t *); extern struct afs_server *afs_lookup_server(struct afs_cell *, struct key *, const uuid_t *, u32); extern struct afs_server *afs_get_server(struct afs_server *, enum afs_server_trace); @@ -1414,7 +1535,7 @@ extern void afs_manage_servers(struct work_struct *); extern void afs_servers_timer(struct timer_list *); extern void afs_fs_probe_timer(struct timer_list *); extern void __net_exit afs_purge_servers(struct afs_net *); -extern bool afs_check_server_record(struct afs_operation *, struct afs_server *); +bool afs_check_server_record(struct afs_operation *op, struct afs_server *server, struct key *key); static inline void afs_inc_servers_outstanding(struct afs_net *net) { @@ -1442,10 +1563,14 @@ static inline struct afs_server_list *afs_get_serverlist(struct afs_server_list } extern void afs_put_serverlist(struct afs_net *, struct afs_server_list *); -extern struct afs_server_list *afs_alloc_server_list(struct afs_cell *, struct key *, - struct afs_vldb_entry *, - u8); +struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume, + struct key *key, + struct afs_vldb_entry *vldb); extern bool afs_annotate_server_list(struct afs_server_list *, struct afs_server_list *); +void afs_attach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist); +void afs_reattach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist, + struct afs_server_list *old); +void afs_detach_volume_from_servers(struct afs_volume *volume, struct afs_server_list *slist); /* * super.c @@ -1454,13 +1579,24 @@ extern int __init afs_fs_init(void); extern void afs_fs_exit(void); /* + * validation.c + */ +bool afs_check_validity(const struct afs_vnode *vnode); +int afs_update_volume_state(struct afs_operation *op); +int afs_validate(struct afs_vnode *vnode, struct key *key); + +/* * vlclient.c */ extern struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *, const char *, int); extern struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *, const uuid_t *); -extern struct afs_call *afs_vl_get_capabilities(struct afs_net *, struct afs_addr_cursor *, - struct key *, struct afs_vlserver *, unsigned int); +struct afs_call *afs_vl_get_capabilities(struct afs_net *net, + struct afs_addr_list *alist, + unsigned int addr_index, + struct key *key, + struct afs_vlserver *server, + unsigned int server_index); extern struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *, const uuid_t *); extern char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *); @@ -1516,7 +1652,7 @@ extern int afs_activate_volume(struct afs_volume *); extern void afs_deactivate_volume(struct afs_volume *); bool afs_try_get_volume(struct afs_volume *volume, enum afs_volume_trace reason); extern struct afs_volume *afs_get_volume(struct afs_volume *, enum afs_volume_trace); -extern void afs_put_volume(struct afs_net *, struct afs_volume *, enum afs_volume_trace); +void afs_put_volume(struct afs_volume *volume, enum afs_volume_trace reason); extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *); /* @@ -1603,7 +1739,7 @@ static inline void afs_update_dentry_version(struct afs_operation *op, struct afs_vnode_param *dir_vp, struct dentry *dentry) { - if (!op->error) + if (!op->cumul_error.error) dentry->d_fsdata = (void *)(unsigned long)dir_vp->scb.status.data_version; } diff --git a/fs/afs/main.c b/fs/afs/main.c index 6425c81d07de..1b3bd21c168a 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -156,6 +156,7 @@ static void __net_exit afs_net_exit(struct net *net_ns) afs_close_socket(net); afs_proc_cleanup(net); afs_put_sysnames(net->sysnames); + kfree_rcu(rcu_access_pointer(net->address_prefs), rcu); } static struct pernet_operations afs_net_ops = { diff --git a/fs/afs/misc.c b/fs/afs/misc.c index 805328ca5428..b8180bf2281f 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -116,6 +116,8 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code) { switch (error) { case 0: + e->aborted = false; + e->error = 0; return; default: if (e->error == -ETIMEDOUT || @@ -161,12 +163,16 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code) if (e->responded) return; e->error = error; + e->aborted = false; return; case -ECONNABORTED: - error = afs_abort_to_error(abort_code); - fallthrough; + e->error = afs_abort_to_error(abort_code); + e->aborted = true; + e->responded = true; + return; case -ENETRESET: /* Responded, but we seem to have changed address */ + e->aborted = false; e->responded = true; e->error = error; return; diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 2a0c83d71565..3bd02571f30d 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -147,6 +147,55 @@ inval: } /* + * Display the list of addr_prefs known to the namespace. + */ +static int afs_proc_addr_prefs_show(struct seq_file *m, void *v) +{ + struct afs_addr_preference_list *preflist; + struct afs_addr_preference *pref; + struct afs_net *net = afs_seq2net_single(m); + union { + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + } addr; + unsigned int i; + char buf[44]; /* Maximum ipv6 + max subnet is 43 */ + + rcu_read_lock(); + preflist = rcu_dereference(net->address_prefs); + + if (!preflist) { + seq_puts(m, "NO PREFS\n"); + return 0; + } + + seq_printf(m, "PROT SUBNET PRIOR (v=%u n=%u/%u/%u)\n", + preflist->version, preflist->ipv6_off, preflist->nr, preflist->max_prefs); + + memset(&addr, 0, sizeof(addr)); + + for (i = 0; i < preflist->nr; i++) { + pref = &preflist->prefs[i]; + + addr.sin.sin_family = pref->family; + if (pref->family == AF_INET) { + memcpy(&addr.sin.sin_addr, &pref->ipv4_addr, + sizeof(addr.sin.sin_addr)); + snprintf(buf, sizeof(buf), "%pISc/%u", &addr.sin, pref->subnet_mask); + seq_printf(m, "UDP %-43.43s %5u\n", buf, pref->prio); + } else { + memcpy(&addr.sin6.sin6_addr, &pref->ipv6_addr, + sizeof(addr.sin6.sin6_addr)); + snprintf(buf, sizeof(buf), "%pISc/%u", &addr.sin6, pref->subnet_mask); + seq_printf(m, "UDP %-43.43s %5u\n", buf, pref->prio); + } + } + + rcu_read_lock(); + return 0; +} + +/* * Display the name of the current workstation cell. */ static int afs_proc_rootcell_show(struct seq_file *m, void *v) @@ -307,7 +356,7 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v) for (i = 0; i < alist->nr_addrs; i++) seq_printf(m, " %c %pISpc\n", alist->preferred == i ? '>' : '-', - &alist->addrs[i].transport); + rxrpc_kernel_remote_addr(alist->addrs[i].peer)); } seq_printf(m, " info: fl=%lx rtt=%d\n", vlserver->flags, vlserver->rtt); seq_printf(m, " probe: fl=%x e=%d ac=%d out=%d\n", @@ -375,32 +424,45 @@ static const struct seq_operations afs_proc_cell_vlservers_ops = { */ static int afs_proc_servers_show(struct seq_file *m, void *v) { - struct afs_server *server; + struct afs_endpoint_state *estate; struct afs_addr_list *alist; + struct afs_server *server; + unsigned long failed; int i; if (v == SEQ_START_TOKEN) { - seq_puts(m, "UUID REF ACT\n"); + seq_puts(m, "UUID REF ACT CELL\n"); return 0; } server = list_entry(v, struct afs_server, proc_link); - alist = rcu_dereference(server->addresses); - seq_printf(m, "%pU %3d %3d\n", + estate = rcu_dereference(server->endpoint_state); + alist = estate->addresses; + seq_printf(m, "%pU %3d %3d %s\n", &server->uuid, refcount_read(&server->ref), - atomic_read(&server->active)); - seq_printf(m, " - info: fl=%lx rtt=%u brk=%x\n", - server->flags, server->rtt, server->cb_s_break); - seq_printf(m, " - probe: last=%d out=%d\n", - (int)(jiffies - server->probed_at) / HZ, - atomic_read(&server->probe_outstanding)); - seq_printf(m, " - ALIST v=%u rsp=%lx f=%lx\n", - alist->version, alist->responded, alist->failed); - for (i = 0; i < alist->nr_addrs; i++) - seq_printf(m, " [%x] %pISpc%s\n", - i, &alist->addrs[i].transport, - alist->preferred == i ? "*" : ""); + atomic_read(&server->active), + server->cell->name); + seq_printf(m, " - info: fl=%lx rtt=%u\n", + server->flags, server->rtt); + seq_printf(m, " - probe: last=%d\n", + (int)(jiffies - server->probed_at) / HZ); + failed = estate->failed_set; + seq_printf(m, " - ESTATE pq=%x np=%u rsp=%lx f=%lx\n", + estate->probe_seq, atomic_read(&estate->nr_probing), + estate->responsive_set, estate->failed_set); + seq_printf(m, " - ALIST v=%u ap=%u\n", + alist->version, alist->addr_pref_version); + for (i = 0; i < alist->nr_addrs; i++) { + const struct afs_address *addr = &alist->addrs[i]; + + seq_printf(m, " [%x] %pISpc%s rtt=%d err=%d p=%u\n", + i, rxrpc_kernel_remote_addr(addr->peer), + alist->preferred == i ? "*" : + test_bit(i, &failed) ? "!" : "", + rxrpc_kernel_get_srtt(addr->peer), + addr->last_error, addr->prio); + } return 0; } @@ -681,7 +743,11 @@ int afs_proc_init(struct afs_net *net) &afs_proc_sysname_ops, afs_proc_sysname_write, sizeof(struct seq_net_private), - NULL)) + NULL) || + !proc_create_net_single_write("addr_prefs", 0644, p, + afs_proc_addr_prefs_show, + afs_proc_addr_prefs_write, + NULL)) goto error_tree; net->proc_afs = p; diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index a840c3588ebb..700a27bc8c25 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -13,6 +13,19 @@ #include <linux/sched/signal.h> #include "internal.h" #include "afs_fs.h" +#include "protocol_uae.h" + +void afs_clear_server_states(struct afs_operation *op) +{ + unsigned int i; + + if (op->server_states) { + for (i = 0; i < op->server_list->nr_servers; i++) + afs_put_endpoint_state(op->server_states[i].endpoint_state, + afs_estate_trace_put_server_state); + kfree(op->server_states); + } +} /* * Begin iteration through a server list, starting with the vnode's last used @@ -25,14 +38,41 @@ static bool afs_start_fs_iteration(struct afs_operation *op, void *cb_server; int i; + trace_afs_rotate(op, afs_rotate_trace_start, 0); + read_lock(&op->volume->servers_lock); op->server_list = afs_get_serverlist( rcu_dereference_protected(op->volume->servers, lockdep_is_held(&op->volume->servers_lock))); read_unlock(&op->volume->servers_lock); - op->untried = (1UL << op->server_list->nr_servers) - 1; - op->index = READ_ONCE(op->server_list->preferred); + op->server_states = kcalloc(op->server_list->nr_servers, sizeof(op->server_states[0]), + GFP_KERNEL); + if (!op->server_states) { + afs_op_nomem(op); + trace_afs_rotate(op, afs_rotate_trace_nomem, 0); + return false; + } + + rcu_read_lock(); + for (i = 0; i < op->server_list->nr_servers; i++) { + struct afs_endpoint_state *estate; + struct afs_server_state *s = &op->server_states[i]; + + server = op->server_list->servers[i].server; + estate = rcu_dereference(server->endpoint_state); + s->endpoint_state = afs_get_endpoint_state(estate, + afs_estate_trace_get_server_state); + s->probe_seq = estate->probe_seq; + s->untried_addrs = (1UL << estate->addresses->nr_addrs) - 1; + init_waitqueue_entry(&s->probe_waiter, current); + afs_get_address_preferences(op->net, estate->addresses); + } + rcu_read_unlock(); + + + op->untried_servers = (1UL << op->server_list->nr_servers) - 1; + op->server_index = -1; cb_server = vnode->cb_server; if (cb_server) { @@ -40,7 +80,7 @@ static bool afs_start_fs_iteration(struct afs_operation *op, for (i = 0; i < op->server_list->nr_servers; i++) { server = op->server_list->servers[i].server; if (server == cb_server) { - op->index = i; + op->server_index = i; goto found_interest; } } @@ -50,7 +90,8 @@ static bool afs_start_fs_iteration(struct afs_operation *op, * and have to return an error. */ if (op->flags & AFS_OPERATION_CUR_ONLY) { - op->error = -ESTALE; + afs_op_set_error(op, -ESTALE); + trace_afs_rotate(op, afs_rotate_trace_stale_lock, 0); return false; } @@ -58,7 +99,7 @@ static bool afs_start_fs_iteration(struct afs_operation *op, write_seqlock(&vnode->cb_lock); ASSERTCMP(cb_server, ==, vnode->cb_server); vnode->cb_server = NULL; - if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) + if (atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE) vnode->cb_break++; write_sequnlock(&vnode->cb_lock); } @@ -70,7 +111,7 @@ found_interest: /* * Post volume busy note. */ -static void afs_busy(struct afs_volume *volume, u32 abort_code) +static void afs_busy(struct afs_operation *op, u32 abort_code) { const char *m; @@ -81,7 +122,8 @@ static void afs_busy(struct afs_volume *volume, u32 abort_code) default: m = "busy"; break; } - pr_notice("kAFS: Volume %llu '%s' is %s\n", volume->vid, volume->name, m); + pr_notice("kAFS: Volume %llu '%s' on server %pU is %s\n", + op->volume->vid, op->volume->name, &op->server->uuid, m); } /* @@ -89,10 +131,11 @@ static void afs_busy(struct afs_volume *volume, u32 abort_code) */ static bool afs_sleep_and_retry(struct afs_operation *op) { + trace_afs_rotate(op, afs_rotate_trace_busy_sleep, 0); if (!(op->flags & AFS_OPERATION_UNINTR)) { msleep_interruptible(1000); if (signal_pending(current)) { - op->error = -ERESTARTSYS; + afs_op_set_error(op, -ERESTARTSYS); return false; } } else { @@ -111,62 +154,105 @@ bool afs_select_fileserver(struct afs_operation *op) struct afs_addr_list *alist; struct afs_server *server; struct afs_vnode *vnode = op->file[0].vnode; - struct afs_error e; - u32 rtt; - int error = op->ac.error, i; + unsigned long set, failed; + s32 abort_code = op->call_abort_code; + int best_prio = 0; + int error = op->call_error, addr_index, i, j; + + op->nr_iterations++; - _enter("%lx[%d],%lx[%d],%d,%d", - op->untried, op->index, - op->ac.tried, op->ac.index, - error, op->ac.abort_code); + _enter("OP=%x+%x,%llx,%u{%lx},%u{%lx},%d,%d", + op->debug_id, op->nr_iterations, op->volume->vid, + op->server_index, op->untried_servers, + op->addr_index, op->addr_tried, + error, abort_code); if (op->flags & AFS_OPERATION_STOP) { + trace_afs_rotate(op, afs_rotate_trace_stopped, 0); _leave(" = f [stopped]"); return false; } - op->nr_iterations++; - - /* Evaluate the result of the previous operation, if there was one. */ - switch (error) { - case SHRT_MAX: + if (op->nr_iterations == 0) goto start; + WRITE_ONCE(op->estate->addresses->addrs[op->addr_index].last_error, error); + trace_afs_rotate(op, afs_rotate_trace_iter, op->call_error); + + /* Evaluate the result of the previous operation, if there was one. */ + switch (op->call_error) { case 0: + clear_bit(AFS_SE_VOLUME_OFFLINE, + &op->server_list->servers[op->server_index].flags); + clear_bit(AFS_SE_VOLUME_BUSY, + &op->server_list->servers[op->server_index].flags); + op->cumul_error.responded = true; + + /* We succeeded, but we may need to redo the op from another + * server if we're looking at a set of RO volumes where some of + * the servers have not yet been brought up to date lest we + * regress the data. We only switch to the new version once + * >=50% of the servers are updated. + */ + error = afs_update_volume_state(op); + if (error != 0) { + if (error == 1) { + afs_sleep_and_retry(op); + goto restart_from_beginning; + } + afs_op_set_error(op, error); + goto failed; + } + fallthrough; default: /* Success or local failure. Stop. */ - op->error = error; + afs_op_set_error(op, error); op->flags |= AFS_OPERATION_STOP; + trace_afs_rotate(op, afs_rotate_trace_stop, error); _leave(" = f [okay/local %d]", error); return false; case -ECONNABORTED: /* The far side rejected the operation on some grounds. This * might involve the server being busy or the volume having been moved. + * + * Note that various V* errors should not be sent to a cache manager + * by a fileserver as they should be translated to more modern UAE* + * errors instead. IBM AFS and OpenAFS fileservers, however, do leak + * these abort codes. */ - switch (op->ac.abort_code) { + trace_afs_rotate(op, afs_rotate_trace_aborted, abort_code); + op->cumul_error.responded = true; + switch (abort_code) { case VNOVOL: /* This fileserver doesn't know about the volume. * - May indicate that the VL is wrong - retry once and compare * the results. * - May indicate that the fileserver couldn't attach to the vol. + * - The volume might have been temporarily removed so that it can + * be replaced by a volume restore. "vos" might have ended one + * transaction and has yet to create the next. + * - The volume might not be blessed or might not be in-service + * (administrative action). */ if (op->flags & AFS_OPERATION_VNOVOL) { - op->error = -EREMOTEIO; + afs_op_accumulate_error(op, -EREMOTEIO, abort_code); goto next_server; } write_lock(&op->volume->servers_lock); - op->server_list->vnovol_mask |= 1 << op->index; + op->server_list->vnovol_mask |= 1 << op->server_index; write_unlock(&op->volume->servers_lock); set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags); error = afs_check_volume_status(op->volume, op); - if (error < 0) - goto failed_set_error; + if (error < 0) { + afs_op_set_error(op, error); + goto failed; + } if (test_bit(AFS_VOLUME_DELETED, &op->volume->flags)) { - op->error = -ENOMEDIUM; + afs_op_set_error(op, -ENOMEDIUM); goto failed; } @@ -174,7 +260,7 @@ bool afs_select_fileserver(struct afs_operation *op) * it's the fileserver having trouble. */ if (rcu_access_pointer(op->volume->servers) == op->server_list) { - op->error = -EREMOTEIO; + afs_op_accumulate_error(op, -EREMOTEIO, abort_code); goto next_server; } @@ -183,50 +269,99 @@ bool afs_select_fileserver(struct afs_operation *op) _leave(" = t [vnovol]"); return true; - case VSALVAGE: /* TODO: Should this return an error or iterate? */ case VVOLEXISTS: - case VNOSERVICE: case VONLINE: - case VDISKFULL: - case VOVERQUOTA: - op->error = afs_abort_to_error(op->ac.abort_code); + /* These should not be returned from the fileserver. */ + pr_warn("Fileserver returned unexpected abort %d\n", + abort_code); + afs_op_accumulate_error(op, -EREMOTEIO, abort_code); goto next_server; + case VNOSERVICE: + /* Prior to AFS 3.2 VNOSERVICE was returned from the fileserver + * if the volume was neither in-service nor administratively + * blessed. All usage was replaced by VNOVOL because AFS 3.1 and + * earlier cache managers did not handle VNOSERVICE and assumed + * it was the client OSes errno 105. + * + * Starting with OpenAFS 1.4.8 VNOSERVICE was repurposed as the + * fileserver idle dead time error which was sent in place of + * RX_CALL_TIMEOUT (-3). The error was intended to be sent if the + * fileserver took too long to send a reply to the client. + * RX_CALL_TIMEOUT would have caused the cache manager to mark the + * server down whereas VNOSERVICE since AFS 3.2 would cause cache + * manager to temporarily (up to 15 minutes) mark the volume + * instance as unusable. + * + * The idle dead logic resulted in cache inconsistency since a + * state changing call that the cache manager assumed was dead + * could still be processed to completion by the fileserver. This + * logic was removed in OpenAFS 1.8.0 and VNOSERVICE is no longer + * returned. However, many 1.4.8 through 1.6.24 fileservers are + * still in existence. + * + * AuriStorFS fileservers have never returned VNOSERVICE. + * + * VNOSERVICE should be treated as an alias for RX_CALL_TIMEOUT. + */ + case RX_CALL_TIMEOUT: + afs_op_accumulate_error(op, -ETIMEDOUT, abort_code); + goto next_server; + + case VSALVAGING: /* This error should not be leaked to cache managers + * but is from OpenAFS demand attach fileservers. + * It should be treated as an alias for VOFFLINE. + */ + case VSALVAGE: /* VSALVAGE should be treated as a synonym of VOFFLINE */ case VOFFLINE: - if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &op->volume->flags)) { - afs_busy(op->volume, op->ac.abort_code); - clear_bit(AFS_VOLUME_BUSY, &op->volume->flags); + /* The volume is in use by the volserver or another volume utility + * for an operation that might alter the contents. The volume is + * expected to come back but it might take a long time (could be + * days). + */ + if (!test_and_set_bit(AFS_SE_VOLUME_OFFLINE, + &op->server_list->servers[op->server_index].flags)) { + afs_busy(op, abort_code); + clear_bit(AFS_SE_VOLUME_BUSY, + &op->server_list->servers[op->server_index].flags); } if (op->flags & AFS_OPERATION_NO_VSLEEP) { - op->error = -EADV; - goto failed; - } - if (op->flags & AFS_OPERATION_CUR_ONLY) { - op->error = -ESTALE; + afs_op_set_error(op, -EADV); goto failed; } goto busy; - case VSALVAGING: - case VRESTARTING: + case VRESTARTING: /* The fileserver is either shutting down or starting up. */ case VBUSY: - /* Retry after going round all the servers unless we - * have a file lock we need to maintain. + /* The volume is in use by the volserver or another volume + * utility for an operation that is not expected to alter the + * contents of the volume. VBUSY does not need to be returned + * for a ROVOL or BACKVOL bound to an ITBusy volserver + * transaction. The fileserver is permitted to continue serving + * content from ROVOLs and BACKVOLs during an ITBusy transaction + * because the content will not change. However, many fileserver + * releases do return VBUSY for ROVOL and BACKVOL instances under + * many circumstances. + * + * Retry after going round all the servers unless we have a file + * lock we need to maintain. */ if (op->flags & AFS_OPERATION_NO_VSLEEP) { - op->error = -EBUSY; + afs_op_set_error(op, -EBUSY); goto failed; } - if (!test_and_set_bit(AFS_VOLUME_BUSY, &op->volume->flags)) { - afs_busy(op->volume, op->ac.abort_code); - clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags); + if (!test_and_set_bit(AFS_SE_VOLUME_BUSY, + &op->server_list->servers[op->server_index].flags)) { + afs_busy(op, abort_code); + clear_bit(AFS_SE_VOLUME_OFFLINE, + &op->server_list->servers[op->server_index].flags); } busy: if (op->flags & AFS_OPERATION_CUR_ONLY) { if (!afs_sleep_and_retry(op)) goto failed; - /* Retry with same server & address */ + /* Retry with same server & address */ _leave(" = t [vbusy]"); return true; } @@ -243,7 +378,7 @@ bool afs_select_fileserver(struct afs_operation *op) * honour, just in case someone sets up a loop. */ if (op->flags & AFS_OPERATION_VMOVED) { - op->error = -EREMOTEIO; + afs_op_set_error(op, -EREMOTEIO); goto failed; } op->flags |= AFS_OPERATION_VMOVED; @@ -251,8 +386,10 @@ bool afs_select_fileserver(struct afs_operation *op) set_bit(AFS_VOLUME_WAIT, &op->volume->flags); set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags); error = afs_check_volume_status(op->volume, op); - if (error < 0) - goto failed_set_error; + if (error < 0) { + afs_op_set_error(op, error); + goto failed; + } /* If the server list didn't change, then the VLDB is * out of sync with the fileservers. This is hopefully @@ -264,22 +401,50 @@ bool afs_select_fileserver(struct afs_operation *op) * TODO: Retry a few times with sleeps. */ if (rcu_access_pointer(op->volume->servers) == op->server_list) { - op->error = -ENOMEDIUM; + afs_op_accumulate_error(op, -ENOMEDIUM, abort_code); goto failed; } goto restart_from_beginning; + case UAEIO: + case VIO: + afs_op_accumulate_error(op, -EREMOTEIO, abort_code); + if (op->volume->type != AFSVL_RWVOL) + goto next_server; + goto failed; + + case VDISKFULL: + case UAENOSPC: + /* The partition is full. Only applies to RWVOLs. + * Translate locally and return ENOSPC. + * No replicas to failover to. + */ + afs_op_set_error(op, -ENOSPC); + goto failed_but_online; + + case VOVERQUOTA: + case UAEDQUOT: + /* Volume is full. Only applies to RWVOLs. + * Translate locally and return EDQUOT. + * No replicas to failover to. + */ + afs_op_set_error(op, -EDQUOT); + goto failed_but_online; + default: - clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags); - clear_bit(AFS_VOLUME_BUSY, &op->volume->flags); - op->error = afs_abort_to_error(op->ac.abort_code); + afs_op_accumulate_error(op, error, abort_code); + failed_but_online: + clear_bit(AFS_SE_VOLUME_OFFLINE, + &op->server_list->servers[op->server_index].flags); + clear_bit(AFS_SE_VOLUME_BUSY, + &op->server_list->servers[op->server_index].flags); goto failed; } case -ETIMEDOUT: case -ETIME: - if (op->error != -EDESTADDRREQ) + if (afs_op_error(op) != -EDESTADDRREQ) goto iterate_address; fallthrough; case -ERFKILL: @@ -289,7 +454,7 @@ bool afs_select_fileserver(struct afs_operation *op) case -EHOSTDOWN: case -ECONNREFUSED: _debug("no conn"); - op->error = error; + afs_op_accumulate_error(op, error, 0); goto iterate_address; case -ENETRESET: @@ -298,24 +463,31 @@ bool afs_select_fileserver(struct afs_operation *op) fallthrough; case -ECONNRESET: _debug("call reset"); - op->error = error; + afs_op_set_error(op, error); goto failed; } restart_from_beginning: + trace_afs_rotate(op, afs_rotate_trace_restart, 0); _debug("restart"); - afs_end_cursor(&op->ac); + op->estate = NULL; op->server = NULL; + afs_clear_server_states(op); + op->server_states = NULL; afs_put_serverlist(op->net, op->server_list); op->server_list = NULL; start: _debug("start"); + ASSERTCMP(op->estate, ==, NULL); /* See if we need to do an update of the volume record. Note that the * volume may have moved or even have been deleted. */ error = afs_check_volume_status(op->volume, op); - if (error < 0) - goto failed_set_error; + trace_afs_rotate(op, afs_rotate_trace_check_vol_status, error); + if (error < 0) { + afs_op_set_error(op, error); + goto failed; + } if (!afs_start_fs_iteration(op, vnode)) goto failed; @@ -323,52 +495,83 @@ start: _debug("__ VOL %llx __", op->volume->vid); pick_server: - _debug("pick [%lx]", op->untried); + _debug("pick [%lx]", op->untried_servers); + ASSERTCMP(op->estate, ==, NULL); - error = afs_wait_for_fs_probes(op->server_list, op->untried); - if (error < 0) - goto failed_set_error; + error = afs_wait_for_fs_probes(op, op->server_states, + !(op->flags & AFS_OPERATION_UNINTR)); + switch (error) { + case 0: /* No untried responsive servers and no outstanding probes */ + trace_afs_rotate(op, afs_rotate_trace_probe_none, 0); + goto no_more_servers; + case 1: /* Got a response */ + trace_afs_rotate(op, afs_rotate_trace_probe_response, 0); + break; + case 2: /* Probe data superseded */ + trace_afs_rotate(op, afs_rotate_trace_probe_superseded, 0); + goto restart_from_beginning; + default: + trace_afs_rotate(op, afs_rotate_trace_probe_error, error); + afs_op_set_error(op, error); + goto failed; + } - /* Pick the untried server with the lowest RTT. If we have outstanding - * callbacks, we stick with the server we're already using if we can. + /* Pick the untried server with the highest priority untried endpoint. + * If we have outstanding callbacks, we stick with the server we're + * already using if we can. */ if (op->server) { - _debug("server %u", op->index); - if (test_bit(op->index, &op->untried)) + _debug("server %u", op->server_index); + if (test_bit(op->server_index, &op->untried_servers)) goto selected_server; op->server = NULL; _debug("no server"); } - op->index = -1; - rtt = U32_MAX; + rcu_read_lock(); + op->server_index = -1; + best_prio = -1; for (i = 0; i < op->server_list->nr_servers; i++) { - struct afs_server *s = op->server_list->servers[i].server; + struct afs_endpoint_state *es; + struct afs_server_entry *se = &op->server_list->servers[i]; + struct afs_addr_list *sal; + struct afs_server *s = se->server; - if (!test_bit(i, &op->untried) || + if (!test_bit(i, &op->untried_servers) || + test_bit(AFS_SE_EXCLUDED, &se->flags) || !test_bit(AFS_SERVER_FL_RESPONDING, &s->flags)) continue; - if (s->probe.rtt < rtt) { - op->index = i; - rtt = s->probe.rtt; + es = op->server_states->endpoint_state; + sal = es->addresses; + + afs_get_address_preferences_rcu(op->net, sal); + for (j = 0; j < sal->nr_addrs; j++) { + if (!sal->addrs[j].peer) + continue; + if (sal->addrs[j].prio > best_prio) { + op->server_index = i; + best_prio = sal->addrs[j].prio; + } } } + rcu_read_unlock(); - if (op->index == -1) + if (op->server_index == -1) goto no_more_servers; selected_server: - _debug("use %d", op->index); - __clear_bit(op->index, &op->untried); + trace_afs_rotate(op, afs_rotate_trace_selected_server, best_prio); + _debug("use %d prio %u", op->server_index, best_prio); + __clear_bit(op->server_index, &op->untried_servers); /* We're starting on a different fileserver from the list. We need to * check it, create a callback intercept, find its address list and * probe its capabilities before we use it. */ - ASSERTCMP(op->ac.alist, ==, NULL); - server = op->server_list->servers[op->index].server; + ASSERTCMP(op->estate, ==, NULL); + server = op->server_list->servers[op->server_index].server; - if (!afs_check_server_record(op, server)) + if (!afs_check_server_record(op, server, op->key)) goto failed; _debug("USING SERVER: %pU", &server->uuid); @@ -377,58 +580,73 @@ selected_server: op->server = server; if (vnode->cb_server != server) { vnode->cb_server = server; - vnode->cb_s_break = server->cb_s_break; - vnode->cb_fs_s_break = atomic_read(&server->cell->fs_s_break); - vnode->cb_v_break = vnode->volume->cb_v_break; - clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); + vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break); + atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE); } - read_lock(&server->fs_lock); - alist = rcu_dereference_protected(server->addresses, - lockdep_is_held(&server->fs_lock)); - afs_get_addrlist(alist); - read_unlock(&server->fs_lock); - retry_server: - memset(&op->ac, 0, sizeof(op->ac)); - - if (!op->ac.alist) - op->ac.alist = alist; - else - afs_put_addrlist(alist); - - op->ac.index = -1; + op->addr_tried = 0; + op->addr_index = -1; iterate_address: - ASSERT(op->ac.alist); /* Iterate over the current server's address list to try and find an * address on which it will respond to us. */ - if (!afs_iterate_addresses(&op->ac)) - goto out_of_addresses; + op->estate = op->server_states[op->server_index].endpoint_state; + set = READ_ONCE(op->estate->responsive_set); + failed = READ_ONCE(op->estate->failed_set); + _debug("iterate ES=%x rs=%lx fs=%lx", op->estate->probe_seq, set, failed); + set &= ~(failed | op->addr_tried); + trace_afs_rotate(op, afs_rotate_trace_iterate_addr, set); + if (!set) + goto wait_for_more_probe_results; + + alist = op->estate->addresses; + for (i = 0; i < alist->nr_addrs; i++) { + if (alist->addrs[i].prio > best_prio) { + addr_index = i; + best_prio = alist->addrs[i].prio; + } + } - _debug("address [%u] %u/%u %pISp", - op->index, op->ac.index, op->ac.alist->nr_addrs, - &op->ac.alist->addrs[op->ac.index].transport); + addr_index = READ_ONCE(alist->preferred); + if (!test_bit(addr_index, &set)) + addr_index = __ffs(set); + op->addr_index = addr_index; + set_bit(addr_index, &op->addr_tried); + + op->volsync.creation = TIME64_MIN; + op->volsync.update = TIME64_MIN; + op->call_responded = false; + _debug("address [%u] %u/%u %pISp", + op->server_index, addr_index, alist->nr_addrs, + rxrpc_kernel_remote_addr(alist->addrs[op->addr_index].peer)); _leave(" = t"); return true; -out_of_addresses: +wait_for_more_probe_results: + error = afs_wait_for_one_fs_probe(op->server, op->estate, op->addr_tried, + !(op->flags & AFS_OPERATION_UNINTR)); + if (!error) + goto iterate_address; + /* We've now had a failure to respond on all of a server's addresses - * immediately probe them again and consider retrying the server. */ + trace_afs_rotate(op, afs_rotate_trace_probe_fileserver, 0); afs_probe_fileserver(op->net, op->server); if (op->flags & AFS_OPERATION_RETRY_SERVER) { - alist = op->ac.alist; - error = afs_wait_for_one_fs_probe( - op->server, !(op->flags & AFS_OPERATION_UNINTR)); + error = afs_wait_for_one_fs_probe(op->server, op->estate, op->addr_tried, + !(op->flags & AFS_OPERATION_UNINTR)); switch (error) { case 0: op->flags &= ~AFS_OPERATION_RETRY_SERVER; + trace_afs_rotate(op, afs_rotate_trace_retry_server, 0); goto retry_server; case -ERESTARTSYS: - goto failed_set_error; + afs_op_set_error(op, error); + goto failed; case -ETIME: case -EDESTADDRREQ: goto next_server; @@ -436,34 +654,51 @@ out_of_addresses: } next_server: + trace_afs_rotate(op, afs_rotate_trace_next_server, 0); _debug("next"); - afs_end_cursor(&op->ac); + ASSERT(op->estate); + alist = op->estate->addresses; + if (op->call_responded && + op->addr_index != READ_ONCE(alist->preferred) && + test_bit(alist->preferred, &op->addr_tried)) + WRITE_ONCE(alist->preferred, op->addr_index); + op->estate = NULL; goto pick_server; no_more_servers: /* That's all the servers poked to no good effect. Try again if some * of them were busy. */ - if (op->flags & AFS_OPERATION_VBUSY) + trace_afs_rotate(op, afs_rotate_trace_no_more_servers, 0); + if (op->flags & AFS_OPERATION_VBUSY) { + afs_sleep_and_retry(op); + op->flags &= ~AFS_OPERATION_VBUSY; goto restart_from_beginning; + } - e.error = -EDESTADDRREQ; - e.responded = false; + rcu_read_lock(); for (i = 0; i < op->server_list->nr_servers; i++) { - struct afs_server *s = op->server_list->servers[i].server; + struct afs_endpoint_state *estate; - afs_prioritise_error(&e, READ_ONCE(s->probe.error), - s->probe.abort_code); + estate = op->server_states->endpoint_state; + error = READ_ONCE(estate->error); + if (error < 0) + afs_op_accumulate_error(op, error, estate->abort_code); } + rcu_read_unlock(); - error = e.error; - -failed_set_error: - op->error = error; failed: + trace_afs_rotate(op, afs_rotate_trace_failed, 0); op->flags |= AFS_OPERATION_STOP; - afs_end_cursor(&op->ac); - _leave(" = f [failed %d]", op->error); + if (op->estate) { + alist = op->estate->addresses; + if (op->call_responded && + op->addr_index != READ_ONCE(alist->preferred) && + test_bit(alist->preferred, &op->addr_tried)) + WRITE_ONCE(alist->preferred, op->addr_index); + op->estate = NULL; + } + _leave(" = f [failed %d]", afs_op_error(op)); return false; } @@ -482,37 +717,40 @@ void afs_dump_edestaddrreq(const struct afs_operation *op) rcu_read_lock(); pr_notice("EDESTADDR occurred\n"); - pr_notice("FC: cbb=%x cbb2=%x fl=%x err=%hd\n", + pr_notice("OP: cbb=%x cbb2=%x fl=%x err=%hd\n", op->file[0].cb_break_before, - op->file[1].cb_break_before, op->flags, op->error); - pr_notice("FC: ut=%lx ix=%d ni=%u\n", - op->untried, op->index, op->nr_iterations); + op->file[1].cb_break_before, op->flags, op->cumul_error.error); + pr_notice("OP: ut=%lx ix=%d ni=%u\n", + op->untried_servers, op->server_index, op->nr_iterations); + pr_notice("OP: call er=%d ac=%d r=%u\n", + op->call_error, op->call_abort_code, op->call_responded); if (op->server_list) { const struct afs_server_list *sl = op->server_list; - pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n", - sl->nr_servers, sl->preferred, sl->vnovol_mask); + + pr_notice("FC: SL nr=%u vnov=%hx\n", + sl->nr_servers, sl->vnovol_mask); for (i = 0; i < sl->nr_servers; i++) { const struct afs_server *s = sl->servers[i].server; + const struct afs_endpoint_state *e = + rcu_dereference(s->endpoint_state); + const struct afs_addr_list *a = e->addresses; + pr_notice("FC: server fl=%lx av=%u %pU\n", s->flags, s->addr_version, &s->uuid); - if (s->addresses) { - const struct afs_addr_list *a = - rcu_dereference(s->addresses); + pr_notice("FC: - pq=%x R=%lx F=%lx\n", + e->probe_seq, e->responsive_set, e->failed_set); + if (a) { pr_notice("FC: - av=%u nr=%u/%u/%u pr=%u\n", a->version, a->nr_ipv4, a->nr_addrs, a->max_addrs, a->preferred); - pr_notice("FC: - R=%lx F=%lx\n", - a->responded, a->failed); - if (a == op->ac.alist) + if (a == e->addresses) pr_notice("FC: - current\n"); } } } - pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n", - op->ac.tried, op->ac.index, op->ac.abort_code, op->ac.error, - op->ac.responded, op->ac.nr_iterations); + pr_notice("AC: t=%lx ax=%d\n", op->addr_tried, op->addr_index); rcu_read_unlock(); } diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index d642d06a453b..c453428f3c8b 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -178,6 +178,8 @@ void afs_put_call(struct afs_call *call) ASSERT(!work_pending(&call->async_work)); ASSERT(call->type->name != NULL); + rxrpc_kernel_put_peer(call->peer); + if (call->rxcall) { rxrpc_kernel_shutdown_call(net->socket, call->rxcall); rxrpc_kernel_put_call(net->socket, call->rxcall); @@ -187,7 +189,6 @@ void afs_put_call(struct afs_call *call) call->type->destructor(call); afs_unuse_server_notime(call->net, call->server, afs_server_trace_put_call); - afs_put_addrlist(call->alist); kfree(call->request); trace_afs_call(call->debug_id, afs_call_trace_free, 0, o, @@ -294,9 +295,8 @@ static void afs_notify_end_request_tx(struct sock *sock, * Initiate a call and synchronously queue up the parameters for dispatch. Any * error is stored into the call struct, which the caller must check for. */ -void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) +void afs_make_call(struct afs_call *call, gfp_t gfp) { - struct sockaddr_rxrpc *srx = &ac->alist->addrs[ac->index]; struct rxrpc_call *rxcall; struct msghdr msg; struct kvec iov[1]; @@ -304,7 +304,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) s64 tx_total_len; int ret; - _enter(",{%pISp},", &srx->transport); + _enter(",{%pISp+%u},", rxrpc_kernel_remote_addr(call->peer), call->service_id); ASSERT(call->type != NULL); ASSERT(call->type->name != NULL); @@ -313,8 +313,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) call, call->type->name, key_serial(call->key), atomic_read(&call->net->nr_outstanding_calls)); - call->addr_ix = ac->index; - call->alist = afs_get_addrlist(ac->alist); + trace_afs_make_call(call); /* Work out the length we're going to transmit. This is awkward for * calls such as FS.StoreData where there's an extra injection of data @@ -333,7 +332,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) } /* create a call */ - rxcall = rxrpc_kernel_begin_call(call->net->socket, srx, call->key, + rxcall = rxrpc_kernel_begin_call(call->net->socket, call->peer, call->key, (unsigned long)call, tx_total_len, call->max_lifespan, @@ -341,6 +340,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) (call->async ? afs_wake_up_async_call : afs_wake_up_call_waiter), + call->service_id, call->upgrade, (call->intr ? RXRPC_PREINTERRUPTIBLE : RXRPC_UNINTERRUPTIBLE), @@ -390,7 +390,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) /* Note that at this point, we may have received the reply or an abort * - and an asynchronous call may already have completed. * - * afs_wait_for_call_to_complete(call, ac) + * afs_wait_for_call_to_complete(call) * must be called to synchronously clean up. */ return; @@ -406,8 +406,7 @@ error_do_abort: rxrpc_kernel_recv_data(call->net->socket, rxcall, &msg.msg_iter, &len, false, &call->abort_code, &call->service_id); - ac->abort_code = call->abort_code; - ac->responded = true; + call->responded = true; } call->error = ret; trace_afs_call_done(call); @@ -427,7 +426,7 @@ error_kill_call: afs_set_call_complete(call, ret, 0); } - ac->error = ret; + call->error = ret; call->state = AFS_CALL_COMPLETE; _leave(" = %d", ret); } @@ -461,7 +460,7 @@ static void afs_log_error(struct afs_call *call, s32 remote_abort) max = m + 1; pr_notice("kAFS: Peer reported %s failure on %s [%pISp]\n", msg, call->type->name, - &call->alist->addrs[call->addr_ix].transport); + rxrpc_kernel_remote_addr(call->peer)); } } @@ -508,6 +507,7 @@ static void afs_deliver_to_call(struct afs_call *call) ret = -EBADMSG; switch (ret) { case 0: + call->responded = true; afs_queue_call_work(call); if (state == AFS_CALL_CL_PROC_REPLY) { if (call->op) @@ -522,9 +522,11 @@ static void afs_deliver_to_call(struct afs_call *call) goto out; case -ECONNABORTED: ASSERTCMP(state, ==, AFS_CALL_COMPLETE); + call->responded = true; afs_log_error(call, call->abort_code); goto done; case -ENOTSUPP: + call->responded = true; abort_code = RXGEN_OPCODE; rxrpc_kernel_abort_call(call->net->socket, call->rxcall, abort_code, ret, @@ -571,50 +573,46 @@ call_complete: } /* - * Wait synchronously for a call to complete and clean up the call struct. + * Wait synchronously for a call to complete. */ -long afs_wait_for_call_to_complete(struct afs_call *call, - struct afs_addr_cursor *ac) +void afs_wait_for_call_to_complete(struct afs_call *call) { - long ret; bool rxrpc_complete = false; - DECLARE_WAITQUEUE(myself, current); - _enter(""); - ret = call->error; - if (ret < 0) - goto out; + if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) { + DECLARE_WAITQUEUE(myself, current); + + add_wait_queue(&call->waitq, &myself); + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + + /* deliver any messages that are in the queue */ + if (!afs_check_call_state(call, AFS_CALL_COMPLETE) && + call->need_attention) { + call->need_attention = false; + __set_current_state(TASK_RUNNING); + afs_deliver_to_call(call); + continue; + } - add_wait_queue(&call->waitq, &myself); - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - - /* deliver any messages that are in the queue */ - if (!afs_check_call_state(call, AFS_CALL_COMPLETE) && - call->need_attention) { - call->need_attention = false; - __set_current_state(TASK_RUNNING); - afs_deliver_to_call(call); - continue; - } + if (afs_check_call_state(call, AFS_CALL_COMPLETE)) + break; - if (afs_check_call_state(call, AFS_CALL_COMPLETE)) - break; + if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall)) { + /* rxrpc terminated the call. */ + rxrpc_complete = true; + break; + } - if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall)) { - /* rxrpc terminated the call. */ - rxrpc_complete = true; - break; + schedule(); } - schedule(); + remove_wait_queue(&call->waitq, &myself); + __set_current_state(TASK_RUNNING); } - remove_wait_queue(&call->waitq, &myself); - __set_current_state(TASK_RUNNING); - if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) { if (rxrpc_complete) { afs_set_call_complete(call, call->error, call->abort_code); @@ -627,29 +625,6 @@ long afs_wait_for_call_to_complete(struct afs_call *call, afs_set_call_complete(call, -EINTR, 0); } } - - spin_lock_bh(&call->state_lock); - ac->abort_code = call->abort_code; - ac->error = call->error; - spin_unlock_bh(&call->state_lock); - - ret = ac->error; - switch (ret) { - case 0: - ret = call->ret0; - call->ret0 = 0; - - fallthrough; - case -ECONNABORTED: - ac->responded = true; - break; - } - -out: - _debug("call complete"); - afs_put_call(call); - _leave(" = %p", (void *)ret); - return ret; } /* diff --git a/fs/afs/server.c b/fs/afs/server.c index b5237206eac3..e169121f603e 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -21,13 +21,13 @@ static void __afs_put_server(struct afs_net *, struct afs_server *); /* * Find a server by one of its addresses. */ -struct afs_server *afs_find_server(struct afs_net *net, - const struct sockaddr_rxrpc *srx) +struct afs_server *afs_find_server(struct afs_net *net, const struct rxrpc_peer *peer) { + const struct afs_endpoint_state *estate; const struct afs_addr_list *alist; struct afs_server *server = NULL; unsigned int i; - int seq = 0, diff; + int seq = 1; rcu_read_lock(); @@ -35,39 +35,15 @@ struct afs_server *afs_find_server(struct afs_net *net, if (server) afs_unuse_server_notime(net, server, afs_server_trace_put_find_rsq); server = NULL; + seq++; /* 2 on the 1st/lockless path, otherwise odd */ read_seqbegin_or_lock(&net->fs_addr_lock, &seq); - if (srx->transport.family == AF_INET6) { - const struct sockaddr_in6 *a = &srx->transport.sin6, *b; - hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) { - alist = rcu_dereference(server->addresses); - for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) { - b = &alist->addrs[i].transport.sin6; - diff = ((u16 __force)a->sin6_port - - (u16 __force)b->sin6_port); - if (diff == 0) - diff = memcmp(&a->sin6_addr, - &b->sin6_addr, - sizeof(struct in6_addr)); - if (diff == 0) - goto found; - } - } - } else { - const struct sockaddr_in *a = &srx->transport.sin, *b; - hlist_for_each_entry_rcu(server, &net->fs_addresses4, addr4_link) { - alist = rcu_dereference(server->addresses); - for (i = 0; i < alist->nr_ipv4; i++) { - b = &alist->addrs[i].transport.sin; - diff = ((u16 __force)a->sin_port - - (u16 __force)b->sin_port); - if (diff == 0) - diff = ((u32 __force)a->sin_addr.s_addr - - (u32 __force)b->sin_addr.s_addr); - if (diff == 0) - goto found; - } - } + hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) { + estate = rcu_dereference(server->endpoint_state); + alist = estate->addresses; + for (i = 0; i < alist->nr_addrs; i++) + if (alist->addrs[i].peer == peer) + goto found; } server = NULL; @@ -90,7 +66,7 @@ struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uu { struct afs_server *server = NULL; struct rb_node *p; - int diff, seq = 0; + int diff, seq = 1; _enter("%pU", uuid); @@ -102,7 +78,7 @@ struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uu if (server) afs_unuse_server(net, server, afs_server_trace_put_uuid_rsq); server = NULL; - + seq++; /* 2 on the 1st/lockless path, otherwise odd */ read_seqbegin_or_lock(&net->fs_lock, &seq); p = net->fs_servers.rb_node; @@ -137,6 +113,7 @@ struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uu static struct afs_server *afs_install_server(struct afs_cell *cell, struct afs_server *candidate) { + const struct afs_endpoint_state *estate; const struct afs_addr_list *alist; struct afs_server *server, *next; struct afs_net *net = cell->net; @@ -188,8 +165,9 @@ static struct afs_server *afs_install_server(struct afs_cell *cell, added_dup: write_seqlock(&net->fs_addr_lock); - alist = rcu_dereference_protected(server->addresses, - lockdep_is_held(&net->fs_addr_lock.lock)); + estate = rcu_dereference_protected(server->endpoint_state, + lockdep_is_held(&net->fs_addr_lock.lock)); + alist = estate->addresses; /* Secondly, if the server has any IPv4 and/or IPv6 addresses, install * it in the IPv4 and/or IPv6 reverse-map lists. @@ -219,6 +197,7 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell, const uuid_t *uuid, struct afs_addr_list *alist) { + struct afs_endpoint_state *estate; struct afs_server *server; struct afs_net *net = cell->net; @@ -228,25 +207,41 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell, if (!server) goto enomem; + estate = kzalloc(sizeof(struct afs_endpoint_state), GFP_KERNEL); + if (!estate) + goto enomem_server; + refcount_set(&server->ref, 1); atomic_set(&server->active, 1); server->debug_id = atomic_inc_return(&afs_server_debug_id); - RCU_INIT_POINTER(server->addresses, alist); server->addr_version = alist->version; server->uuid = *uuid; rwlock_init(&server->fs_lock); - INIT_WORK(&server->initcb_work, afs_server_init_callback_work); + INIT_LIST_HEAD(&server->volumes); init_waitqueue_head(&server->probe_wq); INIT_LIST_HEAD(&server->probe_link); spin_lock_init(&server->probe_lock); server->cell = cell; server->rtt = UINT_MAX; + server->service_id = FS_SERVICE; + + server->probe_counter = 1; + server->probed_at = jiffies - LONG_MAX / 2; + refcount_set(&estate->ref, 1); + estate->addresses = alist; + estate->server_id = server->debug_id; + estate->probe_seq = 1; + rcu_assign_pointer(server->endpoint_state, estate); afs_inc_servers_outstanding(net); trace_afs_server(server->debug_id, 1, 1, afs_server_trace_alloc); + trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref), + afs_estate_trace_alloc_server); _leave(" = %p", server); return server; +enomem_server: + kfree(server); enomem: _leave(" = NULL [nomem]"); return NULL; @@ -301,20 +296,20 @@ struct afs_server *afs_lookup_server(struct afs_cell *cell, struct key *key, candidate = afs_alloc_server(cell, uuid, alist); if (!candidate) { - afs_put_addrlist(alist); + afs_put_addrlist(alist, afs_alist_trace_put_server_oom); return ERR_PTR(-ENOMEM); } server = afs_install_server(cell, candidate); if (server != candidate) { - afs_put_addrlist(alist); + afs_put_addrlist(alist, afs_alist_trace_put_server_dup); kfree(candidate); } else { /* Immediately dispatch an asynchronous probe to each interface * on the fileserver. This will make sure the repeat-probing * service is started. */ - afs_fs_probe_fileserver(cell->net, server, key, true); + afs_fs_probe_fileserver(cell->net, server, alist, key); } return server; @@ -447,7 +442,8 @@ static void afs_server_rcu(struct rcu_head *rcu) trace_afs_server(server->debug_id, refcount_read(&server->ref), atomic_read(&server->active), afs_server_trace_free); - afs_put_addrlist(rcu_access_pointer(server->addresses)); + afs_put_endpoint_state(rcu_access_pointer(server->endpoint_state), + afs_estate_trace_put_server); kfree(server); } @@ -459,14 +455,10 @@ static void __afs_put_server(struct afs_net *net, struct afs_server *server) static void afs_give_up_callbacks(struct afs_net *net, struct afs_server *server) { - struct afs_addr_list *alist = rcu_access_pointer(server->addresses); - struct afs_addr_cursor ac = { - .alist = alist, - .index = alist->preferred, - .error = 0, - }; - - afs_fs_give_up_all_callbacks(net, server, &ac, NULL); + struct afs_endpoint_state *estate = rcu_access_pointer(server->endpoint_state); + struct afs_addr_list *alist = estate->addresses; + + afs_fs_give_up_all_callbacks(net, server, &alist->addrs[alist->preferred], NULL); } /* @@ -477,7 +469,6 @@ static void afs_destroy_server(struct afs_net *net, struct afs_server *server) if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags)) afs_give_up_callbacks(net, server); - flush_work(&server->initcb_work); afs_put_server(net, server, afs_server_trace_destroy); } @@ -636,9 +627,12 @@ void afs_purge_servers(struct afs_net *net) * Get an update for a server's address list. */ static noinline bool afs_update_server_record(struct afs_operation *op, - struct afs_server *server) + struct afs_server *server, + struct key *key) { - struct afs_addr_list *alist, *discard; + struct afs_endpoint_state *estate; + struct afs_addr_list *alist; + bool has_addrs; _enter(""); @@ -648,29 +642,27 @@ static noinline bool afs_update_server_record(struct afs_operation *op, alist = afs_vl_lookup_addrs(op->volume->cell, op->key, &server->uuid); if (IS_ERR(alist)) { + rcu_read_lock(); + estate = rcu_dereference(server->endpoint_state); + has_addrs = estate->addresses; + rcu_read_unlock(); + if ((PTR_ERR(alist) == -ERESTARTSYS || PTR_ERR(alist) == -EINTR) && (op->flags & AFS_OPERATION_UNINTR) && - server->addresses) { + has_addrs) { _leave(" = t [intr]"); return true; } - op->error = PTR_ERR(alist); - _leave(" = f [%d]", op->error); + afs_op_set_error(op, PTR_ERR(alist)); + _leave(" = f [%d]", afs_op_error(op)); return false; } - discard = alist; - if (server->addr_version != alist->version) { - write_lock(&server->fs_lock); - discard = rcu_dereference_protected(server->addresses, - lockdep_is_held(&server->fs_lock)); - rcu_assign_pointer(server->addresses, alist); - server->addr_version = alist->version; - write_unlock(&server->fs_lock); - } + if (server->addr_version != alist->version) + afs_fs_probe_fileserver(op->net, server, alist, key); - afs_put_addrlist(discard); + afs_put_addrlist(alist, afs_alist_trace_put_server_update); _leave(" = t"); return true; } @@ -678,7 +670,8 @@ static noinline bool afs_update_server_record(struct afs_operation *op, /* * See if a server's address list needs updating. */ -bool afs_check_server_record(struct afs_operation *op, struct afs_server *server) +bool afs_check_server_record(struct afs_operation *op, struct afs_server *server, + struct key *key) { bool success; int ret, retries = 0; @@ -698,7 +691,7 @@ retry: update: if (!test_and_set_bit_lock(AFS_SERVER_FL_UPDATING, &server->flags)) { clear_bit(AFS_SERVER_FL_NEEDS_UPDATE, &server->flags); - success = afs_update_server_record(op, server); + success = afs_update_server_record(op, server, key); clear_bit_unlock(AFS_SERVER_FL_UPDATING, &server->flags); wake_up_bit(&server->flags, AFS_SERVER_FL_UPDATING); _leave(" = %d", success); @@ -710,7 +703,7 @@ wait: (op->flags & AFS_OPERATION_UNINTR) ? TASK_UNINTERRUPTIBLE : TASK_INTERRUPTIBLE); if (ret == -ERESTARTSYS) { - op->error = ret; + afs_op_set_error(op, ret); _leave(" = f [intr]"); return false; } diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c index b59896b1de0a..7e7e567a7f8a 100644 --- a/fs/afs/server_list.c +++ b/fs/afs/server_list.c @@ -24,35 +24,62 @@ void afs_put_serverlist(struct afs_net *net, struct afs_server_list *slist) /* * Build a server list from a VLDB record. */ -struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell, +struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume, struct key *key, - struct afs_vldb_entry *vldb, - u8 type_mask) + struct afs_vldb_entry *vldb) { struct afs_server_list *slist; struct afs_server *server; - int ret = -ENOMEM, nr_servers = 0, i, j; - - for (i = 0; i < vldb->nr_servers; i++) - if (vldb->fs_mask[i] & type_mask) - nr_servers++; + unsigned int type_mask = 1 << volume->type; + bool use_newrepsites = false; + int ret = -ENOMEM, nr_servers = 0, newrep = 0, i, j, usable = 0; + + /* Work out if we're going to restrict to NEWREPSITE-marked servers or + * not. If at least one site is marked as NEWREPSITE, then it's likely + * that "vos release" is busy updating RO sites. We cut over from one + * to the other when >=50% of the sites have been updated. Sites that + * are in the process of being updated are marked DONTUSE. + */ + for (i = 0; i < vldb->nr_servers; i++) { + if (!(vldb->fs_mask[i] & type_mask)) + continue; + nr_servers++; + if (vldb->vlsf_flags[i] & AFS_VLSF_DONTUSE) + continue; + usable++; + if (vldb->vlsf_flags[i] & AFS_VLSF_NEWREPSITE) + newrep++; + } slist = kzalloc(struct_size(slist, servers, nr_servers), GFP_KERNEL); if (!slist) goto error; + if (newrep) { + if (newrep < usable / 2) { + slist->ro_replicating = AFS_RO_REPLICATING_USE_OLD; + } else { + slist->ro_replicating = AFS_RO_REPLICATING_USE_NEW; + use_newrepsites = true; + } + } + refcount_set(&slist->usage, 1); rwlock_init(&slist->lock); - for (i = 0; i < AFS_MAXTYPES; i++) - slist->vids[i] = vldb->vid[i]; - /* Make sure a records exists for each server in the list. */ for (i = 0; i < vldb->nr_servers; i++) { + unsigned long se_flags = 0; + bool newrepsite = vldb->vlsf_flags[i] & AFS_VLSF_NEWREPSITE; + if (!(vldb->fs_mask[i] & type_mask)) continue; + if (vldb->vlsf_flags[i] & AFS_VLSF_DONTUSE) + __set_bit(AFS_SE_EXCLUDED, &se_flags); + if (newrep && (newrepsite ^ use_newrepsites)) + __set_bit(AFS_SE_EXCLUDED, &se_flags); - server = afs_lookup_server(cell, key, &vldb->fs_server[i], + server = afs_lookup_server(volume->cell, key, &vldb->fs_server[i], vldb->addr_version[i]); if (IS_ERR(server)) { ret = PTR_ERR(server); @@ -70,7 +97,7 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell, break; if (j < slist->nr_servers) { if (slist->servers[j].server == server) { - afs_put_server(cell->net, server, + afs_put_server(volume->cell->net, server, afs_server_trace_put_slist_isort); continue; } @@ -81,6 +108,9 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell, } slist->servers[j].server = server; + slist->servers[j].volume = volume; + slist->servers[j].flags = se_flags; + slist->servers[j].cb_expires_at = AFS_NO_CB_PROMISE; slist->nr_servers++; } @@ -92,7 +122,7 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell, return slist; error_2: - afs_put_serverlist(cell->net, slist); + afs_put_serverlist(volume->cell->net, slist); error: return ERR_PTR(ret); } @@ -103,27 +133,117 @@ error: bool afs_annotate_server_list(struct afs_server_list *new, struct afs_server_list *old) { - struct afs_server *cur; - int i, j; + unsigned long mask = 1UL << AFS_SE_EXCLUDED; + int i; - if (old->nr_servers != new->nr_servers) + if (old->nr_servers != new->nr_servers || + old->ro_replicating != new->ro_replicating) goto changed; - for (i = 0; i < old->nr_servers; i++) + for (i = 0; i < old->nr_servers; i++) { if (old->servers[i].server != new->servers[i].server) goto changed; - + if ((old->servers[i].flags & mask) != (new->servers[i].flags & mask)) + goto changed; + } return false; - changed: - /* Maintain the same preferred server as before if possible. */ - cur = old->servers[old->preferred].server; - for (j = 0; j < new->nr_servers; j++) { - if (new->servers[j].server == cur) { - new->preferred = j; - break; + return true; +} + +/* + * Attach a volume to the servers it is going to use. + */ +void afs_attach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist) +{ + struct afs_server_entry *se, *pe; + struct afs_server *server; + struct list_head *p; + unsigned int i; + + down_write(&volume->cell->vs_lock); + + for (i = 0; i < slist->nr_servers; i++) { + se = &slist->servers[i]; + server = se->server; + + list_for_each(p, &server->volumes) { + pe = list_entry(p, struct afs_server_entry, slink); + if (volume->vid <= pe->volume->vid) + break; } + list_add_tail(&se->slink, p); } - return true; + slist->attached = true; + up_write(&volume->cell->vs_lock); +} + +/* + * Reattach a volume to the servers it is going to use when server list is + * replaced. We try to switch the attachment points to avoid rewalking the + * lists. + */ +void afs_reattach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *new, + struct afs_server_list *old) +{ + unsigned int n = 0, o = 0; + + down_write(&volume->cell->vs_lock); + + while (n < new->nr_servers || o < old->nr_servers) { + struct afs_server_entry *pn = n < new->nr_servers ? &new->servers[n] : NULL; + struct afs_server_entry *po = o < old->nr_servers ? &old->servers[o] : NULL; + struct afs_server_entry *s; + struct list_head *p; + int diff; + + if (pn && po && pn->server == po->server) { + pn->cb_expires_at = po->cb_expires_at; + list_replace(&po->slink, &pn->slink); + n++; + o++; + continue; + } + + if (pn && po) + diff = memcmp(&pn->server->uuid, &po->server->uuid, + sizeof(pn->server->uuid)); + else + diff = pn ? -1 : 1; + + if (diff < 0) { + list_for_each(p, &pn->server->volumes) { + s = list_entry(p, struct afs_server_entry, slink); + if (volume->vid <= s->volume->vid) + break; + } + list_add_tail(&pn->slink, p); + n++; + } else { + list_del(&po->slink); + o++; + } + } + + up_write(&volume->cell->vs_lock); +} + +/* + * Detach a volume from the servers it has been using. + */ +void afs_detach_volume_from_servers(struct afs_volume *volume, struct afs_server_list *slist) +{ + unsigned int i; + + if (!slist->attached) + return; + + down_write(&volume->cell->vs_lock); + + for (i = 0; i < slist->nr_servers; i++) + list_del(&slist->servers[i].slink); + + slist->attached = false; + up_write(&volume->cell->vs_lock); } diff --git a/fs/afs/super.c b/fs/afs/super.c index a01a0fb2cdbb..ae2d66a52add 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -381,8 +381,7 @@ static int afs_validate_fc(struct fs_context *fc) ctx->key = key; if (ctx->volume) { - afs_put_volume(ctx->net, ctx->volume, - afs_volume_trace_put_validate_fc); + afs_put_volume(ctx->volume, afs_volume_trace_put_validate_fc); ctx->volume = NULL; } @@ -529,7 +528,7 @@ static void afs_destroy_sbi(struct afs_super_info *as) { if (as) { struct afs_net *net = afs_net(as->net_ns); - afs_put_volume(net, as->volume, afs_volume_trace_put_destroy_sbi); + afs_put_volume(as->volume, afs_volume_trace_put_destroy_sbi); afs_unuse_cell(net, as->cell, afs_cell_trace_unuse_sbi); put_net(as->net_ns); kfree(as); @@ -615,7 +614,7 @@ static void afs_free_fc(struct fs_context *fc) struct afs_fs_context *ctx = fc->fs_private; afs_destroy_sbi(fc->s_fs_info); - afs_put_volume(ctx->net, ctx->volume, afs_volume_trace_put_free_fc); + afs_put_volume(ctx->volume, afs_volume_trace_put_free_fc); afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_fc); key_put(ctx->key); kfree(ctx); diff --git a/fs/afs/validation.c b/fs/afs/validation.c new file mode 100644 index 000000000000..46b37f2cce7d --- /dev/null +++ b/fs/afs/validation.c @@ -0,0 +1,473 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* vnode and volume validity verification. + * + * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sched.h> +#include "internal.h" + +/* + * Data validation is managed through a number of mechanisms from the server: + * + * (1) On first contact with a server (such as if it has just been rebooted), + * the server sends us a CB.InitCallBackState* request. + * + * (2) On a RW volume, in response to certain vnode (inode)-accessing RPC + * calls, the server maintains a time-limited per-vnode promise that it + * will send us a CB.CallBack request if a third party alters the vnodes + * accessed. + * + * Note that a vnode-level callbacks may also be sent for other reasons, + * such as filelock release. + * + * (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC + * calls, each server maintains a time-limited per-volume promise that it + * will send us a CB.CallBack request if the RO volume is updated to a + * snapshot of the RW volume ("vos release"). This is an atomic event + * that cuts over all instances of the RO volume across multiple servers + * simultaneously. + * + * Note that a volume-level callbacks may also be sent for other reasons, + * such as the volumeserver taking over control of the volume from the + * fileserver. + * + * Note also that each server maintains an independent time limit on an + * independent callback. + * + * (4) Certain RPC calls include a volume information record "VolSync" in + * their reply. This contains a creation date for the volume that should + * remain unchanged for a RW volume (but will be changed if the volume is + * restored from backup) or will be bumped to the time of snapshotting + * when a RO volume is released. + * + * In order to track this events, the following are provided: + * + * ->cb_v_break. A counter of events that might mean that the contents of + * a volume have been altered since we last checked a vnode. + * + * ->cb_v_check. A counter of the number of events that we've sent a + * query to the server for. Everything's up to date if this equals + * cb_v_break. + * + * ->cb_scrub. A counter of the number of regression events for which we + * have to completely wipe the cache. + * + * ->cb_ro_snapshot. A counter of the number of times that we've + * recognised that a RO volume has been updated. + * + * ->cb_break. A counter of events that might mean that the contents of a + * vnode have been altered. + * + * ->cb_expires_at. The time at which the callback promise expires or + * AFS_NO_CB_PROMISE if we have no promise. + * + * The way we manage things is: + * + * (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on + * the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the + * volume and volume's server record. + * + * (2) When a CB.InitCallBackState occurs, we treat this as a volume-level + * callback break on all the volumes that have been using that volume + * (ie. increment ->cb_v_break and reset ->cb_expires_at). + * + * (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the + * vnode and reset its ->cb_expires_at. If the vnode is mmapped, we also + * dispatch a work item to unmap all PTEs to the vnode's pagecache to + * force reentry to the filesystem for revalidation. + * + * (4) When entering the filesystem, we call afs_validate() to check the + * validity of a vnode. This first checks to see if ->cb_v_check and + * ->cb_v_break match, and if they don't, we lock volume->cb_check_lock + * exclusively and perform an FS.FetchStatus on the vnode. + * + * After checking the volume, we check the vnode. If there's a mismatch + * between the volume counters and the vnode's mirrors of those counters, + * we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode. + * + * (5) When the reply from FS.FetchStatus arrives, the VolSync record is + * parsed: + * + * (A) If the Creation timestamp has changed on a RW volume or regressed + * on a RO volume, we try to increment ->cb_scrub; if it advances on a + * RO volume, we assume "vos release" happened and try to increment + * ->cb_ro_snapshot. + * + * (B) If the Update timestamp has regressed, we try to increment + * ->cb_scrub. + * + * Note that in both of these cases, we only do the increment if we can + * cmpxchg the value of the timestamp from the value we noted before the + * op. This tries to prevent parallel ops from fighting one another. + * + * volume->cb_v_check is then set to ->cb_v_break. + * + * (6) The AFSCallBack record included in the FS.FetchStatus reply is also + * parsed and used to set the promise in ->cb_expires_at for the vnode, + * the volume and the volume's server record. + * + * (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for + * the vnode. + */ + +/* + * Check the validity of a vnode/inode and its parent volume. + */ +bool afs_check_validity(const struct afs_vnode *vnode) +{ + const struct afs_volume *volume = vnode->volume; + time64_t deadline = ktime_get_real_seconds() + 10; + + if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) || + atomic64_read(&vnode->cb_expires_at) <= deadline || + volume->cb_expires_at <= deadline || + vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot) || + vnode->cb_scrub != atomic_read(&volume->cb_scrub) || + test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) { + _debug("inval"); + return false; + } + + return true; +} + +/* + * See if the server we've just talked to is currently excluded. + */ +static bool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume) +{ + const struct afs_server_entry *se; + const struct afs_server_list *slist; + bool is_excluded = true; + int i; + + rcu_read_lock(); + + slist = rcu_dereference(volume->servers); + for (i = 0; i < slist->nr_servers; i++) { + se = &slist->servers[i]; + if (op->server == se->server) { + is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags); + break; + } + } + + rcu_read_unlock(); + return is_excluded; +} + +/* + * Update the volume's server list when the creation time changes and see if + * the server we've just talked to is currently excluded. + */ +static int afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume) +{ + int ret; + + if (__afs_is_server_excluded(op, volume)) + return 1; + + set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags); + ret = afs_check_volume_status(op->volume, op); + if (ret < 0) + return ret; + + return __afs_is_server_excluded(op, volume); +} + +/* + * Handle a change to the volume creation time in the VolSync record. + */ +static int afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume) +{ + unsigned int snap; + time64_t cur = volume->creation_time; + time64_t old = op->pre_volsync.creation; + time64_t new = op->volsync.creation; + int ret; + + _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new); + + if (cur == TIME64_MIN) { + volume->creation_time = new; + return 0; + } + + if (new == cur) + return 0; + + /* Try to advance the creation timestamp from what we had before the + * operation to what we got back from the server. This should + * hopefully ensure that in a race between multiple operations only one + * of them will do this. + */ + if (cur != old) + return 0; + + /* If the creation time changes in an unexpected way, we need to scrub + * our caches. For a RW vol, this will only change if the volume is + * restored from a backup; for a RO/Backup vol, this will advance when + * the volume is updated to a new snapshot (eg. "vos release"). + */ + if (volume->type == AFSVL_RWVOL) + goto regressed; + if (volume->type == AFSVL_BACKVOL) { + if (new < old) + goto regressed; + goto advance; + } + + /* We have an RO volume, we need to query the VL server and look at the + * server flags to see if RW->RO replication is in progress. + */ + ret = afs_is_server_excluded(op, volume); + if (ret < 0) + return ret; + if (ret > 0) { + snap = atomic_read(&volume->cb_ro_snapshot); + trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded); + return ret; + } + +advance: + snap = atomic_inc_return(&volume->cb_ro_snapshot); + trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release); + volume->creation_time = new; + return 0; + +regressed: + atomic_inc(&volume->cb_scrub); + trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress); + volume->creation_time = new; + return 0; +} + +/* + * Handle a change to the volume update time in the VolSync record. + */ +static void afs_update_volume_update_time(struct afs_operation *op, struct afs_volume *volume) +{ + enum afs_cb_break_reason reason = afs_cb_break_no_break; + time64_t cur = volume->update_time; + time64_t old = op->pre_volsync.update; + time64_t new = op->volsync.update; + + _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new); + + if (cur == TIME64_MIN) { + volume->update_time = new; + return; + } + + if (new == cur) + return; + + /* If the volume update time changes in an unexpected way, we need to + * scrub our caches. For a RW vol, this will advance on every + * modification op; for a RO/Backup vol, this will advance when the + * volume is updated to a new snapshot (eg. "vos release"). + */ + if (new < old) + reason = afs_cb_break_for_update_regress; + + /* Try to advance the update timestamp from what we had before the + * operation to what we got back from the server. This should + * hopefully ensure that in a race between multiple operations only one + * of them will do this. + */ + if (cur == old) { + if (reason == afs_cb_break_for_update_regress) { + atomic_inc(&volume->cb_scrub); + trace_afs_cb_v_break(volume->vid, 0, reason); + } + volume->update_time = new; + } +} + +static int afs_update_volume_times(struct afs_operation *op, struct afs_volume *volume) +{ + int ret = 0; + + if (likely(op->volsync.creation == volume->creation_time && + op->volsync.update == volume->update_time)) + return 0; + + mutex_lock(&volume->volsync_lock); + if (op->volsync.creation != volume->creation_time) { + ret = afs_update_volume_creation_time(op, volume); + if (ret < 0) + goto out; + } + if (op->volsync.update != volume->update_time) + afs_update_volume_update_time(op, volume); +out: + mutex_unlock(&volume->volsync_lock); + return ret; +} + +/* + * Update the state of a volume, including recording the expiration time of the + * callback promise. Returns 1 to redo the operation from the start. + */ +int afs_update_volume_state(struct afs_operation *op) +{ + struct afs_server_list *slist = op->server_list; + struct afs_server_entry *se = &slist->servers[op->server_index]; + struct afs_callback *cb = &op->file[0].scb.callback; + struct afs_volume *volume = op->volume; + unsigned int cb_v_break = atomic_read(&volume->cb_v_break); + unsigned int cb_v_check = atomic_read(&volume->cb_v_check); + int ret; + + _enter("%llx", op->volume->vid); + + if (op->volsync.creation != TIME64_MIN || op->volsync.update != TIME64_MIN) { + ret = afs_update_volume_times(op, volume); + if (ret != 0) { + _leave(" = %d", ret); + return ret; + } + } + + if (op->cb_v_break == cb_v_break && + (op->file[0].scb.have_cb || op->file[1].scb.have_cb)) { + time64_t expires_at = cb->expires_at; + + if (!op->file[0].scb.have_cb) + expires_at = op->file[1].scb.callback.expires_at; + + se->cb_expires_at = expires_at; + volume->cb_expires_at = expires_at; + } + if (cb_v_check < op->cb_v_break) + atomic_cmpxchg(&volume->cb_v_check, cb_v_check, op->cb_v_break); + return 0; +} + +/* + * mark the data attached to an inode as obsolete due to a write on the server + * - might also want to ditch all the outstanding writes and dirty pages + */ +static void afs_zap_data(struct afs_vnode *vnode) +{ + _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode); + + afs_invalidate_cache(vnode, 0); + + /* nuke all the non-dirty pages that aren't locked, mapped or being + * written back in a regular file and completely discard the pages in a + * directory or symlink */ + if (S_ISREG(vnode->netfs.inode.i_mode)) + invalidate_remote_inode(&vnode->netfs.inode); + else + invalidate_inode_pages2(vnode->netfs.inode.i_mapping); +} + +/* + * validate a vnode/inode + * - there are several things we need to check + * - parent dir data changes (rm, rmdir, rename, mkdir, create, link, + * symlink) + * - parent dir metadata changed (security changes) + * - dentry data changed (write, truncate) + * - dentry metadata changed (security changes) + */ +int afs_validate(struct afs_vnode *vnode, struct key *key) +{ + struct afs_volume *volume = vnode->volume; + unsigned int cb_ro_snapshot, cb_scrub; + time64_t deadline = ktime_get_real_seconds() + 10; + bool zap = false, locked_vol = false; + int ret; + + _enter("{v={%llx:%llu} fl=%lx},%x", + vnode->fid.vid, vnode->fid.vnode, vnode->flags, + key_serial(key)); + + if (afs_check_validity(vnode)) + return 0; + + ret = down_write_killable(&vnode->validate_lock); + if (ret < 0) + goto error; + + /* Validate a volume after the v_break has changed or the volume + * callback expired. We only want to do this once per volume per + * v_break change. The actual work will be done when parsing the + * status fetch reply. + */ + if (volume->cb_expires_at <= deadline || + atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) { + ret = mutex_lock_interruptible(&volume->cb_check_lock); + if (ret < 0) + goto error_unlock; + locked_vol = true; + } + + cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot); + cb_scrub = atomic_read(&volume->cb_scrub); + if (vnode->cb_ro_snapshot != cb_ro_snapshot || + vnode->cb_scrub != cb_scrub) + unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false); + + if (vnode->cb_ro_snapshot != cb_ro_snapshot || + vnode->cb_scrub != cb_scrub || + volume->cb_expires_at <= deadline || + atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) || + atomic64_read(&vnode->cb_expires_at) <= deadline + ) { + ret = afs_fetch_status(vnode, key, false, NULL); + if (ret < 0) { + if (ret == -ENOENT) { + set_bit(AFS_VNODE_DELETED, &vnode->flags); + ret = -ESTALE; + } + goto error_unlock; + } + + _debug("new promise [fl=%lx]", vnode->flags); + } + + /* We can drop the volume lock now as. */ + if (locked_vol) { + mutex_unlock(&volume->cb_check_lock); + locked_vol = false; + } + + cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot); + cb_scrub = atomic_read(&volume->cb_scrub); + _debug("vnode inval %x==%x %x==%x", + vnode->cb_ro_snapshot, cb_ro_snapshot, + vnode->cb_scrub, cb_scrub); + if (vnode->cb_scrub != cb_scrub) + zap = true; + vnode->cb_ro_snapshot = cb_ro_snapshot; + vnode->cb_scrub = cb_scrub; + + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { + _debug("file already deleted"); + ret = -ESTALE; + goto error_unlock; + } + + /* if the vnode's data version number changed then its contents are + * different */ + zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags); + if (zap) + afs_zap_data(vnode); + up_write(&vnode->validate_lock); + _leave(" = 0"); + return 0; + +error_unlock: + if (locked_vol) + mutex_unlock(&volume->cb_check_lock); + up_write(&vnode->validate_lock); +error: + _leave(" = %d", ret); + return ret; +} diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c index f04a80e4f5c3..9f36e14f1c2d 100644 --- a/fs/afs/vl_alias.c +++ b/fs/afs/vl_alias.c @@ -33,55 +33,6 @@ static struct afs_volume *afs_sample_volume(struct afs_cell *cell, struct key *k } /* - * Compare two addresses. - */ -static int afs_compare_addrs(const struct sockaddr_rxrpc *srx_a, - const struct sockaddr_rxrpc *srx_b) -{ - short port_a, port_b; - int addr_a, addr_b, diff; - - diff = (short)srx_a->transport_type - (short)srx_b->transport_type; - if (diff) - goto out; - - switch (srx_a->transport_type) { - case AF_INET: { - const struct sockaddr_in *a = &srx_a->transport.sin; - const struct sockaddr_in *b = &srx_b->transport.sin; - addr_a = ntohl(a->sin_addr.s_addr); - addr_b = ntohl(b->sin_addr.s_addr); - diff = addr_a - addr_b; - if (diff == 0) { - port_a = ntohs(a->sin_port); - port_b = ntohs(b->sin_port); - diff = port_a - port_b; - } - break; - } - - case AF_INET6: { - const struct sockaddr_in6 *a = &srx_a->transport.sin6; - const struct sockaddr_in6 *b = &srx_b->transport.sin6; - diff = memcmp(&a->sin6_addr, &b->sin6_addr, 16); - if (diff == 0) { - port_a = ntohs(a->sin6_port); - port_b = ntohs(b->sin6_port); - diff = port_a - port_b; - } - break; - } - - default: - WARN_ON(1); - diff = 1; - } - -out: - return diff; -} - -/* * Compare the address lists of a pair of fileservers. */ static int afs_compare_fs_alists(const struct afs_server *server_a, @@ -90,13 +41,13 @@ static int afs_compare_fs_alists(const struct afs_server *server_a, const struct afs_addr_list *la, *lb; int a = 0, b = 0, addr_matches = 0; - la = rcu_dereference(server_a->addresses); - lb = rcu_dereference(server_b->addresses); + la = rcu_dereference(server_a->endpoint_state)->addresses; + lb = rcu_dereference(server_b->endpoint_state)->addresses; while (a < la->nr_addrs && b < lb->nr_addrs) { - const struct sockaddr_rxrpc *srx_a = &la->addrs[a]; - const struct sockaddr_rxrpc *srx_b = &lb->addrs[b]; - int diff = afs_compare_addrs(srx_a, srx_b); + unsigned long pa = (unsigned long)la->addrs[a].peer; + unsigned long pb = (unsigned long)lb->addrs[b].peer; + long diff = pa - pb; if (diff < 0) { a++; @@ -126,7 +77,7 @@ static int afs_compare_volume_slists(const struct afs_volume *vol_a, lb = rcu_dereference(vol_b->servers); for (i = 0; i < AFS_MAXTYPES; i++) - if (la->vids[i] != lb->vids[i]) + if (vol_a->vids[i] != vol_b->vids[i]) return 0; while (a < la->nr_servers && b < lb->nr_servers) { @@ -205,7 +156,7 @@ static int afs_query_for_alias_one(struct afs_cell *cell, struct key *key, /* And see if it's in the new cell. */ volume = afs_sample_volume(cell, key, pvol->name, pvol->name_len); if (IS_ERR(volume)) { - afs_put_volume(cell->net, pvol, afs_volume_trace_put_query_alias); + afs_put_volume(pvol, afs_volume_trace_put_query_alias); if (PTR_ERR(volume) != -ENOMEDIUM) return PTR_ERR(volume); /* That volume is not in the new cell, so not an alias */ @@ -223,8 +174,8 @@ static int afs_query_for_alias_one(struct afs_cell *cell, struct key *key, rcu_read_unlock(); } - afs_put_volume(cell->net, volume, afs_volume_trace_put_query_alias); - afs_put_volume(cell->net, pvol, afs_volume_trace_put_query_alias); + afs_put_volume(volume, afs_volume_trace_put_query_alias); + afs_put_volume(pvol, afs_volume_trace_put_query_alias); return ret; } @@ -285,7 +236,7 @@ static char *afs_vl_get_cell_name(struct afs_cell *cell, struct key *key) while (afs_select_vlserver(&vc)) { if (!test_bit(AFS_VLSERVER_FL_IS_YFS, &vc.server->flags)) { - vc.ac.error = -EOPNOTSUPP; + vc.call_error = -EOPNOTSUPP; skipped = true; continue; } diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c index acc48216136a..9b1c20daac53 100644 --- a/fs/afs/vl_list.c +++ b/fs/afs/vl_list.c @@ -13,6 +13,7 @@ struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len, unsigned short port) { struct afs_vlserver *vlserver; + static atomic_t debug_ids; vlserver = kzalloc(struct_size(vlserver, name, name_len + 1), GFP_KERNEL); @@ -21,8 +22,10 @@ struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len, rwlock_init(&vlserver->lock); init_waitqueue_head(&vlserver->probe_wq); spin_lock_init(&vlserver->probe_lock); + vlserver->debug_id = atomic_inc_return(&debug_ids); vlserver->rtt = UINT_MAX; vlserver->name_len = name_len; + vlserver->service_id = VL_SERVICE; vlserver->port = port; memcpy(vlserver->name, name, name_len); } @@ -33,7 +36,8 @@ static void afs_vlserver_rcu(struct rcu_head *rcu) { struct afs_vlserver *vlserver = container_of(rcu, struct afs_vlserver, rcu); - afs_put_addrlist(rcu_access_pointer(vlserver->addresses)); + afs_put_addrlist(rcu_access_pointer(vlserver->addresses), + afs_alist_trace_put_vlserver); kfree_rcu(vlserver, rcu); } @@ -83,14 +87,15 @@ static u16 afs_extract_le16(const u8 **_b) /* * Build a VL server address list from a DNS queried server list. */ -static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end, +static struct afs_addr_list *afs_extract_vl_addrs(struct afs_net *net, + const u8 **_b, const u8 *end, u8 nr_addrs, u16 port) { struct afs_addr_list *alist; const u8 *b = *_b; int ret = -EINVAL; - alist = afs_alloc_addrlist(nr_addrs, VL_SERVICE, port); + alist = afs_alloc_addrlist(nr_addrs); if (!alist) return ERR_PTR(-ENOMEM); if (nr_addrs == 0) @@ -109,7 +114,9 @@ static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end, goto error; } memcpy(x, b, 4); - afs_merge_fs_addr4(alist, x[0], port); + ret = afs_merge_fs_addr4(net, alist, x[0], port); + if (ret < 0) + goto error; b += 4; break; @@ -119,7 +126,9 @@ static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end, goto error; } memcpy(x, b, 16); - afs_merge_fs_addr6(alist, x, port); + ret = afs_merge_fs_addr6(net, alist, x, port); + if (ret < 0) + goto error; b += 16; break; @@ -140,7 +149,7 @@ static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end, error: *_b = b; - afs_put_addrlist(alist); + afs_put_addrlist(alist, afs_alist_trace_put_parse_error); return ERR_PTR(ret); } @@ -247,7 +256,7 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell, /* Extract the addresses - note that we can't skip this as we * have to advance the payload pointer. */ - addrs = afs_extract_vl_addrs(&b, end, bs.nr_addrs, bs.port); + addrs = afs_extract_vl_addrs(cell->net, &b, end, bs.nr_addrs, bs.port); if (IS_ERR(addrs)) { ret = PTR_ERR(addrs); goto error_2; @@ -255,7 +264,7 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell, if (vllist->nr_servers >= nr_servers) { _debug("skip %u >= %u", vllist->nr_servers, nr_servers); - afs_put_addrlist(addrs); + afs_put_addrlist(addrs, afs_alist_trace_put_parse_empty); afs_put_vlserver(cell->net, server); continue; } @@ -264,7 +273,7 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell, addrs->status = bs.status; if (addrs->nr_addrs == 0) { - afs_put_addrlist(addrs); + afs_put_addrlist(addrs, afs_alist_trace_put_parse_empty); if (!rcu_access_pointer(server->addresses)) { afs_put_vlserver(cell->net, server); continue; @@ -276,7 +285,7 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell, old = rcu_replace_pointer(server->addresses, old, lockdep_is_held(&server->lock)); write_unlock(&server->lock); - afs_put_addrlist(old); + afs_put_addrlist(old, afs_alist_trace_put_vlserver_old); } diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c index 58452b86e672..3d2e0c925460 100644 --- a/fs/afs/vl_probe.c +++ b/fs/afs/vl_probe.c @@ -46,11 +46,12 @@ static void afs_done_one_vl_probe(struct afs_vlserver *server, bool wake_up) */ void afs_vlserver_probe_result(struct afs_call *call) { - struct afs_addr_list *alist = call->alist; + struct afs_addr_list *alist = call->vl_probe; struct afs_vlserver *server = call->vlserver; + struct afs_address *addr = &alist->addrs[call->probe_index]; unsigned int server_index = call->server_index; unsigned int rtt_us = 0; - unsigned int index = call->addr_ix; + unsigned int index = call->probe_index; bool have_result = false; int ret = call->error; @@ -89,7 +90,7 @@ void afs_vlserver_probe_result(struct afs_call *call) case -ETIME: default: clear_bit(index, &alist->responded); - set_bit(index, &alist->failed); + set_bit(index, &alist->probe_failed); if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED) && (server->probe.error == 0 || server->probe.error == -ETIMEDOUT || @@ -101,21 +102,21 @@ void afs_vlserver_probe_result(struct afs_call *call) responded: set_bit(index, &alist->responded); - clear_bit(index, &alist->failed); + clear_bit(index, &alist->probe_failed); if (call->service_id == YFS_VL_SERVICE) { server->probe.flags |= AFS_VLSERVER_PROBE_IS_YFS; set_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags); - alist->addrs[index].srx_service = call->service_id; + server->service_id = call->service_id; } else { server->probe.flags |= AFS_VLSERVER_PROBE_NOT_YFS; if (!(server->probe.flags & AFS_VLSERVER_PROBE_IS_YFS)) { clear_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags); - alist->addrs[index].srx_service = call->service_id; + server->service_id = call->service_id; } } - rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us); + rtt_us = rxrpc_kernel_get_srtt(addr->peer); if (rtt_us < server->probe.rtt) { server->probe.rtt = rtt_us; server->rtt = rtt_us; @@ -130,8 +131,10 @@ responded: out: spin_unlock(&server->probe_lock); - _debug("probe [%u][%u] %pISpc rtt=%u ret=%d", - server_index, index, &alist->addrs[index].transport, rtt_us, ret); + trace_afs_vl_probe(server, false, alist, index, call->error, call->abort_code, rtt_us); + _debug("probe [%u][%u] %pISpc rtt=%d ret=%d", + server_index, index, rxrpc_kernel_remote_addr(addr->peer), + rtt_us, ret); afs_done_one_vl_probe(server, have_result); } @@ -146,35 +149,52 @@ static bool afs_do_probe_vlserver(struct afs_net *net, unsigned int server_index, struct afs_error *_e) { - struct afs_addr_cursor ac = { - .index = 0, - }; + struct afs_addr_list *alist; struct afs_call *call; + unsigned long unprobed; + unsigned int index, i; bool in_progress = false; + int best_prio; _enter("%s", server->name); read_lock(&server->lock); - ac.alist = rcu_dereference_protected(server->addresses, - lockdep_is_held(&server->lock)); + alist = rcu_dereference_protected(server->addresses, + lockdep_is_held(&server->lock)); + afs_get_addrlist(alist, afs_alist_trace_get_vlprobe); read_unlock(&server->lock); - atomic_set(&server->probe_outstanding, ac.alist->nr_addrs); + atomic_set(&server->probe_outstanding, alist->nr_addrs); memset(&server->probe, 0, sizeof(server->probe)); server->probe.rtt = UINT_MAX; - for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++) { - call = afs_vl_get_capabilities(net, &ac, key, server, + unprobed = (1UL << alist->nr_addrs) - 1; + while (unprobed) { + best_prio = -1; + index = 0; + for (i = 0; i < alist->nr_addrs; i++) { + if (test_bit(i, &unprobed) && + alist->addrs[i].prio > best_prio) { + index = i; + best_prio = alist->addrs[i].prio; + } + } + __clear_bit(index, &unprobed); + + trace_afs_vl_probe(server, true, alist, index, 0, 0, 0); + call = afs_vl_get_capabilities(net, alist, index, key, server, server_index); if (!IS_ERR(call)) { + afs_prioritise_error(_e, call->error, call->abort_code); afs_put_call(call); in_progress = true; } else { - afs_prioritise_error(_e, PTR_ERR(call), ac.abort_code); + afs_prioritise_error(_e, PTR_ERR(call), 0); afs_done_one_vl_probe(server, false); } } + afs_put_addrlist(alist, afs_alist_trace_put_vlprobe); return in_progress; } @@ -185,12 +205,10 @@ int afs_send_vl_probes(struct afs_net *net, struct key *key, struct afs_vlserver_list *vllist) { struct afs_vlserver *server; - struct afs_error e; + struct afs_error e = {}; bool in_progress = false; int i; - e.error = 0; - e.responded = false; for (i = 0; i < vllist->nr_servers; i++) { server = vllist->servers[i].server; if (test_bit(AFS_VLSERVER_FL_PROBED, &server->flags)) diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c index eb415ce56360..d8f79f6ada3d 100644 --- a/fs/afs/vl_rotate.c +++ b/fs/afs/vl_rotate.c @@ -17,18 +17,21 @@ bool afs_begin_vlserver_operation(struct afs_vl_cursor *vc, struct afs_cell *cell, struct key *key) { + static atomic_t debug_ids; + memset(vc, 0, sizeof(*vc)); vc->cell = cell; vc->key = key; - vc->error = -EDESTADDRREQ; - vc->ac.error = SHRT_MAX; + vc->cumul_error.error = -EDESTADDRREQ; + vc->nr_iterations = -1; if (signal_pending(current)) { - vc->error = -EINTR; + vc->cumul_error.error = -EINTR; vc->flags |= AFS_VL_CURSOR_STOP; return false; } + vc->debug_id = atomic_inc_return(&debug_ids); return true; } @@ -52,7 +55,7 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc) &cell->dns_lookup_count, smp_load_acquire(&cell->dns_lookup_count) != dns_lookup_count) < 0) { - vc->error = -ERESTARTSYS; + vc->cumul_error.error = -ERESTARTSYS; return false; } } @@ -60,12 +63,12 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc) /* Status load is ordered after lookup counter load */ if (cell->dns_status == DNS_LOOKUP_GOT_NOT_FOUND) { pr_warn("No record of cell %s\n", cell->name); - vc->error = -ENOENT; + vc->cumul_error.error = -ENOENT; return false; } if (cell->dns_source == DNS_RECORD_UNAVAILABLE) { - vc->error = -EDESTADDRREQ; + vc->cumul_error.error = -EDESTADDRREQ; return false; } } @@ -78,8 +81,8 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc) if (!vc->server_list->nr_servers) return false; - vc->untried = (1UL << vc->server_list->nr_servers) - 1; - vc->index = -1; + vc->untried_servers = (1UL << vc->server_list->nr_servers) - 1; + vc->server_index = -1; return true; } @@ -89,54 +92,57 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc) */ bool afs_select_vlserver(struct afs_vl_cursor *vc) { - struct afs_addr_list *alist; + struct afs_addr_list *alist = vc->alist; struct afs_vlserver *vlserver; - struct afs_error e; - u32 rtt; - int error = vc->ac.error, i; + unsigned long set, failed; + unsigned int rtt; + s32 abort_code = vc->call_abort_code; + int error = vc->call_error, i; + + vc->nr_iterations++; - _enter("%lx[%d],%lx[%d],%d,%d", - vc->untried, vc->index, - vc->ac.tried, vc->ac.index, - error, vc->ac.abort_code); + _enter("VC=%x+%x,%d{%lx},%d{%lx},%d,%d", + vc->debug_id, vc->nr_iterations, vc->server_index, vc->untried_servers, + vc->addr_index, vc->addr_tried, + error, abort_code); if (vc->flags & AFS_VL_CURSOR_STOP) { _leave(" = f [stopped]"); return false; } - vc->nr_iterations++; + if (vc->nr_iterations == 0) + goto start; + + WRITE_ONCE(alist->addrs[vc->addr_index].last_error, error); /* Evaluate the result of the previous operation, if there was one. */ switch (error) { - case SHRT_MAX: - goto start; - default: case 0: /* Success or local failure. Stop. */ - vc->error = error; + vc->cumul_error.error = error; vc->flags |= AFS_VL_CURSOR_STOP; - _leave(" = f [okay/local %d]", vc->ac.error); + _leave(" = f [okay/local %d]", vc->cumul_error.error); return false; case -ECONNABORTED: /* The far side rejected the operation on some grounds. This * might involve the server being busy or the volume having been moved. */ - switch (vc->ac.abort_code) { + switch (abort_code) { case AFSVL_IO: case AFSVL_BADVOLOPER: case AFSVL_NOMEM: /* The server went weird. */ - vc->error = -EREMOTEIO; + afs_prioritise_error(&vc->cumul_error, -EREMOTEIO, abort_code); //write_lock(&vc->cell->vl_servers_lock); - //vc->server_list->weird_mask |= 1 << vc->index; + //vc->server_list->weird_mask |= 1 << vc->server_index; //write_unlock(&vc->cell->vl_servers_lock); goto next_server; default: - vc->error = afs_abort_to_error(vc->ac.abort_code); + afs_prioritise_error(&vc->cumul_error, error, abort_code); goto failed; } @@ -149,12 +155,12 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc) case -ETIMEDOUT: case -ETIME: _debug("no conn %d", error); - vc->error = error; + afs_prioritise_error(&vc->cumul_error, error, 0); goto iterate_address; case -ECONNRESET: _debug("call reset"); - vc->error = error; + afs_prioritise_error(&vc->cumul_error, error, 0); vc->flags |= AFS_VL_CURSOR_RETRY; goto next_server; @@ -165,7 +171,13 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc) restart_from_beginning: _debug("restart"); - afs_end_cursor(&vc->ac); + if (vc->call_responded && + vc->addr_index != vc->alist->preferred && + test_bit(alist->preferred, &vc->addr_tried)) + WRITE_ONCE(alist->preferred, vc->addr_index); + afs_put_addrlist(alist, afs_alist_trace_put_vlrotate_restart); + alist = vc->alist = NULL; + afs_put_vlserverlist(vc->cell->net, vc->server_list); vc->server_list = NULL; if (vc->flags & AFS_VL_CURSOR_RETRIED) @@ -173,53 +185,58 @@ restart_from_beginning: vc->flags |= AFS_VL_CURSOR_RETRIED; start: _debug("start"); + ASSERTCMP(alist, ==, NULL); if (!afs_start_vl_iteration(vc)) goto failed; error = afs_send_vl_probes(vc->cell->net, vc->key, vc->server_list); - if (error < 0) - goto failed_set_error; + if (error < 0) { + afs_prioritise_error(&vc->cumul_error, error, 0); + goto failed; + } pick_server: - _debug("pick [%lx]", vc->untried); + _debug("pick [%lx]", vc->untried_servers); + ASSERTCMP(alist, ==, NULL); - error = afs_wait_for_vl_probes(vc->server_list, vc->untried); - if (error < 0) - goto failed_set_error; + error = afs_wait_for_vl_probes(vc->server_list, vc->untried_servers); + if (error < 0) { + afs_prioritise_error(&vc->cumul_error, error, 0); + goto failed; + } /* Pick the untried server with the lowest RTT. */ - vc->index = vc->server_list->preferred; - if (test_bit(vc->index, &vc->untried)) + vc->server_index = vc->server_list->preferred; + if (test_bit(vc->server_index, &vc->untried_servers)) goto selected_server; - vc->index = -1; - rtt = U32_MAX; + vc->server_index = -1; + rtt = UINT_MAX; for (i = 0; i < vc->server_list->nr_servers; i++) { struct afs_vlserver *s = vc->server_list->servers[i].server; - if (!test_bit(i, &vc->untried) || + if (!test_bit(i, &vc->untried_servers) || !test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags)) continue; - if (s->probe.rtt < rtt) { - vc->index = i; + if (s->probe.rtt <= rtt) { + vc->server_index = i; rtt = s->probe.rtt; } } - if (vc->index == -1) + if (vc->server_index == -1) goto no_more_servers; selected_server: - _debug("use %d", vc->index); - __clear_bit(vc->index, &vc->untried); + _debug("use %d", vc->server_index); + __clear_bit(vc->server_index, &vc->untried_servers); /* We're starting on a different vlserver from the list. We need to * check it, find its address list and probe its capabilities before we * use it. */ - ASSERTCMP(vc->ac.alist, ==, NULL); - vlserver = vc->server_list->servers[vc->index].server; + vlserver = vc->server_list->servers[vc->server_index].server; vc->server = vlserver; _debug("USING VLSERVER: %s", vlserver->name); @@ -227,34 +244,48 @@ selected_server: read_lock(&vlserver->lock); alist = rcu_dereference_protected(vlserver->addresses, lockdep_is_held(&vlserver->lock)); - afs_get_addrlist(alist); + vc->alist = afs_get_addrlist(alist, afs_alist_trace_get_vlrotate_set); read_unlock(&vlserver->lock); - memset(&vc->ac, 0, sizeof(vc->ac)); - - if (!vc->ac.alist) - vc->ac.alist = alist; - else - afs_put_addrlist(alist); - - vc->ac.index = -1; + vc->addr_tried = 0; + vc->addr_index = -1; iterate_address: - ASSERT(vc->ac.alist); /* Iterate over the current server's address list to try and find an * address on which it will respond to us. */ - if (!afs_iterate_addresses(&vc->ac)) + set = READ_ONCE(alist->responded); + failed = READ_ONCE(alist->probe_failed); + vc->addr_index = READ_ONCE(alist->preferred); + + _debug("%lx-%lx-%lx,%d", set, failed, vc->addr_tried, vc->addr_index); + + set &= ~(failed | vc->addr_tried); + + if (!set) goto next_server; - _debug("VL address %d/%d", vc->ac.index, vc->ac.alist->nr_addrs); + if (!test_bit(vc->addr_index, &set)) + vc->addr_index = __ffs(set); + + set_bit(vc->addr_index, &vc->addr_tried); + vc->alist = alist; - _leave(" = t %pISpc", &vc->ac.alist->addrs[vc->ac.index].transport); + _debug("VL address %d/%d", vc->addr_index, alist->nr_addrs); + + vc->call_responded = false; + _leave(" = t %pISpc", rxrpc_kernel_remote_addr(alist->addrs[vc->addr_index].peer)); return true; next_server: _debug("next"); - afs_end_cursor(&vc->ac); + ASSERT(alist); + if (vc->call_responded && + vc->addr_index != alist->preferred && + test_bit(alist->preferred, &vc->addr_tried)) + WRITE_ONCE(alist->preferred, vc->addr_index); + afs_put_addrlist(alist, afs_alist_trace_put_vlrotate_next); + alist = vc->alist = NULL; goto pick_server; no_more_servers: @@ -264,25 +295,26 @@ no_more_servers: if (vc->flags & AFS_VL_CURSOR_RETRY) goto restart_from_beginning; - e.error = -EDESTADDRREQ; - e.responded = false; for (i = 0; i < vc->server_list->nr_servers; i++) { struct afs_vlserver *s = vc->server_list->servers[i].server; if (test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags)) - e.responded = true; - afs_prioritise_error(&e, READ_ONCE(s->probe.error), + vc->cumul_error.responded = true; + afs_prioritise_error(&vc->cumul_error, READ_ONCE(s->probe.error), s->probe.abort_code); } - error = e.error; - -failed_set_error: - vc->error = error; failed: + if (alist) { + if (vc->call_responded && + vc->addr_index != alist->preferred && + test_bit(alist->preferred, &vc->addr_tried)) + WRITE_ONCE(alist->preferred, vc->addr_index); + afs_put_addrlist(alist, afs_alist_trace_put_vlrotate_fail); + alist = vc->alist = NULL; + } vc->flags |= AFS_VL_CURSOR_STOP; - afs_end_cursor(&vc->ac); - _leave(" = f [failed %d]", vc->error); + _leave(" = f [failed %d]", vc->cumul_error.error); return false; } @@ -305,7 +337,10 @@ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc) pr_notice("DNS: src=%u st=%u lc=%x\n", cell->dns_source, cell->dns_status, cell->dns_lookup_count); pr_notice("VC: ut=%lx ix=%u ni=%hu fl=%hx err=%hd\n", - vc->untried, vc->index, vc->nr_iterations, vc->flags, vc->error); + vc->untried_servers, vc->server_index, vc->nr_iterations, + vc->flags, vc->cumul_error.error); + pr_notice("VC: call er=%d ac=%d r=%u\n", + vc->call_error, vc->call_abort_code, vc->call_responded); if (vc->server_list) { const struct afs_vlserver_list *sl = vc->server_list; @@ -322,16 +357,14 @@ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc) a->nr_ipv4, a->nr_addrs, a->max_addrs, a->preferred); pr_notice("VC: - R=%lx F=%lx\n", - a->responded, a->failed); - if (a == vc->ac.alist) + a->responded, a->probe_failed); + if (a == vc->alist) pr_notice("VC: - current\n"); } } } - pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n", - vc->ac.tried, vc->ac.index, vc->ac.abort_code, vc->ac.error, - vc->ac.responded, vc->ac.nr_iterations); + pr_notice("AC: t=%lx ax=%u\n", vc->addr_tried, vc->addr_index); rcu_read_unlock(); } @@ -342,17 +375,25 @@ int afs_end_vlserver_operation(struct afs_vl_cursor *vc) { struct afs_net *net = vc->cell->net; - if (vc->error == -EDESTADDRREQ || - vc->error == -EADDRNOTAVAIL || - vc->error == -ENETUNREACH || - vc->error == -EHOSTUNREACH) + _enter("VC=%x+%x", vc->debug_id, vc->nr_iterations); + + switch (vc->cumul_error.error) { + case -EDESTADDRREQ: + case -EADDRNOTAVAIL: + case -ENETUNREACH: + case -EHOSTUNREACH: afs_vl_dump_edestaddrreq(vc); + break; + } - afs_end_cursor(&vc->ac); + if (vc->alist) { + if (vc->call_responded && + vc->addr_index != vc->alist->preferred && + test_bit(vc->alist->preferred, &vc->addr_tried)) + WRITE_ONCE(vc->alist->preferred, vc->addr_index); + afs_put_addrlist(vc->alist, afs_alist_trace_put_vlrotate_end); + vc->alist = NULL; + } afs_put_vlserverlist(net, vc->server_list); - - if (vc->error == -ECONNABORTED) - vc->error = afs_abort_to_error(vc->ac.abort_code); - - return vc->error; + return vc->cumul_error.error; } diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index 00fca3c66ba6..cac75f89b64a 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -18,8 +18,7 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call) { struct afs_uvldbentry__xdr *uvldb; struct afs_vldb_entry *entry; - bool new_only = false; - u32 tmp, nr_servers, vlflags; + u32 nr_servers, vlflags; int i, ret; _enter(""); @@ -41,27 +40,14 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call) entry->name[i] = 0; entry->name_len = strlen(entry->name); - /* If there is a new replication site that we can use, ignore all the - * sites that aren't marked as new. - */ - for (i = 0; i < nr_servers; i++) { - tmp = ntohl(uvldb->serverFlags[i]); - if (!(tmp & AFS_VLSF_DONTUSE) && - (tmp & AFS_VLSF_NEWREPSITE)) - new_only = true; - } - vlflags = ntohl(uvldb->flags); for (i = 0; i < nr_servers; i++) { struct afs_uuid__xdr *xdr; struct afs_uuid *uuid; + u32 tmp = ntohl(uvldb->serverFlags[i]); int j; int n = entry->nr_servers; - tmp = ntohl(uvldb->serverFlags[i]); - if (tmp & AFS_VLSF_DONTUSE || - (new_only && !(tmp & AFS_VLSF_NEWREPSITE))) - continue; if (tmp & AFS_VLSF_RWVOL) { entry->fs_mask[n] |= AFS_VOL_VTM_RW; if (vlflags & AFS_VLF_BACKEXISTS) @@ -82,6 +68,7 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call) for (j = 0; j < 6; j++) uuid->node[j] = (u8)ntohl(xdr->node[j]); + entry->vlsf_flags[n] = tmp; entry->addr_version[n] = ntohl(uvldb->serverUnique[i]); entry->nr_servers++; } @@ -106,12 +93,6 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call) return 0; } -static void afs_destroy_vl_get_entry_by_name_u(struct afs_call *call) -{ - kfree(call->ret_vldb); - afs_flat_call_destructor(call); -} - /* * VL.GetEntryByNameU operation type. */ @@ -119,7 +100,7 @@ static const struct afs_call_type afs_RXVLGetEntryByNameU = { .name = "VL.GetEntryByNameU", .op = afs_VL_GetEntryByNameU, .deliver = afs_deliver_vl_get_entry_by_name_u, - .destructor = afs_destroy_vl_get_entry_by_name_u, + .destructor = afs_flat_call_destructor, }; /* @@ -155,6 +136,8 @@ struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *vc, call->key = vc->key; call->ret_vldb = entry; call->max_lifespan = AFS_VL_MAX_LIFESPAN; + call->peer = rxrpc_kernel_get_peer(vc->alist->addrs[vc->addr_index].peer); + call->service_id = vc->server->service_id; /* Marshall the parameters */ bp = call->request; @@ -165,8 +148,17 @@ struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *vc, memset((void *)bp + volnamesz, 0, padsz); trace_afs_make_vl_call(call); - afs_make_call(&vc->ac, call, GFP_KERNEL); - return (struct afs_vldb_entry *)afs_wait_for_call_to_complete(call, &vc->ac); + afs_make_call(call, GFP_KERNEL); + afs_wait_for_call_to_complete(call); + vc->call_abort_code = call->abort_code; + vc->call_error = call->error; + vc->call_responded = call->responded; + afs_put_call(call); + if (vc->call_error) { + kfree(entry); + return ERR_PTR(vc->call_error); + } + return entry; } /* @@ -208,7 +200,7 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call) count = ntohl(*bp); nentries = min(nentries, count); - alist = afs_alloc_addrlist(nentries, FS_SERVICE, AFS_FS_PORT); + alist = afs_alloc_addrlist(nentries); if (!alist) return -ENOMEM; alist->version = uniquifier; @@ -230,9 +222,13 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call) alist = call->ret_alist; bp = call->buffer; count = min(call->count, 4U); - for (i = 0; i < count; i++) - if (alist->nr_addrs < call->count2) - afs_merge_fs_addr4(alist, *bp++, AFS_FS_PORT); + for (i = 0; i < count; i++) { + if (alist->nr_addrs < call->count2) { + ret = afs_merge_fs_addr4(call->net, alist, *bp++, AFS_FS_PORT); + if (ret < 0) + return ret; + } + } call->count -= count; if (call->count > 0) @@ -245,12 +241,6 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call) return 0; } -static void afs_vl_get_addrs_u_destructor(struct afs_call *call) -{ - afs_put_addrlist(call->ret_alist); - return afs_flat_call_destructor(call); -} - /* * VL.GetAddrsU operation type. */ @@ -258,7 +248,7 @@ static const struct afs_call_type afs_RXVLGetAddrsU = { .name = "VL.GetAddrsU", .op = afs_VL_GetAddrsU, .deliver = afs_deliver_vl_get_addrs_u, - .destructor = afs_vl_get_addrs_u_destructor, + .destructor = afs_flat_call_destructor, }; /* @@ -269,6 +259,7 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc, const uuid_t *uuid) { struct afs_ListAddrByAttributes__xdr *r; + struct afs_addr_list *alist; const struct afs_uuid *u = (const struct afs_uuid *)uuid; struct afs_call *call; struct afs_net *net = vc->cell->net; @@ -286,6 +277,8 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc, call->key = vc->key; call->ret_alist = NULL; call->max_lifespan = AFS_VL_MAX_LIFESPAN; + call->peer = rxrpc_kernel_get_peer(vc->alist->addrs[vc->addr_index].peer); + call->service_id = vc->server->service_id; /* Marshall the parameters */ bp = call->request; @@ -304,8 +297,18 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc, r->uuid.node[i] = htonl(u->node[i]); trace_afs_make_vl_call(call); - afs_make_call(&vc->ac, call, GFP_KERNEL); - return (struct afs_addr_list *)afs_wait_for_call_to_complete(call, &vc->ac); + afs_make_call(call, GFP_KERNEL); + afs_wait_for_call_to_complete(call); + vc->call_abort_code = call->abort_code; + vc->call_error = call->error; + vc->call_responded = call->responded; + alist = call->ret_alist; + afs_put_call(call); + if (vc->call_error) { + afs_put_addrlist(alist, afs_alist_trace_put_getaddru); + return ERR_PTR(vc->call_error); + } + return alist; } /* @@ -355,6 +358,7 @@ static int afs_deliver_vl_get_capabilities(struct afs_call *call) static void afs_destroy_vl_get_capabilities(struct afs_call *call) { + afs_put_addrlist(call->vl_probe, afs_alist_trace_put_vlgetcaps); afs_put_vlserver(call->net, call->vlserver); afs_flat_call_destructor(call); } @@ -378,7 +382,8 @@ static const struct afs_call_type afs_RXVLGetCapabilities = { * other end supports. */ struct afs_call *afs_vl_get_capabilities(struct afs_net *net, - struct afs_addr_cursor *ac, + struct afs_addr_list *alist, + unsigned int addr_index, struct key *key, struct afs_vlserver *server, unsigned int server_index) @@ -395,6 +400,10 @@ struct afs_call *afs_vl_get_capabilities(struct afs_net *net, call->key = key; call->vlserver = afs_get_vlserver(server); call->server_index = server_index; + call->peer = rxrpc_kernel_get_peer(alist->addrs[addr_index].peer); + call->vl_probe = afs_get_addrlist(alist, afs_alist_trace_get_vlgetcaps); + call->probe_index = addr_index; + call->service_id = server->service_id; call->upgrade = true; call->async = true; call->max_lifespan = AFS_PROBE_MAX_LIFESPAN; @@ -405,7 +414,7 @@ struct afs_call *afs_vl_get_capabilities(struct afs_net *net, /* Can't take a ref on server */ trace_afs_make_vl_call(call); - afs_make_call(ac, call, GFP_KERNEL); + afs_make_call(call, GFP_KERNEL); return call; } @@ -450,7 +459,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) if (call->count > YFS_MAXENDPOINTS) return afs_protocol_error(call, afs_eproto_yvl_fsendpt_num); - alist = afs_alloc_addrlist(call->count, FS_SERVICE, AFS_FS_PORT); + alist = afs_alloc_addrlist(call->count); if (!alist) return -ENOMEM; alist->version = uniquifier; @@ -488,14 +497,18 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) if (ntohl(bp[0]) != sizeof(__be32) * 2) return afs_protocol_error( call, afs_eproto_yvl_fsendpt4_len); - afs_merge_fs_addr4(alist, bp[1], ntohl(bp[2])); + ret = afs_merge_fs_addr4(call->net, alist, bp[1], ntohl(bp[2])); + if (ret < 0) + return ret; bp += 3; break; case YFS_ENDPOINT_IPV6: if (ntohl(bp[0]) != sizeof(__be32) * 5) return afs_protocol_error( call, afs_eproto_yvl_fsendpt6_len); - afs_merge_fs_addr6(alist, bp + 1, ntohl(bp[5])); + ret = afs_merge_fs_addr6(call->net, alist, bp + 1, ntohl(bp[5])); + if (ret < 0) + return ret; bp += 6; break; default: @@ -610,7 +623,7 @@ static const struct afs_call_type afs_YFSVLGetEndpoints = { .name = "YFSVL.GetEndpoints", .op = afs_YFSVL_GetEndpoints, .deliver = afs_deliver_yfsvl_get_endpoints, - .destructor = afs_vl_get_addrs_u_destructor, + .destructor = afs_flat_call_destructor, }; /* @@ -620,6 +633,7 @@ static const struct afs_call_type afs_YFSVLGetEndpoints = { struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc, const uuid_t *uuid) { + struct afs_addr_list *alist; struct afs_call *call; struct afs_net *net = vc->cell->net; __be32 *bp; @@ -635,6 +649,8 @@ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc, call->key = vc->key; call->ret_alist = NULL; call->max_lifespan = AFS_VL_MAX_LIFESPAN; + call->peer = rxrpc_kernel_get_peer(vc->alist->addrs[vc->addr_index].peer); + call->service_id = vc->server->service_id; /* Marshall the parameters */ bp = call->request; @@ -643,8 +659,18 @@ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc, memcpy(bp, uuid, sizeof(*uuid)); /* Type opr_uuid */ trace_afs_make_vl_call(call); - afs_make_call(&vc->ac, call, GFP_KERNEL); - return (struct afs_addr_list *)afs_wait_for_call_to_complete(call, &vc->ac); + afs_make_call(call, GFP_KERNEL); + afs_wait_for_call_to_complete(call); + vc->call_abort_code = call->abort_code; + vc->call_error = call->error; + vc->call_responded = call->responded; + alist = call->ret_alist; + afs_put_call(call); + if (vc->call_error) { + afs_put_addrlist(alist, afs_alist_trace_put_getaddru); + return ERR_PTR(vc->call_error); + } + return alist; } /* @@ -709,12 +735,6 @@ static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call) return 0; } -static void afs_destroy_yfsvl_get_cell_name(struct afs_call *call) -{ - kfree(call->ret_str); - afs_flat_call_destructor(call); -} - /* * VL.GetCapabilities operation type */ @@ -722,7 +742,7 @@ static const struct afs_call_type afs_YFSVLGetCellName = { .name = "YFSVL.GetCellName", .op = afs_YFSVL_GetCellName, .deliver = afs_deliver_yfsvl_get_cell_name, - .destructor = afs_destroy_yfsvl_get_cell_name, + .destructor = afs_flat_call_destructor, }; /* @@ -737,6 +757,7 @@ char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *vc) struct afs_call *call; struct afs_net *net = vc->cell->net; __be32 *bp; + char *cellname; _enter(""); @@ -747,6 +768,8 @@ char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *vc) call->key = vc->key; call->ret_str = NULL; call->max_lifespan = AFS_VL_MAX_LIFESPAN; + call->peer = rxrpc_kernel_get_peer(vc->alist->addrs[vc->addr_index].peer); + call->service_id = vc->server->service_id; /* marshall the parameters */ bp = call->request; @@ -754,6 +777,16 @@ char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *vc) /* Can't take a ref on server */ trace_afs_make_vl_call(call); - afs_make_call(&vc->ac, call, GFP_KERNEL); - return (char *)afs_wait_for_call_to_complete(call, &vc->ac); + afs_make_call(call, GFP_KERNEL); + afs_wait_for_call_to_complete(call); + vc->call_abort_code = call->abort_code; + vc->call_error = call->error; + vc->call_responded = call->responded; + cellname = call->ret_str; + afs_put_call(call); + if (vc->call_error) { + kfree(cellname); + return ERR_PTR(vc->call_error); + } + return cellname; } diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 115c081a8e2c..020ecd45e476 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -11,6 +11,8 @@ static unsigned __read_mostly afs_volume_record_life = 60 * 60; +static void afs_destroy_volume(struct work_struct *work); + /* * Insert a volume into a cell. If there's an existing volume record, that is * returned instead with a ref held. @@ -72,11 +74,11 @@ static void afs_remove_volume_from_cell(struct afs_volume *volume) */ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params, struct afs_vldb_entry *vldb, - unsigned long type_mask) + struct afs_server_list **_slist) { struct afs_server_list *slist; struct afs_volume *volume; - int ret = -ENOMEM; + int ret = -ENOMEM, i; volume = kzalloc(sizeof(struct afs_volume), GFP_KERNEL); if (!volume) @@ -88,20 +90,30 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params, volume->type = params->type; volume->type_force = params->force; volume->name_len = vldb->name_len; + volume->creation_time = TIME64_MIN; + volume->update_time = TIME64_MIN; refcount_set(&volume->ref, 1); INIT_HLIST_NODE(&volume->proc_link); + INIT_WORK(&volume->destructor, afs_destroy_volume); rwlock_init(&volume->servers_lock); + mutex_init(&volume->volsync_lock); + mutex_init(&volume->cb_check_lock); rwlock_init(&volume->cb_v_break_lock); + INIT_LIST_HEAD(&volume->open_mmaps); + init_rwsem(&volume->open_mmaps_lock); memcpy(volume->name, vldb->name, vldb->name_len + 1); - slist = afs_alloc_server_list(params->cell, params->key, vldb, type_mask); + for (i = 0; i < AFS_MAXTYPES; i++) + volume->vids[i] = vldb->vid[i]; + + slist = afs_alloc_server_list(volume, params->key, vldb); if (IS_ERR(slist)) { ret = PTR_ERR(slist); goto error_1; } - refcount_set(&slist->usage, 1); + *_slist = slist; rcu_assign_pointer(volume->servers, slist); trace_afs_volume(volume->vid, 1, afs_volume_trace_alloc); return volume; @@ -117,18 +129,20 @@ error_0: * Look up or allocate a volume record. */ static struct afs_volume *afs_lookup_volume(struct afs_fs_context *params, - struct afs_vldb_entry *vldb, - unsigned long type_mask) + struct afs_vldb_entry *vldb) { + struct afs_server_list *slist; struct afs_volume *candidate, *volume; - candidate = afs_alloc_volume(params, vldb, type_mask); + candidate = afs_alloc_volume(params, vldb, &slist); if (IS_ERR(candidate)) return candidate; volume = afs_insert_volume_into_cell(params->cell, candidate); - if (volume != candidate) - afs_put_volume(params->net, candidate, afs_volume_trace_put_cell_dup); + if (volume == candidate) + afs_attach_volume_to_servers(volume, slist); + else + afs_put_volume(candidate, afs_volume_trace_put_cell_dup); return volume; } @@ -208,8 +222,7 @@ struct afs_volume *afs_create_volume(struct afs_fs_context *params) goto error; } - type_mask = 1UL << params->type; - volume = afs_lookup_volume(params, vldb, type_mask); + volume = afs_lookup_volume(params, vldb); error: kfree(vldb); @@ -219,16 +232,20 @@ error: /* * Destroy a volume record */ -static void afs_destroy_volume(struct afs_net *net, struct afs_volume *volume) +static void afs_destroy_volume(struct work_struct *work) { + struct afs_volume *volume = container_of(work, struct afs_volume, destructor); + struct afs_server_list *slist = rcu_access_pointer(volume->servers); + _enter("%p", volume); #ifdef CONFIG_AFS_FSCACHE ASSERTCMP(volume->cache, ==, NULL); #endif + afs_detach_volume_from_servers(volume, slist); afs_remove_volume_from_cell(volume); - afs_put_serverlist(net, rcu_access_pointer(volume->servers)); + afs_put_serverlist(volume->cell->net, slist); afs_put_cell(volume->cell, afs_cell_trace_put_vol); trace_afs_volume(volume->vid, refcount_read(&volume->ref), afs_volume_trace_free); @@ -270,8 +287,7 @@ struct afs_volume *afs_get_volume(struct afs_volume *volume, /* * Drop a reference on a volume record. */ -void afs_put_volume(struct afs_net *net, struct afs_volume *volume, - enum afs_volume_trace reason) +void afs_put_volume(struct afs_volume *volume, enum afs_volume_trace reason) { if (volume) { afs_volid_t vid = volume->vid; @@ -281,7 +297,7 @@ void afs_put_volume(struct afs_net *net, struct afs_volume *volume, zero = __refcount_dec_and_test(&volume->ref, &r); trace_afs_volume(vid, r - 1, reason); if (zero) - afs_destroy_volume(net, volume); + schedule_work(&volume->destructor); } } @@ -362,8 +378,7 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key) } /* See if the volume's server list got updated. */ - new = afs_alloc_server_list(volume->cell, key, - vldb, (1 << volume->type)); + new = afs_alloc_server_list(volume, key, vldb); if (IS_ERR(new)) { ret = PTR_ERR(new); goto error_vldb; @@ -382,11 +397,17 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key) discard = old; } - volume->update_at = ktime_get_real_seconds() + afs_volume_record_life; + /* Check more often if replication is ongoing. */ + if (new->ro_replicating) + volume->update_at = ktime_get_real_seconds() + 10 * 60; + else + volume->update_at = ktime_get_real_seconds() + afs_volume_record_life; write_unlock(&volume->servers_lock); - ret = 0; + if (discard == old) + afs_reattach_volume_to_servers(volume, new, old); afs_put_serverlist(volume->cell->net, discard); + ret = 0; error_vldb: kfree(vldb); error: diff --git a/fs/afs/write.c b/fs/afs/write.c index 4a168781936b..61d34ad2ca7d 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -242,7 +242,7 @@ static void afs_kill_pages(struct address_space *mapping, folio_clear_uptodate(folio); folio_end_writeback(folio); folio_lock(folio); - generic_error_remove_page(mapping, &folio->page); + generic_error_remove_folio(mapping, folio); folio_unlock(folio); folio_put(folio); @@ -366,7 +366,7 @@ static void afs_store_data_success(struct afs_operation *op) op->ctime = op->file[0].scb.status.mtime_client; afs_vnode_commit_status(op, &op->file[0]); - if (op->error == 0) { + if (!afs_op_error(op)) { if (!op->store.laundering) afs_pages_written_back(vnode, op->store.pos, op->store.size); afs_stat_v(vnode, n_stores); @@ -428,7 +428,7 @@ try_next_key: afs_wait_for_operation(op); - switch (op->error) { + switch (afs_op_error(op)) { case -EACCES: case -EPERM: case -ENOKEY: @@ -447,7 +447,7 @@ try_next_key: } afs_put_wb_key(wbk); - _leave(" = %d", op->error); + _leave(" = %d", afs_op_error(op)); return afs_put_operation(op); } @@ -559,8 +559,7 @@ static void afs_extend_writeback(struct address_space *mapping, if (!folio_clear_dirty_for_io(folio)) BUG(); - if (folio_start_writeback(folio)) - BUG(); + folio_start_writeback(folio); afs_folio_start_fscache(caching, folio); *_count -= folio_nr_pages(folio); @@ -595,8 +594,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, _enter(",%lx,%llx-%llx", folio_index(folio), start, end); - if (folio_start_writeback(folio)) - BUG(); + folio_start_writeback(folio); afs_folio_start_fscache(caching, folio); count -= folio_nr_pages(folio); diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c index 64b2c0224f62..e19f396aa370 100644 --- a/fs/afs/xattr.c +++ b/fs/afs/xattr.c @@ -75,7 +75,7 @@ static bool afs_make_acl(struct afs_operation *op, { struct afs_acl *acl; - acl = kmalloc(sizeof(*acl) + size, GFP_KERNEL); + acl = kmalloc(struct_size(acl, data, size), GFP_KERNEL); if (!acl) { afs_op_nomem(op); return false; diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 11571cca86c1..f521e66d3bf6 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -245,12 +245,15 @@ static void xdr_decode_YFSVolSync(const __be32 **_bp, struct afs_volsync *volsync) { struct yfs_xdr_YFSVolSync *x = (void *)*_bp; - u64 creation; + u64 creation, update; if (volsync) { creation = xdr_to_u64(x->vol_creation_date); do_div(creation, 10 * 1000 * 1000); volsync->creation = creation; + update = xdr_to_u64(x->vol_update_date); + do_div(update, 10 * 1000 * 1000); + volsync->update = update; } *_bp += xdr_size(x); @@ -490,6 +493,7 @@ void yfs_fs_fetch_data(struct afs_operation *op) bp = xdr_encode_u64(bp, req->len); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -572,6 +576,7 @@ void yfs_fs_create_file(struct afs_operation *op) bp = xdr_encode_u32(bp, yfs_LockNone); /* ViceLockType */ yfs_check_req(call, bp); + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -620,6 +625,7 @@ void yfs_fs_make_dir(struct afs_operation *op) bp = xdr_encode_YFSStoreStatus(bp, &op->create.mode, &op->mtime); yfs_check_req(call, bp); + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -704,6 +710,7 @@ void yfs_fs_remove_file2(struct afs_operation *op) bp = xdr_encode_name(bp, name); yfs_check_req(call, bp); + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -773,6 +780,7 @@ void yfs_fs_remove_file(struct afs_operation *op) bp = xdr_encode_name(bp, name); yfs_check_req(call, bp); + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -814,6 +822,7 @@ void yfs_fs_remove_dir(struct afs_operation *op) bp = xdr_encode_name(bp, name); yfs_check_req(call, bp); + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -887,6 +896,7 @@ void yfs_fs_link(struct afs_operation *op) bp = xdr_encode_YFSFid(bp, &vp->fid); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call1(call, &vp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -968,6 +978,7 @@ void yfs_fs_symlink(struct afs_operation *op) bp = xdr_encode_YFSStoreStatus(bp, &mode, &op->mtime); yfs_check_req(call, bp); + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -1047,6 +1058,7 @@ void yfs_fs_rename(struct afs_operation *op) bp = xdr_encode_name(bp, new_name); yfs_check_req(call, bp); + call->fid = orig_dvp->fid; trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); afs_make_op_call(op, call, GFP_NOFS); } @@ -1102,6 +1114,7 @@ void yfs_fs_store_data(struct afs_operation *op) bp = xdr_encode_u64(bp, op->store.i_size); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1158,6 +1171,7 @@ static void yfs_fs_setattr_size(struct afs_operation *op) bp = xdr_encode_u64(bp, attr->ia_size); /* new file length */ yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1196,6 +1210,7 @@ void yfs_fs_setattr(struct afs_operation *op) bp = xdr_encode_YFS_StoreStatus(bp, attr); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1366,6 +1381,7 @@ void yfs_fs_get_volume_status(struct afs_operation *op) bp = xdr_encode_u64(bp, vp->fid.vid); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1430,6 +1446,7 @@ void yfs_fs_set_lock(struct afs_operation *op) bp = xdr_encode_u32(bp, op->lock.type); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_calli(call, &vp->fid, op->lock.type); afs_make_op_call(op, call, GFP_NOFS); } @@ -1460,6 +1477,7 @@ void yfs_fs_extend_lock(struct afs_operation *op) bp = xdr_encode_YFSFid(bp, &vp->fid); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1490,6 +1508,7 @@ void yfs_fs_release_lock(struct afs_operation *op) bp = xdr_encode_YFSFid(bp, &vp->fid); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1556,6 +1575,7 @@ void yfs_fs_fetch_status(struct afs_operation *op) bp = xdr_encode_YFSFid(bp, &vp->fid); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1736,6 +1756,7 @@ void yfs_fs_inline_bulk_status(struct afs_operation *op) bp = xdr_encode_YFSFid(bp, &op->more_files[i].fid); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1898,6 +1919,7 @@ void yfs_fs_fetch_opaque_acl(struct afs_operation *op) bp = xdr_encode_YFSFid(bp, &vp->fid); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_KERNEL); } @@ -1948,6 +1970,7 @@ void yfs_fs_store_opaque_acl2(struct afs_operation *op) bp += size / sizeof(__be32); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_KERNEL); } @@ -239,7 +239,6 @@ static struct ctl_table aio_sysctls[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, - {} }; static void __init aio_sysctl_init(void) @@ -266,7 +265,7 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) return ERR_CAST(inode); inode->i_mapping->a_ops = &aio_ctx_aops; - inode->i_mapping->private_data = ctx; + inode->i_mapping->i_private_data = ctx; inode->i_size = PAGE_SIZE * nr_pages; file = alloc_file_pseudo(inode, aio_mnt, "[aio]", @@ -316,10 +315,10 @@ static void put_aio_ring_file(struct kioctx *ctx) /* Prevent further access to the kioctx from migratepages */ i_mapping = aio_ring_file->f_mapping; - spin_lock(&i_mapping->private_lock); - i_mapping->private_data = NULL; + spin_lock(&i_mapping->i_private_lock); + i_mapping->i_private_data = NULL; ctx->aio_ring_file = NULL; - spin_unlock(&i_mapping->private_lock); + spin_unlock(&i_mapping->i_private_lock); fput(aio_ring_file); } @@ -422,9 +421,9 @@ static int aio_migrate_folio(struct address_space *mapping, struct folio *dst, rc = 0; - /* mapping->private_lock here protects against the kioctx teardown. */ - spin_lock(&mapping->private_lock); - ctx = mapping->private_data; + /* mapping->i_private_lock here protects against the kioctx teardown. */ + spin_lock(&mapping->i_private_lock); + ctx = mapping->i_private_data; if (!ctx) { rc = -EINVAL; goto out; @@ -476,7 +475,7 @@ static int aio_migrate_folio(struct address_space *mapping, struct folio *dst, out_unlock: mutex_unlock(&ctx->ring_lock); out: - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); return rc; } #else @@ -1106,6 +1105,11 @@ static inline void iocb_destroy(struct aio_kiocb *iocb) kmem_cache_free(kiocb_cachep, iocb); } +struct aio_waiter { + struct wait_queue_entry w; + size_t min_nr; +}; + /* aio_complete * Called when the io request on the given iocb is complete. */ @@ -1114,7 +1118,7 @@ static void aio_complete(struct aio_kiocb *iocb) struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring; struct io_event *ev_page, *event; - unsigned tail, pos, head; + unsigned tail, pos, head, avail; unsigned long flags; /* @@ -1156,6 +1160,10 @@ static void aio_complete(struct aio_kiocb *iocb) ctx->completed_events++; if (ctx->completed_events > 1) refill_reqs_available(ctx, head, tail); + + avail = tail > head + ? tail - head + : tail + ctx->nr_events - head; spin_unlock_irqrestore(&ctx->completion_lock, flags); pr_debug("added to ring %p at [%u]\n", iocb, tail); @@ -1166,7 +1174,7 @@ static void aio_complete(struct aio_kiocb *iocb) * from IRQ context. */ if (iocb->ki_eventfd) - eventfd_signal(iocb->ki_eventfd, 1); + eventfd_signal(iocb->ki_eventfd); /* * We have to order our ring_info tail store above and test @@ -1176,8 +1184,18 @@ static void aio_complete(struct aio_kiocb *iocb) */ smp_mb(); - if (waitqueue_active(&ctx->wait)) - wake_up(&ctx->wait); + if (waitqueue_active(&ctx->wait)) { + struct aio_waiter *curr, *next; + unsigned long flags; + + spin_lock_irqsave(&ctx->wait.lock, flags); + list_for_each_entry_safe(curr, next, &ctx->wait.head, w.entry) + if (avail >= curr->min_nr) { + list_del_init_careful(&curr->w.entry); + wake_up_process(curr->w.private); + } + spin_unlock_irqrestore(&ctx->wait.lock, flags); + } } static inline void iocb_put(struct aio_kiocb *iocb) @@ -1290,7 +1308,9 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, struct io_event __user *event, ktime_t until) { - long ret = 0; + struct hrtimer_sleeper t; + struct aio_waiter w; + long ret = 0, ret2 = 0; /* * Note that aio_read_events() is being called as the conditional - i.e. @@ -1306,12 +1326,38 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, * the ringbuffer empty. So in practice we should be ok, but it's * something to be aware of when touching this code. */ - if (until == 0) - aio_read_events(ctx, min_nr, nr, event, &ret); - else - wait_event_interruptible_hrtimeout(ctx->wait, - aio_read_events(ctx, min_nr, nr, event, &ret), - until); + aio_read_events(ctx, min_nr, nr, event, &ret); + if (until == 0 || ret < 0 || ret >= min_nr) + return ret; + + hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + if (until != KTIME_MAX) { + hrtimer_set_expires_range_ns(&t.timer, until, current->timer_slack_ns); + hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); + } + + init_wait(&w.w); + + while (1) { + unsigned long nr_got = ret; + + w.min_nr = min_nr - ret; + + ret2 = prepare_to_wait_event(&ctx->wait, &w.w, TASK_INTERRUPTIBLE); + if (!ret2 && !t.task) + ret2 = -ETIME; + + if (aio_read_events(ctx, min_nr, nr, event, &ret) || ret2) + break; + + if (nr_got == ret) + schedule(); + } + + finish_wait(&ctx->wait, &w.w); + hrtimer_cancel(&t.timer); + destroy_hrtimer_on_stack(&t.timer); + return ret; } @@ -1498,7 +1544,7 @@ static ssize_t aio_setup_rw(int rw, const struct iocb *iocb, size_t len = iocb->aio_nbytes; if (!vectored) { - ssize_t ret = import_single_range(rw, buf, len, *iovec, iter); + ssize_t ret = import_ubuf(rw, buf, len, iter); *iovec = NULL; return ret; } diff --git a/fs/attr.c b/fs/attr.c index bdf5deb06ea9..5a13f0c8495f 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -157,7 +157,7 @@ static bool chgrp_ok(struct mnt_idmap *idmap, * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs @nop_mnt_idmap. + * performed on the raw inode simply pass @nop_mnt_idmap. * * Should be called as the first thing in ->setattr implementations, * possibly after taking additional locks. diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index 038b3d2d9f57..39d8c84c16f4 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -73,12 +73,9 @@ done: /* p->d_lock held */ static struct dentry *positive_after(struct dentry *p, struct dentry *child) { - if (child) - child = list_next_entry(child, d_child); - else - child = list_first_entry(&p->d_subdirs, struct dentry, d_child); + child = child ? d_next_sibling(child) : d_first_child(p); - list_for_each_entry_from(child, &p->d_subdirs, d_child) { + hlist_for_each_entry_from(child, d_sib) { spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(child)) { dget_dlock(child); diff --git a/fs/backing-file.c b/fs/backing-file.c new file mode 100644 index 000000000000..a681f38d84d8 --- /dev/null +++ b/fs/backing-file.c @@ -0,0 +1,336 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Common helpers for stackable filesystems and backing files. + * + * Forked from fs/overlayfs/file.c. + * + * Copyright (C) 2017 Red Hat, Inc. + * Copyright (C) 2023 CTERA Networks. + */ + +#include <linux/fs.h> +#include <linux/backing-file.h> +#include <linux/splice.h> +#include <linux/mm.h> + +#include "internal.h" + +/** + * backing_file_open - open a backing file for kernel internal use + * @user_path: path that the user reuqested to open + * @flags: open flags + * @real_path: path of the backing file + * @cred: credentials for open + * + * Open a backing file for a stackable filesystem (e.g., overlayfs). + * @user_path may be on the stackable filesystem and @real_path on the + * underlying filesystem. In this case, we want to be able to return the + * @user_path of the stackable filesystem. This is done by embedding the + * returned file into a container structure that also stores the stacked + * file's path, which can be retrieved using backing_file_user_path(). + */ +struct file *backing_file_open(const struct path *user_path, int flags, + const struct path *real_path, + const struct cred *cred) +{ + struct file *f; + int error; + + f = alloc_empty_backing_file(flags, cred); + if (IS_ERR(f)) + return f; + + path_get(user_path); + *backing_file_user_path(f) = *user_path; + error = vfs_open(real_path, f); + if (error) { + fput(f); + f = ERR_PTR(error); + } + + return f; +} +EXPORT_SYMBOL_GPL(backing_file_open); + +struct backing_aio { + struct kiocb iocb; + refcount_t ref; + struct kiocb *orig_iocb; + /* used for aio completion */ + void (*end_write)(struct file *); + struct work_struct work; + long res; +}; + +static struct kmem_cache *backing_aio_cachep; + +#define BACKING_IOCB_MASK \ + (IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND) + +static rwf_t iocb_to_rw_flags(int flags) +{ + return (__force rwf_t)(flags & BACKING_IOCB_MASK); +} + +static void backing_aio_put(struct backing_aio *aio) +{ + if (refcount_dec_and_test(&aio->ref)) { + fput(aio->iocb.ki_filp); + kmem_cache_free(backing_aio_cachep, aio); + } +} + +static void backing_aio_cleanup(struct backing_aio *aio, long res) +{ + struct kiocb *iocb = &aio->iocb; + struct kiocb *orig_iocb = aio->orig_iocb; + + if (aio->end_write) + aio->end_write(orig_iocb->ki_filp); + + orig_iocb->ki_pos = iocb->ki_pos; + backing_aio_put(aio); +} + +static void backing_aio_rw_complete(struct kiocb *iocb, long res) +{ + struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb); + struct kiocb *orig_iocb = aio->orig_iocb; + + if (iocb->ki_flags & IOCB_WRITE) + kiocb_end_write(iocb); + + backing_aio_cleanup(aio, res); + orig_iocb->ki_complete(orig_iocb, res); +} + +static void backing_aio_complete_work(struct work_struct *work) +{ + struct backing_aio *aio = container_of(work, struct backing_aio, work); + + backing_aio_rw_complete(&aio->iocb, aio->res); +} + +static void backing_aio_queue_completion(struct kiocb *iocb, long res) +{ + struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb); + + /* + * Punt to a work queue to serialize updates of mtime/size. + */ + aio->res = res; + INIT_WORK(&aio->work, backing_aio_complete_work); + queue_work(file_inode(aio->orig_iocb->ki_filp)->i_sb->s_dio_done_wq, + &aio->work); +} + +static int backing_aio_init_wq(struct kiocb *iocb) +{ + struct super_block *sb = file_inode(iocb->ki_filp)->i_sb; + + if (sb->s_dio_done_wq) + return 0; + + return sb_init_dio_done_wq(sb); +} + + +ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags, + struct backing_file_ctx *ctx) +{ + struct backing_aio *aio = NULL; + const struct cred *old_cred; + ssize_t ret; + + if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING))) + return -EIO; + + if (!iov_iter_count(iter)) + return 0; + + if (iocb->ki_flags & IOCB_DIRECT && + !(file->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; + + old_cred = override_creds(ctx->cred); + if (is_sync_kiocb(iocb)) { + rwf_t rwf = iocb_to_rw_flags(flags); + + ret = vfs_iter_read(file, iter, &iocb->ki_pos, rwf); + } else { + ret = -ENOMEM; + aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL); + if (!aio) + goto out; + + aio->orig_iocb = iocb; + kiocb_clone(&aio->iocb, iocb, get_file(file)); + aio->iocb.ki_complete = backing_aio_rw_complete; + refcount_set(&aio->ref, 2); + ret = vfs_iocb_iter_read(file, &aio->iocb, iter); + backing_aio_put(aio); + if (ret != -EIOCBQUEUED) + backing_aio_cleanup(aio, ret); + } +out: + revert_creds(old_cred); + + if (ctx->accessed) + ctx->accessed(ctx->user_file); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_read_iter); + +ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags, + struct backing_file_ctx *ctx) +{ + const struct cred *old_cred; + ssize_t ret; + + if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING))) + return -EIO; + + if (!iov_iter_count(iter)) + return 0; + + ret = file_remove_privs(ctx->user_file); + if (ret) + return ret; + + if (iocb->ki_flags & IOCB_DIRECT && + !(file->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; + + /* + * Stacked filesystems don't support deferred completions, don't copy + * this property in case it is set by the issuer. + */ + flags &= ~IOCB_DIO_CALLER_COMP; + + old_cred = override_creds(ctx->cred); + if (is_sync_kiocb(iocb)) { + rwf_t rwf = iocb_to_rw_flags(flags); + + ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf); + if (ctx->end_write) + ctx->end_write(ctx->user_file); + } else { + struct backing_aio *aio; + + ret = backing_aio_init_wq(iocb); + if (ret) + goto out; + + ret = -ENOMEM; + aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL); + if (!aio) + goto out; + + aio->orig_iocb = iocb; + aio->end_write = ctx->end_write; + kiocb_clone(&aio->iocb, iocb, get_file(file)); + aio->iocb.ki_flags = flags; + aio->iocb.ki_complete = backing_aio_queue_completion; + refcount_set(&aio->ref, 2); + ret = vfs_iocb_iter_write(file, &aio->iocb, iter); + backing_aio_put(aio); + if (ret != -EIOCBQUEUED) + backing_aio_cleanup(aio, ret); + } +out: + revert_creds(old_cred); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_write_iter); + +ssize_t backing_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags, + struct backing_file_ctx *ctx) +{ + const struct cred *old_cred; + ssize_t ret; + + if (WARN_ON_ONCE(!(in->f_mode & FMODE_BACKING))) + return -EIO; + + old_cred = override_creds(ctx->cred); + ret = vfs_splice_read(in, ppos, pipe, len, flags); + revert_creds(old_cred); + + if (ctx->accessed) + ctx->accessed(ctx->user_file); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_splice_read); + +ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, size_t len, + unsigned int flags, + struct backing_file_ctx *ctx) +{ + const struct cred *old_cred; + ssize_t ret; + + if (WARN_ON_ONCE(!(out->f_mode & FMODE_BACKING))) + return -EIO; + + ret = file_remove_privs(ctx->user_file); + if (ret) + return ret; + + old_cred = override_creds(ctx->cred); + file_start_write(out); + ret = iter_file_splice_write(pipe, out, ppos, len, flags); + file_end_write(out); + revert_creds(old_cred); + + if (ctx->end_write) + ctx->end_write(ctx->user_file); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_splice_write); + +int backing_file_mmap(struct file *file, struct vm_area_struct *vma, + struct backing_file_ctx *ctx) +{ + const struct cred *old_cred; + int ret; + + if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)) || + WARN_ON_ONCE(ctx->user_file != vma->vm_file)) + return -EIO; + + if (!file->f_op->mmap) + return -ENODEV; + + vma_set_file(vma, file); + + old_cred = override_creds(ctx->cred); + ret = call_mmap(vma->vm_file, vma); + revert_creds(old_cred); + + if (ctx->accessed) + ctx->accessed(ctx->user_file); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_mmap); + +static int __init backing_aio_init(void) +{ + backing_aio_cachep = kmem_cache_create("backing_aio", + sizeof(struct backing_aio), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!backing_aio_cachep) + return -ENOMEM; + + return 0; +} +fs_initcall(backing_aio_init); diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index fddc7be58022..5cdfef3b551a 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -50,14 +50,6 @@ config BCACHEFS_POSIX_ACL depends on BCACHEFS_FS select FS_POSIX_ACL -config BCACHEFS_DEBUG_TRANSACTIONS - bool "bcachefs runtime info" - depends on BCACHEFS_FS - help - This makes the list of running btree transactions available in debugfs. - - This is a highly useful debugging feature but does add a small amount of overhead. - config BCACHEFS_DEBUG bool "bcachefs debugging" depends on BCACHEFS_FS @@ -85,6 +77,16 @@ config BCACHEFS_NO_LATENCY_ACCT help This disables device latency tracking and time stats, only for performance testing +config BCACHEFS_SIX_OPTIMISTIC_SPIN + bool "Optimistic spinning for six locks" + depends on BCACHEFS_FS + depends on SMP + default y + help + Instead of immediately sleeping when attempting to take a six lock that + is held by another thread, spin for a short while, as long as the + thread owning the lock is running. + config MEAN_AND_VARIANCE_UNIT_TEST tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index b81268418174..7423a3557c68 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -82,6 +82,7 @@ bcachefs-y := \ super-io.o \ sysfs.o \ tests.o \ + thread_with_file.o \ trace.o \ two_state_shared_lock.o \ util.o \ diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 1fec0e67891f..a09b9d00226a 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -261,10 +261,8 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, case BCH_DATA_free: case BCH_DATA_need_gc_gens: case BCH_DATA_need_discard: - bkey_fsck_err_on(a.v->dirty_sectors || - a.v->cached_sectors || - a.v->stripe, c, err, - alloc_key_empty_but_have_data, + bkey_fsck_err_on(bch2_bucket_sectors(*a.v) || a.v->stripe, + c, err, alloc_key_empty_but_have_data, "empty data type free but have data"); break; case BCH_DATA_sb: @@ -272,22 +270,21 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, case BCH_DATA_btree: case BCH_DATA_user: case BCH_DATA_parity: - bkey_fsck_err_on(!a.v->dirty_sectors, c, err, - alloc_key_dirty_sectors_0, + bkey_fsck_err_on(!bch2_bucket_sectors_dirty(*a.v), + c, err, alloc_key_dirty_sectors_0, "data_type %s but dirty_sectors==0", bch2_data_types[a.v->data_type]); break; case BCH_DATA_cached: bkey_fsck_err_on(!a.v->cached_sectors || - a.v->dirty_sectors || - a.v->stripe, c, err, - alloc_key_cached_inconsistency, + bch2_bucket_sectors_dirty(*a.v) || + a.v->stripe, + c, err, alloc_key_cached_inconsistency, "data type inconsistency"); bkey_fsck_err_on(!a.v->io_time[READ] && c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, - c, err, - alloc_key_cached_but_read_time_zero, + c, err, alloc_key_cached_but_read_time_zero, "cached bucket with read_time == 0"); break; case BCH_DATA_stripe: @@ -537,18 +534,12 @@ void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bke int bch2_bucket_gens_init(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - struct bch_alloc_v4 a; struct bkey_i_bucket_gens g; bool have_bucket_gens_key = false; - unsigned offset; - struct bpos pos; - u8 gen; int ret; - for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { + ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ({ /* * Not a fsck error because this is checked/repaired by * bch2_check_alloc_key() which runs later: @@ -556,13 +547,14 @@ int bch2_bucket_gens_init(struct bch_fs *c) if (!bch2_dev_bucket_exists(c, k.k->p)) continue; - gen = bch2_alloc_to_v4(k, &a)->gen; - pos = alloc_gens_pos(iter.pos, &offset); + struct bch_alloc_v4 a; + u8 gen = bch2_alloc_to_v4(k, &a)->gen; + unsigned offset; + struct bpos pos = alloc_gens_pos(iter.pos, &offset); if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) { ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, + BCH_TRANS_COMMIT_no_enospc, bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0)); if (ret) break; @@ -576,45 +568,37 @@ int bch2_bucket_gens_init(struct bch_fs *c) } g.v.gens[offset] = gen; - } - bch2_trans_iter_exit(trans, &iter); + 0; + })); if (have_bucket_gens_key && !ret) ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, + BCH_TRANS_COMMIT_no_enospc, bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0)); bch2_trans_put(trans); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } int bch2_alloc_read(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - struct bch_dev *ca; int ret; down_read(&c->gc_lock); if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) { - const struct bch_bucket_gens *g; - u64 b; - - for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { + ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN, + BTREE_ITER_PREFETCH, k, ({ u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; if (k.k->type != KEY_TYPE_bucket_gens) continue; - g = bkey_s_c_to_bucket_gens(k).v; + const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v; /* * Not a fsck error because this is checked/repaired by @@ -623,19 +607,17 @@ int bch2_alloc_read(struct bch_fs *c) if (!bch2_dev_exists2(c, k.k->p.inode)) continue; - ca = bch_dev_bkey_exists(c, k.k->p.inode); + struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); - for (b = max_t(u64, ca->mi.first_bucket, start); + for (u64 b = max_t(u64, ca->mi.first_bucket, start); b < min_t(u64, ca->mi.nbuckets, end); b++) *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK]; - } - bch2_trans_iter_exit(trans, &iter); + 0; + })); } else { - struct bch_alloc_v4 a; - - for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { + ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ({ /* * Not a fsck error because this is checked/repaired by * bch2_check_alloc_key() which runs later: @@ -643,19 +625,18 @@ int bch2_alloc_read(struct bch_fs *c) if (!bch2_dev_bucket_exists(c, k.k->p)) continue; - ca = bch_dev_bkey_exists(c, k.k->p.inode); + struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); + struct bch_alloc_v4 a; *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; - } - bch2_trans_iter_exit(trans, &iter); + 0; + })); } bch2_trans_put(trans); up_read(&c->gc_lock); - if (ret) - bch_err_fn(c, ret); - + bch_err_fn(c, ret); return ret; } @@ -768,83 +749,177 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans, return ret; } -int bch2_trans_mark_alloc(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_i *new, - unsigned flags) +int bch2_trigger_alloc(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_s new, + unsigned flags) { struct bch_fs *c = trans->c; - struct bch_alloc_v4 old_a_convert, *new_a; - const struct bch_alloc_v4 *old_a; - u64 old_lru, new_lru; int ret = 0; - /* - * Deletion only happens in the device removal path, with - * BTREE_TRIGGER_NORUN: - */ - BUG_ON(new->k.type != KEY_TYPE_alloc_v4); + if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans, + "alloc key for invalid device or bucket")) + return -EIO; - old_a = bch2_alloc_to_v4(old, &old_a_convert); - new_a = &bkey_i_to_alloc_v4(new)->v; + struct bch_dev *ca = bch_dev_bkey_exists(c, new.k->p.inode); - new_a->data_type = alloc_data_type(*new_a, new_a->data_type); + struct bch_alloc_v4 old_a_convert; + const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); - if (new_a->dirty_sectors > old_a->dirty_sectors || - new_a->cached_sectors > old_a->cached_sectors) { - new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); - new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); - SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); - SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); - } + if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v; - if (data_type_is_empty(new_a->data_type) && - BCH_ALLOC_V4_NEED_INC_GEN(new_a) && - !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) { - new_a->gen++; - SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); - } + new_a->data_type = alloc_data_type(*new_a, new_a->data_type); - if (old_a->data_type != new_a->data_type || - (new_a->data_type == BCH_DATA_free && - alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) { - ret = bch2_bucket_do_index(trans, old, old_a, false) ?: - bch2_bucket_do_index(trans, bkey_i_to_s_c(new), new_a, true); - if (ret) - return ret; - } + if (bch2_bucket_sectors(*new_a) > bch2_bucket_sectors(*old_a)) { + new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); + new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); + SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); + SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); + } + + if (data_type_is_empty(new_a->data_type) && + BCH_ALLOC_V4_NEED_INC_GEN(new_a) && + !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) { + new_a->gen++; + SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); + } - if (new_a->data_type == BCH_DATA_cached && - !new_a->io_time[READ]) - new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); + if (old_a->data_type != new_a->data_type || + (new_a->data_type == BCH_DATA_free && + alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) { + ret = bch2_bucket_do_index(trans, old, old_a, false) ?: + bch2_bucket_do_index(trans, new.s_c, new_a, true); + if (ret) + return ret; + } - old_lru = alloc_lru_idx_read(*old_a); - new_lru = alloc_lru_idx_read(*new_a); + if (new_a->data_type == BCH_DATA_cached && + !new_a->io_time[READ]) + new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); - if (old_lru != new_lru) { - ret = bch2_lru_change(trans, new->k.p.inode, - bucket_to_u64(new->k.p), - old_lru, new_lru); - if (ret) - return ret; + u64 old_lru = alloc_lru_idx_read(*old_a); + u64 new_lru = alloc_lru_idx_read(*new_a); + if (old_lru != new_lru) { + ret = bch2_lru_change(trans, new.k->p.inode, + bucket_to_u64(new.k->p), + old_lru, new_lru); + if (ret) + return ret; + } + + new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, + bch_dev_bkey_exists(c, new.k->p.inode)); + if (old_a->fragmentation_lru != new_a->fragmentation_lru) { + ret = bch2_lru_change(trans, + BCH_LRU_FRAGMENTATION_START, + bucket_to_u64(new.k->p), + old_a->fragmentation_lru, new_a->fragmentation_lru); + if (ret) + return ret; + } + + if (old_a->gen != new_a->gen) { + ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen); + if (ret) + return ret; + } + + /* + * need to know if we're getting called from the invalidate path or + * not: + */ + + if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && + old_a->cached_sectors) { + ret = bch2_update_cached_sectors_list(trans, new.k->p.inode, + -((s64) old_a->cached_sectors)); + if (ret) + return ret; + } } - new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, - bch_dev_bkey_exists(c, new->k.p.inode)); + if (!(flags & BTREE_TRIGGER_TRANSACTIONAL) && (flags & BTREE_TRIGGER_INSERT)) { + struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v; + u64 journal_seq = trans->journal_res.seq; + u64 bucket_journal_seq = new_a->journal_seq; - if (old_a->fragmentation_lru != new_a->fragmentation_lru) { - ret = bch2_lru_change(trans, - BCH_LRU_FRAGMENTATION_START, - bucket_to_u64(new->k.p), - old_a->fragmentation_lru, new_a->fragmentation_lru); - if (ret) - return ret; + if ((flags & BTREE_TRIGGER_INSERT) && + data_type_is_empty(old_a->data_type) != + data_type_is_empty(new_a->data_type) && + new.k->type == KEY_TYPE_alloc_v4) { + struct bch_alloc_v4 *v = bkey_s_to_alloc_v4(new).v; + + /* + * If the btree updates referring to a bucket weren't flushed + * before the bucket became empty again, then the we don't have + * to wait on a journal flush before we can reuse the bucket: + */ + v->journal_seq = bucket_journal_seq = + data_type_is_empty(new_a->data_type) && + (journal_seq == v->journal_seq || + bch2_journal_noflush_seq(&c->journal, v->journal_seq)) + ? 0 : journal_seq; + } + + if (!data_type_is_empty(old_a->data_type) && + data_type_is_empty(new_a->data_type) && + bucket_journal_seq) { + ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + c->journal.flushed_seq_ondisk, + new.k->p.inode, new.k->p.offset, + bucket_journal_seq); + if (ret) { + bch2_fs_fatal_error(c, + "error setting bucket_needs_journal_commit: %i", ret); + return ret; + } + } + + percpu_down_read(&c->mark_lock); + if (new_a->gen != old_a->gen) + *bucket_gen(ca, new.k->p.offset) = new_a->gen; + + bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false); + + if (new_a->data_type == BCH_DATA_free && + (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk)) + closure_wake_up(&c->freelist_wait); + + if (new_a->data_type == BCH_DATA_need_discard && + (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk)) + bch2_do_discards(c); + + if (old_a->data_type != BCH_DATA_cached && + new_a->data_type == BCH_DATA_cached && + should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) + bch2_do_invalidates(c); + + if (new_a->data_type == BCH_DATA_need_gc_gens) + bch2_do_gc_gens(c); + percpu_up_read(&c->mark_lock); } - if (old_a->gen != new_a->gen) { - ret = bch2_bucket_gen_update(trans, new->k.p, new_a->gen); - if (ret) - return ret; + if ((flags & BTREE_TRIGGER_GC) && + (flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) { + struct bch_alloc_v4 new_a_convert; + const struct bch_alloc_v4 *new_a = bch2_alloc_to_v4(new.s_c, &new_a_convert); + + percpu_down_read(&c->mark_lock); + struct bucket *g = gc_bucket(ca, new.k->p.offset); + + bucket_lock(g); + + g->gen_valid = 1; + g->gen = new_a->gen; + g->data_type = new_a->data_type; + g->stripe = new_a->stripe; + g->stripe_redundancy = new_a->stripe_redundancy; + g->dirty_sectors = new_a->dirty_sectors; + g->cached_sectors = new_a->cached_sectors; + + bucket_unlock(g); + percpu_up_read(&c->mark_lock); } return 0; @@ -869,8 +944,9 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos bch2_trans_copy_iter(&iter2, iter); - if (!bpos_eq(iter->path->l[0].b->key.k.p, SPOS_MAX)) - end = bkey_min(end, bpos_nosnap_successor(iter->path->l[0].b->key.k.p)); + struct btree_path *path = btree_iter_path(iter->trans, iter); + if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX)) + end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p)); end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1)); @@ -898,7 +974,6 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos static bool next_bucket(struct bch_fs *c, struct bpos *bucket) { struct bch_dev *ca; - unsigned iter; if (bch2_dev_bucket_exists(c, *bucket)) return true; @@ -916,8 +991,7 @@ static bool next_bucket(struct bch_fs *c, struct bpos *bucket) } rcu_read_lock(); - iter = bucket->inode; - ca = __bch2_next_dev(c, &iter, NULL); + ca = __bch2_next_dev_idx(c, bucket->inode, NULL); if (ca) *bucket = POS(ca->dev_idx, ca->mi.first_bucket); rcu_read_unlock(); @@ -1158,9 +1232,6 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, unsigned i, gens_offset, gens_end_offset; int ret; - if (c->sb.version < bcachefs_metadata_version_bucket_gens) - return 0; - bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); k = bch2_btree_iter_peek_slot(bucket_gens_iter); @@ -1212,7 +1283,7 @@ fsck_err: return ret; } -static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_trans *trans, +static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter) { struct bch_fs *c = trans->c; @@ -1267,28 +1338,10 @@ delete: ret = bch2_btree_delete_extent_at(trans, iter, iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW); + BCH_TRANS_COMMIT_no_enospc); goto out; } -static int bch2_check_discard_freespace_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos end) -{ - if (!btree_id_is_extents(iter->btree_id)) { - return __bch2_check_discard_freespace_key(trans, iter); - } else { - int ret = 0; - - while (!bkey_eq(iter->pos, end) && - !(ret = btree_trans_too_many_iters(trans) ?: - __bch2_check_discard_freespace_key(trans, iter))) - bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); - - return ret; - } -} - /* * We've already checked that generation numbers in the bucket_gens btree are * valid for buckets that exist; this just checks for keys for nonexistent @@ -1422,8 +1475,7 @@ int bch2_check_alloc_info(struct bch_fs *c) } ret = bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); + BCH_TRANS_COMMIT_no_enospc); if (ret) goto bkey_err; @@ -1442,23 +1494,50 @@ bkey_err: if (ret < 0) goto err; - ret = for_each_btree_key2(trans, iter, + ret = for_each_btree_key(trans, iter, BTREE_ID_need_discard, POS_MIN, BTREE_ITER_PREFETCH, k, - bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?: - for_each_btree_key2(trans, iter, - BTREE_ID_freespace, POS_MIN, - BTREE_ITER_PREFETCH, k, - bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?: - for_each_btree_key_commit(trans, iter, + bch2_check_discard_freespace_key(trans, &iter)); + if (ret) + goto err; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN, + BTREE_ITER_PREFETCH); + while (1) { + bch2_trans_begin(trans); + k = bch2_btree_iter_peek(&iter); + if (!k.k) + break; + + ret = bkey_err(k) ?: + bch2_check_discard_freespace_key(trans, &iter); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + ret = 0; + continue; + } + if (ret) { + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, k); + + bch_err(c, "while checking %s", buf.buf); + printbuf_exit(&buf); + break; + } + + bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos)); + } + bch2_trans_iter_exit(trans, &iter); + if (ret) + goto err; + + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_bucket_gens, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_check_bucket_gens_key(trans, &iter, k)); err: bch2_trans_put(trans); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -1486,6 +1565,27 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, if (a->data_type != BCH_DATA_cached) return 0; + if (fsck_err_on(!a->io_time[READ], c, + alloc_key_cached_but_read_time_zero, + "cached bucket with read_time 0\n" + " %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { + struct bkey_i_alloc_v4 *a_mut = + bch2_alloc_to_v4_mut(trans, alloc_k); + ret = PTR_ERR_OR_ZERO(a_mut); + if (ret) + goto err; + + a_mut->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); + ret = bch2_trans_update(trans, alloc_iter, + &a_mut->k_i, BTREE_TRIGGER_NORUN); + if (ret) + goto err; + + a = &a_mut->v; + } + lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, lru_pos(alloc_k.k->p.inode, bucket_to_u64(alloc_k.k->p), @@ -1494,41 +1594,18 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, if (ret) return ret; - if (fsck_err_on(!a->io_time[READ], c, - alloc_key_cached_but_read_time_zero, - "cached bucket with read_time 0\n" - " %s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) || - fsck_err_on(lru_k.k->type != KEY_TYPE_set, c, + if (fsck_err_on(lru_k.k->type != KEY_TYPE_set, c, alloc_key_to_missing_lru_entry, "missing lru entry\n" " %s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { - u64 read_time = a->io_time[READ] ?: - atomic64_read(&c->io_clock[READ].now); - ret = bch2_lru_set(trans, alloc_k.k->p.inode, bucket_to_u64(alloc_k.k->p), - read_time); + a->io_time[READ]); if (ret) goto err; - - if (a->io_time[READ] != read_time) { - struct bkey_i_alloc_v4 *a_mut = - bch2_alloc_to_v4_mut(trans, alloc_k); - ret = PTR_ERR_OR_ZERO(a_mut); - if (ret) - goto err; - - a_mut->v.io_time[READ] = read_time; - ret = bch2_trans_update(trans, alloc_iter, - &a_mut->k_i, BTREE_TRIGGER_NORUN); - if (ret) - goto err; - } } err: fsck_err: @@ -1539,17 +1616,12 @@ fsck_err: int bch2_check_alloc_to_lru_refs(struct bch_fs *c) { - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - ret = bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_check_alloc_to_lru_ref(trans, &iter))); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -1655,11 +1727,11 @@ write: ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, BCH_WATERMARK_btree| - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); if (ret) goto out; - this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]); + count_event(c, bucket_discard); (*discarded)++; out: (*seen)++; @@ -1672,8 +1744,6 @@ out: static void bch2_do_discards_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, discard_work); - struct btree_iter iter; - struct bkey_s_c k; u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0; struct bpos discard_pos_done = POS_MAX; int ret; @@ -1684,8 +1754,8 @@ static void bch2_do_discards_work(struct work_struct *work) * successful commit: */ ret = bch2_trans_run(c, - for_each_btree_key2(trans, iter, - BTREE_ID_need_discard, POS_MIN, 0, k, + for_each_btree_key(trans, iter, + BTREE_ID_need_discard, POS_MIN, 0, k, bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &seen, &open, @@ -1760,7 +1830,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, BTREE_TRIGGER_BUCKET_INVALIDATE) ?: bch2_trans_commit(trans, NULL, NULL, BCH_WATERMARK_btree| - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); if (ret) goto out; @@ -1795,22 +1865,18 @@ err: static void bch2_do_invalidates_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work); - struct bch_dev *ca; struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - unsigned i; int ret = 0; - ret = bch2_btree_write_buffer_flush(trans); + ret = bch2_btree_write_buffer_tryflush(trans); if (ret) goto err; - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { s64 nr_to_invalidate = should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); - ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru, + ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru, lru_pos(ca->dev_idx, 0, 0), lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX), BTREE_ITER_INTENT, k, @@ -1884,8 +1950,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, ret = bch2_bucket_do_index(trans, k, a, true) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); if (ret) goto bkey_err; @@ -1905,8 +1970,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); if (ret) goto bkey_err; @@ -1937,8 +2001,6 @@ bkey_err: int bch2_fs_freespace_init(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; int ret = 0; bool doing_init = false; @@ -1947,7 +2009,7 @@ int bch2_fs_freespace_init(struct bch_fs *c) * every mount: */ - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { if (ca->mi.freespace_initialized) continue; @@ -2007,15 +2069,13 @@ out: void bch2_recalc_capacity(struct bch_fs *c) { - struct bch_dev *ca; u64 capacity = 0, reserved_sectors = 0, gc_reserve; unsigned bucket_size_max = 0; unsigned long ra_pages = 0; - unsigned i; lockdep_assert_held(&c->state_lock); - for_each_online_member(ca, c, i) { + for_each_online_member(c, ca) { struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi; ra_pages += bdi->ra_pages; @@ -2023,7 +2083,7 @@ void bch2_recalc_capacity(struct bch_fs *c) bch2_set_ra_pages(c, ra_pages); - for_each_rw_member(ca, c, i) { + for_each_rw_member(c, ca) { u64 dev_reserve = 0; /* @@ -2079,11 +2139,9 @@ void bch2_recalc_capacity(struct bch_fs *c) u64 bch2_min_rw_member_capacity(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; u64 ret = U64_MAX; - for_each_rw_member(ca, c, i) + for_each_rw_member(c, ca) ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size); return ret; } diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index 73faf99a222a..e7f7e842ee1b 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -71,6 +71,24 @@ static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type) return data_type == BCH_DATA_stripe ? BCH_DATA_user : data_type; } +static inline unsigned bch2_bucket_sectors(struct bch_alloc_v4 a) +{ + return a.dirty_sectors + a.cached_sectors; +} + +static inline unsigned bch2_bucket_sectors_dirty(struct bch_alloc_v4 a) +{ + return a.dirty_sectors; +} + +static inline unsigned bch2_bucket_sectors_fragmented(struct bch_dev *ca, + struct bch_alloc_v4 a) +{ + int d = bch2_bucket_sectors_dirty(a); + + return d ? max(0, ca->mi.bucket_size - d) : 0; +} + static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a) { return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0; @@ -90,10 +108,11 @@ static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a, struct bch_dev *ca) { if (!data_type_movable(a.data_type) || - a.dirty_sectors >= ca->mi.bucket_size) + !bch2_bucket_sectors_fragmented(ca, a)) return 0; - return div_u64((u64) a.dirty_sectors * (1ULL << 31), ca->mi.bucket_size); + u64 d = bch2_bucket_sectors_dirty(a); + return div_u64(d * (1ULL << 31), ca->mi.bucket_size); } static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a) @@ -163,24 +182,21 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_alloc ((struct bkey_ops) { \ .key_invalid = bch2_alloc_v1_invalid, \ .val_to_text = bch2_alloc_to_text, \ - .trans_trigger = bch2_trans_mark_alloc, \ - .atomic_trigger = bch2_mark_alloc, \ + .trigger = bch2_trigger_alloc, \ .min_val_size = 8, \ }) #define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) { \ .key_invalid = bch2_alloc_v2_invalid, \ .val_to_text = bch2_alloc_to_text, \ - .trans_trigger = bch2_trans_mark_alloc, \ - .atomic_trigger = bch2_mark_alloc, \ + .trigger = bch2_trigger_alloc, \ .min_val_size = 8, \ }) #define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) { \ .key_invalid = bch2_alloc_v3_invalid, \ .val_to_text = bch2_alloc_to_text, \ - .trans_trigger = bch2_trans_mark_alloc, \ - .atomic_trigger = bch2_mark_alloc, \ + .trigger = bch2_trigger_alloc, \ .min_val_size = 16, \ }) @@ -188,8 +204,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); .key_invalid = bch2_alloc_v4_invalid, \ .val_to_text = bch2_alloc_to_text, \ .swab = bch2_alloc_v4_swab, \ - .trans_trigger = bch2_trans_mark_alloc, \ - .atomic_trigger = bch2_mark_alloc, \ + .trigger = bch2_trigger_alloc, \ .min_val_size = 48, \ }) @@ -213,8 +228,8 @@ static inline bool bkey_is_alloc(const struct bkey *k) int bch2_alloc_read(struct bch_fs *); -int bch2_trans_mark_alloc(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, unsigned); int bch2_check_alloc_info(struct bch_fs *); int bch2_check_alloc_to_lru_refs(struct bch_fs *); void bch2_do_discards(struct bch_fs *); diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 0e6157982607..b0ff47998a94 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -69,11 +69,8 @@ const char * const bch2_watermarks[] = { void bch2_reset_alloc_cursors(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; - rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, NULL) + for_each_member_device_rcu(c, ca, NULL) ca->alloc_cursor = 0; rcu_read_unlock(); } @@ -239,9 +236,8 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * if (cl) closure_wait(&c->open_buckets_wait, cl); - if (!c->blocked_allocate_open_bucket) - c->blocked_allocate_open_bucket = local_clock(); - + track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], + &c->blocked_allocate_open_bucket, true); spin_unlock(&c->freelist_lock); return ERR_PTR(-BCH_ERR_open_buckets_empty); } @@ -267,19 +263,11 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * ca->nr_open_buckets++; bch2_open_bucket_hash_add(c, ob); - if (c->blocked_allocate_open_bucket) { - bch2_time_stats_update( - &c->times[BCH_TIME_blocked_allocate_open_bucket], - c->blocked_allocate_open_bucket); - c->blocked_allocate_open_bucket = 0; - } + track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], + &c->blocked_allocate_open_bucket, false); - if (c->blocked_allocate) { - bch2_time_stats_update( - &c->times[BCH_TIME_blocked_allocate], - c->blocked_allocate); - c->blocked_allocate = 0; - } + track_event_change(&c->times[BCH_TIME_blocked_allocate], + &c->blocked_allocate, false); spin_unlock(&c->freelist_lock); return ob; @@ -377,9 +365,9 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl); if (!ob) - iter.path->preserve = false; + set_btree_iter_dontneed(&iter); err: - if (iter.trans && iter.path) + if (iter.path) set_btree_iter_dontneed(&iter); bch2_trans_iter_exit(trans, &iter); printbuf_exit(&buf); @@ -447,7 +435,7 @@ again: ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl); next: - citer.path->preserve = false; + set_btree_iter_dontneed(&citer); bch2_trans_iter_exit(trans, &citer); if (ob) break; @@ -502,7 +490,7 @@ again: ob = try_alloc_bucket(trans, ca, watermark, alloc_cursor, s, k, cl); if (ob) { - iter.path->preserve = false; + set_btree_iter_dontneed(&iter); break; } } @@ -567,8 +555,8 @@ again: goto again; } - if (!c->blocked_allocate) - c->blocked_allocate = local_clock(); + track_event_change(&c->times[BCH_TIME_blocked_allocate], + &c->blocked_allocate, true); ob = ERR_PTR(-BCH_ERR_freelist_empty); goto err; @@ -697,11 +685,9 @@ static int add_new_bucket(struct bch_fs *c, bch_dev_bkey_exists(c, ob->dev)->mi.durability; BUG_ON(*nr_effective >= nr_replicas); - BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS); __clear_bit(ob->dev, devs_may_alloc->d); - *nr_effective += (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) - ? durability : 1; + *nr_effective += durability; *have_cache |= !durability; ob_push(c, ptrs, ob); @@ -972,8 +958,8 @@ static int __open_bucket_add_buckets(struct btree_trans *trans, devs = target_rw_devs(c, wp->data_type, target); /* Don't allocate from devices we already have pointers to: */ - for (i = 0; i < devs_have->nr; i++) - __clear_bit(devs_have->devs[i], devs.d); + darray_for_each(*devs_have, i) + __clear_bit(*i, devs.d); open_bucket_for_each(c, ptrs, ob, i) __clear_bit(ob->dev, devs.d); diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 23c0834a97a4..e358a2ffffde 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -3,6 +3,7 @@ #include "bbpos.h" #include "alloc_background.h" #include "backpointers.h" +#include "bkey_buf.h" #include "btree_cache.h" #include "btree_update.h" #include "btree_update_interior.h" @@ -136,15 +137,30 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, } int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, - struct bkey_i_backpointer *bp_k, + struct bpos bucket, struct bch_backpointer bp, struct bkey_s_c orig_k, bool insert) { struct btree_iter bp_iter; struct bkey_s_c k; + struct bkey_i_backpointer *bp_k; int ret; + bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer)); + ret = PTR_ERR_OR_ZERO(bp_k); + if (ret) + return ret; + + bkey_backpointer_init(&bp_k->k_i); + bp_k->k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset); + bp_k->v = bp; + + if (!insert) { + bp_k->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&bp_k->k, 0); + } + k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp_k->k.p, BTREE_ITER_INTENT| @@ -375,39 +391,32 @@ fsck_err: /* verify that every backpointer has a corresponding alloc key */ int bch2_check_btree_backpointers(struct bch_fs *c) { - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - ret = bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, POS_MIN, 0, k, - NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_check_btree_backpointer(trans, &iter, k))); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } -struct bpos_level { - unsigned level; - struct bpos pos; -}; - static int check_bp_exists(struct btree_trans *trans, struct bpos bucket, struct bch_backpointer bp, struct bkey_s_c orig_k, struct bpos bucket_start, struct bpos bucket_end, - struct bpos_level *last_flushed) + struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; struct btree_iter bp_iter = { NULL }; struct printbuf buf = PRINTBUF; struct bkey_s_c bp_k; + struct bkey_buf tmp; int ret; + bch2_bkey_buf_init(&tmp); + if (bpos_lt(bucket, bucket_start) || bpos_gt(bucket, bucket_end)) return 0; @@ -424,13 +433,22 @@ static int check_bp_exists(struct btree_trans *trans, if (bp_k.k->type != KEY_TYPE_backpointer || memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) { - if (last_flushed->level != bp.level || - !bpos_eq(last_flushed->pos, orig_k.k->p)) { - last_flushed->level = bp.level; - last_flushed->pos = orig_k.k->p; + if (!bpos_eq(orig_k.k->p, last_flushed->k->k.p) || + bkey_bytes(orig_k.k) != bkey_bytes(&last_flushed->k->k) || + memcmp(orig_k.v, &last_flushed->k->v, bkey_val_bytes(orig_k.k))) { + bch2_bkey_buf_reassemble(&tmp, c, orig_k); + + if (bp.level) { + bch2_trans_unlock(trans); + bch2_btree_interior_updates_flush(c); + } - ret = bch2_btree_write_buffer_flush_sync(trans) ?: - -BCH_ERR_transaction_restart_write_buffer_flush; + ret = bch2_btree_write_buffer_flush_sync(trans); + if (ret) + goto err; + + bch2_bkey_buf_copy(last_flushed, c, tmp.k); + ret = -BCH_ERR_transaction_restart_write_buffer_flush; goto out; } goto missing; @@ -439,6 +457,7 @@ out: err: fsck_err: bch2_trans_iter_exit(trans, &bp_iter); + bch2_bkey_buf_exit(&tmp, c); printbuf_exit(&buf); return ret; missing: @@ -448,8 +467,7 @@ missing: prt_printf(&buf, "\nbp pos "); bch2_bpos_to_text(&buf, bp_iter.pos); - if (c->sb.version_upgrade_complete < bcachefs_metadata_version_backpointers || - c->opts.reconstruct_alloc || + if (c->opts.reconstruct_alloc || fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf)) ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true); @@ -457,25 +475,18 @@ missing: } static int check_extent_to_backpointers(struct btree_trans *trans, - struct btree_iter *iter, + enum btree_id btree, unsigned level, struct bpos bucket_start, struct bpos bucket_end, - struct bpos_level *last_flushed) + struct bkey_buf *last_flushed, + struct bkey_s_c k) { struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs; const union bch_extent_entry *entry; struct extent_ptr_decoded p; - struct bkey_s_c k; int ret; - k = bch2_btree_iter_peek_all_levels(iter); - ret = bkey_err(k); - if (ret) - return ret; - if (!k.k) - return 0; - ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { struct bpos bucket_pos; @@ -484,7 +495,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans, if (p.ptr.cached) continue; - bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level, + bch2_extent_ptr_to_bp(c, btree, level, k, p, &bucket_pos, &bp); ret = check_bp_exists(trans, bucket_pos, bp, k, @@ -501,44 +512,33 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans, enum btree_id btree_id, struct bpos bucket_start, struct bpos bucket_end, - struct bpos_level *last_flushed) + struct bkey_buf *last_flushed, + int *level) { struct bch_fs *c = trans->c; - struct btree_root *r = bch2_btree_id_root(c, btree_id); struct btree_iter iter; struct btree *b; struct bkey_s_c k; - struct bkey_ptrs_c ptrs; - struct extent_ptr_decoded p; - const union bch_extent_entry *entry; int ret; - - bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, r->level, 0); +retry: + bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, + 0, bch2_btree_id_root(c, btree_id)->b->c.level, 0); b = bch2_btree_iter_peek_node(&iter); ret = PTR_ERR_OR_ZERO(b); if (ret) goto err; - BUG_ON(b != btree_node_root(c, b)); - - k = bkey_i_to_s_c(&b->key); - ptrs = bch2_bkey_ptrs_c(k); - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - struct bpos bucket_pos; - struct bch_backpointer bp; - - if (p.ptr.cached) - continue; + if (b != btree_node_root(c, b)) { + bch2_trans_iter_exit(trans, &iter); + goto retry; + } - bch2_extent_ptr_to_bp(c, iter.btree_id, b->c.level + 1, - k, p, &bucket_pos, &bp); + *level = b->c.level; - ret = check_bp_exists(trans, bucket_pos, bp, k, + k = bkey_i_to_s_c(&b->key); + ret = check_extent_to_backpointers(trans, btree_id, b->c.level + 1, bucket_start, bucket_end, - last_flushed); - if (ret) - goto err; - } + last_flushed, k); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -616,43 +616,60 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter iter; enum btree_id btree_id; - struct bpos_level last_flushed = { UINT_MAX, POS_MIN }; + struct bkey_s_c k; + struct bkey_buf last_flushed; int ret = 0; - for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) { - unsigned depth = btree_type_has_ptrs(btree_id) ? 0 : 1; - - bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, - depth, - BTREE_ITER_ALL_LEVELS| - BTREE_ITER_PREFETCH); - - do { - ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL, - check_extent_to_backpointers(trans, &iter, - bucket_start, bucket_end, - &last_flushed)); - if (ret) - break; - } while (!bch2_btree_iter_advance(&iter)); - - bch2_trans_iter_exit(trans, &iter); + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); - if (ret) - break; + for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) { + int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1; ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL, + BCH_TRANS_COMMIT_no_enospc, check_btree_root_to_backpointers(trans, btree_id, bucket_start, bucket_end, - &last_flushed)); + &last_flushed, &level)); if (ret) - break; + return ret; + + while (level >= depth) { + bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, + level, + BTREE_ITER_PREFETCH); + while (1) { + bch2_trans_begin(trans); + k = bch2_btree_iter_peek(&iter); + if (!k.k) + break; + ret = bkey_err(k) ?: + check_extent_to_backpointers(trans, btree_id, level, + bucket_start, bucket_end, + &last_flushed, k) ?: + bch2_trans_commit(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + ret = 0; + continue; + } + if (ret) + break; + if (bpos_eq(iter.pos, SPOS_MAX)) + break; + bch2_btree_iter_advance(&iter); + } + bch2_trans_iter_exit(trans, &iter); + + if (ret) + return ret; + + --level; + } } - return ret; + + bch2_bkey_buf_exit(&last_flushed, c); + return 0; } static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c, @@ -746,8 +763,7 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) } bch2_trans_put(trans); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -801,13 +817,11 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, struct bbpos start, struct bbpos end) { - struct btree_iter iter; - struct bkey_s_c k; struct bpos last_flushed_pos = SPOS_MAX; return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_one_backpointer(trans, start, end, bkey_s_c_to_backpointer(k), &last_flushed_pos)); @@ -854,7 +868,6 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c) } bch2_trans_put(trans); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index ab866feeaf66..737e2396ade7 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -63,7 +63,7 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c, return ret; } -int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bkey_i_backpointer *, +int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bpos bucket, struct bch_backpointer, struct bkey_s_c, bool); static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, @@ -72,28 +72,21 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, struct bkey_s_c orig_k, bool insert) { - struct bch_fs *c = trans->c; - struct bkey_i_backpointer *bp_k; - int ret; + if (unlikely(bch2_backpointers_no_use_write_buffer)) + return bch2_bucket_backpointer_mod_nowritebuffer(trans, bucket, bp, orig_k, insert); - bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer)); - ret = PTR_ERR_OR_ZERO(bp_k); - if (ret) - return ret; + struct bkey_i_backpointer bp_k; - bkey_backpointer_init(&bp_k->k_i); - bp_k->k.p = bucket_pos_to_bp(c, bucket, bp.bucket_offset); - bp_k->v = bp; + bkey_backpointer_init(&bp_k.k_i); + bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset); + bp_k.v = bp; if (!insert) { - bp_k->k.type = KEY_TYPE_deleted; - set_bkey_val_u64s(&bp_k->k, 0); + bp_k.k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&bp_k.k, 0); } - if (unlikely(bch2_backpointers_no_use_write_buffer)) - return bch2_bucket_backpointer_mod_nowritebuffer(trans, bp_k, bp, orig_k, insert); - - return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k->k_i); + return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k.k_i); } static inline enum bch_data_type bkey_ptr_data_type(enum btree_id btree_id, unsigned level, diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index b62737fdf5ab..dac383e37181 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -193,6 +193,7 @@ #include <linux/mutex.h> #include <linux/percpu-refcount.h> #include <linux/percpu-rwsem.h> +#include <linux/refcount.h> #include <linux/rhashtable.h> #include <linux/rwsem.h> #include <linux/semaphore.h> @@ -223,9 +224,11 @@ #define race_fault(...) dynamic_fault("bcachefs:race") +#define count_event(_c, _name) this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]) + #define trace_and_count(_c, _name, ...) \ do { \ - this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]); \ + count_event(_c, _name); \ trace_##_name(__VA_ARGS__); \ } while (0) @@ -262,46 +265,76 @@ do { \ #define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") +__printf(2, 3) +void __bch2_print(struct bch_fs *c, const char *fmt, ...); + +#define maybe_dev_to_fs(_c) _Generic((_c), \ + struct bch_dev *: ((struct bch_dev *) (_c))->fs, \ + struct bch_fs *: (_c)) + +#define bch2_print(_c, ...) __bch2_print(maybe_dev_to_fs(_c), __VA_ARGS__) + +#define bch2_print_ratelimited(_c, ...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + \ + if (__ratelimit(&_rs)) \ + bch2_print(_c, __VA_ARGS__); \ +} while (0) + #define bch_info(c, fmt, ...) \ - printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_notice(c, fmt, ...) \ - printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_warn(c, fmt, ...) \ - printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_warn_ratelimited(c, fmt, ...) \ - printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) + bch2_print_ratelimited(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_err(c, fmt, ...) \ - printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_err_dev(ca, fmt, ...) \ - printk(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) #define bch_err_dev_offset(ca, _offset, fmt, ...) \ - printk(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) #define bch_err_inum(c, _inum, fmt, ...) \ - printk(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) #define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \ - printk(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) #define bch_err_ratelimited(c, fmt, ...) \ - printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) + bch2_print_ratelimited(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_err_dev_ratelimited(ca, fmt, ...) \ - printk_ratelimited(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) + bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) #define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \ - printk_ratelimited(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) + bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) #define bch_err_inum_ratelimited(c, _inum, fmt, ...) \ - printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) + bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) #define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \ - printk_ratelimited(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) + bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) + +static inline bool should_print_err(int err) +{ + return err && !bch2_err_matches(err, BCH_ERR_transaction_restart); +} #define bch_err_fn(_c, _ret) \ do { \ - if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\ + if (should_print_err(_ret)) \ bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\ } while (0) +#define bch_err_fn_ratelimited(_c, _ret) \ +do { \ + if (should_print_err(_ret)) \ + bch_err_ratelimited(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\ +} while (0) + #define bch_err_msg(_c, _ret, _msg, ...) \ do { \ - if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\ + if (should_print_err(_ret)) \ bch_err(_c, "%s(): error " _msg " %s", __func__, \ ##__VA_ARGS__, bch2_err_str(_ret)); \ } while (0) @@ -392,6 +425,7 @@ BCH_DEBUG_PARAMS_DEBUG() x(btree_node_merge) \ x(btree_node_sort) \ x(btree_node_read) \ + x(btree_node_read_done) \ x(btree_interior_update_foreground) \ x(btree_interior_update_total) \ x(btree_gc) \ @@ -401,9 +435,12 @@ BCH_DEBUG_PARAMS_DEBUG() x(journal_flush_write) \ x(journal_noflush_write) \ x(journal_flush_seq) \ - x(blocked_journal) \ + x(blocked_journal_low_on_space) \ + x(blocked_journal_low_on_pin) \ + x(blocked_journal_max_in_flight) \ x(blocked_allocate) \ x(blocked_allocate_open_bucket) \ + x(blocked_write_buffer_full) \ x(nocow_lock_contended) enum bch_time_stats { @@ -428,6 +465,7 @@ enum bch_time_stats { #include "replicas_types.h" #include "subvolume_types.h" #include "super_types.h" +#include "thread_with_file_types.h" /* Number of nodes btree coalesce will try to coalesce at once */ #define GC_MERGE_NODES 4U @@ -564,32 +602,35 @@ struct bch_dev { struct io_count __percpu *io_done; }; -enum { - /* startup: */ - BCH_FS_STARTED, - BCH_FS_MAY_GO_RW, - BCH_FS_RW, - BCH_FS_WAS_RW, - - /* shutdown: */ - BCH_FS_STOPPING, - BCH_FS_EMERGENCY_RO, - BCH_FS_GOING_RO, - BCH_FS_WRITE_DISABLE_COMPLETE, - BCH_FS_CLEAN_SHUTDOWN, - - /* fsck passes: */ - BCH_FS_FSCK_DONE, - BCH_FS_INITIAL_GC_UNFIXED, /* kill when we enumerate fsck errors */ - BCH_FS_NEED_ANOTHER_GC, - - BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, - - /* errors: */ - BCH_FS_ERROR, - BCH_FS_TOPOLOGY_ERROR, - BCH_FS_ERRORS_FIXED, - BCH_FS_ERRORS_NOT_FIXED, +/* + * initial_gc_unfixed + * error + * topology error + */ + +#define BCH_FS_FLAGS() \ + x(started) \ + x(may_go_rw) \ + x(rw) \ + x(was_rw) \ + x(stopping) \ + x(emergency_ro) \ + x(going_ro) \ + x(write_disable_complete) \ + x(clean_shutdown) \ + x(fsck_running) \ + x(initial_gc_unfixed) \ + x(need_another_gc) \ + x(need_delete_dead_snapshots) \ + x(error) \ + x(topology_error) \ + x(errors_fixed) \ + x(errors_not_fixed) + +enum bch_fs_flags { +#define x(n) BCH_FS_##n, + BCH_FS_FLAGS() +#undef x }; struct btree_debug { @@ -599,10 +640,11 @@ struct btree_debug { #define BCH_TRANSACTIONS_NR 128 struct btree_transaction_stats { + struct bch2_time_stats duration; struct bch2_time_stats lock_hold_times; struct mutex lock; unsigned nr_max_paths; - unsigned wb_updates_size; + unsigned journal_entries_size; unsigned max_mem; char *max_paths_text; }; @@ -664,7 +706,8 @@ struct btree_trans_buf { x(invalidate) \ x(delete_dead_snapshots) \ x(snapshot_delete_pagecache) \ - x(sysfs) + x(sysfs) \ + x(btree_write_buffer) enum bch_write_ref { #define x(n) BCH_WRITE_REF_##n, @@ -689,6 +732,8 @@ struct bch_fs { struct super_block *vfs_sb; dev_t dev; char name[40]; + struct stdio_redirect *stdio; + struct task_struct *stdio_filter; /* ro/rw, add/remove/resize devices: */ struct rw_semaphore state_lock; @@ -699,6 +744,13 @@ struct bch_fs { #else struct percpu_ref writes; #endif + /* + * Analagous to c->writes, for asynchronous ops that don't necessarily + * need fs to be read-write + */ + refcount_t ro_ref; + wait_queue_head_t ro_ref_wait; + struct work_struct read_only_work; struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; @@ -1002,10 +1054,21 @@ struct bch_fs { /* RECOVERY */ u64 journal_replay_seq_start; u64 journal_replay_seq_end; + /* + * Two different uses: + * "Has this fsck pass?" - i.e. should this type of error be an + * emergency read-only + * And, in certain situations fsck will rewind to an earlier pass: used + * for signaling to the toplevel code which pass we want to run now. + */ enum bch_recovery_pass curr_recovery_pass; /* bitmap of explicitly enabled recovery passes: */ u64 recovery_passes_explicit; + /* bitmask of recovery passes that we actually ran */ u64 recovery_passes_complete; + /* never rewinds version of curr_recovery_pass */ + enum bch_recovery_pass recovery_pass_done; + struct semaphore online_fsck_mutex; /* DEBUG JUNK */ struct dentry *fs_debug_dir; @@ -1065,10 +1128,20 @@ static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref) #endif } +static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) +{ +#ifdef BCH_WRITE_REF_DEBUG + return !test_bit(BCH_FS_going_ro, &c->flags) && + atomic_long_inc_not_zero(&c->writes[ref]); +#else + return percpu_ref_tryget(&c->writes); +#endif +} + static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) { #ifdef BCH_WRITE_REF_DEBUG - return !test_bit(BCH_FS_GOING_RO, &c->flags) && + return !test_bit(BCH_FS_going_ro, &c->flags) && atomic_long_inc_not_zero(&c->writes[ref]); #else return percpu_ref_tryget_live(&c->writes); @@ -1087,13 +1160,27 @@ static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref) if (atomic_long_read(&c->writes[i])) return; - set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); + set_bit(BCH_FS_write_disable_complete, &c->flags); wake_up(&bch2_read_only_wait); #else percpu_ref_put(&c->writes); #endif } +static inline bool bch2_ro_ref_tryget(struct bch_fs *c) +{ + if (test_bit(BCH_FS_stopping, &c->flags)) + return false; + + return refcount_inc_not_zero(&c->ro_ref); +} + +static inline void bch2_ro_ref_put(struct bch_fs *c) +{ + if (refcount_dec_and_test(&c->ro_ref)) + wake_up(&c->ro_ref_wait); +} + static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages) { #ifndef NO_BCACHEFS_FS @@ -1158,6 +1245,15 @@ static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev) return dev < c->sb.nr_devices && c->devs[dev]; } +static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c) +{ + struct stdio_redirect *stdio = c->stdio; + + if (c->stdio_filter && c->stdio_filter != current) + stdio = NULL; + return stdio; +} + #define BKEY_PADDED_ONSTACK(key, pad) \ struct { struct bkey_i key; __u64 key ## _pad[pad]; } diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index fe78e87603fc..0d5ac4184fbc 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -307,6 +307,13 @@ struct bkey_i { struct bch_val v; }; +#define POS_KEY(_pos) \ +((struct bkey) { \ + .u64s = BKEY_U64s, \ + .format = KEY_FORMAT_CURRENT, \ + .p = _pos, \ +}) + #define KEY(_inode, _offset, _size) \ ((struct bkey) { \ .u64s = BKEY_U64s, \ @@ -1296,6 +1303,7 @@ struct bch_member { __le64 errors[BCH_MEMBER_ERROR_NR]; __le64 errors_at_reset[BCH_MEMBER_ERROR_NR]; __le64 errors_reset_time; + __le64 seq; }; #define BCH_MEMBER_V1_BYTES 56 @@ -1442,7 +1450,7 @@ struct bch_sb_field_replicas_v0 { struct bch_replicas_entry_v0 entries[]; } __packed __aligned(8); -struct bch_replicas_entry { +struct bch_replicas_entry_v1 { __u8 data_type; __u8 nr_devs; __u8 nr_required; @@ -1454,7 +1462,7 @@ struct bch_replicas_entry { struct bch_sb_field_replicas { struct bch_sb_field field; - struct bch_replicas_entry entries[]; + struct bch_replicas_entry_v1 entries[]; } __packed __aligned(8); /* BCH_SB_FIELD_quota: */ @@ -1571,7 +1579,9 @@ struct bch_sb_field_disk_groups { x(write_super, 73) \ x(trans_restart_would_deadlock_recursion_limit, 74) \ x(trans_restart_write_buffer_flush, 75) \ - x(trans_restart_split_race, 76) + x(trans_restart_split_race, 76) \ + x(write_buffer_flush_slowpath, 77) \ + x(write_buffer_flush_sync, 78) enum bch_persistent_counters { #define x(t, n, ...) BCH_COUNTER_##t, @@ -1662,69 +1672,41 @@ struct bch_sb_field_downgrade { #define BCH_VERSION_MINOR(_v) ((__u16) ((_v) & ~(~0U << 10))) #define BCH_VERSION(_major, _minor) (((_major) << 10)|(_minor) << 0) -#define RECOVERY_PASS_ALL_FSCK (1ULL << 63) - /* * field 1: version name * field 2: BCH_VERSION(major, minor) * field 3: recovery passess required on upgrade */ #define BCH_METADATA_VERSIONS() \ - x(bkey_renumber, BCH_VERSION(0, 10), \ - RECOVERY_PASS_ALL_FSCK) \ - x(inode_btree_change, BCH_VERSION(0, 11), \ - RECOVERY_PASS_ALL_FSCK) \ - x(snapshot, BCH_VERSION(0, 12), \ - RECOVERY_PASS_ALL_FSCK) \ - x(inode_backpointers, BCH_VERSION(0, 13), \ - RECOVERY_PASS_ALL_FSCK) \ - x(btree_ptr_sectors_written, BCH_VERSION(0, 14), \ - RECOVERY_PASS_ALL_FSCK) \ - x(snapshot_2, BCH_VERSION(0, 15), \ - BIT_ULL(BCH_RECOVERY_PASS_fs_upgrade_for_subvolumes)| \ - BIT_ULL(BCH_RECOVERY_PASS_initialize_subvolumes)| \ - RECOVERY_PASS_ALL_FSCK) \ - x(reflink_p_fix, BCH_VERSION(0, 16), \ - BIT_ULL(BCH_RECOVERY_PASS_fix_reflink_p)) \ - x(subvol_dirent, BCH_VERSION(0, 17), \ - RECOVERY_PASS_ALL_FSCK) \ - x(inode_v2, BCH_VERSION(0, 18), \ - RECOVERY_PASS_ALL_FSCK) \ - x(freespace, BCH_VERSION(0, 19), \ - RECOVERY_PASS_ALL_FSCK) \ - x(alloc_v4, BCH_VERSION(0, 20), \ - RECOVERY_PASS_ALL_FSCK) \ - x(new_data_types, BCH_VERSION(0, 21), \ - RECOVERY_PASS_ALL_FSCK) \ - x(backpointers, BCH_VERSION(0, 22), \ - RECOVERY_PASS_ALL_FSCK) \ - x(inode_v3, BCH_VERSION(0, 23), \ - RECOVERY_PASS_ALL_FSCK) \ - x(unwritten_extents, BCH_VERSION(0, 24), \ - RECOVERY_PASS_ALL_FSCK) \ - x(bucket_gens, BCH_VERSION(0, 25), \ - BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)| \ - RECOVERY_PASS_ALL_FSCK) \ - x(lru_v2, BCH_VERSION(0, 26), \ - RECOVERY_PASS_ALL_FSCK) \ - x(fragmentation_lru, BCH_VERSION(0, 27), \ - RECOVERY_PASS_ALL_FSCK) \ - x(no_bps_in_alloc_keys, BCH_VERSION(0, 28), \ - RECOVERY_PASS_ALL_FSCK) \ - x(snapshot_trees, BCH_VERSION(0, 29), \ - RECOVERY_PASS_ALL_FSCK) \ - x(major_minor, BCH_VERSION(1, 0), \ - 0) \ - x(snapshot_skiplists, BCH_VERSION(1, 1), \ - BIT_ULL(BCH_RECOVERY_PASS_check_snapshots)) \ - x(deleted_inodes, BCH_VERSION(1, 2), \ - BIT_ULL(BCH_RECOVERY_PASS_check_inodes)) \ - x(rebalance_work, BCH_VERSION(1, 3), \ - BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) + x(bkey_renumber, BCH_VERSION(0, 10)) \ + x(inode_btree_change, BCH_VERSION(0, 11)) \ + x(snapshot, BCH_VERSION(0, 12)) \ + x(inode_backpointers, BCH_VERSION(0, 13)) \ + x(btree_ptr_sectors_written, BCH_VERSION(0, 14)) \ + x(snapshot_2, BCH_VERSION(0, 15)) \ + x(reflink_p_fix, BCH_VERSION(0, 16)) \ + x(subvol_dirent, BCH_VERSION(0, 17)) \ + x(inode_v2, BCH_VERSION(0, 18)) \ + x(freespace, BCH_VERSION(0, 19)) \ + x(alloc_v4, BCH_VERSION(0, 20)) \ + x(new_data_types, BCH_VERSION(0, 21)) \ + x(backpointers, BCH_VERSION(0, 22)) \ + x(inode_v3, BCH_VERSION(0, 23)) \ + x(unwritten_extents, BCH_VERSION(0, 24)) \ + x(bucket_gens, BCH_VERSION(0, 25)) \ + x(lru_v2, BCH_VERSION(0, 26)) \ + x(fragmentation_lru, BCH_VERSION(0, 27)) \ + x(no_bps_in_alloc_keys, BCH_VERSION(0, 28)) \ + x(snapshot_trees, BCH_VERSION(0, 29)) \ + x(major_minor, BCH_VERSION(1, 0)) \ + x(snapshot_skiplists, BCH_VERSION(1, 1)) \ + x(deleted_inodes, BCH_VERSION(1, 2)) \ + x(rebalance_work, BCH_VERSION(1, 3)) \ + x(member_seq, BCH_VERSION(1, 4)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, -#define x(t, n, upgrade_passes) bcachefs_metadata_version_##t = n, +#define x(t, n) bcachefs_metadata_version_##t = n, BCH_METADATA_VERSIONS() #undef x bcachefs_metadata_version_max @@ -1786,7 +1768,8 @@ struct bch_sb { __le32 time_base_hi; __le32 time_precision; - __le64 flags[8]; + __le64 flags[7]; + __le64 write_time; __le64 features[2]; __le64 compat[2]; @@ -2153,7 +2136,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb) x(clock, 7) \ x(dev_usage, 8) \ x(log, 9) \ - x(overwrite, 10) + x(overwrite, 10) \ + x(write_buffer_keys, 11) enum { #define x(f, nr) BCH_JSET_ENTRY_##f = nr, @@ -2162,6 +2146,19 @@ enum { BCH_JSET_ENTRY_NR }; +static inline bool jset_entry_is_key(struct jset_entry *e) +{ + switch (e->type) { + case BCH_JSET_ENTRY_btree_keys: + case BCH_JSET_ENTRY_btree_root: + case BCH_JSET_ENTRY_overwrite: + case BCH_JSET_ENTRY_write_buffer_keys: + return true; + } + + return false; +} + /* * Journal sequence numbers can be blacklisted: bsets record the max sequence * number of all the journal entries they contain updates for, so that on @@ -2203,7 +2200,7 @@ struct jset_entry_usage { struct jset_entry_data_usage { struct jset_entry entry; __le64 v; - struct bch_replicas_entry r; + struct bch_replicas_entry_v1 r; } __packed; struct jset_entry_clock { @@ -2224,8 +2221,8 @@ struct jset_entry_dev_usage { __le32 dev; __u32 pad; - __le64 buckets_ec; - __le64 _buckets_unavailable; /* No longer used */ + __le64 _buckets_ec; /* No longer used */ + __le64 _buckets_unavailable; /* No longer used */ struct jset_entry_dev_usage_type d[]; }; @@ -2239,7 +2236,7 @@ static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage struct jset_entry_log { struct jset_entry entry; u8 d[]; -} __packed; +} __packed __aligned(8); /* * On disk format for a journal entry: diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h index f05881f7e113..4b8fba754b1c 100644 --- a/fs/bcachefs/bcachefs_ioctl.h +++ b/fs/bcachefs/bcachefs_ioctl.h @@ -81,6 +81,11 @@ struct bch_ioctl_incremental { #define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume) #define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume) +#define BCH_IOCTL_DEV_USAGE_V2 _IOWR(0xbc, 18, struct bch_ioctl_dev_usage_v2) + +#define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline) +#define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online) + /* ioctl below act on a particular file, not the filesystem as a whole: */ #define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *) @@ -173,12 +178,18 @@ struct bch_ioctl_disk_set_state { __u64 dev; }; +#define BCH_DATA_OPS() \ + x(scrub, 0) \ + x(rereplicate, 1) \ + x(migrate, 2) \ + x(rewrite_old_nodes, 3) \ + x(drop_extra_replicas, 4) + enum bch_data_ops { - BCH_DATA_OP_SCRUB = 0, - BCH_DATA_OP_REREPLICATE = 1, - BCH_DATA_OP_MIGRATE = 2, - BCH_DATA_OP_REWRITE_OLD_NODES = 3, - BCH_DATA_OP_NR = 4, +#define x(t, n) BCH_DATA_OP_##t = n, + BCH_DATA_OPS() +#undef x + BCH_DATA_OP_NR }; /* @@ -237,7 +248,7 @@ struct bch_ioctl_data_event { struct bch_replicas_usage { __u64 sectors; - struct bch_replicas_entry r; + struct bch_replicas_entry_v1 r; } __packed; static inline struct bch_replicas_usage * @@ -268,7 +279,7 @@ struct bch_ioctl_fs_usage { __u32 replica_entries_bytes; __u32 pad; - struct bch_replicas_usage replicas[0]; + struct bch_replicas_usage replicas[]; }; /* @@ -292,7 +303,20 @@ struct bch_ioctl_dev_usage { __u64 buckets; __u64 sectors; __u64 fragmented; - } d[BCH_DATA_NR]; + } d[10]; +}; + +struct bch_ioctl_dev_usage_v2 { + __u64 dev; + __u32 flags; + __u8 state; + __u8 nr_data_types; + __u8 pad[6]; + + __u32 bucket_size; + __u64 nr_buckets; + + struct bch_ioctl_dev_usage_type d[]; }; /* @@ -365,4 +389,24 @@ struct bch_ioctl_subvolume { #define BCH_SUBVOL_SNAPSHOT_CREATE (1U << 0) #define BCH_SUBVOL_SNAPSHOT_RO (1U << 1) +/* + * BCH_IOCTL_FSCK_OFFLINE: run fsck from the 'bcachefs fsck' userspace command, + * but with the kernel's implementation of fsck: + */ +struct bch_ioctl_fsck_offline { + __u64 flags; + __u64 opts; /* string */ + __u64 nr_devs; + __u64 devs[] __counted_by(nr_devs); +}; + +/* + * BCH_IOCTL_FSCK_ONLINE: run fsck from the 'bcachefs fsck' userspace command, + * but with the kernel's implementation of fsck: + */ +struct bch_ioctl_fsck_online { + __u64 flags; + __u64 opts; /* string */ +}; + #endif /* _BCACHEFS_IOCTL_H */ diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h index 3a370b7087ac..ee82283722b7 100644 --- a/fs/bcachefs/bkey_methods.h +++ b/fs/bcachefs/bkey_methods.h @@ -28,10 +28,8 @@ struct bkey_ops { void (*swab)(struct bkey_s); bool (*key_normalize)(struct bch_fs *, struct bkey_s); bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c); - int (*trans_trigger)(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_i *, unsigned); - int (*atomic_trigger)(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s_c, unsigned); + int (*trigger)(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, unsigned); void (*compat)(enum btree_id id, unsigned version, unsigned big_endian, int write, struct bkey_s); @@ -78,84 +76,86 @@ static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct b bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); -static inline int bch2_mark_key(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) -{ - const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type); - - return ops->atomic_trigger - ? ops->atomic_trigger(trans, btree, level, old, new, flags) - : 0; -} - enum btree_update_flags { __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END, __BTREE_UPDATE_NOJOURNAL, - __BTREE_UPDATE_PREJOURNAL, __BTREE_UPDATE_KEY_CACHE_RECLAIM, - __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ - + __BTREE_TRIGGER_NORUN, + __BTREE_TRIGGER_TRANSACTIONAL, __BTREE_TRIGGER_INSERT, __BTREE_TRIGGER_OVERWRITE, - __BTREE_TRIGGER_GC, __BTREE_TRIGGER_BUCKET_INVALIDATE, - __BTREE_TRIGGER_NOATOMIC, }; #define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) #define BTREE_UPDATE_NOJOURNAL (1U << __BTREE_UPDATE_NOJOURNAL) -#define BTREE_UPDATE_PREJOURNAL (1U << __BTREE_UPDATE_PREJOURNAL) #define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM) +/* Don't run triggers at all */ #define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) +/* + * If set, we're running transactional triggers as part of a transaction commit: + * triggers may generate new updates + * + * If cleared, and either BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE are set, + * we're running atomic triggers during a transaction commit: we have our + * journal reservation, we're holding btree node write locks, and we know the + * transaction is going to commit (returning an error here is a fatal error, + * causing us to go emergency read-only) + */ +#define BTREE_TRIGGER_TRANSACTIONAL (1U << __BTREE_TRIGGER_TRANSACTIONAL) + +/* @new is entering the btree */ #define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) + +/* @old is leaving the btree */ #define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) +/* We're in gc/fsck: running triggers to recalculate e.g. disk usage */ #define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) + +/* signal from bucket invalidate path to alloc trigger */ #define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) -#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) -static inline int bch2_trans_mark_key(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_i *new, - unsigned flags) +static inline int bch2_key_trigger(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_s new, + unsigned flags) { - const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new->k.type); + const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type); - return ops->trans_trigger - ? ops->trans_trigger(trans, btree_id, level, old, new, flags) + return ops->trigger + ? ops->trigger(trans, btree, level, old, new, flags) : 0; } -static inline int bch2_trans_mark_old(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, unsigned flags) +static inline int bch2_key_trigger_old(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, unsigned flags) { struct bkey_i deleted; bkey_init(&deleted.k); deleted.k.p = old.k->p; - return bch2_trans_mark_key(trans, btree_id, level, old, &deleted, - BTREE_TRIGGER_OVERWRITE|flags); + return bch2_key_trigger(trans, btree_id, level, old, bkey_i_to_s(&deleted), + BTREE_TRIGGER_OVERWRITE|flags); } -static inline int bch2_trans_mark_new(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_i *new, unsigned flags) +static inline int bch2_key_trigger_new(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s new, unsigned flags) { struct bkey_i deleted; bkey_init(&deleted.k); - deleted.k.p = new->k.p; + deleted.k.p = new.k->p; - return bch2_trans_mark_key(trans, btree_id, level, bkey_i_to_s_c(&deleted), new, - BTREE_TRIGGER_INSERT|flags); + return bch2_key_trigger(trans, btree_id, level, bkey_i_to_s_c(&deleted), new, + BTREE_TRIGGER_INSERT|flags); } void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index bb73ba9017b0..74bf8eb90a4c 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -68,6 +68,12 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, _k = _n) { _n = bkey_p_next(_k); + if (!_k->u64s) { + printk(KERN_ERR "block %u key %5zu - u64s 0? aieee!\n", set, + _k->_data - i->_data); + break; + } + k = bkey_disassemble(b, _k, &uk); printbuf_reset(&buf); diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 79495cd7a794..8e2488a4b58d 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -500,19 +500,21 @@ void bch2_fs_btree_cache_init_early(struct btree_cache *bc) * cannibalize_bucket() will take. This means every time we unlock the root of * the btree, we need to release this lock if we have it held. */ -void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c) +void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans) { + struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; if (bc->alloc_lock == current) { - trace_and_count(c, btree_cache_cannibalize_unlock, c); + trace_and_count(c, btree_cache_cannibalize_unlock, trans); bc->alloc_lock = NULL; closure_wake_up(&bc->alloc_wait); } } -int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) +int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl) { + struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct task_struct *old; @@ -521,7 +523,7 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) goto success; if (!cl) { - trace_and_count(c, btree_cache_cannibalize_lock_fail, c); + trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; } @@ -535,11 +537,11 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) goto success; } - trace_and_count(c, btree_cache_cannibalize_lock_fail, c); + trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); return -BCH_ERR_btree_cache_cannibalize_lock_blocked; success: - trace_and_count(c, btree_cache_cannibalize_lock, c); + trace_and_count(c, btree_cache_cannibalize_lock, trans); return 0; } @@ -673,7 +675,7 @@ err: mutex_unlock(&bc->lock); - trace_and_count(c, btree_cache_cannibalize, c); + trace_and_count(c, btree_cache_cannibalize, trans); goto out; } @@ -717,12 +719,6 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, if (IS_ERR(b)) return b; - /* - * Btree nodes read in from disk should not have the accessed bit set - * initially, so that linear scans don't thrash the cache: - */ - clear_btree_node_accessed(b); - bkey_copy(&b->key, k); if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { /* raced with another fill: */ @@ -749,7 +745,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, if (path && sync) bch2_trans_unlock_noassert(trans); - bch2_btree_node_read(c, b, sync); + bch2_btree_node_read(trans, b, sync); if (!sync) return NULL; @@ -1039,7 +1035,7 @@ retry: goto retry; if (IS_ERR(b) && - !bch2_btree_cache_cannibalize_lock(c, NULL)) + !bch2_btree_cache_cannibalize_lock(trans, NULL)) goto retry; if (IS_ERR(b)) @@ -1087,7 +1083,7 @@ lock_node: EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); btree_check_header(c, b); out: - bch2_btree_cache_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(trans); return b; } diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index cfb80b201d61..4e1af5882052 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -17,8 +17,8 @@ int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, unsigned, enum btree_id); -void bch2_btree_cache_cannibalize_unlock(struct bch_fs *); -int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *); +void bch2_btree_cache_cannibalize_unlock(struct btree_trans *); +int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *); struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool); diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 30ab78a24517..49b4ade758c3 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -41,6 +41,14 @@ #define DROP_THIS_NODE 10 #define DROP_PREV_NODE 11 +static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k) +{ + return (struct bkey_s) {{{ + (struct bkey *) k.k, + (struct bch_val *) k.v + }}}; +} + static bool should_restart_for_topology_repair(struct bch_fs *c) { return c->opts.fix_errors != FSCK_FIX_no && @@ -108,7 +116,7 @@ static int bch2_gc_check_topology(struct bch_fs *c, ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); goto err; } else { - set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); + set_bit(BCH_FS_initial_gc_unfixed, &c->flags); } } } @@ -134,7 +142,7 @@ static int bch2_gc_check_topology(struct bch_fs *c, ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); goto err; } else { - set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); + set_bit(BCH_FS_initial_gc_unfixed, &c->flags); } } @@ -414,10 +422,9 @@ again: continue; } - if (ret) { - bch_err_msg(c, ret, "getting btree node"); + bch_err_msg(c, ret, "getting btree node"); + if (ret) break; - } ret = btree_repair_node_boundaries(c, b, prev, cur); @@ -482,10 +489,9 @@ again: false); ret = PTR_ERR_OR_ZERO(cur); - if (ret) { - bch_err_msg(c, ret, "getting btree node"); + bch_err_msg(c, ret, "getting btree node"); + if (ret) goto err; - } ret = bch2_btree_repair_topology_recurse(trans, cur); six_unlock_read(&cur->c.lock); @@ -619,7 +625,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id g->data_type = 0; g->dirty_sectors = 0; g->cached_sectors = 0; - set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + set_bit(BCH_FS_need_another_gc, &c->flags); } else { do_update = true; } @@ -664,7 +670,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { if (data_type == BCH_DATA_btree) { g->data_type = data_type; - set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + set_bit(BCH_FS_need_another_gc, &c->flags); } else { do_update = true; } @@ -707,8 +713,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); if (!new) { - bch_err_msg(c, ret, "allocating new key"); ret = -BCH_ERR_ENOMEM_gc_repair_key; + bch_err_msg(c, ret, "allocating new key"); goto err; } @@ -807,9 +813,6 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, struct bch_fs *c = trans->c; struct bkey deleted = KEY(0, 0, 0); struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; - unsigned flags = - BTREE_TRIGGER_GC| - (initial ? BTREE_TRIGGER_NOATOMIC : 0); int ret = 0; deleted.p = k->k->p; @@ -831,11 +834,10 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, } ret = commit_do(trans, NULL, NULL, 0, - bch2_mark_key(trans, btree_id, level, old, *k, flags)); + bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC)); fsck_err: err: - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -996,7 +998,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b /* Continue marking when opted to not * fix the error: */ ret = 0; - set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); + set_bit(BCH_FS_initial_gc_unfixed, &c->flags); continue; } } else if (ret) { @@ -1068,8 +1070,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans, fsck_err: six_unlock_read(&b->c.lock); - if (ret < 0) - bch_err_fn(c, ret); + bch_err_fn(c, ret); printbuf_exit(&buf); return ret; } @@ -1105,10 +1106,8 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) : bch2_gc_btree(trans, i, initial, metadata_only); } - if (ret < 0) - bch_err_fn(c, ret); - bch2_trans_put(trans); + bch_err_fn(c, ret); return ret; } @@ -1159,13 +1158,10 @@ static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, static void bch2_mark_superblocks(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; - mutex_lock(&c->sb_lock); gc_pos_set(c, gc_phase(GC_PHASE_SB)); - for_each_online_member(ca, c, i) + for_each_online_member(c, ca) bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC); mutex_unlock(&c->sb_lock); } @@ -1190,13 +1186,10 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) static void bch2_gc_free(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; - genradix_free(&c->reflink_gc_table); genradix_free(&c->gc_stripes); - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { kvpfree(rcu_dereference_protected(ca->buckets_gc, 1), sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket)); @@ -1218,7 +1211,7 @@ static int bch2_gc_done(struct bch_fs *c, bool verify = !metadata_only && !c->opts.reconstruct_alloc && (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info))); - unsigned i, dev; + unsigned i; int ret = 0; percpu_down_write(&c->mark_lock); @@ -1230,14 +1223,14 @@ static int bch2_gc_done(struct bch_fs *c, , ##__VA_ARGS__, dst->_f, src->_f))) \ dst->_f = src->_f #define copy_dev_field(_err, _f, _msg, ...) \ - copy_field(_err, _f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__) + copy_field(_err, _f, "dev %u has wrong " _msg, ca->dev_idx, ##__VA_ARGS__) #define copy_fs_field(_err, _f, _msg, ...) \ copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__) for (i = 0; i < ARRAY_SIZE(c->usage); i++) bch2_fs_usage_acc_to_base(c, i); - for_each_member_device(ca, c, dev) { + __for_each_member_device(c, ca) { struct bch_dev_usage *dst = ca->usage_base; struct bch_dev_usage *src = (void *) bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc, @@ -1251,9 +1244,6 @@ static int bch2_gc_done(struct bch_fs *c, copy_dev_field(dev_usage_fragmented_wrong, d[i].fragmented, "%s fragmented", bch2_data_types[i]); } - - copy_dev_field(dev_usage_buckets_ec_wrong, - buckets_ec, "buckets_ec"); } { @@ -1284,7 +1274,7 @@ static int bch2_gc_done(struct bch_fs *c, } for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry *e = + struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); if (metadata_only && @@ -1307,8 +1297,7 @@ static int bch2_gc_done(struct bch_fs *c, fsck_err: if (ca) percpu_ref_put(&ca->ref); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); percpu_up_write(&c->mark_lock); printbuf_exit(&buf); @@ -1317,9 +1306,6 @@ fsck_err: static int bch2_gc_start(struct bch_fs *c) { - struct bch_dev *ca = NULL; - unsigned i; - BUG_ON(c->usage_gc); c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64), @@ -1329,7 +1315,7 @@ static int bch2_gc_start(struct bch_fs *c) return -BCH_ERR_ENOMEM_gc_start; } - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { BUG_ON(ca->usage_gc); ca->usage_gc = alloc_percpu(struct bch_dev_usage); @@ -1348,10 +1334,7 @@ static int bch2_gc_start(struct bch_fs *c) static int bch2_gc_reset(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; - - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { free_percpu(ca->usage_gc); ca->usage_gc = NULL; } @@ -1389,9 +1372,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans, enum bch_data_type type; int ret; - if (bkey_ge(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets))) - return 1; - old = bch2_alloc_to_v4(k, &old_convert); new = *old; @@ -1488,52 +1468,36 @@ fsck_err: static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - struct bch_dev *ca; - unsigned i; int ret = 0; - for_each_member_device(ca, c, i) { - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, - POS(ca->dev_idx, ca->mi.first_bucket), - BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_LAZY_RW, - bch2_alloc_write_key(trans, &iter, k, metadata_only)); - - if (ret < 0) { - bch_err_fn(c, ret); + for_each_member_device(c, ca) { + ret = bch2_trans_run(c, + for_each_btree_key_upto_commit(trans, iter, BTREE_ID_alloc, + POS(ca->dev_idx, ca->mi.first_bucket), + POS(ca->dev_idx, ca->mi.nbuckets - 1), + BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k, + NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, + bch2_alloc_write_key(trans, &iter, k, metadata_only))); + if (ret) { percpu_ref_put(&ca->ref); break; } } - bch2_trans_put(trans); - return ret < 0 ? ret : 0; + bch_err_fn(c, ret); + return ret; } static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) { - struct bch_dev *ca; - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - struct bucket *g; - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; - unsigned i; - int ret; - - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket), GFP_KERNEL|__GFP_ZERO); if (!buckets) { percpu_ref_put(&ca->ref); bch_err(c, "error allocating ca->buckets[gc]"); - ret = -BCH_ERR_ENOMEM_gc_alloc_start; - goto err; + return -BCH_ERR_ENOMEM_gc_alloc_start; } buckets->first_bucket = ca->mi.first_bucket; @@ -1541,42 +1505,38 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) rcu_assign_pointer(ca->buckets_gc, buckets); } - ret = for_each_btree_key2(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ({ - ca = bch_dev_bkey_exists(c, k.k->p.inode); - g = gc_bucket(ca, k.k->p.offset); - - a = bch2_alloc_to_v4(k, &a_convert); - - g->gen_valid = 1; - g->gen = a->gen; - - if (metadata_only && - (a->data_type == BCH_DATA_user || - a->data_type == BCH_DATA_cached || - a->data_type == BCH_DATA_parity)) { - g->data_type = a->data_type; - g->dirty_sectors = a->dirty_sectors; - g->cached_sectors = a->cached_sectors; - g->stripe = a->stripe; - g->stripe_redundancy = a->stripe_redundancy; - } + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ({ + struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); + struct bucket *g = gc_bucket(ca, k.k->p.offset); - 0; - })); -err: - bch2_trans_put(trans); - if (ret) - bch_err_fn(c, ret); + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); + + g->gen_valid = 1; + g->gen = a->gen; + + if (metadata_only && + (a->data_type == BCH_DATA_user || + a->data_type == BCH_DATA_cached || + a->data_type == BCH_DATA_parity)) { + g->data_type = a->data_type; + g->dirty_sectors = a->dirty_sectors; + g->cached_sectors = a->cached_sectors; + g->stripe = a->stripe; + g->stripe_redundancy = a->stripe_redundancy; + } + + 0; + }))); + bch_err_fn(c, ret); return ret; } static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only) { - struct bch_dev *ca; - unsigned i; - - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { struct bucket_array *buckets = gc_bucket_array(ca); struct bucket *g; @@ -1634,7 +1594,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans, if (!r->refcount) new->k.type = KEY_TYPE_deleted; else - *bkey_refcount(new) = cpu_to_le64(r->refcount); + *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount); } fsck_err: printbuf_exit(&buf); @@ -1643,64 +1603,52 @@ fsck_err: static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only) { - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; size_t idx = 0; - int ret = 0; if (metadata_only) return 0; - trans = bch2_trans_get(c); - - ret = for_each_btree_key_commit(trans, iter, - BTREE_ID_reflink, POS_MIN, - BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL, - bch2_gc_write_reflink_key(trans, &iter, k, &idx)); - + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, + BTREE_ID_reflink, POS_MIN, + BTREE_ITER_PREFETCH, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_gc_write_reflink_key(trans, &iter, k, &idx))); c->reflink_gc_nr = 0; - bch2_trans_put(trans); return ret; } static int bch2_gc_reflink_start(struct bch_fs *c, bool metadata_only) { - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - struct reflink_gc *r; - int ret = 0; if (metadata_only) return 0; - trans = bch2_trans_get(c); c->reflink_gc_nr = 0; - for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { - const __le64 *refcount = bkey_refcount_c(k); + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, + BTREE_ITER_PREFETCH, k, ({ + const __le64 *refcount = bkey_refcount_c(k); - if (!refcount) - continue; + if (!refcount) + continue; - r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, - GFP_KERNEL); - if (!r) { - ret = -BCH_ERR_ENOMEM_gc_reflink_start; - break; - } + struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table, + c->reflink_gc_nr++, GFP_KERNEL); + if (!r) { + ret = -BCH_ERR_ENOMEM_gc_reflink_start; + break; + } - r->offset = k.k->p.offset; - r->size = k.k->size; - r->refcount = 0; - } - bch2_trans_iter_exit(trans, &iter); + r->offset = k.k->p.offset; + r->size = k.k->size; + r->refcount = 0; + 0; + }))); - bch2_trans_put(trans); + bch_err_fn(c, ret); return ret; } @@ -1768,24 +1716,15 @@ fsck_err: static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only) { - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - if (metadata_only) return 0; - trans = bch2_trans_get(c); - - ret = for_each_btree_key_commit(trans, iter, - BTREE_ID_stripes, POS_MIN, - BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL, - bch2_gc_write_stripes_key(trans, &iter, k)); - - bch2_trans_put(trans); - return ret; + return bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, + BTREE_ID_stripes, POS_MIN, + BTREE_ITER_PREFETCH, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_gc_write_stripes_key(trans, &iter, k))); } static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only) @@ -1848,7 +1787,7 @@ again: #endif c->gc_count++; - if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) || + if (test_bit(BCH_FS_need_another_gc, &c->flags) || (!iter && bch2_test_restart_gc)) { if (iter++ > 2) { bch_info(c, "Unable to fix bucket gens, looping"); @@ -1860,7 +1799,7 @@ again: * XXX: make sure gens we fixed got saved */ bch_info(c, "Second GC pass needed, restarting:"); - clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + clear_bit(BCH_FS_need_another_gc, &c->flags); __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); bch2_gc_stripes_reset(c, metadata_only); @@ -1900,9 +1839,7 @@ out: * allocator thread - issue wakeup in case they blocked on gc_lock: */ closure_wake_up(&c->freelist_wait); - - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -1912,7 +1849,6 @@ static int gc_btree_gens_key(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; struct bkey_i *u; int ret; @@ -1970,12 +1906,7 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i int bch2_gc_gens(struct bch_fs *c) { - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - struct bch_dev *ca; u64 b, start_time = local_clock(); - unsigned i; int ret; /* @@ -1988,9 +1919,8 @@ int bch2_gc_gens(struct bch_fs *c) trace_and_count(c, gc_gens_start, c); down_read(&c->gc_lock); - trans = bch2_trans_get(c); - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { struct bucket_gens *gens = bucket_gens(ca); BUG_ON(ca->oldest_gen); @@ -2007,33 +1937,31 @@ int bch2_gc_gens(struct bch_fs *c) ca->oldest_gen[b] = gens->b[b]; } - for (i = 0; i < BTREE_ID_NR; i++) + for (unsigned i = 0; i < BTREE_ID_NR; i++) if (btree_type_has_ptrs(i)) { c->gc_gens_btree = i; c->gc_gens_pos = POS_MIN; - ret = for_each_btree_key_commit(trans, iter, i, - POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, - k, - NULL, NULL, - BTREE_INSERT_NOFAIL, - gc_btree_gens_key(trans, &iter, k)); - if (ret && !bch2_err_matches(ret, EROFS)) - bch_err_fn(c, ret); + ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, i, + POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, + k, + NULL, NULL, + BCH_TRANS_COMMIT_no_enospc, + gc_btree_gens_key(trans, &iter, k))); if (ret) goto err; } - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, - POS_MIN, - BTREE_ITER_PREFETCH, - k, - NULL, NULL, - BTREE_INSERT_NOFAIL, - bch2_alloc_write_oldest_gen(trans, &iter, k)); - if (ret && !bch2_err_matches(ret, EROFS)) - bch_err_fn(c, ret); + ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, + POS_MIN, + BTREE_ITER_PREFETCH, + k, + NULL, NULL, + BCH_TRANS_COMMIT_no_enospc, + bch2_alloc_write_oldest_gen(trans, &iter, k))); if (ret) goto err; @@ -2045,14 +1973,15 @@ int bch2_gc_gens(struct bch_fs *c) bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); trace_and_count(c, gc_gens_end, c); err: - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { kvfree(ca->oldest_gen); ca->oldest_gen = NULL; } - bch2_trans_put(trans); up_read(&c->gc_lock); mutex_unlock(&c->gc_gens_lock); + if (!bch2_err_matches(ret, EROFS)) + bch_err_fn(c, ret); return ret; } @@ -2062,7 +1991,6 @@ static int bch2_gc_thread(void *arg) struct io_clock *clock = &c->io_clock[WRITE]; unsigned long last = atomic64_read(&clock->now); unsigned last_kick = atomic_read(&c->kick_gc); - int ret; set_freezable(); @@ -2102,11 +2030,8 @@ static int bch2_gc_thread(void *arg) #if 0 ret = bch2_gc(c, false, false); #else - ret = bch2_gc_gens(c); + bch2_gc_gens(c); #endif - if (ret < 0) - bch_err_fn(c, ret); - debug_check_no_locks_held(); } diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 5a720f0cd5a6..33db48e2153f 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -524,7 +524,8 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, prt_printf(out, "at btree "); bch2_btree_pos_to_text(out, c, b); - prt_printf(out, "\n node offset %u", b->written); + prt_printf(out, "\n node offset %u/%u", + b->written, btree_ptr_sectors_written(&b->key)); if (i) prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s)); prt_str(out, ": "); @@ -830,6 +831,23 @@ static int bset_key_invalid(struct bch_fs *c, struct btree *b, (rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0); } +static bool __bkey_valid(struct bch_fs *c, struct btree *b, + struct bset *i, struct bkey_packed *k) +{ + if (bkey_p_next(k) > vstruct_last(i)) + return false; + + if (k->format > KEY_FORMAT_CURRENT) + return false; + + struct printbuf buf = PRINTBUF; + struct bkey tmp; + struct bkey_s u = __bkey_disassemble(b, k, &tmp); + bool ret = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b), READ, &buf); + printbuf_exit(&buf); + return ret; +} + static int validate_bset_keys(struct bch_fs *c, struct btree *b, struct bset *i, int write, bool have_retry, bool *saw_error) @@ -845,6 +863,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, k != vstruct_last(i);) { struct bkey_s u; struct bkey tmp; + unsigned next_good_key; if (btree_err_on(bkey_p_next(k) > vstruct_last(i), -BCH_ERR_btree_node_read_err_fixable, @@ -859,12 +878,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, btree_node_bkey_bad_format, - "invalid bkey format %u", k->format)) { - i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); - memmove_u64s_down(k, bkey_p_next(k), - (u64 *) vstruct_end(i) - (u64 *) k); - continue; - } + "invalid bkey format %u", k->format)) + goto drop_this_key; /* XXX: validate k->u64s */ if (!write) @@ -885,11 +900,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, c, NULL, b, i, btree_node_bad_bkey, "invalid bkey: %s", buf.buf); - - i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); - memmove_u64s_down(k, bkey_p_next(k), - (u64 *) vstruct_end(i) - (u64 *) k); - continue; + goto drop_this_key; } if (write) @@ -906,21 +917,45 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, prt_printf(&buf, " > "); bch2_bkey_to_text(&buf, u.k); - bch2_dump_bset(c, b, i, 0); - if (btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, btree_node_bkey_out_of_order, - "%s", buf.buf)) { - i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); - memmove_u64s_down(k, bkey_p_next(k), - (u64 *) vstruct_end(i) - (u64 *) k); - continue; - } + "%s", buf.buf)) + goto drop_this_key; } prev = k; k = bkey_p_next(k); + continue; +drop_this_key: + next_good_key = k->u64s; + + if (!next_good_key || + (BSET_BIG_ENDIAN(i) == CPU_BIG_ENDIAN && + version >= bcachefs_metadata_version_snapshot)) { + /* + * only do scanning if bch2_bkey_compat() has nothing to + * do + */ + + if (!__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) { + for (next_good_key = 1; + next_good_key < (u64 *) vstruct_last(i) - (u64 *) k; + next_good_key++) + if (__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) + goto got_good_key; + + } + + /* + * didn't find a good key, have to truncate the rest of + * the bset + */ + next_good_key = (u64 *) vstruct_last(i) - (u64 *) k; + } +got_good_key: + le16_add_cpu(&i->u64s, -next_good_key); + memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k); } fsck_err: printbuf_exit(&buf); @@ -934,7 +969,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, struct sort_iter *iter; struct btree_node *sorted; struct bkey_packed *k; - struct bch_extent_ptr *ptr; struct bset *i; bool used_mempool, blacklisted; bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && @@ -943,6 +977,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, unsigned ptr_written = btree_ptr_sectors_written(&b->key); struct printbuf buf = PRINTBUF; int ret = 0, retry_read = 0, write = READ; + u64 start_time = local_clock(); b->version_ondisk = U16_MAX; /* We might get called multiple times on read retry: */ @@ -968,12 +1003,20 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, struct bch_btree_ptr_v2 *bp = &bkey_i_to_btree_ptr_v2(&b->key)->v; + bch2_bpos_to_text(&buf, b->data->min_key); + prt_str(&buf, "-"); + bch2_bpos_to_text(&buf, b->data->max_key); + btree_err_on(b->data->keys.seq != bp->seq, -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, btree_node_bad_seq, - "got wrong btree node (seq %llx want %llx)", - b->data->keys.seq, bp->seq); + "got wrong btree node (want %llx got %llx)\n" + "got btree %s level %llu pos %s", + bp->seq, b->data->keys.seq, + bch2_btree_id_str(BTREE_NODE_ID(b->data)), + BTREE_NODE_LEVEL(b->data), + buf.buf); } else { btree_err_on(!b->data->keys.seq, -BCH_ERR_btree_node_read_err_must_retry, @@ -999,8 +1042,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, nonce = btree_nonce(i, b->written << 9); - csum_bad = bch2_crc_cmp(b->data->csum, - csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data)); + struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); + csum_bad = bch2_crc_cmp(b->data->csum, csum); if (csum_bad) bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); @@ -1008,7 +1051,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, bset_bad_csum, - "invalid checksum"); + "%s", + (printbuf_reset(&buf), + bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum), + buf.buf)); ret = bset_encrypt(c, i, b->written << 9); if (bch2_fs_fatal_err_on(ret, c, @@ -1037,8 +1083,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, "unknown checksum type %llu", BSET_CSUM_TYPE(i)); nonce = btree_nonce(i, b->written << 9); - csum_bad = bch2_crc_cmp(bne->csum, - csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne)); + struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); + csum_bad = bch2_crc_cmp(bne->csum, csum); if (csum_bad) bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); @@ -1046,7 +1092,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, bset_bad_csum, - "invalid checksum"); + "%s", + (printbuf_reset(&buf), + bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum), + buf.buf)); ret = bset_encrypt(c, i, b->written << 9); if (bch2_fs_fatal_err_on(ret, c, @@ -1202,6 +1251,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, out: mempool_free(iter, &c->fill_iter); printbuf_exit(&buf); + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time); return retry_read; fsck_err: if (ret == -BCH_ERR_btree_node_read_err_want_retry || @@ -1575,16 +1625,17 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool return 0; } -void bch2_btree_node_read(struct bch_fs *c, struct btree *b, +void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, bool sync) { + struct bch_fs *c = trans->c; struct extent_ptr_decoded pick; struct btree_read_bio *rb; struct bch_dev *ca; struct bio *bio; int ret; - trace_and_count(c, btree_node_read, c, b); + trace_and_count(c, btree_node_read, trans, b); if (bch2_verify_all_btree_replicas && !btree_node_read_all_replicas(c, b, sync)) @@ -1637,7 +1688,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, if (sync) { submit_bio_wait(bio); - + bch2_latency_acct(ca, rb->start_time, READ); btree_node_read_work(&rb->work); } else { submit_bio(bio); @@ -1663,12 +1714,12 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, closure_init_stack(&cl); do { - ret = bch2_btree_cache_cannibalize_lock(c, &cl); + ret = bch2_btree_cache_cannibalize_lock(trans, &cl); closure_sync(&cl); } while (ret); b = bch2_btree_node_mem_alloc(trans, level != 0); - bch2_btree_cache_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(trans); BUG_ON(IS_ERR(b)); @@ -1677,7 +1728,7 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, set_btree_node_read_in_flight(b); - bch2_btree_node_read(c, b, true); + bch2_btree_node_read(trans, b, true); if (btree_node_read_error(b)) { bch2_btree_node_hash_remove(&c->btree_cache, b); @@ -1789,8 +1840,10 @@ static void btree_node_write_work(struct work_struct *work) bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr, bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); - if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) + if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) { + ret = -BCH_ERR_btree_write_all_failed; goto err; + } if (wbio->wbio.first_btree_write) { if (wbio->wbio.failed.nr) { @@ -1800,9 +1853,9 @@ static void btree_node_write_work(struct work_struct *work) ret = bch2_trans_do(c, NULL, NULL, 0, bch2_btree_node_update_key_get_iter(trans, b, &wbio->key, BCH_WATERMARK_reclaim| - BTREE_INSERT_JOURNAL_RECLAIM| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_NOCHECK_RW, + BCH_TRANS_COMMIT_journal_reclaim| + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_no_check_rw, !wbio->wbio.failed.nr)); if (ret) goto err; @@ -1885,7 +1938,6 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, static void btree_write_submit(struct work_struct *work) { struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work); - struct bch_extent_ptr *ptr; BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; bkey_copy(&tmp.k, &wbio->key); diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index e0d7fa5b1dfb..e251cb6b965f 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -130,7 +130,7 @@ void bch2_btree_init_next(struct btree_trans *, struct btree *); int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *, struct btree *, bool, bool *); -void bch2_btree_node_read(struct bch_fs *, struct btree *, bool); +void bch2_btree_node_read(struct btree_trans *, struct btree *, bool); int bch2_btree_root_read(struct bch_fs *, enum btree_id, const struct bkey_i *, unsigned); diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index da594e006769..fa298289e016 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -13,6 +13,7 @@ #include "error.h" #include "extents.h" #include "journal.h" +#include "journal_io.h" #include "replicas.h" #include "snapshot.h" #include "trace.h" @@ -21,8 +22,8 @@ #include <linux/prefetch.h> static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *); -static inline void btree_path_list_add(struct btree_trans *, struct btree_path *, - struct btree_path *); +static inline void btree_path_list_add(struct btree_trans *, + btree_path_idx_t, btree_path_idx_t); static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter) { @@ -33,7 +34,8 @@ static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter) #endif } -static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *); +static btree_path_idx_t btree_path_alloc(struct btree_trans *, btree_path_idx_t); +static void bch2_trans_srcu_lock(struct btree_trans *); static inline int __btree_path_cmp(const struct btree_path *l, enum btree_id r_btree_id, @@ -239,8 +241,9 @@ static void bch2_btree_path_verify(struct btree_trans *trans, void bch2_trans_verify_paths(struct btree_trans *trans) { struct btree_path *path; + unsigned iter; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, iter) bch2_btree_path_verify(trans, path); } @@ -250,7 +253,7 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) BUG_ON(iter->btree_id >= BTREE_ID_NR); - BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached); + BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != btree_iter_path(trans, iter)->cached); BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && (iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); @@ -260,8 +263,8 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) !btree_type_has_snapshot_field(iter->btree_id)); if (iter->update_path) - bch2_btree_path_verify(trans, iter->update_path); - bch2_btree_path_verify(trans, iter->path); + bch2_btree_path_verify(trans, &trans->paths[iter->update_path]); + bch2_btree_path_verify(trans, btree_iter_path(trans, iter)); } static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) @@ -330,12 +333,12 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, struct bpos pos, bool key_cache) { struct btree_path *path; - unsigned idx; + struct trans_for_each_path_inorder_iter iter; struct printbuf buf = PRINTBUF; btree_trans_sort_paths(trans); - trans_for_each_path_inorder(trans, path, idx) { + trans_for_each_path_inorder(trans, path, iter) { int cmp = cmp_int(path->btree_id, id) ?: cmp_int(path->cached, key_cache); @@ -415,8 +418,9 @@ void bch2_btree_path_fix_key_modified(struct btree_trans *trans, struct bkey_packed *where) { struct btree_path *path; + unsigned i; - trans_for_each_path_with_node(trans, b, path) { + trans_for_each_path_with_node(trans, b, path, i) { __bch2_btree_path_fix_key_modified(path, b, where); bch2_btree_path_verify_level(trans, path, b->c.level); } @@ -523,6 +527,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, { struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where); struct btree_path *linked; + unsigned i; if (node_iter != &path->l[b->c.level].iter) { __bch2_btree_node_iter_fix(path, b, node_iter, t, @@ -532,7 +537,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, bch2_btree_node_iter_verify(node_iter, b); } - trans_for_each_path_with_node(trans, b, linked) { + trans_for_each_path_with_node(trans, b, linked, i) { __bch2_btree_node_iter_fix(linked, b, &linked->l[b->c.level].iter, t, where, clobber_u64s, new_u64s); @@ -647,7 +652,6 @@ void bch2_btree_path_level_init(struct btree_trans *trans, static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b) { struct bch_fs *c = trans->c; - struct btree_insert_entry *i; trans_for_each_update(trans, i) if (!i->cached && @@ -655,7 +659,7 @@ static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, str i->btree_id == b->c.btree_id && bpos_cmp(i->k->k.p, b->data->min_key) >= 0 && bpos_cmp(i->k->k.p, b->data->max_key) <= 0) { - i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v; + i->old_v = bch2_btree_path_peek_slot(trans->paths + i->path, &i->old_k).v; if (unlikely(trans->journal_replay_not_finished)) { struct bkey_i *j_k = @@ -674,14 +678,22 @@ static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, str * A btree node is being replaced - update the iterator to point to the new * node: */ -void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) +void bch2_trans_node_add(struct btree_trans *trans, + struct btree_path *path, + struct btree *b) { - struct btree_path *path; + struct btree_path *prev; + + BUG_ON(!btree_path_pos_in_node(path, b)); + + while ((prev = prev_btree_path(trans, path)) && + btree_path_pos_in_node(prev, b)) + path = prev; - trans_for_each_path(trans, path) - if (path->uptodate == BTREE_ITER_UPTODATE && - !path->cached && - btree_path_pos_in_node(path, b)) { + for (; + path && btree_path_pos_in_node(path, b); + path = next_btree_path(trans, path)) + if (path->uptodate == BTREE_ITER_UPTODATE && !path->cached) { enum btree_node_locked_type t = btree_lock_want(path, b->c.level); @@ -704,8 +716,9 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b) { struct btree_path *path; + unsigned i; - trans_for_each_path_with_node(trans, b, path) + trans_for_each_path_with_node(trans, b, path, i) __btree_path_level_init(path, b->c.level); bch2_trans_revalidate_updates_in_node(trans, b); @@ -781,7 +794,7 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat struct btree_node_iter node_iter = l->iter; struct bkey_packed *k; struct bkey_buf tmp; - unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) + unsigned nr = test_bit(BCH_FS_started, &c->flags) ? (path->level > 1 ? 0 : 2) : (path->level > 1 ? 1 : 16); bool was_locked = btree_node_locked(path, path->level); @@ -816,7 +829,7 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p struct bch_fs *c = trans->c; struct bkey_s_c k; struct bkey_buf tmp; - unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) + unsigned nr = test_bit(BCH_FS_started, &c->flags) ? (path->level > 1 ? 0 : 2) : (path->level > 1 ? 1 : 16); bool was_locked = btree_node_locked(path, path->level); @@ -884,7 +897,8 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, bch2_bkey_buf_reassemble(out, c, k); - if (flags & BTREE_ITER_PREFETCH) + if ((flags & BTREE_ITER_PREFETCH) && + c->opts.btree_node_prefetch) ret = btree_path_prefetch_j(trans, path, &jiter); bch2_btree_and_journal_iter_exit(&jiter); @@ -916,7 +930,8 @@ static __always_inline int btree_path_down(struct btree_trans *trans, bch2_bkey_buf_unpack(&tmp, c, l->b, bch2_btree_node_iter_peek(&l->iter, l->b)); - if (flags & BTREE_ITER_PREFETCH) { + if ((flags & BTREE_ITER_PREFETCH) && + c->opts.btree_node_prefetch) { ret = btree_path_prefetch(trans, path); if (ret) goto err; @@ -953,7 +968,8 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans) struct bch_fs *c = trans->c; struct btree_path *path; unsigned long trace_ip = _RET_IP_; - int i, ret = 0; + unsigned i; + int ret = 0; if (trans->in_traverse_all) return -BCH_ERR_transaction_restart_in_traverse_all; @@ -963,7 +979,7 @@ retry_all: trans->restarted = 0; trans->last_restarted_ip = 0; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) path->should_be_locked = false; btree_trans_sort_paths(trans); @@ -977,7 +993,7 @@ retry_all: closure_init_stack(&cl); do { - ret = bch2_btree_cache_cannibalize_lock(c, &cl); + ret = bch2_btree_cache_cannibalize_lock(trans, &cl); closure_sync(&cl); } while (ret); } @@ -985,16 +1001,16 @@ retry_all: /* Now, redo traversals in correct order: */ i = 0; while (i < trans->nr_sorted) { - path = trans->paths + trans->sorted[i]; + btree_path_idx_t idx = trans->sorted[i]; /* * Traversing a path can cause another path to be added at about * the same position: */ - if (path->uptodate) { - __btree_path_get(path, false); - ret = bch2_btree_path_traverse_one(trans, path, 0, _THIS_IP_); - __btree_path_put(path, false); + if (trans->paths[idx].uptodate) { + __btree_path_get(&trans->paths[idx], false); + ret = bch2_btree_path_traverse_one(trans, idx, 0, _THIS_IP_); + __btree_path_put(&trans->paths[idx], false); if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || bch2_err_matches(ret, ENOMEM)) @@ -1013,7 +1029,7 @@ retry_all: * then failed to relock a path - that's fine. */ err: - bch2_btree_cache_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(trans); trans->in_traverse_all = false; @@ -1099,10 +1115,11 @@ static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, * stashed in the iterator and returned from bch2_trans_exit(). */ int bch2_btree_path_traverse_one(struct btree_trans *trans, - struct btree_path *path, + btree_path_idx_t path_idx, unsigned flags, unsigned long trace_ip) { + struct btree_path *path = &trans->paths[path_idx]; unsigned depth_want = path->level; int ret = -((int) trans->restarted); @@ -1126,6 +1143,8 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans, goto out; } + path = &trans->paths[path_idx]; + if (unlikely(path->level >= BTREE_MAX_DEPTH)) goto out; @@ -1188,39 +1207,38 @@ static inline void btree_path_copy(struct btree_trans *trans, struct btree_path } } -static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src, - bool intent) +static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_idx_t src, + bool intent) { - struct btree_path *new = btree_path_alloc(trans, src); - - btree_path_copy(trans, new, src); - __btree_path_get(new, intent); + btree_path_idx_t new = btree_path_alloc(trans, src); + btree_path_copy(trans, trans->paths + new, trans->paths + src); + __btree_path_get(trans->paths + new, intent); return new; } __flatten -struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans, - struct btree_path *path, bool intent, - unsigned long ip) +btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans, + btree_path_idx_t path, bool intent, unsigned long ip) { - __btree_path_put(path, intent); + __btree_path_put(trans->paths + path, intent); path = btree_path_clone(trans, path, intent); - path->preserve = false; + trans->paths[path].preserve = false; return path; } -struct btree_path * __must_check +btree_path_idx_t __must_check __bch2_btree_path_set_pos(struct btree_trans *trans, - struct btree_path *path, struct bpos new_pos, - bool intent, unsigned long ip, int cmp) + btree_path_idx_t path_idx, struct bpos new_pos, + bool intent, unsigned long ip) { - unsigned level = path->level; + int cmp = bpos_cmp(new_pos, trans->paths[path_idx].pos); bch2_trans_verify_not_in_restart(trans); - EBUG_ON(!path->ref); + EBUG_ON(!trans->paths[path_idx].ref); - path = bch2_btree_path_make_mut(trans, path, intent, ip); + path_idx = bch2_btree_path_make_mut(trans, path_idx, intent, ip); + struct btree_path *path = trans->paths + path_idx; path->pos = new_pos; trans->paths_sorted = false; @@ -1231,7 +1249,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, goto out; } - level = btree_path_up_until_good_node(trans, path, cmp); + unsigned level = btree_path_up_until_good_node(trans, path, cmp); if (btree_path_node(path, level)) { struct btree_path_level *l = &path->l[level]; @@ -1261,7 +1279,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, } out: bch2_btree_path_verify(trans, path); - return path; + return path_idx; } /* Btree path: main interface: */ @@ -1296,19 +1314,16 @@ static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btr return NULL; } -static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path) +static inline void __bch2_path_free(struct btree_trans *trans, btree_path_idx_t path) { - __bch2_btree_path_unlock(trans, path); - btree_path_list_remove(trans, path); - trans->paths_allocated &= ~(1ULL << path->idx); + __bch2_btree_path_unlock(trans, trans->paths + path); + btree_path_list_remove(trans, trans->paths + path); + __clear_bit(path, trans->paths_allocated); } -void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent) +void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent) { - struct btree_path *dup; - - EBUG_ON(trans->paths + path->idx != path); - EBUG_ON(!path->ref); + struct btree_path *path = trans->paths + path_idx, *dup; if (!__btree_path_put(path, intent)) return; @@ -1330,16 +1345,13 @@ void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool inte dup->should_be_locked |= path->should_be_locked; } - __bch2_path_free(trans, path); + __bch2_path_free(trans, path_idx); } -static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path, +static void bch2_path_put_nokeep(struct btree_trans *trans, btree_path_idx_t path, bool intent) { - EBUG_ON(trans->paths + path->idx != path); - EBUG_ON(!path->ref); - - if (!__btree_path_put(path, intent)) + if (!__btree_path_put(trans->paths + path, intent)) return; __bch2_path_free(trans, path); @@ -1362,9 +1374,6 @@ void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) noinline __cold void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) { - struct btree_insert_entry *i; - struct btree_write_buffered_key *wb; - prt_printf(buf, "transaction updates for %s journal seq %llu", trans->fn, trans->journal_res.seq); prt_newline(buf); @@ -1388,16 +1397,10 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) prt_newline(buf); } - trans_for_each_wb_update(trans, wb) { - prt_printf(buf, "update: btree=%s wb=1 %pS", - bch2_btree_id_str(wb->btree), - (void *) i->ip_allocated); - prt_newline(buf); - - prt_printf(buf, " new "); - bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(&wb->k)); - prt_newline(buf); - } + for (struct jset_entry *e = trans->journal_entries; + e != btree_trans_journal_entries_top(trans); + e = vstruct_next(e)) + bch2_journal_entry_to_text(buf, trans->c, e); printbuf_indent_sub(buf, 2); } @@ -1412,11 +1415,12 @@ void bch2_dump_trans_updates(struct btree_trans *trans) printbuf_exit(&buf); } -noinline __cold -void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path) +static void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx) { + struct btree_path *path = trans->paths + path_idx; + prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ", - path->idx, path->ref, path->intent_ref, + path_idx, path->ref, path->intent_ref, path->preserve ? 'P' : ' ', path->should_be_locked ? 'S' : ' ', bch2_btree_id_str(path->btree_id), @@ -1434,14 +1438,13 @@ static noinline __cold void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans, bool nosort) { - struct btree_path *path; - unsigned idx; + struct trans_for_each_path_inorder_iter iter; if (!nosort) btree_trans_sort_paths(trans); - trans_for_each_path_inorder(trans, path, idx) - bch2_btree_path_to_text(out, path); + trans_for_each_path_idx_inorder(trans, iter) + bch2_btree_path_to_text(out, trans, iter.path_idx); } noinline __cold @@ -1473,17 +1476,14 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans) { struct btree_transaction_stats *s = btree_trans_stats(trans); struct printbuf buf = PRINTBUF; - - if (!s) - return; + size_t nr = bitmap_weight(trans->paths_allocated, trans->nr_paths); bch2_trans_paths_to_text(&buf, trans); if (!buf.allocation_failure) { mutex_lock(&s->lock); - if (s->nr_max_paths < hweight64(trans->paths_allocated)) { - s->nr_max_paths = trans->nr_max_paths = - hweight64(trans->paths_allocated); + if (nr > s->nr_max_paths) { + s->nr_max_paths = nr; swap(s->max_paths_text, buf.buf); } mutex_unlock(&s->lock); @@ -1491,64 +1491,121 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans) printbuf_exit(&buf); - trans->nr_max_paths = hweight64(trans->paths_allocated); + trans->nr_paths_max = nr; +} + +noinline __cold +int __bch2_btree_trans_too_many_iters(struct btree_trans *trans) +{ + if (trace_trans_restart_too_many_iters_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_trans_paths_to_text(&buf, trans); + trace_trans_restart_too_many_iters(trans, _THIS_IP_, buf.buf); + printbuf_exit(&buf); + } + + count_event(trans->c, trans_restart_too_many_iters); + + return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters); } static noinline void btree_path_overflow(struct btree_trans *trans) { bch2_dump_trans_paths_updates(trans); - panic("trans path overflow\n"); + bch_err(trans->c, "trans path overflow"); } -static inline struct btree_path *btree_path_alloc(struct btree_trans *trans, - struct btree_path *pos) +static noinline void btree_paths_realloc(struct btree_trans *trans) { - struct btree_path *path; - unsigned idx; + unsigned nr = trans->nr_paths * 2; + + void *p = kzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) + + sizeof(struct btree_trans_paths) + + nr * sizeof(struct btree_path) + + nr * sizeof(btree_path_idx_t) + 8 + + nr * sizeof(struct btree_insert_entry), GFP_KERNEL|__GFP_NOFAIL); + + unsigned long *paths_allocated = p; + memcpy(paths_allocated, trans->paths_allocated, BITS_TO_LONGS(trans->nr_paths) * sizeof(unsigned long)); + p += BITS_TO_LONGS(nr) * sizeof(unsigned long); + + p += sizeof(struct btree_trans_paths); + struct btree_path *paths = p; + *trans_paths_nr(paths) = nr; + memcpy(paths, trans->paths, trans->nr_paths * sizeof(struct btree_path)); + p += nr * sizeof(struct btree_path); + + btree_path_idx_t *sorted = p; + memcpy(sorted, trans->sorted, trans->nr_sorted * sizeof(btree_path_idx_t)); + p += nr * sizeof(btree_path_idx_t) + 8; + + struct btree_insert_entry *updates = p; + memcpy(updates, trans->updates, trans->nr_paths * sizeof(struct btree_insert_entry)); + + unsigned long *old = trans->paths_allocated; - if (unlikely(trans->paths_allocated == - ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) - btree_path_overflow(trans); + rcu_assign_pointer(trans->paths_allocated, paths_allocated); + rcu_assign_pointer(trans->paths, paths); + rcu_assign_pointer(trans->sorted, sorted); + rcu_assign_pointer(trans->updates, updates); - idx = __ffs64(~trans->paths_allocated); + trans->nr_paths = nr; + + if (old != trans->_paths_allocated) + kfree_rcu_mightsleep(old); +} + +static inline btree_path_idx_t btree_path_alloc(struct btree_trans *trans, + btree_path_idx_t pos) +{ + btree_path_idx_t idx = find_first_zero_bit(trans->paths_allocated, trans->nr_paths); + + if (unlikely(idx == trans->nr_paths)) { + if (trans->nr_paths == BTREE_ITER_MAX) { + btree_path_overflow(trans); + return 0; + } + + btree_paths_realloc(trans); + } /* * Do this before marking the new path as allocated, since it won't be * initialized yet: */ - if (unlikely(idx > trans->nr_max_paths)) + if (unlikely(idx > trans->nr_paths_max)) bch2_trans_update_max_paths(trans); - trans->paths_allocated |= 1ULL << idx; + __set_bit(idx, trans->paths_allocated); - path = &trans->paths[idx]; - path->idx = idx; + struct btree_path *path = &trans->paths[idx]; path->ref = 0; path->intent_ref = 0; path->nodes_locked = 0; - path->alloc_seq++; - btree_path_list_add(trans, pos, path); + btree_path_list_add(trans, pos, idx); trans->paths_sorted = false; - return path; + return idx; } -struct btree_path *bch2_path_get(struct btree_trans *trans, - enum btree_id btree_id, struct bpos pos, - unsigned locks_want, unsigned level, - unsigned flags, unsigned long ip) +btree_path_idx_t bch2_path_get(struct btree_trans *trans, + enum btree_id btree_id, struct bpos pos, + unsigned locks_want, unsigned level, + unsigned flags, unsigned long ip) { - struct btree_path *path, *path_pos = NULL; + struct btree_path *path; bool cached = flags & BTREE_ITER_CACHED; bool intent = flags & BTREE_ITER_INTENT; - int i; + struct trans_for_each_path_inorder_iter iter; + btree_path_idx_t path_pos = 0, path_idx; bch2_trans_verify_not_in_restart(trans); bch2_trans_verify_locks(trans); btree_trans_sort_paths(trans); - trans_for_each_path_inorder(trans, path, i) { + trans_for_each_path_inorder(trans, path, iter) { if (__btree_path_cmp(path, btree_id, cached, @@ -1556,18 +1613,19 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, level) > 0) break; - path_pos = path; + path_pos = iter.path_idx; } if (path_pos && - path_pos->cached == cached && - path_pos->btree_id == btree_id && - path_pos->level == level) { - __btree_path_get(path_pos, intent); - path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip); + trans->paths[path_pos].cached == cached && + trans->paths[path_pos].btree_id == btree_id && + trans->paths[path_pos].level == level) { + __btree_path_get(trans->paths + path_pos, intent); + path_idx = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip); + path = trans->paths + path_idx; } else { - path = btree_path_alloc(trans, path_pos); - path_pos = NULL; + path_idx = btree_path_alloc(trans, path_pos); + path = trans->paths + path_idx; __btree_path_get(path, intent); path->pos = pos; @@ -1578,7 +1636,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, path->level = level; path->locks_want = locks_want; path->nodes_locked = 0; - for (i = 0; i < ARRAY_SIZE(path->l); i++) + for (unsigned i = 0; i < ARRAY_SIZE(path->l); i++) path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init); #ifdef TRACK_PATH_ALLOCATED path->ip_allocated = ip; @@ -1604,7 +1662,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, if (locks_want > path->locks_want) bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL); - return path; + return path_idx; } struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u) @@ -1659,9 +1717,10 @@ __bch2_btree_iter_traverse(struct btree_iter *iter) int __must_check bch2_btree_iter_traverse(struct btree_iter *iter) { + struct btree_trans *trans = iter->trans; int ret; - iter->path = bch2_btree_path_set_pos(iter->trans, iter->path, + iter->path = bch2_btree_path_set_pos(trans, iter->path, btree_iter_search_key(iter), iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); @@ -1670,7 +1729,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter) if (ret) return ret; - btree_path_set_should_be_locked(iter->path); + btree_path_set_should_be_locked(trans->paths + iter->path); return 0; } @@ -1682,14 +1741,15 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) struct btree *b = NULL; int ret; - EBUG_ON(iter->path->cached); + EBUG_ON(trans->paths[iter->path].cached); bch2_btree_iter_verify(iter); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (ret) goto err; - b = btree_path_node(iter->path, iter->path->level); + struct btree_path *path = btree_iter_path(trans, iter); + b = btree_path_node(path, path->level); if (!b) goto out; @@ -1701,7 +1761,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); - btree_path_set_should_be_locked(iter->path); + btree_path_set_should_be_locked(btree_iter_path(trans, iter)); out: bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); @@ -1726,14 +1786,15 @@ struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter) struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) { struct btree_trans *trans = iter->trans; - struct btree_path *path = iter->path; struct btree *b = NULL; int ret; + EBUG_ON(trans->paths[iter->path].cached); bch2_trans_verify_not_in_restart(trans); - EBUG_ON(iter->path->cached); bch2_btree_iter_verify(iter); + struct btree_path *path = btree_iter_path(trans, iter); + /* already at end? */ if (!btree_path_node(path, path->level)) return NULL; @@ -1763,17 +1824,19 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) * Haven't gotten to the end of the parent node: go back down to * the next child node */ - path = iter->path = - bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos), - iter->flags & BTREE_ITER_INTENT, - btree_iter_ip_allocated(iter)); + iter->path = bch2_btree_path_set_pos(trans, iter->path, + bpos_successor(iter->pos), + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + path = btree_iter_path(trans, iter); btree_path_set_level_down(trans, path, iter->min_depth); - ret = bch2_btree_path_traverse(trans, path, iter->flags); + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (ret) goto err; + path = btree_iter_path(trans, iter); b = path->l[path->level].b; } @@ -1783,8 +1846,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); - btree_path_set_should_be_locked(iter->path); - BUG_ON(iter->path->uptodate); + btree_path_set_should_be_locked(btree_iter_path(trans, iter)); + EBUG_ON(btree_iter_path(trans, iter)->uptodate); out: bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); @@ -1799,23 +1862,15 @@ err: inline bool bch2_btree_iter_advance(struct btree_iter *iter) { - if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) { - struct bpos pos = iter->k.p; - bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS - ? bpos_eq(pos, SPOS_MAX) - : bkey_eq(pos, SPOS_MAX)); - - if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) - pos = bkey_successor(iter, pos); - bch2_btree_iter_set_pos(iter, pos); - return ret; - } else { - if (!btree_path_node(iter->path, iter->path->level)) - return true; + struct bpos pos = iter->k.p; + bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS + ? bpos_eq(pos, SPOS_MAX) + : bkey_eq(pos, SPOS_MAX)); - iter->advanced = true; - return false; - } + if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + pos = bkey_successor(iter, pos); + bch2_btree_iter_set_pos(iter, pos); + return ret; } inline bool bch2_btree_iter_rewind(struct btree_iter *iter) @@ -1832,58 +1887,70 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter) } static noinline -struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter) +void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c *k) { - struct btree_insert_entry *i; - struct bkey_i *ret = NULL; + struct bpos end = path_l(btree_iter_path(trans, iter))->b->data->min_key; - trans_for_each_update(iter->trans, i) { - if (i->btree_id < iter->btree_id) - continue; - if (i->btree_id > iter->btree_id) - break; - if (bpos_lt(i->k->k.p, iter->path->pos)) - continue; - if (i->key_cache_already_flushed) - continue; - if (!ret || bpos_lt(i->k->k.p, ret->k.p)) - ret = i->k; - } + trans_for_each_update(trans, i) + if (!i->key_cache_already_flushed && + i->btree_id == iter->btree_id && + bpos_le(i->k->k.p, iter->pos) && + bpos_ge(i->k->k.p, k->k ? k->k->p : end)) { + iter->k = i->k->k; + *k = bkey_i_to_s_c(i->k); + } +} - return ret; +static noinline +void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c *k) +{ + struct btree_path *path = btree_iter_path(trans, iter); + struct bpos end = path_l(path)->b->key.k.p; + + trans_for_each_update(trans, i) + if (!i->key_cache_already_flushed && + i->btree_id == iter->btree_id && + bpos_ge(i->k->k.p, path->pos) && + bpos_le(i->k->k.p, k->k ? k->k->p : end)) { + iter->k = i->k->k; + *k = bkey_i_to_s_c(i->k); + } } -static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter) +static noinline +void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c *k) { - return iter->flags & BTREE_ITER_WITH_UPDATES - ? __bch2_btree_trans_peek_updates(iter) - : NULL; + trans_for_each_update(trans, i) + if (!i->key_cache_already_flushed && + i->btree_id == iter->btree_id && + bpos_eq(i->k->k.p, iter->pos)) { + iter->k = i->k->k; + *k = bkey_i_to_s_c(i->k); + } } static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, struct btree_iter *iter, struct bpos end_pos) { - struct bkey_i *k; - - if (bpos_lt(iter->path->pos, iter->journal_pos)) - iter->journal_idx = 0; + struct btree_path *path = btree_iter_path(trans, iter); - k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id, - iter->path->level, - iter->path->pos, - end_pos, - &iter->journal_idx); - - iter->journal_pos = k ? k->k.p : end_pos; - return k; + return bch2_journal_keys_peek_upto(trans->c, iter->btree_id, + path->level, + path->pos, + end_pos, + &iter->journal_idx); } static noinline struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, struct btree_iter *iter) { - struct bkey_i *k = bch2_btree_journal_peek(trans, iter, iter->path->pos); + struct btree_path *path = btree_iter_path(trans, iter); + struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos); if (k) { iter->k = k->k; @@ -1898,9 +1965,10 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) { + struct btree_path *path = btree_iter_path(trans, iter); struct bkey_i *next_journal = bch2_btree_journal_peek(trans, iter, - k.k ? k.k->p : path_l(iter->path)->b->key.k.p); + k.k ? k.k->p : path_l(path)->b->key.k.p); if (next_journal) { iter->k = next_journal->k; @@ -1943,13 +2011,13 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED) ?: - bch2_btree_path_relock(trans, iter->path, _THIS_IP_); + bch2_btree_path_relock(trans, btree_iter_path(trans, iter), _THIS_IP_); if (unlikely(ret)) return bkey_s_c_err(ret); - btree_path_set_should_be_locked(iter->key_cache_path); + btree_path_set_should_be_locked(trans->paths + iter->key_cache_path); - k = bch2_btree_path_peek_slot(iter->key_cache_path, &u); + k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u); if (k.k && !bkey_err(k)) { iter->k = u; k.k = &iter->k; @@ -1960,11 +2028,10 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) { struct btree_trans *trans = iter->trans; - struct bkey_i *next_update; struct bkey_s_c k, k2; int ret; - EBUG_ON(iter->path->cached); + EBUG_ON(btree_iter_path(trans, iter)->cached); bch2_btree_iter_verify(iter); while (1) { @@ -1982,7 +2049,8 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp goto out; } - l = path_l(iter->path); + struct btree_path *path = btree_iter_path(trans, iter); + l = path_l(path); if (unlikely(!l->b)) { /* No btree nodes at requested level: */ @@ -1991,7 +2059,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp goto out; } - btree_path_set_should_be_locked(iter->path); + btree_path_set_should_be_locked(path); k = btree_path_level_peek_all(trans->c, l, &iter->k); @@ -2009,14 +2077,9 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL)) k = btree_trans_peek_journal(trans, iter, k); - next_update = btree_trans_peek_updates(iter); - - if (next_update && - bpos_le(next_update->k.p, - k.k ? k.k->p : l->b->key.k.p)) { - iter->k = next_update->k; - k = bkey_i_to_s_c(next_update); - } + if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) && + trans->nr_updates)) + bch2_btree_trans_peek_updates(trans, iter, &k); if (k.k && bkey_deleted(k.k)) { /* @@ -2066,13 +2129,12 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e struct bpos iter_pos; int ret; - EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX)); if (iter->update_path) { bch2_path_put_nokeep(trans, iter->update_path, iter->flags & BTREE_ITER_INTENT); - iter->update_path = NULL; + iter->update_path = 0; } bch2_btree_iter_verify_entry_exit(iter); @@ -2098,10 +2160,10 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e goto end; if (iter->update_path && - !bkey_eq(iter->update_path->pos, k.k->p)) { + !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) { bch2_path_put_nokeep(trans, iter->update_path, iter->flags & BTREE_ITER_INTENT); - iter->update_path = NULL; + iter->update_path = 0; } if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && @@ -2121,7 +2183,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e * advance, same as on exit for iter->path, but only up * to snapshot */ - __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT); + __btree_path_get(trans->paths + iter->path, iter->flags & BTREE_ITER_INTENT); iter->update_path = iter->path; iter->update_path = bch2_btree_path_set_pos(trans, @@ -2177,14 +2239,14 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); - btree_path_set_should_be_locked(iter->path); + btree_path_set_should_be_locked(btree_iter_path(trans, iter)); out_no_locked: if (iter->update_path) { - ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_); + ret = bch2_btree_path_relock(trans, trans->paths + iter->update_path, _THIS_IP_); if (unlikely(ret)) k = bkey_s_c_err(ret); else - btree_path_set_should_be_locked(iter->update_path); + btree_path_set_should_be_locked(trans->paths + iter->update_path); } if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) @@ -2206,103 +2268,6 @@ end: } /** - * bch2_btree_iter_peek_all_levels() - returns the first key greater than or - * equal to iterator's current position, returning keys from every level of the - * btree. For keys at different levels of the btree that compare equal, the key - * from the lower level (leaf) is returned first. - * @iter: iterator to peek from - * - * Returns: key if found, or an error extractable with bkey_err(). - */ -struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter) -{ - struct btree_trans *trans = iter->trans; - struct bkey_s_c k; - int ret; - - EBUG_ON(iter->path->cached); - bch2_btree_iter_verify(iter); - BUG_ON(iter->path->level < iter->min_depth); - BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); - EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS)); - - while (1) { - iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos, - iter->flags & BTREE_ITER_INTENT, - btree_iter_ip_allocated(iter)); - - ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (unlikely(ret)) { - /* ensure that iter->k is consistent with iter->pos: */ - bch2_btree_iter_set_pos(iter, iter->pos); - k = bkey_s_c_err(ret); - goto out_no_locked; - } - - /* Already at end? */ - if (!btree_path_node(iter->path, iter->path->level)) { - k = bkey_s_c_null; - goto out_no_locked; - } - - k = btree_path_level_peek_all(trans->c, - &iter->path->l[iter->path->level], &iter->k); - - /* Check if we should go up to the parent node: */ - if (!k.k || - (iter->advanced && - bpos_eq(path_l(iter->path)->b->key.k.p, iter->pos))) { - iter->pos = path_l(iter->path)->b->key.k.p; - btree_path_set_level_up(trans, iter->path); - iter->advanced = false; - continue; - } - - /* - * Check if we should go back down to a leaf: - * If we're not in a leaf node, we only return the current key - * if it exactly matches iter->pos - otherwise we first have to - * go back to the leaf: - */ - if (iter->path->level != iter->min_depth && - (iter->advanced || - !k.k || - !bpos_eq(iter->pos, k.k->p))) { - btree_path_set_level_down(trans, iter->path, iter->min_depth); - iter->pos = bpos_successor(iter->pos); - iter->advanced = false; - continue; - } - - /* Check if we should go to the next key: */ - if (iter->path->level == iter->min_depth && - iter->advanced && - k.k && - bpos_eq(iter->pos, k.k->p)) { - iter->pos = bpos_successor(iter->pos); - iter->advanced = false; - continue; - } - - if (iter->advanced && - iter->path->level == iter->min_depth && - !bpos_eq(k.k->p, iter->pos)) - iter->advanced = false; - - BUG_ON(iter->advanced); - BUG_ON(!k.k); - break; - } - - iter->pos = k.k->p; - btree_path_set_should_be_locked(iter->path); -out_no_locked: - bch2_btree_iter_verify(iter); - - return k; -} - -/** * bch2_btree_iter_next() - returns first key greater than iterator's current * position * @iter: iterator to peek from @@ -2328,14 +2293,14 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) { struct btree_trans *trans = iter->trans; struct bpos search_key = iter->pos; - struct btree_path *saved_path = NULL; struct bkey_s_c k; struct bkey saved_k; const struct bch_val *saved_v; + btree_path_idx_t saved_path = 0; int ret; - EBUG_ON(iter->path->cached || iter->path->level); - EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES); + EBUG_ON(btree_iter_path(trans, iter)->cached || + btree_iter_path(trans, iter)->level); if (iter->flags & BTREE_ITER_WITH_JOURNAL) return bkey_s_c_err(-EIO); @@ -2359,14 +2324,18 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) goto out_no_locked; } - k = btree_path_level_peek(trans, iter->path, - &iter->path->l[0], &iter->k); + struct btree_path *path = btree_iter_path(trans, iter); + + k = btree_path_level_peek(trans, path, &path->l[0], &iter->k); if (!k.k || ((iter->flags & BTREE_ITER_IS_EXTENTS) ? bpos_ge(bkey_start_pos(k.k), search_key) : bpos_gt(k.k->p, search_key))) - k = btree_path_level_prev(trans, iter->path, - &iter->path->l[0], &iter->k); + k = btree_path_level_prev(trans, path, &path->l[0], &iter->k); + + if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) && + trans->nr_updates)) + bch2_btree_trans_peek_prev_updates(trans, iter, &k); if (likely(k.k)) { if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) { @@ -2382,13 +2351,13 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) bch2_path_put_nokeep(trans, iter->path, iter->flags & BTREE_ITER_INTENT); iter->path = saved_path; - saved_path = NULL; + saved_path = 0; iter->k = saved_k; k.v = saved_v; goto got_key; } - if (bch2_snapshot_is_ancestor(iter->trans->c, + if (bch2_snapshot_is_ancestor(trans->c, iter->snapshot, k.k->p.snapshot)) { if (saved_path) @@ -2396,6 +2365,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) iter->flags & BTREE_ITER_INTENT); saved_path = btree_path_clone(trans, iter->path, iter->flags & BTREE_ITER_INTENT); + path = btree_iter_path(trans, iter); saved_k = *k.k; saved_v = k.v; } @@ -2412,10 +2382,11 @@ got_key: continue; } + btree_path_set_should_be_locked(path); break; - } else if (likely(!bpos_eq(iter->path->l[0].b->data->min_key, POS_MIN))) { + } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) { /* Advance to previous leaf node: */ - search_key = bpos_predecessor(iter->path->l[0].b->data->min_key); + search_key = bpos_predecessor(path->l[0].b->data->min_key); } else { /* Start of btree: */ bch2_btree_iter_set_pos(iter, POS_MIN); @@ -2432,8 +2403,6 @@ got_key: if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) iter->pos.snapshot = iter->snapshot; - - btree_path_set_should_be_locked(iter->path); out_no_locked: if (saved_path) bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT); @@ -2468,8 +2437,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); - EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); - EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE)); + EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE)); /* extents can't span inode numbers: */ if ((iter->flags & BTREE_ITER_IS_EXTENTS) && @@ -2493,13 +2461,13 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) if ((iter->flags & BTREE_ITER_CACHED) || !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { - struct bkey_i *next_update; + k = bkey_s_c_null; - if ((next_update = btree_trans_peek_updates(iter)) && - bpos_eq(next_update->k.p, iter->pos)) { - iter->k = next_update->k; - k = bkey_i_to_s_c(next_update); - goto out; + if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) && + trans->nr_updates)) { + bch2_btree_trans_peek_slot_updates(trans, iter, &k); + if (k.k) + goto out; } if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) && @@ -2514,7 +2482,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) goto out_no_locked; } - k = bch2_btree_path_peek_slot(iter->path, &iter->k); + k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k); if (unlikely(!k.k)) goto out_no_locked; } else { @@ -2524,7 +2492,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) if (iter->flags & BTREE_ITER_IS_EXTENTS) end.offset = U64_MAX; - EBUG_ON(iter->path->level); + EBUG_ON(btree_iter_path(trans, iter)->level); if (iter->flags & BTREE_ITER_INTENT) { struct btree_iter iter2; @@ -2570,7 +2538,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) } } out: - btree_path_set_should_be_locked(iter->path); + btree_path_set_should_be_locked(btree_iter_path(trans, iter)); out_no_locked: bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); @@ -2617,17 +2585,17 @@ static void btree_trans_verify_sorted_refs(struct btree_trans *trans) struct btree_path *path; unsigned i; - BUG_ON(trans->nr_sorted != hweight64(trans->paths_allocated)); + BUG_ON(trans->nr_sorted != bitmap_weight(trans->paths_allocated, trans->nr_paths) - 1); - trans_for_each_path(trans, path) { + trans_for_each_path(trans, path, i) { BUG_ON(path->sorted_idx >= trans->nr_sorted); - BUG_ON(trans->sorted[path->sorted_idx] != path->idx); + BUG_ON(trans->sorted[path->sorted_idx] != i); } for (i = 0; i < trans->nr_sorted; i++) { unsigned idx = trans->sorted[i]; - EBUG_ON(!(trans->paths_allocated & (1ULL << idx))); + BUG_ON(!test_bit(idx, trans->paths_allocated)); BUG_ON(trans->paths[idx].sorted_idx != i); } } @@ -2635,12 +2603,12 @@ static void btree_trans_verify_sorted_refs(struct btree_trans *trans) static void btree_trans_verify_sorted(struct btree_trans *trans) { struct btree_path *path, *prev = NULL; - unsigned i; + struct trans_for_each_path_inorder_iter iter; if (!bch2_debug_check_iterators) return; - trans_for_each_path_inorder(trans, path, i) { + trans_for_each_path_inorder(trans, path, iter) { if (prev && btree_path_cmp(prev, path) > 0) { __bch2_dump_trans_paths_updates(trans, true); panic("trans paths out of order!\n"); @@ -2697,42 +2665,40 @@ out: static inline void btree_path_list_remove(struct btree_trans *trans, struct btree_path *path) { - unsigned i; - EBUG_ON(path->sorted_idx >= trans->nr_sorted); #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS trans->nr_sorted--; memmove_u64s_down_small(trans->sorted + path->sorted_idx, trans->sorted + path->sorted_idx + 1, - DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8)); + DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, + sizeof(u64) / sizeof(btree_path_idx_t))); #else array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx); #endif - for (i = path->sorted_idx; i < trans->nr_sorted; i++) + for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++) trans->paths[trans->sorted[i]].sorted_idx = i; - - path->sorted_idx = U8_MAX; } static inline void btree_path_list_add(struct btree_trans *trans, - struct btree_path *pos, - struct btree_path *path) + btree_path_idx_t pos, + btree_path_idx_t path_idx) { - unsigned i; + struct btree_path *path = trans->paths + path_idx; - path->sorted_idx = pos ? pos->sorted_idx + 1 : trans->nr_sorted; + path->sorted_idx = pos ? trans->paths[pos].sorted_idx + 1 : trans->nr_sorted; #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1, trans->sorted + path->sorted_idx, - DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8)); + DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, + sizeof(u64) / sizeof(btree_path_idx_t))); trans->nr_sorted++; - trans->sorted[path->sorted_idx] = path->idx; + trans->sorted[path->sorted_idx] = path_idx; #else - array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx); + array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path_idx); #endif - for (i = path->sorted_idx; i < trans->nr_sorted; i++) + for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++) trans->paths[trans->sorted[i]].sorted_idx = i; btree_trans_verify_sorted_refs(trans); @@ -2749,9 +2715,10 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) if (iter->key_cache_path) bch2_path_put(trans, iter->key_cache_path, iter->flags & BTREE_ITER_INTENT); - iter->path = NULL; - iter->update_path = NULL; - iter->key_cache_path = NULL; + iter->path = 0; + iter->update_path = 0; + iter->key_cache_path = 0; + iter->trans = NULL; } void bch2_trans_iter_init_outlined(struct btree_trans *trans, @@ -2782,41 +2749,46 @@ void bch2_trans_node_iter_init(struct btree_trans *trans, iter->min_depth = depth; - BUG_ON(iter->path->locks_want < min(locks_want, BTREE_MAX_DEPTH)); - BUG_ON(iter->path->level != depth); - BUG_ON(iter->min_depth != depth); + struct btree_path *path = btree_iter_path(trans, iter); + BUG_ON(path->locks_want < min(locks_want, BTREE_MAX_DEPTH)); + BUG_ON(path->level != depth); + BUG_ON(iter->min_depth != depth); } void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) { + struct btree_trans *trans = src->trans; + *dst = *src; if (src->path) - __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT); + __btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_INTENT); if (src->update_path) - __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT); - dst->key_cache_path = NULL; + __btree_path_get(trans->paths + src->update_path, src->flags & BTREE_ITER_INTENT); + dst->key_cache_path = 0; } void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) { + struct bch_fs *c = trans->c; unsigned new_top = trans->mem_top + size; - size_t old_bytes = trans->mem_bytes; - size_t new_bytes = roundup_pow_of_two(new_top); + unsigned old_bytes = trans->mem_bytes; + unsigned new_bytes = roundup_pow_of_two(new_top); int ret; void *new_mem; void *p; - trans->mem_max = max(trans->mem_max, new_top); - WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); + struct btree_transaction_stats *s = btree_trans_stats(trans); + s->max_mem = max(s->max_mem, new_bytes); + new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); if (unlikely(!new_mem)) { bch2_trans_unlock(trans); new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL); if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { - new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); + new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); new_bytes = BTREE_TRANS_MEM_MAX; kfree(trans->mem); } @@ -2836,7 +2808,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) trans->mem_bytes = new_bytes; if (old_bytes) { - trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); + trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); } @@ -2858,8 +2830,9 @@ void bch2_trans_srcu_unlock(struct btree_trans *trans) if (trans->srcu_held) { struct bch_fs *c = trans->c; struct btree_path *path; + unsigned i; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->cached && !btree_node_locked(path, 0)) path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset); @@ -2869,7 +2842,7 @@ void bch2_trans_srcu_unlock(struct btree_trans *trans) } } -void bch2_trans_srcu_lock(struct btree_trans *trans) +static void bch2_trans_srcu_lock(struct btree_trans *trans) { if (!trans->srcu_held) { trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier); @@ -2891,14 +2864,16 @@ void bch2_trans_srcu_lock(struct btree_trans *trans) u32 bch2_trans_begin(struct btree_trans *trans) { struct btree_path *path; + unsigned i; u64 now; bch2_trans_reset_updates(trans); trans->restart_count++; trans->mem_top = 0; + trans->journal_entries = NULL; - trans_for_each_path(trans, path) { + trans_for_each_path(trans, path, i) { path->should_be_locked = false; /* @@ -2915,15 +2890,21 @@ u32 bch2_trans_begin(struct btree_trans *trans) * iterators if we do that */ if (!path->ref && !path->preserve) - __bch2_path_free(trans, path); + __bch2_path_free(trans, i); else path->preserve = false; } now = local_clock(); + + if (!IS_ENABLED(CONFIG_BCACHEFS_NO_LATENCY_ACCT) && + time_after64(now, trans->last_begin_time + 10)) + __bch2_time_stats_update(&btree_trans_stats(trans)->duration, + trans->last_begin_time, now); + if (!trans->restarted && (need_resched() || - now - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) { + time_after64(now, trans->last_begin_time + BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS))) { drop_locks_do(trans, (cond_resched(), 0)); now = local_clock(); } @@ -2942,32 +2923,11 @@ u32 bch2_trans_begin(struct btree_trans *trans) return trans->restart_count; } -static struct btree_trans *bch2_trans_alloc(struct bch_fs *c) -{ - struct btree_trans *trans; - - if (IS_ENABLED(__KERNEL__)) { - trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL); - if (trans) - return trans; - } - - trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS); - /* - * paths need to be zeroed, bch2_check_for_deadlock looks at - * paths in other threads - */ - memset(&trans->paths, 0, sizeof(trans->paths)); - return trans; -} - -const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR]; +const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR] = { "(unknown)" }; unsigned bch2_trans_get_fn_idx(const char *fn) { - unsigned i; - - for (i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++) + for (unsigned i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++) if (!bch2_btree_transaction_fns[i] || bch2_btree_transaction_fns[i] == fn) { bch2_btree_transaction_fns[i] = fn; @@ -2975,76 +2935,92 @@ unsigned bch2_trans_get_fn_idx(const char *fn) } pr_warn_once("BCH_TRANSACTIONS_NR not big enough!"); - return i; + return 0; } struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) __acquires(&c->btree_trans_barrier) { struct btree_trans *trans; - struct btree_transaction_stats *s; - trans = bch2_trans_alloc(c); - - memset(trans, 0, sizeof(*trans)); - trans->c = c; - trans->fn = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns) - ? bch2_btree_transaction_fns[fn_idx] : NULL; - trans->last_begin_time = local_clock(); - trans->fn_idx = fn_idx; - trans->locking_wait.task = current; - trans->journal_replay_not_finished = - unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) && - atomic_inc_not_zero(&c->journal_keys.ref); - closure_init_stack(&trans->ref); - - s = btree_trans_stats(trans); - if (s && s->max_mem) { - unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem); - - trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL); - - if (!unlikely(trans->mem)) { - trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); - trans->mem_bytes = BTREE_TRANS_MEM_MAX; - } else { - trans->mem_bytes = expected_mem_bytes; + if (IS_ENABLED(__KERNEL__)) { + trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL); + if (trans) { + memset(trans, 0, offsetof(struct btree_trans, list)); + goto got_trans; } } - if (s) { - trans->nr_max_paths = s->nr_max_paths; - trans->wb_updates_size = s->wb_updates_size; - } - - trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); - trans->srcu_lock_time = jiffies; - trans->srcu_held = true; + trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS); + memset(trans, 0, sizeof(*trans)); + closure_init_stack(&trans->ref); - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { + seqmutex_lock(&c->btree_trans_lock); + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { struct btree_trans *pos; + pid_t pid = current->pid; + + trans->locking_wait.task = current; - seqmutex_lock(&c->btree_trans_lock); list_for_each_entry(pos, &c->btree_trans_list, list) { + struct task_struct *pos_task = READ_ONCE(pos->locking_wait.task); /* * We'd much prefer to be stricter here and completely * disallow multiple btree_trans in the same thread - * but the data move path calls bch2_write when we * already have a btree_trans initialized. */ - BUG_ON(trans->locking_wait.task->pid == pos->locking_wait.task->pid && + BUG_ON(pos_task && + pid == pos_task->pid && bch2_trans_locked(pos)); - if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) { + if (pos_task && pid < pos_task->pid) { list_add_tail(&trans->list, &pos->list); goto list_add_done; } } - list_add_tail(&trans->list, &c->btree_trans_list); + } + list_add_tail(&trans->list, &c->btree_trans_list); list_add_done: - seqmutex_unlock(&c->btree_trans_lock); + seqmutex_unlock(&c->btree_trans_lock); +got_trans: + trans->c = c; + trans->last_begin_time = local_clock(); + trans->fn_idx = fn_idx; + trans->locking_wait.task = current; + trans->journal_replay_not_finished = + unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) && + atomic_inc_not_zero(&c->journal_keys.ref); + trans->nr_paths = ARRAY_SIZE(trans->_paths); + trans->paths_allocated = trans->_paths_allocated; + trans->sorted = trans->_sorted; + trans->paths = trans->_paths; + trans->updates = trans->_updates; + + *trans_paths_nr(trans->paths) = BTREE_ITER_INITIAL; + + trans->paths_allocated[0] = 1; + + if (fn_idx < BCH_TRANSACTIONS_NR) { + trans->fn = bch2_btree_transaction_fns[fn_idx]; + + struct btree_transaction_stats *s = &c->btree_transaction_stats[fn_idx]; + + if (s->max_mem) { + unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem); + + trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL); + if (likely(trans->mem)) + trans->mem_bytes = expected_mem_bytes; + } + + trans->nr_paths_max = s->nr_max_paths; + trans->journal_entries_size = s->journal_entries_size; } + trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + trans->srcu_lock_time = jiffies; + trans->srcu_held = true; return trans; } @@ -3053,14 +3029,15 @@ static void check_btree_paths_leaked(struct btree_trans *trans) #ifdef CONFIG_BCACHEFS_DEBUG struct bch_fs *c = trans->c; struct btree_path *path; + unsigned i; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->ref) goto leaked; return; leaked: bch_err(c, "btree paths leaked from %s!", trans->fn); - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->ref) printk(KERN_ERR " btree %s %pS\n", bch2_btree_id_str(path->btree_id), @@ -3073,26 +3050,14 @@ leaked: void bch2_trans_put(struct btree_trans *trans) __releases(&c->btree_trans_barrier) { - struct btree_insert_entry *i; struct bch_fs *c = trans->c; - struct btree_transaction_stats *s = btree_trans_stats(trans); bch2_trans_unlock(trans); - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { - seqmutex_lock(&c->btree_trans_lock); - list_del(&trans->list); - seqmutex_unlock(&c->btree_trans_lock); - } - - closure_sync(&trans->ref); - - if (s) - s->max_mem = max(s->max_mem, trans->mem_max); - trans_for_each_update(trans, i) - __btree_path_put(i->path, true); - trans->nr_updates = 0; + __btree_path_put(trans->paths + i->path, true); + trans->nr_updates = 0; + trans->locking_wait.task = NULL; check_btree_paths_leaked(trans); @@ -3101,8 +3066,6 @@ void bch2_trans_put(struct btree_trans *trans) srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); } - kfree(trans->extra_journal_entries.data); - if (trans->fs_usage_deltas) { if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) == REPLICAS_DELTA_LIST_MAX) @@ -3115,6 +3078,13 @@ void bch2_trans_put(struct btree_trans *trans) if (unlikely(trans->journal_replay_not_finished)) bch2_journal_keys_put(c); + unsigned long *paths_allocated = trans->paths_allocated; + trans->paths_allocated = NULL; + trans->paths = NULL; + + if (paths_allocated != trans->_paths_allocated) + kfree_rcu_mightsleep(paths_allocated); + if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) mempool_free(trans->mem, &c->btree_trans_mem_pool); else @@ -3123,8 +3093,16 @@ void bch2_trans_put(struct btree_trans *trans) /* Userspace doesn't have a real percpu implementation: */ if (IS_ENABLED(__KERNEL__)) trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans); - if (trans) + + if (trans) { + closure_sync(&trans->ref); + + seqmutex_lock(&c->btree_trans_lock); + list_del(&trans->list); + seqmutex_unlock(&c->btree_trans_lock); + mempool_free(trans, &c->btree_trans_pool); + } } static void __maybe_unused @@ -3152,24 +3130,38 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out, void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) { - struct btree_path *path; struct btree_bkey_cached_common *b; static char lock_types[] = { 'r', 'i', 'w' }; + struct task_struct *task = READ_ONCE(trans->locking_wait.task); unsigned l, idx; + /* before rcu_read_lock(): */ + bch2_printbuf_make_room(out, 4096); + if (!out->nr_tabstops) { printbuf_tabstop_push(out, 16); printbuf_tabstop_push(out, 32); } - prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn); + prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn); + + /* trans->paths is rcu protected vs. freeing */ + rcu_read_lock(); + out->atomic++; + + struct btree_path *paths = rcu_dereference(trans->paths); + if (!paths) + goto out; + + unsigned long *paths_allocated = trans_paths_allocated(paths); - trans_for_each_path_safe(trans, path, idx) { + trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths), idx, 1) { + struct btree_path *path = paths + idx; if (!path->nodes_locked) continue; prt_printf(out, " path %u %c l=%u %s:", - path->idx, + idx, path->cached ? 'c' : 'b', path->level, bch2_btree_id_str(path->btree_id)); @@ -3197,6 +3189,9 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) bch2_btree_bkey_cached_common_to_text(out, b); prt_newline(out); } +out: + --out->atomic; + rcu_read_unlock(); } void bch2_fs_btree_iter_exit(struct bch_fs *c) @@ -3205,15 +3200,26 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) struct btree_trans *trans; int cpu; + if (c->btree_trans_bufs) + for_each_possible_cpu(cpu) { + struct btree_trans *trans = + per_cpu_ptr(c->btree_trans_bufs, cpu)->trans; + + if (trans) { + closure_sync(&trans->ref); + + seqmutex_lock(&c->btree_trans_lock); + list_del(&trans->list); + seqmutex_unlock(&c->btree_trans_lock); + } + kfree(trans); + } + free_percpu(c->btree_trans_bufs); + trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list); if (trans) panic("%s leaked btree_trans\n", trans->fn); - if (c->btree_trans_bufs) - for_each_possible_cpu(cpu) - kfree(per_cpu_ptr(c->btree_trans_bufs, cpu)->trans); - free_percpu(c->btree_trans_bufs); - for (s = c->btree_transaction_stats; s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); s++) { @@ -3234,6 +3240,7 @@ void bch2_fs_btree_iter_init_early(struct bch_fs *c) for (s = c->btree_transaction_stats; s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); s++) { + bch2_time_stats_init(&s->duration); bch2_time_stats_init(&s->lock_hold_times); mutex_init(&s->lock); } diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index eaffced4c132..da2b74fa63fc 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -63,60 +63,57 @@ static inline void btree_trans_sort_paths(struct btree_trans *trans) __bch2_btree_trans_sort_paths(trans); } -static inline struct btree_path * -__trans_next_path(struct btree_trans *trans, unsigned idx) +static inline unsigned long *trans_paths_nr(struct btree_path *paths) { - u64 l; - - if (idx == BTREE_ITER_MAX) - return NULL; - - l = trans->paths_allocated >> idx; - if (!l) - return NULL; - - idx += __ffs64(l); - EBUG_ON(idx >= BTREE_ITER_MAX); - EBUG_ON(trans->paths[idx].idx != idx); - return &trans->paths[idx]; + return &container_of(paths, struct btree_trans_paths, paths[0])->nr_paths; } -#define trans_for_each_path_from(_trans, _path, _start) \ - for (_path = __trans_next_path((_trans), _start); \ - (_path); \ - _path = __trans_next_path((_trans), (_path)->idx + 1)) - -#define trans_for_each_path(_trans, _path) \ - trans_for_each_path_from(_trans, _path, 0) - -static inline struct btree_path * -__trans_next_path_safe(struct btree_trans *trans, unsigned *idx) +static inline unsigned long *trans_paths_allocated(struct btree_path *paths) { - u64 l; + unsigned long *v = trans_paths_nr(paths); + return v - BITS_TO_LONGS(*v); +} - if (*idx == BTREE_ITER_MAX) - return NULL; +#define trans_for_each_path_idx_from(_paths_allocated, _nr, _idx, _start)\ + for (_idx = _start; \ + (_idx = find_next_bit(_paths_allocated, _nr, _idx)) < _nr; \ + _idx++) - l = trans->paths_allocated >> *idx; - if (!l) - return NULL; +static inline struct btree_path * +__trans_next_path(struct btree_trans *trans, unsigned *idx) +{ + unsigned long *w = trans->paths_allocated + *idx / BITS_PER_LONG; + /* + * Open coded find_next_bit(), because + * - this is fast path, we can't afford the function call + * - and we know that nr_paths is a multiple of BITS_PER_LONG, + */ + while (*idx < trans->nr_paths) { + unsigned long v = *w >> (*idx & (BITS_PER_LONG - 1)); + if (v) { + *idx += __ffs(v); + return trans->paths + *idx; + } + + *idx += BITS_PER_LONG; + *idx &= ~(BITS_PER_LONG - 1); + w++; + } - *idx += __ffs64(l); - EBUG_ON(*idx >= BTREE_ITER_MAX); - return &trans->paths[*idx]; + return NULL; } /* * This version is intended to be safe for use on a btree_trans that is owned by * another thread, for bch2_btree_trans_to_text(); */ -#define trans_for_each_path_safe_from(_trans, _path, _idx, _start) \ +#define trans_for_each_path_from(_trans, _path, _idx, _start) \ for (_idx = _start; \ - (_path = __trans_next_path_safe((_trans), &_idx)); \ + (_path = __trans_next_path((_trans), &_idx)); \ _idx++) -#define trans_for_each_path_safe(_trans, _path, _idx) \ - trans_for_each_path_safe_from(_trans, _path, _idx, 0) +#define trans_for_each_path(_trans, _path, _idx) \ + trans_for_each_path_from(_trans, _path, _idx, 1) static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path) { @@ -138,10 +135,23 @@ static inline struct btree_path *prev_btree_path(struct btree_trans *trans, stru : NULL; } -#define trans_for_each_path_inorder(_trans, _path, _i) \ - for (_i = 0; \ - ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\ - _i++) +#define trans_for_each_path_idx_inorder(_trans, _iter) \ + for (_iter = (struct trans_for_each_path_inorder_iter) { 0 }; \ + (_iter.path_idx = trans->sorted[_iter.sorted_idx], \ + _iter.sorted_idx < (_trans)->nr_sorted); \ + _iter.sorted_idx++) + +struct trans_for_each_path_inorder_iter { + btree_path_idx_t sorted_idx; + btree_path_idx_t path_idx; +}; + +#define trans_for_each_path_inorder(_trans, _path, _iter) \ + for (_iter = (struct trans_for_each_path_inorder_iter) { 0 }; \ + (_iter.path_idx = trans->sorted[_iter.sorted_idx], \ + _path = (_trans)->paths + _iter.path_idx, \ + _iter.sorted_idx < (_trans)->nr_sorted); \ + _iter.sorted_idx++) #define trans_for_each_path_inorder_reverse(_trans, _path, _i) \ for (_i = trans->nr_sorted - 1; \ @@ -157,67 +167,65 @@ static inline bool __path_has_node(const struct btree_path *path, static inline struct btree_path * __trans_next_path_with_node(struct btree_trans *trans, struct btree *b, - unsigned idx) + unsigned *idx) { - struct btree_path *path = __trans_next_path(trans, idx); + struct btree_path *path; - while (path && !__path_has_node(path, b)) - path = __trans_next_path(trans, path->idx + 1); + while ((path = __trans_next_path(trans, idx)) && + !__path_has_node(path, b)) + (*idx)++; return path; } -#define trans_for_each_path_with_node(_trans, _b, _path) \ - for (_path = __trans_next_path_with_node((_trans), (_b), 0); \ - (_path); \ - _path = __trans_next_path_with_node((_trans), (_b), \ - (_path)->idx + 1)) +#define trans_for_each_path_with_node(_trans, _b, _path, _iter) \ + for (_iter = 1; \ + (_path = __trans_next_path_with_node((_trans), (_b), &_iter));\ + _iter++) -struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *, - bool, unsigned long); +btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *, btree_path_idx_t, + bool, unsigned long); -static inline struct btree_path * __must_check +static inline btree_path_idx_t __must_check bch2_btree_path_make_mut(struct btree_trans *trans, - struct btree_path *path, bool intent, + btree_path_idx_t path, bool intent, unsigned long ip) { - if (path->ref > 1 || path->preserve) + if (trans->paths[path].ref > 1 || + trans->paths[path].preserve) path = __bch2_btree_path_make_mut(trans, path, intent, ip); - path->should_be_locked = false; + trans->paths[path].should_be_locked = false; return path; } -struct btree_path * __must_check -__bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *, - struct bpos, bool, unsigned long, int); +btree_path_idx_t __must_check +__bch2_btree_path_set_pos(struct btree_trans *, btree_path_idx_t, + struct bpos, bool, unsigned long); -static inline struct btree_path * __must_check +static inline btree_path_idx_t __must_check bch2_btree_path_set_pos(struct btree_trans *trans, - struct btree_path *path, struct bpos new_pos, - bool intent, unsigned long ip) + btree_path_idx_t path, struct bpos new_pos, + bool intent, unsigned long ip) { - int cmp = bpos_cmp(new_pos, path->pos); - - return cmp - ? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip, cmp) + return !bpos_eq(new_pos, trans->paths[path].pos) + ? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip) : path; } -int __must_check bch2_btree_path_traverse_one(struct btree_trans *, struct btree_path *, +int __must_check bch2_btree_path_traverse_one(struct btree_trans *, + btree_path_idx_t, unsigned, unsigned long); static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans, - struct btree_path *path, unsigned flags) + btree_path_idx_t path, unsigned flags) { - if (path->uptodate < BTREE_ITER_NEED_RELOCK) + if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK) return 0; return bch2_btree_path_traverse_one(trans, path, flags, _RET_IP_); } -int __must_check bch2_btree_path_traverse(struct btree_trans *, - struct btree_path *, unsigned); -struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, +btree_path_idx_t bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, unsigned, unsigned, unsigned, unsigned long); struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); @@ -269,7 +277,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *, int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *); -void bch2_path_put(struct btree_trans *, struct btree_path *, bool); +void bch2_path_put(struct btree_trans *, btree_path_idx_t, bool); int bch2_trans_relock(struct btree_trans *); int bch2_trans_relock_notrace(struct btree_trans *); @@ -335,7 +343,7 @@ static inline void bch2_btree_path_downgrade(struct btree_trans *trans, void bch2_trans_downgrade(struct btree_trans *); -void bch2_trans_node_add(struct btree_trans *trans, struct btree *); +void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct btree *); void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *); int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter); @@ -348,8 +356,6 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *); struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos); struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); -struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *); - static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) { return bch2_btree_iter_peek_upto(iter, SPOS_MAX); @@ -376,10 +382,12 @@ static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpo static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) { + struct btree_trans *trans = iter->trans; + if (unlikely(iter->update_path)) - bch2_path_put(iter->trans, iter->update_path, + bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_INTENT); - iter->update_path = NULL; + iter->update_path = 0; if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) new_pos.snapshot = iter->snapshot; @@ -408,9 +416,6 @@ static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans, unsigned btree_id, unsigned flags) { - if (flags & BTREE_ITER_ALL_LEVELS) - flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS; - if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) && btree_id_is_extents(btree_id)) flags |= BTREE_ITER_IS_EXTENTS; @@ -450,14 +455,16 @@ static inline void bch2_trans_iter_init_common(struct btree_trans *trans, unsigned flags, unsigned long ip) { - memset(iter, 0, sizeof(*iter)); - iter->trans = trans; - iter->btree_id = btree_id; - iter->flags = flags; - iter->snapshot = pos.snapshot; - iter->pos = pos; - iter->k.p = pos; - + iter->trans = trans; + iter->update_path = 0; + iter->key_cache_path = 0; + iter->btree_id = btree_id; + iter->min_depth = 0; + iter->flags = flags; + iter->snapshot = pos.snapshot; + iter->pos = pos; + iter->k = POS_KEY(pos); + iter->journal_idx = 0; #ifdef CONFIG_BCACHEFS_DEBUG iter->ip_allocated = ip; #endif @@ -489,8 +496,10 @@ void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *); static inline void set_btree_iter_dontneed(struct btree_iter *iter) { - if (!iter->trans->restarted) - iter->path->preserve = false; + struct btree_trans *trans = iter->trans; + + if (!trans->restarted) + btree_iter_path(trans, iter)->preserve = false; } void *__bch2_trans_kmalloc(struct btree_trans *, size_t); @@ -512,7 +521,7 @@ static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size) { - size = roundup(size, 8); + size = round_up(size, 8); if (likely(trans->mem_top + size <= trans->mem_bytes)) { void *p = trans->mem + trans->mem_top; @@ -581,7 +590,6 @@ static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans, KEY_TYPE_##_type, sizeof(*_val), _val) void bch2_trans_srcu_unlock(struct btree_trans *); -void bch2_trans_srcu_lock(struct btree_trans *); u32 bch2_trans_begin(struct btree_trans *); @@ -606,8 +614,6 @@ u32 bch2_trans_begin(struct btree_trans *); static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter, unsigned flags) { - BUG_ON(flags & BTREE_ITER_ALL_LEVELS); - return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : bch2_btree_iter_peek_prev(iter); } @@ -615,8 +621,7 @@ static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter * static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, unsigned flags) { - return flags & BTREE_ITER_ALL_LEVELS ? bch2_btree_iter_peek_all_levels(iter) : - flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : + return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : bch2_btree_iter_peek(iter); } @@ -633,61 +638,34 @@ static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter * return bch2_btree_iter_peek_slot(iter); } +int __bch2_btree_trans_too_many_iters(struct btree_trans *); + static inline int btree_trans_too_many_iters(struct btree_trans *trans) { - if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX - 8) { - trace_and_count(trans->c, trans_restart_too_many_iters, trans, _THIS_IP_); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters); - } + if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_INITIAL - 8) + return __bch2_btree_trans_too_many_iters(trans); return 0; } -struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); - -static inline struct bkey_s_c -__bch2_btree_iter_peek_and_restart(struct btree_trans *trans, - struct btree_iter *iter, unsigned flags) -{ - struct bkey_s_c k; - - while (btree_trans_too_many_iters(trans) || - (k = bch2_btree_iter_peek_type(iter, flags), - bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) - bch2_trans_begin(trans); - - return k; -} - -static inline struct bkey_s_c -__bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos end, - unsigned flags) -{ - struct bkey_s_c k; - - while (btree_trans_too_many_iters(trans) || - (k = bch2_btree_iter_peek_upto_type(iter, end, flags), - bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) - bch2_trans_begin(trans); - - return k; -} - +/* + * goto instead of loop, so that when used inside for_each_btree_key2() + * break/continue work correctly + */ #define lockrestart_do(_trans, _do) \ ({ \ + __label__ transaction_restart; \ u32 _restart_count; \ int _ret2; \ +transaction_restart: \ + _restart_count = bch2_trans_begin(_trans); \ + _ret2 = (_do); \ \ - do { \ - _restart_count = bch2_trans_begin(_trans); \ - _ret2 = (_do); \ - } while (bch2_err_matches(_ret2, BCH_ERR_transaction_restart)); \ + if (bch2_err_matches(_ret2, BCH_ERR_transaction_restart)) \ + goto transaction_restart; \ \ if (!_ret2) \ bch2_trans_verify_not_restarted(_trans, _restart_count);\ - \ _ret2; \ }) @@ -716,91 +694,56 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, _ret2 ?: trans_was_restarted(_trans, _restart_count); \ }) -#define for_each_btree_key2(_trans, _iter, _btree_id, \ - _start, _flags, _k, _do) \ +#define for_each_btree_key_upto(_trans, _iter, _btree_id, \ + _start, _end, _flags, _k, _do) \ ({ \ + struct btree_iter _iter; \ + struct bkey_s_c _k; \ int _ret3 = 0; \ \ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ \ - while (1) { \ - u32 _restart_count = bch2_trans_begin(_trans); \ - \ - _ret3 = 0; \ - (_k) = bch2_btree_iter_peek_type(&(_iter), (_flags)); \ - if (!(_k).k) \ - break; \ + do { \ + _ret3 = lockrestart_do(_trans, ({ \ + (_k) = bch2_btree_iter_peek_upto_type(&(_iter), \ + _end, (_flags)); \ + if (!(_k).k) \ + break; \ \ - _ret3 = bkey_err(_k) ?: (_do); \ - if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\ - continue; \ - if (_ret3) \ - break; \ - bch2_trans_verify_not_restarted(_trans, _restart_count);\ - if (!bch2_btree_iter_advance(&(_iter))) \ - break; \ - } \ + bkey_err(_k) ?: (_do); \ + })); \ + } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \ \ bch2_trans_iter_exit((_trans), &(_iter)); \ _ret3; \ }) -#define for_each_btree_key2_upto(_trans, _iter, _btree_id, \ - _start, _end, _flags, _k, _do) \ -({ \ - int _ret3 = 0; \ - \ - bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - \ - while (1) { \ - u32 _restart_count = bch2_trans_begin(_trans); \ - \ - _ret3 = 0; \ - (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, (_flags));\ - if (!(_k).k) \ - break; \ - \ - _ret3 = bkey_err(_k) ?: (_do); \ - if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\ - continue; \ - if (_ret3) \ - break; \ - bch2_trans_verify_not_restarted(_trans, _restart_count);\ - if (!bch2_btree_iter_advance(&(_iter))) \ - break; \ - } \ - \ - bch2_trans_iter_exit((_trans), &(_iter)); \ - _ret3; \ -}) +#define for_each_btree_key(_trans, _iter, _btree_id, \ + _start, _flags, _k, _do) \ + for_each_btree_key_upto(_trans, _iter, _btree_id, _start, \ + SPOS_MAX, _flags, _k, _do) #define for_each_btree_key_reverse(_trans, _iter, _btree_id, \ _start, _flags, _k, _do) \ ({ \ + struct btree_iter _iter; \ + struct bkey_s_c _k; \ int _ret3 = 0; \ \ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ \ - while (1) { \ - u32 _restart_count = bch2_trans_begin(_trans); \ - (_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\ - if (!(_k).k) { \ - _ret3 = 0; \ - break; \ - } \ + do { \ + _ret3 = lockrestart_do(_trans, ({ \ + (_k) = bch2_btree_iter_peek_prev_type(&(_iter), \ + (_flags)); \ + if (!(_k).k) \ + break; \ \ - _ret3 = bkey_err(_k) ?: (_do); \ - if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\ - continue; \ - if (_ret3) \ - break; \ - bch2_trans_verify_not_restarted(_trans, _restart_count);\ - if (!bch2_btree_iter_rewind(&(_iter))) \ - break; \ - } \ + bkey_err(_k) ?: (_do); \ + })); \ + } while (!_ret3 && bch2_btree_iter_rewind(&(_iter))); \ \ bch2_trans_iter_exit((_trans), &(_iter)); \ _ret3; \ @@ -810,7 +753,7 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, _start, _iter_flags, _k, \ _disk_res, _journal_seq, _commit_flags,\ _do) \ - for_each_btree_key2(_trans, _iter, _btree_id, _start, _iter_flags, _k,\ + for_each_btree_key(_trans, _iter, _btree_id, _start, _iter_flags, _k,\ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ (_journal_seq), (_commit_flags))) @@ -826,32 +769,31 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, _start, _end, _iter_flags, _k, \ _disk_res, _journal_seq, _commit_flags,\ _do) \ - for_each_btree_key2_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\ + for_each_btree_key_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ (_journal_seq), (_commit_flags))) -#define for_each_btree_key(_trans, _iter, _btree_id, \ - _start, _flags, _k, _ret) \ - for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(&(_iter))) +struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); -#define for_each_btree_key_upto(_trans, _iter, _btree_id, \ - _start, _end, _flags, _k, _ret) \ - for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - (_k) = __bch2_btree_iter_peek_upto_and_restart((_trans), \ - &(_iter), _end, _flags),\ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(&(_iter))) +static inline struct bkey_s_c +__bch2_btree_iter_peek_and_restart(struct btree_trans *trans, + struct btree_iter *iter, unsigned flags) +{ + struct bkey_s_c k; -#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ + while (btree_trans_too_many_iters(trans) || + (k = bch2_btree_iter_peek_type(iter, flags), + bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) + bch2_trans_begin(trans); + + return k; +} + +#define for_each_btree_key_old(_trans, _iter, _btree_id, \ _start, _flags, _k, _ret) \ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ - (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \ + (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\ !((_ret) = bkey_err(_k)) && (_k).k; \ bch2_btree_iter_advance(&(_iter))) @@ -863,24 +805,20 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, !((_ret) = bkey_err(_k)) && (_k).k; \ bch2_btree_iter_advance(&(_iter))) -#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _ret) \ - for (; \ - (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(&(_iter))) - -#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ - for (; \ - (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(&(_iter))) - #define for_each_btree_key_upto_continue_norestart(_iter, _end, _flags, _k, _ret)\ for (; \ (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags), \ !((_ret) = bkey_err(_k)) && (_k).k; \ bch2_btree_iter_advance(&(_iter))) +#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ + _start, _flags, _k, _ret) \ + for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, _start,\ + SPOS_MAX, _flags, _k, _ret) + +#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ + for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret) + #define drop_locks_do(_trans, _do) \ ({ \ bch2_trans_unlock(_trans); \ @@ -912,10 +850,7 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, _p; \ }) -/* new multiple iterator interface: */ - void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *); -void bch2_btree_path_to_text(struct printbuf *, struct btree_path *); void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *); void bch2_dump_trans_updates(struct btree_trans *); void bch2_dump_trans_paths_updates(struct btree_trans *); diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index ec52f50d249d..719a94a84950 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -73,6 +73,7 @@ static size_t bch2_journal_key_search(struct journal_keys *keys, return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos)); } +/* Returns first non-overwritten key >= search key: */ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id, unsigned level, struct bpos pos, struct bpos end_pos, size_t *idx) @@ -86,12 +87,26 @@ search: if (!*idx) *idx = __bch2_journal_key_search(keys, btree_id, level, pos); + while (*idx && + __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) { + --(*idx); + iters++; + if (iters == 10) { + *idx = 0; + goto search; + } + } + while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) return NULL; - if (__journal_key_cmp(btree_id, level, pos, k) <= 0 && - !k->overwritten) + if (k->overwritten) { + (*idx)++; + continue; + } + + if (__journal_key_cmp(btree_id, level, pos, k) <= 0) return k->k; (*idx)++; @@ -162,7 +177,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, struct journal_keys *keys = &c->journal_keys; size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); - BUG_ON(test_bit(BCH_FS_RW, &c->flags)); + BUG_ON(test_bit(BCH_FS_rw, &c->flags)); if (idx < keys->size && journal_key_cmp(&n, &keys->d[idx]) == 0) { @@ -452,9 +467,7 @@ static void __journal_keys_sort(struct journal_keys *keys) src = dst = keys->d; while (src < keys->d + keys->nr) { while (src + 1 < keys->d + keys->nr && - src[0].btree_id == src[1].btree_id && - src[0].level == src[1].level && - bpos_eq(src[0].k->k.p, src[1].k->k.p)) + !journal_key_cmp(src, src + 1)) src++; *dst++ = *src++; diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 1b7a5668df7c..74e52fd28abe 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -630,7 +630,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, if (ret) goto out; - ck = (void *) c_iter.path->l[0].b; + ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b; if (!ck) goto out; @@ -645,22 +645,29 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, if (journal_seq && ck->journal.seq != journal_seq) goto out; + trans->journal_res.seq = ck->journal.seq; + /* - * Since journal reclaim depends on us making progress here, and the - * allocator/copygc depend on journal reclaim making progress, we need - * to be using alloc reserves: + * If we're at the end of the journal, we really want to free up space + * in the journal right away - we don't want to pin that old journal + * sequence number with a new btree node write, we want to re-journal + * the update */ + if (ck->journal.seq == journal_last_seq(j)) + commit_flags |= BCH_WATERMARK_reclaim; + + if (ck->journal.seq != journal_last_seq(j) || + j->watermark == BCH_WATERMARK_stripe) + commit_flags |= BCH_TRANS_COMMIT_no_journal_res; + ret = bch2_btree_iter_traverse(&b_iter) ?: bch2_trans_update(trans, &b_iter, ck->k, BTREE_UPDATE_KEY_CACHE_RECLAIM| BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| BTREE_TRIGGER_NORUN) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL| - (ck->journal.seq == journal_last_seq(j) - ? BCH_WATERMARK_reclaim - : 0)| + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc| commit_flags); bch2_fs_fatal_err_on(ret && @@ -673,7 +680,8 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, bch2_journal_pin_drop(j, &ck->journal); - BUG_ON(!btree_node_locked(c_iter.path, 0)); + struct btree_path *path = btree_iter_path(trans, &c_iter); + BUG_ON(!btree_node_locked(path, 0)); if (!evict) { if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { @@ -682,19 +690,20 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, } } else { struct btree_path *path2; + unsigned i; evict: - trans_for_each_path(trans, path2) - if (path2 != c_iter.path) + trans_for_each_path(trans, path2, i) + if (path2 != path) __bch2_btree_path_unlock(trans, path2); - bch2_btree_node_lock_write_nofail(trans, c_iter.path, &ck->c); + bch2_btree_node_lock_write_nofail(trans, path, &ck->c); if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { clear_bit(BKEY_CACHED_DIRTY, &ck->flags); atomic_long_dec(&c->btree_key_cache.nr_dirty); } - mark_btree_node_locked_noreset(c_iter.path, 0, BTREE_NODE_UNLOCKED); + mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); bkey_cached_evict(&c->btree_key_cache, ck); bkey_cached_free_fast(&c->btree_key_cache, ck); } @@ -732,9 +741,9 @@ int bch2_btree_key_cache_journal_flush(struct journal *j, } six_unlock_read(&ck->c.lock); - ret = commit_do(trans, NULL, NULL, 0, + ret = lockrestart_do(trans, btree_key_cache_flush_pos(trans, key, seq, - BTREE_INSERT_JOURNAL_RECLAIM, false)); + BCH_TRANS_COMMIT_journal_reclaim, false)); unlock: srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); @@ -742,28 +751,12 @@ unlock: return ret; } -/* - * Flush and evict a key from the key cache: - */ -int bch2_btree_key_cache_flush(struct btree_trans *trans, - enum btree_id id, struct bpos pos) -{ - struct bch_fs *c = trans->c; - struct bkey_cached_key key = { id, pos }; - - /* Fastpath - assume it won't be found: */ - if (!bch2_btree_key_cache_find(c, id, pos)) - return 0; - - return btree_key_cache_flush_pos(trans, key, 0, 0, true); -} - bool bch2_btree_insert_key_cached(struct btree_trans *trans, unsigned flags, struct btree_insert_entry *insert_entry) { struct bch_fs *c = trans->c; - struct bkey_cached *ck = (void *) insert_entry->path->l[0].b; + struct bkey_cached *ck = (void *) (trans->paths + insert_entry->path)->l[0].b; struct bkey_i *insert = insert_entry->k; bool kick_reclaim = false; @@ -773,7 +766,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, ck->valid = true; if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); + EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); set_bit(BKEY_CACHED_DIRTY, &ck->flags); atomic_long_inc(&c->btree_key_cache.nr_dirty); @@ -1000,7 +993,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) if (atomic_long_read(&bc->nr_dirty) && !bch2_journal_error(&c->journal) && - test_bit(BCH_FS_WAS_RW, &c->flags)) + test_bit(BCH_FS_was_rw, &c->flags)) panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n", atomic_long_read(&bc->nr_dirty)); diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h index be3acde2caa0..e6b2cd0dd2c1 100644 --- a/fs/bcachefs/btree_key_cache.h +++ b/fs/bcachefs/btree_key_cache.h @@ -31,8 +31,6 @@ int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *, bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned, struct btree_insert_entry *); -int bch2_btree_key_cache_flush(struct btree_trans *, - enum btree_id, struct bpos); void bch2_btree_key_cache_drop(struct btree_trans *, struct btree_path *); diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 3d48834d091f..2d1c95c42f24 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -32,13 +32,14 @@ struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans, { struct btree_path *path; struct six_lock_count ret; + unsigned i; memset(&ret, 0, sizeof(ret)); if (IS_ERR_OR_NULL(b)) return ret; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path != skip && &path->l[level].b->c == b) { int t = btree_node_locked_type(path, level); @@ -85,8 +86,14 @@ static noinline void print_cycle(struct printbuf *out, struct lock_graph *g) prt_printf(out, "Found lock cycle (%u entries):", g->nr); prt_newline(out); - for (i = g->g; i < g->g + g->nr; i++) + for (i = g->g; i < g->g + g->nr; i++) { + struct task_struct *task = READ_ONCE(i->trans->locking_wait.task); + if (!task) + continue; + bch2_btree_trans_to_text(out, i->trans); + bch2_prt_task_backtrace(out, task, i == g->g ? 5 : 1); + } } static noinline void print_chain(struct printbuf *out, struct lock_graph *g) @@ -94,9 +101,10 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g) struct trans_waiting_for_lock *i; for (i = g->g; i != g->g + g->nr; i++) { + struct task_struct *task = i->trans->locking_wait.task; if (i != g->g) prt_str(out, "<- "); - prt_printf(out, "%u ", i->trans->locking_wait.task->pid); + prt_printf(out, "%u ", task ?task->pid : 0); } prt_newline(out); } @@ -142,10 +150,27 @@ static bool lock_graph_remove_non_waiters(struct lock_graph *g) return false; } +static void trace_would_deadlock(struct lock_graph *g, struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + + count_event(c, trans_restart_would_deadlock); + + if (trace_trans_restart_would_deadlock_enabled()) { + struct printbuf buf = PRINTBUF; + + buf.atomic++; + print_cycle(&buf, g); + + trace_trans_restart_would_deadlock(trans, buf.buf); + printbuf_exit(&buf); + } +} + static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i) { if (i == g->g) { - trace_and_count(i->trans->c, trans_restart_would_deadlock, i->trans, _RET_IP_); + trace_would_deadlock(g, i->trans); return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock); } else { i->trans->lock_must_abort = true; @@ -202,7 +227,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) prt_printf(&buf, "backtrace:"); prt_newline(&buf); printbuf_indent_add(&buf, 2); - bch2_prt_task_backtrace(&buf, trans->locking_wait.task); + bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2); printbuf_indent_sub(&buf, 2); prt_newline(&buf); } @@ -262,27 +287,40 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) struct lock_graph g; struct trans_waiting_for_lock *top; struct btree_bkey_cached_common *b; - struct btree_path *path; - unsigned path_idx; - int ret; + btree_path_idx_t path_idx; + int ret = 0; + + g.nr = 0; if (trans->lock_must_abort) { if (cycle) return -1; - trace_and_count(trans->c, trans_restart_would_deadlock, trans, _RET_IP_); + trace_would_deadlock(&g, trans); return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock); } - g.nr = 0; lock_graph_down(&g, trans); + + /* trans->paths is rcu protected vs. freeing */ + rcu_read_lock(); + if (cycle) + cycle->atomic++; next: if (!g.nr) - return 0; + goto out; top = &g.g[g.nr - 1]; - trans_for_each_path_safe_from(top->trans, path, path_idx, top->path_idx) { + struct btree_path *paths = rcu_dereference(top->trans->paths); + if (!paths) + goto up; + + unsigned long *paths_allocated = trans_paths_allocated(paths); + + trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths), + path_idx, top->path_idx) { + struct btree_path *path = paths + path_idx; if (!path->nodes_locked) continue; @@ -348,18 +386,23 @@ next: ret = lock_graph_descend(&g, trans, cycle); if (ret) - return ret; + goto out; goto next; } raw_spin_unlock(&b->lock.wait_lock); } } - +up: if (g.nr > 1 && cycle) print_chain(cycle, &g); lock_graph_up(&g); goto next; +out: + if (cycle) + --cycle->atomic; + rcu_read_unlock(); + return ret; } int bch2_six_check_for_deadlock(struct six_lock *lock, void *p) @@ -398,7 +441,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, struct btree_bkey_cached_common *b) { struct btree_path *linked; - unsigned i; + unsigned i, iter; int ret; /* @@ -412,7 +455,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, * already taken are no longer needed: */ - trans_for_each_path(trans, linked) { + trans_for_each_path(trans, linked, iter) { if (!linked->nodes_locked) continue; @@ -624,8 +667,6 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, unsigned new_locks_want, struct get_locks_fail *f) { - struct btree_path *linked; - if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f)) return true; @@ -648,8 +689,11 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, * before interior nodes - now that's handled by * bch2_btree_path_traverse_all(). */ - if (!path->cached && !trans->in_traverse_all) - trans_for_each_path(trans, linked) + if (!path->cached && !trans->in_traverse_all) { + struct btree_path *linked; + unsigned i; + + trans_for_each_path(trans, linked, i) if (linked != path && linked->cached == path->cached && linked->btree_id == path->btree_id && @@ -657,6 +701,7 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, linked->locks_want = new_locks_want; btree_path_get_locks(trans, linked, true, NULL); } + } return false; } @@ -665,7 +710,7 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans, struct btree_path *path, unsigned new_locks_want) { - unsigned l; + unsigned l, old_locks_want = path->locks_want; if (trans->restarted) return; @@ -689,8 +734,7 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans, bch2_btree_path_verify_locks(path); - path->downgrade_seq++; - trace_path_downgrade(trans, _RET_IP_, path); + trace_path_downgrade(trans, _RET_IP_, path, old_locks_want); } /* Btree transaction locking: */ @@ -698,22 +742,24 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans, void bch2_trans_downgrade(struct btree_trans *trans) { struct btree_path *path; + unsigned i; if (trans->restarted) return; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) bch2_btree_path_downgrade(trans, path); } int bch2_trans_relock(struct btree_trans *trans) { struct btree_path *path; + unsigned i; if (unlikely(trans->restarted)) return -((int) trans->restarted); - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->should_be_locked && !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) { trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path); @@ -725,11 +771,12 @@ int bch2_trans_relock(struct btree_trans *trans) int bch2_trans_relock_notrace(struct btree_trans *trans) { struct btree_path *path; + unsigned i; if (unlikely(trans->restarted)) return -((int) trans->restarted); - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->should_be_locked && !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) { return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); @@ -740,16 +787,18 @@ int bch2_trans_relock_notrace(struct btree_trans *trans) void bch2_trans_unlock_noassert(struct btree_trans *trans) { struct btree_path *path; + unsigned i; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) __bch2_btree_path_unlock(trans, path); } void bch2_trans_unlock(struct btree_trans *trans) { struct btree_path *path; + unsigned i; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) __bch2_btree_path_unlock(trans, path); } @@ -762,8 +811,9 @@ void bch2_trans_unlock_long(struct btree_trans *trans) bool bch2_trans_locked(struct btree_trans *trans) { struct btree_path *path; + unsigned i; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->nodes_locked) return true; return false; @@ -809,8 +859,9 @@ void bch2_btree_path_verify_locks(struct btree_path *path) void bch2_trans_verify_locks(struct btree_trans *trans) { struct btree_path *path; + unsigned i; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) bch2_btree_path_verify_locks(path); } diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index 11b0a2c8cd69..cc5500a957a1 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -122,12 +122,9 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans, struct btree_path *path, unsigned level) { #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS - struct btree_transaction_stats *s = btree_trans_stats(trans); - - if (s) - __bch2_time_stats_update(&s->lock_hold_times, - path->l[level].lock_taken_time, - local_clock()); + __bch2_time_stats_update(&btree_trans_stats(trans)->lock_hold_times, + path->l[level].lock_taken_time, + local_clock()); #endif } @@ -175,6 +172,7 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat struct btree *b) { struct btree_path *linked; + unsigned i; EBUG_ON(path->l[b->c.level].b != b); EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock)); @@ -182,7 +180,7 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); - trans_for_each_path_with_node(trans, b, linked) + trans_for_each_path_with_node(trans, b, linked, i) linked->l[b->c.level].lock_seq++; six_unlock_write(&b->c.lock); @@ -242,8 +240,9 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans, enum btree_node_locked_type want) { struct btree_path *path; + unsigned i; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (&path->l[level].b->c == b && btree_node_locked_type(path, level) >= want) { six_lock_increment(&b->lock, (enum six_lock_type) want); @@ -263,7 +262,6 @@ static inline int btree_node_lock(struct btree_trans *trans, int ret = 0; EBUG_ON(level >= BTREE_MAX_DEPTH); - EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); if (likely(six_trylock_type(&b->lock, type)) || btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) || diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 12907beda98c..90eb8065ff2d 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -12,6 +12,7 @@ #include "errcode.h" #include "error.h" #include "journal.h" +#include "journal_io.h" #include "journal_reclaim.h" #include "replicas.h" #include "snapshot.h" @@ -23,7 +24,7 @@ static void verify_update_old_key(struct btree_trans *trans, struct btree_insert #ifdef CONFIG_BCACHEFS_DEBUG struct bch_fs *c = trans->c; struct bkey u; - struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u); + struct bkey_s_c k = bch2_btree_path_peek_slot_exact(trans->paths + i->path, &u); if (unlikely(trans->journal_replay_not_finished)) { struct bkey_i *j_k = @@ -41,23 +42,23 @@ static void verify_update_old_key(struct btree_trans *trans, struct btree_insert #endif } -static inline struct btree_path_level *insert_l(struct btree_insert_entry *i) +static inline struct btree_path_level *insert_l(struct btree_trans *trans, struct btree_insert_entry *i) { - return i->path->l + i->level; + return (trans->paths + i->path)->l + i->level; } static inline bool same_leaf_as_prev(struct btree_trans *trans, struct btree_insert_entry *i) { return i != trans->updates && - insert_l(&i[0])->b == insert_l(&i[-1])->b; + insert_l(trans, &i[0])->b == insert_l(trans, &i[-1])->b; } static inline bool same_leaf_as_next(struct btree_trans *trans, struct btree_insert_entry *i) { return i + 1 < trans->updates + trans->nr_updates && - insert_l(&i[0])->b == insert_l(&i[1])->b; + insert_l(trans, &i[0])->b == insert_l(trans, &i[1])->b; } inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, @@ -84,7 +85,7 @@ static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btre if (same_leaf_as_prev(trans, i)) continue; - bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b); + bch2_btree_node_unlock_write(trans, trans->paths + i->path, insert_l(trans, i)->b); } trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); @@ -93,19 +94,17 @@ static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btre static inline int bch2_trans_lock_write(struct btree_trans *trans) { - struct btree_insert_entry *i; - EBUG_ON(trans->write_locked); trans_for_each_update(trans, i) { if (same_leaf_as_prev(trans, i)) continue; - if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c)) + if (bch2_btree_node_lock_write(trans, trans->paths + i->path, &insert_l(trans, i)->b->c)) return trans_lock_write_fail(trans, i); if (!i->cached) - bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); + bch2_btree_node_prep_for_write(trans, trans->paths + i->path, insert_l(trans, i)->b); } trans->write_locked = true; @@ -115,12 +114,10 @@ static inline int bch2_trans_lock_write(struct btree_trans *trans) static inline void bch2_trans_unlock_write(struct btree_trans *trans) { if (likely(trans->write_locked)) { - struct btree_insert_entry *i; - trans_for_each_update(trans, i) if (!same_leaf_as_prev(trans, i)) - bch2_btree_node_unlock_write_inlined(trans, i->path, - insert_l(i)->b); + bch2_btree_node_unlock_write_inlined(trans, + trans->paths + i->path, insert_l(trans, i)->b); trans->write_locked = false; } } @@ -287,7 +284,7 @@ inline void bch2_btree_insert_key_leaf(struct btree_trans *trans, bch2_btree_add_journal_pin(c, b, journal_seq); if (unlikely(!btree_node_dirty(b))) { - EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); + EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); set_btree_node_dirty_acct(c, b); } @@ -311,10 +308,12 @@ inline void bch2_btree_insert_key_leaf(struct btree_trans *trans, static inline void btree_insert_entry_checks(struct btree_trans *trans, struct btree_insert_entry *i) { - BUG_ON(!bpos_eq(i->k->k.p, i->path->pos)); - BUG_ON(i->cached != i->path->cached); - BUG_ON(i->level != i->path->level); - BUG_ON(i->btree_id != i->path->btree_id); + struct btree_path *path = trans->paths + i->path; + + BUG_ON(!bpos_eq(i->k->k.p, path->pos)); + BUG_ON(i->cached != path->cached); + BUG_ON(i->level != path->level); + BUG_ON(i->btree_id != path->btree_id); EBUG_ON(!i->level && btree_type_has_snapshots(i->btree_id) && !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && @@ -361,8 +360,6 @@ noinline static int btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags, struct btree_path *path, unsigned new_u64s) { - struct bch_fs *c = trans->c; - struct btree_insert_entry *i; struct bkey_cached *ck = (void *) path->l[0].b; struct bkey_i *new_k; int ret; @@ -372,7 +369,7 @@ btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags, new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); if (!new_k) { - bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", + bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", bch2_btree_id_str(path->btree_id), new_u64s); return -BCH_ERR_ENOMEM_btree_key_cache_insert; } @@ -401,7 +398,6 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags { struct bch_fs *c = trans->c; struct bkey_cached *ck = (void *) path->l[0].b; - struct btree_insert_entry *i; unsigned new_u64s; struct bkey_i *new_k; @@ -409,7 +405,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && bch2_btree_key_cache_must_wait(c) && - !(flags & BTREE_INSERT_JOURNAL_RECLAIM)) + !(flags & BCH_TRANS_COMMIT_journal_reclaim)) return -BCH_ERR_btree_insert_need_journal_reclaim; /* @@ -455,22 +451,15 @@ static int run_one_mem_trigger(struct btree_trans *trans, if (!btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id))) return 0; - if (old_ops->atomic_trigger == new_ops->atomic_trigger) { - ret = bch2_mark_key(trans, i->btree_id, i->level, - old, bkey_i_to_s_c(new), + if (old_ops->trigger == new_ops->trigger) { + ret = bch2_key_trigger(trans, i->btree_id, i->level, + old, bkey_i_to_s(new), BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); } else { - struct bkey _deleted = KEY(0, 0, 0); - struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; - - _deleted.p = i->path->pos; - - ret = bch2_mark_key(trans, i->btree_id, i->level, - deleted, bkey_i_to_s_c(new), - BTREE_TRIGGER_INSERT|flags) ?: - bch2_mark_key(trans, i->btree_id, i->level, - old, deleted, - BTREE_TRIGGER_OVERWRITE|flags); + ret = bch2_key_trigger_new(trans, i->btree_id, i->level, + bkey_i_to_s(new), flags) ?: + bch2_key_trigger_old(trans, i->btree_id, i->level, + old, flags); } return ret; @@ -488,6 +477,7 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ struct bkey_s_c old = { &old_k, i->old_v }; const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); + unsigned flags = i->flags|BTREE_TRIGGER_TRANSACTIONAL; verify_update_old_key(trans, i); @@ -497,19 +487,18 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ if (!i->insert_trigger_run && !i->overwrite_trigger_run && - old_ops->trans_trigger == new_ops->trans_trigger) { + old_ops->trigger == new_ops->trigger) { i->overwrite_trigger_run = true; i->insert_trigger_run = true; - return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k, - BTREE_TRIGGER_INSERT| - BTREE_TRIGGER_OVERWRITE| - i->flags) ?: 1; + return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k), + BTREE_TRIGGER_INSERT| + BTREE_TRIGGER_OVERWRITE|flags) ?: 1; } else if (overwrite && !i->overwrite_trigger_run) { i->overwrite_trigger_run = true; - return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1; + return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1; } else if (!overwrite && !i->insert_trigger_run) { i->insert_trigger_run = true; - return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1; + return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(i->k), flags) ?: 1; } else { return 0; } @@ -551,7 +540,7 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, static int bch2_trans_commit_run_triggers(struct btree_trans *trans) { - struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates; + struct btree_insert_entry *btree_id_start = trans->updates; unsigned btree_id = 0; int ret = 0; @@ -598,7 +587,6 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) { struct bch_fs *c = trans->c; - struct btree_insert_entry *i; int ret = 0; trans_for_each_update(trans, i) { @@ -608,7 +596,7 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) */ BUG_ON(i->cached || i->level); - if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) { + if (gc_visited(c, gc_pos_btree_node(insert_l(trans, i)->b))) { ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); if (ret) break; @@ -624,8 +612,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, unsigned long trace_ip) { struct bch_fs *c = trans->c; - struct btree_insert_entry *i; - struct btree_write_buffered_key *wb; struct btree_trans_commit_hook *h; unsigned u64s = 0; int ret; @@ -650,23 +636,21 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, u64s += i->k->k.u64s; ret = !i->cached - ? btree_key_can_insert(trans, insert_l(i)->b, u64s) - : btree_key_can_insert_cached(trans, flags, i->path, u64s); + ? btree_key_can_insert(trans, insert_l(trans, i)->b, u64s) + : btree_key_can_insert_cached(trans, flags, trans->paths + i->path, u64s); if (ret) { *stopped_at = i; return ret; } - } - if (trans->nr_wb_updates && - trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size) - return -BCH_ERR_btree_insert_need_flush_buffer; + i->k->k.needs_whiteout = false; + } /* * Don't get journal reservation until after we know insert will * succeed: */ - if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { + if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) { ret = bch2_trans_journal_res_get(trans, (flags & BCH_WATERMARK_MASK)| JOURNAL_RES_GET_NONBLOCK); @@ -675,8 +659,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, if (unlikely(trans->journal_transaction_names)) journal_transaction_name(trans); - } else { - trans->journal_res.seq = c->journal.replay_journal_seq; } /* @@ -685,7 +667,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, */ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - !(flags & BTREE_INSERT_JOURNAL_REPLAY)) { + !(flags & BCH_TRANS_COMMIT_no_journal_res)) { if (bch2_journal_seq_verify) trans_for_each_update(trans, i) i->k->k.version.lo = trans->journal_res.seq; @@ -698,14 +680,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) return -BCH_ERR_btree_insert_need_mark_replicas; - if (trans->nr_wb_updates) { - EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY); - - ret = bch2_btree_insert_keys_write_buffer(trans); - if (ret) - goto revert_fs_usage; - } - h = trans->hooks; while (h) { ret = h->fn(trans, h); @@ -727,16 +701,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, goto fatal_err; } - if (unlikely(trans->extra_journal_entries.nr)) { - memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), - trans->extra_journal_entries.data, - trans->extra_journal_entries.nr); - - trans->journal_res.offset += trans->extra_journal_entries.nr; - trans->journal_res.u64s -= trans->extra_journal_entries.nr; - } - - if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { + if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) { struct journal *j = &c->journal; struct jset_entry *entry; @@ -765,33 +730,27 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, bkey_copy((struct bkey_i *) entry->start, i->k); } - trans_for_each_wb_update(trans, wb) { - entry = bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_btree_keys, - wb->btree, 0, - wb->k.k.u64s); - bkey_copy((struct bkey_i *) entry->start, &wb->k); - } + memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), + trans->journal_entries, + trans->journal_entries_u64s); + + trans->journal_res.offset += trans->journal_entries_u64s; + trans->journal_res.u64s -= trans->journal_entries_u64s; if (trans->journal_seq) *trans->journal_seq = trans->journal_res.seq; } trans_for_each_update(trans, i) { - i->k->k.needs_whiteout = false; + struct btree_path *path = trans->paths + i->path; if (!i->cached) { - u64 seq = trans->journal_res.seq; - - if (i->flags & BTREE_UPDATE_PREJOURNAL) - seq = i->seq; - - bch2_btree_insert_key_leaf(trans, i->path, i->k, seq); + bch2_btree_insert_key_leaf(trans, path, i->k, trans->journal_res.seq); } else if (!i->key_cache_already_flushed) bch2_btree_insert_key_cached(trans, flags, i); else { - bch2_btree_key_cache_drop(trans, i->path); - btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE); + bch2_btree_key_cache_drop(trans, path); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); } } @@ -806,14 +765,8 @@ revert_fs_usage: static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) { - struct btree_insert_entry *i; - struct btree_write_buffered_key *wb; - trans_for_each_update(trans, i) bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); - - trans_for_each_wb_update(trans, wb) - bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p); } static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, @@ -841,6 +794,33 @@ static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, return -EINVAL; } +static noinline int bch2_trans_commit_journal_entry_invalid(struct btree_trans *trans, + struct jset_entry *i) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "invalid bkey on insert from %s", trans->fn); + prt_newline(&buf); + printbuf_indent_add(&buf, 2); + + bch2_journal_entry_to_text(&buf, c, i); + prt_newline(&buf); + + bch2_print_string_as_lines(KERN_ERR, buf.buf); + + bch2_inconsistent_error(c); + bch2_dump_trans_updates(trans); + + return -EINVAL; +} + +static int bch2_trans_commit_journal_pin_flush(struct journal *j, + struct journal_entry_pin *_pin, u64 seq) +{ + return 0; +} + /* * Get journal reservation, take write locks, and attempt to do btree update(s): */ @@ -849,7 +829,6 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags unsigned long trace_ip) { struct bch_fs *c = trans->c; - struct btree_insert_entry *i; int ret = 0, u64s_delta = 0; trans_for_each_update(trans, i) { @@ -884,13 +863,15 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags if (!ret && trans->journal_pin) bch2_journal_pin_add(&c->journal, trans->journal_res.seq, - trans->journal_pin, NULL); + trans->journal_pin, + bch2_trans_commit_journal_pin_flush); /* * Drop journal reservation after dropping write locks, since dropping * the journal reservation may kick off a journal write: */ - bch2_journal_res_put(&c->journal, &trans->journal_res); + if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) + bch2_journal_res_put(&c->journal, &trans->journal_res); return ret; } @@ -916,7 +897,8 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, case -BCH_ERR_btree_insert_btree_node_full: ret = bch2_btree_split_leaf(trans, i->path, flags); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path); + trace_and_count(c, trans_restart_btree_node_split, trans, + trace_ip, trans->paths + i->path); break; case -BCH_ERR_btree_insert_need_mark_replicas: ret = drop_locks_do(trans, @@ -927,7 +909,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK * flag */ - if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) && + if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) { ret = -BCH_ERR_journal_reclaim_would_deadlock; break; @@ -950,30 +932,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, ret = bch2_trans_relock(trans); break; - case -BCH_ERR_btree_insert_need_flush_buffer: { - struct btree_write_buffer *wb = &c->btree_write_buffer; - - ret = 0; - - if (wb->state.nr > wb->size * 3 / 4) { - bch2_trans_unlock(trans); - mutex_lock(&wb->flush_lock); - - if (wb->state.nr > wb->size * 3 / 4) { - bch2_trans_begin(trans); - ret = __bch2_btree_write_buffer_flush(trans, - flags|BTREE_INSERT_NOCHECK_RW, true); - if (!ret) { - trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); - } - } else { - mutex_unlock(&wb->flush_lock); - ret = bch2_trans_relock(trans); - } - } - break; - } default: BUG_ON(ret >= 0); break; @@ -982,8 +940,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) && - !(flags & BTREE_INSERT_NOWAIT) && - (flags & BTREE_INSERT_NOFAIL), c, + (flags & BCH_TRANS_COMMIT_no_enospc), c, "%s: incorrectly got %s\n", __func__, bch2_err_str(ret)); return ret; @@ -995,8 +952,8 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags) struct bch_fs *c = trans->c; int ret; - if (likely(!(flags & BTREE_INSERT_LAZY_RW)) || - test_bit(BCH_FS_STARTED, &c->flags)) + if (likely(!(flags & BCH_TRANS_COMMIT_lazy_rw)) || + test_bit(BCH_FS_started, &c->flags)) return -BCH_ERR_erofs_trans_commit; ret = drop_locks_do(trans, bch2_fs_read_write_early(c)); @@ -1016,7 +973,6 @@ static noinline int do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) { struct bch_fs *c = trans->c; - struct btree_insert_entry *i; int ret = 0; trans_for_each_update(trans, i) { @@ -1030,19 +986,14 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) { + struct btree_insert_entry *errored_at = NULL; struct bch_fs *c = trans->c; - struct btree_insert_entry *i = NULL; - struct btree_write_buffered_key *wb; int ret = 0; if (!trans->nr_updates && - !trans->nr_wb_updates && - !trans->extra_journal_entries.nr) + !trans->journal_entries_u64s) goto out_reset; - if (flags & BTREE_INSERT_GC_LOCK_HELD) - lockdep_assert_held(&c->gc_lock); - ret = bch2_trans_commit_run_triggers(trans); if (ret) goto out_reset; @@ -1051,7 +1002,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) struct printbuf buf = PRINTBUF; enum bkey_invalid_flags invalid_flags = 0; - if (!(flags & BTREE_INSERT_JOURNAL_REPLAY)) + if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), @@ -1064,47 +1015,52 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) return ret; } - if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) { + for (struct jset_entry *i = trans->journal_entries; + i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + i = vstruct_next(i)) { + enum bkey_invalid_flags invalid_flags = 0; + + if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) + invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; + + if (unlikely(bch2_journal_entry_validate(c, NULL, i, + bcachefs_metadata_version_current, + CPU_BIG_ENDIAN, invalid_flags))) + ret = bch2_trans_commit_journal_entry_invalid(trans, i); + + if (ret) + return ret; + } + + if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) { ret = do_bch2_trans_commit_to_journal_replay(trans); goto out_reset; } - if (!(flags & BTREE_INSERT_NOCHECK_RW) && + if (!(flags & BCH_TRANS_COMMIT_no_check_rw) && unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) { ret = bch2_trans_commit_get_rw_cold(trans, flags); if (ret) goto out_reset; } - if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 && - mutex_trylock(&c->btree_write_buffer.flush_lock)) { - bch2_trans_begin(trans); - bch2_trans_unlock(trans); - - ret = __bch2_btree_write_buffer_flush(trans, - flags|BTREE_INSERT_NOCHECK_RW, true); - if (!ret) { - trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); - } - goto out; - } - - EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); + EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); - trans->journal_u64s = trans->extra_journal_entries.nr; + trans->journal_u64s = trans->journal_entries_u64s; trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); if (trans->journal_transaction_names) trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); trans_for_each_update(trans, i) { - EBUG_ON(!i->path->should_be_locked); + struct btree_path *path = trans->paths + i->path; - ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1); + EBUG_ON(!path->should_be_locked); + + ret = bch2_btree_path_upgrade(trans, path, i->level + 1); if (unlikely(ret)) goto out; - EBUG_ON(!btree_node_intent_locked(i->path, i->level)); + EBUG_ON(!btree_node_intent_locked(path, i->level)); if (i->key_cache_already_flushed) continue; @@ -1120,22 +1076,21 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) trans->journal_u64s += jset_u64s(i->old_k.u64s); } - trans_for_each_wb_update(trans, wb) - trans->journal_u64s += jset_u64s(wb->k.k.u64s); - - if (trans->extra_journal_res) { + if (trans->extra_disk_res) { ret = bch2_disk_reservation_add(c, trans->disk_res, - trans->extra_journal_res, - (flags & BTREE_INSERT_NOFAIL) + trans->extra_disk_res, + (flags & BCH_TRANS_COMMIT_no_enospc) ? BCH_DISK_RESERVATION_NOFAIL : 0); if (ret) goto err; } retry: + errored_at = NULL; bch2_trans_verify_not_in_restart(trans); - memset(&trans->journal_res, 0, sizeof(trans->journal_res)); + if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) + memset(&trans->journal_res, 0, sizeof(trans->journal_res)); - ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_); + ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_); /* make sure we didn't drop or screw up locks: */ bch2_trans_verify_locks(trans); @@ -1145,7 +1100,7 @@ retry: trace_and_count(c, transaction_commit, trans, _RET_IP_); out: - if (likely(!(flags & BTREE_INSERT_NOCHECK_RW))) + if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw))) bch2_write_ref_put(c, BCH_WRITE_REF_trans); out_reset: if (!ret) @@ -1154,9 +1109,21 @@ out_reset: return ret; err: - ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_); + ret = bch2_trans_commit_error(trans, flags, errored_at, ret, _RET_IP_); if (ret) goto out; + /* + * We might have done another transaction commit in the error path - + * i.e. btree write buffer flush - which will have made use of + * trans->journal_res, but with BCH_TRANS_COMMIT_no_journal_res that is + * how the journal sequence number to pin is passed in - so we must + * restart: + */ + if (flags & BCH_TRANS_COMMIT_no_journal_res) { + ret = -BCH_ERR_transaction_restart_nested; + goto out; + } + goto retry; } diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 60453ba86c4b..d530307046f4 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -185,33 +185,32 @@ struct btree_node_iter { * Iterate over all possible positions, synthesizing deleted keys for holes: */ static const __maybe_unused u16 BTREE_ITER_SLOTS = 1 << 0; -static const __maybe_unused u16 BTREE_ITER_ALL_LEVELS = 1 << 1; /* * Indicates that intent locks should be taken on leaf nodes, because we expect * to be doing updates: */ -static const __maybe_unused u16 BTREE_ITER_INTENT = 1 << 2; +static const __maybe_unused u16 BTREE_ITER_INTENT = 1 << 1; /* * Causes the btree iterator code to prefetch additional btree nodes from disk: */ -static const __maybe_unused u16 BTREE_ITER_PREFETCH = 1 << 3; +static const __maybe_unused u16 BTREE_ITER_PREFETCH = 1 << 2; /* * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for * @pos or the first key strictly greater than @pos */ -static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS = 1 << 4; -static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS = 1 << 5; -static const __maybe_unused u16 BTREE_ITER_CACHED = 1 << 6; -static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 7; -static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES = 1 << 8; -static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL = 1 << 9; -static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 10; -static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 11; -static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 12; -static const __maybe_unused u16 BTREE_ITER_NOPRESERVE = 1 << 13; -static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL = 1 << 14; -static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 15; -#define __BTREE_ITER_FLAGS_END 16 +static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS = 1 << 3; +static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS = 1 << 4; +static const __maybe_unused u16 BTREE_ITER_CACHED = 1 << 5; +static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 6; +static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES = 1 << 7; +static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL = 1 << 8; +static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 9; +static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 10; +static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 11; +static const __maybe_unused u16 BTREE_ITER_NOPRESERVE = 1 << 12; +static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL = 1 << 13; +static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 14; +#define __BTREE_ITER_FLAGS_END 15 enum btree_path_uptodate { BTREE_ITER_UPTODATE = 0, @@ -223,13 +222,12 @@ enum btree_path_uptodate { #define TRACK_PATH_ALLOCATED #endif +typedef u16 btree_path_idx_t; + struct btree_path { - u8 idx; - u8 sorted_idx; + btree_path_idx_t sorted_idx; u8 ref; u8 intent_ref; - u32 alloc_seq; - u32 downgrade_seq; /* btree_iter_copy starts here: */ struct bpos pos; @@ -283,13 +281,12 @@ static inline unsigned long btree_path_ip_allocated(struct btree_path *path) */ struct btree_iter { struct btree_trans *trans; - struct btree_path *path; - struct btree_path *update_path; - struct btree_path *key_cache_path; + btree_path_idx_t path; + btree_path_idx_t update_path; + btree_path_idx_t key_cache_path; enum btree_id btree_id:8; - unsigned min_depth:3; - unsigned advanced:1; + u8 min_depth; /* btree_iter_copy starts here: */ u16 flags; @@ -306,7 +303,6 @@ struct btree_iter { /* BTREE_ITER_WITH_JOURNAL: */ size_t journal_idx; - struct bpos journal_pos; #ifdef TRACK_PATH_ALLOCATED unsigned long ip_allocated; #endif @@ -354,16 +350,16 @@ struct btree_insert_entry { * to the size of the key being overwritten in the btree: */ u8 old_btree_u64s; + btree_path_idx_t path; struct bkey_i *k; - struct btree_path *path; - u64 seq; /* key being overwritten: */ struct bkey old_k; const struct bch_val *old_v; unsigned long ip_allocated; }; -#define BTREE_ITER_MAX 64 +#define BTREE_ITER_INITIAL 64 +#define BTREE_ITER_MAX (1U << 10) struct btree_trans_commit_hook; typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *); @@ -377,25 +373,30 @@ struct btree_trans_commit_hook { #define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS 10000 +struct btree_trans_paths { + unsigned long nr_paths; + struct btree_path paths[]; +}; + struct btree_trans { struct bch_fs *c; - const char *fn; - struct closure ref; - struct list_head list; - u64 last_begin_time; - u8 lock_may_not_fail; - u8 lock_must_abort; - struct btree_bkey_cached_common *locking; - struct six_lock_waiter locking_wait; + unsigned long *paths_allocated; + struct btree_path *paths; + btree_path_idx_t *sorted; + struct btree_insert_entry *updates; - int srcu_idx; + void *mem; + unsigned mem_top; + unsigned mem_bytes; + btree_path_idx_t nr_sorted; + btree_path_idx_t nr_paths; + btree_path_idx_t nr_paths_max; u8 fn_idx; - u8 nr_sorted; u8 nr_updates; - u8 nr_wb_updates; - u8 wb_updates_size; + u8 lock_must_abort; + bool lock_may_not_fail:1; bool srcu_held:1; bool used_mempool:1; bool in_traverse_all:1; @@ -407,41 +408,56 @@ struct btree_trans { bool write_locked:1; enum bch_errcode restarted:16; u32 restart_count; + + u64 last_begin_time; unsigned long last_begin_ip; unsigned long last_restarted_ip; unsigned long srcu_lock_time; - /* - * For when bch2_trans_update notices we'll be splitting a compressed - * extent: - */ - unsigned extra_journal_res; - unsigned nr_max_paths; - - u64 paths_allocated; - - unsigned mem_top; - unsigned mem_max; - unsigned mem_bytes; - void *mem; - - u8 sorted[BTREE_ITER_MAX + 8]; - struct btree_path paths[BTREE_ITER_MAX]; - struct btree_insert_entry updates[BTREE_ITER_MAX]; - struct btree_write_buffered_key *wb_updates; + const char *fn; + struct btree_bkey_cached_common *locking; + struct six_lock_waiter locking_wait; + int srcu_idx; /* update path: */ + u16 journal_entries_u64s; + u16 journal_entries_size; + struct jset_entry *journal_entries; + struct btree_trans_commit_hook *hooks; - darray_u64 extra_journal_entries; struct journal_entry_pin *journal_pin; struct journal_res journal_res; u64 *journal_seq; struct disk_reservation *disk_res; unsigned journal_u64s; + unsigned extra_disk_res; /* XXX kill */ struct replicas_delta_list *fs_usage_deltas; + + /* Entries before this are zeroed out on every bch2_trans_get() call */ + + struct list_head list; + struct closure ref; + + unsigned long _paths_allocated[BITS_TO_LONGS(BTREE_ITER_INITIAL)]; + struct btree_trans_paths trans_paths; + struct btree_path _paths[BTREE_ITER_INITIAL]; + btree_path_idx_t _sorted[BTREE_ITER_INITIAL + 4]; + struct btree_insert_entry _updates[BTREE_ITER_INITIAL]; }; +static inline struct btree_path *btree_iter_path(struct btree_trans *trans, struct btree_iter *iter) +{ + return trans->paths + iter->path; +} + +static inline struct btree_path *btree_iter_key_cache_path(struct btree_trans *trans, struct btree_iter *iter) +{ + return iter->key_cache_path + ? trans->paths + iter->key_cache_path + : NULL; +} + #define BCH_BTREE_WRITE_TYPES() \ x(initial, 0) \ x(init_next_bset, 1) \ diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 2fd3c8cc6f51..c3ff365acce9 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -24,7 +24,7 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, } static int __must_check -bch2_trans_update_by_path(struct btree_trans *, struct btree_path *, +bch2_trans_update_by_path(struct btree_trans *, btree_path_idx_t, struct bkey_i *, enum btree_update_flags, unsigned long ip); @@ -200,7 +200,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, */ if (nr_splits > 1 && (compressed_sectors = bch2_bkey_sectors_compressed(old))) - trans->extra_journal_res += compressed_sectors * (nr_splits - 1); + trans->extra_disk_res += compressed_sectors * (nr_splits - 1); if (front_split) { update = bch2_bkey_make_mut_noupdate(trans, old); @@ -339,21 +339,22 @@ err: } static noinline int flush_new_cached_update(struct btree_trans *trans, - struct btree_path *path, struct btree_insert_entry *i, enum btree_update_flags flags, unsigned long ip) { - struct btree_path *btree_path; struct bkey k; int ret; - btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0, - BTREE_ITER_INTENT, _THIS_IP_); - ret = bch2_btree_path_traverse(trans, btree_path, 0); + btree_path_idx_t path_idx = + bch2_path_get(trans, i->btree_id, i->old_k.p, 1, 0, + BTREE_ITER_INTENT, _THIS_IP_); + ret = bch2_btree_path_traverse(trans, path_idx, 0); if (ret) goto out; + struct btree_path *btree_path = trans->paths + path_idx; + /* * The old key in the insert entry might actually refer to an existing * key in the btree that has been deleted from cache and not yet @@ -368,43 +369,34 @@ static noinline int flush_new_cached_update(struct btree_trans *trans, i->flags |= BTREE_TRIGGER_NORUN; btree_path_set_should_be_locked(btree_path); - ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip); + ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip); out: - bch2_path_put(trans, btree_path, true); + bch2_path_put(trans, path_idx, true); return ret; } static int __must_check -bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, +bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, struct bkey_i *k, enum btree_update_flags flags, unsigned long ip) { struct bch_fs *c = trans->c; struct btree_insert_entry *i, n; - u64 seq = 0; int cmp; + struct btree_path *path = trans->paths + path_idx; EBUG_ON(!path->should_be_locked); - EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX); + EBUG_ON(trans->nr_updates >= trans->nr_paths); EBUG_ON(!bpos_eq(k->k.p, path->pos)); - /* - * The transaction journal res hasn't been allocated at this point. - * That occurs at commit time. Reuse the seq field to pass in the seq - * of a prejournaled key. - */ - if (flags & BTREE_UPDATE_PREJOURNAL) - seq = trans->journal_res.seq; - n = (struct btree_insert_entry) { .flags = flags, .bkey_type = __btree_node_type(path->level, path->btree_id), .btree_id = path->btree_id, .level = path->level, .cached = path->cached, - .path = path, + .path = path_idx, .k = k, - .seq = seq, .ip_allocated = ip, }; @@ -418,7 +410,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, * Pending updates are kept sorted: first, find position of new update, * then delete/trim any updates the new update overwrites: */ - trans_for_each_update(trans, i) { + for (i = trans->updates; i < trans->updates + trans->nr_updates; i++) { cmp = btree_insert_entry_cmp(&n, i); if (cmp <= 0) break; @@ -432,7 +424,6 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, i->cached = n.cached; i->k = n.k; i->path = n.path; - i->seq = n.seq; i->ip_allocated = n.ip_allocated; } else { array_insert_item(trans->updates, trans->nr_updates, @@ -452,7 +443,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, } } - __btree_path_get(i->path, true); + __btree_path_get(trans->paths + i->path, true); /* * If a key is present in the key cache, it must also exist in the @@ -462,7 +453,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, * work: */ if (path->cached && bkey_deleted(&i->old_k)) - return flush_new_cached_update(trans, path, i, flags, ip); + return flush_new_cached_update(trans, i, flags, ip); return 0; } @@ -471,9 +462,11 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, struct btree_iter *iter, struct btree_path *path) { - if (!iter->key_cache_path || - !iter->key_cache_path->should_be_locked || - !bpos_eq(iter->key_cache_path->pos, iter->pos)) { + struct btree_path *key_cache_path = btree_iter_key_cache_path(trans, iter); + + if (!key_cache_path || + !key_cache_path->should_be_locked || + !bpos_eq(key_cache_path->pos, iter->pos)) { struct bkey_cached *ck; int ret; @@ -488,19 +481,18 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, iter->flags & BTREE_ITER_INTENT, _THIS_IP_); - ret = bch2_btree_path_traverse(trans, iter->key_cache_path, - BTREE_ITER_CACHED); + ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_CACHED); if (unlikely(ret)) return ret; - ck = (void *) iter->key_cache_path->l[0].b; + ck = (void *) trans->paths[iter->key_cache_path].l[0].b; if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_); return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); } - btree_path_set_should_be_locked(iter->key_cache_path); + btree_path_set_should_be_locked(trans->paths + iter->key_cache_path); } return 0; @@ -509,7 +501,7 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *k, enum btree_update_flags flags) { - struct btree_path *path = iter->update_path ?: iter->path; + btree_path_idx_t path_idx = iter->update_path ?: iter->path; int ret; if (iter->flags & BTREE_ITER_IS_EXTENTS) @@ -529,6 +521,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter /* * Ensure that updates to cached btrees go to the key cache: */ + struct btree_path *path = trans->paths + path_idx; if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && !path->cached && !path->level && @@ -537,27 +530,15 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter if (ret) return ret; - path = iter->key_cache_path; + path_idx = iter->key_cache_path; } - return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_); + return bch2_trans_update_by_path(trans, path_idx, k, flags, _RET_IP_); } -/* - * Add a transaction update for a key that has already been journaled. - */ -int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq, - struct btree_iter *iter, struct bkey_i *k, - enum btree_update_flags flags) -{ - trans->journal_res.seq = seq; - return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL| - BTREE_UPDATE_PREJOURNAL); -} - -static noinline int bch2_btree_insert_clone_trans(struct btree_trans *trans, - enum btree_id btree, - struct bkey_i *k) +int bch2_btree_insert_clone_trans(struct btree_trans *trans, + enum btree_id btree, + struct bkey_i *k) { struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(&k->k)); int ret = PTR_ERR_OR_ZERO(n); @@ -568,60 +549,30 @@ static noinline int bch2_btree_insert_clone_trans(struct btree_trans *trans, return bch2_btree_insert_trans(trans, btree, n, 0); } -int __must_check bch2_trans_update_buffered(struct btree_trans *trans, - enum btree_id btree, - struct bkey_i *k) +struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) { - struct btree_write_buffered_key *i; - int ret; - - EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size); - EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); - - if (unlikely(trans->journal_replay_not_finished)) - return bch2_btree_insert_clone_trans(trans, btree, k); - - trans_for_each_wb_update(trans, i) { - if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) { - bkey_copy(&i->k, k); - return 0; - } - } + unsigned new_top = trans->journal_entries_u64s + u64s; + unsigned old_size = trans->journal_entries_size; - if (!trans->wb_updates || - trans->nr_wb_updates == trans->wb_updates_size) { - struct btree_write_buffered_key *u; + if (new_top > trans->journal_entries_size) { + trans->journal_entries_size = roundup_pow_of_two(new_top); - if (trans->nr_wb_updates == trans->wb_updates_size) { - struct btree_transaction_stats *s = btree_trans_stats(trans); - - BUG_ON(trans->wb_updates_size > U8_MAX / 2); - trans->wb_updates_size = max(1, trans->wb_updates_size * 2); - if (s) - s->wb_updates_size = trans->wb_updates_size; - } - - u = bch2_trans_kmalloc_nomemzero(trans, - trans->wb_updates_size * - sizeof(struct btree_write_buffered_key)); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - return ret; - - if (trans->nr_wb_updates) - memcpy(u, trans->wb_updates, trans->nr_wb_updates * - sizeof(struct btree_write_buffered_key)); - trans->wb_updates = u; + btree_trans_stats(trans)->journal_entries_size = trans->journal_entries_size; } - trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) { - .btree = btree, - }; + struct jset_entry *n = + bch2_trans_kmalloc_nomemzero(trans, + trans->journal_entries_size * sizeof(u64)); + if (IS_ERR(n)) + return ERR_CAST(n); - bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k); - trans->nr_wb_updates++; + if (trans->journal_entries) + memcpy(n, trans->journal_entries, old_size * sizeof(u64)); + trans->journal_entries = n; - return 0; + struct jset_entry *e = btree_trans_journal_entries_top(trans); + trans->journal_entries_u64s = new_top; + return e; } int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, @@ -733,20 +684,6 @@ int bch2_btree_delete_at(struct btree_trans *trans, return bch2_btree_delete_extent_at(trans, iter, 0, update_flags); } -int bch2_btree_delete_at_buffered(struct btree_trans *trans, - enum btree_id btree, struct bpos pos) -{ - struct bkey_i *k; - - k = bch2_trans_kmalloc(trans, sizeof(*k)); - if (IS_ERR(k)) - return PTR_ERR(k); - - bkey_init(&k->k); - k->k.p = pos; - return bch2_trans_update_buffered(trans, btree, k); -} - int bch2_btree_delete(struct btree_trans *trans, enum btree_id btree, struct bpos pos, unsigned update_flags) @@ -809,7 +746,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?: bch2_trans_commit(trans, &disk_res, journal_seq, - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); bch2_disk_reservation_put(trans->c, &disk_res); err: /* @@ -851,56 +788,26 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, struct bpos pos, bool set) { - struct bkey_i *k; - int ret = 0; + struct bkey_i k; - k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k)); - ret = PTR_ERR_OR_ZERO(k); - if (unlikely(ret)) - return ret; + bkey_init(&k.k); + k.k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; + k.k.p = pos; - bkey_init(&k->k); - k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; - k->k.p = pos; - - return bch2_trans_update_buffered(trans, btree, k); + return bch2_trans_update_buffered(trans, btree, &k); } -__printf(2, 0) -static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args) +static int __bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf, unsigned u64s) { - struct printbuf buf = PRINTBUF; - struct jset_entry_log *l; - unsigned u64s; - int ret; - - prt_vprintf(&buf, fmt, args); - ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; - if (ret) - goto err; - - u64s = DIV_ROUND_UP(buf.pos, sizeof(u64)); - - ret = darray_make_room(entries, jset_u64s(u64s)); + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s)); + int ret = PTR_ERR_OR_ZERO(e); if (ret) - goto err; + return ret; - l = (void *) &darray_top(*entries); - l->entry.u64s = cpu_to_le16(u64s); - l->entry.btree_id = 0; - l->entry.level = 1; - l->entry.type = BCH_JSET_ENTRY_log; - l->entry.pad[0] = 0; - l->entry.pad[1] = 0; - l->entry.pad[2] = 0; - memcpy(l->d, buf.buf, buf.pos); - while (buf.pos & 7) - l->d[buf.pos++] = '\0'; - - entries->nr += jset_u64s(u64s); -err: - printbuf_exit(&buf); - return ret; + struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry); + journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s); + memcpy(l->d, buf->buf, buf->pos); + return 0; } __printf(3, 0) @@ -908,16 +815,32 @@ static int __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, va_list args) { - int ret; + struct printbuf buf = PRINTBUF; + prt_vprintf(&buf, fmt, args); + + unsigned u64s = DIV_ROUND_UP(buf.pos, sizeof(u64)); + prt_chars(&buf, '\0', u64s * sizeof(u64) - buf.pos); + + int ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; + if (ret) + goto err; if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) { - ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args); + ret = darray_make_room(&c->journal.early_journal_entries, jset_u64s(u64s)); + if (ret) + goto err; + + struct jset_entry_log *l = (void *) &darray_top(c->journal.early_journal_entries); + journal_entry_init(&l->entry, BCH_JSET_ENTRY_log, 0, 1, u64s); + memcpy(l->d, buf.buf, buf.pos); + c->journal.early_journal_entries.nr += jset_u64s(u64s); } else { ret = bch2_trans_do(c, NULL, NULL, - BTREE_INSERT_LAZY_RW|commit_flags, - __bch2_trans_log_msg(&trans->extra_journal_entries, fmt, args)); + BCH_TRANS_COMMIT_lazy_rw|commit_flags, + __bch2_trans_log_msg(trans, &buf, u64s)); } - +err: + printbuf_exit(&buf); return ret; } diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 9816d2286540..b9382b7b288b 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -21,42 +21,32 @@ void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *, struct bkey_i *, u64); -enum btree_insert_flags { +#define BCH_TRANS_COMMIT_FLAGS() \ + x(no_enospc, "don't check for enospc") \ + x(no_check_rw, "don't attempt to take a ref on c->writes") \ + x(lazy_rw, "go read-write if we haven't yet - only for use in recovery") \ + x(no_journal_res, "don't take a journal reservation, instead " \ + "pin journal entry referred to by trans->journal_res.seq") \ + x(journal_reclaim, "operation required for journal reclaim; may return error" \ + "instead of deadlocking if BCH_WATERMARK_reclaim not specified")\ + +enum __bch_trans_commit_flags { /* First bits for bch_watermark: */ - __BTREE_INSERT_NOFAIL = BCH_WATERMARK_BITS, - __BTREE_INSERT_NOCHECK_RW, - __BTREE_INSERT_LAZY_RW, - __BTREE_INSERT_JOURNAL_REPLAY, - __BTREE_INSERT_JOURNAL_RECLAIM, - __BTREE_INSERT_NOWAIT, - __BTREE_INSERT_GC_LOCK_HELD, - __BCH_HASH_SET_MUST_CREATE, - __BCH_HASH_SET_MUST_REPLACE, + __BCH_TRANS_COMMIT_FLAGS_START = BCH_WATERMARK_BITS, +#define x(n, ...) __BCH_TRANS_COMMIT_##n, + BCH_TRANS_COMMIT_FLAGS() +#undef x }; -/* Don't check for -ENOSPC: */ -#define BTREE_INSERT_NOFAIL BIT(__BTREE_INSERT_NOFAIL) - -#define BTREE_INSERT_NOCHECK_RW BIT(__BTREE_INSERT_NOCHECK_RW) -#define BTREE_INSERT_LAZY_RW BIT(__BTREE_INSERT_LAZY_RW) - -/* Insert is for journal replay - don't get journal reservations: */ -#define BTREE_INSERT_JOURNAL_REPLAY BIT(__BTREE_INSERT_JOURNAL_REPLAY) - -/* Insert is being called from journal reclaim path: */ -#define BTREE_INSERT_JOURNAL_RECLAIM BIT(__BTREE_INSERT_JOURNAL_RECLAIM) - -/* Don't block on allocation failure (for new btree nodes: */ -#define BTREE_INSERT_NOWAIT BIT(__BTREE_INSERT_NOWAIT) -#define BTREE_INSERT_GC_LOCK_HELD BIT(__BTREE_INSERT_GC_LOCK_HELD) - -#define BCH_HASH_SET_MUST_CREATE BIT(__BCH_HASH_SET_MUST_CREATE) -#define BCH_HASH_SET_MUST_REPLACE BIT(__BCH_HASH_SET_MUST_REPLACE) +enum bch_trans_commit_flags { +#define x(n, ...) BCH_TRANS_COMMIT_##n = BIT(__BCH_TRANS_COMMIT_##n), + BCH_TRANS_COMMIT_FLAGS() +#undef x +}; int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *, unsigned, unsigned); int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); -int bch2_btree_delete_at_buffered(struct btree_trans *, enum btree_id, struct bpos); int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned); int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id, @@ -74,6 +64,12 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id, int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool); +static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans, + enum btree_id btree, struct bpos pos) +{ + return bch2_btree_bit_mod(trans, btree, pos, false); +} + int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id, struct bpos, struct bpos); @@ -105,10 +101,44 @@ int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *, int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, struct bkey_i *, enum btree_update_flags); -int __must_check bch2_trans_update_seq(struct btree_trans *, u64, struct btree_iter *, - struct bkey_i *, enum btree_update_flags); -int __must_check bch2_trans_update_buffered(struct btree_trans *, - enum btree_id, struct bkey_i *); + +struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *, unsigned); + +static inline struct jset_entry *btree_trans_journal_entries_top(struct btree_trans *trans) +{ + return (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); +} + +static inline struct jset_entry * +bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) +{ + if (!trans->journal_entries || + trans->journal_entries_u64s + u64s > trans->journal_entries_size) + return __bch2_trans_jset_entry_alloc(trans, u64s); + + struct jset_entry *e = btree_trans_journal_entries_top(trans); + trans->journal_entries_u64s += u64s; + return e; +} + +int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *); + +static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans, + enum btree_id btree, + struct bkey_i *k) +{ + if (unlikely(trans->journal_replay_not_finished)) + return bch2_btree_insert_clone_trans(trans, btree, k); + + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s)); + int ret = PTR_ERR_OR_ZERO(e); + if (ret) + return ret; + + journal_entry_init(e, BCH_JSET_ENTRY_write_buffer_keys, btree, 0, k->k.u64s); + bkey_copy(e->start, k); + return 0; +} void bch2_trans_commit_hook(struct btree_trans *, struct btree_trans_commit_hook *); @@ -157,28 +187,19 @@ static inline int bch2_trans_commit(struct btree_trans *trans, bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do)) #define trans_for_each_update(_trans, _i) \ - for ((_i) = (_trans)->updates; \ + for (struct btree_insert_entry *_i = (_trans)->updates; \ (_i) < (_trans)->updates + (_trans)->nr_updates; \ (_i)++) -#define trans_for_each_wb_update(_trans, _i) \ - for ((_i) = (_trans)->wb_updates; \ - (_i) < (_trans)->wb_updates + (_trans)->nr_wb_updates; \ - (_i)++) - static inline void bch2_trans_reset_updates(struct btree_trans *trans) { - struct btree_insert_entry *i; - trans_for_each_update(trans, i) bch2_path_put(trans, i->path, true); - trans->extra_journal_res = 0; trans->nr_updates = 0; - trans->nr_wb_updates = 0; - trans->wb_updates = NULL; + trans->journal_entries_u64s = 0; trans->hooks = NULL; - trans->extra_journal_entries.nr = 0; + trans->extra_disk_res = 0; if (trans->fs_usage_deltas) { trans->fs_usage_deltas->used = 0; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 239fcc3c7c99..44f9dfa28a09 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -25,24 +25,24 @@ #include <linux/random.h> static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, - struct btree_path *, struct btree *, + btree_path_idx_t, struct btree *, struct keylist *, unsigned); static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); -static struct btree_path *get_unlocked_mut_path(struct btree_trans *trans, - enum btree_id btree_id, - unsigned level, - struct bpos pos) +static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans, + enum btree_id btree_id, + unsigned level, + struct bpos pos) { - struct btree_path *path; - - path = bch2_path_get(trans, btree_id, pos, level + 1, level, + btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level, BTREE_ITER_NOPRESERVE| BTREE_ITER_INTENT, _RET_IP_); - path = bch2_btree_path_make_mut(trans, path, true, _RET_IP_); + path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_); + + struct btree_path *path = trans->paths + path_idx; bch2_btree_path_downgrade(trans, path); __bch2_btree_path_unlock(trans, path); - return path; + return path_idx; } /* Debug code: */ @@ -164,9 +164,11 @@ static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, /* Btree node freeing/allocation: */ -static void __btree_node_free(struct bch_fs *c, struct btree *b) +static void __btree_node_free(struct btree_trans *trans, struct btree *b) { - trace_and_count(c, btree_node_free, c, b); + struct bch_fs *c = trans->c; + + trace_and_count(c, btree_node_free, trans, b); BUG_ON(btree_node_write_blocked(b)); BUG_ON(btree_node_dirty(b)); @@ -188,15 +190,15 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans, struct btree *b) { struct bch_fs *c = trans->c; - unsigned level = b->c.level; + unsigned i, level = b->c.level; bch2_btree_node_lock_write_nofail(trans, path, &b->c); bch2_btree_node_hash_remove(&c->btree_cache, b); - __btree_node_free(c, b); + __btree_node_free(trans, b); six_unlock_write(&b->c.lock); mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED); - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->l[level].b == b) { btree_node_unlock(trans, path, level); path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); @@ -210,7 +212,7 @@ static void bch2_btree_node_free_never_used(struct btree_update *as, struct bch_fs *c = as->c; struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL]; struct btree_path *path; - unsigned level = b->c.level; + unsigned i, level = b->c.level; BUG_ON(!list_empty(&b->write_blocked)); BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as)); @@ -233,7 +235,7 @@ static void bch2_btree_node_free_never_used(struct btree_update *as, six_unlock_intent(&b->c.lock); - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->l[level].b == b) { btree_node_unlock(trans, path, level); path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); @@ -363,7 +365,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id); BUG_ON(ret); - trace_and_count(c, btree_node_alloc, c, b); + trace_and_count(c, btree_node_alloc, trans, b); bch2_increment_clock(c, btree_sectors(c), WRITE); return b; } @@ -453,7 +455,7 @@ static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans * btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); - __btree_node_free(c, b); + __btree_node_free(trans, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); } @@ -466,7 +468,6 @@ static int bch2_btree_reserve_get(struct btree_trans *trans, unsigned flags, struct closure *cl) { - struct bch_fs *c = as->c; struct btree *b; unsigned interior; int ret = 0; @@ -476,11 +477,8 @@ static int bch2_btree_reserve_get(struct btree_trans *trans, /* * Protects reaping from the btree node cache and using the btree node * open bucket reserve: - * - * BTREE_INSERT_NOWAIT only applies to btree node allocation, not - * blocking on this lock: */ - ret = bch2_btree_cache_cannibalize_lock(c, cl); + ret = bch2_btree_cache_cannibalize_lock(trans, cl); if (ret) return ret; @@ -488,9 +486,8 @@ static int bch2_btree_reserve_get(struct btree_trans *trans, struct prealloc_nodes *p = as->prealloc_nodes + interior; while (p->nr < nr_nodes[interior]) { - b = __bch2_btree_node_alloc(trans, &as->disk_res, - flags & BTREE_INSERT_NOWAIT ? NULL : cl, - interior, flags); + b = __bch2_btree_node_alloc(trans, &as->disk_res, cl, + interior, flags); if (IS_ERR(b)) { ret = PTR_ERR(b); goto err; @@ -500,7 +497,7 @@ static int bch2_btree_reserve_get(struct btree_trans *trans, } } err: - bch2_btree_cache_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(trans); return ret; } @@ -559,24 +556,20 @@ static void btree_update_add_key(struct btree_update *as, static int btree_update_nodes_written_trans(struct btree_trans *trans, struct btree_update *as) { - struct bkey_i *k; - int ret; - - ret = darray_make_room(&trans->extra_journal_entries, as->journal_u64s); + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, as->journal_u64s); + int ret = PTR_ERR_OR_ZERO(e); if (ret) return ret; - memcpy(&darray_top(trans->extra_journal_entries), - as->journal_entries, - as->journal_u64s * sizeof(u64)); - trans->extra_journal_entries.nr += as->journal_u64s; + memcpy(e, as->journal_entries, as->journal_u64s * sizeof(u64)); trans->journal_pin = &as->journal; for_each_keylist_key(&as->old_keys, k) { unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; - ret = bch2_trans_mark_old(trans, as->btree_id, level, bkey_i_to_s_c(k), 0); + ret = bch2_key_trigger_old(trans, as->btree_id, level, bkey_i_to_s_c(k), + BTREE_TRIGGER_TRANSACTIONAL); if (ret) return ret; } @@ -584,7 +577,8 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, for_each_keylist_key(&as->new_keys, k) { unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; - ret = bch2_trans_mark_new(trans, as->btree_id, level, k, 0); + ret = bch2_key_trigger_new(trans, as->btree_id, level, bkey_i_to_s(k), + BTREE_TRIGGER_TRANSACTIONAL); if (ret) return ret; } @@ -645,9 +639,9 @@ static void btree_update_nodes_written(struct btree_update *as) */ ret = commit_do(trans, &as->disk_res, &journal_seq, BCH_WATERMARK_reclaim| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_JOURNAL_RECLAIM, + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_journal_reclaim, btree_update_nodes_written_trans(trans, as)); bch2_trans_unlock(trans); @@ -655,10 +649,11 @@ static void btree_update_nodes_written(struct btree_update *as) "%s(): error %s", __func__, bch2_err_str(ret)); err: if (as->b) { - struct btree_path *path; b = as->b; - path = get_unlocked_mut_path(trans, as->btree_id, b->c.level, b->key.k.p); + btree_path_idx_t path_idx = get_unlocked_mut_path(trans, + as->btree_id, b->c.level, b->key.k.p); + struct btree_path *path = trans->paths + path_idx; /* * @b is the node we did the final insert into: * @@ -728,7 +723,7 @@ err: btree_node_write_if_need(c, b, SIX_LOCK_intent); btree_node_unlock(trans, path, b->c.level); - bch2_path_put(trans, path, true); + bch2_path_put(trans, path_idx, true); } bch2_journal_pin_drop(&c->journal, &as->journal); @@ -815,6 +810,12 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b) mutex_unlock(&c->btree_interior_update_lock); } +static int bch2_update_reparent_journal_pin_flush(struct journal *j, + struct journal_entry_pin *_pin, u64 seq) +{ + return 0; +} + static void btree_update_reparent(struct btree_update *as, struct btree_update *child) { @@ -825,7 +826,8 @@ static void btree_update_reparent(struct btree_update *as, child->b = NULL; child->mode = BTREE_INTERIOR_UPDATING_AS; - bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL); + bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, + bch2_update_reparent_journal_pin_flush); } static void btree_update_updated_root(struct btree_update *as, struct btree *b) @@ -934,6 +936,12 @@ static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct b b->ob.v[--b->ob.nr]; } +static int bch2_btree_update_will_free_node_journal_pin_flush(struct journal *j, + struct journal_entry_pin *_pin, u64 seq) +{ + return 0; +} + /* * @b is being split/rewritten: it may have pointers to not-yet-written btree * nodes and thus outstanding btree_updates - redirect @b's @@ -985,11 +993,13 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as, * when the new nodes are persistent and reachable on disk: */ w = btree_current_write(b); - bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); + bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, + bch2_btree_update_will_free_node_journal_pin_flush); bch2_journal_pin_drop(&c->journal, &w->journal); w = btree_prev_write(b); - bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); + bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, + bch2_btree_update_will_free_node_journal_pin_flush); bch2_journal_pin_drop(&c->journal, &w->journal); mutex_unlock(&c->btree_interior_update_lock); @@ -1039,7 +1049,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, struct bch_fs *c = trans->c; struct btree_update *as; u64 start_time = local_clock(); - int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) + int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc) ? BCH_DISK_RESERVATION_NOFAIL : 0; unsigned nr_nodes[2] = { 0, 0 }; unsigned update_level = level; @@ -1057,7 +1067,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, flags &= ~BCH_WATERMARK_MASK; flags |= watermark; - if (!(flags & BTREE_INSERT_JOURNAL_RECLAIM) && + if (!(flags & BCH_TRANS_COMMIT_journal_reclaim) && watermark < c->journal.watermark) { struct journal_res res = { 0 }; @@ -1094,9 +1104,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); } - if (flags & BTREE_INSERT_GC_LOCK_HELD) - lockdep_assert_held(&c->gc_lock); - else if (!down_read_trylock(&c->gc_lock)) { + if (!down_read_trylock(&c->gc_lock)) { ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0)); if (ret) { up_read(&c->gc_lock); @@ -1110,7 +1118,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, as->c = c; as->start_time = start_time; as->mode = BTREE_INTERIOR_NO_UPDATE; - as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD); + as->took_gc_lock = true; as->btree_id = path->btree_id; as->update_level = update_level; INIT_LIST_HEAD(&as->list); @@ -1153,7 +1161,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, * flag */ if (bch2_err_matches(ret, ENOSPC) && - (flags & BTREE_INSERT_JOURNAL_RECLAIM) && + (flags & BCH_TRANS_COMMIT_journal_reclaim) && watermark != BCH_WATERMARK_reclaim) { ret = -BCH_ERR_journal_reclaim_would_deadlock; goto err; @@ -1183,6 +1191,9 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, return as; err: bch2_btree_update_free(as, trans); + if (!bch2_err_matches(ret, ENOSPC) && + !bch2_err_matches(ret, EROFS)) + bch_err_fn_ratelimited(c, ret); return ERR_PTR(ret); } @@ -1214,7 +1225,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct bch_fs *c = as->c; struct btree *old; - trace_and_count(c, btree_node_set_root, c, b); + trace_and_count(c, btree_node_set_root, trans, b); old = btree_node_root(c, b); @@ -1445,10 +1456,12 @@ static void __btree_split_node(struct btree_update *as, */ static void btree_split_insert_keys(struct btree_update *as, struct btree_trans *trans, - struct btree_path *path, + btree_path_idx_t path_idx, struct btree *b, struct keylist *keys) { + struct btree_path *path = trans->paths + path_idx; + if (!bch2_keylist_empty(keys) && bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) { struct btree_node_iter node_iter; @@ -1462,25 +1475,25 @@ static void btree_split_insert_keys(struct btree_update *as, } static int btree_split(struct btree_update *as, struct btree_trans *trans, - struct btree_path *path, struct btree *b, + btree_path_idx_t path, struct btree *b, struct keylist *keys, unsigned flags) { struct bch_fs *c = as->c; - struct btree *parent = btree_node_parent(path, b); + struct btree *parent = btree_node_parent(trans->paths + path, b); struct btree *n1, *n2 = NULL, *n3 = NULL; - struct btree_path *path1 = NULL, *path2 = NULL; + btree_path_idx_t path1 = 0, path2 = 0; u64 start_time = local_clock(); int ret = 0; BUG_ON(!parent && (b != btree_node_root(c, b))); - BUG_ON(parent && !btree_node_intent_locked(path, b->c.level + 1)); + BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1)); bch2_btree_interior_update_will_free_node(as, b); if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) { struct btree *n[2]; - trace_and_count(c, btree_node_split, c, b); + trace_and_count(c, btree_node_split, trans, b); n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level); n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level); @@ -1501,15 +1514,15 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, six_unlock_write(&n2->c.lock); six_unlock_write(&n1->c.lock); - path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p); + path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p); six_lock_increment(&n1->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, path1, n1); + mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, trans->paths + path1, n1); - path2 = get_unlocked_mut_path(trans, path->btree_id, n2->c.level, n2->key.k.p); + path2 = get_unlocked_mut_path(trans, as->btree_id, n2->c.level, n2->key.k.p); six_lock_increment(&n2->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, path2, n2->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, path2, n2); + mark_btree_node_locked(trans, trans->paths + path2, n2->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, trans->paths + path2, n2); /* * Note that on recursive parent_keys == keys, so we @@ -1526,11 +1539,11 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, bch2_btree_update_add_new_node(as, n3); six_unlock_write(&n3->c.lock); - path2->locks_want++; - BUG_ON(btree_node_locked(path2, n3->c.level)); + trans->paths[path2].locks_want++; + BUG_ON(btree_node_locked(trans->paths + path2, n3->c.level)); six_lock_increment(&n3->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, path2, n3->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, path2, n3); + mark_btree_node_locked(trans, trans->paths + path2, n3->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, trans->paths + path2, n3); n3->sib_u64s[0] = U16_MAX; n3->sib_u64s[1] = U16_MAX; @@ -1538,7 +1551,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, btree_split_insert_keys(as, trans, path, n3, &as->parent_keys); } } else { - trace_and_count(c, btree_node_compact, c, b); + trace_and_count(c, btree_node_compact, trans, b); n1 = bch2_btree_node_alloc_replacement(as, trans, b); @@ -1551,10 +1564,10 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, bch2_btree_update_add_new_node(as, n1); six_unlock_write(&n1->c.lock); - path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p); + path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p); six_lock_increment(&n1->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, path1, n1); + mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, trans->paths + path1, n1); if (parent) bch2_keylist_add(&as->parent_keys, &n1->key); @@ -1568,10 +1581,10 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, if (ret) goto err; } else if (n3) { - bch2_btree_set_root(as, trans, path, n3); + bch2_btree_set_root(as, trans, trans->paths + path, n3); } else { /* Root filled up but didn't need to be split */ - bch2_btree_set_root(as, trans, path, n1); + bch2_btree_set_root(as, trans, trans->paths + path, n1); } if (n3) { @@ -1591,13 +1604,13 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, * node after another thread has locked and updated the new node, thus * seeing stale data: */ - bch2_btree_node_free_inmem(trans, path, b); + bch2_btree_node_free_inmem(trans, trans->paths + path, b); if (n3) - bch2_trans_node_add(trans, n3); + bch2_trans_node_add(trans, trans->paths + path, n3); if (n2) - bch2_trans_node_add(trans, n2); - bch2_trans_node_add(trans, n1); + bch2_trans_node_add(trans, trans->paths + path2, n2); + bch2_trans_node_add(trans, trans->paths + path1, n1); if (n3) six_unlock_intent(&n3->c.lock); @@ -1606,11 +1619,11 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, six_unlock_intent(&n1->c.lock); out: if (path2) { - __bch2_btree_path_unlock(trans, path2); + __bch2_btree_path_unlock(trans, trans->paths + path2); bch2_path_put(trans, path2, true); } if (path1) { - __bch2_btree_path_unlock(trans, path1); + __bch2_btree_path_unlock(trans, trans->paths + path1); bch2_path_put(trans, path1, true); } @@ -1638,13 +1651,14 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct keylist *keys) { struct btree_path *linked; + unsigned i; __bch2_btree_insert_keys_interior(as, trans, path, b, path->l[b->c.level].iter, keys); btree_update_updated_node(as, b); - trans_for_each_path_with_node(trans, b, linked) + trans_for_each_path_with_node(trans, b, linked, i) bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); bch2_trans_verify_paths(trans); @@ -1655,7 +1669,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, * * @as: btree_update object * @trans: btree_trans object - * @path: path that points to current node + * @path_idx: path that points to current node * @b: node to insert keys into * @keys: list of keys to insert * @flags: transaction commit flags @@ -1667,10 +1681,11 @@ bch2_btree_insert_keys_interior(struct btree_update *as, * for leaf nodes -- inserts into interior nodes have to be atomic. */ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans, - struct btree_path *path, struct btree *b, + btree_path_idx_t path_idx, struct btree *b, struct keylist *keys, unsigned flags) { struct bch_fs *c = as->c; + struct btree_path *path = trans->paths + path_idx; int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; @@ -1723,19 +1738,22 @@ split: return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); } - return btree_split(as, trans, path, b, keys, flags); + return btree_split(as, trans, path_idx, b, keys, flags); } int bch2_btree_split_leaf(struct btree_trans *trans, - struct btree_path *path, + btree_path_idx_t path, unsigned flags) { - struct btree *b = path_l(path)->b; + /* btree_split & merge may both cause paths array to be reallocated */ + + struct btree *b = path_l(trans->paths + path)->b; struct btree_update *as; unsigned l; int ret = 0; - as = bch2_btree_update_start(trans, path, path->level, + as = bch2_btree_update_start(trans, trans->paths + path, + trans->paths[path].level, true, flags); if (IS_ERR(as)) return PTR_ERR(as); @@ -1748,20 +1766,21 @@ int bch2_btree_split_leaf(struct btree_trans *trans, bch2_btree_update_done(as, trans); - for (l = path->level + 1; btree_node_intent_locked(path, l) && !ret; l++) + for (l = trans->paths[path].level + 1; + btree_node_intent_locked(&trans->paths[path], l) && !ret; + l++) ret = bch2_foreground_maybe_merge(trans, path, l, flags); return ret; } int __bch2_foreground_maybe_merge(struct btree_trans *trans, - struct btree_path *path, + btree_path_idx_t path, unsigned level, unsigned flags, enum btree_node_sibling sib) { struct bch_fs *c = trans->c; - struct btree_path *sib_path = NULL, *new_path = NULL; struct btree_update *as; struct bkey_format_state new_s; struct bkey_format new_f; @@ -1769,13 +1788,15 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, struct btree *b, *m, *n, *prev, *next, *parent; struct bpos sib_pos; size_t sib_u64s; + enum btree_id btree = trans->paths[path].btree_id; + btree_path_idx_t sib_path = 0, new_path = 0; u64 start_time = local_clock(); int ret = 0; - BUG_ON(!path->should_be_locked); - BUG_ON(!btree_node_locked(path, level)); + BUG_ON(!trans->paths[path].should_be_locked); + BUG_ON(!btree_node_locked(&trans->paths[path], level)); - b = path->l[level].b; + b = trans->paths[path].l[level].b; if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) || (sib == btree_next_sib && bpos_eq(b->data->max_key, SPOS_MAX))) { @@ -1787,18 +1808,18 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, ? bpos_predecessor(b->data->min_key) : bpos_successor(b->data->max_key); - sib_path = bch2_path_get(trans, path->btree_id, sib_pos, + sib_path = bch2_path_get(trans, btree, sib_pos, U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_); ret = bch2_btree_path_traverse(trans, sib_path, false); if (ret) goto err; - btree_path_set_should_be_locked(sib_path); + btree_path_set_should_be_locked(trans->paths + sib_path); - m = sib_path->l[level].b; + m = trans->paths[sib_path].l[level].b; - if (btree_node_parent(path, b) != - btree_node_parent(sib_path, m)) { + if (btree_node_parent(trans->paths + path, b) != + btree_node_parent(trans->paths + sib_path, m)) { b->sib_u64s[sib] = U16_MAX; goto out; } @@ -1851,14 +1872,14 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) goto out; - parent = btree_node_parent(path, b); - as = bch2_btree_update_start(trans, path, level, false, - BTREE_INSERT_NOFAIL|flags); + parent = btree_node_parent(trans->paths + path, b); + as = bch2_btree_update_start(trans, trans->paths + path, level, false, + BCH_TRANS_COMMIT_no_enospc|flags); ret = PTR_ERR_OR_ZERO(as); if (ret) goto err; - trace_and_count(c, btree_node_merge, c, b); + trace_and_count(c, btree_node_merge, trans, b); bch2_btree_interior_update_will_free_node(as, b); bch2_btree_interior_update_will_free_node(as, m); @@ -1882,10 +1903,10 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_btree_update_add_new_node(as, n); six_unlock_write(&n->c.lock); - new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p); + new_path = get_unlocked_mut_path(trans, btree, n->c.level, n->key.k.p); six_lock_increment(&n->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, new_path, n); + mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, trans->paths + new_path, n); bkey_init(&delete.k); delete.k.p = prev->key.k.p; @@ -1903,10 +1924,10 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_btree_update_get_open_buckets(as, n); bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); - bch2_btree_node_free_inmem(trans, path, b); - bch2_btree_node_free_inmem(trans, sib_path, m); + bch2_btree_node_free_inmem(trans, trans->paths + path, b); + bch2_btree_node_free_inmem(trans, trans->paths + sib_path, m); - bch2_trans_node_add(trans, n); + bch2_trans_node_add(trans, trans->paths + path, n); bch2_trans_verify_paths(trans); @@ -1934,16 +1955,16 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, unsigned flags) { struct bch_fs *c = trans->c; - struct btree_path *new_path = NULL; struct btree *n, *parent; struct btree_update *as; + btree_path_idx_t new_path = 0; int ret; - flags |= BTREE_INSERT_NOFAIL; + flags |= BCH_TRANS_COMMIT_no_enospc; - parent = btree_node_parent(iter->path, b); - as = bch2_btree_update_start(trans, iter->path, b->c.level, - false, flags); + struct btree_path *path = btree_iter_path(trans, iter); + parent = btree_node_parent(path, b); + as = bch2_btree_update_start(trans, path, b->c.level, false, flags); ret = PTR_ERR_OR_ZERO(as); if (ret) goto out; @@ -1958,27 +1979,27 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p); six_lock_increment(&n->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, new_path, n); + mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, trans->paths + new_path, n); - trace_and_count(c, btree_node_rewrite, c, b); + trace_and_count(c, btree_node_rewrite, trans, b); if (parent) { bch2_keylist_add(&as->parent_keys, &n->key); - ret = bch2_btree_insert_node(as, trans, iter->path, parent, - &as->parent_keys, flags); + ret = bch2_btree_insert_node(as, trans, iter->path, + parent, &as->parent_keys, flags); if (ret) goto err; } else { - bch2_btree_set_root(as, trans, iter->path, n); + bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n); } bch2_btree_update_get_open_buckets(as, n); bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); - bch2_btree_node_free_inmem(trans, iter->path, b); + bch2_btree_node_free_inmem(trans, btree_iter_path(trans, iter), b); - bch2_trans_node_add(trans, n); + bch2_trans_node_add(trans, trans->paths + iter->path, n); six_unlock_intent(&n->c.lock); bch2_btree_update_done(as, trans); @@ -2047,8 +2068,7 @@ static void async_btree_node_rewrite_work(struct work_struct *work) ret = bch2_trans_do(c, NULL, NULL, 0, async_btree_node_rewrite_trans(trans, a)); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); kfree(a); } @@ -2071,7 +2091,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) a->seq = b->data->keys.seq; INIT_WORK(&a->work, async_btree_node_rewrite_work); - if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) { + if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) { mutex_lock(&c->pending_node_rewrites_lock); list_add(&a->list, &c->pending_node_rewrites); mutex_unlock(&c->pending_node_rewrites_lock); @@ -2079,15 +2099,15 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) } if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { - if (test_bit(BCH_FS_STARTED, &c->flags)) { + if (test_bit(BCH_FS_started, &c->flags)) { bch_err(c, "%s: error getting c->writes ref", __func__); kfree(a); return; } ret = bch2_fs_read_write_early(c); + bch_err_msg(c, ret, "going read-write"); if (ret) { - bch_err_msg(c, ret, "going read-write"); kfree(a); return; } @@ -2138,13 +2158,12 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, int ret; if (!skip_triggers) { - ret = bch2_trans_mark_old(trans, b->c.btree_id, b->c.level + 1, - bkey_i_to_s_c(&b->key), 0); - if (ret) - return ret; - - ret = bch2_trans_mark_new(trans, b->c.btree_id, b->c.level + 1, - new_key, 0); + ret = bch2_key_trigger_old(trans, b->c.btree_id, b->c.level + 1, + bkey_i_to_s_c(&b->key), + BTREE_TRIGGER_TRANSACTIONAL) ?: + bch2_key_trigger_new(trans, b->c.btree_id, b->c.level + 1, + bkey_i_to_s(new_key), + BTREE_TRIGGER_TRANSACTIONAL); if (ret) return ret; } @@ -2156,7 +2175,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, BUG_ON(ret); } - parent = btree_node_parent(iter->path, b); + parent = btree_node_parent(btree_iter_path(trans, iter), b); if (parent) { bch2_trans_copy_iter(&iter2, iter); @@ -2164,10 +2183,11 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, iter2.flags & BTREE_ITER_INTENT, _THIS_IP_); - BUG_ON(iter2.path->level != b->c.level); - BUG_ON(!bpos_eq(iter2.path->pos, new_key->k.p)); + struct btree_path *path2 = btree_iter_path(trans, &iter2); + BUG_ON(path2->level != b->c.level); + BUG_ON(!bpos_eq(path2->pos, new_key->k.p)); - btree_path_set_level_up(trans, iter2.path); + btree_path_set_level_up(trans, path2); trans->paths_sorted = false; @@ -2178,23 +2198,23 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, } else { BUG_ON(btree_node_root(c, b) != b); - ret = darray_make_room(&trans->extra_journal_entries, + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(new_key->k.u64s)); + ret = PTR_ERR_OR_ZERO(e); if (ret) return ret; - journal_entry_set((void *) &darray_top(trans->extra_journal_entries), + journal_entry_set(e, BCH_JSET_ENTRY_btree_root, b->c.btree_id, b->c.level, new_key, new_key->k.u64s); - trans->extra_journal_entries.nr += jset_u64s(new_key->k.u64s); } ret = bch2_trans_commit(trans, NULL, NULL, commit_flags); if (ret) goto err; - bch2_btree_node_lock_write_nofail(trans, iter->path, &b->c); + bch2_btree_node_lock_write_nofail(trans, btree_iter_path(trans, iter), &b->c); if (new_hash) { mutex_lock(&c->btree_cache.lock); @@ -2209,7 +2229,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, bkey_copy(&b->key, new_key); } - bch2_btree_node_unlock_write(trans, iter->path, b); + bch2_btree_node_unlock_write(trans, btree_iter_path(trans, iter), b); out: bch2_trans_iter_exit(trans, &iter2); return ret; @@ -2228,7 +2248,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite { struct bch_fs *c = trans->c; struct btree *new_hash = NULL; - struct btree_path *path = iter->path; + struct btree_path *path = btree_iter_path(trans, iter); struct closure cl; int ret = 0; @@ -2243,7 +2263,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite * btree_iter_traverse(): */ if (btree_ptr_hash_val(new_key) != b->hash_val) { - ret = bch2_btree_cache_cannibalize_lock(c, &cl); + ret = bch2_btree_cache_cannibalize_lock(trans, &cl); if (ret) { ret = drop_locks_do(trans, (closure_sync(&cl), 0)); if (ret) @@ -2267,7 +2287,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite six_unlock_intent(&new_hash->c.lock); } closure_sync(&cl); - bch2_btree_cache_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(trans); return ret; } @@ -2286,7 +2306,7 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, goto out; /* has node been freed? */ - if (iter.path->l[b->c.level].b != b) { + if (btree_iter_path(trans, &iter)->l[b->c.level].b != b) { /* node has been freed: */ BUG_ON(!btree_node_dying(b)); goto out; @@ -2328,12 +2348,12 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) closure_init_stack(&cl); do { - ret = bch2_btree_cache_cannibalize_lock(c, &cl); + ret = bch2_btree_cache_cannibalize_lock(trans, &cl); closure_sync(&cl); } while (ret); b = bch2_btree_node_mem_alloc(trans, false); - bch2_btree_cache_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(trans); set_btree_node_fake(b); set_btree_node_need_rewrite(b); diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index a6668992a272..adfc62083844 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -117,16 +117,17 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, struct btree *, struct bkey_format); -int bch2_btree_split_leaf(struct btree_trans *, struct btree_path *, unsigned); +int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned); -int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_path *, +int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t, unsigned, unsigned, enum btree_node_sibling); static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, - struct btree_path *path, + btree_path_idx_t path_idx, unsigned level, unsigned flags, enum btree_node_sibling sib) { + struct btree_path *path = trans->paths + path_idx; struct btree *b; EBUG_ON(!btree_node_locked(path, level)); @@ -135,11 +136,11 @@ static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold) return 0; - return __bch2_foreground_maybe_merge(trans, path, level, flags, sib); + return __bch2_foreground_maybe_merge(trans, path_idx, level, flags, sib); } static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, - struct btree_path *path, + btree_path_idx_t path, unsigned level, unsigned flags) { diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 4e6241db518b..5c1169c78daf 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -7,45 +7,144 @@ #include "btree_write_buffer.h" #include "error.h" #include "journal.h" +#include "journal_io.h" #include "journal_reclaim.h" -#include <linux/sort.h> +#include <linux/prefetch.h> -static int btree_write_buffered_key_cmp(const void *_l, const void *_r) +static int bch2_btree_write_buffer_journal_flush(struct journal *, + struct journal_entry_pin *, u64); + +static int bch2_journal_keys_to_write_buffer(struct bch_fs *, struct journal_buf *); + +static inline bool __wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r) +{ + return (cmp_int(l->hi, r->hi) ?: + cmp_int(l->mi, r->mi) ?: + cmp_int(l->lo, r->lo)) >= 0; +} + +static inline bool wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r) +{ +#ifdef CONFIG_X86_64 + int cmp; + + asm("mov (%[l]), %%rax;" + "sub (%[r]), %%rax;" + "mov 8(%[l]), %%rax;" + "sbb 8(%[r]), %%rax;" + "mov 16(%[l]), %%rax;" + "sbb 16(%[r]), %%rax;" + : "=@ccae" (cmp) + : [l] "r" (l), [r] "r" (r) + : "rax", "cc"); + + EBUG_ON(cmp != __wb_key_ref_cmp(l, r)); + return cmp; +#else + return __wb_key_ref_cmp(l, r); +#endif +} + +/* Compare excluding idx, the low 24 bits: */ +static inline bool wb_key_eq(const void *_l, const void *_r) { - const struct btree_write_buffered_key *l = _l; - const struct btree_write_buffered_key *r = _r; + const struct wb_key_ref *l = _l; + const struct wb_key_ref *r = _r; - return cmp_int(l->btree, r->btree) ?: - bpos_cmp(l->k.k.p, r->k.k.p) ?: - cmp_int(l->journal_seq, r->journal_seq) ?: - cmp_int(l->journal_offset, r->journal_offset); + return !((l->hi ^ r->hi)| + (l->mi ^ r->mi)| + ((l->lo >> 24) ^ (r->lo >> 24))); } -static int btree_write_buffered_journal_cmp(const void *_l, const void *_r) +static noinline void wb_sort(struct wb_key_ref *base, size_t num) { - const struct btree_write_buffered_key *l = _l; - const struct btree_write_buffered_key *r = _r; + size_t n = num, a = num / 2; + + if (!a) /* num < 2 || size == 0 */ + return; + + for (;;) { + size_t b, c, d; + + if (a) /* Building heap: sift down --a */ + --a; + else if (--n) /* Sorting: Extract root to --n */ + swap(base[0], base[n]); + else /* Sort complete */ + break; - return cmp_int(l->journal_seq, r->journal_seq); + /* + * Sift element at "a" down into heap. This is the + * "bottom-up" variant, which significantly reduces + * calls to cmp_func(): we find the sift-down path all + * the way to the leaves (one compare per level), then + * backtrack to find where to insert the target element. + * + * Because elements tend to sift down close to the leaves, + * this uses fewer compares than doing two per level + * on the way down. (A bit more than half as many on + * average, 3/4 worst-case.) + */ + for (b = a; c = 2*b + 1, (d = c + 1) < n;) + b = wb_key_ref_cmp(base + c, base + d) ? c : d; + if (d == n) /* Special case last leaf with no sibling */ + b = c; + + /* Now backtrack from "b" to the correct location for "a" */ + while (b != a && wb_key_ref_cmp(base + a, base + b)) + b = (b - 1) / 2; + c = b; /* Where "a" belongs */ + while (b != a) { /* Shift it into place */ + b = (b - 1) / 2; + swap(base[b], base[c]); + } + } +} + +static noinline int wb_flush_one_slowpath(struct btree_trans *trans, + struct btree_iter *iter, + struct btree_write_buffered_key *wb) +{ + struct btree_path *path = btree_iter_path(trans, iter); + + bch2_btree_node_unlock_write(trans, path, path->l[0].b); + + trans->journal_res.seq = wb->journal_seq; + + return bch2_trans_update(trans, iter, &wb->k, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_trans_commit(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_journal_res| + BCH_TRANS_COMMIT_journal_reclaim); } -static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans, - struct btree_iter *iter, - struct btree_write_buffered_key *wb, - unsigned commit_flags, - bool *write_locked, - size_t *fast) +static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter, + struct btree_write_buffered_key *wb, + bool *write_locked, size_t *fast) { struct bch_fs *c = trans->c; struct btree_path *path; int ret; + EBUG_ON(!wb->journal_seq); + EBUG_ON(!c->btree_write_buffer.flushing.pin.seq); + EBUG_ON(c->btree_write_buffer.flushing.pin.seq > wb->journal_seq); + ret = bch2_btree_iter_traverse(iter); if (ret) return ret; - path = iter->path; + /* + * We can't clone a path that has write locks: unshare it now, before + * set_pos and traverse(): + */ + if (btree_iter_path(trans, iter)->ref > 1) + iter->path = __bch2_btree_path_make_mut(trans, iter->path, true, _THIS_IP_); + + path = btree_iter_path(trans, iter); if (!*write_locked) { ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c); @@ -56,52 +155,14 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans, *write_locked = true; } - if (!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s)) { - bch2_btree_node_unlock_write(trans, path, path->l[0].b); + if (unlikely(!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s))) { *write_locked = false; - goto trans_commit; + return wb_flush_one_slowpath(trans, iter, wb); } bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq); (*fast)++; - - if (path->ref > 1) { - /* - * We can't clone a path that has write locks: if the path is - * shared, unlock before set_pos(), traverse(): - */ - bch2_btree_node_unlock_write(trans, path, path->l[0].b); - *write_locked = false; - } return 0; -trans_commit: - return bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: - bch2_trans_commit(trans, NULL, NULL, - commit_flags| - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_JOURNAL_RECLAIM); -} - -static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb) -{ - union btree_write_buffer_state old, new; - u64 v = READ_ONCE(wb->state.v); - - do { - old.v = new.v = v; - - new.nr = 0; - new.idx++; - } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v); - - while (old.idx == 0 ? wb->state.ref0 : wb->state.ref1) - cpu_relax(); - - smp_mb(); - - return old; } /* @@ -124,41 +185,87 @@ btree_write_buffered_insert(struct btree_trans *trans, bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k), BTREE_ITER_CACHED|BTREE_ITER_INTENT); + trans->journal_res.seq = wb->journal_seq; + ret = bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch2_trans_update(trans, &iter, &wb->k, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); bch2_trans_iter_exit(trans, &iter); return ret; } -int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_flags, - bool locked) +static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb) +{ + struct bch_fs *c = container_of(wb, struct bch_fs, btree_write_buffer); + struct journal *j = &c->journal; + + if (!wb->inc.keys.nr) + return; + + bch2_journal_pin_add(j, wb->inc.keys.data[0].journal_seq, &wb->flushing.pin, + bch2_btree_write_buffer_journal_flush); + + darray_resize(&wb->flushing.keys, min_t(size_t, 1U << 20, wb->flushing.keys.nr + wb->inc.keys.nr)); + darray_resize(&wb->sorted, wb->flushing.keys.size); + + if (!wb->flushing.keys.nr && wb->sorted.size >= wb->inc.keys.nr) { + swap(wb->flushing.keys, wb->inc.keys); + goto out; + } + + size_t nr = min(darray_room(wb->flushing.keys), + wb->sorted.size - wb->flushing.keys.nr); + nr = min(nr, wb->inc.keys.nr); + + memcpy(&darray_top(wb->flushing.keys), + wb->inc.keys.data, + sizeof(wb->inc.keys.data[0]) * nr); + + memmove(wb->inc.keys.data, + wb->inc.keys.data + nr, + sizeof(wb->inc.keys.data[0]) * (wb->inc.keys.nr - nr)); + + wb->flushing.keys.nr += nr; + wb->inc.keys.nr -= nr; +out: + if (!wb->inc.keys.nr) + bch2_journal_pin_drop(j, &wb->inc.pin); + else + bch2_journal_pin_update(j, wb->inc.keys.data[0].journal_seq, &wb->inc.pin, + bch2_btree_write_buffer_journal_flush); + + if (j->watermark) { + spin_lock(&j->lock); + bch2_journal_set_watermark(j); + spin_unlock(&j->lock); + } + + BUG_ON(wb->sorted.size < wb->flushing.keys.nr); +} + +static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) { struct bch_fs *c = trans->c; struct journal *j = &c->journal; struct btree_write_buffer *wb = &c->btree_write_buffer; - struct journal_entry_pin pin; - struct btree_write_buffered_key *i, *keys; struct btree_iter iter = { NULL }; - size_t nr = 0, skipped = 0, fast = 0, slowpath = 0; + size_t skipped = 0, fast = 0, slowpath = 0; bool write_locked = false; - union btree_write_buffer_state s; int ret = 0; - memset(&pin, 0, sizeof(pin)); - - if (!locked && !mutex_trylock(&wb->flush_lock)) - return 0; - - bch2_journal_pin_copy(j, &pin, &wb->journal_pin, NULL); - bch2_journal_pin_drop(j, &wb->journal_pin); + bch2_trans_unlock(trans); + bch2_trans_begin(trans); - s = btree_write_buffer_switch(wb); - keys = wb->keys[s.idx]; - nr = s.nr; + mutex_lock(&wb->inc.lock); + move_keys_from_inc_to_flushing(wb); + mutex_unlock(&wb->inc.lock); - if (race_fault()) - goto slowpath; + for (size_t i = 0; i < wb->flushing.keys.nr; i++) { + wb->sorted.data[i].idx = i; + wb->sorted.data[i].btree = wb->flushing.keys.data[i].btree; + memcpy(&wb->sorted.data[i].pos, &wb->flushing.keys.data[i].k.k.p, sizeof(struct bpos)); + } + wb->sorted.nr = wb->flushing.keys.nr; /* * We first sort so that we can detect and skip redundant updates, and @@ -168,208 +275,373 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f * However, since we're not flushing in the order they appear in the * journal we won't be able to drop our journal pin until everything is * flushed - which means this could deadlock the journal if we weren't - * passing BTREE_INSERT_JOURNAL_RECLAIM. This causes the update to fail + * passing BCH_TRANS_COMMIT_journal_reclaim. This causes the update to fail * if it would block taking a journal reservation. * * If that happens, simply skip the key so we can optimistically insert * as many keys as possible in the fast path. */ - sort(keys, nr, sizeof(keys[0]), - btree_write_buffered_key_cmp, NULL); + wb_sort(wb->sorted.data, wb->sorted.nr); + + darray_for_each(wb->sorted, i) { + struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx]; + + for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++) + prefetch(&wb->flushing.keys.data[n->idx]); + + BUG_ON(!k->journal_seq); + + if (i + 1 < &darray_top(wb->sorted) && + wb_key_eq(i, i + 1)) { + struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx]; - for (i = keys; i < keys + nr; i++) { - if (i + 1 < keys + nr && - i[0].btree == i[1].btree && - bpos_eq(i[0].k.k.p, i[1].k.k.p)) { skipped++; - i->journal_seq = 0; + n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq); + k->journal_seq = 0; continue; } - if (write_locked && - (iter.path->btree_id != i->btree || - bpos_gt(i->k.k.p, iter.path->l[0].b->key.k.p))) { - bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b); - write_locked = false; + if (write_locked) { + struct btree_path *path = btree_iter_path(trans, &iter); + + if (path->btree_id != i->btree || + bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) { + bch2_btree_node_unlock_write(trans, path, path->l[0].b); + write_locked = false; + } } - if (!iter.path || iter.path->btree_id != i->btree) { + if (!iter.path || iter.btree_id != k->btree) { bch2_trans_iter_exit(trans, &iter); - bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, + bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p, BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS); } - bch2_btree_iter_set_pos(&iter, i->k.k.p); - iter.path->preserve = false; + bch2_btree_iter_set_pos(&iter, k->k.k.p); + btree_iter_path(trans, &iter)->preserve = false; do { - ret = bch2_btree_write_buffer_flush_one(trans, &iter, i, - commit_flags, &write_locked, &fast); + if (race_fault()) { + ret = -BCH_ERR_journal_reclaim_would_deadlock; + break; + } + + ret = wb_flush_one(trans, &iter, k, &write_locked, &fast); if (!write_locked) bch2_trans_begin(trans); } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); - if (ret == -BCH_ERR_journal_reclaim_would_deadlock) { + if (!ret) { + k->journal_seq = 0; + } else if (ret == -BCH_ERR_journal_reclaim_would_deadlock) { slowpath++; - continue; - } - if (ret) + ret = 0; + } else break; - - i->journal_seq = 0; } - if (write_locked) - bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b); + if (write_locked) { + struct btree_path *path = btree_iter_path(trans, &iter); + bch2_btree_node_unlock_write(trans, path, path->l[0].b); + } bch2_trans_iter_exit(trans, &iter); - trace_write_buffer_flush(trans, nr, skipped, fast, wb->size); - - if (slowpath) - goto slowpath; + if (ret) + goto err; + if (slowpath) { + /* + * Flush in the order they were present in the journal, so that + * we can release journal pins: + * The fastpath zapped the seq of keys that were successfully flushed so + * we can skip those here. + */ + trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr); + + darray_for_each(wb->flushing.keys, i) { + if (!i->journal_seq) + continue; + + bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin, + bch2_btree_write_buffer_journal_flush); + + bch2_trans_begin(trans); + + ret = commit_do(trans, NULL, NULL, + BCH_WATERMARK_reclaim| + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_no_journal_res| + BCH_TRANS_COMMIT_journal_reclaim, + btree_write_buffered_insert(trans, i)); + if (ret) + goto err; + } + } +err: bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)); -out: - bch2_journal_pin_drop(j, &pin); - mutex_unlock(&wb->flush_lock); + trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0); + bch2_journal_pin_drop(j, &wb->flushing.pin); + wb->flushing.keys.nr = 0; return ret; -slowpath: - trace_write_buffer_flush_slowpath(trans, i - keys, nr); +} - /* - * Now sort the rest by journal seq and bump the journal pin as we go. - * The slowpath zapped the seq of keys that were successfully flushed so - * we can skip those here. - */ - sort(keys, nr, sizeof(keys[0]), - btree_write_buffered_journal_cmp, - NULL); +static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq) +{ + struct journal *j = &c->journal; + struct journal_buf *buf; + int ret = 0; - commit_flags &= ~BCH_WATERMARK_MASK; - commit_flags |= BCH_WATERMARK_reclaim; + while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, seq))) { + ret = bch2_journal_keys_to_write_buffer(c, buf); + mutex_unlock(&j->buf_lock); + } - for (i = keys; i < keys + nr; i++) { - if (!i->journal_seq) - continue; + return ret; +} - if (i->journal_seq > pin.seq) { - struct journal_entry_pin pin2; +static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq) +{ + struct bch_fs *c = trans->c; + struct btree_write_buffer *wb = &c->btree_write_buffer; + int ret = 0, fetch_from_journal_err; - memset(&pin2, 0, sizeof(pin2)); + do { + bch2_trans_unlock(trans); - bch2_journal_pin_add(j, i->journal_seq, &pin2, NULL); - bch2_journal_pin_drop(j, &pin); - bch2_journal_pin_copy(j, &pin, &pin2, NULL); - bch2_journal_pin_drop(j, &pin2); - } + fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq); - ret = commit_do(trans, NULL, NULL, - commit_flags| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_JOURNAL_RECLAIM, - btree_write_buffered_insert(trans, i)); - if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret))) - break; - } + /* + * On memory allocation failure, bch2_btree_write_buffer_flush_locked() + * is not guaranteed to empty wb->inc: + */ + mutex_lock(&wb->flushing.lock); + ret = bch2_btree_write_buffer_flush_locked(trans); + mutex_unlock(&wb->flushing.lock); + } while (!ret && + (fetch_from_journal_err || + (wb->inc.pin.seq && wb->inc.pin.seq <= seq) || + (wb->flushing.pin.seq && wb->flushing.pin.seq <= seq))); - goto out; + return ret; } -int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans) +static int bch2_btree_write_buffer_journal_flush(struct journal *j, + struct journal_entry_pin *_pin, u64 seq) { - bch2_trans_unlock(trans); - mutex_lock(&trans->c->btree_write_buffer.flush_lock); - return __bch2_btree_write_buffer_flush(trans, 0, true); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + + return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq)); } -int bch2_btree_write_buffer_flush(struct btree_trans *trans) +int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans) { - return __bch2_btree_write_buffer_flush(trans, 0, false); + struct bch_fs *c = trans->c; + + trace_and_count(c, write_buffer_flush_sync, trans, _RET_IP_); + + return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal)); } -static int bch2_btree_write_buffer_journal_flush(struct journal *j, - struct journal_entry_pin *_pin, u64 seq) +int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans) { - struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_fs *c = trans->c; struct btree_write_buffer *wb = &c->btree_write_buffer; + int ret = 0; - mutex_lock(&wb->flush_lock); + if (mutex_trylock(&wb->flushing.lock)) { + ret = bch2_btree_write_buffer_flush_locked(trans); + mutex_unlock(&wb->flushing.lock); + } - return bch2_trans_run(c, - __bch2_btree_write_buffer_flush(trans, BTREE_INSERT_NOCHECK_RW, true)); + return ret; } -static inline u64 btree_write_buffer_ref(int idx) +int bch2_btree_write_buffer_tryflush(struct btree_trans *trans) { - return ((union btree_write_buffer_state) { - .ref0 = idx == 0, - .ref1 = idx == 1, - }).v; + struct bch_fs *c = trans->c; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer)) + return -BCH_ERR_erofs_no_writes; + + int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans); + bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); + return ret; } -int bch2_btree_insert_keys_write_buffer(struct btree_trans *trans) +static void bch2_btree_write_buffer_flush_work(struct work_struct *work) { - struct bch_fs *c = trans->c; + struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work); struct btree_write_buffer *wb = &c->btree_write_buffer; - struct btree_write_buffered_key *i; - union btree_write_buffer_state old, new; - int ret = 0; - u64 v; + int ret; - trans_for_each_wb_update(trans, i) { - EBUG_ON(i->k.k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); + mutex_lock(&wb->flushing.lock); + do { + ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans)); + } while (!ret && bch2_btree_write_buffer_should_flush(c)); + mutex_unlock(&wb->flushing.lock); + + bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); +} - i->journal_seq = trans->journal_res.seq; - i->journal_offset = trans->journal_res.offset; +int bch2_journal_key_to_wb_slowpath(struct bch_fs *c, + struct journal_keys_to_wb *dst, + enum btree_id btree, struct bkey_i *k) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + int ret; +retry: + ret = darray_make_room_gfp(&dst->wb->keys, 1, GFP_KERNEL); + if (!ret && dst->wb == &wb->flushing) + ret = darray_resize(&wb->sorted, wb->flushing.keys.size); + + if (unlikely(ret)) { + if (dst->wb == &c->btree_write_buffer.flushing) { + mutex_unlock(&dst->wb->lock); + dst->wb = &c->btree_write_buffer.inc; + bch2_journal_pin_add(&c->journal, dst->seq, &dst->wb->pin, + bch2_btree_write_buffer_journal_flush); + goto retry; + } + + return ret; } - preempt_disable(); - v = READ_ONCE(wb->state.v); - do { - old.v = new.v = v; + dst->room = darray_room(dst->wb->keys); + if (dst->wb == &wb->flushing) + dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr); + BUG_ON(!dst->room); + BUG_ON(!dst->seq); + + struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys); + wb_k->journal_seq = dst->seq; + wb_k->btree = btree; + bkey_copy(&wb_k->k, k); + dst->wb->keys.nr++; + dst->room--; + return 0; +} + +void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_keys_to_wb *dst, u64 seq) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + if (mutex_trylock(&wb->flushing.lock)) { + mutex_lock(&wb->inc.lock); + move_keys_from_inc_to_flushing(wb); - new.v += btree_write_buffer_ref(new.idx); - new.nr += trans->nr_wb_updates; - if (new.nr > wb->size) { - ret = -BCH_ERR_btree_insert_need_flush_buffer; - goto out; + /* + * Attempt to skip wb->inc, and add keys directly to + * wb->flushing, saving us a copy later: + */ + + if (!wb->inc.keys.nr) { + dst->wb = &wb->flushing; + } else { + mutex_unlock(&wb->flushing.lock); + dst->wb = &wb->inc; } - } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v); + } else { + mutex_lock(&wb->inc.lock); + dst->wb = &wb->inc; + } - memcpy(wb->keys[new.idx] + old.nr, - trans->wb_updates, - sizeof(trans->wb_updates[0]) * trans->nr_wb_updates); + dst->room = darray_room(dst->wb->keys); + if (dst->wb == &wb->flushing) + dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr); + dst->seq = seq; - bch2_journal_pin_add(&c->journal, trans->journal_res.seq, &wb->journal_pin, + bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin, bch2_btree_write_buffer_journal_flush); +} + +void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + if (!dst->wb->keys.nr) + bch2_journal_pin_drop(&c->journal, &dst->wb->pin); + + if (bch2_btree_write_buffer_should_flush(c) && + __bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer) && + !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); + + if (dst->wb == &wb->flushing) + mutex_unlock(&wb->flushing.lock); + mutex_unlock(&wb->inc.lock); +} + +static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf) +{ + struct journal_keys_to_wb dst; + struct jset_entry *entry; + struct bkey_i *k; + int ret = 0; + + bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq)); - atomic64_sub_return_release(btree_write_buffer_ref(new.idx), &wb->state.counter); + for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) { + jset_entry_for_each_key(entry, k) { + ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k); + if (ret) + goto out; + } + + entry->type = BCH_JSET_ENTRY_btree_keys; + } + + buf->need_flush_to_write_buffer = false; out: - preempt_enable(); + bch2_journal_keys_to_write_buffer_end(c, &dst); + return ret; +} + +static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size) +{ + if (wb->keys.size >= new_size) + return 0; + + if (!mutex_trylock(&wb->lock)) + return -EINTR; + + int ret = darray_resize(&wb->keys, new_size); + mutex_unlock(&wb->lock); return ret; } +int bch2_btree_write_buffer_resize(struct bch_fs *c, size_t new_size) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + return wb_keys_resize(&wb->flushing, new_size) ?: + wb_keys_resize(&wb->inc, new_size); +} + void bch2_fs_btree_write_buffer_exit(struct bch_fs *c) { struct btree_write_buffer *wb = &c->btree_write_buffer; - BUG_ON(wb->state.nr && !bch2_journal_error(&c->journal)); + BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) && + !bch2_journal_error(&c->journal)); - kvfree(wb->keys[1]); - kvfree(wb->keys[0]); + darray_exit(&wb->sorted); + darray_exit(&wb->flushing.keys); + darray_exit(&wb->inc.keys); } int bch2_fs_btree_write_buffer_init(struct bch_fs *c) { struct btree_write_buffer *wb = &c->btree_write_buffer; - mutex_init(&wb->flush_lock); - wb->size = c->opts.btree_write_buffer_size; + mutex_init(&wb->inc.lock); + mutex_init(&wb->flushing.lock); + INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work); - wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL); - wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL); - if (!wb->keys[0] || !wb->keys[1]) - return -BCH_ERR_ENOMEM_fs_btree_write_buffer_init; + /* Will be resized by journal as needed: */ + unsigned initial_size = 1 << 16; - return 0; + return darray_make_room(&wb->inc.keys, initial_size) ?: + darray_make_room(&wb->flushing.keys, initial_size) ?: + darray_make_room(&wb->sorted, initial_size); } diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h index 322df1c8304e..eebcd2b15249 100644 --- a/fs/bcachefs/btree_write_buffer.h +++ b/fs/bcachefs/btree_write_buffer.h @@ -2,12 +2,59 @@ #ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H #define _BCACHEFS_BTREE_WRITE_BUFFER_H -int __bch2_btree_write_buffer_flush(struct btree_trans *, unsigned, bool); +#include "bkey.h" + +static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + return wb->inc.keys.nr + wb->flushing.keys.nr > wb->inc.keys.size / 4; +} + +static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + return wb->inc.keys.nr > wb->inc.keys.size * 3 / 4; +} + +struct btree_trans; int bch2_btree_write_buffer_flush_sync(struct btree_trans *); -int bch2_btree_write_buffer_flush(struct btree_trans *); +int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *); +int bch2_btree_write_buffer_tryflush(struct btree_trans *); + +struct journal_keys_to_wb { + struct btree_write_buffer_keys *wb; + size_t room; + u64 seq; +}; + +int bch2_journal_key_to_wb_slowpath(struct bch_fs *, + struct journal_keys_to_wb *, + enum btree_id, struct bkey_i *); + +static inline int bch2_journal_key_to_wb(struct bch_fs *c, + struct journal_keys_to_wb *dst, + enum btree_id btree, struct bkey_i *k) +{ + EBUG_ON(!dst->seq); + + if (unlikely(!dst->room)) + return bch2_journal_key_to_wb_slowpath(c, dst, btree, k); + + struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys); + wb_k->journal_seq = dst->seq; + wb_k->btree = btree; + bkey_copy(&wb_k->k, k); + dst->wb->keys.nr++; + dst->room--; + return 0; +} -int bch2_btree_insert_keys_write_buffer(struct btree_trans *); +void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64); +void bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *); +int bch2_btree_write_buffer_resize(struct bch_fs *, size_t); void bch2_fs_btree_write_buffer_exit(struct bch_fs *); int bch2_fs_btree_write_buffer_init(struct bch_fs *); diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h index 99993ba77aea..9b9433de9c36 100644 --- a/fs/bcachefs/btree_write_buffer_types.h +++ b/fs/bcachefs/btree_write_buffer_types.h @@ -2,43 +2,56 @@ #ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H #define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H +#include "darray.h" #include "journal_types.h" #define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4 #define BTREE_WRITE_BUFERED_U64s_MAX (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX) -struct btree_write_buffered_key { - u64 journal_seq; - unsigned journal_offset; - enum btree_id btree; - __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX); -}; - -union btree_write_buffer_state { +struct wb_key_ref { +union { struct { - atomic64_t counter; - }; - - struct { - u64 v; - }; - +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + unsigned idx:24; + u8 pos[sizeof(struct bpos)]; + enum btree_id btree:8; +#else + enum btree_id btree:8; + u8 pos[sizeof(struct bpos)]; + unsigned idx:24; +#endif + } __packed; struct { - u64 nr:23; - u64 idx:1; - u64 ref0:20; - u64 ref1:20; +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + u64 lo; + u64 mi; + u64 hi; +#else + u64 hi; + u64 mi; + u64 lo; +#endif }; }; +}; -struct btree_write_buffer { - struct mutex flush_lock; - struct journal_entry_pin journal_pin; +struct btree_write_buffered_key { + enum btree_id btree:8; + u64 journal_seq:56; + __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX); +}; - union btree_write_buffer_state state; - size_t size; +struct btree_write_buffer_keys { + DARRAY(struct btree_write_buffered_key) keys; + struct journal_entry_pin pin; + struct mutex lock; +}; - struct btree_write_buffered_key *keys[2]; +struct btree_write_buffer { + DARRAY(struct wb_key_ref) sorted; + struct btree_write_buffer_keys inc; + struct btree_write_buffer_keys flushing; + struct work_struct flush_work; }; #endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */ diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 5a91d3189fcf..d83ea0e53df3 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -47,27 +47,23 @@ static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage, void bch2_fs_usage_initialize(struct bch_fs *c) { - struct bch_fs_usage *usage; - struct bch_dev *ca; - unsigned i; - percpu_down_write(&c->mark_lock); - usage = c->usage_base; + struct bch_fs_usage *usage = c->usage_base; - for (i = 0; i < ARRAY_SIZE(c->usage); i++) + for (unsigned i = 0; i < ARRAY_SIZE(c->usage); i++) bch2_fs_usage_acc_to_base(c, i); - for (i = 0; i < BCH_REPLICAS_MAX; i++) + for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) usage->reserved += usage->persistent_reserved[i]; - for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry *e = + for (unsigned i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]); } - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { struct bch_dev_usage dev = bch2_dev_usage_read(ca); usage->hidden += (dev.d[BCH_DATA_sb].buckets + @@ -158,8 +154,7 @@ retry: void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) { - struct bch_dev *ca; - unsigned i, u64s = fs_usage_u64s(c); + unsigned u64s = fs_usage_u64s(c); BUG_ON(idx >= ARRAY_SIZE(c->usage)); @@ -171,7 +166,7 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) percpu_memset(c->usage[idx], 0, u64s * sizeof(u64)); rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, NULL) { + for_each_member_device_rcu(c, ca, NULL) { u64s = dev_usage_u64s(); acc_u64s_percpu((u64 *) ca->usage_base, @@ -214,7 +209,7 @@ void bch2_fs_usage_to_text(struct printbuf *out, } for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry *e = + struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); prt_printf(out, "\t"); @@ -277,18 +272,34 @@ void bch2_dev_usage_init(struct bch_dev *ca) ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket; } -static inline int bucket_sectors_fragmented(struct bch_dev *ca, - struct bch_alloc_v4 a) +void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage) { - return a.dirty_sectors - ? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors) - : 0; + prt_tab(out); + prt_str(out, "buckets"); + prt_tab_rjust(out); + prt_str(out, "sectors"); + prt_tab_rjust(out); + prt_str(out, "fragmented"); + prt_tab_rjust(out); + prt_newline(out); + + for (unsigned i = 0; i < BCH_DATA_NR; i++) { + prt_str(out, bch2_data_types[i]); + prt_tab(out); + prt_u64(out, usage->d[i].buckets); + prt_tab_rjust(out); + prt_u64(out, usage->d[i].sectors); + prt_tab_rjust(out); + prt_u64(out, usage->d[i].fragmented); + prt_tab_rjust(out); + prt_newline(out); + } } -static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, - struct bch_alloc_v4 old, - struct bch_alloc_v4 new, - u64 journal_seq, bool gc) +void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, + const struct bch_alloc_v4 *old, + const struct bch_alloc_v4 *new, + u64 journal_seq, bool gc) { struct bch_fs_usage *fs_usage; struct bch_dev_usage *u; @@ -296,56 +307,51 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, preempt_disable(); fs_usage = fs_usage_ptr(c, journal_seq, gc); - if (data_type_is_hidden(old.data_type)) + if (data_type_is_hidden(old->data_type)) fs_usage->hidden -= ca->mi.bucket_size; - if (data_type_is_hidden(new.data_type)) + if (data_type_is_hidden(new->data_type)) fs_usage->hidden += ca->mi.bucket_size; u = dev_usage_ptr(ca, journal_seq, gc); - u->d[old.data_type].buckets--; - u->d[new.data_type].buckets++; - - u->buckets_ec -= (int) !!old.stripe; - u->buckets_ec += (int) !!new.stripe; + u->d[old->data_type].buckets--; + u->d[new->data_type].buckets++; - u->d[old.data_type].sectors -= old.dirty_sectors; - u->d[new.data_type].sectors += new.dirty_sectors; + u->d[old->data_type].sectors -= bch2_bucket_sectors_dirty(*old); + u->d[new->data_type].sectors += bch2_bucket_sectors_dirty(*new); - u->d[BCH_DATA_cached].sectors += new.cached_sectors; - u->d[BCH_DATA_cached].sectors -= old.cached_sectors; + u->d[BCH_DATA_cached].sectors += new->cached_sectors; + u->d[BCH_DATA_cached].sectors -= old->cached_sectors; - u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old); - u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new); + u->d[old->data_type].fragmented -= bch2_bucket_sectors_fragmented(ca, *old); + u->d[new->data_type].fragmented += bch2_bucket_sectors_fragmented(ca, *new); preempt_enable(); } -static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca, - struct bucket old, struct bucket new, - u64 journal_seq, bool gc) +static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b) { - struct bch_alloc_v4 old_a = { - .gen = old.gen, - .data_type = old.data_type, - .dirty_sectors = old.dirty_sectors, - .cached_sectors = old.cached_sectors, - .stripe = old.stripe, - }; - struct bch_alloc_v4 new_a = { - .gen = new.gen, - .data_type = new.data_type, - .dirty_sectors = new.dirty_sectors, - .cached_sectors = new.cached_sectors, - .stripe = new.stripe, + return (struct bch_alloc_v4) { + .gen = b.gen, + .data_type = b.data_type, + .dirty_sectors = b.dirty_sectors, + .cached_sectors = b.cached_sectors, + .stripe = b.stripe, }; +} + +void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca, + struct bucket *old, struct bucket *new) +{ + struct bch_alloc_v4 old_a = bucket_m_to_alloc(*old); + struct bch_alloc_v4 new_a = bucket_m_to_alloc(*new); - bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc); + bch2_dev_usage_update(c, ca, &old_a, &new_a, 0, true); } static inline int __update_replicas(struct bch_fs *c, struct bch_fs_usage *fs_usage, - struct bch_replicas_entry *r, + struct bch_replicas_entry_v1 *r, s64 sectors) { int idx = bch2_replicas_entry_idx(c, r); @@ -358,9 +364,9 @@ static inline int __update_replicas(struct bch_fs *c, return 0; } -static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k, - struct bch_replicas_entry *r, s64 sectors, - unsigned journal_seq, bool gc) +int bch2_update_replicas(struct bch_fs *c, struct bkey_s_c k, + struct bch_replicas_entry_v1 *r, s64 sectors, + unsigned journal_seq, bool gc) { struct bch_fs_usage *fs_usage; int idx, ret = 0; @@ -407,7 +413,7 @@ static inline int update_cached_sectors(struct bch_fs *c, bch2_replicas_entry_cached(&r.e, dev); - return update_replicas(c, k, &r.e, sectors, journal_seq, gc); + return bch2_update_replicas(c, k, &r.e, sectors, journal_seq, gc); } static int __replicas_deltas_realloc(struct btree_trans *trans, unsigned more, @@ -453,9 +459,9 @@ int bch2_replicas_deltas_realloc(struct btree_trans *trans, unsigned more) __replicas_deltas_realloc(trans, more, _gfp)); } -static inline int update_replicas_list(struct btree_trans *trans, - struct bch_replicas_entry *r, - s64 sectors) +int bch2_update_replicas_list(struct btree_trans *trans, + struct bch_replicas_entry_v1 *r, + s64 sectors) { struct replicas_delta_list *d; struct replicas_delta *n; @@ -481,139 +487,13 @@ static inline int update_replicas_list(struct btree_trans *trans, return 0; } -static inline int update_cached_sectors_list(struct btree_trans *trans, - unsigned dev, s64 sectors) +int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64 sectors) { struct bch_replicas_padded r; bch2_replicas_entry_cached(&r.e, dev); - return update_replicas_list(trans, &r.e, sectors); -} - -int bch2_mark_alloc(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) -{ - bool gc = flags & BTREE_TRIGGER_GC; - u64 journal_seq = trans->journal_res.seq; - u64 bucket_journal_seq; - struct bch_fs *c = trans->c; - struct bch_alloc_v4 old_a_convert, new_a_convert; - const struct bch_alloc_v4 *old_a, *new_a; - struct bch_dev *ca; - int ret = 0; - - /* - * alloc btree is read in by bch2_alloc_read, not gc: - */ - if ((flags & BTREE_TRIGGER_GC) && - !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) - return 0; - - if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans, - "alloc key for invalid device or bucket")) - return -EIO; - - ca = bch_dev_bkey_exists(c, new.k->p.inode); - - old_a = bch2_alloc_to_v4(old, &old_a_convert); - new_a = bch2_alloc_to_v4(new, &new_a_convert); - - bucket_journal_seq = new_a->journal_seq; - - if ((flags & BTREE_TRIGGER_INSERT) && - data_type_is_empty(old_a->data_type) != - data_type_is_empty(new_a->data_type) && - new.k->type == KEY_TYPE_alloc_v4) { - struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v; - - EBUG_ON(!journal_seq); - - /* - * If the btree updates referring to a bucket weren't flushed - * before the bucket became empty again, then the we don't have - * to wait on a journal flush before we can reuse the bucket: - */ - v->journal_seq = bucket_journal_seq = - data_type_is_empty(new_a->data_type) && - (journal_seq == v->journal_seq || - bch2_journal_noflush_seq(&c->journal, v->journal_seq)) - ? 0 : journal_seq; - } - - if (!data_type_is_empty(old_a->data_type) && - data_type_is_empty(new_a->data_type) && - bucket_journal_seq) { - ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, - c->journal.flushed_seq_ondisk, - new.k->p.inode, new.k->p.offset, - bucket_journal_seq); - if (ret) { - bch2_fs_fatal_error(c, - "error setting bucket_needs_journal_commit: %i", ret); - return ret; - } - } - - percpu_down_read(&c->mark_lock); - if (!gc && new_a->gen != old_a->gen) - *bucket_gen(ca, new.k->p.offset) = new_a->gen; - - bch2_dev_usage_update(c, ca, *old_a, *new_a, journal_seq, gc); - - if (gc) { - struct bucket *g = gc_bucket(ca, new.k->p.offset); - - bucket_lock(g); - - g->gen_valid = 1; - g->gen = new_a->gen; - g->data_type = new_a->data_type; - g->stripe = new_a->stripe; - g->stripe_redundancy = new_a->stripe_redundancy; - g->dirty_sectors = new_a->dirty_sectors; - g->cached_sectors = new_a->cached_sectors; - - bucket_unlock(g); - } - percpu_up_read(&c->mark_lock); - - /* - * need to know if we're getting called from the invalidate path or - * not: - */ - - if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && - old_a->cached_sectors) { - ret = update_cached_sectors(c, new, ca->dev_idx, - -((s64) old_a->cached_sectors), - journal_seq, gc); - if (ret) { - bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors", - __func__); - return ret; - } - } - - if (new_a->data_type == BCH_DATA_free && - (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk)) - closure_wake_up(&c->freelist_wait); - - if (new_a->data_type == BCH_DATA_need_discard && - (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk)) - bch2_do_discards(c); - - if (old_a->data_type != BCH_DATA_cached && - new_a->data_type == BCH_DATA_cached && - should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) - bch2_do_invalidates(c); - - if (new_a->data_type == BCH_DATA_need_gc_gens) - bch2_do_gc_gens(c); - - return 0; + return bch2_update_replicas_list(trans, &r.e, sectors); } int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, @@ -658,31 +538,27 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, goto err; } - g->data_type = data_type; g->dirty_sectors += sectors; new = *g; err: bucket_unlock(g); if (!ret) - bch2_dev_usage_update_m(c, ca, old, new, 0, true); + bch2_dev_usage_update_m(c, ca, &old, &new); percpu_up_read(&c->mark_lock); return ret; } -static int check_bucket_ref(struct btree_trans *trans, - struct bkey_s_c k, - const struct bch_extent_ptr *ptr, - s64 sectors, enum bch_data_type ptr_data_type, - u8 b_gen, u8 bucket_data_type, - u32 dirty_sectors, u32 cached_sectors) +int bch2_check_bucket_ref(struct btree_trans *trans, + struct bkey_s_c k, + const struct bch_extent_ptr *ptr, + s64 sectors, enum bch_data_type ptr_data_type, + u8 b_gen, u8 bucket_data_type, + u32 bucket_sectors) { struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); - u32 bucket_sectors = !ptr->cached - ? dirty_sectors - : cached_sectors; struct printbuf buf = PRINTBUF; int ret = 0; @@ -777,508 +653,6 @@ err: goto out; } -static int mark_stripe_bucket(struct btree_trans *trans, - struct bkey_s_c k, - unsigned ptr_idx, - unsigned flags) -{ - struct bch_fs *c = trans->c; - u64 journal_seq = trans->journal_res.seq; - const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; - unsigned nr_data = s->nr_blocks - s->nr_redundant; - bool parity = ptr_idx >= nr_data; - enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; - s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; - const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket old, new, *g; - struct printbuf buf = PRINTBUF; - int ret = 0; - - BUG_ON(!(flags & BTREE_TRIGGER_GC)); - - /* * XXX doesn't handle deletion */ - - percpu_down_read(&c->mark_lock); - g = PTR_GC_BUCKET(ca, ptr); - - if (g->dirty_sectors || - (g->stripe && g->stripe != k.k->p.offset)) { - bch2_fs_inconsistent(c, - "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", - ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EINVAL; - goto err; - } - - bucket_lock(g); - old = *g; - - ret = check_bucket_ref(trans, k, ptr, sectors, data_type, - g->gen, g->data_type, - g->dirty_sectors, g->cached_sectors); - if (ret) - goto err; - - g->data_type = data_type; - g->dirty_sectors += sectors; - - g->stripe = k.k->p.offset; - g->stripe_redundancy = s->nr_redundant; - new = *g; -err: - bucket_unlock(g); - if (!ret) - bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); - percpu_up_read(&c->mark_lock); - printbuf_exit(&buf); - return ret; -} - -static int __mark_pointer(struct btree_trans *trans, - struct bkey_s_c k, - const struct bch_extent_ptr *ptr, - s64 sectors, enum bch_data_type ptr_data_type, - u8 bucket_gen, u8 *bucket_data_type, - u32 *dirty_sectors, u32 *cached_sectors) -{ - u32 *dst_sectors = !ptr->cached - ? dirty_sectors - : cached_sectors; - int ret = check_bucket_ref(trans, k, ptr, sectors, ptr_data_type, - bucket_gen, *bucket_data_type, - *dirty_sectors, *cached_sectors); - - if (ret) - return ret; - - *dst_sectors += sectors; - - if (!*dirty_sectors && !*cached_sectors) - *bucket_data_type = 0; - else if (*bucket_data_type != BCH_DATA_stripe) - *bucket_data_type = ptr_data_type; - - return 0; -} - -static int bch2_mark_pointer(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, - struct extent_ptr_decoded p, - s64 sectors, - unsigned flags) -{ - u64 journal_seq = trans->journal_res.seq; - struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - struct bucket old, new, *g; - enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p); - u8 bucket_data_type; - int ret = 0; - - BUG_ON(!(flags & BTREE_TRIGGER_GC)); - - percpu_down_read(&c->mark_lock); - g = PTR_GC_BUCKET(ca, &p.ptr); - bucket_lock(g); - old = *g; - - bucket_data_type = g->data_type; - ret = __mark_pointer(trans, k, &p.ptr, sectors, - data_type, g->gen, - &bucket_data_type, - &g->dirty_sectors, - &g->cached_sectors); - if (!ret) - g->data_type = bucket_data_type; - - new = *g; - bucket_unlock(g); - if (!ret) - bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); - percpu_up_read(&c->mark_lock); - - return ret; -} - -static int bch2_mark_stripe_ptr(struct btree_trans *trans, - struct bkey_s_c k, - struct bch_extent_stripe_ptr p, - enum bch_data_type data_type, - s64 sectors, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct bch_replicas_padded r; - struct gc_stripe *m; - - BUG_ON(!(flags & BTREE_TRIGGER_GC)); - - m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL); - if (!m) { - bch_err(c, "error allocating memory for gc_stripes, idx %llu", - (u64) p.idx); - return -BCH_ERR_ENOMEM_mark_stripe_ptr; - } - - mutex_lock(&c->ec_stripes_heap_lock); - - if (!m || !m->alive) { - mutex_unlock(&c->ec_stripes_heap_lock); - bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", - (u64) p.idx); - bch2_inconsistent_error(c); - return -EIO; - } - - m->block_sectors[p.block] += sectors; - - r = m->r; - mutex_unlock(&c->ec_stripes_heap_lock); - - r.e.data_type = data_type; - update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true); - - return 0; -} - -static int __mark_extent(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) -{ - u64 journal_seq = trans->journal_res.seq; - struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - struct bch_replicas_padded r; - enum bch_data_type data_type = bkey_is_btree_ptr(k.k) - ? BCH_DATA_btree - : BCH_DATA_user; - s64 sectors = bkey_is_btree_ptr(k.k) - ? btree_sectors(c) - : k.k->size; - s64 dirty_sectors = 0; - bool stale; - int ret; - - BUG_ON(!(flags & BTREE_TRIGGER_GC)); - - r.e.data_type = data_type; - r.e.nr_devs = 0; - r.e.nr_required = 1; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - s64 disk_sectors = ptr_disk_sectors(sectors, p); - - if (flags & BTREE_TRIGGER_OVERWRITE) - disk_sectors = -disk_sectors; - - ret = bch2_mark_pointer(trans, btree_id, level, k, p, disk_sectors, flags); - if (ret < 0) - return ret; - - stale = ret > 0; - - if (p.ptr.cached) { - if (!stale) { - ret = update_cached_sectors(c, k, p.ptr.dev, - disk_sectors, journal_seq, true); - if (ret) { - bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors", - __func__); - return ret; - } - } - } else if (!p.has_ec) { - dirty_sectors += disk_sectors; - r.e.devs[r.e.nr_devs++] = p.ptr.dev; - } else { - ret = bch2_mark_stripe_ptr(trans, k, p.ec, data_type, - disk_sectors, flags); - if (ret) - return ret; - - /* - * There may be other dirty pointers in this extent, but - * if so they're not required for mounting if we have an - * erasure coded pointer in this extent: - */ - r.e.nr_required = 0; - } - } - - if (r.e.nr_devs) { - ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true); - if (ret) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf); - printbuf_exit(&buf); - return ret; - } - } - - return 0; -} - -int bch2_mark_extent(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) -{ - return mem_trigger_run_overwrite_then_insert(__mark_extent, trans, btree_id, level, old, new, flags); -} - -int bch2_mark_stripe(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) -{ - bool gc = flags & BTREE_TRIGGER_GC; - u64 journal_seq = trans->journal_res.seq; - struct bch_fs *c = trans->c; - u64 idx = new.k->p.offset; - const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe - ? bkey_s_c_to_stripe(old).v : NULL; - const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe - ? bkey_s_c_to_stripe(new).v : NULL; - unsigned i; - int ret; - - BUG_ON(gc && old_s); - - if (!gc) { - struct stripe *m = genradix_ptr(&c->stripes, idx); - - if (!m) { - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; - - bch2_bkey_val_to_text(&buf1, c, old); - bch2_bkey_val_to_text(&buf2, c, new); - bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" - "old %s\n" - "new %s", idx, buf1.buf, buf2.buf); - printbuf_exit(&buf2); - printbuf_exit(&buf1); - bch2_inconsistent_error(c); - return -1; - } - - if (!new_s) { - bch2_stripes_heap_del(c, m, idx); - - memset(m, 0, sizeof(*m)); - } else { - m->sectors = le16_to_cpu(new_s->sectors); - m->algorithm = new_s->algorithm; - m->nr_blocks = new_s->nr_blocks; - m->nr_redundant = new_s->nr_redundant; - m->blocks_nonempty = 0; - - for (i = 0; i < new_s->nr_blocks; i++) - m->blocks_nonempty += !!stripe_blockcount_get(new_s, i); - - if (!old_s) - bch2_stripes_heap_insert(c, m, idx); - else - bch2_stripes_heap_update(c, m, idx); - } - } else { - struct gc_stripe *m = - genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); - - if (!m) { - bch_err(c, "error allocating memory for gc_stripes, idx %llu", - idx); - return -BCH_ERR_ENOMEM_mark_stripe; - } - /* - * This will be wrong when we bring back runtime gc: we should - * be unmarking the old key and then marking the new key - */ - m->alive = true; - m->sectors = le16_to_cpu(new_s->sectors); - m->nr_blocks = new_s->nr_blocks; - m->nr_redundant = new_s->nr_redundant; - - for (i = 0; i < new_s->nr_blocks; i++) - m->ptrs[i] = new_s->ptrs[i]; - - bch2_bkey_to_replicas(&m->r.e, new); - - /* - * gc recalculates this field from stripe ptr - * references: - */ - memset(m->block_sectors, 0, sizeof(m->block_sectors)); - - for (i = 0; i < new_s->nr_blocks; i++) { - ret = mark_stripe_bucket(trans, new, i, flags); - if (ret) - return ret; - } - - ret = update_replicas(c, new, &m->r.e, - ((s64) m->sectors * m->nr_redundant), - journal_seq, gc); - if (ret) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, new); - bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf); - printbuf_exit(&buf); - return ret; - } - } - - return 0; -} - -static int __mark_reservation(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) -{ - struct bch_fs *c = trans->c; - struct bch_fs_usage *fs_usage; - unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; - s64 sectors = (s64) k.k->size; - - BUG_ON(!(flags & BTREE_TRIGGER_GC)); - - if (flags & BTREE_TRIGGER_OVERWRITE) - sectors = -sectors; - sectors *= replicas; - - percpu_down_read(&c->mark_lock); - preempt_disable(); - - fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC); - replicas = clamp_t(unsigned, replicas, 1, - ARRAY_SIZE(fs_usage->persistent_reserved)); - - fs_usage->reserved += sectors; - fs_usage->persistent_reserved[replicas - 1] += sectors; - - preempt_enable(); - percpu_up_read(&c->mark_lock); - - return 0; -} - -int bch2_mark_reservation(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) -{ - return mem_trigger_run_overwrite_then_insert(__mark_reservation, trans, btree_id, level, old, new, flags); -} - -static s64 __bch2_mark_reflink_p(struct btree_trans *trans, - struct bkey_s_c_reflink_p p, - u64 start, u64 end, - u64 *idx, unsigned flags, size_t r_idx) -{ - struct bch_fs *c = trans->c; - struct reflink_gc *r; - int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; - u64 next_idx = end; - s64 ret = 0; - struct printbuf buf = PRINTBUF; - - if (r_idx >= c->reflink_gc_nr) - goto not_found; - - r = genradix_ptr(&c->reflink_gc_table, r_idx); - next_idx = min(next_idx, r->offset - r->size); - if (*idx < next_idx) - goto not_found; - - BUG_ON((s64) r->refcount + add < 0); - - r->refcount += add; - *idx = r->offset; - return 0; -not_found: - if (fsck_err(c, reflink_p_to_missing_reflink_v, - "pointer to missing indirect extent\n" - " %s\n" - " missing range %llu-%llu", - (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), - *idx, next_idx)) { - struct bkey_i_error *new; - - new = bch2_trans_kmalloc(trans, sizeof(*new)); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto err; - - bkey_init(&new->k); - new->k.type = KEY_TYPE_error; - new->k.p = bkey_start_pos(p.k); - new->k.p.offset += *idx - start; - bch2_key_resize(&new->k, next_idx - *idx); - ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, - BTREE_TRIGGER_NORUN); - } - - *idx = next_idx; -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -static int __mark_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) -{ - struct bch_fs *c = trans->c; - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - struct reflink_gc *ref; - size_t l, r, m; - u64 idx = le64_to_cpu(p.v->idx), start = idx; - u64 end = le64_to_cpu(p.v->idx) + p.k->size; - int ret = 0; - - BUG_ON(!(flags & BTREE_TRIGGER_GC)); - - if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_reflink_p_fix) { - idx -= le32_to_cpu(p.v->front_pad); - end += le32_to_cpu(p.v->back_pad); - } - - l = 0; - r = c->reflink_gc_nr; - while (l < r) { - m = l + (r - l) / 2; - - ref = genradix_ptr(&c->reflink_gc_table, m); - if (ref->offset <= idx) - l = m + 1; - else - r = m; - } - - while (idx < end && !ret) - ret = __bch2_mark_reflink_p(trans, p, start, end, - &idx, flags, l++); - - return ret; -} - -int bch2_mark_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) -{ - return mem_trigger_run_overwrite_then_insert(__mark_reflink_p, trans, btree_id, level, old, new, flags); -} - void bch2_trans_fs_usage_revert(struct btree_trans *trans, struct replicas_delta_list *deltas) { @@ -1398,92 +772,184 @@ need_mark: return -1; } -/* trans_mark: */ +/* KEY_TYPE_extent: */ -static inline int bch2_trans_mark_pointer(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, struct extent_ptr_decoded p, - unsigned flags) +static int __mark_pointer(struct btree_trans *trans, + struct bkey_s_c k, + const struct bch_extent_ptr *ptr, + s64 sectors, enum bch_data_type ptr_data_type, + u8 bucket_gen, u8 *bucket_data_type, + u32 *dirty_sectors, u32 *cached_sectors) +{ + u32 *dst_sectors = !ptr->cached + ? dirty_sectors + : cached_sectors; + int ret = bch2_check_bucket_ref(trans, k, ptr, sectors, ptr_data_type, + bucket_gen, *bucket_data_type, *dst_sectors); + + if (ret) + return ret; + + *dst_sectors += sectors; + + if (!*dirty_sectors && !*cached_sectors) + *bucket_data_type = 0; + else if (*bucket_data_type != BCH_DATA_stripe) + *bucket_data_type = ptr_data_type; + + return 0; +} + +static int bch2_trigger_pointer(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, struct extent_ptr_decoded p, + s64 *sectors, + unsigned flags) { bool insert = !(flags & BTREE_TRIGGER_OVERWRITE); - struct btree_iter iter; - struct bkey_i_alloc_v4 *a; struct bpos bucket; struct bch_backpointer bp; - s64 sectors; - int ret; bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp); - sectors = bp.bucket_len; - if (!insert) - sectors = -sectors; + *sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len); - a = bch2_trans_start_alloc_update(trans, &iter, bucket); - if (IS_ERR(a)) - return PTR_ERR(a); - - ret = __mark_pointer(trans, k, &p.ptr, sectors, bp.data_type, - a->v.gen, &a->v.data_type, - &a->v.dirty_sectors, &a->v.cached_sectors) ?: - bch2_trans_update(trans, &iter, &a->k_i, 0); - bch2_trans_iter_exit(trans, &iter); + if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + struct btree_iter iter; + struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, &iter, bucket); + int ret = PTR_ERR_OR_ZERO(a); + if (ret) + return ret; - if (ret) - return ret; + ret = __mark_pointer(trans, k, &p.ptr, *sectors, bp.data_type, + a->v.gen, &a->v.data_type, + &a->v.dirty_sectors, &a->v.cached_sectors) ?: + bch2_trans_update(trans, &iter, &a->k_i, 0); + bch2_trans_iter_exit(trans, &iter); - if (!p.ptr.cached) { - ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert); if (ret) return ret; + + if (!p.ptr.cached) { + ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert); + if (ret) + return ret; + } + } + + if (flags & BTREE_TRIGGER_GC) { + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p); + + percpu_down_read(&c->mark_lock); + struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); + bucket_lock(g); + struct bucket old = *g; + + u8 bucket_data_type = g->data_type; + int ret = __mark_pointer(trans, k, &p.ptr, *sectors, + data_type, g->gen, + &bucket_data_type, + &g->dirty_sectors, + &g->cached_sectors); + if (ret) { + bucket_unlock(g); + percpu_up_read(&c->mark_lock); + return ret; + } + + g->data_type = bucket_data_type; + struct bucket new = *g; + bucket_unlock(g); + bch2_dev_usage_update_m(c, ca, &old, &new); + percpu_up_read(&c->mark_lock); } return 0; } -static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, - struct extent_ptr_decoded p, - s64 sectors, enum bch_data_type data_type) +static int bch2_trigger_stripe_ptr(struct btree_trans *trans, + struct bkey_s_c k, + struct extent_ptr_decoded p, + enum bch_data_type data_type, + s64 sectors, unsigned flags) { - struct btree_iter iter; - struct bkey_i_stripe *s; - struct bch_replicas_padded r; - int ret = 0; + if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + struct btree_iter iter; + struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_stripes, POS(0, p.ec.idx), + BTREE_ITER_WITH_UPDATES, stripe); + int ret = PTR_ERR_OR_ZERO(s); + if (unlikely(ret)) { + bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans, + "pointer to nonexistent stripe %llu", + (u64) p.ec.idx); + goto err; + } - s = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_stripes, POS(0, p.ec.idx), - BTREE_ITER_WITH_UPDATES, stripe); - ret = PTR_ERR_OR_ZERO(s); - if (unlikely(ret)) { - bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans, - "pointer to nonexistent stripe %llu", - (u64) p.ec.idx); - goto err; - } + if (!bch2_ptr_matches_stripe(&s->v, p)) { + bch2_trans_inconsistent(trans, + "stripe pointer doesn't match stripe %llu", + (u64) p.ec.idx); + ret = -EIO; + goto err; + } - if (!bch2_ptr_matches_stripe(&s->v, p)) { - bch2_trans_inconsistent(trans, - "stripe pointer doesn't match stripe %llu", - (u64) p.ec.idx); - ret = -EIO; - goto err; + stripe_blockcount_set(&s->v, p.ec.block, + stripe_blockcount_get(&s->v, p.ec.block) + + sectors); + + struct bch_replicas_padded r; + bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i)); + r.e.data_type = data_type; + ret = bch2_update_replicas_list(trans, &r.e, sectors); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; } - stripe_blockcount_set(&s->v, p.ec.block, - stripe_blockcount_get(&s->v, p.ec.block) + - sectors); + if (flags & BTREE_TRIGGER_GC) { + struct bch_fs *c = trans->c; - bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i)); - r.e.data_type = data_type; - ret = update_replicas_list(trans, &r.e, sectors); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + + struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL); + if (!m) { + bch_err(c, "error allocating memory for gc_stripes, idx %llu", + (u64) p.ec.idx); + return -BCH_ERR_ENOMEM_mark_stripe_ptr; + } + + mutex_lock(&c->ec_stripes_heap_lock); + + if (!m || !m->alive) { + mutex_unlock(&c->ec_stripes_heap_lock); + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, k); + bch_err_ratelimited(c, "pointer to nonexistent stripe %llu\n while marking %s", + (u64) p.ec.idx, buf.buf); + printbuf_exit(&buf); + bch2_inconsistent_error(c); + return -EIO; + } + + m->block_sectors[p.ec.block] += sectors; + + struct bch_replicas_padded r = m->r; + mutex_unlock(&c->ec_stripes_heap_lock); + + r.e.data_type = data_type; + bch2_update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true); + } + + return 0; } -static int __trans_mark_extent(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) +static int __trigger_extent(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, unsigned flags) { + bool gc = flags & BTREE_TRIGGER_GC; struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -1492,11 +958,7 @@ static int __trans_mark_extent(struct btree_trans *trans, enum bch_data_type data_type = bkey_is_btree_ptr(k.k) ? BCH_DATA_btree : BCH_DATA_user; - s64 sectors = bkey_is_btree_ptr(k.k) - ? btree_sectors(c) - : k.k->size; s64 dirty_sectors = 0; - bool stale; int ret = 0; r.e.data_type = data_type; @@ -1504,21 +966,20 @@ static int __trans_mark_extent(struct btree_trans *trans, r.e.nr_required = 1; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - s64 disk_sectors = ptr_disk_sectors(sectors, p); - - if (flags & BTREE_TRIGGER_OVERWRITE) - disk_sectors = -disk_sectors; - - ret = bch2_trans_mark_pointer(trans, btree_id, level, k, p, flags); + s64 disk_sectors; + ret = bch2_trigger_pointer(trans, btree_id, level, k, p, &disk_sectors, flags); if (ret < 0) return ret; - stale = ret > 0; + bool stale = ret > 0; if (p.ptr.cached) { if (!stale) { - ret = update_cached_sectors_list(trans, p.ptr.dev, - disk_sectors); + ret = !gc + ? bch2_update_cached_sectors_list(trans, p.ptr.dev, disk_sectors) + : update_cached_sectors(c, k, p.ptr.dev, disk_sectors, 0, true); + bch2_fs_fatal_err_on(ret && gc, c, "%s(): no replicas entry while updating cached sectors", + __func__); if (ret) return ret; } @@ -1526,324 +987,122 @@ static int __trans_mark_extent(struct btree_trans *trans, dirty_sectors += disk_sectors; r.e.devs[r.e.nr_devs++] = p.ptr.dev; } else { - ret = bch2_trans_mark_stripe_ptr(trans, p, - disk_sectors, data_type); + ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags); if (ret) return ret; + /* + * There may be other dirty pointers in this extent, but + * if so they're not required for mounting if we have an + * erasure coded pointer in this extent: + */ r.e.nr_required = 0; } } - if (r.e.nr_devs) - ret = update_replicas_list(trans, &r.e, dirty_sectors); - - return ret; -} - -int bch2_trans_mark_extent(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_i *new, - unsigned flags) -{ - struct bch_fs *c = trans->c; - int mod = (int) bch2_bkey_needs_rebalance(c, bkey_i_to_s_c(new)) - - (int) bch2_bkey_needs_rebalance(c, old); + if (r.e.nr_devs) { + ret = !gc + ? bch2_update_replicas_list(trans, &r.e, dirty_sectors) + : bch2_update_replicas(c, k, &r.e, dirty_sectors, 0, true); + if (unlikely(ret && gc)) { + struct printbuf buf = PRINTBUF; - if (mod) { - int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new->k.p, mod > 0); + bch2_bkey_val_to_text(&buf, c, k); + bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf); + printbuf_exit(&buf); + } if (ret) return ret; } - return trigger_run_overwrite_then_insert(__trans_mark_extent, trans, btree_id, level, old, new, flags); -} - -static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, - struct bkey_s_c_stripe s, - unsigned idx, bool deleting) -{ - struct bch_fs *c = trans->c; - const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; - struct btree_iter iter; - struct bkey_i_alloc_v4 *a; - enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant - ? BCH_DATA_parity : 0; - s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0; - int ret = 0; - - if (deleting) - sectors = -sectors; - - a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr)); - if (IS_ERR(a)) - return PTR_ERR(a); - - ret = check_bucket_ref(trans, s.s_c, ptr, sectors, data_type, - a->v.gen, a->v.data_type, - a->v.dirty_sectors, a->v.cached_sectors); - if (ret) - goto err; - - if (!deleting) { - if (bch2_trans_inconsistent_on(a->v.stripe || - a->v.stripe_redundancy, trans, - "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", - iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_types[a->v.data_type], - a->v.dirty_sectors, - a->v.stripe, s.k->p.offset)) { - ret = -EIO; - goto err; - } - - if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans, - "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", - iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_types[a->v.data_type], - a->v.dirty_sectors, - s.k->p.offset)) { - ret = -EIO; - goto err; - } - - a->v.stripe = s.k->p.offset; - a->v.stripe_redundancy = s.v->nr_redundant; - a->v.data_type = BCH_DATA_stripe; - } else { - if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset || - a->v.stripe_redundancy != s.v->nr_redundant, trans, - "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)", - iter.pos.inode, iter.pos.offset, a->v.gen, - s.k->p.offset, a->v.stripe)) { - ret = -EIO; - goto err; - } - - a->v.stripe = 0; - a->v.stripe_redundancy = 0; - a->v.data_type = alloc_data_type(a->v, BCH_DATA_user); - } - - a->v.dirty_sectors += sectors; - if (data_type) - a->v.data_type = !deleting ? data_type : 0; - - ret = bch2_trans_update(trans, &iter, &a->k_i, 0); - if (ret) - goto err; -err: - bch2_trans_iter_exit(trans, &iter); - return ret; + return 0; } -int bch2_trans_mark_stripe(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_i *new, - unsigned flags) +int bch2_trigger_extent(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s new, + unsigned flags) { - const struct bch_stripe *old_s = NULL; - struct bch_stripe *new_s = NULL; - struct bch_replicas_padded r; - unsigned i, nr_blocks; - int ret = 0; - - if (old.k->type == KEY_TYPE_stripe) - old_s = bkey_s_c_to_stripe(old).v; - if (new->k.type == KEY_TYPE_stripe) - new_s = &bkey_i_to_stripe(new)->v; - - /* - * If the pointers aren't changing, we don't need to do anything: - */ - if (new_s && old_s && - new_s->nr_blocks == old_s->nr_blocks && - new_s->nr_redundant == old_s->nr_redundant && - !memcmp(old_s->ptrs, new_s->ptrs, - new_s->nr_blocks * sizeof(struct bch_extent_ptr))) + struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c); + struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old); + unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start; + unsigned old_ptrs_bytes = (void *) old_ptrs.end - (void *) old_ptrs.start; + + /* if pointers aren't changing - nothing to do: */ + if (new_ptrs_bytes == old_ptrs_bytes && + !memcmp(new_ptrs.start, + old_ptrs.start, + new_ptrs_bytes)) return 0; - BUG_ON(new_s && old_s && - (new_s->nr_blocks != old_s->nr_blocks || - new_s->nr_redundant != old_s->nr_redundant)); - - nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; - - if (new_s) { - s64 sectors = le16_to_cpu(new_s->sectors); - - bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new)); - ret = update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); - if (ret) - return ret; - } + if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + struct bch_fs *c = trans->c; + int mod = (int) bch2_bkey_needs_rebalance(c, new.s_c) - + (int) bch2_bkey_needs_rebalance(c, old); - if (old_s) { - s64 sectors = -((s64) le16_to_cpu(old_s->sectors)); - - bch2_bkey_to_replicas(&r.e, old); - ret = update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); - if (ret) - return ret; - } - - for (i = 0; i < nr_blocks; i++) { - if (new_s && old_s && - !memcmp(&new_s->ptrs[i], - &old_s->ptrs[i], - sizeof(new_s->ptrs[i]))) - continue; - - if (new_s) { - ret = bch2_trans_mark_stripe_bucket(trans, - bkey_i_to_s_c_stripe(new), i, false); - if (ret) - break; - } - - if (old_s) { - ret = bch2_trans_mark_stripe_bucket(trans, - bkey_s_c_to_stripe(old), i, true); + if (mod) { + int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new.k->p, mod > 0); if (ret) - break; + return ret; } } - return ret; + if (flags & (BTREE_TRIGGER_TRANSACTIONAL|BTREE_TRIGGER_GC)) + return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree_id, level, old, new, flags); + + return 0; } -static int __trans_mark_reservation(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) +/* KEY_TYPE_reservation */ + +static int __trigger_reservation(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, unsigned flags) { + struct bch_fs *c = trans->c; unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; - s64 sectors = (s64) k.k->size; - struct replicas_delta_list *d; - int ret; + s64 sectors = (s64) k.k->size * replicas; if (flags & BTREE_TRIGGER_OVERWRITE) sectors = -sectors; - sectors *= replicas; - ret = bch2_replicas_deltas_realloc(trans, 0); - if (ret) - return ret; - - d = trans->fs_usage_deltas; - replicas = clamp_t(unsigned, replicas, 1, - ARRAY_SIZE(d->persistent_reserved)); - - d->persistent_reserved[replicas - 1] += sectors; - return 0; -} - -int bch2_trans_mark_reservation(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, - struct bkey_i *new, - unsigned flags) -{ - return trigger_run_overwrite_then_insert(__trans_mark_reservation, trans, btree_id, level, old, new, flags); -} - -static int trans_mark_reflink_p_segment(struct btree_trans *trans, - struct bkey_s_c_reflink_p p, - u64 *idx, unsigned flags) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_i *k; - __le64 *refcount; - int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; - struct printbuf buf = PRINTBUF; - int ret; + if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + int ret = bch2_replicas_deltas_realloc(trans, 0); + if (ret) + return ret; - k = bch2_bkey_get_mut_noupdate(trans, &iter, - BTREE_ID_reflink, POS(0, *idx), - BTREE_ITER_WITH_UPDATES); - ret = PTR_ERR_OR_ZERO(k); - if (ret) - goto err; + struct replicas_delta_list *d = trans->fs_usage_deltas; + replicas = min(replicas, ARRAY_SIZE(d->persistent_reserved)); - refcount = bkey_refcount(k); - if (!refcount) { - bch2_bkey_val_to_text(&buf, c, p.s_c); - bch2_trans_inconsistent(trans, - "nonexistent indirect extent at %llu while marking\n %s", - *idx, buf.buf); - ret = -EIO; - goto err; + d->persistent_reserved[replicas - 1] += sectors; } - if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) { - bch2_bkey_val_to_text(&buf, c, p.s_c); - bch2_trans_inconsistent(trans, - "indirect extent refcount underflow at %llu while marking\n %s", - *idx, buf.buf); - ret = -EIO; - goto err; - } + if (flags & BTREE_TRIGGER_GC) { + percpu_down_read(&c->mark_lock); + preempt_disable(); - if (flags & BTREE_TRIGGER_INSERT) { - struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; - u64 pad; + struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage_gc); - pad = max_t(s64, le32_to_cpu(v->front_pad), - le64_to_cpu(v->idx) - bkey_start_offset(&k->k)); - BUG_ON(pad > U32_MAX); - v->front_pad = cpu_to_le32(pad); + replicas = min(replicas, ARRAY_SIZE(fs_usage->persistent_reserved)); + fs_usage->reserved += sectors; + fs_usage->persistent_reserved[replicas - 1] += sectors; - pad = max_t(s64, le32_to_cpu(v->back_pad), - k->k.p.offset - p.k->size - le64_to_cpu(v->idx)); - BUG_ON(pad > U32_MAX); - v->back_pad = cpu_to_le32(pad); + preempt_enable(); + percpu_up_read(&c->mark_lock); } - le64_add_cpu(refcount, add); - - bch2_btree_iter_set_pos_to_extent_start(&iter); - ret = bch2_trans_update(trans, &iter, k, 0); - if (ret) - goto err; - - *idx = k->k.p.offset; -err: - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); - return ret; + return 0; } -static int __trans_mark_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) +int bch2_trigger_reservation(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s new, + unsigned flags) { - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - u64 idx, end_idx; - int ret = 0; - - idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); - end_idx = le64_to_cpu(p.v->idx) + p.k->size + - le32_to_cpu(p.v->back_pad); - - while (idx < end_idx && !ret) - ret = trans_mark_reflink_p_segment(trans, p, &idx, flags); - return ret; + return trigger_run_overwrite_then_insert(__trigger_reservation, trans, btree_id, level, old, new, flags); } -int bch2_trans_mark_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, - struct bkey_i *new, - unsigned flags) -{ - if (flags & BTREE_TRIGGER_INSERT) { - struct bch_reflink_p *v = &bkey_i_to_reflink_p(new)->v; - - v->front_pad = v->back_pad = 0; - } - - return trigger_run_overwrite_then_insert(__trans_mark_reflink_p, trans, btree_id, level, old, new, flags); -} +/* Mark superblocks: */ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *ca, size_t b, @@ -1974,17 +1233,13 @@ int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca) { int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca)); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } int bch2_trans_mark_dev_sbs(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; - - for_each_online_member(ca, c, i) { + for_each_online_member(c, ca) { int ret = bch2_trans_mark_dev_sb(c, ca); if (ret) { percpu_ref_put(&ca->ref); diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 21f6cb356921..2c95cc5d86be 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -203,6 +203,7 @@ static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) } void bch2_dev_usage_init(struct bch_dev *); +void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev_usage *); static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark) { @@ -301,6 +302,12 @@ u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *); struct bch_fs_usage_short bch2_fs_usage_read_short(struct bch_fs *); +void bch2_dev_usage_update(struct bch_fs *, struct bch_dev *, + const struct bch_alloc_v4 *, + const struct bch_alloc_v4 *, u64, bool); +void bch2_dev_usage_update_m(struct bch_fs *, struct bch_dev *, + struct bucket *, struct bucket *); + /* key/bucket marking: */ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, @@ -315,44 +322,40 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, : c->usage[journal_seq & JOURNAL_BUF_MASK]); } +int bch2_update_replicas(struct bch_fs *, struct bkey_s_c, + struct bch_replicas_entry_v1 *, s64, + unsigned, bool); +int bch2_update_replicas_list(struct btree_trans *, + struct bch_replicas_entry_v1 *, s64); +int bch2_update_cached_sectors_list(struct btree_trans *, unsigned, s64); int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned); void bch2_fs_usage_initialize(struct bch_fs *); +int bch2_check_bucket_ref(struct btree_trans *, struct bkey_s_c, + const struct bch_extent_ptr *, + s64, enum bch_data_type, u8, u8, u32); + int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, size_t, enum bch_data_type, unsigned, struct gc_pos, unsigned); -int bch2_mark_alloc(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s_c, unsigned); -int bch2_mark_extent(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s_c, unsigned); -int bch2_mark_stripe(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s_c, unsigned); -int bch2_mark_reservation(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s_c, unsigned); -int bch2_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s_c, unsigned); - -int bch2_trans_mark_extent(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); -int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); -int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); -int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); - -#define mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\ +int bch2_trigger_extent(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, unsigned); +int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, unsigned); + +#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\ ({ \ int ret = 0; \ \ if (_old.k->type) \ ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_INSERT); \ if (!ret && _new.k->type) \ - ret = _fn(_trans, _btree_id, _level, _new, _flags & ~BTREE_TRIGGER_OVERWRITE); \ + ret = _fn(_trans, _btree_id, _level, _new.s_c, _flags & ~BTREE_TRIGGER_OVERWRITE);\ ret; \ }) -#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags) \ - mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, bkey_i_to_s_c(_new), _flags) - void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *); int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 2a9dab9006ef..783f71017204 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -33,8 +33,6 @@ struct bucket_gens { }; struct bch_dev_usage { - u64 buckets_ec; - struct { u64 buckets; u64 sectors; /* _compressed_ sectors: */ diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 4bb88aefed12..226b39c17667 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -7,22 +7,27 @@ #include "chardev.h" #include "journal.h" #include "move.h" +#include "recovery.h" #include "replicas.h" #include "super.h" #include "super-io.h" +#include "thread_with_file.h" -#include <linux/anon_inodes.h> #include <linux/cdev.h> #include <linux/device.h> -#include <linux/file.h> #include <linux/fs.h> #include <linux/ioctl.h> -#include <linux/kthread.h> #include <linux/major.h> #include <linux/sched/task.h> #include <linux/slab.h> #include <linux/uaccess.h> +__must_check +static int copy_to_user_errcode(void __user *to, const void *from, unsigned long n) +{ + return copy_to_user(to, from, n) ? -EFAULT : 0; +} + /* returns with ref on ca->ref */ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, unsigned flags) @@ -132,8 +137,106 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg } #endif +struct fsck_thread { + struct thread_with_stdio thr; + struct bch_fs *c; + char **devs; + size_t nr_devs; + struct bch_opts opts; +}; + +static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr) +{ + struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr); + if (thr->devs) + for (size_t i = 0; i < thr->nr_devs; i++) + kfree(thr->devs[i]); + kfree(thr->devs); + kfree(thr); +} + +static int bch2_fsck_offline_thread_fn(void *arg) +{ + struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr); + struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts); + + thr->thr.thr.ret = PTR_ERR_OR_ZERO(c); + if (!thr->thr.thr.ret) + bch2_fs_stop(c); + + thread_with_stdio_done(&thr->thr); + return 0; +} + +static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) +{ + struct bch_ioctl_fsck_offline arg; + struct fsck_thread *thr = NULL; + u64 *devs = NULL; + long ret = 0; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + + if (arg.flags) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!(devs = kcalloc(arg.nr_devs, sizeof(*devs), GFP_KERNEL)) || + !(thr = kzalloc(sizeof(*thr), GFP_KERNEL)) || + !(thr->devs = kcalloc(arg.nr_devs, sizeof(*thr->devs), GFP_KERNEL))) { + ret = -ENOMEM; + goto err; + } + + thr->opts = bch2_opts_empty(); + thr->nr_devs = arg.nr_devs; + + if (copy_from_user(devs, &user_arg->devs[0], + array_size(sizeof(user_arg->devs[0]), arg.nr_devs))) { + ret = -EINVAL; + goto err; + } + + for (size_t i = 0; i < arg.nr_devs; i++) { + thr->devs[i] = strndup_user((char __user *)(unsigned long) devs[i], PATH_MAX); + ret = PTR_ERR_OR_ZERO(thr->devs[i]); + if (ret) + goto err; + } + + if (arg.opts) { + char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); + + ret = PTR_ERR_OR_ZERO(optstr) ?: + bch2_parse_mount_opts(NULL, &thr->opts, optstr); + kfree(optstr); + + if (ret) + goto err; + } + + opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); + + ret = bch2_run_thread_with_stdio(&thr->thr, + bch2_fsck_thread_exit, + bch2_fsck_offline_thread_fn); +err: + if (ret < 0) { + if (thr) + bch2_fsck_thread_exit(&thr->thr); + pr_err("ret %s", bch2_err_str(ret)); + } + kfree(devs); + return ret; +} + static long bch2_global_ioctl(unsigned cmd, void __user *arg) { + long ret; + switch (cmd) { #if 0 case BCH_IOCTL_ASSEMBLE: @@ -141,18 +244,25 @@ static long bch2_global_ioctl(unsigned cmd, void __user *arg) case BCH_IOCTL_INCREMENTAL: return bch2_ioctl_incremental(arg); #endif + case BCH_IOCTL_FSCK_OFFLINE: { + ret = bch2_ioctl_fsck_offline(arg); + break; + } default: - return -ENOTTY; + ret = -ENOTTY; + break; } + + if (ret < 0) + ret = bch2_err_class(ret); + return ret; } static long bch2_ioctl_query_uuid(struct bch_fs *c, struct bch_ioctl_query_uuid __user *user_arg) { - if (copy_to_user(&user_arg->uuid, &c->sb.user_uuid, - sizeof(c->sb.user_uuid))) - return -EFAULT; - return 0; + return copy_to_user_errcode(&user_arg->uuid, &c->sb.user_uuid, + sizeof(c->sb.user_uuid)); } #if 0 @@ -295,31 +405,27 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c, } struct bch_data_ctx { + struct thread_with_file thr; + struct bch_fs *c; struct bch_ioctl_data arg; struct bch_move_stats stats; - - int ret; - - struct task_struct *thread; }; static int bch2_data_thread(void *arg) { - struct bch_data_ctx *ctx = arg; - - ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); + struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr); + ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); ctx->stats.data_type = U8_MAX; return 0; } static int bch2_data_job_release(struct inode *inode, struct file *file) { - struct bch_data_ctx *ctx = file->private_data; + struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr); - kthread_stop(ctx->thread); - put_task_struct(ctx->thread); + bch2_thread_with_file_exit(&ctx->thr); kfree(ctx); return 0; } @@ -327,7 +433,7 @@ static int bch2_data_job_release(struct inode *inode, struct file *file) static ssize_t bch2_data_job_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { - struct bch_data_ctx *ctx = file->private_data; + struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr); struct bch_fs *c = ctx->c; struct bch_ioctl_data_event e = { .type = BCH_DATA_EVENT_PROGRESS, @@ -341,10 +447,7 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf, if (len < sizeof(e)) return -EINVAL; - if (copy_to_user(buf, &e, sizeof(e))) - return -EFAULT; - - return sizeof(e); + return copy_to_user_errcode(buf, &e, sizeof(e)) ?: sizeof(e); } static const struct file_operations bcachefs_data_ops = { @@ -356,10 +459,8 @@ static const struct file_operations bcachefs_data_ops = { static long bch2_ioctl_data(struct bch_fs *c, struct bch_ioctl_data arg) { - struct bch_data_ctx *ctx = NULL; - struct file *file = NULL; - unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK; - int ret, fd = -1; + struct bch_data_ctx *ctx; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -374,36 +475,11 @@ static long bch2_ioctl_data(struct bch_fs *c, ctx->c = c; ctx->arg = arg; - ctx->thread = kthread_create(bch2_data_thread, ctx, - "bch-data/%s", c->name); - if (IS_ERR(ctx->thread)) { - ret = PTR_ERR(ctx->thread); - goto err; - } - - ret = get_unused_fd_flags(flags); + ret = bch2_run_thread_with_file(&ctx->thr, + &bcachefs_data_ops, + bch2_data_thread); if (ret < 0) - goto err; - fd = ret; - - file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags); - if (IS_ERR(file)) { - ret = PTR_ERR(file); - goto err; - } - - fd_install(fd, file); - - get_task_struct(ctx->thread); - wake_up_process(ctx->thread); - - return fd; -err: - if (fd >= 0) - put_unused_fd(fd); - if (!IS_ERR_OR_NULL(ctx->thread)) - kthread_stop(ctx->thread); - kfree(ctx); + kfree(ctx); return ret; } @@ -417,7 +493,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, unsigned i; int ret = 0; - if (!test_bit(BCH_FS_STARTED, &c->flags)) + if (!test_bit(BCH_FS_started, &c->flags)) return -EINVAL; if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes)) @@ -444,7 +520,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, dst_end = (void *) arg->replicas + replica_entries_bytes; for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry *src_e = + struct bch_replicas_entry_v1 *src_e = cpu_replicas_entry(&c->replicas, i); /* check that we have enough space for one replicas entry */ @@ -474,14 +550,15 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, if (ret) goto err; - if (copy_to_user(user_arg, arg, - sizeof(*arg) + arg->replica_entries_bytes)) - ret = -EFAULT; + + ret = copy_to_user_errcode(user_arg, arg, + sizeof(*arg) + arg->replica_entries_bytes); err: kfree(arg); return ret; } +/* obsolete, didn't allow for new data types: */ static long bch2_ioctl_dev_usage(struct bch_fs *c, struct bch_ioctl_dev_usage __user *user_arg) { @@ -490,7 +567,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, struct bch_dev *ca; unsigned i; - if (!test_bit(BCH_FS_STARTED, &c->flags)) + if (!test_bit(BCH_FS_started, &c->flags)) return -EINVAL; if (copy_from_user(&arg, user_arg, sizeof(arg))) @@ -511,7 +588,6 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, arg.state = ca->mi.state; arg.bucket_size = ca->mi.bucket_size; arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; - arg.buckets_ec = src.buckets_ec; for (i = 0; i < BCH_DATA_NR; i++) { arg.d[i].buckets = src.d[i].buckets; @@ -521,10 +597,58 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, percpu_ref_put(&ca->ref); - if (copy_to_user(user_arg, &arg, sizeof(arg))) + return copy_to_user_errcode(user_arg, &arg, sizeof(arg)); +} + +static long bch2_ioctl_dev_usage_v2(struct bch_fs *c, + struct bch_ioctl_dev_usage_v2 __user *user_arg) +{ + struct bch_ioctl_dev_usage_v2 arg; + struct bch_dev_usage src; + struct bch_dev *ca; + int ret = 0; + + if (!test_bit(BCH_FS_started, &c->flags)) + return -EINVAL; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) return -EFAULT; - return 0; + if ((arg.flags & ~BCH_BY_INDEX) || + arg.pad[0] || + arg.pad[1] || + arg.pad[2]) + return -EINVAL; + + ca = bch2_device_lookup(c, arg.dev, arg.flags); + if (IS_ERR(ca)) + return PTR_ERR(ca); + + src = bch2_dev_usage_read(ca); + + arg.state = ca->mi.state; + arg.bucket_size = ca->mi.bucket_size; + arg.nr_data_types = min(arg.nr_data_types, BCH_DATA_NR); + arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; + + ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg)); + if (ret) + goto err; + + for (unsigned i = 0; i < arg.nr_data_types; i++) { + struct bch_ioctl_dev_usage_type t = { + .buckets = src.d[i].buckets, + .sectors = src.d[i].sectors, + .fragmented = src.d[i].fragmented, + }; + + ret = copy_to_user_errcode(&user_arg->d[i], &t, sizeof(t)); + if (ret) + goto err; + } +err: + percpu_ref_put(&ca->ref); + return ret; } static long bch2_ioctl_read_super(struct bch_fs *c, @@ -561,9 +685,8 @@ static long bch2_ioctl_read_super(struct bch_fs *c, goto err; } - if (copy_to_user((void __user *)(unsigned long)arg.sb, sb, - vstruct_bytes(sb))) - ret = -EFAULT; + ret = copy_to_user_errcode((void __user *)(unsigned long)arg.sb, sb, + vstruct_bytes(sb)); err: if (!IS_ERR_OR_NULL(ca)) percpu_ref_put(&ca->ref); @@ -575,8 +698,6 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c, struct bch_ioctl_disk_get_idx arg) { dev_t dev = huge_decode_dev(arg.dev); - struct bch_dev *ca; - unsigned i; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -584,10 +705,10 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c, if (!dev) return -EINVAL; - for_each_online_member(ca, c, i) + for_each_online_member(c, ca) if (ca->dev == dev) { percpu_ref_put(&ca->io_ref); - return i; + return ca->dev_idx; } return -BCH_ERR_ENOENT_dev_idx_not_found; @@ -642,6 +763,97 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, return ret; } +static int bch2_fsck_online_thread_fn(void *arg) +{ + struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr); + struct bch_fs *c = thr->c; + + c->stdio_filter = current; + c->stdio = &thr->thr.stdio; + + /* + * XXX: can we figure out a way to do this without mucking with c->opts? + */ + unsigned old_fix_errors = c->opts.fix_errors; + if (opt_defined(thr->opts, fix_errors)) + c->opts.fix_errors = thr->opts.fix_errors; + else + c->opts.fix_errors = FSCK_FIX_ask; + + c->opts.fsck = true; + set_bit(BCH_FS_fsck_running, &c->flags); + + c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; + int ret = bch2_run_online_recovery_passes(c); + + clear_bit(BCH_FS_fsck_running, &c->flags); + bch_err_fn(c, ret); + + c->stdio = NULL; + c->stdio_filter = NULL; + c->opts.fix_errors = old_fix_errors; + + thread_with_stdio_done(&thr->thr); + + up(&c->online_fsck_mutex); + bch2_ro_ref_put(c); + return 0; +} + +static long bch2_ioctl_fsck_online(struct bch_fs *c, + struct bch_ioctl_fsck_online arg) +{ + struct fsck_thread *thr = NULL; + long ret = 0; + + if (arg.flags) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!bch2_ro_ref_tryget(c)) + return -EROFS; + + if (down_trylock(&c->online_fsck_mutex)) { + bch2_ro_ref_put(c); + return -EAGAIN; + } + + thr = kzalloc(sizeof(*thr), GFP_KERNEL); + if (!thr) { + ret = -ENOMEM; + goto err; + } + + thr->c = c; + thr->opts = bch2_opts_empty(); + + if (arg.opts) { + char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); + + ret = PTR_ERR_OR_ZERO(optstr) ?: + bch2_parse_mount_opts(c, &thr->opts, optstr); + kfree(optstr); + + if (ret) + goto err; + } + + ret = bch2_run_thread_with_stdio(&thr->thr, + bch2_fsck_thread_exit, + bch2_fsck_online_thread_fn); +err: + if (ret < 0) { + bch_err_fn(c, ret); + if (thr) + bch2_fsck_thread_exit(&thr->thr); + up(&c->online_fsck_mutex); + bch2_ro_ref_put(c); + } + return ret; +} + #define BCH_IOCTL(_name, _argtype) \ do { \ _argtype i; \ @@ -663,6 +875,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) return bch2_ioctl_fs_usage(c, arg); case BCH_IOCTL_DEV_USAGE: return bch2_ioctl_dev_usage(c, arg); + case BCH_IOCTL_DEV_USAGE_V2: + return bch2_ioctl_dev_usage_v2(c, arg); #if 0 case BCH_IOCTL_START: BCH_IOCTL(start, struct bch_ioctl_start); @@ -675,7 +889,7 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx); } - if (!test_bit(BCH_FS_STARTED, &c->flags)) + if (!test_bit(BCH_FS_started, &c->flags)) return -EINVAL; switch (cmd) { @@ -695,7 +909,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize); case BCH_IOCTL_DISK_RESIZE_JOURNAL: BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal); - + case BCH_IOCTL_FSCK_ONLINE: + BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online); default: return -ENOTTY; } diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 13998388c545..1b8c2c1016dc 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -45,6 +45,29 @@ struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce, bch2_checksum(_c, _type, _nonce, _start, vstruct_end(_i) - _start);\ }) +static inline void bch2_csum_to_text(struct printbuf *out, + enum bch_csum_type type, + struct bch_csum csum) +{ + const u8 *p = (u8 *) &csum; + unsigned bytes = type < BCH_CSUM_NR ? bch_crc_bytes[type] : 16; + + for (unsigned i = 0; i < bytes; i++) + prt_hex_byte(out, p[i]); +} + +static inline void bch2_csum_err_msg(struct printbuf *out, + enum bch_csum_type type, + struct bch_csum expected, + struct bch_csum got) +{ + prt_printf(out, "checksum error: got "); + bch2_csum_to_text(out, type, got); + prt_str(out, " should be "); + bch2_csum_to_text(out, type, expected); + prt_printf(out, " type %s", bch2_csum_types[type]); +} + int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); int bch2_request_key(struct bch_sb *, struct bch_key *); #ifndef __KERNEL__ diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 51af8ea230ed..33df8cf86bd8 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -572,10 +572,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) ZSTD_parameters params = zstd_get_params(zstd_max_clevel(), c->opts.encoded_extent_max); - /* - * ZSTD is lying: if we allocate the size of the workspace it says it - * requires, it returns memory allocation errors - */ c->zstd_workspace_size = zstd_cctx_workspace_bound(¶ms.cParams); struct { diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h index e367c625f057..4b340d13caac 100644 --- a/fs/bcachefs/darray.h +++ b/fs/bcachefs/darray.h @@ -20,6 +20,7 @@ struct { \ #define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0) typedef DARRAY(char) darray_char; +typedef DARRAY(char *) darray_str; int __bch2_darray_resize(darray_char *, size_t, size_t, gfp_t); @@ -81,11 +82,14 @@ static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more, #define darray_remove_item(_d, _pos) \ array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data) +#define __darray_for_each(_d, _i) \ + for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++) + #define darray_for_each(_d, _i) \ - for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++) + for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++) #define darray_for_each_reverse(_d, _i) \ - for (_i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i) + for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i) #define darray_init(_d) \ do { \ diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 37d6ecae8c30..6f13477ff652 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -267,6 +267,20 @@ restart_drop_extra_replicas: goto out; } + if (trace_data_update_enabled()) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, old); + prt_str(&buf, "\nk: "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + + trace_data_update(c, buf.buf); + printbuf_exit(&buf); + } + ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, bkey_start_pos(&insert->k)) ?: bch2_insert_snapshot_whiteouts(trans, m->btree_id, @@ -278,8 +292,8 @@ restart_drop_extra_replicas: BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: bch2_trans_commit(trans, &op->res, NULL, - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL| + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc| m->data_opts.btree_insert_flags); if (!ret) { bch2_btree_iter_set_pos(&iter, next_pos); @@ -300,14 +314,14 @@ next: } continue; nowork: - if (m->stats && m->stats) { + if (m->stats) { BUG_ON(k.k->p.offset <= iter.pos.offset); atomic64_inc(&m->stats->keys_raced); atomic64_add(k.k->p.offset - iter.pos.offset, &m->stats->sectors_raced); } - this_cpu_inc(c->counters[BCH_COUNTER_move_extent_fail]); + count_event(c, move_extent_fail); bch2_btree_iter_advance(&iter); goto next; @@ -342,7 +356,6 @@ void bch2_data_update_exit(struct data_update *update) struct bch_fs *c = update->op.c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k)); - const struct bch_extent_ptr *ptr; bkey_for_each_ptr(ptrs, ptr) { if (c->opts.nocow_enabled) @@ -363,7 +376,6 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, struct bio *bio = &update->op.wbio.bio; struct bkey_i_extent *e; struct write_point *wp; - struct bch_extent_ptr *ptr; struct closure cl; struct btree_iter iter; struct bkey_s_c k; @@ -404,6 +416,8 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, continue; } + bch_err_fn_ratelimited(c, ret); + if (ret) return; @@ -476,7 +490,7 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, return bch2_trans_relock(trans) ?: bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: - bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } int bch2_data_update_init(struct btree_trans *trans, @@ -493,7 +507,6 @@ int bch2_data_update_init(struct btree_trans *trans, struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; - const struct bch_extent_ptr *ptr; unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas; unsigned ptrs_locked = 0; int ret = 0; @@ -639,7 +652,6 @@ done: void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; unsigned i = 0; bkey_for_each_ptr(ptrs, ptr) { diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 57c5128db173..d6418948495f 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -366,35 +366,23 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - ssize_t ret; i->ubuf = buf; i->size = size; i->ret = 0; - ret = flush_buf(i); - if (ret) - return ret; - - trans = bch2_trans_get(i->c); - ret = for_each_btree_key2(trans, iter, i->id, i->from, - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ({ - bch2_bkey_val_to_text(&i->buf, i->c, k); - prt_newline(&i->buf); - drop_locks_do(trans, flush_buf(i)); - })); - i->from = iter.pos; - - bch2_trans_put(trans); - - if (!ret) - ret = flush_buf(i); - - return ret ?: i->ret; + return flush_buf(i) ?: + bch2_trans_run(i->c, + for_each_btree_key(trans, iter, i->id, i->from, + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ({ + bch2_bkey_val_to_text(&i->buf, i->c, k); + prt_newline(&i->buf); + bch2_trans_unlock(trans); + i->from = bpos_successor(iter.pos); + flush_buf(i); + }))) ?: + i->ret; } static const struct file_operations btree_debug_ops = { @@ -462,44 +450,32 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - ssize_t ret; i->ubuf = buf; i->size = size; i->ret = 0; - ret = flush_buf(i); - if (ret) - return ret; - - trans = bch2_trans_get(i->c); - - ret = for_each_btree_key2(trans, iter, i->id, i->from, - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ({ - struct btree_path_level *l = &iter.path->l[0]; - struct bkey_packed *_k = - bch2_btree_node_iter_peek(&l->iter, l->b); - - if (bpos_gt(l->b->key.k.p, i->prev_node)) { - bch2_btree_node_to_text(&i->buf, i->c, l->b); - i->prev_node = l->b->key.k.p; - } - - bch2_bfloat_to_text(&i->buf, l->b, _k); - drop_locks_do(trans, flush_buf(i)); - })); - i->from = iter.pos; - - bch2_trans_put(trans); - - if (!ret) - ret = flush_buf(i); - - return ret ?: i->ret; + return flush_buf(i) ?: + bch2_trans_run(i->c, + for_each_btree_key(trans, iter, i->id, i->from, + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ({ + struct btree_path_level *l = + &btree_iter_path(trans, &iter)->l[0]; + struct bkey_packed *_k = + bch2_btree_node_iter_peek(&l->iter, l->b); + + if (bpos_gt(l->b->key.k.p, i->prev_node)) { + bch2_btree_node_to_text(&i->buf, i->c, l->b); + i->prev_node = l->b->key.k.p; + } + + bch2_bfloat_to_text(&i->buf, l->b, _k); + bch2_trans_unlock(trans); + i->from = bpos_successor(iter.pos); + flush_buf(i); + }))) ?: + i->ret; } static const struct file_operations bfloat_failed_debug_ops = { @@ -616,7 +592,6 @@ static const struct file_operations cached_btree_nodes_ops = { .read = bch2_cached_btree_nodes_read, }; -#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { @@ -632,7 +607,9 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, restart: seqmutex_lock(&c->btree_trans_lock); list_for_each_entry(trans, &c->btree_trans_list, list) { - if (trans->locking_wait.task->pid <= i->iter) + struct task_struct *task = READ_ONCE(trans->locking_wait.task); + + if (!task || task->pid <= i->iter) continue; closure_get(&trans->ref); @@ -650,11 +627,11 @@ restart: prt_printf(&i->buf, "backtrace:"); prt_newline(&i->buf); printbuf_indent_add(&i->buf, 2); - bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task); + bch2_prt_task_backtrace(&i->buf, task, 0); printbuf_indent_sub(&i->buf, 2); prt_newline(&i->buf); - i->iter = trans->locking_wait.task->pid; + i->iter = task->pid; closure_put(&trans->ref); @@ -678,7 +655,6 @@ static const struct file_operations btree_transactions_ops = { .release = bch2_dump_release, .read = bch2_btree_transactions_read, }; -#endif /* CONFIG_BCACHEFS_DEBUG_TRANSACTIONS */ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) @@ -717,7 +693,7 @@ static const struct file_operations journal_pins_ops = { .read = bch2_journal_pins_read, }; -static int lock_held_stats_open(struct inode *inode, struct file *file) +static int btree_transaction_stats_open(struct inode *inode, struct file *file) { struct bch_fs *c = inode->i_private; struct dump_iter *i; @@ -727,7 +703,7 @@ static int lock_held_stats_open(struct inode *inode, struct file *file) if (!i) return -ENOMEM; - i->iter = 0; + i->iter = 1; i->c = c; i->buf = PRINTBUF; file->private_data = i; @@ -735,7 +711,7 @@ static int lock_held_stats_open(struct inode *inode, struct file *file) return 0; } -static int lock_held_stats_release(struct inode *inode, struct file *file) +static int btree_transaction_stats_release(struct inode *inode, struct file *file) { struct dump_iter *i = file->private_data; @@ -745,8 +721,8 @@ static int lock_held_stats_release(struct inode *inode, struct file *file) return 0; } -static ssize_t lock_held_stats_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) +static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; struct bch_fs *c = i->c; @@ -779,6 +755,13 @@ static ssize_t lock_held_stats_read(struct file *file, char __user *buf, prt_printf(&i->buf, "Max mem used: %u", s->max_mem); prt_newline(&i->buf); + prt_printf(&i->buf, "Transaction duration:"); + prt_newline(&i->buf); + + printbuf_indent_add(&i->buf, 2); + bch2_time_stats_to_text(&i->buf, &s->duration); + printbuf_indent_sub(&i->buf, 2); + if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) { prt_printf(&i->buf, "Lock hold times:"); prt_newline(&i->buf); @@ -810,11 +793,11 @@ static ssize_t lock_held_stats_read(struct file *file, char __user *buf, return i->ret; } -static const struct file_operations lock_held_stats_op = { - .owner = THIS_MODULE, - .open = lock_held_stats_open, - .release = lock_held_stats_release, - .read = lock_held_stats_read, +static const struct file_operations btree_transaction_stats_op = { + .owner = THIS_MODULE, + .open = btree_transaction_stats_open, + .release = btree_transaction_stats_release, + .read = btree_transaction_stats_read, }; static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, @@ -835,7 +818,9 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, restart: seqmutex_lock(&c->btree_trans_lock); list_for_each_entry(trans, &c->btree_trans_list, list) { - if (trans->locking_wait.task->pid <= i->iter) + struct task_struct *task = READ_ONCE(trans->locking_wait.task); + + if (!task || task->pid <= i->iter) continue; closure_get(&trans->ref); @@ -850,7 +835,7 @@ restart: bch2_check_for_deadlock(trans, &i->buf); - i->iter = trans->locking_wait.task->pid; + i->iter = task->pid; closure_put(&trans->ref); @@ -897,16 +882,14 @@ void bch2_fs_debug_init(struct bch_fs *c) debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir, c->btree_debug, &cached_btree_nodes_ops); -#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir, c->btree_debug, &btree_transactions_ops); -#endif debugfs_create_file("journal_pins", 0400, c->fs_debug_dir, c->btree_debug, &journal_pins_ops); debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir, - c, &lock_held_stats_op); + c, &btree_transaction_stats_op); debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir, c->btree_debug, &btree_deadlock_ops); @@ -947,8 +930,6 @@ void bch2_debug_exit(void) int __init bch2_debug_init(void) { - int ret = 0; - bch_debug = debugfs_create_dir("bcachefs", NULL); - return ret; + return 0; } diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 2bfff0da7000..4ae1e9f002a0 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -65,7 +65,7 @@ static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r) const struct qstr l_name = bch2_dirent_get_name(l); const struct qstr *r_name = _r; - return l_name.len - r_name->len ?: memcmp(l_name.name, r_name->name, l_name.len); + return !qstr_eq(l_name, *r_name); } static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) @@ -75,7 +75,7 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) const struct qstr l_name = bch2_dirent_get_name(l); const struct qstr r_name = bch2_dirent_get_name(r); - return l_name.len - r_name.len ?: memcmp(l_name.name, r_name.name, l_name.len); + return !qstr_eq(l_name, r_name); } static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k) @@ -198,10 +198,39 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, return dirent; } +int bch2_dirent_create_snapshot(struct btree_trans *trans, + u64 dir, u32 snapshot, + const struct bch_hash_info *hash_info, + u8 type, const struct qstr *name, u64 dst_inum, + u64 *dir_offset, + bch_str_hash_flags_t str_hash_flags) +{ + subvol_inum zero_inum = { 0 }; + struct bkey_i_dirent *dirent; + int ret; + + dirent = dirent_create_key(trans, zero_inum, type, name, dst_inum); + ret = PTR_ERR_OR_ZERO(dirent); + if (ret) + return ret; + + dirent->k.p.inode = dir; + dirent->k.p.snapshot = snapshot; + + ret = bch2_hash_set_snapshot(trans, bch2_dirent_hash_desc, hash_info, + zero_inum, snapshot, + &dirent->k_i, str_hash_flags, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + *dir_offset = dirent->k.p.offset; + + return ret; +} + int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, const struct bch_hash_info *hash_info, u8 type, const struct qstr *name, u64 dst_inum, - u64 *dir_offset, int flags) + u64 *dir_offset, + bch_str_hash_flags_t str_hash_flags) { struct bkey_i_dirent *dirent; int ret; @@ -212,7 +241,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, return ret; ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, - dir, &dirent->k_i, flags); + dir, &dirent->k_i, str_hash_flags); *dir_offset = dirent->k.p.offset; return ret; @@ -470,17 +499,11 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, const struct qstr *name, subvol_inum *inum) { struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - int ret; -retry: - bch2_trans_begin(trans); + struct btree_iter iter = { NULL }; - ret = __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, - name, inum, 0); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - if (!ret) - bch2_trans_iter_exit(trans, &iter); + int ret = lockrestart_do(trans, + __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0)); + bch2_trans_iter_exit(trans, &iter); bch2_trans_put(trans); return ret; } diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index 1e3431990abd..21ffeb78f02e 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -35,9 +35,14 @@ static inline unsigned dirent_val_u64s(unsigned len) int bch2_dirent_read_target(struct btree_trans *, subvol_inum, struct bkey_s_c_dirent, subvol_inum *); +int bch2_dirent_create_snapshot(struct btree_trans *, u64, u32, + const struct bch_hash_info *, u8, + const struct qstr *, u64, u64 *, + bch_str_hash_flags_t); int bch2_dirent_create(struct btree_trans *, subvol_inum, const struct bch_hash_info *, u8, - const struct qstr *, u64, u64 *, int); + const struct qstr *, u64, u64 *, + bch_str_hash_flags_t); static inline unsigned vfs_d_type(unsigned type) { diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index 4d0cb0ccff32..06a7df529b40 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -89,19 +89,14 @@ err: void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) { - struct bch_disk_groups_cpu *g; - struct bch_dev *ca; - int i; - unsigned iter; - out->atomic++; rcu_read_lock(); - g = rcu_dereference(c->disk_groups); + struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); if (!g) goto out; - for (i = 0; i < g->nr; i++) { + for (unsigned i = 0; i < g->nr; i++) { if (i) prt_printf(out, " "); @@ -111,7 +106,7 @@ void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) } prt_printf(out, "[parent %d devs", g->entries[i].parent); - for_each_member_device_rcu(ca, c, iter, &g->entries[i].devs) + for_each_member_device_rcu(c, ca, &g->entries[i].devs) prt_printf(out, " %s", ca->name); prt_printf(out, "]"); } @@ -562,7 +557,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) : NULL; if (ca && percpu_ref_tryget(&ca->io_ref)) { - prt_printf(out, "/dev/%pg", ca->disk_sb.bdev); + prt_printf(out, "/dev/%s", ca->name); percpu_ref_put(&ca->io_ref); } else if (ca) { prt_printf(out, "offline device %u", t.dev); diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 2a77de18c004..d802bc63c8d0 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -3,6 +3,7 @@ /* erasure coding */ #include "bcachefs.h" +#include "alloc_background.h" #include "alloc_foreground.h" #include "backpointers.h" #include "bkey_buf.h" @@ -156,12 +157,311 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, } } +/* Triggers: */ + +static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, + struct bkey_s_c_stripe s, + unsigned idx, bool deleting) +{ + struct bch_fs *c = trans->c; + const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; + struct btree_iter iter; + struct bkey_i_alloc_v4 *a; + enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant + ? BCH_DATA_parity : 0; + s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0; + int ret = 0; + + if (deleting) + sectors = -sectors; + + a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr)); + if (IS_ERR(a)) + return PTR_ERR(a); + + ret = bch2_check_bucket_ref(trans, s.s_c, ptr, sectors, data_type, + a->v.gen, a->v.data_type, + a->v.dirty_sectors); + if (ret) + goto err; + + if (!deleting) { + if (bch2_trans_inconsistent_on(a->v.stripe || + a->v.stripe_redundancy, trans, + "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", + iter.pos.inode, iter.pos.offset, a->v.gen, + bch2_data_types[a->v.data_type], + a->v.dirty_sectors, + a->v.stripe, s.k->p.offset)) { + ret = -EIO; + goto err; + } + + if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans, + "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", + iter.pos.inode, iter.pos.offset, a->v.gen, + bch2_data_types[a->v.data_type], + a->v.dirty_sectors, + s.k->p.offset)) { + ret = -EIO; + goto err; + } + + a->v.stripe = s.k->p.offset; + a->v.stripe_redundancy = s.v->nr_redundant; + a->v.data_type = BCH_DATA_stripe; + } else { + if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset || + a->v.stripe_redundancy != s.v->nr_redundant, trans, + "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)", + iter.pos.inode, iter.pos.offset, a->v.gen, + s.k->p.offset, a->v.stripe)) { + ret = -EIO; + goto err; + } + + a->v.stripe = 0; + a->v.stripe_redundancy = 0; + a->v.data_type = alloc_data_type(a->v, BCH_DATA_user); + } + + a->v.dirty_sectors += sectors; + if (data_type) + a->v.data_type = !deleting ? data_type : 0; + + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); + if (ret) + goto err; +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int mark_stripe_bucket(struct btree_trans *trans, + struct bkey_s_c k, + unsigned ptr_idx, + unsigned flags) +{ + struct bch_fs *c = trans->c; + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + unsigned nr_data = s->nr_blocks - s->nr_redundant; + bool parity = ptr_idx >= nr_data; + enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; + s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; + const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket old, new, *g; + struct printbuf buf = PRINTBUF; + int ret = 0; + + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + + /* * XXX doesn't handle deletion */ + + percpu_down_read(&c->mark_lock); + g = PTR_GC_BUCKET(ca, ptr); + + if (g->dirty_sectors || + (g->stripe && g->stripe != k.k->p.offset)) { + bch2_fs_inconsistent(c, + "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", + ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EINVAL; + goto err; + } + + bucket_lock(g); + old = *g; + + ret = bch2_check_bucket_ref(trans, k, ptr, sectors, data_type, + g->gen, g->data_type, + g->dirty_sectors); + if (ret) + goto err; + + g->data_type = data_type; + g->dirty_sectors += sectors; + + g->stripe = k.k->p.offset; + g->stripe_redundancy = s->nr_redundant; + new = *g; +err: + bucket_unlock(g); + if (!ret) + bch2_dev_usage_update_m(c, ca, &old, &new); + percpu_up_read(&c->mark_lock); + printbuf_exit(&buf); + return ret; +} + +int bch2_trigger_stripe(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s _new, + unsigned flags) +{ + struct bkey_s_c new = _new.s_c; + struct bch_fs *c = trans->c; + u64 idx = new.k->p.offset; + const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe + ? bkey_s_c_to_stripe(old).v : NULL; + const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe + ? bkey_s_c_to_stripe(new).v : NULL; + + if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + /* + * If the pointers aren't changing, we don't need to do anything: + */ + if (new_s && old_s && + new_s->nr_blocks == old_s->nr_blocks && + new_s->nr_redundant == old_s->nr_redundant && + !memcmp(old_s->ptrs, new_s->ptrs, + new_s->nr_blocks * sizeof(struct bch_extent_ptr))) + return 0; + + BUG_ON(new_s && old_s && + (new_s->nr_blocks != old_s->nr_blocks || + new_s->nr_redundant != old_s->nr_redundant)); + + if (new_s) { + s64 sectors = le16_to_cpu(new_s->sectors); + + struct bch_replicas_padded r; + bch2_bkey_to_replicas(&r.e, new); + int ret = bch2_update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); + if (ret) + return ret; + } + + if (old_s) { + s64 sectors = -((s64) le16_to_cpu(old_s->sectors)); + + struct bch_replicas_padded r; + bch2_bkey_to_replicas(&r.e, old); + int ret = bch2_update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); + if (ret) + return ret; + } + + unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; + for (unsigned i = 0; i < nr_blocks; i++) { + if (new_s && old_s && + !memcmp(&new_s->ptrs[i], + &old_s->ptrs[i], + sizeof(new_s->ptrs[i]))) + continue; + + if (new_s) { + int ret = bch2_trans_mark_stripe_bucket(trans, + bkey_s_c_to_stripe(new), i, false); + if (ret) + return ret; + } + + if (old_s) { + int ret = bch2_trans_mark_stripe_bucket(trans, + bkey_s_c_to_stripe(old), i, true); + if (ret) + return ret; + } + } + } + + if (!(flags & (BTREE_TRIGGER_TRANSACTIONAL|BTREE_TRIGGER_GC))) { + struct stripe *m = genradix_ptr(&c->stripes, idx); + + if (!m) { + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; + + bch2_bkey_val_to_text(&buf1, c, old); + bch2_bkey_val_to_text(&buf2, c, new); + bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" + "old %s\n" + "new %s", idx, buf1.buf, buf2.buf); + printbuf_exit(&buf2); + printbuf_exit(&buf1); + bch2_inconsistent_error(c); + return -1; + } + + if (!new_s) { + bch2_stripes_heap_del(c, m, idx); + + memset(m, 0, sizeof(*m)); + } else { + m->sectors = le16_to_cpu(new_s->sectors); + m->algorithm = new_s->algorithm; + m->nr_blocks = new_s->nr_blocks; + m->nr_redundant = new_s->nr_redundant; + m->blocks_nonempty = 0; + + for (unsigned i = 0; i < new_s->nr_blocks; i++) + m->blocks_nonempty += !!stripe_blockcount_get(new_s, i); + + if (!old_s) + bch2_stripes_heap_insert(c, m, idx); + else + bch2_stripes_heap_update(c, m, idx); + } + } + + if (flags & BTREE_TRIGGER_GC) { + struct gc_stripe *m = + genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); + + if (!m) { + bch_err(c, "error allocating memory for gc_stripes, idx %llu", + idx); + return -BCH_ERR_ENOMEM_mark_stripe; + } + /* + * This will be wrong when we bring back runtime gc: we should + * be unmarking the old key and then marking the new key + */ + m->alive = true; + m->sectors = le16_to_cpu(new_s->sectors); + m->nr_blocks = new_s->nr_blocks; + m->nr_redundant = new_s->nr_redundant; + + for (unsigned i = 0; i < new_s->nr_blocks; i++) + m->ptrs[i] = new_s->ptrs[i]; + + bch2_bkey_to_replicas(&m->r.e, new); + + /* + * gc recalculates this field from stripe ptr + * references: + */ + memset(m->block_sectors, 0, sizeof(m->block_sectors)); + + for (unsigned i = 0; i < new_s->nr_blocks; i++) { + int ret = mark_stripe_bucket(trans, new, i, flags); + if (ret) + return ret; + } + + int ret = bch2_update_replicas(c, new, &m->r.e, + ((s64) m->sectors * m->nr_redundant), + 0, true); + if (ret) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, new); + bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf); + printbuf_exit(&buf); + return ret; + } + } + + return 0; +} + /* returns blocknr in stripe that we matched: */ static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s, struct bkey_s_c k, unsigned *block) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; unsigned i, nr_data = s->nr_blocks - s->nr_redundant; bkey_for_each_ptr(ptrs, ptr) @@ -791,28 +1091,22 @@ static void ec_stripe_delete_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, ec_stripe_delete_work); - struct btree_trans *trans = bch2_trans_get(c); - int ret; - u64 idx; while (1) { mutex_lock(&c->ec_stripes_heap_lock); - idx = stripe_idx_to_delete(c); + u64 idx = stripe_idx_to_delete(c); mutex_unlock(&c->ec_stripes_heap_lock); if (!idx) break; - ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, - ec_stripe_delete(trans, idx)); - if (ret) { - bch_err_fn(c, ret); + int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + ec_stripe_delete(trans, idx)); + bch_err_fn(c, ret); + if (ret) break; - } } - bch2_trans_put(trans); - bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); } @@ -983,8 +1277,8 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b while (1) { ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL, + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc, ec_stripe_update_extent(trans, bucket_pos, bucket.gen, s, &bp_pos)); if (ret) @@ -1005,7 +1299,7 @@ static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) unsigned i, nr_data = v->nr_blocks - v->nr_redundant; int ret = 0; - ret = bch2_btree_write_buffer_flush(trans); + ret = bch2_btree_write_buffer_flush_sync(trans); if (ret) goto err; @@ -1121,21 +1415,20 @@ static void ec_stripe_create(struct ec_stripe_new *s) } ret = bch2_trans_do(c, &s->res, NULL, - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL, + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc, ec_stripe_key_update(trans, bkey_i_to_stripe(&s->new_stripe.key), !s->have_existing_stripe)); + bch_err_msg(c, ret, "creating stripe key"); if (ret) { - bch_err(c, "error creating stripe: error creating stripe key"); goto err; } ret = ec_stripe_update_extents(c, &s->new_stripe); - if (ret) { - bch_err_msg(c, ret, "creating stripe: error updating pointers"); + bch_err_msg(c, ret, "error updating extents"); + if (ret) goto err; - } err: bch2_disk_reservation_put(c, &s->res); @@ -1250,18 +1543,17 @@ static int unsigned_cmp(const void *_l, const void *_r) static unsigned pick_blocksize(struct bch_fs *c, struct bch_devs_mask *devs) { - struct bch_dev *ca; - unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX]; + unsigned nr = 0, sizes[BCH_SB_MEMBERS_MAX]; struct { unsigned nr, size; } cur = { 0, 0 }, best = { 0, 0 }; - for_each_member_device_rcu(ca, c, i, devs) + for_each_member_device_rcu(c, ca, devs) sizes[nr++] = ca->mi.bucket_size; sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL); - for (i = 0; i < nr; i++) { + for (unsigned i = 0; i < nr; i++) { if (sizes[i] != cur.size) { if (cur.nr > best.nr) best = cur; @@ -1344,8 +1636,6 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, enum bch_watermark watermark) { struct ec_stripe_head *h; - struct bch_dev *ca; - unsigned i; h = kzalloc(sizeof(*h), GFP_KERNEL); if (!h) @@ -1362,13 +1652,13 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, rcu_read_lock(); h->devs = target_rw_devs(c, BCH_DATA_user, target); - for_each_member_device_rcu(ca, c, i, &h->devs) + for_each_member_device_rcu(c, ca, &h->devs) if (!ca->mi.durability) - __clear_bit(i, h->devs.d); + __clear_bit(ca->dev_idx, h->devs.d); h->blocksize = pick_blocksize(c, &h->devs); - for_each_member_device_rcu(ca, c, i, &h->devs) + for_each_member_device_rcu(c, ca, &h->devs) if (ca->mi.bucket_size == h->blocksize) h->nr_active_devs++; @@ -1415,7 +1705,7 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, if (ret) return ERR_PTR(ret); - if (test_bit(BCH_FS_GOING_RO, &c->flags)) { + if (test_bit(BCH_FS_going_ro, &c->flags)) { h = ERR_PTR(-BCH_ERR_erofs_no_writes); goto found; } @@ -1833,44 +2123,32 @@ void bch2_fs_ec_flush(struct bch_fs *c) int bch2_stripes_read(struct bch_fs *c) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - const struct bch_stripe *s; - struct stripe *m; - unsigned i; - int ret; - - for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { - if (k.k->type != KEY_TYPE_stripe) - continue; - - ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); - if (ret) - break; - - s = bkey_s_c_to_stripe(k).v; - - m = genradix_ptr(&c->stripes, k.k->p.offset); - m->sectors = le16_to_cpu(s->sectors); - m->algorithm = s->algorithm; - m->nr_blocks = s->nr_blocks; - m->nr_redundant = s->nr_redundant; - m->blocks_nonempty = 0; + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN, + BTREE_ITER_PREFETCH, k, ({ + if (k.k->type != KEY_TYPE_stripe) + continue; - for (i = 0; i < s->nr_blocks; i++) - m->blocks_nonempty += !!stripe_blockcount_get(s, i); + ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); + if (ret) + break; - bch2_stripes_heap_insert(c, m, k.k->p.offset); - } - bch2_trans_iter_exit(trans, &iter); + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; - bch2_trans_put(trans); + struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset); + m->sectors = le16_to_cpu(s->sectors); + m->algorithm = s->algorithm; + m->nr_blocks = s->nr_blocks; + m->nr_redundant = s->nr_redundant; + m->blocks_nonempty = 0; - if (ret) - bch_err_fn(c, ret); + for (unsigned i = 0; i < s->nr_blocks; i++) + m->blocks_nonempty += !!stripe_blockcount_get(s, i); + bch2_stripes_heap_insert(c, m, k.k->p.offset); + 0; + }))); + bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 7d0237c9819f..f4369b02e805 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -12,13 +12,14 @@ int bch2_stripe_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, unsigned); #define bch2_bkey_ops_stripe ((struct bkey_ops) { \ .key_invalid = bch2_stripe_invalid, \ .val_to_text = bch2_stripe_to_text, \ .swab = bch2_ptr_swab, \ - .trans_trigger = bch2_trans_mark_stripe, \ - .atomic_trigger = bch2_mark_stripe, \ + .trigger = bch2_trigger_stripe, \ .min_val_size = 8, \ }) diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h index e2b02a82de32..976426da3a12 100644 --- a/fs/bcachefs/ec_types.h +++ b/fs/bcachefs/ec_types.h @@ -5,7 +5,7 @@ #include "bcachefs_format.h" struct bch_replicas_padded { - struct bch_replicas_entry e; + struct bch_replicas_entry_v1 e; u8 pad[BCH_BKEY_PTRS_MAX]; }; diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 9ce29681eec9..8c40c2067a04 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -73,7 +73,6 @@ x(ENOMEM, ENOMEM_fsck_add_nlink) \ x(ENOMEM, ENOMEM_journal_key_insert) \ x(ENOMEM, ENOMEM_journal_keys_sort) \ - x(ENOMEM, ENOMEM_journal_replay) \ x(ENOMEM, ENOMEM_read_superblock_clean) \ x(ENOMEM, ENOMEM_fs_alloc) \ x(ENOMEM, ENOMEM_fs_name_alloc) \ @@ -152,7 +151,6 @@ x(BCH_ERR_btree_insert_fail, btree_insert_need_mark_replicas) \ x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \ x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \ - x(BCH_ERR_btree_insert_fail, btree_insert_need_flush_buffer) \ x(0, backpointer_to_overwritten_btree_node) \ x(0, lock_fail_root_changed) \ x(0, journal_reclaim_would_deadlock) \ @@ -172,10 +170,12 @@ x(EINVAL, device_size_too_small) \ x(EINVAL, device_not_a_member_of_filesystem) \ x(EINVAL, device_has_been_removed) \ + x(EINVAL, device_splitbrain) \ x(EINVAL, device_already_online) \ x(EINVAL, insufficient_devices_to_start) \ x(EINVAL, invalid) \ x(EINVAL, internal_fsck_err) \ + x(EINVAL, opt_parse_error) \ x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ x(EROFS, erofs_journal_err) \ @@ -224,6 +224,8 @@ x(BCH_ERR_invalid, invalid_bkey) \ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ x(EIO, btree_node_read_err) \ + x(EIO, sb_not_downgraded) \ + x(EIO, btree_write_all_failed) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ @@ -235,6 +237,7 @@ x(BCH_ERR_nopromote, nopromote_unwritten) \ x(BCH_ERR_nopromote, nopromote_congested) \ x(BCH_ERR_nopromote, nopromote_in_flight) \ + x(BCH_ERR_nopromote, nopromote_no_writes) \ x(BCH_ERR_nopromote, nopromote_enomem) enum bch_errcode { diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 25cf78a7b946..d32c8bebe46c 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -2,12 +2,13 @@ #include "bcachefs.h" #include "error.h" #include "super.h" +#include "thread_with_file.h" #define FSCK_ERR_RATELIMIT_NR 10 bool bch2_inconsistent_error(struct bch_fs *c) { - set_bit(BCH_FS_ERROR, &c->flags); + set_bit(BCH_FS_error, &c->flags); switch (c->opts.errors) { case BCH_ON_ERROR_continue: @@ -26,8 +27,8 @@ bool bch2_inconsistent_error(struct bch_fs *c) void bch2_topology_error(struct bch_fs *c) { - set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags); - if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) + set_bit(BCH_FS_topology_error, &c->flags); + if (!test_bit(BCH_FS_fsck_running, &c->flags)) bch2_inconsistent_error(c); } @@ -69,40 +70,66 @@ enum ask_yn { YN_ALLYES, }; +static enum ask_yn parse_yn_response(char *buf) +{ + buf = strim(buf); + + if (strlen(buf) == 1) + switch (buf[0]) { + case 'n': + return YN_NO; + case 'y': + return YN_YES; + case 'N': + return YN_ALLNO; + case 'Y': + return YN_ALLYES; + } + return -1; +} + #ifdef __KERNEL__ -#define bch2_fsck_ask_yn() YN_NO +static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c) +{ + struct stdio_redirect *stdio = c->stdio; + + if (c->stdio_filter && c->stdio_filter != current) + stdio = NULL; + + if (!stdio) + return YN_NO; + + char buf[100]; + int ret; + + do { + bch2_print(c, " (y,n, or Y,N for all errors of this type) "); + + int r = bch2_stdio_redirect_readline(stdio, buf, sizeof(buf) - 1); + if (r < 0) + return YN_NO; + buf[r] = '\0'; + } while ((ret = parse_yn_response(buf)) < 0); + + return ret; +} #else #include "tools-util.h" -enum ask_yn bch2_fsck_ask_yn(void) +static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c) { char *buf = NULL; size_t buflen = 0; - bool ret; + int ret; - while (true) { + do { fputs(" (y,n, or Y,N for all errors of this type) ", stdout); fflush(stdout); if (getline(&buf, &buflen, stdin) < 0) die("error reading from standard input"); - - strim(buf); - if (strlen(buf) != 1) - continue; - - switch (buf[0]) { - case 'n': - return YN_NO; - case 'y': - return YN_YES; - case 'N': - return YN_ALLNO; - case 'Y': - return YN_ALLYES; - } - } + } while ((ret = parse_yn_response(buf)) < 0); free(buf); return ret; @@ -114,7 +141,7 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt) { struct fsck_err_state *s; - if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) + if (!test_bit(BCH_FS_fsck_running, &c->flags)) return NULL; list_for_each_entry(s, &c->fsck_error_msgs, list) @@ -152,7 +179,8 @@ int bch2_fsck_err(struct bch_fs *c, struct printbuf buf = PRINTBUF, *out = &buf; int ret = -BCH_ERR_fsck_ignore; - if (test_bit(err, c->sb.errors_silent)) + if ((flags & FSCK_CAN_FIX) && + test_bit(err, c->sb.errors_silent)) return -BCH_ERR_fsck_fix; bch2_sb_error_count(c, err); @@ -196,7 +224,7 @@ int bch2_fsck_err(struct bch_fs *c, prt_printf(out, bch2_log_msg(c, "")); #endif - if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) { + if (!test_bit(BCH_FS_fsck_running, &c->flags)) { if (c->opts.errors != BCH_ON_ERROR_continue || !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { prt_str(out, ", shutting down"); @@ -221,10 +249,13 @@ int bch2_fsck_err(struct bch_fs *c, int ask; prt_str(out, ": fix?"); - bch2_print_string_as_lines(KERN_ERR, out->buf); + if (bch2_fs_stdio_redirect(c)) + bch2_print(c, "%s", out->buf); + else + bch2_print_string_as_lines(KERN_ERR, out->buf); print = false; - ask = bch2_fsck_ask_yn(); + ask = bch2_fsck_ask_yn(c); if (ask >= YN_ALLNO && s) s->fix = ask == YN_ALLNO @@ -253,10 +284,14 @@ int bch2_fsck_err(struct bch_fs *c, !(flags & FSCK_CAN_IGNORE))) ret = -BCH_ERR_fsck_errors_not_fixed; - if (print) - bch2_print_string_as_lines(KERN_ERR, out->buf); + if (print) { + if (bch2_fs_stdio_redirect(c)) + bch2_print(c, "%s\n", out->buf); + else + bch2_print_string_as_lines(KERN_ERR, out->buf); + } - if (!test_bit(BCH_FS_FSCK_DONE, &c->flags) && + if (test_bit(BCH_FS_fsck_running, &c->flags) && (ret != -BCH_ERR_fsck_fix && ret != -BCH_ERR_fsck_ignore)) bch_err(c, "Unable to continue, halting"); @@ -274,10 +309,10 @@ int bch2_fsck_err(struct bch_fs *c, bch2_inconsistent_error(c); if (ret == -BCH_ERR_fsck_fix) { - set_bit(BCH_FS_ERRORS_FIXED, &c->flags); + set_bit(BCH_FS_errors_fixed, &c->flags); } else { - set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags); - set_bit(BCH_FS_ERROR, &c->flags); + set_bit(BCH_FS_errors_not_fixed, &c->flags); + set_bit(BCH_FS_error, &c->flags); } return ret; diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c index 21af6fb8cecf..b9033bb4f11c 100644 --- a/fs/bcachefs/extent_update.c +++ b/fs/bcachefs/extent_update.c @@ -100,7 +100,7 @@ static int count_iters_for_insert(struct btree_trans *trans, return ret2 ?: ret; } -#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3) +#define EXTENT_ITERS_MAX (BTREE_ITER_INITIAL / 3) int bch2_extent_atomic_end(struct btree_trans *trans, struct btree_iter *iter, diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 9d8afcb5979a..82ec056f4cdb 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -843,7 +843,6 @@ void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev) const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; bkey_for_each_ptr(ptrs, ptr) if (ptr->dev == dev) @@ -855,7 +854,6 @@ const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; bkey_for_each_ptr(ptrs, ptr) if (bch2_dev_in_target(c, ptr->dev, target) && @@ -1065,7 +1063,6 @@ static int extent_ptr_invalid(struct bch_fs *c, struct printbuf *err) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr2; u64 bucket; u32 bucket_offset; struct bch_dev *ca; @@ -1307,7 +1304,6 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k, } incompressible: if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) { - const struct bch_extent_ptr *ptr; unsigned i = 0; bkey_for_each_ptr(ptrs, ptr) { diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index a2ce8a3be13c..a855c94d43dd 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -300,7 +300,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) bkey_extent_entry_for_each_from(_p, _entry, _p.start) #define __bkey_for_each_ptr(_start, _end, _ptr) \ - for ((_ptr) = (_start); \ + for (typeof(_start) (_ptr) = (_start); \ ((_ptr) = __bkey_ptr_next(_ptr, _end)); \ (_ptr)++) @@ -415,8 +415,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, .key_invalid = bch2_btree_ptr_invalid, \ .val_to_text = bch2_btree_ptr_to_text, \ .swab = bch2_ptr_swab, \ - .trans_trigger = bch2_trans_mark_extent, \ - .atomic_trigger = bch2_mark_extent, \ + .trigger = bch2_trigger_extent, \ }) #define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) { \ @@ -424,8 +423,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, .val_to_text = bch2_btree_ptr_v2_to_text, \ .swab = bch2_ptr_swab, \ .compat = bch2_btree_ptr_v2_compat, \ - .trans_trigger = bch2_trans_mark_extent, \ - .atomic_trigger = bch2_mark_extent, \ + .trigger = bch2_trigger_extent, \ .min_val_size = 40, \ }) @@ -439,8 +437,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); .swab = bch2_ptr_swab, \ .key_normalize = bch2_extent_normalize, \ .key_merge = bch2_extent_merge, \ - .trans_trigger = bch2_trans_mark_extent, \ - .atomic_trigger = bch2_mark_extent, \ + .trigger = bch2_trigger_extent, \ }) /* KEY_TYPE_reservation: */ @@ -454,8 +451,7 @@ bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); .key_invalid = bch2_reservation_invalid, \ .val_to_text = bch2_reservation_to_text, \ .key_merge = bch2_reservation_merge, \ - .trans_trigger = bch2_trans_mark_reservation, \ - .atomic_trigger = bch2_mark_reservation, \ + .trigger = bch2_trigger_reservation, \ .min_val_size = 8, \ }) @@ -547,7 +543,6 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k) static inline bool bkey_extent_is_unwritten(struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; bkey_for_each_ptr(ptrs, ptr) if (ptr->unwritten) @@ -565,10 +560,9 @@ static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k) { struct bch_devs_list ret = (struct bch_devs_list) { 0 }; struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; bkey_for_each_ptr(p, ptr) - ret.devs[ret.nr++] = ptr->dev; + ret.data[ret.nr++] = ptr->dev; return ret; } @@ -577,11 +571,10 @@ static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k) { struct bch_devs_list ret = (struct bch_devs_list) { 0 }; struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; bkey_for_each_ptr(p, ptr) if (!ptr->cached) - ret.devs[ret.nr++] = ptr->dev; + ret.data[ret.nr++] = ptr->dev; return ret; } @@ -590,11 +583,10 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) { struct bch_devs_list ret = (struct bch_devs_list) { 0 }; struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; bkey_for_each_ptr(p, ptr) if (ptr->cached) - ret.devs[ret.nr++] = ptr->dev; + ret.data[ret.nr++] = ptr->dev; return ret; } diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index 05429c9631cd..9637f636e32d 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -261,11 +261,11 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, #define eytzinger0_find(base, nr, size, _cmp, search) \ ({ \ - void *_base = (base); \ - void *_search = (search); \ - size_t _nr = (nr); \ - size_t _size = (size); \ - size_t _i = 0; \ + void *_base = (base); \ + const void *_search = (search); \ + size_t _nr = (nr); \ + size_t _size = (size); \ + size_t _i = 0; \ int _res; \ \ while (_i < _nr && \ diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index 4496cf91a4c1..1c1ea0f0c692 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -166,10 +166,8 @@ int bch2_create_trans(struct btree_trans *trans, if (ret) goto err; - if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { - new_inode->bi_dir = dir_u->bi_inum; - new_inode->bi_dir_offset = dir_offset; - } + new_inode->bi_dir = dir_u->bi_inum; + new_inode->bi_dir_offset = dir_offset; } inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS; @@ -228,10 +226,8 @@ int bch2_link_trans(struct btree_trans *trans, if (ret) goto err; - if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { - inode_u->bi_dir = dir.inum; - inode_u->bi_dir_offset = dir_offset; - } + inode_u->bi_dir = dir.inum; + inode_u->bi_dir_offset = dir_offset; ret = bch2_inode_write(trans, &dir_iter, dir_u) ?: bch2_inode_write(trans, &inode_iter, inode_u); @@ -414,21 +410,19 @@ int bch2_rename_trans(struct btree_trans *trans, goto err; } - if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { - src_inode_u->bi_dir = dst_dir_u->bi_inum; - src_inode_u->bi_dir_offset = dst_offset; + src_inode_u->bi_dir = dst_dir_u->bi_inum; + src_inode_u->bi_dir_offset = dst_offset; - if (mode == BCH_RENAME_EXCHANGE) { - dst_inode_u->bi_dir = src_dir_u->bi_inum; - dst_inode_u->bi_dir_offset = src_offset; - } + if (mode == BCH_RENAME_EXCHANGE) { + dst_inode_u->bi_dir = src_dir_u->bi_inum; + dst_inode_u->bi_dir_offset = src_offset; + } - if (mode == BCH_RENAME_OVERWRITE && - dst_inode_u->bi_dir == dst_dir_u->bi_inum && - dst_inode_u->bi_dir_offset == src_offset) { - dst_inode_u->bi_dir = 0; - dst_inode_u->bi_dir_offset = 0; - } + if (mode == BCH_RENAME_OVERWRITE && + dst_inode_u->bi_dir == dst_dir_u->bi_inum && + dst_inode_u->bi_dir_offset == src_offset) { + dst_inode_u->bi_dir = 0; + dst_inode_u->bi_dir_offset = 0; } if (mode == BCH_RENAME_OVERWRITE) { diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 52f0e7acda3d..73c12e565af5 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -52,26 +52,20 @@ struct readpages_iter { static int readpages_iter_init(struct readpages_iter *iter, struct readahead_control *ractl) { - struct folio **fi; - int ret; - - memset(iter, 0, sizeof(*iter)); + struct folio *folio; - iter->mapping = ractl->mapping; + *iter = (struct readpages_iter) { ractl->mapping }; - ret = bch2_filemap_get_contig_folios_d(iter->mapping, - ractl->_index << PAGE_SHIFT, - (ractl->_index + ractl->_nr_pages) << PAGE_SHIFT, - 0, mapping_gfp_mask(iter->mapping), - &iter->folios); - if (ret) - return ret; + while ((folio = __readahead_folio(ractl))) { + if (!bch2_folio_create(folio, GFP_KERNEL) || + darray_push(&iter->folios, folio)) { + bch2_folio_release(folio); + ractl->_nr_pages += folio_nr_pages(folio); + ractl->_index -= folio_nr_pages(folio); + return iter->folios.nr ? 0 : -ENOMEM; + } - darray_for_each(iter->folios, fi) { - ractl->_nr_pages -= 1U << folio_order(*fi); - __bch2_folio_create(*fi, __GFP_NOFAIL|GFP_KERNEL); - folio_put(*fi); - folio_put(*fi); + folio_put(folio); } return 0; @@ -273,12 +267,12 @@ void bch2_readahead(struct readahead_control *ractl) struct btree_trans *trans = bch2_trans_get(c); struct folio *folio; struct readpages_iter readpages_iter; - int ret; bch2_inode_opts_get(&opts, c, &inode->ei_inode); - ret = readpages_iter_init(&readpages_iter, ractl); - BUG_ON(ret); + int ret = readpages_iter_init(&readpages_iter, ractl); + if (ret) + return; bch2_pagecache_add_get(inode); @@ -638,7 +632,7 @@ do_io: /* Check for writing past i_size: */ WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > round_up(i_size, block_bytes(c)) && - !test_bit(BCH_FS_EMERGENCY_RO, &c->flags), + !test_bit(BCH_FS_emergency_ro, &c->flags), "writing past i_size: %llu > %llu (unrounded %llu)\n", bio_end_sector(&w->io->op.wbio.bio) << 9, round_up(i_size, block_bytes(c)), @@ -826,7 +820,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch2_folio_reservation res; folios fs; - struct folio **fi, *f; + struct folio *f; unsigned copied = 0, f_offset, f_copied; u64 end = pos + len, f_pos, f_len; loff_t last_folio_pos = inode->v.i_size; diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index 84e20c3ada6c..fdd57c5785c9 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -77,9 +77,6 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) bch2_inode_opts_get(&opts, c, &inode->ei_inode); - if ((offset|iter->count) & (block_bytes(c) - 1)) - return -EINVAL; - ret = min_t(loff_t, iter->count, max_t(loff_t, 0, i_size_read(&inode->v) - offset)); diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index b0e8144ec550..98bd5babab19 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -192,13 +192,17 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret, ret2, ret3; + int ret; ret = file_write_and_wait_range(file, start, end); - ret2 = sync_inode_metadata(&inode->v, 1); - ret3 = bch2_flush_inode(c, inode); - - return bch2_err_class(ret ?: ret2 ?: ret3); + if (ret) + goto out; + ret = sync_inode_metadata(&inode->v, 1); + if (ret) + goto out; + ret = bch2_flush_inode(c, inode); +out: + return bch2_err_class(ret); } /* truncate: */ @@ -861,7 +865,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, abs(pos_src - pos_dst) < len) return -EINVAL; - bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); + lock_two_nondirectories(&src->v, &dst->v); + bch2_lock_inodes(INODE_PAGECACHE_BLOCK, src, dst); inode_dio_wait(&src->v); inode_dio_wait(&dst->v); @@ -914,7 +919,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, ret = bch2_flush_inode(c, dst); err: bch2_quota_reservation_put(c, dst, "a_res); - bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); + bch2_unlock_inodes(INODE_PAGECACHE_BLOCK, src, dst); + unlock_two_nondirectories(&src->v, &dst->v); return bch2_err_class(ret); } diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 14d5cc6f90d7..946cc610eef5 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -285,34 +285,26 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) bch_notice(c, "shutdown by ioctl type %u", flags); - down_write(&c->vfs_sb->s_umount); - switch (flags) { case FSOP_GOING_FLAGS_DEFAULT: - ret = freeze_bdev(c->vfs_sb->s_bdev); + ret = bdev_freeze(c->vfs_sb->s_bdev); if (ret) - goto err; - + break; bch2_journal_flush(&c->journal); - c->vfs_sb->s_flags |= SB_RDONLY; bch2_fs_emergency_read_only(c); - thaw_bdev(c->vfs_sb->s_bdev); + bdev_thaw(c->vfs_sb->s_bdev); break; - case FSOP_GOING_FLAGS_LOGFLUSH: bch2_journal_flush(&c->journal); fallthrough; - case FSOP_GOING_FLAGS_NOLOGFLUSH: - c->vfs_sb->s_flags |= SB_RDONLY; bch2_fs_emergency_read_only(c); break; default: ret = -EINVAL; break; } -err: - up_write(&c->vfs_sb->s_umount); + return ret; } diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 49da8db1d9e9..ec419b8e2c43 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -93,7 +93,7 @@ retry: BTREE_ITER_INTENT) ?: (set ? set(trans, inode, &inode_u, p) : 0) ?: bch2_inode_write(trans, &iter, &inode_u) ?: - bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); /* * the btree node lock protects inode->ei_inode, not ei_update_lock; @@ -455,7 +455,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL, + BCH_TRANS_COMMIT_no_enospc, bch2_unlink_trans(trans, inode_inum(dir), &dir_u, &inode_u, &dentry->d_name, @@ -729,7 +729,7 @@ retry: ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); btree_err: bch2_trans_iter_exit(trans, &inode_iter); @@ -1012,15 +1012,13 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret; if (!dir_emit_dots(file, ctx)) return 0; - ret = bch2_readdir(c, inode_inum(inode), ctx); - if (ret) - bch_err_fn(c, ret); + int ret = bch2_readdir(c, inode_inum(inode), ctx); + bch_err_fn(c, ret); return bch2_err_class(ret); } @@ -1131,7 +1129,7 @@ static const struct address_space_operations bch_address_space_operations = { #ifdef CONFIG_MIGRATION .migrate_folio = filemap_migrate_folio, #endif - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; struct bcachefs_fid { @@ -1500,7 +1498,7 @@ static void bch2_evict_inode(struct inode *vinode) void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) { - struct bch_inode_info *inode, **i; + struct bch_inode_info *inode; DARRAY(struct bch_inode_info *) grabbed; bool clean_pass = false, this_pass_clean; @@ -1626,43 +1624,18 @@ static struct bch_fs *bch2_path_to_fs(const char *path) return c ?: ERR_PTR(-ENOENT); } -static char **split_devs(const char *_dev_name, unsigned *nr) -{ - char *dev_name = NULL, **devs = NULL, *s; - size_t i = 0, nr_devs = 0; - - dev_name = kstrdup(_dev_name, GFP_KERNEL); - if (!dev_name) - return NULL; - - for (s = dev_name; s; s = strchr(s + 1, ':')) - nr_devs++; - - devs = kcalloc(nr_devs + 1, sizeof(const char *), GFP_KERNEL); - if (!devs) { - kfree(dev_name); - return NULL; - } - - while ((s = strsep(&dev_name, ":"))) - devs[i++] = s; - - *nr = nr_devs; - return devs; -} - static int bch2_remount(struct super_block *sb, int *flags, char *data) { struct bch_fs *c = sb->s_fs_info; struct bch_opts opts = bch2_opts_empty(); int ret; - opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); - ret = bch2_parse_mount_opts(c, &opts, data); if (ret) goto err; + opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); + if (opts.read_only != c->opts.read_only) { down_write(&c->state_lock); @@ -1696,11 +1669,9 @@ err: static int bch2_show_devname(struct seq_file *seq, struct dentry *root) { struct bch_fs *c = root->d_sb->s_fs_info; - struct bch_dev *ca; - unsigned i; bool first = true; - for_each_online_member(ca, c, i) { + for_each_online_member(c, ca) { if (!first) seq_putc(seq, ':'); first = false; @@ -1770,7 +1741,7 @@ static int bch2_unfreeze(struct super_block *sb) struct bch_fs *c = sb->s_fs_info; int ret; - if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) + if (test_bit(BCH_FS_emergency_ro, &c->flags)) return 0; down_write(&c->state_lock); @@ -1805,17 +1776,18 @@ static int bch2_noset_super(struct super_block *s, void *data) return -EBUSY; } +typedef DARRAY(struct bch_fs *) darray_fs; + static int bch2_test_super(struct super_block *s, void *data) { struct bch_fs *c = s->s_fs_info; - struct bch_fs **devs = data; - unsigned i; + darray_fs *d = data; if (!c) return false; - for (i = 0; devs[i]; i++) - if (c != devs[i]) + darray_for_each(*d, i) + if (c != *i) return false; return true; } @@ -1824,13 +1796,9 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { struct bch_fs *c; - struct bch_dev *ca; struct super_block *sb; struct inode *vinode; struct bch_opts opts = bch2_opts_empty(); - char **devs; - struct bch_fs **devs_to_fs = NULL; - unsigned i, nr_devs; int ret; opt_set(opts, read_only, (flags & SB_RDONLY) != 0); @@ -1842,25 +1810,25 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, if (!dev_name || strlen(dev_name) == 0) return ERR_PTR(-EINVAL); - devs = split_devs(dev_name, &nr_devs); - if (!devs) - return ERR_PTR(-ENOMEM); + darray_str devs; + ret = bch2_split_devs(dev_name, &devs); + if (ret) + return ERR_PTR(ret); - devs_to_fs = kcalloc(nr_devs + 1, sizeof(void *), GFP_KERNEL); - if (!devs_to_fs) { - sb = ERR_PTR(-ENOMEM); - goto got_sb; + darray_fs devs_to_fs = {}; + darray_for_each(devs, i) { + ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i)); + if (ret) { + sb = ERR_PTR(ret); + goto got_sb; + } } - for (i = 0; i < nr_devs; i++) - devs_to_fs[i] = bch2_path_to_fs(devs[i]); - - sb = sget(fs_type, bch2_test_super, bch2_noset_super, - flags|SB_NOSEC, devs_to_fs); + sb = sget(fs_type, bch2_test_super, bch2_noset_super, flags|SB_NOSEC, &devs_to_fs); if (!IS_ERR(sb)) goto got_sb; - c = bch2_fs_open(devs, nr_devs, opts); + c = bch2_fs_open(devs.data, devs.nr, opts); if (IS_ERR(c)) { sb = ERR_CAST(c); goto got_sb; @@ -1880,9 +1848,8 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, if (IS_ERR(sb)) bch2_fs_stop(c); got_sb: - kfree(devs_to_fs); - kfree(devs[0]); - kfree(devs); + darray_exit(&devs_to_fs); + bch2_darray_str_exit(&devs); if (IS_ERR(sb)) { ret = PTR_ERR(sb); @@ -1923,7 +1890,7 @@ got_sb: sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; - for_each_online_member(ca, c, i) { + for_each_online_member(c, ca) { struct block_device *bdev = ca->disk_sb.bdev; /* XXX: create an anonymous device for multi device filesystems */ @@ -1944,10 +1911,9 @@ got_sb: vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); ret = PTR_ERR_OR_ZERO(vinode); - if (ret) { - bch_err_msg(c, ret, "mounting: error getting root inode"); + bch_err_msg(c, ret, "mounting: error getting root inode"); + if (ret) goto err_put_super; - } sb->s_root = d_make_root(vinode); if (!sb->s_root) { diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index 5edf1d4b9e6b..c3af7225ff69 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -77,9 +77,8 @@ static inline int ptrcmp(void *l, void *r) } enum bch_inode_lock_op { - INODE_LOCK = (1U << 0), - INODE_PAGECACHE_BLOCK = (1U << 1), - INODE_UPDATE_LOCK = (1U << 2), + INODE_PAGECACHE_BLOCK = (1U << 0), + INODE_UPDATE_LOCK = (1U << 1), }; #define bch2_lock_inodes(_locks, ...) \ @@ -91,8 +90,6 @@ do { \ \ for (i = 1; i < ARRAY_SIZE(a); i++) \ if (a[i] != a[i - 1]) { \ - if ((_locks) & INODE_LOCK) \ - down_write_nested(&a[i]->v.i_rwsem, i); \ if ((_locks) & INODE_PAGECACHE_BLOCK) \ bch2_pagecache_block_get(a[i]);\ if ((_locks) & INODE_UPDATE_LOCK) \ @@ -109,8 +106,6 @@ do { \ \ for (i = 1; i < ARRAY_SIZE(a); i++) \ if (a[i] != a[i - 1]) { \ - if ((_locks) & INODE_LOCK) \ - up_write(&a[i]->v.i_rwsem); \ if ((_locks) & INODE_PAGECACHE_BLOCK) \ bch2_pagecache_block_put(a[i]);\ if ((_locks) & INODE_UPDATE_LOCK) \ diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index e0c5cd119acc..4f0ecd605675 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -20,8 +20,6 @@ #include <linux/bsearch.h> #include <linux/dcache.h> /* struct qstr */ -#define QSTR(n) { { { .len = strlen(n) } }, .name = n } - /* * XXX: this is handling transaction restarts without returning * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore: @@ -29,19 +27,16 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, u32 snapshot) { - struct btree_iter iter; - struct bkey_s_c k; u64 sectors = 0; - int ret; - for_each_btree_key_upto(trans, iter, BTREE_ID_extents, + int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents, SPOS(inum, 0, snapshot), POS(inum, U64_MAX), - 0, k, ret) + 0, k, ({ if (bkey_extent_is_allocation(k.k)) sectors += k.k->size; - - bch2_trans_iter_exit(trans, &iter); + 0; + })); return ret ?: sectors; } @@ -49,45 +44,23 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, u32 snapshot) { - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_s_c_dirent d; u64 subdirs = 0; - int ret; - - for_each_btree_key_upto(trans, iter, BTREE_ID_dirents, - SPOS(inum, 0, snapshot), - POS(inum, U64_MAX), - 0, k, ret) { - if (k.k->type != KEY_TYPE_dirent) - continue; - d = bkey_s_c_to_dirent(k); - if (d.v->d_type == DT_DIR) + int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents, + SPOS(inum, 0, snapshot), + POS(inum, U64_MAX), + 0, k, ({ + if (k.k->type == KEY_TYPE_dirent && + bkey_s_c_to_dirent(k).v->d_type == DT_DIR) subdirs++; - } - bch2_trans_iter_exit(trans, &iter); + 0; + })); return ret ?: subdirs; } -static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot, - u32 *subvol) -{ - struct bch_snapshot s; - int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, - POS(0, snapshot), 0, - snapshot, &s); - if (!ret) - *subvol = le32_to_cpu(s.subvol); - else if (bch2_err_matches(ret, ENOENT)) - bch_err(trans->c, "snapshot %u not found", snapshot); - return ret; - -} - -static int __subvol_lookup(struct btree_trans *trans, u32 subvol, - u32 *snapshot, u64 *inum) +static int subvol_lookup(struct btree_trans *trans, u32 subvol, + u32 *snapshot, u64 *inum) { struct bch_subvolume s; int ret; @@ -99,12 +72,6 @@ static int __subvol_lookup(struct btree_trans *trans, u32 subvol, return ret; } -static int subvol_lookup(struct btree_trans *trans, u32 subvol, - u32 *snapshot, u64 *inum) -{ - return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum)); -} - static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, struct bch_inode_unpacked *inode) { @@ -132,7 +99,7 @@ err: return ret; } -static int __lookup_inode(struct btree_trans *trans, u64 inode_nr, +static int lookup_inode(struct btree_trans *trans, u64 inode_nr, struct bch_inode_unpacked *inode, u32 *snapshot) { @@ -157,13 +124,6 @@ err: return ret; } -static int lookup_inode(struct btree_trans *trans, u64 inode_nr, - struct bch_inode_unpacked *inode, - u32 *snapshot) -{ - return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot)); -} - static int __lookup_dirent(struct btree_trans *trans, struct bch_hash_info hash_info, subvol_inum dir, struct qstr *name, @@ -207,12 +167,9 @@ static int fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode, u32 snapshot) { - int ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - __write_inode(trans, inode, snapshot)); - if (ret) - bch_err_fn(trans->c, ret); + int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + __write_inode(trans, inode, snapshot)); + bch_err_fn(trans->c, ret); return ret; } @@ -242,35 +199,43 @@ err: } /* Get lost+found, create if it doesn't exist: */ -static int lookup_lostfound(struct btree_trans *trans, u32 subvol, +static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, struct bch_inode_unpacked *lostfound) { struct bch_fs *c = trans->c; - struct bch_inode_unpacked root; - struct bch_hash_info root_hash_info; struct qstr lostfound_str = QSTR("lost+found"); - subvol_inum root_inum = { .subvol = subvol }; u64 inum = 0; unsigned d_type = 0; - u32 snapshot; int ret; - ret = __subvol_lookup(trans, subvol, &snapshot, &root_inum.inum); + struct bch_snapshot_tree st; + ret = bch2_snapshot_tree_lookup(trans, + bch2_snapshot_tree(c, snapshot), &st); + if (ret) + return ret; + + subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) }; + u32 subvol_snapshot; + + ret = subvol_lookup(trans, le32_to_cpu(st.master_subvol), + &subvol_snapshot, &root_inum.inum); + bch_err_msg(c, ret, "looking up root subvol"); if (ret) return ret; - ret = __lookup_inode(trans, root_inum.inum, &root, &snapshot); + struct bch_inode_unpacked root_inode; + struct bch_hash_info root_hash_info; + ret = lookup_inode(trans, root_inum.inum, &root_inode, &snapshot); + bch_err_msg(c, ret, "looking up root inode"); if (ret) return ret; - root_hash_info = bch2_hash_info_init(c, &root); + root_hash_info = bch2_hash_info_init(c, &root_inode); ret = __lookup_dirent(trans, root_hash_info, root_inum, - &lostfound_str, &inum, &d_type); - if (bch2_err_matches(ret, ENOENT)) { - bch_notice(c, "creating lost+found"); + &lostfound_str, &inum, &d_type); + if (bch2_err_matches(ret, ENOENT)) goto create_lostfound; - } bch_err_fn(c, ret); if (ret) @@ -285,20 +250,50 @@ static int lookup_lostfound(struct btree_trans *trans, u32 subvol, * The bch2_check_dirents pass has already run, dangling dirents * shouldn't exist here: */ - return __lookup_inode(trans, inum, lostfound, &snapshot); + return lookup_inode(trans, inum, lostfound, &snapshot); create_lostfound: + /* + * XXX: we could have a nicer log message here if we had a nice way to + * walk backpointers to print a path + */ + bch_notice(c, "creating lost+found in snapshot %u", le32_to_cpu(st.root_snapshot)); + + u64 now = bch2_current_time(c); + struct btree_iter lostfound_iter = { NULL }; + u64 cpu = raw_smp_processor_id(); + bch2_inode_init_early(c, lostfound); + bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode); + lostfound->bi_dir = root_inode.bi_inum; + + root_inode.bi_nlink++; + + ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu); + if (ret) + goto err; + + bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot); + ret = bch2_btree_iter_traverse(&lostfound_iter); + if (ret) + goto err; - ret = bch2_create_trans(trans, root_inum, &root, - lostfound, &lostfound_str, - 0, 0, S_IFDIR|0700, 0, NULL, NULL, - (subvol_inum) { }, 0); + ret = bch2_dirent_create_snapshot(trans, + root_inode.bi_inum, snapshot, &root_hash_info, + mode_to_type(lostfound->bi_mode), + &lostfound_str, + lostfound->bi_inum, + &lostfound->bi_dir_offset, + BCH_HASH_SET_MUST_CREATE) ?: + bch2_inode_write_flags(trans, &lostfound_iter, lostfound, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); +err: bch_err_msg(c, ret, "creating lost+found"); + bch2_trans_iter_exit(trans, &lostfound_iter); return ret; } -static int __reattach_inode(struct btree_trans *trans, +static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode, u32 inode_snapshot) { @@ -307,14 +302,9 @@ static int __reattach_inode(struct btree_trans *trans, char name_buf[20]; struct qstr name; u64 dir_offset = 0; - u32 subvol; int ret; - ret = __snapshot_lookup_subvol(trans, inode_snapshot, &subvol); - if (ret) - return ret; - - ret = lookup_lostfound(trans, subvol, &lostfound); + ret = lookup_lostfound(trans, inode_snapshot, &lostfound); if (ret) return ret; @@ -331,15 +321,12 @@ static int __reattach_inode(struct btree_trans *trans, snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); name = (struct qstr) QSTR(name_buf); - ret = bch2_dirent_create(trans, - (subvol_inum) { - .subvol = subvol, - .inum = lostfound.bi_inum, - }, - &dir_hash, - inode_d_type(inode), - &name, inode->bi_inum, &dir_offset, - BCH_HASH_SET_MUST_CREATE); + ret = bch2_dirent_create_snapshot(trans, + lostfound.bi_inum, inode_snapshot, + &dir_hash, + inode_d_type(inode), + &name, inode->bi_inum, &dir_offset, + BCH_HASH_SET_MUST_CREATE); if (ret) return ret; @@ -349,18 +336,6 @@ static int __reattach_inode(struct btree_trans *trans, return __write_inode(trans, inode, inode_snapshot); } -static int reattach_inode(struct btree_trans *trans, - struct bch_inode_unpacked *inode, - u32 inode_snapshot) -{ - int ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL, - __reattach_inode(trans, inode, inode_snapshot)); - bch_err_msg(trans->c, ret, "reattaching inode %llu", inode->bi_inum); - return ret; -} - static int remove_backpointer(struct btree_trans *trans, struct bch_inode_unpacked *inode) { @@ -405,7 +380,7 @@ static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s }; int ret = 0; - darray_for_each(s->ids, i) { + __darray_for_each(s->ids, i) { if (i->id == id) return 0; if (i->id > id) @@ -422,7 +397,7 @@ static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, enum btree_id btree_id, struct bpos pos) { - struct snapshots_seen_entry *i, n = { + struct snapshots_seen_entry n = { .id = pos.snapshot, .equiv = bch2_snapshot_equiv(c, pos.snapshot), }; @@ -448,7 +423,7 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, bch2_btree_id_str(btree_id), pos.inode, pos.offset, i->id, n.id, n.equiv); - set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags); + set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots); } } @@ -593,14 +568,13 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; - u32 restart_count = trans->restart_count; int ret; w->recalculate_sums = false; w->inodes.nr = 0; - for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum), - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { if (k.k->p.offset != inum) break; @@ -613,8 +587,7 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, return ret; w->first_this_inode = true; - - return trans_was_restarted(trans, restart_count); + return 0; } static struct inode_walker_entry * @@ -625,7 +598,7 @@ lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, snapshot = bch2_snapshot_equiv(c, snapshot); - darray_for_each(w->inodes, i) + __darray_for_each(w->inodes, i) if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot)) goto found; @@ -667,11 +640,8 @@ static struct inode_walker_entry *walk_inode(struct btree_trans *trans, if (ret) return ERR_PTR(ret); } else if (bkey_cmp(w->last_pos, pos)) { - struct inode_walker_entry *i; - darray_for_each(w->inodes, i) i->seen_this_pos = false; - } w->last_pos = pos; @@ -756,9 +726,7 @@ static int hash_redo_key(struct btree_trans *trans, k.k->p.snapshot, tmp, BCH_HASH_SET_MUST_CREATE, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: - bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } static int hash_check_key(struct btree_trans *trans, @@ -826,6 +794,18 @@ fsck_err: goto out; } +static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0); + int ret = bkey_err(k); + if (ret) + return ret; + + bch2_trans_iter_exit(trans, &iter); + return k.k->type == KEY_TYPE_set; +} + static int check_inode(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, @@ -867,7 +847,7 @@ static int check_inode(struct btree_trans *trans, c, inode_snapshot_mismatch, "inodes in different snapshots don't match")) { bch_err(c, "repair not implemented yet"); - return -EINVAL; + return -BCH_ERR_fsck_repair_unimplemented; } if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) && @@ -890,14 +870,22 @@ static int check_inode(struct btree_trans *trans, return 0; } + if (u.bi_flags & BCH_INODE_unlinked) { + ret = check_inode_deleted_list(trans, k.k->p); + if (ret < 0) + return ret; + + fsck_err_on(ret, c, unlinked_inode_not_on_deleted_list, + "inode %llu:%u unlinked, but not on deleted list", + u.bi_inum, k.k->p.snapshot); + ret = 0; + } + if (u.bi_flags & BCH_INODE_unlinked && (!c->sb.clean || fsck_err(c, inode_unlinked_but_clean, "filesystem marked clean, but inode %llu unlinked", u.bi_inum))) { - bch2_trans_unlock(trans); - bch2_fs_lazy_rw(c); - ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); bch_err_msg(c, ret, "in fsck deleting inode"); return ret; @@ -910,9 +898,6 @@ static int check_inode(struct btree_trans *trans, u.bi_inum))) { bch_verbose(c, "truncating inode %llu", u.bi_inum); - bch2_trans_unlock(trans); - bch2_fs_lazy_rw(c); - /* * XXX: need to truncate partial blocks too here - or ideally * just switch units to bytes and that issue goes away @@ -976,27 +961,22 @@ fsck_err: return ret; } -noinline_for_stack int bch2_check_inodes(struct bch_fs *c) { bool full = c->opts.fsck; - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; struct bch_inode_unpacked prev = { 0 }; struct snapshots_seen s; - struct bkey_s_c k; - int ret; snapshots_seen_init(&s); - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, - POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - check_inode(trans, &iter, k, &prev, &s, full)); + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, + POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_inode(trans, &iter, k, &prev, &s, full))); snapshots_seen_exit(&s); - bch2_trans_put(trans); bch_err_fn(c, ret); return ret; } @@ -1023,29 +1003,9 @@ static bool dirent_points_to_inode(struct bkey_s_c_dirent d, : le64_to_cpu(d.v->d_inum) == inode->bi_inum; } -static int inode_backpointer_exists(struct btree_trans *trans, - struct bch_inode_unpacked *inode, - u32 snapshot) -{ - struct btree_iter iter; - struct bkey_s_c_dirent d; - int ret; - - d = dirent_get_by_pos(trans, &iter, - SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot)); - ret = bkey_err(d); - if (ret) - return bch2_err_matches(ret, ENOENT) ? 0 : ret; - - ret = dirent_points_to_inode(d, inode); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) { struct bch_fs *c = trans->c; - struct inode_walker_entry *i; u32 restart_count = trans->restart_count; int ret = 0; s64 count2; @@ -1094,11 +1054,8 @@ struct extent_ends { static void extent_ends_reset(struct extent_ends *extent_ends) { - struct extent_end *i; - darray_for_each(extent_ends->e, i) snapshots_seen_exit(&i->seen); - extent_ends->e.nr = 0; } @@ -1130,7 +1087,7 @@ static int extent_ends_at(struct bch_fs *c, if (!n.seen.ids.data) return -BCH_ERR_ENOMEM_fsck_extent_ends_at; - darray_for_each(extent_ends->e, i) { + __darray_for_each(extent_ends->e, i) { if (i->snapshot == k.k->p.snapshot) { snapshots_seen_exit(&i->seen); *i = n; @@ -1220,13 +1177,12 @@ static int overlapping_extents_found(struct btree_trans *trans, swap(k1, k2); } - trans->extra_journal_res += bch2_bkey_sectors_compressed(k2); + trans->extra_disk_res += bch2_bkey_sectors_compressed(k2); ret = bch2_trans_update_extent_overwrite(trans, old_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, k1, k2) ?: - bch2_trans_commit(trans, &res, NULL, - BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL); + bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc); bch2_disk_reservation_put(c, &res); if (ret) @@ -1270,7 +1226,6 @@ static int check_overlapping_extents(struct btree_trans *trans, bool *fixed) { struct bch_fs *c = trans->c; - struct extent_end *i; int ret = 0; /* transaction restart, running again */ @@ -1451,32 +1406,28 @@ int bch2_check_extents(struct bch_fs *c) { struct inode_walker w = inode_walker_init(); struct snapshots_seen s; - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; struct extent_ends extent_ends; struct disk_reservation res = { 0 }; - int ret = 0; snapshots_seen_init(&s); extent_ends_init(&extent_ends); - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_extents, - POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - &res, NULL, - BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({ - bch2_disk_reservation_put(c, &res); - check_extent(trans, &iter, k, &w, &s, &extent_ends) ?: - check_extent_overbig(trans, &iter, k); - })) ?: - check_i_sectors(trans, &w); + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_extents, + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + &res, NULL, + BCH_TRANS_COMMIT_no_enospc, ({ + bch2_disk_reservation_put(c, &res); + check_extent(trans, &iter, k, &w, &s, &extent_ends) ?: + check_extent_overbig(trans, &iter, k); + })) ?: + check_i_sectors(trans, &w)); bch2_disk_reservation_put(c, &res); extent_ends_exit(&extent_ends); inode_walker_exit(&w); snapshots_seen_exit(&s); - bch2_trans_put(trans); bch_err_fn(c, ret); return ret; @@ -1484,24 +1435,19 @@ int bch2_check_extents(struct bch_fs *c) int bch2_check_indirect_extents(struct bch_fs *c) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; struct disk_reservation res = { 0 }; - int ret = 0; - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, - POS_MIN, - BTREE_ITER_PREFETCH, k, - &res, NULL, - BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({ - bch2_disk_reservation_put(c, &res); - check_extent_overbig(trans, &iter, k); - })); + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, + POS_MIN, + BTREE_ITER_PREFETCH, k, + &res, NULL, + BCH_TRANS_COMMIT_no_enospc, ({ + bch2_disk_reservation_put(c, &res); + check_extent_overbig(trans, &iter, k); + }))); bch2_disk_reservation_put(c, &res); - bch2_trans_put(trans); - bch_err_fn(c, ret); return ret; } @@ -1509,7 +1455,6 @@ int bch2_check_indirect_extents(struct bch_fs *c) static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) { struct bch_fs *c = trans->c; - struct inode_walker_entry *i; u32 restart_count = trans->restart_count; int ret = 0; s64 count2; @@ -1553,8 +1498,8 @@ static int check_dirent_target(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bkey_i_dirent *n; - bool backpointer_exists = true; struct printbuf buf = PRINTBUF; + struct btree_iter bp_iter = { NULL }; int ret = 0; if (!target->bi_dir && @@ -1568,25 +1513,37 @@ static int check_dirent_target(struct btree_trans *trans, } if (!inode_points_to_dirent(target, d)) { - ret = inode_backpointer_exists(trans, target, d.k->p.snapshot); - if (ret < 0) + struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, + SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot)); + ret = bkey_err(bp_dirent); + if (ret && !bch2_err_matches(ret, ENOENT)) goto err; - backpointer_exists = ret; + bool backpointer_exists = !ret; ret = 0; + bch2_bkey_val_to_text(&buf, c, d.s_c); + prt_newline(&buf); + if (backpointer_exists) + bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); + if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists, c, inode_dir_multiple_links, - "directory %llu with multiple links", - target->bi_inum)) { + "directory %llu:%u with multiple links\n%s", + target->bi_inum, target_snapshot, buf.buf)) { ret = __remove_dirent(trans, d.k->p); goto out; } + /* + * hardlinked file with nlink 0: + * We're just adjusting nlink here so check_nlinks() will pick + * it up, it ignores inodes with nlink 0 + */ if (fsck_err_on(backpointer_exists && !target->bi_nlink, c, inode_multiple_links_but_nlink_0, - "inode %llu type %s has multiple links but i_nlink 0", - target->bi_inum, bch2_d_types[d.v->d_type])) { + "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", + target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { target->bi_nlink++; target->bi_flags &= ~BCH_INODE_unlinked; @@ -1636,13 +1593,12 @@ static int check_dirent_target(struct btree_trans *trans, d = dirent_i_to_s_c(n); } - if (d.v->d_type == DT_SUBVOL && - target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) && - (c->sb.version < bcachefs_metadata_version_subvol_dirent || - fsck_err(c, dirent_d_parent_subvol_wrong, - "dirent has wrong d_parent_subvol field: got %u, should be %u", - le32_to_cpu(d.v->d_parent_subvol), - target->bi_parent_subvol))) { + if (fsck_err_on(d.v->d_type == DT_SUBVOL && + target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol), + c, dirent_d_parent_subvol_wrong, + "dirent has wrong d_parent_subvol field: got %u, should be %u", + le32_to_cpu(d.v->d_parent_subvol), + target->bi_parent_subvol)) { n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); ret = PTR_ERR_OR_ZERO(n); if (ret) @@ -1660,6 +1616,7 @@ static int check_dirent_target(struct btree_trans *trans, out: err: fsck_err: + bch2_trans_iter_exit(trans, &bp_iter); printbuf_exit(&buf); bch_err_fn(c, ret); return ret; @@ -1701,7 +1658,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, goto err; } - BUG_ON(!iter->path->should_be_locked); + BUG_ON(!btree_iter_path(trans, iter)->should_be_locked); i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout); ret = PTR_ERR_OR_ZERO(i); @@ -1754,7 +1711,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, u32 target_snapshot; u64 target_inum; - ret = __subvol_lookup(trans, target_subvol, + ret = subvol_lookup(trans, target_subvol, &target_snapshot, &target_inum); if (ret && !bch2_err_matches(ret, ENOENT)) goto err; @@ -1766,7 +1723,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, goto err; } - ret = __lookup_inode(trans, target_inum, + ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot); if (ret && !bch2_err_matches(ret, ENOENT)) goto err; @@ -1842,22 +1799,18 @@ int bch2_check_dirents(struct bch_fs *c) struct inode_walker target = inode_walker_init(); struct snapshots_seen s; struct bch_hash_info hash_info; - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; snapshots_seen_init(&s); - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, - POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, - k, - NULL, NULL, - BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)); + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, + k, + NULL, NULL, + BCH_TRANS_COMMIT_no_enospc, + check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s))); - bch2_trans_put(trans); snapshots_seen_exit(&s); inode_walker_exit(&dir); inode_walker_exit(&target); @@ -1908,8 +1861,6 @@ int bch2_check_xattrs(struct bch_fs *c) { struct inode_walker inode = inode_walker_init(); struct bch_hash_info hash_info; - struct btree_iter iter; - struct bkey_s_c k; int ret = 0; ret = bch2_trans_run(c, @@ -1918,7 +1869,7 @@ int bch2_check_xattrs(struct bch_fs *c) BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, NULL, NULL, - BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + BCH_TRANS_COMMIT_no_enospc, check_xattr(trans, &iter, k, &hash_info, &inode))); bch_err_fn(c, ret); return ret; @@ -1932,7 +1883,7 @@ static int check_root_trans(struct btree_trans *trans) u64 inum; int ret; - ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); + ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); if (ret && !bch2_err_matches(ret, ENOENT)) return ret; @@ -1948,18 +1899,13 @@ static int check_root_trans(struct btree_trans *trans) root_subvol.v.flags = 0; root_subvol.v.snapshot = cpu_to_le32(snapshot); root_subvol.v.inode = cpu_to_le64(inum); - ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, - &root_subvol.k_i, 0)); + ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol.k_i, 0); bch_err_msg(c, ret, "writing root subvol"); if (ret) goto err; - } - ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot); + ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot); if (ret && !bch2_err_matches(ret, ENOENT)) return ret; @@ -1983,11 +1929,7 @@ fsck_err: /* Get root directory, create if it doesn't exist: */ int bch2_check_root(struct bch_fs *c) { - int ret; - - ret = bch2_trans_do(c, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, + int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_root_trans(trans)); bch_err_fn(c, ret); return ret; @@ -2002,13 +1944,10 @@ typedef DARRAY(struct pathbuf_entry) pathbuf; static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) { - struct pathbuf_entry *i; - darray_for_each(*p, i) if (i->inum == inum && i->snapshot == snapshot) return true; - return false; } @@ -2057,10 +1996,10 @@ static int check_path(struct btree_trans *trans, break; } - ret = lockrestart_do(trans, - PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter, - SPOS(inode->bi_dir, inode->bi_dir_offset, - parent_snapshot))).k)); + d = dirent_get_by_pos(trans, &dirent_iter, + SPOS(inode->bi_dir, inode->bi_dir_offset, + parent_snapshot)); + ret = bkey_err(d.s_c); if (ret && !bch2_err_matches(ret, ENOENT)) break; @@ -2097,13 +2036,12 @@ static int check_path(struct btree_trans *trans, ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot); if (ret) { /* Should have been caught in dirents pass */ - bch_err(c, "error looking up parent directory: %i", ret); + if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err(c, "error looking up parent directory: %i", ret); break; } if (path_is_dup(p, inode->bi_inum, snapshot)) { - struct pathbuf_entry *i; - /* XXX print path */ bch_err(c, "directory structure loop"); @@ -2111,20 +2049,19 @@ static int check_path(struct btree_trans *trans, pr_err("%llu:%u", i->inum, i->snapshot); pr_err("%llu:%u", inode->bi_inum, snapshot); - if (!fsck_err(c, dir_loop, - "directory structure loop")) + if (!fsck_err(c, dir_loop, "directory structure loop")) return 0; - ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - remove_backpointer(trans, inode)); - if (ret) { - bch_err(c, "error removing dirent: %i", ret); + ret = remove_backpointer(trans, inode); + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err_msg(c, ret, "removing dirent"); + if (ret) break; - } ret = reattach_inode(trans, inode, snapshot); + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err_msg(c, ret, "reattaching inode %llu", inode->bi_inum); + break; } } fsck_err: @@ -2139,37 +2076,28 @@ fsck_err: */ int bch2_check_directory_structure(struct bch_fs *c) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; struct bch_inode_unpacked u; pathbuf path = { 0, }; int ret; - for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - if (!bkey_is_inode(k.k)) - continue; + ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + if (!bkey_is_inode(k.k)) + continue; - ret = bch2_inode_unpack(k, &u); - if (ret) { - /* Should have been caught earlier in fsck: */ - bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret); - break; - } + BUG_ON(bch2_inode_unpack(k, &u)); - if (u.bi_flags & BCH_INODE_unlinked) - continue; + if (u.bi_flags & BCH_INODE_unlinked) + continue; - ret = check_path(trans, &path, &u, iter.pos.snapshot); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); + check_path(trans, &path, &u, iter.pos.snapshot); + }))); darray_exit(&path); + bch_err_fn(c, ret); return ret; } @@ -2255,47 +2183,39 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, struct nlink_table *t, u64 start, u64 *end) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - struct bch_inode_unpacked u; - int ret = 0; - - for_each_btree_key(trans, iter, BTREE_ID_inodes, - POS(0, start), - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - if (!bkey_is_inode(k.k)) - continue; - - /* Should never fail, checked by bch2_inode_invalid: */ - BUG_ON(bch2_inode_unpack(k, &u)); - - /* - * Backpointer and directory structure checks are sufficient for - * directories, since they can't have hardlinks: - */ - if (S_ISDIR(u.bi_mode)) - continue; + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, BTREE_ID_inodes, + POS(0, start), + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ({ + if (!bkey_is_inode(k.k)) + continue; - if (!u.bi_nlink) - continue; + /* Should never fail, checked by bch2_inode_invalid: */ + struct bch_inode_unpacked u; + BUG_ON(bch2_inode_unpack(k, &u)); - ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot); - if (ret) { - *end = k.k->p.offset; - ret = 0; - break; - } + /* + * Backpointer and directory structure checks are sufficient for + * directories, since they can't have hardlinks: + */ + if (S_ISDIR(u.bi_mode)) + continue; - } - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); + if (!u.bi_nlink) + continue; - if (ret) - bch_err(c, "error in fsck: btree error %i while walking inodes", ret); + ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot); + if (ret) { + *end = k.k->p.offset; + ret = 0; + break; + } + 0; + }))); + bch_err_fn(c, ret); return ret; } @@ -2303,42 +2223,34 @@ noinline_for_stack static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links, u64 range_start, u64 range_end) { - struct btree_trans *trans = bch2_trans_get(c); struct snapshots_seen s; - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_s_c_dirent d; - int ret; snapshots_seen_init(&s); - for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); - if (ret) - break; - - switch (k.k->type) { - case KEY_TYPE_dirent: - d = bkey_s_c_to_dirent(k); + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ({ + ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); + if (ret) + break; - if (d.v->d_type != DT_DIR && - d.v->d_type != DT_SUBVOL) - inc_link(c, &s, links, range_start, range_end, - le64_to_cpu(d.v->d_inum), - bch2_snapshot_equiv(c, d.k->p.snapshot)); - break; - } - } - bch2_trans_iter_exit(trans, &iter); + if (k.k->type == KEY_TYPE_dirent) { + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - if (ret) - bch_err(c, "error in fsck: btree error %i while walking dirents", ret); + if (d.v->d_type != DT_DIR && + d.v->d_type != DT_SUBVOL) + inc_link(c, &s, links, range_start, range_end, + le64_to_cpu(d.v->d_inum), + bch2_snapshot_equiv(c, d.k->p.snapshot)); + } + 0; + }))); - bch2_trans_put(trans); snapshots_seen_exit(&s); + + bch_err_fn(c, ret); return ret; } @@ -2389,19 +2301,16 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, struct nlink_table *links, u64 range_start, u64 range_end) { - struct btree_iter iter; - struct bkey_s_c k; size_t idx = 0; - int ret = 0; - ret = bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS(0, range_start), BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end))); if (ret < 0) { - bch_err(c, "error in fsck: btree error %i while walking inodes", ret); + bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret)); return ret; } @@ -2447,7 +2356,6 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, { struct bkey_s_c_reflink_p p; struct bkey_i_reflink_p *u; - int ret; if (k.k->type != KEY_TYPE_reflink_p) return 0; @@ -2458,7 +2366,7 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, return 0; u = bch2_trans_kmalloc(trans, sizeof(*u)); - ret = PTR_ERR_OR_ZERO(u); + int ret = PTR_ERR_OR_ZERO(u); if (ret) return ret; @@ -2471,19 +2379,15 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, int bch2_fix_reflink_p(struct bch_fs *c) { - struct btree_iter iter; - struct bkey_s_c k; - int ret; - if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) return 0; - ret = bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_extents, POS_MIN, BTREE_ITER_INTENT|BTREE_ITER_PREFETCH| BTREE_ITER_ALL_SNAPSHOTS, k, - NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, fix_reflink_p_key(trans, &iter, k))); bch_err_fn(c, ret); return ret; diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 9309cfeecd8d..37dce96f48ac 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -561,64 +561,46 @@ static inline bool bkey_is_deleted_inode(struct bkey_s_c k) return bkey_inode_flags(k) & BCH_INODE_unlinked; } -int bch2_trans_mark_inode(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, - struct bkey_i *new, - unsigned flags) +int bch2_trigger_inode(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, + struct bkey_s new, + unsigned flags) { - int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k); - bool old_deleted = bkey_is_deleted_inode(old); - bool new_deleted = bkey_is_deleted_inode(bkey_i_to_s_c(new)); + s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k); - if (nr) { - int ret = bch2_replicas_deltas_realloc(trans, 0); - struct replicas_delta_list *d = trans->fs_usage_deltas; + if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (nr) { + int ret = bch2_replicas_deltas_realloc(trans, 0); + if (ret) + return ret; - if (ret) - return ret; - - d->nr_inodes += nr; - } + trans->fs_usage_deltas->nr_inodes += nr; + } - if (old_deleted != new_deleted) { - int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new->k.p, new_deleted); - if (ret) - return ret; + bool old_deleted = bkey_is_deleted_inode(old); + bool new_deleted = bkey_is_deleted_inode(new.s_c); + if (old_deleted != new_deleted) { + int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new.k->p, new_deleted); + if (ret) + return ret; + } } - return 0; -} + if (!(flags & BTREE_TRIGGER_TRANSACTIONAL) && (flags & BTREE_TRIGGER_INSERT)) { + BUG_ON(!trans->journal_res.seq); -int bch2_mark_inode(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct bch_fs_usage *fs_usage; - u64 journal_seq = trans->journal_res.seq; - - if (flags & BTREE_TRIGGER_INSERT) { - struct bch_inode_v3 *v = (struct bch_inode_v3 *) new.v; - - BUG_ON(!journal_seq); - BUG_ON(new.k->type != KEY_TYPE_inode_v3); - - v->bi_journal_seq = cpu_to_le64(journal_seq); + bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); } if (flags & BTREE_TRIGGER_GC) { - percpu_down_read(&c->mark_lock); - preempt_disable(); - - fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC); - fs_usage->nr_inodes += bkey_is_inode(new.k); - fs_usage->nr_inodes -= bkey_is_inode(old.k); + struct bch_fs *c = trans->c; - preempt_enable(); + percpu_down_read(&c->mark_lock); + this_cpu_add(c->usage_gc->nr_inodes, nr); percpu_up_read(&c->mark_lock); } + return 0; } @@ -831,7 +813,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, ret = bch2_trans_update(trans, &iter, &delete, 0) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); err: if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) break; @@ -894,7 +876,7 @@ retry: ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); err: bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -1058,7 +1040,7 @@ retry: ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); err: bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -1155,51 +1137,48 @@ delete: int bch2_delete_dead_inodes(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; bool need_another_pass; int ret; again: need_another_pass = false; - ret = bch2_btree_write_buffer_flush_sync(trans); - if (ret) - goto err; - /* * Weird transaction restart handling here because on successful delete, * bch2_inode_rm_snapshot() will return a nested transaction restart, * but we can't retry because the btree write buffer won't have been * flushed and we'd spin: */ - for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass)); - if (ret < 0) - break; - - if (ret) { - if (!test_bit(BCH_FS_RW, &c->flags)) { - bch2_trans_unlock(trans); - bch2_fs_lazy_rw(c); - } - + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass); + if (ret > 0) { bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot); ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot); - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - break; + /* + * We don't want to loop here: a transaction restart + * error here means we handled a transaction restart and + * we're actually done, but if we loop we'll retry the + * same key because the write buffer hasn't been flushed + * yet + */ + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + ret = 0; + continue; + } } - } - bch2_trans_iter_exit(trans, &iter); - if (!ret && need_another_pass) + ret; + })); + + if (!ret && need_another_pass) { + ret = bch2_btree_write_buffer_flush_sync(trans); + if (ret) + goto err; goto again; + } err: bch2_trans_put(trans); - return ret; } diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 88818a332b1e..b63f312581cf 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -17,32 +17,27 @@ int bch2_inode_v3_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_trans_mark_inode(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_i *, unsigned); -int bch2_mark_inode(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s_c, unsigned); +int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, unsigned); #define bch2_bkey_ops_inode ((struct bkey_ops) { \ .key_invalid = bch2_inode_invalid, \ .val_to_text = bch2_inode_to_text, \ - .trans_trigger = bch2_trans_mark_inode, \ - .atomic_trigger = bch2_mark_inode, \ + .trigger = bch2_trigger_inode, \ .min_val_size = 16, \ }) #define bch2_bkey_ops_inode_v2 ((struct bkey_ops) { \ .key_invalid = bch2_inode_v2_invalid, \ .val_to_text = bch2_inode_to_text, \ - .trans_trigger = bch2_trans_mark_inode, \ - .atomic_trigger = bch2_mark_inode, \ + .trigger = bch2_trigger_inode, \ .min_val_size = 32, \ }) #define bch2_bkey_ops_inode_v3 ((struct bkey_ops) { \ .key_invalid = bch2_inode_v3_invalid, \ .val_to_text = bch2_inode_to_text, \ - .trans_trigger = bch2_trans_mark_inode, \ - .atomic_trigger = bch2_mark_inode, \ + .trigger = bch2_trigger_inode, \ .min_val_size = 48, \ }) diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index bebc11444ef5..ca6d5f516aa2 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -34,8 +34,7 @@ int bch2_extent_fallocate(struct btree_trans *trans, struct open_buckets open_buckets = { 0 }; struct bkey_s_c k; struct bkey_buf old, new; - unsigned sectors_allocated = 0; - bool have_reservation = false; + unsigned sectors_allocated = 0, new_replicas; bool unwritten = opts.nocow && c->sb.version >= bcachefs_metadata_version_unwritten_extents; int ret; @@ -50,28 +49,20 @@ int bch2_extent_fallocate(struct btree_trans *trans, return ret; sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset); + new_replicas = max(0, (int) opts.data_replicas - + (int) bch2_bkey_nr_ptrs_fully_allocated(k)); - if (!have_reservation) { - unsigned new_replicas = - max(0, (int) opts.data_replicas - - (int) bch2_bkey_nr_ptrs_fully_allocated(k)); - /* - * Get a disk reservation before (in the nocow case) calling - * into the allocator: - */ - ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0); - if (unlikely(ret)) - goto err; - - bch2_bkey_buf_reassemble(&old, c, k); - } + /* + * Get a disk reservation before (in the nocow case) calling + * into the allocator: + */ + ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0); + if (unlikely(ret)) + goto err_noprint; - if (have_reservation) { - if (!bch2_extents_match(k, bkey_i_to_s_c(old.k))) - goto err; + bch2_bkey_buf_reassemble(&old, c, k); - bch2_key_resize(&new.k->k, sectors); - } else if (!unwritten) { + if (!unwritten) { struct bkey_i_reservation *reservation; bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64)); @@ -83,7 +74,6 @@ int bch2_extent_fallocate(struct btree_trans *trans, struct bkey_i_extent *e; struct bch_devs_list devs_have; struct write_point *wp; - struct bch_extent_ptr *ptr; devs_have.nr = 0; @@ -118,14 +108,17 @@ int bch2_extent_fallocate(struct btree_trans *trans, ptr->unwritten = true; } - have_reservation = true; - ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res, 0, i_sectors_delta, true); err: if (!ret && sectors_allocated) bch2_increment_clock(c, sectors_allocated, WRITE); - + if (should_print_err(ret)) + bch_err_inum_offset_ratelimited(c, + inum.inum, + iter->pos.offset << 9, + "%s(): error: %s", __func__, bch2_err_str(ret)); +err_noprint: bch2_open_buckets_put(c, &open_buckets); bch2_disk_reservation_put(c, &disk_res); bch2_bkey_buf_exit(&new, c); @@ -256,7 +249,7 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans, u64 new_i_size = le64_to_cpu(op->v.new_i_size); int ret; - ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, truncate_set_isize(trans, inum, new_i_size)); if (ret) goto err; @@ -378,7 +371,7 @@ case LOGGED_OP_FINSERT_start: op->v.state = LOGGED_OP_FINSERT_shift_extents; if (insert) { - ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, adjust_i_size(trans, inum, src_offset, len) ?: bch2_logged_op_update(trans, &op->k_i)); if (ret) @@ -390,7 +383,7 @@ case LOGGED_OP_FINSERT_start: if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto err; - ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_logged_op_update(trans, &op->k_i)); } @@ -455,7 +448,7 @@ case LOGGED_OP_FINSERT_shift_extents: bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?: bch2_logged_op_update(trans, &op->k_i) ?: - bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL); + bch2_trans_commit(trans, &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc); btree_err: bch2_disk_reservation_put(c, &disk_res); @@ -470,12 +463,12 @@ btree_err: op->v.state = LOGGED_OP_FINSERT_finish; if (!insert) { - ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, adjust_i_size(trans, inum, src_offset, shift) ?: bch2_logged_op_update(trans, &op->k_i)); } else { /* We need an inode update to update bi_journal_seq for fsync: */ - ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, adjust_i_size(trans, inum, 0, 0) ?: bch2_logged_op_update(trans, &op->k_i)); } diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 36763865facd..3c574d8873a1 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -80,7 +80,7 @@ struct promote_op { struct bpos pos; struct data_update write; - struct bio_vec bi_inline_vecs[0]; /* must be last */ + struct bio_vec bi_inline_vecs[]; /* must be last */ }; static const struct rhashtable_params bch_promote_params = { @@ -172,11 +172,13 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, int ret; if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) - return NULL; + return ERR_PTR(-BCH_ERR_nopromote_no_writes); - op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS); - if (!op) + op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_KERNEL); + if (!op) { + ret = -BCH_ERR_nopromote_enomem; goto err; + } op->start_time = local_clock(); op->pos = pos; @@ -187,24 +189,29 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, */ *rbio = kzalloc(sizeof(struct bch_read_bio) + sizeof(struct bio_vec) * pages, - GFP_NOFS); - if (!*rbio) + GFP_KERNEL); + if (!*rbio) { + ret = -BCH_ERR_nopromote_enomem; goto err; + } rbio_init(&(*rbio)->bio, opts); bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); - if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, - GFP_NOFS)) + if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) { + ret = -BCH_ERR_nopromote_enomem; goto err; + } (*rbio)->bounce = true; (*rbio)->split = true; (*rbio)->kmalloc = true; if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, - bch_promote_params)) + bch_promote_params)) { + ret = -BCH_ERR_nopromote_in_flight; goto err; + } bio = &op->write.op.wbio.bio; bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); @@ -223,9 +230,8 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, * -BCH_ERR_ENOSPC_disk_reservation: */ if (ret) { - ret = rhashtable_remove_fast(&c->promote_table, &op->hash, - bch_promote_params); - BUG_ON(ret); + BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params)); goto err; } @@ -239,7 +245,7 @@ err: *rbio = NULL; kfree(op); bch2_write_ref_put(c, BCH_WRITE_REF_promote); - return NULL; + return ERR_PTR(ret); } noinline @@ -274,10 +280,9 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, ? BTREE_ID_reflink : BTREE_ID_extents, k, pos, pick, opts, sectors, rbio); - if (!promote) { - ret = -BCH_ERR_nopromote_enomem; + ret = PTR_ERR_OR_ZERO(promote); + if (ret) goto nopromote; - } *bounce = true; *read_full = promote_full; @@ -526,7 +531,7 @@ out: static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) { - bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_trans_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, __bch2_rbio_narrow_crcs(trans, rbio)); } @@ -637,12 +642,17 @@ csum_err: goto out; } + struct printbuf buf = PRINTBUF; + buf.atomic++; + prt_str(&buf, "data "); + bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); + bch_err_inum_offset_ratelimited(ca, rbio->read_pos.inode, rbio->read_pos.offset << 9, - "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)", - rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, - csum.hi, csum.lo, bch2_csum_types[crc.csum_type]); + "data %s", buf.buf); + printbuf_exit(&buf); + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); goto out; diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 8c8cb1541ac9..33c0e783d546 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -316,8 +316,8 @@ int bch2_extent_update(struct btree_trans *trans, i_sectors_delta) ?: bch2_trans_update(trans, iter, k, 0) ?: bch2_trans_commit(trans, disk_res, NULL, - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc); if (unlikely(ret)) return ret; @@ -396,17 +396,14 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, bool nocow) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); - const struct bch_extent_ptr *ptr; struct bch_write_bio *n; - struct bch_dev *ca; BUG_ON(c->opts.nochanges); bkey_for_each_ptr(ptrs, ptr) { - BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX || - !c->devs[ptr->dev]); + BUG_ON(!bch2_dev_exists2(c, ptr->dev)); - ca = bch_dev_bkey_exists(c, ptr->dev); + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); if (to_entry(ptr + 1) < ptrs.end) { n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, @@ -1109,16 +1106,14 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op, static inline void bch2_nocow_write_unlock(struct bch_write_op *op) { struct bch_fs *c = op->c; - const struct bch_extent_ptr *ptr; - struct bkey_i *k; for_each_keylist_key(&op->insert_keys, k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); bkey_for_each_ptr(ptrs, ptr) bch2_bucket_nocow_unlock(&c->nocow_locks, - PTR_BUCKET_POS(c, ptr), - BUCKET_NOCOW_LOCK_UPDATE); + PTR_BUCKET_POS(c, ptr), + BUCKET_NOCOW_LOCK_UPDATE); } } @@ -1128,25 +1123,20 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, struct bkey_s_c k, u64 new_i_size) { - struct bkey_i *new; - struct bkey_ptrs ptrs; - struct bch_extent_ptr *ptr; - int ret; - if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) { /* trace this */ return 0; } - new = bch2_bkey_make_mut_noupdate(trans, k); - ret = PTR_ERR_OR_ZERO(new); + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); + int ret = PTR_ERR_OR_ZERO(new); if (ret) return ret; bch2_cut_front(bkey_start_pos(&orig->k), new); bch2_cut_back(orig->k.p, new); - ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); bkey_for_each_ptr(ptrs, ptr) ptr->unwritten = 0; @@ -1167,16 +1157,12 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) { struct bch_fs *c = op->c; struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_i *orig; - struct bkey_s_c k; - int ret; for_each_keylist_key(&op->insert_keys, orig) { - ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents, + int ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents, bkey_start_pos(&orig->k), orig->k.p, BTREE_ITER_INTENT, k, - NULL, NULL, BTREE_INSERT_NOFAIL, ({ + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size); })); @@ -1228,10 +1214,7 @@ static void bch2_nocow_write(struct bch_write_op *op) struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; - struct bkey_ptrs_c ptrs; - const struct bch_extent_ptr *ptr; DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets; - struct bucket_to_lock *i; u32 snapshot; struct bucket_to_lock *stale_at; int ret; @@ -1273,7 +1256,7 @@ retry: break; /* Get iorefs before dropping btree locks: */ - ptrs = bch2_bkey_ptrs_c(k); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr) { struct bpos b = PTR_BUCKET_POS(c, ptr); struct nocow_lock_bucket *l = @@ -1464,6 +1447,10 @@ err: op->flags |= BCH_WRITE_DONE; if (ret < 0) { + bch_err_inum_offset_ratelimited(c, + op->pos.inode, + op->pos.offset << 9, + "%s(): error: %s", __func__, bch2_err_str(ret)); op->error = ret; break; } diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 8cf238be6213..8538ef34f62b 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -10,6 +10,7 @@ #include "bkey_methods.h" #include "btree_gc.h" #include "btree_update.h" +#include "btree_write_buffer.h" #include "buckets.h" #include "error.h" #include "journal.h" @@ -184,6 +185,8 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val) /* Close out old buffer: */ buf->data->u64s = cpu_to_le32(old.cur_entry_offset); + trace_journal_entry_close(c, vstruct_bytes(buf->data)); + sectors = vstruct_blocks_plus(buf->data, c->block_bits, buf->u64s_reserved) << c->block_bits; BUG_ON(sectors > buf->sectors); @@ -330,6 +333,7 @@ static int journal_entry_open(struct journal *j) buf->must_flush = false; buf->separate_flush = false; buf->flush_time = 0; + buf->need_flush_to_write_buffer = true; memset(buf->data, 0, sizeof(*buf->data)); buf->data->seq = cpu_to_le64(journal_cur_seq(j)); @@ -363,11 +367,6 @@ static int journal_entry_open(struct journal *j) } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); - if (j->res_get_blocked_start) - bch2_time_stats_update(j->blocked_time, - j->res_get_blocked_start); - j->res_get_blocked_start = 0; - mod_delayed_work(c->io_complete_wq, &j->write_work, msecs_to_jiffies(c->opts.journal_flush_delay)); @@ -467,15 +466,12 @@ retry: __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); ret = journal_entry_open(j); - if (ret == JOURNAL_ERR_max_in_flight) + if (ret == JOURNAL_ERR_max_in_flight) { + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], + &j->max_in_flight_start, true); trace_and_count(c, journal_entry_full, c); -unlock: - if ((ret && ret != JOURNAL_ERR_insufficient_devices) && - !j->res_get_blocked_start) { - j->res_get_blocked_start = local_clock() ?: 1; - trace_and_count(c, journal_full, c); } - +unlock: can_discard = j->can_discard; spin_unlock(&j->lock); @@ -774,6 +770,48 @@ void bch2_journal_block(struct journal *j) journal_quiesce(j); } +static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) +{ + struct journal_buf *ret = NULL; + + mutex_lock(&j->buf_lock); + spin_lock(&j->lock); + max_seq = min(max_seq, journal_cur_seq(j)); + + for (u64 seq = journal_last_unwritten_seq(j); + seq <= max_seq; + seq++) { + unsigned idx = seq & JOURNAL_BUF_MASK; + struct journal_buf *buf = j->buf + idx; + + if (buf->need_flush_to_write_buffer) { + if (seq == journal_cur_seq(j)) + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + + union journal_res_state s; + s.v = atomic64_read_acquire(&j->reservations.counter); + + ret = journal_state_count(s, idx) + ? ERR_PTR(-EAGAIN) + : buf; + break; + } + } + + spin_unlock(&j->lock); + if (IS_ERR_OR_NULL(ret)) + mutex_unlock(&j->buf_lock); + return ret; +} + +struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) +{ + struct journal_buf *ret; + + wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, max_seq)) != ERR_PTR(-EAGAIN)); + return ret; +} + /* allocate journal on a device: */ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, @@ -955,8 +993,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, break; } - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); unlock: up_write(&c->state_lock); return ret; @@ -986,17 +1023,13 @@ int bch2_dev_journal_alloc(struct bch_dev *ca) ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL); err: - if (ret) - bch_err_fn(ca, ret); + bch_err_fn(ca, ret); return ret; } int bch2_fs_journal_alloc(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; - - for_each_online_member(ca, c, i) { + for_each_online_member(c, ca) { if (ca->journal.nr) continue; @@ -1225,6 +1258,7 @@ int bch2_fs_journal_init(struct journal *j) static struct lock_class_key res_key; unsigned i; + mutex_init(&j->buf_lock); spin_lock_init(&j->lock); spin_lock_init(&j->err_lock); init_waitqueue_head(&j->wait); @@ -1260,10 +1294,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); union journal_res_state s; - struct bch_dev *ca; unsigned long now = jiffies; - u64 seq; - unsigned i; + u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes; if (!out->nr_tabstops) printbuf_tabstop_push(out, 24); @@ -1275,20 +1307,23 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size); prt_printf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j)); prt_printf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk); - prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); + prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); - prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); - prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]); - prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); + prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); + prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]); + prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); - prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); - prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); + prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); + prt_printf(out, "average write size:\t"); + prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0); + prt_newline(out); + prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim); prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked); - prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) + prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); - prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); - prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); + prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); + prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); prt_printf(out, "current entry:\t\t"); switch (s.cur_entry_offset) { @@ -1305,10 +1340,10 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) prt_newline(out); - for (seq = journal_cur_seq(j); + for (u64 seq = journal_cur_seq(j); seq >= journal_last_unwritten_seq(j); --seq) { - i = seq & JOURNAL_BUF_MASK; + unsigned i = seq & JOURNAL_BUF_MASK; prt_printf(out, "unwritten entry:"); prt_tab(out); @@ -1352,8 +1387,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) j->space[journal_space_total].next_entry, j->space[journal_space_total].total); - for_each_member_device_rcu(ca, c, i, - &c->rw_devs[BCH_DATA_journal]) { + for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { struct journal_device *ja = &ca->journal; if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d)) @@ -1362,7 +1396,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) if (!ja->nr) continue; - prt_printf(out, "dev %u:\n", i); + prt_printf(out, "dev %u:\n", ca->dev_idx); prt_printf(out, "\tnr\t\t%u\n", ja->nr); prt_printf(out, "\tbucket size\t%u\n", ca->mi.bucket_size); prt_printf(out, "\tavailable\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free); diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 2f768e11aec9..4544ce24bb8a 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -119,7 +119,6 @@ static inline void journal_wake(struct journal *j) { wake_up(&j->wait); closure_wake_up(&j->async_wait); - closure_wake_up(&j->preres_wait); } static inline struct journal_buf *journal_cur_buf(struct journal *j) @@ -239,8 +238,6 @@ bch2_journal_add_entry(struct journal *j, struct journal_res *res, static inline bool journal_entry_empty(struct jset *j) { - struct jset_entry *i; - if (j->seq != j->last_seq) return false; @@ -426,6 +423,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j) void bch2_journal_unblock(struct journal *); void bch2_journal_block(struct journal *); +struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq); void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); void bch2_journal_debug_to_text(struct printbuf *, struct journal *); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 3eb6c3f62a81..b0f4dd491e12 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -4,6 +4,7 @@ #include "alloc_foreground.h" #include "btree_io.h" #include "btree_update_interior.h" +#include "btree_write_buffer.h" #include "buckets.h" #include "checksum.h" #include "disk_groups.h" @@ -26,11 +27,15 @@ static struct nonce journal_nonce(const struct jset *jset) }}; } -static bool jset_csum_good(struct bch_fs *c, struct jset *j) +static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum) { - return bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)) && - !bch2_crc_cmp(j->csum, - csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j)); + if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) { + *csum = (struct bch_csum) {}; + return false; + } + + *csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j); + return !bch2_crc_cmp(j->csum, *csum); } static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) @@ -687,8 +692,6 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs le64_to_cpu(u->d[i].sectors), le64_to_cpu(u->d[i].fragmented)); } - - prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec)); } static int journal_entry_log_validate(struct bch_fs *c, @@ -725,6 +728,22 @@ static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs journal_entry_btree_keys_to_text(out, c, entry); } +static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) +{ + return journal_entry_btree_keys_validate(c, jset, entry, + version, big_endian, READ); +} + +static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + journal_entry_btree_keys_to_text(out, c, entry); +} + struct jset_entry_ops { int (*validate)(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, @@ -768,7 +787,6 @@ void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, static int jset_validate_entries(struct bch_fs *c, struct jset *jset, enum bkey_invalid_flags flags) { - struct jset_entry *entry; unsigned version = le32_to_cpu(jset->version); int ret = 0; @@ -920,6 +938,7 @@ static int journal_read_bucket(struct bch_dev *ca, u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), end = offset + ca->mi.bucket_size; bool saw_bad = false, csum_good; + struct printbuf err = PRINTBUF; int ret = 0; pr_debug("reading %u", bucket); @@ -952,7 +971,7 @@ reread: * found on a different device, and missing or * no journal entries will be handled later */ - return 0; + goto out; } j = buf->data; @@ -969,12 +988,12 @@ reread: ret = journal_read_buf_realloc(buf, vstruct_bytes(j)); if (ret) - return ret; + goto err; } goto reread; case JOURNAL_ENTRY_NONE: if (!saw_bad) - return 0; + goto out; /* * On checksum error we don't really trust the size * field of the journal entry we read, so try reading @@ -983,7 +1002,7 @@ reread: sectors = block_sectors(c); goto next_block; default: - return ret; + goto err; } /* @@ -993,20 +1012,28 @@ reread: * bucket: */ if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) - return 0; + goto out; ja->bucket_seq[bucket] = le64_to_cpu(j->seq); - csum_good = jset_csum_good(c, j); + enum bch_csum_type csum_type = JSET_CSUM_TYPE(j); + struct bch_csum csum; + csum_good = jset_csum_good(c, j, &csum); + if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum, - "journal checksum error")) + "%s", + (printbuf_reset(&err), + prt_str(&err, "journal "), + bch2_csum_err_msg(&err, csum_type, j->csum, csum), + err.buf))) saw_bad = true; ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), j->encrypted_start, vstruct_end(j) - (void *) j->encrypted_start); bch2_fs_fatal_err_on(ret, c, - "error decrypting journal entry: %i", ret); + "error decrypting journal entry: %s", + bch2_err_str(ret)); mutex_lock(&jlist->lock); ret = journal_entry_add(c, ca, (struct journal_ptr) { @@ -1025,7 +1052,7 @@ reread: case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: break; default: - return ret; + goto err; } next_block: pr_debug("next"); @@ -1034,7 +1061,11 @@ next_block: j = ((void *) j) + (sectors << 9); } - return 0; +out: + ret = 0; +err: + printbuf_exit(&err); + return ret; } static CLOSURE_CALLBACK(bch2_journal_read_device) @@ -1156,8 +1187,6 @@ int bch2_journal_read(struct bch_fs *c, struct journal_list jlist; struct journal_replay *i, **_i, *prev = NULL; struct genradix_iter radix_iter; - struct bch_dev *ca; - unsigned iter; struct printbuf buf = PRINTBUF; bool degraded = false, last_write_torn = false; u64 seq; @@ -1168,7 +1197,7 @@ int bch2_journal_read(struct bch_fs *c, jlist.last_seq = 0; jlist.ret = 0; - for_each_member_device(ca, c, iter) { + for_each_member_device(c, ca) { if (!c->opts.fsck && !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) continue; @@ -1334,7 +1363,7 @@ int bch2_journal_read(struct bch_fs *c, continue; for (ptr = 0; ptr < i->nr_ptrs; ptr++) { - ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev); + struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev); if (!i->ptrs[ptr].csum_good) bch_err_dev_offset(ca, i->ptrs[ptr].sector, @@ -1505,6 +1534,8 @@ done: static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + /* we aren't holding j->lock: */ unsigned new_size = READ_ONCE(j->buf_size_want); void *new_buf; @@ -1512,6 +1543,11 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) if (buf->buf_size >= new_size) return; + size_t btree_write_buffer_size = new_size / 64; + + if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) + return; + new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN); if (!new_buf) return; @@ -1604,6 +1640,9 @@ static CLOSURE_CALLBACK(journal_write_done) bch2_journal_reclaim_fast(j); bch2_journal_space_available(j); + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], + &j->max_in_flight_start, false); + closure_wake_up(&w->wait); journal_wake(j); @@ -1656,7 +1695,6 @@ static CLOSURE_CALLBACK(do_journal_write) struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; struct journal_buf *w = journal_last_unwritten_buf(j); - struct bch_extent_ptr *ptr; struct bio *bio; unsigned sectors = vstruct_sectors(w->data, c->block_bits); @@ -1700,11 +1738,13 @@ static CLOSURE_CALLBACK(do_journal_write) static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct jset_entry *start, *end, *i, *next, *prev = NULL; + struct jset_entry *start, *end; struct jset *jset = w->data; + struct journal_keys_to_wb wb = { NULL }; unsigned sectors, bytes, u64s; - bool validate_before_checksum = false; unsigned long btree_roots_have = 0; + bool validate_before_checksum = false; + u64 seq = le64_to_cpu(jset->seq); int ret; /* @@ -1715,7 +1755,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) * If we wanted to be really fancy here, we could sort all the keys in * the jset and drop keys that were overwritten - probably not worth it: */ - vstruct_for_each_safe(jset, i, next) { + vstruct_for_each(jset, i) { unsigned u64s = le16_to_cpu(i->u64s); /* Empty entry: */ @@ -1732,40 +1772,40 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) * to c->btree_roots we have to get any missing btree roots and * add them to this journal entry: */ - if (i->type == BCH_JSET_ENTRY_btree_root) { + switch (i->type) { + case BCH_JSET_ENTRY_btree_root: bch2_journal_entry_to_btree_root(c, i); __set_bit(i->btree_id, &btree_roots_have); + break; + case BCH_JSET_ENTRY_write_buffer_keys: + EBUG_ON(!w->need_flush_to_write_buffer); + + if (!wb.wb) + bch2_journal_keys_to_write_buffer_start(c, &wb, seq); + + struct bkey_i *k; + jset_entry_for_each_key(i, k) { + ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); + if (ret) { + bch2_fs_fatal_error(c, "-ENOMEM flushing journal keys to btree write buffer"); + bch2_journal_keys_to_write_buffer_end(c, &wb); + return ret; + } + } + i->type = BCH_JSET_ENTRY_btree_keys; + break; } - - /* Can we merge with previous entry? */ - if (prev && - i->btree_id == prev->btree_id && - i->level == prev->level && - i->type == prev->type && - i->type == BCH_JSET_ENTRY_btree_keys && - le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { - memmove_u64s_down(vstruct_next(prev), - i->_data, - u64s); - le16_add_cpu(&prev->u64s, u64s); - continue; - } - - /* Couldn't merge, move i into new position (after prev): */ - prev = prev ? vstruct_next(prev) : jset->start; - if (i != prev) - memmove_u64s_down(prev, i, jset_u64s(u64s)); } - prev = prev ? vstruct_next(prev) : jset->start; - jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); + if (wb.wb) + bch2_journal_keys_to_write_buffer_end(c, &wb); + w->need_flush_to_write_buffer = false; start = end = vstruct_last(jset); end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); - bch2_journal_super_entries_add_common(c, &end, - le64_to_cpu(jset->seq)); + bch2_journal_super_entries_add_common(c, &end, seq); u64s = (u64 *) end - (u64 *) start; BUG_ON(u64s > j->entry_u64s_reserved); @@ -1788,7 +1828,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) - j->last_empty_seq = le64_to_cpu(jset->seq); + j->last_empty_seq = seq; if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) validate_before_checksum = true; @@ -1847,7 +1887,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * (!w->must_flush && (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) { - w->noflush = true; + w->noflush = true; SET_JSET_NO_FLUSH(w->data, true); w->data->last_seq = 0; w->last_seq = 0; @@ -1866,12 +1906,11 @@ CLOSURE_CALLBACK(bch2_journal_write) { closure_type(j, struct journal, io); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_dev *ca; struct journal_buf *w = journal_last_unwritten_buf(j); struct bch_replicas_padded replicas; struct bio *bio; struct printbuf journal_debug_buf = PRINTBUF; - unsigned i, nr_rw_members = 0; + unsigned nr_rw_members = 0; int ret; BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); @@ -1884,12 +1923,16 @@ CLOSURE_CALLBACK(bch2_journal_write) if (ret) goto err; + mutex_lock(&j->buf_lock); journal_buf_realloc(j, w); ret = bch2_journal_write_prep(j, w); + mutex_unlock(&j->buf_lock); if (ret) goto err; + j->entry_bytes_written += vstruct_bytes(w->data); + while (1) { spin_lock(&j->lock); ret = journal_write_alloc(j, w); @@ -1927,7 +1970,7 @@ CLOSURE_CALLBACK(bch2_journal_write) if (c->opts.nochanges) goto no_io; - for_each_rw_member(ca, c, i) + for_each_rw_member(c, ca) nr_rw_members++; if (nr_rw_members > 1) @@ -1944,7 +1987,7 @@ CLOSURE_CALLBACK(bch2_journal_write) goto err; if (!JSET_NO_FLUSH(w->data) && w->separate_flush) { - for_each_rw_member(ca, c, i) { + for_each_rw_member(c, ca) { percpu_ref_get(&ca->io_ref); bio = ca->journal.bio; diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index ec712104addb..820d25e19e5f 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -3,6 +3,7 @@ #include "bcachefs.h" #include "btree_key_cache.h" #include "btree_update.h" +#include "btree_write_buffer.h" #include "buckets.h" #include "errcode.h" #include "error.h" @@ -50,17 +51,24 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j, return available; } -static inline void journal_set_watermark(struct journal *j, bool low_on_space) +void bch2_journal_set_watermark(struct journal *j) { - unsigned watermark = BCH_WATERMARK_stripe; - - if (low_on_space) - watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); - if (fifo_free(&j->pin) < j->pin.size / 4) - watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); - - if (watermark == j->watermark) - return; + struct bch_fs *c = container_of(j, struct bch_fs, journal); + bool low_on_space = j->space[journal_space_clean].total * 4 <= + j->space[journal_space_total].total; + bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4; + bool low_on_wb = bch2_btree_write_buffer_must_wait(c); + unsigned watermark = low_on_space || low_on_pin || low_on_wb + ? BCH_WATERMARK_reclaim + : BCH_WATERMARK_stripe; + + if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], + &j->low_on_space_start, low_on_space) || + track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], + &j->low_on_pin_start, low_on_pin) || + track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], + &j->write_buffer_full_start, low_on_wb)) + trace_and_count(c, journal_full, c); swap(watermark, j->watermark); if (watermark > j->watermark) @@ -128,15 +136,13 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne enum journal_space_from from) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_dev *ca; - unsigned i, pos, nr_devs = 0; + unsigned pos, nr_devs = 0; struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX]; BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space)); rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, - &c->rw_devs[BCH_DATA_journal]) { + for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { if (!ca->journal.nr) continue; @@ -165,19 +171,17 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne void bch2_journal_space_available(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_dev *ca; unsigned clean, clean_ondisk, total; unsigned max_entry_size = min(j->buf[0].buf_size >> 9, j->buf[1].buf_size >> 9); - unsigned i, nr_online = 0, nr_devs_want; + unsigned nr_online = 0, nr_devs_want; bool can_discard = false; int ret = 0; lockdep_assert_held(&j->lock); rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, - &c->rw_devs[BCH_DATA_journal]) { + for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { struct journal_device *ja = &ca->journal; if (!ja->nr) @@ -208,7 +212,7 @@ void bch2_journal_space_available(struct journal *j) nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas); - for (i = 0; i < journal_space_nr; i++) + for (unsigned i = 0; i < journal_space_nr; i++) j->space[i] = __journal_space_available(j, nr_devs_want, i); clean_ondisk = j->space[journal_space_clean_ondisk].total; @@ -226,7 +230,7 @@ void bch2_journal_space_available(struct journal *j) else clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); - journal_set_watermark(j, clean * 4 <= total); + bch2_journal_set_watermark(j); out: j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; j->cur_entry_error = ret; @@ -255,12 +259,10 @@ static bool should_discard_bucket(struct journal *j, struct journal_device *ja) void bch2_journal_do_discards(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_dev *ca; - unsigned iter; mutex_lock(&j->discard_lock); - for_each_rw_member(ca, c, iter) { + for_each_rw_member(c, ca) { struct journal_device *ja = &ca->journal; while (should_discard_bucket(j, ja)) { @@ -299,6 +301,7 @@ void bch2_journal_reclaim_fast(struct journal *j) * all btree nodes got written out */ while (!fifo_empty(&j->pin) && + j->pin.front <= j->seq_ondisk && !atomic_read(&fifo_peek_front(&j->pin).count)) { j->pin.front++; popped = true; @@ -367,15 +370,36 @@ static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn) return JOURNAL_PIN_other; } -void bch2_journal_pin_set(struct journal *j, u64 seq, +static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq, struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) + journal_pin_flush_fn flush_fn, + enum journal_pin_type type) +{ + struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); + + /* + * flush_fn is how we identify journal pins in debugfs, so must always + * exist, even if it doesn't do anything: + */ + BUG_ON(!flush_fn); + + atomic_inc(&pin_list->count); + pin->seq = seq; + pin->flush = flush_fn; + list_add(&pin->list, &pin_list->list[type]); +} + +void bch2_journal_pin_copy(struct journal *j, + struct journal_entry_pin *dst, + struct journal_entry_pin *src, + journal_pin_flush_fn flush_fn) { - struct journal_entry_pin_list *pin_list; bool reclaim; spin_lock(&j->lock); + u64 seq = READ_ONCE(src->seq); + if (seq < journal_last_seq(j)) { /* * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on @@ -387,18 +411,34 @@ void bch2_journal_pin_set(struct journal *j, u64 seq, return; } - pin_list = journal_seq_pin(j, seq); + reclaim = __journal_pin_drop(j, dst); - reclaim = __journal_pin_drop(j, pin); + bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn)); - atomic_inc(&pin_list->count); - pin->seq = seq; - pin->flush = flush_fn; + if (reclaim) + bch2_journal_reclaim_fast(j); + spin_unlock(&j->lock); - if (flush_fn) - list_add(&pin->list, &pin_list->list[journal_pin_type(flush_fn)]); - else - list_add(&pin->list, &pin_list->flushed); + /* + * If the journal is currently full, we might want to call flush_fn + * immediately: + */ + journal_wake(j); +} + +void bch2_journal_pin_set(struct journal *j, u64 seq, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) +{ + bool reclaim; + + spin_lock(&j->lock); + + BUG_ON(seq < journal_last_seq(j)); + + reclaim = __journal_pin_drop(j, pin); + + bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn)); if (reclaim) bch2_journal_reclaim_fast(j); @@ -537,13 +577,11 @@ static size_t journal_flush_pins(struct journal *j, static u64 journal_seq_to_flush(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_dev *ca; u64 seq_to_flush = 0; - unsigned iter; spin_lock(&j->lock); - for_each_rw_member(ca, c, iter) { + for_each_rw_member(c, ca) { struct journal_device *ja = &ca->journal; unsigned nr_buckets, bucket_to_flush; @@ -747,10 +785,9 @@ int bch2_journal_reclaim_start(struct journal *j) p = kthread_create(bch2_journal_reclaim_thread, j, "bch-reclaim/%s", c->name); ret = PTR_ERR_OR_ZERO(p); - if (ret) { - bch_err_msg(c, ret, "creating journal reclaim thread"); + bch_err_msg(c, ret, "creating journal reclaim thread"); + if (ret) return ret; - } get_task_struct(p); j->reclaim_thread = p; @@ -796,6 +833,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) { + /* time_stats this */ bool did_work = false; if (!test_bit(JOURNAL_STARTED, &j->flags)) diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h index 494d1a6eddb0..ec84c3345281 100644 --- a/fs/bcachefs/journal_reclaim.h +++ b/fs/bcachefs/journal_reclaim.h @@ -16,6 +16,7 @@ static inline void journal_reclaim_kick(struct journal *j) unsigned bch2_journal_dev_buckets_available(struct journal *, struct journal_device *, enum journal_space_from); +void bch2_journal_set_watermark(struct journal *); void bch2_journal_space_available(struct journal *); static inline bool journal_pin_active(struct journal_entry_pin *pin) @@ -47,17 +48,10 @@ static inline void bch2_journal_pin_add(struct journal *j, u64 seq, bch2_journal_pin_set(j, seq, pin, flush_fn); } -static inline void bch2_journal_pin_copy(struct journal *j, - struct journal_entry_pin *dst, - struct journal_entry_pin *src, - journal_pin_flush_fn flush_fn) -{ - /* Guard against racing with journal_pin_drop(src): */ - u64 seq = READ_ONCE(src->seq); - - if (seq) - bch2_journal_pin_add(j, seq, dst, flush_fn); -} +void bch2_journal_pin_copy(struct journal *, + struct journal_entry_pin *, + struct journal_entry_pin *, + journal_pin_flush_fn); static inline void bch2_journal_pin_update(struct journal *j, u64 seq, struct journal_entry_pin *pin, diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index f9d9aa95bf3a..0200e299cfbb 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -267,7 +267,7 @@ retry: while (!(ret = PTR_ERR_OR_ZERO(b)) && b && - !test_bit(BCH_FS_STOPPING, &c->flags)) + !test_bit(BCH_FS_stopping, &c->flags)) b = bch2_btree_iter_next_node(&iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index a756b69582e3..38817c7a0851 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -36,6 +36,7 @@ struct journal_buf { bool noflush; /* write has already been kicked off, and was noflush */ bool must_flush; /* something wants a flush */ bool separate_flush; + bool need_flush_to_write_buffer; }; /* @@ -182,6 +183,12 @@ struct journal { darray_u64 early_journal_entries; /* + * Protects journal_buf->data, when accessing without a jorunal + * reservation: for synchronization between the btree write buffer code + * and the journal write path: + */ + struct mutex buf_lock; + /* * Two journal entries -- one is currently open for new entries, the * other is possibly being written out. */ @@ -195,7 +202,6 @@ struct journal { /* Used when waiting because the journal was full */ wait_queue_head_t wait; struct closure_waitlist async_wait; - struct closure_waitlist preres_wait; struct closure io; struct delayed_work write_work; @@ -262,15 +268,19 @@ struct journal { unsigned long last_flush_write; - u64 res_get_blocked_start; u64 write_start_time; u64 nr_flush_writes; u64 nr_noflush_writes; + u64 entry_bytes_written; + + u64 low_on_space_start; + u64 low_on_pin_start; + u64 max_in_flight_start; + u64 write_buffer_full_start; struct bch2_time_stats *flush_write_time; struct bch2_time_stats *noflush_write_time; - struct bch2_time_stats *blocked_time; struct bch2_time_stats *flush_seq_time; #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c index 5699cd4873c8..1b828bddd11b 100644 --- a/fs/bcachefs/keylist.c +++ b/fs/bcachefs/keylist.c @@ -43,8 +43,6 @@ void bch2_keylist_pop_front(struct keylist *l) #ifdef CONFIG_BCACHEFS_DEBUG void bch2_verify_keylist_sorted(struct keylist *l) { - struct bkey_i *k; - for_each_keylist_key(l, k) BUG_ON(bkey_next(k) != l->top && bpos_ge(k->k.p, bkey_next(k)->k.p)); diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h index fe759c7031e0..e687e0e9aede 100644 --- a/fs/bcachefs/keylist.h +++ b/fs/bcachefs/keylist.h @@ -50,18 +50,16 @@ static inline struct bkey_i *bch2_keylist_front(struct keylist *l) } #define for_each_keylist_key(_keylist, _k) \ - for (_k = (_keylist)->keys; \ + for (struct bkey_i *_k = (_keylist)->keys; \ _k != (_keylist)->top; \ _k = bkey_next(_k)) static inline u64 keylist_sectors(struct keylist *keys) { - struct bkey_i *k; u64 ret = 0; for_each_keylist_key(keys, k) ret += k->k.size; - return ret; } diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c index 8640f7dee0de..ad598105c587 100644 --- a/fs/bcachefs/logged_ops.c +++ b/fs/bcachefs/logged_ops.c @@ -54,16 +54,12 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, int bch2_resume_logged_ops(struct bch_fs *c) { - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - ret = bch2_trans_run(c, - for_each_btree_key2(trans, iter, - BTREE_ID_logged_ops, POS_MIN, BTREE_ITER_PREFETCH, k, + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, + BTREE_ID_logged_ops, POS_MIN, + BTREE_ITER_PREFETCH, k, resume_logged_op(trans, &iter, k))); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -85,13 +81,13 @@ static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) { - return commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + return commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, __bch2_logged_op_start(trans, k)); } void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k) { - int ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0)); /* * This needs to be a fatal error because we've left an unfinished diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index a5cc0ed195d6..7a4ca5a28b3e 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -147,18 +147,13 @@ fsck_err: int bch2_check_lrus(struct bch_fs *c) { - struct btree_iter iter; - struct bkey_s_c k; struct bpos last_flushed_pos = POS_MIN; - int ret = 0; - - ret = bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw, bch2_check_lru_key(trans, &iter, k, &last_flushed_pos))); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/mean_and_variance.c b/fs/bcachefs/mean_and_variance.c index 1f0801e2e565..bf0ef668fd38 100644 --- a/fs/bcachefs/mean_and_variance.c +++ b/fs/bcachefs/mean_and_variance.c @@ -62,6 +62,7 @@ EXPORT_SYMBOL_GPL(u128_div); /** * mean_and_variance_get_mean() - get mean from @s + * @s: mean and variance number of samples and their sums */ s64 mean_and_variance_get_mean(struct mean_and_variance s) { @@ -71,6 +72,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_mean); /** * mean_and_variance_get_variance() - get variance from @s1 + * @s1: mean and variance number of samples and sums * * see linked pdf equation 12. */ @@ -89,6 +91,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_variance); /** * mean_and_variance_get_stddev() - get standard deviation from @s + * @s: mean and variance number of samples and their sums */ u32 mean_and_variance_get_stddev(struct mean_and_variance s) { @@ -98,8 +101,8 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev); /** * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update() - * @s1: .. - * @s2: .. + * @s: mean and variance number of samples and their sums + * @x: new value to include in the &mean_and_variance_weighted * * see linked pdf: function derived from equations 140-143 where alpha = 2^w. * values are stored bitshifted for performance and added precision. @@ -129,6 +132,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update); /** * mean_and_variance_weighted_get_mean() - get mean from @s + * @s: mean and variance number of samples and their sums */ s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s) { @@ -138,6 +142,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean); /** * mean_and_variance_weighted_get_variance() -- get variance from @s + * @s: mean and variance number of samples and their sums */ u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s) { @@ -148,6 +153,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance); /** * mean_and_variance_weighted_get_stddev() - get standard deviation from @s + * @s: mean and variance number of samples and their sums */ u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s) { diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h index 647505010b39..b2be565bb8f2 100644 --- a/fs/bcachefs/mean_and_variance.h +++ b/fs/bcachefs/mean_and_variance.h @@ -12,9 +12,12 @@ /* * u128_u: u128 user mode, because not all architectures support a real int128 * type + * + * We don't use this version in userspace, because in userspace we link with + * Rust and rustc has issues with u128. */ -#ifdef __SIZEOF_INT128__ +#if defined(__SIZEOF_INT128__) && defined(__KERNEL__) typedef struct { unsigned __int128 v; diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index e3a51f6d6c9b..5623cee3ef86 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -79,8 +79,6 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) { struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; enum btree_id id; int ret = 0; @@ -90,7 +88,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - NULL, NULL, BTREE_INSERT_NOFAIL, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags)); if (ret) break; @@ -145,10 +143,9 @@ retry: continue; } - if (ret) { - bch_err_msg(c, ret, "updating btree node key"); + bch_err_msg(c, ret, "updating btree node key"); + if (ret) break; - } next: bch2_btree_iter_next_node(&iter); } diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 54830ee0ed88..7a33319dcd16 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -27,6 +27,13 @@ #include <linux/ioprio.h> #include <linux/kthread.h> +const char * const bch2_data_ops_strs[] = { +#define x(t, n, ...) [n] = #t, + BCH_DATA_OPS() +#undef x + NULL +}; + static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k) { if (trace_move_extent_enabled()) { @@ -63,7 +70,7 @@ struct moving_io { struct data_update write; /* Must be last since it is variable size */ - struct bio_vec bi_inline_vecs[0]; + struct bio_vec bi_inline_vecs[]; }; static void move_free(struct moving_io *io) @@ -152,7 +159,7 @@ void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) atomic_read(&ctxt->write_sectors) != sectors_pending); } -static void bch2_moving_ctxt_flush_all(struct moving_context *ctxt) +void bch2_moving_ctxt_flush_all(struct moving_context *ctxt) { move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); bch2_trans_unlock_long(ctxt->trans); @@ -211,7 +218,7 @@ void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c) trace_move_data(c, stats); } -void bch2_move_stats_init(struct bch_move_stats *stats, char *name) +void bch2_move_stats_init(struct bch_move_stats *stats, const char *name) { memset(stats, 0, sizeof(*stats)); stats->data_type = BCH_DATA_user; @@ -342,7 +349,8 @@ err: bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ret; - this_cpu_inc(c->counters[BCH_COUNTER_move_extent_start_fail]); + count_event(c, move_extent_start_fail); + if (trace_move_extent_start_fail_enabled()) { struct printbuf buf = PRINTBUF; @@ -364,13 +372,10 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, int ret = 0; if (io_opts->cur_inum != extent_k.k->p.inode) { - struct btree_iter iter; - struct bkey_s_c k; - io_opts->d.nr = 0; - for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode), - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode), + BTREE_ITER_ALL_SNAPSHOTS, k, ({ if (k.k->p.offset != extent_k.k->p.inode) break; @@ -383,11 +388,8 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot }; bch2_inode_opts_get(&e.io_opts, trans->c, &inode); - ret = darray_push(&io_opts->d, e); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); + darray_push(&io_opts->d, e); + })); io_opts->cur_inum = extent_k.k->p.inode; } @@ -395,12 +397,10 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, if (ret) return ERR_PTR(ret); - if (extent_k.k->p.snapshot) { - struct snapshot_io_opts_entry *i; + if (extent_k.k->p.snapshot) darray_for_each(io_opts->d, i) if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) return &i->io_opts; - } return &io_opts->fs_io_opts; } @@ -628,7 +628,7 @@ int bch2_move_data(struct bch_fs *c, return ret; } -int __bch2_evacuate_bucket(struct moving_context *ctxt, +int bch2_evacuate_bucket(struct moving_context *ctxt, struct move_bucket_in_flight *bucket_in_flight, struct bpos bucket, int gen, struct data_update_opts _data_opts) @@ -664,21 +664,19 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); bch2_trans_iter_exit(trans, &iter); - if (ret) { - bch_err_msg(c, ret, "looking up alloc key"); + bch_err_msg(c, ret, "looking up alloc key"); + if (ret) goto err; - } a = bch2_alloc_to_v4(k, &a_convert); - dirty_sectors = a->dirty_sectors; + dirty_sectors = bch2_bucket_sectors_dirty(*a); bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size; fragmentation = a->fragmentation_lru; - ret = bch2_btree_write_buffer_flush(trans); - if (ret) { - bch_err_msg(c, ret, "flushing btree write buffer"); + ret = bch2_btree_write_buffer_tryflush(trans); + bch_err_msg(c, ret, "flushing btree write buffer"); + if (ret) goto err; - } while (!(ret = bch2_move_ratelimit(ctxt))) { if (is_kthread && kthread_should_stop()) @@ -697,9 +695,6 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, break; if (!bp.level) { - const struct bch_extent_ptr *ptr; - unsigned i = 0; - k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0); ret = bkey_err(k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -722,6 +717,7 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, data_opts.target = io_opts.background_target; data_opts.rewrite_ptrs = 0; + unsigned i = 0; bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { if (ptr->dev == bucket.inode) { data_opts.rewrite_ptrs |= 1U << i; @@ -789,31 +785,13 @@ err: return ret; } -int bch2_evacuate_bucket(struct bch_fs *c, - struct bpos bucket, int gen, - struct data_update_opts data_opts, - struct bch_ratelimit *rate, - struct bch_move_stats *stats, - struct write_point_specifier wp, - bool wait_on_copygc) -{ - struct moving_context ctxt; - int ret; - - bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); - ret = __bch2_evacuate_bucket(&ctxt, NULL, bucket, gen, data_opts); - bch2_moving_ctxt_exit(&ctxt); - - return ret; -} - typedef bool (*move_btree_pred)(struct bch_fs *, void *, struct btree *, struct bch_io_opts *, struct data_update_opts *); static int bch2_move_btree(struct bch_fs *c, - enum btree_id start_btree_id, struct bpos start_pos, - enum btree_id end_btree_id, struct bpos end_pos, + struct bbpos start, + struct bbpos end, move_btree_pred pred, void *arg, struct bch_move_stats *stats) { @@ -823,7 +801,7 @@ static int bch2_move_btree(struct bch_fs *c, struct btree_trans *trans; struct btree_iter iter; struct btree *b; - enum btree_id id; + enum btree_id btree; struct data_update_opts data_opts; int ret = 0; @@ -834,15 +812,15 @@ static int bch2_move_btree(struct bch_fs *c, stats->data_type = BCH_DATA_btree; - for (id = start_btree_id; - id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1); - id++) { - stats->pos = BBPOS(id, POS_MIN); + for (btree = start.btree; + btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1); + btree ++) { + stats->pos = BBPOS(btree, POS_MIN); - if (!bch2_btree_id_root(c, id)->b) + if (!bch2_btree_id_root(c, btree)->b) continue; - bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0, + bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0, BTREE_ITER_PREFETCH); retry: ret = 0; @@ -852,8 +830,8 @@ retry: if (kthread && kthread_should_stop()) break; - if ((cmp_int(id, end_btree_id) ?: - bpos_cmp(b->key.k.p, end_pos)) > 0) + if ((cmp_int(btree, end.btree) ?: + bpos_cmp(b->key.k.p, end.pos)) > 0) break; stats->pos = BBPOS(iter.btree_id, iter.pos); @@ -910,7 +888,6 @@ static bool migrate_pred(struct bch_fs *c, void *arg, struct data_update_opts *data_opts) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; struct bch_ioctl_data *op = arg; unsigned i = 0; @@ -990,8 +967,8 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) int ret; ret = bch2_move_btree(c, - 0, POS_MIN, - BTREE_ID_NR, SPOS_MAX, + BBPOS_MIN, + BBPOS_MAX, rewrite_old_nodes_pred, c, stats); if (!ret) { mutex_lock(&c->sb_lock); @@ -1006,71 +983,101 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) return ret; } +static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, + struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + unsigned durability = bch2_bkey_durability(c, k); + unsigned replicas = bkey_is_btree_ptr(k.k) + ? c->opts.metadata_replicas + : io_opts->data_replicas; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned i = 0; + + bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { + unsigned d = bch2_extent_ptr_durability(c, &p); + + if (d && durability - d >= replicas) { + data_opts->kill_ptrs |= BIT(i); + durability -= d; + } + + i++; + } + + return data_opts->kill_ptrs != 0; +} + +static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg, + struct btree *b, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); +} + int bch2_data_job(struct bch_fs *c, struct bch_move_stats *stats, struct bch_ioctl_data op) { + struct bbpos start = BBPOS(op.start_btree, op.start_pos); + struct bbpos end = BBPOS(op.end_btree, op.end_pos); int ret = 0; + if (op.op >= BCH_DATA_OP_NR) + return -EINVAL; + + bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]); + switch (op.op) { - case BCH_DATA_OP_REREPLICATE: - bch2_move_stats_init(stats, "rereplicate"); + case BCH_DATA_OP_rereplicate: stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, -1); - - ret = bch2_move_btree(c, - op.start_btree, op.start_pos, - op.end_btree, op.end_pos, + ret = bch2_move_btree(c, start, end, rereplicate_btree_pred, c, stats) ?: ret; - ret = bch2_replicas_gc2(c) ?: ret; - - ret = bch2_move_data(c, - (struct bbpos) { op.start_btree, op.start_pos }, - (struct bbpos) { op.end_btree, op.end_pos }, + ret = bch2_move_data(c, start, end, NULL, stats, writepoint_hashed((unsigned long) current), true, rereplicate_pred, c) ?: ret; ret = bch2_replicas_gc2(c) ?: ret; - - bch2_move_stats_exit(stats, c); break; - case BCH_DATA_OP_MIGRATE: + case BCH_DATA_OP_migrate: if (op.migrate.dev >= c->sb.nr_devices) return -EINVAL; - bch2_move_stats_init(stats, "migrate"); stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); - - ret = bch2_move_btree(c, - op.start_btree, op.start_pos, - op.end_btree, op.end_pos, + ret = bch2_move_btree(c, start, end, migrate_btree_pred, &op, stats) ?: ret; - ret = bch2_replicas_gc2(c) ?: ret; - - ret = bch2_move_data(c, - (struct bbpos) { op.start_btree, op.start_pos }, - (struct bbpos) { op.end_btree, op.end_pos }, + ret = bch2_move_data(c, start, end, NULL, stats, writepoint_hashed((unsigned long) current), true, migrate_pred, &op) ?: ret; ret = bch2_replicas_gc2(c) ?: ret; - - bch2_move_stats_exit(stats, c); break; - case BCH_DATA_OP_REWRITE_OLD_NODES: - bch2_move_stats_init(stats, "rewrite_old_nodes"); + case BCH_DATA_OP_rewrite_old_nodes: ret = bch2_scan_old_btree_nodes(c, stats); - bch2_move_stats_exit(stats, c); + break; + case BCH_DATA_OP_drop_extra_replicas: + ret = bch2_move_btree(c, start, end, + drop_extra_replicas_btree_pred, c, stats) ?: ret; + ret = bch2_move_data(c, start, end, NULL, stats, + writepoint_hashed((unsigned long) current), + true, + drop_extra_replicas_pred, c) ?: ret; + ret = bch2_replicas_gc2(c) ?: ret; break; default: ret = -EINVAL; } + bch2_move_stats_exit(stats, c); return ret; } diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index 0906aa2d1de2..9baf3093a678 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -75,12 +75,15 @@ do { \ typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c, struct bch_io_opts *, struct data_update_opts *); +extern const char * const bch2_data_ops_strs[]; + void bch2_moving_ctxt_exit(struct moving_context *); void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *, struct bch_ratelimit *, struct bch_move_stats *, struct write_point_specifier, bool); struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *); void bch2_moving_ctxt_do_pending_writes(struct moving_context *); +void bch2_moving_ctxt_flush_all(struct moving_context *); void bch2_move_ctxt_wait_for_io(struct moving_context *); int bch2_move_ratelimit(struct moving_context *); @@ -133,23 +136,17 @@ int bch2_move_data(struct bch_fs *, bool, move_pred_fn, void *); -int __bch2_evacuate_bucket(struct moving_context *, +int bch2_evacuate_bucket(struct moving_context *, struct move_bucket_in_flight *, struct bpos, int, struct data_update_opts); -int bch2_evacuate_bucket(struct bch_fs *, struct bpos, int, - struct data_update_opts, - struct bch_ratelimit *, - struct bch_move_stats *, - struct write_point_specifier, - bool); int bch2_data_job(struct bch_fs *, struct bch_move_stats *, struct bch_ioctl_data); void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *); void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *); -void bch2_move_stats_init(struct bch_move_stats *, char *); +void bch2_move_stats_init(struct bch_move_stats *, const char *); void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *); diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index a84e79f79e5e..69e06a84dad4 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -91,7 +91,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, a = bch2_alloc_to_v4(k, &_a); b->k.gen = a->gen; - b->sectors = a->dirty_sectors; + b->sectors = bch2_bucket_sectors_dirty(*a); ret = data_type_movable(a->data_type) && a->fragmentation_lru && @@ -145,20 +145,21 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4); size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0; int ret; move_buckets_wait(ctxt, buckets_in_flight, false); - ret = bch2_btree_write_buffer_flush(trans); - if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()", + ret = bch2_btree_write_buffer_tryflush(trans); + if (bch2_err_matches(ret, EROFS)) + return ret; + + if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_tryflush()", __func__, bch2_err_str(ret))) return ret; - ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru, + ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru, lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0), lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX), 0, k, ({ @@ -167,15 +168,23 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, saw++; - if (!bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p))) + ret2 = bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p)); + if (ret2 < 0) + goto err; + + if (!ret2) not_movable++; else if (bucket_in_flight(buckets_in_flight, b.k)) in_flight++; else { - ret2 = darray_push(buckets, b) ?: buckets->nr >= nr_to_get; - if (ret2 >= 0) - sectors += b.sectors; + ret2 = darray_push(buckets, b); + if (ret2) + goto err; + sectors += b.sectors; } + + ret2 = buckets->nr >= nr_to_get; +err: ret2; })); @@ -198,7 +207,6 @@ static int bch2_copygc(struct moving_context *ctxt, }; move_buckets buckets = { 0 }; struct move_bucket_in_flight *f; - struct move_bucket *i; u64 moved = atomic64_read(&ctxt->stats->sectors_moved); int ret = 0; @@ -221,7 +229,7 @@ static int bch2_copygc(struct moving_context *ctxt, break; } - ret = __bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket, + ret = bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket, f->bucket.k.gen, data_opts); if (ret) goto err; @@ -259,19 +267,16 @@ err: */ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) { - struct bch_dev *ca; - unsigned dev_idx; s64 wait = S64_MAX, fragmented_allowed, fragmented; - unsigned i; - for_each_rw_member(ca, c, dev_idx) { + for_each_rw_member(c, ca) { struct bch_dev_usage usage = bch2_dev_usage_read(ca); fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) * ca->mi.bucket_size) >> 1); fragmented = 0; - for (i = 0; i < BCH_DATA_NR; i++) + for (unsigned i = 0; i < BCH_DATA_NR; i++) if (data_type_movable(i)) fragmented += usage.d[i].fragmented; @@ -313,9 +318,9 @@ static int bch2_copygc_thread(void *arg) if (!buckets) return -ENOMEM; ret = rhashtable_init(&buckets->table, &bch_move_bucket_params); + bch_err_msg(c, ret, "allocating copygc buckets in flight"); if (ret) { kfree(buckets); - bch_err_msg(c, ret, "allocating copygc buckets in flight"); return ret; } @@ -334,7 +339,8 @@ static int bch2_copygc_thread(void *arg) if (!c->copy_gc_enabled) { move_buckets_wait(&ctxt, buckets, true); - kthread_wait_freezable(c->copy_gc_enabled); + kthread_wait_freezable(c->copy_gc_enabled || + kthread_should_stop()); } if (unlikely(freezing(current))) { @@ -411,10 +417,9 @@ int bch2_copygc_start(struct bch_fs *c) t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name); ret = PTR_ERR_OR_ZERO(t); - if (ret) { - bch_err_msg(c, ret, "creating copygc thread"); + bch_err_msg(c, ret, "creating copygc thread"); + if (ret) return ret; - } get_task_struct(t); diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 8dd4046cca41..8e6f230eac38 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -279,14 +279,14 @@ int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err) if (err) prt_printf(err, "%s: not a multiple of 512", opt->attr.name); - return -EINVAL; + return -BCH_ERR_opt_parse_error; } if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) { if (err) prt_printf(err, "%s: must be a power of two", opt->attr.name); - return -EINVAL; + return -BCH_ERR_opt_parse_error; } if (opt->fn.validate) diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 8526f177450a..93a24fef4214 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -233,11 +233,6 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ NULL, "Stash pointer to in memory btree node in btree ptr")\ - x(btree_write_buffer_size, u32, \ - OPT_FS|OPT_MOUNT, \ - OPT_UINT(16, (1U << 20) - 1), \ - BCH2_NO_SB_OPT, 1U << 13, \ - NULL, "Number of btree write buffer entries") \ x(gc_reserve_percent, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(5, 21), \ @@ -394,7 +389,7 @@ enum fsck_err_opts { BCH2_NO_SB_OPT, BCH_SB_SECTOR, \ "offset", "Sector offset of superblock") \ x(read_only, u8, \ - OPT_FS, \ + OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, NULL) \ @@ -419,6 +414,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Allocate the buckets_nouse bitmap") \ + x(stdio, u64, \ + 0, \ + OPT_UINT(0, S64_MAX), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Pointer to a struct stdio_redirect") \ x(project, u8, \ OPT_INODE, \ OPT_BOOL(), \ @@ -458,7 +458,13 @@ enum fsck_err_opts { OPT_UINT(0, BCH_REPLICAS_MAX), \ BCH2_NO_SB_OPT, 1, \ "n", "Data written to this device will be considered\n"\ - "to have already been replicated n times") + "to have already been replicated n times") \ + x(btree_node_prefetch, u8, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ + NULL, "BTREE_ITER_PREFETCH casuse btree nodes to be\n"\ + " prefetched sequentially") struct bch_opts { #define x(_name, _bits, ...) unsigned _name##_defined:1; diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index a54647c36b85..e68b34eab90a 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -599,14 +599,9 @@ advance: int bch2_fs_quota_read(struct bch_fs *c) { - struct bch_sb_field_quota *sb_quota; - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - int ret; mutex_lock(&c->sb_lock); - sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); + struct bch_sb_field_quota *sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); if (!sb_quota) { mutex_unlock(&c->sb_lock); return -BCH_ERR_ENOSPC_sb_quota; @@ -615,19 +610,14 @@ int bch2_fs_quota_read(struct bch_fs *c) bch2_sb_quota_read(c); mutex_unlock(&c->sb_lock); - trans = bch2_trans_get(c); - - ret = for_each_btree_key2(trans, iter, BTREE_ID_quotas, - POS_MIN, BTREE_ITER_PREFETCH, k, - __bch2_quota_set(c, k, NULL)) ?: - for_each_btree_key2(trans, iter, BTREE_ID_inodes, - POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - bch2_fs_quota_read_inode(trans, &iter, k)); - - bch2_trans_put(trans); - - if (ret) - bch_err_fn(c, ret); + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, BTREE_ID_quotas, POS_MIN, + BTREE_ITER_PREFETCH, k, + __bch2_quota_set(c, k, NULL)) ?: + for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + bch2_fs_quota_read_inode(trans, &iter, k))); + bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 3319190b8d9c..95f46cb3b5bd 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -69,7 +69,7 @@ err: int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) { - int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw, __bch2_set_rebalance_needs_scan(trans, inum)); rebalance_wakeup(c); return ret; @@ -125,7 +125,7 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, extent_entry_drop(bkey_i_to_s(n), (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n))); - return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); + return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, @@ -171,6 +171,21 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, return bkey_s_c_null; } + if (trace_rebalance_extent_enabled()) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "target="); + bch2_target_to_text(&buf, c, r->target); + prt_str(&buf, " compression="); + struct bch_compression_opt opt = __bch2_compression_decode(r->compression); + prt_str(&buf, bch2_compression_opts[opt.type]); + prt_str(&buf, " "); + bch2_bkey_val_to_text(&buf, c, k); + + trace_rebalance_extent(c, buf.buf); + printbuf_exit(&buf); + } + return k; } @@ -273,7 +288,7 @@ static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) r->state = BCH_REBALANCE_scanning; ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?: - commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_clear_rebalance_needs_scan(trans, inum, cookie)); bch2_move_stats_exit(&r->scan_stats, trans->c); @@ -317,8 +332,16 @@ static int do_rebalance(struct moving_context *ctxt) BTREE_ID_rebalance_work, POS_MIN, BTREE_ITER_ALL_SNAPSHOTS); - while (!bch2_move_ratelimit(ctxt) && - !kthread_wait_freezable(r->enabled)) { + while (!bch2_move_ratelimit(ctxt)) { + if (!r->enabled) { + bch2_moving_ctxt_flush_all(ctxt); + kthread_wait_freezable(r->enabled || + kthread_should_stop()); + } + + if (kthread_should_stop()) + break; + bch2_trans_begin(trans); ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter)); @@ -447,10 +470,9 @@ int bch2_rebalance_start(struct bch_fs *c) p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name); ret = PTR_ERR_OR_ZERO(p); - if (ret) { - bch_err_msg(c, ret, "creating rebalance thread"); + bch_err_msg(c, ret, "creating rebalance thread"); + if (ret) return ret; - } get_task_struct(p); rcu_assign_pointer(c->rebalance.thread, p); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 5cf7d0532002..725214605a05 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -99,6 +99,11 @@ static int bch2_journal_replay_key(struct btree_trans *trans, unsigned update_flags = BTREE_TRIGGER_NORUN; int ret; + if (k->overwritten) + return 0; + + trans->journal_res.seq = k->journal_seq; + /* * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to * keep the key cache coherent with the underlying btree. Nothing @@ -140,27 +145,13 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) static int bch2_journal_replay(struct bch_fs *c) { struct journal_keys *keys = &c->journal_keys; - struct journal_key **keys_sorted, *k; + DARRAY(struct journal_key *) keys_sorted = { 0 }; struct journal *j = &c->journal; u64 start_seq = c->journal_replay_seq_start; u64 end_seq = c->journal_replay_seq_start; - size_t i; + struct btree_trans *trans = bch2_trans_get(c); int ret = 0; - move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); - keys->gap = keys->nr; - - keys_sorted = kvmalloc_array(keys->nr, sizeof(*keys_sorted), GFP_KERNEL); - if (!keys_sorted) - return -BCH_ERR_ENOMEM_journal_replay; - - for (i = 0; i < keys->nr; i++) - keys_sorted[i] = &keys->d[i]; - - sort(keys_sorted, keys->nr, - sizeof(keys_sorted[0]), - journal_sort_seq_cmp, NULL); - if (keys->nr) { ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)", keys->nr, start_seq, end_seq); @@ -170,27 +161,67 @@ static int bch2_journal_replay(struct bch_fs *c) BUG_ON(!atomic_read(&keys->ref)); - for (i = 0; i < keys->nr; i++) { - k = keys_sorted[i]; + /* + * First, attempt to replay keys in sorted order. This is more + * efficient - better locality of btree access - but some might fail if + * that would cause a journal deadlock. + */ + for (size_t i = 0; i < keys->nr; i++) { + cond_resched(); + + struct journal_key *k = keys->d + i; + + /* Skip fastpath if we're low on space in the journal */ + ret = c->journal.watermark ? -1 : + commit_do(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_journal_reclaim| + (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0), + bch2_journal_replay_key(trans, k)); + BUG_ON(!ret && !k->overwritten); + if (ret) { + ret = darray_push(&keys_sorted, k); + if (ret) + goto err; + } + } + /* + * Now, replay any remaining keys in the order in which they appear in + * the journal, unpinning those journal entries as we go: + */ + sort(keys_sorted.data, keys_sorted.nr, + sizeof(keys_sorted.data[0]), + journal_sort_seq_cmp, NULL); + + darray_for_each(keys_sorted, kp) { cond_resched(); + struct journal_key *k = *kp; + replay_now_at(j, k->journal_seq); - ret = bch2_trans_do(c, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL| - (!k->allocated - ? BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim - : 0), + ret = commit_do(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc| + (!k->allocated + ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim + : 0), bch2_journal_replay_key(trans, k)); - if (ret) { - bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s", - bch2_btree_id_str(k->btree_id), k->level, bch2_err_str(ret)); + bch_err_msg(c, ret, "while replaying key at btree %s level %u:", + bch2_btree_id_str(k->btree_id), k->level); + if (ret) goto err; - } + + BUG_ON(!k->overwritten); } + /* + * We need to put our btree_trans before calling flush_all_pins(), since + * that will use a btree_trans internally + */ + bch2_trans_put(trans); + trans = NULL; + if (!c->opts.keep_journal) bch2_journal_keys_put_initial(c); @@ -198,16 +229,14 @@ static int bch2_journal_replay(struct bch_fs *c) j->replay_journal_seq = 0; bch2_journal_set_replay_done(j); - bch2_journal_flush_all_pins(j); - ret = bch2_journal_error(j); - if (keys->nr && !ret) + if (keys->nr) bch2_journal_log_msg(c, "journal replay finished"); err: - kvfree(keys_sorted); - - if (ret) - bch_err_fn(c, ret); + if (trans) + bch2_trans_put(trans); + darray_exit(&keys_sorted); + bch_err_fn(c, ret); return ret; } @@ -275,8 +304,6 @@ static int journal_replay_entry_early(struct bch_fs *c, struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev)); unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); - ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec); - for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) { ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets); ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors); @@ -317,14 +344,11 @@ static int journal_replay_entry_early(struct bch_fs *c, static int journal_replay_early(struct bch_fs *c, struct bch_sb_field_clean *clean) { - struct jset_entry *entry; - int ret; - if (clean) { - for (entry = clean->start; + for (struct jset_entry *entry = clean->start; entry != vstruct_end(&clean->field); entry = vstruct_next(entry)) { - ret = journal_replay_entry_early(c, entry); + int ret = journal_replay_entry_early(c, entry); if (ret) return ret; } @@ -339,7 +363,7 @@ static int journal_replay_early(struct bch_fs *c, continue; vstruct_for_each(&i->j, entry) { - ret = journal_replay_entry_early(c, entry); + int ret = journal_replay_entry_early(c, entry); if (ret) return ret; } @@ -435,8 +459,7 @@ static int bch2_initialize_subvolumes(struct bch_fs *c) ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?: bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?: bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -474,10 +497,9 @@ err: noinline_for_stack static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) { - int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW, + int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, __bch2_fs_upgrade_for_subvolumes(trans)); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -495,7 +517,20 @@ static int bch2_check_allocations(struct bch_fs *c) static int bch2_set_may_go_rw(struct bch_fs *c) { - set_bit(BCH_FS_MAY_GO_RW, &c->flags); + struct journal_keys *keys = &c->journal_keys; + + /* + * After we go RW, the journal keys buffer can't be modified (except for + * setting journal_key->overwritten: it will be accessed by multiple + * threads + */ + move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); + keys->gap = keys->nr; + + set_bit(BCH_FS_may_go_rw, &c->flags); + + if (keys->nr || c->opts.fsck || !c->sb.clean) + return bch2_fs_read_write_early(c); return 0; } @@ -589,17 +624,15 @@ static bool check_version_upgrade(struct bch_fs *c) bch2_version_to_text(&buf, new_version); prt_newline(&buf); - u64 recovery_passes = bch2_upgrade_recovery_passes(c, old_version, new_version); - if (recovery_passes) { - if ((recovery_passes & RECOVERY_PASS_ALL_FSCK) == RECOVERY_PASS_ALL_FSCK) - prt_str(&buf, "fsck required"); - else { - prt_str(&buf, "running recovery passes: "); - prt_bitflags(&buf, bch2_recovery_passes, recovery_passes); - } - - c->recovery_passes_explicit |= recovery_passes; - c->opts.fix_errors = FSCK_FIX_yes; + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + __le64 passes = ext->recovery_passes_required[0]; + bch2_sb_set_upgrade(c, old_version, new_version); + passes = ext->recovery_passes_required[0] & ~passes; + + if (passes) { + prt_str(&buf, " running recovery passes: "); + prt_bitflags(&buf, bch2_recovery_passes, + bch2_recovery_passes_from_stable(le64_to_cpu(passes))); } bch_info(c, "%s", buf.buf); @@ -625,7 +658,7 @@ u64 bch2_fsck_recovery_passes(void) static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { - struct recovery_pass_fn *p = recovery_pass_fns + c->curr_recovery_pass; + struct recovery_pass_fn *p = recovery_pass_fns + pass; if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read) return false; @@ -642,39 +675,62 @@ static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pa static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { + struct recovery_pass_fn *p = recovery_pass_fns + pass; int ret; - c->curr_recovery_pass = pass; + if (!(p->when & PASS_SILENT)) + bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), + bch2_recovery_passes[pass]); + ret = p->fn(c); + if (ret) + return ret; + if (!(p->when & PASS_SILENT)) + bch2_print(c, KERN_CONT " done\n"); - if (should_run_recovery_pass(c, pass)) { - struct recovery_pass_fn *p = recovery_pass_fns + pass; + return 0; +} - if (!(p->when & PASS_SILENT)) - printk(KERN_INFO bch2_log_msg(c, "%s..."), - bch2_recovery_passes[pass]); - ret = p->fn(c); - if (ret) - return ret; - if (!(p->when & PASS_SILENT)) - printk(KERN_CONT " done\n"); +static int bch2_run_recovery_passes(struct bch_fs *c) +{ + int ret = 0; - c->recovery_passes_complete |= BIT_ULL(pass); + while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { + if (should_run_recovery_pass(c, c->curr_recovery_pass)) { + unsigned pass = c->curr_recovery_pass; + + ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); + if (bch2_err_matches(ret, BCH_ERR_restart_recovery) || + (ret && c->curr_recovery_pass < pass)) + continue; + if (ret) + break; + + c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass); + } + c->curr_recovery_pass++; + c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass); } - return 0; + return ret; } -static int bch2_run_recovery_passes(struct bch_fs *c) +int bch2_run_online_recovery_passes(struct bch_fs *c) { int ret = 0; - while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { - ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); - if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) + for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) { + struct recovery_pass_fn *p = recovery_pass_fns + i; + + if (!(p->when & PASS_ONLINE)) + continue; + + ret = bch2_run_recovery_pass(c, i); + if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) { + i = c->curr_recovery_pass; continue; + } if (ret) break; - c->curr_recovery_pass++; } return ret; @@ -779,6 +835,9 @@ int bch2_fs_recovery(struct bch_fs *c) if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); + if (c->opts.fsck) + set_bit(BCH_FS_fsck_running, &c->flags); + ret = bch2_blacklist_table_initialize(c); if (ret) { bch_err(c, "error initializing blacklist table"); @@ -919,13 +978,17 @@ use_clean: if (ret) goto err; + clear_bit(BCH_FS_fsck_running, &c->flags); + /* If we fixed errors, verify that fs is actually clean now: */ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - test_bit(BCH_FS_ERRORS_FIXED, &c->flags) && - !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags) && - !test_bit(BCH_FS_ERROR, &c->flags)) { + test_bit(BCH_FS_errors_fixed, &c->flags) && + !test_bit(BCH_FS_errors_not_fixed, &c->flags) && + !test_bit(BCH_FS_error, &c->flags)) { + bch2_flush_fsck_errs(c); + bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean"); - clear_bit(BCH_FS_ERRORS_FIXED, &c->flags); + clear_bit(BCH_FS_errors_fixed, &c->flags); c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; @@ -933,13 +996,13 @@ use_clean: if (ret) goto err; - if (test_bit(BCH_FS_ERRORS_FIXED, &c->flags) || - test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) { + if (test_bit(BCH_FS_errors_fixed, &c->flags) || + test_bit(BCH_FS_errors_not_fixed, &c->flags)) { bch_err(c, "Second fsck run was not clean"); - set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags); + set_bit(BCH_FS_errors_not_fixed, &c->flags); } - set_bit(BCH_FS_ERRORS_FIXED, &c->flags); + set_bit(BCH_FS_errors_fixed, &c->flags); } if (enabled_qtypes(c)) { @@ -958,13 +1021,13 @@ use_clean: write_sb = true; } - if (!test_bit(BCH_FS_ERROR, &c->flags) && + if (!test_bit(BCH_FS_error, &c->flags) && !(c->disk_sb.sb->compat[0] & cpu_to_le64(1ULL << BCH_COMPAT_alloc_info))) { c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); write_sb = true; } - if (!test_bit(BCH_FS_ERROR, &c->flags)) { + if (!test_bit(BCH_FS_error, &c->flags)) { struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); if (ext && (!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) || @@ -976,8 +1039,8 @@ use_clean: } if (c->opts.fsck && - !test_bit(BCH_FS_ERROR, &c->flags) && - !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) { + !test_bit(BCH_FS_error, &c->flags) && + !test_bit(BCH_FS_errors_not_fixed, &c->flags)) { SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0); write_sb = true; @@ -993,8 +1056,12 @@ use_clean: bch2_move_stats_init(&stats, "recovery"); - bch_info(c, "scanning for old btree nodes"); - ret = bch2_fs_read_write(c) ?: + struct printbuf buf = PRINTBUF; + bch2_version_to_text(&buf, c->sb.version_min); + bch_info(c, "scanning for old btree nodes: min_version %s", buf.buf); + printbuf_exit(&buf); + + ret = bch2_fs_read_write_early(c) ?: bch2_scan_old_btree_nodes(c, &stats); if (ret) goto err; @@ -1007,7 +1074,6 @@ use_clean: ret = 0; out: - set_bit(BCH_FS_FSCK_DONE, &c->flags); bch2_flush_fsck_errs(c); if (!c->opts.keep_journal && @@ -1015,13 +1081,14 @@ out: bch2_journal_keys_put_initial(c); kfree(clean); - if (!ret && test_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) { + if (!ret && + test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) && + !c->opts.nochanges) { bch2_fs_read_write_early(c); bch2_delete_dead_snapshots_async(c); } - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; err: fsck_err: @@ -1034,8 +1101,6 @@ int bch2_fs_initialize(struct bch_fs *c) struct bch_inode_unpacked root_inode, lostfound_inode; struct bkey_inode_buf packed_inode; struct qstr lostfound = QSTR("lost+found"); - struct bch_dev *ca; - unsigned i; int ret; bch_notice(c, "initializing new filesystem"); @@ -1054,13 +1119,12 @@ int bch2_fs_initialize(struct bch_fs *c) mutex_unlock(&c->sb_lock); c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns); - set_bit(BCH_FS_MAY_GO_RW, &c->flags); - set_bit(BCH_FS_FSCK_DONE, &c->flags); + set_bit(BCH_FS_may_go_rw, &c->flags); - for (i = 0; i < BTREE_ID_NR; i++) + for (unsigned i = 0; i < BTREE_ID_NR; i++) bch2_btree_root_alloc(c, i); - for_each_member_device(ca, c, i) + for_each_member_device(c, ca) bch2_dev_usage_init(ca); ret = bch2_fs_journal_alloc(c); @@ -1088,7 +1152,7 @@ int bch2_fs_initialize(struct bch_fs *c) if (ret) goto err; - for_each_online_member(ca, c, i) + for_each_online_member(c, ca) ca->new_fs_bucket_idx = 0; ret = bch2_fs_freespace_init(c); @@ -1112,10 +1176,9 @@ int bch2_fs_initialize(struct bch_fs *c) packed_inode.inode.k.p.snapshot = U32_MAX; ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0); - if (ret) { - bch_err_msg(c, ret, "creating root directory"); + bch_err_msg(c, ret, "creating root directory"); + if (ret) goto err; - } bch2_inode_init_early(c, &lostfound_inode); @@ -1126,10 +1189,11 @@ int bch2_fs_initialize(struct bch_fs *c) &lostfound, 0, 0, S_IFDIR|0700, 0, NULL, NULL, (subvol_inum) { 0 }, 0)); - if (ret) { - bch_err_msg(c, ret, "creating lost+found"); + bch_err_msg(c, ret, "creating lost+found"); + if (ret) goto err; - } + + c->recovery_pass_done = ARRAY_SIZE(recovery_pass_fns) - 1; if (enabled_qtypes(c)) { ret = bch2_fs_quota_read(c); @@ -1138,10 +1202,9 @@ int bch2_fs_initialize(struct bch_fs *c) } ret = bch2_journal_flush(&c->journal); - if (ret) { - bch_err_msg(c, ret, "writing first journal entry"); + bch_err_msg(c, ret, "writing first journal entry"); + if (ret) goto err; - } mutex_lock(&c->sb_lock); SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); @@ -1152,6 +1215,6 @@ int bch2_fs_initialize(struct bch_fs *c) return 0; err: - bch_err_fn(ca, ret); + bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h index 3a554b0751d0..4e9d24719b2e 100644 --- a/fs/bcachefs/recovery.h +++ b/fs/bcachefs/recovery.h @@ -31,6 +31,7 @@ static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c, } } +int bch2_run_online_recovery_passes(struct bch_fs *); u64 bch2_fsck_recovery_passes(void); int bch2_fs_recovery(struct bch_fs *); diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h index d37c6fd30e38..fa0c8efd2a1b 100644 --- a/fs/bcachefs/recovery_types.h +++ b/fs/bcachefs/recovery_types.h @@ -6,6 +6,7 @@ #define PASS_FSCK BIT(1) #define PASS_UNCLEAN BIT(2) #define PASS_ALWAYS BIT(3) +#define PASS_ONLINE BIT(4) /* * Passes may be reordered, but the second field is a persistent identifier and @@ -22,18 +23,18 @@ x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \ x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ x(journal_replay, 9, PASS_ALWAYS) \ - x(check_alloc_info, 10, PASS_FSCK) \ - x(check_lrus, 11, PASS_FSCK) \ - x(check_btree_backpointers, 12, PASS_FSCK) \ - x(check_backpointers_to_extents, 13, PASS_FSCK) \ - x(check_extents_to_backpointers, 14, PASS_FSCK) \ - x(check_alloc_to_lru_refs, 15, PASS_FSCK) \ + x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \ + x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \ + x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \ + x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK) \ + x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \ + x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \ x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ x(bucket_gens_init, 17, 0) \ - x(check_snapshot_trees, 18, PASS_FSCK) \ - x(check_snapshots, 19, PASS_FSCK) \ - x(check_subvols, 20, PASS_FSCK) \ - x(delete_dead_snapshots, 21, PASS_FSCK) \ + x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ + x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ + x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ + x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ x(fs_upgrade_for_subvolumes, 22, 0) \ x(resume_logged_ops, 23, PASS_ALWAYS) \ x(check_inodes, 24, PASS_FSCK) \ @@ -41,8 +42,8 @@ x(check_indirect_extents, 26, PASS_FSCK) \ x(check_dirents, 27, PASS_FSCK) \ x(check_xattrs, 28, PASS_FSCK) \ - x(check_root, 29, PASS_FSCK) \ - x(check_directory_structure, 30, PASS_FSCK) \ + x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ + x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ x(check_nlinks, 31, PASS_FSCK) \ x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \ x(fix_reflink_p, 33, 0) \ diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 37d16e04e671..faa5d3670058 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -3,6 +3,7 @@ #include "bkey_buf.h" #include "btree_update.h" #include "buckets.h" +#include "error.h" #include "extents.h" #include "inode.h" #include "io_misc.h" @@ -33,15 +34,14 @@ int bch2_reflink_p_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + int ret = 0; - if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix && - le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad)) { - prt_printf(err, "idx < front_pad (%llu < %u)", - le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad)); - return -EINVAL; - } - - return 0; + bkey_fsck_err_on(le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad), + c, err, reflink_p_front_pad_bad, + "idx < front_pad (%llu < %u)", + le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad)); +fsck_err: + return ret; } void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, @@ -73,6 +73,184 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r return true; } +static int trans_trigger_reflink_p_segment(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 *idx, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_i *k; + __le64 *refcount; + int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; + struct printbuf buf = PRINTBUF; + int ret; + + k = bch2_bkey_get_mut_noupdate(trans, &iter, + BTREE_ID_reflink, POS(0, *idx), + BTREE_ITER_WITH_UPDATES); + ret = PTR_ERR_OR_ZERO(k); + if (ret) + goto err; + + refcount = bkey_refcount(bkey_i_to_s(k)); + if (!refcount) { + bch2_bkey_val_to_text(&buf, c, p.s_c); + bch2_trans_inconsistent(trans, + "nonexistent indirect extent at %llu while marking\n %s", + *idx, buf.buf); + ret = -EIO; + goto err; + } + + if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) { + bch2_bkey_val_to_text(&buf, c, p.s_c); + bch2_trans_inconsistent(trans, + "indirect extent refcount underflow at %llu while marking\n %s", + *idx, buf.buf); + ret = -EIO; + goto err; + } + + if (flags & BTREE_TRIGGER_INSERT) { + struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; + u64 pad; + + pad = max_t(s64, le32_to_cpu(v->front_pad), + le64_to_cpu(v->idx) - bkey_start_offset(&k->k)); + BUG_ON(pad > U32_MAX); + v->front_pad = cpu_to_le32(pad); + + pad = max_t(s64, le32_to_cpu(v->back_pad), + k->k.p.offset - p.k->size - le64_to_cpu(v->idx)); + BUG_ON(pad > U32_MAX); + v->back_pad = cpu_to_le32(pad); + } + + le64_add_cpu(refcount, add); + + bch2_btree_iter_set_pos_to_extent_start(&iter); + ret = bch2_trans_update(trans, &iter, k, 0); + if (ret) + goto err; + + *idx = k->k.p.offset; +err: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ret; +} + +static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 *idx, unsigned flags, size_t r_idx) +{ + struct bch_fs *c = trans->c; + struct reflink_gc *r; + int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; + u64 start = le64_to_cpu(p.v->idx); + u64 end = le64_to_cpu(p.v->idx) + p.k->size; + u64 next_idx = end + le32_to_cpu(p.v->back_pad); + s64 ret = 0; + struct printbuf buf = PRINTBUF; + + if (r_idx >= c->reflink_gc_nr) + goto not_found; + + r = genradix_ptr(&c->reflink_gc_table, r_idx); + next_idx = min(next_idx, r->offset - r->size); + if (*idx < next_idx) + goto not_found; + + BUG_ON((s64) r->refcount + add < 0); + + r->refcount += add; + *idx = r->offset; + return 0; +not_found: + if (fsck_err(c, reflink_p_to_missing_reflink_v, + "pointer to missing indirect extent\n" + " %s\n" + " missing range %llu-%llu", + (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), + *idx, next_idx)) { + struct bkey_i *update = bch2_bkey_make_mut_noupdate(trans, p.s_c); + ret = PTR_ERR_OR_ZERO(update); + if (ret) + goto err; + + if (next_idx <= start) { + bkey_i_to_reflink_p(update)->v.front_pad = cpu_to_le32(start - next_idx); + } else if (*idx >= end) { + bkey_i_to_reflink_p(update)->v.back_pad = cpu_to_le32(*idx - end); + } else { + bkey_error_init(update); + update->k.p = p.k->p; + update->k.p.offset = next_idx; + update->k.size = next_idx - *idx; + set_bkey_val_u64s(&update->k, 0); + } + + ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, update, BTREE_TRIGGER_NORUN); + } + + *idx = next_idx; +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +static int __trigger_reflink_p(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + int ret = 0; + + u64 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); + u64 end = le64_to_cpu(p.v->idx) + p.k->size + le32_to_cpu(p.v->back_pad); + + if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + while (idx < end && !ret) + ret = trans_trigger_reflink_p_segment(trans, p, &idx, flags); + } + + if (flags & BTREE_TRIGGER_GC) { + size_t l = 0, r = c->reflink_gc_nr; + + while (l < r) { + size_t m = l + (r - l) / 2; + struct reflink_gc *ref = genradix_ptr(&c->reflink_gc_table, m); + if (ref->offset <= idx) + l = m + 1; + else + r = m; + } + + while (idx < end && !ret) + ret = gc_trigger_reflink_p_segment(trans, p, &idx, flags, l++); + } + + return ret; +} + +int bch2_trigger_reflink_p(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, + struct bkey_s new, + unsigned flags) +{ + if ((flags & BTREE_TRIGGER_TRANSACTIONAL) && + (flags & BTREE_TRIGGER_INSERT)) { + struct bch_reflink_p *v = bkey_s_to_reflink_p(new).v; + + v->front_pad = v->back_pad = 0; + } + + return trigger_run_overwrite_then_insert(__trigger_reflink_p, trans, btree_id, level, old, new, flags); +} + /* indirect extents */ int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k, @@ -104,32 +282,26 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r } #endif -static inline void check_indirect_extent_deleting(struct bkey_i *new, unsigned *flags) +static inline void check_indirect_extent_deleting(struct bkey_s new, unsigned *flags) { if ((*flags & BTREE_TRIGGER_INSERT) && !*bkey_refcount(new)) { - new->k.type = KEY_TYPE_deleted; - new->k.size = 0; - set_bkey_val_u64s(&new->k, 0);; + new.k->type = KEY_TYPE_deleted; + new.k->size = 0; + set_bkey_val_u64s(new.k, 0); *flags &= ~BTREE_TRIGGER_INSERT; } } int bch2_trans_mark_reflink_v(struct btree_trans *trans, enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_i *new, + struct bkey_s_c old, struct bkey_s new, unsigned flags) { - check_indirect_extent_deleting(new, &flags); - - if (old.k->type == KEY_TYPE_reflink_v && - new->k.type == KEY_TYPE_reflink_v && - old.k->u64s == new->k.u64s && - !memcmp(bkey_s_c_to_reflink_v(old).v->start, - bkey_i_to_reflink_v(new)->v.start, - bkey_val_bytes(&new->k) - 8)) - return 0; + if ((flags & BTREE_TRIGGER_TRANSACTIONAL) && + (flags & BTREE_TRIGGER_INSERT)) + check_indirect_extent_deleting(new, &flags); - return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags); + return bch2_trigger_extent(trans, btree_id, level, old, new, flags); } /* indirect inline data */ @@ -154,7 +326,7 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out, int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans, enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_i *new, + struct bkey_s_c old, struct bkey_s new, unsigned flags) { check_indirect_extent_deleting(new, &flags); @@ -197,7 +369,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k)); - refcount = bkey_refcount(r_v); + refcount = bkey_refcount(bkey_i_to_s(r_v)); *refcount = 0; memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k)); @@ -398,7 +570,7 @@ s64 bch2_remap_range(struct bch_fs *c, inode_u.bi_size = new_i_size; ret2 = bch2_inode_write(trans, &inode_iter, &inode_u) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); } bch2_trans_iter_exit(trans, &inode_iter); diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h index 8ccf3f9c4939..8ee778ec0022 100644 --- a/fs/bcachefs/reflink.h +++ b/fs/bcachefs/reflink.h @@ -9,13 +9,14 @@ int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c, void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); +int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, unsigned); #define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \ .key_invalid = bch2_reflink_p_invalid, \ .val_to_text = bch2_reflink_p_to_text, \ .key_merge = bch2_reflink_p_merge, \ - .trans_trigger = bch2_trans_mark_reflink_p, \ - .atomic_trigger = bch2_mark_reflink_p, \ + .trigger = bch2_trigger_reflink_p, \ .min_val_size = 16, \ }) @@ -24,14 +25,13 @@ int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c, void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_i *, unsigned); + struct bkey_s_c, struct bkey_s, unsigned); #define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \ .key_invalid = bch2_reflink_v_invalid, \ .val_to_text = bch2_reflink_v_to_text, \ .swab = bch2_ptr_swab, \ - .trans_trigger = bch2_trans_mark_reflink_v, \ - .atomic_trigger = bch2_mark_extent, \ + .trigger = bch2_trans_mark_reflink_v, \ .min_val_size = 8, \ }) @@ -41,13 +41,13 @@ void bch2_indirect_inline_data_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trans_mark_indirect_inline_data(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_i *, + struct bkey_s_c, struct bkey_s, unsigned); #define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \ .key_invalid = bch2_indirect_inline_data_invalid, \ .val_to_text = bch2_indirect_inline_data_to_text, \ - .trans_trigger = bch2_trans_mark_indirect_inline_data, \ + .trigger = bch2_trans_mark_indirect_inline_data, \ .min_val_size = 8, \ }) @@ -63,13 +63,13 @@ static inline const __le64 *bkey_refcount_c(struct bkey_s_c k) } } -static inline __le64 *bkey_refcount(struct bkey_i *k) +static inline __le64 *bkey_refcount(struct bkey_s k) { - switch (k->k.type) { + switch (k.k->type) { case KEY_TYPE_reflink_v: - return &bkey_i_to_reflink_v(k)->v.refcount; + return &bkey_s_to_reflink_v(k).v->refcount; case KEY_TYPE_indirect_inline_data: - return &bkey_i_to_indirect_inline_data(k)->v.refcount; + return &bkey_s_to_indirect_inline_data(k).v->refcount; default: return NULL; } diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 2008fe8bf706..92ba56ef1fc8 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -11,7 +11,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, /* Replicas tracking - in memory: */ -static void verify_replicas_entry(struct bch_replicas_entry *e) +static void verify_replicas_entry(struct bch_replicas_entry_v1 *e) { #ifdef CONFIG_BCACHEFS_DEBUG unsigned i; @@ -26,7 +26,7 @@ static void verify_replicas_entry(struct bch_replicas_entry *e) #endif } -void bch2_replicas_entry_sort(struct bch_replicas_entry *e) +void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e) { bubble_sort(e->devs, e->nr_devs, u8_cmp); } @@ -53,7 +53,7 @@ static void bch2_replicas_entry_v0_to_text(struct printbuf *out, } void bch2_replicas_entry_to_text(struct printbuf *out, - struct bch_replicas_entry *e) + struct bch_replicas_entry_v1 *e) { unsigned i; @@ -68,7 +68,7 @@ void bch2_replicas_entry_to_text(struct printbuf *out, prt_printf(out, "]"); } -int bch2_replicas_entry_validate(struct bch_replicas_entry *r, +int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, struct bch_sb *sb, struct printbuf *err) { @@ -98,7 +98,7 @@ bad: void bch2_cpu_replicas_to_text(struct printbuf *out, struct bch_replicas_cpu *r) { - struct bch_replicas_entry *e; + struct bch_replicas_entry_v1 *e; bool first = true; for_each_cpu_replicas_entry(r, e) { @@ -111,7 +111,7 @@ void bch2_cpu_replicas_to_text(struct printbuf *out, } static void extent_to_replicas(struct bkey_s_c k, - struct bch_replicas_entry *r) + struct bch_replicas_entry_v1 *r) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -131,7 +131,7 @@ static void extent_to_replicas(struct bkey_s_c k, } static void stripe_to_replicas(struct bkey_s_c k, - struct bch_replicas_entry *r) + struct bch_replicas_entry_v1 *r) { struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); const struct bch_extent_ptr *ptr; @@ -144,7 +144,7 @@ static void stripe_to_replicas(struct bkey_s_c k, r->devs[r->nr_devs++] = ptr->dev; } -void bch2_bkey_to_replicas(struct bch_replicas_entry *e, +void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e, struct bkey_s_c k) { e->nr_devs = 0; @@ -169,12 +169,10 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e, bch2_replicas_entry_sort(e); } -void bch2_devlist_to_replicas(struct bch_replicas_entry *e, +void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e, enum bch_data_type data_type, struct bch_devs_list devs) { - unsigned i; - BUG_ON(!data_type || data_type == BCH_DATA_sb || data_type >= BCH_DATA_NR); @@ -183,8 +181,8 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e, e->nr_devs = 0; e->nr_required = 1; - for (i = 0; i < devs.nr; i++) - e->devs[e->nr_devs++] = devs.devs[i]; + darray_for_each(devs, i) + e->devs[e->nr_devs++] = *i; bch2_replicas_entry_sort(e); } @@ -192,7 +190,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e, static struct bch_replicas_cpu cpu_replicas_add_entry(struct bch_fs *c, struct bch_replicas_cpu *old, - struct bch_replicas_entry *new_entry) + struct bch_replicas_entry_v1 *new_entry) { unsigned i; struct bch_replicas_cpu new = { @@ -225,7 +223,7 @@ cpu_replicas_add_entry(struct bch_fs *c, } static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, - struct bch_replicas_entry *search) + struct bch_replicas_entry_v1 *search) { int idx, entry_size = replicas_entry_bytes(search); @@ -243,7 +241,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, } int bch2_replicas_entry_idx(struct bch_fs *c, - struct bch_replicas_entry *search) + struct bch_replicas_entry_v1 *search) { bch2_replicas_entry_sort(search); @@ -251,13 +249,13 @@ int bch2_replicas_entry_idx(struct bch_fs *c, } static bool __replicas_has_entry(struct bch_replicas_cpu *r, - struct bch_replicas_entry *search) + struct bch_replicas_entry_v1 *search) { return __replicas_entry_idx(r, search) >= 0; } bool bch2_replicas_marked(struct bch_fs *c, - struct bch_replicas_entry *search) + struct bch_replicas_entry_v1 *search) { bool marked; @@ -374,7 +372,7 @@ err: static unsigned reserve_journal_replicas(struct bch_fs *c, struct bch_replicas_cpu *r) { - struct bch_replicas_entry *e; + struct bch_replicas_entry_v1 *e; unsigned journal_res_u64s = 0; /* nr_inodes: */ @@ -399,7 +397,7 @@ static unsigned reserve_journal_replicas(struct bch_fs *c, noinline static int bch2_mark_replicas_slowpath(struct bch_fs *c, - struct bch_replicas_entry *new_entry) + struct bch_replicas_entry_v1 *new_entry) { struct bch_replicas_cpu new_r, new_gc; int ret = 0; @@ -464,7 +462,7 @@ err: goto out; } -int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r) +int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r) { return likely(bch2_replicas_marked(c, r)) ? 0 : bch2_mark_replicas_slowpath(c, r); @@ -515,7 +513,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret) int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) { - struct bch_replicas_entry *e; + struct bch_replicas_entry_v1 *e; unsigned i = 0; lockdep_assert_held(&c->replicas_gc_lock); @@ -590,7 +588,7 @@ retry: } for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry *e = + struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); if (e->data_type == BCH_DATA_journal || @@ -621,7 +619,7 @@ retry: } int bch2_replicas_set_usage(struct bch_fs *c, - struct bch_replicas_entry *r, + struct bch_replicas_entry_v1 *r, u64 sectors) { int ret, idx = bch2_replicas_entry_idx(c, r); @@ -654,7 +652,7 @@ static int __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, struct bch_replicas_cpu *cpu_r) { - struct bch_replicas_entry *e, *dst; + struct bch_replicas_entry_v1 *e, *dst; unsigned nr = 0, entry_size = 0, idx = 0; for_each_replicas_entry(sb_r, e) { @@ -692,7 +690,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, nr++; } - entry_size += sizeof(struct bch_replicas_entry) - + entry_size += sizeof(struct bch_replicas_entry_v1) - sizeof(struct bch_replicas_entry_v0); cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); @@ -703,7 +701,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, cpu_r->entry_size = entry_size; for_each_replicas_entry(sb_r, e) { - struct bch_replicas_entry *dst = + struct bch_replicas_entry_v1 *dst = cpu_replicas_entry(cpu_r, idx++); dst->data_type = e->data_type; @@ -747,7 +745,7 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, { struct bch_sb_field_replicas_v0 *sb_r; struct bch_replicas_entry_v0 *dst; - struct bch_replicas_entry *src; + struct bch_replicas_entry_v1 *src; size_t bytes; bytes = sizeof(struct bch_sb_field_replicas); @@ -785,7 +783,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, struct bch_replicas_cpu *r) { struct bch_sb_field_replicas *sb_r; - struct bch_replicas_entry *dst, *src; + struct bch_replicas_entry_v1 *dst, *src; bool need_v1 = false; size_t bytes; @@ -836,7 +834,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, memcmp, NULL); for (i = 0; i < cpu_r->nr; i++) { - struct bch_replicas_entry *e = + struct bch_replicas_entry_v1 *e = cpu_replicas_entry(cpu_r, i); int ret = bch2_replicas_entry_validate(e, sb, err); @@ -844,7 +842,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, return ret; if (i + 1 < cpu_r->nr) { - struct bch_replicas_entry *n = + struct bch_replicas_entry_v1 *n = cpu_replicas_entry(cpu_r, i + 1); BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0); @@ -881,7 +879,7 @@ static void bch2_sb_replicas_to_text(struct printbuf *out, struct bch_sb_field *f) { struct bch_sb_field_replicas *r = field_to_type(f, replicas); - struct bch_replicas_entry *e; + struct bch_replicas_entry_v1 *e; bool first = true; for_each_replicas_entry(r, e) { @@ -943,7 +941,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, unsigned flags, bool print) { - struct bch_replicas_entry *e; + struct bch_replicas_entry_v1 *e; bool ret = true; percpu_down_read(&c->mark_lock); @@ -1003,7 +1001,7 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) replicas_v0 = bch2_sb_field_get(sb, replicas_v0); if (replicas) { - struct bch_replicas_entry *r; + struct bch_replicas_entry_v1 *r; for_each_replicas_entry(replicas, r) for (i = 0; i < r->nr_devs; i++) diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h index f70a642775d1..654a4b26d3a3 100644 --- a/fs/bcachefs/replicas.h +++ b/fs/bcachefs/replicas.h @@ -6,28 +6,28 @@ #include "eytzinger.h" #include "replicas_types.h" -void bch2_replicas_entry_sort(struct bch_replicas_entry *); +void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *); void bch2_replicas_entry_to_text(struct printbuf *, - struct bch_replicas_entry *); -int bch2_replicas_entry_validate(struct bch_replicas_entry *, + struct bch_replicas_entry_v1 *); +int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *, struct bch_sb *, struct printbuf *); void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); -static inline struct bch_replicas_entry * +static inline struct bch_replicas_entry_v1 * cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) { return (void *) r->entries + r->entry_size * i; } int bch2_replicas_entry_idx(struct bch_fs *, - struct bch_replicas_entry *); + struct bch_replicas_entry_v1 *); -void bch2_devlist_to_replicas(struct bch_replicas_entry *, +void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *, enum bch_data_type, struct bch_devs_list); -bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *); +bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry_v1 *); int bch2_mark_replicas(struct bch_fs *, - struct bch_replicas_entry *); + struct bch_replicas_entry_v1 *); static inline struct replicas_delta * replicas_delta_next(struct replicas_delta *d) @@ -37,9 +37,9 @@ replicas_delta_next(struct replicas_delta *d) int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *); -void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c); +void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *, struct bkey_s_c); -static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, +static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e, unsigned dev) { e->data_type = BCH_DATA_cached; @@ -59,7 +59,7 @@ int bch2_replicas_gc_start(struct bch_fs *, unsigned); int bch2_replicas_gc2(struct bch_fs *); int bch2_replicas_set_usage(struct bch_fs *, - struct bch_replicas_entry *, + struct bch_replicas_entry_v1 *, u64); #define for_each_cpu_replicas_entry(_r, _i) \ diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h index 5cfff489bbc3..ac90d142c4e8 100644 --- a/fs/bcachefs/replicas_types.h +++ b/fs/bcachefs/replicas_types.h @@ -5,12 +5,12 @@ struct bch_replicas_cpu { unsigned nr; unsigned entry_size; - struct bch_replicas_entry *entries; + struct bch_replicas_entry_v1 *entries; }; struct replicas_delta { s64 delta; - struct bch_replicas_entry r; + struct bch_replicas_entry_v1 r; } __packed; struct replicas_delta_list { @@ -21,7 +21,7 @@ struct replicas_delta_list { u64 nr_inodes; u64 persistent_reserved[BCH_REPLICAS_MAX]; struct {} memset_end; - struct replicas_delta d[0]; + struct replicas_delta d[]; }; #endif /* _BCACHEFS_REPLICAS_TYPES_H */ diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c index c76ad8ea5e4a..9632f36f5f31 100644 --- a/fs/bcachefs/sb-clean.c +++ b/fs/bcachefs/sb-clean.c @@ -191,13 +191,10 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, struct jset_entry **end, u64 journal_seq) { - struct bch_dev *ca; - unsigned i, dev; - percpu_down_read(&c->mark_lock); if (!journal_seq) { - for (i = 0; i < ARRAY_SIZE(c->usage); i++) + for (unsigned i = 0; i < ARRAY_SIZE(c->usage); i++) bch2_fs_usage_acc_to_base(c, i); } else { bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK); @@ -223,7 +220,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, u->v = cpu_to_le64(atomic64_read(&c->key_version)); } - for (i = 0; i < BCH_REPLICAS_MAX; i++) { + for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) { struct jset_entry_usage *u = container_of(jset_entry_init(end, sizeof(*u)), struct jset_entry_usage, entry); @@ -234,8 +231,8 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); } - for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry *e = + for (unsigned i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); struct jset_entry_data_usage *u = container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs), @@ -247,7 +244,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, "embedded variable length struct"); } - for_each_member_device(ca, c, dev) { + for_each_member_device(c, ca) { unsigned b = sizeof(struct jset_entry_dev_usage) + sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR; struct jset_entry_dev_usage *u = @@ -255,10 +252,9 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, struct jset_entry_dev_usage, entry); u->entry.type = BCH_JSET_ENTRY_dev_usage; - u->dev = cpu_to_le32(dev); - u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec); + u->dev = cpu_to_le32(ca->dev_idx); - for (i = 0; i < BCH_DATA_NR; i++) { + for (unsigned i = 0; i < BCH_DATA_NR; i++) { u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets); u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors); u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented); @@ -267,7 +263,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, percpu_up_read(&c->mark_lock); - for (i = 0; i < 2; i++) { + for (unsigned i = 0; i < 2; i++) { struct jset_entry_clock *clock = container_of(jset_entry_init(end, sizeof(*clock)), struct jset_entry_clock, entry); diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 4919237bbe73..441dcb1bf160 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -12,33 +12,105 @@ #include "sb-errors.h" #include "super-io.h" +#define RECOVERY_PASS_ALL_FSCK BIT_ULL(63) + /* - * Downgrade table: - * When dowgrading past certain versions, we need to run certain recovery passes - * and fix certain errors: + * Upgrade, downgrade tables - run certain recovery passes, fix certain errors * * x(version, recovery_passes, errors...) */ +#define UPGRADE_TABLE() \ + x(backpointers, \ + RECOVERY_PASS_ALL_FSCK) \ + x(inode_v3, \ + RECOVERY_PASS_ALL_FSCK) \ + x(unwritten_extents, \ + RECOVERY_PASS_ALL_FSCK) \ + x(bucket_gens, \ + BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)| \ + RECOVERY_PASS_ALL_FSCK) \ + x(lru_v2, \ + RECOVERY_PASS_ALL_FSCK) \ + x(fragmentation_lru, \ + RECOVERY_PASS_ALL_FSCK) \ + x(no_bps_in_alloc_keys, \ + RECOVERY_PASS_ALL_FSCK) \ + x(snapshot_trees, \ + RECOVERY_PASS_ALL_FSCK) \ + x(snapshot_skiplists, \ + BIT_ULL(BCH_RECOVERY_PASS_check_snapshots), \ + BCH_FSCK_ERR_snapshot_bad_depth, \ + BCH_FSCK_ERR_snapshot_bad_skiplist) \ + x(deleted_inodes, \ + BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ + BCH_FSCK_ERR_unlinked_inode_not_on_deleted_list) \ + x(rebalance_work, \ + BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) #define DOWNGRADE_TABLE() -struct downgrade_entry { +struct upgrade_downgrade_entry { u64 recovery_passes; u16 version; u16 nr_errors; const u16 *errors; }; -#define x(ver, passes, ...) static const u16 ver_##errors[] = { __VA_ARGS__ }; +#define x(ver, passes, ...) static const u16 upgrade_##ver##_errors[] = { __VA_ARGS__ }; +UPGRADE_TABLE() +#undef x + +static const struct upgrade_downgrade_entry upgrade_table[] = { +#define x(ver, passes, ...) { \ + .recovery_passes = passes, \ + .version = bcachefs_metadata_version_##ver,\ + .nr_errors = ARRAY_SIZE(upgrade_##ver##_errors), \ + .errors = upgrade_##ver##_errors, \ +}, +UPGRADE_TABLE() +#undef x +}; + +void bch2_sb_set_upgrade(struct bch_fs *c, + unsigned old_version, + unsigned new_version) +{ + lockdep_assert_held(&c->sb_lock); + + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + + for (const struct upgrade_downgrade_entry *i = upgrade_table; + i < upgrade_table + ARRAY_SIZE(upgrade_table); + i++) + if (i->version > old_version && i->version <= new_version) { + u64 passes = i->recovery_passes; + + if (passes & RECOVERY_PASS_ALL_FSCK) + passes |= bch2_fsck_recovery_passes(); + passes &= ~RECOVERY_PASS_ALL_FSCK; + + ext->recovery_passes_required[0] |= + cpu_to_le64(bch2_recovery_passes_to_stable(passes)); + + for (const u16 *e = i->errors; + e < i->errors + i->nr_errors; + e++) { + __set_bit(*e, c->sb.errors_silent); + ext->errors_silent[*e / 64] |= cpu_to_le64(BIT_ULL(*e % 64)); + } + } +} + +#define x(ver, passes, ...) static const u16 downgrade_ver_##errors[] = { __VA_ARGS__ }; DOWNGRADE_TABLE() #undef x -static const struct downgrade_entry downgrade_table[] = { +static const struct upgrade_downgrade_entry downgrade_table[] = { #define x(ver, passes, ...) { \ .recovery_passes = passes, \ .version = bcachefs_metadata_version_##ver,\ - .nr_errors = ARRAY_SIZE(ver_##errors), \ - .errors = ver_##errors, \ + .nr_errors = ARRAY_SIZE(downgrade_##ver##_errors), \ + .errors = downgrade_##ver##_errors, \ }, DOWNGRADE_TABLE() #undef x @@ -118,7 +190,7 @@ int bch2_sb_downgrade_update(struct bch_fs *c) darray_char table = {}; int ret = 0; - for (const struct downgrade_entry *src = downgrade_table; + for (const struct upgrade_downgrade_entry *src = downgrade_table; src < downgrade_table + ARRAY_SIZE(downgrade_table); src++) { if (BCH_VERSION_MAJOR(src->version) != BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version))) diff --git a/fs/bcachefs/sb-downgrade.h b/fs/bcachefs/sb-downgrade.h index bc48fd2ca70e..57e6c916fc73 100644 --- a/fs/bcachefs/sb-downgrade.h +++ b/fs/bcachefs/sb-downgrade.h @@ -5,6 +5,7 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_downgrade; int bch2_sb_downgrade_update(struct bch_fs *); +void bch2_sb_set_upgrade(struct bch_fs *, unsigned, unsigned); void bch2_sb_set_downgrade(struct bch_fs *, unsigned, unsigned); #endif /* _BCACHEFS_SB_DOWNGRADE_H */ diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index 3504c2d09c29..c08aacdfd073 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -248,7 +248,9 @@ x(root_inode_not_dir, 240) \ x(dir_loop, 241) \ x(hash_table_key_duplicate, 242) \ - x(hash_table_key_wrong_offset, 243) + x(hash_table_key_wrong_offset, 243) \ + x(unlinked_inode_not_on_deleted_list, 244) \ + x(reflink_p_front_pad_bad, 245) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index bed0f857fe5b..a44a238bf8b5 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -235,6 +235,11 @@ static void member_to_text(struct printbuf *out, prt_printf(out, "(never)"); prt_newline(out); + prt_printf(out, "Last superblock write:"); + prt_tab(out); + prt_u64(out, le64_to_cpu(m.seq)); + prt_newline(out); + prt_printf(out, "State:"); prt_tab(out); prt_printf(out, "%s", @@ -259,6 +264,11 @@ static void member_to_text(struct printbuf *out, prt_printf(out, "(none)"); prt_newline(out); + prt_str(out, "Durability:"); + prt_tab(out); + prt_printf(out, "%llu", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1); + prt_newline(out); + prt_printf(out, "Discard:"); prt_tab(out); prt_printf(out, "%llu", BCH_MEMBER_DISCARD(&m)); @@ -353,14 +363,12 @@ const struct bch_sb_field_ops bch_sb_field_ops_members_v2 = { void bch2_sb_members_from_cpu(struct bch_fs *c) { struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - struct bch_dev *ca; - unsigned i, e; rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, NULL) { - struct bch_member *m = __bch2_members_v2_get_mut(mi, i); + for_each_member_device_rcu(c, ca, NULL) { + struct bch_member *m = __bch2_members_v2_get_mut(mi, ca->dev_idx); - for (e = 0; e < BCH_MEMBER_ERROR_NR; e++) + for (unsigned e = 0; e < BCH_MEMBER_ERROR_NR; e++) m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e])); } rcu_read_unlock(); diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index 03613e3eb8e3..be0a94183271 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -2,6 +2,8 @@ #ifndef _BCACHEFS_SB_MEMBERS_H #define _BCACHEFS_SB_MEMBERS_H +#include "darray.h" + extern char * const bch2_member_error_strs[]; static inline struct bch_member * @@ -47,23 +49,18 @@ static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs, unsigned dev) { - unsigned i; - - for (i = 0; i < devs.nr; i++) - if (devs.devs[i] == dev) + darray_for_each(devs, i) + if (*i == dev) return true; - return false; } static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs, unsigned dev) { - unsigned i; - - for (i = 0; i < devs->nr; i++) - if (devs->devs[i] == dev) { - array_remove_item(devs->devs, devs->nr, i); + darray_for_each(*devs, i) + if (*i == dev) { + darray_remove_item(devs, i); return; } } @@ -72,40 +69,48 @@ static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, unsigned dev) { if (!bch2_dev_list_has_dev(*devs, dev)) { - BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs)); - devs->devs[devs->nr++] = dev; + BUG_ON(devs->nr >= ARRAY_SIZE(devs->data)); + devs->data[devs->nr++] = dev; } } static inline struct bch_devs_list bch2_dev_list_single(unsigned dev) { - return (struct bch_devs_list) { .nr = 1, .devs[0] = dev }; + return (struct bch_devs_list) { .nr = 1, .data[0] = dev }; } -static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter, - const struct bch_devs_mask *mask) +static inline struct bch_dev *__bch2_next_dev_idx(struct bch_fs *c, unsigned idx, + const struct bch_devs_mask *mask) { struct bch_dev *ca = NULL; - while ((*iter = mask - ? find_next_bit(mask->d, c->sb.nr_devices, *iter) - : *iter) < c->sb.nr_devices && - !(ca = rcu_dereference_check(c->devs[*iter], + while ((idx = mask + ? find_next_bit(mask->d, c->sb.nr_devices, idx) + : idx) < c->sb.nr_devices && + !(ca = rcu_dereference_check(c->devs[idx], lockdep_is_held(&c->state_lock)))) - (*iter)++; + idx++; return ca; } -#define for_each_member_device_rcu(ca, c, iter, mask) \ - for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++) +static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev *ca, + const struct bch_devs_mask *mask) +{ + return __bch2_next_dev_idx(c, ca ? ca->dev_idx + 1 : 0, mask); +} -static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter) +#define for_each_member_device_rcu(_c, _ca, _mask) \ + for (struct bch_dev *_ca = NULL; \ + (_ca = __bch2_next_dev((_c), _ca, (_mask)));) + +static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca) { - struct bch_dev *ca; + if (ca) + percpu_ref_put(&ca->ref); rcu_read_lock(); - if ((ca = __bch2_next_dev(c, iter, NULL))) + if ((ca = __bch2_next_dev(c, ca, NULL))) percpu_ref_get(&ca->ref); rcu_read_unlock(); @@ -115,41 +120,42 @@ static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter /* * If you break early, you must drop your ref on the current device */ -#define for_each_member_device(ca, c, iter) \ - for ((iter) = 0; \ - (ca = bch2_get_next_dev(c, &(iter))); \ - percpu_ref_put(&ca->ref), (iter)++) +#define __for_each_member_device(_c, _ca) \ + for (; (_ca = bch2_get_next_dev(_c, _ca));) + +#define for_each_member_device(_c, _ca) \ + for (struct bch_dev *_ca = NULL; \ + (_ca = bch2_get_next_dev(_c, _ca));) static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, - unsigned *iter, - int state_mask) + struct bch_dev *ca, + unsigned state_mask) { - struct bch_dev *ca; + if (ca) + percpu_ref_put(&ca->io_ref); rcu_read_lock(); - while ((ca = __bch2_next_dev(c, iter, NULL)) && + while ((ca = __bch2_next_dev(c, ca, NULL)) && (!((1 << ca->mi.state) & state_mask) || !percpu_ref_tryget(&ca->io_ref))) - (*iter)++; + ; rcu_read_unlock(); return ca; } -#define __for_each_online_member(ca, c, iter, state_mask) \ - for ((iter) = 0; \ - (ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \ - percpu_ref_put(&ca->io_ref), (iter)++) +#define __for_each_online_member(_c, _ca, state_mask) \ + for (struct bch_dev *_ca = NULL; \ + (_ca = bch2_get_next_online_dev(_c, _ca, state_mask));) -#define for_each_online_member(ca, c, iter) \ - __for_each_online_member(ca, c, iter, ~0) +#define for_each_online_member(c, ca) \ + __for_each_online_member(c, ca, ~0) -#define for_each_rw_member(ca, c, iter) \ - __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw) +#define for_each_rw_member(c, ca) \ + __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw)) -#define for_each_readable_member(ca, c, iter) \ - __for_each_online_member(ca, c, iter, \ - (1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro)) +#define for_each_readable_member(c, ca) \ + __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro)) /* * If a key exists that references a device, the device won't be going away and @@ -175,11 +181,9 @@ static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx) static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) { struct bch_devs_mask devs; - struct bch_dev *ca; - unsigned i; memset(&devs, 0, sizeof(devs)); - for_each_online_member(ca, c, i) + for_each_online_member(c, ca) __set_bit(ca->dev_idx, devs.d); return devs; } diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c index 97790445e67a..3a494c5d1247 100644 --- a/fs/bcachefs/six.c +++ b/fs/bcachefs/six.c @@ -324,101 +324,57 @@ bool six_relock_ip(struct six_lock *lock, enum six_lock_type type, } EXPORT_SYMBOL_GPL(six_relock_ip); -#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER +#ifdef CONFIG_BCACHEFS_SIX_OPTIMISTIC_SPIN -static inline bool six_can_spin_on_owner(struct six_lock *lock) +static inline bool six_owner_running(struct six_lock *lock) { - struct task_struct *owner; - bool ret; - - if (need_resched()) - return false; - + /* + * When there's no owner, we might have preempted between the owner + * acquiring the lock and setting the owner field. If we're an RT task + * that will live-lock because we won't let the owner complete. + */ rcu_read_lock(); - owner = READ_ONCE(lock->owner); - ret = !owner || owner_on_cpu(owner); + struct task_struct *owner = READ_ONCE(lock->owner); + bool ret = owner ? owner_on_cpu(owner) : !rt_task(current); rcu_read_unlock(); return ret; } -static inline bool six_spin_on_owner(struct six_lock *lock, - struct task_struct *owner, - u64 end_time) +static inline bool six_optimistic_spin(struct six_lock *lock, + struct six_lock_waiter *wait, + enum six_lock_type type) { - bool ret = true; unsigned loop = 0; - - rcu_read_lock(); - while (lock->owner == owner) { - /* - * Ensure we emit the owner->on_cpu, dereference _after_ - * checking lock->owner still matches owner. If that fails, - * owner might point to freed memory. If it still matches, - * the rcu_read_lock() ensures the memory stays valid. - */ - barrier(); - - if (!owner_on_cpu(owner) || need_resched()) { - ret = false; - break; - } - - if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) { - six_set_bitmask(lock, SIX_LOCK_NOSPIN); - ret = false; - break; - } - - cpu_relax(); - } - rcu_read_unlock(); - - return ret; -} - -static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) -{ - struct task_struct *task = current; u64 end_time; if (type == SIX_LOCK_write) return false; - preempt_disable(); - if (!six_can_spin_on_owner(lock)) - goto fail; + if (lock->wait_list.next != &wait->list) + return false; - if (!osq_lock(&lock->osq)) - goto fail; + if (atomic_read(&lock->state) & SIX_LOCK_NOSPIN) + return false; + preempt_disable(); end_time = sched_clock() + 10 * NSEC_PER_USEC; - while (1) { - struct task_struct *owner; - + while (!need_resched() && six_owner_running(lock)) { /* - * If there's an owner, wait for it to either - * release the lock or go to sleep. + * Ensures that writes to the waitlist entry happen after we see + * wait->lock_acquired: pairs with the smp_store_release in + * __six_lock_wakeup */ - owner = READ_ONCE(lock->owner); - if (owner && !six_spin_on_owner(lock, owner, end_time)) - break; - - if (do_six_trylock(lock, type, false)) { - osq_unlock(&lock->osq); + if (smp_load_acquire(&wait->lock_acquired)) { preempt_enable(); return true; } - /* - * When there's no owner, we might have preempted between the - * owner acquiring the lock and setting the owner field. If - * we're an RT task that will live-lock because we won't let - * the owner complete. - */ - if (!owner && (need_resched() || rt_task(task))) + if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) { + six_set_bitmask(lock, SIX_LOCK_NOSPIN); break; + } /* * The cpu_relax() call is a compiler barrier which forces @@ -429,24 +385,15 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type cpu_relax(); } - osq_unlock(&lock->osq); -fail: preempt_enable(); - - /* - * If we fell out of the spin path because of need_resched(), - * reschedule now, before we try-lock again. This avoids getting - * scheduled out right after we obtained the lock. - */ - if (need_resched()) - schedule(); - return false; } -#else /* CONFIG_SIX_LOCK_SPIN_ON_OWNER */ +#else /* CONFIG_LOCK_SPIN_ON_OWNER */ -static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) +static inline bool six_optimistic_spin(struct six_lock *lock, + struct six_lock_waiter *wait, + enum six_lock_type type) { return false; } @@ -470,9 +417,6 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type, trace_contention_begin(lock, 0); lock_contended(&lock->dep_map, ip); - if (six_optimistic_spin(lock, type)) - goto out; - wait->task = current; wait->lock_want = type; wait->lock_acquired = false; @@ -510,6 +454,9 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type, ret = 0; } + if (six_optimistic_spin(lock, wait, type)) + goto out; + while (1) { set_current_state(TASK_UNINTERRUPTIBLE); diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h index 4c268b0b8316..68d46fd7f391 100644 --- a/fs/bcachefs/six.h +++ b/fs/bcachefs/six.h @@ -15,7 +15,7 @@ * will have to take write locks for the full duration of the operation. * * But by adding an intent state, which is exclusive with other intent locks but - * not with readers, we can take intent locks at thte start of the operation, + * not with readers, we can take intent locks at the start of the operation, * and then take write locks only for the actual update to each individual * nodes, without deadlocking. * @@ -65,8 +65,8 @@ * * Reentrancy: * - * Six locks are not by themselves reentrent, but have counters for both the - * read and intent states that can be used to provide reentrency by an upper + * Six locks are not by themselves reentrant, but have counters for both the + * read and intent states that can be used to provide reentrancy by an upper * layer that tracks held locks. If a lock is known to already be held in the * read or intent state, six_lock_increment() can be used to bump the "lock * held in this state" counter, increasing the number of unlock calls that @@ -127,10 +127,6 @@ #include <linux/sched.h> #include <linux/types.h> -#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER -#include <linux/osq_lock.h> -#endif - enum six_lock_type { SIX_LOCK_read, SIX_LOCK_intent, @@ -143,9 +139,6 @@ struct six_lock { unsigned intent_lock_recurse; struct task_struct *owner; unsigned __percpu *readers; -#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER - struct optimistic_spin_queue osq; -#endif raw_spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 5dac038f0851..56af937523ff 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -123,7 +123,7 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) struct snapshot_table *t; bool ret; - EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots); + EBUG_ON(c->recovery_pass_done <= BCH_RECOVERY_PASS_check_snapshots); rcu_read_lock(); t = rcu_dereference(c->snapshots); @@ -276,7 +276,7 @@ static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id) mutex_unlock(&c->snapshot_table_lock); } -int bch2_mark_snapshot(struct btree_trans *trans, +static int __bch2_mark_snapshot(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s_c new, unsigned flags) @@ -318,7 +318,7 @@ int bch2_mark_snapshot(struct btree_trans *trans, __set_is_ancestor_bitmap(c, id); if (BCH_SNAPSHOT_DELETED(s.v)) { - set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags); + set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots) bch2_delete_dead_snapshots_async(c); } @@ -330,6 +330,14 @@ err: return ret; } +int bch2_mark_snapshot(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_s new, + unsigned flags) +{ + return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags); +} + int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, struct bch_snapshot *s) { @@ -459,7 +467,6 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; - struct bkey_s_c_subvolume s; bool found = false; int ret; @@ -468,7 +475,7 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, if (k.k->type != KEY_TYPE_subvolume) continue; - s = bkey_s_c_to_subvolume(k); + struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root)) continue; if (!BCH_SUBVOLUME_SNAP(s.v)) { @@ -582,19 +589,13 @@ fsck_err: */ int bch2_check_snapshot_trees(struct bch_fs *c) { - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - ret = bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_snapshot_trees, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_snapshot_tree(trans, &iter, k))); - - if (ret) - bch_err(c, "error %i checking snapshot trees", ret); + bch_err_fn(c, ret); return ret; } @@ -813,11 +814,10 @@ static int check_snapshot(struct btree_trans *trans, real_depth = bch2_snapshot_depth(c, parent_id); - if (le32_to_cpu(s.depth) != real_depth && - (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || - fsck_err(c, snapshot_bad_depth, - "snapshot with incorrect depth field, should be %u:\n %s", - real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { + if (fsck_err_on(le32_to_cpu(s.depth) != real_depth, + c, snapshot_bad_depth, + "snapshot with incorrect depth field, should be %u:\n %s", + real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); ret = PTR_ERR_OR_ZERO(u); if (ret) @@ -831,11 +831,9 @@ static int check_snapshot(struct btree_trans *trans, if (ret < 0) goto err; - if (!ret && - (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || - fsck_err(c, snapshot_bad_skiplist, - "snapshot with bad skiplist field:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { + if (fsck_err_on(!ret, c, snapshot_bad_skiplist, + "snapshot with bad skiplist field:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); ret = PTR_ERR_OR_ZERO(u); if (ret) @@ -856,22 +854,17 @@ fsck_err: int bch2_check_snapshots(struct bch_fs *c) { - struct btree_iter iter; - struct bkey_s_c k; - int ret; - /* * We iterate backwards as checking/fixing the depth field requires that * the parent's depth already be correct: */ - ret = bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_reverse_commit(trans, iter, - BTREE_ID_snapshots, POS_MAX, - BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - check_snapshot(trans, &iter, k))); - if (ret) - bch_err_fn(c, ret); + BTREE_ID_snapshots, POS_MAX, + BTREE_ITER_PREFETCH, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_snapshot(trans, &iter, k))); + bch_err_fn(c, ret); return ret; } @@ -1067,7 +1060,7 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32); SET_BCH_SNAPSHOT_SUBVOL(&n->v, true); - ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, + ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); if (ret) goto err; @@ -1315,7 +1308,6 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, struct bch_fs *c = trans->c; u32 nr_deleted_ancestors = 0; struct bkey_i_snapshot *s; - u32 *i; int ret; if (k.k->type != KEY_TYPE_snapshot) @@ -1368,23 +1360,19 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, int bch2_delete_dead_snapshots(struct bch_fs *c) { struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_s_c_snapshot snap; snapshot_id_list deleted = { 0 }; snapshot_id_list deleted_interior = { 0 }; - u32 *i, id; + u32 id; int ret = 0; - if (!test_and_clear_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) + if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) return 0; - if (!test_bit(BCH_FS_STARTED, &c->flags)) { + if (!test_bit(BCH_FS_started, &c->flags)) { ret = bch2_fs_read_write_early(c); - if (ret) { - bch_err_msg(c, ret, "deleting dead snapshots: error going rw"); + bch_err_msg(c, ret, "deleting dead snapshots: error going rw"); + if (ret) return ret; - } } trans = bch2_trans_get(c); @@ -1397,37 +1385,29 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) POS_MIN, 0, k, NULL, NULL, 0, bch2_delete_redundant_snapshot(trans, k)); - if (ret) { - bch_err_msg(c, ret, "deleting redundant snapshots"); + bch_err_msg(c, ret, "deleting redundant snapshots"); + if (ret) goto err; - } - ret = for_each_btree_key2(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, + ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, bch2_snapshot_set_equiv(trans, k)); - if (ret) { - bch_err_msg(c, ret, "in bch2_snapshots_set_equiv"); + bch_err_msg(c, ret, "in bch2_snapshots_set_equiv"); + if (ret) goto err; - } - for_each_btree_key(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, ret) { + ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ({ if (k.k->type != KEY_TYPE_snapshot) continue; - snap = bkey_s_c_to_snapshot(k); - if (BCH_SNAPSHOT_DELETED(snap.v)) { - ret = snapshot_list_add(c, &deleted, k.k->p.offset); - if (ret) - break; - } - } - bch2_trans_iter_exit(trans, &iter); - - if (ret) { - bch_err_msg(c, ret, "walking snapshots"); + BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v) + ? snapshot_list_add(c, &deleted, k.k->p.offset) + : 0; + })); + bch_err_msg(c, ret, "walking snapshots"); + if (ret) goto err; - } for (id = 0; id < BTREE_ID_NR; id++) { struct bpos last_pos = POS_MIN; @@ -1449,36 +1429,36 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - &res, NULL, BTREE_INSERT_NOFAIL, + &res, NULL, BCH_TRANS_COMMIT_no_enospc, snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?: for_each_btree_key_commit(trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - &res, NULL, BTREE_INSERT_NOFAIL, + &res, NULL, BCH_TRANS_COMMIT_no_enospc, move_key_to_correct_snapshot(trans, &iter, k)); bch2_disk_reservation_put(c, &res); darray_exit(&equiv_seen); - if (ret) { - bch_err_msg(c, ret, "deleting keys from dying snapshots"); + bch_err_msg(c, ret, "deleting keys from dying snapshots"); + if (ret) goto err; - } } bch2_trans_unlock(trans); down_write(&c->snapshot_create_lock); - for_each_btree_key(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, ret) { + ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ({ u32 snapshot = k.k->p.offset; u32 equiv = bch2_snapshot_equiv(c, snapshot); - if (equiv != snapshot) - snapshot_list_add(c, &deleted_interior, snapshot); - } - bch2_trans_iter_exit(trans, &iter); + equiv != snapshot + ? snapshot_list_add(c, &deleted_interior, snapshot) + : 0; + })); + bch_err_msg(c, ret, "walking snapshots"); if (ret) goto err_create_lock; @@ -1489,7 +1469,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) */ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, BTREE_ITER_INTENT, k, - NULL, NULL, BTREE_INSERT_NOFAIL, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior)); if (ret) goto err_create_lock; @@ -1497,19 +1477,17 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) darray_for_each(deleted, i) { ret = commit_do(trans, NULL, NULL, 0, bch2_snapshot_node_delete(trans, *i)); - if (ret) { - bch_err_msg(c, ret, "deleting snapshot %u", *i); + bch_err_msg(c, ret, "deleting snapshot %u", *i); + if (ret) goto err_create_lock; - } } darray_for_each(deleted_interior, i) { ret = commit_do(trans, NULL, NULL, 0, bch2_snapshot_node_delete(trans, *i)); - if (ret) { - bch_err_msg(c, ret, "deleting snapshot %u", *i); + bch_err_msg(c, ret, "deleting snapshot %u", *i); + if (ret) goto err_create_lock; - } } err_create_lock: up_write(&c->snapshot_create_lock); @@ -1517,8 +1495,7 @@ err: darray_exit(&deleted_interior); darray_exit(&deleted); bch2_trans_put(trans); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -1680,7 +1657,7 @@ static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct if (BCH_SNAPSHOT_DELETED(snap.v) || bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset || (ret = bch2_snapshot_needs_delete(trans, k)) > 0) { - set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags); + set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); return 0; } @@ -1689,21 +1666,16 @@ static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct int bch2_snapshots_read(struct bch_fs *c) { - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - ret = bch2_trans_run(c, - for_each_btree_key2(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, - bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, + __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: bch2_snapshot_set_equiv(trans, k) ?: bch2_check_snapshot_needs_deletion(trans, k)) ?: - for_each_btree_key2(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, + for_each_btree_key(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, (set_is_ancestor_bitmap(c, k.k->p.offset), 0))); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index f09a22f44239..7c66ffc06385 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -22,12 +22,12 @@ void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_snapshot_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s_c, unsigned); + struct bkey_s_c, struct bkey_s, unsigned); #define bch2_bkey_ops_snapshot ((struct bkey_ops) { \ .key_invalid = bch2_snapshot_invalid, \ .val_to_text = bch2_snapshot_to_text, \ - .atomic_trigger = bch2_mark_snapshot, \ + .trigger = bch2_mark_snapshot, \ .min_val_size = 24, \ }) @@ -202,8 +202,6 @@ static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id) static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) { - u32 *i; - darray_for_each(*s, i) if (*i == id) return true; @@ -212,8 +210,6 @@ static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id) { - u32 *i; - darray_for_each(*s, i) if (bch2_snapshot_is_ancestor(c, id, *i)) return true; diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index ae21a8cca1b4..89fdb7c21134 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -15,6 +15,16 @@ #include <crypto/hash.h> #include <crypto/sha2.h> +typedef unsigned __bitwise bch_str_hash_flags_t; + +enum bch_str_hash_flags { + __BCH_HASH_SET_MUST_CREATE, + __BCH_HASH_SET_MUST_REPLACE, +}; + +#define BCH_HASH_SET_MUST_CREATE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_CREATE) +#define BCH_HASH_SET_MUST_REPLACE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_REPLACE) + static inline enum bch_str_hash_type bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) { @@ -246,7 +256,7 @@ int bch2_hash_set_snapshot(struct btree_trans *trans, const struct bch_hash_info *info, subvol_inum inum, u32 snapshot, struct bkey_i *insert, - int flags, + bch_str_hash_flags_t str_hash_flags, int update_flags) { struct btree_iter iter, slot = { NULL }; @@ -269,7 +279,7 @@ int bch2_hash_set_snapshot(struct btree_trans *trans, } if (!slot.path && - !(flags & BCH_HASH_SET_MUST_REPLACE)) + !(str_hash_flags & BCH_HASH_SET_MUST_REPLACE)) bch2_trans_copy_iter(&slot, &iter); if (k.k->type != KEY_TYPE_hash_whiteout) @@ -287,16 +297,16 @@ found: found = true; not_found: - if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) { + if (!found && (str_hash_flags & BCH_HASH_SET_MUST_REPLACE)) { ret = -BCH_ERR_ENOENT_str_hash_set_must_replace; - } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) { + } else if (found && (str_hash_flags & BCH_HASH_SET_MUST_CREATE)) { ret = -EEXIST; } else { if (!found && slot.path) swap(iter, slot); insert->k.p = iter.pos; - ret = bch2_trans_update(trans, &iter, insert, 0); + ret = bch2_trans_update(trans, &iter, insert, update_flags); } goto out; @@ -307,7 +317,8 @@ int bch2_hash_set(struct btree_trans *trans, const struct bch_hash_desc desc, const struct bch_hash_info *info, subvol_inum inum, - struct bkey_i *insert, int flags) + struct bkey_i *insert, + bch_str_hash_flags_t str_hash_flags) { u32 snapshot; int ret; @@ -319,7 +330,7 @@ int bch2_hash_set(struct btree_trans *trans, insert->k.p.inode = inum.inum; return bch2_hash_set_snapshot(trans, desc, info, inum, - snapshot, insert, flags, 0); + snapshot, insert, str_hash_flags, 0); } static __always_inline diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index 22b34a8e4d6e..7c67c28d3ef8 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -37,11 +37,8 @@ static int check_subvol(struct btree_trans *trans, return ret; if (BCH_SUBVOLUME_UNLINKED(subvol.v)) { - bch2_fs_lazy_rw(c); - ret = bch2_subvolume_delete(trans, iter->pos.offset); - if (ret) - bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); + bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); return ret ?: -BCH_ERR_transaction_restart_nested; } @@ -82,17 +79,12 @@ fsck_err: int bch2_check_subvols(struct bch_fs *c) { - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - ret = bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - check_subvol(trans, &iter, k))); - if (ret) - bch_err_fn(c, ret); + BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_subvol(trans, &iter, k))); + bch_err_fn(c, ret); return ret; } @@ -228,8 +220,6 @@ static int bch2_subvolume_reparent(struct btree_trans *trans, */ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete) { - struct btree_iter iter; - struct bkey_s_c k; struct bch_subvolume s; return lockrestart_do(trans, @@ -237,7 +227,7 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d BTREE_ITER_CACHED, &s)) ?: for_each_btree_key_commit(trans, iter, BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_subvolume_reparent(trans, &iter, k, subvolid_to_delete, le32_to_cpu(s.parent))); } @@ -274,7 +264,7 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) { return bch2_subvolumes_reparent(trans, subvolid) ?: - commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, __bch2_subvolume_delete(trans, subvolid)); } @@ -299,10 +289,9 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor for (id = s.data; id < s.data + s.nr; id++) { ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id)); - if (ret) { - bch_err_msg(c, ret, "deleting subvolume %u", *id); + bch_err_msg(c, ret, "deleting subvolume %u", *id); + if (ret) break; - } } darray_exit(&s); diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h index 2d2e66a4e468..ae644adfc391 100644 --- a/fs/bcachefs/subvolume_types.h +++ b/fs/bcachefs/subvolume_types.h @@ -20,7 +20,11 @@ struct snapshot_t { }; struct snapshot_table { +#ifndef RUST_BINDGEN DECLARE_FLEX_ARRAY(struct snapshot_t, s); +#else + struct snapshot_t s[0]; +#endif }; typedef struct { diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 4c98d8cc2a79..6d3db5cce5f6 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -30,14 +30,12 @@ static const struct blk_holder_ops bch2_sb_handle_bdev_ops = { struct bch2_metadata_version { u16 version; const char *name; - u64 recovery_passes; }; static const struct bch2_metadata_version bch2_metadata_versions[] = { -#define x(n, v, _recovery_passes) { \ +#define x(n, v) { \ .version = v, \ .name = #n, \ - .recovery_passes = _recovery_passes, \ }, BCH_METADATA_VERSIONS() #undef x @@ -70,24 +68,6 @@ unsigned bch2_latest_compatible_version(unsigned v) return v; } -u64 bch2_upgrade_recovery_passes(struct bch_fs *c, - unsigned old_version, - unsigned new_version) -{ - u64 ret = 0; - - for (const struct bch2_metadata_version *i = bch2_metadata_versions; - i < bch2_metadata_versions + ARRAY_SIZE(bch2_metadata_versions); - i++) - if (i->version > old_version && i->version <= new_version) { - if (i->recovery_passes & RECOVERY_PASS_ALL_FSCK) - ret |= bch2_fsck_recovery_passes(); - ret |= i->recovery_passes; - } - - return ret &= ~RECOVERY_PASS_ALL_FSCK; -} - const char * const bch2_sb_fields[] = { #define x(name, nr) #name, BCH_SB_FIELDS() @@ -101,8 +81,6 @@ static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *, struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb, enum bch_sb_field_type type) { - struct bch_sb_field *f; - /* XXX: need locking around superblock to access optional fields */ vstruct_for_each(sb, f) @@ -164,8 +142,8 @@ void bch2_sb_field_delete(struct bch_sb_handle *sb, void bch2_free_super(struct bch_sb_handle *sb) { kfree(sb->bio); - if (!IS_ERR_OR_NULL(sb->bdev)) - blkdev_put(sb->bdev, sb->holder); + if (!IS_ERR_OR_NULL(sb->bdev_handle)) + bdev_release(sb->bdev_handle); kfree(sb->holder); kfree(sb->sb_name); @@ -192,8 +170,12 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; if (new_bytes > max_bytes) { - pr_err("%pg: superblock too big: want %zu but have %llu", - sb->bdev, new_bytes, max_bytes); + struct printbuf buf = PRINTBUF; + + prt_bdevname(&buf, sb->bdev); + prt_printf(&buf, ": superblock too big: want %zu but have %llu", new_bytes, max_bytes); + pr_err("%s", buf.buf); + printbuf_exit(&buf); return -BCH_ERR_ENOSPC_sb; } } @@ -241,14 +223,12 @@ struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb, if (sb->fs_sb) { struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb); - struct bch_dev *ca; - unsigned i; lockdep_assert_held(&c->sb_lock); /* XXX: we're not checking that offline device have enough space */ - for_each_online_member(ca, c, i) { + for_each_online_member(c, ca) { struct bch_sb_handle *dev_sb = &ca->disk_sb; if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) { @@ -368,7 +348,6 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, int rw) { struct bch_sb *sb = disk_sb->sb; - struct bch_sb_field *f; struct bch_sb_field_members_v1 *mi; enum bch_opt_id opt_id; u16 block_size; @@ -514,8 +493,6 @@ static void le_bitvector_to_cpu(unsigned long *dst, unsigned long *src, unsigned static void bch2_sb_update(struct bch_fs *c) { struct bch_sb *src = c->disk_sb.sb; - struct bch_dev *ca; - unsigned i; lockdep_assert_held(&c->sb_lock); @@ -546,7 +523,7 @@ static void bch2_sb_update(struct bch_fs *c) le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent, sizeof(c->sb.errors_silent) * 8); - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { struct bch_member m = bch2_sb_member_get(src, ca->dev_idx); ca->mi = bch2_mi_to_cpu(&m); } @@ -571,6 +548,7 @@ static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) dst->time_base_lo = src->time_base_lo; dst->time_base_hi = src->time_base_hi; dst->time_precision = src->time_precision; + dst->write_time = src->write_time; memcpy(dst->flags, src->flags, sizeof(dst->flags)); memcpy(dst->features, src->features, sizeof(dst->features)); @@ -634,7 +612,6 @@ int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err) { - struct bch_csum csum; size_t bytes; int ret; reread: @@ -650,7 +627,9 @@ reread: if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) && !uuid_equal(&sb->sb->magic, &BCHFS_MAGIC)) { - prt_printf(err, "Not a bcachefs superblock"); + prt_str(err, "Not a bcachefs superblock (got magic "); + pr_uuid(err, sb->sb->magic.b); + prt_str(err, ")"); return -BCH_ERR_invalid_sb_magic; } @@ -673,17 +652,16 @@ reread: goto reread; } - if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) { + enum bch_csum_type csum_type = BCH_SB_CSUM_TYPE(sb->sb); + if (csum_type >= BCH_CSUM_NR) { prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb)); return -BCH_ERR_invalid_sb_csum_type; } /* XXX: verify MACs */ - csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb), - null_nonce(), sb->sb); - + struct bch_csum csum = csum_vstruct(NULL, csum_type, null_nonce(), sb->sb); if (bch2_crc_cmp(csum, sb->sb->csum)) { - prt_printf(err, "bad checksum"); + bch2_csum_err_msg(err, csum_type, sb->sb->csum, csum); return -BCH_ERR_invalid_sb_csum; } @@ -692,12 +670,13 @@ reread: return 0; } -int bch2_read_super(const char *path, struct bch_opts *opts, - struct bch_sb_handle *sb) +static int __bch2_read_super(const char *path, struct bch_opts *opts, + struct bch_sb_handle *sb, bool ignore_notbchfs_msg) { u64 offset = opt_get(*opts, sb); struct bch_sb_layout layout; struct printbuf err = PRINTBUF; + struct printbuf err2 = PRINTBUF; __le64 *i; int ret; #ifndef __KERNEL__ @@ -725,21 +704,22 @@ retry: if (!opt_get(*opts, nochanges)) sb->mode |= BLK_OPEN_WRITE; - sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); - if (IS_ERR(sb->bdev) && - PTR_ERR(sb->bdev) == -EACCES && + sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); + if (IS_ERR(sb->bdev_handle) && + PTR_ERR(sb->bdev_handle) == -EACCES && opt_get(*opts, read_only)) { sb->mode &= ~BLK_OPEN_WRITE; - sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); - if (!IS_ERR(sb->bdev)) + sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); + if (!IS_ERR(sb->bdev_handle)) opt_set(*opts, nochanges, true); } - if (IS_ERR(sb->bdev)) { - ret = PTR_ERR(sb->bdev); + if (IS_ERR(sb->bdev_handle)) { + ret = PTR_ERR(sb->bdev_handle); goto out; } + sb->bdev = sb->bdev_handle->bdev; ret = bch2_sb_realloc(sb, 0); if (ret) { @@ -760,8 +740,14 @@ retry: if (opt_defined(*opts, sb)) goto err; - printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s\n", + prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n", path, err.buf); + if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg) + printk(KERN_INFO "%s", err2.buf); + else + printk(KERN_ERR "%s", err2.buf); + + printbuf_exit(&err2); printbuf_reset(&err); /* @@ -837,6 +823,20 @@ err_no_print: goto out; } +int bch2_read_super(const char *path, struct bch_opts *opts, + struct bch_sb_handle *sb) +{ + return __bch2_read_super(path, opts, sb, false); +} + +/* provide a silenced version for mount.bcachefs */ + +int bch2_read_super_silent(const char *path, struct bch_opts *opts, + struct bch_sb_handle *sb) +{ + return __bch2_read_super(path, opts, sb, true); +} + /* write superblock: */ static void write_super_endio(struct bio *bio) @@ -905,9 +905,8 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) int bch2_write_super(struct bch_fs *c) { struct closure *cl = &c->sb_write; - struct bch_dev *ca; struct printbuf err = PRINTBUF; - unsigned i, sb = 0, nr_wrote; + unsigned sb = 0, nr_wrote; struct bch_devs_mask sb_written; bool wrote, can_mount_without_written, can_mount_with_written; unsigned degraded_flags = BCH_FORCE_IF_DEGRADED; @@ -929,9 +928,14 @@ int bch2_write_super(struct bch_fs *c) le64_add_cpu(&c->disk_sb.sb->seq, 1); - if (test_bit(BCH_FS_ERROR, &c->flags)) + struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); + for_each_online_member(c, ca) + __bch2_members_v2_get_mut(mi, ca->dev_idx)->seq = c->disk_sb.sb->seq; + c->disk_sb.sb->write_time = cpu_to_le64(ktime_get_real_seconds()); + + if (test_bit(BCH_FS_error, &c->flags)) SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1); - if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags)) + if (test_bit(BCH_FS_topology_error, &c->flags)) SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1); SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN); @@ -942,10 +946,10 @@ int bch2_write_super(struct bch_fs *c) bch2_sb_errors_from_cpu(c); bch2_sb_downgrade_update(c); - for_each_online_member(ca, c, i) + for_each_online_member(c, ca) bch2_sb_from_fs(c, ca); - for_each_online_member(ca, c, i) { + for_each_online_member(c, ca) { printbuf_reset(&err); ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE); @@ -966,16 +970,28 @@ int bch2_write_super(struct bch_fs *c) if (!BCH_SB_INITIALIZED(c->disk_sb.sb)) goto out; - for_each_online_member(ca, c, i) { + if (le16_to_cpu(c->disk_sb.sb->version) > bcachefs_metadata_version_current) { + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "attempting to write superblock that wasn't version downgraded ("); + bch2_version_to_text(&buf, le16_to_cpu(c->disk_sb.sb->version)); + prt_str(&buf, " > "); + bch2_version_to_text(&buf, bcachefs_metadata_version_current); + prt_str(&buf, ")"); + bch2_fs_fatal_error(c, "%s", buf.buf); + printbuf_exit(&buf); + return -BCH_ERR_sb_not_downgraded; + } + + for_each_online_member(c, ca) { __set_bit(ca->dev_idx, sb_written.d); ca->sb_write_error = 0; } - for_each_online_member(ca, c, i) + for_each_online_member(c, ca) read_back_super(c, ca); closure_sync(cl); - for_each_online_member(ca, c, i) { + for_each_online_member(c, ca) { if (ca->sb_write_error) continue; @@ -1002,7 +1018,7 @@ int bch2_write_super(struct bch_fs *c) do { wrote = false; - for_each_online_member(ca, c, i) + for_each_online_member(c, ca) if (!ca->sb_write_error && sb < ca->disk_sb.sb->layout.nr_superblocks) { write_one_super(c, ca, sb); @@ -1012,7 +1028,7 @@ int bch2_write_super(struct bch_fs *c) sb++; } while (wrote); - for_each_online_member(ca, c, i) { + for_each_online_member(c, ca) { if (ca->sb_write_error) __clear_bit(ca->dev_idx, sb_written.d); else @@ -1024,7 +1040,7 @@ int bch2_write_super(struct bch_fs *c) can_mount_with_written = bch2_have_enough_devs(c, sb_written, degraded_flags, false); - for (i = 0; i < ARRAY_SIZE(sb_written.d); i++) + for (unsigned i = 0; i < ARRAY_SIZE(sb_written.d); i++) sb_written.d[i] = ~sb_written.d[i]; can_mount_without_written = @@ -1073,13 +1089,22 @@ bool bch2_check_version_downgrade(struct bch_fs *c) /* * Downgrade, if superblock is at a higher version than currently * supported: + * + * c->sb will be checked before we write the superblock, so update it as + * well: */ - if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) + if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) { SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); - if (c->sb.version > bcachefs_metadata_version_current) + c->sb.version_upgrade_complete = bcachefs_metadata_version_current; + } + if (c->sb.version > bcachefs_metadata_version_current) { c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); - if (c->sb.version_min > bcachefs_metadata_version_current) + c->sb.version = bcachefs_metadata_version_current; + } + if (c->sb.version_min > bcachefs_metadata_version_current) { c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current); + c->sb.version_min = bcachefs_metadata_version_current; + } c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1); return ret; } @@ -1172,8 +1197,8 @@ static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, return ret; } -void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) +void __bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) { unsigned type = le32_to_cpu(f->type); const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); @@ -1181,6 +1206,15 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, if (!out->nr_tabstops) printbuf_tabstop_push(out, 32); + if (ops->to_text) + ops->to_text(out, sb, f); +} + +void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + unsigned type = le32_to_cpu(f->type); + if (type < BCH_SB_FIELD_NR) prt_printf(out, "%s", bch2_sb_fields[type]); else @@ -1189,11 +1223,7 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, prt_printf(out, " (size %zu):", vstruct_bytes(f)); prt_newline(out); - if (ops->to_text) { - printbuf_indent_add(out, 2); - ops->to_text(out, sb, f); - printbuf_indent_sub(out, 2); - } + __bch2_sb_field_to_text(out, sb, f); } void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l) @@ -1222,7 +1252,6 @@ void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l) void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, bool print_layout, unsigned fields) { - struct bch_sb_field *f; u64 fields_have = 0; unsigned nr_devices = 0; @@ -1242,6 +1271,11 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, pr_uuid(out, sb->uuid.b); prt_newline(out); + prt_printf(out, "Magic number:"); + prt_tab(out); + pr_uuid(out, sb->magic.b); + prt_newline(out); + prt_str(out, "Device index:"); prt_tab(out); prt_printf(out, "%u", sb->dev_idx); @@ -1280,6 +1314,11 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, prt_printf(out, "%llu", le64_to_cpu(sb->seq)); prt_newline(out); + prt_printf(out, "Time of last write:"); + prt_tab(out); + bch2_prt_datetime(out, le64_to_cpu(sb->write_time)); + prt_newline(out); + prt_printf(out, "Superblock size:"); prt_tab(out); prt_printf(out, "%zu", vstruct_bytes(sb)); diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index e41e5de531a0..95e80e06316b 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -19,10 +19,6 @@ static inline bool bch2_version_compatible(u16 version) void bch2_version_to_text(struct printbuf *, unsigned); unsigned bch2_latest_compatible_version(unsigned); -u64 bch2_upgrade_recovery_passes(struct bch_fs *c, - unsigned, - unsigned); - static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f) { return le32_to_cpu(f->u64s) * sizeof(u64); @@ -84,6 +80,7 @@ void bch2_free_super(struct bch_sb_handle *); int bch2_sb_realloc(struct bch_sb_handle *, unsigned); int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); +int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *); int bch2_write_super(struct bch_fs *); void __bch2_check_set_feature(struct bch_fs *, unsigned); @@ -96,6 +93,8 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) bool bch2_check_version_downgrade(struct bch_fs *); void bch2_sb_upgrade(struct bch_fs *, unsigned); +void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, + struct bch_sb_field *); void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, struct bch_sb_field *); void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 818ec467a06b..9dbc35940197 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -79,6 +79,36 @@ MODULE_SOFTDEP("pre: chacha20"); MODULE_SOFTDEP("pre: poly1305"); MODULE_SOFTDEP("pre: xxhash"); +const char * const bch2_fs_flag_strs[] = { +#define x(n) #n, + BCH_FS_FLAGS() +#undef x + NULL +}; + +void __bch2_print(struct bch_fs *c, const char *fmt, ...) +{ + struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); + + va_list args; + va_start(args, fmt); + if (likely(!stdio)) { + vprintk(fmt, args); + } else { + unsigned long flags; + + if (fmt[0] == KERN_SOH[0]) + fmt += 2; + + spin_lock_irqsave(&stdio->output_lock, flags); + prt_vprintf(&stdio->output_buf, fmt, args); + spin_unlock_irqrestore(&stdio->output_lock, flags); + + wake_up(&stdio->output_wait); + } + va_end(args); +} + #define KTYPE(type) \ static const struct attribute_group type ## _group = { \ .attrs = type ## _files \ @@ -134,14 +164,12 @@ static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); struct bch_fs *bch2_dev_to_fs(dev_t dev) { struct bch_fs *c; - struct bch_dev *ca; - unsigned i; mutex_lock(&bch_fs_list_lock); rcu_read_lock(); list_for_each_entry(c, &bch_fs_list, list) - for_each_member_device_rcu(ca, c, i, NULL) + for_each_member_device_rcu(c, ca, NULL) if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) { closure_get(&c->cl); goto found; @@ -182,14 +210,13 @@ struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid) static void bch2_dev_usage_journal_reserve(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i, nr = 0, u64s = + unsigned nr = 0, u64s = ((sizeof(struct jset_entry_dev_usage) + sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR)) / sizeof(u64); rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, NULL) + for_each_member_device_rcu(c, ca, NULL) nr++; rcu_read_unlock(); @@ -216,8 +243,7 @@ static void bch2_dev_usage_journal_reserve(struct bch_fs *c) static void __bch2_fs_read_only(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i, clean_passes = 0; + unsigned clean_passes = 0; u64 seq = 0; bch2_fs_ec_stop(c); @@ -246,14 +272,14 @@ static void __bch2_fs_read_only(struct bch_fs *c) journal_cur_seq(&c->journal)); if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) && - !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) - set_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags); + !test_bit(BCH_FS_emergency_ro, &c->flags)) + set_bit(BCH_FS_clean_shutdown, &c->flags); bch2_fs_journal_stop(&c->journal); /* * After stopping journal: */ - for_each_member_device(ca, c, i) + for_each_member_device(c, ca) bch2_dev_allocator_remove(c, ca); } @@ -262,25 +288,27 @@ static void bch2_writes_disabled(struct percpu_ref *writes) { struct bch_fs *c = container_of(writes, struct bch_fs, writes); - set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); + set_bit(BCH_FS_write_disable_complete, &c->flags); wake_up(&bch2_read_only_wait); } #endif void bch2_fs_read_only(struct bch_fs *c) { - if (!test_bit(BCH_FS_RW, &c->flags)) { + if (!test_bit(BCH_FS_rw, &c->flags)) { bch2_journal_reclaim_stop(&c->journal); return; } - BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); + BUG_ON(test_bit(BCH_FS_write_disable_complete, &c->flags)); + + bch_verbose(c, "going read-only"); /* * Block new foreground-end write operations from starting - any new * writes will return -EROFS: */ - set_bit(BCH_FS_GOING_RO, &c->flags); + set_bit(BCH_FS_going_ro, &c->flags); #ifndef BCH_WRITE_REF_DEBUG percpu_ref_kill(&c->writes); #else @@ -300,33 +328,42 @@ void bch2_fs_read_only(struct bch_fs *c) * that going RO is complete: */ wait_event(bch2_read_only_wait, - test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) || - test_bit(BCH_FS_EMERGENCY_RO, &c->flags)); + test_bit(BCH_FS_write_disable_complete, &c->flags) || + test_bit(BCH_FS_emergency_ro, &c->flags)); + + bool writes_disabled = test_bit(BCH_FS_write_disable_complete, &c->flags); + if (writes_disabled) + bch_verbose(c, "finished waiting for writes to stop"); __bch2_fs_read_only(c); wait_event(bch2_read_only_wait, - test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); + test_bit(BCH_FS_write_disable_complete, &c->flags)); + + if (!writes_disabled) + bch_verbose(c, "finished waiting for writes to stop"); - clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); - clear_bit(BCH_FS_GOING_RO, &c->flags); + clear_bit(BCH_FS_write_disable_complete, &c->flags); + clear_bit(BCH_FS_going_ro, &c->flags); + clear_bit(BCH_FS_rw, &c->flags); if (!bch2_journal_error(&c->journal) && - !test_bit(BCH_FS_ERROR, &c->flags) && - !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) && - test_bit(BCH_FS_STARTED, &c->flags) && - test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags) && + !test_bit(BCH_FS_error, &c->flags) && + !test_bit(BCH_FS_emergency_ro, &c->flags) && + test_bit(BCH_FS_started, &c->flags) && + test_bit(BCH_FS_clean_shutdown, &c->flags) && !c->opts.norecovery) { BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); BUG_ON(atomic_read(&c->btree_cache.dirty)); BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); - BUG_ON(c->btree_write_buffer.state.nr); + BUG_ON(c->btree_write_buffer.inc.keys.nr); + BUG_ON(c->btree_write_buffer.flushing.keys.nr); bch_verbose(c, "marking filesystem clean"); bch2_fs_mark_clean(c); + } else { + bch_verbose(c, "done going read-only, filesystem not clean"); } - - clear_bit(BCH_FS_RW, &c->flags); } static void bch2_fs_read_only_work(struct work_struct *work) @@ -346,7 +383,7 @@ static void bch2_fs_read_only_async(struct bch_fs *c) bool bch2_fs_emergency_read_only(struct bch_fs *c) { - bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags); + bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); bch2_journal_halt(&c->journal); bch2_fs_read_only_async(c); @@ -383,28 +420,16 @@ static int bch2_fs_read_write_late(struct bch_fs *c) static int __bch2_fs_read_write(struct bch_fs *c, bool early) { - struct bch_dev *ca; - unsigned i; int ret; - if (test_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags)) { + if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) { bch_err(c, "cannot go rw, unfixed btree errors"); return -BCH_ERR_erofs_unfixed_errors; } - if (test_bit(BCH_FS_RW, &c->flags)) + if (test_bit(BCH_FS_rw, &c->flags)) return 0; - if (c->opts.norecovery) - return -BCH_ERR_erofs_norecovery; - - /* - * nochanges is used for fsck -n mode - we have to allow going rw - * during recovery for that to work: - */ - if (c->opts.nochanges && (!early || c->opts.read_only)) - return -BCH_ERR_erofs_nochanges; - bch_info(c, "going read-write"); ret = bch2_sb_members_v2_init(c); @@ -415,7 +440,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) if (ret) goto err; - clear_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags); + clear_bit(BCH_FS_clean_shutdown, &c->flags); /* * First journal write must be a flush write: after a clean shutdown we @@ -425,17 +450,17 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) */ set_bit(JOURNAL_NEED_FLUSH_WRITE, &c->journal.flags); - for_each_rw_member(ca, c, i) + for_each_rw_member(c, ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); - set_bit(BCH_FS_RW, &c->flags); - set_bit(BCH_FS_WAS_RW, &c->flags); + set_bit(BCH_FS_rw, &c->flags); + set_bit(BCH_FS_was_rw, &c->flags); #ifndef BCH_WRITE_REF_DEBUG percpu_ref_reinit(&c->writes); #else - for (i = 0; i < BCH_WRITE_REF_NR; i++) { + for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) { BUG_ON(atomic_long_read(&c->writes[i])); atomic_long_inc(&c->writes[i]); } @@ -463,7 +488,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) bch2_do_pending_node_rewrites(c); return 0; err: - if (test_bit(BCH_FS_RW, &c->flags)) + if (test_bit(BCH_FS_rw, &c->flags)) bch2_fs_read_only(c); else __bch2_fs_read_only(c); @@ -472,6 +497,12 @@ err: int bch2_fs_read_write(struct bch_fs *c) { + if (c->opts.norecovery) + return -BCH_ERR_erofs_norecovery; + + if (c->opts.nochanges) + return -BCH_ERR_erofs_nochanges; + return __bch2_fs_read_write(c, false); } @@ -558,12 +589,9 @@ static void bch2_fs_release(struct kobject *kobj) void __bch2_fs_stop(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; - bch_verbose(c, "shutting down"); - set_bit(BCH_FS_STOPPING, &c->flags); + set_bit(BCH_FS_stopping, &c->flags); cancel_work_sync(&c->journal_seq_blacklist_gc_work); @@ -571,7 +599,7 @@ void __bch2_fs_stop(struct bch_fs *c) bch2_fs_read_only(c); up_write(&c->state_lock); - for_each_member_device(ca, c, i) + for_each_member_device(c, ca) if (ca->kobj.state_in_sysfs && ca->disk_sb.bdev) sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); @@ -582,6 +610,9 @@ void __bch2_fs_stop(struct bch_fs *c) bch2_fs_debug_exit(c); bch2_fs_chardev_exit(c); + bch2_ro_ref_put(c); + wait_event(c->ro_ref_wait, !refcount_read(&c->ro_ref)); + kobject_put(&c->counters_kobj); kobject_put(&c->time_stats); kobject_put(&c->opts_dir); @@ -590,7 +621,7 @@ void __bch2_fs_stop(struct bch_fs *c) /* btree prefetch might have kicked off reads in the background: */ bch2_btree_flush_all_reads(c); - for_each_member_device(ca, c, i) + for_each_member_device(c, ca) cancel_work_sync(&ca->io_error_work); cancel_work_sync(&c->read_only_work); @@ -629,8 +660,6 @@ void bch2_fs_stop(struct bch_fs *c) static int bch2_fs_online(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; int ret = 0; lockdep_assert_held(&bch_fs_list_lock); @@ -651,7 +680,9 @@ static int bch2_fs_online(struct bch_fs *c) ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?: kobject_add(&c->internal, &c->kobj, "internal") ?: kobject_add(&c->opts_dir, &c->kobj, "options") ?: +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT kobject_add(&c->time_stats, &c->kobj, "time_stats") ?: +#endif kobject_add(&c->counters_kobj, &c->kobj, "counters") ?: bch2_opts_create_sysfs_files(&c->opts_dir); if (ret) { @@ -661,7 +692,7 @@ static int bch2_fs_online(struct bch_fs *c) down_write(&c->state_lock); - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { ret = bch2_dev_sysfs_online(c, ca); if (ret) { bch_err(c, "error creating sysfs objects"); @@ -690,6 +721,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) goto out; } + c->stdio = (void *)(unsigned long) opts.stdio; + __module_get(THIS_MODULE); closure_init(&c->cl, NULL); @@ -710,6 +743,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) mutex_init(&c->btree_root_lock); INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); + refcount_set(&c->ro_ref, 1); + init_waitqueue_head(&c->ro_ref_wait); + sema_init(&c->online_fsck_mutex, 1); + init_rwsem(&c->gc_lock); mutex_init(&c->gc_gens_lock); atomic_set(&c->journal_keys.ref, 1); @@ -763,7 +800,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write]; c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; - c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal]; c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; bch2_fs_btree_cache_init_early(&c->btree_cache); @@ -832,7 +868,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || !(c->io_complete_wq = alloc_workqueue("bcachefs_io", - WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) || + WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 512)) || !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", WQ_FREEZABLE, 0)) || #ifndef BCH_WRITE_REF_DEBUG @@ -946,16 +982,14 @@ static void print_mount_opts(struct bch_fs *c) int bch2_fs_start(struct bch_fs *c) { - struct bch_dev *ca; time64_t now = ktime_get_real_seconds(); - unsigned i; int ret; print_mount_opts(c); down_write(&c->state_lock); - BUG_ON(test_bit(BCH_FS_STARTED, &c->flags)); + BUG_ON(test_bit(BCH_FS_started, &c->flags)); mutex_lock(&c->sb_lock); @@ -965,12 +999,12 @@ int bch2_fs_start(struct bch_fs *c) goto err; } - for_each_online_member(ca, c, i) - bch2_members_v2_get_mut(c->disk_sb.sb, i)->last_mount = cpu_to_le64(now); + for_each_online_member(c, ca) + bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now); mutex_unlock(&c->sb_lock); - for_each_rw_member(ca, c, i) + for_each_rw_member(c, ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); @@ -990,12 +1024,12 @@ int bch2_fs_start(struct bch_fs *c) goto err; } - set_bit(BCH_FS_STARTED, &c->flags); + set_bit(BCH_FS_started, &c->flags); - if (c->opts.read_only || c->opts.nochanges) { + if (c->opts.read_only) { bch2_fs_read_only(c); } else { - ret = !test_bit(BCH_FS_RW, &c->flags) + ret = !test_bit(BCH_FS_rw, &c->flags) ? bch2_fs_read_write(c) : bch2_fs_read_write_late(c); if (ret) @@ -1003,12 +1037,13 @@ int bch2_fs_start(struct bch_fs *c) } ret = 0; -out: +err: + if (ret) + bch_err_msg(c, ret, "starting filesystem"); + else + bch_verbose(c, "done starting filesystem"); up_write(&c->state_lock); return ret; -err: - bch_err_msg(c, ret, "starting filesystem"); - goto out; } static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) @@ -1025,20 +1060,83 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) return 0; } -static int bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb) +static int bch2_dev_in_fs(struct bch_sb_handle *fs, + struct bch_sb_handle *sb) { - struct bch_sb *newest = - le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb; + if (fs == sb) + return 0; - if (!uuid_equal(&fs->uuid, &sb->uuid)) + if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid)) return -BCH_ERR_device_not_a_member_of_filesystem; - if (!bch2_dev_exists(newest, sb->dev_idx)) + if (!bch2_dev_exists(fs->sb, sb->sb->dev_idx)) return -BCH_ERR_device_has_been_removed; - if (fs->block_size != sb->block_size) + if (fs->sb->block_size != sb->sb->block_size) return -BCH_ERR_mismatched_block_size; + if (le16_to_cpu(fs->sb->version) < bcachefs_metadata_version_member_seq || + le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_member_seq) + return 0; + + if (fs->sb->seq == sb->sb->seq && + fs->sb->write_time != sb->sb->write_time) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "Split brain detected between "); + prt_bdevname(&buf, sb->bdev); + prt_str(&buf, " and "); + prt_bdevname(&buf, fs->bdev); + prt_char(&buf, ':'); + prt_newline(&buf); + prt_printf(&buf, "seq=%llu but write_time different, got", le64_to_cpu(sb->sb->seq)); + prt_newline(&buf); + + prt_bdevname(&buf, fs->bdev); + prt_char(&buf, ' '); + bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));; + prt_newline(&buf); + + prt_bdevname(&buf, sb->bdev); + prt_char(&buf, ' '); + bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));; + prt_newline(&buf); + + prt_printf(&buf, "Not using older sb"); + + pr_err("%s", buf.buf); + printbuf_exit(&buf); + return -BCH_ERR_device_splitbrain; + } + + struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx); + u64 seq_from_fs = le64_to_cpu(m.seq); + u64 seq_from_member = le64_to_cpu(sb->sb->seq); + + if (seq_from_fs && seq_from_fs < seq_from_member) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "Split brain detected between "); + prt_bdevname(&buf, sb->bdev); + prt_str(&buf, " and "); + prt_bdevname(&buf, fs->bdev); + prt_char(&buf, ':'); + prt_newline(&buf); + + prt_bdevname(&buf, fs->bdev); + prt_str(&buf, "believes seq of "); + prt_bdevname(&buf, sb->bdev); + prt_printf(&buf, " to be %llu, but ", seq_from_fs); + prt_bdevname(&buf, sb->bdev); + prt_printf(&buf, " has %llu\n", seq_from_member); + prt_str(&buf, "Not using "); + prt_bdevname(&buf, sb->bdev); + + pr_err("%s", buf.buf); + printbuf_exit(&buf); + return -BCH_ERR_device_splitbrain; + } + return 0; } @@ -1284,9 +1382,14 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) bch2_dev_sysfs_online(c, ca); + struct printbuf name = PRINTBUF; + prt_bdevname(&name, ca->disk_sb.bdev); + if (c->sb.nr_devices == 1) - snprintf(c->name, sizeof(c->name), "%pg", ca->disk_sb.bdev); - snprintf(ca->name, sizeof(ca->name), "%pg", ca->disk_sb.bdev); + strlcpy(c->name, name.buf, sizeof(c->name)); + strlcpy(ca->name, name.buf, sizeof(ca->name)); + + printbuf_exit(&name); rebalance_wakeup(c); return 0; @@ -1307,8 +1410,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, enum bch_member_state new_state, int flags) { struct bch_devs_mask new_online_devs; - struct bch_dev *ca2; - int i, nr_rw = 0, required; + int nr_rw = 0, required; lockdep_assert_held(&c->state_lock); @@ -1320,7 +1422,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, return true; /* do we have enough devices to write to? */ - for_each_member_device(ca2, c, i) + for_each_member_device(c, ca2) if (ca2 != ca) nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw; @@ -1468,9 +1570,7 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) BTREE_TRIGGER_NORUN, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end, BTREE_TRIGGER_NORUN, NULL); - if (ret) - bch_err_msg(c, ret, "removing dev alloc info"); - + bch_err_msg(c, ret, "removing dev alloc info"); return ret; } @@ -1497,34 +1597,29 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) __bch2_dev_read_only(c, ca); ret = bch2_dev_data_drop(c, ca->dev_idx, flags); - if (ret) { - bch_err_msg(ca, ret, "dropping data"); + bch_err_msg(ca, ret, "dropping data"); + if (ret) goto err; - } ret = bch2_dev_remove_alloc(c, ca); - if (ret) { - bch_err_msg(ca, ret, "deleting alloc info"); + bch_err_msg(ca, ret, "deleting alloc info"); + if (ret) goto err; - } ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); - if (ret) { - bch_err_msg(ca, ret, "flushing journal"); + bch_err_msg(ca, ret, "flushing journal"); + if (ret) goto err; - } ret = bch2_journal_flush(&c->journal); - if (ret) { - bch_err(ca, "journal error"); + bch_err(ca, "journal error"); + if (ret) goto err; - } ret = bch2_replicas_gc2(c); - if (ret) { - bch_err_msg(ca, ret, "in replicas_gc2()"); + bch_err_msg(ca, ret, "in replicas_gc2()"); + if (ret) goto err; - } data = bch2_dev_has_data(c, ca); if (data) { @@ -1596,10 +1691,9 @@ int bch2_dev_add(struct bch_fs *c, const char *path) int ret; ret = bch2_read_super(path, &opts, &sb); - if (ret) { - bch_err_msg(c, ret, "reading super"); + bch_err_msg(c, ret, "reading super"); + if (ret) goto err; - } dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx); @@ -1612,10 +1706,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path) } ret = bch2_dev_may_add(sb.sb, c); - if (ret) { - bch_err_fn(c, ret); + if (ret) goto err; - } ca = __bch2_dev_alloc(c, &dev_mi); if (!ca) { @@ -1630,19 +1722,17 @@ int bch2_dev_add(struct bch_fs *c, const char *path) goto err; ret = bch2_dev_journal_alloc(ca); - if (ret) { - bch_err_msg(c, ret, "allocating journal"); + bch_err_msg(c, ret, "allocating journal"); + if (ret) goto err; - } down_write(&c->state_lock); mutex_lock(&c->sb_lock); ret = bch2_sb_from_fs(c, ca); - if (ret) { - bch_err_msg(c, ret, "setting up new superblock"); + bch_err_msg(c, ret, "setting up new superblock"); + if (ret) goto err_unlock; - } if (dynamic_fault("bcachefs:add:no_slot")) goto no_slot; @@ -1681,10 +1771,9 @@ have_slot: if (BCH_MEMBER_GROUP(&dev_mi)) { ret = __bch2_dev_group_set(c, ca, label.buf); - if (ret) { - bch_err_msg(c, ret, "creating new label"); + bch_err_msg(c, ret, "creating new label"); + if (ret) goto err_unlock; - } } bch2_write_super(c); @@ -1693,16 +1782,14 @@ have_slot: bch2_dev_usage_journal_reserve(c); ret = bch2_trans_mark_dev_sb(c, ca); - if (ret) { - bch_err_msg(ca, ret, "marking new superblock"); + bch_err_msg(ca, ret, "marking new superblock"); + if (ret) goto err_late; - } ret = bch2_fs_freespace_init(c); - if (ret) { - bch_err_msg(ca, ret, "initializing free space"); + bch_err_msg(ca, ret, "initializing free space"); + if (ret) goto err_late; - } ca->new_fs_bucket_idx = 0; @@ -1721,6 +1808,7 @@ err: bch2_free_super(&sb); printbuf_exit(&label); printbuf_exit(&errbuf); + bch_err_fn(c, ret); return ret; err_late: up_write(&c->state_lock); @@ -1747,11 +1835,10 @@ int bch2_dev_online(struct bch_fs *c, const char *path) dev_idx = sb.sb->dev_idx; - ret = bch2_dev_in_fs(c->disk_sb.sb, sb.sb); - if (ret) { - bch_err_msg(c, ret, "bringing %s online", path); + ret = bch2_dev_in_fs(&c->disk_sb, &sb); + bch_err_msg(c, ret, "bringing %s online", path); + if (ret) goto err; - } ret = bch2_dev_attach_bdev(c, &sb); if (ret) @@ -1760,10 +1847,9 @@ int bch2_dev_online(struct bch_fs *c, const char *path) ca = bch_dev_locked(c, dev_idx); ret = bch2_trans_mark_dev_sb(c, ca); - if (ret) { - bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path); + bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path); + if (ret) goto err; - } if (ca->mi.state == BCH_MEMBER_STATE_rw) __bch2_dev_read_write(c, ca); @@ -1842,10 +1928,9 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) } ret = bch2_dev_buckets_resize(c, ca, nbuckets); - if (ret) { - bch_err_msg(ca, ret, "resizing buckets"); + bch_err_msg(ca, ret, "resizing buckets"); + if (ret) goto err; - } ret = bch2_trans_mark_dev_sb(c, ca); if (ret) @@ -1879,28 +1964,30 @@ err: /* return with ref on ca->ref: */ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) { - struct bch_dev *ca; - unsigned i; - rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, NULL) - if (!strcmp(name, ca->name)) - goto found; - ca = ERR_PTR(-BCH_ERR_ENOENT_dev_not_found); -found: + for_each_member_device_rcu(c, ca, NULL) + if (!strcmp(name, ca->name)) { + rcu_read_unlock(); + return ca; + } rcu_read_unlock(); - - return ca; + return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found); } /* Filesystem open: */ +static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r) +{ + return cmp_int(le64_to_cpu(l->seq), le64_to_cpu(r->seq)) ?: + cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time)); +} + struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, struct bch_opts opts) { DARRAY(struct bch_sb_handle) sbs = { 0 }; struct bch_fs *c = NULL; - struct bch_sb_handle *sb, *best = NULL; + struct bch_sb_handle *best = NULL; struct printbuf errbuf = PRINTBUF; int ret = 0; @@ -1926,20 +2013,27 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, BUG_ON(darray_push(&sbs, sb)); } + if (opts.nochanges && !opts.read_only) { + ret = -BCH_ERR_erofs_nochanges; + goto err_print; + } + darray_for_each(sbs, sb) - if (!best || le64_to_cpu(sb->sb->seq) > le64_to_cpu(best->sb->seq)) + if (!best || sb_cmp(sb->sb, best->sb) > 0) best = sb; darray_for_each_reverse(sbs, sb) { - if (sb != best && !bch2_dev_exists(best->sb, sb->sb->dev_idx)) { - pr_info("%pg has been removed, skipping", sb->bdev); + ret = bch2_dev_in_fs(best, sb); + + if (ret == -BCH_ERR_device_has_been_removed || + ret == -BCH_ERR_device_splitbrain) { bch2_free_super(sb); darray_remove_item(&sbs, sb); best -= best > sb; + ret = 0; continue; } - ret = bch2_dev_in_fs(best->sb, sb->sb); if (ret) goto err_print; } diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index bf762df18012..dada09331d2e 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -8,6 +8,8 @@ #include <linux/math64.h> +extern const char * const bch2_fs_flag_strs[]; + struct bch_fs *bch2_dev_to_fs(dev_t); struct bch_fs *bch2_uuid_to_fs(__uuid_t); @@ -37,8 +39,8 @@ int bch2_fs_read_write_early(struct bch_fs *); */ static inline void bch2_fs_lazy_rw(struct bch_fs *c) { - if (!test_bit(BCH_FS_RW, &c->flags) && - !test_bit(BCH_FS_WAS_RW, &c->flags)) + if (!test_bit(BCH_FS_rw, &c->flags) && + !test_bit(BCH_FS_was_rw, &c->flags)) bch2_fs_read_write_early(c); } diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h index 9c1fd4ca2b10..0e5a14fc8e7f 100644 --- a/fs/bcachefs/super_types.h +++ b/fs/bcachefs/super_types.h @@ -4,6 +4,7 @@ struct bch_sb_handle { struct bch_sb *sb; + struct bdev_handle *bdev_handle; struct block_device *bdev; char *sb_name; struct bio *bio; @@ -22,7 +23,7 @@ struct bch_devs_mask { struct bch_devs_list { u8 nr; - u8 devs[BCH_BKEY_PTRS_MAX]; + u8 data[BCH_BKEY_PTRS_MAX]; }; struct bch_member_cpu { diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index f3cb7115b530..8ed52319ff68 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -145,6 +145,7 @@ rw_attribute(gc_gens_pos); read_attribute(uuid); read_attribute(minor); +read_attribute(flags); read_attribute(bucket_size); read_attribute(first_bucket); read_attribute(nbuckets); @@ -255,19 +256,18 @@ static size_t bch2_btree_cache_size(struct bch_fs *c) static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c) { struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; enum btree_id id; - u64 nr_uncompressed_extents = 0, - nr_compressed_extents = 0, - nr_incompressible_extents = 0, - uncompressed_sectors = 0, - incompressible_sectors = 0, - compressed_sectors_compressed = 0, - compressed_sectors_uncompressed = 0; + struct compression_type_stats { + u64 nr_extents; + u64 sectors_compressed; + u64 sectors_uncompressed; + } s[BCH_COMPRESSION_TYPE_NR]; + u64 compressed_incompressible = 0; int ret = 0; - if (!test_bit(BCH_FS_STARTED, &c->flags)) + memset(s, 0, sizeof(s)); + + if (!test_bit(BCH_FS_started, &c->flags)) return -EPERM; trans = bch2_trans_get(c); @@ -276,39 +276,33 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c if (!btree_type_has_ptrs(id)) continue; - ret = for_each_btree_key2(trans, iter, id, POS_MIN, - BTREE_ITER_ALL_SNAPSHOTS, k, ({ + ret = for_each_btree_key(trans, iter, id, POS_MIN, + BTREE_ITER_ALL_SNAPSHOTS, k, ({ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + struct bch_extent_crc_unpacked crc; const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - bool compressed = false, uncompressed = false, incompressible = false; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - switch (p.crc.compression_type) { - case BCH_COMPRESSION_TYPE_none: - uncompressed = true; - uncompressed_sectors += k.k->size; - break; - case BCH_COMPRESSION_TYPE_incompressible: - incompressible = true; - incompressible_sectors += k.k->size; - break; - default: - compressed_sectors_compressed += - p.crc.compressed_size; - compressed_sectors_uncompressed += - p.crc.uncompressed_size; - compressed = true; - break; + bool compressed = false, incompressible = false; + + bkey_for_each_crc(k.k, ptrs, crc, entry) { + incompressible |= crc.compression_type == BCH_COMPRESSION_TYPE_incompressible; + compressed |= crc_is_compressed(crc); + + if (crc_is_compressed(crc)) { + s[crc.compression_type].nr_extents++; + s[crc.compression_type].sectors_compressed += crc.compressed_size; + s[crc.compression_type].sectors_uncompressed += crc.uncompressed_size; } } - if (incompressible) - nr_incompressible_extents++; - else if (uncompressed) - nr_uncompressed_extents++; - else if (compressed) - nr_compressed_extents++; + compressed_incompressible += compressed && incompressible; + + if (!compressed) { + unsigned t = incompressible ? BCH_COMPRESSION_TYPE_incompressible : 0; + + s[t].nr_extents++; + s[t].sectors_compressed += k.k->size; + s[t].sectors_uncompressed += k.k->size; + } 0; })); } @@ -318,26 +312,45 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c if (ret) return ret; - prt_printf(out, "uncompressed:\n"); - prt_printf(out, " nr extents: %llu\n", nr_uncompressed_extents); - prt_printf(out, " size: "); - prt_human_readable_u64(out, uncompressed_sectors << 9); - prt_printf(out, "\n"); + prt_str(out, "type"); + printbuf_tabstop_push(out, 12); + prt_tab(out); - prt_printf(out, "compressed:\n"); - prt_printf(out, " nr extents: %llu\n", nr_compressed_extents); - prt_printf(out, " compressed size: "); - prt_human_readable_u64(out, compressed_sectors_compressed << 9); - prt_printf(out, "\n"); - prt_printf(out, " uncompressed size: "); - prt_human_readable_u64(out, compressed_sectors_uncompressed << 9); - prt_printf(out, "\n"); + prt_str(out, "compressed"); + printbuf_tabstop_push(out, 16); + prt_tab_rjust(out); + + prt_str(out, "uncompressed"); + printbuf_tabstop_push(out, 16); + prt_tab_rjust(out); + + prt_str(out, "average extent size"); + printbuf_tabstop_push(out, 24); + prt_tab_rjust(out); + prt_newline(out); + + for (unsigned i = 0; i < ARRAY_SIZE(s); i++) { + prt_str(out, bch2_compression_types[i]); + prt_tab(out); + + prt_human_readable_u64(out, s[i].sectors_compressed << 9); + prt_tab_rjust(out); + + prt_human_readable_u64(out, s[i].sectors_uncompressed << 9); + prt_tab_rjust(out); + + prt_human_readable_u64(out, s[i].nr_extents + ? div_u64(s[i].sectors_uncompressed << 9, s[i].nr_extents) + : 0); + prt_tab_rjust(out); + prt_newline(out); + } + + if (compressed_incompressible) { + prt_printf(out, "%llu compressed & incompressible extents", compressed_incompressible); + prt_newline(out); + } - prt_printf(out, "incompressible:\n"); - prt_printf(out, " nr extents: %llu\n", nr_incompressible_extents); - prt_printf(out, " size: "); - prt_human_readable_u64(out, incompressible_sectors << 9); - prt_printf(out, "\n"); return 0; } @@ -370,6 +383,9 @@ SHOW(bch2_fs) sysfs_print(minor, c->minor); sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); + if (attr == &sysfs_flags) + prt_bitflags(out, bch2_fs_flag_strs, c->flags); + sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); if (attr == &sysfs_btree_write_stats) @@ -483,12 +499,12 @@ STORE(bch2_fs) /* Debugging: */ - if (!test_bit(BCH_FS_STARTED, &c->flags)) + if (!test_bit(BCH_FS_started, &c->flags)) return -EPERM; /* Debugging: */ - if (!test_bit(BCH_FS_RW, &c->flags)) + if (!test_bit(BCH_FS_rw, &c->flags)) return -EROFS; if (attr == &sysfs_prune_cache) { @@ -620,6 +636,7 @@ STORE(bch2_fs_internal) SYSFS_OPS(bch2_fs_internal); struct attribute *bch2_fs_internal_files[] = { + &sysfs_flags, &sysfs_journal_debug, &sysfs_btree_updates, &sysfs_btree_cache, @@ -786,32 +803,7 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) printbuf_tabstop_push(out, 16); printbuf_tabstop_push(out, 16); - prt_tab(out); - prt_str(out, "buckets"); - prt_tab_rjust(out); - prt_str(out, "sectors"); - prt_tab_rjust(out); - prt_str(out, "fragmented"); - prt_tab_rjust(out); - prt_newline(out); - - for (i = 0; i < BCH_DATA_NR; i++) { - prt_str(out, bch2_data_types[i]); - prt_tab(out); - prt_u64(out, stats.d[i].buckets); - prt_tab_rjust(out); - prt_u64(out, stats.d[i].sectors); - prt_tab_rjust(out); - prt_u64(out, stats.d[i].fragmented); - prt_tab_rjust(out); - prt_newline(out); - } - - prt_str(out, "ec"); - prt_tab(out); - prt_u64(out, stats.buckets_ec); - prt_tab_rjust(out); - prt_newline(out); + bch2_dev_usage_to_text(out, &stats); prt_newline(out); diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index 2fc9e60c754b..b3fe9fc57747 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -107,9 +107,6 @@ err: static int test_iterate(struct bch_fs *c, u64 nr) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = { NULL }; - struct bkey_s_c k; u64 i; int ret = 0; @@ -127,49 +124,43 @@ static int test_iterate(struct bch_fs *c, u64 nr) ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0); bch_err_msg(c, ret, "insert error"); if (ret) - goto err; + return ret; } pr_info("iterating forwards"); - i = 0; - ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ - BUG_ON(k.k->p.offset != i++); - 0; - })); + ret = bch2_trans_run(c, + for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ + BUG_ON(k.k->p.offset != i++); + 0; + }))); bch_err_msg(c, ret, "error iterating forwards"); if (ret) - goto err; + return ret; BUG_ON(i != nr); pr_info("iterating backwards"); - ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs, - SPOS(0, U64_MAX, U32_MAX), 0, k, - ({ + ret = bch2_trans_run(c, + for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs, + SPOS(0, U64_MAX, U32_MAX), 0, k, ({ BUG_ON(k.k->p.offset != --i); 0; - })); + }))); bch_err_msg(c, ret, "error iterating backwards"); if (ret) - goto err; + return ret; BUG_ON(i); -err: - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; + return 0; } static int test_iterate_extents(struct bch_fs *c, u64 nr) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = { NULL }; - struct bkey_s_c k; u64 i; int ret = 0; @@ -188,51 +179,45 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0); bch_err_msg(c, ret, "insert error"); if (ret) - goto err; + return ret; } pr_info("iterating forwards"); - i = 0; - ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ - BUG_ON(bkey_start_offset(k.k) != i); - i = k.k->p.offset; - 0; - })); + ret = bch2_trans_run(c, + for_each_btree_key_upto(trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ + BUG_ON(bkey_start_offset(k.k) != i); + i = k.k->p.offset; + 0; + }))); bch_err_msg(c, ret, "error iterating forwards"); if (ret) - goto err; + return ret; BUG_ON(i != nr); pr_info("iterating backwards"); - ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_extents, - SPOS(0, U64_MAX, U32_MAX), 0, k, - ({ + ret = bch2_trans_run(c, + for_each_btree_key_reverse(trans, iter, BTREE_ID_extents, + SPOS(0, U64_MAX, U32_MAX), 0, k, ({ BUG_ON(k.k->p.offset != i); i = bkey_start_offset(k.k); 0; - })); + }))); bch_err_msg(c, ret, "error iterating backwards"); if (ret) - goto err; + return ret; BUG_ON(i); -err: - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; + return 0; } static int test_iterate_slots(struct bch_fs *c, u64 nr) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = { NULL }; - struct bkey_s_c k; u64 i; int ret = 0; @@ -250,57 +235,48 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0); bch_err_msg(c, ret, "insert error"); if (ret) - goto err; + return ret; } pr_info("iterating forwards"); - i = 0; - ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ - BUG_ON(k.k->p.offset != i); - i += 2; - 0; - })); + ret = bch2_trans_run(c, + for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ + BUG_ON(k.k->p.offset != i); + i += 2; + 0; + }))); bch_err_msg(c, ret, "error iterating forwards"); if (ret) - goto err; + return ret; BUG_ON(i != nr * 2); pr_info("iterating forwards by slots"); - i = 0; - ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - BTREE_ITER_SLOTS, k, ({ - if (i >= nr * 2) - break; + ret = bch2_trans_run(c, + for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + BTREE_ITER_SLOTS, k, ({ + if (i >= nr * 2) + break; - BUG_ON(k.k->p.offset != i); - BUG_ON(bkey_deleted(k.k) != (i & 1)); + BUG_ON(k.k->p.offset != i); + BUG_ON(bkey_deleted(k.k) != (i & 1)); - i++; - 0; - })); - if (ret < 0) { - bch_err_msg(c, ret, "error iterating forwards by slots"); - goto err; - } - ret = 0; -err: - bch2_trans_put(trans); + i++; + 0; + }))); + bch_err_msg(c, ret, "error iterating forwards by slots"); return ret; } static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = { NULL }; - struct bkey_s_c k; u64 i; int ret = 0; @@ -319,50 +295,45 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0); bch_err_msg(c, ret, "insert error"); if (ret) - goto err; + return ret; } pr_info("iterating forwards"); - i = 0; - ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ - BUG_ON(bkey_start_offset(k.k) != i + 8); - BUG_ON(k.k->size != 8); - i += 16; - 0; - })); + ret = bch2_trans_run(c, + for_each_btree_key_upto(trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ + BUG_ON(bkey_start_offset(k.k) != i + 8); + BUG_ON(k.k->size != 8); + i += 16; + 0; + }))); bch_err_msg(c, ret, "error iterating forwards"); if (ret) - goto err; + return ret; BUG_ON(i != nr); pr_info("iterating forwards by slots"); - i = 0; - ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - BTREE_ITER_SLOTS, k, ({ - if (i == nr) - break; - BUG_ON(bkey_deleted(k.k) != !(i % 16)); + ret = bch2_trans_run(c, + for_each_btree_key_upto(trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + BTREE_ITER_SLOTS, k, ({ + if (i == nr) + break; + BUG_ON(bkey_deleted(k.k) != !(i % 16)); - BUG_ON(bkey_start_offset(k.k) != i); - BUG_ON(k.k->size != 8); - i = k.k->p.offset; - 0; - })); + BUG_ON(bkey_start_offset(k.k) != i); + BUG_ON(k.k->size != 8); + i = k.k->p.offset; + 0; + }))); bch_err_msg(c, ret, "error iterating forwards by slots"); - if (ret) - goto err; - ret = 0; -err: - bch2_trans_put(trans); - return 0; + return ret; } /* @@ -736,8 +707,6 @@ static int rand_delete(struct bch_fs *c, u64 nr) static int seq_insert(struct bch_fs *c, u64 nr) { - struct btree_iter iter; - struct bkey_s_c k; struct bkey_i_cookie insert; bkey_cookie_init(&insert.k_i); @@ -756,11 +725,8 @@ static int seq_insert(struct bch_fs *c, u64 nr) static int seq_lookup(struct bch_fs *c, u64 nr) { - struct btree_iter iter; - struct bkey_s_c k; - return bch2_trans_run(c, - for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs, + for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), 0, k, 0)); @@ -768,9 +734,6 @@ static int seq_lookup(struct bch_fs *c, u64 nr) static int seq_overwrite(struct bch_fs *c, u64 nr) { - struct btree_iter iter; - struct bkey_s_c k; - return bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c new file mode 100644 index 000000000000..b1c867aa2b58 --- /dev/null +++ b/fs/bcachefs/thread_with_file.c @@ -0,0 +1,299 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + +#include "bcachefs.h" +#include "printbuf.h" +#include "thread_with_file.h" + +#include <linux/anon_inodes.h> +#include <linux/file.h> +#include <linux/kthread.h> +#include <linux/pagemap.h> +#include <linux/poll.h> + +void bch2_thread_with_file_exit(struct thread_with_file *thr) +{ + if (thr->task) { + kthread_stop(thr->task); + put_task_struct(thr->task); + } +} + +int bch2_run_thread_with_file(struct thread_with_file *thr, + const struct file_operations *fops, + int (*fn)(void *)) +{ + struct file *file = NULL; + int ret, fd = -1; + unsigned fd_flags = O_CLOEXEC; + + if (fops->read && fops->write) + fd_flags |= O_RDWR; + else if (fops->read) + fd_flags |= O_RDONLY; + else if (fops->write) + fd_flags |= O_WRONLY; + + char name[TASK_COMM_LEN]; + get_task_comm(name, current); + + thr->ret = 0; + thr->task = kthread_create(fn, thr, "%s", name); + ret = PTR_ERR_OR_ZERO(thr->task); + if (ret) + return ret; + + ret = get_unused_fd_flags(fd_flags); + if (ret < 0) + goto err; + fd = ret; + + file = anon_inode_getfile(name, fops, thr, fd_flags); + ret = PTR_ERR_OR_ZERO(file); + if (ret) + goto err; + + fd_install(fd, file); + get_task_struct(thr->task); + wake_up_process(thr->task); + return fd; +err: + if (fd >= 0) + put_unused_fd(fd); + if (thr->task) + kthread_stop(thr->task); + return ret; +} + +static inline bool thread_with_stdio_has_output(struct thread_with_stdio *thr) +{ + return thr->stdio.output_buf.pos || + thr->output2.nr || + thr->thr.done; +} + +static ssize_t thread_with_stdio_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + size_t copied = 0, b; + int ret = 0; + + if ((file->f_flags & O_NONBLOCK) && + !thread_with_stdio_has_output(thr)) + return -EAGAIN; + + ret = wait_event_interruptible(thr->stdio.output_wait, + thread_with_stdio_has_output(thr)); + if (ret) + return ret; + + if (thr->thr.done) + return 0; + + while (len) { + ret = darray_make_room(&thr->output2, thr->stdio.output_buf.pos); + if (ret) + break; + + spin_lock_irq(&thr->stdio.output_lock); + b = min_t(size_t, darray_room(thr->output2), thr->stdio.output_buf.pos); + + memcpy(&darray_top(thr->output2), thr->stdio.output_buf.buf, b); + memmove(thr->stdio.output_buf.buf, + thr->stdio.output_buf.buf + b, + thr->stdio.output_buf.pos - b); + + thr->output2.nr += b; + thr->stdio.output_buf.pos -= b; + spin_unlock_irq(&thr->stdio.output_lock); + + b = min(len, thr->output2.nr); + if (!b) + break; + + b -= copy_to_user(buf, thr->output2.data, b); + if (!b) { + ret = -EFAULT; + break; + } + + copied += b; + buf += b; + len -= b; + + memmove(thr->output2.data, + thr->output2.data + b, + thr->output2.nr - b); + thr->output2.nr -= b; + } + + return copied ?: ret; +} + +static int thread_with_stdio_release(struct inode *inode, struct file *file) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + + bch2_thread_with_file_exit(&thr->thr); + printbuf_exit(&thr->stdio.input_buf); + printbuf_exit(&thr->stdio.output_buf); + darray_exit(&thr->output2); + thr->exit(thr); + return 0; +} + +#define WRITE_BUFFER 4096 + +static inline bool thread_with_stdio_has_input_space(struct thread_with_stdio *thr) +{ + return thr->stdio.input_buf.pos < WRITE_BUFFER || thr->thr.done; +} + +static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf, + size_t len, loff_t *ppos) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + struct printbuf *buf = &thr->stdio.input_buf; + size_t copied = 0; + ssize_t ret = 0; + + while (len) { + if (thr->thr.done) { + ret = -EPIPE; + break; + } + + size_t b = len - fault_in_readable(ubuf, len); + if (!b) { + ret = -EFAULT; + break; + } + + spin_lock(&thr->stdio.input_lock); + if (buf->pos < WRITE_BUFFER) + bch2_printbuf_make_room(buf, min(b, WRITE_BUFFER - buf->pos)); + b = min(len, printbuf_remaining_size(buf)); + + if (b && !copy_from_user_nofault(&buf->buf[buf->pos], ubuf, b)) { + ubuf += b; + len -= b; + copied += b; + buf->pos += b; + } + spin_unlock(&thr->stdio.input_lock); + + if (b) { + wake_up(&thr->stdio.input_wait); + } else { + if ((file->f_flags & O_NONBLOCK)) { + ret = -EAGAIN; + break; + } + + ret = wait_event_interruptible(thr->stdio.input_wait, + thread_with_stdio_has_input_space(thr)); + if (ret) + break; + } + } + + return copied ?: ret; +} + +static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_struct *wait) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + + poll_wait(file, &thr->stdio.output_wait, wait); + poll_wait(file, &thr->stdio.input_wait, wait); + + __poll_t mask = 0; + + if (thread_with_stdio_has_output(thr)) + mask |= EPOLLIN; + if (thread_with_stdio_has_input_space(thr)) + mask |= EPOLLOUT; + if (thr->thr.done) + mask |= EPOLLHUP|EPOLLERR; + return mask; +} + +static const struct file_operations thread_with_stdio_fops = { + .release = thread_with_stdio_release, + .read = thread_with_stdio_read, + .write = thread_with_stdio_write, + .poll = thread_with_stdio_poll, + .llseek = no_llseek, +}; + +int bch2_run_thread_with_stdio(struct thread_with_stdio *thr, + void (*exit)(struct thread_with_stdio *), + int (*fn)(void *)) +{ + thr->stdio.input_buf = PRINTBUF; + thr->stdio.input_buf.atomic++; + spin_lock_init(&thr->stdio.input_lock); + init_waitqueue_head(&thr->stdio.input_wait); + + thr->stdio.output_buf = PRINTBUF; + thr->stdio.output_buf.atomic++; + spin_lock_init(&thr->stdio.output_lock); + init_waitqueue_head(&thr->stdio.output_wait); + + darray_init(&thr->output2); + thr->exit = exit; + + return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, fn); +} + +int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *buf, size_t len) +{ + wait_event(stdio->input_wait, + stdio->input_buf.pos || stdio->done); + + if (stdio->done) + return -1; + + spin_lock(&stdio->input_lock); + int ret = min(len, stdio->input_buf.pos); + stdio->input_buf.pos -= ret; + memcpy(buf, stdio->input_buf.buf, ret); + memmove(stdio->input_buf.buf, + stdio->input_buf.buf + ret, + stdio->input_buf.pos); + spin_unlock(&stdio->input_lock); + + wake_up(&stdio->input_wait); + return ret; +} + +int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *buf, size_t len) +{ + wait_event(stdio->input_wait, + stdio->input_buf.pos || stdio->done); + + if (stdio->done) + return -1; + + spin_lock(&stdio->input_lock); + int ret = min(len, stdio->input_buf.pos); + char *n = memchr(stdio->input_buf.buf, '\n', ret); + if (n) + ret = min(ret, n + 1 - stdio->input_buf.buf); + stdio->input_buf.pos -= ret; + memcpy(buf, stdio->input_buf.buf, ret); + memmove(stdio->input_buf.buf, + stdio->input_buf.buf + ret, + stdio->input_buf.pos); + spin_unlock(&stdio->input_lock); + + wake_up(&stdio->input_wait); + return ret; +} + +#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h new file mode 100644 index 000000000000..05879c5048c8 --- /dev/null +++ b/fs/bcachefs/thread_with_file.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_THREAD_WITH_FILE_H +#define _BCACHEFS_THREAD_WITH_FILE_H + +#include "thread_with_file_types.h" + +struct task_struct; + +struct thread_with_file { + struct task_struct *task; + int ret; + bool done; +}; + +void bch2_thread_with_file_exit(struct thread_with_file *); +int bch2_run_thread_with_file(struct thread_with_file *, + const struct file_operations *, + int (*fn)(void *)); + +struct thread_with_stdio { + struct thread_with_file thr; + struct stdio_redirect stdio; + DARRAY(char) output2; + void (*exit)(struct thread_with_stdio *); +}; + +static inline void thread_with_stdio_done(struct thread_with_stdio *thr) +{ + thr->thr.done = true; + thr->stdio.done = true; + wake_up(&thr->stdio.input_wait); + wake_up(&thr->stdio.output_wait); +} + +int bch2_run_thread_with_stdio(struct thread_with_stdio *, + void (*exit)(struct thread_with_stdio *), + int (*fn)(void *)); +int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t); +int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t); + +#endif /* _BCACHEFS_THREAD_WITH_FILE_H */ diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h new file mode 100644 index 000000000000..90b5e645e98c --- /dev/null +++ b/fs/bcachefs/thread_with_file_types.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H +#define _BCACHEFS_THREAD_WITH_FILE_TYPES_H + +struct stdio_redirect { + spinlock_t output_lock; + wait_queue_head_t output_wait; + struct printbuf output_buf; + + spinlock_t input_lock; + wait_queue_head_t input_wait; + struct printbuf input_buf; + bool done; +}; + +#endif /* _BCACHEFS_THREAD_WITH_FILE_TYPES_H */ diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index fd49b63562c3..c94876b3bb06 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -32,22 +32,68 @@ DECLARE_EVENT_CLASS(bpos, TP_printk("%llu:%llu:%u", __entry->p_inode, __entry->p_offset, __entry->p_snapshot) ); -DECLARE_EVENT_CLASS(bkey, - TP_PROTO(struct bch_fs *c, const char *k), - TP_ARGS(c, k), +DECLARE_EVENT_CLASS(fs_str, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str), TP_STRUCT__entry( - __string(k, k ) + __field(dev_t, dev ) + __string(str, str ) ), TP_fast_assign( - __assign_str(k, k); + __entry->dev = c->dev; + __assign_str(str, str); ), - TP_printk("%s", __get_str(k)) + TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str)) ); -DECLARE_EVENT_CLASS(btree_node, +DECLARE_EVENT_CLASS(trans_str, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str), + TP_ARGS(trans, caller_ip, str), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __string(str, str ) + ), + + TP_fast_assign( + __entry->dev = trans->c->dev; + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __assign_str(str, str); + ), + + TP_printk("%d,%d %s %pS %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->trans_fn, (void *) __entry->caller_ip, __get_str(str)) +); + +DECLARE_EVENT_CLASS(trans_str_nocaller, + TP_PROTO(struct btree_trans *trans, const char *str), + TP_ARGS(trans, str), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __array(char, trans_fn, 32 ) + __string(str, str ) + ), + + TP_fast_assign( + __entry->dev = trans->c->dev; + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __assign_str(str, str); + ), + + TP_printk("%d,%d %s %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->trans_fn, __get_str(str)) +); + +DECLARE_EVENT_CLASS(btree_node_nofs, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b), @@ -72,6 +118,33 @@ DECLARE_EVENT_CLASS(btree_node, __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) ); +DECLARE_EVENT_CLASS(btree_node, + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __array(char, trans_fn, 32 ) + __field(u8, level ) + __field(u8, btree_id ) + TRACE_BPOS_entries(pos) + ), + + TP_fast_assign( + __entry->dev = trans->c->dev; + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->level = b->c.level; + __entry->btree_id = b->c.btree_id; + TRACE_BPOS_assign(pos, b->key.k.p); + ), + + TP_printk("%d,%d %s %u %s %llu:%llu:%u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn, + __entry->level, + bch2_btree_id_str(__entry->btree_id), + __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) +); + DECLARE_EVENT_CLASS(bch_fs, TP_PROTO(struct bch_fs *c), TP_ARGS(c), @@ -87,6 +160,23 @@ DECLARE_EVENT_CLASS(bch_fs, TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev)) ); +DECLARE_EVENT_CLASS(btree_trans, + TP_PROTO(struct btree_trans *trans), + TP_ARGS(trans), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __array(char, trans_fn, 32 ) + ), + + TP_fast_assign( + __entry->dev = trans->c->dev; + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + ), + + TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn) +); + DECLARE_EVENT_CLASS(bio, TP_PROTO(struct bio *bio), TP_ARGS(bio), @@ -188,6 +278,25 @@ DEFINE_EVENT(bch_fs, journal_entry_full, TP_ARGS(c) ); +TRACE_EVENT(journal_entry_close, + TP_PROTO(struct bch_fs *c, unsigned bytes), + TP_ARGS(c, bytes), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u32, bytes ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->bytes = bytes; + ), + + TP_printk("%d,%d entry bytes %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->bytes) +); + DEFINE_EVENT(bio, journal_write, TP_PROTO(struct bio *bio), TP_ARGS(bio) @@ -286,36 +395,36 @@ TRACE_EVENT(btree_cache_scan, __entry->nr_to_scan, __entry->can_free, __entry->ret) ); -DEFINE_EVENT(btree_node, btree_cache_reap, +DEFINE_EVENT(btree_node_nofs, btree_cache_reap, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b) ); -DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock_fail, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) +DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock_fail, + TP_PROTO(struct btree_trans *trans), + TP_ARGS(trans) ); -DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) +DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock, + TP_PROTO(struct btree_trans *trans), + TP_ARGS(trans) ); -DEFINE_EVENT(bch_fs, btree_cache_cannibalize, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) +DEFINE_EVENT(btree_trans, btree_cache_cannibalize, + TP_PROTO(struct btree_trans *trans), + TP_ARGS(trans) ); -DEFINE_EVENT(bch_fs, btree_cache_cannibalize_unlock, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) +DEFINE_EVENT(btree_trans, btree_cache_cannibalize_unlock, + TP_PROTO(struct btree_trans *trans), + TP_ARGS(trans) ); /* Btree */ DEFINE_EVENT(btree_node, btree_node_read, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); TRACE_EVENT(btree_node_write, @@ -339,13 +448,13 @@ TRACE_EVENT(btree_node_write, ); DEFINE_EVENT(btree_node, btree_node_alloc, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); DEFINE_EVENT(btree_node, btree_node_free, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); TRACE_EVENT(btree_reserve_get_fail, @@ -377,28 +486,28 @@ TRACE_EVENT(btree_reserve_get_fail, ); DEFINE_EVENT(btree_node, btree_node_compact, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); DEFINE_EVENT(btree_node, btree_node_merge, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); DEFINE_EVENT(btree_node, btree_node_split, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); DEFINE_EVENT(btree_node, btree_node_rewrite, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); DEFINE_EVENT(btree_node, btree_node_set_root, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); TRACE_EVENT(btree_path_relock_fail, @@ -717,22 +826,22 @@ TRACE_EVENT(bucket_evacuate, __entry->dev_idx, __entry->bucket) ); -DEFINE_EVENT(bkey, move_extent, +DEFINE_EVENT(fs_str, move_extent, TP_PROTO(struct bch_fs *c, const char *k), TP_ARGS(c, k) ); -DEFINE_EVENT(bkey, move_extent_read, +DEFINE_EVENT(fs_str, move_extent_read, TP_PROTO(struct bch_fs *c, const char *k), TP_ARGS(c, k) ); -DEFINE_EVENT(bkey, move_extent_write, +DEFINE_EVENT(fs_str, move_extent_write, TP_PROTO(struct bch_fs *c, const char *k), TP_ARGS(c, k) ); -DEFINE_EVENT(bkey, move_extent_finish, +DEFINE_EVENT(fs_str, move_extent_finish, TP_PROTO(struct bch_fs *c, const char *k), TP_ARGS(c, k) ); @@ -754,7 +863,7 @@ TRACE_EVENT(move_extent_fail, TP_printk("%d:%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(msg)) ); -DEFINE_EVENT(bkey, move_extent_start_fail, +DEFINE_EVENT(fs_str, move_extent_start_fail, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); @@ -987,10 +1096,11 @@ DEFINE_EVENT(transaction_event, trans_restart_key_cache_raced, TP_ARGS(trans, caller_ip) ); -DEFINE_EVENT(transaction_event, trans_restart_too_many_iters, +DEFINE_EVENT(trans_str, trans_restart_too_many_iters, TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) + unsigned long caller_ip, + const char *paths), + TP_ARGS(trans, caller_ip, paths) ); DECLARE_EVENT_CLASS(transaction_restart_iter, @@ -1056,8 +1166,6 @@ TRACE_EVENT(trans_restart_upgrade, __field(u8, level ) __field(u32, path_seq ) __field(u32, node_seq ) - __field(u32, path_alloc_seq ) - __field(u32, downgrade_seq) TRACE_BPOS_entries(pos) ), @@ -1070,12 +1178,10 @@ TRACE_EVENT(trans_restart_upgrade, __entry->level = f->l; __entry->path_seq = path->l[f->l].lock_seq; __entry->node_seq = IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq; - __entry->path_alloc_seq = path->alloc_seq; - __entry->downgrade_seq = path->downgrade_seq; TRACE_BPOS_assign(pos, path->pos) ), - TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u alloc_seq %u downgrade_seq %u", + TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u", __entry->trans_fn, (void *) __entry->caller_ip, bch2_btree_id_str(__entry->btree_id), @@ -1086,9 +1192,7 @@ TRACE_EVENT(trans_restart_upgrade, __entry->new_locks_want, __entry->level, __entry->path_seq, - __entry->node_seq, - __entry->path_alloc_seq, - __entry->downgrade_seq) + __entry->node_seq) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_relock, @@ -1160,10 +1264,10 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure, TP_ARGS(trans, caller_ip, path) ); -DEFINE_EVENT(transaction_event, trans_restart_would_deadlock, +DEFINE_EVENT(trans_str_nocaller, trans_restart_would_deadlock, TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) + const char *cycle), + TP_ARGS(trans, cycle) ); DEFINE_EVENT(transaction_event, trans_restart_would_deadlock_recursion_limit, @@ -1252,22 +1356,37 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced, TRACE_EVENT(path_downgrade, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path), + struct btree_path *path, + unsigned old_locks_want), + TP_ARGS(trans, caller_ip, path, old_locks_want), TP_STRUCT__entry( __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) + __field(unsigned, old_locks_want ) + __field(unsigned, new_locks_want ) + __field(unsigned, btree ) + TRACE_BPOS_entries(pos) ), TP_fast_assign( strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; + __entry->old_locks_want = old_locks_want; + __entry->new_locks_want = path->locks_want; + __entry->btree = path->btree_id; + TRACE_BPOS_assign(pos, path->pos); ), - TP_printk("%s %pS", + TP_printk("%s %pS locks_want %u -> %u %s %llu:%llu:%u", __entry->trans_fn, - (void *) __entry->caller_ip) + (void *) __entry->caller_ip, + __entry->old_locks_want, + __entry->new_locks_want, + bch2_btree_id_str(__entry->btree), + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot) ); DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, @@ -1298,21 +1417,48 @@ TRACE_EVENT(write_buffer_flush, __entry->nr, __entry->size, __entry->skipped, __entry->fast) ); +TRACE_EVENT(write_buffer_flush_sync, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), + TP_ARGS(trans, caller_ip), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + ), + + TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip) +); + TRACE_EVENT(write_buffer_flush_slowpath, - TP_PROTO(struct btree_trans *trans, size_t nr, size_t size), - TP_ARGS(trans, nr, size), + TP_PROTO(struct btree_trans *trans, size_t slowpath, size_t total), + TP_ARGS(trans, slowpath, total), TP_STRUCT__entry( - __field(size_t, nr ) - __field(size_t, size ) + __field(size_t, slowpath ) + __field(size_t, total ) ), TP_fast_assign( - __entry->nr = nr; - __entry->size = size; + __entry->slowpath = slowpath; + __entry->total = total; ), - TP_printk("%zu/%zu", __entry->nr, __entry->size) + TP_printk("%zu/%zu", __entry->slowpath, __entry->total) +); + +DEFINE_EVENT(fs_str, rebalance_extent, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); + +DEFINE_EVENT(fs_str, data_update, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); #endif /* _TRACE_BCACHEFS_H */ diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 84b142fcc3df..c2ef7cddaa4f 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -267,7 +267,7 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines) console_unlock(); } -int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task) +int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr) { #ifdef CONFIG_STACKTRACE unsigned nr_entries = 0; @@ -282,7 +282,7 @@ int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task) return -1; do { - nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, 0); + nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, skipnr + 1); } while (nr_entries == stack->size && !(ret = darray_make_room(stack, stack->size * 2))); @@ -297,24 +297,74 @@ int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task) void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack) { - unsigned long *i; - darray_for_each(*stack, i) { prt_printf(out, "[<0>] %pB", (void *) *i); prt_newline(out); } } -int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task) +int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr) { bch_stacktrace stack = { 0 }; - int ret = bch2_save_backtrace(&stack, task); + int ret = bch2_save_backtrace(&stack, task, skipnr + 1); bch2_prt_backtrace(out, &stack); darray_exit(&stack); return ret; } +#ifndef __KERNEL__ +#include <time.h> +void bch2_prt_datetime(struct printbuf *out, time64_t sec) +{ + time_t t = sec; + char buf[64]; + ctime_r(&t, buf); + strim(buf); + prt_str(out, buf); +} +#else +void bch2_prt_datetime(struct printbuf *out, time64_t sec) +{ + char buf[64]; + snprintf(buf, sizeof(buf), "%ptT", &sec); + prt_u64(out, sec); +} +#endif + +static const struct time_unit { + const char *name; + u64 nsecs; +} time_units[] = { + { "ns", 1 }, + { "us", NSEC_PER_USEC }, + { "ms", NSEC_PER_MSEC }, + { "s", NSEC_PER_SEC }, + { "m", (u64) NSEC_PER_SEC * 60}, + { "h", (u64) NSEC_PER_SEC * 3600}, + { "eon", U64_MAX }, +}; + +static const struct time_unit *pick_time_units(u64 ns) +{ + const struct time_unit *u; + + for (u = time_units; + u + 1 < time_units + ARRAY_SIZE(time_units) && + ns >= u[1].nsecs << 1; + u++) + ; + + return u; +} + +void bch2_pr_time_units(struct printbuf *out, u64 ns) +{ + const struct time_unit *u = pick_time_units(ns); + + prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); +} + /* time stats: */ #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT @@ -359,6 +409,7 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats, mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration); stats->max_duration = max(stats->max_duration, duration); stats->min_duration = min(stats->min_duration, duration); + stats->total_duration += duration; bch2_quantiles_update(&stats->quantiles, duration); } @@ -372,29 +423,33 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats, } } +static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, + struct bch2_time_stat_buffer *b) +{ + for (struct bch2_time_stat_buffer_entry *i = b->entries; + i < b->entries + ARRAY_SIZE(b->entries); + i++) + bch2_time_stats_update_one(stats, i->start, i->end); + b->nr = 0; +} + static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, struct bch2_time_stat_buffer *b) { - struct bch2_time_stat_buffer_entry *i; unsigned long flags; spin_lock_irqsave(&stats->lock, flags); - for (i = b->entries; - i < b->entries + ARRAY_SIZE(b->entries); - i++) - bch2_time_stats_update_one(stats, i->start, i->end); + __bch2_time_stats_clear_buffer(stats, b); spin_unlock_irqrestore(&stats->lock, flags); - - b->nr = 0; } void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) { unsigned long flags; - WARN_RATELIMIT(!stats->min_duration || !stats->min_freq, - "time_stats: min_duration = %llu, min_freq = %llu", - stats->min_duration, stats->min_freq); + WARN_ONCE(!stats->duration_stats_weighted.weight || + !stats->freq_stats_weighted.weight, + "uninitialized time_stats"); if (!stats->buffer) { spin_lock_irqsave(&stats->lock, flags); @@ -423,40 +478,6 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) preempt_enable(); } } -#endif - -static const struct time_unit { - const char *name; - u64 nsecs; -} time_units[] = { - { "ns", 1 }, - { "us", NSEC_PER_USEC }, - { "ms", NSEC_PER_MSEC }, - { "s", NSEC_PER_SEC }, - { "m", (u64) NSEC_PER_SEC * 60}, - { "h", (u64) NSEC_PER_SEC * 3600}, - { "eon", U64_MAX }, -}; - -static const struct time_unit *pick_time_units(u64 ns) -{ - const struct time_unit *u; - - for (u = time_units; - u + 1 < time_units + ARRAY_SIZE(time_units) && - ns >= u[1].nsecs << 1; - u++) - ; - - return u; -} - -void bch2_pr_time_units(struct printbuf *out, u64 ns) -{ - const struct time_unit *u = pick_time_units(ns); - - prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); -} static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) { @@ -467,26 +488,6 @@ static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) prt_printf(out, "%s", u->name); } -#ifndef __KERNEL__ -#include <time.h> -void bch2_prt_datetime(struct printbuf *out, time64_t sec) -{ - time_t t = sec; - char buf[64]; - ctime_r(&t, buf); - prt_str(out, buf); -} -#else -void bch2_prt_datetime(struct printbuf *out, time64_t sec) -{ - char buf[64]; - snprintf(buf, sizeof(buf), "%ptT", &sec); - prt_u64(out, sec); -} -#endif - -#define TABSTOP_SIZE 12 - static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns) { prt_str(out, name); @@ -495,12 +496,24 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 prt_newline(out); } +#define TABSTOP_SIZE 12 + void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) { const struct time_unit *u; s64 f_mean = 0, d_mean = 0; u64 q, last_q = 0, f_stddev = 0, d_stddev = 0; int i; + + if (stats->buffer) { + int cpu; + + spin_lock_irq(&stats->lock); + for_each_possible_cpu(cpu) + __bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu)); + spin_unlock_irq(&stats->lock); + } + /* * avoid divide by zero */ @@ -546,6 +559,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats pr_name_and_units(out, "min:", stats->min_duration); pr_name_and_units(out, "max:", stats->max_duration); + pr_name_and_units(out, "total:", stats->total_duration); prt_printf(out, "mean:"); prt_tab(out); @@ -603,6 +617,9 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats last_q = q; } } +#else +void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {} +#endif void bch2_time_stats_exit(struct bch2_time_stats *stats) { @@ -1157,3 +1174,37 @@ u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) return ret; } + +void bch2_darray_str_exit(darray_str *d) +{ + darray_for_each(*d, i) + kfree(*i); + darray_exit(d); +} + +int bch2_split_devs(const char *_dev_name, darray_str *ret) +{ + darray_init(ret); + + char *dev_name = kstrdup(_dev_name, GFP_KERNEL), *s = dev_name; + if (!dev_name) + return -ENOMEM; + + while ((s = strsep(&dev_name, ":"))) { + char *p = kstrdup(s, GFP_KERNEL); + if (!p) + goto err; + + if (darray_push(ret, p)) { + kfree(p); + goto err; + } + } + + kfree(dev_name); + return 0; +err: + bch2_darray_str_exit(ret); + kfree(dev_name); + return -ENOMEM; +} diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index b701f7fe0784..c75fc31915d3 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -347,9 +347,18 @@ void bch2_prt_u64_binary(struct printbuf *, u64, unsigned); void bch2_print_string_as_lines(const char *prefix, const char *lines); typedef DARRAY(unsigned long) bch_stacktrace; -int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *); +int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned); void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *); -int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *); +int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *, unsigned); + +static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev) +{ +#ifdef __KERNEL__ + prt_printf(out, "%pg", bdev); +#else + prt_str(out, bdev->name); +#endif +} #define NR_QUANTILES 15 #define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) @@ -374,8 +383,9 @@ struct bch2_time_stat_buffer { struct bch2_time_stats { spinlock_t lock; /* all fields are in nanoseconds */ - u64 max_duration; u64 min_duration; + u64 max_duration; + u64 total_duration; u64 max_freq; u64 min_freq; u64 last_event; @@ -390,15 +400,39 @@ struct bch2_time_stats { #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64); -#else -static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {} -#endif static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) { __bch2_time_stats_update(stats, start, local_clock()); } +static inline bool track_event_change(struct bch2_time_stats *stats, + u64 *start, bool v) +{ + if (v != !!*start) { + if (!v) { + bch2_time_stats_update(stats, *start); + *start = 0; + } else { + *start = local_clock() ?: 1; + return true; + } + } + + return false; +} +#else +static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {} +static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {} +static inline bool track_event_change(struct bch2_time_stats *stats, + u64 *start, bool v) +{ + bool ret = v && !*start; + *start = v; + return ret; +} +#endif + void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *); void bch2_time_stats_exit(struct bch2_time_stats *); @@ -831,4 +865,14 @@ static inline int cmp_le32(__le32 l, __le32 r) #include <linux/uuid.h> +#define QSTR(n) { { { .len = strlen(n) } }, .name = n } + +static inline bool qstr_eq(const struct qstr l, const struct qstr r) +{ + return l.len == r.len && !memcmp(l.name, r.name, l.len); +} + +void bch2_darray_str_exit(darray_str *); +int bch2_split_devs(const char *, darray_str *); + #endif /* _BCACHEFS_UTIL_H */ diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h index a6561b4b36a6..2ad338e282da 100644 --- a/fs/bcachefs/vstructs.h +++ b/fs/bcachefs/vstructs.h @@ -48,14 +48,14 @@ ((void *) ((u64 *) (_s)->_data + __vstruct_u64s(_s))) #define vstruct_for_each(_s, _i) \ - for (_i = (_s)->start; \ + for (typeof(&(_s)->start[0]) _i = (_s)->start; \ _i < vstruct_last(_s); \ _i = vstruct_next(_i)) -#define vstruct_for_each_safe(_s, _i, _t) \ - for (_i = (_s)->start; \ - _i < vstruct_last(_s) && (_t = vstruct_next(_i), true); \ - _i = _t) +#define vstruct_for_each_safe(_s, _i) \ + for (typeof(&(_s)->start[0]) _next, _i = (_s)->start; \ + _i < vstruct_last(_s) && (_next = vstruct_next(_i), true); \ + _i = _next) #define vstruct_idx(_s, _idx) \ ((typeof(&(_s)->start[0])) ((_s)->_data + (_idx))) diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index a93d76df8ed8..2b4dda047450 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -671,9 +671,6 @@ static struct dentry *befs_get_parent(struct dentry *child) parent = befs_iget(child->d_sb, (unsigned long)befs_ino->i_parent.start); - if (IS_ERR(parent)) - return ERR_CAST(parent); - return d_obtain_alias(parent); } diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index fbc4ae80a4b2..c375e22c4c0c 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -275,11 +275,6 @@ static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino) dprintf("name=%s, namelen=%d\n", name, namelen); - if (!namelen) - return -ENOENT; - if (namelen > BFS_NAMELEN) - return -ENAMETOOLONG; - sblock = BFS_I(dir)->i_sblock; eblock = BFS_I(dir)->i_eblock; for (block = sblock; block <= eblock; block++) { diff --git a/fs/bfs/file.c b/fs/bfs/file.c index adc2230079c6..a778411574a9 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -11,6 +11,7 @@ */ #include <linux/fs.h> +#include <linux/mpage.h> #include <linux/buffer_head.h> #include "bfs.h" @@ -150,9 +151,10 @@ out: return err; } -static int bfs_writepage(struct page *page, struct writeback_control *wbc) +static int bfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - return block_write_full_page(page, bfs_get_block, wbc); + return mpage_writepages(mapping, wbc, bfs_get_block); } static int bfs_read_folio(struct file *file, struct folio *folio) @@ -190,9 +192,10 @@ const struct address_space_operations bfs_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = bfs_read_folio, - .writepage = bfs_writepage, + .writepages = bfs_writepages, .write_begin = bfs_write_begin, .write_end = generic_write_end, + .migrate_folio = buffer_migrate_folio, .bmap = bfs_bmap, }; diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c index 206cf1612c1d..1925a0919ca6 100644 --- a/fs/btrfs/accessors.c +++ b/fs/btrfs/accessors.c @@ -27,7 +27,7 @@ static bool check_setget_bounds(const struct extent_buffer *eb, void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb) { token->eb = eb; - token->kaddr = page_address(eb->pages[0]); + token->kaddr = folio_address(eb->folios[0]); token->offset = 0; } @@ -50,7 +50,7 @@ void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *e * an offset into the extent buffer page array, cast to a specific type. This * gives us all the type checking. * - * The extent buffer pages stored in the array pages do not form a contiguous + * The extent buffer pages stored in the array folios may not form a contiguous * phyusical range, but the API functions assume the linear offset to the range * from 0 to metadata node size. */ @@ -60,28 +60,30 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ const void *ptr, unsigned long off) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long idx = get_eb_page_index(member_offset); \ - const unsigned long oip = get_eb_offset_in_page(token->eb, \ - member_offset); \ + const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \ + const unsigned long oil = get_eb_offset_in_folio(token->eb, \ + member_offset);\ + const int unit_size = folio_size(token->eb->folios[0]); \ + const int unit_shift = folio_shift(token->eb->folios[0]); \ const int size = sizeof(u##bits); \ u8 lebytes[sizeof(u##bits)]; \ - const int part = PAGE_SIZE - oip; \ + const int part = unit_size - oil; \ \ ASSERT(token); \ ASSERT(token->kaddr); \ ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \ if (token->offset <= member_offset && \ - member_offset + size <= token->offset + PAGE_SIZE) { \ - return get_unaligned_le##bits(token->kaddr + oip); \ + member_offset + size <= token->offset + unit_size) { \ + return get_unaligned_le##bits(token->kaddr + oil); \ } \ - token->kaddr = page_address(token->eb->pages[idx]); \ - token->offset = idx << PAGE_SHIFT; \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE ) \ - return get_unaligned_le##bits(token->kaddr + oip); \ + token->kaddr = folio_address(token->eb->folios[idx]); \ + token->offset = idx << unit_shift; \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \ + return get_unaligned_le##bits(token->kaddr + oil); \ \ - memcpy(lebytes, token->kaddr + oip, part); \ - token->kaddr = page_address(token->eb->pages[idx + 1]); \ - token->offset = (idx + 1) << PAGE_SHIFT; \ + memcpy(lebytes, token->kaddr + oil, part); \ + token->kaddr = folio_address(token->eb->folios[idx + 1]); \ + token->offset = (idx + 1) << unit_shift; \ memcpy(lebytes + part, token->kaddr, size - part); \ return get_unaligned_le##bits(lebytes); \ } \ @@ -89,19 +91,21 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ const void *ptr, unsigned long off) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \ - const unsigned long idx = get_eb_page_index(member_offset); \ - char *kaddr = page_address(eb->pages[idx]); \ + const unsigned long idx = get_eb_folio_index(eb, member_offset);\ + const unsigned long oil = get_eb_offset_in_folio(eb, \ + member_offset);\ + const int unit_size = folio_size(eb->folios[0]); \ + char *kaddr = folio_address(eb->folios[idx]); \ const int size = sizeof(u##bits); \ - const int part = PAGE_SIZE - oip; \ + const int part = unit_size - oil; \ u8 lebytes[sizeof(u##bits)]; \ \ ASSERT(check_setget_bounds(eb, ptr, off, size)); \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) \ - return get_unaligned_le##bits(kaddr + oip); \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \ + return get_unaligned_le##bits(kaddr + oil); \ \ - memcpy(lebytes, kaddr + oip, part); \ - kaddr = page_address(eb->pages[idx + 1]); \ + memcpy(lebytes, kaddr + oil, part); \ + kaddr = folio_address(eb->folios[idx + 1]); \ memcpy(lebytes + part, kaddr, size - part); \ return get_unaligned_le##bits(lebytes); \ } \ @@ -110,53 +114,59 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token, \ u##bits val) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long idx = get_eb_page_index(member_offset); \ - const unsigned long oip = get_eb_offset_in_page(token->eb, \ - member_offset); \ + const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \ + const unsigned long oil = get_eb_offset_in_folio(token->eb, \ + member_offset);\ + const int unit_size = folio_size(token->eb->folios[0]); \ + const int unit_shift = folio_shift(token->eb->folios[0]); \ const int size = sizeof(u##bits); \ u8 lebytes[sizeof(u##bits)]; \ - const int part = PAGE_SIZE - oip; \ + const int part = unit_size - oil; \ \ ASSERT(token); \ ASSERT(token->kaddr); \ ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \ if (token->offset <= member_offset && \ - member_offset + size <= token->offset + PAGE_SIZE) { \ - put_unaligned_le##bits(val, token->kaddr + oip); \ + member_offset + size <= token->offset + unit_size) { \ + put_unaligned_le##bits(val, token->kaddr + oil); \ return; \ } \ - token->kaddr = page_address(token->eb->pages[idx]); \ - token->offset = idx << PAGE_SHIFT; \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \ - put_unaligned_le##bits(val, token->kaddr + oip); \ + token->kaddr = folio_address(token->eb->folios[idx]); \ + token->offset = idx << unit_shift; \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || \ + oil + size <= unit_size) { \ + put_unaligned_le##bits(val, token->kaddr + oil); \ return; \ } \ put_unaligned_le##bits(val, lebytes); \ - memcpy(token->kaddr + oip, lebytes, part); \ - token->kaddr = page_address(token->eb->pages[idx + 1]); \ - token->offset = (idx + 1) << PAGE_SHIFT; \ + memcpy(token->kaddr + oil, lebytes, part); \ + token->kaddr = folio_address(token->eb->folios[idx + 1]); \ + token->offset = (idx + 1) << unit_shift; \ memcpy(token->kaddr, lebytes + part, size - part); \ } \ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ unsigned long off, u##bits val) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \ - const unsigned long idx = get_eb_page_index(member_offset); \ - char *kaddr = page_address(eb->pages[idx]); \ + const unsigned long idx = get_eb_folio_index(eb, member_offset);\ + const unsigned long oil = get_eb_offset_in_folio(eb, \ + member_offset);\ + const int unit_size = folio_size(eb->folios[0]); \ + char *kaddr = folio_address(eb->folios[idx]); \ const int size = sizeof(u##bits); \ - const int part = PAGE_SIZE - oip; \ + const int part = unit_size - oil; \ u8 lebytes[sizeof(u##bits)]; \ \ ASSERT(check_setget_bounds(eb, ptr, off, size)); \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \ - put_unaligned_le##bits(val, kaddr + oip); \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || \ + oil + size <= unit_size) { \ + put_unaligned_le##bits(val, kaddr + oil); \ return; \ } \ \ put_unaligned_le##bits(val, lebytes); \ - memcpy(kaddr + oip, lebytes, part); \ - kaddr = page_address(eb->pages[idx + 1]); \ + memcpy(kaddr + oil, lebytes, part); \ + kaddr = folio_address(eb->folios[idx + 1]); \ memcpy(kaddr, lebytes + part, size - part); \ } diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index aa0844535644..ed7aa32972ad 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -90,14 +90,14 @@ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(const struct extent_buffer *eb) \ { \ - const type *p = page_address(eb->pages[0]) + \ + const type *p = folio_address(eb->folios[0]) + \ offset_in_page(eb->start); \ return get_unaligned_le##bits(&p->member); \ } \ static inline void btrfs_set_##name(const struct extent_buffer *eb, \ u##bits val) \ { \ - type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \ + type *p = folio_address(eb->folios[0]) + offset_in_page(eb->start); \ put_unaligned_le##bits(val, &p->member); \ } diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 4f3b693a16b1..928f512cdb4a 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -194,6 +194,12 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio); int mirror = repair_bbio->mirror_num; + /* + * We can only trigger this for data bio, which doesn't support larger + * folios yet. + */ + ASSERT(folio_order(page_folio(bv->bv_page)) == 0); + if (repair_bbio->bio.bi_status || !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) { bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); @@ -215,7 +221,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, btrfs_repair_io_failure(fs_info, btrfs_ino(inode), repair_bbio->file_offset, fs_info->sectorsize, repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT, - bv->bv_page, bv->bv_offset, mirror); + page_folio(bv->bv_page), bv->bv_offset, mirror); } while (mirror != fbio->bbio->mirror_num); done: @@ -626,7 +632,7 @@ static bool should_async_write(struct btrfs_bio *bbio) /* * Submit bio to an async queue. * - * Return true if the work has been succesfuly submitted, else false. + * Return true if the work has been successfully submitted, else false. */ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, struct btrfs_io_context *bioc, @@ -767,8 +773,8 @@ void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) * freeing the bio. */ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, struct page *page, - unsigned int pg_offset, int mirror_num) + u64 length, u64 logical, struct folio *folio, + unsigned int folio_offset, int mirror_num) { struct btrfs_io_stripe smap = { 0 }; struct bio_vec bvec; @@ -799,7 +805,8 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; - __bio_add_page(&bio, page, length, pg_offset); + ret = bio_add_folio(&bio, folio, length, folio_offset); + ASSERT(ret); ret = submit_bio_wait(&bio); if (ret) { /* try to remap that extent elsewhere? */ diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index ca79decee060..bbaed317161a 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -105,7 +105,7 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status); void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num); void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace); int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, struct page *page, - unsigned int pg_offset, int mirror_num); + u64 length, u64 logical, struct folio *folio, + unsigned int folio_offset, int mirror_num); #endif diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 6e5dc68ff661..a9be9ac99222 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -168,7 +168,7 @@ void btrfs_put_block_group(struct btrfs_block_group *cache) cache); kfree(cache->free_space_ctl); - kfree(cache->physical_map); + btrfs_free_chunk_map(cache->physical_map); kfree(cache); } } @@ -1047,7 +1047,7 @@ static int remove_block_group_item(struct btrfs_trans_handle *trans, } int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - u64 group_start, struct extent_map *em) + struct btrfs_chunk_map *map) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_path *path; @@ -1059,10 +1059,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, int index; int factor; struct btrfs_caching_control *caching_ctl = NULL; - bool remove_em; + bool remove_map; bool remove_rsv = false; - block_group = btrfs_lookup_block_group(fs_info, group_start); + block_group = btrfs_lookup_block_group(fs_info, map->start); BUG_ON(!block_group); BUG_ON(!block_group->ro); @@ -1252,7 +1252,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * entries because we already removed them all when we called * btrfs_remove_free_space_cache(). * - * And we must not remove the extent map from the fs_info->mapping_tree + * And we must not remove the chunk map from the fs_info->mapping_tree * to prevent the same logical address range and physical device space * ranges from being reused for a new block group. This is needed to * avoid races with trimming and scrub. @@ -1268,19 +1268,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * in place until the extents have been discarded completely when * the transaction commit has completed. */ - remove_em = (atomic_read(&block_group->frozen) == 0); + remove_map = (atomic_read(&block_group->frozen) == 0); spin_unlock(&block_group->lock); - if (remove_em) { - struct extent_map_tree *em_tree; - - em_tree = &fs_info->mapping_tree; - write_lock(&em_tree->lock); - remove_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - /* once for the tree */ - free_extent_map(em); - } + if (remove_map) + btrfs_remove_chunk_map(fs_info, map); out: /* Once for the lookup reference */ @@ -1295,15 +1287,12 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( struct btrfs_fs_info *fs_info, const u64 chunk_offset) { struct btrfs_root *root = btrfs_block_group_root(fs_info); - struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; unsigned int num_items; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, chunk_offset, 1); - read_unlock(&em_tree->lock); - ASSERT(em && em->start == chunk_offset); + map = btrfs_find_chunk_map(fs_info, chunk_offset, 1); + ASSERT(map != NULL); + ASSERT(map->start == chunk_offset); /* * We need to reserve 3 + N units from the metadata space info in order @@ -1324,9 +1313,8 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( * more device items and remove one chunk item), but this is done at * btrfs_remove_chunk() through a call to check_system_chunk(). */ - map = em->map_lookup; num_items = 3 + map->num_stripes; - free_extent_map(em); + btrfs_free_chunk_map(map); return btrfs_start_transaction_fallback_global_rsv(root, num_items); } @@ -1927,8 +1915,7 @@ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg) static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, struct btrfs_path *path) { - struct extent_map_tree *em_tree; - struct extent_map *em; + struct btrfs_chunk_map *map; struct btrfs_block_group_item bg; struct extent_buffer *leaf; int slot; @@ -1938,23 +1925,20 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, slot = path->slots[0]; leaf = path->nodes[0]; - em_tree = &fs_info->mapping_tree; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, key->objectid, key->offset); - read_unlock(&em_tree->lock); - if (!em) { + map = btrfs_find_chunk_map(fs_info, key->objectid, key->offset); + if (!map) { btrfs_err(fs_info, "logical %llu len %llu found bg but no related chunk", key->objectid, key->offset); return -ENOENT; } - if (em->start != key->objectid || em->len != key->offset) { + if (map->start != key->objectid || map->chunk_len != key->offset) { btrfs_err(fs_info, "block group %llu len %llu mismatch with chunk %llu len %llu", - key->objectid, key->offset, em->start, em->len); + key->objectid, key->offset, map->start, map->chunk_len); ret = -EUCLEAN; - goto out_free_em; + goto out_free_map; } read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot), @@ -1962,16 +1946,16 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, flags = btrfs_stack_block_group_flags(&bg) & BTRFS_BLOCK_GROUP_TYPE_MASK; - if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + if (flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { btrfs_err(fs_info, "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", key->objectid, key->offset, flags, - (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type)); + (BTRFS_BLOCK_GROUP_TYPE_MASK & map->type)); ret = -EUCLEAN; } -out_free_em: - free_extent_map(em); +out_free_map: + btrfs_free_chunk_map(map); return ret; } @@ -2024,8 +2008,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, u64 physical, u64 **logical, int *naddrs, int *stripe_len) { - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; u64 *buf; u64 bytenr; u64 data_stripe_length; @@ -2033,14 +2016,13 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, int i, nr = 0; int ret = 0; - em = btrfs_get_chunk_map(fs_info, chunk_start, 1); - if (IS_ERR(em)) + map = btrfs_get_chunk_map(fs_info, chunk_start, 1); + if (IS_ERR(map)) return -EIO; - map = em->map_lookup; - data_stripe_length = em->orig_block_len; + data_stripe_length = map->stripe_size; io_stripe_size = BTRFS_STRIPE_LEN; - chunk_start = em->start; + chunk_start = map->start; /* For RAID5/6 adjust to a full IO stripe length */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) @@ -2094,7 +2076,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, *naddrs = nr; *stripe_len = io_stripe_size; out: - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } @@ -2199,49 +2181,47 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( */ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) { - struct extent_map_tree *map_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct btrfs_block_group *bg; u64 start = 0; int ret = 0; while (1) { - read_lock(&map_tree->lock); + struct btrfs_chunk_map *map; + struct btrfs_block_group *bg; + /* - * lookup_extent_mapping will return the first extent map - * intersecting the range, so setting @len to 1 is enough to + * btrfs_find_chunk_map() will return the first chunk map + * intersecting the range, so setting @length to 1 is enough to * get the first chunk. */ - em = lookup_extent_mapping(map_tree, start, 1); - read_unlock(&map_tree->lock); - if (!em) + map = btrfs_find_chunk_map(fs_info, start, 1); + if (!map) break; - bg = btrfs_lookup_block_group(fs_info, em->start); + bg = btrfs_lookup_block_group(fs_info, map->start); if (!bg) { btrfs_err(fs_info, "chunk start=%llu len=%llu doesn't have corresponding block group", - em->start, em->len); + map->start, map->chunk_len); ret = -EUCLEAN; - free_extent_map(em); + btrfs_free_chunk_map(map); break; } - if (bg->start != em->start || bg->length != em->len || + if (bg->start != map->start || bg->length != map->chunk_len || (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != - (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { btrfs_err(fs_info, "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", - em->start, em->len, - em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK, + map->start, map->chunk_len, + map->type & BTRFS_BLOCK_GROUP_TYPE_MASK, bg->start, bg->length, bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); ret = -EUCLEAN; - free_extent_map(em); + btrfs_free_chunk_map(map); btrfs_put_block_group(bg); break; } - start = em->start + em->len; - free_extent_map(em); + start = map->start + map->chunk_len; + btrfs_free_chunk_map(map); btrfs_put_block_group(bg); } return ret; @@ -2369,28 +2349,25 @@ error: static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) { - struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct rb_node *node; int ret = 0; - for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { - struct extent_map *em; - struct map_lookup *map; + for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) { + struct btrfs_chunk_map *map; struct btrfs_block_group *bg; - em = rb_entry(node, struct extent_map, rb_node); - map = em->map_lookup; - bg = btrfs_create_block_group_cache(fs_info, em->start); + map = rb_entry(node, struct btrfs_chunk_map, rb_node); + bg = btrfs_create_block_group_cache(fs_info, map->start); if (!bg) { ret = -ENOMEM; break; } /* Fill dummy cache as FULL */ - bg->length = em->len; + bg->length = map->chunk_len; bg->flags = map->type; bg->cached = BTRFS_CACHE_FINISHED; - bg->used = em->len; + bg->used = map->chunk_len; bg->flags = map->type; ret = btrfs_add_block_group_cache(fs_info, bg); /* @@ -2618,19 +2595,14 @@ static int insert_dev_extents(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_device *device; - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; u64 dev_offset; - u64 stripe_size; int i; int ret = 0; - em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); - if (IS_ERR(em)) - return PTR_ERR(em); - - map = em->map_lookup; - stripe_size = em->orig_block_len; + map = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); + if (IS_ERR(map)) + return PTR_ERR(map); /* * Take the device list mutex to prevent races with the final phase of @@ -2647,13 +2619,13 @@ static int insert_dev_extents(struct btrfs_trans_handle *trans, dev_offset = map->stripes[i].physical; ret = insert_dev_extent(trans, device, chunk_offset, dev_offset, - stripe_size); + map->stripe_size); if (ret) break; } mutex_unlock(&fs_info->fs_devices->device_list_mutex); - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } @@ -2910,7 +2882,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, goto unlock_out; /* - * Skip chunk alloction if the bg is SYSTEM, this is to avoid system + * Skip chunk allocation if the bg is SYSTEM, this is to avoid system * chunk allocation storm to exhaust the system chunk array. Otherwise * we still want to try our best to mark the block group read-only. */ @@ -4406,8 +4378,6 @@ void btrfs_freeze_block_group(struct btrfs_block_group *cache) void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; - struct extent_map_tree *em_tree; - struct extent_map *em; bool cleanup; spin_lock(&block_group->lock); @@ -4416,17 +4386,16 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) spin_unlock(&block_group->lock); if (cleanup) { - em_tree = &fs_info->mapping_tree; - write_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, block_group->start, - 1); - BUG_ON(!em); /* logic error, can't happen */ - remove_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - - /* once for us and once for the tree */ - free_extent_map(em); - free_extent_map(em); + struct btrfs_chunk_map *map; + + map = btrfs_find_chunk_map(fs_info, block_group->start, 1); + /* Logic error, can't happen. */ + ASSERT(map); + + btrfs_remove_chunk_map(fs_info, map); + + /* Once for our lookup reference. */ + btrfs_free_chunk_map(map); /* * We may have left one free space entry and other possible diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 2bdbcb834f95..c4a1f01cc1c2 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -5,6 +5,8 @@ #include "free-space-cache.h" +struct btrfs_chunk_map; + enum btrfs_disk_cache_state { BTRFS_DC_WRITTEN, BTRFS_DC_ERROR, @@ -243,7 +245,7 @@ struct btrfs_block_group { u64 zone_unusable; u64 zone_capacity; u64 meta_write_pointer; - struct map_lookup *physical_map; + struct btrfs_chunk_map *physical_map; struct list_head active_bg_list; struct work_struct zone_finish_work; struct extent_buffer *last_eb; @@ -297,7 +299,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( struct btrfs_fs_info *fs_info, const u64 chunk_offset); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - u64 group_start, struct extent_map *em); + struct btrfs_chunk_map *map); void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); void btrfs_mark_bg_unused(struct btrfs_block_group *bg); void btrfs_reclaim_bgs_work(struct work_struct *work); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 5572ae52444e..7f7c5a92d2b8 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -69,6 +69,8 @@ enum { BTRFS_INODE_VERITY_IN_PROGRESS, /* Set when this inode is a free space inode. */ BTRFS_INODE_FREE_SPACE_INODE, + /* Set when there are no capabilities in XATTs for the inode. */ + BTRFS_INODE_NO_CAP_XATTR, }; /* in memory btrfs inode */ @@ -107,9 +109,11 @@ struct btrfs_inode { /* * Keep track of where the inode has extent items mapped in order to - * make sure the i_size adjustments are accurate + * make sure the i_size adjustments are accurate. Not required when the + * filesystem is NO_HOLES, the status can't be set while mounted as + * it's a mkfs-time feature. */ - struct extent_io_tree file_extent_tree; + struct extent_io_tree *file_extent_tree; /* held while logging the inode in tree-log.c */ struct mutex log_mutex; @@ -487,7 +491,7 @@ struct inode *btrfs_iget_path(struct super_block *s, u64 ino, struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, - u64 start, u64 end); + u64 start, u64 len); int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 19b22b4653c8..193168214eeb 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -20,6 +20,7 @@ #include <linux/slab.h> #include <linux/sched/mm.h> #include <linux/log2.h> +#include <linux/shrinker.h> #include <crypto/hash.h> #include "misc.h" #include "ctree.h" @@ -163,13 +164,107 @@ static int compression_decompress(int type, struct list_head *ws, static void btrfs_free_compressed_pages(struct compressed_bio *cb) { for (unsigned int i = 0; i < cb->nr_pages; i++) - put_page(cb->compressed_pages[i]); + btrfs_free_compr_page(cb->compressed_pages[i]); kfree(cb->compressed_pages); } static int btrfs_decompress_bio(struct compressed_bio *cb); -static void end_compressed_bio_read(struct btrfs_bio *bbio) +/* + * Global cache of last unused pages for compression/decompression. + */ +static struct btrfs_compr_pool { + struct shrinker *shrinker; + spinlock_t lock; + struct list_head list; + int count; + int thresh; +} compr_pool; + +static unsigned long btrfs_compr_pool_count(struct shrinker *sh, struct shrink_control *sc) +{ + int ret; + + /* + * We must not read the values more than once if 'ret' gets expanded in + * the return statement so we don't accidentally return a negative + * number, even if the first condition finds it positive. + */ + ret = READ_ONCE(compr_pool.count) - READ_ONCE(compr_pool.thresh); + + return ret > 0 ? ret : 0; +} + +static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_control *sc) +{ + struct list_head remove; + struct list_head *tmp, *next; + int freed; + + if (compr_pool.count == 0) + return SHRINK_STOP; + + INIT_LIST_HEAD(&remove); + + /* For now, just simply drain the whole list. */ + spin_lock(&compr_pool.lock); + list_splice_init(&compr_pool.list, &remove); + freed = compr_pool.count; + compr_pool.count = 0; + spin_unlock(&compr_pool.lock); + + list_for_each_safe(tmp, next, &remove) { + struct page *page = list_entry(tmp, struct page, lru); + + ASSERT(page_ref_count(page) == 1); + put_page(page); + } + + return freed; +} + +/* + * Common wrappers for page allocation from compression wrappers + */ +struct page *btrfs_alloc_compr_page(void) +{ + struct page *page = NULL; + + spin_lock(&compr_pool.lock); + if (compr_pool.count > 0) { + page = list_first_entry(&compr_pool.list, struct page, lru); + list_del_init(&page->lru); + compr_pool.count--; + } + spin_unlock(&compr_pool.lock); + + if (page) + return page; + + return alloc_page(GFP_NOFS); +} + +void btrfs_free_compr_page(struct page *page) +{ + bool do_free = false; + + spin_lock(&compr_pool.lock); + if (compr_pool.count > compr_pool.thresh) { + do_free = true; + } else { + list_add(&page->lru, &compr_pool.list); + compr_pool.count++; + } + spin_unlock(&compr_pool.lock); + + if (!do_free) + return; + + ASSERT(page_ref_count(page) == 1); + put_page(page); +} + +static void end_bbio_comprssed_read(struct btrfs_bio *bbio) { struct compressed_bio *cb = to_compressed_bio(bbio); blk_status_t status = bbio->bio.bi_status; @@ -211,8 +306,8 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb) for (i = 0; i < ret; i++) { struct folio *folio = fbatch.folios[i]; - btrfs_page_clamp_clear_writeback(fs_info, &folio->page, - cb->start, cb->len); + btrfs_folio_clamp_clear_writeback(fs_info, folio, + cb->start, cb->len); } folio_batch_release(&fbatch); } @@ -242,7 +337,7 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work) * This also calls the writeback end hooks for the file pages so that metadata * and checksums can be updated in the file. */ -static void end_compressed_bio_write(struct btrfs_bio *bbio) +static void end_bbio_comprssed_write(struct btrfs_bio *bbio) { struct compressed_bio *cb = to_compressed_bio(bbio); struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; @@ -289,7 +384,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, cb = alloc_compressed_bio(inode, ordered->file_offset, REQ_OP_WRITE | write_flags, - end_compressed_bio_write); + end_bbio_comprssed_write); cb->start = ordered->file_offset; cb->len = ordered->num_bytes; cb->compressed_pages = compressed_pages; @@ -446,7 +541,8 @@ static noinline int add_ra_bio_pages(struct inode *inode, * subpage::readers and to unlock the page. */ if (fs_info->sectorsize < PAGE_SIZE) - btrfs_subpage_start_reader(fs_info, page, cur, add_size); + btrfs_subpage_start_reader(fs_info, page_folio(page), + cur, add_size); put_page(page); cur += add_size; } @@ -489,11 +585,11 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) goto out; } - ASSERT(em->compress_type != BTRFS_COMPRESS_NONE); + ASSERT(extent_map_is_compressed(em)); compressed_len = em->block_len; cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ, - end_compressed_bio_read); + end_bbio_comprssed_read); cb->start = em->orig_start; em_len = em->len; @@ -501,7 +597,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) cb->len = bbio->bio.bi_iter.bi_size; cb->compressed_len = compressed_len; - cb->compress_type = em->compress_type; + cb->compress_type = extent_map_compression(em); cb->orig_bbio = bbio; free_extent_map(em); @@ -513,7 +609,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) goto out_free_bio; } - ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages); + ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages, 0); if (ret2) { ret = BLK_STS_RESOURCE; goto out_free_compressed_pages; @@ -960,15 +1056,36 @@ int __init btrfs_init_compress(void) offsetof(struct compressed_bio, bbio.bio), BIOSET_NEED_BVECS)) return -ENOMEM; + + compr_pool.shrinker = shrinker_alloc(SHRINKER_NONSLAB, "btrfs-compr-pages"); + if (!compr_pool.shrinker) + return -ENOMEM; + btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE); btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB); btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO); zstd_init_workspace_manager(); + + spin_lock_init(&compr_pool.lock); + INIT_LIST_HEAD(&compr_pool.list); + compr_pool.count = 0; + /* 128K / 4K = 32, for 8 threads is 256 pages. */ + compr_pool.thresh = BTRFS_MAX_COMPRESSED / PAGE_SIZE * 8; + compr_pool.shrinker->count_objects = btrfs_compr_pool_count; + compr_pool.shrinker->scan_objects = btrfs_compr_pool_scan; + compr_pool.shrinker->batch = 32; + compr_pool.shrinker->seeks = DEFAULT_SEEKS; + shrinker_register(compr_pool.shrinker); + return 0; } void __cold btrfs_exit_compress(void) { + /* For now scan drains all pages and does not touch the parameters. */ + btrfs_compr_pool_scan(NULL, NULL); + shrinker_free(compr_pool.shrinker); + btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE); btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB); btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO); diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 03bb9d143fa7..93cc92974dee 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -32,6 +32,8 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); #define BTRFS_ZLIB_DEFAULT_LEVEL 3 +struct page; + struct compressed_bio { /* Number of compressed pages in the array */ unsigned int nr_pages; @@ -96,6 +98,9 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio); unsigned int btrfs_compress_str2level(unsigned int type, const char *str); +struct page *btrfs_alloc_compr_page(void); +void btrfs_free_compr_page(struct page *page); + enum btrfs_compression_type { BTRFS_COMPRESS_NONE = 0, BTRFS_COMPRESS_ZLIB = 1, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 35c1d24d4a78..e65e012bac55 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -370,33 +370,41 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, /* * check if the tree block can be shared by multiple trees */ -int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf) +bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf) { + const u64 buf_gen = btrfs_header_generation(buf); + /* * Tree blocks not in shareable trees and tree roots are never shared. * If a block was allocated after the last snapshot and the block was * not allocated by tree relocation, we know the block is not shared. */ - if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && - buf != root->node && - (btrfs_header_generation(buf) <= - btrfs_root_last_snapshot(&root->root_item) || - btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) { - if (buf != root->commit_root) - return 1; - /* - * An extent buffer that used to be the commit root may still be - * shared because the tree height may have increased and it - * became a child of a higher level root. This can happen when - * snapshotting a subvolume created in the current transaction. - */ - if (btrfs_header_generation(buf) == trans->transid) - return 1; - } - return 0; + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) + return false; + + if (buf == root->node) + return false; + + if (buf_gen > btrfs_root_last_snapshot(&root->root_item) && + !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) + return false; + + if (buf != root->commit_root) + return true; + + /* + * An extent buffer that used to be the commit root may still be shared + * because the tree height may have increased and it became a child of a + * higher level root. This can happen when snapshotting a subvolume + * created in the current transaction. + */ + if (buf_gen == trans->transid) + return true; + + return false; } static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, @@ -812,7 +820,8 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot, } while (low < high) { - unsigned long oip; + const int unit_size = folio_size(eb->folios[0]); + unsigned long oil; unsigned long offset; struct btrfs_disk_key *tmp; struct btrfs_disk_key unaligned; @@ -820,14 +829,14 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot, mid = (low + high) / 2; offset = p + mid * item_size; - oip = offset_in_page(offset); + oil = get_eb_offset_in_folio(eb, offset); - if (oip + key_size <= PAGE_SIZE) { - const unsigned long idx = get_eb_page_index(offset); - char *kaddr = page_address(eb->pages[idx]); + if (oil + key_size <= unit_size) { + const unsigned long idx = get_eb_folio_index(eb, offset); + char *kaddr = folio_address(eb->folios[idx]); - oip = get_eb_offset_in_page(eb, offset); - tmp = (struct btrfs_disk_key *)(kaddr + oip); + oil = get_eb_offset_in_folio(eb, offset); + tmp = (struct btrfs_disk_key *)(kaddr + oil); } else { read_extent_buffer(eb, &unaligned, offset, key_size); tmp = &unaligned; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 196c005c31f6..70e828d33177 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -212,8 +212,6 @@ struct btrfs_root { u64 last_trans; - u32 type; - u64 free_objectid; struct btrfs_key defrag_progress; @@ -224,18 +222,15 @@ struct btrfs_root { struct list_head root_list; - spinlock_t log_extents_lock[2]; - struct list_head logged_list[2]; - spinlock_t inode_lock; /* red-black tree that keeps track of in-memory inodes */ struct rb_root inode_tree; /* - * radix tree that keeps track of delayed nodes of every inode, - * protected by inode_lock + * Xarray that keeps track of delayed nodes of every inode, protected + * by @inode_lock. */ - struct radix_tree_root delayed_nodes_tree; + struct xarray delayed_nodes; /* * right now this just gets used so that a root has its own devid * for stat. It may be used for more later @@ -561,9 +556,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer **cow_ret, u64 new_root_objectid); -int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf); +bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf); int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); void btrfs_extend_item(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 5244561e2016..c276b136ab63 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -775,7 +775,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, * this em, as either we don't care about the generation, or the * merged extent map will be rejected anyway. */ - if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) && + if (em && (em->flags & EXTENT_FLAG_MERGED) && newer_than && em->generation >= newer_than) { free_extent_map(em); em = NULL; @@ -802,7 +802,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, const struct extent_map *em) { - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + if (extent_map_is_compressed(em)) return BTRFS_MAX_COMPRESSED; return fs_info->max_extent_size; } @@ -828,7 +828,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, /* No more em or hole */ if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) goto out; - if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags)) + if (next->flags & EXTENT_FLAG_PREALLOC) goto out; /* * If the next extent is at its max capacity, defragging current extent @@ -996,10 +996,9 @@ static int defrag_collect_targets(struct btrfs_inode *inode, em->len <= inode->root->fs_info->max_inline) goto next; - /* Skip hole/delalloc/preallocated extents */ + /* Skip holes and preallocated extents. */ if (em->block_start == EXTENT_MAP_HOLE || - em->block_start == EXTENT_MAP_DELALLOC || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + (em->flags & EXTENT_FLAG_PREALLOC)) goto next; /* Skip older extent */ @@ -1190,7 +1189,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, /* Update the page status */ for (i = start_index - first_index; i <= last_index - first_index; i++) { ClearPageChecked(pages[i]); - btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len); + btrfs_folio_clamp_set_dirty(fs_info, page_folio(pages[i]), start, len); } btrfs_delalloc_release_extents(inode, len); extent_changeset_free(data_reserved); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 7381241334e8..08102883f560 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -71,7 +71,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( } spin_lock(&root->inode_lock); - node = radix_tree_lookup(&root->delayed_nodes_tree, ino); + node = xa_load(&root->delayed_nodes, ino); if (node) { if (btrfs_inode->delayed_node) { @@ -83,9 +83,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( /* * It's possible that we're racing into the middle of removing - * this node from the radix tree. In this case, the refcount + * this node from the xarray. In this case, the refcount * was zero and it should never go back to one. Just return - * NULL like it was never in the radix at all; our release + * NULL like it was never in the xarray at all; our release * function is in the process of removing it. * * Some implementations of refcount_inc refuse to bump the @@ -93,7 +93,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( * here, refcount_inc() may decide to just WARN_ONCE() instead * of actually bumping the refcount. * - * If this node is properly in the radix, we want to bump the + * If this node is properly in the xarray, we want to bump the * refcount twice, once for the inode and once for this get * operation. */ @@ -120,6 +120,7 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( struct btrfs_root *root = btrfs_inode->root; u64 ino = btrfs_ino(btrfs_inode); int ret; + void *ptr; again: node = btrfs_get_delayed_node(btrfs_inode); @@ -131,26 +132,30 @@ again: return ERR_PTR(-ENOMEM); btrfs_init_delayed_node(node, root, ino); - /* cached in the btrfs inode and can be accessed */ + /* Cached in the inode and can be accessed. */ refcount_set(&node->refs, 2); - ret = radix_tree_preload(GFP_NOFS); - if (ret) { + /* Allocate and reserve the slot, from now it can return a NULL from xa_load(). */ + ret = xa_reserve(&root->delayed_nodes, ino, GFP_NOFS); + if (ret == -ENOMEM) { kmem_cache_free(delayed_node_cache, node); - return ERR_PTR(ret); + return ERR_PTR(-ENOMEM); } - spin_lock(&root->inode_lock); - ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node); - if (ret == -EEXIST) { + ptr = xa_load(&root->delayed_nodes, ino); + if (ptr) { + /* Somebody inserted it, go back and read it. */ spin_unlock(&root->inode_lock); kmem_cache_free(delayed_node_cache, node); - radix_tree_preload_end(); + node = NULL; goto again; } + ptr = xa_store(&root->delayed_nodes, ino, node, GFP_ATOMIC); + ASSERT(xa_err(ptr) != -EINVAL); + ASSERT(xa_err(ptr) != -ENOMEM); + ASSERT(ptr == NULL); btrfs_inode->delayed_node = node; spin_unlock(&root->inode_lock); - radix_tree_preload_end(); return node; } @@ -269,8 +274,7 @@ static void __btrfs_release_delayed_node( * back up. We can delete it now. */ ASSERT(refcount_read(&delayed_node->refs) == 0); - radix_tree_delete(&root->delayed_nodes_tree, - delayed_node->inode_id); + xa_erase(&root->delayed_nodes, delayed_node->inode_id); spin_unlock(&root->inode_lock); kmem_cache_free(delayed_node_cache, delayed_node); } @@ -1036,14 +1040,33 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) goto out; - path->slots[0]++; - if (path->slots[0] >= btrfs_header_nritems(leaf)) - goto search; -again: + /* + * Now we're going to delete the INODE_REF/EXTREF, which should be the + * only one ref left. Check if the next item is an INODE_REF/EXTREF. + * + * But if we're the last item already, release and search for the last + * INODE_REF/EXTREF. + */ + if (path->slots[0] + 1 >= btrfs_header_nritems(leaf)) { + key.objectid = node->inode_id; + key.type = BTRFS_INODE_EXTREF_KEY; + key.offset = (u64)-1; + + btrfs_release_path(path); + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto err_out; + ASSERT(ret > 0); + ASSERT(path->slots[0] > 0); + ret = 0; + path->slots[0]--; + leaf = path->nodes[0]; + } else { + path->slots[0]++; + } btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != node->inode_id) goto out; - if (key.type != BTRFS_INODE_REF_KEY && key.type != BTRFS_INODE_EXTREF_KEY) goto out; @@ -1070,22 +1093,6 @@ err_out: btrfs_abort_transaction(trans, ret); return ret; - -search: - btrfs_release_path(path); - - key.type = BTRFS_INODE_EXTREF_KEY; - key.offset = -1; - - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) - goto err_out; - ASSERT(ret); - - ret = 0; - leaf = path->nodes[0]; - path->slots[0]--; - goto again; } static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, @@ -2035,34 +2042,36 @@ void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode) void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) { - u64 inode_id = 0; + unsigned long index = 0; struct btrfs_delayed_node *delayed_nodes[8]; - int i, n; while (1) { + struct btrfs_delayed_node *node; + int count; + spin_lock(&root->inode_lock); - n = radix_tree_gang_lookup(&root->delayed_nodes_tree, - (void **)delayed_nodes, inode_id, - ARRAY_SIZE(delayed_nodes)); - if (!n) { + if (xa_empty(&root->delayed_nodes)) { spin_unlock(&root->inode_lock); - break; + return; } - inode_id = delayed_nodes[n - 1]->inode_id + 1; - for (i = 0; i < n; i++) { + count = 0; + xa_for_each_start(&root->delayed_nodes, index, node, index) { /* * Don't increase refs in case the node is dead and * about to be removed from the tree in the loop below */ - if (!refcount_inc_not_zero(&delayed_nodes[i]->refs)) - delayed_nodes[i] = NULL; + if (refcount_inc_not_zero(&node->refs)) { + delayed_nodes[count] = node; + count++; + } + if (count >= ARRAY_SIZE(delayed_nodes)) + break; } spin_unlock(&root->inode_lock); + index++; - for (i = 0; i < n; i++) { - if (!delayed_nodes[i]) - continue; + for (int i = 0; i < count; i++) { __btrfs_kill_delayed_node(delayed_nodes[i]); btrfs_release_delayed_node(delayed_nodes[i]); } diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index f9544fda38e9..1502d664c892 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -550,8 +550,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, u64 physical) { struct btrfs_fs_info *fs_info = cache->fs_info; - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; u64 chunk_offset = cache->start; int num_extents, cur_extent; int i; @@ -567,9 +566,8 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, } spin_unlock(&cache->lock); - em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); - ASSERT(!IS_ERR(em)); - map = em->map_lookup; + map = btrfs_get_chunk_map(fs_info, chunk_offset, 1); + ASSERT(!IS_ERR(map)); num_extents = 0; cur_extent = 0; @@ -583,7 +581,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, cur_extent = i; } - free_extent_map(em); + btrfs_free_chunk_map(map); if (num_extents > 1 && cur_extent < num_extents - 1) { /* @@ -812,25 +810,23 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( struct btrfs_device *srcdev, struct btrfs_device *tgtdev) { - struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct map_lookup *map; u64 start = 0; int i; - write_lock(&em_tree->lock); + write_lock(&fs_info->mapping_tree_lock); do { - em = lookup_extent_mapping(em_tree, start, (u64)-1); - if (!em) + struct btrfs_chunk_map *map; + + map = btrfs_find_chunk_map_nolock(fs_info, start, U64_MAX); + if (!map) break; - map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) if (srcdev == map->stripes[i].dev) map->stripes[i].dev = tgtdev; - start = em->start + em->len; - free_extent_map(em); + start = map->start + map->chunk_len; + btrfs_free_chunk_map(map); } while (start); - write_unlock(&em_tree->lock); + write_unlock(&fs_info->mapping_tree_lock); } static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 62cb97f7c94f..c6907d533fe8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -74,20 +74,37 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) static void csum_tree_block(struct extent_buffer *buf, u8 *result) { struct btrfs_fs_info *fs_info = buf->fs_info; - const int num_pages = num_extent_pages(buf); - const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize); + int num_pages; + u32 first_page_part; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); char *kaddr; int i; shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); - kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start); + + if (buf->addr) { + /* Pages are contiguous, handle them as a big one. */ + kaddr = buf->addr; + first_page_part = fs_info->nodesize; + num_pages = 1; + } else { + kaddr = folio_address(buf->folios[0]); + first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize); + num_pages = num_extent_pages(buf); + } + crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, first_page_part - BTRFS_CSUM_SIZE); + /* + * Multiple single-page folios case would reach here. + * + * nodesize <= PAGE_SIZE and large folio all handled by above + * crypto_shash_update() already. + */ for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) { - kaddr = page_address(buf->pages[i]); + kaddr = folio_address(buf->folios[i]); crypto_shash_update(shash, kaddr, PAGE_SIZE); } memset(result, 0, BTRFS_CSUM_SIZE); @@ -166,20 +183,22 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) { struct btrfs_fs_info *fs_info = eb->fs_info; - int i, num_pages = num_extent_pages(eb); + int num_folios = num_extent_folios(eb); int ret = 0; if (sb_rdonly(fs_info->sb)) return -EROFS; - for (i = 0; i < num_pages; i++) { - struct page *p = eb->pages[i]; - u64 start = max_t(u64, eb->start, page_offset(p)); - u64 end = min_t(u64, eb->start + eb->len, page_offset(p) + PAGE_SIZE); + for (int i = 0; i < num_folios; i++) { + struct folio *folio = eb->folios[i]; + u64 start = max_t(u64, eb->start, folio_pos(folio)); + u64 end = min_t(u64, eb->start + eb->len, + folio_pos(folio) + folio_size(folio)); u32 len = end - start; ret = btrfs_repair_io_failure(fs_info, 0, start, len, - start, p, offset_in_page(start), mirror_num); + start, folio, offset_in_folio(folio, start), + mirror_num); if (ret) break; } @@ -254,15 +273,20 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len)) return BLK_STS_IOERR; - if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) { - WARN_ON_ONCE(found_start != 0); + /* + * If an extent_buffer is marked as EXTENT_BUFFER_ZONED_ZEROOUT, don't + * checksum it but zero-out its content. This is done to preserve + * ordering of I/O without unnecessarily writing out data. + */ + if (test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)) { + memzero_extent_buffer(eb, 0, eb->len); return BLK_STS_OK; } if (WARN_ON_ONCE(found_start != eb->start)) return BLK_STS_IOERR; - if (WARN_ON(!btrfs_page_test_uptodate(fs_info, eb->pages[0], eb->start, - eb->len))) + if (WARN_ON(!btrfs_folio_test_uptodate(fs_info, eb->folios[0], + eb->start, eb->len))) return BLK_STS_IOERR; ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, @@ -371,8 +395,8 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, } csum_tree_block(eb, result); - header_csum = page_address(eb->pages[0]) + - get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum)); + header_csum = folio_address(eb->folios[0]) + + get_eb_offset_in_folio(eb, offsetof(struct btrfs_header, csum)); if (memcmp(result, header_csum, csum_size) != 0) { btrfs_warn_rl(fs_info, @@ -639,7 +663,8 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->nr_delalloc_inodes = 0; root->nr_ordered_extents = 0; root->inode_tree = RB_ROOT; - INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); + /* GFP flags are compatible with XA_FLAGS_*. */ + xa_init_flags(&root->delayed_nodes, GFP_ATOMIC); btrfs_init_root_block_rsv(root); @@ -650,14 +675,10 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&root->ordered_extents); INIT_LIST_HEAD(&root->ordered_root); INIT_LIST_HEAD(&root->reloc_dirty_list); - INIT_LIST_HEAD(&root->logged_list[0]); - INIT_LIST_HEAD(&root->logged_list[1]); spin_lock_init(&root->inode_lock); spin_lock_init(&root->delalloc_lock); spin_lock_init(&root->ordered_extent_lock); spin_lock_init(&root->accounting_lock); - spin_lock_init(&root->log_extents_lock[0]); - spin_lock_init(&root->log_extents_lock[1]); spin_lock_init(&root->qgroup_meta_rsv_lock); mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); @@ -2618,9 +2639,6 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) */ btrfs_set_super_log_root(sb, 0); - /* We can't trust the free space cache either */ - btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE); - btrfs_warn(fs_info, "try to load backup roots slot %d", i); ret = read_backup_root(fs_info, i); backup_index = ret; @@ -2724,7 +2742,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&fs_info->allocated_ebs); spin_lock_init(&fs_info->eb_leak_lock); #endif - extent_map_tree_init(&fs_info->mapping_tree); + fs_info->mapping_tree = RB_ROOT_CACHED; + rwlock_init(&fs_info->mapping_tree_lock); btrfs_init_block_rsv(&fs_info->global_block_rsv, BTRFS_BLOCK_RSV_GLOBAL); btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); @@ -2794,6 +2813,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) fs_info->sectorsize_bits = ilog2(4096); fs_info->stripesize = 4096; + /* Default compress algorithm when user does -o compress */ + fs_info->compress_type = BTRFS_COMPRESS_ZLIB; + fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE; spin_lock_init(&fs_info->swapfile_pins_lock); @@ -2931,17 +2953,6 @@ out: } /* - * Some options only have meaning at mount time and shouldn't persist across - * remounts, or be displayed. Clear these at the end of mount and remount - * code paths. - */ -void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info) -{ - btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT); - btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE); -} - -/* * Mounting logic specific to read-write file systems. Shared by open_ctree * and btrfs_remount when remounting from read-only to read-write. */ @@ -2953,7 +2964,11 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) if (btrfs_test_opt(fs_info, CLEAR_CACHE) && btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - rebuild_free_space_tree = true; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + btrfs_warn(fs_info, + "'clear_cache' option is ignored with extent tree v2"); + else + rebuild_free_space_tree = true; } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) { btrfs_warn(fs_info, "free space tree is invalid"); @@ -3276,13 +3291,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) WRITE_ONCE(fs_info->fs_error, -EUCLEAN); - /* - * In the long term, we'll store the compression type in the super - * block, and it'll be used for per file compression control. - */ - fs_info->compress_type = BTRFS_COMPRESS_ZLIB; - - /* Set up fs_info before parsing mount options */ nodesize = btrfs_super_nodesize(disk_super); sectorsize = btrfs_super_sectorsize(disk_super); @@ -3296,28 +3304,30 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; - ret = btrfs_parse_options(fs_info, options, sb->s_flags); - if (ret) + /* + * Handle the space caching options appropriately now that we have the + * super block loaded and validated. + */ + btrfs_set_free_space_cache_settings(fs_info); + + if (!btrfs_check_options(fs_info, &fs_info->mount_opt, sb->s_flags)) { + ret = -EINVAL; goto fail_alloc; + } ret = btrfs_check_features(fs_info, !sb_rdonly(sb)); if (ret < 0) goto fail_alloc; + /* + * At this point our mount options are validated, if we set ->max_inline + * to something non-standard make sure we truncate it to sectorsize. + */ + fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize); + if (sectorsize < PAGE_SIZE) { struct btrfs_subpage_info *subpage_info; - /* - * V1 space cache has some hardcoded PAGE_SIZE usage, and is - * going to be deprecated. - * - * Force to use v2 cache for subpage case. - */ - btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); - btrfs_set_and_info(fs_info, FREE_SPACE_TREE, - "forcing free space tree for sector size %u with page size %lu", - sectorsize, PAGE_SIZE); - btrfs_warn(fs_info, "read-write for sector size %u with page size %lu is experimental", sectorsize, PAGE_SIZE); @@ -3494,29 +3504,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_cleaner; } - if (!btrfs_test_opt(fs_info, NOSSD) && - !fs_info->fs_devices->rotating) { - btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations"); - } - - /* - * For devices supporting discard turn on discard=async automatically, - * unless it's already set or disabled. This could be turned off by - * nodiscard for the same mount. - * - * The zoned mode piggy backs on the discard functionality for - * resetting a zone. There is no reason to delay the zone reset as it is - * fast enough. So, do not enable async discard for zoned mode. - */ - if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) || - btrfs_test_opt(fs_info, DISCARD_ASYNC) || - btrfs_test_opt(fs_info, NODISCARD)) && - fs_info->fs_devices->discardable && - !btrfs_is_zoned(fs_info)) { - btrfs_set_and_info(fs_info, DISCARD_ASYNC, - "auto enabling async discard"); - } - ret = btrfs_read_qgroup_config(fs_info); if (ret) goto fail_trans_kthread; @@ -3542,7 +3529,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device } if (sb_rdonly(sb)) - goto clear_oneshot; + return 0; ret = btrfs_start_pre_rw_mount(fs_info); if (ret) { @@ -3570,8 +3557,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags)) wake_up_process(fs_info->cleaner_kthread); -clear_oneshot: - btrfs_clear_oneshot_options(fs_info); return 0; fail_qgroup: @@ -3608,7 +3593,7 @@ fail_sb_buffer: btrfs_stop_all_workers(fs_info); btrfs_free_block_groups(fs_info); fail_alloc: - btrfs_mapping_tree_free(&fs_info->mapping_tree); + btrfs_mapping_tree_free(fs_info); iput(fs_info->btree_inode); fail: @@ -4391,7 +4376,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) iput(fs_info->btree_inode); - btrfs_mapping_tree_free(&fs_info->mapping_tree); + btrfs_mapping_tree_free(fs_info); btrfs_close_devices(fs_info->fs_devices); } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 50dab8f639dc..9413726b329b 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -37,9 +37,6 @@ struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, int level); -void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, - struct extent_buffer *buf); -void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info); int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, const struct btrfs_super_block *disk_sb); diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index ea149be28dff..e3ee5449cc4a 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -58,12 +58,13 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, struct extent_io_tree *tree, u64 start, u64 end) { - struct btrfs_inode *inode = tree->inode; + const struct btrfs_inode *inode; u64 isize; - if (!inode) + if (tree->owner != IO_TREE_INODE_IO) return; + inode = extent_io_tree_to_inode_const(tree); isize = i_size_read(&inode->vfs_inode); if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { btrfs_debug_rl(inode->root->fs_info, @@ -78,31 +79,46 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) #endif + /* - * For the file_extent_tree, we want to hold the inode lock when we lookup and - * update the disk_i_size, but lockdep will complain because our io_tree we hold - * the tree lock and get the inode lock when setting delalloc. These two things - * are unrelated, so make a class for the file_extent_tree so we don't get the - * two locking patterns mixed up. + * The only tree allowed to set the inode is IO_TREE_INODE_IO. */ -static struct lock_class_key file_extent_tree_class; +static bool is_inode_io_tree(const struct extent_io_tree *tree) +{ + return tree->owner == IO_TREE_INODE_IO; +} + +/* Return the inode if it's valid for the given tree, otherwise NULL. */ +struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree) +{ + if (tree->owner == IO_TREE_INODE_IO) + return tree->inode; + return NULL; +} + +/* Read-only access to the inode. */ +const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree) +{ + if (tree->owner == IO_TREE_INODE_IO) + return tree->inode; + return NULL; +} -struct tree_entry { - u64 start; - u64 end; - struct rb_node rb_node; -}; +/* For read-only access to fs_info. */ +const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree) +{ + if (tree->owner == IO_TREE_INODE_IO) + return tree->inode->root->fs_info; + return tree->fs_info; +} void extent_io_tree_init(struct btrfs_fs_info *fs_info, struct extent_io_tree *tree, unsigned int owner) { - tree->fs_info = fs_info; tree->state = RB_ROOT; spin_lock_init(&tree->lock); - tree->inode = NULL; + tree->fs_info = fs_info; tree->owner = owner; - if (owner == IO_TREE_INODE_FILE_EXTENT) - lockdep_set_class(&tree->lock, &file_extent_tree_class); } /* @@ -329,10 +345,14 @@ static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64 return tree_search_for_insert(tree, offset, NULL, NULL); } -static void extent_io_tree_panic(struct extent_io_tree *tree, int err) +static void extent_io_tree_panic(const struct extent_io_tree *tree, + const struct extent_state *state, + const char *opname, + int err) { - btrfs_panic(tree->fs_info, err, - "locking error: extent tree was modified by another thread while locked"); + btrfs_panic(extent_io_tree_to_fs_info(tree), err, + "extent io tree error on %s state start %llu end %llu", + opname, state->start, state->end); } static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *state) @@ -341,8 +361,9 @@ static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *s prev = prev_state(state); if (prev && prev->end == state->start - 1 && prev->state == state->state) { - if (tree->inode) - btrfs_merge_delalloc_extent(tree->inode, state, prev); + if (is_inode_io_tree(tree)) + btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree), + state, prev); state->start = prev->start; rb_erase(&prev->rb_node, &tree->state); RB_CLEAR_NODE(&prev->rb_node); @@ -356,8 +377,9 @@ static void merge_next_state(struct extent_io_tree *tree, struct extent_state *s next = next_state(state); if (next && next->start == state->end + 1 && next->state == state->state) { - if (tree->inode) - btrfs_merge_delalloc_extent(tree->inode, state, next); + if (is_inode_io_tree(tree)) + btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree), + state, next); state->end = next->end; rb_erase(&next->rb_node, &tree->state); RB_CLEAR_NODE(&next->rb_node); @@ -390,8 +412,8 @@ static void set_state_bits(struct extent_io_tree *tree, u32 bits_to_set = bits & ~EXTENT_CTLBITS; int ret; - if (tree->inode) - btrfs_set_delalloc_extent(tree->inode, state, bits); + if (is_inode_io_tree(tree)) + btrfs_set_delalloc_extent(extent_io_tree_to_inode(tree), state, bits); ret = add_extent_changeset(state, bits_to_set, changeset, 1); BUG_ON(ret < 0); @@ -436,9 +458,10 @@ static struct extent_state *insert_state(struct extent_io_tree *tree, if (state->end < entry->start) { if (try_merge && end == entry->start && state->state == entry->state) { - if (tree->inode) - btrfs_merge_delalloc_extent(tree->inode, - state, entry); + if (is_inode_io_tree(tree)) + btrfs_merge_delalloc_extent( + extent_io_tree_to_inode(tree), + state, entry); entry->start = state->start; merge_prev_state(tree, entry); state->state = 0; @@ -448,9 +471,10 @@ static struct extent_state *insert_state(struct extent_io_tree *tree, } else if (state->end > entry->end) { if (try_merge && entry->end == start && state->state == entry->state) { - if (tree->inode) - btrfs_merge_delalloc_extent(tree->inode, - state, entry); + if (is_inode_io_tree(tree)) + btrfs_merge_delalloc_extent( + extent_io_tree_to_inode(tree), + state, entry); entry->end = state->end; merge_next_state(tree, entry); state->state = 0; @@ -458,9 +482,6 @@ static struct extent_state *insert_state(struct extent_io_tree *tree, } node = &(*node)->rb_right; } else { - btrfs_err(tree->fs_info, - "found node %llu %llu on insert of %llu %llu", - entry->start, entry->end, state->start, state->end); return ERR_PTR(-EEXIST); } } @@ -505,8 +526,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, struct rb_node *parent = NULL; struct rb_node **node; - if (tree->inode) - btrfs_split_delalloc_extent(tree->inode, orig, split); + if (is_inode_io_tree(tree)) + btrfs_split_delalloc_extent(extent_io_tree_to_inode(tree), orig, + split); prealloc->start = orig->start; prealloc->end = split - 1; @@ -553,8 +575,9 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, u32 bits_to_clear = bits & ~EXTENT_CTLBITS; int ret; - if (tree->inode) - btrfs_clear_delalloc_extent(tree->inode, state, bits); + if (is_inode_io_tree(tree)) + btrfs_clear_delalloc_extent(extent_io_tree_to_inode(tree), state, + bits); ret = add_extent_changeset(state, bits_to_clear, changeset, 0); BUG_ON(ret < 0); @@ -695,7 +718,7 @@ hit_next: goto search_again; err = split_state(tree, state, prealloc, start); if (err) - extent_io_tree_panic(tree, err); + extent_io_tree_panic(tree, state, "split", err); prealloc = NULL; if (err) @@ -717,7 +740,7 @@ hit_next: goto search_again; err = split_state(tree, state, prealloc, end + 1); if (err) - extent_io_tree_panic(tree, err); + extent_io_tree_panic(tree, state, "split", err); if (wake) wake_up(&state->wq); @@ -939,6 +962,8 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, struct extent_state *state; int ret = 1; + ASSERT(!btrfs_fs_incompat(extent_io_tree_to_fs_info(tree), NO_HOLES)); + spin_lock(&tree->lock); state = find_first_extent_bit_state(tree, start, bits); if (state) { @@ -1152,7 +1177,7 @@ hit_next: goto search_again; err = split_state(tree, state, prealloc, start); if (err) - extent_io_tree_panic(tree, err); + extent_io_tree_panic(tree, state, "split", err); prealloc = NULL; if (err) @@ -1200,7 +1225,7 @@ hit_next: inserted_state = insert_state(tree, prealloc, bits, changeset); if (IS_ERR(inserted_state)) { err = PTR_ERR(inserted_state); - extent_io_tree_panic(tree, err); + extent_io_tree_panic(tree, prealloc, "insert", err); } cache_state(inserted_state, cached_state); @@ -1228,7 +1253,7 @@ hit_next: goto search_again; err = split_state(tree, state, prealloc, end + 1); if (err) - extent_io_tree_panic(tree, err); + extent_io_tree_panic(tree, state, "split", err); set_state_bits(tree, prealloc, bits, changeset); cache_state(prealloc, cached_state); @@ -1382,7 +1407,7 @@ hit_next: } err = split_state(tree, state, prealloc, start); if (err) - extent_io_tree_panic(tree, err); + extent_io_tree_panic(tree, state, "split", err); prealloc = NULL; if (err) goto out; @@ -1430,7 +1455,7 @@ hit_next: inserted_state = insert_state(tree, prealloc, bits, NULL); if (IS_ERR(inserted_state)) { err = PTR_ERR(inserted_state); - extent_io_tree_panic(tree, err); + extent_io_tree_panic(tree, prealloc, "insert", err); } cache_state(inserted_state, cached_state); if (inserted_state == prealloc) @@ -1453,7 +1478,7 @@ hit_next: err = split_state(tree, state, prealloc, end + 1); if (err) - extent_io_tree_panic(tree, err); + extent_io_tree_panic(tree, state, "split", err); set_state_bits(tree, prealloc, bits, NULL); cache_state(prealloc, cached_state); diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 5602b0137fcd..ebe6390d65e9 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -87,9 +87,17 @@ enum { struct extent_io_tree { struct rb_root state; - struct btrfs_fs_info *fs_info; - /* Inode associated with this tree, or NULL. */ - struct btrfs_inode *inode; + /* + * The fs_info is needed for trace points, a tree attached to an inode + * needs the inode. + * + * owner == IO_TREE_INODE_IO - then inode is valid and fs_info can be + * accessed as inode->root->fs_info + */ + union { + struct btrfs_fs_info *fs_info; + struct btrfs_inode *inode; + }; /* Who owns this io tree, should be one of IO_TREE_* */ u8 owner; @@ -112,6 +120,10 @@ struct extent_state { #endif }; +struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree); +const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree); +const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree); + void extent_io_tree_init(struct btrfs_fs_info *fs_info, struct extent_io_tree *tree, unsigned int owner); void extent_io_tree_release(struct extent_io_tree *tree); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 01423670bc8a..f396aba92c57 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3447,6 +3447,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_ref generic_ref = { 0 }; + struct btrfs_block_group *bg; int ret; btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, @@ -3460,67 +3461,64 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, BUG_ON(ret); /* -ENOMEM */ } - if (last_ref && btrfs_header_generation(buf) == trans->transid) { - struct btrfs_block_group *cache; - bool must_pin = false; - - if (root_id != BTRFS_TREE_LOG_OBJECTID) { - ret = check_ref_cleanup(trans, buf->start); - if (!ret) { - btrfs_redirty_list_add(trans->transaction, buf); - goto out; - } - } + if (!last_ref) + return; - cache = btrfs_lookup_block_group(fs_info, buf->start); + if (btrfs_header_generation(buf) != trans->transid) + goto out; - if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { - pin_down_extent(trans, cache, buf->start, buf->len, 1); - btrfs_put_block_group(cache); + if (root_id != BTRFS_TREE_LOG_OBJECTID) { + ret = check_ref_cleanup(trans, buf->start); + if (!ret) goto out; - } + } - /* - * If there are tree mod log users we may have recorded mod log - * operations for this node. If we re-allocate this node we - * could replay operations on this node that happened when it - * existed in a completely different root. For example if it - * was part of root A, then was reallocated to root B, and we - * are doing a btrfs_old_search_slot(root b), we could replay - * operations that happened when the block was part of root A, - * giving us an inconsistent view of the btree. - * - * We are safe from races here because at this point no other - * node or root points to this extent buffer, so if after this - * check a new tree mod log user joins we will not have an - * existing log of operations on this node that we have to - * contend with. - */ - if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) - must_pin = true; + bg = btrfs_lookup_block_group(fs_info, buf->start); - if (must_pin || btrfs_is_zoned(fs_info)) { - btrfs_redirty_list_add(trans->transaction, buf); - pin_down_extent(trans, cache, buf->start, buf->len, 1); - btrfs_put_block_group(cache); - goto out; - } + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { + pin_down_extent(trans, bg, buf->start, buf->len, 1); + btrfs_put_block_group(bg); + goto out; + } - WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); + /* + * If there are tree mod log users we may have recorded mod log + * operations for this node. If we re-allocate this node we + * could replay operations on this node that happened when it + * existed in a completely different root. For example if it + * was part of root A, then was reallocated to root B, and we + * are doing a btrfs_old_search_slot(root b), we could replay + * operations that happened when the block was part of root A, + * giving us an inconsistent view of the btree. + * + * We are safe from races here because at this point no other + * node or root points to this extent buffer, so if after this + * check a new tree mod log user joins we will not have an + * existing log of operations on this node that we have to + * contend with. + */ - btrfs_add_free_space(cache, buf->start, buf->len); - btrfs_free_reserved_bytes(cache, buf->len, 0); - btrfs_put_block_group(cache); - trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); + if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags) + || btrfs_is_zoned(fs_info)) { + pin_down_extent(trans, bg, buf->start, buf->len, 1); + btrfs_put_block_group(bg); + goto out; } + + WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); + + btrfs_add_free_space(bg, buf->start, buf->len); + btrfs_free_reserved_bytes(bg, buf->len, 0); + btrfs_put_block_group(bg); + trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); + out: - if (last_ref) { - /* - * Deleting the buffer, clear the corrupt flag since it doesn't - * matter anymore. - */ - clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); - } + + /* + * Deleting the buffer, clear the corrupt flag since it doesn't + * matter anymore. + */ + clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); } /* Can return -ENOMEM */ @@ -5061,7 +5059,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, __btrfs_tree_lock(buf, nest); btrfs_clear_buffer_dirty(trans, buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); - clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags); + clear_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &buf->bflags); set_extent_buffer_uptodate(buf); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8f724c54fc8e..cfd2967f04a2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -184,22 +184,23 @@ static void process_one_page(struct btrfs_fs_info *fs_info, struct page *page, struct page *locked_page, unsigned long page_ops, u64 start, u64 end) { + struct folio *folio = page_folio(page); u32 len; ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX); len = end + 1 - start; if (page_ops & PAGE_SET_ORDERED) - btrfs_page_clamp_set_ordered(fs_info, page, start, len); + btrfs_folio_clamp_set_ordered(fs_info, folio, start, len); if (page_ops & PAGE_START_WRITEBACK) { - btrfs_page_clamp_clear_dirty(fs_info, page, start, len); - btrfs_page_clamp_set_writeback(fs_info, page, start, len); + btrfs_folio_clamp_clear_dirty(fs_info, folio, start, len); + btrfs_folio_clamp_set_writeback(fs_info, folio, start, len); } if (page_ops & PAGE_END_WRITEBACK) - btrfs_page_clamp_clear_writeback(fs_info, page, start, len); + btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len); if (page != locked_page && (page_ops & PAGE_UNLOCK)) - btrfs_page_end_writer_lock(fs_info, page, start, len); + btrfs_folio_end_writer_lock(fs_info, folio, start, len); } static void __process_pages_contig(struct address_space *mapping, @@ -271,19 +272,20 @@ static noinline int lock_delalloc_pages(struct inode *inode, goto out; for (i = 0; i < found_folios; i++) { - struct page *page = &fbatch.folios[i]->page; + struct folio *folio = fbatch.folios[i]; + struct page *page = folio_page(folio, 0); u32 len = end + 1 - start; if (page == locked_page) continue; - if (btrfs_page_start_writer_lock(fs_info, page, start, - len)) + if (btrfs_folio_start_writer_lock(fs_info, folio, start, + len)) goto out; if (!PageDirty(page) || page->mapping != mapping) { - btrfs_page_end_writer_lock(fs_info, page, start, - len); + btrfs_folio_end_writer_lock(fs_info, folio, start, + len); goto out; } @@ -432,60 +434,65 @@ static bool btrfs_verify_page(struct page *page, u64 start) static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct folio *folio = page_folio(page); ASSERT(page_offset(page) <= start && start + len <= page_offset(page) + PAGE_SIZE); if (uptodate && btrfs_verify_page(page, start)) - btrfs_page_set_uptodate(fs_info, page, start, len); + btrfs_folio_set_uptodate(fs_info, folio, start, len); else - btrfs_page_clear_uptodate(fs_info, page, start, len); + btrfs_folio_clear_uptodate(fs_info, folio, start, len); - if (!btrfs_is_subpage(fs_info, page)) + if (!btrfs_is_subpage(fs_info, page->mapping)) unlock_page(page); else - btrfs_subpage_end_reader(fs_info, page, start, len); + btrfs_subpage_end_reader(fs_info, folio, start, len); } /* - * after a writepage IO is done, we need to: - * clear the uptodate bits on error - * clear the writeback bits in the extent tree for this IO - * end_page_writeback if the page has no more pending IO + * After a write IO is done, we need to: + * + * - clear the uptodate bits on error + * - clear the writeback bits in the extent tree for the range + * - filio_end_writeback() if there is no more pending io for the folio * * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ -static void end_bio_extent_writepage(struct btrfs_bio *bbio) +static void end_bbio_data_write(struct btrfs_bio *bbio) { struct bio *bio = &bbio->bio; int error = blk_status_to_errno(bio->bi_status); - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + struct folio_iter fi; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *page = bvec->bv_page; - struct inode *inode = page->mapping->host; + bio_for_each_folio_all(fi, bio) { + struct folio *folio = fi.folio; + struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u32 sectorsize = fs_info->sectorsize; - u64 start = page_offset(page) + bvec->bv_offset; - u32 len = bvec->bv_len; + u64 start = folio_pos(folio) + fi.offset; + u32 len = fi.length; + + /* Only order 0 (single page) folios are allowed for data. */ + ASSERT(folio_order(folio) == 0); /* Our read/write should always be sector aligned. */ - if (!IS_ALIGNED(bvec->bv_offset, sectorsize)) + if (!IS_ALIGNED(fi.offset, sectorsize)) btrfs_err(fs_info, - "partial page write in btrfs with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); - else if (!IS_ALIGNED(bvec->bv_len, sectorsize)) + "partial page write in btrfs with offset %zu and length %zu", + fi.offset, fi.length); + else if (!IS_ALIGNED(fi.length, sectorsize)) btrfs_info(fs_info, - "incomplete page write with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); + "incomplete page write with offset %zu and length %zu", + fi.offset, fi.length); - btrfs_finish_ordered_extent(bbio->ordered, page, start, len, !error); + btrfs_finish_ordered_extent(bbio->ordered, + folio_page(folio, 0), start, len, !error); if (error) - mapping_set_error(page->mapping, error); - btrfs_page_clear_writeback(fs_info, page, start, len); + mapping_set_error(folio->mapping, error); + btrfs_folio_clear_writeback(fs_info, folio, start, len); } bio_put(bio); @@ -562,98 +569,102 @@ update: static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) { - ASSERT(PageLocked(page)); - if (!btrfs_is_subpage(fs_info, page)) + struct folio *folio = page_folio(page); + + ASSERT(folio_test_locked(folio)); + if (!btrfs_is_subpage(fs_info, folio->mapping)) return; - ASSERT(PagePrivate(page)); - btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE); + ASSERT(folio_test_private(folio)); + btrfs_subpage_start_reader(fs_info, folio, page_offset(page), PAGE_SIZE); } /* - * after a readpage IO is done, we need to: - * clear the uptodate bits on error - * set the uptodate bits if things worked - * set the page up to date if all extents in the tree are uptodate - * clear the lock bit in the extent tree - * unlock the page if there are no other extents locked for it + * After a data read IO is done, we need to: + * + * - clear the uptodate bits on error + * - set the uptodate bits if things worked + * - set the folio up to date if all extents in the tree are uptodate + * - clear the lock bit in the extent tree + * - unlock the folio if there are no other extents locked for it * * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ -static void end_bio_extent_readpage(struct btrfs_bio *bbio) +static void end_bbio_data_read(struct btrfs_bio *bbio) { struct bio *bio = &bbio->bio; - struct bio_vec *bvec; struct processed_extent processed = { 0 }; + struct folio_iter fi; /* * The offset to the beginning of a bio, since one bio can never be * larger than UINT_MAX, u32 here is enough. */ u32 bio_offset = 0; - struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) { + bio_for_each_folio_all(fi, &bbio->bio) { bool uptodate = !bio->bi_status; - struct page *page = bvec->bv_page; - struct inode *inode = page->mapping->host; + struct folio *folio = fi.folio; + struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u32 sectorsize = fs_info->sectorsize; u64 start; u64 end; u32 len; + /* For now only order 0 folios are supported for data. */ + ASSERT(folio_order(folio) == 0); btrfs_debug(fs_info, - "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", - bio->bi_iter.bi_sector, bio->bi_status, + "%s: bi_sector=%llu, err=%d, mirror=%u", + __func__, bio->bi_iter.bi_sector, bio->bi_status, bbio->mirror_num); /* * We always issue full-sector reads, but if some block in a - * page fails to read, blk_update_request() will advance + * folio fails to read, blk_update_request() will advance * bv_offset and adjust bv_len to compensate. Print a warning * for unaligned offsets, and an error if they don't add up to * a full sector. */ - if (!IS_ALIGNED(bvec->bv_offset, sectorsize)) + if (!IS_ALIGNED(fi.offset, sectorsize)) btrfs_err(fs_info, - "partial page read in btrfs with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); - else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len, - sectorsize)) + "partial page read in btrfs with offset %zu and length %zu", + fi.offset, fi.length); + else if (!IS_ALIGNED(fi.offset + fi.length, sectorsize)) btrfs_info(fs_info, - "incomplete page read with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); + "incomplete page read with offset %zu and length %zu", + fi.offset, fi.length); - start = page_offset(page) + bvec->bv_offset; - end = start + bvec->bv_len - 1; - len = bvec->bv_len; + start = folio_pos(folio) + fi.offset; + end = start + fi.length - 1; + len = fi.length; if (likely(uptodate)) { loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> PAGE_SHIFT; + pgoff_t end_index = i_size >> folio_shift(folio); /* * Zero out the remaining part if this range straddles * i_size. * - * Here we should only zero the range inside the bvec, + * Here we should only zero the range inside the folio, * not touch anything else. * * NOTE: i_size is exclusive while end is inclusive. */ - if (page->index == end_index && i_size <= end) { - u32 zero_start = max(offset_in_page(i_size), - offset_in_page(start)); + if (folio_index(folio) == end_index && i_size <= end) { + u32 zero_start = max(offset_in_folio(folio, i_size), + offset_in_folio(folio, start)); + u32 zero_len = offset_in_folio(folio, end) + 1 - + zero_start; - zero_user_segment(page, zero_start, - offset_in_page(end) + 1); + folio_zero_range(folio, zero_start, zero_len); } } /* Update page status and unlock. */ - end_page_read(page, uptodate, start, len); + end_page_read(folio_page(folio, 0), uptodate, start, len); endio_readpage_release_extent(&processed, BTRFS_I(inode), start, end, uptodate); @@ -672,19 +683,22 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) * @nr_pages: number of pages to allocate * @page_array: the array to fill with pages; any existing non-null entries in * the array will be skipped + * @extra_gfp: the extra GFP flags for the allocation. * * Return: 0 if all pages were able to be allocated; * -ENOMEM otherwise, the partially allocated pages would be freed and * the array slots zeroed */ -int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) +int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, + gfp_t extra_gfp) { unsigned int allocated; for (allocated = 0; allocated < nr_pages;) { unsigned int last = allocated; - allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array); + allocated = alloc_pages_bulk_array(GFP_NOFS | extra_gfp, + nr_pages, page_array); if (allocated == nr_pages) return 0; @@ -707,6 +721,26 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) return 0; } +/* + * Populate needed folios for the extent buffer. + * + * For now, the folios populated are always in order 0 (aka, single page). + */ +static int alloc_eb_folio_array(struct extent_buffer *eb, gfp_t extra_gfp) +{ + struct page *page_array[INLINE_EXTENT_BUFFER_PAGES] = { 0 }; + int num_pages = num_extent_pages(eb); + int ret; + + ret = btrfs_alloc_page_array(num_pages, page_array, extra_gfp); + if (ret < 0) + return ret; + + for (int i = 0; i < num_pages; i++) + eb->folios[i] = page_folio(page_array[i]); + return 0; +} + static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, struct page *page, u64 disk_bytenr, unsigned int pg_offset) @@ -861,9 +895,9 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, } while (size); } -static int attach_extent_buffer_page(struct extent_buffer *eb, - struct page *page, - struct btrfs_subpage *prealloc) +static int attach_extent_buffer_folio(struct extent_buffer *eb, + struct folio *folio, + struct btrfs_subpage *prealloc) { struct btrfs_fs_info *fs_info = eb->fs_info; int ret = 0; @@ -874,65 +908,66 @@ static int attach_extent_buffer_page(struct extent_buffer *eb, * For cloned or dummy extent buffers, their pages are not mapped and * will not race with any other ebs. */ - if (page->mapping) - lockdep_assert_held(&page->mapping->private_lock); + if (folio->mapping) + lockdep_assert_held(&folio->mapping->i_private_lock); if (fs_info->nodesize >= PAGE_SIZE) { - if (!PagePrivate(page)) - attach_page_private(page, eb); + if (!folio_test_private(folio)) + folio_attach_private(folio, eb); else - WARN_ON(page->private != (unsigned long)eb); + WARN_ON(folio_get_private(folio) != eb); return 0; } /* Already mapped, just free prealloc */ - if (PagePrivate(page)) { + if (folio_test_private(folio)) { btrfs_free_subpage(prealloc); return 0; } if (prealloc) /* Has preallocated memory for subpage */ - attach_page_private(page, prealloc); + folio_attach_private(folio, prealloc); else /* Do new allocation to attach subpage */ - ret = btrfs_attach_subpage(fs_info, page, - BTRFS_SUBPAGE_METADATA); + ret = btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA); return ret; } int set_page_extent_mapped(struct page *page) { + struct folio *folio = page_folio(page); struct btrfs_fs_info *fs_info; ASSERT(page->mapping); - if (PagePrivate(page)) + if (folio_test_private(folio)) return 0; fs_info = btrfs_sb(page->mapping->host->i_sb); - if (btrfs_is_subpage(fs_info, page)) - return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA); + if (btrfs_is_subpage(fs_info, page->mapping)) + return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA); - attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); + folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE); return 0; } void clear_page_extent_mapped(struct page *page) { + struct folio *folio = page_folio(page); struct btrfs_fs_info *fs_info; ASSERT(page->mapping); - if (!PagePrivate(page)) + if (!folio_test_private(folio)) return; fs_info = btrfs_sb(page->mapping->host->i_sb); - if (btrfs_is_subpage(fs_info, page)) - return btrfs_detach_subpage(fs_info, page); + if (btrfs_is_subpage(fs_info, page->mapping)) + return btrfs_detach_subpage(fs_info, folio); - detach_page_private(page); + folio_detach_private(folio); } static struct extent_map * @@ -1001,7 +1036,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, memzero_page(page, zero_offset, iosize); } } - bio_ctrl->end_io_func = end_bio_extent_readpage; + bio_ctrl->end_io_func = end_bbio_data_read; begin_page_read(fs_info, page); while (cur <= end) { enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; @@ -1027,8 +1062,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, BUG_ON(extent_map_end(em) <= cur); BUG_ON(end < cur); - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) - compress_type = em->compress_type; + compress_type = extent_map_compression(em); iosize = min(extent_map_end(em) - cur, end - cur + 1); iosize = ALIGN(iosize, blocksize); @@ -1037,7 +1071,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, else disk_bytenr = em->block_start + extent_offset; block_start = em->block_start; - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + if (em->flags & EXTENT_FLAG_PREALLOC) block_start = EXTENT_MAP_HOLE; /* @@ -1074,7 +1108,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, * is a corner case so we prioritize correctness over * non-optimal behavior (submitting 2 bios for the same extent). */ - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) && + if (compress_type != BTRFS_COMPRESS_NONE && prev_em_start && *prev_em_start != (u64)-1 && *prev_em_start != em->start) force_bio_submit = true; @@ -1240,7 +1274,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, struct page *page, u64 *start, u64 *end) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct folio *folio = page_folio(page); + struct btrfs_subpage *subpage = folio_get_private(folio); struct btrfs_subpage_info *spi = fs_info->subpage_info; u64 orig_start = *start; /* Declare as unsigned long so we can use bitmap ops */ @@ -1252,7 +1287,7 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, * For regular sector size == page size case, since one page only * contains one sector, we return the page offset directly. */ - if (!btrfs_is_subpage(fs_info, page)) { + if (!btrfs_is_subpage(fs_info, page->mapping)) { *start = page_offset(page); *end = page_offset(page) + PAGE_SIZE; return; @@ -1305,7 +1340,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, return 1; } - bio_ctrl->end_io_func = end_bio_extent_writepage; + bio_ctrl->end_io_func = end_bbio_data_write; while (cur <= end) { u32 len = end - cur + 1; u64 disk_bytenr; @@ -1325,7 +1360,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * writeback the sectors with subpage dirty bits, * causing writeback without ordered extent. */ - btrfs_page_clear_dirty(fs_info, page, cur, len); + btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, len); break; } @@ -1352,7 +1387,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, block_start = em->block_start; disk_bytenr = em->block_start + extent_offset; - ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); + ASSERT(!extent_map_is_compressed(em)); ASSERT(block_start != EXTENT_MAP_HOLE); ASSERT(block_start != EXTENT_MAP_INLINE); @@ -1377,7 +1412,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * So clear subpage dirty bit here so next time we won't submit * page for range already written to disk. */ - btrfs_page_clear_dirty(fs_info, page, cur, iosize); + btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, iosize); submit_extent_page(bio_ctrl, disk_bytenr, page, iosize, cur - page_offset(page)); @@ -1385,7 +1420,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, nr++; } - btrfs_page_assert_not_dirty(fs_info, page); + btrfs_folio_assert_not_dirty(fs_info, page_folio(page)); *nr_ret = nr; return 0; @@ -1607,24 +1642,23 @@ static struct extent_buffer *find_extent_buffer_nolock( return NULL; } -static void extent_buffer_write_end_io(struct btrfs_bio *bbio) +static void end_bbio_meta_write(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; struct btrfs_fs_info *fs_info = eb->fs_info; bool uptodate = !bbio->bio.bi_status; - struct bvec_iter_all iter_all; - struct bio_vec *bvec; + struct folio_iter fi; u32 bio_offset = 0; if (!uptodate) set_btree_ioerr(eb); - bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { + bio_for_each_folio_all(fi, &bbio->bio) { u64 start = eb->start + bio_offset; - struct page *page = bvec->bv_page; - u32 len = bvec->bv_len; + struct folio *folio = fi.folio; + u32 len = fi.length; - btrfs_page_clear_writeback(fs_info, page, start, len); + btrfs_folio_clear_writeback(fs_info, folio, start, len); bio_offset += len; } @@ -1673,36 +1707,44 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc), - eb->fs_info, extent_buffer_write_end_io, eb); + eb->fs_info, end_bbio_meta_write, eb); bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev); wbc_init_bio(wbc, &bbio->bio); bbio->inode = BTRFS_I(eb->fs_info->btree_inode); bbio->file_offset = eb->start; if (fs_info->nodesize < PAGE_SIZE) { - struct page *p = eb->pages[0]; + struct folio *folio = eb->folios[0]; + bool ret; - lock_page(p); - btrfs_subpage_set_writeback(fs_info, p, eb->start, eb->len); - if (btrfs_subpage_clear_and_test_dirty(fs_info, p, eb->start, + folio_lock(folio); + btrfs_subpage_set_writeback(fs_info, folio, eb->start, eb->len); + if (btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, eb->len)) { - clear_page_dirty_for_io(p); + folio_clear_dirty_for_io(folio); wbc->nr_to_write--; } - __bio_add_page(&bbio->bio, p, eb->len, eb->start - page_offset(p)); - wbc_account_cgroup_owner(wbc, p, eb->len); - unlock_page(p); + ret = bio_add_folio(&bbio->bio, folio, eb->len, + eb->start - folio_pos(folio)); + ASSERT(ret); + wbc_account_cgroup_owner(wbc, folio_page(folio, 0), eb->len); + folio_unlock(folio); } else { - for (int i = 0; i < num_extent_pages(eb); i++) { - struct page *p = eb->pages[i]; - - lock_page(p); - clear_page_dirty_for_io(p); - set_page_writeback(p); - __bio_add_page(&bbio->bio, p, PAGE_SIZE, 0); - wbc_account_cgroup_owner(wbc, p, PAGE_SIZE); - wbc->nr_to_write--; - unlock_page(p); + int num_folios = num_extent_folios(eb); + + for (int i = 0; i < num_folios; i++) { + struct folio *folio = eb->folios[i]; + bool ret; + + folio_lock(folio); + folio_clear_dirty_for_io(folio); + folio_start_writeback(folio); + ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0); + ASSERT(ret); + wbc_account_cgroup_owner(wbc, folio_page(folio, 0), + folio_size(folio)); + wbc->nr_to_write -= folio_nr_pages(folio); + folio_unlock(folio); } } btrfs_submit_bio(bbio, 0); @@ -1725,6 +1767,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct folio *folio = page_folio(page); int submitted = 0; u64 page_start = page_offset(page); int bit_start = 0; @@ -1732,7 +1775,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) /* Lock and write each dirty extent buffers in the range */ while (bit_start < fs_info->subpage_info->bitmap_nr_bits) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct btrfs_subpage *subpage = folio_get_private(folio); struct extent_buffer *eb; unsigned long flags; u64 start; @@ -1741,16 +1784,16 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) * Take private lock to ensure the subpage won't be detached * in the meantime. */ - spin_lock(&page->mapping->private_lock); - if (!PagePrivate(page)) { - spin_unlock(&page->mapping->private_lock); + spin_lock(&page->mapping->i_private_lock); + if (!folio_test_private(folio)) { + spin_unlock(&page->mapping->i_private_lock); break; } spin_lock_irqsave(&subpage->lock, flags); if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset, subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); bit_start++; continue; } @@ -1764,7 +1807,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) */ eb = find_extent_buffer_nolock(fs_info, start); spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); /* * The eb has already reached 0 refs thus find_extent_buffer() @@ -1807,38 +1850,39 @@ static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) { struct writeback_control *wbc = ctx->wbc; struct address_space *mapping = page->mapping; + struct folio *folio = page_folio(page); struct extent_buffer *eb; int ret; - if (!PagePrivate(page)) + if (!folio_test_private(folio)) return 0; if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) return submit_eb_subpage(page, wbc); - spin_lock(&mapping->private_lock); - if (!PagePrivate(page)) { - spin_unlock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); + if (!folio_test_private(folio)) { + spin_unlock(&mapping->i_private_lock); return 0; } - eb = (struct extent_buffer *)page->private; + eb = folio_get_private(folio); /* * Shouldn't happen and normally this would be a BUG_ON but no point * crashing the machine for something we can survive anyway. */ if (WARN_ON(!eb)) { - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); return 0; } if (eb == ctx->eb) { - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); return 0; } ret = atomic_inc_not_zero(&eb->refs); - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); if (!ret) return 0; @@ -2201,7 +2245,7 @@ void extent_write_locked_range(struct inode *inode, struct page *locked_page, cur, cur_len, !ret); mapping_set_error(page->mapping, ret); } - btrfs_page_unlock_writer(fs_info, page, cur, cur_len); + btrfs_folio_unlock_writer(fs_info, page_folio(page), cur, cur_len); if (ret < 0) found_error = true; next_page: @@ -2352,7 +2396,7 @@ int try_release_extent_mapping(struct page *page, gfp_t mask) write_unlock(&map->lock); break; } - if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || + if ((em->flags & EXTENT_FLAG_PINNED) || em->start != start) { write_unlock(&map->lock); free_extent_map(em); @@ -2369,7 +2413,7 @@ int try_release_extent_mapping(struct page *page, gfp_t mask) * extra reference on the em. */ if (list_empty(&em->list) || - test_bit(EXTENT_FLAG_LOGGING, &em->flags)) + (em->flags & EXTENT_FLAG_LOGGING)) goto remove_em; /* * If it's in the list of modified extents, remove it @@ -3058,14 +3102,14 @@ static int extent_buffer_under_io(const struct extent_buffer *eb) test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); } -static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page) +static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *folio) { struct btrfs_subpage *subpage; - lockdep_assert_held(&page->mapping->private_lock); + lockdep_assert_held(&folio->mapping->i_private_lock); - if (PagePrivate(page)) { - subpage = (struct btrfs_subpage *)page->private; + if (folio_test_private(folio)) { + subpage = folio_get_private(folio); if (atomic_read(&subpage->eb_refs)) return true; /* @@ -3078,21 +3122,21 @@ static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page) return false; } -static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page) +static void detach_extent_buffer_folio(struct extent_buffer *eb, struct folio *folio) { struct btrfs_fs_info *fs_info = eb->fs_info; const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); /* - * For mapped eb, we're going to change the page private, which should - * be done under the private_lock. + * For mapped eb, we're going to change the folio private, which should + * be done under the i_private_lock. */ if (mapped) - spin_lock(&page->mapping->private_lock); + spin_lock(&folio->mapping->i_private_lock); - if (!PagePrivate(page)) { + if (!folio_test_private(folio)) { if (mapped) - spin_unlock(&page->mapping->private_lock); + spin_unlock(&folio->mapping->i_private_lock); return; } @@ -3101,66 +3145,58 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag * We do this since we'll remove the pages after we've * removed the eb from the radix tree, so we could race * and have this page now attached to the new eb. So - * only clear page_private if it's still connected to + * only clear folio if it's still connected to * this eb. */ - if (PagePrivate(page) && - page->private == (unsigned long)eb) { + if (folio_test_private(folio) && folio_get_private(folio) == eb) { BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); - BUG_ON(PageDirty(page)); - BUG_ON(PageWriteback(page)); - /* - * We need to make sure we haven't be attached - * to a new eb. - */ - detach_page_private(page); + BUG_ON(folio_test_dirty(folio)); + BUG_ON(folio_test_writeback(folio)); + /* We need to make sure we haven't be attached to a new eb. */ + folio_detach_private(folio); } if (mapped) - spin_unlock(&page->mapping->private_lock); + spin_unlock(&folio->mapping->i_private_lock); return; } /* - * For subpage, we can have dummy eb with page private. In this case, - * we can directly detach the private as such page is only attached to - * one dummy eb, no sharing. + * For subpage, we can have dummy eb with folio private attached. In + * this case, we can directly detach the private as such folio is only + * attached to one dummy eb, no sharing. */ if (!mapped) { - btrfs_detach_subpage(fs_info, page); + btrfs_detach_subpage(fs_info, folio); return; } - btrfs_page_dec_eb_refs(fs_info, page); + btrfs_folio_dec_eb_refs(fs_info, folio); /* - * We can only detach the page private if there are no other ebs in the + * We can only detach the folio private if there are no other ebs in the * page range and no unfinished IO. */ - if (!page_range_has_eb(fs_info, page)) - btrfs_detach_subpage(fs_info, page); + if (!folio_range_has_eb(fs_info, folio)) + btrfs_detach_subpage(fs_info, folio); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&folio->mapping->i_private_lock); } /* Release all pages attached to the extent buffer */ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) { - int i; - int num_pages; - ASSERT(!extent_buffer_under_io(eb)); - num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++) { - struct page *page = eb->pages[i]; + for (int i = 0; i < INLINE_EXTENT_BUFFER_PAGES; i++) { + struct folio *folio = eb->folios[i]; - if (!page) + if (!folio) continue; - detach_extent_buffer_page(eb, page); + detach_extent_buffer_folio(eb, folio); - /* One for when we allocated the page */ - put_page(page); + /* One for when we allocated the folio. */ + folio_put(folio); } } @@ -3198,9 +3234,8 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) { - int i; struct extent_buffer *new; - int num_pages = num_extent_pages(src); + int num_folios = num_extent_folios(src); int ret; new = __alloc_extent_buffer(src->fs_info, src->start, src->len); @@ -3214,22 +3249,22 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) */ set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); - ret = btrfs_alloc_page_array(num_pages, new->pages); + ret = alloc_eb_folio_array(new, 0); if (ret) { btrfs_release_extent_buffer(new); return NULL; } - for (i = 0; i < num_pages; i++) { + for (int i = 0; i < num_folios; i++) { + struct folio *folio = new->folios[i]; int ret; - struct page *p = new->pages[i]; - ret = attach_extent_buffer_page(new, p, NULL); + ret = attach_extent_buffer_folio(new, folio, NULL); if (ret < 0) { btrfs_release_extent_buffer(new); return NULL; } - WARN_ON(PageDirty(p)); + WARN_ON(folio_test_dirty(folio)); } copy_extent_buffer_full(new, src); set_extent_buffer_uptodate(new); @@ -3241,23 +3276,20 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, unsigned long len) { struct extent_buffer *eb; - int num_pages; - int i; + int num_folios = 0; int ret; eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) return NULL; - num_pages = num_extent_pages(eb); - ret = btrfs_alloc_page_array(num_pages, eb->pages); + ret = alloc_eb_folio_array(eb, 0); if (ret) goto err; - for (i = 0; i < num_pages; i++) { - struct page *p = eb->pages[i]; - - ret = attach_extent_buffer_page(eb, p, NULL); + num_folios = num_extent_folios(eb); + for (int i = 0; i < num_folios; i++) { + ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL); if (ret < 0) goto err; } @@ -3268,10 +3300,10 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, return eb; err: - for (i = 0; i < num_pages; i++) { - if (eb->pages[i]) { - detach_extent_buffer_page(eb, eb->pages[i]); - __free_page(eb->pages[i]); + for (int i = 0; i < num_folios; i++) { + if (eb->folios[i]) { + detach_extent_buffer_folio(eb, eb->folios[i]); + __folio_put(eb->folios[i]); } } __free_extent_buffer(eb); @@ -3320,20 +3352,14 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) spin_unlock(&eb->refs_lock); } -static void mark_extent_buffer_accessed(struct extent_buffer *eb, - struct page *accessed) +static void mark_extent_buffer_accessed(struct extent_buffer *eb) { - int num_pages, i; + int num_folios= num_extent_folios(eb); check_buffer_tree_ref(eb); - num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++) { - struct page *p = eb->pages[i]; - - if (p != accessed) - mark_page_accessed(p); - } + for (int i = 0; i < num_folios; i++) + folio_mark_accessed(eb->folios[i]); } struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, @@ -3361,7 +3387,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, spin_lock(&eb->refs_lock); spin_unlock(&eb->refs_lock); } - mark_extent_buffer_accessed(eb, NULL); + mark_extent_buffer_accessed(eb); return eb; } @@ -3410,6 +3436,7 @@ free_eb: static struct extent_buffer *grab_extent_buffer( struct btrfs_fs_info *fs_info, struct page *page) { + struct folio *folio = page_folio(page); struct extent_buffer *exists; /* @@ -3421,21 +3448,21 @@ static struct extent_buffer *grab_extent_buffer( return NULL; /* Page not yet attached to an extent buffer */ - if (!PagePrivate(page)) + if (!folio_test_private(folio)) return NULL; /* * We could have already allocated an eb for this page and attached one * so lets see if we can get a ref on the existing eb, and if we can we * know it's good and we can just return that one, else we know we can - * just overwrite page->private. + * just overwrite folio private. */ - exists = (struct extent_buffer *)page->private; + exists = folio_get_private(folio); if (atomic_inc_not_zero(&exists->refs)) return exists; WARN_ON(PageDirty(page)); - detach_page_private(page); + folio_detach_private(folio); return NULL; } @@ -3469,19 +3496,88 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) return 0; } + +/* + * Return 0 if eb->folios[i] is attached to btree inode successfully. + * Return >0 if there is already another extent buffer for the range, + * and @found_eb_ret would be updated. + * Return -EAGAIN if the filemap has an existing folio but with different size + * than @eb. + * The caller needs to free the existing folios and retry using the same order. + */ +static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, + struct extent_buffer **found_eb_ret) +{ + + struct btrfs_fs_info *fs_info = eb->fs_info; + struct address_space *mapping = fs_info->btree_inode->i_mapping; + const unsigned long index = eb->start >> PAGE_SHIFT; + struct folio *existing_folio; + int ret; + + ASSERT(found_eb_ret); + + /* Caller should ensure the folio exists. */ + ASSERT(eb->folios[i]); + +retry: + ret = filemap_add_folio(mapping, eb->folios[i], index + i, + GFP_NOFS | __GFP_NOFAIL); + if (!ret) + return 0; + + existing_folio = filemap_lock_folio(mapping, index + i); + /* The page cache only exists for a very short time, just retry. */ + if (IS_ERR(existing_folio)) + goto retry; + + /* For now, we should only have single-page folios for btree inode. */ + ASSERT(folio_nr_pages(existing_folio) == 1); + + if (folio_size(existing_folio) != folio_size(eb->folios[0])) { + folio_unlock(existing_folio); + folio_put(existing_folio); + return -EAGAIN; + } + + if (fs_info->nodesize < PAGE_SIZE) { + /* + * We're going to reuse the existing page, can drop our page + * and subpage structure now. + */ + __free_page(folio_page(eb->folios[i], 0)); + eb->folios[i] = existing_folio; + } else { + struct extent_buffer *existing_eb; + + existing_eb = grab_extent_buffer(fs_info, + folio_page(existing_folio, 0)); + if (existing_eb) { + /* The extent buffer still exists, we can use it directly. */ + *found_eb_ret = existing_eb; + folio_unlock(existing_folio); + folio_put(existing_folio); + return 1; + } + /* The extent buffer no longer exists, we can reuse the folio. */ + __free_page(folio_page(eb->folios[i], 0)); + eb->folios[i] = existing_folio; + } + return 0; +} + struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level) { unsigned long len = fs_info->nodesize; - int num_pages; - int i; - unsigned long index = start >> PAGE_SHIFT; + int num_folios; + int attached = 0; struct extent_buffer *eb; - struct extent_buffer *exists = NULL; - struct page *p; + struct extent_buffer *existing_eb = NULL; struct address_space *mapping = fs_info->btree_inode->i_mapping; struct btrfs_subpage *prealloc = NULL; u64 lockdep_owner = owner_root; + bool page_contig = true; int uptodate = 1; int ret; @@ -3516,11 +3612,9 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level); - num_pages = num_extent_pages(eb); - /* - * Preallocate page->private for subpage case, so that we won't - * allocate memory with private_lock nor page lock hold. + * Preallocate folio private for subpage case, so that we won't + * allocate memory with i_private_lock nor page lock hold. * * The memory will be freed by attach_extent_buffer_page() or freed * manually if we exit earlier. @@ -3528,47 +3622,89 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, if (fs_info->nodesize < PAGE_SIZE) { prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); if (IS_ERR(prealloc)) { - exists = ERR_CAST(prealloc); - goto free_eb; + ret = PTR_ERR(prealloc); + goto out; } } - for (i = 0; i < num_pages; i++, index++) { - p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); - if (!p) { - exists = ERR_PTR(-ENOMEM); - btrfs_free_subpage(prealloc); - goto free_eb; +reallocate: + /* Allocate all pages first. */ + ret = alloc_eb_folio_array(eb, __GFP_NOFAIL); + if (ret < 0) { + btrfs_free_subpage(prealloc); + goto out; + } + + num_folios = num_extent_folios(eb); + /* Attach all pages to the filemap. */ + for (int i = 0; i < num_folios; i++) { + struct folio *folio; + + ret = attach_eb_folio_to_filemap(eb, i, &existing_eb); + if (ret > 0) { + ASSERT(existing_eb); + goto out; } - spin_lock(&mapping->private_lock); - exists = grab_extent_buffer(fs_info, p); - if (exists) { - spin_unlock(&mapping->private_lock); - unlock_page(p); - put_page(p); - mark_extent_buffer_accessed(exists, p); - btrfs_free_subpage(prealloc); - goto free_eb; + /* + * TODO: Special handling for a corner case where the order of + * folios mismatch between the new eb and filemap. + * + * This happens when: + * + * - the new eb is using higher order folio + * + * - the filemap is still using 0-order folios for the range + * This can happen at the previous eb allocation, and we don't + * have higher order folio for the call. + * + * - the existing eb has already been freed + * + * In this case, we have to free the existing folios first, and + * re-allocate using the same order. + * Thankfully this is not going to happen yet, as we're still + * using 0-order folios. + */ + if (unlikely(ret == -EAGAIN)) { + ASSERT(0); + goto reallocate; } + attached++; + + /* + * Only after attach_eb_folio_to_filemap(), eb->folios[] is + * reliable, as we may choose to reuse the existing page cache + * and free the allocated page. + */ + folio = eb->folios[i]; + spin_lock(&mapping->i_private_lock); /* Should not fail, as we have preallocated the memory */ - ret = attach_extent_buffer_page(eb, p, prealloc); + ret = attach_extent_buffer_folio(eb, folio, prealloc); ASSERT(!ret); /* * To inform we have extra eb under allocation, so that - * detach_extent_buffer_page() won't release the page private + * detach_extent_buffer_page() won't release the folio private * when the eb hasn't yet been inserted into radix tree. * * The ref will be decreased when the eb released the page, in * detach_extent_buffer_page(). * Thus needs no special handling in error path. */ - btrfs_page_inc_eb_refs(fs_info, p); - spin_unlock(&mapping->private_lock); + btrfs_folio_inc_eb_refs(fs_info, folio); + spin_unlock(&mapping->i_private_lock); + + WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len)); + + /* + * Check if the current page is physically contiguous with previous eb + * page. + * At this stage, either we allocated a large folio, thus @i + * would only be 0, or we fall back to per-page allocation. + */ + if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0)) + page_contig = false; - WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len)); - eb->pages[i] = p; - if (!btrfs_page_test_uptodate(fs_info, p, eb->start, eb->len)) + if (!btrfs_folio_test_uptodate(fs_info, folio, eb->start, eb->len)) uptodate = 0; /* @@ -3581,12 +3717,13 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, } if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + /* All pages are physically contiguous, can skip cross page handling. */ + if (page_contig) + eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start); again: ret = radix_tree_preload(GFP_NOFS); - if (ret) { - exists = ERR_PTR(ret); - goto free_eb; - } + if (ret) + goto out; spin_lock(&fs_info->buffer_lock); ret = radix_tree_insert(&fs_info->buffer_radix, @@ -3594,9 +3731,10 @@ again: spin_unlock(&fs_info->buffer_lock); radix_tree_preload_end(); if (ret == -EEXIST) { - exists = find_extent_buffer(fs_info, start); - if (exists) - goto free_eb; + ret = 0; + existing_eb = find_extent_buffer(fs_info, start); + if (existing_eb) + goto out; else goto again; } @@ -3609,19 +3747,46 @@ again: * btree_release_folio will correctly detect that a page belongs to a * live buffer and won't free them prematurely. */ - for (i = 0; i < num_pages; i++) - unlock_page(eb->pages[i]); + for (int i = 0; i < num_folios; i++) + unlock_page(folio_page(eb->folios[i], 0)); return eb; -free_eb: +out: WARN_ON(!atomic_dec_and_test(&eb->refs)); - for (i = 0; i < num_pages; i++) { - if (eb->pages[i]) - unlock_page(eb->pages[i]); + + /* + * Any attached folios need to be detached before we unlock them. This + * is because when we're inserting our new folios into the mapping, and + * then attaching our eb to that folio. If we fail to insert our folio + * we'll lookup the folio for that index, and grab that EB. We do not + * want that to grab this eb, as we're getting ready to free it. So we + * have to detach it first and then unlock it. + * + * We have to drop our reference and NULL it out here because in the + * subpage case detaching does a btrfs_folio_dec_eb_refs() for our eb. + * Below when we call btrfs_release_extent_buffer() we will call + * detach_extent_buffer_folio() on our remaining pages in the !subpage + * case. If we left eb->folios[i] populated in the subpage case we'd + * double put our reference and be super sad. + */ + for (int i = 0; i < attached; i++) { + ASSERT(eb->folios[i]); + detach_extent_buffer_folio(eb, eb->folios[i]); + unlock_page(folio_page(eb->folios[i], 0)); + folio_put(eb->folios[i]); + eb->folios[i] = NULL; } + /* + * Now all pages of that extent buffer is unmapped, set UNMAPPED flag, + * so it can be cleaned up without utlizing page->mapping. + */ + set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); btrfs_release_extent_buffer(eb); - return exists; + if (ret < 0) + return ERR_PTR(ret); + ASSERT(existing_eb); + return existing_eb; } static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) @@ -3713,31 +3878,30 @@ void free_extent_buffer_stale(struct extent_buffer *eb) release_extent_buffer(eb); } -static void btree_clear_page_dirty(struct page *page) +static void btree_clear_folio_dirty(struct folio *folio) { - ASSERT(PageDirty(page)); - ASSERT(PageLocked(page)); - clear_page_dirty_for_io(page); - xa_lock_irq(&page->mapping->i_pages); - if (!PageDirty(page)) - __xa_clear_mark(&page->mapping->i_pages, - page_index(page), PAGECACHE_TAG_DIRTY); - xa_unlock_irq(&page->mapping->i_pages); + ASSERT(folio_test_dirty(folio)); + ASSERT(folio_test_locked(folio)); + folio_clear_dirty_for_io(folio); + xa_lock_irq(&folio->mapping->i_pages); + if (!folio_test_dirty(folio)) + __xa_clear_mark(&folio->mapping->i_pages, + folio_index(folio), PAGECACHE_TAG_DIRTY); + xa_unlock_irq(&folio->mapping->i_pages); } static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - struct page *page = eb->pages[0]; + struct folio *folio = eb->folios[0]; bool last; - /* btree_clear_page_dirty() needs page locked */ - lock_page(page); - last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start, - eb->len); + /* btree_clear_folio_dirty() needs page locked. */ + folio_lock(folio); + last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, eb->len); if (last) - btree_clear_page_dirty(page); - unlock_page(page); + btree_clear_folio_dirty(folio); + folio_unlock(folio); WARN_ON(atomic_read(&eb->refs) == 0); } @@ -3745,15 +3909,27 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - int i; - int num_pages; - struct page *page; + int num_folios; btrfs_assert_tree_write_locked(eb); if (trans && btrfs_header_generation(eb) != trans->transid) return; + /* + * Instead of clearing the dirty flag off of the buffer, mark it as + * EXTENT_BUFFER_ZONED_ZEROOUT. This allows us to preserve + * write-ordering in zoned mode, without the need to later re-dirty + * the extent_buffer. + * + * The actual zeroout of the buffer will happen later in + * btree_csum_one_bio. + */ + if (btrfs_is_zoned(fs_info)) { + set_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags); + return; + } + if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) return; @@ -3763,30 +3939,29 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, if (eb->fs_info->nodesize < PAGE_SIZE) return clear_subpage_extent_buffer_dirty(eb); - num_pages = num_extent_pages(eb); + num_folios = num_extent_folios(eb); + for (int i = 0; i < num_folios; i++) { + struct folio *folio = eb->folios[i]; - for (i = 0; i < num_pages; i++) { - page = eb->pages[i]; - if (!PageDirty(page)) + if (!folio_test_dirty(folio)) continue; - lock_page(page); - btree_clear_page_dirty(page); - unlock_page(page); + folio_lock(folio); + btree_clear_folio_dirty(folio); + folio_unlock(folio); } WARN_ON(atomic_read(&eb->refs) == 0); } void set_extent_buffer_dirty(struct extent_buffer *eb) { - int i; - int num_pages; + int num_folios; bool was_dirty; check_buffer_tree_ref(eb); was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - num_pages = num_extent_pages(eb); + num_folios = num_extent_folios(eb); WARN_ON(atomic_read(&eb->refs) == 0); WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); @@ -3805,34 +3980,32 @@ void set_extent_buffer_dirty(struct extent_buffer *eb) * the above race. */ if (subpage) - lock_page(eb->pages[0]); - for (i = 0; i < num_pages; i++) - btrfs_page_set_dirty(eb->fs_info, eb->pages[i], - eb->start, eb->len); + lock_page(folio_page(eb->folios[0], 0)); + for (int i = 0; i < num_folios; i++) + btrfs_folio_set_dirty(eb->fs_info, eb->folios[i], + eb->start, eb->len); if (subpage) - unlock_page(eb->pages[0]); + unlock_page(folio_page(eb->folios[0], 0)); percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes, eb->len, eb->fs_info->dirty_metadata_batch); } #ifdef CONFIG_BTRFS_DEBUG - for (i = 0; i < num_pages; i++) - ASSERT(PageDirty(eb->pages[i])); + for (int i = 0; i < num_folios; i++) + ASSERT(folio_test_dirty(eb->folios[i])); #endif } void clear_extent_buffer_uptodate(struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - struct page *page; - int num_pages; - int i; + int num_folios = num_extent_folios(eb); clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++) { - page = eb->pages[i]; - if (!page) + for (int i = 0; i < num_folios; i++) { + struct folio *folio = eb->folios[i]; + + if (!folio) continue; /* @@ -3840,44 +4013,40 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb) * btrfs_is_subpage() can not handle cloned/dummy metadata. */ if (fs_info->nodesize >= PAGE_SIZE) - ClearPageUptodate(page); + folio_clear_uptodate(folio); else - btrfs_subpage_clear_uptodate(fs_info, page, eb->start, - eb->len); + btrfs_subpage_clear_uptodate(fs_info, folio, + eb->start, eb->len); } } void set_extent_buffer_uptodate(struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - struct page *page; - int num_pages; - int i; + int num_folios = num_extent_folios(eb); set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++) { - page = eb->pages[i]; + for (int i = 0; i < num_folios; i++) { + struct folio *folio = eb->folios[i]; /* * This is special handling for metadata subpage, as regular * btrfs_is_subpage() can not handle cloned/dummy metadata. */ if (fs_info->nodesize >= PAGE_SIZE) - SetPageUptodate(page); + folio_mark_uptodate(folio); else - btrfs_subpage_set_uptodate(fs_info, page, eb->start, - eb->len); + btrfs_subpage_set_uptodate(fs_info, folio, + eb->start, eb->len); } } -static void extent_buffer_read_end_io(struct btrfs_bio *bbio) +static void end_bbio_meta_read(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; struct btrfs_fs_info *fs_info = eb->fs_info; bool uptodate = !bbio->bio.bi_status; - struct bvec_iter_all iter_all; - struct bio_vec *bvec; + struct folio_iter fi; u32 bio_offset = 0; eb->read_mirror = bbio->mirror_num; @@ -3893,15 +4062,15 @@ static void extent_buffer_read_end_io(struct btrfs_bio *bbio) set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); } - bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { + bio_for_each_folio_all(fi, &bbio->bio) { + struct folio *folio = fi.folio; u64 start = eb->start + bio_offset; - struct page *page = bvec->bv_page; - u32 len = bvec->bv_len; + u32 len = fi.length; if (uptodate) - btrfs_page_set_uptodate(fs_info, page, start, len); + btrfs_folio_set_uptodate(fs_info, folio, start, len); else - btrfs_page_clear_uptodate(fs_info, page, start, len); + btrfs_folio_clear_uptodate(fs_info, folio, start, len); bio_offset += len; } @@ -3917,8 +4086,8 @@ static void extent_buffer_read_end_io(struct btrfs_bio *bbio) int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, struct btrfs_tree_parent_check *check) { - int num_pages = num_extent_pages(eb), i; struct btrfs_bio *bbio; + bool ret; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; @@ -3942,17 +4111,24 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, REQ_OP_READ | REQ_META, eb->fs_info, - extent_buffer_read_end_io, eb); + end_bbio_meta_read, eb); bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; bbio->inode = BTRFS_I(eb->fs_info->btree_inode); bbio->file_offset = eb->start; memcpy(&bbio->parent_check, check, sizeof(*check)); if (eb->fs_info->nodesize < PAGE_SIZE) { - __bio_add_page(&bbio->bio, eb->pages[0], eb->len, - eb->start - page_offset(eb->pages[0])); + ret = bio_add_folio(&bbio->bio, eb->folios[0], eb->len, + eb->start - folio_pos(eb->folios[0])); + ASSERT(ret); } else { - for (i = 0; i < num_pages; i++) - __bio_add_page(&bbio->bio, eb->pages[i], PAGE_SIZE, 0); + int num_folios = num_extent_folios(eb); + + for (int i = 0; i < num_folios; i++) { + struct folio *folio = eb->folios[i]; + + ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0); + ASSERT(ret); + } } btrfs_submit_bio(bbio, mirror_num); @@ -3999,29 +4175,33 @@ static inline int check_eb_range(const struct extent_buffer *eb, void read_extent_buffer(const struct extent_buffer *eb, void *dstv, unsigned long start, unsigned long len) { + const int unit_size = folio_size(eb->folios[0]); size_t cur; size_t offset; - struct page *page; - char *kaddr; char *dst = (char *)dstv; - unsigned long i = get_eb_page_index(start); + unsigned long i = get_eb_folio_index(eb, start); if (check_eb_range(eb, start, len)) { /* * Invalid range hit, reset the memory, so callers won't get - * some random garbage for their uninitialzed memory. + * some random garbage for their uninitialized memory. */ memset(dstv, 0, len); return; } - offset = get_eb_offset_in_page(eb, start); + if (eb->addr) { + memcpy(dstv, eb->addr + start, len); + return; + } + + offset = get_eb_offset_in_folio(eb, start); while (len > 0) { - page = eb->pages[i]; + char *kaddr; - cur = min(len, (PAGE_SIZE - offset)); - kaddr = page_address(page); + cur = min(len, unit_size - offset); + kaddr = folio_address(eb->folios[i]); memcpy(dst, kaddr + offset, cur); dst += cur; @@ -4035,24 +4215,29 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, void __user *dstv, unsigned long start, unsigned long len) { + const int unit_size = folio_size(eb->folios[0]); size_t cur; size_t offset; - struct page *page; - char *kaddr; char __user *dst = (char __user *)dstv; - unsigned long i = get_eb_page_index(start); + unsigned long i = get_eb_folio_index(eb, start); int ret = 0; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = get_eb_offset_in_page(eb, start); + if (eb->addr) { + if (copy_to_user_nofault(dstv, eb->addr + start, len)) + ret = -EFAULT; + return ret; + } + + offset = get_eb_offset_in_folio(eb, start); while (len > 0) { - page = eb->pages[i]; + char *kaddr; - cur = min(len, (PAGE_SIZE - offset)); - kaddr = page_address(page); + cur = min(len, unit_size - offset); + kaddr = folio_address(eb->folios[i]); if (copy_to_user_nofault(dst, kaddr + offset, cur)) { ret = -EFAULT; break; @@ -4070,25 +4255,25 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, unsigned long start, unsigned long len) { + const int unit_size = folio_size(eb->folios[0]); size_t cur; size_t offset; - struct page *page; char *kaddr; char *ptr = (char *)ptrv; - unsigned long i = get_eb_page_index(start); + unsigned long i = get_eb_folio_index(eb, start); int ret = 0; if (check_eb_range(eb, start, len)) return -EINVAL; - offset = get_eb_offset_in_page(eb, start); - - while (len > 0) { - page = eb->pages[i]; + if (eb->addr) + return memcmp(ptrv, eb->addr + start, len); - cur = min(len, (PAGE_SIZE - offset)); + offset = get_eb_offset_in_folio(eb, start); - kaddr = page_address(page); + while (len > 0) { + cur = min(len, unit_size - offset); + kaddr = folio_address(eb->folios[i]); ret = memcmp(ptr, kaddr + offset, cur); if (ret) break; @@ -4107,10 +4292,12 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, * For regular sector size == PAGE_SIZE case, check if @page is uptodate. * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE. */ -static void assert_eb_page_uptodate(const struct extent_buffer *eb, - struct page *page) +static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i) { struct btrfs_fs_info *fs_info = eb->fs_info; + struct folio *folio = eb->folios[i]; + + ASSERT(folio); /* * If we are using the commit root we could potentially clear a page @@ -4124,11 +4311,14 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb, return; if (fs_info->nodesize < PAGE_SIZE) { - if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, page, + struct folio *folio = eb->folios[0]; + + ASSERT(i == 0); + if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio, eb->start, eb->len))) - btrfs_subpage_dump_bitmap(fs_info, page, eb->start, eb->len); + btrfs_subpage_dump_bitmap(fs_info, folio, eb->start, eb->len); } else { - WARN_ON(!PageUptodate(page)); + WARN_ON(!folio_test_uptodate(folio)); } } @@ -4136,29 +4326,34 @@ static void __write_extent_buffer(const struct extent_buffer *eb, const void *srcv, unsigned long start, unsigned long len, bool use_memmove) { + const int unit_size = folio_size(eb->folios[0]); size_t cur; size_t offset; - struct page *page; char *kaddr; char *src = (char *)srcv; - unsigned long i = get_eb_page_index(start); + unsigned long i = get_eb_folio_index(eb, start); /* For unmapped (dummy) ebs, no need to check their uptodate status. */ const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); - WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)); - if (check_eb_range(eb, start, len)) return; - offset = get_eb_offset_in_page(eb, start); + if (eb->addr) { + if (use_memmove) + memmove(eb->addr + start, srcv, len); + else + memcpy(eb->addr + start, srcv, len); + return; + } + + offset = get_eb_offset_in_folio(eb, start); while (len > 0) { - page = eb->pages[i]; if (check_uptodate) - assert_eb_page_uptodate(eb, page); + assert_eb_folio_uptodate(eb, i); - cur = min(len, PAGE_SIZE - offset); - kaddr = page_address(page); + cur = min(len, unit_size - offset); + kaddr = folio_address(eb->folios[i]); if (use_memmove) memmove(kaddr + offset, src, cur); else @@ -4180,16 +4375,21 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, static void memset_extent_buffer(const struct extent_buffer *eb, int c, unsigned long start, unsigned long len) { + const int unit_size = folio_size(eb->folios[0]); unsigned long cur = start; + if (eb->addr) { + memset(eb->addr + start, c, len); + return; + } + while (cur < start + len) { - unsigned long index = get_eb_page_index(cur); - unsigned int offset = get_eb_offset_in_page(eb, cur); - unsigned int cur_len = min(start + len - cur, PAGE_SIZE - offset); - struct page *page = eb->pages[index]; + unsigned long index = get_eb_folio_index(eb, cur); + unsigned int offset = get_eb_offset_in_folio(eb, cur); + unsigned int cur_len = min(start + len - cur, unit_size - offset); - assert_eb_page_uptodate(eb, page); - memset(page_address(page) + offset, c, cur_len); + assert_eb_folio_uptodate(eb, index); + memset(folio_address(eb->folios[index]) + offset, c, cur_len); cur += cur_len; } @@ -4206,15 +4406,16 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, void copy_extent_buffer_full(const struct extent_buffer *dst, const struct extent_buffer *src) { + const int unit_size = folio_size(src->folios[0]); unsigned long cur = 0; ASSERT(dst->len == src->len); while (cur < src->len) { - unsigned long index = get_eb_page_index(cur); - unsigned long offset = get_eb_offset_in_page(src, cur); - unsigned long cur_len = min(src->len, PAGE_SIZE - offset); - void *addr = page_address(src->pages[index]) + offset; + unsigned long index = get_eb_folio_index(src, cur); + unsigned long offset = get_eb_offset_in_folio(src, cur); + unsigned long cur_len = min(src->len, unit_size - offset); + void *addr = folio_address(src->folios[index]) + offset; write_extent_buffer(dst, addr, cur, cur_len); @@ -4227,12 +4428,12 @@ void copy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { + const int unit_size = folio_size(dst->folios[0]); u64 dst_len = dst->len; size_t cur; size_t offset; - struct page *page; char *kaddr; - unsigned long i = get_eb_page_index(dst_offset); + unsigned long i = get_eb_folio_index(dst, dst_offset); if (check_eb_range(dst, dst_offset, len) || check_eb_range(src, src_offset, len)) @@ -4240,15 +4441,14 @@ void copy_extent_buffer(const struct extent_buffer *dst, WARN_ON(src->len != dst_len); - offset = get_eb_offset_in_page(dst, dst_offset); + offset = get_eb_offset_in_folio(dst, dst_offset); while (len > 0) { - page = dst->pages[i]; - assert_eb_page_uptodate(dst, page); + assert_eb_folio_uptodate(dst, i); - cur = min(len, (unsigned long)(PAGE_SIZE - offset)); + cur = min(len, (unsigned long)(unit_size - offset)); - kaddr = page_address(page); + kaddr = folio_address(dst->folios[i]); read_extent_buffer(src, kaddr + offset, src_offset, cur); src_offset += cur; @@ -4259,22 +4459,22 @@ void copy_extent_buffer(const struct extent_buffer *dst, } /* - * Calculate the page and offset of the byte containing the given bit number. + * Calculate the folio and offset of the byte containing the given bit number. * * @eb: the extent buffer * @start: offset of the bitmap item in the extent buffer * @nr: bit number - * @page_index: return index of the page in the extent buffer that contains + * @folio_index: return index of the folio in the extent buffer that contains * the given bit number - * @page_offset: return offset into the page given by page_index + * @folio_offset: return offset into the folio given by folio_index * * This helper hides the ugliness of finding the byte in an extent buffer which * contains a given bit. */ static inline void eb_bitmap_offset(const struct extent_buffer *eb, unsigned long start, unsigned long nr, - unsigned long *page_index, - size_t *page_offset) + unsigned long *folio_index, + size_t *folio_offset) { size_t byte_offset = BIT_BYTE(nr); size_t offset; @@ -4284,10 +4484,10 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb, * the bitmap item in the extent buffer + the offset of the byte in the * bitmap item. */ - offset = start + offset_in_page(eb->start) + byte_offset; + offset = start + offset_in_folio(eb->folios[0], eb->start) + byte_offset; - *page_index = offset >> PAGE_SHIFT; - *page_offset = offset_in_page(offset); + *folio_index = offset >> folio_shift(eb->folios[0]); + *folio_offset = offset_in_folio(eb->folios[0], offset); } /* @@ -4300,25 +4500,23 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb, int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, unsigned long nr) { - u8 *kaddr; - struct page *page; unsigned long i; size_t offset; + u8 *kaddr; eb_bitmap_offset(eb, start, nr, &i, &offset); - page = eb->pages[i]; - assert_eb_page_uptodate(eb, page); - kaddr = page_address(page); + assert_eb_folio_uptodate(eb, i); + kaddr = folio_address(eb->folios[i]); return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); } static u8 *extent_buffer_get_byte(const struct extent_buffer *eb, unsigned long bytenr) { - unsigned long index = get_eb_page_index(bytenr); + unsigned long index = get_eb_folio_index(eb, bytenr); if (check_eb_range(eb, bytenr, 1)) return NULL; - return page_address(eb->pages[index]) + get_eb_offset_in_page(eb, bytenr); + return folio_address(eb->folios[index]) + get_eb_offset_in_folio(eb, bytenr); } /* @@ -4403,19 +4601,30 @@ void memcpy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { + const int unit_size = folio_size(dst->folios[0]); unsigned long cur_off = 0; if (check_eb_range(dst, dst_offset, len) || check_eb_range(dst, src_offset, len)) return; + if (dst->addr) { + const bool use_memmove = areas_overlap(src_offset, dst_offset, len); + + if (use_memmove) + memmove(dst->addr + dst_offset, dst->addr + src_offset, len); + else + memcpy(dst->addr + dst_offset, dst->addr + src_offset, len); + return; + } + while (cur_off < len) { unsigned long cur_src = cur_off + src_offset; - unsigned long pg_index = get_eb_page_index(cur_src); - unsigned long pg_off = get_eb_offset_in_page(dst, cur_src); + unsigned long folio_index = get_eb_folio_index(dst, cur_src); + unsigned long folio_off = get_eb_offset_in_folio(dst, cur_src); unsigned long cur_len = min(src_offset + len - cur_src, - PAGE_SIZE - pg_off); - void *src_addr = page_address(dst->pages[pg_index]) + pg_off; + unit_size - folio_off); + void *src_addr = folio_address(dst->folios[folio_index]) + folio_off; const bool use_memmove = areas_overlap(src_offset + cur_off, dst_offset + cur_off, cur_len); @@ -4441,24 +4650,29 @@ void memmove_extent_buffer(const struct extent_buffer *dst, return; } + if (dst->addr) { + memmove(dst->addr + dst_offset, dst->addr + src_offset, len); + return; + } + while (len > 0) { unsigned long src_i; size_t cur; - size_t dst_off_in_page; - size_t src_off_in_page; + size_t dst_off_in_folio; + size_t src_off_in_folio; void *src_addr; bool use_memmove; - src_i = get_eb_page_index(src_end); + src_i = get_eb_folio_index(dst, src_end); - dst_off_in_page = get_eb_offset_in_page(dst, dst_end); - src_off_in_page = get_eb_offset_in_page(dst, src_end); + dst_off_in_folio = get_eb_offset_in_folio(dst, dst_end); + src_off_in_folio = get_eb_offset_in_folio(dst, src_end); - cur = min_t(unsigned long, len, src_off_in_page + 1); - cur = min(cur, dst_off_in_page + 1); + cur = min_t(unsigned long, len, src_off_in_folio + 1); + cur = min(cur, dst_off_in_folio + 1); - src_addr = page_address(dst->pages[src_i]) + src_off_in_page - - cur + 1; + src_addr = folio_address(dst->folios[src_i]) + src_off_in_folio - + cur + 1; use_memmove = areas_overlap(src_end - cur + 1, dst_end - cur + 1, cur); @@ -4520,7 +4734,7 @@ static int try_release_subpage_extent_buffer(struct page *page) struct extent_buffer *eb = NULL; /* - * Unlike try_release_extent_buffer() which uses page->private + * Unlike try_release_extent_buffer() which uses folio private * to grab buffer, for subpage case we rely on radix tree, thus * we need to ensure radix tree consistency. * @@ -4560,43 +4774,44 @@ static int try_release_subpage_extent_buffer(struct page *page) /* * Here we don't care about the return value, we will always - * check the page private at the end. And + * check the folio private at the end. And * release_extent_buffer() will release the refs_lock. */ release_extent_buffer(eb); } /* - * Finally to check if we have cleared page private, as if we have - * released all ebs in the page, the page private should be cleared now. + * Finally to check if we have cleared folio private, as if we have + * released all ebs in the page, the folio private should be cleared now. */ - spin_lock(&page->mapping->private_lock); - if (!PagePrivate(page)) + spin_lock(&page->mapping->i_private_lock); + if (!folio_test_private(page_folio(page))) ret = 1; else ret = 0; - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); return ret; } int try_release_extent_buffer(struct page *page) { + struct folio *folio = page_folio(page); struct extent_buffer *eb; if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) return try_release_subpage_extent_buffer(page); /* - * We need to make sure nobody is changing page->private, as we rely on - * page->private as the pointer to extent buffer. + * We need to make sure nobody is changing folio private, as we rely on + * folio private as the pointer to extent buffer. */ - spin_lock(&page->mapping->private_lock); - if (!PagePrivate(page)) { - spin_unlock(&page->mapping->private_lock); + spin_lock(&page->mapping->i_private_lock); + if (!folio_test_private(folio)) { + spin_unlock(&page->mapping->i_private_lock); return 1; } - eb = (struct extent_buffer *)page->private; + eb = folio_get_private(folio); BUG_ON(!eb); /* @@ -4607,10 +4822,10 @@ int try_release_extent_buffer(struct page *page) spin_lock(&eb->refs_lock); if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { spin_unlock(&eb->refs_lock); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); return 0; } - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); /* * If tree ref isn't set then we know the ref on this eb is a real ref, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 2171057a4477..46050500529b 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -28,7 +28,8 @@ enum { EXTENT_BUFFER_IN_TREE, /* write IO error */ EXTENT_BUFFER_WRITE_ERR, - EXTENT_BUFFER_NO_CHECK, + /* Indicate the extent buffer is written zeroed out (for zoned) */ + EXTENT_BUFFER_ZONED_ZEROOUT, /* Indicate that extent buffer pages a being read */ EXTENT_BUFFER_READING, }; @@ -43,10 +44,10 @@ enum { }; /* - * page->private values. Every page that is controlled by the extent - * map has page->private set to one. + * Folio private values. Every page that is controlled by the extent map has + * folio private set to this value. */ -#define EXTENT_PAGE_PRIVATE 1 +#define EXTENT_FOLIO_PRIVATE 1 /* * The extent buffer bitmap operations are done with byte granularity instead of @@ -77,6 +78,13 @@ struct extent_buffer { unsigned long len; unsigned long bflags; struct btrfs_fs_info *fs_info; + + /* + * The address where the eb can be accessed without any cross-page handling. + * This can be NULL if not possible. + */ + void *addr; + spinlock_t refs_lock; atomic_t refs; int read_mirror; @@ -86,7 +94,12 @@ struct extent_buffer { struct rw_semaphore lock; - struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; + /* + * Pointers to all the folios of the extent buffer. + * + * For now the folio is always order 0 (aka, a single page). + */ + struct folio *folios[INLINE_EXTENT_BUFFER_PAGES]; #ifdef CONFIG_BTRFS_DEBUG struct list_head leak_list; pid_t lock_owner; @@ -108,29 +121,43 @@ struct btrfs_eb_write_context { * * Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases. */ -static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb, - unsigned long offset) +static inline size_t get_eb_offset_in_folio(const struct extent_buffer *eb, + unsigned long offset) { /* - * For sectorsize == PAGE_SIZE case, eb->start will always be aligned - * to PAGE_SIZE, thus adding it won't cause any difference. + * 1) sectorsize == PAGE_SIZE and nodesize >= PAGE_SIZE case + * 1.1) One large folio covering the whole eb + * The eb->start is aligned to folio size, thus adding it + * won't cause any difference. + * 1.2) Several page sized folios + * The eb->start is aligned to folio (page) size, thus + * adding it won't cause any difference. * - * For sectorsize < PAGE_SIZE, we must only read the data that belongs - * to the eb, thus we have to take the eb->start into consideration. + * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case + * In this case there would only be one page sized folio, and there + * may be several different extent buffers in the page/folio. + * We need to add eb->start to properly access the offset inside + * that eb. */ - return offset_in_page(offset + eb->start); + return offset_in_folio(eb->folios[0], offset + eb->start); } -static inline unsigned long get_eb_page_index(unsigned long offset) +static inline unsigned long get_eb_folio_index(const struct extent_buffer *eb, + unsigned long offset) { /* - * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough. + * 1) sectorsize == PAGE_SIZE and nodesize >= PAGE_SIZE case + * 1.1) One large folio covering the whole eb. + * the folio_shift would be large enough to always make us + * return 0 as index. + * 1.2) Several page sized folios + * The folio_shift() would be PAGE_SHIFT, giving us the correct + * index. * - * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE, - * and have ensured that all tree blocks are contained in one page, - * thus we always get index == 0. + * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case + * The folio would only be page sized, and always give us 0 as index. */ - return offset >> PAGE_SHIFT; + return offset >> folio_shift(eb->folios[0]); } /* @@ -230,6 +257,20 @@ static inline int num_extent_pages(const struct extent_buffer *eb) return (eb->len >> PAGE_SHIFT) ?: 1; } +/* + * This can only be determined at runtime by checking eb::folios[0]. + * + * As we can have either one large folio covering the whole eb + * (either nodesize <= PAGE_SIZE, or high order folio), or multiple + * single-paged folios. + */ +static inline int num_extent_folios(const struct extent_buffer *eb) +{ + if (folio_order(eb->folios[0])) + return 1; + return num_extent_pages(eb); +} + static inline int extent_buffer_uptodate(const struct extent_buffer *eb) { return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); @@ -294,7 +335,8 @@ int extent_invalidate_folio(struct extent_io_tree *tree, void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, struct extent_buffer *buf); -int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); +int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, + gfp_t extra_gfp); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index a6d8368ed0ed..b61099bf97a8 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -50,7 +50,6 @@ struct extent_map *alloc_extent_map(void) if (!em) return NULL; RB_CLEAR_NODE(&em->rb_node); - em->compress_type = BTRFS_COMPRESS_NONE; refcount_set(&em->refs, 1); INIT_LIST_HEAD(&em->list); return em; @@ -67,8 +66,6 @@ void free_extent_map(struct extent_map *em) if (refcount_dec_and_test(&em->refs)) { WARN_ON(extent_map_in_tree(em)); WARN_ON(!list_empty(&em->list)); - if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) - kfree(em->map_lookup); kmem_cache_free(extent_map_cache, em); } } @@ -182,50 +179,50 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, return NULL; } -/* Check to see if two extent_map structs are adjacent and safe to merge. */ -static int mergable_maps(struct extent_map *prev, struct extent_map *next) +static inline u64 extent_map_block_end(const struct extent_map *em) { - if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) - return 0; + if (em->block_start + em->block_len < em->block_start) + return (u64)-1; + return em->block_start + em->block_len; +} - /* - * don't merge compressed extents, we need to know their - * actual size - */ - if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) - return 0; +static bool can_merge_extent_map(const struct extent_map *em) +{ + if (em->flags & EXTENT_FLAG_PINNED) + return false; + + /* Don't merge compressed extents, we need to know their actual size. */ + if (extent_map_is_compressed(em)) + return false; - if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) || - test_bit(EXTENT_FLAG_LOGGING, &next->flags)) - return 0; + if (em->flags & EXTENT_FLAG_LOGGING) + return false; /* * We don't want to merge stuff that hasn't been written to the log yet * since it may not reflect exactly what is on disk, and that would be * bad. */ - if (!list_empty(&prev->list) || !list_empty(&next->list)) - return 0; - - ASSERT(next->block_start != EXTENT_MAP_DELALLOC && - prev->block_start != EXTENT_MAP_DELALLOC); - - if (prev->map_lookup || next->map_lookup) - ASSERT(test_bit(EXTENT_FLAG_FS_MAPPING, &prev->flags) && - test_bit(EXTENT_FLAG_FS_MAPPING, &next->flags)); - - if (extent_map_end(prev) == next->start && - prev->flags == next->flags && - prev->map_lookup == next->map_lookup && - ((next->block_start == EXTENT_MAP_HOLE && - prev->block_start == EXTENT_MAP_HOLE) || - (next->block_start == EXTENT_MAP_INLINE && - prev->block_start == EXTENT_MAP_INLINE) || - (next->block_start < EXTENT_MAP_LAST_BYTE - 1 && - next->block_start == extent_map_block_end(prev)))) { - return 1; - } - return 0; + if (!list_empty(&em->list)) + return false; + + return true; +} + +/* Check to see if two extent_map structs are adjacent and safe to merge. */ +static bool mergeable_maps(const struct extent_map *prev, const struct extent_map *next) +{ + if (extent_map_end(prev) != next->start) + return false; + + if (prev->flags != next->flags) + return false; + + if (next->block_start < EXTENT_MAP_LAST_BYTE - 1) + return next->block_start == extent_map_block_end(prev); + + /* HOLES and INLINE extents. */ + return next->block_start == prev->block_start; } static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) @@ -244,11 +241,14 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) if (refcount_read(&em->refs) > 2) return; + if (!can_merge_extent_map(em)) + return; + if (em->start != 0) { rb = rb_prev(&em->rb_node); if (rb) merge = rb_entry(rb, struct extent_map, rb_node); - if (rb && mergable_maps(merge, em)) { + if (rb && can_merge_extent_map(merge) && mergeable_maps(merge, em)) { em->start = merge->start; em->orig_start = merge->orig_start; em->len += merge->len; @@ -257,7 +257,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; em->mod_start = merge->mod_start; em->generation = max(em->generation, merge->generation); - set_bit(EXTENT_FLAG_MERGED, &em->flags); + em->flags |= EXTENT_FLAG_MERGED; rb_erase_cached(&merge->rb_node, &tree->map); RB_CLEAR_NODE(&merge->rb_node); @@ -268,14 +268,14 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) rb = rb_next(&em->rb_node); if (rb) merge = rb_entry(rb, struct extent_map, rb_node); - if (rb && mergable_maps(em, merge)) { + if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) { em->len += merge->len; em->block_len += merge->block_len; rb_erase_cached(&merge->rb_node, &tree->map); RB_CLEAR_NODE(&merge->rb_node); em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; em->generation = max(em->generation, merge->generation); - set_bit(EXTENT_FLAG_MERGED, &em->flags); + em->flags |= EXTENT_FLAG_MERGED; free_extent_map(merge); } } @@ -283,7 +283,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) /* * Unpin an extent from the cache. * - * @tree: tree to unpin the extent in + * @inode: the inode from which we are unpinning an extent range * @start: logical offset in the file * @len: length of the extent * @gen: generation that this extent has been modified in @@ -292,9 +292,10 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) * to the generation that actually added the file item to the inode so we know * we need to sync this extent when we call fsync(). */ -int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, - u64 gen) +int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_map_tree *tree = &inode->extent_tree; int ret = 0; struct extent_map *em; bool prealloc = false; @@ -302,19 +303,28 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, write_lock(&tree->lock); em = lookup_extent_mapping(tree, start, len); - WARN_ON(!em || em->start != start); - - if (!em) + if (WARN_ON(!em)) { + btrfs_warn(fs_info, +"no extent map found for inode %llu (root %lld) when unpinning extent range [%llu, %llu), generation %llu", + btrfs_ino(inode), btrfs_root_id(inode->root), + start, len, gen); goto out; + } + + if (WARN_ON(em->start != start)) + btrfs_warn(fs_info, +"found extent map for inode %llu (root %lld) with unexpected start offset %llu when unpinning extent range [%llu, %llu), generation %llu", + btrfs_ino(inode), btrfs_root_id(inode->root), + em->start, start, len, gen); em->generation = gen; - clear_bit(EXTENT_FLAG_PINNED, &em->flags); + em->flags &= ~EXTENT_FLAG_PINNED; em->mod_start = em->start; em->mod_len = em->len; - if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) { + if (em->flags & EXTENT_FLAG_FILLING) { prealloc = true; - clear_bit(EXTENT_FLAG_FILLING, &em->flags); + em->flags &= ~EXTENT_FLAG_FILLING; } try_merge_map(tree, em); @@ -335,7 +345,7 @@ void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) { lockdep_assert_held_write(&tree->lock); - clear_bit(EXTENT_FLAG_LOGGING, &em->flags); + em->flags &= ~EXTENT_FLAG_LOGGING; if (extent_map_in_tree(em)) try_merge_map(tree, em); } @@ -348,45 +358,14 @@ static inline void setup_extent_mapping(struct extent_map_tree *tree, em->mod_start = em->start; em->mod_len = em->len; + ASSERT(list_empty(&em->list)); + if (modified) - list_move(&em->list, &tree->modified_extents); + list_add(&em->list, &tree->modified_extents); else try_merge_map(tree, em); } -static void extent_map_device_set_bits(struct extent_map *em, unsigned bits) -{ - struct map_lookup *map = em->map_lookup; - u64 stripe_size = em->orig_block_len; - int i; - - for (i = 0; i < map->num_stripes; i++) { - struct btrfs_io_stripe *stripe = &map->stripes[i]; - struct btrfs_device *device = stripe->dev; - - set_extent_bit(&device->alloc_state, stripe->physical, - stripe->physical + stripe_size - 1, - bits | EXTENT_NOWAIT, NULL); - } -} - -static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits) -{ - struct map_lookup *map = em->map_lookup; - u64 stripe_size = em->orig_block_len; - int i; - - for (i = 0; i < map->num_stripes; i++) { - struct btrfs_io_stripe *stripe = &map->stripes[i]; - struct btrfs_device *device = stripe->dev; - - __clear_extent_bit(&device->alloc_state, stripe->physical, - stripe->physical + stripe_size - 1, - bits | EXTENT_NOWAIT, - NULL, NULL); - } -} - /* * Add new extent map to the extent tree * @@ -400,8 +379,8 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits) * into the tree directly, with an additional reference taken, or a * reference dropped if the merge attempt was successful. */ -int add_extent_mapping(struct extent_map_tree *tree, - struct extent_map *em, int modified) +static int add_extent_mapping(struct extent_map_tree *tree, + struct extent_map *em, int modified) { int ret = 0; @@ -412,10 +391,6 @@ int add_extent_mapping(struct extent_map_tree *tree, goto out; setup_extent_mapping(tree, em, modified); - if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) { - extent_map_device_set_bits(em, CHUNK_ALLOCATED); - extent_map_device_clear_bits(em, CHUNK_TRIMMED); - } out: return ret; } @@ -495,12 +470,10 @@ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) { lockdep_assert_held_write(&tree->lock); - WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); + WARN_ON(em->flags & EXTENT_FLAG_PINNED); rb_erase_cached(&em->rb_node, &tree->map); - if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) + if (!(em->flags & EXTENT_FLAG_LOGGING)) list_del_init(&em->list); - if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) - extent_map_device_clear_bits(em, CHUNK_ALLOCATED); RB_CLEAR_NODE(&em->rb_node); } @@ -511,9 +484,9 @@ static void replace_extent_mapping(struct extent_map_tree *tree, { lockdep_assert_held_write(&tree->lock); - WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags)); + WARN_ON(cur->flags & EXTENT_FLAG_PINNED); ASSERT(extent_map_in_tree(cur)); - if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags)) + if (!(cur->flags & EXTENT_FLAG_LOGGING)) list_del_init(&cur->list); rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map); RB_CLEAR_NODE(&cur->rb_node); @@ -576,7 +549,7 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, em->start = start; em->len = end - start; if (em->block_start < EXTENT_MAP_LAST_BYTE && - !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + !extent_map_is_compressed(em)) { em->block_start += start_diff; em->block_len = em->len; } @@ -626,8 +599,6 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, if (ret == -EEXIST) { struct extent_map *existing; - ret = 0; - existing = search_extent_mapping(em_tree, start, len); trace_btrfs_handle_em_exist(fs_info, existing, em, start, len); @@ -681,8 +652,7 @@ static void drop_all_extent_maps_fast(struct extent_map_tree *tree) node = rb_first_cached(&tree->map); em = rb_entry(node, struct extent_map, rb_node); - clear_bit(EXTENT_FLAG_PINNED, &em->flags); - clear_bit(EXTENT_FLAG_LOGGING, &em->flags); + em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING); remove_extent_mapping(tree, em); free_extent_map(em); cond_resched_rwlock_write(&tree->lock); @@ -758,19 +728,18 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, } } - if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { + if (skip_pinned && (em->flags & EXTENT_FLAG_PINNED)) { start = em_end; goto next; } flags = em->flags; - clear_bit(EXTENT_FLAG_PINNED, &em->flags); /* * In case we split the extent map, we want to preserve the * EXTENT_FLAG_LOGGING flag on our extent map, but we don't want * it on the new extent maps. */ - clear_bit(EXTENT_FLAG_LOGGING, &flags); + em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING); modified = !list_empty(&em->list); /* @@ -781,7 +750,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, goto remove_em; gen = em->generation; - compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + compressed = extent_map_is_compressed(em); if (em->start < start) { if (!split) { @@ -814,7 +783,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, split->generation = gen; split->flags = flags; - split->compress_type = em->compress_type; replace_extent_mapping(em_tree, em, split, modified); free_extent_map(split); split = split2; @@ -831,7 +799,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, split->len = em_end - end; split->block_start = em->block_start; split->flags = flags; - split->compress_type = em->compress_type; split->generation = gen; if (em->block_start < EXTENT_MAP_LAST_BYTE) { @@ -997,14 +964,14 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, } ASSERT(em->len == len); - ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); + ASSERT(!extent_map_is_compressed(em)); ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE); - ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags)); - ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags)); + ASSERT(em->flags & EXTENT_FLAG_PINNED); + ASSERT(!(em->flags & EXTENT_FLAG_LOGGING)); ASSERT(!list_empty(&em->list)); flags = em->flags; - clear_bit(EXTENT_FLAG_PINNED, &em->flags); + em->flags &= ~EXTENT_FLAG_PINNED; /* First, replace the em with a new extent_map starting from * em->start */ split_pre->start = em->start; @@ -1015,7 +982,6 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, split_pre->orig_block_len = split_pre->block_len; split_pre->ram_bytes = split_pre->len; split_pre->flags = flags; - split_pre->compress_type = em->compress_type; split_pre->generation = em->generation; replace_extent_mapping(em_tree, em, split_pre, 1); @@ -1034,7 +1000,6 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, split_mid->orig_block_len = split_mid->block_len; split_mid->ram_bytes = split_mid->len; split_mid->flags = flags; - split_mid->compress_type = em->compress_type; split_mid->generation = em->generation; add_extent_mapping(em_tree, split_mid, 1); diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 35d27c756e08..e380fc08bbe4 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -5,30 +5,33 @@ #include <linux/rbtree.h> #include <linux/refcount.h> +#include "compression.h" #define EXTENT_MAP_LAST_BYTE ((u64)-4) #define EXTENT_MAP_HOLE ((u64)-3) #define EXTENT_MAP_INLINE ((u64)-2) -/* used only during fiemap calls */ -#define EXTENT_MAP_DELALLOC ((u64)-1) /* bits for the extent_map::flags field */ enum { /* this entry not yet on disk, don't free it */ - EXTENT_FLAG_PINNED, - EXTENT_FLAG_COMPRESSED, + ENUM_BIT(EXTENT_FLAG_PINNED), + ENUM_BIT(EXTENT_FLAG_COMPRESS_ZLIB), + ENUM_BIT(EXTENT_FLAG_COMPRESS_LZO), + ENUM_BIT(EXTENT_FLAG_COMPRESS_ZSTD), /* pre-allocated extent */ - EXTENT_FLAG_PREALLOC, + ENUM_BIT(EXTENT_FLAG_PREALLOC), /* Logging this extent */ - EXTENT_FLAG_LOGGING, + ENUM_BIT(EXTENT_FLAG_LOGGING), /* Filling in a preallocated extent */ - EXTENT_FLAG_FILLING, - /* filesystem extent mapping type */ - EXTENT_FLAG_FS_MAPPING, + ENUM_BIT(EXTENT_FLAG_FILLING), /* This em is merged from two or more physically adjacent ems */ - EXTENT_FLAG_MERGED, + ENUM_BIT(EXTENT_FLAG_MERGED), }; +/* + * Keep this structure as compact as possible, as we can have really large + * amounts of allocated extent maps at any time. + */ struct extent_map { struct rb_node rb_node; @@ -49,11 +52,8 @@ struct extent_map { * For non-merged extents, it's from btrfs_file_extent_item::generation. */ u64 generation; - unsigned long flags; - /* Used for chunk mappings, flag EXTENT_FLAG_FS_MAPPING must be set */ - struct map_lookup *map_lookup; + u32 flags; refcount_t refs; - unsigned int compress_type; struct list_head list; }; @@ -65,30 +65,57 @@ struct extent_map_tree { struct btrfs_inode; +static inline void extent_map_set_compression(struct extent_map *em, + enum btrfs_compression_type type) +{ + if (type == BTRFS_COMPRESS_ZLIB) + em->flags |= EXTENT_FLAG_COMPRESS_ZLIB; + else if (type == BTRFS_COMPRESS_LZO) + em->flags |= EXTENT_FLAG_COMPRESS_LZO; + else if (type == BTRFS_COMPRESS_ZSTD) + em->flags |= EXTENT_FLAG_COMPRESS_ZSTD; +} + +static inline enum btrfs_compression_type extent_map_compression(const struct extent_map *em) +{ + if (em->flags & EXTENT_FLAG_COMPRESS_ZLIB) + return BTRFS_COMPRESS_ZLIB; + + if (em->flags & EXTENT_FLAG_COMPRESS_LZO) + return BTRFS_COMPRESS_LZO; + + if (em->flags & EXTENT_FLAG_COMPRESS_ZSTD) + return BTRFS_COMPRESS_ZSTD; + + return BTRFS_COMPRESS_NONE; +} + +/* + * More efficient way to determine if extent is compressed, instead of using + * 'extent_map_compression() != BTRFS_COMPRESS_NONE'. + */ +static inline bool extent_map_is_compressed(const struct extent_map *em) +{ + return (em->flags & (EXTENT_FLAG_COMPRESS_ZLIB | + EXTENT_FLAG_COMPRESS_LZO | + EXTENT_FLAG_COMPRESS_ZSTD)) != 0; +} + static inline int extent_map_in_tree(const struct extent_map *em) { return !RB_EMPTY_NODE(&em->rb_node); } -static inline u64 extent_map_end(struct extent_map *em) +static inline u64 extent_map_end(const struct extent_map *em) { if (em->start + em->len < em->start) return (u64)-1; return em->start + em->len; } -static inline u64 extent_map_block_end(struct extent_map *em) -{ - if (em->block_start + em->block_len < em->block_start) - return (u64)-1; - return em->block_start + em->block_len; -} - void extent_map_tree_init(struct extent_map_tree *tree); struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); -int add_extent_mapping(struct extent_map_tree *tree, - struct extent_map *em, int modified); void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, u64 new_logical); @@ -97,7 +124,7 @@ struct extent_map *alloc_extent_map(void); void free_extent_map(struct extent_map *em); int __init extent_map_init(void); void __cold extent_map_exit(void); -int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen); +int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen); void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em); struct extent_map *search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 45cae356e89b..81ac1d474bf1 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -59,7 +59,7 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz goto out_unlock; } - ret = find_contiguous_extent_bit(&inode->file_extent_tree, 0, &start, + ret = find_contiguous_extent_bit(inode->file_extent_tree, 0, &start, &end, EXTENT_DIRTY); if (!ret && start == 0) i_size = min(i_size, end + 1); @@ -94,7 +94,7 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES)) return 0; - return set_extent_bit(&inode->file_extent_tree, start, start + len - 1, + return set_extent_bit(inode->file_extent_tree, start, start + len - 1, EXTENT_DIRTY, NULL); } @@ -123,7 +123,7 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES)) return 0; - return clear_extent_bit(&inode->file_extent_tree, start, + return clear_extent_bit(inode->file_extent_tree, start, start + len - 1, EXTENT_DIRTY, NULL); } @@ -1294,8 +1294,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, return; } if (compress_type != BTRFS_COMPRESS_NONE) { - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - em->compress_type = compress_type; + extent_map_set_compression(em, compress_type); em->block_start = bytenr; em->block_len = em->orig_block_len; } else { @@ -1303,7 +1302,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, em->block_start = bytenr; em->block_len = em->len; if (type == BTRFS_FILE_EXTENT_PREALLOC) - set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + em->flags |= EXTENT_FLAG_PREALLOC; } } else if (type == BTRFS_FILE_EXTENT_INLINE) { em->block_start = EXTENT_MAP_INLINE; @@ -1315,9 +1314,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, */ em->orig_start = EXTENT_MAP_HOLE; em->block_len = (u64)-1; - em->compress_type = compress_type; - if (compress_type != BTRFS_COMPRESS_NONE) - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + extent_map_set_compression(em, compress_type); } else { btrfs_err(fs_info, "unknown file extent item type %d, inode %llu, offset %llu, " diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 32611a4edd6b..38dfcac47609 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -111,8 +111,8 @@ static void btrfs_drop_pages(struct btrfs_fs_info *fs_info, * accessed as prepare_pages should have marked them accessed * in prepare_pages via find_or_create_page() */ - btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start, - block_len); + btrfs_folio_clamp_clear_checked(fs_info, page_folio(pages[i]), + block_start, block_len); unlock_page(pages[i]); put_page(pages[i]); } @@ -168,9 +168,12 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, for (i = 0; i < num_pages; i++) { struct page *p = pages[i]; - btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes); - btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes); - btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes); + btrfs_folio_clamp_set_uptodate(fs_info, page_folio(p), + start_pos, num_bytes); + btrfs_folio_clamp_clear_checked(fs_info, page_folio(p), + start_pos, num_bytes); + btrfs_folio_clamp_set_dirty(fs_info, page_folio(p), + start_pos, num_bytes); } /* @@ -869,9 +872,9 @@ static int prepare_uptodate_page(struct inode *inode, * released. * * The private flag check is essential for subpage as we need - * to store extra bitmap using page->private. + * to store extra bitmap using folio private. */ - if (page->mapping != inode->i_mapping || !PagePrivate(page)) { + if (page->mapping != inode->i_mapping || !folio_test_private(folio)) { unlock_page(page); return -EAGAIN; } @@ -2150,7 +2153,6 @@ out: hole_em->block_start = EXTENT_MAP_HOLE; hole_em->block_len = 0; hole_em->orig_block_len = 0; - hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = trans->transid; ret = btrfs_replace_extent_map_range(inode, hole_em, true); @@ -2839,7 +2841,7 @@ static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, if (em->block_start == EXTENT_MAP_HOLE) ret = RANGE_BOUNDARY_HOLE; - else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + else if (em->flags & EXTENT_FLAG_PREALLOC) ret = RANGE_BOUNDARY_PREALLOC_EXTENT; else ret = RANGE_BOUNDARY_WRITTEN_EXTENT; @@ -2879,8 +2881,7 @@ static int btrfs_zero_range(struct inode *inode, * extents and holes, we drop all the existing extents and allocate a * new prealloc extent, so that we get a larger contiguous disk extent. */ - if (em->start <= alloc_start && - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + if (em->start <= alloc_start && (em->flags & EXTENT_FLAG_PREALLOC)) { const u64 em_end = em->start + em->len; if (em_end >= offset + len) { @@ -2915,7 +2916,7 @@ static int btrfs_zero_range(struct inode *inode, goto out; } - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + if (em->flags & EXTENT_FLAG_PREALLOC) { free_extent_map(em); ret = btrfs_fallocate_update_isize(inode, offset + len, mode); @@ -3136,7 +3137,7 @@ static long btrfs_fallocate(struct file *file, int mode, last_byte = ALIGN(last_byte, blocksize); if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && - !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + !(em->flags & EXTENT_FLAG_PREALLOC))) { const u64 range_len = last_byte - cur_offset; ret = add_falloc_range(&reserve_list, cur_offset, range_len); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 6f93c9a2c3e3..d372c7ce0e6b 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -439,8 +439,8 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) for (i = 0; i < io_ctl->num_pages; i++) { if (io_ctl->pages[i]) { - btrfs_page_clear_checked(io_ctl->fs_info, - io_ctl->pages[i], + btrfs_folio_clear_checked(io_ctl->fs_info, + page_folio(io_ctl->pages[i]), page_offset(io_ctl->pages[i]), PAGE_SIZE); unlock_page(io_ctl->pages[i]); diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 318df6f9d9cb..f8bb73d6ab68 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -188,6 +188,7 @@ enum { BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 27), BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 28), BTRFS_MOUNT_NODISCARD = (1UL << 29), + BTRFS_MOUNT_NOSPACECACHE = (1UL << 30), }; /* @@ -398,7 +399,8 @@ struct btrfs_fs_info { struct extent_io_tree excluded_extents; /* logical->physical extent mapping */ - struct extent_map_tree mapping_tree; + struct rb_root_cached mapping_tree; + rwlock_t mapping_tree_lock; /* * Block reservation for extent, checksum, root tree and delayed dir @@ -960,20 +962,6 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, #define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \ BTRFS_MOUNT_##opt) -#define btrfs_set_and_info(fs_info, opt, fmt, args...) \ -do { \ - if (!btrfs_test_opt(fs_info, opt)) \ - btrfs_info(fs_info, fmt, ##args); \ - btrfs_set_opt(fs_info->mount_opt, opt); \ -} while (0) - -#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \ -do { \ - if (btrfs_test_opt(fs_info, opt)) \ - btrfs_info(fs_info, fmt, ##args); \ - btrfs_clear_opt(fs_info->mount_opt, opt); \ -} while (0) - static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) { /* Do it this way so we only ever do one test_bit in the normal case. */ diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fb3c3f43c3fa..809b11472a80 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -114,6 +114,15 @@ struct data_reloc_warn { int mirror_num; }; +/* + * For the file_extent_tree, we want to hold the inode lock when we lookup and + * update the disk_i_size, but lockdep will complain because our io_tree we hold + * the tree lock and get the inode lock when setting delalloc. These two things + * are unrelated, so make a class for the file_extent_tree so we don't get the + * two locking patterns mixed up. + */ +static struct lock_class_key file_extent_tree_class; + static const struct inode_operations btrfs_dir_inode_operations; static const struct inode_operations btrfs_symlink_inode_operations; static const struct inode_operations btrfs_special_inode_operations; @@ -447,8 +456,8 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, * range, then btrfs_mark_ordered_io_finished() will handle * the ordered extent accounting for the range. */ - btrfs_page_clamp_clear_ordered(inode->root->fs_info, page, - offset, bytes); + btrfs_folio_clamp_clear_ordered(inode->root->fs_info, + page_folio(page), offset, bytes); put_page(page); } @@ -1037,7 +1046,7 @@ free_pages: if (pages) { for (i = 0; i < nr_pages; i++) { WARN_ON(pages[i]->mapping); - put_page(pages[i]); + btrfs_free_compr_page(pages[i]); } kfree(pages); } @@ -1052,7 +1061,7 @@ static void free_async_extent_pages(struct async_extent *async_extent) for (i = 0; i < async_extent->nr_pages; i++) { WARN_ON(async_extent->pages[i]->mapping); - put_page(async_extent->pages[i]); + btrfs_free_compr_page(async_extent->pages[i]); } kfree(async_extent->pages); async_extent->nr_pages = 0; @@ -2793,7 +2802,7 @@ out_page: PAGE_SIZE, !ret); clear_page_dirty_for_io(page); } - btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); + btrfs_folio_clear_checked(fs_info, page_folio(page), page_start, PAGE_SIZE); unlock_page(page); put_page(page); kfree(fixup); @@ -2848,7 +2857,7 @@ int btrfs_writepage_cow_fixup(struct page *page) * page->mapping outside of the page lock. */ ihold(inode); - btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE); + btrfs_folio_set_checked(fs_info, page_folio(page), page_offset(page), PAGE_SIZE); get_page(page); btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL); fixup->page = page; @@ -3118,7 +3127,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) ordered_extent->disk_num_bytes); } } - unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset, + unpin_extent_cache(inode, ordered_extent->file_offset, ordered_extent->num_bytes, trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, ret); @@ -3796,7 +3805,7 @@ cache_index: * cache. * * This is required for both inode re-read from disk and delayed inode - * in delayed_nodes_tree. + * in the delayed_nodes xarray. */ if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info)) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, @@ -4725,7 +4734,7 @@ again: /* * We unlock the page after the io is completed and then re-lock it * above. release_folio() could have come in between that and cleared - * PagePrivate(), but left the page in the mapping. Set the page mapped + * folio private, but left the page in the mapping. Set the page mapped * here to make sure it's properly set for the subpage stuff. */ ret = set_page_extent_mapped(page); @@ -4767,9 +4776,10 @@ again: memzero_page(page, (block_start - page_offset(page)) + offset, len); } - btrfs_page_clear_checked(fs_info, page, block_start, - block_end + 1 - block_start); - btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start); + btrfs_folio_clear_checked(fs_info, page_folio(page), block_start, + block_end + 1 - block_start); + btrfs_folio_set_dirty(fs_info, page_folio(page), block_start, + block_end + 1 - block_start); unlock_extent(io_tree, block_start, block_end, &cached_state); if (only_release_metadata) @@ -4889,7 +4899,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) last_byte = ALIGN(last_byte, fs_info->sectorsize); hole_size = last_byte - cur_offset; - if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + if (!(em->flags & EXTENT_FLAG_PREALLOC)) { struct extent_map *hole_em; err = maybe_insert_hole(inode, cur_offset, hole_size); @@ -4917,7 +4927,6 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) hole_em->block_len = 0; hole_em->orig_block_len = 0; hole_em->ram_bytes = hole_size; - hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = btrfs_get_fs_generation(fs_info); err = btrfs_replace_extent_map_range(inode, hole_em, true); @@ -6217,6 +6226,13 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, inode->i_generation = BTRFS_I(inode)->generation; /* + * We don't have any capability xattrs set here yet, shortcut any + * queries for the xattrs here. If we add them later via the inode + * security init path or any other path this flag will be cleared. + */ + set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); + + /* * Subvolumes don't inherit flags from their parent directory. * Originally this was probably by accident, but we probably can't * change it now without compatibility issues. @@ -7258,13 +7274,11 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, em->orig_block_len = orig_block_len; em->ram_bytes = ram_bytes; em->generation = -1; - set_bit(EXTENT_FLAG_PINNED, &em->flags); - if (type == BTRFS_ORDERED_PREALLOC) { - set_bit(EXTENT_FLAG_FILLING, &em->flags); - } else if (type == BTRFS_ORDERED_COMPRESSED) { - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - em->compress_type = compress_type; - } + em->flags |= EXTENT_FLAG_PINNED; + if (type == BTRFS_ORDERED_PREALLOC) + em->flags |= EXTENT_FLAG_FILLING; + else if (type == BTRFS_ORDERED_COMPRESSED) + extent_map_set_compression(em, compress_type); ret = btrfs_replace_extent_map_range(inode, em, true); if (ret) { @@ -7304,10 +7318,10 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, * just use the extent. * */ - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || + if ((em->flags & EXTENT_FLAG_PREALLOC) || ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && em->block_start != EXTENT_MAP_HOLE)) { - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + if (em->flags & EXTENT_FLAG_PREALLOC) type = BTRFS_ORDERED_PREALLOC; else type = BTRFS_ORDERED_NOCOW; @@ -7542,7 +7556,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, * to buffered IO. Don't blame me, this is the price we pay for using * the generic code. */ - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || + if (extent_map_is_compressed(em) || em->block_start == EXTENT_MAP_INLINE) { free_extent_map(em); /* @@ -7638,7 +7652,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, * that, since we have locked only the parts we are performing I/O in. */ if ((em->block_start == EXTENT_MAP_HOLE) || - (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) { + ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) { iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_HOLE; } else { @@ -7851,13 +7865,14 @@ static void btrfs_readahead(struct readahead_control *rac) static void wait_subpage_spinlock(struct page *page) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct folio *folio = page_folio(page); struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, page)) + if (!btrfs_is_subpage(fs_info, page->mapping)) return; - ASSERT(PagePrivate(page) && page->private); - subpage = (struct btrfs_subpage *)page->private; + ASSERT(folio_test_private(folio) && folio_get_private(folio)); + subpage = folio_get_private(folio); /* * This may look insane as we just acquire the spinlock and release it, @@ -7995,7 +8010,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, page_end); ASSERT(range_end + 1 - cur < U32_MAX); range_len = range_end + 1 - cur; - if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) { + if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) { /* * If Ordered (Private2) is cleared, it means endio has * already been executed for the range. @@ -8004,7 +8019,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, */ goto next; } - btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len); + btrfs_folio_clear_ordered(fs_info, folio, cur, range_len); /* * IO on this page will never be started, so we need to account @@ -8074,7 +8089,7 @@ next: * did something wrong. */ ASSERT(!folio_test_ordered(folio)); - btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio)); + btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); if (!inode_evicting) __btrfs_release_folio(folio, GFP_NOFS); clear_page_extent_mapped(&folio->page); @@ -8098,6 +8113,7 @@ next: vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; + struct folio *folio = page_folio(page); struct inode *inode = file_inode(vmf->vma->vm_file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -8114,6 +8130,8 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) u64 page_end; u64 end; + ASSERT(folio_order(folio) == 0); + reserved_space = PAGE_SIZE; sb_start_pagefault(inode->i_sb); @@ -8217,9 +8235,9 @@ again: if (zero_start != PAGE_SIZE) memzero_page(page, zero_start, PAGE_SIZE - zero_start); - btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); - btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start); - btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start); + btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); + btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); + btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start); btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); @@ -8462,10 +8480,20 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_inode *ei; struct inode *inode; + struct extent_io_tree *file_extent_tree = NULL; + + /* Self tests may pass a NULL fs_info. */ + if (fs_info && !btrfs_fs_incompat(fs_info, NO_HOLES)) { + file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL); + if (!file_extent_tree) + return NULL; + } ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL); - if (!ei) + if (!ei) { + kfree(file_extent_tree); return NULL; + } ei->root = NULL; ei->generation = 0; @@ -8501,10 +8529,18 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); + + /* This io tree sets the valid inode. */ extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); ei->io_tree.inode = ei; - extent_io_tree_init(fs_info, &ei->file_extent_tree, - IO_TREE_INODE_FILE_EXTENT); + + ei->file_extent_tree = file_extent_tree; + if (file_extent_tree) { + extent_io_tree_init(fs_info, ei->file_extent_tree, + IO_TREE_INODE_FILE_EXTENT); + /* Lockdep class is set only for the file extent tree. */ + lockdep_set_class(&ei->file_extent_tree->lock, &file_extent_tree_class); + } mutex_init(&ei->log_mutex); spin_lock_init(&ei->ordered_tree_lock); ei->ordered_tree = RB_ROOT; @@ -8521,12 +8557,14 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) void btrfs_test_destroy_inode(struct inode *inode) { btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); + kfree(BTRFS_I(inode)->file_extent_tree); kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } #endif void btrfs_free_inode(struct inode *inode) { + kfree(BTRFS_I(inode)->file_extent_tree); kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } @@ -9632,7 +9670,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, em->block_len = ins.offset; em->orig_block_len = ins.offset; em->ram_bytes = ins.offset; - set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + em->flags |= EXTENT_FLAG_PREALLOC; em->generation = trans->transid; ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true); @@ -9785,7 +9823,9 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) page = find_get_page(inode->vfs_inode.i_mapping, index); ASSERT(page); /* Pages should be in the extent_io_tree */ - btrfs_page_set_writeback(fs_info, page, start, len); + /* This is for data, which doesn't yet support larger folio. */ + ASSERT(folio_order(page_folio(page)) == 0); + btrfs_folio_set_writeback(fs_info, page_folio(page), start, len); put_page(page); index++; } @@ -9994,7 +10034,7 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) return -ENOMEM; - ret = btrfs_alloc_page_array(nr_pages, pages); + ret = btrfs_alloc_page_array(nr_pages, pages, 0); if (ret) { ret = -ENOMEM; goto out; @@ -10113,12 +10153,12 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, encoded->len = min_t(u64, extent_map_end(em), inode->vfs_inode.i_size) - iocb->ki_pos; if (em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + (em->flags & EXTENT_FLAG_PREALLOC)) { disk_bytenr = EXTENT_MAP_HOLE; count = min_t(u64, count, encoded->len); encoded->len = count; encoded->unencoded_len = count; - } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + } else if (extent_map_is_compressed(em)) { disk_bytenr = em->block_start; /* * Bail if the buffer isn't large enough to return the whole @@ -10133,7 +10173,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, encoded->unencoded_len = em->ram_bytes; encoded->unencoded_offset = iocb->ki_pos - em->orig_start; ret = btrfs_encoded_io_compression_from_extent(fs_info, - em->compress_type); + extent_map_compression(em)); if (ret < 0) goto out_em; encoded->compression = ret; @@ -10564,6 +10604,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; struct extent_map *em = NULL; + struct btrfs_chunk_map *map = NULL; struct btrfs_device *device = NULL; struct btrfs_swap_info bsi = { .lowest_ppage = (sector_t)-1ULL, @@ -10680,7 +10721,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, ret = -EINVAL; goto out; } - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + if (extent_map_is_compressed(em)) { btrfs_warn(fs_info, "swapfile must not be compressed"); ret = -EINVAL; goto out; @@ -10703,13 +10744,13 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, goto out; } - em = btrfs_get_chunk_map(fs_info, logical_block_start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); + map = btrfs_get_chunk_map(fs_info, logical_block_start, len); + if (IS_ERR(map)) { + ret = PTR_ERR(map); goto out; } - if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { btrfs_warn(fs_info, "swapfile must have single data profile"); ret = -EINVAL; @@ -10717,23 +10758,23 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, } if (device == NULL) { - device = em->map_lookup->stripes[0].dev; + device = map->stripes[0].dev; ret = btrfs_add_swapfile_pin(inode, device, false); if (ret == 1) ret = 0; else if (ret) goto out; - } else if (device != em->map_lookup->stripes[0].dev) { + } else if (device != map->stripes[0].dev) { btrfs_warn(fs_info, "swapfile must be on one device"); ret = -EINVAL; goto out; } - physical_block_start = (em->map_lookup->stripes[0].physical + - (logical_block_start - em->start)); - len = min(len, em->len - (logical_block_start - em->start)); - free_extent_map(em); - em = NULL; + physical_block_start = (map->stripes[0].physical + + (logical_block_start - map->start)); + len = min(len, map->chunk_len - (logical_block_start - map->start)); + btrfs_free_chunk_map(map); + map = NULL; bg = btrfs_lookup_block_group(fs_info, logical_block_start); if (!bg) { @@ -10786,6 +10827,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, out: if (!IS_ERR_OR_NULL(em)) free_extent_map(em); + if (!IS_ERR_OR_NULL(map)) + btrfs_free_chunk_map(map); unlock_extent(io_tree, 0, isize - 1, &cached_state); @@ -10930,7 +10973,7 @@ static const struct address_space_operations btrfs_aops = { .release_folio = btrfs_release_folio, .migrate_folio = btrfs_migrate_folio, .dirty_folio = filemap_dirty_folio, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = btrfs_swap_activate, .swap_deactivate = btrfs_swap_deactivate, }; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a1743904202b..41b479861b3c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4533,29 +4533,29 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool if (ret < 0) goto out_acct; - file_start_write(file); - if (iov_iter_count(&iter) == 0) { ret = 0; - goto out_end_write; + goto out_iov; } pos = args.offset; ret = rw_verify_area(WRITE, file, &pos, args.len); if (ret < 0) - goto out_end_write; + goto out_iov; init_sync_kiocb(&kiocb, file); ret = kiocb_set_rw_flags(&kiocb, 0); if (ret) - goto out_end_write; + goto out_iov; kiocb.ki_pos = pos; + file_start_write(file); + ret = btrfs_do_write_iter(&kiocb, &iter, &args); if (ret > 0) fsnotify_modify(file); -out_end_write: file_end_write(file); +out_iov: kfree(iov); out_acct: if (ret > 0) diff --git a/fs/btrfs/lru_cache.c b/fs/btrfs/lru_cache.c index 0fe0ae54ac67..fd88af17d8d9 100644 --- a/fs/btrfs/lru_cache.c +++ b/fs/btrfs/lru_cache.c @@ -9,7 +9,7 @@ * * @cache: The cache. * @max_size: Maximum size (number of entries) for the cache. - * Use 0 for unlimited size, it's the user's responsability to + * Use 0 for unlimited size, it's the user's responsibility to * trim the cache in that case. */ void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size) diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index d3fcfc628a4f..1131d5a29d61 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -152,7 +152,7 @@ static int copy_compressed_data_to_page(char *compressed_data, cur_page = out_pages[*cur_out / PAGE_SIZE]; /* Allocate a new page */ if (!cur_page) { - cur_page = alloc_page(GFP_NOFS); + cur_page = btrfs_alloc_compr_page(); if (!cur_page) return -ENOMEM; out_pages[*cur_out / PAGE_SIZE] = cur_page; @@ -178,7 +178,7 @@ static int copy_compressed_data_to_page(char *compressed_data, cur_page = out_pages[*cur_out / PAGE_SIZE]; /* Allocate a new page */ if (!cur_page) { - cur_page = alloc_page(GFP_NOFS); + cur_page = btrfs_alloc_compr_page(); if (!cur_page) return -ENOMEM; out_pages[*cur_out / PAGE_SIZE] = cur_page; diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index b8f9c9e56c8c..cdada4865837 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -287,7 +287,7 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) * panic or BUGs, depending on mount options. */ __cold -void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, +void __btrfs_panic(const struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int error, const char *fmt, ...) { char *s_id = "<unknown>"; diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 4d04c1fa5899..08a9272399d2 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -194,7 +194,7 @@ const char * __attribute_const__ btrfs_decode_error(int error); __printf(5, 6) __cold -void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, +void __btrfs_panic(const struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int error, const char *fmt, ...); /* * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a82e1417c4d2..59850dc17b22 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -323,9 +323,10 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, * * If there's no such bit, we need to skip to next range. */ - if (!btrfs_page_test_ordered(fs_info, page, file_offset, len)) + if (!btrfs_folio_test_ordered(fs_info, page_folio(page), + file_offset, len)) return false; - btrfs_page_clear_ordered(fs_info, page, file_offset, len); + btrfs_folio_clear_ordered(fs_info, page_folio(page), file_offset, len); } /* Now we're fine to update the accounting. */ diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 567a6d3d4712..127ef8bf0ffd 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -97,13 +97,6 @@ struct btrfs_ordered_extent { u64 bytes_left; /* - * the end of the ordered extent which is behind it but - * didn't update disk_i_size. Please see the comment of - * btrfs_ordered_update_i_size(); - */ - u64 outstanding_isize; - - /* * If we get truncated we need to adjust the file extent we enter for * this ordered extent so that we do not expose stale data. */ diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index e46774e8f49f..63b426cc7798 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -194,7 +194,7 @@ static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, * * Must be called with qgroup_lock held and @prealloc preallocated. * - * The control on the lifespan of @prealloc would be transfered to this + * The control on the lifespan of @prealloc would be transferred to this * function, thus caller should no longer touch @prealloc. */ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 3e014b9370a3..792c8e17c31d 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -964,7 +964,7 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) { int ret; - ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages); + ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, 0); if (ret < 0) return ret; /* Mapping all sectors */ @@ -979,7 +979,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) int ret; ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages, - rbio->stripe_pages + data_pages); + rbio->stripe_pages + data_pages, 0); if (ret < 0) return ret; @@ -1530,7 +1530,7 @@ static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) const int data_pages = rbio->nr_data * rbio->stripe_npages; int ret; - ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages); + ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, 0); if (ret < 0) return ret; @@ -1549,7 +1549,6 @@ struct btrfs_plug_cb { struct blk_plug_cb cb; struct btrfs_fs_info *info; struct list_head rbio_list; - struct work_struct work; }; /* diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 45e6ff78316f..470213688872 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -164,7 +164,7 @@ struct raid56_bio_trace_info { u8 stripe_nr; }; -static inline int nr_data_stripes(const struct map_lookup *map) +static inline int nr_data_stripes(const struct btrfs_chunk_map *map) { return map->num_stripes - btrfs_nr_parity_stripes(map->type); } diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index f88b0c2ac3fe..ae90894dc7dc 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -141,9 +141,9 @@ static int copy_inline_to_page(struct btrfs_inode *inode, if (datal < block_size) memzero_page(page, datal, block_size - datal); - btrfs_page_set_uptodate(fs_info, page, file_offset, block_size); - btrfs_page_clear_checked(fs_info, page, file_offset, block_size); - btrfs_page_set_dirty(fs_info, page, file_offset, block_size); + btrfs_folio_set_uptodate(fs_info, page_folio(page), file_offset, block_size); + btrfs_folio_clear_checked(fs_info, page_folio(page), file_offset, block_size); + btrfs_folio_set_dirty(fs_info, page_folio(page), file_offset, block_size); out_unlock: if (page) { unlock_page(page); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f5d9e5f74a52..abe594f77f99 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2895,7 +2895,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( * will re-read the whole page anyway. */ if (page) { - btrfs_subpage_clear_uptodate(fs_info, page, i_size, + btrfs_subpage_clear_uptodate(fs_info, page_folio(page), i_size, round_up(i_size, PAGE_SIZE) - i_size); unlock_page(page); put_page(page); @@ -2951,7 +2951,7 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod em->len = end + 1 - start; em->block_len = em->len; em->block_start = block_start; - set_bit(EXTENT_FLAG_PINNED, &em->flags); + em->flags |= EXTENT_FLAG_PINNED; lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false); @@ -3070,7 +3070,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, clamped_len); goto release_page; } - btrfs_page_set_dirty(fs_info, page, clamped_start, clamped_len); + btrfs_folio_set_dirty(fs_info, page_folio(page), + clamped_start, clamped_len); /* * Set the boundary if it's inside the page. diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index f62a408671cb..a01807cbd4d4 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -43,7 +43,7 @@ struct scrub_ctx; /* * The following value only influences the performance. * - * This detemines how many stripes would be submitted in one go, + * This determines how many stripes would be submitted in one go, * which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP). */ #define SCRUB_STRIPES_PER_GROUP 8 @@ -192,7 +192,6 @@ struct scrub_ctx { int cur_stripe; atomic_t cancel_req; int readonly; - int sectors_per_bio; /* State of IO submission throttling affecting the associated device */ ktime_t throttle_deadline; @@ -262,7 +261,7 @@ static int init_scrub_stripe(struct btrfs_fs_info *fs_info, atomic_set(&stripe->pending_io, 0); spin_lock_init(&stripe->write_error_lock); - ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages); + ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages, 0); if (ret < 0) goto error; @@ -710,7 +709,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) /* Metadata, verify the full tree block. */ if (sector->is_metadata) { /* - * Check if the tree block crosses the stripe boudary. If + * Check if the tree block crosses the stripe boundary. If * crossed the boundary, we cannot verify it but only give a * warning. * @@ -884,7 +883,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, /* * Init needed infos for error reporting. * - * Although our scrub_stripe infrastucture is mostly based on btrfs_submit_bio() + * Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio() * thus no need for dev/physical, error reporting still needs dev and physical. */ if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) { @@ -1280,7 +1279,7 @@ static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *d * return 0 if it is a data stripe, 1 means parity stripe. */ static int get_raid56_logic_offset(u64 physical, int num, - struct map_lookup *map, u64 *offset, + struct btrfs_chunk_map *map, u64 *offset, u64 *stripe_start) { int i; @@ -1409,14 +1408,11 @@ search_forward: if (ret > 0) break; next: - path->slots[0]++; - if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { - ret = btrfs_next_leaf(extent_root, path); - if (ret) { - /* Either no more item or fatal error */ - btrfs_release_path(path); - return ret; - } + ret = btrfs_next_item(extent_root, path); + if (ret) { + /* Either no more items or a fatal error. */ + btrfs_release_path(path); + return ret; } } btrfs_release_path(path); @@ -1816,7 +1812,7 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) if (sctx->is_dev_replace) { /* * For dev-replace, if we know there is something wrong with - * metadata, we should immedately abort. + * metadata, we should immediately abort. */ for (int i = 0; i < nr_stripes; i++) { if (stripe_has_metadata_error(&sctx->stripes[i])) { @@ -1898,7 +1894,7 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group * static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, struct btrfs_block_group *bg, - struct map_lookup *map, + struct btrfs_chunk_map *map, u64 full_stripe_start) { DECLARE_COMPLETION_ONSTACK(io_done); @@ -2067,7 +2063,7 @@ out: */ static int scrub_simple_mirror(struct scrub_ctx *sctx, struct btrfs_block_group *bg, - struct map_lookup *map, + struct btrfs_chunk_map *map, u64 logical_start, u64 logical_length, struct btrfs_device *device, u64 physical, int mirror_num) @@ -2128,7 +2124,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, } /* Calculate the full stripe length for simple stripe based profiles */ -static u64 simple_stripe_full_stripe_len(const struct map_lookup *map) +static u64 simple_stripe_full_stripe_len(const struct btrfs_chunk_map *map) { ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)); @@ -2137,7 +2133,7 @@ static u64 simple_stripe_full_stripe_len(const struct map_lookup *map) } /* Get the logical bytenr for the stripe */ -static u64 simple_stripe_get_logical(struct map_lookup *map, +static u64 simple_stripe_get_logical(struct btrfs_chunk_map *map, struct btrfs_block_group *bg, int stripe_index) { @@ -2154,7 +2150,7 @@ static u64 simple_stripe_get_logical(struct map_lookup *map, } /* Get the mirror number for the stripe */ -static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index) +static int simple_stripe_mirror_num(struct btrfs_chunk_map *map, int stripe_index) { ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)); @@ -2166,7 +2162,7 @@ static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index) static int scrub_simple_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, - struct map_lookup *map, + struct btrfs_chunk_map *map, struct btrfs_device *device, int stripe_index) { @@ -2199,18 +2195,17 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx, static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, - struct extent_map *em, + struct btrfs_chunk_map *map, struct btrfs_device *scrub_dev, int stripe_index) { struct btrfs_fs_info *fs_info = sctx->fs_info; - struct map_lookup *map = em->map_lookup; const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; const u64 chunk_logical = bg->start; int ret; int ret2; u64 physical = map->stripes[stripe_index].physical; - const u64 dev_stripe_len = btrfs_calc_stripe_length(em); + const u64 dev_stripe_len = btrfs_calc_stripe_length(map); const u64 physical_end = physical + dev_stripe_len; u64 logical; u64 logic_end; @@ -2373,17 +2368,12 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, u64 dev_extent_len) { struct btrfs_fs_info *fs_info = sctx->fs_info; - struct extent_map_tree *map_tree = &fs_info->mapping_tree; - struct map_lookup *map; - struct extent_map *em; + struct btrfs_chunk_map *map; int i; int ret = 0; - read_lock(&map_tree->lock); - em = lookup_extent_mapping(map_tree, bg->start, bg->length); - read_unlock(&map_tree->lock); - - if (!em) { + map = btrfs_find_chunk_map(fs_info, bg->start, bg->length); + if (!map) { /* * Might have been an unused block group deleted by the cleaner * kthread or relocation. @@ -2395,22 +2385,21 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, return ret; } - if (em->start != bg->start) + if (map->start != bg->start) goto out; - if (em->len < dev_extent_len) + if (map->chunk_len < dev_extent_len) goto out; - map = em->map_lookup; for (i = 0; i < map->num_stripes; ++i) { if (map->stripes[i].dev->bdev == scrub_dev->bdev && map->stripes[i].physical == dev_offset) { - ret = scrub_stripe(sctx, bg, em, scrub_dev, i); + ret = scrub_stripe(sctx, bg, map, scrub_dev, i); if (ret) goto out; } } out: - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 1b999c6e4193..93511d54abf8 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -64,7 +64,7 @@ * This means a slightly higher tree locking latency. */ -bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page) +bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping) { if (fs_info->sectorsize >= PAGE_SIZE) return false; @@ -74,8 +74,7 @@ bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page) * mapping. And if page->mapping->host is data inode, it's subpage. * As we have ruled our sectorsize >= PAGE_SIZE case already. */ - if (!page->mapping || !page->mapping->host || - is_data_inode(page->mapping->host)) + if (!mapping || !mapping->host || is_data_inode(mapping->host)) return true; /* @@ -116,7 +115,7 @@ void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sector } int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, - struct page *page, enum btrfs_subpage_type type) + struct folio *folio, enum btrfs_subpage_type type) { struct btrfs_subpage *subpage; @@ -124,31 +123,30 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, * We have cases like a dummy extent buffer page, which is not mapped * and doesn't need to be locked. */ - if (page->mapping) - ASSERT(PageLocked(page)); + if (folio->mapping) + ASSERT(folio_test_locked(folio)); - /* Either not subpage, or the page already has private attached */ - if (!btrfs_is_subpage(fs_info, page) || PagePrivate(page)) + /* Either not subpage, or the folio already has private attached. */ + if (!btrfs_is_subpage(fs_info, folio->mapping) || folio_test_private(folio)) return 0; subpage = btrfs_alloc_subpage(fs_info, type); if (IS_ERR(subpage)) return PTR_ERR(subpage); - attach_page_private(page, subpage); + folio_attach_private(folio, subpage); return 0; } -void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, - struct page *page) +void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio) { struct btrfs_subpage *subpage; - /* Either not subpage, or already detached */ - if (!btrfs_is_subpage(fs_info, page) || !PagePrivate(page)) + /* Either not subpage, or the folio already has private attached. */ + if (!btrfs_is_subpage(fs_info, folio->mapping) || !folio_test_private(folio)) return; - subpage = detach_page_private(page); + subpage = folio_detach_private(folio); ASSERT(subpage); btrfs_free_subpage(subpage); } @@ -188,77 +186,78 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage) * This is important for eb allocation, to prevent race with last eb freeing * of the same page. * With the eb_refs increased before the eb inserted into radix tree, - * detach_extent_buffer_page() won't detach the page private while we're still + * detach_extent_buffer_page() won't detach the folio private while we're still * allocating the extent buffer. */ -void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, - struct page *page) +void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio) { struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, page)) + if (!btrfs_is_subpage(fs_info, folio->mapping)) return; - ASSERT(PagePrivate(page) && page->mapping); - lockdep_assert_held(&page->mapping->private_lock); + ASSERT(folio_test_private(folio) && folio->mapping); + lockdep_assert_held(&folio->mapping->i_private_lock); - subpage = (struct btrfs_subpage *)page->private; + subpage = folio_get_private(folio); atomic_inc(&subpage->eb_refs); } -void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, - struct page *page) +void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio) { struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, page)) + if (!btrfs_is_subpage(fs_info, folio->mapping)) return; - ASSERT(PagePrivate(page) && page->mapping); - lockdep_assert_held(&page->mapping->private_lock); + ASSERT(folio_test_private(folio) && folio->mapping); + lockdep_assert_held(&folio->mapping->i_private_lock); - subpage = (struct btrfs_subpage *)page->private; + subpage = folio_get_private(folio); ASSERT(atomic_read(&subpage->eb_refs)); atomic_dec(&subpage->eb_refs); } static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { + /* For subpage support, the folio must be single page. */ + ASSERT(folio_order(folio) == 0); + /* Basic checks */ - ASSERT(PagePrivate(page) && page->private); + ASSERT(folio_test_private(folio) && folio_get_private(folio)); ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(len, fs_info->sectorsize)); /* * The range check only works for mapped page, we can still have * unmapped page like dummy extent buffer pages. */ - if (page->mapping) - ASSERT(page_offset(page) <= start && - start + len <= page_offset(page) + PAGE_SIZE); + if (folio->mapping) + ASSERT(folio_pos(folio) <= start && + start + len <= folio_pos(folio) + PAGE_SIZE); } void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct btrfs_subpage *subpage = folio_get_private(folio); const int nbits = len >> fs_info->sectorsize_bits; - btrfs_subpage_assert(fs_info, page, start, len); + btrfs_subpage_assert(fs_info, folio, start, len); atomic_add(nbits, &subpage->readers); } void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct btrfs_subpage *subpage = folio_get_private(folio); const int nbits = len >> fs_info->sectorsize_bits; bool is_data; bool last; - btrfs_subpage_assert(fs_info, page, start, len); - is_data = is_data_inode(page->mapping->host); + btrfs_subpage_assert(fs_info, folio, start, len); + is_data = is_data_inode(folio->mapping->host); ASSERT(atomic_read(&subpage->readers) >= nbits); last = atomic_sub_and_test(nbits, &subpage->readers); @@ -270,35 +269,35 @@ void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, * As we want the atomic_sub_and_test() to be always executed. */ if (is_data && last) - unlock_page(page); + folio_unlock(folio); } -static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len) +static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) { u64 orig_start = *start; u32 orig_len = *len; - *start = max_t(u64, page_offset(page), orig_start); + *start = max_t(u64, folio_pos(folio), orig_start); /* * For certain call sites like btrfs_drop_pages(), we may have pages * beyond the target range. In that case, just set @len to 0, subpage * helpers can handle @len == 0 without any problem. */ - if (page_offset(page) >= orig_start + orig_len) + if (folio_pos(folio) >= orig_start + orig_len) *len = 0; else - *len = min_t(u64, page_offset(page) + PAGE_SIZE, + *len = min_t(u64, folio_pos(folio) + PAGE_SIZE, orig_start + orig_len) - *start; } void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct btrfs_subpage *subpage = folio_get_private(folio); const int nbits = (len >> fs_info->sectorsize_bits); int ret; - btrfs_subpage_assert(fs_info, page, start, len); + btrfs_subpage_assert(fs_info, folio, start, len); ASSERT(atomic_read(&subpage->readers) == 0); ret = atomic_add_return(nbits, &subpage->writers); @@ -306,12 +305,12 @@ void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, } bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct btrfs_subpage *subpage = folio_get_private(folio); const int nbits = (len >> fs_info->sectorsize_bits); - btrfs_subpage_assert(fs_info, page, start, len); + btrfs_subpage_assert(fs_info, folio, start, len); /* * We have call sites passing @lock_page into @@ -328,7 +327,7 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, } /* - * Lock a page for delalloc page writeback. + * Lock a folio for delalloc page writeback. * * Return -EAGAIN if the page is not properly initialized. * Return 0 with the page locked, and writer counter updated. @@ -337,38 +336,40 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, * it's really the correct page, as the caller is using * filemap_get_folios_contig(), which can race with page invalidating. */ -int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) +int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { - lock_page(page); + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { + folio_lock(folio); return 0; } - lock_page(page); - if (!PagePrivate(page) || !page->private) { - unlock_page(page); + folio_lock(folio); + if (!folio_test_private(folio) || !folio_get_private(folio)) { + folio_unlock(folio); return -EAGAIN; } - btrfs_subpage_clamp_range(page, &start, &len); - btrfs_subpage_start_writer(fs_info, page, start, len); + btrfs_subpage_clamp_range(folio, &start, &len); + btrfs_subpage_start_writer(fs_info, folio, start, len); return 0; } -void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) +void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) - return unlock_page(page); - btrfs_subpage_clamp_range(page, &start, &len); - if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len)) - unlock_page(page); + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { + folio_unlock(folio); + return; + } + btrfs_subpage_clamp_range(folio, &start, &len); + if (btrfs_subpage_end_and_test_writer(fs_info, folio, start, len)) + folio_unlock(folio); } -#define subpage_calc_start_bit(fs_info, page, name, start, len) \ +#define subpage_calc_start_bit(fs_info, folio, name, start, len) \ ({ \ unsigned int start_bit; \ \ - btrfs_subpage_assert(fs_info, page, start, len); \ + btrfs_subpage_assert(fs_info, folio, start, len); \ start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ start_bit += fs_info->subpage_info->name##_offset; \ start_bit; \ @@ -385,46 +386,46 @@ void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, fs_info->subpage_info->bitmap_nr_bits) void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, uptodate, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate)) - SetPageUptodate(page); + folio_mark_uptodate(folio); spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, uptodate, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - ClearPageUptodate(page); + folio_clear_uptodate(folio); spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, dirty, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); spin_unlock_irqrestore(&subpage->lock, flags); - set_page_dirty(page); + folio_mark_dirty(folio); } /* @@ -438,10 +439,10 @@ void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, * extra handling for tree blocks. */ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, dirty, start, len); unsigned long flags; bool last = false; @@ -455,101 +456,101 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, } void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { bool last; - last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len); + last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, start, len); if (last) - clear_page_dirty_for_io(page); + folio_clear_dirty_for_io(folio); } void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, writeback, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - set_page_writeback(page); + folio_start_writeback(folio); spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, writeback, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) { - ASSERT(PageWriteback(page)); - end_page_writeback(page); + ASSERT(folio_test_writeback(folio)); + folio_end_writeback(folio); } spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, ordered, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - SetPageOrdered(page); + folio_set_ordered(folio); spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, ordered, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered)) - ClearPageOrdered(page); + folio_clear_ordered(folio); spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, checked, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_set(fs_info, subpage, checked)) - SetPageChecked(page); + folio_set_checked(folio); spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, checked, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - ClearPageChecked(page); + folio_clear_checked(folio); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -559,10 +560,10 @@ void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info, */ #define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len) \ + struct folio *folio, u64 start, u32 len) \ { \ - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \ - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, \ + struct btrfs_subpage *subpage = folio_get_private(folio); \ + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, \ name, start, len); \ unsigned long flags; \ bool ret; \ @@ -584,88 +585,94 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked); * in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall * back to regular sectorsize branch. */ -#define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func, \ - test_page_func) \ -void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len) \ +#define IMPLEMENT_BTRFS_PAGE_OPS(name, folio_set_func, \ + folio_clear_func, folio_test_func) \ +void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ - set_page_func(page); \ + if (unlikely(!fs_info) || \ + !btrfs_is_subpage(fs_info, folio->mapping)) { \ + folio_set_func(folio); \ return; \ } \ - btrfs_subpage_set_##name(fs_info, page, start, len); \ + btrfs_subpage_set_##name(fs_info, folio, start, len); \ } \ -void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len) \ +void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ - clear_page_func(page); \ + if (unlikely(!fs_info) || \ + !btrfs_is_subpage(fs_info, folio->mapping)) { \ + folio_clear_func(folio); \ return; \ } \ - btrfs_subpage_clear_##name(fs_info, page, start, len); \ + btrfs_subpage_clear_##name(fs_info, folio, start, len); \ } \ -bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len) \ +bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \ - return test_page_func(page); \ - return btrfs_subpage_test_##name(fs_info, page, start, len); \ + if (unlikely(!fs_info) || \ + !btrfs_is_subpage(fs_info, folio->mapping)) \ + return folio_test_func(folio); \ + return btrfs_subpage_test_##name(fs_info, folio, start, len); \ } \ -void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len) \ +void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ - set_page_func(page); \ + if (unlikely(!fs_info) || \ + !btrfs_is_subpage(fs_info, folio->mapping)) { \ + folio_set_func(folio); \ return; \ } \ - btrfs_subpage_clamp_range(page, &start, &len); \ - btrfs_subpage_set_##name(fs_info, page, start, len); \ + btrfs_subpage_clamp_range(folio, &start, &len); \ + btrfs_subpage_set_##name(fs_info, folio, start, len); \ } \ -void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len) \ +void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ - clear_page_func(page); \ + if (unlikely(!fs_info) || \ + !btrfs_is_subpage(fs_info, folio->mapping)) { \ + folio_clear_func(folio); \ return; \ } \ - btrfs_subpage_clamp_range(page, &start, &len); \ - btrfs_subpage_clear_##name(fs_info, page, start, len); \ + btrfs_subpage_clamp_range(folio, &start, &len); \ + btrfs_subpage_clear_##name(fs_info, folio, start, len); \ } \ -bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len) \ +bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \ - return test_page_func(page); \ - btrfs_subpage_clamp_range(page, &start, &len); \ - return btrfs_subpage_test_##name(fs_info, page, start, len); \ -} -IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate, - PageUptodate); -IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io, - PageDirty); -IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback, - PageWriteback); -IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered, - PageOrdered); -IMPLEMENT_BTRFS_PAGE_OPS(checked, SetPageChecked, ClearPageChecked, PageChecked); + if (unlikely(!fs_info) || \ + !btrfs_is_subpage(fs_info, folio->mapping)) \ + return folio_test_func(folio); \ + btrfs_subpage_clamp_range(folio, &start, &len); \ + return btrfs_subpage_test_##name(fs_info, folio, start, len); \ +} +IMPLEMENT_BTRFS_PAGE_OPS(uptodate, folio_mark_uptodate, folio_clear_uptodate, + folio_test_uptodate); +IMPLEMENT_BTRFS_PAGE_OPS(dirty, folio_mark_dirty, folio_clear_dirty_for_io, + folio_test_dirty); +IMPLEMENT_BTRFS_PAGE_OPS(writeback, folio_start_writeback, folio_end_writeback, + folio_test_writeback); +IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered, + folio_test_ordered); +IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, + folio_test_checked); /* * Make sure not only the page dirty bit is cleared, but also subpage dirty bit * is cleared. */ -void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, - struct page *page) +void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct btrfs_subpage *subpage = folio_get_private(folio); if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) return; - ASSERT(!PageDirty(page)); - if (!btrfs_is_subpage(fs_info, page)) + ASSERT(!folio_test_dirty(folio)); + if (!btrfs_is_subpage(fs_info, folio->mapping)) return; - ASSERT(PagePrivate(page) && page->private); + ASSERT(folio_test_private(folio) && folio_get_private(folio)); ASSERT(subpage_test_bitmap_all_zero(fs_info, subpage, dirty)); } @@ -684,18 +691,20 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, * extent_write_locked_range(). * In this case, we have to call subpage helper to handle the case. */ -void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, - u64 start, u32 len) +void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage; - ASSERT(PageLocked(page)); + ASSERT(folio_test_locked(folio)); /* For non-subpage case, we just unlock the page */ - if (!btrfs_is_subpage(fs_info, page)) - return unlock_page(page); + if (!btrfs_is_subpage(fs_info, folio->mapping)) { + folio_unlock(folio); + return; + } - ASSERT(PagePrivate(page) && page->private); - subpage = (struct btrfs_subpage *)page->private; + ASSERT(folio_test_private(folio) && folio_get_private(folio)); + subpage = folio_get_private(folio); /* * For subpage case, there are two types of locked page. With or @@ -704,12 +713,14 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, * Since we own the page lock, no one else could touch subpage::writers * and we are safe to do several atomic operations without spinlock. */ - if (atomic_read(&subpage->writers) == 0) + if (atomic_read(&subpage->writers) == 0) { /* No writers, locked by plain lock_page() */ - return unlock_page(page); + folio_unlock(folio); + return; + } /* Have writers, use proper subpage helper to end it */ - btrfs_page_end_writer_lock(fs_info, page, start, len); + btrfs_folio_end_writer_lock(fs_info, folio, start, len); } #define GET_SUBPAGE_BITMAP(subpage, subpage_info, name, dst) \ @@ -717,7 +728,7 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, subpage_info->name##_offset, subpage_info->bitmap_nr_bits) void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { struct btrfs_subpage_info *subpage_info = fs_info->subpage_info; struct btrfs_subpage *subpage; @@ -729,9 +740,9 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, unsigned long checked_bitmap; unsigned long flags; - ASSERT(PagePrivate(page) && page->private); + ASSERT(folio_test_private(folio) && folio_get_private(folio)); ASSERT(subpage_info); - subpage = (struct btrfs_subpage *)page->private; + subpage = folio_get_private(folio); spin_lock_irqsave(&subpage->lock, flags); GET_SUBPAGE_BITMAP(subpage, subpage_info, uptodate, &uptodate_bitmap); @@ -741,10 +752,10 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, GET_SUBPAGE_BITMAP(subpage, subpage_info, checked, &checked_bitmap); spin_unlock_irqrestore(&subpage->lock, flags); - dump_page(page, "btrfs subpage dump"); + dump_page(folio_page(folio, 0), "btrfs subpage dump"); btrfs_warn(fs_info, "start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl error=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", - start, len, page_offset(page), + start, len, folio_pos(folio), subpage_info->bitmap_nr_bits, &uptodate_bitmap, subpage_info->bitmap_nr_bits, &error_bitmap, subpage_info->bitmap_nr_bits, &dirty_bitmap, diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 5cbf67ccbdeb..793c2b314a58 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -73,71 +73,68 @@ enum btrfs_subpage_type { BTRFS_SUBPAGE_DATA, }; -bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page); +bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping); void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize); int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, - struct page *page, enum btrfs_subpage_type type); -void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, - struct page *page); + struct folio *folio, enum btrfs_subpage_type type); +void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio); /* Allocate additional data where page represents more than one sector */ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, enum btrfs_subpage_type type); void btrfs_free_subpage(struct btrfs_subpage *subpage); -void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, - struct page *page); -void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, - struct page *page); +void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); +void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); -int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); -void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); +int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len); +void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len); /* * Template for subpage related operations. * - * btrfs_subpage_*() are for call sites where the page has subpage attached and - * the range is ensured to be inside the page. + * btrfs_subpage_*() are for call sites where the folio has subpage attached and + * the range is ensured to be inside the folio's single page. * - * btrfs_page_*() are for call sites where the page can either be subpage - * specific or regular page. The function will handle both cases. - * But the range still needs to be inside the page. + * btrfs_folio_*() are for call sites where the page can either be subpage + * specific or regular folios. The function will handle both cases. + * But the range still needs to be inside one single page. * - * btrfs_page_clamp_*() are similar to btrfs_page_*(), except the range doesn't + * btrfs_folio_clamp_*() are similar to btrfs_folio_*(), except the range doesn't * need to be inside the page. Those functions will truncate the range * automatically. */ #define DECLARE_BTRFS_SUBPAGE_OPS(name) \ void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ + struct folio *folio, u64 start, u32 len); \ void btrfs_subpage_clear_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ + struct folio *folio, u64 start, u32 len); \ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ -void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ -void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ -bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ -void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ -void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ -bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); \ +void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len); \ +void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len); \ +bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len); \ +void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len); \ +void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len); \ +bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len); DECLARE_BTRFS_SUBPAGE_OPS(uptodate); DECLARE_BTRFS_SUBPAGE_OPS(dirty); @@ -146,13 +143,12 @@ DECLARE_BTRFS_SUBPAGE_OPS(ordered); DECLARE_BTRFS_SUBPAGE_OPS(checked); bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); -void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, - struct page *page); -void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, - u64 start, u32 len); +void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio); +void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len); void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ef256b944c72..896acfda1789 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -27,6 +27,7 @@ #include <linux/crc32c.h> #include <linux/btrfs.h> #include <linux/security.h> +#include <linux/fs_parser.h> #include "messages.h" #include "delayed-inode.h" #include "ctree.h" @@ -64,19 +65,7 @@ #include <trace/events/btrfs.h> static const struct super_operations btrfs_super_ops; - -/* - * Types for mounting the default subvolume and a subvolume explicitly - * requested by subvol=/path. That way the callchain is straightforward and we - * don't have to play tricks with the mount options and recursive calls to - * btrfs_mount. - * - * The new btrfs_root_fs_type also servers as a tag for the bdev_holder. - */ static struct file_system_type btrfs_fs_type; -static struct file_system_type btrfs_root_fs_type; - -static int btrfs_remount(struct super_block *sb, int *flags, char *data); static void btrfs_put_super(struct super_block *sb) { @@ -86,8 +75,22 @@ static void btrfs_put_super(struct super_block *sb) close_ctree(fs_info); } +/* Store the mount options related information. */ +struct btrfs_fs_context { + char *subvol_name; + u64 subvol_objectid; + u64 max_inline; + u32 commit_interval; + u32 metadata_ratio; + u32 thread_pool_size; + unsigned long mount_opt; + unsigned long compress_type:4; + unsigned int compress_level; + refcount_t refs; +}; + enum { - Opt_acl, Opt_noacl, + Opt_acl, Opt_clear_cache, Opt_commit_interval, Opt_compress, @@ -97,27 +100,26 @@ enum { Opt_degraded, Opt_device, Opt_fatal_errors, - Opt_flushoncommit, Opt_noflushoncommit, + Opt_flushoncommit, Opt_max_inline, - Opt_barrier, Opt_nobarrier, - Opt_datacow, Opt_nodatacow, - Opt_datasum, Opt_nodatasum, - Opt_defrag, Opt_nodefrag, - Opt_discard, Opt_nodiscard, + Opt_barrier, + Opt_datacow, + Opt_datasum, + Opt_defrag, + Opt_discard, Opt_discard_mode, - Opt_norecovery, Opt_ratio, Opt_rescan_uuid_tree, Opt_skip_balance, - Opt_space_cache, Opt_no_space_cache, + Opt_space_cache, Opt_space_cache_version, - Opt_ssd, Opt_nossd, - Opt_ssd_spread, Opt_nossd_spread, + Opt_ssd, + Opt_ssd_spread, Opt_subvol, Opt_subvol_empty, Opt_subvolid, Opt_thread_pool, - Opt_treelog, Opt_notreelog, + Opt_treelog, Opt_user_subvol_rm_allowed, /* Rescue options */ @@ -128,14 +130,10 @@ enum { Opt_ignoredatacsums, Opt_rescue_all, - /* Deprecated options */ - Opt_recovery, - Opt_inode_cache, Opt_noinode_cache, - /* Debugging options */ - Opt_enospc_debug, Opt_noenospc_debug, + Opt_enospc_debug, #ifdef CONFIG_BTRFS_DEBUG - Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, + Opt_fragment, Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, #endif #ifdef CONFIG_BTRFS_FS_REF_VERIFY Opt_ref_verify, @@ -143,785 +141,611 @@ enum { Opt_err, }; -static const match_table_t tokens = { - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, - {Opt_clear_cache, "clear_cache"}, - {Opt_commit_interval, "commit=%u"}, - {Opt_compress, "compress"}, - {Opt_compress_type, "compress=%s"}, - {Opt_compress_force, "compress-force"}, - {Opt_compress_force_type, "compress-force=%s"}, - {Opt_degraded, "degraded"}, - {Opt_device, "device=%s"}, - {Opt_fatal_errors, "fatal_errors=%s"}, - {Opt_flushoncommit, "flushoncommit"}, - {Opt_noflushoncommit, "noflushoncommit"}, - {Opt_inode_cache, "inode_cache"}, - {Opt_noinode_cache, "noinode_cache"}, - {Opt_max_inline, "max_inline=%s"}, - {Opt_barrier, "barrier"}, - {Opt_nobarrier, "nobarrier"}, - {Opt_datacow, "datacow"}, - {Opt_nodatacow, "nodatacow"}, - {Opt_datasum, "datasum"}, - {Opt_nodatasum, "nodatasum"}, - {Opt_defrag, "autodefrag"}, - {Opt_nodefrag, "noautodefrag"}, - {Opt_discard, "discard"}, - {Opt_discard_mode, "discard=%s"}, - {Opt_nodiscard, "nodiscard"}, - {Opt_norecovery, "norecovery"}, - {Opt_ratio, "metadata_ratio=%u"}, - {Opt_rescan_uuid_tree, "rescan_uuid_tree"}, - {Opt_skip_balance, "skip_balance"}, - {Opt_space_cache, "space_cache"}, - {Opt_no_space_cache, "nospace_cache"}, - {Opt_space_cache_version, "space_cache=%s"}, - {Opt_ssd, "ssd"}, - {Opt_nossd, "nossd"}, - {Opt_ssd_spread, "ssd_spread"}, - {Opt_nossd_spread, "nossd_spread"}, - {Opt_subvol, "subvol=%s"}, - {Opt_subvol_empty, "subvol="}, - {Opt_subvolid, "subvolid=%s"}, - {Opt_thread_pool, "thread_pool=%u"}, - {Opt_treelog, "treelog"}, - {Opt_notreelog, "notreelog"}, - {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, +enum { + Opt_fatal_errors_panic, + Opt_fatal_errors_bug, +}; - /* Rescue options */ - {Opt_rescue, "rescue=%s"}, +static const struct constant_table btrfs_parameter_fatal_errors[] = { + { "panic", Opt_fatal_errors_panic }, + { "bug", Opt_fatal_errors_bug }, + {} +}; + +enum { + Opt_discard_sync, + Opt_discard_async, +}; + +static const struct constant_table btrfs_parameter_discard[] = { + { "sync", Opt_discard_sync }, + { "async", Opt_discard_async }, + {} +}; + +enum { + Opt_space_cache_v1, + Opt_space_cache_v2, +}; + +static const struct constant_table btrfs_parameter_space_cache[] = { + { "v1", Opt_space_cache_v1 }, + { "v2", Opt_space_cache_v2 }, + {} +}; + +enum { + Opt_rescue_usebackuproot, + Opt_rescue_nologreplay, + Opt_rescue_ignorebadroots, + Opt_rescue_ignoredatacsums, + Opt_rescue_parameter_all, +}; + +static const struct constant_table btrfs_parameter_rescue[] = { + { "usebackuproot", Opt_rescue_usebackuproot }, + { "nologreplay", Opt_rescue_nologreplay }, + { "ignorebadroots", Opt_rescue_ignorebadroots }, + { "ibadroots", Opt_rescue_ignorebadroots }, + { "ignoredatacsums", Opt_rescue_ignoredatacsums }, + { "idatacsums", Opt_rescue_ignoredatacsums }, + { "all", Opt_rescue_parameter_all }, + {} +}; + +#ifdef CONFIG_BTRFS_DEBUG +enum { + Opt_fragment_parameter_data, + Opt_fragment_parameter_metadata, + Opt_fragment_parameter_all, +}; + +static const struct constant_table btrfs_parameter_fragment[] = { + { "data", Opt_fragment_parameter_data }, + { "metadata", Opt_fragment_parameter_metadata }, + { "all", Opt_fragment_parameter_all }, + {} +}; +#endif + +static const struct fs_parameter_spec btrfs_fs_parameters[] = { + fsparam_flag_no("acl", Opt_acl), + fsparam_flag_no("autodefrag", Opt_defrag), + fsparam_flag_no("barrier", Opt_barrier), + fsparam_flag("clear_cache", Opt_clear_cache), + fsparam_u32("commit", Opt_commit_interval), + fsparam_flag("compress", Opt_compress), + fsparam_string("compress", Opt_compress_type), + fsparam_flag("compress-force", Opt_compress_force), + fsparam_string("compress-force", Opt_compress_force_type), + fsparam_flag_no("datacow", Opt_datacow), + fsparam_flag_no("datasum", Opt_datasum), + fsparam_flag("degraded", Opt_degraded), + fsparam_string("device", Opt_device), + fsparam_flag_no("discard", Opt_discard), + fsparam_enum("discard", Opt_discard_mode, btrfs_parameter_discard), + fsparam_enum("fatal_errors", Opt_fatal_errors, btrfs_parameter_fatal_errors), + fsparam_flag_no("flushoncommit", Opt_flushoncommit), + fsparam_string("max_inline", Opt_max_inline), + fsparam_u32("metadata_ratio", Opt_ratio), + fsparam_flag("rescan_uuid_tree", Opt_rescan_uuid_tree), + fsparam_flag("skip_balance", Opt_skip_balance), + fsparam_flag_no("space_cache", Opt_space_cache), + fsparam_enum("space_cache", Opt_space_cache_version, btrfs_parameter_space_cache), + fsparam_flag_no("ssd", Opt_ssd), + fsparam_flag_no("ssd_spread", Opt_ssd_spread), + fsparam_string("subvol", Opt_subvol), + fsparam_flag("subvol=", Opt_subvol_empty), + fsparam_u64("subvolid", Opt_subvolid), + fsparam_u32("thread_pool", Opt_thread_pool), + fsparam_flag_no("treelog", Opt_treelog), + fsparam_flag("user_subvol_rm_allowed", Opt_user_subvol_rm_allowed), + + /* Rescue options. */ + fsparam_enum("rescue", Opt_rescue, btrfs_parameter_rescue), /* Deprecated, with alias rescue=nologreplay */ - {Opt_nologreplay, "nologreplay"}, + __fsparam(NULL, "nologreplay", Opt_nologreplay, fs_param_deprecated, NULL), /* Deprecated, with alias rescue=usebackuproot */ - {Opt_usebackuproot, "usebackuproot"}, + __fsparam(NULL, "usebackuproot", Opt_usebackuproot, fs_param_deprecated, NULL), - /* Deprecated options */ - {Opt_recovery, "recovery"}, - - /* Debugging options */ - {Opt_enospc_debug, "enospc_debug"}, - {Opt_noenospc_debug, "noenospc_debug"}, + /* Debugging options. */ + fsparam_flag_no("enospc_debug", Opt_enospc_debug), #ifdef CONFIG_BTRFS_DEBUG - {Opt_fragment_data, "fragment=data"}, - {Opt_fragment_metadata, "fragment=metadata"}, - {Opt_fragment_all, "fragment=all"}, + fsparam_enum("fragment", Opt_fragment, btrfs_parameter_fragment), #endif #ifdef CONFIG_BTRFS_FS_REF_VERIFY - {Opt_ref_verify, "ref_verify"}, + fsparam_flag("ref_verify", Opt_ref_verify), #endif - {Opt_err, NULL}, + {} }; -static const match_table_t rescue_tokens = { - {Opt_usebackuproot, "usebackuproot"}, - {Opt_nologreplay, "nologreplay"}, - {Opt_ignorebadroots, "ignorebadroots"}, - {Opt_ignorebadroots, "ibadroots"}, - {Opt_ignoredatacsums, "ignoredatacsums"}, - {Opt_ignoredatacsums, "idatacsums"}, - {Opt_rescue_all, "all"}, - {Opt_err, NULL}, -}; - -static bool check_ro_option(struct btrfs_fs_info *fs_info, unsigned long opt, - const char *opt_name) +/* No support for restricting writes to btrfs devices yet... */ +static inline blk_mode_t btrfs_open_mode(struct fs_context *fc) { - if (fs_info->mount_opt & opt) { - btrfs_err(fs_info, "%s must be used with ro mount option", - opt_name); - return true; - } - return false; + return sb_open_mode(fc->sb_flags) & ~BLK_OPEN_RESTRICT_WRITES; } -static int parse_rescue_options(struct btrfs_fs_info *info, const char *options) +static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *opts; - char *orig; - char *p; - substring_t args[MAX_OPT_ARGS]; - int ret = 0; + struct btrfs_fs_context *ctx = fc->fs_private; + struct fs_parse_result result; + int opt; - opts = kstrdup(options, GFP_KERNEL); - if (!opts) - return -ENOMEM; - orig = opts; + opt = fs_parse(fc, btrfs_fs_parameters, param, &result); + if (opt < 0) + return opt; - while ((p = strsep(&opts, ":")) != NULL) { - int token; + switch (opt) { + case Opt_degraded: + btrfs_set_opt(ctx->mount_opt, DEGRADED); + break; + case Opt_subvol_empty: + /* + * This exists because we used to allow it on accident, so we're + * keeping it to maintain ABI. See 37becec95ac3 ("Btrfs: allow + * empty subvol= again"). + */ + break; + case Opt_subvol: + kfree(ctx->subvol_name); + ctx->subvol_name = kstrdup(param->string, GFP_KERNEL); + if (!ctx->subvol_name) + return -ENOMEM; + break; + case Opt_subvolid: + ctx->subvol_objectid = result.uint_64; - if (!*p) - continue; - token = match_token(p, rescue_tokens, args); - switch (token){ - case Opt_usebackuproot: - btrfs_info(info, - "trying to use backup root at mount time"); - btrfs_set_opt(info->mount_opt, USEBACKUPROOT); - break; - case Opt_nologreplay: - btrfs_set_and_info(info, NOLOGREPLAY, - "disabling log replay at mount time"); - break; - case Opt_ignorebadroots: - btrfs_set_and_info(info, IGNOREBADROOTS, - "ignoring bad roots"); - break; - case Opt_ignoredatacsums: - btrfs_set_and_info(info, IGNOREDATACSUMS, - "ignoring data csums"); - break; - case Opt_rescue_all: - btrfs_info(info, "enabling all of the rescue options"); - btrfs_set_and_info(info, IGNOREDATACSUMS, - "ignoring data csums"); - btrfs_set_and_info(info, IGNOREBADROOTS, - "ignoring bad roots"); - btrfs_set_and_info(info, NOLOGREPLAY, - "disabling log replay at mount time"); - break; - case Opt_err: - btrfs_info(info, "unrecognized rescue option '%s'", p); - ret = -EINVAL; - goto out; - default: - break; - } + /* subvolid=0 means give me the original fs_tree. */ + if (!ctx->subvol_objectid) + ctx->subvol_objectid = BTRFS_FS_TREE_OBJECTID; + break; + case Opt_device: { + struct btrfs_device *device; + blk_mode_t mode = btrfs_open_mode(fc); + mutex_lock(&uuid_mutex); + device = btrfs_scan_one_device(param->string, mode, false); + mutex_unlock(&uuid_mutex); + if (IS_ERR(device)) + return PTR_ERR(device); + break; } -out: - kfree(orig); - return ret; -} - -/* - * Regular mount options parser. Everything that is needed only when - * reading in a new superblock is parsed here. - * XXX JDM: This needs to be cleaned up for remount. - */ -int btrfs_parse_options(struct btrfs_fs_info *info, char *options, - unsigned long new_flags) -{ - substring_t args[MAX_OPT_ARGS]; - char *p, *num; - int intarg; - int ret = 0; - char *compress_type; - bool compress_force = false; - enum btrfs_compression_type saved_compress_type; - int saved_compress_level; - bool saved_compress_force; - int no_compress = 0; - const bool remounting = test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state); - - if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE)) - btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE); - else if (btrfs_free_space_cache_v1_active(info)) { - if (btrfs_is_zoned(info)) { - btrfs_info(info, - "zoned: clearing existing space cache"); - btrfs_set_super_cache_generation(info->super_copy, 0); + case Opt_datasum: + if (result.negated) { + btrfs_set_opt(ctx->mount_opt, NODATASUM); } else { - btrfs_set_opt(info->mount_opt, SPACE_CACHE); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); } - } - - /* - * Even the options are empty, we still need to do extra check - * against new flags - */ - if (!options) - goto check; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_degraded: - btrfs_info(info, "allowing degraded mounts"); - btrfs_set_opt(info->mount_opt, DEGRADED); - break; - case Opt_subvol: - case Opt_subvol_empty: - case Opt_subvolid: - case Opt_device: - /* - * These are parsed by btrfs_parse_subvol_options or - * btrfs_parse_device_options and can be ignored here. - */ - break; - case Opt_nodatasum: - btrfs_set_and_info(info, NODATASUM, - "setting nodatasum"); - break; - case Opt_datasum: - if (btrfs_test_opt(info, NODATASUM)) { - if (btrfs_test_opt(info, NODATACOW)) - btrfs_info(info, - "setting datasum, datacow enabled"); - else - btrfs_info(info, "setting datasum"); - } - btrfs_clear_opt(info->mount_opt, NODATACOW); - btrfs_clear_opt(info->mount_opt, NODATASUM); - break; - case Opt_nodatacow: - if (!btrfs_test_opt(info, NODATACOW)) { - if (!btrfs_test_opt(info, COMPRESS) || - !btrfs_test_opt(info, FORCE_COMPRESS)) { - btrfs_info(info, - "setting nodatacow, compression disabled"); - } else { - btrfs_info(info, "setting nodatacow"); - } - } - btrfs_clear_opt(info->mount_opt, COMPRESS); - btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); - btrfs_set_opt(info->mount_opt, NODATACOW); - btrfs_set_opt(info->mount_opt, NODATASUM); - break; - case Opt_datacow: - btrfs_clear_and_info(info, NODATACOW, - "setting datacow"); - break; - case Opt_compress_force: - case Opt_compress_force_type: - compress_force = true; - fallthrough; - case Opt_compress: - case Opt_compress_type: - saved_compress_type = btrfs_test_opt(info, - COMPRESS) ? - info->compress_type : BTRFS_COMPRESS_NONE; - saved_compress_force = - btrfs_test_opt(info, FORCE_COMPRESS); - saved_compress_level = info->compress_level; - if (token == Opt_compress || - token == Opt_compress_force || - strncmp(args[0].from, "zlib", 4) == 0) { - compress_type = "zlib"; - - info->compress_type = BTRFS_COMPRESS_ZLIB; - info->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL; - /* - * args[0] contains uninitialized data since - * for these tokens we don't expect any - * parameter. - */ - if (token != Opt_compress && - token != Opt_compress_force) - info->compress_level = - btrfs_compress_str2level( - BTRFS_COMPRESS_ZLIB, - args[0].from + 4); - btrfs_set_opt(info->mount_opt, COMPRESS); - btrfs_clear_opt(info->mount_opt, NODATACOW); - btrfs_clear_opt(info->mount_opt, NODATASUM); - no_compress = 0; - } else if (strncmp(args[0].from, "lzo", 3) == 0) { - compress_type = "lzo"; - info->compress_type = BTRFS_COMPRESS_LZO; - info->compress_level = 0; - btrfs_set_opt(info->mount_opt, COMPRESS); - btrfs_clear_opt(info->mount_opt, NODATACOW); - btrfs_clear_opt(info->mount_opt, NODATASUM); - btrfs_set_fs_incompat(info, COMPRESS_LZO); - no_compress = 0; - } else if (strncmp(args[0].from, "zstd", 4) == 0) { - compress_type = "zstd"; - info->compress_type = BTRFS_COMPRESS_ZSTD; - info->compress_level = - btrfs_compress_str2level( - BTRFS_COMPRESS_ZSTD, - args[0].from + 4); - btrfs_set_opt(info->mount_opt, COMPRESS); - btrfs_clear_opt(info->mount_opt, NODATACOW); - btrfs_clear_opt(info->mount_opt, NODATASUM); - btrfs_set_fs_incompat(info, COMPRESS_ZSTD); - no_compress = 0; - } else if (strncmp(args[0].from, "no", 2) == 0) { - compress_type = "no"; - info->compress_level = 0; - info->compress_type = 0; - btrfs_clear_opt(info->mount_opt, COMPRESS); - btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); - compress_force = false; - no_compress++; - } else { - btrfs_err(info, "unrecognized compression value %s", - args[0].from); - ret = -EINVAL; - goto out; - } - - if (compress_force) { - btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); - } else { - /* - * If we remount from compress-force=xxx to - * compress=xxx, we need clear FORCE_COMPRESS - * flag, otherwise, there is no way for users - * to disable forcible compression separately. - */ - btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); - } - if (no_compress == 1) { - btrfs_info(info, "use no compression"); - } else if ((info->compress_type != saved_compress_type) || - (compress_force != saved_compress_force) || - (info->compress_level != saved_compress_level)) { - btrfs_info(info, "%s %s compression, level %d", - (compress_force) ? "force" : "use", - compress_type, info->compress_level); - } - compress_force = false; - break; - case Opt_ssd: - btrfs_set_and_info(info, SSD, - "enabling ssd optimizations"); - btrfs_clear_opt(info->mount_opt, NOSSD); - break; - case Opt_ssd_spread: - btrfs_set_and_info(info, SSD, - "enabling ssd optimizations"); - btrfs_set_and_info(info, SSD_SPREAD, - "using spread ssd allocation scheme"); - btrfs_clear_opt(info->mount_opt, NOSSD); - break; - case Opt_nossd: - btrfs_set_opt(info->mount_opt, NOSSD); - btrfs_clear_and_info(info, SSD, - "not using ssd optimizations"); - fallthrough; - case Opt_nossd_spread: - btrfs_clear_and_info(info, SSD_SPREAD, - "not using spread ssd allocation scheme"); - break; - case Opt_barrier: - btrfs_clear_and_info(info, NOBARRIER, - "turning on barriers"); - break; - case Opt_nobarrier: - btrfs_set_and_info(info, NOBARRIER, - "turning off barriers"); - break; - case Opt_thread_pool: - ret = match_int(&args[0], &intarg); - if (ret) { - btrfs_err(info, "unrecognized thread_pool value %s", - args[0].from); - goto out; - } else if (intarg == 0) { - btrfs_err(info, "invalid value 0 for thread_pool"); - ret = -EINVAL; - goto out; - } - info->thread_pool_size = intarg; - break; - case Opt_max_inline: - num = match_strdup(&args[0]); - if (num) { - info->max_inline = memparse(num, NULL); - kfree(num); - - if (info->max_inline) { - info->max_inline = min_t(u64, - info->max_inline, - info->sectorsize); - } - btrfs_info(info, "max_inline at %llu", - info->max_inline); - } else { - ret = -ENOMEM; - goto out; - } - break; - case Opt_acl: + break; + case Opt_datacow: + if (result.negated) { + btrfs_clear_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); + btrfs_set_opt(ctx->mount_opt, NODATACOW); + btrfs_set_opt(ctx->mount_opt, NODATASUM); + } else { + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + } + break; + case Opt_compress_force: + case Opt_compress_force_type: + btrfs_set_opt(ctx->mount_opt, FORCE_COMPRESS); + fallthrough; + case Opt_compress: + case Opt_compress_type: + if (opt == Opt_compress || opt == Opt_compress_force) { + ctx->compress_type = BTRFS_COMPRESS_ZLIB; + ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL; + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (strncmp(param->string, "zlib", 4) == 0) { + ctx->compress_type = BTRFS_COMPRESS_ZLIB; + ctx->compress_level = + btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB, + param->string + 4); + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (strncmp(param->string, "lzo", 3) == 0) { + ctx->compress_type = BTRFS_COMPRESS_LZO; + ctx->compress_level = 0; + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (strncmp(param->string, "zstd", 4) == 0) { + ctx->compress_type = BTRFS_COMPRESS_ZSTD; + ctx->compress_level = + btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD, + param->string + 4); + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (strncmp(param->string, "no", 2) == 0) { + ctx->compress_level = 0; + ctx->compress_type = 0; + btrfs_clear_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); + } else { + btrfs_err(NULL, "unrecognized compression value %s", + param->string); + return -EINVAL; + } + break; + case Opt_ssd: + if (result.negated) { + btrfs_set_opt(ctx->mount_opt, NOSSD); + btrfs_clear_opt(ctx->mount_opt, SSD); + btrfs_clear_opt(ctx->mount_opt, SSD_SPREAD); + } else { + btrfs_set_opt(ctx->mount_opt, SSD); + btrfs_clear_opt(ctx->mount_opt, NOSSD); + } + break; + case Opt_ssd_spread: + if (result.negated) { + btrfs_clear_opt(ctx->mount_opt, SSD_SPREAD); + } else { + btrfs_set_opt(ctx->mount_opt, SSD); + btrfs_set_opt(ctx->mount_opt, SSD_SPREAD); + btrfs_clear_opt(ctx->mount_opt, NOSSD); + } + break; + case Opt_barrier: + if (result.negated) + btrfs_set_opt(ctx->mount_opt, NOBARRIER); + else + btrfs_clear_opt(ctx->mount_opt, NOBARRIER); + break; + case Opt_thread_pool: + if (result.uint_32 == 0) { + btrfs_err(NULL, "invalid value 0 for thread_pool"); + return -EINVAL; + } + ctx->thread_pool_size = result.uint_32; + break; + case Opt_max_inline: + ctx->max_inline = memparse(param->string, NULL); + break; + case Opt_acl: + if (result.negated) { + fc->sb_flags &= ~SB_POSIXACL; + } else { #ifdef CONFIG_BTRFS_FS_POSIX_ACL - info->sb->s_flags |= SB_POSIXACL; - break; + fc->sb_flags |= SB_POSIXACL; #else - btrfs_err(info, "support for ACL not compiled in!"); - ret = -EINVAL; - goto out; + btrfs_err(NULL, "support for ACL not compiled in"); + return -EINVAL; #endif - case Opt_noacl: - info->sb->s_flags &= ~SB_POSIXACL; - break; - case Opt_notreelog: - btrfs_set_and_info(info, NOTREELOG, - "disabling tree log"); - break; - case Opt_treelog: - btrfs_clear_and_info(info, NOTREELOG, - "enabling tree log"); - break; - case Opt_norecovery: - case Opt_nologreplay: - btrfs_warn(info, + } + /* + * VFS limits the ability to toggle ACL on and off via remount, + * despite every file system allowing this. This seems to be + * an oversight since we all do, but it'll fail if we're + * remounting. So don't set the mask here, we'll check it in + * btrfs_reconfigure and do the toggling ourselves. + */ + if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE) + fc->sb_flags_mask |= SB_POSIXACL; + break; + case Opt_treelog: + if (result.negated) + btrfs_set_opt(ctx->mount_opt, NOTREELOG); + else + btrfs_clear_opt(ctx->mount_opt, NOTREELOG); + break; + case Opt_nologreplay: + btrfs_warn(NULL, "'nologreplay' is deprecated, use 'rescue=nologreplay' instead"); - btrfs_set_and_info(info, NOLOGREPLAY, - "disabling log replay at mount time"); - break; - case Opt_flushoncommit: - btrfs_set_and_info(info, FLUSHONCOMMIT, - "turning on flush-on-commit"); - break; - case Opt_noflushoncommit: - btrfs_clear_and_info(info, FLUSHONCOMMIT, - "turning off flush-on-commit"); - break; - case Opt_ratio: - ret = match_int(&args[0], &intarg); - if (ret) { - btrfs_err(info, "unrecognized metadata_ratio value %s", - args[0].from); - goto out; - } - info->metadata_ratio = intarg; - btrfs_info(info, "metadata ratio %u", - info->metadata_ratio); - break; - case Opt_discard: - case Opt_discard_mode: - if (token == Opt_discard || - strcmp(args[0].from, "sync") == 0) { - btrfs_clear_opt(info->mount_opt, DISCARD_ASYNC); - btrfs_set_and_info(info, DISCARD_SYNC, - "turning on sync discard"); - } else if (strcmp(args[0].from, "async") == 0) { - btrfs_clear_opt(info->mount_opt, DISCARD_SYNC); - btrfs_set_and_info(info, DISCARD_ASYNC, - "turning on async discard"); - } else { - btrfs_err(info, "unrecognized discard mode value %s", - args[0].from); - ret = -EINVAL; - goto out; - } - btrfs_clear_opt(info->mount_opt, NODISCARD); - break; - case Opt_nodiscard: - btrfs_clear_and_info(info, DISCARD_SYNC, - "turning off discard"); - btrfs_clear_and_info(info, DISCARD_ASYNC, - "turning off async discard"); - btrfs_set_opt(info->mount_opt, NODISCARD); - break; - case Opt_space_cache: - case Opt_space_cache_version: - /* - * We already set FREE_SPACE_TREE above because we have - * compat_ro(FREE_SPACE_TREE) set, and we aren't going - * to allow v1 to be set for extent tree v2, simply - * ignore this setting if we're extent tree v2. - */ - if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) - break; - if (token == Opt_space_cache || - strcmp(args[0].from, "v1") == 0) { - btrfs_clear_opt(info->mount_opt, - FREE_SPACE_TREE); - btrfs_set_and_info(info, SPACE_CACHE, - "enabling disk space caching"); - } else if (strcmp(args[0].from, "v2") == 0) { - btrfs_clear_opt(info->mount_opt, - SPACE_CACHE); - btrfs_set_and_info(info, FREE_SPACE_TREE, - "enabling free space tree"); - } else { - btrfs_err(info, "unrecognized space_cache value %s", - args[0].from); - ret = -EINVAL; - goto out; - } - break; - case Opt_rescan_uuid_tree: - btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE); - break; - case Opt_no_space_cache: - /* - * We cannot operate without the free space tree with - * extent tree v2, ignore this option. - */ - if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) - break; - if (btrfs_test_opt(info, SPACE_CACHE)) { - btrfs_clear_and_info(info, SPACE_CACHE, - "disabling disk space caching"); - } - if (btrfs_test_opt(info, FREE_SPACE_TREE)) { - btrfs_clear_and_info(info, FREE_SPACE_TREE, - "disabling free space tree"); - } - break; - case Opt_inode_cache: - case Opt_noinode_cache: - btrfs_warn(info, - "the 'inode_cache' option is deprecated and has no effect since 5.11"); + btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); + break; + case Opt_flushoncommit: + if (result.negated) + btrfs_clear_opt(ctx->mount_opt, FLUSHONCOMMIT); + else + btrfs_set_opt(ctx->mount_opt, FLUSHONCOMMIT); + break; + case Opt_ratio: + ctx->metadata_ratio = result.uint_32; + break; + case Opt_discard: + if (result.negated) { + btrfs_clear_opt(ctx->mount_opt, DISCARD_SYNC); + btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC); + btrfs_set_opt(ctx->mount_opt, NODISCARD); + } else { + btrfs_set_opt(ctx->mount_opt, DISCARD_SYNC); + btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC); + } + break; + case Opt_discard_mode: + switch (result.uint_32) { + case Opt_discard_sync: + btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC); + btrfs_set_opt(ctx->mount_opt, DISCARD_SYNC); break; - case Opt_clear_cache: - /* - * We cannot clear the free space tree with extent tree - * v2, ignore this option. - */ - if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) - break; - btrfs_set_and_info(info, CLEAR_CACHE, - "force clearing of disk cache"); + case Opt_discard_async: + btrfs_clear_opt(ctx->mount_opt, DISCARD_SYNC); + btrfs_set_opt(ctx->mount_opt, DISCARD_ASYNC); break; - case Opt_user_subvol_rm_allowed: - btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED); + default: + btrfs_err(NULL, "unrecognized discard mode value %s", + param->key); + return -EINVAL; + } + btrfs_clear_opt(ctx->mount_opt, NODISCARD); + break; + case Opt_space_cache: + if (result.negated) { + btrfs_set_opt(ctx->mount_opt, NOSPACECACHE); + btrfs_clear_opt(ctx->mount_opt, SPACE_CACHE); + btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE); + } else { + btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE); + btrfs_set_opt(ctx->mount_opt, SPACE_CACHE); + } + break; + case Opt_space_cache_version: + switch (result.uint_32) { + case Opt_space_cache_v1: + btrfs_set_opt(ctx->mount_opt, SPACE_CACHE); + btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE); break; - case Opt_enospc_debug: - btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); + case Opt_space_cache_v2: + btrfs_clear_opt(ctx->mount_opt, SPACE_CACHE); + btrfs_set_opt(ctx->mount_opt, FREE_SPACE_TREE); break; - case Opt_noenospc_debug: - btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG); + default: + btrfs_err(NULL, "unrecognized space_cache value %s", + param->key); + return -EINVAL; + } + break; + case Opt_rescan_uuid_tree: + btrfs_set_opt(ctx->mount_opt, RESCAN_UUID_TREE); + break; + case Opt_clear_cache: + btrfs_set_opt(ctx->mount_opt, CLEAR_CACHE); + break; + case Opt_user_subvol_rm_allowed: + btrfs_set_opt(ctx->mount_opt, USER_SUBVOL_RM_ALLOWED); + break; + case Opt_enospc_debug: + if (result.negated) + btrfs_clear_opt(ctx->mount_opt, ENOSPC_DEBUG); + else + btrfs_set_opt(ctx->mount_opt, ENOSPC_DEBUG); + break; + case Opt_defrag: + if (result.negated) + btrfs_clear_opt(ctx->mount_opt, AUTO_DEFRAG); + else + btrfs_set_opt(ctx->mount_opt, AUTO_DEFRAG); + break; + case Opt_usebackuproot: + btrfs_warn(NULL, + "'usebackuproot' is deprecated, use 'rescue=usebackuproot' instead"); + btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT); + + /* If we're loading the backup roots we can't trust the space cache. */ + btrfs_set_opt(ctx->mount_opt, CLEAR_CACHE); + break; + case Opt_skip_balance: + btrfs_set_opt(ctx->mount_opt, SKIP_BALANCE); + break; + case Opt_fatal_errors: + switch (result.uint_32) { + case Opt_fatal_errors_panic: + btrfs_set_opt(ctx->mount_opt, PANIC_ON_FATAL_ERROR); break; - case Opt_defrag: - btrfs_set_and_info(info, AUTO_DEFRAG, - "enabling auto defrag"); + case Opt_fatal_errors_bug: + btrfs_clear_opt(ctx->mount_opt, PANIC_ON_FATAL_ERROR); break; - case Opt_nodefrag: - btrfs_clear_and_info(info, AUTO_DEFRAG, - "disabling auto defrag"); + default: + btrfs_err(NULL, "unrecognized fatal_errors value %s", + param->key); + return -EINVAL; + } + break; + case Opt_commit_interval: + ctx->commit_interval = result.uint_32; + if (ctx->commit_interval == 0) + ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; + break; + case Opt_rescue: + switch (result.uint_32) { + case Opt_rescue_usebackuproot: + btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT); break; - case Opt_recovery: - case Opt_usebackuproot: - btrfs_warn(info, - "'%s' is deprecated, use 'rescue=usebackuproot' instead", - token == Opt_recovery ? "recovery" : - "usebackuproot"); - btrfs_info(info, - "trying to use backup root at mount time"); - btrfs_set_opt(info->mount_opt, USEBACKUPROOT); + case Opt_rescue_nologreplay: + btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); break; - case Opt_skip_balance: - btrfs_set_opt(info->mount_opt, SKIP_BALANCE); + case Opt_rescue_ignorebadroots: + btrfs_set_opt(ctx->mount_opt, IGNOREBADROOTS); break; - case Opt_fatal_errors: - if (strcmp(args[0].from, "panic") == 0) { - btrfs_set_opt(info->mount_opt, - PANIC_ON_FATAL_ERROR); - } else if (strcmp(args[0].from, "bug") == 0) { - btrfs_clear_opt(info->mount_opt, - PANIC_ON_FATAL_ERROR); - } else { - btrfs_err(info, "unrecognized fatal_errors value %s", - args[0].from); - ret = -EINVAL; - goto out; - } + case Opt_rescue_ignoredatacsums: + btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS); break; - case Opt_commit_interval: - intarg = 0; - ret = match_int(&args[0], &intarg); - if (ret) { - btrfs_err(info, "unrecognized commit_interval value %s", - args[0].from); - ret = -EINVAL; - goto out; - } - if (intarg == 0) { - btrfs_info(info, - "using default commit interval %us", - BTRFS_DEFAULT_COMMIT_INTERVAL); - intarg = BTRFS_DEFAULT_COMMIT_INTERVAL; - } else if (intarg > 300) { - btrfs_warn(info, "excessive commit interval %d", - intarg); - } - info->commit_interval = intarg; - break; - case Opt_rescue: - ret = parse_rescue_options(info, args[0].from); - if (ret < 0) { - btrfs_err(info, "unrecognized rescue value %s", - args[0].from); - goto out; - } + case Opt_rescue_parameter_all: + btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS); + btrfs_set_opt(ctx->mount_opt, IGNOREBADROOTS); + btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); break; + default: + btrfs_info(NULL, "unrecognized rescue option '%s'", + param->key); + return -EINVAL; + } + break; #ifdef CONFIG_BTRFS_DEBUG - case Opt_fragment_all: - btrfs_info(info, "fragmenting all space"); - btrfs_set_opt(info->mount_opt, FRAGMENT_DATA); - btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA); + case Opt_fragment: + switch (result.uint_32) { + case Opt_fragment_parameter_all: + btrfs_set_opt(ctx->mount_opt, FRAGMENT_DATA); + btrfs_set_opt(ctx->mount_opt, FRAGMENT_METADATA); break; - case Opt_fragment_metadata: - btrfs_info(info, "fragmenting metadata"); - btrfs_set_opt(info->mount_opt, - FRAGMENT_METADATA); + case Opt_fragment_parameter_metadata: + btrfs_set_opt(ctx->mount_opt, FRAGMENT_METADATA); break; - case Opt_fragment_data: - btrfs_info(info, "fragmenting data"); - btrfs_set_opt(info->mount_opt, FRAGMENT_DATA); + case Opt_fragment_parameter_data: + btrfs_set_opt(ctx->mount_opt, FRAGMENT_DATA); break; + default: + btrfs_info(NULL, "unrecognized fragment option '%s'", + param->key); + return -EINVAL; + } + break; #endif #ifdef CONFIG_BTRFS_FS_REF_VERIFY - case Opt_ref_verify: - btrfs_info(info, "doing ref verification"); - btrfs_set_opt(info->mount_opt, REF_VERIFY); - break; + case Opt_ref_verify: + btrfs_set_opt(ctx->mount_opt, REF_VERIFY); + break; #endif - case Opt_err: - btrfs_err(info, "unrecognized mount option '%s'", p); - ret = -EINVAL; - goto out; - default: - break; - } + default: + btrfs_err(NULL, "unrecognized mount option '%s'", param->key); + return -EINVAL; } -check: - /* We're read-only, don't have to check. */ - if (new_flags & SB_RDONLY) - goto out; - if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") || - check_ro_option(info, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") || - check_ro_option(info, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums")) - ret = -EINVAL; -out: - if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) && - !btrfs_test_opt(info, FREE_SPACE_TREE) && - !btrfs_test_opt(info, CLEAR_CACHE)) { - btrfs_err(info, "cannot disable free space tree"); - ret = -EINVAL; - } - if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE) && - !btrfs_test_opt(info, FREE_SPACE_TREE)) { - btrfs_err(info, "cannot disable free space tree with block-group-tree feature"); - ret = -EINVAL; - } - if (!ret) - ret = btrfs_check_mountopts_zoned(info); - if (!ret && !remounting) { - if (btrfs_test_opt(info, SPACE_CACHE)) - btrfs_info(info, "disk space caching is enabled"); - if (btrfs_test_opt(info, FREE_SPACE_TREE)) - btrfs_info(info, "using free space tree"); - } - return ret; + return 0; } /* - * Parse mount options that are required early in the mount process. - * - * All other options will be parsed on much later in the mount process and - * only when we need to allocate a new super block. + * Some options only have meaning at mount time and shouldn't persist across + * remounts, or be displayed. Clear these at the end of mount and remount code + * paths. */ -static int btrfs_parse_device_options(const char *options, blk_mode_t flags) +static void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info) { - substring_t args[MAX_OPT_ARGS]; - char *device_name, *opts, *orig, *p; - struct btrfs_device *device = NULL; - int error = 0; + btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT); + btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE); + btrfs_clear_opt(fs_info->mount_opt, NOSPACECACHE); +} - lockdep_assert_held(&uuid_mutex); +static bool check_ro_option(struct btrfs_fs_info *fs_info, + unsigned long mount_opt, unsigned long opt, + const char *opt_name) +{ + if (mount_opt & opt) { + btrfs_err(fs_info, "%s must be used with ro mount option", + opt_name); + return true; + } + return false; +} - if (!options) - return 0; +bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt, + unsigned long flags) +{ + bool ret = true; - /* - * strsep changes the string, duplicate it because btrfs_parse_options - * gets called later - */ - opts = kstrdup(options, GFP_KERNEL); - if (!opts) - return -ENOMEM; - orig = opts; + if (!(flags & SB_RDONLY) && + (check_ro_option(info, *mount_opt, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") || + check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") || + check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums"))) + ret = false; - while ((p = strsep(&opts, ",")) != NULL) { - int token; + if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) && + !btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE) && + !btrfs_raw_test_opt(*mount_opt, CLEAR_CACHE)) { + btrfs_err(info, "cannot disable free-space-tree"); + ret = false; + } + if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE) && + !btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE)) { + btrfs_err(info, "cannot disable free-space-tree with block-group-tree feature"); + ret = false; + } - if (!*p) - continue; + if (btrfs_check_mountopts_zoned(info, mount_opt)) + ret = false; - token = match_token(p, tokens, args); - if (token == Opt_device) { - device_name = match_strdup(&args[0]); - if (!device_name) { - error = -ENOMEM; - goto out; - } - device = btrfs_scan_one_device(device_name, flags, false); - kfree(device_name); - if (IS_ERR(device)) { - error = PTR_ERR(device); - goto out; - } - } + if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) { + if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) + btrfs_info(info, "disk space caching is enabled"); + if (btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE)) + btrfs_info(info, "using free-space-tree"); } -out: - kfree(orig); - return error; + return ret; } /* - * Parse mount options that are related to subvolume id + * This is subtle, we only call this during open_ctree(). We need to pre-load + * the mount options with the on-disk settings. Before the new mount API took + * effect we would do this on mount and remount. With the new mount API we'll + * only do this on the initial mount. * - * The value is later passed to mount_subvol() + * This isn't a change in behavior, because we're using the current state of the + * file system to set the current mount options. If you mounted with special + * options to disable these features and then remounted we wouldn't revert the + * settings, because mounting without these features cleared the on-disk + * settings, so this being called on re-mount is not needed. */ -static int btrfs_parse_subvol_options(const char *options, char **subvol_name, - u64 *subvol_objectid) +void btrfs_set_free_space_cache_settings(struct btrfs_fs_info *fs_info) { - substring_t args[MAX_OPT_ARGS]; - char *opts, *orig, *p; - int error = 0; - u64 subvolid; - - if (!options) - return 0; + if (fs_info->sectorsize < PAGE_SIZE) { + btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); + if (!btrfs_test_opt(fs_info, FREE_SPACE_TREE)) { + btrfs_info(fs_info, + "forcing free space tree for sector size %u with page size %lu", + fs_info->sectorsize, PAGE_SIZE); + btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE); + } + } /* - * strsep changes the string, duplicate it because - * btrfs_parse_device_options gets called later + * At this point our mount options are populated, so we only mess with + * these settings if we don't have any settings already. */ - opts = kstrdup(options, GFP_KERNEL); - if (!opts) - return -ENOMEM; - orig = opts; + if (btrfs_test_opt(fs_info, FREE_SPACE_TREE)) + return; - while ((p = strsep(&opts, ",")) != NULL) { - int token; - if (!*p) - continue; + if (btrfs_is_zoned(fs_info) && + btrfs_free_space_cache_v1_active(fs_info)) { + btrfs_info(fs_info, "zoned: clearing existing space cache"); + btrfs_set_super_cache_generation(fs_info->super_copy, 0); + return; + } - token = match_token(p, tokens, args); - switch (token) { - case Opt_subvol: - kfree(*subvol_name); - *subvol_name = match_strdup(&args[0]); - if (!*subvol_name) { - error = -ENOMEM; - goto out; - } - break; - case Opt_subvolid: - error = match_u64(&args[0], &subvolid); - if (error) - goto out; + if (btrfs_test_opt(fs_info, SPACE_CACHE)) + return; - /* we want the original fs_tree */ - if (subvolid == 0) - subvolid = BTRFS_FS_TREE_OBJECTID; + if (btrfs_test_opt(fs_info, NOSPACECACHE)) + return; - *subvol_objectid = subvolid; - break; - default: - break; - } - } + /* + * At this point we don't have explicit options set by the user, set + * them ourselves based on the state of the file system. + */ + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE); + else if (btrfs_free_space_cache_v1_active(fs_info)) + btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE); +} -out: - kfree(orig); - return error; +static void set_device_specific_options(struct btrfs_fs_info *fs_info) +{ + if (!btrfs_test_opt(fs_info, NOSSD) && + !fs_info->fs_devices->rotating) + btrfs_set_opt(fs_info->mount_opt, SSD); + + /* + * For devices supporting discard turn on discard=async automatically, + * unless it's already set or disabled. This could be turned off by + * nodiscard for the same mount. + * + * The zoned mode piggy backs on the discard functionality for + * resetting a zone. There is no reason to delay the zone reset as it is + * fast enough. So, do not enable async discard for zoned mode. + */ + if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) || + btrfs_test_opt(fs_info, DISCARD_ASYNC) || + btrfs_test_opt(fs_info, NODISCARD)) && + fs_info->fs_devices->discardable && + !btrfs_is_zoned(fs_info)) + btrfs_set_opt(fs_info->mount_opt, DISCARD_ASYNC); } char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, @@ -1105,10 +929,6 @@ static int btrfs_fill_super(struct super_block *sb, #endif sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; -#ifdef CONFIG_BTRFS_FS_POSIX_ACL - sb->s_flags |= SB_POSIXACL; -#endif - sb->s_flags |= SB_I_VERSION; sb->s_iflags |= SB_I_CGROUPWB; err = super_setup_bdi(sb); @@ -1291,22 +1111,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) return 0; } -static int btrfs_test_super(struct super_block *s, void *data) -{ - struct btrfs_fs_info *p = data; - struct btrfs_fs_info *fs_info = btrfs_sb(s); - - return fs_info->fs_devices == p->fs_devices; -} - -static int btrfs_set_super(struct super_block *s, void *data) -{ - int err = set_anon_super(s, data); - if (!err) - s->s_fs_info = data; - return err; -} - /* * subvolumes are identified by ino 256 */ @@ -1382,200 +1186,6 @@ out: return root; } -/* - * Find a superblock for the given device / mount point. - * - * Note: This is based on mount_bdev from fs/super.c with a few additions - * for multiple device setup. Make sure to keep it in sync. - */ -static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, - int flags, const char *device_name, void *data) -{ - struct block_device *bdev = NULL; - struct super_block *s; - struct btrfs_device *device = NULL; - struct btrfs_fs_devices *fs_devices = NULL; - struct btrfs_fs_info *fs_info = NULL; - void *new_sec_opts = NULL; - blk_mode_t mode = sb_open_mode(flags); - int error = 0; - - if (data) { - error = security_sb_eat_lsm_opts(data, &new_sec_opts); - if (error) - return ERR_PTR(error); - } - - /* - * Setup a dummy root and fs_info for test/set super. This is because - * we don't actually fill this stuff out until open_ctree, but we need - * then open_ctree will properly initialize the file system specific - * settings later. btrfs_init_fs_info initializes the static elements - * of the fs_info (locks and such) to make cleanup easier if we find a - * superblock with our given fs_devices later on at sget() time. - */ - fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL); - if (!fs_info) { - error = -ENOMEM; - goto error_sec_opts; - } - btrfs_init_fs_info(fs_info); - - fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); - fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); - if (!fs_info->super_copy || !fs_info->super_for_commit) { - error = -ENOMEM; - goto error_fs_info; - } - - mutex_lock(&uuid_mutex); - error = btrfs_parse_device_options(data, mode); - if (error) { - mutex_unlock(&uuid_mutex); - goto error_fs_info; - } - - /* - * With 'true' passed to btrfs_scan_one_device() (mount time) we expect - * either a valid device or an error. - */ - device = btrfs_scan_one_device(device_name, mode, true); - ASSERT(device != NULL); - if (IS_ERR(device)) { - mutex_unlock(&uuid_mutex); - error = PTR_ERR(device); - goto error_fs_info; - } - - fs_devices = device->fs_devices; - fs_info->fs_devices = fs_devices; - - error = btrfs_open_devices(fs_devices, mode, fs_type); - mutex_unlock(&uuid_mutex); - if (error) - goto error_fs_info; - - if (!(flags & SB_RDONLY) && fs_devices->rw_devices == 0) { - error = -EACCES; - goto error_close_devices; - } - - bdev = fs_devices->latest_dev->bdev; - s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC, - fs_info); - if (IS_ERR(s)) { - error = PTR_ERR(s); - goto error_close_devices; - } - - if (s->s_root) { - btrfs_close_devices(fs_devices); - btrfs_free_fs_info(fs_info); - if ((flags ^ s->s_flags) & SB_RDONLY) - error = -EBUSY; - } else { - snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); - shrinker_debugfs_rename(s->s_shrink, "sb-%s:%s", fs_type->name, - s->s_id); - btrfs_sb(s)->bdev_holder = fs_type; - error = btrfs_fill_super(s, fs_devices, data); - } - if (!error) - error = security_sb_set_mnt_opts(s, new_sec_opts, 0, NULL); - security_free_mnt_opts(&new_sec_opts); - if (error) { - deactivate_locked_super(s); - return ERR_PTR(error); - } - - return dget(s->s_root); - -error_close_devices: - btrfs_close_devices(fs_devices); -error_fs_info: - btrfs_free_fs_info(fs_info); -error_sec_opts: - security_free_mnt_opts(&new_sec_opts); - return ERR_PTR(error); -} - -/* - * Mount function which is called by VFS layer. - * - * In order to allow mounting a subvolume directly, btrfs uses mount_subtree() - * which needs vfsmount* of device's root (/). This means device's root has to - * be mounted internally in any case. - * - * Operation flow: - * 1. Parse subvol id related options for later use in mount_subvol(). - * - * 2. Mount device's root (/) by calling vfs_kern_mount(). - * - * NOTE: vfs_kern_mount() is used by VFS to call btrfs_mount() in the - * first place. In order to avoid calling btrfs_mount() again, we use - * different file_system_type which is not registered to VFS by - * register_filesystem() (btrfs_root_fs_type). As a result, - * btrfs_mount_root() is called. The return value will be used by - * mount_subtree() in mount_subvol(). - * - * 3. Call mount_subvol() to get the dentry of subvolume. Since there is - * "btrfs subvolume set-default", mount_subvol() is called always. - */ -static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, - const char *device_name, void *data) -{ - struct vfsmount *mnt_root; - struct dentry *root; - char *subvol_name = NULL; - u64 subvol_objectid = 0; - int error = 0; - - error = btrfs_parse_subvol_options(data, &subvol_name, - &subvol_objectid); - if (error) { - kfree(subvol_name); - return ERR_PTR(error); - } - - /* mount device's root (/) */ - mnt_root = vfs_kern_mount(&btrfs_root_fs_type, flags, device_name, data); - if (PTR_ERR_OR_ZERO(mnt_root) == -EBUSY) { - if (flags & SB_RDONLY) { - mnt_root = vfs_kern_mount(&btrfs_root_fs_type, - flags & ~SB_RDONLY, device_name, data); - } else { - mnt_root = vfs_kern_mount(&btrfs_root_fs_type, - flags | SB_RDONLY, device_name, data); - if (IS_ERR(mnt_root)) { - root = ERR_CAST(mnt_root); - kfree(subvol_name); - goto out; - } - - down_write(&mnt_root->mnt_sb->s_umount); - error = btrfs_remount(mnt_root->mnt_sb, &flags, NULL); - up_write(&mnt_root->mnt_sb->s_umount); - if (error < 0) { - root = ERR_PTR(error); - mntput(mnt_root); - kfree(subvol_name); - goto out; - } - } - } - if (IS_ERR(mnt_root)) { - root = ERR_CAST(mnt_root); - kfree(subvol_name); - goto out; - } - - /* mount_subvol() will free subvol_name and mnt_root */ - root = mount_subvol(subvol_name, subvol_objectid, mnt_root); - -out: - return root; -} - static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, u32 new_pool_size, u32 old_pool_size) { @@ -1638,202 +1248,274 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, btrfs_set_free_space_cache_v1_active(fs_info, cache_opt); } -static int btrfs_remount(struct super_block *sb, int *flags, char *data) +static int btrfs_remount_rw(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - unsigned old_flags = sb->s_flags; - unsigned long old_opts = fs_info->mount_opt; - unsigned long old_compress_type = fs_info->compress_type; - u64 old_max_inline = fs_info->max_inline; - u32 old_thread_pool_size = fs_info->thread_pool_size; - u32 old_metadata_ratio = fs_info->metadata_ratio; int ret; - sync_filesystem(sb); - set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); + if (BTRFS_FS_ERROR(fs_info)) { + btrfs_err(fs_info, + "remounting read-write after error is not allowed"); + return -EINVAL; + } - if (data) { - void *new_sec_opts = NULL; + if (fs_info->fs_devices->rw_devices == 0) + return -EACCES; - ret = security_sb_eat_lsm_opts(data, &new_sec_opts); - if (!ret) - ret = security_sb_remount(sb, new_sec_opts); - security_free_mnt_opts(&new_sec_opts); - if (ret) - goto restore; + if (!btrfs_check_rw_degradable(fs_info, NULL)) { + btrfs_warn(fs_info, + "too many missing devices, writable remount is not allowed"); + return -EACCES; + } + + if (btrfs_super_log_root(fs_info->super_copy) != 0) { + btrfs_warn(fs_info, + "mount required to replay tree-log, cannot remount read-write"); + return -EINVAL; } - ret = btrfs_parse_options(fs_info, data, *flags); + /* + * NOTE: when remounting with a change that does writes, don't put it + * anywhere above this point, as we are not sure to be safe to write + * until we pass the above checks. + */ + ret = btrfs_start_pre_rw_mount(fs_info); if (ret) - goto restore; + return ret; - ret = btrfs_check_features(fs_info, !(*flags & SB_RDONLY)); - if (ret < 0) - goto restore; + btrfs_clear_sb_rdonly(fs_info->sb); - btrfs_remount_begin(fs_info, old_opts, *flags); - btrfs_resize_thread_pool(fs_info, - fs_info->thread_pool_size, old_thread_pool_size); + set_bit(BTRFS_FS_OPEN, &fs_info->flags); - if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) != - (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && - (!sb_rdonly(sb) || (*flags & SB_RDONLY))) { - btrfs_warn(fs_info, - "remount supports changing free space tree only from ro to rw"); - /* Make sure free space cache options match the state on disk */ - if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE); - btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); - } - if (btrfs_free_space_cache_v1_active(fs_info)) { - btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE); - btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE); - } - } + /* + * If we've gone from readonly -> read-write, we need to get our + * sync/async discard lists in the right state. + */ + btrfs_discard_resume(fs_info); - if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) - goto out; + return 0; +} - if (*flags & SB_RDONLY) { - /* - * this also happens on 'umount -rf' or on shutdown, when - * the filesystem is busy. - */ - cancel_work_sync(&fs_info->async_reclaim_work); - cancel_work_sync(&fs_info->async_data_reclaim_work); +static int btrfs_remount_ro(struct btrfs_fs_info *fs_info) +{ + /* + * This also happens on 'umount -rf' or on shutdown, when the + * filesystem is busy. + */ + cancel_work_sync(&fs_info->async_reclaim_work); + cancel_work_sync(&fs_info->async_data_reclaim_work); - btrfs_discard_cleanup(fs_info); + btrfs_discard_cleanup(fs_info); - /* wait for the uuid_scan task to finish */ - down(&fs_info->uuid_tree_rescan_sem); - /* avoid complains from lockdep et al. */ - up(&fs_info->uuid_tree_rescan_sem); + /* Wait for the uuid_scan task to finish */ + down(&fs_info->uuid_tree_rescan_sem); + /* Avoid complains from lockdep et al. */ + up(&fs_info->uuid_tree_rescan_sem); - btrfs_set_sb_rdonly(sb); + btrfs_set_sb_rdonly(fs_info->sb); - /* - * Setting SB_RDONLY will put the cleaner thread to - * sleep at the next loop if it's already active. - * If it's already asleep, we'll leave unused block - * groups on disk until we're mounted read-write again - * unless we clean them up here. - */ - btrfs_delete_unused_bgs(fs_info); + /* + * Setting SB_RDONLY will put the cleaner thread to sleep at the next + * loop if it's already active. If it's already asleep, we'll leave + * unused block groups on disk until we're mounted read-write again + * unless we clean them up here. + */ + btrfs_delete_unused_bgs(fs_info); - /* - * The cleaner task could be already running before we set the - * flag BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock). - * We must make sure that after we finish the remount, i.e. after - * we call btrfs_commit_super(), the cleaner can no longer start - * a transaction - either because it was dropping a dead root, - * running delayed iputs or deleting an unused block group (the - * cleaner picked a block group from the list of unused block - * groups before we were able to in the previous call to - * btrfs_delete_unused_bgs()). - */ - wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING, - TASK_UNINTERRUPTIBLE); + /* + * The cleaner task could be already running before we set the flag + * BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock). We must make + * sure that after we finish the remount, i.e. after we call + * btrfs_commit_super(), the cleaner can no longer start a transaction + * - either because it was dropping a dead root, running delayed iputs + * or deleting an unused block group (the cleaner picked a block + * group from the list of unused block groups before we were able to + * in the previous call to btrfs_delete_unused_bgs()). + */ + wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING, TASK_UNINTERRUPTIBLE); - /* - * We've set the superblock to RO mode, so we might have made - * the cleaner task sleep without running all pending delayed - * iputs. Go through all the delayed iputs here, so that if an - * unmount happens without remounting RW we don't end up at - * finishing close_ctree() with a non-empty list of delayed - * iputs. - */ - btrfs_run_delayed_iputs(fs_info); + /* + * We've set the superblock to RO mode, so we might have made the + * cleaner task sleep without running all pending delayed iputs. Go + * through all the delayed iputs here, so that if an unmount happens + * without remounting RW we don't end up at finishing close_ctree() + * with a non-empty list of delayed iputs. + */ + btrfs_run_delayed_iputs(fs_info); - btrfs_dev_replace_suspend_for_unmount(fs_info); - btrfs_scrub_cancel(fs_info); - btrfs_pause_balance(fs_info); + btrfs_dev_replace_suspend_for_unmount(fs_info); + btrfs_scrub_cancel(fs_info); + btrfs_pause_balance(fs_info); - /* - * Pause the qgroup rescan worker if it is running. We don't want - * it to be still running after we are in RO mode, as after that, - * by the time we unmount, it might have left a transaction open, - * so we would leak the transaction and/or crash. - */ - btrfs_qgroup_wait_for_completion(fs_info, false); + /* + * Pause the qgroup rescan worker if it is running. We don't want it to + * be still running after we are in RO mode, as after that, by the time + * we unmount, it might have left a transaction open, so we would leak + * the transaction and/or crash. + */ + btrfs_qgroup_wait_for_completion(fs_info, false); - ret = btrfs_commit_super(fs_info); - if (ret) - goto restore; - } else { - if (BTRFS_FS_ERROR(fs_info)) { - btrfs_err(fs_info, - "Remounting read-write after error is not allowed"); - ret = -EINVAL; - goto restore; - } - if (fs_info->fs_devices->rw_devices == 0) { - ret = -EACCES; - goto restore; - } + return btrfs_commit_super(fs_info); +} - if (!btrfs_check_rw_degradable(fs_info, NULL)) { - btrfs_warn(fs_info, - "too many missing devices, writable remount is not allowed"); - ret = -EACCES; - goto restore; - } +static void btrfs_ctx_to_info(struct btrfs_fs_info *fs_info, struct btrfs_fs_context *ctx) +{ + fs_info->max_inline = ctx->max_inline; + fs_info->commit_interval = ctx->commit_interval; + fs_info->metadata_ratio = ctx->metadata_ratio; + fs_info->thread_pool_size = ctx->thread_pool_size; + fs_info->mount_opt = ctx->mount_opt; + fs_info->compress_type = ctx->compress_type; + fs_info->compress_level = ctx->compress_level; +} - if (btrfs_super_log_root(fs_info->super_copy) != 0) { - btrfs_warn(fs_info, - "mount required to replay tree-log, cannot remount read-write"); - ret = -EINVAL; - goto restore; - } +static void btrfs_info_to_ctx(struct btrfs_fs_info *fs_info, struct btrfs_fs_context *ctx) +{ + ctx->max_inline = fs_info->max_inline; + ctx->commit_interval = fs_info->commit_interval; + ctx->metadata_ratio = fs_info->metadata_ratio; + ctx->thread_pool_size = fs_info->thread_pool_size; + ctx->mount_opt = fs_info->mount_opt; + ctx->compress_type = fs_info->compress_type; + ctx->compress_level = fs_info->compress_level; +} - /* - * NOTE: when remounting with a change that does writes, don't - * put it anywhere above this point, as we are not sure to be - * safe to write until we pass the above checks. - */ - ret = btrfs_start_pre_rw_mount(fs_info); - if (ret) - goto restore; +#define btrfs_info_if_set(fs_info, old_ctx, opt, fmt, args...) \ +do { \ + if ((!old_ctx || !btrfs_raw_test_opt(old_ctx->mount_opt, opt)) && \ + btrfs_raw_test_opt(fs_info->mount_opt, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ +} while (0) + +#define btrfs_info_if_unset(fs_info, old_ctx, opt, fmt, args...) \ +do { \ + if ((old_ctx && btrfs_raw_test_opt(old_ctx->mount_opt, opt)) && \ + !btrfs_raw_test_opt(fs_info->mount_opt, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ +} while (0) + +static void btrfs_emit_options(struct btrfs_fs_info *info, + struct btrfs_fs_context *old) +{ + btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum"); + btrfs_info_if_set(info, old, DEGRADED, "allowing degraded mounts"); + btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum"); + btrfs_info_if_set(info, old, SSD, "enabling ssd optimizations"); + btrfs_info_if_set(info, old, SSD_SPREAD, "using spread ssd allocation scheme"); + btrfs_info_if_set(info, old, NOBARRIER, "turning off barriers"); + btrfs_info_if_set(info, old, NOTREELOG, "disabling tree log"); + btrfs_info_if_set(info, old, NOLOGREPLAY, "disabling log replay at mount time"); + btrfs_info_if_set(info, old, FLUSHONCOMMIT, "turning on flush-on-commit"); + btrfs_info_if_set(info, old, DISCARD_SYNC, "turning on sync discard"); + btrfs_info_if_set(info, old, DISCARD_ASYNC, "turning on async discard"); + btrfs_info_if_set(info, old, FREE_SPACE_TREE, "enabling free space tree"); + btrfs_info_if_set(info, old, SPACE_CACHE, "enabling disk space caching"); + btrfs_info_if_set(info, old, CLEAR_CACHE, "force clearing of disk cache"); + btrfs_info_if_set(info, old, AUTO_DEFRAG, "enabling auto defrag"); + btrfs_info_if_set(info, old, FRAGMENT_DATA, "fragmenting data"); + btrfs_info_if_set(info, old, FRAGMENT_METADATA, "fragmenting metadata"); + btrfs_info_if_set(info, old, REF_VERIFY, "doing ref verification"); + btrfs_info_if_set(info, old, USEBACKUPROOT, "trying to use backup root at mount time"); + btrfs_info_if_set(info, old, IGNOREBADROOTS, "ignoring bad roots"); + btrfs_info_if_set(info, old, IGNOREDATACSUMS, "ignoring data csums"); + + btrfs_info_if_unset(info, old, NODATACOW, "setting datacow"); + btrfs_info_if_unset(info, old, SSD, "not using ssd optimizations"); + btrfs_info_if_unset(info, old, SSD_SPREAD, "not using spread ssd allocation scheme"); + btrfs_info_if_unset(info, old, NOBARRIER, "turning off barriers"); + btrfs_info_if_unset(info, old, NOTREELOG, "enabling tree log"); + btrfs_info_if_unset(info, old, SPACE_CACHE, "disabling disk space caching"); + btrfs_info_if_unset(info, old, FREE_SPACE_TREE, "disabling free space tree"); + btrfs_info_if_unset(info, old, AUTO_DEFRAG, "disabling auto defrag"); + btrfs_info_if_unset(info, old, COMPRESS, "use no compression"); + + /* Did the compression settings change? */ + if (btrfs_test_opt(info, COMPRESS) && + (!old || + old->compress_type != info->compress_type || + old->compress_level != info->compress_level || + (!btrfs_raw_test_opt(old->mount_opt, FORCE_COMPRESS) && + btrfs_raw_test_opt(info->mount_opt, FORCE_COMPRESS)))) { + const char *compress_type = btrfs_compress_type2str(info->compress_type); + + btrfs_info(info, "%s %s compression, level %d", + btrfs_test_opt(info, FORCE_COMPRESS) ? "force" : "use", + compress_type, info->compress_level); + } - btrfs_clear_sb_rdonly(sb); + if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE) + btrfs_info(info, "max_inline set to %llu", info->max_inline); +} - set_bit(BTRFS_FS_OPEN, &fs_info->flags); +static int btrfs_reconfigure(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_fs_context *ctx = fc->fs_private; + struct btrfs_fs_context old_ctx; + int ret = 0; + bool mount_reconfigure = (fc->s_fs_info != NULL); - /* - * If we've gone from readonly -> read/write, we need to get - * our sync/async discard lists in the right state. - */ - btrfs_discard_resume(fs_info); + btrfs_info_to_ctx(fs_info, &old_ctx); + + sync_filesystem(sb); + set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); + + if (!mount_reconfigure && + !btrfs_check_options(fs_info, &ctx->mount_opt, fc->sb_flags)) + return -EINVAL; + + ret = btrfs_check_features(fs_info, !(fc->sb_flags & SB_RDONLY)); + if (ret < 0) + return ret; + + btrfs_ctx_to_info(fs_info, ctx); + btrfs_remount_begin(fs_info, old_ctx.mount_opt, fc->sb_flags); + btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size, + old_ctx.thread_pool_size); + + if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) != + (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && + (!sb_rdonly(sb) || (fc->sb_flags & SB_RDONLY))) { + btrfs_warn(fs_info, + "remount supports changing free space tree only from RO to RW"); + /* Make sure free space cache options match the state on disk. */ + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE); + btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); + } + if (btrfs_free_space_cache_v1_active(fs_info)) { + btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE); + btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE); + } } -out: + + ret = 0; + if (!sb_rdonly(sb) && (fc->sb_flags & SB_RDONLY)) + ret = btrfs_remount_ro(fs_info); + else if (sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY)) + ret = btrfs_remount_rw(fs_info); + if (ret) + goto restore; + /* - * We need to set SB_I_VERSION here otherwise it'll get cleared by VFS, - * since the absence of the flag means it can be toggled off by remount. + * If we set the mask during the parameter parsing VFS would reject the + * remount. Here we can set the mask and the value will be updated + * appropriately. */ - *flags |= SB_I_VERSION; + if ((fc->sb_flags & SB_POSIXACL) != (sb->s_flags & SB_POSIXACL)) + fc->sb_flags_mask |= SB_POSIXACL; + btrfs_emit_options(fs_info, &old_ctx); wake_up_process(fs_info->transaction_kthread); - btrfs_remount_cleanup(fs_info, old_opts); + btrfs_remount_cleanup(fs_info, old_ctx.mount_opt); btrfs_clear_oneshot_options(fs_info); clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); return 0; - restore: - /* We've hit an error - don't reset SB_RDONLY */ - if (sb_rdonly(sb)) - old_flags |= SB_RDONLY; - if (!(old_flags & SB_RDONLY)) - clear_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state); - sb->s_flags = old_flags; - fs_info->mount_opt = old_opts; - fs_info->compress_type = old_compress_type; - fs_info->max_inline = old_max_inline; - btrfs_resize_thread_pool(fs_info, - old_thread_pool_size, fs_info->thread_pool_size); - fs_info->metadata_ratio = old_metadata_ratio; - btrfs_remount_cleanup(fs_info, old_opts); + btrfs_ctx_to_info(fs_info, &old_ctx); + btrfs_remount_cleanup(fs_info, old_ctx.mount_opt); clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); - return ret; } @@ -2094,6 +1776,309 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } +static int btrfs_fc_test_super(struct super_block *sb, struct fs_context *fc) +{ + struct btrfs_fs_info *p = fc->s_fs_info; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + + return fs_info->fs_devices == p->fs_devices; +} + +static int btrfs_get_tree_super(struct fs_context *fc) +{ + struct btrfs_fs_info *fs_info = fc->s_fs_info; + struct btrfs_fs_context *ctx = fc->fs_private; + struct btrfs_fs_devices *fs_devices = NULL; + struct block_device *bdev; + struct btrfs_device *device; + struct super_block *sb; + blk_mode_t mode = btrfs_open_mode(fc); + int ret; + + btrfs_ctx_to_info(fs_info, ctx); + mutex_lock(&uuid_mutex); + + /* + * With 'true' passed to btrfs_scan_one_device() (mount time) we expect + * either a valid device or an error. + */ + device = btrfs_scan_one_device(fc->source, mode, true); + ASSERT(device != NULL); + if (IS_ERR(device)) { + mutex_unlock(&uuid_mutex); + return PTR_ERR(device); + } + + fs_devices = device->fs_devices; + fs_info->fs_devices = fs_devices; + + ret = btrfs_open_devices(fs_devices, mode, &btrfs_fs_type); + mutex_unlock(&uuid_mutex); + if (ret) + return ret; + + if (!(fc->sb_flags & SB_RDONLY) && fs_devices->rw_devices == 0) { + ret = -EACCES; + goto error; + } + + bdev = fs_devices->latest_dev->bdev; + + /* + * From now on the error handling is not straightforward. + * + * If successful, this will transfer the fs_info into the super block, + * and fc->s_fs_info will be NULL. However if there's an existing + * super, we'll still have fc->s_fs_info populated. If we error + * completely out it'll be cleaned up when we drop the fs_context, + * otherwise it's tied to the lifetime of the super_block. + */ + sb = sget_fc(fc, btrfs_fc_test_super, set_anon_super_fc); + if (IS_ERR(sb)) { + ret = PTR_ERR(sb); + goto error; + } + + set_device_specific_options(fs_info); + + if (sb->s_root) { + btrfs_close_devices(fs_devices); + if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) + ret = -EBUSY; + } else { + snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); + shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id); + btrfs_sb(sb)->bdev_holder = &btrfs_fs_type; + ret = btrfs_fill_super(sb, fs_devices, NULL); + } + + if (ret) { + deactivate_locked_super(sb); + return ret; + } + + btrfs_clear_oneshot_options(fs_info); + + fc->root = dget(sb->s_root); + return 0; + +error: + btrfs_close_devices(fs_devices); + return ret; +} + +/* + * Ever since commit 0723a0473fb4 ("btrfs: allow mounting btrfs subvolumes + * with different ro/rw options") the following works: + * + * (i) mount /dev/sda3 -o subvol=foo,ro /mnt/foo + * (ii) mount /dev/sda3 -o subvol=bar,rw /mnt/bar + * + * which looks nice and innocent but is actually pretty intricate and deserves + * a long comment. + * + * On another filesystem a subvolume mount is close to something like: + * + * (iii) # create rw superblock + initial mount + * mount -t xfs /dev/sdb /opt/ + * + * # create ro bind mount + * mount --bind -o ro /opt/foo /mnt/foo + * + * # unmount initial mount + * umount /opt + * + * Of course, there's some special subvolume sauce and there's the fact that the + * sb->s_root dentry is really swapped after mount_subtree(). But conceptually + * it's very close and will help us understand the issue. + * + * The old mount API didn't cleanly distinguish between a mount being made ro + * and a superblock being made ro. The only way to change the ro state of + * either object was by passing ms_rdonly. If a new mount was created via + * mount(2) such as: + * + * mount("/dev/sdb", "/mnt", "xfs", ms_rdonly, null); + * + * the MS_RDONLY flag being specified had two effects: + * + * (1) MNT_READONLY was raised -> the resulting mount got + * @mnt->mnt_flags |= MNT_READONLY raised. + * + * (2) MS_RDONLY was passed to the filesystem's mount method and the filesystems + * made the superblock ro. Note, how SB_RDONLY has the same value as + * ms_rdonly and is raised whenever MS_RDONLY is passed through mount(2). + * + * Creating a subtree mount via (iii) ends up leaving a rw superblock with a + * subtree mounted ro. + * + * But consider the effect on the old mount API on btrfs subvolume mounting + * which combines the distinct step in (iii) into a single step. + * + * By issuing (i) both the mount and the superblock are turned ro. Now when (ii) + * is issued the superblock is ro and thus even if the mount created for (ii) is + * rw it wouldn't help. Hence, btrfs needed to transition the superblock from ro + * to rw for (ii) which it did using an internal remount call. + * + * IOW, subvolume mounting was inherently complicated due to the ambiguity of + * MS_RDONLY in mount(2). Note, this ambiguity has mount(8) always translate + * "ro" to MS_RDONLY. IOW, in both (i) and (ii) "ro" becomes MS_RDONLY when + * passed by mount(8) to mount(2). + * + * Enter the new mount API. The new mount API disambiguates making a mount ro + * and making a superblock ro. + * + * (3) To turn a mount ro the MOUNT_ATTR_ONLY flag can be used with either + * fsmount() or mount_setattr() this is a pure VFS level change for a + * specific mount or mount tree that is never seen by the filesystem itself. + * + * (4) To turn a superblock ro the "ro" flag must be used with + * fsconfig(FSCONFIG_SET_FLAG, "ro"). This option is seen by the filesystem + * in fc->sb_flags. + * + * This disambiguation has rather positive consequences. Mounting a subvolume + * ro will not also turn the superblock ro. Only the mount for the subvolume + * will become ro. + * + * So, if the superblock creation request comes from the new mount API the + * caller must have explicitly done: + * + * fsconfig(FSCONFIG_SET_FLAG, "ro") + * fsmount/mount_setattr(MOUNT_ATTR_RDONLY) + * + * IOW, at some point the caller must have explicitly turned the whole + * superblock ro and we shouldn't just undo it like we did for the old mount + * API. In any case, it lets us avoid the hack in the new mount API. + * + * Consequently, the remounting hack must only be used for requests originating + * from the old mount API and should be marked for full deprecation so it can be + * turned off in a couple of years. + * + * The new mount API has no reason to support this hack. + */ +static struct vfsmount *btrfs_reconfigure_for_mount(struct fs_context *fc) +{ + struct vfsmount *mnt; + int ret; + const bool ro2rw = !(fc->sb_flags & SB_RDONLY); + + /* + * We got an EBUSY because our SB_RDONLY flag didn't match the existing + * super block, so invert our setting here and retry the mount so we + * can get our vfsmount. + */ + if (ro2rw) + fc->sb_flags |= SB_RDONLY; + else + fc->sb_flags &= ~SB_RDONLY; + + mnt = fc_mount(fc); + if (IS_ERR(mnt)) + return mnt; + + if (!fc->oldapi || !ro2rw) + return mnt; + + /* We need to convert to rw, call reconfigure. */ + fc->sb_flags &= ~SB_RDONLY; + down_write(&mnt->mnt_sb->s_umount); + ret = btrfs_reconfigure(fc); + up_write(&mnt->mnt_sb->s_umount); + if (ret) { + mntput(mnt); + return ERR_PTR(ret); + } + return mnt; +} + +static int btrfs_get_tree_subvol(struct fs_context *fc) +{ + struct btrfs_fs_info *fs_info = NULL; + struct btrfs_fs_context *ctx = fc->fs_private; + struct fs_context *dup_fc; + struct dentry *dentry; + struct vfsmount *mnt; + + /* + * Setup a dummy root and fs_info for test/set super. This is because + * we don't actually fill this stuff out until open_ctree, but we need + * then open_ctree will properly initialize the file system specific + * settings later. btrfs_init_fs_info initializes the static elements + * of the fs_info (locks and such) to make cleanup easier if we find a + * superblock with our given fs_devices later on at sget() time. + */ + fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL); + if (!fs_info) + return -ENOMEM; + + fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); + fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); + if (!fs_info->super_copy || !fs_info->super_for_commit) { + btrfs_free_fs_info(fs_info); + return -ENOMEM; + } + btrfs_init_fs_info(fs_info); + + dup_fc = vfs_dup_fs_context(fc); + if (IS_ERR(dup_fc)) { + btrfs_free_fs_info(fs_info); + return PTR_ERR(dup_fc); + } + + /* + * When we do the sget_fc this gets transferred to the sb, so we only + * need to set it on the dup_fc as this is what creates the super block. + */ + dup_fc->s_fs_info = fs_info; + + /* + * We'll do the security settings in our btrfs_get_tree_super() mount + * loop, they were duplicated into dup_fc, we can drop the originals + * here. + */ + security_free_mnt_opts(&fc->security); + fc->security = NULL; + + mnt = fc_mount(dup_fc); + if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) + mnt = btrfs_reconfigure_for_mount(dup_fc); + put_fs_context(dup_fc); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + + /* + * This free's ->subvol_name, because if it isn't set we have to + * allocate a buffer to hold the subvol_name, so we just drop our + * reference to it here. + */ + dentry = mount_subvol(ctx->subvol_name, ctx->subvol_objectid, mnt); + ctx->subvol_name = NULL; + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + fc->root = dentry; + return 0; +} + +static int btrfs_get_tree(struct fs_context *fc) +{ + /* + * Since we use mount_subtree to mount the default/specified subvol, we + * have to do mounts in two steps. + * + * First pass through we call btrfs_get_tree_subvol(), this is just a + * wrapper around fc_mount() to call back into here again, and this time + * we'll call btrfs_get_tree_super(). This will do the open_ctree() and + * everything to open the devices and file system. Then we return back + * with a fully constructed vfsmount in btrfs_get_tree_subvol(), and + * from there we can do our mount_subvol() call, which will lookup + * whichever subvol we're mounting and setup this fc with the + * appropriate dentry for the subvol. + */ + if (fc->s_fs_info) + return btrfs_get_tree_super(fc); + return btrfs_get_tree_subvol(fc); +} + static void btrfs_kill_super(struct super_block *sb) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); @@ -2101,22 +2086,85 @@ static void btrfs_kill_super(struct super_block *sb) btrfs_free_fs_info(fs_info); } -static struct file_system_type btrfs_fs_type = { - .owner = THIS_MODULE, - .name = "btrfs", - .mount = btrfs_mount, - .kill_sb = btrfs_kill_super, - .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA, -}; +static void btrfs_free_fs_context(struct fs_context *fc) +{ + struct btrfs_fs_context *ctx = fc->fs_private; + struct btrfs_fs_info *fs_info = fc->s_fs_info; + + if (fs_info) + btrfs_free_fs_info(fs_info); + + if (ctx && refcount_dec_and_test(&ctx->refs)) { + kfree(ctx->subvol_name); + kfree(ctx); + } +} + +static int btrfs_dup_fs_context(struct fs_context *fc, struct fs_context *src_fc) +{ + struct btrfs_fs_context *ctx = src_fc->fs_private; -static struct file_system_type btrfs_root_fs_type = { - .owner = THIS_MODULE, - .name = "btrfs", - .mount = btrfs_mount_root, - .kill_sb = btrfs_kill_super, - .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP, + /* + * Give a ref to our ctx to this dup, as we want to keep it around for + * our original fc so we can have the subvolume name or objectid. + * + * We unset ->source in the original fc because the dup needs it for + * mounting, and then once we free the dup it'll free ->source, so we + * need to make sure we're only pointing to it in one fc. + */ + refcount_inc(&ctx->refs); + fc->fs_private = ctx; + fc->source = src_fc->source; + src_fc->source = NULL; + return 0; +} + +static const struct fs_context_operations btrfs_fs_context_ops = { + .parse_param = btrfs_parse_param, + .reconfigure = btrfs_reconfigure, + .get_tree = btrfs_get_tree, + .dup = btrfs_dup_fs_context, + .free = btrfs_free_fs_context, }; +static int btrfs_init_fs_context(struct fs_context *fc) +{ + struct btrfs_fs_context *ctx; + + ctx = kzalloc(sizeof(struct btrfs_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + refcount_set(&ctx->refs, 1); + fc->fs_private = ctx; + fc->ops = &btrfs_fs_context_ops; + + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + btrfs_info_to_ctx(btrfs_sb(fc->root->d_sb), ctx); + } else { + ctx->thread_pool_size = + min_t(unsigned long, num_online_cpus() + 2, 8); + ctx->max_inline = BTRFS_DEFAULT_MAX_INLINE; + ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; + } + +#ifdef CONFIG_BTRFS_FS_POSIX_ACL + fc->sb_flags |= SB_POSIXACL; +#endif + fc->sb_flags |= SB_I_VERSION; + + return 0; +} + +static struct file_system_type btrfs_fs_type = { + .owner = THIS_MODULE, + .name = "btrfs", + .init_fs_context = btrfs_init_fs_context, + .parameters = btrfs_fs_parameters, + .kill_sb = btrfs_kill_super, + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP, + }; + MODULE_ALIAS_FS("btrfs"); static int btrfs_control_open(struct inode *inode, struct file *file) @@ -2328,7 +2376,6 @@ static const struct super_operations btrfs_super_ops = { .destroy_inode = btrfs_destroy_inode, .free_inode = btrfs_free_inode, .statfs = btrfs_statfs, - .remount_fs = btrfs_remount, .freeze_fs = btrfs_freeze, .unfreeze_fs = btrfs_unfreeze, }; diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h index 8dbb909b364f..f18253ca280d 100644 --- a/fs/btrfs/super.h +++ b/fs/btrfs/super.h @@ -3,11 +3,12 @@ #ifndef BTRFS_SUPER_H #define BTRFS_SUPER_H -int btrfs_parse_options(struct btrfs_fs_info *info, char *options, - unsigned long new_flags); +bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt, + unsigned long flags); int btrfs_sync_fs(struct super_block *sb, int wait); char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, u64 subvol_objectid); +void btrfs_set_free_space_cache_settings(struct btrfs_fs_info *fs_info); static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) { diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index e6b51fb3ddc1..84c05246ffd8 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1783,6 +1783,10 @@ static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj, unsigned long long limit; limit = memparse(buf, &endptr); + /* There could be trailing '\n', also catch any typos after the value. */ + endptr = skip_spaces(endptr); + if (*endptr != 0) + return -EINVAL; WRITE_ONCE(device->scrub_speed_max, limit); return len; } diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index ca09cf9afce8..709c6cc9706a 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -28,6 +28,7 @@ const char *test_error[] = { [TEST_ALLOC_INODE] = "cannot allocate inode", [TEST_ALLOC_BLOCK_GROUP] = "cannot allocate block group", [TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map", + [TEST_ALLOC_CHUNK_MAP] = "cannot allocate chunk map", }; static const struct super_operations btrfs_test_super_ops = { @@ -102,7 +103,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info) if (!dev) return ERR_PTR(-ENOMEM); - extent_io_tree_init(NULL, &dev->alloc_state, 0); + extent_io_tree_init(fs_info, &dev->alloc_state, 0); INIT_LIST_HEAD(&dev->dev_list); list_add(&dev->dev_list, &fs_info->fs_devices->devices); @@ -185,7 +186,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) } spin_unlock(&fs_info->buffer_lock); - btrfs_mapping_tree_free(&fs_info->mapping_tree); + btrfs_mapping_tree_free(fs_info); list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices, dev_list) { btrfs_free_dummy_device(dev); diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index 7a2d7ffbe30e..dc2f2ab15fa5 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -23,6 +23,7 @@ enum { TEST_ALLOC_INODE, TEST_ALLOC_BLOCK_GROUP, TEST_ALLOC_EXTENT_MAP, + TEST_ALLOC_CHUNK_MAP, }; extern const char *test_error[]; diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 1cc86af97dc6..25b3349595e0 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -652,7 +652,7 @@ static void dump_eb_and_memory_contents(struct extent_buffer *eb, void *memory, const char *test_name) { for (int i = 0; i < eb->len; i++) { - struct page *page = eb->pages[i >> PAGE_SHIFT]; + struct page *page = folio_page(eb->folios[i >> PAGE_SHIFT], 0); void *addr = page_address(page) + offset_in_page(i); if (memcmp(addr, memory + i, 1) != 0) { @@ -668,7 +668,7 @@ static int verify_eb_and_memory(struct extent_buffer *eb, void *memory, const char *test_name) { for (int i = 0; i < (eb->len >> PAGE_SHIFT); i++) { - void *eb_addr = page_address(eb->pages[i]); + void *eb_addr = folio_address(eb->folios[i]); if (memcmp(memory + (i << PAGE_SHIFT), eb_addr, PAGE_SIZE) != 0) { dump_eb_and_memory_contents(eb, memory, test_name); diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 29bdd08b241f..253cce7ffecf 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -25,7 +25,7 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree) #ifdef CONFIG_BTRFS_DEBUG if (refcount_read(&em->refs) != 1) { test_err( -"em leak: em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx) refs %d", +"em leak: em (start %llu len %llu block_start %llu block_len %llu) refs %d", em->start, em->len, em->block_start, em->block_len, refcount_read(&em->refs)); @@ -73,7 +73,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, em->block_start = 0; em->block_len = SZ_16K; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [0, 16K)"); @@ -94,7 +94,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, em->block_start = SZ_32K; /* avoid merging */ em->block_len = SZ_4K; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [16K, 20K)"); @@ -121,9 +121,14 @@ static int test_case_1(struct btrfs_fs_info *fs_info, test_err("case1 [%llu %llu]: ret %d", start, start + len, ret); goto out; } - if (em && - (em->start != 0 || extent_map_end(em) != SZ_16K || - em->block_start != 0 || em->block_len != SZ_16K)) { + if (!em) { + test_err("case1 [%llu %llu]: no extent map returned", + start, start + len); + ret = -ENOENT; + goto out; + } + if (em->start != 0 || extent_map_end(em) != SZ_16K || + em->block_start != 0 || em->block_len != SZ_16K) { test_err( "case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu", start, start + len, ret, em->start, em->len, @@ -161,7 +166,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info, em->block_start = EXTENT_MAP_INLINE; em->block_len = (u64)-1; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [0, 1K)"); @@ -182,7 +187,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info, em->block_start = SZ_4K; em->block_len = SZ_4K; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [4K, 8K)"); @@ -209,9 +214,13 @@ static int test_case_2(struct btrfs_fs_info *fs_info, test_err("case2 [0 1K]: ret %d", ret); goto out; } - if (em && - (em->start != 0 || extent_map_end(em) != SZ_1K || - em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1)) { + if (!em) { + test_err("case2 [0 1K]: no extent map returned"); + ret = -ENOENT; + goto out; + } + if (em->start != 0 || extent_map_end(em) != SZ_1K || + em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1) { test_err( "case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu", ret, em->start, em->len, em->block_start, @@ -244,7 +253,7 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, em->block_start = SZ_4K; em->block_len = SZ_4K; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [4K, 8K)"); @@ -268,19 +277,24 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); write_unlock(&em_tree->lock); if (ret) { - test_err("case3 [0x%llx 0x%llx): ret %d", + test_err("case3 [%llu %llu): ret %d", start, start + len, ret); goto out; } + if (!em) { + test_err("case3 [%llu %llu): no extent map returned", + start, start + len); + ret = -ENOENT; + goto out; + } /* * Since bytes within em are contiguous, em->block_start is identical to * em->start. */ - if (em && - (start < em->start || start + len > extent_map_end(em) || - em->start != em->block_start || em->len != em->block_len)) { + if (start < em->start || start + len > extent_map_end(em) || + em->start != em->block_start || em->len != em->block_len) { test_err( -"case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)", +"case3 [%llu %llu): ret %d em (start %llu len %llu block_start %llu block_len %llu)", start, start + len, ret, em->start, em->len, em->block_start, em->block_len); ret = -EINVAL; @@ -343,7 +357,7 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, em->block_start = 0; em->block_len = SZ_8K; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [0, 8K)"); @@ -364,7 +378,7 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, em->block_start = SZ_16K; /* avoid merging */ em->block_len = 24 * SZ_1K; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [8K, 32K)"); @@ -387,14 +401,20 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); write_unlock(&em_tree->lock); if (ret) { - test_err("case4 [0x%llx 0x%llx): ret %d", - start, len, ret); + test_err("case4 [%llu %llu): ret %d", + start, start + len, ret); goto out; } - if (em && (start < em->start || start + len > extent_map_end(em))) { + if (!em) { + test_err("case4 [%llu %llu): no extent map returned", + start, start + len); + ret = -ENOENT; + goto out; + } + if (start < em->start || start + len > extent_map_end(em)) { test_err( -"case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)", - start, len, ret, em->start, em->len, em->block_start, +"case4 [%llu %llu): ret %d, added wrong em (start %llu len %llu block_start %llu block_len %llu)", + start, start + len, ret, em->start, em->len, em->block_start, em->block_len); ret = -EINVAL; } @@ -443,7 +463,8 @@ static int test_case_4(struct btrfs_fs_info *fs_info, return ret; } -static int add_compressed_extent(struct extent_map_tree *em_tree, +static int add_compressed_extent(struct btrfs_fs_info *fs_info, + struct extent_map_tree *em_tree, u64 start, u64 len, u64 block_start) { struct extent_map *em; @@ -459,9 +480,9 @@ static int add_compressed_extent(struct extent_map_tree *em_tree, em->len = len; em->block_start = block_start; em->block_len = SZ_4K; - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->flags |= EXTENT_FLAG_COMPRESS_ZLIB; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); free_extent_map(em); if (ret < 0) { @@ -567,7 +588,7 @@ static int validate_range(struct extent_map_tree *em_tree, int index) * They'll have the EXTENT_FLAG_COMPRESSED flag set to keep the em tree from * merging the em's. */ -static int test_case_5(void) +static int test_case_5(struct btrfs_fs_info *fs_info) { struct extent_map_tree *em_tree; struct inode *inode; @@ -585,35 +606,35 @@ static int test_case_5(void) em_tree = &BTRFS_I(inode)->extent_tree; /* [0, 12k) */ - ret = add_compressed_extent(em_tree, 0, SZ_4K * 3, 0); + ret = add_compressed_extent(fs_info, em_tree, 0, SZ_4K * 3, 0); if (ret) { test_err("cannot add extent range [0, 12K)"); goto out; } /* [12k, 24k) */ - ret = add_compressed_extent(em_tree, SZ_4K * 3, SZ_4K * 3, SZ_4K); + ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 3, SZ_4K * 3, SZ_4K); if (ret) { test_err("cannot add extent range [12k, 24k)"); goto out; } /* [24k, 36k) */ - ret = add_compressed_extent(em_tree, SZ_4K * 6, SZ_4K * 3, SZ_8K); + ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 6, SZ_4K * 3, SZ_8K); if (ret) { test_err("cannot add extent range [12k, 24k)"); goto out; } /* [36k, 40k) */ - ret = add_compressed_extent(em_tree, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3); + ret = add_compressed_extent(fs_info, em_tree, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3); if (ret) { test_err("cannot add extent range [12k, 24k)"); goto out; } /* [40k, 64k) */ - ret = add_compressed_extent(em_tree, SZ_4K * 10, SZ_4K * 6, SZ_16K); + ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 10, SZ_4K * 6, SZ_16K); if (ret) { test_err("cannot add extent range [12k, 24k)"); goto out; @@ -665,11 +686,11 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct extent_map_tree *em struct extent_map *em = NULL; int ret; - ret = add_compressed_extent(em_tree, 0, SZ_4K, 0); + ret = add_compressed_extent(fs_info, em_tree, 0, SZ_4K, 0); if (ret) goto out; - ret = add_compressed_extent(em_tree, SZ_4K, SZ_4K, 0); + ret = add_compressed_extent(fs_info, em_tree, SZ_4K, SZ_4K, 0); if (ret) goto out; @@ -713,7 +734,7 @@ out: * true would mess up the start/end calculations and subsequent splits would be * incorrect. */ -static int test_case_7(void) +static int test_case_7(struct btrfs_fs_info *fs_info) { struct extent_map_tree *em_tree; struct extent_map *em; @@ -742,9 +763,9 @@ static int test_case_7(void) em->len = SZ_16K; em->block_start = 0; em->block_len = SZ_4K; - set_bit(EXTENT_FLAG_PINNED, &em->flags); + em->flags |= EXTENT_FLAG_PINNED; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("couldn't add extent map"); @@ -765,7 +786,7 @@ static int test_case_7(void) em->block_start = SZ_32K; em->block_len = SZ_16K; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("couldn't add extent map"); @@ -859,33 +880,21 @@ struct rmap_test_vector { static int test_rmap_block(struct btrfs_fs_info *fs_info, struct rmap_test_vector *test) { - struct extent_map *em; - struct map_lookup *map = NULL; + struct btrfs_chunk_map *map; u64 *logical = NULL; int i, out_ndaddrs, out_stripe_len; int ret; - em = alloc_extent_map(); - if (!em) { - test_std_err(TEST_ALLOC_EXTENT_MAP); - return -ENOMEM; - } - - map = kmalloc(map_lookup_size(test->num_stripes), GFP_KERNEL); + map = btrfs_alloc_chunk_map(test->num_stripes, GFP_KERNEL); if (!map) { - kfree(em); - test_std_err(TEST_ALLOC_EXTENT_MAP); + test_std_err(TEST_ALLOC_CHUNK_MAP); return -ENOMEM; } - set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); /* Start at 4GiB logical address */ - em->start = SZ_4G; - em->len = test->data_stripe_size * test->num_data_stripes; - em->block_len = em->len; - em->orig_block_len = test->data_stripe_size; - em->map_lookup = map; - + map->start = SZ_4G; + map->chunk_len = test->data_stripe_size * test->num_data_stripes; + map->stripe_size = test->data_stripe_size; map->num_stripes = test->num_stripes; map->type = test->raid_type; @@ -901,15 +910,13 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, map->stripes[i].physical = test->data_stripe_phys_start[i]; } - write_lock(&fs_info->mapping_tree.lock); - ret = add_extent_mapping(&fs_info->mapping_tree, em, 0); - write_unlock(&fs_info->mapping_tree.lock); + ret = btrfs_add_chunk_map(fs_info, map); if (ret) { - test_err("error adding block group mapping to mapping tree"); + test_err("error adding chunk map to mapping tree"); goto out_free; } - ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1), + ret = btrfs_rmap_block(fs_info, map->start, btrfs_sb_offset(1), &logical, &out_ndaddrs, &out_stripe_len); if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) { test_err("didn't rmap anything but expected %d", @@ -938,14 +945,8 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, ret = 0; out: - write_lock(&fs_info->mapping_tree.lock); - remove_extent_mapping(&fs_info->mapping_tree, em); - write_unlock(&fs_info->mapping_tree.lock); - /* For us */ - free_extent_map(em); + btrfs_remove_chunk_map(fs_info, map); out_free: - /* For the tree */ - free_extent_map(em); kfree(logical); return ret; } @@ -1022,13 +1023,13 @@ int btrfs_test_extent_map(void) ret = test_case_4(fs_info, em_tree); if (ret) goto out; - ret = test_case_5(); + ret = test_case_5(fs_info); if (ret) goto out; ret = test_case_6(fs_info, em_tree); if (ret) goto out; - ret = test_case_7(); + ret = test_case_7(fs_info); if (ret) goto out; diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 492d69d2fa73..9957de9f7806 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -211,9 +211,9 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); } -static unsigned long prealloc_only = 0; -static unsigned long compressed_only = 0; -static unsigned long vacancy_only = 0; +static u32 prealloc_only = 0; +static u32 compressed_only = 0; +static u32 vacancy_only = 0; static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) { @@ -305,7 +305,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } /* @@ -332,7 +332,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } offset = em->start + em->len; @@ -355,7 +355,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } if (em->orig_start != em->start) { @@ -383,7 +383,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } if (em->orig_start != em->start) { @@ -412,7 +412,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } offset = em->start + em->len; @@ -434,7 +434,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } if (em->orig_start != orig_start) { @@ -468,7 +468,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != prealloc_only) { - test_err("unexpected flags set, want %lu have %lu", + test_err("unexpected flags set, want %u have %u", prealloc_only, em->flags); goto out; } @@ -497,7 +497,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != prealloc_only) { - test_err("unexpected flags set, want %lu have %lu", + test_err("unexpected flags set, want %u have %u", prealloc_only, em->flags); goto out; } @@ -527,7 +527,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } if (em->orig_start != orig_start) { @@ -560,7 +560,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != prealloc_only) { - test_err("unexpected flags set, want %lu have %lu", + test_err("unexpected flags set, want %u have %u", prealloc_only, em->flags); goto out; } @@ -595,7 +595,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != compressed_only) { - test_err("unexpected flags set, want %lu have %lu", + test_err("unexpected flags set, want %u have %u", compressed_only, em->flags); goto out; } @@ -604,9 +604,9 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em->start, em->orig_start); goto out; } - if (em->compress_type != BTRFS_COMPRESS_ZLIB) { + if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { test_err("unexpected compress type, wanted %d, got %d", - BTRFS_COMPRESS_ZLIB, em->compress_type); + BTRFS_COMPRESS_ZLIB, extent_map_compression(em)); goto out; } offset = em->start + em->len; @@ -629,7 +629,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != compressed_only) { - test_err("unexpected flags set, want %lu have %lu", + test_err("unexpected flags set, want %u have %u", compressed_only, em->flags); goto out; } @@ -638,9 +638,9 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em->start, em->orig_start); goto out; } - if (em->compress_type != BTRFS_COMPRESS_ZLIB) { + if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { test_err("unexpected compress type, wanted %d, got %d", - BTRFS_COMPRESS_ZLIB, em->compress_type); + BTRFS_COMPRESS_ZLIB, extent_map_compression(em)); goto out; } disk_bytenr = em->block_start; @@ -664,7 +664,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } if (em->orig_start != em->start) { @@ -692,7 +692,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != compressed_only) { - test_err("unexpected flags set, want %lu have %lu", + test_err("unexpected flags set, want %u have %u", compressed_only, em->flags); goto out; } @@ -701,9 +701,9 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em->start, orig_start); goto out; } - if (em->compress_type != BTRFS_COMPRESS_ZLIB) { + if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { test_err("unexpected compress type, wanted %d, got %d", - BTRFS_COMPRESS_ZLIB, em->compress_type); + BTRFS_COMPRESS_ZLIB, extent_map_compression(em)); goto out; } offset = em->start + em->len; @@ -726,7 +726,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } if (em->orig_start != em->start) { @@ -758,7 +758,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != vacancy_only) { - test_err("unexpected flags set, want %lu have %lu", + test_err("unexpected flags set, want %u have %u", vacancy_only, em->flags); goto out; } @@ -786,7 +786,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } if (em->orig_start != em->start) { @@ -866,7 +866,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != vacancy_only) { - test_err("wrong flags, wanted %lu, have %lu", vacancy_only, + test_err("wrong flags, wanted %u, have %u", vacancy_only, em->flags); goto out; } @@ -888,7 +888,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, wanted 0 got %lu", + test_err("unexpected flags set, wanted 0 got %u", em->flags); goto out; } @@ -1095,8 +1095,8 @@ int btrfs_test_inodes(u32 sectorsize, u32 nodesize) test_msg("running inode tests"); - set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only); - set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only); + compressed_only |= EXTENT_FLAG_COMPRESS_ZLIB; + prealloc_only |= EXTENT_FLAG_PREALLOC; ret = test_btrfs_get_extent(sectorsize, nodesize); if (ret) diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index 3c2a02a72f64..14b9fbe82da4 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -22,7 +22,7 @@ struct btrfs_tree_parent_check { /* * Expected transid, can be 0 to skip the check, but such skip - * should only be utlized for backref walk related code. + * should only be utilized for backref walk related code. */ u64 transid; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7d6729d9fd2f..331fc7429952 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2575,7 +2575,6 @@ static int clean_log_buffer(struct btrfs_trans_handle *trans, ret = btrfs_pin_reserved_extent(trans, eb); if (ret) return ret; - btrfs_redirty_list_add(trans->transaction, eb); } else { unaccount_log_buffer(eb->fs_info, eb->start); } @@ -4520,7 +4519,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, int ret = 0; if (inode->flags & BTRFS_INODE_NODATASUM || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || + (em->flags & EXTENT_FLAG_PREALLOC) || em->block_start == EXTENT_MAP_HOLE) return 0; @@ -4583,7 +4582,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, return 0; /* If we're compressed we have to save the entire range of csums. */ - if (em->compress_type) { + if (extent_map_is_compressed(em)) { csum_offset = 0; csum_len = max(em->block_len, em->orig_block_len); } else { @@ -4623,18 +4622,20 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item fi = { 0 }; struct extent_buffer *leaf; struct btrfs_key key; + enum btrfs_compression_type compress_type; u64 extent_offset = em->start - em->orig_start; u64 block_len; int ret; btrfs_set_stack_file_extent_generation(&fi, trans->transid); - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + if (em->flags & EXTENT_FLAG_PREALLOC) btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC); else btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG); block_len = max(em->block_len, em->orig_block_len); - if (em->compress_type != BTRFS_COMPRESS_NONE) { + compress_type = extent_map_compression(em); + if (compress_type != BTRFS_COMPRESS_NONE) { btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start); btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { @@ -4646,7 +4647,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_offset(&fi, extent_offset); btrfs_set_stack_file_extent_num_bytes(&fi, em->len); btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes); - btrfs_set_stack_file_extent_compression(&fi, em->compress_type); + btrfs_set_stack_file_extent_compression(&fi, compress_type); ret = log_extent_csums(trans, inode, log, em, ctx); if (ret) @@ -4859,13 +4860,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, continue; /* We log prealloc extents beyond eof later. */ - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && + if ((em->flags & EXTENT_FLAG_PREALLOC) && em->start >= i_size_read(&inode->vfs_inode)) continue; /* Need a ref to keep it from getting evicted from cache */ refcount_inc(&em->refs); - set_bit(EXTENT_FLAG_LOGGING, &em->flags); + em->flags |= EXTENT_FLAG_LOGGING; list_add_tail(&em->list, &extents); num++; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f627674b37db..4c32497311d2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -41,6 +41,17 @@ BTRFS_BLOCK_GROUP_RAID10 | \ BTRFS_BLOCK_GROUP_RAID56_MASK) +struct btrfs_io_geometry { + u32 stripe_index; + u32 stripe_nr; + int mirror_num; + int num_stripes; + u64 stripe_offset; + u64 raid56_full_stripe_start; + int max_errors; + enum btrfs_map_op op; +}; + const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { .sub_stripes = 2, @@ -1742,19 +1753,18 @@ out: static u64 find_next_chunk(struct btrfs_fs_info *fs_info) { - struct extent_map_tree *em_tree; - struct extent_map *em; struct rb_node *n; u64 ret = 0; - em_tree = &fs_info->mapping_tree; - read_lock(&em_tree->lock); - n = rb_last(&em_tree->map.rb_root); + read_lock(&fs_info->mapping_tree_lock); + n = rb_last(&fs_info->mapping_tree.rb_root); if (n) { - em = rb_entry(n, struct extent_map, rb_node); - ret = em->start + em->len; + struct btrfs_chunk_map *map; + + map = rb_entry(n, struct btrfs_chunk_map, rb_node); + ret = map->start + map->chunk_len; } - read_unlock(&em_tree->lock); + read_unlock(&fs_info->mapping_tree_lock); return ret; } @@ -2986,6 +2996,81 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) return ret; } +struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) +{ + struct rb_node *node = fs_info->mapping_tree.rb_root.rb_node; + struct rb_node *prev = NULL; + struct rb_node *orig_prev; + struct btrfs_chunk_map *map; + struct btrfs_chunk_map *prev_map = NULL; + + while (node) { + map = rb_entry(node, struct btrfs_chunk_map, rb_node); + prev = node; + prev_map = map; + + if (logical < map->start) { + node = node->rb_left; + } else if (logical >= map->start + map->chunk_len) { + node = node->rb_right; + } else { + refcount_inc(&map->refs); + return map; + } + } + + if (!prev) + return NULL; + + orig_prev = prev; + while (prev && logical >= prev_map->start + prev_map->chunk_len) { + prev = rb_next(prev); + prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node); + } + + if (!prev) { + prev = orig_prev; + prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node); + while (prev && logical < prev_map->start) { + prev = rb_prev(prev); + prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node); + } + } + + if (prev) { + u64 end = logical + length; + + /* + * Caller can pass a U64_MAX length when it wants to get any + * chunk starting at an offset of 'logical' or higher, so deal + * with underflow by resetting the end offset to U64_MAX. + */ + if (end < logical) + end = U64_MAX; + + if (end > prev_map->start && + logical < prev_map->start + prev_map->chunk_len) { + refcount_inc(&prev_map->refs); + return prev_map; + } + } + + return NULL; +} + +struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) +{ + struct btrfs_chunk_map *map; + + read_lock(&fs_info->mapping_tree_lock); + map = btrfs_find_chunk_map_nolock(fs_info, logical, length); + read_unlock(&fs_info->mapping_tree_lock); + + return map; +} + /* * Find the mapping containing the given logical extent. * @@ -2994,38 +3079,37 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) * * Return: Chunk mapping or ERR_PTR. */ -struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, - u64 logical, u64 length) +struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) { - struct extent_map_tree *em_tree; - struct extent_map *em; + struct btrfs_chunk_map *map; - em_tree = &fs_info->mapping_tree; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, length); - read_unlock(&em_tree->lock); + map = btrfs_find_chunk_map(fs_info, logical, length); - if (!em) { + if (unlikely(!map)) { + read_unlock(&fs_info->mapping_tree_lock); btrfs_crit(fs_info, "unable to find chunk map for logical %llu length %llu", logical, length); return ERR_PTR(-EINVAL); } - if (em->start > logical || em->start + em->len <= logical) { + if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) { + read_unlock(&fs_info->mapping_tree_lock); btrfs_crit(fs_info, "found a bad chunk map, wanted %llu-%llu, found %llu-%llu", - logical, logical + length, em->start, em->start + em->len); - free_extent_map(em); + logical, logical + length, map->start, + map->start + map->chunk_len); + btrfs_free_chunk_map(map); return ERR_PTR(-EINVAL); } - /* callers are responsible for dropping em's ref. */ - return em; + /* Callers are responsible for dropping the reference. */ + return map; } static int remove_chunk_item(struct btrfs_trans_handle *trans, - struct map_lookup *map, u64 chunk_offset) + struct btrfs_chunk_map *map, u64 chunk_offset) { int i; @@ -3050,23 +3134,21 @@ static int remove_chunk_item(struct btrfs_trans_handle *trans, int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; u64 dev_extent_len = 0; int i, ret = 0; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); - if (IS_ERR(em)) { + map = btrfs_get_chunk_map(fs_info, chunk_offset, 1); + if (IS_ERR(map)) { /* * This is a logic error, but we don't want to just rely on the * user having built with ASSERT enabled, so if ASSERT doesn't * do anything we still error out. */ ASSERT(0); - return PTR_ERR(em); + return PTR_ERR(map); } - map = em->map_lookup; /* * First delete the device extent items from the devices btree. @@ -3169,7 +3251,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) goto out; } - trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len); + trace_btrfs_chunk_free(fs_info, map, chunk_offset, map->chunk_len); if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_del_sys_chunk(fs_info, chunk_offset); @@ -3188,7 +3270,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) */ btrfs_trans_release_chunk_metadata(trans); - ret = btrfs_remove_block_group(trans, chunk_offset, em); + ret = btrfs_remove_block_group(trans, map); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -3200,7 +3282,7 @@ out: trans->removing_chunk = false; } /* once for us */ - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } @@ -5347,24 +5429,131 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, } } +static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int bits) +{ + for (int i = 0; i < map->num_stripes; i++) { + struct btrfs_io_stripe *stripe = &map->stripes[i]; + struct btrfs_device *device = stripe->dev; + + set_extent_bit(&device->alloc_state, stripe->physical, + stripe->physical + map->stripe_size - 1, + bits | EXTENT_NOWAIT, NULL); + } +} + +static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits) +{ + for (int i = 0; i < map->num_stripes; i++) { + struct btrfs_io_stripe *stripe = &map->stripes[i]; + struct btrfs_device *device = stripe->dev; + + __clear_extent_bit(&device->alloc_state, stripe->physical, + stripe->physical + map->stripe_size - 1, + bits | EXTENT_NOWAIT, + NULL, NULL); + } +} + +void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map) +{ + write_lock(&fs_info->mapping_tree_lock); + rb_erase_cached(&map->rb_node, &fs_info->mapping_tree); + RB_CLEAR_NODE(&map->rb_node); + chunk_map_device_clear_bits(map, CHUNK_ALLOCATED); + write_unlock(&fs_info->mapping_tree_lock); + + /* Once for the tree reference. */ + btrfs_free_chunk_map(map); +} + +EXPORT_FOR_TESTS +int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + bool leftmost = true; + + write_lock(&fs_info->mapping_tree_lock); + p = &fs_info->mapping_tree.rb_root.rb_node; + while (*p) { + struct btrfs_chunk_map *entry; + + parent = *p; + entry = rb_entry(parent, struct btrfs_chunk_map, rb_node); + + if (map->start < entry->start) { + p = &(*p)->rb_left; + } else if (map->start > entry->start) { + p = &(*p)->rb_right; + leftmost = false; + } else { + write_unlock(&fs_info->mapping_tree_lock); + return -EEXIST; + } + } + rb_link_node(&map->rb_node, parent, p); + rb_insert_color_cached(&map->rb_node, &fs_info->mapping_tree, leftmost); + chunk_map_device_set_bits(map, CHUNK_ALLOCATED); + chunk_map_device_clear_bits(map, CHUNK_TRIMMED); + write_unlock(&fs_info->mapping_tree_lock); + + return 0; +} + +EXPORT_FOR_TESTS +struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp) +{ + struct btrfs_chunk_map *map; + + map = kmalloc(btrfs_chunk_map_size(num_stripes), gfp); + if (!map) + return NULL; + + refcount_set(&map->refs, 1); + RB_CLEAR_NODE(&map->rb_node); + + return map; +} + +struct btrfs_chunk_map *btrfs_clone_chunk_map(struct btrfs_chunk_map *map, gfp_t gfp) +{ + const int size = btrfs_chunk_map_size(map->num_stripes); + struct btrfs_chunk_map *clone; + + clone = kmemdup(map, size, gfp); + if (!clone) + return NULL; + + refcount_set(&clone->refs, 1); + RB_CLEAR_NODE(&clone->rb_node); + + return clone; +} + static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, struct alloc_chunk_ctl *ctl, struct btrfs_device_info *devices_info) { struct btrfs_fs_info *info = trans->fs_info; - struct map_lookup *map = NULL; - struct extent_map_tree *em_tree; + struct btrfs_chunk_map *map; struct btrfs_block_group *block_group; - struct extent_map *em; u64 start = ctl->start; u64 type = ctl->type; int ret; int i; int j; - map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS); + map = btrfs_alloc_chunk_map(ctl->num_stripes, GFP_NOFS); if (!map) return ERR_PTR(-ENOMEM); + + map->start = start; + map->chunk_len = ctl->chunk_size; + map->stripe_size = ctl->stripe_size; + map->type = type; + map->io_align = BTRFS_STRIPE_LEN; + map->io_width = BTRFS_STRIPE_LEN; + map->sub_stripes = ctl->sub_stripes; map->num_stripes = ctl->num_stripes; for (i = 0; i < ctl->ndevs; ++i) { @@ -5375,41 +5564,22 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, j * ctl->stripe_size; } } - map->io_align = BTRFS_STRIPE_LEN; - map->io_width = BTRFS_STRIPE_LEN; - map->type = type; - map->sub_stripes = ctl->sub_stripes; trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size); - em = alloc_extent_map(); - if (!em) { - kfree(map); - return ERR_PTR(-ENOMEM); - } - set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); - em->map_lookup = map; - em->start = start; - em->len = ctl->chunk_size; - em->block_start = 0; - em->block_len = em->len; - em->orig_block_len = ctl->stripe_size; - - em_tree = &info->mapping_tree; - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_chunk_map(info, map); if (ret) { - write_unlock(&em_tree->lock); - free_extent_map(em); + btrfs_free_chunk_map(map); return ERR_PTR(ret); } - write_unlock(&em_tree->lock); block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size); - if (IS_ERR(block_group)) - goto error_del_extent; + if (IS_ERR(block_group)) { + btrfs_remove_chunk_map(info, map); + return block_group; + } - for (i = 0; i < map->num_stripes; i++) { + for (int i = 0; i < map->num_stripes; i++) { struct btrfs_device *dev = map->stripes[i].dev; btrfs_device_set_bytes_used(dev, @@ -5422,23 +5592,10 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, atomic64_sub(ctl->stripe_size * map->num_stripes, &info->free_chunk_space); - free_extent_map(em); check_raid56_incompat_flag(info, type); check_raid1c34_incompat_flag(info, type); return block_group; - -error_del_extent: - write_lock(&em_tree->lock); - remove_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - - /* One for our allocation */ - free_extent_map(em); - /* One for the tree reference */ - free_extent_map(em); - - return block_group; } struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, @@ -5514,8 +5671,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, struct btrfs_key key; struct btrfs_chunk *chunk; struct btrfs_stripe *stripe; - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; size_t item_size; int i; int ret; @@ -5544,14 +5700,13 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, */ lockdep_assert_held(&fs_info->chunk_mutex); - em = btrfs_get_chunk_map(fs_info, bg->start, bg->length); - if (IS_ERR(em)) { - ret = PTR_ERR(em); + map = btrfs_get_chunk_map(fs_info, bg->start, bg->length); + if (IS_ERR(map)) { + ret = PTR_ERR(map); btrfs_abort_transaction(trans, ret); return ret; } - map = em->map_lookup; item_size = btrfs_chunk_item_size(map->num_stripes); chunk = kzalloc(item_size, GFP_NOFS); @@ -5608,7 +5763,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, out: kfree(chunk); - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } @@ -5653,7 +5808,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) return 0; } -static inline int btrfs_chunk_max_errors(struct map_lookup *map) +static inline int btrfs_chunk_max_errors(struct btrfs_chunk_map *map) { const int index = btrfs_bg_flags_to_raid_index(map->type); @@ -5662,17 +5817,15 @@ static inline int btrfs_chunk_max_errors(struct map_lookup *map) bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset) { - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; int miss_ndevs = 0; int i; bool ret = true; - em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); - if (IS_ERR(em)) + map = btrfs_get_chunk_map(fs_info, chunk_offset, 1); + if (IS_ERR(map)) return false; - map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { if (test_bit(BTRFS_DEV_STATE_MISSING, &map->stripes[i].dev->dev_state)) { @@ -5693,38 +5846,37 @@ bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset) if (miss_ndevs > btrfs_chunk_max_errors(map)) ret = false; end: - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } -void btrfs_mapping_tree_free(struct extent_map_tree *tree) +void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info) { - struct extent_map *em; + write_lock(&fs_info->mapping_tree_lock); + while (!RB_EMPTY_ROOT(&fs_info->mapping_tree.rb_root)) { + struct btrfs_chunk_map *map; + struct rb_node *node; - while (1) { - write_lock(&tree->lock); - em = lookup_extent_mapping(tree, 0, (u64)-1); - if (em) - remove_extent_mapping(tree, em); - write_unlock(&tree->lock); - if (!em) - break; - /* once for us */ - free_extent_map(em); - /* once for the tree */ - free_extent_map(em); + node = rb_first_cached(&fs_info->mapping_tree); + map = rb_entry(node, struct btrfs_chunk_map, rb_node); + rb_erase_cached(&map->rb_node, &fs_info->mapping_tree); + RB_CLEAR_NODE(&map->rb_node); + chunk_map_device_clear_bits(map, CHUNK_ALLOCATED); + /* Once for the tree ref. */ + btrfs_free_chunk_map(map); + cond_resched_rwlock_write(&fs_info->mapping_tree_lock); } + write_unlock(&fs_info->mapping_tree_lock); } int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; enum btrfs_raid_types index; int ret = 1; - em = btrfs_get_chunk_map(fs_info, logical, len); - if (IS_ERR(em)) + map = btrfs_get_chunk_map(fs_info, logical, len); + if (IS_ERR(map)) /* * We could return errors for these cases, but that could get * ugly and we'd probably do the same thing which is just not do @@ -5733,7 +5885,6 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) */ return 1; - map = em->map_lookup; index = btrfs_bg_flags_to_raid_index(map->type); /* Non-RAID56, use their ncopies from btrfs_raid_array. */ @@ -5750,53 +5901,49 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) * stripe under reconstruction. */ ret = map->num_stripes; - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, u64 logical) { - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; unsigned long len = fs_info->sectorsize; if (!btrfs_fs_incompat(fs_info, RAID56)) return len; - em = btrfs_get_chunk_map(fs_info, logical, len); + map = btrfs_get_chunk_map(fs_info, logical, len); - if (!WARN_ON(IS_ERR(em))) { - map = em->map_lookup; + if (!WARN_ON(IS_ERR(map))) { if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) len = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); - free_extent_map(em); + btrfs_free_chunk_map(map); } return len; } int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; int ret = 0; if (!btrfs_fs_incompat(fs_info, RAID56)) return 0; - em = btrfs_get_chunk_map(fs_info, logical, len); + map = btrfs_get_chunk_map(fs_info, logical, len); - if(!WARN_ON(IS_ERR(em))) { - map = em->map_lookup; + if (!WARN_ON(IS_ERR(map))) { if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) ret = 1; - free_extent_map(em); + btrfs_free_chunk_map(map); } return ret; } static int find_live_mirror(struct btrfs_fs_info *fs_info, - struct map_lookup *map, int first, + struct btrfs_chunk_map *map, int first, int dev_replace_is_ongoing) { int i; @@ -5903,8 +6050,7 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, u64 logical, u64 *length_ret, u32 *num_stripes) { - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; struct btrfs_discard_stripe *stripes; u64 length = *length_ret; u64 offset; @@ -5922,11 +6068,9 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, int ret; int i; - em = btrfs_get_chunk_map(fs_info, logical, length); - if (IS_ERR(em)) - return ERR_CAST(em); - - map = em->map_lookup; + map = btrfs_get_chunk_map(fs_info, logical, length); + if (IS_ERR(map)) + return ERR_CAST(map); /* we don't discard raid56 yet */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { @@ -5934,8 +6078,8 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, goto out_free_map; } - offset = logical - em->start; - length = min_t(u64, em->start + em->len - logical, length); + offset = logical - map->start; + length = min_t(u64, map->start + map->chunk_len - logical, length); *length_ret = length; /* @@ -6032,10 +6176,10 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, } } - free_extent_map(em); + btrfs_free_chunk_map(map); return stripes; out_free_map: - free_extent_map(em); + btrfs_free_chunk_map(map); return ERR_PTR(ret); } @@ -6133,17 +6277,16 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, bioc->replace_nr_stripes = nr_extra_stripes; } -static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, - u64 offset, u32 *stripe_nr, u64 *stripe_offset, - u64 *full_stripe_start) +static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset, + struct btrfs_io_geometry *io_geom) { /* * Stripe_nr is the stripe where this block falls. stripe_offset is * the offset of this block in its stripe. */ - *stripe_offset = offset & BTRFS_STRIPE_LEN_MASK; - *stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; - ASSERT(*stripe_offset < U32_MAX); + io_geom->stripe_offset = offset & BTRFS_STRIPE_LEN_MASK; + io_geom->stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; + ASSERT(io_geom->stripe_offset < U32_MAX); if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { unsigned long full_stripe_len = @@ -6158,18 +6301,17 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, * to go rounddown(), not round_down(), as nr_data_stripes is * not ensured to be power of 2. */ - *full_stripe_start = - btrfs_stripe_nr_to_offset( - rounddown(*stripe_nr, nr_data_stripes(map))); + io_geom->raid56_full_stripe_start = btrfs_stripe_nr_to_offset( + rounddown(io_geom->stripe_nr, nr_data_stripes(map))); - ASSERT(*full_stripe_start + full_stripe_len > offset); - ASSERT(*full_stripe_start <= offset); + ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset); + ASSERT(io_geom->raid56_full_stripe_start <= offset); /* * For writes to RAID56, allow to write a full stripe set, but * no straddling of stripe sets. */ - if (op == BTRFS_MAP_WRITE) - return full_stripe_len - (offset - *full_stripe_start); + if (io_geom->op == BTRFS_MAP_WRITE) + return full_stripe_len - (offset - io_geom->raid56_full_stripe_start); } /* @@ -6177,26 +6319,180 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, * a single disk). */ if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) - return BTRFS_STRIPE_LEN - *stripe_offset; + return BTRFS_STRIPE_LEN - io_geom->stripe_offset; return U64_MAX; } -static int set_io_stripe(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 *length, struct btrfs_io_stripe *dst, - struct map_lookup *map, u32 stripe_index, - u64 stripe_offset, u64 stripe_nr) +static int set_io_stripe(struct btrfs_fs_info *fs_info, u64 logical, + u64 *length, struct btrfs_io_stripe *dst, + struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom) { - dst->dev = map->stripes[stripe_index].dev; + dst->dev = map->stripes[io_geom->stripe_index].dev; - if (op == BTRFS_MAP_READ && btrfs_need_stripe_tree_update(fs_info, map->type)) + if (io_geom->op == BTRFS_MAP_READ && + btrfs_need_stripe_tree_update(fs_info, map->type)) return btrfs_get_raid_extent_offset(fs_info, logical, length, - map->type, stripe_index, dst); + map->type, + io_geom->stripe_index, dst); - dst->physical = map->stripes[stripe_index].physical + - stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr); + dst->physical = map->stripes[io_geom->stripe_index].physical + + io_geom->stripe_offset + + btrfs_stripe_nr_to_offset(io_geom->stripe_nr); return 0; } +static bool is_single_device_io(struct btrfs_fs_info *fs_info, + const struct btrfs_io_stripe *smap, + const struct btrfs_chunk_map *map, + int num_alloc_stripes, + enum btrfs_map_op op, int mirror_num) +{ + if (!smap) + return false; + + if (num_alloc_stripes != 1) + return false; + + if (btrfs_need_stripe_tree_update(fs_info, map->type) && op != BTRFS_MAP_READ) + return false; + + if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) + return false; + + return true; +} + +static void map_blocks_raid0(const struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom) +{ + io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes; + io_geom->stripe_nr /= map->num_stripes; + if (io_geom->op == BTRFS_MAP_READ) + io_geom->mirror_num = 1; +} + +static void map_blocks_raid1(struct btrfs_fs_info *fs_info, + struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom, + bool dev_replace_is_ongoing) +{ + if (io_geom->op != BTRFS_MAP_READ) { + io_geom->num_stripes = map->num_stripes; + return; + } + + if (io_geom->mirror_num) { + io_geom->stripe_index = io_geom->mirror_num - 1; + return; + } + + io_geom->stripe_index = find_live_mirror(fs_info, map, 0, + dev_replace_is_ongoing); + io_geom->mirror_num = io_geom->stripe_index + 1; +} + +static void map_blocks_dup(const struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom) +{ + if (io_geom->op != BTRFS_MAP_READ) { + io_geom->num_stripes = map->num_stripes; + return; + } + + if (io_geom->mirror_num) { + io_geom->stripe_index = io_geom->mirror_num - 1; + return; + } + + io_geom->mirror_num = 1; +} + +static void map_blocks_raid10(struct btrfs_fs_info *fs_info, + struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom, + bool dev_replace_is_ongoing) +{ + u32 factor = map->num_stripes / map->sub_stripes; + int old_stripe_index; + + io_geom->stripe_index = (io_geom->stripe_nr % factor) * map->sub_stripes; + io_geom->stripe_nr /= factor; + + if (io_geom->op != BTRFS_MAP_READ) { + io_geom->num_stripes = map->sub_stripes; + return; + } + + if (io_geom->mirror_num) { + io_geom->stripe_index += io_geom->mirror_num - 1; + return; + } + + old_stripe_index = io_geom->stripe_index; + io_geom->stripe_index = find_live_mirror(fs_info, map, + io_geom->stripe_index, + dev_replace_is_ongoing); + io_geom->mirror_num = io_geom->stripe_index - old_stripe_index + 1; +} + +static void map_blocks_raid56_write(struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom, + u64 logical, u64 *length) +{ + int data_stripes = nr_data_stripes(map); + + /* + * Needs full stripe mapping. + * + * Push stripe_nr back to the start of the full stripe For those cases + * needing a full stripe, @stripe_nr is the full stripe number. + * + * Originally we go raid56_full_stripe_start / full_stripe_len, but + * that can be expensive. Here we just divide @stripe_nr with + * @data_stripes. + */ + io_geom->stripe_nr /= data_stripes; + + /* RAID[56] write or recovery. Return all stripes */ + io_geom->num_stripes = map->num_stripes; + io_geom->max_errors = btrfs_chunk_max_errors(map); + + /* Return the length to the full stripe end. */ + *length = min(logical + *length, + io_geom->raid56_full_stripe_start + map->start + + btrfs_stripe_nr_to_offset(data_stripes)) - + logical; + io_geom->stripe_index = 0; + io_geom->stripe_offset = 0; +} + +static void map_blocks_raid56_read(struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom) +{ + int data_stripes = nr_data_stripes(map); + + ASSERT(io_geom->mirror_num <= 1); + /* Just grab the data stripe directly. */ + io_geom->stripe_index = io_geom->stripe_nr % data_stripes; + io_geom->stripe_nr /= data_stripes; + + /* We distribute the parity blocks across stripes. */ + io_geom->stripe_index = + (io_geom->stripe_nr + io_geom->stripe_index) % map->num_stripes; + + if (io_geom->op == BTRFS_MAP_READ && io_geom->mirror_num < 1) + io_geom->mirror_num = 1; +} + +static void map_blocks_single(const struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom) +{ + io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes; + io_geom->stripe_nr /= map->num_stripes; + io_geom->mirror_num = io_geom->stripe_index + 1; +} + /* * Map one logical range to one or more physical ranges. * @@ -6237,43 +6533,37 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, struct btrfs_io_context **bioc_ret, struct btrfs_io_stripe *smap, int *mirror_num_ret) { - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; + struct btrfs_io_geometry io_geom = { 0 }; u64 map_offset; - u64 stripe_offset; - u32 stripe_nr; - u32 stripe_index; - int data_stripes; int i; int ret = 0; - int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0); - int num_stripes; int num_copies; - int max_errors = 0; struct btrfs_io_context *bioc = NULL; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; int dev_replace_is_ongoing = 0; u16 num_alloc_stripes; - u64 raid56_full_stripe_start = (u64)-1; u64 max_len; ASSERT(bioc_ret); + io_geom.mirror_num = (mirror_num_ret ? *mirror_num_ret : 0); + io_geom.num_stripes = 1; + io_geom.stripe_index = 0; + io_geom.op = op; + num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize); - if (mirror_num > num_copies) + if (io_geom.mirror_num > num_copies) return -EINVAL; - em = btrfs_get_chunk_map(fs_info, logical, *length); - if (IS_ERR(em)) - return PTR_ERR(em); + map = btrfs_get_chunk_map(fs_info, logical, *length); + if (IS_ERR(map)) + return PTR_ERR(map); - map = em->map_lookup; - data_stripes = nr_data_stripes(map); - - map_offset = logical - em->start; - max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr, - &stripe_offset, &raid56_full_stripe_start); - *length = min_t(u64, em->len - map_offset, max_len); + map_offset = logical - map->start; + io_geom.raid56_full_stripe_start = (u64)-1; + max_len = btrfs_max_io_len(map, map_offset, &io_geom); + *length = min_t(u64, map->chunk_len - map_offset, max_len); down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); @@ -6284,107 +6574,46 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, if (!dev_replace_is_ongoing) up_read(&dev_replace->rwsem); - num_stripes = 1; - stripe_index = 0; - if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - stripe_index = stripe_nr % map->num_stripes; - stripe_nr /= map->num_stripes; - if (op == BTRFS_MAP_READ) - mirror_num = 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { - if (op != BTRFS_MAP_READ) { - num_stripes = map->num_stripes; - } else if (mirror_num) { - stripe_index = mirror_num - 1; - } else { - stripe_index = find_live_mirror(fs_info, map, 0, - dev_replace_is_ongoing); - mirror_num = stripe_index + 1; - } - - } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (op != BTRFS_MAP_READ) { - num_stripes = map->num_stripes; - } else if (mirror_num) { - stripe_index = mirror_num - 1; - } else { - mirror_num = 1; - } - - } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { - u32 factor = map->num_stripes / map->sub_stripes; - - stripe_index = (stripe_nr % factor) * map->sub_stripes; - stripe_nr /= factor; - - if (op != BTRFS_MAP_READ) - num_stripes = map->sub_stripes; - else if (mirror_num) - stripe_index += mirror_num - 1; - else { - int old_stripe_index = stripe_index; - stripe_index = find_live_mirror(fs_info, map, - stripe_index, - dev_replace_is_ongoing); - mirror_num = stripe_index - old_stripe_index + 1; - } - - } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - if (op != BTRFS_MAP_READ || mirror_num > 1) { - /* - * Needs full stripe mapping. - * - * Push stripe_nr back to the start of the full stripe - * For those cases needing a full stripe, @stripe_nr - * is the full stripe number. - * - * Originally we go raid56_full_stripe_start / full_stripe_len, - * but that can be expensive. Here we just divide - * @stripe_nr with @data_stripes. - */ - stripe_nr /= data_stripes; - - /* RAID[56] write or recovery. Return all stripes */ - num_stripes = map->num_stripes; - max_errors = btrfs_chunk_max_errors(map); - - /* Return the length to the full stripe end */ - *length = min(logical + *length, - raid56_full_stripe_start + em->start + - btrfs_stripe_nr_to_offset(data_stripes)) - - logical; - stripe_index = 0; - stripe_offset = 0; - } else { - ASSERT(mirror_num <= 1); - /* Just grab the data stripe directly. */ - stripe_index = stripe_nr % data_stripes; - stripe_nr /= data_stripes; - - /* We distribute the parity blocks across stripes */ - stripe_index = (stripe_nr + stripe_index) % map->num_stripes; - if (op == BTRFS_MAP_READ && mirror_num < 1) - mirror_num = 1; - } - } else { + switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + case BTRFS_BLOCK_GROUP_RAID0: + map_blocks_raid0(map, &io_geom); + break; + case BTRFS_BLOCK_GROUP_RAID1: + case BTRFS_BLOCK_GROUP_RAID1C3: + case BTRFS_BLOCK_GROUP_RAID1C4: + map_blocks_raid1(fs_info, map, &io_geom, dev_replace_is_ongoing); + break; + case BTRFS_BLOCK_GROUP_DUP: + map_blocks_dup(map, &io_geom); + break; + case BTRFS_BLOCK_GROUP_RAID10: + map_blocks_raid10(fs_info, map, &io_geom, dev_replace_is_ongoing); + break; + case BTRFS_BLOCK_GROUP_RAID5: + case BTRFS_BLOCK_GROUP_RAID6: + if (op != BTRFS_MAP_READ || io_geom.mirror_num > 1) + map_blocks_raid56_write(map, &io_geom, logical, length); + else + map_blocks_raid56_read(map, &io_geom); + break; + default: /* * After this, stripe_nr is the number of stripes on this * device we have to walk to find the data, and stripe_index is * the number of our device in the stripe array */ - stripe_index = stripe_nr % map->num_stripes; - stripe_nr /= map->num_stripes; - mirror_num = stripe_index + 1; + map_blocks_single(map, &io_geom); + break; } - if (stripe_index >= map->num_stripes) { + if (io_geom.stripe_index >= map->num_stripes) { btrfs_crit(fs_info, "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", - stripe_index, map->num_stripes); + io_geom.stripe_index, map->num_stripes); ret = -EINVAL; goto out; } - num_alloc_stripes = num_stripes; + num_alloc_stripes = io_geom.num_stripes; if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && op != BTRFS_MAP_READ) /* @@ -6401,14 +6630,11 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * physical block information on the stack instead of allocating an * I/O context structure. */ - if (smap && num_alloc_stripes == 1 && - !(btrfs_need_stripe_tree_update(fs_info, map->type) && - op != BTRFS_MAP_READ) && - !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)) { - ret = set_io_stripe(fs_info, op, logical, length, smap, map, - stripe_index, stripe_offset, stripe_nr); + if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, op, + io_geom.mirror_num)) { + ret = set_io_stripe(fs_info, logical, length, smap, map, &io_geom); if (mirror_num_ret) - *mirror_num_ret = mirror_num; + *mirror_num_ret = io_geom.mirror_num; *bioc_ret = NULL; goto out; } @@ -6428,7 +6654,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * It's still mostly the same as other profiles, just with extra rotation. */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && - (op != BTRFS_MAP_READ || mirror_num > 1)) { + (op != BTRFS_MAP_READ || io_geom.mirror_num > 1)) { /* * For RAID56 @stripe_nr is already the number of full stripes * before us, which is also the rotation value (needs to modulo @@ -6437,28 +6663,31 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * In this case, we just add @stripe_nr with @i, then do the * modulo, to reduce one modulo call. */ - bioc->full_stripe_logical = em->start + - btrfs_stripe_nr_to_offset(stripe_nr * data_stripes); - for (int i = 0; i < num_stripes; i++) { - ret = set_io_stripe(fs_info, op, logical, length, - &bioc->stripes[i], map, - (i + stripe_nr) % num_stripes, - stripe_offset, stripe_nr); - if (ret < 0) - break; + bioc->full_stripe_logical = map->start + + btrfs_stripe_nr_to_offset(io_geom.stripe_nr * + nr_data_stripes(map)); + for (int i = 0; i < io_geom.num_stripes; i++) { + struct btrfs_io_stripe *dst = &bioc->stripes[i]; + u32 stripe_index; + + stripe_index = (i + io_geom.stripe_nr) % io_geom.num_stripes; + dst->dev = map->stripes[stripe_index].dev; + dst->physical = + map->stripes[stripe_index].physical + + io_geom.stripe_offset + + btrfs_stripe_nr_to_offset(io_geom.stripe_nr); } } else { /* * For all other non-RAID56 profiles, just copy the target * stripe into the bioc. */ - for (i = 0; i < num_stripes; i++) { - ret = set_io_stripe(fs_info, op, logical, length, - &bioc->stripes[i], map, stripe_index, - stripe_offset, stripe_nr); + for (i = 0; i < io_geom.num_stripes; i++) { + ret = set_io_stripe(fs_info, logical, length, + &bioc->stripes[i], map, &io_geom); if (ret < 0) break; - stripe_index++; + io_geom.stripe_index++; } } @@ -6469,18 +6698,18 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, } if (op != BTRFS_MAP_READ) - max_errors = btrfs_chunk_max_errors(map); + io_geom.max_errors = btrfs_chunk_max_errors(map); if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && op != BTRFS_MAP_READ) { handle_ops_on_dev_replace(op, bioc, dev_replace, logical, - &num_stripes, &max_errors); + &io_geom.num_stripes, &io_geom.max_errors); } *bioc_ret = bioc; - bioc->num_stripes = num_stripes; - bioc->max_errors = max_errors; - bioc->mirror_num = mirror_num; + bioc->num_stripes = io_geom.num_stripes; + bioc->max_errors = io_geom.max_errors; + bioc->mirror_num = io_geom.mirror_num; out: if (dev_replace_is_ongoing) { @@ -6488,7 +6717,7 @@ out: /* Unlock and let waiting writers proceed */ up_read(&dev_replace->rwsem); } - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } @@ -6660,12 +6889,11 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, devid, uuid); } -u64 btrfs_calc_stripe_length(const struct extent_map *em) +u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map) { - const struct map_lookup *map = em->map_lookup; const int data_stripes = calc_data_stripes(map->type, map->num_stripes); - return div_u64(em->len, data_stripes); + return div_u64(map->chunk_len, data_stripes); } #if BITS_PER_LONG == 32 @@ -6734,9 +6962,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, { BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_fs_info *fs_info = leaf->fs_info; - struct extent_map_tree *map_tree = &fs_info->mapping_tree; - struct map_lookup *map; - struct extent_map *em; + struct btrfs_chunk_map *map; u64 logical; u64 length; u64 devid; @@ -6770,35 +6996,22 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, return ret; } - read_lock(&map_tree->lock); - em = lookup_extent_mapping(map_tree, logical, 1); - read_unlock(&map_tree->lock); + map = btrfs_find_chunk_map(fs_info, logical, 1); /* already mapped? */ - if (em && em->start <= logical && em->start + em->len > logical) { - free_extent_map(em); + if (map && map->start <= logical && map->start + map->chunk_len > logical) { + btrfs_free_chunk_map(map); return 0; - } else if (em) { - free_extent_map(em); + } else if (map) { + btrfs_free_chunk_map(map); } - em = alloc_extent_map(); - if (!em) - return -ENOMEM; - map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); - if (!map) { - free_extent_map(em); + map = btrfs_alloc_chunk_map(num_stripes, GFP_NOFS); + if (!map) return -ENOMEM; - } - - set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); - em->map_lookup = map; - em->start = logical; - em->len = length; - em->orig_start = 0; - em->block_start = 0; - em->block_len = em->len; + map->start = logical; + map->chunk_len = length; map->num_stripes = num_stripes; map->io_width = btrfs_chunk_io_width(leaf, chunk); map->io_align = btrfs_chunk_io_align(leaf, chunk); @@ -6813,7 +7026,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, */ map->sub_stripes = btrfs_raid_array[index].sub_stripes; map->verified_stripes = 0; - em->orig_block_len = btrfs_calc_stripe_length(em); + map->stripe_size = btrfs_calc_stripe_length(map); for (i = 0; i < num_stripes; i++) { map->stripes[i].physical = btrfs_stripe_offset_nr(leaf, chunk, i); @@ -6829,7 +7042,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, devid, uuid); if (IS_ERR(map->stripes[i].dev)) { ret = PTR_ERR(map->stripes[i].dev); - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } } @@ -6838,15 +7051,12 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, &(map->stripes[i].dev->dev_state)); } - write_lock(&map_tree->lock); - ret = add_extent_mapping(map_tree, em, 0); - write_unlock(&map_tree->lock); + ret = btrfs_add_chunk_map(fs_info, map); if (ret < 0) { btrfs_err(fs_info, "failed to add chunk map, start=%llu len=%llu: %d", - em->start, em->len, ret); + map->start, map->chunk_len, ret); } - free_extent_map(em); return ret; } @@ -7156,26 +7366,21 @@ out_short_read: bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, struct btrfs_device *failing_dev) { - struct extent_map_tree *map_tree = &fs_info->mapping_tree; - struct extent_map *em; - u64 next_start = 0; + struct btrfs_chunk_map *map; + u64 next_start; bool ret = true; - read_lock(&map_tree->lock); - em = lookup_extent_mapping(map_tree, 0, (u64)-1); - read_unlock(&map_tree->lock); + map = btrfs_find_chunk_map(fs_info, 0, U64_MAX); /* No chunk at all? Return false anyway */ - if (!em) { + if (!map) { ret = false; goto out; } - while (em) { - struct map_lookup *map; + while (map) { int missing = 0; int max_tolerated; int i; - map = em->map_lookup; max_tolerated = btrfs_get_num_tolerated_disk_barrier_failures( map->type); @@ -7193,18 +7398,15 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, if (!failing_dev) btrfs_warn(fs_info, "chunk %llu missing %d devices, max tolerance is %d for writable mount", - em->start, missing, max_tolerated); - free_extent_map(em); + map->start, missing, max_tolerated); + btrfs_free_chunk_map(map); ret = false; goto out; } - next_start = extent_map_end(em); - free_extent_map(em); + next_start = map->start + map->chunk_len; + btrfs_free_chunk_map(map); - read_lock(&map_tree->lock); - em = lookup_extent_mapping(map_tree, next_start, - (u64)(-1) - next_start); - read_unlock(&map_tree->lock); + map = btrfs_find_chunk_map(fs_info, next_start, U64_MAX - next_start); } out: return ret; @@ -7697,20 +7899,15 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, u64 physical_offset, u64 physical_len) { struct btrfs_dev_lookup_args args = { .devid = devid }; - struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; struct btrfs_device *dev; u64 stripe_len; bool found = false; int ret = 0; int i; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, chunk_offset, 1); - read_unlock(&em_tree->lock); - - if (!em) { + map = btrfs_find_chunk_map(fs_info, chunk_offset, 1); + if (!map) { btrfs_err(fs_info, "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", physical_offset, devid); @@ -7718,12 +7915,11 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, goto out; } - map = em->map_lookup; - stripe_len = btrfs_calc_stripe_length(em); + stripe_len = btrfs_calc_stripe_length(map); if (physical_len != stripe_len) { btrfs_err(fs_info, "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", - physical_offset, devid, em->start, physical_len, + physical_offset, devid, map->start, physical_len, stripe_len); ret = -EUCLEAN; goto out; @@ -7746,7 +7942,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, if (map->verified_stripes >= map->num_stripes) { btrfs_err(fs_info, "too many dev extents for chunk %llu found", - em->start); + map->start); ret = -EUCLEAN; goto out; } @@ -7792,32 +7988,30 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, } out: - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) { - struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct extent_map *em; struct rb_node *node; int ret = 0; - read_lock(&em_tree->lock); - for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { - em = rb_entry(node, struct extent_map, rb_node); - if (em->map_lookup->num_stripes != - em->map_lookup->verified_stripes) { + read_lock(&fs_info->mapping_tree_lock); + for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) { + struct btrfs_chunk_map *map; + + map = rb_entry(node, struct btrfs_chunk_map, rb_node); + if (map->num_stripes != map->verified_stripes) { btrfs_err(fs_info, "chunk %llu has missing dev extent, have %d expect %d", - em->start, em->map_lookup->verified_stripes, - em->map_lookup->num_stripes); + map->start, map->verified_stripes, map->num_stripes); ret = -EUCLEAN; goto out; } } out: - read_unlock(&em_tree->lock); + read_unlock(&fs_info->mapping_tree_lock); return ret; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 9cc374864a79..53f87f398da7 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -426,7 +426,8 @@ struct btrfs_discard_stripe { struct btrfs_io_context { refcount_t refs; struct btrfs_fs_info *fs_info; - u64 map_type; /* get from map_lookup->type */ + /* Taken from struct btrfs_chunk_map::type. */ + u64 map_type; struct bio *orig_bio; atomic_t error; u16 max_errors; @@ -529,18 +530,32 @@ struct btrfs_raid_attr { extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES]; -struct map_lookup { +struct btrfs_chunk_map { + struct rb_node rb_node; + /* For mount time dev extent verification. */ + int verified_stripes; + refcount_t refs; + u64 start; + u64 chunk_len; + u64 stripe_size; u64 type; int io_align; int io_width; int num_stripes; int sub_stripes; - int verified_stripes; /* For mount time dev extent verification */ struct btrfs_io_stripe stripes[]; }; -#define map_lookup_size(n) (sizeof(struct map_lookup) + \ - (sizeof(struct btrfs_io_stripe) * (n))) +#define btrfs_chunk_map_size(n) (sizeof(struct btrfs_chunk_map) + \ + (sizeof(struct btrfs_io_stripe) * (n))) + +static inline void btrfs_free_chunk_map(struct btrfs_chunk_map *map) +{ + if (map && refcount_dec_and_test(&map->refs)) { + ASSERT(RB_EMPTY_NODE(&map->rb_node)); + kfree(map); + } +} struct btrfs_balance_args; struct btrfs_balance_progress; @@ -598,7 +613,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) } /* - * Do the type safe converstion from stripe_nr to offset inside the chunk. + * Do the type safe conversion from stripe_nr to offset inside the chunk. * * @stripe_nr is u32, with left shift it can overflow u32 for chunks larger * than 4G. This does the proper type cast to avoid overflow. @@ -624,7 +639,7 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, u64 type); -void btrfs_mapping_tree_free(struct extent_map_tree *tree); +void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, blk_mode_t flags, void *holder); struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, @@ -680,13 +695,25 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, u64 logical); -u64 btrfs_calc_stripe_length(const struct extent_map *em); +u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map); int btrfs_nr_parity_stripes(u64 type); int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, struct btrfs_block_group *bg); int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset); -struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, - u64 logical, u64 length); + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp); +int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map); +#endif + +struct btrfs_chunk_map *btrfs_clone_chunk_map(struct btrfs_chunk_map *map, gfp_t gfp); +struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length); +struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info, + u64 logical, u64 length); +struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length); +void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map); void btrfs_release_disk_super(struct btrfs_super_block *super); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 3cf236fb40a4..6287763fdccc 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -382,6 +382,53 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler, return btrfs_setxattr_trans(inode, name, buffer, size, flags); } +static int btrfs_xattr_handler_get_security(const struct xattr_handler *handler, + struct dentry *unused, + struct inode *inode, + const char *name, void *buffer, + size_t size) +{ + int ret; + bool is_cap = false; + + name = xattr_full_name(handler, name); + + /* + * security.capability doesn't cache the results, so calls into us + * constantly to see if there's a capability xattr. Cache the result + * here in order to avoid wasting time doing lookups for xattrs we know + * don't exist. + */ + if (strcmp(name, XATTR_NAME_CAPS) == 0) { + is_cap = true; + if (test_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags)) + return -ENODATA; + } + + ret = btrfs_getxattr(inode, name, buffer, size); + if (ret == -ENODATA && is_cap) + set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); + return ret; +} + +static int btrfs_xattr_handler_set_security(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *unused, + struct inode *inode, + const char *name, + const void *buffer, + size_t size, int flags) +{ + if (btrfs_root_readonly(BTRFS_I(inode)->root)) + return -EROFS; + + name = xattr_full_name(handler, name); + if (strcmp(name, XATTR_NAME_CAPS) == 0) + clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); + + return btrfs_setxattr_trans(inode, name, buffer, size, flags); +} + static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, @@ -420,8 +467,8 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, static const struct xattr_handler btrfs_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, - .get = btrfs_xattr_handler_get, - .set = btrfs_xattr_handler_set, + .get = btrfs_xattr_handler_get_security, + .set = btrfs_xattr_handler_set_security, }; static const struct xattr_handler btrfs_trusted_xattr_handler = { @@ -473,6 +520,10 @@ static int btrfs_initxattrs(struct inode *inode, } strcpy(name, XATTR_SECURITY_PREFIX); strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); + + if (strcmp(name, XATTR_NAME_CAPS) == 0) + clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); + err = btrfs_setxattr(trans, inode, name, xattr->value, xattr->value_len, 0); kfree(name); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 6c231a116a29..36cf1f0e338e 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -121,7 +121,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, workspace->strm.total_in = 0; workspace->strm.total_out = 0; - out_page = alloc_page(GFP_NOFS); + out_page = btrfs_alloc_compr_page(); if (out_page == NULL) { ret = -ENOMEM; goto out; @@ -200,7 +200,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -E2BIG; goto out; } - out_page = alloc_page(GFP_NOFS); + out_page = btrfs_alloc_compr_page(); if (out_page == NULL) { ret = -ENOMEM; goto out; @@ -236,7 +236,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -E2BIG; goto out; } - out_page = alloc_page(GFP_NOFS); + out_page = btrfs_alloc_compr_page(); if (out_page == NULL) { ret = -ENOMEM; goto out; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 188378ca19c7..5bd76813b23f 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -578,26 +578,12 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) kvfree(zones); - switch (bdev_zoned_model(bdev)) { - case BLK_ZONED_HM: + if (bdev_is_zoned(bdev)) { model = "host-managed zoned"; emulated = ""; - break; - case BLK_ZONED_HA: - model = "host-aware zoned"; - emulated = ""; - break; - case BLK_ZONED_NONE: + } else { model = "regular"; emulated = "emulated "; - break; - default: - /* Just in case */ - btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s", - bdev_zoned_model(bdev), - rcu_str_deref(device->name)); - ret = -EOPNOTSUPP; - goto out_free_zone_info; } btrfs_info_in_rcu(fs_info, @@ -609,9 +595,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) out: kvfree(zones); -out_free_zone_info: btrfs_destroy_dev_zone_info(device); - return ret; } @@ -688,8 +672,7 @@ static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info) struct btrfs_device *device; list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { - if (device->bdev && - bdev_zoned_model(device->bdev) == BLK_ZONED_HM) { + if (device->bdev && bdev_is_zoned(device->bdev)) { btrfs_err(fs_info, "zoned: mode not enabled but zoned device found: %pg", device->bdev); @@ -781,7 +764,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) * Check mount options here, because we might change fs_info->zoned * from fs_info->zone_size. */ - ret = btrfs_check_mountopts_zoned(fs_info); + ret = btrfs_check_mountopts_zoned(fs_info, &fs_info->mount_opt); if (ret) return ret; @@ -789,7 +772,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) +int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info, unsigned long *mount_opt) { if (!btrfs_is_zoned(info)) return 0; @@ -798,18 +781,21 @@ int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) * Space cache writing is not COWed. Disable that to avoid write errors * in sequential zones. */ - if (btrfs_test_opt(info, SPACE_CACHE)) { + if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) { btrfs_err(info, "zoned: space cache v1 is not supported"); return -EINVAL; } - if (btrfs_test_opt(info, NODATACOW)) { + if (btrfs_raw_test_opt(*mount_opt, NODATACOW)) { btrfs_err(info, "zoned: NODATACOW not supported"); return -EINVAL; } - btrfs_clear_and_info(info, DISCARD_ASYNC, - "zoned: async discard ignored and disabled for zoned mode"); + if (btrfs_raw_test_opt(*mount_opt, DISCARD_ASYNC)) { + btrfs_info(info, + "zoned: async discard ignored and disabled for zoned mode"); + btrfs_clear_opt(*mount_opt, DISCARD_ASYNC); + } return 0; } @@ -1290,7 +1276,7 @@ struct zone_info { static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, struct zone_info *info, unsigned long *active, - struct map_lookup *map) + struct btrfs_chunk_map *map) { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; struct btrfs_device *device = map->stripes[zone_idx].dev; @@ -1393,7 +1379,7 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg, } static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, - struct map_lookup *map, + struct btrfs_chunk_map *map, struct zone_info *zone_info, unsigned long *active) { @@ -1435,7 +1421,7 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, } static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, - struct map_lookup *map, + struct btrfs_chunk_map *map, struct zone_info *zone_info, unsigned long *active) { @@ -1483,7 +1469,7 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, } static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, - struct map_lookup *map, + struct btrfs_chunk_map *map, struct zone_info *zone_info, unsigned long *active) { @@ -1515,7 +1501,7 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, } static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, - struct map_lookup *map, + struct btrfs_chunk_map *map, struct zone_info *zone_info, unsigned long *active) { @@ -1552,9 +1538,7 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) { struct btrfs_fs_info *fs_info = cache->fs_info; - struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; u64 logical = cache->start; u64 length = cache->length; struct zone_info *zone_info = NULL; @@ -1575,17 +1559,11 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) return -EIO; } - /* Get the chunk mapping */ - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, length); - read_unlock(&em_tree->lock); - - if (!em) + map = btrfs_find_chunk_map(fs_info, logical, length); + if (!map) return -EINVAL; - map = em->map_lookup; - - cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS); + cache->physical_map = btrfs_clone_chunk_map(map, GFP_NOFS); if (!cache->physical_map) { ret = -ENOMEM; goto out; @@ -1687,12 +1665,11 @@ out: spin_unlock(&fs_info->zone_active_bgs_lock); } } else { - kfree(cache->physical_map); + btrfs_free_chunk_map(cache->physical_map); cache->physical_map = NULL; } bitmap_free(active); kfree(zone_info); - free_extent_map(em); return ret; } @@ -1715,22 +1692,6 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) cache->zone_unusable = unusable; } -void btrfs_redirty_list_add(struct btrfs_transaction *trans, - struct extent_buffer *eb) -{ - if (!btrfs_is_zoned(eb->fs_info) || - btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN)) - return; - - ASSERT(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); - - memzero_extent_buffer(eb, 0, eb->len); - set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags); - set_extent_buffer_dirty(eb); - set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1, - EXTENT_DIRTY, NULL); -} - bool btrfs_use_zone_append(struct btrfs_bio *bbio) { u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT); @@ -2082,7 +2043,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, bool btrfs_zone_activate(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; - struct map_lookup *map; + struct btrfs_chunk_map *map; struct btrfs_device *device; u64 physical; const bool is_data = (block_group->flags & BTRFS_BLOCK_GROUP_DATA); @@ -2194,7 +2155,7 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group) static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written) { struct btrfs_fs_info *fs_info = block_group->fs_info; - struct map_lookup *map; + struct btrfs_chunk_map *map; const bool is_metadata = (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)); int ret = 0; @@ -2643,7 +2604,7 @@ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) /* Release reservation for currently active block groups. */ spin_lock(&fs_info->zone_active_bgs_lock); list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { - struct map_lookup *map = block_group->physical_map; + struct btrfs_chunk_map *map = block_group->physical_map; if (!(block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))) diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index b9cec523b778..f573bda496fb 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -45,7 +45,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache); void btrfs_destroy_dev_zone_info(struct btrfs_device *device); struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev); int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info); -int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info); +int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info, unsigned long *mount_opt); int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, u64 *bytenr_ret); int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, @@ -59,8 +59,6 @@ int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size); int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new); void btrfs_calc_zone_unusable(struct btrfs_block_group *cache); -void btrfs_redirty_list_add(struct btrfs_transaction *trans, - struct extent_buffer *eb); bool btrfs_use_zone_append(struct btrfs_bio *bbio); void btrfs_record_physical_zoned(struct btrfs_bio *bbio); int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, @@ -123,7 +121,8 @@ static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info) return -EOPNOTSUPP; } -static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) +static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info, + unsigned long *mount_opt) { return 0; } @@ -180,9 +179,6 @@ static inline int btrfs_load_block_group_zone_info( static inline void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { } -static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans, - struct extent_buffer *eb) { } - static inline bool btrfs_use_zone_append(struct btrfs_bio *bbio) { return false; @@ -323,8 +319,8 @@ static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_i (bdev_zone_sectors(bdev) << SECTOR_SHIFT); } - /* Do not allow Host Manged zoned device */ - return bdev_zoned_model(bdev) != BLK_ZONED_HM; + /* Do not allow Host Managed zoned device. */ + return !bdev_is_zoned(bdev); } static inline bool btrfs_check_super_location(struct btrfs_device *device, u64 pos) diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 5511766485cd..0d66db8bc1d4 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -410,9 +410,8 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); - /* Allocate and map in the output buffer */ - out_page = alloc_page(GFP_NOFS); + out_page = btrfs_alloc_compr_page(); if (out_page == NULL) { ret = -ENOMEM; goto out; @@ -457,7 +456,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -E2BIG; goto out; } - out_page = alloc_page(GFP_NOFS); + out_page = btrfs_alloc_compr_page(); if (out_page == NULL) { ret = -ENOMEM; goto out; @@ -514,7 +513,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -E2BIG; goto out; } - out_page = alloc_page(GFP_NOFS); + out_page = btrfs_alloc_compr_page(); if (out_page == NULL) { ret = -ENOMEM; goto out; diff --git a/fs/buffer.c b/fs/buffer.c index 967f34b70aa8..d3bcf601d3e5 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -180,11 +180,11 @@ EXPORT_SYMBOL(end_buffer_write_sync); * Various filesystems appear to want __find_get_block to be non-blocking. * But it's the page lock which protects the buffers. To get around this, * we get exclusion from try_to_free_buffers with the blockdev mapping's - * private_lock. + * i_private_lock. * - * Hack idea: for the blockdev mapping, private_lock contention + * Hack idea: for the blockdev mapping, i_private_lock contention * may be quite high. This code could TryLock the page, and if that - * succeeds, there is no need to take private_lock. + * succeeds, there is no need to take i_private_lock. */ static struct buffer_head * __find_get_block_slow(struct block_device *bdev, sector_t block) @@ -199,12 +199,12 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) int all_mapped = 1; static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1); - index = block >> (PAGE_SHIFT - bd_inode->i_blkbits); + index = ((loff_t)block << bd_inode->i_blkbits) / PAGE_SIZE; folio = __filemap_get_folio(bd_mapping, index, FGP_ACCESSED, 0); if (IS_ERR(folio)) goto out; - spin_lock(&bd_mapping->private_lock); + spin_lock(&bd_mapping->i_private_lock); head = folio_buffers(folio); if (!head) goto out_unlock; @@ -236,7 +236,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) 1 << bd_inode->i_blkbits); } out_unlock: - spin_unlock(&bd_mapping->private_lock); + spin_unlock(&bd_mapping->i_private_lock); folio_put(folio); out: return ret; @@ -372,10 +372,10 @@ static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate) } /* - * Completion handler for block_write_full_page() - pages which are unlocked - * during I/O, and which have PageWriteback cleared upon I/O completion. + * Completion handler for block_write_full_folio() - folios which are unlocked + * during I/O, and which have the writeback flag cleared upon I/O completion. */ -void end_buffer_async_write(struct buffer_head *bh, int uptodate) +static void end_buffer_async_write(struct buffer_head *bh, int uptodate) { unsigned long flags; struct buffer_head *first; @@ -415,7 +415,6 @@ still_busy: spin_unlock_irqrestore(&first->b_uptodate_lock, flags); return; } -EXPORT_SYMBOL(end_buffer_async_write); /* * If a page's buffers are under async readin (end_buffer_async_read @@ -467,25 +466,25 @@ EXPORT_SYMBOL(mark_buffer_async_write); * * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(), * inode_has_buffers() and invalidate_inode_buffers() are provided for the - * management of a list of dependent buffers at ->i_mapping->private_list. + * management of a list of dependent buffers at ->i_mapping->i_private_list. * * Locking is a little subtle: try_to_free_buffers() will remove buffers * from their controlling inode's queue when they are being freed. But * try_to_free_buffers() will be operating against the *blockdev* mapping * at the time, not against the S_ISREG file which depends on those buffers. - * So the locking for private_list is via the private_lock in the address_space + * So the locking for i_private_list is via the i_private_lock in the address_space * which backs the buffers. Which is different from the address_space * against which the buffers are listed. So for a particular address_space, - * mapping->private_lock does *not* protect mapping->private_list! In fact, - * mapping->private_list will always be protected by the backing blockdev's - * ->private_lock. + * mapping->i_private_lock does *not* protect mapping->i_private_list! In fact, + * mapping->i_private_list will always be protected by the backing blockdev's + * ->i_private_lock. * * Which introduces a requirement: all buffers on an address_space's - * ->private_list must be from the same address_space: the blockdev's. + * ->i_private_list must be from the same address_space: the blockdev's. * - * address_spaces which do not place buffers at ->private_list via these - * utility functions are free to use private_lock and private_list for - * whatever they want. The only requirement is that list_empty(private_list) + * address_spaces which do not place buffers at ->i_private_list via these + * utility functions are free to use i_private_lock and i_private_list for + * whatever they want. The only requirement is that list_empty(i_private_list) * be true at clear_inode() time. * * FIXME: clear_inode should not call invalidate_inode_buffers(). The @@ -508,7 +507,7 @@ EXPORT_SYMBOL(mark_buffer_async_write); */ /* - * The buffer's backing address_space's private_lock must be held + * The buffer's backing address_space's i_private_lock must be held */ static void __remove_assoc_queue(struct buffer_head *bh) { @@ -519,7 +518,7 @@ static void __remove_assoc_queue(struct buffer_head *bh) int inode_has_buffers(struct inode *inode) { - return !list_empty(&inode->i_data.private_list); + return !list_empty(&inode->i_data.i_private_list); } /* @@ -561,7 +560,7 @@ repeat: * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers * @mapping: the mapping which wants those buffers written * - * Starts I/O against the buffers at mapping->private_list, and waits upon + * Starts I/O against the buffers at mapping->i_private_list, and waits upon * that I/O. * * Basically, this is a convenience function for fsync(). @@ -570,13 +569,13 @@ repeat: */ int sync_mapping_buffers(struct address_space *mapping) { - struct address_space *buffer_mapping = mapping->private_data; + struct address_space *buffer_mapping = mapping->i_private_data; - if (buffer_mapping == NULL || list_empty(&mapping->private_list)) + if (buffer_mapping == NULL || list_empty(&mapping->i_private_list)) return 0; - return fsync_buffers_list(&buffer_mapping->private_lock, - &mapping->private_list); + return fsync_buffers_list(&buffer_mapping->i_private_lock, + &mapping->i_private_list); } EXPORT_SYMBOL(sync_mapping_buffers); @@ -673,17 +672,17 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) struct address_space *buffer_mapping = bh->b_folio->mapping; mark_buffer_dirty(bh); - if (!mapping->private_data) { - mapping->private_data = buffer_mapping; + if (!mapping->i_private_data) { + mapping->i_private_data = buffer_mapping; } else { - BUG_ON(mapping->private_data != buffer_mapping); + BUG_ON(mapping->i_private_data != buffer_mapping); } if (!bh->b_assoc_map) { - spin_lock(&buffer_mapping->private_lock); + spin_lock(&buffer_mapping->i_private_lock); list_move_tail(&bh->b_assoc_buffers, - &mapping->private_list); + &mapping->i_private_list); bh->b_assoc_map = mapping; - spin_unlock(&buffer_mapping->private_lock); + spin_unlock(&buffer_mapping->i_private_lock); } } EXPORT_SYMBOL(mark_buffer_dirty_inode); @@ -706,7 +705,7 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean * page on the dirty page list. * - * We use private_lock to lock against try_to_free_buffers while using the + * We use i_private_lock to lock against try_to_free_buffers while using the * page's buffer list. Also use this to protect against clean buffers being * added to the page after it was set dirty. * @@ -718,7 +717,7 @@ bool block_dirty_folio(struct address_space *mapping, struct folio *folio) struct buffer_head *head; bool newly_dirty; - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); head = folio_buffers(folio); if (head) { struct buffer_head *bh = head; @@ -734,7 +733,7 @@ bool block_dirty_folio(struct address_space *mapping, struct folio *folio) */ folio_memcg_lock(folio); newly_dirty = !folio_test_set_dirty(folio); - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); if (newly_dirty) __folio_mark_dirty(folio, mapping, 1); @@ -827,7 +826,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) smp_mb(); if (buffer_dirty(bh)) { list_add(&bh->b_assoc_buffers, - &mapping->private_list); + &mapping->i_private_list); bh->b_assoc_map = mapping; } spin_unlock(lock); @@ -851,7 +850,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) * probably unmounting the fs, but that doesn't mean we have already * done a sync(). Just drop the buffers from the inode list. * - * NOTE: we take the inode's blockdev's mapping's private_lock. Which + * NOTE: we take the inode's blockdev's mapping's i_private_lock. Which * assumes that all the buffers are against the blockdev. Not true * for reiserfs. */ @@ -859,13 +858,13 @@ void invalidate_inode_buffers(struct inode *inode) { if (inode_has_buffers(inode)) { struct address_space *mapping = &inode->i_data; - struct list_head *list = &mapping->private_list; - struct address_space *buffer_mapping = mapping->private_data; + struct list_head *list = &mapping->i_private_list; + struct address_space *buffer_mapping = mapping->i_private_data; - spin_lock(&buffer_mapping->private_lock); + spin_lock(&buffer_mapping->i_private_lock); while (!list_empty(list)) __remove_assoc_queue(BH_ENTRY(list->next)); - spin_unlock(&buffer_mapping->private_lock); + spin_unlock(&buffer_mapping->i_private_lock); } } EXPORT_SYMBOL(invalidate_inode_buffers); @@ -882,10 +881,10 @@ int remove_inode_buffers(struct inode *inode) if (inode_has_buffers(inode)) { struct address_space *mapping = &inode->i_data; - struct list_head *list = &mapping->private_list; - struct address_space *buffer_mapping = mapping->private_data; + struct list_head *list = &mapping->i_private_list; + struct address_space *buffer_mapping = mapping->i_private_data; - spin_lock(&buffer_mapping->private_lock); + spin_lock(&buffer_mapping->i_private_lock); while (!list_empty(list)) { struct buffer_head *bh = BH_ENTRY(list->next); if (buffer_dirty(bh)) { @@ -894,7 +893,7 @@ int remove_inode_buffers(struct inode *inode) } __remove_assoc_queue(bh); } - spin_unlock(&buffer_mapping->private_lock); + spin_unlock(&buffer_mapping->i_private_lock); } return ret; } @@ -995,11 +994,12 @@ static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size) * Initialise the state of a blockdev folio's buffers. */ static sector_t folio_init_buffers(struct folio *folio, - struct block_device *bdev, sector_t block, int size) + struct block_device *bdev, unsigned size) { struct buffer_head *head = folio_buffers(folio); struct buffer_head *bh = head; bool uptodate = folio_test_uptodate(folio); + sector_t block = div_u64(folio_pos(folio), size); sector_t end_block = blkdev_max_block(bdev, size); do { @@ -1024,86 +1024,88 @@ static sector_t folio_init_buffers(struct folio *folio, } /* - * Create the page-cache page that contains the requested block. + * Create the page-cache folio that contains the requested block. * * This is used purely for blockdev mappings. + * + * Returns false if we have a failure which cannot be cured by retrying + * without sleeping. Returns true if we succeeded, or the caller should retry. */ -static int -grow_dev_page(struct block_device *bdev, sector_t block, - pgoff_t index, int size, int sizebits, gfp_t gfp) +static bool grow_dev_folio(struct block_device *bdev, sector_t block, + pgoff_t index, unsigned size, gfp_t gfp) { struct inode *inode = bdev->bd_inode; struct folio *folio; struct buffer_head *bh; - sector_t end_block; - int ret = 0; + sector_t end_block = 0; folio = __filemap_get_folio(inode->i_mapping, index, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (IS_ERR(folio)) - return PTR_ERR(folio); + return false; bh = folio_buffers(folio); if (bh) { if (bh->b_size == size) { - end_block = folio_init_buffers(folio, bdev, - (sector_t)index << sizebits, size); - goto done; + end_block = folio_init_buffers(folio, bdev, size); + goto unlock; + } + + /* + * Retrying may succeed; for example the folio may finish + * writeback, or buffers may be cleaned. This should not + * happen very often; maybe we have old buffers attached to + * this blockdev's page cache and we're trying to change + * the block size? + */ + if (!try_to_free_buffers(folio)) { + end_block = ~0ULL; + goto unlock; } - if (!try_to_free_buffers(folio)) - goto failed; } - ret = -ENOMEM; bh = folio_alloc_buffers(folio, size, gfp | __GFP_ACCOUNT); if (!bh) - goto failed; + goto unlock; /* * Link the folio to the buffers and initialise them. Take the * lock to be atomic wrt __find_get_block(), which does not * run under the folio lock. */ - spin_lock(&inode->i_mapping->private_lock); + spin_lock(&inode->i_mapping->i_private_lock); link_dev_buffers(folio, bh); - end_block = folio_init_buffers(folio, bdev, - (sector_t)index << sizebits, size); - spin_unlock(&inode->i_mapping->private_lock); -done: - ret = (block < end_block) ? 1 : -ENXIO; -failed: + end_block = folio_init_buffers(folio, bdev, size); + spin_unlock(&inode->i_mapping->i_private_lock); +unlock: folio_unlock(folio); folio_put(folio); - return ret; + return block < end_block; } /* - * Create buffers for the specified block device block's page. If - * that page was dirty, the buffers are set dirty also. + * Create buffers for the specified block device block's folio. If + * that folio was dirty, the buffers are set dirty also. Returns false + * if we've hit a permanent error. */ -static int -grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp) +static bool grow_buffers(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp) { - pgoff_t index; - int sizebits; - - sizebits = PAGE_SHIFT - __ffs(size); - index = block >> sizebits; + loff_t pos; /* - * Check for a block which wants to lie outside our maximum possible - * pagecache index. (this comparison is done using sector_t types). + * Check for a block which lies outside our maximum possible + * pagecache index. */ - if (unlikely(index != block >> sizebits)) { - printk(KERN_ERR "%s: requested out-of-range block %llu for " - "device %pg\n", + if (check_mul_overflow(block, (sector_t)size, &pos) || pos > MAX_LFS_FILESIZE) { + printk(KERN_ERR "%s: requested out-of-range block %llu for device %pg\n", __func__, (unsigned long long)block, bdev); - return -EIO; + return false; } - /* Create a page with the proper size buffers.. */ - return grow_dev_page(bdev, block, index, size, sizebits, gfp); + /* Create a folio with the proper size buffers */ + return grow_dev_folio(bdev, block, pos / PAGE_SIZE, size, gfp); } static struct buffer_head * @@ -1124,14 +1126,12 @@ __getblk_slow(struct block_device *bdev, sector_t block, for (;;) { struct buffer_head *bh; - int ret; bh = __find_get_block(bdev, block, size); if (bh) return bh; - ret = grow_buffers(bdev, block, size, gfp); - if (ret < 0) + if (!grow_buffers(bdev, block, size, gfp)) return NULL; } } @@ -1168,7 +1168,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, * and then attach the address_space's inode to its superblock's dirty * inode list. * - * mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->private_lock, + * mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->i_private_lock, * i_pages lock and mapping->host->i_lock. */ void mark_buffer_dirty(struct buffer_head *bh) @@ -1246,10 +1246,10 @@ void __bforget(struct buffer_head *bh) if (bh->b_assoc_map) { struct address_space *buffer_mapping = bh->b_folio->mapping; - spin_lock(&buffer_mapping->private_lock); + spin_lock(&buffer_mapping->i_private_lock); list_del_init(&bh->b_assoc_buffers); bh->b_assoc_map = NULL; - spin_unlock(&buffer_mapping->private_lock); + spin_unlock(&buffer_mapping->i_private_lock); } __brelse(bh); } @@ -1638,7 +1638,7 @@ EXPORT_SYMBOL(block_invalidate_folio); /* * We attach and possibly dirty the buffers atomically wrt - * block_dirty_folio() via private_lock. try_to_free_buffers + * block_dirty_folio() via i_private_lock. try_to_free_buffers * is already excluded via the folio lock. */ struct buffer_head *create_empty_buffers(struct folio *folio, @@ -1656,7 +1656,7 @@ struct buffer_head *create_empty_buffers(struct folio *folio, } while (bh); tail->b_this_page = head; - spin_lock(&folio->mapping->private_lock); + spin_lock(&folio->mapping->i_private_lock); if (folio_test_uptodate(folio) || folio_test_dirty(folio)) { bh = head; do { @@ -1668,7 +1668,7 @@ struct buffer_head *create_empty_buffers(struct folio *folio, } while (bh != head); } folio_attach_private(folio, head); - spin_unlock(&folio->mapping->private_lock); + spin_unlock(&folio->mapping->i_private_lock); return head; } @@ -1699,13 +1699,13 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) struct inode *bd_inode = bdev->bd_inode; struct address_space *bd_mapping = bd_inode->i_mapping; struct folio_batch fbatch; - pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits); + pgoff_t index = ((loff_t)block << bd_inode->i_blkbits) / PAGE_SIZE; pgoff_t end; int i, count; struct buffer_head *bh; struct buffer_head *head; - end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits); + end = ((loff_t)(block + len - 1) << bd_inode->i_blkbits) / PAGE_SIZE; folio_batch_init(&fbatch); while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) { count = folio_batch_count(&fbatch); @@ -1715,7 +1715,7 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) if (!folio_buffers(folio)) continue; /* - * We use folio lock instead of bd_mapping->private_lock + * We use folio lock instead of bd_mapping->i_private_lock * to pin buffers here since we can afford to sleep and * it scales better than a global spinlock lock. */ @@ -1748,19 +1748,6 @@ unlock_page: } EXPORT_SYMBOL(clean_bdev_aliases); -/* - * Size is a power-of-two in the range 512..PAGE_SIZE, - * and the case we care about most is PAGE_SIZE. - * - * So this *could* possibly be written with those - * constraints in mind (relevant mostly if some - * architecture has a slow bit-scan instruction) - */ -static inline int block_size_bits(unsigned int blocksize) -{ - return ilog2(blocksize); -} - static struct buffer_head *folio_create_buffers(struct folio *folio, struct inode *inode, unsigned int b_state) @@ -1790,30 +1777,29 @@ static struct buffer_head *folio_create_buffers(struct folio *folio, */ /* - * While block_write_full_page is writing back the dirty buffers under + * While block_write_full_folio is writing back the dirty buffers under * the page lock, whoever dirtied the buffers may decide to clean them * again at any time. We handle that by only looking at the buffer * state inside lock_buffer(). * - * If block_write_full_page() is called for regular writeback + * If block_write_full_folio() is called for regular writeback * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a * locked buffer. This only can happen if someone has written the buffer * directly, with submit_bh(). At the address_space level PageWriteback * prevents this contention from occurring. * - * If block_write_full_page() is called with wbc->sync_mode == + * If block_write_full_folio() is called with wbc->sync_mode == * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this * causes the writes to be flagged as synchronous writes. */ int __block_write_full_folio(struct inode *inode, struct folio *folio, - get_block_t *get_block, struct writeback_control *wbc, - bh_end_io_t *handler) + get_block_t *get_block, struct writeback_control *wbc) { int err; sector_t block; sector_t last_block; struct buffer_head *bh, *head; - unsigned int blocksize, bbits; + size_t blocksize; int nr_underway = 0; blk_opf_t write_flags = wbc_to_write_flags(wbc); @@ -1832,10 +1818,9 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio, bh = head; blocksize = bh->b_size; - bbits = block_size_bits(blocksize); - block = (sector_t)folio->index << (PAGE_SHIFT - bbits); - last_block = (i_size_read(inode) - 1) >> bbits; + block = div_u64(folio_pos(folio), blocksize); + last_block = div_u64(i_size_read(inode) - 1, blocksize); /* * Get all the dirty buffers mapped to disk addresses and @@ -1849,7 +1834,7 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio, * truncate in progress. */ /* - * The buffer was zeroed by block_write_full_page() + * The buffer was zeroed by block_write_full_folio() */ clear_buffer_dirty(bh); set_buffer_uptodate(bh); @@ -1887,7 +1872,8 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio, continue; } if (test_clear_buffer_dirty(bh)) { - mark_buffer_async_write_endio(bh, handler); + mark_buffer_async_write_endio(bh, + end_buffer_async_write); } else { unlock_buffer(bh); } @@ -1940,7 +1926,8 @@ recover: if (buffer_mapped(bh) && buffer_dirty(bh) && !buffer_delay(bh)) { lock_buffer(bh); - mark_buffer_async_write_endio(bh, handler); + mark_buffer_async_write_endio(bh, + end_buffer_async_write); } else { /* * The buffer may have been set dirty during @@ -2014,7 +2001,7 @@ static int iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, const struct iomap *iomap) { - loff_t offset = block << inode->i_blkbits; + loff_t offset = (loff_t)block << inode->i_blkbits; bh->b_bdev = iomap->bdev; @@ -2081,27 +2068,24 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block, const struct iomap *iomap) { - unsigned from = pos & (PAGE_SIZE - 1); - unsigned to = from + len; + size_t from = offset_in_folio(folio, pos); + size_t to = from + len; struct inode *inode = folio->mapping->host; - unsigned block_start, block_end; + size_t block_start, block_end; sector_t block; int err = 0; - unsigned blocksize, bbits; + size_t blocksize; struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; BUG_ON(!folio_test_locked(folio)); - BUG_ON(from > PAGE_SIZE); - BUG_ON(to > PAGE_SIZE); + BUG_ON(to > folio_size(folio)); BUG_ON(from > to); head = folio_create_buffers(folio, inode, 0); blocksize = head->b_size; - bbits = block_size_bits(blocksize); + block = div_u64(folio_pos(folio), blocksize); - block = (sector_t)folio->index << (PAGE_SHIFT - bbits); - - for(bh = head, block_start = 0; bh != head || !block_start; + for (bh = head, block_start = 0; bh != head || !block_start; block++, block_start=block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { @@ -2364,7 +2348,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) struct inode *inode = folio->mapping->host; sector_t iblock, lblock; struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; - unsigned int blocksize, bbits; + size_t blocksize; int nr, i; int fully_mapped = 1; bool page_error = false; @@ -2378,10 +2362,9 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) head = folio_create_buffers(folio, inode, 0); blocksize = head->b_size; - bbits = block_size_bits(blocksize); - iblock = (sector_t)folio->index << (PAGE_SHIFT - bbits); - lblock = (limit+blocksize-1) >> bbits; + iblock = div_u64(folio_pos(folio), blocksize); + lblock = div_u64(limit + blocksize - 1, blocksize); bh = head; nr = 0; i = 0; @@ -2666,8 +2649,8 @@ int block_truncate_page(struct address_space *mapping, return 0; length = blocksize - length; - iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits); - + iblock = ((loff_t)index * PAGE_SIZE) >> inode->i_blkbits; + folio = filemap_grab_folio(mapping, index); if (IS_ERR(folio)) return PTR_ERR(folio); @@ -2720,17 +2703,15 @@ EXPORT_SYMBOL(block_truncate_page); /* * The generic ->writepage function for buffer-backed address_spaces */ -int block_write_full_page(struct page *page, get_block_t *get_block, - struct writeback_control *wbc) +int block_write_full_folio(struct folio *folio, struct writeback_control *wbc, + void *get_block) { - struct folio *folio = page_folio(page); struct inode * const inode = folio->mapping->host; loff_t i_size = i_size_read(inode); /* Is the folio fully inside i_size? */ if (folio_pos(folio) + folio_size(folio) <= i_size) - return __block_write_full_folio(inode, folio, get_block, wbc, - end_buffer_async_write); + return __block_write_full_folio(inode, folio, get_block, wbc); /* Is the folio fully outside i_size? (truncate in progress) */ if (folio_pos(folio) >= i_size) { @@ -2747,10 +2728,8 @@ int block_write_full_page(struct page *page, get_block_t *get_block, */ folio_zero_segment(folio, offset_in_folio(folio, i_size), folio_size(folio)); - return __block_write_full_folio(inode, folio, get_block, wbc, - end_buffer_async_write); + return __block_write_full_folio(inode, folio, get_block, wbc); } -EXPORT_SYMBOL(block_write_full_page); sector_t generic_block_bmap(struct address_space *mapping, sector_t block, get_block_t *get_block) @@ -2883,7 +2862,7 @@ EXPORT_SYMBOL(sync_dirty_buffer); * are unused, and releases them if so. * * Exclusion against try_to_free_buffers may be obtained by either - * locking the folio or by holding its mapping's private_lock. + * locking the folio or by holding its mapping's i_private_lock. * * If the folio is dirty but all the buffers are clean then we need to * be sure to mark the folio clean as well. This is because the folio @@ -2894,7 +2873,7 @@ EXPORT_SYMBOL(sync_dirty_buffer); * The same applies to regular filesystem folios: if all the buffers are * clean then we set the folio clean and proceed. To do that, we require * total exclusion from block_dirty_folio(). That is obtained with - * private_lock. + * i_private_lock. * * try_to_free_buffers() is non-blocking. */ @@ -2946,7 +2925,7 @@ bool try_to_free_buffers(struct folio *folio) goto out; } - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); ret = drop_buffers(folio, &buffers_to_free); /* @@ -2959,13 +2938,13 @@ bool try_to_free_buffers(struct folio *folio) * the folio's buffers clean. We discover that here and clean * the folio also. * - * private_lock must be held over this entire operation in order + * i_private_lock must be held over this entire operation in order * to synchronise against block_dirty_folio and prevent the * dirty bit from being lost. */ if (ret) folio_cancel_dirty(folio); - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); out: if (buffers_to_free) { struct buffer_head *bh = buffers_to_free; diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index aa4efcabb5e3..3f24905f4066 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -77,6 +77,7 @@ static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = { { "tag", cachefiles_daemon_tag }, #ifdef CONFIG_CACHEFILES_ONDEMAND { "copen", cachefiles_ondemand_copen }, + { "restore", cachefiles_ondemand_restore }, #endif { "", NULL } }; @@ -355,14 +356,24 @@ static __poll_t cachefiles_daemon_poll(struct file *file, struct poll_table_struct *poll) { struct cachefiles_cache *cache = file->private_data; + XA_STATE(xas, &cache->reqs, 0); + struct cachefiles_req *req; __poll_t mask; poll_wait(file, &cache->daemon_pollwq, poll); mask = 0; if (cachefiles_in_ondemand_mode(cache)) { - if (!xa_empty(&cache->reqs)) - mask |= EPOLLIN; + if (!xa_empty(&cache->reqs)) { + rcu_read_lock(); + xas_for_each_marked(&xas, req, ULONG_MAX, CACHEFILES_REQ_NEW) { + if (!cachefiles_ondemand_is_reopening_read(req)) { + mask |= EPOLLIN; + break; + } + } + rcu_read_unlock(); + } } else { if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags)) mask |= EPOLLIN; diff --git a/fs/cachefiles/error_inject.c b/fs/cachefiles/error_inject.c index 18de8a876b02..1715d5ca2b2d 100644 --- a/fs/cachefiles/error_inject.c +++ b/fs/cachefiles/error_inject.c @@ -19,7 +19,6 @@ static struct ctl_table cachefiles_sysctls[] = { .mode = 0644, .proc_handler = proc_douintvec, }, - {} }; int __init cachefiles_register_error_injection(void) diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 40052bdb3365..35ba2117a6f6 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -31,6 +31,11 @@ struct cachefiles_object *cachefiles_alloc_object(struct fscache_cookie *cookie) if (!object) return NULL; + if (cachefiles_ondemand_init_obj_info(object, volume)) { + kmem_cache_free(cachefiles_object_jar, object); + return NULL; + } + refcount_set(&object->ref, 1); spin_lock_init(&object->lock); @@ -88,7 +93,7 @@ void cachefiles_put_object(struct cachefiles_object *object, ASSERTCMP(object->file, ==, NULL); kfree(object->d_name); - + cachefiles_ondemand_deinit_obj_info(object); cache = object->volume->cache->cache; fscache_put_cookie(object->cookie, fscache_cookie_put_object); object->cookie = NULL; diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 2ad58c465208..4a87c9d714a9 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -44,6 +44,19 @@ struct cachefiles_volume { struct dentry *fanout[256]; /* Fanout subdirs */ }; +enum cachefiles_object_state { + CACHEFILES_ONDEMAND_OBJSTATE_CLOSE, /* Anonymous fd closed by daemon or initial state */ + CACHEFILES_ONDEMAND_OBJSTATE_OPEN, /* Anonymous fd associated with object is available */ + CACHEFILES_ONDEMAND_OBJSTATE_REOPENING, /* Object that was closed and is being reopened. */ +}; + +struct cachefiles_ondemand_info { + struct work_struct ondemand_work; + int ondemand_id; + enum cachefiles_object_state state; + struct cachefiles_object *object; +}; + /* * Backing file state. */ @@ -61,7 +74,7 @@ struct cachefiles_object { unsigned long flags; #define CACHEFILES_OBJECT_USING_TMPFILE 0 /* Have an unlinked tmpfile */ #ifdef CONFIG_CACHEFILES_ONDEMAND - int ondemand_id; + struct cachefiles_ondemand_info *ondemand; #endif }; @@ -290,12 +303,42 @@ extern ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, extern int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args); +extern int cachefiles_ondemand_restore(struct cachefiles_cache *cache, + char *args); + extern int cachefiles_ondemand_init_object(struct cachefiles_object *object); extern void cachefiles_ondemand_clean_object(struct cachefiles_object *object); extern int cachefiles_ondemand_read(struct cachefiles_object *object, loff_t pos, size_t len); +extern int cachefiles_ondemand_init_obj_info(struct cachefiles_object *obj, + struct cachefiles_volume *volume); +extern void cachefiles_ondemand_deinit_obj_info(struct cachefiles_object *obj); + +#define CACHEFILES_OBJECT_STATE_FUNCS(_state, _STATE) \ +static inline bool \ +cachefiles_ondemand_object_is_##_state(const struct cachefiles_object *object) \ +{ \ + return object->ondemand->state == CACHEFILES_ONDEMAND_OBJSTATE_##_STATE; \ +} \ + \ +static inline void \ +cachefiles_ondemand_set_object_##_state(struct cachefiles_object *object) \ +{ \ + object->ondemand->state = CACHEFILES_ONDEMAND_OBJSTATE_##_STATE; \ +} + +CACHEFILES_OBJECT_STATE_FUNCS(open, OPEN); +CACHEFILES_OBJECT_STATE_FUNCS(close, CLOSE); +CACHEFILES_OBJECT_STATE_FUNCS(reopening, REOPENING); + +static inline bool cachefiles_ondemand_is_reopening_read(struct cachefiles_req *req) +{ + return cachefiles_ondemand_object_is_reopening(req->object) && + req->msg.opcode == CACHEFILES_OP_READ; +} + #else static inline ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, char __user *_buffer, size_t buflen) @@ -317,6 +360,20 @@ static inline int cachefiles_ondemand_read(struct cachefiles_object *object, { return -EOPNOTSUPP; } + +static inline int cachefiles_ondemand_init_obj_info(struct cachefiles_object *obj, + struct cachefiles_volume *volume) +{ + return 0; +} +static inline void cachefiles_ondemand_deinit_obj_info(struct cachefiles_object *obj) +{ +} + +static inline bool cachefiles_ondemand_is_reopening_read(struct cachefiles_req *req) +{ + return false; +} #endif /* diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 009d23cd435b..5857241c5918 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -259,7 +259,8 @@ static void cachefiles_write_complete(struct kiocb *iocb, long ret) _enter("%ld", ret); - kiocb_end_write(iocb); + if (ki->was_async) + kiocb_end_write(iocb); if (ret < 0) trace_cachefiles_io_error(object, inode, ret, @@ -319,8 +320,6 @@ int __cachefiles_write(struct cachefiles_object *object, ki->iocb.ki_complete = cachefiles_write_complete; atomic_long_add(ki->b_writing, &cache->b_writing); - kiocb_start_write(&ki->iocb); - get_file(ki->iocb.ki_filp); cachefiles_grab_object(object, cachefiles_obj_get_ioreq); diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 7bf7a5fcc045..7ade836beb58 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -305,6 +305,8 @@ try_again: /* do the multiway lock magic */ trap = lock_rename(cache->graveyard, dir); + if (IS_ERR(trap)) + return PTR_ERR(trap); /* do some checks before getting the grave dentry */ if (rep->d_parent != dir || IS_DEADDIR(d_inode(rep))) { diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index 0254ed39f68c..b8fbbb1961bb 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -9,21 +9,19 @@ static int cachefiles_ondemand_fd_release(struct inode *inode, { struct cachefiles_object *object = file->private_data; struct cachefiles_cache *cache = object->volume->cache; - int object_id = object->ondemand_id; + struct cachefiles_ondemand_info *info = object->ondemand; + int object_id = info->ondemand_id; struct cachefiles_req *req; XA_STATE(xas, &cache->reqs, 0); xa_lock(&cache->reqs); - object->ondemand_id = CACHEFILES_ONDEMAND_ID_CLOSED; + info->ondemand_id = CACHEFILES_ONDEMAND_ID_CLOSED; + cachefiles_ondemand_set_object_close(object); - /* - * Flush all pending READ requests since their completion depends on - * anon_fd. - */ - xas_for_each(&xas, req, ULONG_MAX) { + /* Only flush CACHEFILES_REQ_NEW marked req to avoid race with daemon_read */ + xas_for_each_marked(&xas, req, ULONG_MAX, CACHEFILES_REQ_NEW) { if (req->msg.object_id == object_id && - req->msg.opcode == CACHEFILES_OP_READ) { - req->error = -EIO; + req->msg.opcode == CACHEFILES_OP_CLOSE) { complete(&req->done); xas_store(&xas, NULL); } @@ -176,11 +174,37 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args) set_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags); trace_cachefiles_ondemand_copen(req->object, id, size); + cachefiles_ondemand_set_object_open(req->object); + wake_up_all(&cache->daemon_pollwq); + out: complete(&req->done); return ret; } +int cachefiles_ondemand_restore(struct cachefiles_cache *cache, char *args) +{ + struct cachefiles_req *req; + + XA_STATE(xas, &cache->reqs, 0); + + if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags)) + return -EOPNOTSUPP; + + /* + * Reset the requests to CACHEFILES_REQ_NEW state, so that the + * requests have been processed halfway before the crash of the + * user daemon could be reprocessed after the recovery. + */ + xas_lock(&xas); + xas_for_each(&xas, req, ULONG_MAX) + xas_set_mark(&xas, CACHEFILES_REQ_NEW); + xas_unlock(&xas); + + wake_up_all(&cache->daemon_pollwq); + return 0; +} + static int cachefiles_ondemand_get_fd(struct cachefiles_req *req) { struct cachefiles_object *object; @@ -218,8 +242,7 @@ static int cachefiles_ondemand_get_fd(struct cachefiles_req *req) load = (void *)req->msg.data; load->fd = fd; - req->msg.object_id = object_id; - object->ondemand_id = object_id; + object->ondemand->ondemand_id = object_id; cachefiles_get_unbind_pincount(cache); trace_cachefiles_ondemand_open(object, &req->msg, load); @@ -234,6 +257,43 @@ err: return ret; } +static void ondemand_object_worker(struct work_struct *work) +{ + struct cachefiles_ondemand_info *info = + container_of(work, struct cachefiles_ondemand_info, ondemand_work); + + cachefiles_ondemand_init_object(info->object); +} + +/* + * If there are any inflight or subsequent READ requests on the + * closed object, reopen it. + * Skip read requests whose related object is reopening. + */ +static struct cachefiles_req *cachefiles_ondemand_select_req(struct xa_state *xas, + unsigned long xa_max) +{ + struct cachefiles_req *req; + struct cachefiles_object *object; + struct cachefiles_ondemand_info *info; + + xas_for_each_marked(xas, req, xa_max, CACHEFILES_REQ_NEW) { + if (req->msg.opcode != CACHEFILES_OP_READ) + return req; + object = req->object; + info = object->ondemand; + if (cachefiles_ondemand_object_is_close(object)) { + cachefiles_ondemand_set_object_reopening(object); + queue_work(fscache_wq, &info->ondemand_work); + continue; + } + if (cachefiles_ondemand_object_is_reopening(object)) + continue; + return req; + } + return NULL; +} + ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, char __user *_buffer, size_t buflen) { @@ -244,16 +304,16 @@ ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, int ret = 0; XA_STATE(xas, &cache->reqs, cache->req_id_next); + xa_lock(&cache->reqs); /* * Cyclically search for a request that has not ever been processed, * to prevent requests from being processed repeatedly, and make * request distribution fair. */ - xa_lock(&cache->reqs); - req = xas_find_marked(&xas, UINT_MAX, CACHEFILES_REQ_NEW); + req = cachefiles_ondemand_select_req(&xas, ULONG_MAX); if (!req && cache->req_id_next > 0) { xas_set(&xas, 0); - req = xas_find_marked(&xas, cache->req_id_next - 1, CACHEFILES_REQ_NEW); + req = cachefiles_ondemand_select_req(&xas, cache->req_id_next - 1); } if (!req) { xa_unlock(&cache->reqs); @@ -273,14 +333,18 @@ ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, xa_unlock(&cache->reqs); id = xas.xa_index; - msg->msg_id = id; if (msg->opcode == CACHEFILES_OP_OPEN) { ret = cachefiles_ondemand_get_fd(req); - if (ret) + if (ret) { + cachefiles_ondemand_set_object_close(req->object); goto error; + } } + msg->msg_id = id; + msg->object_id = req->object->ondemand->ondemand_id; + if (copy_to_user(_buffer, msg, n) != 0) { ret = -EFAULT; goto err_put_fd; @@ -313,19 +377,23 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object, void *private) { struct cachefiles_cache *cache = object->volume->cache; - struct cachefiles_req *req; + struct cachefiles_req *req = NULL; XA_STATE(xas, &cache->reqs, 0); int ret; if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags)) return 0; - if (test_bit(CACHEFILES_DEAD, &cache->flags)) - return -EIO; + if (test_bit(CACHEFILES_DEAD, &cache->flags)) { + ret = -EIO; + goto out; + } req = kzalloc(sizeof(*req) + data_len, GFP_KERNEL); - if (!req) - return -ENOMEM; + if (!req) { + ret = -ENOMEM; + goto out; + } req->object = object; init_completion(&req->done); @@ -363,8 +431,9 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object, /* coupled with the barrier in cachefiles_flush_reqs() */ smp_mb(); - if (opcode != CACHEFILES_OP_OPEN && object->ondemand_id <= 0) { - WARN_ON_ONCE(object->ondemand_id == 0); + if (opcode == CACHEFILES_OP_CLOSE && + !cachefiles_ondemand_object_is_open(object)) { + WARN_ON_ONCE(object->ondemand->ondemand_id == 0); xas_unlock(&xas); ret = -EIO; goto out; @@ -387,7 +456,15 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object, wake_up_all(&cache->daemon_pollwq); wait_for_completion(&req->done); ret = req->error; + kfree(req); + return ret; out: + /* Reset the object to close state in error handling path. + * If error occurs after creating the anonymous fd, + * cachefiles_ondemand_fd_release() will set object to close. + */ + if (opcode == CACHEFILES_OP_OPEN) + cachefiles_ondemand_set_object_close(object); kfree(req); return ret; } @@ -430,18 +507,10 @@ static int cachefiles_ondemand_init_close_req(struct cachefiles_req *req, void *private) { struct cachefiles_object *object = req->object; - int object_id = object->ondemand_id; - /* - * It's possible that object id is still 0 if the cookie looking up - * phase failed before OPEN request has ever been sent. Also avoid - * sending CLOSE request for CACHEFILES_ONDEMAND_ID_CLOSED, which means - * anon_fd has already been closed. - */ - if (object_id <= 0) + if (!cachefiles_ondemand_object_is_open(object)) return -ENOENT; - req->msg.object_id = object_id; trace_cachefiles_ondemand_close(object, &req->msg); return 0; } @@ -457,16 +526,7 @@ static int cachefiles_ondemand_init_read_req(struct cachefiles_req *req, struct cachefiles_object *object = req->object; struct cachefiles_read *load = (void *)req->msg.data; struct cachefiles_read_ctx *read_ctx = private; - int object_id = object->ondemand_id; - - /* Stop enqueuing requests when daemon has closed anon_fd. */ - if (object_id <= 0) { - WARN_ON_ONCE(object_id == 0); - pr_info_once("READ: anonymous fd closed prematurely.\n"); - return -EIO; - } - req->msg.object_id = object_id; load->off = read_ctx->off; load->len = read_ctx->len; trace_cachefiles_ondemand_read(object, &req->msg, load); @@ -485,7 +545,7 @@ int cachefiles_ondemand_init_object(struct cachefiles_object *object) * creating a new tmpfile as the cache file. Reuse the previously * allocated object ID if any. */ - if (object->ondemand_id > 0) + if (cachefiles_ondemand_object_is_open(object)) return 0; volume_key_size = volume->key[0] + 1; @@ -503,6 +563,28 @@ void cachefiles_ondemand_clean_object(struct cachefiles_object *object) cachefiles_ondemand_init_close_req, NULL); } +int cachefiles_ondemand_init_obj_info(struct cachefiles_object *object, + struct cachefiles_volume *volume) +{ + if (!cachefiles_in_ondemand_mode(volume->cache)) + return 0; + + object->ondemand = kzalloc(sizeof(struct cachefiles_ondemand_info), + GFP_KERNEL); + if (!object->ondemand) + return -ENOMEM; + + object->ondemand->object = object; + INIT_WORK(&object->ondemand->ondemand_work, ondemand_object_worker); + return 0; +} + +void cachefiles_ondemand_deinit_obj_info(struct cachefiles_object *object) +{ + kfree(object->ondemand); + object->ondemand = NULL; +} + int cachefiles_ondemand_read(struct cachefiles_object *object, loff_t pos, size_t len) { diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 85be3bf18cdf..13af429ab030 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -907,8 +907,8 @@ static void writepages_finish(struct ceph_osd_request *req) doutc(cl, "unlocking %p\n", page); if (remove_page) - generic_error_remove_page(inode->i_mapping, - page); + generic_error_remove_folio(inode->i_mapping, + page_folio(page)); unlock_page(page); } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 91709934c8b1..678596684596 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -174,7 +174,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx, /* * When possible, we try to satisfy a readdir by peeking at the * dcache. We make this work by carefully ordering dentries on - * d_child when we initially get results back from the MDS, and + * d_children when we initially get results back from the MDS, and * falling back to a "normal" sync readdir if any dentries in the dir * are dropped. * diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3b5aae29e944..d380d9dad0e0 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -12,6 +12,7 @@ #include <linux/falloc.h> #include <linux/iversion.h> #include <linux/ktime.h> +#include <linux/splice.h> #include "super.h" #include "mds_client.h" @@ -3010,8 +3011,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, * {read,write}_iter, which will get caps again. */ put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); - ret = do_splice_direct(src_file, &src_off, dst_file, - &dst_off, src_objlen, flags); + ret = splice_file_range(src_file, &src_off, dst_file, &dst_off, + src_objlen); /* Abort on short copies or on error */ if (ret < (long)src_objlen) { doutc(cl, "Failed partial copy (%zd)\n", ret); @@ -3065,8 +3066,8 @@ out_caps: */ if (len && (len < src_ci->i_layout.object_size)) { doutc(cl, "Final partial copy of %zu bytes\n", len); - bytes = do_splice_direct(src_file, &src_off, dst_file, - &dst_off, len, flags); + bytes = splice_file_range(src_file, &src_off, dst_file, + &dst_off, len); if (bytes > 0) ret += bytes; else @@ -3089,8 +3090,8 @@ static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off, len, flags); if (ret == -EOPNOTSUPP || ret == -EXDEV) - ret = generic_copy_file_range(src_file, src_off, dst_file, - dst_off, len, flags); + ret = splice_copy_file_range(src_file, src_off, dst_file, + dst_off, len); return ret; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index d95eb525519a..02ebfabfc8ee 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2128,7 +2128,7 @@ static bool drop_negative_children(struct dentry *dentry) goto out; spin_lock(&dentry->d_lock); - list_for_each_entry(child, &dentry->d_subdirs, d_child) { + hlist_for_each_entry(child, &dentry->d_children, d_sib) { if (d_really_is_positive(child)) { all_negative = false; break; diff --git a/fs/coda/cache.c b/fs/coda/cache.c index 3b8c4513118f..970f0022ec52 100644 --- a/fs/coda/cache.c +++ b/fs/coda/cache.c @@ -93,13 +93,13 @@ static void coda_flag_children(struct dentry *parent, int flag) struct dentry *de; spin_lock(&parent->d_lock); - list_for_each_entry(de, &parent->d_subdirs, d_child) { + hlist_for_each_entry(de, &parent->d_children, d_sib) { + struct inode *inode = d_inode_rcu(de); /* don't know what to do with negative dentries */ - if (d_inode(de) ) - coda_flag_inode(d_inode(de), flag); + if (inode) + coda_flag_inode(inode, flag); } spin_unlock(&parent->d_lock); - return; } void coda_flag_inode_children(struct inode *inode, int flag) diff --git a/fs/coda/file.c b/fs/coda/file.c index 16acc58311ea..148856a582a9 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -79,14 +79,12 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to) if (ret) goto finish_write; - file_start_write(host_file); inode_lock(coda_inode); ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos, 0); coda_inode->i_size = file_inode(host_file)->i_size; coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9; inode_set_mtime_to_ts(coda_inode, inode_set_ctime_current(coda_inode)); inode_unlock(coda_inode); - file_end_write(host_file); finish_write: venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode), diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c index a247c14aaab7..9f2d5743e2c8 100644 --- a/fs/coda/sysctl.c +++ b/fs/coda/sysctl.c @@ -36,7 +36,6 @@ static struct ctl_table coda_table[] = { .mode = 0600, .proc_handler = proc_dointvec }, - {} }; void coda_sysctl_init(void) diff --git a/fs/coredump.c b/fs/coredump.c index 9d235fa14ab9..f258c17c1841 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -981,7 +981,6 @@ static struct ctl_table coredump_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; static int __init init_fs_coredump_sysctls(void) diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig index 2d0c8922f635..5aff5934baa1 100644 --- a/fs/crypto/Kconfig +++ b/fs/crypto/Kconfig @@ -11,7 +11,7 @@ config FS_ENCRYPTION feature is similar to ecryptfs, but it is more memory efficient since it avoids caching the encrypted and decrypted pages in the page cache. Currently Ext4, - F2FS and UBIFS make use of this feature. + F2FS, UBIFS, and CephFS make use of this feature. # Filesystems supporting encryption must select this if FS_ENCRYPTION. This # allows the algorithms to be built as modules when all the filesystems are, diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index f34a9b0b9e92..0edf0b58daa7 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -1002,9 +1002,9 @@ static int try_to_lock_encrypted_files(struct super_block *sb, * FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS (all_users=true) always removes the * key itself. * - * To "remove the key itself", first we wipe the actual master key secret, so - * that no more inodes can be unlocked with it. Then we try to evict all cached - * inodes that had been unlocked with the key. + * To "remove the key itself", first we transition the key to the "incompletely + * removed" state, so that no more inodes can be unlocked with it. Then we try + * to evict all cached inodes that had been unlocked with the key. * * If all inodes were evicted, then we unlink the fscrypt_master_key from the * keyring. Otherwise it remains in the keyring in the "incompletely removed" @@ -1128,7 +1128,7 @@ static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size, /* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */ bool zero_edge = srcmap->flags & IOMAP_F_SHARED || srcmap->type == IOMAP_UNWRITTEN; - void *saddr = 0; + void *saddr = NULL; int ret = 0; if (!zero_edge) { diff --git a/fs/dcache.c b/fs/dcache.c index c82ae731df9a..b813528fb147 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -51,8 +51,8 @@ * - d_lru * - d_count * - d_unhashed() - * - d_parent and d_subdirs - * - childrens' d_child and d_parent + * - d_parent and d_chilren + * - childrens' d_sib and d_parent * - d_u.d_alias, d_inode * * Ordering: @@ -191,7 +191,6 @@ static struct ctl_table fs_dcache_sysctls[] = { .mode = 0444, .proc_handler = proc_nr_dentry, }, - { } }; static int __init init_fs_dcache_sysctls(void) @@ -344,7 +343,7 @@ static inline void __d_set_inode_and_type(struct dentry *dentry, dentry->d_inode = inode; flags = READ_ONCE(dentry->d_flags); - flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); + flags &= ~DCACHE_ENTRY_TYPE; flags |= type_flags; smp_store_release(&dentry->d_flags, flags); } @@ -353,7 +352,7 @@ static inline void __d_clear_type_and_inode(struct dentry *dentry) { unsigned flags = READ_ONCE(dentry->d_flags); - flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); + flags &= ~DCACHE_ENTRY_TYPE; WRITE_ONCE(dentry->d_flags, flags); dentry->d_inode = NULL; if (dentry->d_flags & DCACHE_LRU_LIST) @@ -428,7 +427,8 @@ static void d_lru_add(struct dentry *dentry) this_cpu_inc(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_inc(nr_dentry_negative); - WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); + WARN_ON_ONCE(!list_lru_add_obj( + &dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } static void d_lru_del(struct dentry *dentry) @@ -438,7 +438,8 @@ static void d_lru_del(struct dentry *dentry) this_cpu_dec(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); - WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); + WARN_ON_ONCE(!list_lru_del_obj( + &dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } static void d_shrink_del(struct dentry *dentry) @@ -537,7 +538,7 @@ void d_drop(struct dentry *dentry) } EXPORT_SYMBOL(d_drop); -static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent) +static inline void dentry_unlist(struct dentry *dentry) { struct dentry *next; /* @@ -545,12 +546,12 @@ static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent) * attached to the dentry tree */ dentry->d_flags |= DCACHE_DENTRY_KILLED; - if (unlikely(list_empty(&dentry->d_child))) + if (unlikely(hlist_unhashed(&dentry->d_sib))) return; - __list_del_entry(&dentry->d_child); + __hlist_del(&dentry->d_sib); /* * Cursors can move around the list of children. While we'd been - * a normal list member, it didn't matter - ->d_child.next would've + * a normal list member, it didn't matter - ->d_sib.next would've * been updated. However, from now on it won't be and for the * things like d_walk() it might end up with a nasty surprise. * Normally d_walk() doesn't care about cursors moving around - @@ -558,29 +559,27 @@ static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent) * of its own, we get through it without ever unlocking the parent. * There is one exception, though - if we ascend from a child that * gets killed as soon as we unlock it, the next sibling is found - * using the value left in its ->d_child.next. And if _that_ + * using the value left in its ->d_sib.next. And if _that_ * pointed to a cursor, and cursor got moved (e.g. by lseek()) * before d_walk() regains parent->d_lock, we'll end up skipping * everything the cursor had been moved past. * - * Solution: make sure that the pointer left behind in ->d_child.next + * Solution: make sure that the pointer left behind in ->d_sib.next * points to something that won't be moving around. I.e. skip the * cursors. */ - while (dentry->d_child.next != &parent->d_subdirs) { - next = list_entry(dentry->d_child.next, struct dentry, d_child); + while (dentry->d_sib.next) { + next = hlist_entry(dentry->d_sib.next, struct dentry, d_sib); if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR))) break; - dentry->d_child.next = next->d_child.next; + dentry->d_sib.next = next->d_sib.next; } } -static void __dentry_kill(struct dentry *dentry) +static struct dentry *__dentry_kill(struct dentry *dentry) { struct dentry *parent = NULL; bool can_free = true; - if (!IS_ROOT(dentry)) - parent = dentry->d_parent; /* * The dentry is now unrecoverably dead to the world. @@ -600,9 +599,6 @@ static void __dentry_kill(struct dentry *dentry) } /* if it was on the hash then remove it */ __d_drop(dentry); - dentry_unlist(dentry, parent); - if (parent) - spin_unlock(&parent->d_lock); if (dentry->d_inode) dentry_unlink_inode(dentry); else @@ -611,80 +607,114 @@ static void __dentry_kill(struct dentry *dentry) if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); - spin_lock(&dentry->d_lock); - if (dentry->d_flags & DCACHE_SHRINK_LIST) { - dentry->d_flags |= DCACHE_MAY_FREE; - can_free = false; + cond_resched(); + /* now that it's negative, ->d_parent is stable */ + if (!IS_ROOT(dentry)) { + parent = dentry->d_parent; + spin_lock(&parent->d_lock); } + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + dentry_unlist(dentry); + if (dentry->d_flags & DCACHE_SHRINK_LIST) + can_free = false; spin_unlock(&dentry->d_lock); if (likely(can_free)) dentry_free(dentry); - cond_resched(); -} - -static struct dentry *__lock_parent(struct dentry *dentry) -{ - struct dentry *parent; - rcu_read_lock(); - spin_unlock(&dentry->d_lock); -again: - parent = READ_ONCE(dentry->d_parent); - spin_lock(&parent->d_lock); - /* - * We can't blindly lock dentry until we are sure - * that we won't violate the locking order. - * Any changes of dentry->d_parent must have - * been done with parent->d_lock held, so - * spin_lock() above is enough of a barrier - * for checking if it's still our child. - */ - if (unlikely(parent != dentry->d_parent)) { + if (parent && --parent->d_lockref.count) { spin_unlock(&parent->d_lock); - goto again; + return NULL; } - rcu_read_unlock(); - if (parent != dentry) - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - else - parent = NULL; return parent; } -static inline struct dentry *lock_parent(struct dentry *dentry) +/* + * Lock a dentry for feeding it to __dentry_kill(). + * Called under rcu_read_lock() and dentry->d_lock; the former + * guarantees that nothing we access will be freed under us. + * Note that dentry is *not* protected from concurrent dentry_kill(), + * d_delete(), etc. + * + * Return false if dentry is busy. Otherwise, return true and have + * that dentry's inode locked. + */ + +static bool lock_for_kill(struct dentry *dentry) { - struct dentry *parent = dentry->d_parent; - if (IS_ROOT(dentry)) - return NULL; - if (likely(spin_trylock(&parent->d_lock))) - return parent; - return __lock_parent(dentry); + struct inode *inode = dentry->d_inode; + + if (unlikely(dentry->d_lockref.count)) + return false; + + if (!inode || likely(spin_trylock(&inode->i_lock))) + return true; + + do { + spin_unlock(&dentry->d_lock); + spin_lock(&inode->i_lock); + spin_lock(&dentry->d_lock); + if (likely(inode == dentry->d_inode)) + break; + spin_unlock(&inode->i_lock); + inode = dentry->d_inode; + } while (inode); + if (likely(!dentry->d_lockref.count)) + return true; + if (inode) + spin_unlock(&inode->i_lock); + return false; } -static inline bool retain_dentry(struct dentry *dentry) +/* + * Decide if dentry is worth retaining. Usually this is called with dentry + * locked; if not locked, we are more limited and might not be able to tell + * without a lock. False in this case means "punt to locked path and recheck". + * + * In case we aren't locked, these predicates are not "stable". However, it is + * sufficient that at some point after we dropped the reference the dentry was + * hashed and the flags had the proper value. Other dentry users may have + * re-gotten a reference to the dentry and change that, but our work is done - + * we can leave the dentry around with a zero refcount. + */ +static inline bool retain_dentry(struct dentry *dentry, bool locked) { - WARN_ON(d_in_lookup(dentry)); + unsigned int d_flags; - /* Unreachable? Get rid of it */ + smp_rmb(); + d_flags = READ_ONCE(dentry->d_flags); + + // Unreachable? Nobody would be able to look it up, no point retaining if (unlikely(d_unhashed(dentry))) return false; - if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) + // Same if it's disconnected + if (unlikely(d_flags & DCACHE_DISCONNECTED)) return false; - if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) { - if (dentry->d_op->d_delete(dentry)) + // ->d_delete() might tell us not to bother, but that requires + // ->d_lock; can't decide without it + if (unlikely(d_flags & DCACHE_OP_DELETE)) { + if (!locked || dentry->d_op->d_delete(dentry)) return false; } - if (unlikely(dentry->d_flags & DCACHE_DONTCACHE)) + // Explicitly told not to bother + if (unlikely(d_flags & DCACHE_DONTCACHE)) return false; - /* retain; LRU fodder */ - dentry->d_lockref.count--; - if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) + // At this point it looks like we ought to keep it. We also might + // need to do something - put it on LRU if it wasn't there already + // and mark it referenced if it was on LRU, but not marked yet. + // Unfortunately, both actions require ->d_lock, so in lockless + // case we'd have to punt rather than doing those. + if (unlikely(!(d_flags & DCACHE_LRU_LIST))) { + if (!locked) + return false; d_lru_add(dentry); - else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED))) + } else if (unlikely(!(d_flags & DCACHE_REFERENCED))) { + if (!locked) + return false; dentry->d_flags |= DCACHE_REFERENCED; + } return true; } @@ -704,60 +734,11 @@ void d_mark_dontcache(struct inode *inode) EXPORT_SYMBOL(d_mark_dontcache); /* - * Finish off a dentry we've decided to kill. - * dentry->d_lock must be held, returns with it unlocked. - * Returns dentry requiring refcount drop, or NULL if we're done. - */ -static struct dentry *dentry_kill(struct dentry *dentry) - __releases(dentry->d_lock) -{ - struct inode *inode = dentry->d_inode; - struct dentry *parent = NULL; - - if (inode && unlikely(!spin_trylock(&inode->i_lock))) - goto slow_positive; - - if (!IS_ROOT(dentry)) { - parent = dentry->d_parent; - if (unlikely(!spin_trylock(&parent->d_lock))) { - parent = __lock_parent(dentry); - if (likely(inode || !dentry->d_inode)) - goto got_locks; - /* negative that became positive */ - if (parent) - spin_unlock(&parent->d_lock); - inode = dentry->d_inode; - goto slow_positive; - } - } - __dentry_kill(dentry); - return parent; - -slow_positive: - spin_unlock(&dentry->d_lock); - spin_lock(&inode->i_lock); - spin_lock(&dentry->d_lock); - parent = lock_parent(dentry); -got_locks: - if (unlikely(dentry->d_lockref.count != 1)) { - dentry->d_lockref.count--; - } else if (likely(!retain_dentry(dentry))) { - __dentry_kill(dentry); - return parent; - } - /* we are keeping it, after all */ - if (inode) - spin_unlock(&inode->i_lock); - if (parent) - spin_unlock(&parent->d_lock); - spin_unlock(&dentry->d_lock); - return NULL; -} - -/* * Try to do a lockless dput(), and return whether that was successful. * * If unsuccessful, we return false, having already taken the dentry lock. + * In that case refcount is guaranteed to be zero and we have already + * decided that it's not worth keeping around. * * The caller needs to hold the RCU read lock, so that the dentry is * guaranteed to stay around even if the refcount goes down to zero! @@ -765,18 +746,9 @@ got_locks: static inline bool fast_dput(struct dentry *dentry) { int ret; - unsigned int d_flags; - - /* - * If we have a d_op->d_delete() operation, we sould not - * let the dentry count go to zero, so use "put_or_lock". - */ - if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) - return lockref_put_or_lock(&dentry->d_lockref); /* - * .. otherwise, we can try to just decrement the - * lockref optimistically. + * try to decrement the lockref optimistically. */ ret = lockref_put_return(&dentry->d_lockref); @@ -787,12 +759,12 @@ static inline bool fast_dput(struct dentry *dentry) */ if (unlikely(ret < 0)) { spin_lock(&dentry->d_lock); - if (dentry->d_lockref.count > 1) { - dentry->d_lockref.count--; + if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) { spin_unlock(&dentry->d_lock); return true; } - return false; + dentry->d_lockref.count--; + goto locked; } /* @@ -802,45 +774,18 @@ static inline bool fast_dput(struct dentry *dentry) return true; /* - * Careful, careful. The reference count went down - * to zero, but we don't hold the dentry lock, so - * somebody else could get it again, and do another - * dput(), and we need to not race with that. - * - * However, there is a very special and common case - * where we don't care, because there is nothing to - * do: the dentry is still hashed, it does not have - * a 'delete' op, and it's referenced and already on - * the LRU list. - * - * NOTE! Since we aren't locked, these values are - * not "stable". However, it is sufficient that at - * some point after we dropped the reference the - * dentry was hashed and the flags had the proper - * value. Other dentry users may have re-gotten - * a reference to the dentry and change that, but - * our work is done - we can leave the dentry - * around with a zero refcount. - * - * Nevertheless, there are two cases that we should kill - * the dentry anyway. - * 1. free disconnected dentries as soon as their refcount - * reached zero. - * 2. free dentries if they should not be cached. + * Can we decide that decrement of refcount is all we needed without + * taking the lock? There's a very common case when it's all we need - + * dentry looks like it ought to be retained and there's nothing else + * to do. */ - smp_rmb(); - d_flags = READ_ONCE(dentry->d_flags); - d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | - DCACHE_DISCONNECTED | DCACHE_DONTCACHE; - - /* Nothing to do? Dropping the reference was all we needed? */ - if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry)) + if (retain_dentry(dentry, false)) return true; /* - * Not the fast normal case? Get the lock. We've already decremented - * the refcount, but we'll need to re-check the situation after - * getting the lock. + * Either not worth retaining or we can't tell without the lock. + * Get the lock, then. We've already decremented the refcount to 0, + * but we'll need to re-check the situation after getting the lock. */ spin_lock(&dentry->d_lock); @@ -850,17 +795,11 @@ static inline bool fast_dput(struct dentry *dentry) * else could have killed it and marked it dead. Either way, we * don't need to do anything else. */ - if (dentry->d_lockref.count) { +locked: + if (dentry->d_lockref.count || retain_dentry(dentry, true)) { spin_unlock(&dentry->d_lock); return true; } - - /* - * Re-get the reference we optimistically dropped. We hold the - * lock, and we just tested that it was zero, so we can just - * set it to 1. - */ - dentry->d_lockref.count = 1; return false; } @@ -893,39 +832,37 @@ static inline bool fast_dput(struct dentry *dentry) */ void dput(struct dentry *dentry) { - while (dentry) { - might_sleep(); - - rcu_read_lock(); - if (likely(fast_dput(dentry))) { - rcu_read_unlock(); - return; - } - - /* Slow case: now with the dentry lock held */ + if (!dentry) + return; + might_sleep(); + rcu_read_lock(); + if (likely(fast_dput(dentry))) { rcu_read_unlock(); - - if (likely(retain_dentry(dentry))) { + return; + } + while (lock_for_kill(dentry)) { + rcu_read_unlock(); + dentry = __dentry_kill(dentry); + if (!dentry) + return; + if (retain_dentry(dentry, true)) { spin_unlock(&dentry->d_lock); return; } - - dentry = dentry_kill(dentry); + rcu_read_lock(); } + rcu_read_unlock(); + spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(dput); -static void __dput_to_list(struct dentry *dentry, struct list_head *list) +static void to_shrink_list(struct dentry *dentry, struct list_head *list) __must_hold(&dentry->d_lock) { - if (dentry->d_flags & DCACHE_SHRINK_LIST) { - /* let the owner of the list it's on deal with it */ - --dentry->d_lockref.count; - } else { + if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { if (dentry->d_flags & DCACHE_LRU_LIST) d_lru_del(dentry); - if (!--dentry->d_lockref.count) - d_shrink_add(dentry, list); + d_shrink_add(dentry, list); } } @@ -937,22 +874,10 @@ void dput_to_list(struct dentry *dentry, struct list_head *list) return; } rcu_read_unlock(); - if (!retain_dentry(dentry)) - __dput_to_list(dentry, list); + to_shrink_list(dentry, list); spin_unlock(&dentry->d_lock); } -/* This must be called with d_lock held */ -static inline void __dget_dlock(struct dentry *dentry) -{ - dentry->d_lockref.count++; -} - -static inline void __dget(struct dentry *dentry) -{ - lockref_get(&dentry->d_lockref); -} - struct dentry *dget_parent(struct dentry *dentry) { int gotref; @@ -1002,7 +927,7 @@ static struct dentry * __d_find_any_alias(struct inode *inode) if (hlist_empty(&inode->i_dentry)) return NULL; alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); - __dget(alias); + lockref_get(&alias->d_lockref); return alias; } @@ -1034,7 +959,7 @@ static struct dentry *__d_find_alias(struct inode *inode) hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { spin_lock(&alias->d_lock); if (!d_unhashed(alias)) { - __dget_dlock(alias); + dget_dlock(alias); spin_unlock(&alias->d_lock); return alias; } @@ -1101,104 +1026,53 @@ struct dentry *d_find_alias_rcu(struct inode *inode) */ void d_prune_aliases(struct inode *inode) { + LIST_HEAD(dispose); struct dentry *dentry; -restart: + spin_lock(&inode->i_lock); hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { spin_lock(&dentry->d_lock); - if (!dentry->d_lockref.count) { - struct dentry *parent = lock_parent(dentry); - if (likely(!dentry->d_lockref.count)) { - __dentry_kill(dentry); - dput(parent); - goto restart; - } - if (parent) - spin_unlock(&parent->d_lock); - } + if (!dentry->d_lockref.count) + to_shrink_list(dentry, &dispose); spin_unlock(&dentry->d_lock); } spin_unlock(&inode->i_lock); + shrink_dentry_list(&dispose); } EXPORT_SYMBOL(d_prune_aliases); -/* - * Lock a dentry from shrink list. - * Called under rcu_read_lock() and dentry->d_lock; the former - * guarantees that nothing we access will be freed under us. - * Note that dentry is *not* protected from concurrent dentry_kill(), - * d_delete(), etc. - * - * Return false if dentry has been disrupted or grabbed, leaving - * the caller to kick it off-list. Otherwise, return true and have - * that dentry's inode and parent both locked. - */ -static bool shrink_lock_dentry(struct dentry *dentry) +static inline void shrink_kill(struct dentry *victim) { - struct inode *inode; - struct dentry *parent; - - if (dentry->d_lockref.count) - return false; - - inode = dentry->d_inode; - if (inode && unlikely(!spin_trylock(&inode->i_lock))) { - spin_unlock(&dentry->d_lock); - spin_lock(&inode->i_lock); - spin_lock(&dentry->d_lock); - if (unlikely(dentry->d_lockref.count)) - goto out; - /* changed inode means that somebody had grabbed it */ - if (unlikely(inode != dentry->d_inode)) - goto out; - } - - parent = dentry->d_parent; - if (IS_ROOT(dentry) || likely(spin_trylock(&parent->d_lock))) - return true; - - spin_unlock(&dentry->d_lock); - spin_lock(&parent->d_lock); - if (unlikely(parent != dentry->d_parent)) { - spin_unlock(&parent->d_lock); - spin_lock(&dentry->d_lock); - goto out; - } - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - if (likely(!dentry->d_lockref.count)) - return true; - spin_unlock(&parent->d_lock); -out: - if (inode) - spin_unlock(&inode->i_lock); - return false; + do { + rcu_read_unlock(); + victim = __dentry_kill(victim); + rcu_read_lock(); + } while (victim && lock_for_kill(victim)); + rcu_read_unlock(); + if (victim) + spin_unlock(&victim->d_lock); } void shrink_dentry_list(struct list_head *list) { while (!list_empty(list)) { - struct dentry *dentry, *parent; + struct dentry *dentry; dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); rcu_read_lock(); - if (!shrink_lock_dentry(dentry)) { - bool can_free = false; + if (!lock_for_kill(dentry)) { + bool can_free; rcu_read_unlock(); d_shrink_del(dentry); - if (dentry->d_lockref.count < 0) - can_free = dentry->d_flags & DCACHE_MAY_FREE; + can_free = dentry->d_flags & DCACHE_DENTRY_KILLED; spin_unlock(&dentry->d_lock); if (can_free) dentry_free(dentry); continue; } - rcu_read_unlock(); d_shrink_del(dentry); - parent = dentry->d_parent; - if (parent != dentry) - __dput_to_list(parent, list); - __dentry_kill(dentry); + shrink_kill(dentry); } } @@ -1240,7 +1114,7 @@ static enum lru_status dentry_lru_isolate(struct list_head *item, * * This is guaranteed by the fact that all LRU management * functions are intermediated by the LRU API calls like - * list_lru_add and list_lru_del. List movement in this file + * list_lru_add_obj and list_lru_del_obj. List movement in this file * only ever occur through this functions or through callbacks * like this one, that are called from the LRU API. * @@ -1348,8 +1222,7 @@ enum d_walk_ret { static void d_walk(struct dentry *parent, void *data, enum d_walk_ret (*enter)(void *, struct dentry *)) { - struct dentry *this_parent; - struct list_head *next; + struct dentry *this_parent, *dentry; unsigned seq = 0; enum d_walk_ret ret; bool retry = true; @@ -1371,13 +1244,9 @@ again: break; } repeat: - next = this_parent->d_subdirs.next; + dentry = d_first_child(this_parent); resume: - while (next != &this_parent->d_subdirs) { - struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_child); - next = tmp->next; - + hlist_for_each_entry_from(dentry, d_sib) { if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR)) continue; @@ -1398,7 +1267,7 @@ resume: continue; } - if (!list_empty(&dentry->d_subdirs)) { + if (!hlist_empty(&dentry->d_children)) { spin_unlock(&this_parent->d_lock); spin_release(&dentry->d_lock.dep_map, _RET_IP_); this_parent = dentry; @@ -1413,24 +1282,23 @@ resume: rcu_read_lock(); ascend: if (this_parent != parent) { - struct dentry *child = this_parent; - this_parent = child->d_parent; + dentry = this_parent; + this_parent = dentry->d_parent; - spin_unlock(&child->d_lock); + spin_unlock(&dentry->d_lock); spin_lock(&this_parent->d_lock); /* might go back up the wrong parent if we have had a rename. */ if (need_seqretry(&rename_lock, seq)) goto rename_retry; /* go into the first sibling still alive */ - do { - next = child->d_child.next; - if (next == &this_parent->d_subdirs) - goto ascend; - child = list_entry(next, struct dentry, d_child); - } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED)); - rcu_read_unlock(); - goto resume; + hlist_for_each_entry_continue(dentry, d_sib) { + if (likely(!(dentry->d_flags & DCACHE_DENTRY_KILLED))) { + rcu_read_unlock(); + goto resume; + } + } + goto ascend; } if (need_seqretry(&rename_lock, seq)) goto rename_retry; @@ -1530,7 +1398,7 @@ out: * Search the dentry child list of the specified parent, * and move any unused dentries to the end of the unused * list for prune_dcache(). We descend to the next level - * whenever the d_subdirs list is non-empty and continue + * whenever the d_children list is non-empty and continue * searching. * * It returns zero iff there are no unused children, @@ -1560,13 +1428,11 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) if (dentry->d_flags & DCACHE_SHRINK_LIST) { data->found++; - } else { - if (dentry->d_flags & DCACHE_LRU_LIST) - d_lru_del(dentry); - if (!dentry->d_lockref.count) { - d_shrink_add(dentry, &data->dispose); - data->found++; - } + } else if (!dentry->d_lockref.count) { + to_shrink_list(dentry, &data->dispose); + data->found++; + } else if (dentry->d_lockref.count < 0) { + data->found++; } /* * We can return to the caller if we have found some (this @@ -1587,17 +1453,13 @@ static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry) if (data->start == dentry) goto out; - if (dentry->d_flags & DCACHE_SHRINK_LIST) { - if (!dentry->d_lockref.count) { + if (!dentry->d_lockref.count) { + if (dentry->d_flags & DCACHE_SHRINK_LIST) { rcu_read_lock(); data->victim = dentry; return D_WALK_QUIT; } - } else { - if (dentry->d_flags & DCACHE_LRU_LIST) - d_lru_del(dentry); - if (!dentry->d_lockref.count) - d_shrink_add(dentry, &data->dispose); + to_shrink_list(dentry, &data->dispose); } /* * We can return to the caller if we have found some (this @@ -1635,17 +1497,12 @@ void shrink_dcache_parent(struct dentry *parent) data.victim = NULL; d_walk(parent, &data, select_collect2); if (data.victim) { - struct dentry *parent; spin_lock(&data.victim->d_lock); - if (!shrink_lock_dentry(data.victim)) { + if (!lock_for_kill(data.victim)) { spin_unlock(&data.victim->d_lock); rcu_read_unlock(); } else { - rcu_read_unlock(); - parent = data.victim->d_parent; - if (parent != data.victim) - __dput_to_list(parent, &data.dispose); - __dentry_kill(data.victim); + shrink_kill(data.victim); } } if (!list_empty(&data.dispose)) @@ -1657,7 +1514,7 @@ EXPORT_SYMBOL(shrink_dcache_parent); static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) { /* it has busy descendents; complain about those instead */ - if (!list_empty(&dentry->d_subdirs)) + if (!hlist_empty(&dentry->d_children)) return D_WALK_CONTINUE; /* root with refcount 1 is fine */ @@ -1707,8 +1564,7 @@ static enum d_walk_ret find_submount(void *_data, struct dentry *dentry) { struct dentry **victim = _data; if (d_mountpoint(dentry)) { - __dget_dlock(dentry); - *victim = dentry; + *victim = dget_dlock(dentry); return D_WALK_QUIT; } return D_WALK_CONTINUE; @@ -1814,9 +1670,9 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) dentry->d_fsdata = NULL; INIT_HLIST_BL_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); - INIT_LIST_HEAD(&dentry->d_subdirs); + INIT_HLIST_HEAD(&dentry->d_children); INIT_HLIST_NODE(&dentry->d_u.d_alias); - INIT_LIST_HEAD(&dentry->d_child); + INIT_HLIST_NODE(&dentry->d_sib); d_set_d_op(dentry, dentry->d_sb->s_d_op); if (dentry->d_op && dentry->d_op->d_init) { @@ -1853,9 +1709,8 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) * don't need child lock because it is not subject * to concurrency here */ - __dget_dlock(parent); - dentry->d_parent = parent; - list_add(&dentry->d_child, &parent->d_subdirs); + dentry->d_parent = dget_dlock(parent); + hlist_add_head(&dentry->d_sib, &parent->d_children); spin_unlock(&parent->d_lock); return dentry; @@ -1895,9 +1750,15 @@ struct dentry *d_alloc_cursor(struct dentry * parent) */ struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name) { + static const struct dentry_operations anon_ops = { + .d_dname = simple_dname + }; struct dentry *dentry = __d_alloc(sb, name); - if (likely(dentry)) + if (likely(dentry)) { dentry->d_flags |= DCACHE_NORCU; + if (!sb->s_d_op) + d_set_d_op(dentry, &anon_ops); + } return dentry; } @@ -1941,22 +1802,6 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) } EXPORT_SYMBOL(d_set_d_op); - -/* - * d_set_fallthru - Mark a dentry as falling through to a lower layer - * @dentry - The dentry to mark - * - * Mark a dentry as falling through to the lower layer (as set with - * d_pin_lower()). This flag may be recorded on the medium. - */ -void d_set_fallthru(struct dentry *dentry) -{ - spin_lock(&dentry->d_lock); - dentry->d_flags |= DCACHE_FALLTHRU; - spin_unlock(&dentry->d_lock); -} -EXPORT_SYMBOL(d_set_fallthru); - static unsigned d_flags_for_inode(struct inode *inode) { unsigned add_flags = DCACHE_REGULAR_TYPE; @@ -2075,75 +1920,55 @@ struct dentry *d_make_root(struct inode *root_inode) } EXPORT_SYMBOL(d_make_root); -static struct dentry *__d_instantiate_anon(struct dentry *dentry, - struct inode *inode, - bool disconnected) -{ - struct dentry *res; - unsigned add_flags; - - security_d_instantiate(dentry, inode); - spin_lock(&inode->i_lock); - res = __d_find_any_alias(inode); - if (res) { - spin_unlock(&inode->i_lock); - dput(dentry); - goto out_iput; - } - - /* attach a disconnected dentry */ - add_flags = d_flags_for_inode(inode); - - if (disconnected) - add_flags |= DCACHE_DISCONNECTED; - - spin_lock(&dentry->d_lock); - __d_set_inode_and_type(dentry, inode, add_flags); - hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); - if (!disconnected) { - hlist_bl_lock(&dentry->d_sb->s_roots); - hlist_bl_add_head(&dentry->d_hash, &dentry->d_sb->s_roots); - hlist_bl_unlock(&dentry->d_sb->s_roots); - } - spin_unlock(&dentry->d_lock); - spin_unlock(&inode->i_lock); - - return dentry; - - out_iput: - iput(inode); - return res; -} - -struct dentry *d_instantiate_anon(struct dentry *dentry, struct inode *inode) -{ - return __d_instantiate_anon(dentry, inode, true); -} -EXPORT_SYMBOL(d_instantiate_anon); - static struct dentry *__d_obtain_alias(struct inode *inode, bool disconnected) { - struct dentry *tmp; - struct dentry *res; + struct super_block *sb; + struct dentry *new, *res; if (!inode) return ERR_PTR(-ESTALE); if (IS_ERR(inode)) return ERR_CAST(inode); - res = d_find_any_alias(inode); + sb = inode->i_sb; + + res = d_find_any_alias(inode); /* existing alias? */ if (res) - goto out_iput; + goto out; - tmp = d_alloc_anon(inode->i_sb); - if (!tmp) { + new = d_alloc_anon(sb); + if (!new) { res = ERR_PTR(-ENOMEM); - goto out_iput; + goto out; } - return __d_instantiate_anon(tmp, inode, disconnected); + security_d_instantiate(new, inode); + spin_lock(&inode->i_lock); + res = __d_find_any_alias(inode); /* recheck under lock */ + if (likely(!res)) { /* still no alias, attach a disconnected dentry */ + unsigned add_flags = d_flags_for_inode(inode); + + if (disconnected) + add_flags |= DCACHE_DISCONNECTED; -out_iput: + spin_lock(&new->d_lock); + __d_set_inode_and_type(new, inode, add_flags); + hlist_add_head(&new->d_u.d_alias, &inode->i_dentry); + if (!disconnected) { + hlist_bl_lock(&sb->s_roots); + hlist_bl_add_head(&new->d_hash, &sb->s_roots); + hlist_bl_unlock(&sb->s_roots); + } + spin_unlock(&new->d_lock); + spin_unlock(&inode->i_lock); + inode = NULL; /* consumed by new->d_inode */ + res = new; + } else { + spin_unlock(&inode->i_lock); + dput(new); + } + + out: iput(inode); return res; } @@ -2727,7 +2552,7 @@ retry: /* we can't take ->d_lock here; it's OK, though. */ new->d_flags |= DCACHE_PAR_LOOKUP; new->d_wait = wq; - hlist_bl_add_head_rcu(&new->d_u.d_in_lookup_hash, b); + hlist_bl_add_head(&new->d_u.d_in_lookup_hash, b); hlist_bl_unlock(b); return new; mismatch: @@ -2851,7 +2676,7 @@ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode) spin_unlock(&alias->d_lock); alias = NULL; } else { - __dget_dlock(alias); + dget_dlock(alias); __d_rehash(alias); spin_unlock(&alias->d_lock); } @@ -2993,11 +2818,15 @@ static void __d_move(struct dentry *dentry, struct dentry *target, } else { target->d_parent = old_parent; swap_names(dentry, target); - list_move(&target->d_child, &target->d_parent->d_subdirs); + if (!hlist_unhashed(&target->d_sib)) + __hlist_del(&target->d_sib); + hlist_add_head(&target->d_sib, &target->d_parent->d_children); __d_rehash(target); fsnotify_update_flags(target); } - list_move(&dentry->d_child, &dentry->d_parent->d_subdirs); + if (!hlist_unhashed(&dentry->d_sib)) + __hlist_del(&dentry->d_sib); + hlist_add_head(&dentry->d_sib, &dentry->d_parent->d_children); __d_rehash(dentry); fsnotify_update_flags(dentry); fscrypt_handle_d_move(dentry); @@ -3080,8 +2909,7 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) * Note: If ever the locking in lock_rename() changes, then please * remember to update this too... */ -static int __d_unalias(struct inode *inode, - struct dentry *dentry, struct dentry *alias) +static int __d_unalias(struct dentry *dentry, struct dentry *alias) { struct mutex *m1 = NULL; struct rw_semaphore *m2 = NULL; @@ -3162,7 +2990,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) inode->i_sb->s_id); } else if (!IS_ROOT(new)) { struct dentry *old_parent = dget(new->d_parent); - int err = __d_unalias(inode, dentry, new); + int err = __d_unalias(dentry, new); write_sequnlock(&rename_lock); if (err) { dput(new); @@ -3233,10 +3061,7 @@ static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) if (d_unhashed(dentry) || !dentry->d_inode) return D_WALK_SKIP; - if (!(dentry->d_flags & DCACHE_GENOCIDE)) { - dentry->d_flags |= DCACHE_GENOCIDE; - dentry->d_lockref.count--; - } + dentry->d_lockref.count--; } return D_WALK_CONTINUE; } diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 6d7c1a49581f..c6f4a9a98b85 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -1100,17 +1100,35 @@ static ssize_t read_file_blob(struct file *file, char __user *user_buf, return r; } +static ssize_t write_file_blob(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct debugfs_blob_wrapper *blob = file->private_data; + struct dentry *dentry = F_DENTRY(file); + ssize_t r; + + r = debugfs_file_get(dentry); + if (unlikely(r)) + return r; + r = simple_write_to_buffer(blob->data, blob->size, ppos, user_buf, + count); + + debugfs_file_put(dentry); + return r; +} + static const struct file_operations fops_blob = { .read = read_file_blob, + .write = write_file_blob, .open = simple_open, .llseek = default_llseek, }; /** - * debugfs_create_blob - create a debugfs file that is used to read a binary blob + * debugfs_create_blob - create a debugfs file that is used to read and write + * a binary blob * @name: a pointer to a string containing the name of the file to create. - * @mode: the read permission that the file should have (other permissions are - * masked out) + * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. @@ -1119,7 +1137,7 @@ static const struct file_operations fops_blob = { * * This function creates a file in debugfs with the given name that exports * @blob->data as a binary blob. If the @mode variable is so set it can be - * read from. Writing is not supported. + * read from and written to. * * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is @@ -1134,7 +1152,7 @@ struct dentry *debugfs_create_blob(const char *name, umode_t mode, struct dentry *parent, struct debugfs_blob_wrapper *blob) { - return debugfs_create_file_unsafe(name, mode & 0444, parent, blob, &fops_blob); + return debugfs_create_file_unsafe(name, mode & 0644, parent, blob, &fops_blob); } EXPORT_SYMBOL_GPL(debugfs_create_blob); diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index c830261aa883..b20e565b9c5e 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -69,7 +69,6 @@ static struct ctl_table pty_table[] = { .data = &pty_count, .proc_handler = proc_dointvec, }, - {} }; struct pts_mount_opts { diff --git a/fs/direct-io.c b/fs/direct-io.c index 20533266ade6..60456263a338 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1114,7 +1114,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, loff_t offset = iocb->ki_pos; const loff_t end = offset + count; struct dio *dio; - struct dio_submit sdio = { 0, }; + struct dio_submit sdio = { NULL, }; struct buffer_head map_bh = { 0, }; struct blk_plug plug; unsigned long align = offset | iov_iter_alignment(iter); diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 42f332f46359..4fa11d9ddbb6 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -443,14 +443,14 @@ static int table_seq_show(struct seq_file *seq, void *iter_ptr) break; case 3: if (ri->header) { - seq_puts(seq, "version rsb 1.1 lvb 1.1 lkb 1.1\n"); + seq_puts(seq, "rsb ptr nodeid first_lkid flags !root_list_empty !recover_list_empty recover_locks_count len\n"); ri->header = 0; } print_format3(ri->rsb, seq); break; case 4: if (ri->header) { - seq_puts(seq, "version 4 rsb 2\n"); + seq_puts(seq, "rsb ptr nodeid master_nodeid dir_nodeid our_nodeid toss_time flags len str|hex name\n"); ri->header = 0; } print_format4(ri->rsb, seq); @@ -748,7 +748,7 @@ static int table_open4(struct inode *inode, struct file *file) struct seq_file *seq; int ret; - ret = seq_open(file, &format5_seq_ops); + ret = seq_open(file, &format4_seq_ops); if (ret) return ret; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 67f8dd8a05ef..6296c62c10fa 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1817,8 +1817,8 @@ static int dlm_tcp_bind(struct socket *sock) memcpy(&src_addr, &dlm_local_addr[0], sizeof(src_addr)); make_sockaddr(&src_addr, 0, &addr_len); - result = sock->ops->bind(sock, (struct sockaddr *)&src_addr, - addr_len); + result = kernel_bind(sock, (struct sockaddr *)&src_addr, + addr_len); if (result < 0) { /* This *may* not indicate a critical error */ log_print("could not bind for connect: %d", result); @@ -1830,7 +1830,7 @@ static int dlm_tcp_bind(struct socket *sock) static int dlm_tcp_connect(struct connection *con, struct socket *sock, struct sockaddr *addr, int addr_len) { - return sock->ops->connect(sock, addr, addr_len, O_NONBLOCK); + return kernel_connect(sock, addr, addr_len, O_NONBLOCK); } static int dlm_tcp_listen_validate(void) @@ -1862,8 +1862,8 @@ static int dlm_tcp_listen_bind(struct socket *sock) /* Bind to our port */ make_sockaddr(&dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len); - return sock->ops->bind(sock, (struct sockaddr *)&dlm_local_addr[0], - addr_len); + return kernel_bind(sock, (struct sockaddr *)&dlm_local_addr[0], + addr_len); } static const struct dlm_proto_ops dlm_tcp_ops = { @@ -1888,12 +1888,12 @@ static int dlm_sctp_connect(struct connection *con, struct socket *sock, int ret; /* - * Make sock->ops->connect() function return in specified time, + * Make kernel_connect() function return in specified time, * since O_NONBLOCK argument in connect() function does not work here, * then, we should restore the default value of this attribute. */ sock_set_sndtimeo(sock->sk, 5); - ret = sock->ops->connect(sock, addr, addr_len, 0); + ret = kernel_connect(sock, addr, addr_len, 0); sock_set_sndtimeo(sock->sk, 0); return ret; } diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index e6b4c1a21446..d814c5121367 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -140,11 +140,12 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, op->info.optype = DLM_PLOCK_OP_LOCK; op->info.pid = fl->fl_pid; op->info.ex = (fl->fl_type == F_WRLCK); - op->info.wait = IS_SETLKW(cmd); + op->info.wait = !!(fl->fl_flags & FL_SLEEP); op->info.fsid = ls->ls_global_id; op->info.number = number; op->info.start = fl->fl_start; op->info.end = fl->fl_end; + op->info.owner = (__u64)(long)fl->fl_owner; /* async handling */ if (fl->fl_lmops && fl->fl_lmops->lm_grant) { op_data = kzalloc(sizeof(*op_data), GFP_NOFS); @@ -154,9 +155,6 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, goto out; } - /* fl_owner is lockd which doesn't distinguish - processes on the nfs client */ - op->info.owner = (__u64) fl->fl_pid; op_data->callback = fl->fl_lmops->lm_grant; locks_init_lock(&op_data->flc); locks_copy_lock(&op_data->flc, fl); @@ -168,8 +166,6 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, send_op(op); rv = FILE_LOCK_DEFERRED; goto out; - } else { - op->info.owner = (__u64)(long) fl->fl_owner; } send_op(op); @@ -326,10 +322,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, op->info.number = number; op->info.start = fl->fl_start; op->info.end = fl->fl_end; - if (fl->fl_lmops && fl->fl_lmops->lm_grant) - op->info.owner = (__u64) fl->fl_pid; - else - op->info.owner = (__u64)(long) fl->fl_owner; + op->info.owner = (__u64)(long)fl->fl_owner; if (fl->fl_flags & FL_CLOSE) { op->info.flags |= DLM_PLOCK_FL_CLOSE; @@ -389,7 +382,7 @@ int dlm_posix_cancel(dlm_lockspace_t *lockspace, u64 number, struct file *file, info.number = number; info.start = fl->fl_start; info.end = fl->fl_end; - info.owner = (__u64)fl->fl_pid; + info.owner = (__u64)(long)fl->fl_owner; rv = do_lock_cancel(&info); switch (rv) { @@ -450,10 +443,7 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file, op->info.number = number; op->info.start = fl->fl_start; op->info.end = fl->fl_end; - if (fl->fl_lmops && fl->fl_lmops->lm_grant) - op->info.owner = (__u64) fl->fl_pid; - else - op->info.owner = (__u64)(long) fl->fl_owner; + op->info.owner = (__u64)(long)fl->fl_owner; send_op(op); wait_event(recv_wq, (op->done != 0)); diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index b0e8774c435a..5ed1e4cf6c0b 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -78,6 +78,14 @@ static struct inode *__ecryptfs_get_inode(struct inode *lower_inode, if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) return ERR_PTR(-EXDEV); + + /* Reject dealing with casefold directories. */ + if (IS_CASEFOLDED(lower_inode)) { + pr_err_ratelimited("%s: Can't handle casefolded directory.\n", + __func__); + return ERR_PTR(-EREMOTE); + } + if (!igrab(lower_inode)) return ERR_PTR(-ESTALE); inode = iget5_locked(sb, (unsigned long)lower_inode, @@ -599,6 +607,8 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, target_inode = d_inode(new_dentry); trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); + if (IS_ERR(trap)) + return PTR_ERR(trap); dget(lower_new_dentry); rc = -EINVAL; if (lower_old_dentry->d_parent != lower_old_dir_dentry) diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index 91290fe4a70b..586446e02ef7 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -77,6 +77,7 @@ bool efivarfs_valid_name(const char *str, int len) static int efivarfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { + struct efivarfs_fs_info *info = dir->i_sb->s_fs_info; struct inode *inode = NULL; struct efivar_entry *var; int namelen, i = 0, err = 0; @@ -118,7 +119,7 @@ static int efivarfs_create(struct mnt_idmap *idmap, struct inode *dir, inode->i_private = var; kmemleak_ignore(var); - err = efivar_entry_add(var, &efivarfs_list); + err = efivar_entry_add(var, &info->efivarfs_list); if (err) goto out; diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h index c66647f5c0bd..169252e6dc46 100644 --- a/fs/efivarfs/internal.h +++ b/fs/efivarfs/internal.h @@ -16,6 +16,9 @@ struct efivarfs_mount_opts { struct efivarfs_fs_info { struct efivarfs_mount_opts mount_opts; + struct list_head efivarfs_list; + struct super_block *sb; + struct notifier_block nb; }; struct efi_variable { @@ -33,7 +36,8 @@ struct efivar_entry { struct kobject kobj; }; -int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), +int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *, + struct list_head *), void *data, bool duplicates, struct list_head *head); int efivar_entry_add(struct efivar_entry *entry, struct list_head *head); @@ -64,6 +68,4 @@ extern struct inode *efivarfs_get_inode(struct super_block *sb, const struct inode *dir, int mode, dev_t dev, bool is_removable); -extern struct list_head efivarfs_list; - #endif /* EFIVAR_FS_INTERNAL_H */ diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 77240953a92e..6038dd39367a 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -15,10 +15,29 @@ #include <linux/slab.h> #include <linux/magic.h> #include <linux/statfs.h> +#include <linux/notifier.h> +#include <linux/printk.h> #include "internal.h" -LIST_HEAD(efivarfs_list); +static int efivarfs_ops_notifier(struct notifier_block *nb, unsigned long event, + void *data) +{ + struct efivarfs_fs_info *sfi = container_of(nb, struct efivarfs_fs_info, nb); + + switch (event) { + case EFIVAR_OPS_RDONLY: + sfi->sb->s_flags |= SB_RDONLY; + break; + case EFIVAR_OPS_RDWR: + sfi->sb->s_flags &= ~SB_RDONLY; + break; + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} static void efivarfs_evict_inode(struct inode *inode) { @@ -166,7 +185,8 @@ static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name) } static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, - unsigned long name_size, void *data) + unsigned long name_size, void *data, + struct list_head *list) { struct super_block *sb = (struct super_block *)data; struct efivar_entry *entry; @@ -221,7 +241,7 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, } __efivar_entry_get(entry, NULL, &size, NULL); - __efivar_entry_add(entry, &efivarfs_list); + __efivar_entry_add(entry, list); /* copied by the above to local storage in the dentry. */ kfree(name); @@ -291,13 +311,11 @@ static int efivarfs_parse_param(struct fs_context *fc, struct fs_parameter *para static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc) { + struct efivarfs_fs_info *sfi = sb->s_fs_info; struct inode *inode = NULL; struct dentry *root; int err; - if (!efivar_is_available()) - return -EOPNOTSUPP; - sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; @@ -319,11 +337,16 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc) if (!root) return -ENOMEM; - INIT_LIST_HEAD(&efivarfs_list); + sfi->sb = sb; + sfi->nb.notifier_call = efivarfs_ops_notifier; + err = blocking_notifier_chain_register(&efivar_ops_nh, &sfi->nb); + if (err) + return err; - err = efivar_init(efivarfs_callback, (void *)sb, true, &efivarfs_list); + err = efivar_init(efivarfs_callback, (void *)sb, true, + &sfi->efivarfs_list); if (err) - efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL); + efivar_entry_iter(efivarfs_destroy, &sfi->efivarfs_list, NULL); return err; } @@ -333,19 +356,35 @@ static int efivarfs_get_tree(struct fs_context *fc) return get_tree_single(fc, efivarfs_fill_super); } +static int efivarfs_reconfigure(struct fs_context *fc) +{ + if (!efivar_supports_writes() && !(fc->sb_flags & SB_RDONLY)) { + pr_err("Firmware does not support SetVariableRT. Can not remount with rw\n"); + return -EINVAL; + } + + return 0; +} + static const struct fs_context_operations efivarfs_context_ops = { .get_tree = efivarfs_get_tree, .parse_param = efivarfs_parse_param, + .reconfigure = efivarfs_reconfigure, }; static int efivarfs_init_fs_context(struct fs_context *fc) { struct efivarfs_fs_info *sfi; + if (!efivar_is_available()) + return -EOPNOTSUPP; + sfi = kzalloc(sizeof(*sfi), GFP_KERNEL); if (!sfi) return -ENOMEM; + INIT_LIST_HEAD(&sfi->efivarfs_list); + sfi->mount_opts.uid = GLOBAL_ROOT_UID; sfi->mount_opts.gid = GLOBAL_ROOT_GID; @@ -356,13 +395,14 @@ static int efivarfs_init_fs_context(struct fs_context *fc) static void efivarfs_kill_sb(struct super_block *sb) { - kill_litter_super(sb); + struct efivarfs_fs_info *sfi = sb->s_fs_info; - if (!efivar_is_available()) - return; + blocking_notifier_chain_unregister(&efivar_ops_nh, &sfi->nb); + kill_litter_super(sb); /* Remove all entries and destroy */ - efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL); + efivar_entry_iter(efivarfs_destroy, &sfi->efivarfs_list, NULL); + kfree(sfi); } static struct file_system_type efivarfs_type = { diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c index 9e4f47808bd5..114ff0fd4e55 100644 --- a/fs/efivarfs/vars.c +++ b/fs/efivarfs/vars.c @@ -369,7 +369,8 @@ static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid, * * Returns 0 on success, or a kernel error code on failure. */ -int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), +int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *, + struct list_head *), void *data, bool duplicates, struct list_head *head) { unsigned long variable_name_size = 1024; @@ -420,7 +421,7 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), status = EFI_NOT_FOUND; } else { err = func(variable_name, vendor_guid, - variable_name_size, data); + variable_name_size, data, head); if (err) status = EFI_NOT_FOUND; } diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 021be5feb1bc..1d65b9f60a39 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -121,11 +121,11 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, } static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, - void *inpage, unsigned int *inputmargin, int *maptype, - bool may_inplace) + void *inpage, void *out, unsigned int *inputmargin, + int *maptype, bool may_inplace) { struct z_erofs_decompress_req *rq = ctx->rq; - unsigned int omargin, total, i, j; + unsigned int omargin, total, i; struct page **in; void *src, *tmp; @@ -135,12 +135,13 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, omargin < LZ4_DECOMPRESS_INPLACE_MARGIN(rq->inputsize)) goto docopy; - for (i = 0; i < ctx->inpages; ++i) { - DBG_BUGON(rq->in[i] == NULL); - for (j = 0; j < ctx->outpages - ctx->inpages + i; ++j) - if (rq->out[j] == rq->in[i]) - goto docopy; - } + for (i = 0; i < ctx->inpages; ++i) + if (rq->out[ctx->outpages - ctx->inpages + i] != + rq->in[i]) + goto docopy; + kunmap_local(inpage); + *maptype = 3; + return out + ((ctx->outpages - ctx->inpages) << PAGE_SHIFT); } if (ctx->inpages <= 1) { @@ -148,7 +149,6 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, return inpage; } kunmap_local(inpage); - might_sleep(); src = erofs_vm_map_ram(rq->in, ctx->inpages); if (!src) return ERR_PTR(-ENOMEM); @@ -204,12 +204,12 @@ int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf, } static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, - u8 *out) + u8 *dst) { struct z_erofs_decompress_req *rq = ctx->rq; bool support_0padding = false, may_inplace = false; unsigned int inputmargin; - u8 *headpage, *src; + u8 *out, *headpage, *src; int ret, maptype; DBG_BUGON(*rq->in == NULL); @@ -230,11 +230,12 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, } inputmargin = rq->pageofs_in; - src = z_erofs_lz4_handle_overlap(ctx, headpage, &inputmargin, + src = z_erofs_lz4_handle_overlap(ctx, headpage, dst, &inputmargin, &maptype, may_inplace); if (IS_ERR(src)) return PTR_ERR(src); + out = dst + rq->pageofs_out; /* legacy format could compress extra data in a pcluster. */ if (rq->partial_decoding || !support_0padding) ret = LZ4_decompress_safe_partial(src + inputmargin, out, @@ -246,15 +247,9 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, if (ret != rq->outputsize) { erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]", ret, rq->inputsize, inputmargin, rq->outputsize); - - print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET, - 16, 1, src + inputmargin, rq->inputsize, true); - print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET, - 16, 1, out, rq->outputsize, true); - if (ret >= 0) memset(out + ret, 0, rq->outputsize - ret); - ret = -EIO; + ret = -EFSCORRUPTED; } else { ret = 0; } @@ -265,7 +260,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, vm_unmap_ram(src, ctx->inpages); } else if (maptype == 2) { erofs_put_pcpubuf(src); - } else { + } else if (maptype != 3) { DBG_BUGON(1); return -EFAULT; } @@ -308,7 +303,7 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, } dstmap_out: - ret = z_erofs_lz4_decompress_mem(&ctx, dst + rq->pageofs_out); + ret = z_erofs_lz4_decompress_mem(&ctx, dst); if (!dst_maptype) kunmap_local(dst); else if (dst_maptype == 2) @@ -319,43 +314,58 @@ dstmap_out: static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, struct page **pagepool) { - const unsigned int inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; - const unsigned int outpages = + const unsigned int nrpages_in = + PAGE_ALIGN(rq->pageofs_in + rq->inputsize) >> PAGE_SHIFT; + const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; - const unsigned int righthalf = min_t(unsigned int, rq->outputsize, - PAGE_SIZE - rq->pageofs_out); - const unsigned int lefthalf = rq->outputsize - righthalf; - const unsigned int interlaced_offset = - rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out; - u8 *src; - - if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - - if (rq->out[0] == *rq->in) { - DBG_BUGON(rq->pageofs_out); - return 0; + const unsigned int bs = rq->sb->s_blocksize; + unsigned int cur = 0, ni = 0, no, pi, po, insz, cnt; + u8 *kin; + + DBG_BUGON(rq->outputsize > rq->inputsize); + if (rq->alg == Z_EROFS_COMPRESSION_INTERLACED) { + cur = bs - (rq->pageofs_out & (bs - 1)); + pi = (rq->pageofs_in + rq->inputsize - cur) & ~PAGE_MASK; + cur = min(cur, rq->outputsize); + if (cur && rq->out[0]) { + kin = kmap_local_page(rq->in[nrpages_in - 1]); + if (rq->out[0] == rq->in[nrpages_in - 1]) { + memmove(kin + rq->pageofs_out, kin + pi, cur); + flush_dcache_page(rq->out[0]); + } else { + memcpy_to_page(rq->out[0], rq->pageofs_out, + kin + pi, cur); + } + kunmap_local(kin); + } + rq->outputsize -= cur; } - src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in; - if (rq->out[0]) - memcpy_to_page(rq->out[0], rq->pageofs_out, - src + interlaced_offset, righthalf); - - if (outpages > inpages) { - DBG_BUGON(!rq->out[outpages - 1]); - if (rq->out[outpages - 1] != rq->in[inpages - 1]) { - memcpy_to_page(rq->out[outpages - 1], 0, src + - (interlaced_offset ? 0 : righthalf), - lefthalf); - } else if (!interlaced_offset) { - memmove(src, src + righthalf, lefthalf); - flush_dcache_page(rq->in[inpages - 1]); - } + for (; rq->outputsize; rq->pageofs_in = 0, cur += PAGE_SIZE, ni++) { + insz = min(PAGE_SIZE - rq->pageofs_in, rq->outputsize); + rq->outputsize -= insz; + if (!rq->in[ni]) + continue; + kin = kmap_local_page(rq->in[ni]); + pi = 0; + do { + no = (rq->pageofs_out + cur + pi) >> PAGE_SHIFT; + po = (rq->pageofs_out + cur + pi) & ~PAGE_MASK; + DBG_BUGON(no >= nrpages_out); + cnt = min(insz - pi, PAGE_SIZE - po); + if (rq->out[no] == rq->in[ni]) { + memmove(kin + po, + kin + rq->pageofs_in + pi, cnt); + flush_dcache_page(rq->out[no]); + } else if (rq->out[no]) { + memcpy_to_page(rq->out[no], po, + kin + rq->pageofs_in + pi, cnt); + } + pi += cnt; + } while (pi < insz); + kunmap_local(kin); } - kunmap_local(src); + DBG_BUGON(ni > nrpages_in); return 0; } diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c index daf3c1bdeab8..4a64a9c91dd3 100644 --- a/fs/erofs/decompressor_deflate.c +++ b/fs/erofs/decompressor_deflate.c @@ -70,7 +70,7 @@ int __init z_erofs_deflate_init(void) return 0; out_failed: - pr_err("failed to allocate zlib workspace\n"); + erofs_err(NULL, "failed to allocate zlib workspace"); z_erofs_deflate_exit(); return -ENOMEM; } diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 14a79d3226ab..3d616dea55dc 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -259,8 +259,10 @@ static int erofs_fill_inode(struct inode *inode) if (erofs_inode_is_data_compressed(vi->datalayout)) { #ifdef CONFIG_EROFS_FS_ZIP - if (!erofs_is_fscache_mode(inode->i_sb) && - inode->i_sb->s_blocksize_bits == PAGE_SHIFT) { + if (!erofs_is_fscache_mode(inode->i_sb)) { + DO_ONCE_LITE_IF(inode->i_sb->s_blocksize != PAGE_SIZE, + erofs_info, inode->i_sb, + "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!"); inode->i_mapping->a_ops = &z_erofs_aops; err = 0; goto out_unlock; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 3789d6224513..5f60f163bd56 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -27,7 +27,10 @@ void _erofs_err(struct super_block *sb, const char *func, const char *fmt, ...) vaf.fmt = fmt; vaf.va = &args; - pr_err("(device %s): %s: %pV", sb->s_id, func, &vaf); + if (sb) + pr_err("(device %s): %s: %pV", sb->s_id, func, &vaf); + else + pr_err("%s: %pV", func, &vaf); va_end(args); } @@ -41,7 +44,10 @@ void _erofs_info(struct super_block *sb, const char *func, const char *fmt, ...) vaf.fmt = fmt; vaf.va = &args; - pr_info("(device %s): %pV", sb->s_id, &vaf); + if (sb) + pr_info("(device %s): %pV", sb->s_id, &vaf); + else + pr_info("%pV", &vaf); va_end(args); } diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index a7e6847f6f8f..692c0c39be63 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -56,6 +56,9 @@ struct z_erofs_pcluster { /* L: total number of bvecs */ unsigned int vcnt; + /* I: pcluster size (compressed size) in bytes */ + unsigned int pclustersize; + /* I: page offset of start position of decompression */ unsigned short pageofs_out; @@ -70,14 +73,6 @@ struct z_erofs_pcluster { struct rcu_head rcu; }; - union { - /* I: physical cluster size in pages */ - unsigned short pclusterpages; - - /* I: tailpacking inline compressed size */ - unsigned short tailpacking_size; - }; - /* I: compression algorithm format */ unsigned char algorithmformat; @@ -115,9 +110,7 @@ static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) { - if (z_erofs_is_inline_pcluster(pcl)) - return 1; - return pcl->pclusterpages; + return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT; } /* @@ -298,12 +291,12 @@ static int z_erofs_create_pcluster_pool(void) return 0; } -static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages) +static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size) { - int i; + unsigned int nrpages = PAGE_ALIGN(size) >> PAGE_SHIFT; + struct z_erofs_pcluster_slab *pcs = pcluster_pool; - for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { - struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; + for (; pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { struct z_erofs_pcluster *pcl; if (nrpages > pcs->maxpages) @@ -312,7 +305,7 @@ static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages) pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS); if (!pcl) return ERR_PTR(-ENOMEM); - pcl->pclusterpages = nrpages; + pcl->pclustersize = size; return pcl; } return ERR_PTR(-EINVAL); @@ -559,6 +552,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) { struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode)); struct z_erofs_pcluster *pcl = fe->pcl; + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); bool shouldalloc = z_erofs_should_alloc_cache(fe); bool standalone = true; /* @@ -569,13 +563,14 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; unsigned int i; + if (i_blocksize(fe->inode) != PAGE_SIZE) + return; if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) return; - for (i = 0; i < pcl->pclusterpages; ++i) { - struct page *page; + for (i = 0; i < pclusterpages; ++i) { + struct page *page, *newpage; void *t; /* mark pages just found for debugging */ - struct page *newpage = NULL; /* the compressed page was loaded before */ if (READ_ONCE(pcl->compressed_bvecs[i].page)) @@ -585,6 +580,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) if (page) { t = (void *)((unsigned long)page | 1); + newpage = NULL; } else { /* I/O is needed, no possible to decompress directly */ standalone = false; @@ -592,9 +588,8 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) continue; /* - * try to use cached I/O if page allocation - * succeeds or fallback to in-place I/O instead - * to avoid any direct reclaim. + * Try cached I/O if allocation succeeds or fallback to + * in-place I/O instead to avoid any direct reclaim. */ newpage = erofs_allocpage(&fe->pagepool, gfp); if (!newpage) @@ -626,6 +621,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, { struct z_erofs_pcluster *const pcl = container_of(grp, struct z_erofs_pcluster, obj); + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); int i; DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); @@ -633,7 +629,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, * refcount of workgroup is now freezed as 0, * therefore no need to worry about available decompression users. */ - for (i = 0; i < pcl->pclusterpages; ++i) { + for (i = 0; i < pclusterpages; ++i) { struct page *page = pcl->compressed_bvecs[i].page; if (!page) @@ -657,6 +653,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) { struct z_erofs_pcluster *pcl = folio_get_private(folio); + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); bool ret; int i; @@ -669,7 +666,7 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) goto out; DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); - for (i = 0; i < pcl->pclusterpages; ++i) { + for (i = 0; i < pclusterpages; ++i) { if (pcl->compressed_bvecs[i].page == &folio->page) { WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); ret = true; @@ -778,20 +775,20 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) { struct erofs_map_blocks *map = &fe->map; + struct super_block *sb = fe->inode->i_sb; bool ztailpacking = map->m_flags & EROFS_MAP_META; struct z_erofs_pcluster *pcl; struct erofs_workgroup *grp; int err; if (!(map->m_flags & EROFS_MAP_ENCODED) || - (!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) { + (!ztailpacking && !erofs_blknr(sb, map->m_pa))) { DBG_BUGON(1); return -EFSCORRUPTED; } /* no available pcluster, let's allocate one */ - pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 : - map->m_plen >> PAGE_SHIFT); + pcl = z_erofs_alloc_pcluster(map->m_plen); if (IS_ERR(pcl)) return PTR_ERR(pcl); @@ -815,10 +812,8 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) if (ztailpacking) { pcl->obj.index = 0; /* which indicates ztailpacking */ - pcl->pageofs_in = erofs_blkoff(fe->inode->i_sb, map->m_pa); - pcl->tailpacking_size = map->m_plen; } else { - pcl->obj.index = map->m_pa >> PAGE_SHIFT; + pcl->obj.index = erofs_blknr(sb, map->m_pa); grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj); if (IS_ERR(grp)) { @@ -893,6 +888,7 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) } get_page(map->buf.page); WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page); + fe->pcl->pageofs_in = map->m_pa & ~PAGE_MASK; fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } /* file-backed inplace I/O pages are traversed in reverse order */ @@ -973,12 +969,12 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct inode *const inode = fe->inode; struct erofs_map_blocks *const map = &fe->map; const loff_t offset = page_offset(page); + const unsigned int bs = i_blocksize(inode); bool tight = true, exclusive; unsigned int cur, end, len, split; int err = 0; z_erofs_onlinepage_init(page); - split = 0; end = PAGE_SIZE; repeat: @@ -1027,7 +1023,7 @@ repeat: * for inplace I/O or bvpage (should be processed in a strict order.) */ tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); - exclusive = (!cur && ((split <= 1) || tight)); + exclusive = (!cur && ((split <= 1) || (tight && bs == PAGE_SIZE))); if (cur) tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); @@ -1206,34 +1202,27 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i]; struct page *page = bvec->page; - /* compressed pages ought to be present before decompressing */ + /* compressed data ought to be valid before decompressing */ if (!page) { - DBG_BUGON(1); + err = -EIO; continue; } be->compressed_pages[i] = page; - if (z_erofs_is_inline_pcluster(pcl)) { + if (z_erofs_is_inline_pcluster(pcl) || + erofs_page_is_managed(EROFS_SB(be->sb), page)) { if (!PageUptodate(page)) err = -EIO; continue; } DBG_BUGON(z_erofs_page_is_invalidated(page)); - if (!z_erofs_is_shortlived_page(page)) { - if (erofs_page_is_managed(EROFS_SB(be->sb), page)) { - if (!PageUptodate(page)) - err = -EIO; - continue; - } - z_erofs_do_decompressed_bvec(be, bvec); - *overlapped = true; - } + if (z_erofs_is_shortlived_page(page)) + continue; + z_erofs_do_decompressed_bvec(be, bvec); + *overlapped = true; } - - if (err) - return err; - return 0; + return err; } static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, @@ -1242,10 +1231,9 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, struct erofs_sb_info *const sbi = EROFS_SB(be->sb); struct z_erofs_pcluster *pcl = be->pcl; unsigned int pclusterpages = z_erofs_pclusterpages(pcl); - const struct z_erofs_decompressor *decompressor = + const struct z_erofs_decompressor *decomp = &erofs_decompressors[pcl->algorithmformat]; - unsigned int i, inputsize; - int err2; + int i, err2; struct page *page; bool overlapped; @@ -1279,21 +1267,14 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, err2 = z_erofs_parse_in_bvecs(be, &overlapped); if (err2) err = err2; - if (err) - goto out; - - if (z_erofs_is_inline_pcluster(pcl)) - inputsize = pcl->tailpacking_size; - else - inputsize = pclusterpages * PAGE_SIZE; - - err = decompressor->decompress(&(struct z_erofs_decompress_req) { + if (!err) + err = decomp->decompress(&(struct z_erofs_decompress_req) { .sb = be->sb, .in = be->compressed_pages, .out = be->decompressed_pages, .pageofs_in = pcl->pageofs_in, .pageofs_out = pcl->pageofs_out, - .inputsize = inputsize, + .inputsize = pcl->pclustersize, .outputsize = pcl->length, .alg = pcl->algorithmformat, .inplace_io = overlapped, @@ -1301,7 +1282,6 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, .fillgaps = pcl->multibases, }, be->pagepool); -out: /* must handle all compressed pages before actual file pages */ if (z_erofs_is_inline_pcluster(pcl)) { page = pcl->compressed_bvecs[0].page; @@ -1309,12 +1289,11 @@ out: put_page(page); } else { for (i = 0; i < pclusterpages; ++i) { - page = pcl->compressed_bvecs[i].page; + /* consider shortlived pages added when decompressing */ + page = be->compressed_pages[i]; - if (erofs_page_is_managed(sbi, page)) + if (!page || erofs_page_is_managed(sbi, page)) continue; - - /* recycle all individual short-lived pages */ (void)z_erofs_put_shortlivedpage(be->pagepool, page); WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); } @@ -1436,86 +1415,85 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, z_erofs_decompressqueue_work(&io->u.work); } -static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, - unsigned int nr, - struct page **pagepool, - struct address_space *mc) +static void z_erofs_fill_bio_vec(struct bio_vec *bvec, + struct z_erofs_decompress_frontend *f, + struct z_erofs_pcluster *pcl, + unsigned int nr, + struct address_space *mc) { - const pgoff_t index = pcl->obj.index; gfp_t gfp = mapping_gfp_mask(mc); bool tocache = false; - + struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr; struct address_space *mapping; - struct page *oldpage, *page; - int justfound; + struct page *page, *oldpage; + int justfound, bs = i_blocksize(f->inode); + /* Except for inplace pages, the entire page can be used for I/Os */ + bvec->bv_offset = 0; + bvec->bv_len = PAGE_SIZE; repeat: - page = READ_ONCE(pcl->compressed_bvecs[nr].page); - oldpage = page; - - if (!page) + oldpage = READ_ONCE(zbv->page); + if (!oldpage) goto out_allocpage; - justfound = (unsigned long)page & 1UL; - page = (struct page *)((unsigned long)page & ~1UL); + justfound = (unsigned long)oldpage & 1UL; + page = (struct page *)((unsigned long)oldpage & ~1UL); + bvec->bv_page = page; + DBG_BUGON(z_erofs_is_shortlived_page(page)); /* - * preallocated cached pages, which is used to avoid direct reclaim - * otherwise, it will go inplace I/O path instead. + * Handle preallocated cached pages. We tried to allocate such pages + * without triggering direct reclaim. If allocation failed, inplace + * file-backed pages will be used instead. */ if (page->private == Z_EROFS_PREALLOCATED_PAGE) { - WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); set_page_private(page, 0); + WRITE_ONCE(zbv->page, page); tocache = true; goto out_tocache; } - mapping = READ_ONCE(page->mapping); + mapping = READ_ONCE(page->mapping); /* - * file-backed online pages in plcuster are all locked steady, - * therefore it is impossible for `mapping' to be NULL. + * File-backed pages for inplace I/Os are all locked steady, + * therefore it is impossible for `mapping` to be NULL. */ - if (mapping && mapping != mc) - /* ought to be unmanaged pages */ - goto out; - - /* directly return for shortlived page as well */ - if (z_erofs_is_shortlived_page(page)) - goto out; + if (mapping && mapping != mc) { + if (zbv->offset < 0) + bvec->bv_offset = round_up(-zbv->offset, bs); + bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset; + return; + } lock_page(page); - /* only true if page reclaim goes wrong, should never happen */ DBG_BUGON(justfound && PagePrivate(page)); - /* the page is still in manage cache */ + /* the cached page is still in managed cache */ if (page->mapping == mc) { - WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); - + WRITE_ONCE(zbv->page, page); + /* + * The cached page is still available but without a valid + * `->private` pcluster hint. Let's reconnect them. + */ if (!PagePrivate(page)) { - /* - * impossible to be !PagePrivate(page) for - * the current restriction as well if - * the page is already in compressed_bvecs[]. - */ DBG_BUGON(!justfound); - - justfound = 0; - set_page_private(page, (unsigned long)pcl); - SetPagePrivate(page); + /* compressed_bvecs[] already takes a ref */ + attach_page_private(page, pcl); + put_page(page); } - /* no need to submit io if it is already up-to-date */ + /* no need to submit if it is already up-to-date */ if (PageUptodate(page)) { unlock_page(page); - page = NULL; + bvec->bv_page = NULL; } - goto out; + return; } /* - * the managed page has been truncated, it's unsafe to - * reuse this one, let's allocate a new cache-managed page. + * It has been truncated, so it's unsafe to reuse this one. Let's + * allocate a new page for compressed data. */ DBG_BUGON(page->mapping); DBG_BUGON(!justfound); @@ -1524,25 +1502,23 @@ repeat: unlock_page(page); put_page(page); out_allocpage: - page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); - if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page, - oldpage, page)) { - erofs_pagepool_add(pagepool, page); + page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL); + if (oldpage != cmpxchg(&zbv->page, oldpage, page)) { + erofs_pagepool_add(&f->pagepool, page); cond_resched(); goto repeat; } + bvec->bv_page = page; out_tocache: - if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) { - /* turn into temporary page if fails (1 ref) */ + if (!tocache || bs != PAGE_SIZE || + add_to_page_cache_lru(page, mc, pcl->obj.index + nr, gfp)) { + /* turn into a temporary shortlived page (1 ref) */ set_page_private(page, Z_EROFS_SHORTLIVED_PAGE); - goto out; + return; } attach_page_private(page, pcl); - /* drop a refcount added by allocpage (then we have 2 refs here) */ + /* drop a refcount added by allocpage (then 2 refs in total here) */ put_page(page); - -out: /* the only exit (for tracing and debugging) */ - return page; } static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb, @@ -1597,7 +1573,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, qtail[JQ_BYPASS] = &pcl->next; } -static void z_erofs_decompressqueue_endio(struct bio *bio) +static void z_erofs_submissionqueue_endio(struct bio *bio) { struct z_erofs_decompressqueue *q = bio->bi_private; blk_status_t err = bio->bi_status; @@ -1609,7 +1585,6 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) DBG_BUGON(PageUptodate(page)); DBG_BUGON(z_erofs_page_is_invalidated(page)); - if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { if (!err) SetPageUptodate(page); @@ -1632,17 +1607,14 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; z_erofs_next_pcluster_t owned_head = f->owned_head; /* bio is NULL initially, so no need to initialize last_{index,bdev} */ - pgoff_t last_index; + erofs_off_t last_pa; struct block_device *last_bdev; unsigned int nr_bios = 0; struct bio *bio = NULL; unsigned long pflags; int memstall = 0; - /* - * if managed cache is enabled, bypass jobqueue is needed, - * no need to read from device for all pclusters in this queue. - */ + /* No need to read from device for pclusters in the bypass queue. */ q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL); q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg); @@ -1655,7 +1627,8 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, do { struct erofs_map_dev mdev; struct z_erofs_pcluster *pcl; - pgoff_t cur, end; + erofs_off_t cur, end; + struct bio_vec bvec; unsigned int i = 0; bool bypass = true; @@ -1674,18 +1647,14 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, }; (void)erofs_map_dev(sb, &mdev); - cur = erofs_blknr(sb, mdev.m_pa); - end = cur + pcl->pclusterpages; - + cur = mdev.m_pa; + end = cur + pcl->pclustersize; do { - struct page *page; - - page = pickup_page_for_submission(pcl, i++, - &f->pagepool, mc); - if (!page) + z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc); + if (!bvec.bv_page) continue; - if (bio && (cur != last_index + 1 || + if (bio && (cur != last_pa || last_bdev != mdev.m_bdev)) { submit_bio_retry: submit_bio(bio); @@ -1696,7 +1665,8 @@ submit_bio_retry: bio = NULL; } - if (unlikely(PageWorkingset(page)) && !memstall) { + if (unlikely(PageWorkingset(bvec.bv_page)) && + !memstall) { psi_memstall_enter(&pflags); memstall = 1; } @@ -1704,23 +1674,24 @@ submit_bio_retry: if (!bio) { bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOIO); - bio->bi_end_io = z_erofs_decompressqueue_endio; - - last_bdev = mdev.m_bdev; - bio->bi_iter.bi_sector = (sector_t)cur << - (sb->s_blocksize_bits - 9); + bio->bi_end_io = z_erofs_submissionqueue_endio; + bio->bi_iter.bi_sector = cur >> 9; bio->bi_private = q[JQ_SUBMIT]; if (readahead) bio->bi_opf |= REQ_RAHEAD; ++nr_bios; + last_bdev = mdev.m_bdev; } - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) + if (cur + bvec.bv_len > end) + bvec.bv_len = end - cur; + if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len, + bvec.bv_offset)) goto submit_bio_retry; - last_index = cur; + last_pa = cur + bvec.bv_len; bypass = false; - } while (++cur < end); + } while ((cur += bvec.bv_len) < end); if (!bypass) qtail[JQ_SUBMIT] = &pcl->next; diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 7b55111fd533..9753875e41cb 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -82,29 +82,26 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, } static unsigned int decode_compactedbits(unsigned int lobits, - unsigned int lomask, u8 *in, unsigned int pos, u8 *type) { const unsigned int v = get_unaligned_le32(in + pos / 8) >> (pos & 7); - const unsigned int lo = v & lomask; + const unsigned int lo = v & ((1 << lobits) - 1); *type = (v >> lobits) & 3; return lo; } -static int get_compacted_la_distance(unsigned int lclusterbits, +static int get_compacted_la_distance(unsigned int lobits, unsigned int encodebits, unsigned int vcnt, u8 *in, int i) { - const unsigned int lomask = (1 << lclusterbits) - 1; unsigned int lo, d1 = 0; u8 type; DBG_BUGON(i >= vcnt); do { - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * i, &type); + lo = decode_compactedbits(lobits, in, encodebits * i, &type); if (type != Z_EROFS_LCLUSTER_TYPE_NONHEAD) return d1; @@ -123,15 +120,14 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, { struct erofs_inode *const vi = EROFS_I(m->inode); const unsigned int lclusterbits = vi->z_logical_clusterbits; - const unsigned int lomask = (1 << lclusterbits) - 1; - unsigned int vcnt, base, lo, encodebits, nblk, eofs; + unsigned int vcnt, base, lo, lobits, encodebits, nblk, eofs; int i; u8 *in, type; bool big_pcluster; if (1 << amortizedshift == 4 && lclusterbits <= 14) vcnt = 2; - else if (1 << amortizedshift == 2 && lclusterbits == 12) + else if (1 << amortizedshift == 2 && lclusterbits <= 12) vcnt = 16; else return -EOPNOTSUPP; @@ -140,6 +136,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, m->nextpackoff = round_down(pos, vcnt << amortizedshift) + (vcnt << amortizedshift); big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1; + lobits = max(lclusterbits, ilog2(Z_EROFS_LI_D0_CBLKCNT) + 1U); encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt; eofs = erofs_blkoff(m->inode->i_sb, pos); base = round_down(eofs, vcnt << amortizedshift); @@ -147,15 +144,14 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, i = (eofs - base) >> amortizedshift; - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * i, &type); + lo = decode_compactedbits(lobits, in, encodebits * i, &type); m->type = type; if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { m->clusterofs = 1 << lclusterbits; /* figure out lookahead_distance: delta[1] if needed */ if (lookahead) - m->delta[1] = get_compacted_la_distance(lclusterbits, + m->delta[1] = get_compacted_la_distance(lobits, encodebits, vcnt, in, i); if (lo & Z_EROFS_LI_D0_CBLKCNT) { if (!big_pcluster) { @@ -174,8 +170,8 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, * of which lo saves delta[1] rather than delta[0]. * Hence, get delta[0] by the previous lcluster indirectly. */ - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * (i - 1), &type); + lo = decode_compactedbits(lobits, in, + encodebits * (i - 1), &type); if (type != Z_EROFS_LCLUSTER_TYPE_NONHEAD) lo = 0; else if (lo & Z_EROFS_LI_D0_CBLKCNT) @@ -190,8 +186,8 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, nblk = 1; while (i > 0) { --i; - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * i, &type); + lo = decode_compactedbits(lobits, in, + encodebits * i, &type); if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) i -= lo; @@ -202,8 +198,8 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, nblk = 0; while (i > 0) { --i; - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * i, &type); + lo = decode_compactedbits(lobits, in, + encodebits * i, &type); if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { if (lo & Z_EROFS_LI_D0_CBLKCNT) { --i; diff --git a/fs/eventfd.c b/fs/eventfd.c index 33a918f9566c..ad8186d47ba7 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -43,7 +43,17 @@ struct eventfd_ctx { int id; }; -__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask) +/** + * eventfd_signal_mask - Increment the event counter + * @ctx: [in] Pointer to the eventfd context. + * @mask: [in] poll mask + * + * This function is supposed to be called by the kernel in paths that do not + * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX + * value, and we signal this as overflow condition by returning a EPOLLERR + * to poll(2). + */ +void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask) { unsigned long flags; @@ -56,45 +66,23 @@ __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask) * safe context. */ if (WARN_ON_ONCE(current->in_eventfd)) - return 0; + return; spin_lock_irqsave(&ctx->wqh.lock, flags); current->in_eventfd = 1; - if (ULLONG_MAX - ctx->count < n) - n = ULLONG_MAX - ctx->count; - ctx->count += n; + if (ctx->count < ULLONG_MAX) + ctx->count++; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask); current->in_eventfd = 0; spin_unlock_irqrestore(&ctx->wqh.lock, flags); - - return n; -} - -/** - * eventfd_signal - Adds @n to the eventfd counter. - * @ctx: [in] Pointer to the eventfd context. - * @n: [in] Value of the counter to be added to the eventfd internal counter. - * The value cannot be negative. - * - * This function is supposed to be called by the kernel in paths that do not - * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX - * value, and we signal this as overflow condition by returning a EPOLLERR - * to poll(2). - * - * Returns the amount by which the counter was incremented. This will be less - * than @n if the counter has overflowed. - */ -__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) -{ - return eventfd_signal_mask(ctx, n, 0); } -EXPORT_SYMBOL_GPL(eventfd_signal); +EXPORT_SYMBOL_GPL(eventfd_signal_mask); static void eventfd_free_ctx(struct eventfd_ctx *ctx) { if (ctx->id >= 0) - ida_simple_remove(&eventfd_ida, ctx->id); + ida_free(&eventfd_ida, ctx->id); kfree(ctx); } @@ -407,7 +395,7 @@ static int do_eventfd(unsigned int count, int flags) init_waitqueue_head(&ctx->wqh); ctx->count = count; ctx->flags = flags; - ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL); + ctx->id = ida_alloc(&eventfd_ida, GFP_KERNEL); flags &= EFD_SHARED_FCNTL_FLAGS; flags |= O_RDWR; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 2877cc01cff1..3534d36a1474 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -322,7 +322,6 @@ static struct ctl_table epoll_table[] = { .extra1 = &long_zero, .extra2 = &long_max, }, - { } }; static void __init epoll_sysctls_init(void) diff --git a/fs/exec.c b/fs/exec.c index 4aa19b24f281..73e4045df271 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -66,6 +66,7 @@ #include <linux/coredump.h> #include <linux/time_namespace.h> #include <linux/user_events.h> +#include <linux/rseq.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -1578,11 +1579,10 @@ static void check_unsafe_exec(struct linux_binprm *bprm) * will be able to manipulate the current directory, etc. * It would be nice to force an unshare instead... */ - t = p; n_fs = 1; spin_lock(&p->fs->lock); rcu_read_lock(); - while_each_thread(p, t) { + for_other_threads(p, t) { if (t->fs == p->fs) n_fs++; } @@ -2165,7 +2165,6 @@ static struct ctl_table fs_exec_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, - { } }; static int __init init_fs_exec_sysctls(void) diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 464faf6c217e..5a4272b2c6b0 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -969,7 +969,7 @@ const struct address_space_operations ext2_aops = { .writepages = ext2_writepages, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; static const struct address_space_operations ext2_dax_aops = { diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 65f702b1da5b..8346ab9534c1 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -325,6 +325,7 @@ static int ext2_rename (struct mnt_idmap * idmap, struct ext2_dir_entry_2 * dir_de = NULL; struct folio * old_folio; struct ext2_dir_entry_2 * old_de; + bool old_is_dir = S_ISDIR(old_inode->i_mode); int err; if (flags & ~RENAME_NOREPLACE) @@ -342,7 +343,7 @@ static int ext2_rename (struct mnt_idmap * idmap, if (IS_ERR(old_de)) return PTR_ERR(old_de); - if (S_ISDIR(old_inode->i_mode)) { + if (old_is_dir && old_dir != new_dir) { err = -EIO; dir_de = ext2_dotdot(old_inode, &dir_folio); if (!dir_de) @@ -354,7 +355,7 @@ static int ext2_rename (struct mnt_idmap * idmap, struct ext2_dir_entry_2 *new_de; err = -ENOTEMPTY; - if (dir_de && !ext2_empty_dir (new_inode)) + if (old_is_dir && !ext2_empty_dir(new_inode)) goto out_dir; new_de = ext2_find_entry(new_dir, &new_dentry->d_name, @@ -368,14 +369,14 @@ static int ext2_rename (struct mnt_idmap * idmap, if (err) goto out_dir; inode_set_ctime_current(new_inode); - if (dir_de) + if (old_is_dir) drop_nlink(new_inode); inode_dec_link_count(new_inode); } else { err = ext2_add_link(new_dentry, old_inode); if (err) goto out_dir; - if (dir_de) + if (old_is_dir) inode_inc_link_count(new_dir); } @@ -387,7 +388,7 @@ static int ext2_rename (struct mnt_idmap * idmap, mark_inode_dirty(old_inode); err = ext2_delete_entry(old_de, old_folio); - if (!err && dir_de) { + if (!err && old_is_dir) { if (old_dir != new_dir) err = ext2_set_link(old_inode, dir_de, dir_folio, new_dir, false); diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index d1a2e6624401..5d8055161acd 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -235,8 +235,6 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line, might_sleep(); - ext4_check_bdev_write_error(sb); - if (ext4_handle_valid(handle)) { err = jbd2_journal_get_write_access(handle, bh); if (err) { @@ -244,7 +242,8 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line, handle, err); return err; } - } + } else + ext4_check_bdev_write_error(sb); if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb)) return 0; BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d5efe076d3d3..01299b55a567 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4523,7 +4523,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, * Round up offset. This is not fallocate, we need to zero out * blocks, so convert interior block aligned part of the range to * unwritten and possibly manually zero out unaligned parts of the - * range. + * range. Here, start and partial_begin are inclusive, end and + * partial_end are exclusive. */ start = round_up(offset, 1 << blkbits); end = round_down((offset + len), 1 << blkbits); @@ -4609,7 +4610,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, * disk in case of crash before zeroing trans is committed. */ if (ext4_should_journal_data(inode)) { - ret = filemap_write_and_wait_range(mapping, start, end); + ret = filemap_write_and_wait_range(mapping, start, + end - 1); if (ret) { filemap_invalidate_unlock(mapping); goto out_mutex; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 9a84a5f9fef4..d5bd1e3a5d36 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -502,9 +502,8 @@ static int ext4_read_inline_folio(struct inode *inode, struct folio *folio) BUG_ON(len > PAGE_SIZE); kaddr = kmap_local_folio(folio, 0); ret = ext4_read_inline_data(inode, kaddr, len, &iloc); - flush_dcache_folio(folio); + kaddr = folio_zero_tail(folio, len, kaddr + len); kunmap_local(kaddr); - folio_zero_segment(folio, len, folio_size(folio)); folio_mark_uptodate(folio); brelse(iloc.bh); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 61277f7f8722..5af1b0b8680e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1261,7 +1261,7 @@ static int write_end_fn(handle_t *handle, struct inode *inode, * We need to pick up the new inode size which generic_commit_write gave us * `file' can be NULL - eg, when called from page_symlink(). * - * ext4 never places buffers on inode->i_mapping->private_list. metadata + * ext4 never places buffers on inode->i_mapping->i_private_list. metadata * buffers are managed internally. */ static int ext4_write_end(struct file *file, @@ -2947,7 +2947,7 @@ static int ext4_da_should_update_i_disksize(struct folio *folio, static int ext4_da_do_write_end(struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page) + struct folio *folio) { struct inode *inode = mapping->host; loff_t old_size = inode->i_size; @@ -2958,12 +2958,13 @@ static int ext4_da_do_write_end(struct address_space *mapping, * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES * flag, which all that's needed to trigger page writeback. */ - copied = block_write_end(NULL, mapping, pos, len, copied, page, NULL); + copied = block_write_end(NULL, mapping, pos, len, copied, + &folio->page, NULL); new_i_size = pos + copied; /* - * It's important to update i_size while still holding page lock, - * because page writeout could otherwise come in and zero beyond + * It's important to update i_size while still holding folio lock, + * because folio writeout could otherwise come in and zero beyond * i_size. * * Since we are holding inode lock, we are sure i_disksize <= @@ -2981,14 +2982,14 @@ static int ext4_da_do_write_end(struct address_space *mapping, i_size_write(inode, new_i_size); end = (new_i_size - 1) & (PAGE_SIZE - 1); - if (copied && ext4_da_should_update_i_disksize(page_folio(page), end)) { + if (copied && ext4_da_should_update_i_disksize(folio, end)) { ext4_update_i_disksize(inode, new_i_size); disksize_changed = true; } } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); if (old_size < pos) pagecache_isize_extended(inode, old_size, pos); @@ -3027,10 +3028,10 @@ static int ext4_da_write_end(struct file *file, return ext4_write_inline_data_end(inode, pos, len, copied, folio); - if (unlikely(copied < len) && !PageUptodate(page)) + if (unlikely(copied < len) && !folio_test_uptodate(folio)) copied = 0; - return ext4_da_do_write_end(mapping, pos, len, copied, &folio->page); + return ext4_da_do_write_end(mapping, pos, len, copied, folio); } /* @@ -3213,7 +3214,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) } /* Any metadata buffers to write? */ - if (!list_empty(&inode->i_mapping->private_list)) + if (!list_empty(&inode->i_mapping->i_private_list)) return true; return inode->i_state & I_DIRTY_DATASYNC; } @@ -3564,7 +3565,7 @@ static const struct address_space_operations ext4_aops = { .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; @@ -3581,7 +3582,7 @@ static const struct address_space_operations ext4_journalled_aops = { .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio_norefs, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; @@ -3598,7 +3599,7 @@ static const struct address_space_operations ext4_da_aops = { .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; @@ -3630,6 +3631,12 @@ void ext4_set_aops(struct inode *inode) inode->i_mapping->a_ops = &ext4_aops; } +/* + * Here we can't skip an unwritten buffer even though it usually reads zero + * because it might have data in pagecache (eg, if called from ext4_zero_range, + * ext4_punch_hole, etc) which needs to be properly zeroed out. Otherwise a + * racing writeback can come later and flush the stale pagecache to disk. + */ static int __ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 4f931f80cb34..aa6be510eb8f 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -819,11 +819,11 @@ int ext4_force_shutdown(struct super_block *sb, u32 flags) switch (flags) { case EXT4_GOING_FLAGS_DEFAULT: - ret = freeze_bdev(sb->s_bdev); + ret = bdev_freeze(sb->s_bdev); if (ret) return ret; set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); - thaw_bdev(sb->s_bdev); + bdev_thaw(sb->s_bdev); break; case EXT4_GOING_FLAGS_LOGFLUSH: set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d72b5e3c92ec..f44f668e407f 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1456,9 +1456,8 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, return 0; } - block++; - pnum = block / blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, gfp); + /* blocks_per_page == 1, hence we need another page for the buddy */ + page = find_or_create_page(inode->i_mapping, block + 1, gfp); if (!page) return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); @@ -1958,8 +1957,7 @@ done: static int mb_find_extent(struct ext4_buddy *e4b, int block, int needed, struct ext4_free_extent *ex) { - int next = block; - int max, order; + int max, order, next; void *buddy; assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); @@ -1977,16 +1975,12 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block, /* find actual order */ order = mb_find_order_for_block(e4b, block); - block = block >> order; - ex->fe_len = 1 << order; - ex->fe_start = block << order; + ex->fe_len = (1 << order) - (block & ((1 << order) - 1)); + ex->fe_start = block; ex->fe_group = e4b->bd_group; - /* calc difference from given start */ - next = next - ex->fe_start; - ex->fe_len -= next; - ex->fe_start += next; + block = block >> order; while (needed > ex->fe_len && mb_find_buddy(e4b, order, &max)) { @@ -2895,14 +2889,19 @@ repeat: ac->ac_groups_scanned++; if (cr == CR_POWER2_ALIGNED) ext4_mb_simple_scan_group(ac, &e4b); - else if ((cr == CR_GOAL_LEN_FAST || - cr == CR_BEST_AVAIL_LEN) && - sbi->s_stripe && - !(ac->ac_g_ex.fe_len % - EXT4_B2C(sbi, sbi->s_stripe))) - ext4_mb_scan_aligned(ac, &e4b); - else - ext4_mb_complex_scan_group(ac, &e4b); + else { + bool is_stripe_aligned = sbi->s_stripe && + !(ac->ac_g_ex.fe_len % + EXT4_B2C(sbi, sbi->s_stripe)); + + if ((cr == CR_GOAL_LEN_FAST || + cr == CR_BEST_AVAIL_LEN) && + is_stripe_aligned) + ext4_mb_scan_aligned(ac, &e4b); + + if (ac->ac_status == AC_STATUS_CONTINUE) + ext4_mb_complex_scan_group(ac, &e4b); + } ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); @@ -6735,11 +6734,16 @@ __acquires(bitlock) static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb, ext4_group_t grp) { - if (grp < ext4_get_groups_count(sb)) - return EXT4_CLUSTERS_PER_GROUP(sb) - 1; - return (ext4_blocks_count(EXT4_SB(sb)->s_es) - - ext4_group_first_block_no(sb, grp) - 1) >> - EXT4_CLUSTER_BITS(sb); + unsigned long nr_clusters_in_group; + + if (grp < (ext4_get_groups_count(sb) - 1)) + nr_clusters_in_group = EXT4_CLUSTERS_PER_GROUP(sb); + else + nr_clusters_in_group = (ext4_blocks_count(EXT4_SB(sb)->s_es) - + ext4_group_first_block_no(sb, grp)) + >> EXT4_CLUSTER_BITS(sb); + + return nr_clusters_in_group - 1; } static bool ext4_trim_interrupted(void) @@ -6753,13 +6757,15 @@ static int ext4_try_to_trim_range(struct super_block *sb, __acquires(ext4_group_lock_ptr(sb, e4b->bd_group)) __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) { - ext4_grpblk_t next, count, free_count; + ext4_grpblk_t next, count, free_count, last, origin_start; bool set_trimmed = false; void *bitmap; + last = ext4_last_grp_cluster(sb, e4b->bd_group); bitmap = e4b->bd_bitmap; - if (start == 0 && max >= ext4_last_grp_cluster(sb, e4b->bd_group)) + if (start == 0 && max >= last) set_trimmed = true; + origin_start = start; start = max(e4b->bd_info->bb_first_free, start); count = 0; free_count = 0; @@ -6768,7 +6774,10 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) start = mb_find_next_zero_bit(bitmap, max + 1, start); if (start > max) break; - next = mb_find_next_bit(bitmap, max + 1, start); + + next = mb_find_next_bit(bitmap, last + 1, start); + if (origin_start == 0 && next >= last) + set_trimmed = true; if ((next - start) >= minblocks) { int ret = ext4_trim_extent(sb, start, next - start, e4b); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index d252935f9c8a..05b647e6bc19 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2388,8 +2388,6 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, sb = dir->i_sb; blocksize = sb->s_blocksize; - if (!dentry->d_name.len) - return -EINVAL; if (fscrypt_is_nokey_name(dentry)) return -ENOKEY; @@ -3591,10 +3589,14 @@ struct ext4_renament { int dir_inlined; }; -static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent) +static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent, bool is_cross) { int retval; + ent->is_dir = true; + if (!is_cross) + return 0; + ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode, &retval, &ent->parent_de, &ent->dir_inlined); @@ -3612,6 +3614,9 @@ static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent, { int retval; + if (!ent->dir_bh) + return 0; + ent->parent_de->inode = cpu_to_le32(dir_ino); BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata"); if (!ent->dir_inlined) { @@ -3900,7 +3905,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir)) goto end_rename; } - retval = ext4_rename_dir_prepare(handle, &old); + retval = ext4_rename_dir_prepare(handle, &old, new.dir != old.dir); if (retval) goto end_rename; } @@ -3964,7 +3969,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, } inode_set_mtime_to_ts(old.dir, inode_set_ctime_current(old.dir)); ext4_update_dx_flag(old.dir); - if (old.dir_bh) { + if (old.is_dir) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); if (retval) goto end_rename; @@ -3987,7 +3992,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (unlikely(retval)) goto end_rename; - if (S_ISDIR(old.inode->i_mode)) { + if (old.is_dir) { /* * We disable fast commits here that's because the * replay code is not yet capable of changing dot dot @@ -4114,14 +4119,12 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, ext4_handle_sync(handle); if (S_ISDIR(old.inode->i_mode)) { - old.is_dir = true; - retval = ext4_rename_dir_prepare(handle, &old); + retval = ext4_rename_dir_prepare(handle, &old, new.dir != old.dir); if (retval) goto end_rename; } if (S_ISDIR(new.inode->i_mode)) { - new.is_dir = true; - retval = ext4_rename_dir_prepare(handle, &new); + retval = ext4_rename_dir_prepare(handle, &new, new.dir != old.dir); if (retval) goto end_rename; } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index dfdd7e5cf038..312bc6813357 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -444,7 +444,7 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, folio_clear_error(folio); /* - * Comments copied from block_write_full_page: + * Comments copied from block_write_full_folio: * * The folio straddles i_size. It must be zeroed out on each and every * writepage invocation because it may be mmapped. "A file is mapped diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 4fe061edefdd..4d4a5a32e310 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -218,35 +218,53 @@ struct ext4_new_flex_group_data { in the flex group */ __u16 *bg_flags; /* block group flags of groups in @groups */ + ext4_group_t resize_bg; /* number of allocated + new_group_data */ ext4_group_t count; /* number of groups in @groups */ }; /* + * Avoiding memory allocation failures due to too many groups added each time. + */ +#define MAX_RESIZE_BG 16384 + +/* * alloc_flex_gd() allocates a ext4_new_flex_group_data with size of * @flexbg_size. * * Returns NULL on failure otherwise address of the allocated structure. */ -static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size) +static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned int flexbg_size, + ext4_group_t o_group, ext4_group_t n_group) { + ext4_group_t last_group; struct ext4_new_flex_group_data *flex_gd; flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS); if (flex_gd == NULL) goto out3; - if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_group_data)) - goto out2; - flex_gd->count = flexbg_size; + if (unlikely(flexbg_size > MAX_RESIZE_BG)) + flex_gd->resize_bg = MAX_RESIZE_BG; + else + flex_gd->resize_bg = flexbg_size; + + /* Avoid allocating large 'groups' array if not needed */ + last_group = o_group | (flex_gd->resize_bg - 1); + if (n_group <= last_group) + flex_gd->resize_bg = 1 << fls(n_group - o_group + 1); + else if (n_group - last_group < flex_gd->resize_bg) + flex_gd->resize_bg = 1 << max(fls(last_group - o_group + 1), + fls(n_group - last_group)); - flex_gd->groups = kmalloc_array(flexbg_size, + flex_gd->groups = kmalloc_array(flex_gd->resize_bg, sizeof(struct ext4_new_group_data), GFP_NOFS); if (flex_gd->groups == NULL) goto out2; - flex_gd->bg_flags = kmalloc_array(flexbg_size, sizeof(__u16), + flex_gd->bg_flags = kmalloc_array(flex_gd->resize_bg, sizeof(__u16), GFP_NOFS); if (flex_gd->bg_flags == NULL) goto out1; @@ -283,7 +301,7 @@ static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd) */ static int ext4_alloc_group_tables(struct super_block *sb, struct ext4_new_flex_group_data *flex_gd, - int flexbg_size) + unsigned int flexbg_size) { struct ext4_new_group_data *group_data = flex_gd->groups; ext4_fsblk_t start_blk; @@ -384,12 +402,12 @@ next_group: group = group_data[0].group; printk(KERN_DEBUG "EXT4-fs: adding a flex group with " - "%d groups, flexbg size is %d:\n", flex_gd->count, + "%u groups, flexbg size is %u:\n", flex_gd->count, flexbg_size); for (i = 0; i < flex_gd->count; i++) { ext4_debug( - "adding %s group %u: %u blocks (%d free, %d mdata blocks)\n", + "adding %s group %u: %u blocks (%u free, %u mdata blocks)\n", ext4_bg_has_super(sb, group + i) ? "normal" : "no-super", group + i, group_data[i].blocks_count, @@ -1605,8 +1623,7 @@ exit: static int ext4_setup_next_flex_gd(struct super_block *sb, struct ext4_new_flex_group_data *flex_gd, - ext4_fsblk_t n_blocks_count, - unsigned long flexbg_size) + ext4_fsblk_t n_blocks_count) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; @@ -1630,7 +1647,7 @@ static int ext4_setup_next_flex_gd(struct super_block *sb, BUG_ON(last); ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last); - last_group = group | (flexbg_size - 1); + last_group = group | (flex_gd->resize_bg - 1); if (last_group > n_group) last_group = n_group; @@ -1990,8 +2007,9 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) ext4_fsblk_t o_blocks_count; ext4_fsblk_t n_blocks_count_retry = 0; unsigned long last_update_time = 0; - int err = 0, flexbg_size = 1 << sbi->s_log_groups_per_flex; + int err = 0; int meta_bg; + unsigned int flexbg_size = ext4_flex_bg_size(sbi); /* See if the device is actually as big as what was requested */ bh = ext4_sb_bread(sb, n_blocks_count - 1, 0); @@ -2123,7 +2141,7 @@ retry: if (err) goto out; - flex_gd = alloc_flex_gd(flexbg_size); + flex_gd = alloc_flex_gd(flexbg_size, o_group, n_group); if (flex_gd == NULL) { err = -ENOMEM; goto out; @@ -2132,8 +2150,7 @@ retry: /* Add flex groups. Note that a regular group is a * flex group with 1 group. */ - while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count, - flexbg_size)) { + while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count)) { if (time_is_before_jiffies(last_update_time + HZ * 10)) { if (last_update_time) ext4_msg(sb, KERN_INFO, diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c5fcf377ab1f..dcba0f85dfe2 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2793,15 +2793,6 @@ static int ext4_check_opt_consistency(struct fs_context *fc, return -EINVAL; } - if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DIOREAD_NOLOCK)) { - int blocksize = - BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); - if (blocksize < PAGE_SIZE) - ext4_msg(NULL, KERN_WARNING, "Warning: mounting with an " - "experimental mount option 'dioread_nolock' " - "for blocksize < PAGE_SIZE"); - } - err = ext4_check_test_dummy_encryption(fc, sb); if (err) return err; @@ -4410,7 +4401,7 @@ static void ext4_set_def_opts(struct super_block *sb, ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) set_opt(sb, DELALLOC); - if (sb->s_blocksize == PAGE_SIZE) + if (sb->s_blocksize <= PAGE_SIZE) set_opt(sb, DIOREAD_NOLOCK); } @@ -5864,11 +5855,9 @@ static struct bdev_handle *ext4_get_journal_blkdev(struct super_block *sb, struct ext4_super_block *es; int errno; - /* see get_tree_bdev why this is needed and safe */ - up_write(&sb->s_umount); - bdev_handle = bdev_open_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE, - sb, &fs_holder_ops); - down_write(&sb->s_umount); + bdev_handle = bdev_open_by_dev(j_dev, + BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES, + sb, &fs_holder_ops); if (IS_ERR(bdev_handle)) { ext4_msg(sb, KERN_ERR, "failed to open journal device unknown-block(%u,%u) %ld", diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 36e5dab6baae..531517dac079 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1036,8 +1036,10 @@ static void set_cluster_dirty(struct compress_ctx *cc) int i; for (i = 0; i < cc->cluster_size; i++) - if (cc->rpages[i]) + if (cc->rpages[i]) { set_page_dirty(cc->rpages[i]); + set_page_private_gcing(cc->rpages[i]); + } } static int prepare_compress_overwrite(struct compress_ctx *cc, @@ -1369,8 +1371,6 @@ unlock_continue: add_compr_block_stat(inode, cc->valid_nr_cpages); set_inode_flag(cc->inode, FI_APPEND_WRITE); - if (cc->cluster_idx == 0) - set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); f2fs_put_dnode(&dn); if (quota_inode) @@ -1944,7 +1944,7 @@ void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) continue; } - generic_error_remove_page(mapping, &folio->page); + generic_error_remove_folio(mapping, folio); folio_unlock(folio); } folio_batch_release(&fbatch); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 4e42b5f24deb..26e317696b33 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -995,7 +995,7 @@ static bool is_end_zone_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr) } blkaddr -= FDEV(devi).start_blk; } - return bdev_zoned_model(FDEV(devi).bdev) == BLK_ZONED_HM && + return bdev_is_zoned(FDEV(devi).bdev) && f2fs_blkz_is_seq(sbi, devi, blkaddr) && (blkaddr % sbi->blocks_per_blkz == sbi->blocks_per_blkz - 1); } @@ -1179,18 +1179,12 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, return 0; } -static void __set_data_blkaddr(struct dnode_of_data *dn) +static void __set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) { - struct f2fs_node *rn = F2FS_NODE(dn->node_page); - __le32 *addr_array; - int base = 0; + __le32 *addr = get_dnode_addr(dn->inode, dn->node_page); - if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) - base = get_extra_isize(dn->inode); - - /* Get physical address of data block */ - addr_array = blkaddr_in_node(rn); - addr_array[base + dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); + dn->data_blkaddr = blkaddr; + addr[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); } /* @@ -1199,18 +1193,17 @@ static void __set_data_blkaddr(struct dnode_of_data *dn) * ->node_page * update block addresses in the node page */ -void f2fs_set_data_blkaddr(struct dnode_of_data *dn) +void f2fs_set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) { f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true); - __set_data_blkaddr(dn); + __set_data_blkaddr(dn, blkaddr); if (set_page_dirty(dn->node_page)) dn->node_changed = true; } void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) { - dn->data_blkaddr = blkaddr; - f2fs_set_data_blkaddr(dn); + f2fs_set_data_blkaddr(dn, blkaddr); f2fs_update_read_extent_cache(dn); } @@ -1237,8 +1230,7 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) block_t blkaddr = f2fs_data_blkaddr(dn); if (blkaddr == NULL_ADDR) { - dn->data_blkaddr = NEW_ADDR; - __set_data_blkaddr(dn); + __set_data_blkaddr(dn, NEW_ADDR); count--; } } @@ -1492,11 +1484,9 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) old_blkaddr = dn->data_blkaddr; f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr, &sum, seg_type, NULL); - if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) { - invalidate_mapping_pages(META_MAPPING(sbi), - old_blkaddr, old_blkaddr); - f2fs_invalidate_compress_page(sbi, old_blkaddr); - } + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) + f2fs_invalidate_internal_cache(sbi, old_blkaddr); + f2fs_update_data_blkaddr(dn, dn->data_blkaddr); return 0; } @@ -1992,7 +1982,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) return ret; - inode_lock(inode); + inode_lock_shared(inode); maxbytes = max_file_blocks(inode) << F2FS_BLKSIZE_BITS; if (start > maxbytes) { @@ -2112,7 +2102,7 @@ out: if (ret == 1) ret = 0; - inode_unlock(inode); + inode_unlock_shared(inode); return ret; } @@ -2566,9 +2556,6 @@ int f2fs_encrypt_one_page(struct f2fs_io_info *fio) page = fio->compressed_page ? fio->compressed_page : fio->page; - /* wait for GCed page writeback via META_MAPPING */ - f2fs_wait_on_block_writeback(inode, fio->old_blkaddr); - if (fscrypt_inode_uses_inline_crypto(inode)) return 0; @@ -2755,6 +2742,10 @@ got_it: goto out_writepage; } + /* wait for GCed page writeback via META_MAPPING */ + if (fio->post_read) + f2fs_wait_on_block_writeback(inode, fio->old_blkaddr); + /* * If current allocation needs SSR, * it had better in-place writes for updated data. @@ -2810,8 +2801,6 @@ got_it: f2fs_outplace_write_data(&dn, fio); trace_f2fs_do_write_data_page(page, OPU); set_inode_flag(inode, FI_APPEND_WRITE); - if (page->index == 0) - set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); out_writepage: f2fs_put_dnode(&dn); out: @@ -2894,9 +2883,6 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, zero_user_segment(page, offset, PAGE_SIZE); write: - if (f2fs_is_drop_cache(inode)) - goto out; - /* Dentry/quota blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode) || quota_inode) { /* diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9043cedfa12b..65294e3b0bef 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -374,6 +374,12 @@ enum { MAX_DPOLICY, }; +enum { + DPOLICY_IO_AWARE_DISABLE, /* force to not be aware of IO */ + DPOLICY_IO_AWARE_ENABLE, /* force to be aware of IO */ + DPOLICY_IO_AWARE_MAX, +}; + struct discard_policy { int type; /* type of discard */ unsigned int min_interval; /* used for candidates exist */ @@ -406,6 +412,7 @@ struct discard_cmd_control { unsigned int discard_urgent_util; /* utilization which issue discard proactively */ unsigned int discard_granularity; /* discard granularity */ unsigned int max_ordered_discard; /* maximum discard granularity issued by lba order */ + unsigned int discard_io_aware; /* io_aware policy */ unsigned int undiscard_blks; /* # of undiscard blocks */ unsigned int next_pos; /* next discard position */ atomic_t issued_discard; /* # of issued discard */ @@ -774,8 +781,6 @@ enum { FI_UPDATE_WRITE, /* inode has in-place-update data */ FI_NEED_IPU, /* used for ipu per file */ FI_ATOMIC_FILE, /* indicate atomic file */ - FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */ - FI_DROP_CACHE, /* drop dirty page cache */ FI_DATA_EXIST, /* indicate data exists */ FI_INLINE_DOTS, /* indicate inline dot dentries */ FI_SKIP_WRITES, /* should skip data page writeback */ @@ -3272,22 +3277,13 @@ static inline bool f2fs_is_cow_file(struct inode *inode) return is_inode_flag_set(inode, FI_COW_FILE); } -static inline bool f2fs_is_first_block_written(struct inode *inode) -{ - return is_inode_flag_set(inode, FI_FIRST_BLOCK_WRITTEN); -} - -static inline bool f2fs_is_drop_cache(struct inode *inode) -{ - return is_inode_flag_set(inode, FI_DROP_CACHE); -} - +static inline __le32 *get_dnode_addr(struct inode *inode, + struct page *node_page); static inline void *inline_data_addr(struct inode *inode, struct page *page) { - struct f2fs_inode *ri = F2FS_INODE(page); - int extra_size = get_extra_isize(inode); + __le32 *addr = get_dnode_addr(inode, page); - return (void *)&(ri->i_addr[extra_size + DEF_INLINE_RESERVED_SIZE]); + return (void *)(addr + DEF_INLINE_RESERVED_SIZE); } static inline int f2fs_has_inline_dentry(struct inode *inode) @@ -3432,6 +3428,17 @@ static inline int get_inline_xattr_addrs(struct inode *inode) return F2FS_I(inode)->i_inline_xattr_size; } +static inline __le32 *get_dnode_addr(struct inode *inode, + struct page *node_page) +{ + int base = 0; + + if (IS_INODE(node_page) && f2fs_has_extra_attr(inode)) + base = get_extra_isize(inode); + + return blkaddr_in_node(F2FS_NODE(node_page)) + base; +} + #define f2fs_get_inode_mode(i) \ ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) @@ -3815,7 +3822,7 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio); struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, block_t blk_addr, sector_t *sector); int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr); -void f2fs_set_data_blkaddr(struct dnode_of_data *dn); +void f2fs_set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); int f2fs_reserve_new_block(struct dnode_of_data *dn); @@ -4606,6 +4613,13 @@ static inline bool f2fs_is_readonly(struct f2fs_sb_info *sbi) return f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb); } +static inline void f2fs_invalidate_internal_cache(struct f2fs_sb_info *sbi, + block_t blkaddr) +{ + invalidate_mapping_pages(META_MAPPING(sbi), blkaddr, blkaddr); + f2fs_invalidate_compress_page(sbi, blkaddr); +} + #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e50363583f01..b58ab1157b7e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -42,11 +42,11 @@ static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) vm_fault_t ret; ret = filemap_fault(vmf); - if (!ret) + if (ret & VM_FAULT_LOCKED) f2fs_update_iostat(F2FS_I_SB(inode), inode, APP_MAPPED_READ_IO, F2FS_BLKSIZE); - trace_f2fs_filemap_fault(inode, vmf->pgoff, (unsigned long)ret); + trace_f2fs_filemap_fault(inode, vmf->pgoff, vmf->vma->vm_flags, ret); return ret; } @@ -59,26 +59,29 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) struct dnode_of_data dn; bool need_alloc = true; int err = 0; + vm_fault_t ret; if (unlikely(IS_IMMUTABLE(inode))) return VM_FAULT_SIGBUS; - if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) - return VM_FAULT_SIGBUS; + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { + err = -EIO; + goto out; + } if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; - goto err; + goto out; } if (!f2fs_is_checkpoint_ready(sbi)) { err = -ENOSPC; - goto err; + goto out; } err = f2fs_convert_inline_inode(inode); if (err) - goto err; + goto out; #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { @@ -86,7 +89,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) if (ret < 0) { err = ret; - goto err; + goto out; } else if (ret) { need_alloc = false; } @@ -153,13 +156,15 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) f2fs_update_iostat(sbi, inode, APP_MAPPED_IO, F2FS_BLKSIZE); f2fs_update_time(sbi, REQ_TIME); - trace_f2fs_vm_page_mkwrite(page, DATA); out_sem: filemap_invalidate_unlock_shared(inode->i_mapping); sb_end_pagefault(inode->i_sb); -err: - return vmf_fs_error(err); +out: + ret = vmf_fs_error(err); + + trace_f2fs_vm_page_mkwrite(inode, page->index, vmf->vma->vm_flags, ret); + return ret; } static const struct vm_operations_struct f2fs_file_vm_ops = { @@ -418,7 +423,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) loff_t isize; int err = 0; - inode_lock(inode); + inode_lock_shared(inode); isize = i_size_read(inode); if (offset >= isize) @@ -483,10 +488,10 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) found: if (whence == SEEK_HOLE && data_ofs > isize) data_ofs = isize; - inode_unlock(inode); + inode_unlock_shared(inode); return vfs_setpos(file, data_ofs, maxbytes); fail: - inode_unlock(inode); + inode_unlock_shared(inode); return -ENXIO; } @@ -557,20 +562,14 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); - struct f2fs_node *raw_node; int nr_free = 0, ofs = dn->ofs_in_node, len = count; __le32 *addr; - int base = 0; bool compressed_cluster = false; int cluster_index = 0, valid_blocks = 0; int cluster_size = F2FS_I(dn->inode)->i_cluster_size; bool released = !atomic_read(&F2FS_I(dn->inode)->i_compr_blocks); - if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) - base = get_extra_isize(dn->inode); - - raw_node = F2FS_NODE(dn->node_page); - addr = blkaddr_in_node(raw_node) + base + ofs; + addr = get_dnode_addr(dn->inode, dn->node_page) + ofs; /* Assumption: truncation starts with cluster */ for (; count > 0; count--, addr++, dn->ofs_in_node++, cluster_index++) { @@ -588,8 +587,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) if (blkaddr == NULL_ADDR) continue; - dn->data_blkaddr = NULL_ADDR; - f2fs_set_data_blkaddr(dn); + f2fs_set_data_blkaddr(dn, NULL_ADDR); if (__is_valid_data_blkaddr(blkaddr)) { if (!f2fs_is_valid_blkaddr(sbi, blkaddr, @@ -599,9 +597,6 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) valid_blocks++; } - if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page)) - clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN); - f2fs_invalidate_blocks(sbi, blkaddr); if (!released || blkaddr != COMPRESS_ADDR) @@ -1317,6 +1312,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, } memcpy_page(pdst, 0, psrc, 0, PAGE_SIZE); set_page_dirty(pdst); + set_page_private_gcing(pdst); f2fs_put_page(pdst, 1); f2fs_put_page(psrc, 1); @@ -1487,8 +1483,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, } f2fs_invalidate_blocks(sbi, dn->data_blkaddr); - dn->data_blkaddr = NEW_ADDR; - f2fs_set_data_blkaddr(dn); + f2fs_set_data_blkaddr(dn, NEW_ADDR); } f2fs_update_read_extent_cache_range(dn, start, 0, index - start); @@ -2239,11 +2234,11 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) switch (in) { case F2FS_GOING_DOWN_FULLSYNC: - ret = freeze_bdev(sb->s_bdev); + ret = bdev_freeze(sb->s_bdev); if (ret) goto out; f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); - thaw_bdev(sb->s_bdev); + bdev_thaw(sb->s_bdev); break; case F2FS_GOING_DOWN_METASYNC: /* do checkpoint only */ @@ -2818,6 +2813,11 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, goto out; } + if (f2fs_compressed_file(src) || f2fs_compressed_file(dst)) { + ret = -EOPNOTSUPP; + goto out_unlock; + } + ret = -EINVAL; if (pos_in + len > src->i_size || pos_in + len < pos_in) goto out_unlock; @@ -3463,8 +3463,7 @@ static int release_compress_blocks(struct dnode_of_data *dn, pgoff_t count) if (blkaddr != NEW_ADDR) continue; - dn->data_blkaddr = NULL_ADDR; - f2fs_set_data_blkaddr(dn); + f2fs_set_data_blkaddr(dn, NULL_ADDR); } f2fs_i_compr_blocks_update(dn->inode, compr_blocks, false); @@ -3630,8 +3629,7 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count) continue; } - dn->data_blkaddr = NEW_ADDR; - f2fs_set_data_blkaddr(dn); + f2fs_set_data_blkaddr(dn, NEW_ADDR); } reserved = cluster_size - compr_blocks; @@ -4054,6 +4052,7 @@ static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len) f2fs_bug_on(F2FS_I_SB(inode), !page); set_page_dirty(page); + set_page_private_gcing(page); f2fs_put_page(page, 1); f2fs_put_page(page, 0); } @@ -4568,7 +4567,8 @@ static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter, if (map.m_len > map.m_lblk) map.m_len -= map.m_lblk; else - map.m_len = 0; + return 0; + map.m_may_create = true; if (dio) { map.m_seg_type = f2fs_rw_hint_to_seg_type(inode->i_write_hint); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index f550cdeaa663..a079eebfb080 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -46,8 +46,8 @@ static int gc_thread_func(void *data) do { bool sync_mode, foreground = false; - wait_event_interruptible_timeout(*wq, - kthread_should_stop() || freezing(current) || + wait_event_freezable_timeout(*wq, + kthread_should_stop() || waitqueue_active(fggc_wq) || gc_th->gc_wake, msecs_to_jiffies(wait_ms)); @@ -59,7 +59,7 @@ static int gc_thread_func(void *data) if (gc_th->gc_wake) gc_th->gc_wake = false; - if (try_to_freeze() || f2fs_readonly(sbi->sb)) { + if (f2fs_readonly(sbi->sb)) { stat_other_skip_bggc_count(sbi); continue; } @@ -1380,9 +1380,8 @@ static int move_data_block(struct inode *inode, block_t bidx, memcpy(page_address(fio.encrypted_page), page_address(mpage), PAGE_SIZE); f2fs_put_page(mpage, 1); - invalidate_mapping_pages(META_MAPPING(fio.sbi), - fio.old_blkaddr, fio.old_blkaddr); - f2fs_invalidate_compress_page(fio.sbi, fio.old_blkaddr); + + f2fs_invalidate_internal_cache(fio.sbi, fio.old_blkaddr); set_page_dirty(fio.encrypted_page); if (clear_page_dirty_for_io(fio.encrypted_page)) @@ -1405,8 +1404,6 @@ static int move_data_block(struct inode *inode, block_t bidx, f2fs_update_data_blkaddr(&dn, newaddr); set_inode_flag(inode, FI_APPEND_WRITE); - if (page->index == 0) - set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); put_page_out: f2fs_put_page(fio.encrypted_page, 1); recover_block: @@ -1868,6 +1865,9 @@ retry: seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, gc_control->should_migrate_blocks); + if (seg_freed < 0) + goto stop; + total_freed += seg_freed; if (seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) { diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 560bfcad1af2..c26effdce9aa 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -61,49 +61,31 @@ void f2fs_set_inode_flags(struct inode *inode) S_ENCRYPTED|S_VERITY|S_CASEFOLD); } -static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) +static void __get_inode_rdev(struct inode *inode, struct page *node_page) { - int extra_size = get_extra_isize(inode); + __le32 *addr = get_dnode_addr(inode, node_page); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { - if (ri->i_addr[extra_size]) - inode->i_rdev = old_decode_dev( - le32_to_cpu(ri->i_addr[extra_size])); + if (addr[0]) + inode->i_rdev = old_decode_dev(le32_to_cpu(addr[0])); else - inode->i_rdev = new_decode_dev( - le32_to_cpu(ri->i_addr[extra_size + 1])); + inode->i_rdev = new_decode_dev(le32_to_cpu(addr[1])); } } -static int __written_first_block(struct f2fs_sb_info *sbi, - struct f2fs_inode *ri) +static void __set_inode_rdev(struct inode *inode, struct page *node_page) { - block_t addr = le32_to_cpu(ri->i_addr[offset_in_addr(ri)]); - - if (!__is_valid_data_blkaddr(addr)) - return 1; - if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC_ENHANCE)) { - f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); - return -EFSCORRUPTED; - } - return 0; -} - -static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) -{ - int extra_size = get_extra_isize(inode); + __le32 *addr = get_dnode_addr(inode, node_page); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { if (old_valid_dev(inode->i_rdev)) { - ri->i_addr[extra_size] = - cpu_to_le32(old_encode_dev(inode->i_rdev)); - ri->i_addr[extra_size + 1] = 0; + addr[0] = cpu_to_le32(old_encode_dev(inode->i_rdev)); + addr[1] = 0; } else { - ri->i_addr[extra_size] = 0; - ri->i_addr[extra_size + 1] = - cpu_to_le32(new_encode_dev(inode->i_rdev)); - ri->i_addr[extra_size + 2] = 0; + addr[0] = 0; + addr[1] = cpu_to_le32(new_encode_dev(inode->i_rdev)); + addr[2] = 0; } } } @@ -398,7 +380,6 @@ static int do_read_inode(struct inode *inode) struct page *node_page; struct f2fs_inode *ri; projid_t i_projid; - int err; /* Check if ino is within scope */ if (f2fs_check_nid_range(sbi, inode->i_ino)) @@ -478,17 +459,7 @@ static int do_read_inode(struct inode *inode) } /* get rdev by using inline_info */ - __get_inode_rdev(inode, ri); - - if (S_ISREG(inode->i_mode)) { - err = __written_first_block(sbi, ri); - if (err < 0) { - f2fs_put_page(node_page, 1); - return err; - } - if (!err) - set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); - } + __get_inode_rdev(inode, node_page); if (!f2fs_need_inode_block_update(sbi, inode->i_ino)) fi->last_disk_size = inode->i_size; @@ -600,7 +571,7 @@ make_now: #ifdef CONFIG_F2FS_FS_COMPRESSION inode->i_mapping->a_ops = &f2fs_compress_aops; /* - * generic_error_remove_page only truncates pages of regular + * generic_error_remove_folio only truncates pages of regular * inode */ inode->i_mode |= S_IFREG; @@ -761,7 +732,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) } } - __set_inode_rdev(inode, ri); + __set_inode_rdev(inode, node_page); /* deleted inode */ if (inode->i_nlink == 0) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index d0053b0284d8..b3bb815fc6aa 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -459,7 +459,6 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct qstr dot = QSTR_INIT(".", 1); - struct qstr dotdot = QSTR_INIT("..", 2); struct f2fs_dir_entry *de; struct page *page; int err = 0; @@ -497,13 +496,13 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) goto out; } - de = f2fs_find_entry(dir, &dotdot, &page); + de = f2fs_find_entry(dir, &dotdot_name, &page); if (de) f2fs_put_page(page, 0); else if (IS_ERR(page)) err = PTR_ERR(page); else - err = f2fs_do_add_link(dir, &dotdot, NULL, pino, S_IFDIR); + err = f2fs_do_add_link(dir, &dotdot_name, NULL, pino, S_IFDIR); out: if (!err) clear_inode_flag(dir, FI_INLINE_DOTS); @@ -963,6 +962,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct f2fs_dir_entry *old_dir_entry = NULL; struct f2fs_dir_entry *old_entry; struct f2fs_dir_entry *new_entry; + bool old_is_dir = S_ISDIR(old_inode->i_mode); int err; if (unlikely(f2fs_cp_error(sbi))) @@ -1017,7 +1017,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, goto out; } - if (S_ISDIR(old_inode->i_mode)) { + if (old_is_dir && old_dir != new_dir) { old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page); if (!old_dir_entry) { if (IS_ERR(old_dir_page)) @@ -1029,7 +1029,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (new_inode) { err = -ENOTEMPTY; - if (old_dir_entry && !f2fs_empty_dir(new_inode)) + if (old_is_dir && !f2fs_empty_dir(new_inode)) goto out_dir; err = -ENOENT; @@ -1054,7 +1054,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, inode_set_ctime_current(new_inode); f2fs_down_write(&F2FS_I(new_inode)->i_sem); - if (old_dir_entry) + if (old_is_dir) f2fs_i_links_write(new_inode, false); f2fs_i_links_write(new_inode, false); f2fs_up_write(&F2FS_I(new_inode)->i_sem); @@ -1074,12 +1074,12 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, goto out_dir; } - if (old_dir_entry) + if (old_is_dir) f2fs_i_links_write(new_dir, true); } f2fs_down_write(&F2FS_I(old_inode)->i_sem); - if (!old_dir_entry || whiteout) + if (!old_is_dir || whiteout) file_lost_pino(old_inode); else /* adjust dir's i_pino to pass fsck check */ @@ -1105,8 +1105,8 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, iput(whiteout); } - if (old_dir_entry) { - if (old_dir != new_dir && !whiteout) + if (old_is_dir) { + if (old_dir_entry) f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); else @@ -1316,21 +1316,27 @@ static int f2fs_rename2(struct mnt_idmap *idmap, if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; + trace_f2fs_rename_start(old_dir, old_dentry, new_dir, new_dentry, + flags); + err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry, flags); if (err) return err; - if (flags & RENAME_EXCHANGE) { - return f2fs_cross_rename(old_dir, old_dentry, - new_dir, new_dentry); - } + if (flags & RENAME_EXCHANGE) + err = f2fs_cross_rename(old_dir, old_dentry, + new_dir, new_dentry); + else /* * VFS has already handled the new dentry existence case, * here, we just deal with "RENAME_NOREPLACE" as regular rename. */ - return f2fs_rename(idmap, old_dir, old_dentry, + err = f2fs_rename(idmap, old_dir, old_dentry, new_dir, new_dentry, flags); + + trace_f2fs_rename_end(old_dentry, new_dentry, flags, err); + return err; } static const char *f2fs_encrypted_get_link(struct dentry *dentry, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 6c7f6a649d27..9b546fd21010 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2751,11 +2751,11 @@ recover_xnid: f2fs_update_inode_page(inode); /* 3: update and set xattr node page dirty */ - if (page) + if (page) { memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE); - - set_page_dirty(xpage); + set_page_dirty(xpage); + } f2fs_put_page(xpage, 1); return 0; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index b56d0f1078a7..d0f24ccbd1ac 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -712,7 +712,16 @@ retry_dn: */ if (dest == NEW_ADDR) { f2fs_truncate_data_blocks_range(&dn, 1); - f2fs_reserve_new_block(&dn); + do { + err = f2fs_reserve_new_block(&dn); + if (err == -ENOSPC) { + f2fs_bug_on(sbi, 1); + break; + } + } while (err && + IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)); + if (err) + goto err; continue; } @@ -720,12 +729,14 @@ retry_dn: if (f2fs_is_valid_blkaddr(sbi, dest, META_POR)) { if (src == NULL_ADDR) { - err = f2fs_reserve_new_block(&dn); - while (err && - IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)) + do { err = f2fs_reserve_new_block(&dn); - /* We should not get -ENOSPC */ - f2fs_bug_on(sbi, err); + if (err == -ENOSPC) { + f2fs_bug_on(sbi, 1); + break; + } + } while (err && + IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)); if (err) goto err; } @@ -906,6 +917,8 @@ skip: if (!err && fix_curseg_write_pointer && !f2fs_readonly(sbi->sb) && f2fs_sb_has_blkzoned(sbi)) { err = f2fs_fix_curseg_write_pointer(sbi); + if (!err) + err = f2fs_check_write_pointer(sbi); ret = err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 727d016318f9..4c8836ded90f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1172,7 +1172,10 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->min_interval = dcc->min_discard_issue_time; dpolicy->mid_interval = dcc->mid_discard_issue_time; dpolicy->max_interval = dcc->max_discard_issue_time; - dpolicy->io_aware = true; + if (dcc->discard_io_aware == DPOLICY_IO_AWARE_ENABLE) + dpolicy->io_aware = true; + else if (dcc->discard_io_aware == DPOLICY_IO_AWARE_DISABLE) + dpolicy->io_aware = false; dpolicy->sync = false; dpolicy->ordered = true; if (utilization(sbi) > dcc->discard_urgent_util) { @@ -1380,7 +1383,8 @@ static void __insert_discard_cmd(struct f2fs_sb_info *sbi, p = &(*p)->rb_right; leftmost = false; } else { - f2fs_bug_on(sbi, 1); + /* Let's skip to add, if exists */ + return; } } @@ -1883,9 +1887,8 @@ static int issue_discard_thread(void *data) set_freezable(); do { - wait_event_interruptible_timeout(*q, - kthread_should_stop() || freezing(current) || - dcc->discard_wake, + wait_event_freezable_timeout(*q, + kthread_should_stop() || dcc->discard_wake, msecs_to_jiffies(wait_ms)); if (sbi->gc_mode == GC_URGENT_HIGH || @@ -1903,8 +1906,6 @@ static int issue_discard_thread(void *data) if (atomic_read(&dcc->queued_discard)) __wait_all_discard_cmd(sbi, NULL); - if (try_to_freeze()) - continue; if (f2fs_readonly(sbi->sb)) continue; if (kthread_should_stop()) @@ -2274,6 +2275,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) dcc->discard_io_aware_gran = MAX_PLIST_NUM; dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY; + dcc->discard_io_aware = DPOLICY_IO_AWARE_ENABLE; if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) dcc->discard_granularity = sbi->blocks_per_seg; else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) @@ -2495,8 +2497,7 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) if (addr == NEW_ADDR || addr == COMPRESS_ADDR) return; - invalidate_mapping_pages(META_MAPPING(sbi), addr, addr); - f2fs_invalidate_compress_page(sbi, addr); + f2fs_invalidate_internal_cache(sbi, addr); /* add it into sit main buffer */ down_write(&sit_i->sentry_lock); @@ -3557,11 +3558,8 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) reallocate: f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type, fio); - if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) { - invalidate_mapping_pages(META_MAPPING(fio->sbi), - fio->old_blkaddr, fio->old_blkaddr); - f2fs_invalidate_compress_page(fio->sbi, fio->old_blkaddr); - } + if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) + f2fs_invalidate_internal_cache(fio->sbi, fio->old_blkaddr); /* writeout dirty page into bdev */ f2fs_submit_page_write(fio); @@ -3757,9 +3755,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, update_sit_entry(sbi, new_blkaddr, 1); } if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) { - invalidate_mapping_pages(META_MAPPING(sbi), - old_blkaddr, old_blkaddr); - f2fs_invalidate_compress_page(sbi, old_blkaddr); + f2fs_invalidate_internal_cache(sbi, old_blkaddr); if (!from_gc) update_segment_mtime(sbi, old_blkaddr, 0); update_sit_entry(sbi, old_blkaddr, -1); @@ -4865,99 +4861,56 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, struct f2fs_dev_info *fdev, struct blk_zone *zone) { - unsigned int wp_segno, wp_blkoff, zone_secno, zone_segno, segno; - block_t zone_block, wp_block, last_valid_block; + unsigned int zone_segno; + block_t zone_block, valid_block_cnt; unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; - int i, s, b, ret; - struct seg_entry *se; + int ret; if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ) return 0; - wp_block = fdev->start_blk + (zone->wp >> log_sectors_per_block); - wp_segno = GET_SEGNO(sbi, wp_block); - wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno); zone_block = fdev->start_blk + (zone->start >> log_sectors_per_block); zone_segno = GET_SEGNO(sbi, zone_block); - zone_secno = GET_SEC_FROM_SEG(sbi, zone_segno); - - if (zone_segno >= MAIN_SEGS(sbi)) - return 0; /* * Skip check of zones cursegs point to, since * fix_curseg_write_pointer() checks them. */ - for (i = 0; i < NO_CHECK_TYPE; i++) - if (zone_secno == GET_SEC_FROM_SEG(sbi, - CURSEG_I(sbi, i)->segno)) - return 0; + if (zone_segno >= MAIN_SEGS(sbi) || + IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, zone_segno))) + return 0; /* - * Get last valid block of the zone. + * Get # of valid block of the zone. */ - last_valid_block = zone_block - 1; - for (s = sbi->segs_per_sec - 1; s >= 0; s--) { - segno = zone_segno + s; - se = get_seg_entry(sbi, segno); - for (b = sbi->blocks_per_seg - 1; b >= 0; b--) - if (f2fs_test_bit(b, se->cur_valid_map)) { - last_valid_block = START_BLOCK(sbi, segno) + b; - break; - } - if (last_valid_block >= zone_block) - break; - } + valid_block_cnt = get_valid_blocks(sbi, zone_segno, true); - /* - * When safely unmounted in the previous mount, we can trust write - * pointers. Otherwise, finish zones. - */ - if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { - /* - * The write pointer matches with the valid blocks or - * already points to the end of the zone. - */ - if ((last_valid_block + 1 == wp_block) || - (zone->wp == zone->start + zone->len)) - return 0; - } + if ((!valid_block_cnt && zone->cond == BLK_ZONE_COND_EMPTY) || + (valid_block_cnt && zone->cond == BLK_ZONE_COND_FULL)) + return 0; - if (last_valid_block + 1 == zone_block) { - if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { - /* - * If there is no valid block in the zone and if write - * pointer is not at zone start, reset the write - * pointer. - */ - f2fs_notice(sbi, - "Zone without valid block has non-zero write " - "pointer. Reset the write pointer: wp[0x%x,0x%x]", - wp_segno, wp_blkoff); - } + if (!valid_block_cnt) { + f2fs_notice(sbi, "Zone without valid block has non-zero write " + "pointer. Reset the write pointer: cond[0x%x]", + zone->cond); ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block, zone->len >> log_sectors_per_block); if (ret) f2fs_err(sbi, "Discard zone failed: %s (errno=%d)", fdev->path, ret); - return ret; } - if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { - /* - * If there are valid blocks and the write pointer doesn't match - * with them, we need to report the inconsistency and fill - * the zone till the end to close the zone. This inconsistency - * does not cause write error because the zone will not be - * selected for write operation until it get discarded. - */ - f2fs_notice(sbi, "Valid blocks are not aligned with write " - "pointer: valid block[0x%x,0x%x] wp[0x%x,0x%x]", - GET_SEGNO(sbi, last_valid_block), - GET_BLKOFF_FROM_SEG0(sbi, last_valid_block), - wp_segno, wp_blkoff); - } + /* + * If there are valid blocks and the write pointer doesn't match + * with them, we need to report the inconsistency and fill + * the zone till the end to close the zone. This inconsistency + * does not cause write error because the zone will not be + * selected for write operation until it get discarded. + */ + f2fs_notice(sbi, "Valid blocks are not aligned with write " + "pointer: valid block[0x%x,0x%x] cond[0x%x]", + zone_segno, valid_block_cnt, zone->cond); ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH, zone->start, zone->len, GFP_NOFS); @@ -5048,15 +5001,18 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) f2fs_notice(sbi, "Unaligned curseg[%d] with write pointer: " "curseg[0x%x,0x%x] wp[0x%x,0x%x]", type, cs->segno, cs->next_blkoff, wp_segno, wp_blkoff); - } else { - f2fs_notice(sbi, "Not successfully unmounted in the previous " - "mount"); } - f2fs_notice(sbi, "Assign new section to curseg[%d]: " - "curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff); + /* Allocate a new section if it's not new. */ + if (cs->next_blkoff) { + unsigned int old_segno = cs->segno, old_blkoff = cs->next_blkoff; - f2fs_allocate_new_section(sbi, type, true); + f2fs_allocate_new_section(sbi, type, true); + f2fs_notice(sbi, "Assign new section to curseg[%d]: " + "[0x%x,0x%x] -> [0x%x,0x%x]", + type, old_segno, old_blkoff, + cs->segno, cs->next_blkoff); + } /* check consistency of the zone curseg pointed to */ if (check_zone_write_pointer(sbi, zbd, &zone)) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 033af907c3b1..d00d21a8b53a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1422,11 +1422,6 @@ default_check: } } - if (test_opt(sbi, DISABLE_CHECKPOINT) && f2fs_lfs_mode(sbi)) { - f2fs_err(sbi, "LFS is not compatible with checkpoint=disable"); - return -EINVAL; - } - if (test_opt(sbi, ATGC) && f2fs_lfs_mode(sbi)) { f2fs_err(sbi, "LFS is not compatible with ATGC"); return -EINVAL; @@ -1717,12 +1712,10 @@ static void f2fs_put_super(struct super_block *sb) kvfree(sbi->ckpt); - sb->s_fs_info = NULL; if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->raw_super); - destroy_device_list(sbi); f2fs_destroy_page_array_cache(sbi); f2fs_destroy_xattr_caches(sbi); mempool_destroy(sbi->write_io_dummy); @@ -1738,7 +1731,6 @@ static void f2fs_put_super(struct super_block *sb) #if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif - kfree(sbi); } int f2fs_sync_fs(struct super_block *sb, int sync) @@ -3364,6 +3356,14 @@ loff_t max_file_blocks(struct inode *inode) leaf_count *= NIDS_PER_BLOCK; result += leaf_count; + /* + * For compatibility with FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{64,32} with + * a 4K crypto data unit, we must restrict the max filesize to what can + * fit within U32_MAX + 1 data units. + */ + + result = min(result, (((loff_t)U32_MAX + 1) * 4096) >> F2FS_BLKSIZE_BITS); + return result; } @@ -4282,24 +4282,21 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) sbi->aligned_blksize = false; #ifdef CONFIG_BLK_DEV_ZONED - if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM && - !f2fs_sb_has_blkzoned(sbi)) { - f2fs_err(sbi, "Zoned block device feature not enabled"); - return -EINVAL; - } - if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) { + if (bdev_is_zoned(FDEV(i).bdev)) { + if (!f2fs_sb_has_blkzoned(sbi)) { + f2fs_err(sbi, "Zoned block device feature not enabled"); + return -EINVAL; + } if (init_blkz_info(sbi, i)) { f2fs_err(sbi, "Failed to initialize F2FS blkzone information"); return -EINVAL; } if (max_devices == 1) break; - f2fs_info(sbi, "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)", + f2fs_info(sbi, "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: Host-managed)", i, FDEV(i).path, FDEV(i).total_segments, - FDEV(i).start_blk, FDEV(i).end_blk, - bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HA ? - "Host-aware" : "Host-managed"); + FDEV(i).start_blk, FDEV(i).end_blk); continue; } #endif @@ -4746,7 +4743,7 @@ try_onemore: #ifdef CONFIG_QUOTA f2fs_recover_quota_end(sbi, quota_enabled); #endif - +reset_checkpoint: /* * If the f2fs is not readonly and fsync data recovery succeeds, * check zoned block devices' write pointer consistency. @@ -4757,7 +4754,6 @@ try_onemore: goto free_meta; } -reset_checkpoint: f2fs_init_inmem_curseg(sbi); /* f2fs_recover_fsync_data() cleared this already */ @@ -4902,9 +4898,9 @@ static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags, static void kill_f2fs_super(struct super_block *sb) { - if (sb->s_root) { - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_SB(sb); + if (sb->s_root) { set_sbi_flag(sbi, SBI_IS_CLOSE); f2fs_stop_gc_thread(sbi); f2fs_stop_discard_thread(sbi); @@ -4931,6 +4927,12 @@ static void kill_f2fs_super(struct super_block *sb) sb->s_flags &= ~SB_RDONLY; } kill_block_super(sb); + /* Release block devices last, after fscrypt_destroy_keyring(). */ + if (sbi) { + destroy_device_list(sbi); + kfree(sbi); + sb->s_fs_info = NULL; + } } static struct file_system_type f2fs_fs_type = { diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 417fae96890f..a7ec55c7bb20 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -143,6 +143,33 @@ static ssize_t pending_discard_show(struct f2fs_attr *a, &SM_I(sbi)->dcc_info->discard_cmd_cnt)); } +static ssize_t issued_discard_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + if (!SM_I(sbi)->dcc_info) + return -EINVAL; + return sysfs_emit(buf, "%llu\n", (unsigned long long)atomic_read( + &SM_I(sbi)->dcc_info->issued_discard)); +} + +static ssize_t queued_discard_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + if (!SM_I(sbi)->dcc_info) + return -EINVAL; + return sysfs_emit(buf, "%llu\n", (unsigned long long)atomic_read( + &SM_I(sbi)->dcc_info->queued_discard)); +} + +static ssize_t undiscard_blks_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + if (!SM_I(sbi)->dcc_info) + return -EINVAL; + return sysfs_emit(buf, "%u\n", + SM_I(sbi)->dcc_info->undiscard_blks); +} + static ssize_t gc_mode_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -516,6 +543,13 @@ out: return count; } + if (!strcmp(a->attr.name, "discard_io_aware")) { + if (t >= DPOLICY_IO_AWARE_MAX) + return -EINVAL; + *ui = t; + return count; + } + if (!strcmp(a->attr.name, "migration_granularity")) { if (t == 0 || t > sbi->segs_per_sec) return -EINVAL; @@ -734,6 +768,13 @@ out: return count; } + if (!strcmp(a->attr.name, "dir_level")) { + if (t > MAX_DIR_HASH_DEPTH) + return -EINVAL; + sbi->dir_level = t; + return count; + } + *ui = (unsigned int)t; return count; @@ -926,6 +967,7 @@ DCC_INFO_GENERAL_RW_ATTR(discard_io_aware_gran); DCC_INFO_GENERAL_RW_ATTR(discard_urgent_util); DCC_INFO_GENERAL_RW_ATTR(discard_granularity); DCC_INFO_GENERAL_RW_ATTR(max_ordered_discard); +DCC_INFO_GENERAL_RW_ATTR(discard_io_aware); /* NM_INFO ATTR */ NM_INFO_RW_ATTR(max_roll_forward_node_blocks, max_rf_node_blocks); @@ -1074,6 +1116,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(discard_urgent_util), ATTR_LIST(discard_granularity), ATTR_LIST(max_ordered_discard), + ATTR_LIST(discard_io_aware), ATTR_LIST(pending_discard), ATTR_LIST(gc_mode), ATTR_LIST(ipu_policy), @@ -1197,9 +1240,16 @@ ATTRIBUTE_GROUPS(f2fs_feat); F2FS_GENERAL_RO_ATTR(sb_status); F2FS_GENERAL_RO_ATTR(cp_status); +F2FS_GENERAL_RO_ATTR(issued_discard); +F2FS_GENERAL_RO_ATTR(queued_discard); +F2FS_GENERAL_RO_ATTR(undiscard_blks); + static struct attribute *f2fs_stat_attrs[] = { ATTR_LIST(sb_status), ATTR_LIST(cp_status), + ATTR_LIST(issued_discard), + ATTR_LIST(queued_discard), + ATTR_LIST(undiscard_blks), NULL, }; ATTRIBUTE_GROUPS(f2fs_stat); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 47e88b4d4e7d..f290fe9327c4 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -660,11 +660,14 @@ retry: here = __find_xattr(base_addr, last_base_addr, NULL, index, len, name); if (!here) { if (!F2FS_I(inode)->i_xattr_nid) { + error = f2fs_recover_xattr_data(inode, NULL); f2fs_notice(F2FS_I_SB(inode), - "recover xattr in inode (%lu)", inode->i_ino); - f2fs_recover_xattr_data(inode, NULL); - kfree(base_addr); - goto retry; + "recover xattr in inode (%lu), error(%d)", + inode->i_ino, error); + if (!error) { + kfree(base_addr); + goto retry; + } } f2fs_err(F2FS_I_SB(inode), "set inode (%lu) has corrupted xattr", inode->i_ino); @@ -754,6 +757,12 @@ retry: memcpy(pval, value, size); last->e_value_size = cpu_to_le16(size); new_hsize += newsize; + /* + * Explicitly add the null terminator. The unused xattr space + * is supposed to always be zeroed, which would make this + * unnecessary, but don't depend on that. + */ + *(u32 *)((u8 *)last + newsize) = 0; } error = write_all_xattrs(inode, new_hsize, base_addr, ipage); diff --git a/fs/file.c b/fs/file.c index 5fb0b146e79e..3b683b9101d8 100644 --- a/fs/file.c +++ b/fs/file.c @@ -629,19 +629,23 @@ void fd_install(unsigned int fd, struct file *file) EXPORT_SYMBOL(fd_install); /** - * pick_file - return file associatd with fd + * file_close_fd_locked - return file associated with fd * @files: file struct to retrieve file from * @fd: file descriptor to retrieve file for * + * Doesn't take a separate reference count. + * * Context: files_lock must be held. * * Returns: The file associated with @fd (NULL if @fd is not open) */ -static struct file *pick_file(struct files_struct *files, unsigned fd) +struct file *file_close_fd_locked(struct files_struct *files, unsigned fd) { struct fdtable *fdt = files_fdtable(files); struct file *file; + lockdep_assert_held(&files->file_lock); + if (fd >= fdt->max_fds) return NULL; @@ -660,7 +664,7 @@ int close_fd(unsigned fd) struct file *file; spin_lock(&files->file_lock); - file = pick_file(files, fd); + file = file_close_fd_locked(files, fd); spin_unlock(&files->file_lock); if (!file) return -EBADF; @@ -707,7 +711,7 @@ static inline void __range_close(struct files_struct *files, unsigned int fd, max_fd = min(max_fd, n); for (; fd <= max_fd; fd++) { - file = pick_file(files, fd); + file = file_close_fd_locked(files, fd); if (file) { spin_unlock(&files->file_lock); filp_close(file, files); @@ -795,26 +799,21 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) return 0; } -/* - * See close_fd_get_file() below, this variant assumes current->files->file_lock - * is held. - */ -struct file *__close_fd_get_file(unsigned int fd) -{ - return pick_file(current->files, fd); -} - -/* - * variant of close_fd that gets a ref on the file for later fput. - * The caller must ensure that filp_close() called on the file. +/** + * file_close_fd - return file associated with fd + * @fd: file descriptor to retrieve file for + * + * Doesn't take a separate reference count. + * + * Returns: The file associated with @fd (NULL if @fd is not open) */ -struct file *close_fd_get_file(unsigned int fd) +struct file *file_close_fd(unsigned int fd) { struct files_struct *files = current->files; struct file *file; spin_lock(&files->file_lock); - file = pick_file(files, fd); + file = file_close_fd_locked(files, fd); spin_unlock(&files->file_lock); return file; @@ -959,31 +958,45 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, struct file *file; struct fdtable *fdt = rcu_dereference_raw(files->fdt); struct file __rcu **fdentry; + unsigned long nospec_mask; - if (unlikely(fd >= fdt->max_fds)) - return NULL; - - fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds); + /* Mask is a 0 for invalid fd's, ~0 for valid ones */ + nospec_mask = array_index_mask_nospec(fd, fdt->max_fds); /* - * Ok, we have a file pointer. However, because we do - * this all locklessly under RCU, we may be racing with - * that file being closed. - * - * Such a race can take two forms: - * - * (a) the file ref already went down to zero and the - * file hasn't been reused yet or the file count - * isn't zero but the file has already been reused. + * fdentry points to the 'fd' offset, or fdt->fd[0]. + * Loading from fdt->fd[0] is always safe, because the + * array always exists. */ - file = __get_file_rcu(fdentry); + fdentry = fdt->fd + (fd & nospec_mask); + + /* Do the load, then mask any invalid result */ + file = rcu_dereference_raw(*fdentry); + file = (void *)(nospec_mask & (unsigned long)file); if (unlikely(!file)) return NULL; - if (unlikely(IS_ERR(file))) + /* + * Ok, we have a file pointer that was valid at + * some point, but it might have become stale since. + * + * We need to confirm it by incrementing the refcount + * and then check the lookup again. + * + * atomic_long_inc_not_zero() gives us a full memory + * barrier. We only really need an 'acquire' one to + * protect the loads below, but we don't have that. + */ + if (unlikely(!atomic_long_inc_not_zero(&file->f_count))) continue; /* + * Such a race can take two forms: + * + * (a) the file ref already went down to zero and the + * file hasn't been reused yet or the file count + * isn't zero but the file has already been reused. + * * (b) the file table entry has changed under us. * Note that we don't need to re-check the 'fdt->fd' * pointer having changed, because it always goes @@ -991,7 +1004,8 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, * * If so, we need to put our ref and try again. */ - if (unlikely(rcu_dereference_raw(files->fdt) != fdt)) { + if (unlikely(file != rcu_dereference_raw(*fdentry)) || + unlikely(rcu_dereference_raw(files->fdt) != fdt)) { fput(file); continue; } @@ -1128,13 +1142,13 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask) * atomic_read_acquire() pairs with atomic_dec_and_test() in * put_files_struct(). */ - if (atomic_read_acquire(&files->count) == 1) { + if (likely(atomic_read_acquire(&files->count) == 1)) { file = files_lookup_fd_raw(files, fd); if (!file || unlikely(file->f_mode & mask)) return 0; return (unsigned long)file; } else { - file = __fget(fd, mask); + file = __fget_files(files, fd, mask); if (!file) return 0; return FDPUT_FPUT | (unsigned long)file; @@ -1282,7 +1296,7 @@ out_unlock: } /** - * __receive_fd() - Install received file into file descriptor table + * receive_fd() - Install received file into file descriptor table * @file: struct file that was received from another process * @ufd: __user pointer to write new fd number to * @o_flags: the O_* flags to apply to the new fd entry @@ -1296,7 +1310,7 @@ out_unlock: * * Returns newly install fd or -ve on error. */ -int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) +int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) { int new_fd; int error; @@ -1321,6 +1335,7 @@ int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) __receive_sock(file); return new_fd; } +EXPORT_SYMBOL_GPL(receive_fd); int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) { @@ -1336,12 +1351,6 @@ int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) return new_fd; } -int receive_fd(struct file *file, unsigned int o_flags) -{ - return __receive_fd(file, NULL, o_flags); -} -EXPORT_SYMBOL_GPL(receive_fd); - static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) { int err = -EBADF; diff --git a/fs/file_table.c b/fs/file_table.c index de4a2915bfd4..b991f90571b4 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -75,18 +75,6 @@ static inline void file_free(struct file *f) } } -void release_empty_file(struct file *f) -{ - WARN_ON_ONCE(f->f_mode & (FMODE_BACKING | FMODE_OPENED)); - if (atomic_long_dec_and_test(&f->f_count)) { - security_file_free(f); - put_cred(f->f_cred); - if (likely(!(f->f_mode & FMODE_NOACCOUNT))) - percpu_counter_dec(&nr_files); - kmem_cache_free(filp_cachep, f); - } -} - /* * Return the total number of open files in the system */ @@ -142,7 +130,6 @@ static struct ctl_table fs_stat_sysctls[] = { .extra1 = &sysctl_nr_open_min, .extra2 = &sysctl_nr_open_max, }, - { } }; static int __init init_fs_stat_sysctls(void) @@ -329,9 +316,6 @@ struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, const char *name, int flags, const struct file_operations *fops) { - static const struct dentry_operations anon_ops = { - .d_dname = simple_dname - }; struct qstr this = QSTR_INIT(name, strlen(name)); struct path path; struct file *file; @@ -339,8 +323,6 @@ struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this); if (!path.dentry) return ERR_PTR(-ENOMEM); - if (!mnt->mnt_sb->s_d_op) - d_set_d_op(path.dentry, &anon_ops); path.mnt = mntget(mnt); d_instantiate(path.dentry, inode); file = alloc_file(&path, flags, fops); @@ -419,7 +401,7 @@ static void delayed_fput(struct work_struct *unused) static void ____fput(struct callback_head *work) { - __fput(container_of(work, struct file, f_rcuhead)); + __fput(container_of(work, struct file, f_task_work)); } /* @@ -445,9 +427,13 @@ void fput(struct file *file) if (atomic_long_dec_and_test(&file->f_count)) { struct task_struct *task = current; + if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { + file_free(file); + return; + } if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { - init_task_work(&file->f_rcuhead, ____fput); - if (!task_work_add(task, &file->f_rcuhead, TWA_RESUME)) + init_task_work(&file->f_task_work, ____fput); + if (!task_work_add(task, &file->f_task_work, TWA_RESUME)) return; /* * After this task has run exit_task_work(), diff --git a/fs/freevxfs/vxfs_bmap.c b/fs/freevxfs/vxfs_bmap.c index de2a5bccb930..26d367e3668d 100644 --- a/fs/freevxfs/vxfs_bmap.c +++ b/fs/freevxfs/vxfs_bmap.c @@ -29,7 +29,7 @@ vxfs_typdump(struct vxfs_typed *typ) /** * vxfs_bmap_ext4 - do bmap for ext4 extents * @ip: pointer to the inode we do bmap for - * @iblock: logical block. + * @bn: logical block. * * Description: * vxfs_bmap_ext4 performs the bmap operation for inodes with @@ -97,7 +97,7 @@ fail_buf: * vxfs_bmap_indir reads a &struct vxfs_typed at @indir * and performs the type-defined action. * - * Return Value: + * Returns: * The physical block number on success, else Zero. * * Note: @@ -179,7 +179,7 @@ out: * Description: * Performs the bmap operation for typed extents. * - * Return Value: + * Returns: * The physical block number on success, else Zero. */ static daddr_t @@ -243,7 +243,7 @@ vxfs_bmap_typed(struct inode *ip, long iblock) * vxfs_bmap1 perfoms a logical to physical block mapping * for vxfs-internal purposes. * - * Return Value: + * Returns: * The physical block number on success, else Zero. */ daddr_t diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c index 9b49ec36e667..ed51fcd34757 100644 --- a/fs/freevxfs/vxfs_immed.c +++ b/fs/freevxfs/vxfs_immed.c @@ -15,7 +15,7 @@ /** * vxfs_immed_read_folio - read part of an immed inode into pagecache - * @file: file context (unused) + * @fp: file context (unused) * @folio: folio to fill in. * * Description: diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c index f04ba2ed1e1a..1b0bca8b4cc6 100644 --- a/fs/freevxfs/vxfs_lookup.c +++ b/fs/freevxfs/vxfs_lookup.c @@ -177,8 +177,7 @@ vxfs_lookup(struct inode *dip, struct dentry *dp, unsigned int flags) /** * vxfs_readdir - read a directory * @fp: the directory to read - * @retp: return buffer - * @filler: filldir callback + * @ctx: dir_context for filldir/readdir * * Description: * vxfs_readdir fills @retp with directory entries from @fp diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a660f1f21540..148a71b8b4d0 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -19,6 +19,7 @@ #include <linux/uio.h> #include <linux/fs.h> #include <linux/filelock.h> +#include <linux/splice.h> static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, int opcode, @@ -3195,8 +3196,8 @@ static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off, len, flags); if (ret == -EOPNOTSUPP || ret == -EXDEV) - ret = generic_copy_file_range(src_file, src_off, dst_file, - dst_off, len, flags); + ret = splice_copy_file_range(src_file, src_off, dst_file, + dst_off, len); return ret; } diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 9611bfceda4b..974aca9c8ea8 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -82,11 +82,11 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock, } /** - * gfs2_write_jdata_folio - gfs2 jdata-specific version of block_write_full_page + * gfs2_write_jdata_folio - gfs2 jdata-specific version of block_write_full_folio * @folio: The folio to write * @wbc: The writeback control * - * This is the same as calling block_write_full_page, but it also + * This is the same as calling block_write_full_folio, but it also * writes pages outside of i_size */ static int gfs2_write_jdata_folio(struct folio *folio, @@ -108,7 +108,7 @@ static int gfs2_write_jdata_folio(struct folio *folio, folio_size(folio)); return __block_write_full_folio(inode, folio, gfs2_get_block_noalloc, - wbc, end_buffer_async_write); + wbc); } /** @@ -403,18 +403,18 @@ static int gfs2_jdata_writepages(struct address_space *mapping, } /** - * stuffed_readpage - Fill in a Linux folio with stuffed file data + * stuffed_read_folio - Fill in a Linux folio with stuffed file data * @ip: the inode * @folio: the folio * * Returns: errno */ -static int stuffed_readpage(struct gfs2_inode *ip, struct folio *folio) +static int stuffed_read_folio(struct gfs2_inode *ip, struct folio *folio) { - struct buffer_head *dibh; - size_t i_size = i_size_read(&ip->i_inode); - void *data; - int error; + struct buffer_head *dibh = NULL; + size_t dsize = i_size_read(&ip->i_inode); + void *from = NULL; + int error = 0; /* * Due to the order of unstuffing files and ->fault(), we can be @@ -422,22 +422,20 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct folio *folio) * so we need to supply one here. It doesn't happen often. */ if (unlikely(folio->index)) { - folio_zero_range(folio, 0, folio_size(folio)); - folio_mark_uptodate(folio); - return 0; + dsize = 0; + } else { + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out; + from = dibh->b_data + sizeof(struct gfs2_dinode); } - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - return error; - - data = dibh->b_data + sizeof(struct gfs2_dinode); - memcpy_to_folio(folio, 0, data, i_size); - folio_zero_range(folio, i_size, folio_size(folio) - i_size); + folio_fill_tail(folio, 0, from, dsize); brelse(dibh); - folio_mark_uptodate(folio); +out: + folio_end_read(folio, error == 0); - return 0; + return error; } /** @@ -456,13 +454,12 @@ static int gfs2_read_folio(struct file *file, struct folio *folio) (i_blocksize(inode) == PAGE_SIZE && !folio_buffers(folio))) { error = iomap_read_folio(folio, &gfs2_iomap_ops); } else if (gfs2_is_stuffed(ip)) { - error = stuffed_readpage(ip, folio); - folio_unlock(folio); + error = stuffed_read_folio(ip, folio); } else { error = mpage_read_folio(folio, gfs2_block_map); } - if (unlikely(gfs2_withdrawn(sdp))) + if (gfs2_withdrawing_or_withdrawn(sdp)) return -EIO; return error; @@ -748,7 +745,7 @@ static const struct address_space_operations gfs2_aops = { .bmap = gfs2_bmap, .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; static const struct address_space_operations gfs2_jdata_aops = { @@ -761,7 +758,7 @@ static const struct address_space_operations gfs2_jdata_aops = { .invalidate_folio = gfs2_invalidate_folio, .release_folio = gfs2_release_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; void gfs2_set_aops(struct inode *inode) diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index 2e215e8c3c88..177f1f41f225 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -32,21 +32,25 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) { - struct dentry *parent; + struct dentry *parent = NULL; struct gfs2_sbd *sdp; struct gfs2_inode *dip; - struct inode *inode; + struct inode *dinode, *inode; struct gfs2_holder d_gh; struct gfs2_inode *ip = NULL; int error, valid = 0; int had_lock = 0; - if (flags & LOOKUP_RCU) - return -ECHILD; - - parent = dget_parent(dentry); - sdp = GFS2_SB(d_inode(parent)); - dip = GFS2_I(d_inode(parent)); + if (flags & LOOKUP_RCU) { + dinode = d_inode_rcu(READ_ONCE(dentry->d_parent)); + if (!dinode) + return -ECHILD; + } else { + parent = dget_parent(dentry); + dinode = d_inode(parent); + } + sdp = GFS2_SB(dinode); + dip = GFS2_I(dinode); inode = d_inode(dentry); if (inode) { @@ -62,7 +66,8 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL); if (!had_lock) { - error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, + flags & LOOKUP_RCU ? GL_NOBLOCK : 0, &d_gh); if (error) goto out; } diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index cf40895233f5..d418d8b5367f 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -138,8 +138,6 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, return ERR_PTR(-ESTALE); inode = gfs2_lookup_by_inum(sdp, inum->no_addr, inum->no_formal_ino, GFS2_BLKST_DINODE); - if (IS_ERR(inode)) - return ERR_CAST(inode); return d_obtain_alias(inode); } @@ -192,5 +190,6 @@ const struct export_operations gfs2_export_ops = { .fh_to_parent = gfs2_fh_to_parent, .get_name = gfs2_get_name, .get_parent = gfs2_get_parent, + .flags = EXPORT_OP_ASYNC_LOCK, }; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 4b66efc1a82a..992ca4effb50 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1442,7 +1442,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_POSIX)) return -ENOLCK; - if (unlikely(gfs2_withdrawn(sdp))) { + if (gfs2_withdrawing_or_withdrawn(sdp)) { if (fl->fl_type == F_UNLCK) locks_lock_file_wait(file, fl); return -EIO; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index d6bf1f8c25dc..34540f9d011c 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -156,7 +156,7 @@ static bool glock_blocked_by_withdraw(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - if (likely(!gfs2_withdrawn(sdp))) + if (!gfs2_withdrawing_or_withdrawn(sdp)) return false; if (gl->gl_ops->go_flags & GLOF_NONDISK) return false; @@ -278,7 +278,7 @@ static void __gfs2_glock_put(struct gfs2_glock *gl) GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); if (mapping) { truncate_inode_pages_final(mapping); - if (!gfs2_withdrawn(sdp)) + if (!gfs2_withdrawing_or_withdrawn(sdp)) GLOCK_BUG_ON(gl, !mapping_empty(mapping)); } trace_gfs2_glock_put(gl); @@ -517,6 +517,23 @@ static inline struct gfs2_holder *find_first_waiter(const struct gfs2_glock *gl) } /** + * find_last_waiter - find the last gh that's waiting for the glock + * @gl: the glock + * + * This also is a fast way of finding out if there are any waiters. + */ + +static inline struct gfs2_holder *find_last_waiter(const struct gfs2_glock *gl) +{ + struct gfs2_holder *gh; + + if (list_empty(&gl->gl_holders)) + return NULL; + gh = list_last_entry(&gl->gl_holders, struct gfs2_holder, gh_list); + return test_bit(HIF_HOLDER, &gh->gh_iflags) ? NULL : gh; +} + +/** * state_change - record that the glock is now in a different state * @gl: the glock * @new_state: the new state @@ -757,7 +774,7 @@ skip_inval: * gfs2_gl_hash_clear calls clear_glock) and recovery is complete * then it's okay to tell dlm to unlock it. */ - if (unlikely(sdp->sd_log_error && !gfs2_withdrawn(sdp))) + if (unlikely(sdp->sd_log_error) && !gfs2_withdrawing_or_withdrawn(sdp)) gfs2_withdraw_delayed(sdp); if (glock_blocked_by_withdraw(gl) && (target != LM_ST_UNLOCKED || @@ -794,7 +811,7 @@ skip_inval: gfs2_glock_queue_work(gl, 0); } else if (ret) { fs_err(sdp, "lm_lock ret %d\n", ret); - GLOCK_BUG_ON(gl, !gfs2_withdrawn(sdp)); + GLOCK_BUG_ON(gl, !gfs2_withdrawing_or_withdrawn(sdp)); } } else { /* lock_nolock */ finish_xmote(gl, target); @@ -1213,7 +1230,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, mapping->host = s->s_bdev->bd_inode; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); - mapping->private_data = NULL; + mapping->i_private_data = NULL; mapping->writeback_index = 0; } @@ -1555,11 +1572,30 @@ trap_recursive: int gfs2_glock_nq(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; - int error = 0; + int error; if (glock_blocked_by_withdraw(gl) && !(gh->gh_flags & LM_FLAG_NOEXP)) return -EIO; + if (gh->gh_flags & GL_NOBLOCK) { + struct gfs2_holder *current_gh; + + error = -ECHILD; + spin_lock(&gl->gl_lockref.lock); + if (find_last_waiter(gl)) + goto unlock; + current_gh = find_first_holder(gl); + if (!may_grant(gl, current_gh, gh)) + goto unlock; + set_bit(HIF_HOLDER, &gh->gh_iflags); + list_add_tail(&gh->gh_list, &gl->gl_holders); + trace_gfs2_promote(gh); + error = 0; +unlock: + spin_unlock(&gl->gl_lockref.lock); + return error; + } + if (test_bit(GLF_LRU, &gl->gl_flags)) gfs2_glock_remove_from_lru(gl); @@ -1575,6 +1611,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh) run_queue(gl, 1); spin_unlock(&gl->gl_lockref.lock); + error = 0; if (!(gh->gh_flags & GL_ASYNC)) error = gfs2_glock_wait(gh); diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 61197598abfd..0114f3e0ebe0 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -84,6 +84,7 @@ enum { #define GL_SKIP 0x0100 #define GL_NOPID 0x0200 #define GL_NOCACHE 0x0400 +#define GL_NOBLOCK 0x0800 /* * lm_async_cb return flags diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index b41c78bd2cc0..45653cbc8a87 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -174,7 +174,7 @@ static int gfs2_rgrp_metasync(struct gfs2_glock *gl) filemap_fdatawrite_range(metamapping, start, end); error = filemap_fdatawait_range(metamapping, start, end); - WARN_ON_ONCE(error && !gfs2_withdrawn(sdp)); + WARN_ON_ONCE(error && !gfs2_withdrawing_or_withdrawn(sdp)); mapping_set_error(metamapping, error); if (error) gfs2_io_error(sdp); @@ -494,7 +494,7 @@ int gfs2_inode_refresh(struct gfs2_inode *ip) /** * inode_go_instantiate - read in an inode if necessary - * @gh: The glock holder + * @gl: The glock * * Returns: errno */ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 1b95db2c3aac..6bfc9383b7b8 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1882,10 +1882,10 @@ int gfs2_permission(struct mnt_idmap *idmap, struct inode *inode, WARN_ON_ONCE(!may_not_block); return -ECHILD; } - if (gfs2_glock_is_locked_by_me(gl) == NULL) { - if (may_not_block) - return -ECHILD; - error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { + int noblock = may_not_block ? GL_NOBLOCK : 0; + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, + LM_FLAG_ANY | noblock, &i_gh); if (error) return error; } diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 59ab18c79889..d1ac5d0679ea 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -1122,7 +1122,7 @@ static void gdlm_recover_prep(void *arg) struct gfs2_sbd *sdp = arg; struct lm_lockstruct *ls = &sdp->sd_lockstruct; - if (gfs2_withdrawn(sdp)) { + if (gfs2_withdrawing_or_withdrawn(sdp)) { fs_err(sdp, "recover_prep ignored due to withdraw.\n"); return; } @@ -1148,7 +1148,7 @@ static void gdlm_recover_slot(void *arg, struct dlm_slot *slot) struct lm_lockstruct *ls = &sdp->sd_lockstruct; int jid = slot->slot - 1; - if (gfs2_withdrawn(sdp)) { + if (gfs2_withdrawing_or_withdrawn(sdp)) { fs_err(sdp, "recover_slot jid %d ignored due to withdraw.\n", jid); return; @@ -1177,7 +1177,7 @@ static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots, struct gfs2_sbd *sdp = arg; struct lm_lockstruct *ls = &sdp->sd_lockstruct; - if (gfs2_withdrawn(sdp)) { + if (gfs2_withdrawing_or_withdrawn(sdp)) { fs_err(sdp, "recover_done ignored due to withdraw.\n"); return; } @@ -1208,7 +1208,7 @@ static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid, { struct lm_lockstruct *ls = &sdp->sd_lockstruct; - if (gfs2_withdrawn(sdp)) { + if (gfs2_withdrawing_or_withdrawn(sdp)) { fs_err(sdp, "recovery_result jid %d ignored due to withdraw.\n", jid); return; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index e5271ae87d1c..8cddf955ebc0 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -126,7 +126,7 @@ __acquires(&sdp->sd_ail_lock) } } - if (gfs2_withdrawn(sdp)) { + if (gfs2_withdrawing_or_withdrawn(sdp)) { gfs2_remove_from_ail(bd); continue; } @@ -352,14 +352,15 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr, * @sdp: The superblock * @max_revokes: If non-zero, add revokes where appropriate * - * Tries to empty the ail1 lists, starting with the oldest first + * Tries to empty the ail1 lists, starting with the oldest first. + * Returns %true if the ail1 list is now empty. */ -static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int max_revokes) +static bool gfs2_ail1_empty(struct gfs2_sbd *sdp, int max_revokes) { struct gfs2_trans *tr, *s; int oldest_tr = 1; - int ret; + bool empty; spin_lock(&sdp->sd_ail_lock); list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) { @@ -369,15 +370,10 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int max_revokes) oldest_tr = 0; } gfs2_log_update_flush_tail(sdp); - ret = list_empty(&sdp->sd_ail1_list); + empty = list_empty(&sdp->sd_ail1_list); spin_unlock(&sdp->sd_ail_lock); - if (test_bit(SDF_WITHDRAWING, &sdp->sd_flags)) { - gfs2_lm(sdp, "fatal: I/O error(s)\n"); - gfs2_withdraw(sdp); - } - - return ret; + return empty; } static void gfs2_ail1_wait(struct gfs2_sbd *sdp) @@ -814,6 +810,9 @@ void gfs2_flush_revokes(struct gfs2_sbd *sdp) gfs2_log_lock(sdp); gfs2_ail1_empty(sdp, max_revokes); gfs2_log_unlock(sdp); + + if (gfs2_withdrawing(sdp)) + gfs2_withdraw(sdp); } /** @@ -841,7 +840,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, struct super_block *sb = sdp->sd_vfs; u64 dblock; - if (gfs2_withdrawn(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp)) return; page = mempool_alloc(gfs2_page_pool, GFP_NOIO); @@ -914,8 +913,9 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, static void log_write_header(struct gfs2_sbd *sdp, u32 flags) { blk_opf_t op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC; + struct super_block *sb = sdp->sd_vfs; - gfs2_assert_withdraw(sdp, !test_bit(SDF_FROZEN, &sdp->sd_flags)); + gfs2_assert_withdraw(sdp, sb->s_writers.frozen != SB_FREEZE_COMPLETE); if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) { gfs2_ordered_wait(sdp); @@ -974,8 +974,9 @@ void gfs2_ail_drain(struct gfs2_sbd *sdp) static void empty_ail1_list(struct gfs2_sbd *sdp) { unsigned long start = jiffies; + bool empty = false; - for (;;) { + while (!empty) { if (time_after(jiffies, start + (HZ * 600))) { fs_err(sdp, "Error: In %s for 10 minutes! t=%d\n", __func__, current->journal_info ? 1 : 0); @@ -984,9 +985,14 @@ static void empty_ail1_list(struct gfs2_sbd *sdp) } gfs2_ail1_start(sdp); gfs2_ail1_wait(sdp); - if (gfs2_ail1_empty(sdp, 0)) - return; + empty = gfs2_ail1_empty(sdp, 0); + + if (gfs2_withdrawing_or_withdrawn(sdp)) + break; } + + if (gfs2_withdrawing(sdp)) + gfs2_withdraw(sdp); } /** @@ -1047,7 +1053,8 @@ repeat: * Do this check while holding the log_flush_lock to prevent new * buffers from being added to the ail via gfs2_pin() */ - if (gfs2_withdrawn(sdp) || !test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) + if (gfs2_withdrawing_or_withdrawn(sdp) || + !test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) goto out; /* Log might have been flushed while we waited for the flush lock */ @@ -1096,13 +1103,13 @@ repeat: goto out_withdraw; gfs2_ordered_write(sdp); - if (gfs2_withdrawn(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp)) goto out_withdraw; lops_before_commit(sdp, tr); - if (gfs2_withdrawn(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp)) goto out_withdraw; gfs2_log_submit_bio(&sdp->sd_jdesc->jd_log_bio, REQ_OP_WRITE); - if (gfs2_withdrawn(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp)) goto out_withdraw; if (sdp->sd_log_head != sdp->sd_log_flush_head) { @@ -1110,7 +1117,7 @@ repeat: } else if (sdp->sd_log_tail != sdp->sd_log_flush_tail && !sdp->sd_log_idle) { log_write_header(sdp, flags); } - if (gfs2_withdrawn(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp)) goto out_withdraw; lops_after_commit(sdp, tr); @@ -1128,7 +1135,7 @@ repeat: if (!(flags & GFS2_LOG_HEAD_FLUSH_NORMAL)) { if (!sdp->sd_log_idle) { empty_ail1_list(sdp); - if (gfs2_withdrawn(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp)) goto out_withdraw; log_write_header(sdp, flags); } @@ -1297,8 +1304,9 @@ int gfs2_logd(void *data) struct gfs2_sbd *sdp = data; unsigned long t = 1; + set_freezable(); while (!kthread_should_stop()) { - if (gfs2_withdrawn(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp)) break; /* Check for errors writing to the journal */ @@ -1330,18 +1338,19 @@ int gfs2_logd(void *data) t = gfs2_tune_get(sdp, gt_logd_secs) * HZ; - try_to_freeze(); - - t = wait_event_interruptible_timeout(sdp->sd_logd_waitq, + t = wait_event_freezable_timeout(sdp->sd_logd_waitq, test_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags) || gfs2_ail_flush_reqd(sdp) || gfs2_jrnl_flush_reqd(sdp) || sdp->sd_log_error || - gfs2_withdrawn(sdp) || + gfs2_withdrawing_or_withdrawn(sdp) || kthread_should_stop(), t); } + if (gfs2_withdrawing(sdp)) + gfs2_withdraw(sdp); + return 0; } diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 483f69807062..314ec2a70167 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -391,22 +391,15 @@ static void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page) * Simply unlock the pages in the bio. The main thread will wait on them and * process them in order as necessary. */ - static void gfs2_end_log_read(struct bio *bio) { - struct page *page; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + int error = blk_status_to_errno(bio->bi_status); + struct folio_iter fi; - bio_for_each_segment_all(bvec, bio, iter_all) { - page = bvec->bv_page; - if (bio->bi_status) { - int err = blk_status_to_errno(bio->bi_status); - - SetPageError(page); - mapping_set_error(page->mapping, err); - } - unlock_page(page); + bio_for_each_folio_all(fi, bio) { + /* We're abusing wb_err to get the error to gfs2_find_jhead */ + filemap_set_wb_err(fi.folio->mapping, error); + folio_end_read(fi.folio, !error); } bio_put(bio); @@ -475,7 +468,7 @@ static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index, folio = filemap_get_folio(jd->jd_inode->i_mapping, index); folio_wait_locked(folio); - if (folio_test_error(folio)) + if (!folio_test_uptodate(folio)) *done = true; if (!*done) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 25ceb0805df2..f814054c8cd0 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -252,7 +252,8 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, struct buffer_head *bh, *bhs[2]; int num = 0; - if (unlikely(gfs2_withdrawn(sdp)) && !gfs2_withdraw_in_prog(sdp)) { + if (gfs2_withdrawing_or_withdrawn(sdp) && + !gfs2_withdraw_in_prog(sdp)) { *bhp = NULL; return -EIO; } @@ -310,7 +311,8 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) { - if (unlikely(gfs2_withdrawn(sdp)) && !gfs2_withdraw_in_prog(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp) && + !gfs2_withdraw_in_prog(sdp)) return -EIO; wait_on_buffer(bh); @@ -321,7 +323,8 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) gfs2_io_error_bh_wd(sdp, bh); return -EIO; } - if (unlikely(gfs2_withdrawn(sdp)) && !gfs2_withdraw_in_prog(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp) && + !gfs2_withdraw_in_prog(sdp)) return -EIO; return 0; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index b108c5d26839..1281e60be639 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -117,7 +117,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) mapping->host = sb->s_bdev->bd_inode; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); - mapping->private_data = NULL; + mapping->i_private_data = NULL; mapping->writeback_index = 0; spin_lock_init(&sdp->sd_log_lock); @@ -1073,7 +1073,7 @@ hostdata_error: void gfs2_lm_unmount(struct gfs2_sbd *sdp) { const struct lm_lockops *lm = sdp->sd_lockstruct.ls_ops; - if (likely(!gfs2_withdrawn(sdp)) && lm->lm_unmount) + if (!gfs2_withdrawing_or_withdrawn(sdp) && lm->lm_unmount) lm->lm_unmount(sdp); } diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 95dae7838b4e..aa9cf0102848 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -128,7 +128,7 @@ static void gfs2_qd_dispose(struct gfs2_quota_data *qd) hlist_bl_del_rcu(&qd->qd_hlist); spin_unlock_bucket(qd->qd_hash); - if (!gfs2_withdrawn(sdp)) { + if (!gfs2_withdrawing_or_withdrawn(sdp)) { gfs2_assert_warn(sdp, !qd->qd_change); gfs2_assert_warn(sdp, !qd->qd_slot_ref); gfs2_assert_warn(sdp, !qd->qd_bh_count); @@ -271,7 +271,7 @@ static struct gfs2_quota_data *gfs2_qd_search_bucket(unsigned int hash, if (qd->qd_sbd != sdp) continue; if (lockref_get_not_dead(&qd->qd_lockref)) { - list_lru_del(&gfs2_qd_lru, &qd->qd_lru); + list_lru_del_obj(&gfs2_qd_lru, &qd->qd_lru); return qd; } } @@ -344,7 +344,7 @@ static void qd_put(struct gfs2_quota_data *qd) } qd->qd_lockref.count = 0; - list_lru_add(&gfs2_qd_lru, &qd->qd_lru); + list_lru_add_obj(&gfs2_qd_lru, &qd->qd_lru); spin_unlock(&qd->qd_lockref.lock); } @@ -1505,7 +1505,8 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp) LIST_HEAD(dispose); int count; - BUG_ON(test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)); + BUG_ON(!test_bit(SDF_NORECOVERY, &sdp->sd_flags) && + test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)); spin_lock(&qd_lock); list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { @@ -1517,7 +1518,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp) lockref_mark_dead(&qd->qd_lockref); spin_unlock(&qd->qd_lockref.lock); - list_lru_del(&gfs2_qd_lru, &qd->qd_lru); + list_lru_del_obj(&gfs2_qd_lru, &qd->qd_lru); list_add(&qd->qd_lru, &dispose); } spin_unlock(&qd_lock); @@ -1539,7 +1540,7 @@ static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error) { if (error == 0 || error == -EROFS) return; - if (!gfs2_withdrawn(sdp)) { + if (!gfs2_withdrawing_or_withdrawn(sdp)) { if (!cmpxchg(&sdp->sd_log_error, 0, error)) fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error); wake_up(&sdp->sd_logd_waitq); @@ -1582,8 +1583,9 @@ int gfs2_quotad(void *data) unsigned long quotad_timeo = 0; unsigned long t = 0; + set_freezable(); while (!kthread_should_stop()) { - if (gfs2_withdrawn(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp)) break; /* Update the master statfs file */ @@ -1601,13 +1603,11 @@ int gfs2_quotad(void *data) quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t, "ad_timeo, &tune->gt_quota_quantum); - try_to_freeze(); - t = min(quotad_timeo, statfs_timeo); - t = wait_event_interruptible_timeout(sdp->sd_quota_wait, + t = wait_event_freezable_timeout(sdp->sd_quota_wait, sdp->sd_statfs_force_sync || - gfs2_withdrawn(sdp) || + gfs2_withdrawing_or_withdrawn(sdp) || kthread_should_stop(), t); diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 5aae02669a40..f4fe7039f725 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -411,7 +411,7 @@ void gfs2_recover_func(struct work_struct *work) int error = 0; int jlocked = 0; - if (gfs2_withdrawn(sdp)) { + if (gfs2_withdrawing_or_withdrawn(sdp)) { fs_err(sdp, "jid=%u: Recovery not attempted due to withdraw.\n", jd->jd_jid); goto fail; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index c2060203b98a..26d6c1eea559 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -159,13 +159,13 @@ static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm, bool use_clone) } /** - * gfs2_bit_search + * gfs2_bit_search - search bitmap for a state * @ptr: Pointer to bitmap data * @mask: Mask to use (normally 0x55555.... but adjusted for search start) * @state: The state we are searching for * - * We xor the bitmap data with a patter which is the bitwise opposite - * of what we are looking for, this gives rise to a pattern of ones + * We xor the bitmap data with a pattern which is the bitwise opposite + * of what we are looking for. This gives rise to a pattern of ones * wherever there is a match. Since we have two bits per entry, we * take this pattern, shift it down by one place and then and it with * the original. All the even bit positions (0,2,4, etc) then represent @@ -1188,7 +1188,7 @@ static void rgrp_set_bitmap_flags(struct gfs2_rgrpd *rgd) /** * gfs2_rgrp_go_instantiate - Read in a RG's header and bitmaps - * @gh: the glock holder representing the rgrpd to read in + * @gl: the glock representing the rgrpd to read in * * Read in all of a Resource Group's header and bitmap blocks. * Caller must eventually call gfs2_rgrp_brelse() to free the bitmaps. @@ -1967,7 +1967,7 @@ static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops) } /** - * gfs2_rgrp_used_recently + * gfs2_rgrp_used_recently - test if an rgrp has been used recently * @rs: The block reservation with the rgrp to test * @msecs: The time limit in milliseconds * @@ -2306,7 +2306,7 @@ void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd, (unsigned long long)rgd->rd_addr, rgd->rd_flags, rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes, rgd->rd_requested, rgd->rd_reserved, rgd->rd_extfail_pt); - if (rgd->rd_sbd->sd_args.ar_rgrplvb) { + if (rgd->rd_sbd->sd_args.ar_rgrplvb && rgd->rd_rgl) { struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl; gfs2_print_dbg(seq, "%s L: f:%02x b:%u i:%u\n", fs_id_buf, diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index d21c04a22d73..e5f79466340d 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -134,7 +134,7 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) int error; j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); - if (gfs2_withdrawn(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp)) return -EIO; error = gfs2_find_jhead(sdp->sd_jdesc, &head, false); @@ -153,7 +153,7 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) gfs2_log_pointers_init(sdp, head.lh_blkno); error = gfs2_quota_init(sdp); - if (!error && gfs2_withdrawn(sdp)) + if (!error && gfs2_withdrawing_or_withdrawn(sdp)) error = -EIO; if (!error) set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); @@ -499,7 +499,7 @@ static void gfs2_dirty_inode(struct inode *inode, int flags) return; } - if (unlikely(gfs2_withdrawn(sdp))) + if (gfs2_withdrawing_or_withdrawn(sdp)) return; if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); @@ -605,7 +605,7 @@ restart: if (!sb_rdonly(sb)) gfs2_make_fs_ro(sdp); else { - if (gfs2_withdrawn(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp)) gfs2_destroy_threads(sdp); gfs2_quota_cleanup(sdp); @@ -673,28 +673,6 @@ static int gfs2_sync_fs(struct super_block *sb, int wait) return sdp->sd_log_error; } -static int gfs2_freeze_locally(struct gfs2_sbd *sdp) -{ - struct super_block *sb = sdp->sd_vfs; - int error; - - error = freeze_super(sb, FREEZE_HOLDER_USERSPACE); - if (error) - return error; - - if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { - gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE | - GFS2_LFC_FREEZE_GO_SYNC); - if (gfs2_withdrawn(sdp)) { - error = thaw_super(sb, FREEZE_HOLDER_USERSPACE); - if (error) - return error; - return -EIO; - } - } - return 0; -} - static int gfs2_do_thaw(struct gfs2_sbd *sdp) { struct super_block *sb = sdp->sd_vfs; @@ -724,7 +702,7 @@ void gfs2_freeze_func(struct work_struct *work) if (test_bit(SDF_FROZEN, &sdp->sd_flags)) goto freeze_failed; - error = gfs2_freeze_locally(sdp); + error = freeze_super(sb, FREEZE_HOLDER_USERSPACE); if (error) goto freeze_failed; @@ -759,12 +737,13 @@ static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who) if (!mutex_trylock(&sdp->sd_freeze_mutex)) return -EBUSY; - error = -EBUSY; - if (test_bit(SDF_FROZEN, &sdp->sd_flags)) - goto out; + if (test_bit(SDF_FROZEN, &sdp->sd_flags)) { + mutex_unlock(&sdp->sd_freeze_mutex); + return -EBUSY; + } for (;;) { - error = gfs2_freeze_locally(sdp); + error = freeze_super(sb, FREEZE_HOLDER_USERSPACE); if (error) { fs_info(sdp, "GFS2: couldn't freeze filesystem: %d\n", error); @@ -772,8 +751,11 @@ static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who) } error = gfs2_lock_fs_check_clean(sdp); - if (!error) - break; /* success */ + if (!error) { + set_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags); + set_bit(SDF_FROZEN, &sdp->sd_flags); + break; + } error = gfs2_do_thaw(sdp); if (error) @@ -793,14 +775,23 @@ static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who) } out: - if (!error) { - set_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags); - set_bit(SDF_FROZEN, &sdp->sd_flags); - } mutex_unlock(&sdp->sd_freeze_mutex); return error; } +static int gfs2_freeze_fs(struct super_block *sb) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + + if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { + gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE | + GFS2_LFC_FREEZE_GO_SYNC); + if (gfs2_withdrawing_or_withdrawn(sdp)) + return -EIO; + } + return 0; +} + /** * gfs2_thaw_super - reallow writes to the filesystem * @sb: the VFS structure for the filesystem @@ -814,10 +805,12 @@ static int gfs2_thaw_super(struct super_block *sb, enum freeze_holder who) if (!mutex_trylock(&sdp->sd_freeze_mutex)) return -EBUSY; - error = -EINVAL; - if (!test_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags)) - goto out; + if (!test_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags)) { + mutex_unlock(&sdp->sd_freeze_mutex); + return -EINVAL; + } + atomic_inc(&sb->s_active); gfs2_freeze_unlock(&sdp->sd_freeze_gh); error = gfs2_do_thaw(sdp); @@ -826,8 +819,8 @@ static int gfs2_thaw_super(struct super_block *sb, enum freeze_holder who) clear_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags); clear_bit(SDF_FROZEN, &sdp->sd_flags); } -out: mutex_unlock(&sdp->sd_freeze_mutex); + deactivate_super(sb); return error; } @@ -1065,16 +1058,6 @@ static int gfs2_drop_inode(struct inode *inode) return generic_drop_inode(inode); } -static int is_ancestor(const struct dentry *d1, const struct dentry *d2) -{ - do { - if (d1 == d2) - return 1; - d1 = d1->d_parent; - } while (!IS_ROOT(d1)); - return 0; -} - /** * gfs2_show_options - Show mount options for /proc/mounts * @s: seq_file structure @@ -1096,7 +1079,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) statfs_slow = sdp->sd_tune.gt_statfs_slow; spin_unlock(&sdp->sd_tune.gt_spin); - if (is_ancestor(root, sdp->sd_master_dir)) + if (is_subdir(root, sdp->sd_master_dir)) seq_puts(s, ",meta"); if (args->ar_lockproto[0]) seq_show_option(s, "lockproto", args->ar_lockproto); @@ -1607,6 +1590,7 @@ const struct super_operations gfs2_super_ops = { .put_super = gfs2_put_super, .sync_fs = gfs2_sync_fs, .freeze_super = gfs2_freeze_super, + .freeze_fs = gfs2_freeze_fs, .thaw_super = gfs2_thaw_super, .statfs = gfs2_statfs, .drop_inode = gfs2_drop_inode, diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 60a0206890c5..250f340cb44d 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -193,7 +193,7 @@ static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len) static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf) { - unsigned int b = gfs2_withdrawn(sdp); + unsigned int b = gfs2_withdrawing_or_withdrawn(sdp); return snprintf(buf, PAGE_SIZE, "%u\n", b); } diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 7e835be7032d..192213c7359a 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -268,7 +268,7 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) (unsigned long long)bd->bd_bh->b_blocknr); BUG(); } - if (unlikely(gfs2_withdrawn(sdp))) { + if (gfs2_withdrawing_or_withdrawn(sdp)) { fs_info(sdp, "GFS2:adding buf while withdrawn! 0x%llx\n", (unsigned long long)bd->bd_bh->b_blocknr); goto out_unlock; diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index da29fafb6272..f52141ce9485 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -372,7 +372,7 @@ void gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion, const char *function, char *file, unsigned int line, bool delayed) { - if (gfs2_withdrawn(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp)) return; fs_err(sdp, @@ -548,7 +548,7 @@ void gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh, const char *function, char *file, unsigned int line, bool withdraw) { - if (gfs2_withdrawn(sdp)) + if (gfs2_withdrawing_or_withdrawn(sdp)) return; fs_err(sdp, "fatal: I/O error\n" diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index 11c9d59b6889..ba071998461f 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -198,13 +198,14 @@ static inline void gfs2_withdraw_delayed(struct gfs2_sbd *sdp) } /** - * gfs2_withdrawn - test whether the file system is withdrawing or withdrawn + * gfs2_withdrawing_or_withdrawn - test whether the file system is withdrawing + * or withdrawn * @sdp: the superblock */ -static inline bool gfs2_withdrawn(struct gfs2_sbd *sdp) +static inline bool gfs2_withdrawing_or_withdrawn(struct gfs2_sbd *sdp) { - return test_bit(SDF_WITHDRAWN, &sdp->sd_flags) || - test_bit(SDF_WITHDRAWING, &sdp->sd_flags); + return unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags) || + test_bit(SDF_WITHDRAWING, &sdp->sd_flags)); } /** @@ -213,13 +214,13 @@ static inline bool gfs2_withdrawn(struct gfs2_sbd *sdp) */ static inline bool gfs2_withdrawing(struct gfs2_sbd *sdp) { - return test_bit(SDF_WITHDRAWING, &sdp->sd_flags) && - !test_bit(SDF_WITHDRAWN, &sdp->sd_flags); + return unlikely(test_bit(SDF_WITHDRAWING, &sdp->sd_flags) && + !test_bit(SDF_WITHDRAWN, &sdp->sd_flags)); } static inline bool gfs2_withdraw_in_prog(struct gfs2_sbd *sdp) { - return test_bit(SDF_WITHDRAW_IN_PROG, &sdp->sd_flags); + return unlikely(test_bit(SDF_WITHDRAW_IN_PROG, &sdp->sd_flags)); } #define gfs2_tune_get(sdp, field) \ diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index a7bc4690a780..8c34798a0715 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -29,11 +29,6 @@ static const struct inode_operations hfs_file_inode_operations; #define HFS_VALID_MODE_BITS (S_IFREG | S_IFDIR | S_IRWXUGO) -static int hfs_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, hfs_get_block, wbc); -} - static int hfs_read_folio(struct file *file, struct folio *folio) { return block_read_full_folio(folio, hfs_get_block); @@ -162,9 +157,10 @@ const struct address_space_operations hfs_btree_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = hfs_read_folio, - .writepage = hfs_writepage, + .writepages = hfs_writepages, .write_begin = hfs_write_begin, .write_end = generic_write_end, + .migrate_folio = buffer_migrate_folio, .bmap = hfs_bmap, .release_folio = hfs_release_folio, }; diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 702a0663b1d8..3d326926c195 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -28,11 +28,6 @@ static int hfsplus_read_folio(struct file *file, struct folio *folio) return block_read_full_folio(folio, hfsplus_get_block); } -static int hfsplus_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, hfsplus_get_block, wbc); -} - static void hfsplus_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; @@ -159,9 +154,10 @@ const struct address_space_operations hfsplus_btree_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = hfsplus_read_folio, - .writepage = hfsplus_writepage, + .writepages = hfsplus_writepages, .write_begin = hfsplus_write_begin, .write_end = generic_write_end, + .migrate_folio = buffer_migrate_folio, .bmap = hfsplus_bmap, .release_folio = hfsplus_release_folio, }; diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 0b791adf02e5..b0cb70400996 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -30,8 +30,7 @@ struct hfsplus_wd { * @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes * @buf: buffer for I/O * @data: output pointer for location of requested data - * @op: direction of I/O - * @op_flags: request op flags + * @opf: request op flags * * The unit of I/O is hfsplus_min_io_size(sb), which may be bigger than * HFSPLUS_SECTOR_SIZE, and @buf must be sized accordingly. On reads @@ -43,6 +42,8 @@ struct hfsplus_wd { * that starts at the rounded-down address. As long as the data was * read using hfsplus_submit_bio() and the same buffer is used things * will work correctly. + * + * Returns: %0 on success else -errno code */ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, void *buf, void **data, blk_opf_t opf) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index ea87f24c6c3f..a73d27c4dd58 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -637,12 +637,8 @@ static struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry, inode = hostfs_iget(ino->i_sb, name); __putname(name); - if (IS_ERR(inode)) { - if (PTR_ERR(inode) == -ENOENT) - inode = NULL; - else - return ERR_CAST(inode); - } + if (inode == ERR_PTR(-ENOENT)) + inode = NULL; return d_splice_alias(inode, dentry); } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index f757d4f7ad98..ea5b8e57d904 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -686,7 +686,7 @@ static void hugetlbfs_evict_inode(struct inode *inode) * at inode creation time. If this is a device special inode, * i_mapping may not point to the original address space. */ - resv_map = (struct resv_map *)(&inode->i_data)->private_data; + resv_map = (struct resv_map *)(&inode->i_data)->i_private_data; /* Only regular and link inodes have associated reserve maps */ if (resv_map) resv_map_release(&resv_map->refs); @@ -1000,7 +1000,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, &hugetlbfs_i_mmap_rwsem_key); inode->i_mapping->a_ops = &hugetlbfs_aops; simple_inode_init_ts(inode); - inode->i_mapping->private_data = resv_map; + inode->i_mapping->i_private_data = resv_map; info->seals = F_SEAL_SEAL; switch (mode & S_IFMT) { default: @@ -1129,8 +1129,8 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping, #define hugetlbfs_migrate_folio NULL #endif -static int hugetlbfs_error_remove_page(struct address_space *mapping, - struct page *page) +static int hugetlbfs_error_remove_folio(struct address_space *mapping, + struct folio *folio) { return 0; } @@ -1277,7 +1277,7 @@ static const struct address_space_operations hugetlbfs_aops = { .write_end = hugetlbfs_write_end, .dirty_folio = noop_dirty_folio, .migrate_folio = hugetlbfs_migrate_folio, - .error_remove_page = hugetlbfs_error_remove_page, + .error_remove_folio = hugetlbfs_error_remove_folio, }; diff --git a/fs/inode.c b/fs/inode.c index f238d987dec9..91048c4c9c9e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -129,7 +129,6 @@ static struct ctl_table inodes_sysctls[] = { .mode = 0444, .proc_handler = proc_nr_inodes, }, - { } }; static int __init init_fs_inode_sysctls(void) @@ -209,7 +208,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) atomic_set(&mapping->nr_thps, 0); #endif mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); - mapping->private_data = NULL; + mapping->i_private_data = NULL; mapping->writeback_index = 0; init_rwsem(&mapping->invalidate_lock); lockdep_set_class_and_name(&mapping->invalidate_lock, @@ -398,8 +397,8 @@ static void __address_space_init_once(struct address_space *mapping) { xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT); init_rwsem(&mapping->i_mmap_rwsem); - INIT_LIST_HEAD(&mapping->private_list); - spin_lock_init(&mapping->private_lock); + INIT_LIST_HEAD(&mapping->i_private_list); + spin_lock_init(&mapping->i_private_lock); mapping->i_mmap = RB_ROOT_CACHED; } @@ -464,7 +463,7 @@ static void __inode_add_lru(struct inode *inode, bool rotate) if (!mapping_shrinkable(&inode->i_data)) return; - if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru)) + if (list_lru_add_obj(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_inc(nr_unused); else if (rotate) inode->i_state |= I_REFERENCED; @@ -482,7 +481,7 @@ void inode_add_lru(struct inode *inode) static void inode_lru_list_del(struct inode *inode) { - if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru)) + if (list_lru_del_obj(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_dec(nr_unused); } @@ -620,7 +619,7 @@ void clear_inode(struct inode *inode) * nor even WARN_ON(!mapping_empty). */ xa_unlock_irq(&inode->i_data.i_pages); - BUG_ON(!list_empty(&inode->i_data.private_list)); + BUG_ON(!list_empty(&inode->i_data.i_private_list)); BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); BUG_ON(!list_empty(&inode->i_wb_list)); @@ -1090,48 +1089,6 @@ void discard_new_inode(struct inode *inode) EXPORT_SYMBOL(discard_new_inode); /** - * lock_two_inodes - lock two inodes (may be regular files but also dirs) - * - * Lock any non-NULL argument. The caller must make sure that if he is passing - * in two directories, one is not ancestor of the other. Zero, one or two - * objects may be locked by this function. - * - * @inode1: first inode to lock - * @inode2: second inode to lock - * @subclass1: inode lock subclass for the first lock obtained - * @subclass2: inode lock subclass for the second lock obtained - */ -void lock_two_inodes(struct inode *inode1, struct inode *inode2, - unsigned subclass1, unsigned subclass2) -{ - if (!inode1 || !inode2) { - /* - * Make sure @subclass1 will be used for the acquired lock. - * This is not strictly necessary (no current caller cares) but - * let's keep things consistent. - */ - if (!inode1) - swap(inode1, inode2); - goto lock; - } - - /* - * If one object is directory and the other is not, we must make sure - * to lock directory first as the other object may be its child. - */ - if (S_ISDIR(inode2->i_mode) == S_ISDIR(inode1->i_mode)) { - if (inode1 > inode2) - swap(inode1, inode2); - } else if (!S_ISDIR(inode1->i_mode)) - swap(inode1, inode2); -lock: - if (inode1) - inode_lock_nested(inode1, subclass1); - if (inode2 && inode2 != inode1) - inode_lock_nested(inode2, subclass2); -} - -/** * lock_two_nondirectories - take two i_mutexes on non-directory objects * * Lock any non-NULL argument. Passed objects must not be directories. @@ -1146,7 +1103,12 @@ void lock_two_nondirectories(struct inode *inode1, struct inode *inode2) WARN_ON_ONCE(S_ISDIR(inode1->i_mode)); if (inode2) WARN_ON_ONCE(S_ISDIR(inode2->i_mode)); - lock_two_inodes(inode1, inode2, I_MUTEX_NORMAL, I_MUTEX_NONDIR2); + if (inode1 > inode2) + swap(inode1, inode2); + if (inode1) + inode_lock(inode1); + if (inode2 && inode2 != inode1) + inode_lock_nested(inode2, I_MUTEX_NONDIR2); } EXPORT_SYMBOL(lock_two_nondirectories); @@ -1836,37 +1798,37 @@ EXPORT_SYMBOL(bmap); * earlier than or equal to either the ctime or mtime, * or if at least a day has passed since the last atime update. */ -static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, +static bool relatime_need_update(struct vfsmount *mnt, struct inode *inode, struct timespec64 now) { struct timespec64 atime, mtime, ctime; if (!(mnt->mnt_flags & MNT_RELATIME)) - return 1; + return true; /* * Is mtime younger than or equal to atime? If yes, update atime: */ atime = inode_get_atime(inode); mtime = inode_get_mtime(inode); if (timespec64_compare(&mtime, &atime) >= 0) - return 1; + return true; /* * Is ctime younger than or equal to atime? If yes, update atime: */ ctime = inode_get_ctime(inode); if (timespec64_compare(&ctime, &atime) >= 0) - return 1; + return true; /* * Is the previous atime value older than a day? If yes, * update atime: */ if ((long)(now.tv_sec - atime.tv_sec) >= 24*60*60) - return 1; + return true; /* * Good, we can skip the atime update: */ - return 0; + return false; } /** @@ -2404,7 +2366,7 @@ EXPORT_SYMBOL(inode_init_owner); * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ bool inode_owner_or_capable(struct mnt_idmap *idmap, const struct inode *inode) diff --git a/fs/internal.h b/fs/internal.h index 58e43341aebf..b67406435fc0 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -83,6 +83,8 @@ int path_mount(const char *dev_name, struct path *path, const char *type_page, unsigned long flags, void *data_page); int path_umount(struct path *path, int flags); +int show_path(struct seq_file *m, struct dentry *root); + /* * fs_struct.c */ @@ -94,7 +96,6 @@ extern void chroot_fs_refs(const struct path *, const struct path *); struct file *alloc_empty_file(int flags, const struct cred *cred); struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred); struct file *alloc_empty_backing_file(int flags, const struct cred *cred); -void release_empty_file(struct file *f); static inline void file_put_write_access(struct file *file) { @@ -180,7 +181,7 @@ extern struct file *do_file_open_root(const struct path *, const char *, const struct open_flags *); extern struct open_how build_open_how(int flags, umode_t mode); extern int build_open_flags(const struct open_how *how, struct open_flags *op); -extern struct file *__close_fd_get_file(unsigned int fd); +struct file *file_close_fd_locked(struct files_struct *files, unsigned fd); long do_sys_ftruncate(unsigned int fd, loff_t length, int small); int chmod_common(const struct path *path, umode_t mode); @@ -196,8 +197,6 @@ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); int dentry_needs_remove_privs(struct mnt_idmap *, struct dentry *dentry); bool in_group_or_capable(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t vfsgid); -void lock_two_inodes(struct inode *inode1, struct inode *inode2, - unsigned subclass1, unsigned subclass2); /* * fs-writeback.c @@ -215,6 +214,11 @@ extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *) extern char *simple_dname(struct dentry *, char *, int); extern void dput_to_list(struct dentry *, struct list_head *); extern void shrink_dentry_list(struct list_head *); +extern void shrink_dcache_for_umount(struct super_block *); +extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *); +extern struct dentry *__d_lookup_rcu(const struct dentry *parent, + const struct qstr *name, unsigned *seq); +extern void d_genocide(struct dentry *); /* * pipe.c @@ -243,10 +247,10 @@ int do_statx(int dfd, struct filename *filename, unsigned int flags, /* * fs/splice.c: */ -long splice_file_to_pipe(struct file *in, - struct pipe_inode_info *opipe, - loff_t *offset, - size_t len, unsigned int flags); +ssize_t splice_file_to_pipe(struct file *in, + struct pipe_inode_info *opipe, + loff_t *offset, + size_t len, unsigned int flags); /* * fs/xattr.c: diff --git a/fs/ioctl.c b/fs/ioctl.c index f5fd99d6b0d4..76cf22ac97d7 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -920,8 +920,7 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, if (!f.file) return -EBADF; - /* RED-PEN how should LSM module know it's handling 32bit? */ - error = security_file_ioctl(f.file, cmd, arg); + error = security_file_ioctl_compat(f.file, cmd, arg); if (error) goto out; diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index f72df2babe56..093c4515b22a 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -305,28 +305,18 @@ static int iomap_read_inline_data(const struct iomap_iter *iter, { const struct iomap *iomap = iomap_iter_srcmap(iter); size_t size = i_size_read(iter->inode) - iomap->offset; - size_t poff = offset_in_page(iomap->offset); size_t offset = offset_in_folio(folio, iomap->offset); - void *addr; if (folio_test_uptodate(folio)) return 0; - if (WARN_ON_ONCE(size > PAGE_SIZE - poff)) - return -EIO; - if (WARN_ON_ONCE(size > PAGE_SIZE - - offset_in_page(iomap->inline_data))) - return -EIO; if (WARN_ON_ONCE(size > iomap->length)) return -EIO; if (offset > 0) ifs_alloc(iter->inode, folio, iter->flags); - addr = kmap_local_folio(folio, offset); - memcpy(addr, iomap->inline_data, size); - memset(addr + size, 0, PAGE_SIZE - poff - size); - kunmap_local(addr); - iomap_set_range_uptodate(folio, offset, PAGE_SIZE - poff); + folio_fill_tail(folio, offset, iomap->inline_data, size); + iomap_set_range_uptodate(folio, offset, folio_size(folio) - offset); return 0; } diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 118699fff2f9..1c97e64c4784 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -556,7 +556,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) struct transaction_chp_stats_s *stats; transaction_t *transaction; journal_t *journal; - struct buffer_head *bh = jh2bh(jh); JBUFFER_TRACE(jh, "entry"); @@ -569,16 +568,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) JBUFFER_TRACE(jh, "removing from transaction"); - /* - * If we have failed to write the buffer out to disk, the filesystem - * may become inconsistent. We cannot abort the journal here since - * we hold j_list_lock and we have to be careful about races with - * jbd2_journal_destroy(). So mark the writeback IO error in the - * journal here and we abort the journal later from a better context. - */ - if (buffer_write_io_error(bh)) - set_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags); - __buffer_unlink(jh); jh->b_cp_transaction = NULL; percpu_counter_dec(&journal->j_checkpoint_jh_count); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 206cb53ef2b0..b6c114c11b97 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1534,6 +1534,7 @@ static journal_t *journal_init_common(struct block_device *bdev, journal->j_fs_dev = fs_dev; journal->j_blk_offset = start; journal->j_total_len = len; + jbd2_init_fs_dev_write_error(journal); err = journal_load_superblock(journal); if (err) @@ -1861,7 +1862,7 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, if (is_journal_aborted(journal)) return -EIO; - if (test_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags)) { + if (jbd2_check_fs_dev_write_error(journal)) { jbd2_journal_abort(journal, -EIO); return -EIO; } @@ -2159,12 +2160,12 @@ int jbd2_journal_destroy(journal_t *journal) /* * OK, all checkpoint transactions have been checked, now check the - * write out io error flag and abort the journal if some buffer failed - * to write back to the original location, otherwise the filesystem - * may become inconsistent. + * writeback errseq of fs dev and abort the journal if some buffer + * failed to write back to the original location, otherwise the + * filesystem may become inconsistent. */ if (!is_journal_aborted(journal) && - test_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags)) + jbd2_check_fs_dev_write_error(journal)) jbd2_journal_abort(journal, -EIO); if (journal->j_sb_buffer) { diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 01f744cb97a4..1f7664984d6e 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -289,8 +289,6 @@ int jbd2_journal_recover(journal_t *journal) journal_superblock_t * sb; struct recovery_info info; - errseq_t wb_err; - struct address_space *mapping; memset(&info, 0, sizeof(info)); sb = journal->j_superblock; @@ -308,9 +306,6 @@ int jbd2_journal_recover(journal_t *journal) return 0; } - wb_err = 0; - mapping = journal->j_fs_dev->bd_inode->i_mapping; - errseq_check_and_advance(&mapping->wb_err, &wb_err); err = do_one_pass(journal, &info, PASS_SCAN); if (!err) err = do_one_pass(journal, &info, PASS_REVOKE); @@ -334,7 +329,7 @@ int jbd2_journal_recover(journal_t *journal) err2 = sync_blockdev(journal->j_fs_dev); if (!err) err = err2; - err2 = errseq_check_and_advance(&mapping->wb_err, &wb_err); + err2 = jbd2_check_fs_dev_write_error(journal); if (!err) err = err2; /* Make sure all replayed data is on permanent storage */ diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 5f08b5fd105a..cb0b8d6fc0c6 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1231,11 +1231,25 @@ out: int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh) { struct journal_head *jh; + journal_t *journal; int rc; if (is_handle_aborted(handle)) return -EROFS; + journal = handle->h_transaction->t_journal; + if (jbd2_check_fs_dev_write_error(journal)) { + /* + * If the fs dev has writeback errors, it may have failed + * to async write out metadata buffers in the background. + * In this case, we could read old data from disk and write + * it out again, which may lead to on-disk filesystem + * inconsistency. Aborting journal can avoid it happen. + */ + jbd2_journal_abort(journal, -EIO); + return -EIO; + } + if (jbd2_write_access_granted(handle, bh, false)) return 0; diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c index 9d26b1b9fc01..0925caab23c4 100644 --- a/fs/jffs2/debug.c +++ b/fs/jffs2/debug.c @@ -157,7 +157,7 @@ __jffs2_dbg_prewrite_paranoia_check(struct jffs2_sb_info *c, kfree(buf); } -void __jffs2_dbg_superblock_counts(struct jffs2_sb_info *c) +static void __jffs2_dbg_superblock_counts(struct jffs2_sb_info *c) { struct jffs2_eraseblock *jeb; uint32_t free = 0, dirty = 0, used = 0, wasted = 0, diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 11c77757ead9..8eec84c651bf 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -63,10 +63,10 @@ */ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, int nblocks); -static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval); -static int dbBackSplit(dmtree_t * tp, int leafno); -static int dbJoin(dmtree_t * tp, int leafno, int newval); -static void dbAdjTree(dmtree_t * tp, int leafno, int newval); +static void dbSplit(dmtree_t *tp, int leafno, int splitsz, int newval, bool is_ctl); +static int dbBackSplit(dmtree_t *tp, int leafno, bool is_ctl); +static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl); +static void dbAdjTree(dmtree_t *tp, int leafno, int newval, bool is_ctl); static int dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level); static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results); @@ -2103,7 +2103,7 @@ static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, * system. */ if (dp->tree.stree[word] == NOFREE) - dbBackSplit((dmtree_t *) & dp->tree, word); + dbBackSplit((dmtree_t *)&dp->tree, word, false); dbAllocBits(bmp, dp, blkno, nblocks); } @@ -2189,7 +2189,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, * the binary system of the leaves if need be. */ dbSplit(tp, word, BUDMIN, - dbMaxBud((u8 *) & dp->wmap[word])); + dbMaxBud((u8 *)&dp->wmap[word]), false); word += 1; } else { @@ -2229,7 +2229,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, * system of the leaves to reflect the current * allocation (size). */ - dbSplit(tp, word, size, NOFREE); + dbSplit(tp, word, size, NOFREE, false); /* get the number of dmap words handled */ nw = BUDSIZE(size, BUDMIN); @@ -2336,7 +2336,7 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, /* update the leaf for this dmap word. */ rc = dbJoin(tp, word, - dbMaxBud((u8 *) & dp->wmap[word])); + dbMaxBud((u8 *)&dp->wmap[word]), false); if (rc) return rc; @@ -2369,7 +2369,7 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, /* update the leaf. */ - rc = dbJoin(tp, word, size); + rc = dbJoin(tp, word, size, false); if (rc) return rc; @@ -2521,16 +2521,16 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level) * that it is at the front of a binary buddy system. */ if (oldval == NOFREE) { - rc = dbBackSplit((dmtree_t *) dcp, leafno); + rc = dbBackSplit((dmtree_t *)dcp, leafno, true); if (rc) { release_metapage(mp); return rc; } oldval = dcp->stree[ti]; } - dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval); + dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval, true); } else { - rc = dbJoin((dmtree_t *) dcp, leafno, newval); + rc = dbJoin((dmtree_t *) dcp, leafno, newval, true); if (rc) { release_metapage(mp); return rc; @@ -2561,7 +2561,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level) */ if (alloc) { dbJoin((dmtree_t *) dcp, leafno, - oldval); + oldval, true); } else { /* the dbJoin() above might have * caused a larger binary buddy system @@ -2571,9 +2571,9 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level) */ if (dcp->stree[ti] == NOFREE) dbBackSplit((dmtree_t *) - dcp, leafno); + dcp, leafno, true); dbSplit((dmtree_t *) dcp, leafno, - dcp->budmin, oldval); + dcp->budmin, oldval, true); } /* release the buffer and return the error. @@ -2621,7 +2621,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level) * * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; */ -static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval) +static void dbSplit(dmtree_t *tp, int leafno, int splitsz, int newval, bool is_ctl) { int budsz; int cursz; @@ -2643,7 +2643,7 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval) while (cursz >= splitsz) { /* update the buddy's leaf with its new value. */ - dbAdjTree(tp, leafno ^ budsz, cursz); + dbAdjTree(tp, leafno ^ budsz, cursz, is_ctl); /* on to the next size and buddy. */ @@ -2655,7 +2655,7 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval) /* adjust the dmap tree to reflect the specified leaf's new * value. */ - dbAdjTree(tp, leafno, newval); + dbAdjTree(tp, leafno, newval, is_ctl); } @@ -2686,7 +2686,7 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval) * * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; */ -static int dbBackSplit(dmtree_t * tp, int leafno) +static int dbBackSplit(dmtree_t *tp, int leafno, bool is_ctl) { int budsz, bud, w, bsz, size; int cursz; @@ -2737,7 +2737,7 @@ static int dbBackSplit(dmtree_t * tp, int leafno) * system in two. */ cursz = leaf[bud] - 1; - dbSplit(tp, bud, cursz, cursz); + dbSplit(tp, bud, cursz, cursz, is_ctl); break; } } @@ -2763,9 +2763,11 @@ static int dbBackSplit(dmtree_t * tp, int leafno) * leafno - the number of the leaf to be updated. * newval - the new value for the leaf. * - * RETURN VALUES: none + * RETURN VALUES: + * 0 - success + * -EIO - i/o error */ -static int dbJoin(dmtree_t * tp, int leafno, int newval) +static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl) { int budsz, buddy; s8 *leaf; @@ -2790,6 +2792,10 @@ static int dbJoin(dmtree_t * tp, int leafno, int newval) * get the buddy size (number of words covered) of * the new value. */ + + if ((newval - tp->dmt_budmin) > BUDMIN) + return -EIO; + budsz = BUDSIZE(newval, tp->dmt_budmin); /* try to join. @@ -2820,12 +2826,12 @@ static int dbJoin(dmtree_t * tp, int leafno, int newval) if (leafno < buddy) { /* leafno is the left buddy. */ - dbAdjTree(tp, buddy, NOFREE); + dbAdjTree(tp, buddy, NOFREE, is_ctl); } else { /* buddy is the left buddy and becomes * leafno. */ - dbAdjTree(tp, leafno, NOFREE); + dbAdjTree(tp, leafno, NOFREE, is_ctl); leafno = buddy; } @@ -2838,7 +2844,7 @@ static int dbJoin(dmtree_t * tp, int leafno, int newval) /* update the leaf value. */ - dbAdjTree(tp, leafno, newval); + dbAdjTree(tp, leafno, newval, is_ctl); return 0; } @@ -2859,15 +2865,20 @@ static int dbJoin(dmtree_t * tp, int leafno, int newval) * * RETURN VALUES: none */ -static void dbAdjTree(dmtree_t * tp, int leafno, int newval) +static void dbAdjTree(dmtree_t *tp, int leafno, int newval, bool is_ctl) { int lp, pp, k; - int max; + int max, size; + + size = is_ctl ? CTLTREESIZE : TREESIZE; /* pick up the index of the leaf for this leafno. */ lp = leafno + le32_to_cpu(tp->dmt_leafidx); + if (WARN_ON_ONCE(lp >= size || lp < 0)) + return; + /* is the current value the same as the old value ? if so, * there is nothing to do. */ diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index 92b7c533407c..031d8f570f58 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -633,6 +633,11 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data, for (base = 0, lim = p->header.nextindex; lim; lim >>= 1) { index = base + (lim >> 1); + if (stbl[index] < 0) { + rc = -EIO; + goto out; + } + if (p->header.flag & BT_LEAF) { /* uppercase leaf name to compare */ cmp = @@ -1970,7 +1975,7 @@ static int dtSplitRoot(tid_t tid, do { f = &rp->slot[fsi]; fsi = f->next; - } while (fsi != -1); + } while (fsi >= 0); f->next = n; } diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index a037ee59e398..2ec35889ad24 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -2179,6 +2179,9 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno) /* get the ag and iag numbers for this iag. */ agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi); + if (agno >= MAXAG || agno < 0) + return -EIO; + iagno = le32_to_cpu(iagp->iagnum); /* check if this is the last free extent within the diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c index 415eb65a36ff..9b5c6a20b30c 100644 --- a/fs/jfs/jfs_mount.c +++ b/fs/jfs/jfs_mount.c @@ -172,15 +172,15 @@ int jfs_mount(struct super_block *sb) } jfs_info("jfs_mount: ipimap:0x%p", ipimap); - /* map further access of per fileset inodes by the fileset inode */ - sbi->ipimap = ipimap; - /* initialize fileset inode allocation map */ if ((rc = diMount(ipimap))) { jfs_err("jfs_mount: diMount failed w/rc = %d", rc); goto err_ipimap; } + /* map further access of per fileset inodes by the fileset inode */ + sbi->ipimap = ipimap; + return rc; /* diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index dccc8b3f1045..be17e3c43582 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -2702,6 +2702,7 @@ int jfs_lazycommit(void *arg) unsigned long flags; struct jfs_sb_info *sbi; + set_freezable(); do { LAZY_LOCK(flags); jfs_commit_thread_waking = 0; /* OK to wake another thread */ @@ -2884,6 +2885,7 @@ int jfs_sync(void *arg) struct jfs_inode_info *jfs_ip; tid_t tid; + set_freezable(); do { /* * write each inode on the anonymous inode list diff --git a/fs/libfs.c b/fs/libfs.c index c2aa6fd4795c..eec6031b0155 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -104,15 +104,16 @@ EXPORT_SYMBOL(dcache_dir_close); * If no such element exists, NULL is returned. */ static struct dentry *scan_positives(struct dentry *cursor, - struct list_head *p, + struct hlist_node **p, loff_t count, struct dentry *last) { struct dentry *dentry = cursor->d_parent, *found = NULL; spin_lock(&dentry->d_lock); - while ((p = p->next) != &dentry->d_subdirs) { - struct dentry *d = list_entry(p, struct dentry, d_child); + while (*p) { + struct dentry *d = hlist_entry(*p, struct dentry, d_sib); + p = &d->d_sib.next; // we must at least skip cursors, to avoid livelocks if (d->d_flags & DCACHE_DENTRY_CURSOR) continue; @@ -126,8 +127,10 @@ static struct dentry *scan_positives(struct dentry *cursor, count = 1; } if (need_resched()) { - list_move(&cursor->d_child, p); - p = &cursor->d_child; + if (!hlist_unhashed(&cursor->d_sib)) + __hlist_del(&cursor->d_sib); + hlist_add_behind(&cursor->d_sib, &d->d_sib); + p = &cursor->d_sib.next; spin_unlock(&dentry->d_lock); cond_resched(); spin_lock(&dentry->d_lock); @@ -159,13 +162,12 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) inode_lock_shared(dentry->d_inode); if (offset > 2) - to = scan_positives(cursor, &dentry->d_subdirs, + to = scan_positives(cursor, &dentry->d_children.first, offset - 2, NULL); spin_lock(&dentry->d_lock); + hlist_del_init(&cursor->d_sib); if (to) - list_move(&cursor->d_child, &to->d_child); - else - list_del_init(&cursor->d_child); + hlist_add_behind(&cursor->d_sib, &to->d_sib); spin_unlock(&dentry->d_lock); dput(to); @@ -187,19 +189,16 @@ int dcache_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; struct dentry *cursor = file->private_data; - struct list_head *anchor = &dentry->d_subdirs; struct dentry *next = NULL; - struct list_head *p; + struct hlist_node **p; if (!dir_emit_dots(file, ctx)) return 0; if (ctx->pos == 2) - p = anchor; - else if (!list_empty(&cursor->d_child)) - p = &cursor->d_child; + p = &dentry->d_children.first; else - return 0; + p = &cursor->d_sib.next; while ((next = scan_positives(cursor, p, 1, next)) != NULL) { if (!dir_emit(ctx, next->d_name.name, next->d_name.len, @@ -207,13 +206,12 @@ int dcache_readdir(struct file *file, struct dir_context *ctx) fs_umode_to_dtype(d_inode(next)->i_mode))) break; ctx->pos++; - p = &next->d_child; + p = &next->d_sib.next; } spin_lock(&dentry->d_lock); + hlist_del_init(&cursor->d_sib); if (next) - list_move_tail(&cursor->d_child, &next->d_child); - else - list_del_init(&cursor->d_child); + hlist_add_before(&cursor->d_sib, &next->d_sib); spin_unlock(&dentry->d_lock); dput(next); @@ -500,12 +498,11 @@ const struct file_operations simple_offset_dir_operations = { static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev) { - struct dentry *child = NULL; - struct list_head *p = prev ? &prev->d_child : &parent->d_subdirs; + struct dentry *child = NULL, *d; spin_lock(&parent->d_lock); - while ((p = p->next) != &parent->d_subdirs) { - struct dentry *d = container_of(p, struct dentry, d_child); + d = prev ? d_next_sibling(prev) : d_first_child(parent); + hlist_for_each_entry_from(d, d_sib) { if (simple_positive(d)) { spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(d)) @@ -666,7 +663,7 @@ int simple_empty(struct dentry *dentry) int ret = 0; spin_lock(&dentry->d_lock); - list_for_each_entry(child, &dentry->d_subdirs, d_child) { + hlist_for_each_entry(child, &dentry->d_children, d_sib) { spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(child)) { spin_unlock(&child->d_lock); @@ -920,7 +917,6 @@ int simple_fill_super(struct super_block *s, unsigned long magic, const struct tree_descr *files) { struct inode *inode; - struct dentry *root; struct dentry *dentry; int i; @@ -943,8 +939,8 @@ int simple_fill_super(struct super_block *s, unsigned long magic, inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; set_nlink(inode, 2); - root = d_make_root(inode); - if (!root) + s->s_root = d_make_root(inode); + if (!s->s_root) return -ENOMEM; for (i = 0; !files->name || files->name[0]; i++, files++) { if (!files->name) @@ -956,13 +952,13 @@ int simple_fill_super(struct super_block *s, unsigned long magic, "with an index of 1!\n", __func__, s->s_type->name); - dentry = d_alloc_name(root, files->name); + dentry = d_alloc_name(s->s_root, files->name); if (!dentry) - goto out; + return -ENOMEM; inode = new_inode(s); if (!inode) { dput(dentry); - goto out; + return -ENOMEM; } inode->i_mode = S_IFREG | files->mode; simple_inode_init_ts(inode); @@ -970,13 +966,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic, inode->i_ino = i; d_add(dentry, inode); } - s->s_root = root; return 0; -out: - d_genocide(root); - shrink_dcache_parent(root); - dput(root); - return -ENOMEM; } EXPORT_SYMBOL(simple_fill_super); diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 81be07c1d3d1..ce5862482097 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -345,10 +345,10 @@ static int lockd_get(void) serv->sv_maxconn = nlm_max_connections; error = svc_set_num_threads(serv, NULL, 1); - /* The thread now holds the only reference */ - svc_put(serv); - if (error < 0) + if (error < 0) { + svc_destroy(&serv); return error; + } nlmsvc_serv = serv; register_inetaddr_notifier(&lockd_inetaddr_notifier); @@ -372,11 +372,9 @@ static void lockd_put(void) unregister_inet6addr_notifier(&lockd_inet6addr_notifier); #endif - svc_get(nlmsvc_serv); svc_set_num_threads(nlmsvc_serv, NULL, 0); - svc_put(nlmsvc_serv); timer_delete_sync(&nlmsvc_retry); - nlmsvc_serv = NULL; + svc_destroy(&nlmsvc_serv); dprintk("lockd_down: service destroyed\n"); } @@ -475,7 +473,6 @@ static struct ctl_table nlm_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; #endif /* CONFIG_SYSCTL */ diff --git a/fs/locks.c b/fs/locks.c index 46d88b9e222c..cc7c117ee192 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -111,7 +111,6 @@ static struct ctl_table locks_sysctls[] = { .proc_handler = proc_dointvec, }, #endif /* CONFIG_MMU */ - {} }; static int __init init_fs_locks_sysctls(void) diff --git a/fs/minix/dir.c b/fs/minix/dir.c index 62c313fc9a49..a224cf222570 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -26,12 +26,6 @@ const struct file_operations minix_dir_operations = { .fsync = generic_file_fsync, }; -static inline void dir_put_page(struct page *page) -{ - kunmap(page); - put_page(page); -} - /* * Return the offset into page `page_nr' of the last valid * byte in that page, plus one. @@ -70,13 +64,14 @@ static int minix_handle_dirsync(struct inode *dir) return err; } -static struct page * dir_get_page(struct inode *dir, unsigned long n) +static void *dir_get_page(struct inode *dir, unsigned long n, struct page **p) { struct address_space *mapping = dir->i_mapping; struct page *page = read_mapping_page(mapping, n, NULL); - if (!IS_ERR(page)) - kmap(page); - return page; + if (IS_ERR(page)) + return ERR_CAST(page); + *p = page; + return kmap_local_page(page); } static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi) @@ -104,11 +99,11 @@ static int minix_readdir(struct file *file, struct dir_context *ctx) for ( ; n < npages; n++, offset = 0) { char *p, *kaddr, *limit; - struct page *page = dir_get_page(inode, n); + struct page *page; - if (IS_ERR(page)) + kaddr = dir_get_page(inode, n, &page); + if (IS_ERR(kaddr)) continue; - kaddr = (char *)page_address(page); p = kaddr+offset; limit = kaddr + minix_last_byte(inode, n) - chunk_size; for ( ; p <= limit; p = minix_next_entry(p, sbi)) { @@ -127,13 +122,13 @@ static int minix_readdir(struct file *file, struct dir_context *ctx) unsigned l = strnlen(name, sbi->s_namelen); if (!dir_emit(ctx, name, l, inumber, DT_UNKNOWN)) { - dir_put_page(page); + unmap_and_put_page(page, p); return 0; } } ctx->pos += chunk_size; } - dir_put_page(page); + unmap_and_put_page(page, kaddr); } return 0; } @@ -173,11 +168,10 @@ minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page) for (n = 0; n < npages; n++) { char *kaddr, *limit; - page = dir_get_page(dir, n); - if (IS_ERR(page)) + kaddr = dir_get_page(dir, n, &page); + if (IS_ERR(kaddr)) continue; - kaddr = (char*)page_address(page); limit = kaddr + minix_last_byte(dir, n) - sbi->s_dirsize; for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) { if (sbi->s_version == MINIX_V3) { @@ -194,7 +188,7 @@ minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page) if (namecompare(namelen, sbi->s_namelen, name, namx)) goto found; } - dir_put_page(page); + unmap_and_put_page(page, kaddr); } return NULL; @@ -229,12 +223,10 @@ int minix_add_link(struct dentry *dentry, struct inode *inode) for (n = 0; n <= npages; n++) { char *limit, *dir_end; - page = dir_get_page(dir, n); - err = PTR_ERR(page); - if (IS_ERR(page)) - goto out; + kaddr = dir_get_page(dir, n, &page); + if (IS_ERR(kaddr)) + return PTR_ERR(kaddr); lock_page(page); - kaddr = (char*)page_address(page); dir_end = kaddr + minix_last_byte(dir, n); limit = kaddr + PAGE_SIZE - sbi->s_dirsize; for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) { @@ -262,13 +254,13 @@ int minix_add_link(struct dentry *dentry, struct inode *inode) goto out_unlock; } unlock_page(page); - dir_put_page(page); + unmap_and_put_page(page, kaddr); } BUG(); return -EINVAL; got_it: - pos = page_offset(page) + p - (char *)page_address(page); + pos = page_offset(page) + offset_in_page(p); err = minix_prepare_chunk(page, pos, sbi->s_dirsize); if (err) goto out_unlock; @@ -285,8 +277,7 @@ got_it: mark_inode_dirty(dir); err = minix_handle_dirsync(dir); out_put: - dir_put_page(page); -out: + unmap_and_put_page(page, kaddr); return err; out_unlock: unlock_page(page); @@ -296,8 +287,7 @@ out_unlock: int minix_delete_entry(struct minix_dir_entry *de, struct page *page) { struct inode *inode = page->mapping->host; - char *kaddr = page_address(page); - loff_t pos = page_offset(page) + (char*)de - kaddr; + loff_t pos = page_offset(page) + offset_in_page(de); struct minix_sb_info *sbi = minix_sb(inode->i_sb); unsigned len = sbi->s_dirsize; int err; @@ -333,7 +323,7 @@ int minix_make_empty(struct inode *inode, struct inode *dir) goto fail; } - kaddr = kmap_atomic(page); + kaddr = kmap_local_page(page); memset(kaddr, 0, PAGE_SIZE); if (sbi->s_version == MINIX_V3) { @@ -353,7 +343,7 @@ int minix_make_empty(struct inode *inode, struct inode *dir) de->inode = dir->i_ino; strcpy(de->name, ".."); } - kunmap_atomic(kaddr); + kunmap_local(kaddr); dir_commit_chunk(page, 0, 2 * sbi->s_dirsize); err = minix_handle_dirsync(inode); @@ -370,17 +360,16 @@ int minix_empty_dir(struct inode * inode) struct page *page = NULL; unsigned long i, npages = dir_pages(inode); struct minix_sb_info *sbi = minix_sb(inode->i_sb); - char *name; + char *name, *kaddr; __u32 inumber; for (i = 0; i < npages; i++) { - char *p, *kaddr, *limit; + char *p, *limit; - page = dir_get_page(inode, i); - if (IS_ERR(page)) + kaddr = dir_get_page(inode, i, &page); + if (IS_ERR(kaddr)) continue; - kaddr = (char *)page_address(page); limit = kaddr + minix_last_byte(inode, i) - sbi->s_dirsize; for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) { if (sbi->s_version == MINIX_V3) { @@ -406,12 +395,12 @@ int minix_empty_dir(struct inode * inode) goto not_empty; } } - dir_put_page(page); + unmap_and_put_page(page, kaddr); } return 1; not_empty: - dir_put_page(page); + unmap_and_put_page(page, kaddr); return 0; } @@ -421,8 +410,7 @@ int minix_set_link(struct minix_dir_entry *de, struct page *page, { struct inode *dir = page->mapping->host; struct minix_sb_info *sbi = minix_sb(dir->i_sb); - loff_t pos = page_offset(page) + - (char *)de-(char*)page_address(page); + loff_t pos = page_offset(page) + offset_in_page(de); int err; lock_page(page); @@ -443,15 +431,12 @@ int minix_set_link(struct minix_dir_entry *de, struct page *page, struct minix_dir_entry * minix_dotdot (struct inode *dir, struct page **p) { - struct page *page = dir_get_page(dir, 0); struct minix_sb_info *sbi = minix_sb(dir->i_sb); - struct minix_dir_entry *de = NULL; + struct minix_dir_entry *de = dir_get_page(dir, 0, p); - if (!IS_ERR(page)) { - de = minix_next_entry(page_address(page), sbi); - *p = page; - } - return de; + if (!IS_ERR(de)) + return minix_next_entry(de, sbi); + return NULL; } ino_t minix_inode_by_name(struct dentry *dentry) @@ -469,7 +454,7 @@ ino_t minix_inode_by_name(struct dentry *dentry) res = ((minix3_dirent *) de)->inode; else res = de->inode; - dir_put_page(page); + unmap_and_put_page(page, de); } return res; } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index f8af6c3ae336..73f37f298087 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -17,6 +17,7 @@ #include <linux/slab.h> #include <linux/init.h> #include <linux/highuid.h> +#include <linux/mpage.h> #include <linux/vfs.h> #include <linux/writeback.h> @@ -397,9 +398,10 @@ static int minix_get_block(struct inode *inode, sector_t block, return V2_minix_get_block(inode, block, bh_result, create); } -static int minix_writepage(struct page *page, struct writeback_control *wbc) +static int minix_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - return block_write_full_page(page, minix_get_block, wbc); + return mpage_writepages(mapping, wbc, minix_get_block); } static int minix_read_folio(struct file *file, struct folio *folio) @@ -444,9 +446,10 @@ static const struct address_space_operations minix_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = minix_read_folio, - .writepage = minix_writepage, + .writepages = minix_writepages, .write_begin = minix_write_begin, .write_end = generic_write_end, + .migrate_folio = buffer_migrate_folio, .bmap = minix_bmap, .direct_IO = noop_direct_IO }; diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 114084d5636a..d6031acc34f0 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -149,8 +149,7 @@ static int minix_unlink(struct inode * dir, struct dentry *dentry) if (!de) return -ENOENT; err = minix_delete_entry(de, page); - kunmap(page); - put_page(page); + unmap_and_put_page(page, de); if (err) return err; @@ -242,13 +241,10 @@ static int minix_rename(struct mnt_idmap *idmap, inode_dec_link_count(old_dir); } out_dir: - if (dir_de) { - kunmap(dir_page); - put_page(dir_page); - } + if (dir_de) + unmap_and_put_page(dir_page, dir_de); out_old: - kunmap(old_page); - put_page(old_page); + unmap_and_put_page(old_page, old_de); out: return err; } diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c index 57d1dedf3f8f..64c5205e2b5e 100644 --- a/fs/mnt_idmapping.c +++ b/fs/mnt_idmapping.c @@ -9,8 +9,16 @@ #include "internal.h" +/* + * Outside of this file vfs{g,u}id_t are always created from k{g,u}id_t, + * never from raw values. These are just internal helpers. + */ +#define VFSUIDT_INIT_RAW(val) (vfsuid_t){ val } +#define VFSGIDT_INIT_RAW(val) (vfsgid_t){ val } + struct mnt_idmap { - struct user_namespace *owner; + struct uid_gid_map uid_map; + struct uid_gid_map gid_map; refcount_t count; }; @@ -20,25 +28,11 @@ struct mnt_idmap { * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...]. */ struct mnt_idmap nop_mnt_idmap = { - .owner = &init_user_ns, .count = REFCOUNT_INIT(1), }; EXPORT_SYMBOL_GPL(nop_mnt_idmap); /** - * check_fsmapping - check whether an mount idmapping is allowed - * @idmap: idmap of the relevent mount - * @sb: super block of the filesystem - * - * Return: true if @idmap is allowed, false if not. - */ -bool check_fsmapping(const struct mnt_idmap *idmap, - const struct super_block *sb) -{ - return idmap->owner != sb->s_user_ns; -} - -/** * initial_idmapping - check whether this is the initial mapping * @ns: idmapping to check * @@ -53,26 +47,6 @@ static inline bool initial_idmapping(const struct user_namespace *ns) } /** - * no_idmapping - check whether we can skip remapping a kuid/gid - * @mnt_userns: the mount's idmapping - * @fs_userns: the filesystem's idmapping - * - * This function can be used to check whether a remapping between two - * idmappings is required. - * An idmapped mount is a mount that has an idmapping attached to it that - * is different from the filsystem's idmapping and the initial idmapping. - * If the initial mapping is used or the idmapping of the mount and the - * filesystem are identical no remapping is required. - * - * Return: true if remapping can be skipped, false if not. - */ -static inline bool no_idmapping(const struct user_namespace *mnt_userns, - const struct user_namespace *fs_userns) -{ - return initial_idmapping(mnt_userns) || mnt_userns == fs_userns; -} - -/** * make_vfsuid - map a filesystem kuid according to an idmapping * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping @@ -81,8 +55,8 @@ static inline bool no_idmapping(const struct user_namespace *mnt_userns, * Take a @kuid and remap it from @fs_userns into @idmap. Use this * function when preparing a @kuid to be reported to userspace. * - * If no_idmapping() determines that this is not an idmapped mount we can - * simply return @kuid unchanged. + * If initial_idmapping() determines that this is not an idmapped mount + * we can simply return @kuid unchanged. * If initial_idmapping() tells us that the filesystem is not mounted with an * idmapping we know the value of @kuid won't change when calling * from_kuid() so we can simply retrieve the value via __kuid_val() @@ -94,13 +68,12 @@ static inline bool no_idmapping(const struct user_namespace *mnt_userns, */ vfsuid_t make_vfsuid(struct mnt_idmap *idmap, - struct user_namespace *fs_userns, - kuid_t kuid) + struct user_namespace *fs_userns, + kuid_t kuid) { uid_t uid; - struct user_namespace *mnt_userns = idmap->owner; - if (no_idmapping(mnt_userns, fs_userns)) + if (idmap == &nop_mnt_idmap) return VFSUIDT_INIT(kuid); if (initial_idmapping(fs_userns)) uid = __kuid_val(kuid); @@ -108,7 +81,7 @@ vfsuid_t make_vfsuid(struct mnt_idmap *idmap, uid = from_kuid(fs_userns, kuid); if (uid == (uid_t)-1) return INVALID_VFSUID; - return VFSUIDT_INIT(make_kuid(mnt_userns, uid)); + return VFSUIDT_INIT_RAW(map_id_down(&idmap->uid_map, uid)); } EXPORT_SYMBOL_GPL(make_vfsuid); @@ -121,8 +94,8 @@ EXPORT_SYMBOL_GPL(make_vfsuid); * Take a @kgid and remap it from @fs_userns into @idmap. Use this * function when preparing a @kgid to be reported to userspace. * - * If no_idmapping() determines that this is not an idmapped mount we can - * simply return @kgid unchanged. + * If initial_idmapping() determines that this is not an idmapped mount + * we can simply return @kgid unchanged. * If initial_idmapping() tells us that the filesystem is not mounted with an * idmapping we know the value of @kgid won't change when calling * from_kgid() so we can simply retrieve the value via __kgid_val() @@ -136,9 +109,8 @@ vfsgid_t make_vfsgid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, kgid_t kgid) { gid_t gid; - struct user_namespace *mnt_userns = idmap->owner; - if (no_idmapping(mnt_userns, fs_userns)) + if (idmap == &nop_mnt_idmap) return VFSGIDT_INIT(kgid); if (initial_idmapping(fs_userns)) gid = __kgid_val(kgid); @@ -146,7 +118,7 @@ vfsgid_t make_vfsgid(struct mnt_idmap *idmap, gid = from_kgid(fs_userns, kgid); if (gid == (gid_t)-1) return INVALID_VFSGID; - return VFSGIDT_INIT(make_kgid(mnt_userns, gid)); + return VFSGIDT_INIT_RAW(map_id_down(&idmap->gid_map, gid)); } EXPORT_SYMBOL_GPL(make_vfsgid); @@ -165,11 +137,10 @@ kuid_t from_vfsuid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsuid_t vfsuid) { uid_t uid; - struct user_namespace *mnt_userns = idmap->owner; - if (no_idmapping(mnt_userns, fs_userns)) + if (idmap == &nop_mnt_idmap) return AS_KUIDT(vfsuid); - uid = from_kuid(mnt_userns, AS_KUIDT(vfsuid)); + uid = map_id_up(&idmap->uid_map, __vfsuid_val(vfsuid)); if (uid == (uid_t)-1) return INVALID_UID; if (initial_idmapping(fs_userns)) @@ -193,11 +164,10 @@ kgid_t from_vfsgid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsgid_t vfsgid) { gid_t gid; - struct user_namespace *mnt_userns = idmap->owner; - if (no_idmapping(mnt_userns, fs_userns)) + if (idmap == &nop_mnt_idmap) return AS_KGIDT(vfsgid); - gid = from_kgid(mnt_userns, AS_KGIDT(vfsgid)); + gid = map_id_up(&idmap->gid_map, __vfsgid_val(vfsgid)); if (gid == (gid_t)-1) return INVALID_GID; if (initial_idmapping(fs_userns)) @@ -228,16 +198,91 @@ int vfsgid_in_group_p(vfsgid_t vfsgid) #endif EXPORT_SYMBOL_GPL(vfsgid_in_group_p); +static int copy_mnt_idmap(struct uid_gid_map *map_from, + struct uid_gid_map *map_to) +{ + struct uid_gid_extent *forward, *reverse; + u32 nr_extents = READ_ONCE(map_from->nr_extents); + /* Pairs with smp_wmb() when writing the idmapping. */ + smp_rmb(); + + /* + * Don't blindly copy @map_to into @map_from if nr_extents is + * smaller or equal to UID_GID_MAP_MAX_BASE_EXTENTS. Since we + * read @nr_extents someone could have written an idmapping and + * then we might end up with inconsistent data. So just don't do + * anything at all. + */ + if (nr_extents == 0) + return 0; + + /* + * Here we know that nr_extents is greater than zero which means + * a map has been written. Since idmappings can't be changed + * once they have been written we know that we can safely copy + * from @map_to into @map_from. + */ + + if (nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { + *map_to = *map_from; + return 0; + } + + forward = kmemdup(map_from->forward, + nr_extents * sizeof(struct uid_gid_extent), + GFP_KERNEL_ACCOUNT); + if (!forward) + return -ENOMEM; + + reverse = kmemdup(map_from->reverse, + nr_extents * sizeof(struct uid_gid_extent), + GFP_KERNEL_ACCOUNT); + if (!reverse) { + kfree(forward); + return -ENOMEM; + } + + /* + * The idmapping isn't exposed anywhere so we don't need to care + * about ordering between extent pointers and @nr_extents + * initialization. + */ + map_to->forward = forward; + map_to->reverse = reverse; + map_to->nr_extents = nr_extents; + return 0; +} + +static void free_mnt_idmap(struct mnt_idmap *idmap) +{ + if (idmap->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + kfree(idmap->uid_map.forward); + kfree(idmap->uid_map.reverse); + } + if (idmap->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + kfree(idmap->gid_map.forward); + kfree(idmap->gid_map.reverse); + } + kfree(idmap); +} + struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns) { struct mnt_idmap *idmap; + int ret; idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT); if (!idmap) return ERR_PTR(-ENOMEM); - idmap->owner = get_user_ns(mnt_userns); refcount_set(&idmap->count, 1); + ret = copy_mnt_idmap(&mnt_userns->uid_map, &idmap->uid_map); + if (!ret) + ret = copy_mnt_idmap(&mnt_userns->gid_map, &idmap->gid_map); + if (ret) { + free_mnt_idmap(idmap); + idmap = ERR_PTR(ret); + } return idmap; } @@ -267,9 +312,7 @@ EXPORT_SYMBOL_GPL(mnt_idmap_get); */ void mnt_idmap_put(struct mnt_idmap *idmap) { - if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) { - put_user_ns(idmap->owner); - kfree(idmap); - } + if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) + free_mnt_idmap(idmap); } EXPORT_SYMBOL_GPL(mnt_idmap_put); diff --git a/fs/mount.h b/fs/mount.h index 130c07c2f8d2..4a42fc68f4cc 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -8,19 +8,13 @@ struct mnt_namespace { struct ns_common ns; struct mount * root; - /* - * Traversal and modification of .list is protected by either - * - taking namespace_sem for write, OR - * - taking namespace_sem for read AND taking .ns_lock. - */ - struct list_head list; - spinlock_t ns_lock; + struct rb_root mounts; /* Protected by namespace_sem */ struct user_namespace *user_ns; struct ucounts *ucounts; u64 seq; /* Sequence number to prevent loops */ wait_queue_head_t poll; u64 event; - unsigned int mounts; /* # of mounts in the namespace */ + unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; } __randomize_layout; @@ -55,7 +49,10 @@ struct mount { struct list_head mnt_child; /* and going through their mnt_child */ struct list_head mnt_instance; /* mount instance on sb->s_mounts */ const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ - struct list_head mnt_list; + union { + struct rb_node mnt_node; /* Under ns->mounts */ + struct list_head mnt_list; + }; struct list_head mnt_expire; /* link in fs-specific expiry list */ struct list_head mnt_share; /* circular list of shared mounts */ struct list_head mnt_slave_list;/* list of slave mounts */ @@ -72,7 +69,8 @@ struct mount { struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks; __u32 mnt_fsnotify_mask; #endif - int mnt_id; /* mount identifier */ + int mnt_id; /* mount identifier, reused */ + u64 mnt_id_unique; /* mount ID unique until reboot */ int mnt_group_id; /* peer group identifier */ int mnt_expiry_mark; /* true if marked for expiry */ struct hlist_head mnt_pins; @@ -127,7 +125,6 @@ struct proc_mounts { struct mnt_namespace *ns; struct path root; int (*show)(struct seq_file *, struct vfsmount *); - struct mount cursor; }; extern const struct seq_operations mounts_op; @@ -146,4 +143,12 @@ static inline bool is_anon_ns(struct mnt_namespace *ns) return ns->seq == 0; } +static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list) +{ + WARN_ON(!(mnt->mnt.mnt_flags & MNT_ONRB)); + mnt->mnt.mnt_flags &= ~MNT_ONRB; + rb_erase(&mnt->mnt_node, &mnt->mnt_ns->mounts); + list_add_tail(&mnt->mnt_list, dt_list); +} + extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor); diff --git a/fs/mpage.c b/fs/mpage.c index ffb064ed9d04..738882e0766d 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -166,7 +166,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; - sector_t blocks[MAX_BUF_PER_PAGE]; + sector_t first_block; unsigned page_block; unsigned first_hole = blocks_per_page; struct block_device *bdev = NULL; @@ -205,6 +205,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) unsigned map_offset = block_in_file - args->first_logical_block; unsigned last = nblocks - map_offset; + first_block = map_bh->b_blocknr + map_offset; for (relative_block = 0; ; relative_block++) { if (relative_block == last) { clear_buffer_mapped(map_bh); @@ -212,8 +213,6 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) } if (page_block == blocks_per_page) break; - blocks[page_block] = map_bh->b_blocknr + map_offset + - relative_block; page_block++; block_in_file++; } @@ -259,7 +258,9 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) goto confused; /* hole -> non-hole */ /* Contiguous blocks? */ - if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1) + if (!page_block) + first_block = map_bh->b_blocknr; + else if (first_block + page_block != map_bh->b_blocknr) goto confused; nblocks = map_bh->b_size >> blkbits; for (relative_block = 0; ; relative_block++) { @@ -268,7 +269,6 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) break; } else if (page_block == blocks_per_page) break; - blocks[page_block] = map_bh->b_blocknr+relative_block; page_block++; block_in_file++; } @@ -289,7 +289,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) /* * This folio will go to BIO. Do we need to send this BIO off first? */ - if (args->bio && (args->last_block_in_bio != blocks[0] - 1)) + if (args->bio && (args->last_block_in_bio != first_block - 1)) args->bio = mpage_bio_submit_read(args->bio); alloc_new: @@ -298,7 +298,7 @@ alloc_new: gfp); if (args->bio == NULL) goto confused; - args->bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); + args->bio->bi_iter.bi_sector = first_block << (blkbits - 9); } length = first_hole << blkbits; @@ -313,7 +313,7 @@ alloc_new: (first_hole != blocks_per_page)) args->bio = mpage_bio_submit_read(args->bio); else - args->last_block_in_bio = blocks[blocks_per_page - 1]; + args->last_block_in_bio = first_block + blocks_per_page - 1; out: return args->bio; @@ -430,13 +430,13 @@ struct mpage_data { * We have our BIO, so we can now mark the buffers clean. Make * sure to only clean buffers which we know we'll be writing. */ -static void clean_buffers(struct page *page, unsigned first_unmapped) +static void clean_buffers(struct folio *folio, unsigned first_unmapped) { unsigned buffer_counter = 0; - struct buffer_head *bh, *head; - if (!page_has_buffers(page)) + struct buffer_head *bh, *head = folio_buffers(folio); + + if (!head) return; - head = page_buffers(page); bh = head; do { @@ -451,18 +451,8 @@ static void clean_buffers(struct page *page, unsigned first_unmapped) * read_folio would fail to serialize with the bh and it would read from * disk before we reach the platter. */ - if (buffer_heads_over_limit && PageUptodate(page)) - try_to_free_buffers(page_folio(page)); -} - -/* - * For situations where we want to clean all buffers attached to a page. - * We don't need to calculate how many buffers are attached to the page, - * we just need to specify a number larger than the maximum number of buffers. - */ -void clean_page_buffers(struct page *page) -{ - clean_buffers(page, ~0U); + if (buffer_heads_over_limit && folio_test_uptodate(folio)) + try_to_free_buffers(folio); } static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, @@ -476,7 +466,7 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, const unsigned blocks_per_page = PAGE_SIZE >> blkbits; sector_t last_block; sector_t block_in_file; - sector_t blocks[MAX_BUF_PER_PAGE]; + sector_t first_block; unsigned page_block; unsigned first_unmapped = blocks_per_page; struct block_device *bdev = NULL; @@ -514,10 +504,12 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, if (!buffer_dirty(bh) || !buffer_uptodate(bh)) goto confused; if (page_block) { - if (bh->b_blocknr != blocks[page_block-1] + 1) + if (bh->b_blocknr != first_block + page_block) goto confused; + } else { + first_block = bh->b_blocknr; } - blocks[page_block++] = bh->b_blocknr; + page_block++; boundary = buffer_boundary(bh); if (boundary) { boundary_block = bh->b_blocknr; @@ -566,10 +558,12 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, boundary_bdev = map_bh.b_bdev; } if (page_block) { - if (map_bh.b_blocknr != blocks[page_block-1] + 1) + if (map_bh.b_blocknr != first_block + page_block) goto confused; + } else { + first_block = map_bh.b_blocknr; } - blocks[page_block++] = map_bh.b_blocknr; + page_block++; boundary = buffer_boundary(&map_bh); bdev = map_bh.b_bdev; if (block_in_file == last_block) @@ -601,7 +595,7 @@ page_is_mapped: /* * This page will go to BIO. Do we need to send this BIO off first? */ - if (bio && mpd->last_block_in_bio != blocks[0] - 1) + if (bio && mpd->last_block_in_bio != first_block - 1) bio = mpage_bio_submit_write(bio); alloc_new: @@ -609,7 +603,7 @@ alloc_new: bio = bio_alloc(bdev, BIO_MAX_VECS, REQ_OP_WRITE | wbc_to_write_flags(wbc), GFP_NOFS); - bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); + bio->bi_iter.bi_sector = first_block << (blkbits - 9); wbc_init_bio(wbc, bio); } @@ -625,7 +619,7 @@ alloc_new: goto alloc_new; } - clean_buffers(&folio->page, first_unmapped); + clean_buffers(folio, first_unmapped); BUG_ON(folio_test_writeback(folio)); folio_start_writeback(folio); @@ -637,7 +631,7 @@ alloc_new: boundary_block, 1 << blkbits); } } else { - mpd->last_block_in_bio = blocks[blocks_per_page - 1]; + mpd->last_block_in_bio = first_block + blocks_per_page - 1; } goto out; @@ -648,7 +642,7 @@ confused: /* * The caller has a ref on the inode, so *mapping is stable */ - ret = block_write_full_page(&folio->page, mpd->get_block, wbc); + ret = block_write_full_folio(folio, wbc, mpd->get_block); mapping_set_error(mapping, ret); out: mpd->bio = bio; diff --git a/fs/namei.c b/fs/namei.c index 71c13b2990b4..5c318d657503 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -289,7 +289,7 @@ EXPORT_SYMBOL(putname); * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ static int check_acl(struct mnt_idmap *idmap, struct inode *inode, int mask) @@ -334,7 +334,7 @@ static int check_acl(struct mnt_idmap *idmap, * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ static int acl_permission_check(struct mnt_idmap *idmap, struct inode *inode, int mask) @@ -395,7 +395,7 @@ static int acl_permission_check(struct mnt_idmap *idmap, * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int generic_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) @@ -1071,7 +1071,6 @@ static struct ctl_table namei_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, - { } }; static int __init init_fs_namei_sysctls(void) @@ -2467,7 +2466,7 @@ static int handle_lookup_down(struct nameidata *nd) return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry)); } -/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ +/* Returns 0 and nd will be valid on success; Returns error, otherwise. */ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path) { const char *s = path_init(nd, flags); @@ -2522,7 +2521,7 @@ int filename_lookup(int dfd, struct filename *name, unsigned flags, return retval; } -/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ +/* Returns 0 and nd will be valid on success; Returns error, otherwise. */ static int path_parentat(struct nameidata *nd, unsigned flags, struct path *parent) { @@ -3014,27 +3013,37 @@ static inline int may_create(struct mnt_idmap *idmap, return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); } +// p1 != p2, both are on the same filesystem, ->s_vfs_rename_mutex is held static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2) { - struct dentry *p; + struct dentry *p = p1, *q = p2, *r; - p = d_ancestor(p2, p1); - if (p) { + while ((r = p->d_parent) != p2 && r != p) + p = r; + if (r == p2) { + // p is a child of p2 and an ancestor of p1 or p1 itself inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); - inode_lock_nested(p1->d_inode, I_MUTEX_CHILD); + inode_lock_nested(p1->d_inode, I_MUTEX_PARENT2); return p; } - - p = d_ancestor(p1, p2); - if (p) { + // p is the root of connected component that contains p1 + // p2 does not occur on the path from p to p1 + while ((r = q->d_parent) != p1 && r != p && r != q) + q = r; + if (r == p1) { + // q is a child of p1 and an ancestor of p2 or p2 itself inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); - inode_lock_nested(p2->d_inode, I_MUTEX_CHILD); - return p; + inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); + return q; + } else if (likely(r == p)) { + // both p2 and p1 are descendents of p + inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); + inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); + return NULL; + } else { // no common ancestor at the time we'd been called + mutex_unlock(&p1->d_sb->s_vfs_rename_mutex); + return ERR_PTR(-EXDEV); } - - lock_two_inodes(p1->d_inode, p2->d_inode, - I_MUTEX_PARENT, I_MUTEX_PARENT2); - return NULL; } /* @@ -3158,7 +3167,7 @@ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap, * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int vfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool want_excl) @@ -3646,7 +3655,7 @@ static int do_open(struct nameidata *nd, * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ static int vfs_tmpfile(struct mnt_idmap *idmap, const struct path *parentpath, @@ -3785,10 +3794,7 @@ static struct file *path_openat(struct nameidata *nd, WARN_ON(1); error = -EINVAL; } - if (unlikely(file->f_mode & FMODE_OPENED)) - fput(file); - else - release_empty_file(file); + fput(file); if (error == -EOPENSTALE) { if (flags & LOOKUP_RCU) error = -ECHILD; @@ -3954,7 +3960,7 @@ EXPORT_SYMBOL(user_path_create); * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) @@ -4080,7 +4086,7 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) @@ -4161,7 +4167,7 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry) @@ -4290,7 +4296,7 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname) * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, struct inode **delegated_inode) @@ -4443,7 +4449,7 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname) * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *oldname) @@ -4535,7 +4541,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, struct inode *dir, struct dentry *new_dentry, @@ -4716,11 +4722,12 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * * a) we can get into loop creation. * b) race potential - two innocent renames can create a loop together. - * That's where 4.4 screws up. Current fix: serialization on + * That's where 4.4BSD screws up. Current fix: serialization on * sb->s_vfs_rename_mutex. We might be more accurate, but that's another * story. - * c) we have to lock _four_ objects - parents and victim (if it exists), - * and source. + * c) we may have to lock up to _four_ objects - parents and victim (if it exists), + * and source (if it's a non-directory or a subdirectory that moves to + * different parent). * And that - after we got ->i_mutex on parents (until then we don't know * whether the target exists). Solution: try to be smart with locking * order for inodes. We rely on the fact that tree topology may change @@ -4752,6 +4759,7 @@ int vfs_rename(struct renamedata *rd) bool new_is_dir = false; unsigned max_links = new_dir->i_sb->s_max_links; struct name_snapshot old_name; + bool lock_old_subdir, lock_new_subdir; if (source == target) return 0; @@ -4805,15 +4813,32 @@ int vfs_rename(struct renamedata *rd) take_dentry_name_snapshot(&old_name, old_dentry); dget(new_dentry); /* - * Lock all moved children. Moved directories may need to change parent - * pointer so they need the lock to prevent against concurrent - * directory changes moving parent pointer. For regular files we've - * historically always done this. The lockdep locking subclasses are - * somewhat arbitrary but RENAME_EXCHANGE in particular can swap - * regular files and directories so it's difficult to tell which - * subclasses to use. + * Lock children. + * The source subdirectory needs to be locked on cross-directory + * rename or cross-directory exchange since its parent changes. + * The target subdirectory needs to be locked on cross-directory + * exchange due to parent change and on any rename due to becoming + * a victim. + * Non-directories need locking in all cases (for NFS reasons); + * they get locked after any subdirectories (in inode address order). + * + * NOTE: WE ONLY LOCK UNRELATED DIRECTORIES IN CROSS-DIRECTORY CASE. + * NEVER, EVER DO THAT WITHOUT ->s_vfs_rename_mutex. */ - lock_two_inodes(source, target, I_MUTEX_NORMAL, I_MUTEX_NONDIR2); + lock_old_subdir = new_dir != old_dir; + lock_new_subdir = new_dir != old_dir || !(flags & RENAME_EXCHANGE); + if (is_dir) { + if (lock_old_subdir) + inode_lock_nested(source, I_MUTEX_CHILD); + if (target && (!new_is_dir || lock_new_subdir)) + inode_lock(target); + } else if (new_is_dir) { + if (lock_new_subdir) + inode_lock_nested(target, I_MUTEX_CHILD); + inode_lock(source); + } else { + lock_two_nondirectories(source, target); + } error = -EPERM; if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target))) @@ -4861,8 +4886,9 @@ int vfs_rename(struct renamedata *rd) d_exchange(old_dentry, new_dentry); } out: - inode_unlock(source); - if (target) + if (!is_dir || lock_old_subdir) + inode_unlock(source); + if (target && (!new_is_dir || lock_new_subdir)) inode_unlock(target); dput(new_dentry); if (!error) { @@ -4933,6 +4959,10 @@ retry: retry_deleg: trap = lock_rename(new_path.dentry, old_path.dentry); + if (IS_ERR(trap)) { + error = PTR_ERR(trap); + goto exit_lock_rename; + } old_dentry = lookup_one_qstr_excl(&old_last, old_path.dentry, lookup_flags); @@ -5000,6 +5030,7 @@ exit4: dput(old_dentry); exit3: unlock_rename(new_path.dentry, old_path.dentry); +exit_lock_rename: if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) diff --git a/fs/namespace.c b/fs/namespace.c index fbf0e596fcd3..ef1fd6829814 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -32,6 +32,7 @@ #include <linux/fs_context.h> #include <linux/shmem_fs.h> #include <linux/mnt_idmapping.h> +#include <linux/nospec.h> #include "pnode.h" #include "internal.h" @@ -68,6 +69,9 @@ static u64 event; static DEFINE_IDA(mnt_id_ida); static DEFINE_IDA(mnt_group_ida); +/* Don't allow confusion with old 32bit mount ID */ +static atomic64_t mnt_id_ctr = ATOMIC64_INIT(1ULL << 32); + static struct hlist_head *mount_hashtable __ro_after_init; static struct hlist_head *mountpoint_hashtable __ro_after_init; static struct kmem_cache *mnt_cache __ro_after_init; @@ -131,6 +135,7 @@ static int mnt_alloc_id(struct mount *mnt) if (res < 0) return res; mnt->mnt_id = res; + mnt->mnt_id_unique = atomic64_inc_return(&mnt_id_ctr); return 0; } @@ -730,21 +735,6 @@ struct vfsmount *lookup_mnt(const struct path *path) return m; } -static inline void lock_ns_list(struct mnt_namespace *ns) -{ - spin_lock(&ns->ns_lock); -} - -static inline void unlock_ns_list(struct mnt_namespace *ns) -{ - spin_unlock(&ns->ns_lock); -} - -static inline bool mnt_is_cursor(struct mount *mnt) -{ - return mnt->mnt.mnt_flags & MNT_CURSOR; -} - /* * __is_local_mountpoint - Test to see if dentry is a mountpoint in the * current mount namespace. @@ -763,19 +753,15 @@ static inline bool mnt_is_cursor(struct mount *mnt) bool __is_local_mountpoint(struct dentry *dentry) { struct mnt_namespace *ns = current->nsproxy->mnt_ns; - struct mount *mnt; + struct mount *mnt, *n; bool is_covered = false; down_read(&namespace_sem); - lock_ns_list(ns); - list_for_each_entry(mnt, &ns->list, mnt_list) { - if (mnt_is_cursor(mnt)) - continue; + rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) { is_covered = (mnt->mnt_mountpoint == dentry); if (is_covered) break; } - unlock_ns_list(ns); up_read(&namespace_sem); return is_covered; @@ -1022,6 +1008,30 @@ void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct m mnt_add_count(old_parent, -1); } +static inline struct mount *node_to_mount(struct rb_node *node) +{ + return node ? rb_entry(node, struct mount, mnt_node) : NULL; +} + +static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) +{ + struct rb_node **link = &ns->mounts.rb_node; + struct rb_node *parent = NULL; + + WARN_ON(mnt->mnt.mnt_flags & MNT_ONRB); + mnt->mnt_ns = ns; + while (*link) { + parent = *link; + if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) + link = &parent->rb_left; + else + link = &parent->rb_right; + } + rb_link_node(&mnt->mnt_node, parent, link); + rb_insert_color(&mnt->mnt_node, &ns->mounts); + mnt->mnt.mnt_flags |= MNT_ONRB; +} + /* * vfsmount lock must be held for write */ @@ -1035,12 +1045,13 @@ static void commit_tree(struct mount *mnt) BUG_ON(parent == mnt); list_add_tail(&head, &mnt->mnt_list); - list_for_each_entry(m, &head, mnt_list) - m->mnt_ns = n; + while (!list_empty(&head)) { + m = list_first_entry(&head, typeof(*m), mnt_list); + list_del(&m->mnt_list); - list_splice(&head, n->list.prev); - - n->mounts += n->pending_mounts; + mnt_add_to_ns(n, m); + } + n->nr_mounts += n->pending_mounts; n->pending_mounts = 0; __attach_mnt(mnt, parent); @@ -1188,7 +1199,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, } mnt->mnt.mnt_flags = old->mnt.mnt_flags; - mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); + mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL|MNT_ONRB); atomic_inc(&sb->s_active); mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt)); @@ -1413,65 +1424,57 @@ struct vfsmount *mnt_clone_internal(const struct path *path) return &p->mnt; } -#ifdef CONFIG_PROC_FS -static struct mount *mnt_list_next(struct mnt_namespace *ns, - struct list_head *p) +/* + * Returns the mount which either has the specified mnt_id, or has the next + * smallest id afer the specified one. + */ +static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id) { - struct mount *mnt, *ret = NULL; + struct rb_node *node = ns->mounts.rb_node; + struct mount *ret = NULL; - lock_ns_list(ns); - list_for_each_continue(p, &ns->list) { - mnt = list_entry(p, typeof(*mnt), mnt_list); - if (!mnt_is_cursor(mnt)) { - ret = mnt; - break; + while (node) { + struct mount *m = node_to_mount(node); + + if (mnt_id <= m->mnt_id_unique) { + ret = node_to_mount(node); + if (mnt_id == m->mnt_id_unique) + break; + node = node->rb_left; + } else { + node = node->rb_right; } } - unlock_ns_list(ns); - return ret; } +#ifdef CONFIG_PROC_FS + /* iterator; we want it to have access to namespace_sem, thus here... */ static void *m_start(struct seq_file *m, loff_t *pos) { struct proc_mounts *p = m->private; - struct list_head *prev; down_read(&namespace_sem); - if (!*pos) { - prev = &p->ns->list; - } else { - prev = &p->cursor.mnt_list; - - /* Read after we'd reached the end? */ - if (list_empty(prev)) - return NULL; - } - return mnt_list_next(p->ns, prev); + return mnt_find_id_at(p->ns, *pos); } static void *m_next(struct seq_file *m, void *v, loff_t *pos) { - struct proc_mounts *p = m->private; - struct mount *mnt = v; + struct mount *next = NULL, *mnt = v; + struct rb_node *node = rb_next(&mnt->mnt_node); ++*pos; - return mnt_list_next(p->ns, &mnt->mnt_list); + if (node) { + next = node_to_mount(node); + *pos = next->mnt_id_unique; + } + return next; } static void m_stop(struct seq_file *m, void *v) { - struct proc_mounts *p = m->private; - struct mount *mnt = v; - - lock_ns_list(p->ns); - if (mnt) - list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list); - else - list_del_init(&p->cursor.mnt_list); - unlock_ns_list(p->ns); up_read(&namespace_sem); } @@ -1489,14 +1492,6 @@ const struct seq_operations mounts_op = { .show = m_show, }; -void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor) -{ - down_read(&namespace_sem); - lock_ns_list(ns); - list_del(&cursor->mnt_list); - unlock_ns_list(ns); - up_read(&namespace_sem); -} #endif /* CONFIG_PROC_FS */ /** @@ -1638,7 +1633,10 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) /* Gather the mounts to umount */ for (p = mnt; p; p = next_mnt(p, mnt)) { p->mnt.mnt_flags |= MNT_UMOUNT; - list_move(&p->mnt_list, &tmp_list); + if (p->mnt.mnt_flags & MNT_ONRB) + move_from_ns(p, &tmp_list); + else + list_move(&p->mnt_list, &tmp_list); } /* Hide the mounts from mnt_mounts */ @@ -1658,7 +1656,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) list_del_init(&p->mnt_list); ns = p->mnt_ns; if (ns) { - ns->mounts--; + ns->nr_mounts--; __touch_mnt_namespace(ns); } p->mnt_ns = NULL; @@ -1784,14 +1782,16 @@ static int do_umount(struct mount *mnt, int flags) event++; if (flags & MNT_DETACH) { - if (!list_empty(&mnt->mnt_list)) + if (mnt->mnt.mnt_flags & MNT_ONRB || + !list_empty(&mnt->mnt_list)) umount_tree(mnt, UMOUNT_PROPAGATE); retval = 0; } else { shrink_submounts(mnt); retval = -EBUSY; if (!propagate_mount_busy(mnt, 2)) { - if (!list_empty(&mnt->mnt_list)) + if (mnt->mnt.mnt_flags & MNT_ONRB || + !list_empty(&mnt->mnt_list)) umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); retval = 0; } @@ -2209,9 +2209,9 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt) unsigned int mounts = 0; struct mount *p; - if (ns->mounts >= max) + if (ns->nr_mounts >= max) return -ENOSPC; - max -= ns->mounts; + max -= ns->nr_mounts; if (ns->pending_mounts >= max) return -ENOSPC; max -= ns->pending_mounts; @@ -2355,8 +2355,12 @@ static int attach_recursive_mnt(struct mount *source_mnt, touch_mnt_namespace(source_mnt->mnt_ns); } else { if (source_mnt->mnt_ns) { + LIST_HEAD(head); + /* move from anon - the caller will destroy */ - list_del_init(&source_mnt->mnt_ns->list); + for (p = source_mnt; p; p = next_mnt(p, source_mnt)) + move_from_ns(p, &head); + list_del_init(&head); } if (beneath) mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp); @@ -2667,11 +2671,10 @@ static struct file *open_detached_copy(struct path *path, bool recursive) lock_mount_hash(); for (p = mnt; p; p = next_mnt(p, mnt)) { - p->mnt_ns = ns; - ns->mounts++; + mnt_add_to_ns(ns, p); + ns->nr_mounts++; } ns->root = mnt; - list_add_tail(&ns->list, &mnt->mnt_list); mntget(&mnt->mnt); unlock_mount_hash(); namespace_unlock(); @@ -2875,7 +2878,12 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags, if (IS_ERR(fc)) return PTR_ERR(fc); + /* + * Indicate to the filesystem that the remount request is coming + * from the legacy mount system call. + */ fc->oldapi = true; + err = parse_monolithic_mount_data(fc, data); if (!err) { down_write(&sb->s_umount); @@ -3026,6 +3034,7 @@ static inline bool path_overmounted(const struct path *path) * can_move_mount_beneath - check that we can mount beneath the top mount * @from: mount to mount beneath * @to: mount under which to mount + * @mp: mountpoint of @to * * - Make sure that @to->dentry is actually the root of a mount under * which we can mount another mount. @@ -3324,6 +3333,12 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, if (IS_ERR(fc)) return PTR_ERR(fc); + /* + * Indicate to the filesystem that the mount request is coming + * from the legacy mount system call. + */ + fc->oldapi = true; + if (subtype) err = vfs_parse_fs_string(fc, "subtype", subtype, strlen(subtype)); @@ -3734,9 +3749,8 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a if (!anon) new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); refcount_set(&new_ns->ns.count, 1); - INIT_LIST_HEAD(&new_ns->list); + new_ns->mounts = RB_ROOT; init_waitqueue_head(&new_ns->poll); - spin_lock_init(&new_ns->ns_lock); new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; return new_ns; @@ -3783,7 +3797,6 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, unlock_mount_hash(); } new_ns->root = new; - list_add_tail(&new_ns->list, &new->mnt_list); /* * Second pass: switch the tsk->fs->* elements and mark new vfsmounts @@ -3793,8 +3806,8 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, p = old; q = new; while (p) { - q->mnt_ns = new_ns; - new_ns->mounts++; + mnt_add_to_ns(new_ns, q); + new_ns->nr_mounts++; if (new_fs) { if (&p->mnt == new_fs->root.mnt) { new_fs->root.mnt = mntget(&q->mnt); @@ -3836,10 +3849,9 @@ struct dentry *mount_subtree(struct vfsmount *m, const char *name) mntput(m); return ERR_CAST(ns); } - mnt->mnt_ns = ns; ns->root = mnt; - ns->mounts++; - list_add(&mnt->mnt_list, &ns->list); + ns->nr_mounts++; + mnt_add_to_ns(ns, mnt); err = vfs_path_lookup(m->mnt_root, m, name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); @@ -4017,10 +4029,9 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, goto err_path; } mnt = real_mount(newmount.mnt); - mnt->mnt_ns = ns; ns->root = mnt; - ns->mounts = 1; - list_add(&mnt->mnt_list, &ns->list); + ns->nr_mounts = 1; + mnt_add_to_ns(ns, mnt); mntget(newmount.mnt); /* Attach to an apparent O_PATH fd with a note that we need to unmount @@ -4288,7 +4299,7 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) * Creating an idmapped mount with the filesystem wide idmapping * doesn't make sense so block that. We don't allow mushy semantics. */ - if (!check_fsmapping(kattr->mnt_idmap, m->mnt_sb)) + if (kattr->mnt_userns == m->mnt_sb->s_user_ns) return -EINVAL; /* @@ -4676,6 +4687,438 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, return err; } +int show_path(struct seq_file *m, struct dentry *root) +{ + if (root->d_sb->s_op->show_path) + return root->d_sb->s_op->show_path(m, root); + + seq_dentry(m, root, " \t\n\\"); + return 0; +} + +static struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns) +{ + struct mount *mnt = mnt_find_id_at(ns, id); + + if (!mnt || mnt->mnt_id_unique != id) + return NULL; + + return &mnt->mnt; +} + +struct kstatmount { + struct statmount __user *buf; + size_t bufsize; + struct vfsmount *mnt; + u64 mask; + struct path root; + struct statmount sm; + struct seq_file seq; +}; + +static u64 mnt_to_attr_flags(struct vfsmount *mnt) +{ + unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags); + u64 attr_flags = 0; + + if (mnt_flags & MNT_READONLY) + attr_flags |= MOUNT_ATTR_RDONLY; + if (mnt_flags & MNT_NOSUID) + attr_flags |= MOUNT_ATTR_NOSUID; + if (mnt_flags & MNT_NODEV) + attr_flags |= MOUNT_ATTR_NODEV; + if (mnt_flags & MNT_NOEXEC) + attr_flags |= MOUNT_ATTR_NOEXEC; + if (mnt_flags & MNT_NODIRATIME) + attr_flags |= MOUNT_ATTR_NODIRATIME; + if (mnt_flags & MNT_NOSYMFOLLOW) + attr_flags |= MOUNT_ATTR_NOSYMFOLLOW; + + if (mnt_flags & MNT_NOATIME) + attr_flags |= MOUNT_ATTR_NOATIME; + else if (mnt_flags & MNT_RELATIME) + attr_flags |= MOUNT_ATTR_RELATIME; + else + attr_flags |= MOUNT_ATTR_STRICTATIME; + + if (is_idmapped_mnt(mnt)) + attr_flags |= MOUNT_ATTR_IDMAP; + + return attr_flags; +} + +static u64 mnt_to_propagation_flags(struct mount *m) +{ + u64 propagation = 0; + + if (IS_MNT_SHARED(m)) + propagation |= MS_SHARED; + if (IS_MNT_SLAVE(m)) + propagation |= MS_SLAVE; + if (IS_MNT_UNBINDABLE(m)) + propagation |= MS_UNBINDABLE; + if (!propagation) + propagation |= MS_PRIVATE; + + return propagation; +} + +static void statmount_sb_basic(struct kstatmount *s) +{ + struct super_block *sb = s->mnt->mnt_sb; + + s->sm.mask |= STATMOUNT_SB_BASIC; + s->sm.sb_dev_major = MAJOR(sb->s_dev); + s->sm.sb_dev_minor = MINOR(sb->s_dev); + s->sm.sb_magic = sb->s_magic; + s->sm.sb_flags = sb->s_flags & (SB_RDONLY|SB_SYNCHRONOUS|SB_DIRSYNC|SB_LAZYTIME); +} + +static void statmount_mnt_basic(struct kstatmount *s) +{ + struct mount *m = real_mount(s->mnt); + + s->sm.mask |= STATMOUNT_MNT_BASIC; + s->sm.mnt_id = m->mnt_id_unique; + s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique; + s->sm.mnt_id_old = m->mnt_id; + s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id; + s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt); + s->sm.mnt_propagation = mnt_to_propagation_flags(m); + s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0; + s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0; +} + +static void statmount_propagate_from(struct kstatmount *s) +{ + struct mount *m = real_mount(s->mnt); + + s->sm.mask |= STATMOUNT_PROPAGATE_FROM; + if (IS_MNT_SLAVE(m)) + s->sm.propagate_from = get_dominating_id(m, ¤t->fs->root); +} + +static int statmount_mnt_root(struct kstatmount *s, struct seq_file *seq) +{ + int ret; + size_t start = seq->count; + + ret = show_path(seq, s->mnt->mnt_root); + if (ret) + return ret; + + if (unlikely(seq_has_overflowed(seq))) + return -EAGAIN; + + /* + * Unescape the result. It would be better if supplied string was not + * escaped in the first place, but that's a pretty invasive change. + */ + seq->buf[seq->count] = '\0'; + seq->count = start; + seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL)); + return 0; +} + +static int statmount_mnt_point(struct kstatmount *s, struct seq_file *seq) +{ + struct vfsmount *mnt = s->mnt; + struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; + int err; + + err = seq_path_root(seq, &mnt_path, &s->root, ""); + return err == SEQ_SKIP ? 0 : err; +} + +static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq) +{ + struct super_block *sb = s->mnt->mnt_sb; + + seq_puts(seq, sb->s_type->name); + return 0; +} + +static int statmount_string(struct kstatmount *s, u64 flag) +{ + int ret; + size_t kbufsize; + struct seq_file *seq = &s->seq; + struct statmount *sm = &s->sm; + + switch (flag) { + case STATMOUNT_FS_TYPE: + sm->fs_type = seq->count; + ret = statmount_fs_type(s, seq); + break; + case STATMOUNT_MNT_ROOT: + sm->mnt_root = seq->count; + ret = statmount_mnt_root(s, seq); + break; + case STATMOUNT_MNT_POINT: + sm->mnt_point = seq->count; + ret = statmount_mnt_point(s, seq); + break; + default: + WARN_ON_ONCE(true); + return -EINVAL; + } + + if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize))) + return -EOVERFLOW; + if (kbufsize >= s->bufsize) + return -EOVERFLOW; + + /* signal a retry */ + if (unlikely(seq_has_overflowed(seq))) + return -EAGAIN; + + if (ret) + return ret; + + seq->buf[seq->count++] = '\0'; + sm->mask |= flag; + return 0; +} + +static int copy_statmount_to_user(struct kstatmount *s) +{ + struct statmount *sm = &s->sm; + struct seq_file *seq = &s->seq; + char __user *str = ((char __user *)s->buf) + sizeof(*sm); + size_t copysize = min_t(size_t, s->bufsize, sizeof(*sm)); + + if (seq->count && copy_to_user(str, seq->buf, seq->count)) + return -EFAULT; + + /* Return the number of bytes copied to the buffer */ + sm->size = copysize + seq->count; + if (copy_to_user(s->buf, sm, copysize)) + return -EFAULT; + + return 0; +} + +static int do_statmount(struct kstatmount *s) +{ + struct mount *m = real_mount(s->mnt); + int err; + + /* + * Don't trigger audit denials. We just want to determine what + * mounts to show users. + */ + if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) && + !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + err = security_sb_statfs(s->mnt->mnt_root); + if (err) + return err; + + if (s->mask & STATMOUNT_SB_BASIC) + statmount_sb_basic(s); + + if (s->mask & STATMOUNT_MNT_BASIC) + statmount_mnt_basic(s); + + if (s->mask & STATMOUNT_PROPAGATE_FROM) + statmount_propagate_from(s); + + if (s->mask & STATMOUNT_FS_TYPE) + err = statmount_string(s, STATMOUNT_FS_TYPE); + + if (!err && s->mask & STATMOUNT_MNT_ROOT) + err = statmount_string(s, STATMOUNT_MNT_ROOT); + + if (!err && s->mask & STATMOUNT_MNT_POINT) + err = statmount_string(s, STATMOUNT_MNT_POINT); + + if (err) + return err; + + return 0; +} + +static inline bool retry_statmount(const long ret, size_t *seq_size) +{ + if (likely(ret != -EAGAIN)) + return false; + if (unlikely(check_mul_overflow(*seq_size, 2, seq_size))) + return false; + if (unlikely(*seq_size > MAX_RW_COUNT)) + return false; + return true; +} + +static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, + struct statmount __user *buf, size_t bufsize, + size_t seq_size) +{ + if (!access_ok(buf, bufsize)) + return -EFAULT; + + memset(ks, 0, sizeof(*ks)); + ks->mask = kreq->param; + ks->buf = buf; + ks->bufsize = bufsize; + ks->seq.size = seq_size; + ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT); + if (!ks->seq.buf) + return -ENOMEM; + return 0; +} + +static int copy_mnt_id_req(const struct mnt_id_req __user *req, + struct mnt_id_req *kreq) +{ + int ret; + size_t usize; + + BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER0); + + ret = get_user(usize, &req->size); + if (ret) + return -EFAULT; + if (unlikely(usize > PAGE_SIZE)) + return -E2BIG; + if (unlikely(usize < MNT_ID_REQ_SIZE_VER0)) + return -EINVAL; + memset(kreq, 0, sizeof(*kreq)); + ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize); + if (ret) + return ret; + if (kreq->spare != 0) + return -EINVAL; + return 0; +} + +SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, + struct statmount __user *, buf, size_t, bufsize, + unsigned int, flags) +{ + struct vfsmount *mnt; + struct mnt_id_req kreq; + struct kstatmount ks; + /* We currently support retrieval of 3 strings. */ + size_t seq_size = 3 * PATH_MAX; + int ret; + + if (flags) + return -EINVAL; + + ret = copy_mnt_id_req(req, &kreq); + if (ret) + return ret; + +retry: + ret = prepare_kstatmount(&ks, &kreq, buf, bufsize, seq_size); + if (ret) + return ret; + + down_read(&namespace_sem); + mnt = lookup_mnt_in_ns(kreq.mnt_id, current->nsproxy->mnt_ns); + if (!mnt) { + up_read(&namespace_sem); + kvfree(ks.seq.buf); + return -ENOENT; + } + + ks.mnt = mnt; + get_fs_root(current->fs, &ks.root); + ret = do_statmount(&ks); + path_put(&ks.root); + up_read(&namespace_sem); + + if (!ret) + ret = copy_statmount_to_user(&ks); + kvfree(ks.seq.buf); + if (retry_statmount(ret, &seq_size)) + goto retry; + return ret; +} + +static struct mount *listmnt_next(struct mount *curr) +{ + return node_to_mount(rb_next(&curr->mnt_node)); +} + +static ssize_t do_listmount(struct mount *first, struct path *orig, u64 mnt_id, + u64 __user *buf, size_t bufsize, + const struct path *root) +{ + struct mount *r; + ssize_t ctr; + int err; + + /* + * Don't trigger audit denials. We just want to determine what + * mounts to show users. + */ + if (!is_path_reachable(real_mount(orig->mnt), orig->dentry, root) && + !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + err = security_sb_statfs(orig->dentry); + if (err) + return err; + + for (ctr = 0, r = first; r && ctr < bufsize; r = listmnt_next(r)) { + if (r->mnt_id_unique == mnt_id) + continue; + if (!is_path_reachable(r, r->mnt.mnt_root, orig)) + continue; + ctr = array_index_nospec(ctr, bufsize); + if (put_user(r->mnt_id_unique, buf + ctr)) + return -EFAULT; + if (check_add_overflow(ctr, 1, &ctr)) + return -ERANGE; + } + return ctr; +} + +SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, + u64 __user *, buf, size_t, bufsize, unsigned int, flags) +{ + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + struct mnt_id_req kreq; + struct mount *first; + struct path root, orig; + u64 mnt_id, last_mnt_id; + ssize_t ret; + + if (flags) + return -EINVAL; + + ret = copy_mnt_id_req(req, &kreq); + if (ret) + return ret; + mnt_id = kreq.mnt_id; + last_mnt_id = kreq.param; + + down_read(&namespace_sem); + get_fs_root(current->fs, &root); + if (mnt_id == LSMT_ROOT) { + orig = root; + } else { + ret = -ENOENT; + orig.mnt = lookup_mnt_in_ns(mnt_id, ns); + if (!orig.mnt) + goto err; + orig.dentry = orig.mnt->mnt_root; + } + if (!last_mnt_id) + first = node_to_mount(rb_first(&ns->mounts)); + else + first = mnt_find_id_at(ns, last_mnt_id + 1); + + ret = do_listmount(first, &orig, mnt_id, buf, bufsize, &root); +err: + path_put(&root); + up_read(&namespace_sem); + return ret; +} + + static void __init init_mount_tree(void) { struct vfsmount *mnt; @@ -4691,10 +5134,9 @@ static void __init init_mount_tree(void) if (IS_ERR(ns)) panic("Can't allocate initial namespace"); m = real_mount(mnt); - m->mnt_ns = ns; ns->root = m; - ns->mounts = 1; - list_add(&m->mnt_list, &ns->list); + ns->nr_mounts = 1; + mnt_add_to_ns(ns, m); init_task.nsproxy->mnt_ns = ns; get_mnt_ns(ns); @@ -4821,18 +5263,14 @@ static bool mnt_already_visible(struct mnt_namespace *ns, int *new_mnt_flags) { int new_flags = *new_mnt_flags; - struct mount *mnt; + struct mount *mnt, *n; bool visible = false; down_read(&namespace_sem); - lock_ns_list(ns); - list_for_each_entry(mnt, &ns->list, mnt_list) { + rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) { struct mount *child; int mnt_flags; - if (mnt_is_cursor(mnt)) - continue; - if (mnt->mnt.mnt_sb->s_type != sb->s_type) continue; @@ -4880,7 +5318,6 @@ static bool mnt_already_visible(struct mnt_namespace *ns, next: ; } found: - unlock_ns_list(ns); up_read(&namespace_sem); return visible; } @@ -5010,7 +5447,6 @@ static struct ctl_table fs_namespace_sysctls[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, - { } }; static int __init init_fs_namespace_sysctls(void) diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 943aeea1eb16..6be13e0ec170 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -580,6 +580,8 @@ retry: nfs4_delete_deviceid(node->ld, node->nfs_client, id); goto retry; } + + nfs4_put_deviceid_node(node); return ERR_PTR(-ENODEV); } @@ -893,10 +895,9 @@ bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) } if (pgio->pg_dreq == NULL) - wb_size = pnfs_num_cont_bytes(pgio->pg_inode, - req->wb_index); + wb_size = pnfs_num_cont_bytes(pgio->pg_inode, req->wb_index); else - wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); + wb_size = nfs_dreq_bytes_left(pgio->pg_dreq, req_offset(req)); pnfs_generic_pg_init_write(pgio, req, wb_size); diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index f318a05a80e1..c97ebc42ec0f 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -351,6 +351,9 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, d->map = bl_map_simple; d->pr_key = v->scsi.pr_key; + if (d->len == 0) + return -ENODEV; + pr_info("pNFS: using block device %s (reservation key 0x%llx)\n", d->bdev_handle->bdev->bd_disk->disk_name, d->pr_key); diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c index 6c977288cc28..d8d50a88de04 100644 --- a/fs/nfs/blocklayout/rpc_pipefs.c +++ b/fs/nfs/blocklayout/rpc_pipefs.c @@ -75,7 +75,7 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b, msg->len = sizeof(*bl_msg) + b->simple.len; msg->data = kzalloc(msg->len, gfp_mask); if (!msg->data) - goto out_free_data; + goto out_unlock; bl_msg = msg->data; bl_msg->type = BL_DEVICE_MOUNT; diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 4ffa1f469e90..760d27dd7225 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -187,7 +187,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) * Check whether we're already up and running. */ if (cb_info->serv) - return svc_get(cb_info->serv); + return cb_info->serv; /* * Sanity check: if there's no task, @@ -245,9 +245,10 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) cb_info->users++; err_net: - if (!cb_info->users) - cb_info->serv = NULL; - svc_put(serv); + if (!cb_info->users) { + svc_set_num_threads(cb_info->serv, NULL, 0); + svc_destroy(&cb_info->serv); + } err_create: mutex_unlock(&nfs_callback_mutex); return ret; @@ -271,11 +272,9 @@ void nfs_callback_down(int minorversion, struct net *net) nfs_callback_down_net(minorversion, serv, net); cb_info->users--; if (cb_info->users == 0) { - svc_get(serv); svc_set_num_threads(serv, NULL, 0); - svc_put(serv); dprintk("nfs_callback_down: service destroyed\n"); - cb_info->serv = NULL; + svc_destroy(&cb_info->serv); } mutex_unlock(&nfs_callback_mutex); } diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index ccd4f245cae2..650758ee0d5f 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -19,32 +19,14 @@ enum nfs4_callback_procnum { CB_COMPOUND = 1, }; -enum nfs4_callback_opnum { - OP_CB_GETATTR = 3, - OP_CB_RECALL = 4, -/* Callback operations new to NFSv4.1 */ - OP_CB_LAYOUTRECALL = 5, - OP_CB_NOTIFY = 6, - OP_CB_PUSH_DELEG = 7, - OP_CB_RECALL_ANY = 8, - OP_CB_RECALLABLE_OBJ_AVAIL = 9, - OP_CB_RECALL_SLOT = 10, - OP_CB_SEQUENCE = 11, - OP_CB_WANTS_CANCELLED = 12, - OP_CB_NOTIFY_LOCK = 13, - OP_CB_NOTIFY_DEVICEID = 14, -/* Callback operations new to NFSv4.2 */ - OP_CB_OFFLOAD = 15, - OP_CB_ILLEGAL = 10044, -}; - struct nfs4_slot; struct cb_process_state { - __be32 drc_status; struct nfs_client *clp; struct nfs4_slot *slot; - u32 minorversion; struct net *net; + u32 minorversion; + __be32 drc_status; + unsigned int referring_calls; }; struct cb_compound_hdr_arg { diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 96a4923080ae..76cea34477ae 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -207,7 +207,8 @@ static struct inode *nfs_layout_find_inode(struct nfs_client *clp, * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing) */ static u32 pnfs_check_callback_stateid(struct pnfs_layout_hdr *lo, - const nfs4_stateid *new) + const nfs4_stateid *new, + struct cb_process_state *cps) { u32 oldseq, newseq; @@ -221,28 +222,29 @@ static u32 pnfs_check_callback_stateid(struct pnfs_layout_hdr *lo, newseq = be32_to_cpu(new->seqid); /* Are we already in a layout recall situation? */ - if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) && - lo->plh_return_seq != 0) { - if (newseq < lo->plh_return_seq) - return NFS4ERR_OLD_STATEID; - if (newseq > lo->plh_return_seq) - return NFS4ERR_DELAY; - goto out; - } + if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) + return NFS4ERR_DELAY; - /* Check that the stateid matches what we think it should be. */ + /* + * Check that the stateid matches what we think it should be. + * Note that if the server sent us a list of referring calls, + * and we know that those have completed, then we trust the + * stateid argument is correct. + */ oldseq = be32_to_cpu(lo->plh_stateid.seqid); - if (newseq > oldseq + 1) + if (newseq > oldseq + 1 && !cps->referring_calls) return NFS4ERR_DELAY; + /* Crazy server! */ if (newseq <= oldseq) return NFS4ERR_OLD_STATEID; -out: + return NFS_OK; } static u32 initiate_file_draining(struct nfs_client *clp, - struct cb_layoutrecallargs *args) + struct cb_layoutrecallargs *args, + struct cb_process_state *cps) { struct inode *ino; struct pnfs_layout_hdr *lo; @@ -266,7 +268,7 @@ static u32 initiate_file_draining(struct nfs_client *clp, goto out; } pnfs_get_layout_hdr(lo); - rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid); + rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid, cps); if (rv != NFS_OK) goto unlock; @@ -326,10 +328,11 @@ static u32 initiate_bulk_draining(struct nfs_client *clp, } static u32 do_callback_layoutrecall(struct nfs_client *clp, - struct cb_layoutrecallargs *args) + struct cb_layoutrecallargs *args, + struct cb_process_state *cps) { if (args->cbl_recall_type == RETURN_FILE) - return initiate_file_draining(clp, args); + return initiate_file_draining(clp, args, cps); return initiate_bulk_draining(clp, args); } @@ -340,11 +343,12 @@ __be32 nfs4_callback_layoutrecall(void *argp, void *resp, u32 res = NFS4ERR_OP_NOT_IN_SESSION; if (cps->clp) - res = do_callback_layoutrecall(cps->clp, args); + res = do_callback_layoutrecall(cps->clp, args, cps); return cpu_to_be32(res); } -static void pnfs_recall_all_layouts(struct nfs_client *clp) +static void pnfs_recall_all_layouts(struct nfs_client *clp, + struct cb_process_state *cps) { struct cb_layoutrecallargs args; @@ -352,7 +356,7 @@ static void pnfs_recall_all_layouts(struct nfs_client *clp) memset(&args, 0, sizeof(args)); args.cbl_recall_type = RETURN_ALL; /* FIXME we ignore errors, what should we do? */ - do_callback_layoutrecall(clp, &args); + do_callback_layoutrecall(clp, &args, cps); } __be32 nfs4_callback_devicenotify(void *argp, void *resp, @@ -450,6 +454,7 @@ static int referring_call_exists(struct nfs_client *clp, __acquires(lock) { int status = 0; + int found = 0; int i, j; struct nfs4_session *session; struct nfs4_slot_table *tbl; @@ -478,11 +483,12 @@ static int referring_call_exists(struct nfs_client *clp, spin_lock(lock); if (status) goto out; + found++; } } out: - return status; + return status < 0 ? status : found; } __be32 nfs4_callback_sequence(void *argp, void *resp, @@ -493,6 +499,7 @@ __be32 nfs4_callback_sequence(void *argp, void *resp, struct nfs4_slot_table *tbl; struct nfs4_slot *slot; struct nfs_client *clp; + int ret; int i; __be32 status = htonl(NFS4ERR_BADSESSION); @@ -552,11 +559,13 @@ __be32 nfs4_callback_sequence(void *argp, void *resp, * related callback was received before the response to the original * call. */ - if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists, - &tbl->slot_tbl_lock) < 0) { + ret = referring_call_exists(clp, args->csa_nrclists, args->csa_rclists, + &tbl->slot_tbl_lock); + if (ret < 0) { status = htonl(NFS4ERR_DELAY); goto out_unlock; } + cps->referring_calls = ret; /* * RFC5661 20.9.3 @@ -617,7 +626,7 @@ __be32 nfs4_callback_recallany(void *argp, void *resp, nfs_expire_unused_delegation_types(cps->clp, flags); if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_FILE_LAYOUT)) - pnfs_recall_all_layouts(cps->clp); + pnfs_recall_all_layouts(cps->clp, cps); if (args->craa_type_mask & BIT(PNFS_FF_RCA4_TYPE_MASK_READ)) { set_bit(NFS4CLNT_RECALL_ANY_LAYOUT_READ, &cps->clp->cl_state); diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 321af81c456e..9369488f2ed4 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -967,6 +967,11 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp) nops--; } + if (svc_is_backchannel(rqstp) && cps.clp) { + rqstp->bc_to_initval = cps.clp->cl_rpcclient->cl_timeout->to_initval; + rqstp->bc_to_retries = cps.clp->cl_rpcclient->cl_timeout->to_retries; + } + *hdr_res.status = status; *hdr_res.nops = htonl(nops); nfs4_cb_free_slot(&cps); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 13dffe4201e6..c8ecbe999059 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2194,6 +2194,8 @@ nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, { struct inode *inode; + trace_nfs_lookup_revalidate_enter(dir, dentry, flags); + if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY)) goto full_reval; if (d_mountpoint(dentry)) @@ -2963,7 +2965,7 @@ static u64 nfs_access_login_time(const struct task_struct *task, rcu_read_lock(); for (;;) { parent = rcu_dereference(task->real_parent); - pcred = rcu_dereference(parent->cred); + pcred = __task_cred(parent); if (parent == task || cred_fscmp(pcred, cred) != 0) break; task = parent; diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index f6c74f424691..c03926a1cc73 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -205,9 +205,10 @@ static void nfs_direct_req_release(struct nfs_direct_req *dreq) kref_put(&dreq->kref, nfs_direct_req_free); } -ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq) +ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset) { - return dreq->bytes_left; + loff_t start = offset - dreq->io_start; + return dreq->max_count - start; } EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left); @@ -368,7 +369,6 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, bytes -= req_len; requested_bytes += req_len; pos += req_len; - dreq->bytes_left -= req_len; } nfs_direct_release_pages(pagevec, npages); kvfree(pagevec); @@ -440,7 +440,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, goto out; dreq->inode = inode; - dreq->bytes_left = dreq->max_count = count; + dreq->max_count = count; dreq->io_start = iocb->ki_pos; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); l_ctx = nfs_get_lock_context(dreq->ctx); @@ -873,7 +873,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, bytes -= req_len; requested_bytes += req_len; pos += req_len; - dreq->bytes_left -= req_len; if (defer) { nfs_mark_request_commit(req, NULL, &cinfo, 0); @@ -980,7 +979,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, goto out; dreq->inode = inode; - dreq->bytes_left = dreq->max_count = count; + dreq->max_count = count; dreq->io_start = pos; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); l_ctx = nfs_get_lock_context(dreq->ctx); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 3f9768810427..8577ccf621f5 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -558,7 +558,6 @@ const struct address_space_operations nfs_file_aops = { .read_folio = nfs_read_folio, .readahead = nfs_readahead, .dirty_folio = filemap_dirty_folio, - .writepage = nfs_writepage, .writepages = nfs_writepages, .write_begin = nfs_write_begin, .write_end = nfs_write_end, @@ -567,7 +566,7 @@ const struct address_space_operations nfs_file_aops = { .migrate_folio = nfs_migrate_folio, .launder_folio = nfs_launder_folio, .is_dirty_writeback = nfs_check_dirty_writeback, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = nfs_swap_activate, .swap_deactivate = nfs_swap_deactivate, .swap_rw = nfs_swap_rw, diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 9c9cf764f600..e3722ce6722e 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -655,7 +655,7 @@ extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry); /* direct.c */ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, struct nfs_direct_req *dreq); -extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); +extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset); /* nfs4proc.c */ extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, @@ -936,7 +936,6 @@ struct nfs_direct_req { loff_t io_start; /* Start offset for I/O */ ssize_t count, /* bytes actually processed */ max_count, /* max expected count */ - bytes_left, /* bytes left to be sent */ error; /* any reported error */ struct completion completion; /* wait for i/o completion */ diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c index 2ad66a8922f4..49aaf28a6950 100644 --- a/fs/nfs/nfs42xattr.c +++ b/fs/nfs/nfs42xattr.c @@ -132,7 +132,7 @@ nfs4_xattr_entry_lru_add(struct nfs4_xattr_entry *entry) lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ? &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; - return list_lru_add(lru, &entry->lru); + return list_lru_add_obj(lru, &entry->lru); } static bool @@ -143,7 +143,7 @@ nfs4_xattr_entry_lru_del(struct nfs4_xattr_entry *entry) lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ? &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; - return list_lru_del(lru, &entry->lru); + return list_lru_del_obj(lru, &entry->lru); } /* @@ -349,7 +349,7 @@ nfs4_xattr_cache_unlink(struct inode *inode) oldcache = nfsi->xattr_cache; if (oldcache != NULL) { - list_lru_del(&nfs4_xattr_cache_lru, &oldcache->lru); + list_lru_del_obj(&nfs4_xattr_cache_lru, &oldcache->lru); oldcache->inode = NULL; } nfsi->xattr_cache = NULL; @@ -474,7 +474,7 @@ nfs4_xattr_get_cache(struct inode *inode, int add) kref_get(&cache->ref); nfsi->xattr_cache = cache; cache->inode = inode; - list_lru_add(&nfs4_xattr_cache_lru, &cache->lru); + list_lru_add_obj(&nfs4_xattr_cache_lru, &cache->lru); } spin_unlock(&inode->i_lock); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 02788c3c85e5..e238abc78a13 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -10,6 +10,7 @@ #include <linux/mount.h> #include <linux/nfs_fs.h> #include <linux/nfs_ssc.h> +#include <linux/splice.h> #include "delegation.h" #include "internal.h" #include "iostat.h" @@ -195,8 +196,8 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in, ret = __nfs4_copy_file_range(file_in, pos_in, file_out, pos_out, count, flags); if (ret == -EOPNOTSUPP || ret == -EXDEV) - ret = generic_copy_file_range(file_in, pos_in, file_out, - pos_out, count, flags); + ret = splice_copy_file_range(file_in, pos_in, file_out, + pos_out, count); return ret; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8a943fffaad5..23819a756508 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -170,6 +170,7 @@ static int nfs4_map_errors(int err) case -NFS4ERR_RESOURCE: case -NFS4ERR_LAYOUTTRYLATER: case -NFS4ERR_RECALLCONFLICT: + case -NFS4ERR_RETURNCONFLICT: return -EREMOTEIO; case -NFS4ERR_WRONGSEC: case -NFS4ERR_WRONG_CRED: @@ -558,6 +559,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server, case -NFS4ERR_GRACE: case -NFS4ERR_LAYOUTTRYLATER: case -NFS4ERR_RECALLCONFLICT: + case -NFS4ERR_RETURNCONFLICT: exception->delay = 1; return 0; @@ -9691,6 +9693,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, status = -EBUSY; break; case -NFS4ERR_RECALLCONFLICT: + case -NFS4ERR_RETURNCONFLICT: status = -ERECALLCONFLICT; break; case -NFS4ERR_DELEG_REVOKED: diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c index e776200e9a11..886a7c4c60b3 100644 --- a/fs/nfs/nfs4sysctl.c +++ b/fs/nfs/nfs4sysctl.c @@ -34,7 +34,6 @@ static struct ctl_table nfs4_cb_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; int nfs4_register_sysctl(void) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index deec76cf5afe..69406e60f391 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1602,7 +1602,8 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_pgio_args *args static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr) { uint32_t attrs[3] = { - FATTR4_WORD0_RDATTR_ERROR, + FATTR4_WORD0_TYPE + | FATTR4_WORD0_RDATTR_ERROR, FATTR4_WORD1_MOUNTED_ON_FILEID, }; uint32_t dircount = readdir->count; @@ -1612,12 +1613,20 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg unsigned int i; if (readdir->plus) { - attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE| - FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE|FATTR4_WORD0_FILEID; - attrs[1] |= FATTR4_WORD1_MODE|FATTR4_WORD1_NUMLINKS|FATTR4_WORD1_OWNER| - FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV| - FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS| - FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; + attrs[0] |= FATTR4_WORD0_CHANGE + | FATTR4_WORD0_SIZE + | FATTR4_WORD0_FSID + | FATTR4_WORD0_FILEHANDLE + | FATTR4_WORD0_FILEID; + attrs[1] |= FATTR4_WORD1_MODE + | FATTR4_WORD1_NUMLINKS + | FATTR4_WORD1_OWNER + | FATTR4_WORD1_OWNER_GROUP + | FATTR4_WORD1_RAWDEV + | FATTR4_WORD1_SPACE_USED + | FATTR4_WORD1_TIME_ACCESS + | FATTR4_WORD1_TIME_METADATA + | FATTR4_WORD1_TIME_MODIFY; attrs[2] |= FATTR4_WORD2_SECURITY_LABEL; } /* Use mounted_on_fileid only if the server supports it */ diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 4e90ca531176..afedb449b54f 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -400,6 +400,7 @@ DECLARE_EVENT_CLASS(nfs_lookup_event, __field(unsigned long, flags) __field(dev_t, dev) __field(u64, dir) + __field(u64, fileid) __string(name, dentry->d_name.name) ), @@ -407,16 +408,18 @@ DECLARE_EVENT_CLASS(nfs_lookup_event, __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->flags = flags; + __entry->fileid = d_is_negative(dentry) ? 0 : NFS_FILEID(d_inode(dentry)); __assign_str(name, dentry->d_name.name); ), TP_printk( - "flags=0x%lx (%s) name=%02x:%02x:%llu/%s", + "flags=0x%lx (%s) name=%02x:%02x:%llu/%s fileid=%llu", __entry->flags, show_fs_lookup_flags(__entry->flags), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, - __get_str(name) + __get_str(name), + __entry->fileid ) ); @@ -444,6 +447,7 @@ DECLARE_EVENT_CLASS(nfs_lookup_event_done, __field(unsigned long, flags) __field(dev_t, dev) __field(u64, dir) + __field(u64, fileid) __string(name, dentry->d_name.name) ), @@ -452,17 +456,19 @@ DECLARE_EVENT_CLASS(nfs_lookup_event_done, __entry->dir = NFS_FILEID(dir); __entry->error = error < 0 ? -error : 0; __entry->flags = flags; + __entry->fileid = d_is_negative(dentry) ? 0 : NFS_FILEID(d_inode(dentry)); __assign_str(name, dentry->d_name.name); ), TP_printk( - "error=%ld (%s) flags=0x%lx (%s) name=%02x:%02x:%llu/%s", + "error=%ld (%s) flags=0x%lx (%s) name=%02x:%02x:%llu/%s fileid=%llu", -__entry->error, show_nfs_status(__entry->error), __entry->flags, show_fs_lookup_flags(__entry->flags), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, - __get_str(name) + __get_str(name), + __entry->fileid ) ); @@ -893,7 +899,7 @@ DECLARE_EVENT_CLASS(nfs_rename_event_done, DEFINE_NFS_RENAME_EVENT(nfs_rename_enter); DEFINE_NFS_RENAME_EVENT_DONE(nfs_rename_exit); -DEFINE_NFS_RENAME_EVENT_DONE(nfs_sillyrename_rename); +DEFINE_NFS_RENAME_EVENT_DONE(nfs_async_rename_done); TRACE_EVENT(nfs_sillyrename_unlink, TP_PROTO( @@ -1539,7 +1545,6 @@ DECLARE_EVENT_CLASS(nfs_direct_req_class, __field(u32, fhandle) __field(loff_t, offset) __field(ssize_t, count) - __field(ssize_t, bytes_left) __field(ssize_t, error) __field(int, flags) ), @@ -1554,19 +1559,18 @@ DECLARE_EVENT_CLASS(nfs_direct_req_class, __entry->fhandle = nfs_fhandle_hash(fh); __entry->offset = dreq->io_start; __entry->count = dreq->count; - __entry->bytes_left = dreq->bytes_left; __entry->error = dreq->error; __entry->flags = dreq->flags; ), TP_printk( "error=%zd fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld count=%zd bytes_left=%zd flags=%s", + "offset=%lld count=%zd flags=%s", __entry->error, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->offset, - __entry->count, __entry->bytes_left, + __entry->count, nfs_show_direct_req_flags(__entry->flags) ) ); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 21a365357629..0c0fed1ecd0b 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2733,7 +2733,8 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r if (pgio->pg_dreq == NULL) rd_size = i_size_read(pgio->pg_inode) - req_offset(req); else - rd_size = nfs_dreq_bytes_left(pgio->pg_dreq); + rd_size = nfs_dreq_bytes_left(pgio->pg_dreq, + req_offset(req)); pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c index f39e2089bc4c..e645be1a3381 100644 --- a/fs/nfs/sysctl.c +++ b/fs/nfs/sysctl.c @@ -29,7 +29,6 @@ static struct ctl_table nfs_cb_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; int nfs_register_sysctl(void) diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 150a953a8be9..0110299643a2 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -267,7 +267,7 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata) struct inode *new_dir = data->new_dir; struct dentry *old_dentry = data->old_dentry; - trace_nfs_sillyrename_rename(old_dir, old_dentry, + trace_nfs_async_rename_done(old_dir, old_dentry, new_dir, data->new_dentry, task->tk_status); if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) { rpc_restart_call_prepare(task); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b664caea8b4e..bb79d3a886ae 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -192,13 +192,13 @@ static struct nfs_page *nfs_folio_find_private_request(struct folio *folio) if (!folio_test_private(folio)) return NULL; - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); req = nfs_folio_private_request(folio); if (req) { WARN_ON_ONCE(req->wb_head != req); kref_get(&req->wb_kref); } - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); return req; } @@ -680,17 +680,6 @@ static int nfs_writepage_locked(struct folio *folio, return err; } -int nfs_writepage(struct page *page, struct writeback_control *wbc) -{ - struct folio *folio = page_folio(page); - int ret; - - ret = nfs_writepage_locked(folio, wbc); - if (ret != AOP_WRITEPAGE_ACTIVATE) - unlock_page(page); - return ret; -} - static int nfs_writepages_callback(struct folio *folio, struct writeback_control *wbc, void *data) { @@ -769,13 +758,13 @@ static void nfs_inode_add_request(struct nfs_page *req) * Swap-space should not get truncated. Hence no need to plug the race * with invalidate/truncate. */ - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); if (likely(!folio_test_swapcache(folio))) { set_bit(PG_MAPPED, &req->wb_flags); folio_set_private(folio); folio->private = req; } - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); atomic_long_inc(&nfsi->nrequests); /* this a head request for a page group - mark it as having an * extra reference so sub groups can follow suit. @@ -796,13 +785,13 @@ static void nfs_inode_remove_request(struct nfs_page *req) struct folio *folio = nfs_page_to_folio(req->wb_head); struct address_space *mapping = folio_file_mapping(folio); - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); if (likely(folio && !folio_test_swapcache(folio))) { folio->private = NULL; folio_clear_private(folio); clear_bit(PG_MAPPED, &req->wb_head->wb_flags); } - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); } if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) { diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 43b88eaf0673..272ab8d5c4d7 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -158,3 +158,19 @@ config NFSD_V4_SECURITY_LABEL If you do not wish to enable fine-grained security labels SELinux or Smack policies on NFSv4 files, say N. + +config NFSD_LEGACY_CLIENT_TRACKING + bool "Support legacy NFSv4 client tracking methods (DEPRECATED)" + depends on NFSD_V4 + default n + help + The NFSv4 server needs to store a small amount of information on + stable storage in order to handle state recovery after reboot. Most + modern deployments upcall to a userland daemon for this (nfsdcld), + but older NFS servers may store information directly in a + recoverydir, or spawn a process directly using a usermodehelper + upcall. + + These legacy client tracking methods have proven to be probelmatic + and will be removed in the future. Say Y here if you need support + for them in the interim. diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index ef063f93fde9..9cb7f0c33df5 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -322,7 +322,7 @@ nfsd_file_check_writeback(struct nfsd_file *nf) static bool nfsd_file_lru_add(struct nfsd_file *nf) { set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); - if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) { + if (list_lru_add_obj(&nfsd_file_lru, &nf->nf_lru)) { trace_nfsd_file_lru_add(nf); return true; } @@ -331,7 +331,7 @@ static bool nfsd_file_lru_add(struct nfsd_file *nf) static bool nfsd_file_lru_remove(struct nfsd_file *nf) { - if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) { + if (list_lru_del_obj(&nfsd_file_lru, &nf->nf_lru)) { trace_nfsd_file_lru_del(nf); return true; } @@ -717,7 +717,7 @@ nfsd_file_cache_init(void) return ret; ret = -ENOMEM; - nfsd_filecache_wq = alloc_workqueue("nfsd_filecache", 0, 0); + nfsd_filecache_wq = alloc_workqueue("nfsd_filecache", WQ_UNBOUND, 0); if (!nfsd_filecache_wq) goto out; diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index ab303a8b77d5..74b4360779a1 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -123,14 +123,9 @@ struct nfsd_net { u32 clientid_counter; u32 clverifier_counter; - struct svc_serv *nfsd_serv; - /* When a listening socket is added to nfsd, keep_active is set - * and this justifies a reference on nfsd_serv. This stops - * nfsd_serv from being freed. When the number of threads is - * set, keep_active is cleared and the reference is dropped. So - * when the last thread exits, the service will be destroyed. - */ - int keep_active; + struct svc_info nfsd_info; +#define nfsd_serv nfsd_info.serv + /* * clientid and stateid data for construction of net unique COPY diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 4039ffcf90ba..926c29879c6a 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -31,6 +31,7 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include <linux/nfs4.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/xprt.h> #include <linux/sunrpc/svc_xprt.h> @@ -87,31 +88,6 @@ static void encode_bitmap4(struct xdr_stream *xdr, const __u32 *bitmap, WARN_ON_ONCE(xdr_stream_encode_uint32_array(xdr, bitmap, len) < 0); } -/* - * nfs_cb_opnum4 - * - * enum nfs_cb_opnum4 { - * OP_CB_GETATTR = 3, - * ... - * }; - */ -enum nfs_cb_opnum4 { - OP_CB_GETATTR = 3, - OP_CB_RECALL = 4, - OP_CB_LAYOUTRECALL = 5, - OP_CB_NOTIFY = 6, - OP_CB_PUSH_DELEG = 7, - OP_CB_RECALL_ANY = 8, - OP_CB_RECALLABLE_OBJ_AVAIL = 9, - OP_CB_RECALL_SLOT = 10, - OP_CB_SEQUENCE = 11, - OP_CB_WANTS_CANCELLED = 12, - OP_CB_NOTIFY_LOCK = 13, - OP_CB_NOTIFY_DEVICEID = 14, - OP_CB_OFFLOAD = 15, - OP_CB_ILLEGAL = 10044 -}; - static void encode_nfs_cb_opnum4(struct xdr_stream *xdr, enum nfs_cb_opnum4 op) { __be32 *p; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 6f2d4aa4970d..14712fa08f76 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -970,8 +970,11 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * To ensure proper ordering, we therefore turn off zero copy if * the client wants us to do more in this compound: */ - if (!nfsd4_last_compound_op(rqstp)) - clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); + if (!nfsd4_last_compound_op(rqstp)) { + struct nfsd4_compoundargs *argp = rqstp->rq_argp; + + argp->splice_ok = false; + } /* check stateid */ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 3509e73abe1f..2c060e0b1604 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -66,6 +66,7 @@ struct nfsd4_client_tracking_ops { static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops; static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2; +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING /* Globals */ static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; @@ -720,6 +721,7 @@ static const struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = { .version = 1, .msglen = 0, }; +#endif /* CONFIG_NFSD_LEGACY_CLIENT_TRACKING */ /* Globals */ #define NFSD_PIPE_DIR "nfsd" @@ -731,8 +733,10 @@ struct cld_net { spinlock_t cn_lock; struct list_head cn_list; unsigned int cn_xid; - bool cn_has_legacy; struct crypto_shash *cn_tfm; +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING + bool cn_has_legacy; +#endif }; struct cld_upcall { @@ -793,7 +797,6 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg, uint8_t cmd, princhashlen; struct xdr_netobj name, princhash = { .len = 0, .data = NULL }; uint16_t namelen; - struct cld_net *cn = nn->cld_net; if (get_user(cmd, &cmsg->cm_cmd)) { dprintk("%s: error when copying cmd from userspace", __func__); @@ -833,11 +836,15 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg, return PTR_ERR(name.data); name.len = namelen; } +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING if (name.len > 5 && memcmp(name.data, "hash:", 5) == 0) { + struct cld_net *cn = nn->cld_net; + name.len = name.len - 5; memmove(name.data, name.data + 5, name.len); cn->cn_has_legacy = true; } +#endif if (!nfs4_client_to_reclaim(name, princhash, nn)) { kfree(name.data); kfree(princhash.data); @@ -1010,7 +1017,9 @@ __nfsd4_init_cld_pipe(struct net *net) } cn->cn_pipe->dentry = dentry; +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING cn->cn_has_legacy = false; +#endif nn->cld_net = cn; return 0; @@ -1282,10 +1291,6 @@ nfsd4_cld_check(struct nfs4_client *clp) { struct nfs4_client_reclaim *crp; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); - struct cld_net *cn = nn->cld_net; - int status; - char dname[HEXDIR_LEN]; - struct xdr_netobj name; /* did we already find that this client is stable? */ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) @@ -1296,7 +1301,12 @@ nfsd4_cld_check(struct nfs4_client *clp) if (crp) goto found; - if (cn->cn_has_legacy) { +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING + if (nn->cld_net->cn_has_legacy) { + int status; + char dname[HEXDIR_LEN]; + struct xdr_netobj name; + status = nfs4_make_rec_clidname(dname, &clp->cl_name); if (status) return -ENOENT; @@ -1314,6 +1324,7 @@ nfsd4_cld_check(struct nfs4_client *clp) goto found; } +#endif return -ENOENT; found: crp->cr_clp = clp; @@ -1327,8 +1338,6 @@ nfsd4_cld_check_v2(struct nfs4_client *clp) struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct cld_net *cn = nn->cld_net; int status; - char dname[HEXDIR_LEN]; - struct xdr_netobj name; struct crypto_shash *tfm = cn->cn_tfm; struct xdr_netobj cksum; char *principal = NULL; @@ -1342,7 +1351,11 @@ nfsd4_cld_check_v2(struct nfs4_client *clp) if (crp) goto found; +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING if (cn->cn_has_legacy) { + struct xdr_netobj name; + char dname[HEXDIR_LEN]; + status = nfs4_make_rec_clidname(dname, &clp->cl_name); if (status) return -ENOENT; @@ -1360,6 +1373,7 @@ nfsd4_cld_check_v2(struct nfs4_client *clp) goto found; } +#endif return -ENOENT; found: if (crp->cr_princhash.len) { @@ -1663,6 +1677,7 @@ static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2 = { .msglen = sizeof(struct cld_msg_v2), }; +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING /* upcall via usermodehelper */ static char cltrack_prog[PATH_MAX] = "/sbin/nfsdcltrack"; module_param_string(cltrack_prog, cltrack_prog, sizeof(cltrack_prog), @@ -2007,28 +2022,10 @@ static const struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = { .msglen = 0, }; -int -nfsd4_client_tracking_init(struct net *net) +static inline int check_for_legacy_methods(int status, struct net *net) { - int status; - struct path path; struct nfsd_net *nn = net_generic(net, nfsd_net_id); - - /* just run the init if it the method is already decided */ - if (nn->client_tracking_ops) - goto do_init; - - /* First, try to use nfsdcld */ - nn->client_tracking_ops = &nfsd4_cld_tracking_ops; - status = nn->client_tracking_ops->init(net); - if (!status) - return status; - if (status != -ETIMEDOUT) { - nn->client_tracking_ops = &nfsd4_cld_tracking_ops_v0; - status = nn->client_tracking_ops->init(net); - if (!status) - return status; - } + struct path path; /* * Next, try the UMH upcall. @@ -2045,14 +2042,46 @@ nfsd4_client_tracking_init(struct net *net) nn->client_tracking_ops = &nfsd4_legacy_tracking_ops; status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path); if (!status) { - status = d_is_dir(path.dentry); + status = !d_is_dir(path.dentry); path_put(&path); - if (!status) { - status = -EINVAL; - goto out; - } + if (status) + return -ENOTDIR; + status = nn->client_tracking_ops->init(net); + } + return status; +} +#else +static inline int check_for_legacy_methods(int status, struct net *net) +{ + return status; +} +#endif /* CONFIG_LEGACY_NFSD_CLIENT_TRACKING */ + +int +nfsd4_client_tracking_init(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + int status; + + /* just run the init if it the method is already decided */ + if (nn->client_tracking_ops) + goto do_init; + + /* First, try to use nfsdcld */ + nn->client_tracking_ops = &nfsd4_cld_tracking_ops; + status = nn->client_tracking_ops->init(net); + if (!status) + return status; + if (status != -ETIMEDOUT) { + nn->client_tracking_ops = &nfsd4_cld_tracking_ops_v0; + status = nn->client_tracking_ops->init(net); + if (!status) + return status; } + status = check_for_legacy_methods(status, net); + if (status) + goto out; do_init: status = nn->client_tracking_ops->init(net); out: diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 3edbfa0233e6..2fa54cfd4882 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -6575,7 +6575,7 @@ unlock: spin_unlock(&nn->s2s_cp_lock); if (!state) return nfserr_bad_stateid; - if (!clp && state) + if (!clp) *cps = state; return 0; } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index b499fe9caa32..c719c475a068 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2524,8 +2524,9 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) svc_reserve(argp->rqstp, max_reply + readbytes); argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE; + argp->splice_ok = nfsd_read_splice_ok(argp->rqstp); if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack) - clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags); + argp->splice_ok = false; return true; } @@ -4375,12 +4376,13 @@ static __be32 nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, union nfsd4_op_u *u) { + struct nfsd4_compoundargs *argp = resp->rqstp->rq_argp; struct nfsd4_read *read = &u->read; - bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags); - unsigned long maxcount; struct xdr_stream *xdr = resp->xdr; - struct file *file; int starting_len = xdr->buf->len; + bool splice_ok = argp->splice_ok; + unsigned long maxcount; + struct file *file; __be32 *p; if (nfserr) @@ -5201,9 +5203,10 @@ static __be32 nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, struct nfsd4_read *read) { - bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags); + struct nfsd4_compoundargs *argp = resp->rqstp->rq_argp; struct file *file = read->rd_nf->nf_file; struct xdr_stream *xdr = resp->xdr; + bool splice_ok = argp->splice_ok; unsigned long maxcount; __be32 nfserr, *p; diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index d3273a396659..5c1a4a0aa605 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -364,8 +364,6 @@ nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc) if (freed > sc->nr_to_scan) break; } - - trace_nfsd_drc_gc(nn, freed); return freed; } @@ -508,7 +506,6 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, __wsum csum; struct nfsd_drc_bucket *b; int type = rqstp->rq_cachetype; - unsigned long freed; LIST_HEAD(dispose); int rtn = RC_DOIT; @@ -538,8 +535,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, nfsd_prune_bucket_locked(nn, b, 3, &dispose); spin_unlock(&b->cache_lock); - freed = nfsd_cacherep_dispose(&dispose); - trace_nfsd_drc_gc(nn, freed); + nfsd_cacherep_dispose(&dispose); nfsd_stats_rc_misses_inc(); atomic_inc(&nn->num_drc_entries); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 87fed75808ff..f206ca32e7f5 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -48,10 +48,6 @@ enum { NFSD_MaxBlkSize, NFSD_MaxConnections, NFSD_Filecache, - /* - * The below MUST come last. Otherwise we leave a hole in nfsd_files[] - * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops - */ #ifdef CONFIG_NFSD_V4 NFSD_Leasetime, NFSD_Gracetime, @@ -76,7 +72,9 @@ static ssize_t write_maxconn(struct file *file, char *buf, size_t size); #ifdef CONFIG_NFSD_V4 static ssize_t write_leasetime(struct file *file, char *buf, size_t size); static ssize_t write_gracetime(struct file *file, char *buf, size_t size); +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); +#endif static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size); #endif @@ -93,7 +91,9 @@ static ssize_t (*const write_op[])(struct file *, char *, size_t) = { #ifdef CONFIG_NFSD_V4 [NFSD_Leasetime] = write_leasetime, [NFSD_Gracetime] = write_gracetime, +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING [NFSD_RecoveryDir] = write_recoverydir, +#endif [NFSD_V4EndGrace] = write_v4_end_grace, #endif }; @@ -179,7 +179,7 @@ static const struct file_operations pool_stats_operations = { .open = nfsd_pool_stats_open, .read = seq_read, .llseek = seq_lseek, - .release = nfsd_pool_stats_release, + .release = seq_release, }; DEFINE_SHOW_ATTRIBUTE(nfsd_reply_cache_stats); @@ -707,12 +707,9 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred serv = nn->nfsd_serv; err = svc_addsock(serv, net, fd, buf, SIMPLE_TRANSACTION_LIMIT, cred); - if (err < 0 && !serv->sv_nrthreads && !nn->keep_active) - nfsd_last_thread(net); - else if (err >= 0 && !serv->sv_nrthreads && !xchg(&nn->keep_active, 1)) - svc_get(serv); + if (!serv->sv_nrthreads && list_empty(&nn->nfsd_serv->sv_permsocks)) + nfsd_destroy_serv(net); - svc_put(serv); return err; } @@ -750,10 +747,6 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr if (err < 0 && err != -EAFNOSUPPORT) goto out_close; - if (!serv->sv_nrthreads && !xchg(&nn->keep_active, 1)) - svc_get(serv); - - svc_put(serv); return 0; out_close: xprt = svc_find_xprt(serv, transport, net, PF_INET, port); @@ -762,10 +755,9 @@ out_close: svc_xprt_put(xprt); } out_err: - if (!serv->sv_nrthreads && !nn->keep_active) - nfsd_last_thread(net); + if (!serv->sv_nrthreads && list_empty(&nn->nfsd_serv->sv_permsocks)) + nfsd_destroy_serv(net); - svc_put(serv); return err; } @@ -1021,6 +1013,7 @@ static ssize_t write_gracetime(struct file *file, char *buf, size_t size) return nfsd4_write_time(file, buf, size, &nn->nfsd4_grace, nn); } +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size, struct nfsd_net *nn) { @@ -1081,6 +1074,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) mutex_unlock(&nfsd_mutex); return rv; } +#endif /* * write_v4_end_grace - release grace period for nfsd's v4.x lock manager @@ -1244,63 +1238,34 @@ static inline void _nfsd_symlink(struct dentry *parent, const char *name, #endif -static void clear_ncl(struct inode *inode) +static void clear_ncl(struct dentry *dentry) { + struct inode *inode = d_inode(dentry); struct nfsdfs_client *ncl = inode->i_private; + spin_lock(&inode->i_lock); inode->i_private = NULL; + spin_unlock(&inode->i_lock); kref_put(&ncl->cl_ref, ncl->cl_release); } -static struct nfsdfs_client *__get_nfsdfs_client(struct inode *inode) -{ - struct nfsdfs_client *nc = inode->i_private; - - if (nc) - kref_get(&nc->cl_ref); - return nc; -} - struct nfsdfs_client *get_nfsdfs_client(struct inode *inode) { struct nfsdfs_client *nc; - inode_lock_shared(inode); - nc = __get_nfsdfs_client(inode); - inode_unlock_shared(inode); + spin_lock(&inode->i_lock); + nc = inode->i_private; + if (nc) + kref_get(&nc->cl_ref); + spin_unlock(&inode->i_lock); return nc; } -/* from __rpc_unlink */ -static void nfsdfs_remove_file(struct inode *dir, struct dentry *dentry) -{ - int ret; - - clear_ncl(d_inode(dentry)); - dget(dentry); - ret = simple_unlink(dir, dentry); - d_drop(dentry); - fsnotify_unlink(dir, dentry); - dput(dentry); - WARN_ON_ONCE(ret); -} - -static void nfsdfs_remove_files(struct dentry *root) -{ - struct dentry *dentry, *tmp; - - list_for_each_entry_safe(dentry, tmp, &root->d_subdirs, d_child) { - if (!simple_positive(dentry)) { - WARN_ON_ONCE(1); /* I think this can't happen? */ - continue; - } - nfsdfs_remove_file(d_inode(root), dentry); - } -} /* XXX: cut'n'paste from simple_fill_super; figure out if we could share * code instead. */ static int nfsdfs_create_files(struct dentry *root, const struct tree_descr *files, + struct nfsdfs_client *ncl, struct dentry **fdentries) { struct inode *dir = d_inode(root); @@ -1319,8 +1284,9 @@ static int nfsdfs_create_files(struct dentry *root, dput(dentry); goto out; } + kref_get(&ncl->cl_ref); inode->i_fop = files->ops; - inode->i_private = __get_nfsdfs_client(dir); + inode->i_private = ncl; d_add(dentry, inode); fsnotify_create(dir, dentry); if (fdentries) @@ -1329,7 +1295,6 @@ static int nfsdfs_create_files(struct dentry *root, inode_unlock(dir); return 0; out: - nfsdfs_remove_files(root); inode_unlock(dir); return -ENOMEM; } @@ -1349,7 +1314,7 @@ struct dentry *nfsd_client_mkdir(struct nfsd_net *nn, dentry = nfsd_mkdir(nn->nfsd_client_dir, ncl, name); if (IS_ERR(dentry)) /* XXX: tossing errors? */ return NULL; - ret = nfsdfs_create_files(dentry, files, fdentries); + ret = nfsdfs_create_files(dentry, files, ncl, fdentries); if (ret) { nfsd_client_rmdir(dentry); return NULL; @@ -1360,20 +1325,7 @@ struct dentry *nfsd_client_mkdir(struct nfsd_net *nn, /* Taken from __rpc_rmdir: */ void nfsd_client_rmdir(struct dentry *dentry) { - struct inode *dir = d_inode(dentry->d_parent); - struct inode *inode = d_inode(dentry); - int ret; - - inode_lock(dir); - nfsdfs_remove_files(dentry); - clear_ncl(inode); - dget(dentry); - ret = simple_rmdir(dir, dentry); - WARN_ON_ONCE(ret); - d_drop(dentry); - fsnotify_rmdir(dir, dentry); - dput(dentry); - inode_unlock(dir); + simple_recursive_removal(dentry, clear_ncl); } static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc) diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 9ed0e08d16c2..304e9728b929 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -148,7 +148,7 @@ int nfsd_vers(struct nfsd_net *nn, int vers, enum vers_op change); int nfsd_minorversion(struct nfsd_net *nn, u32 minorversion, enum vers_op change); void nfsd_reset_versions(struct nfsd_net *nn); int nfsd_create_serv(struct net *net); -void nfsd_last_thread(struct net *net); +void nfsd_destroy_serv(struct net *net); extern int nfsd_max_blksize; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 7a2bc8e82a63..a667802e08e7 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -59,15 +59,6 @@ static __be32 nfsd_init_request(struct svc_rqst *, * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and some members * of the svc_serv struct such as ->sv_temp_socks and ->sv_permsocks. * - * If (out side the lock) nn->nfsd_serv is non-NULL, then it must point to a - * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0 (unless - * nn->keep_active is set). That number of nfsd threads must - * exist and each must be listed in ->sp_all_threads in some entry of - * ->sv_pools[]. - * - * Each active thread holds a counted reference on nn->nfsd_serv, as does - * the nn->keep_active flag and various transient calls to svc_get(). - * * Finally, the nfsd_mutex also protects some of the global variables that are * accessed when nfsd starts and that are settable via the write_* routines in * nfsctl.c. In particular: @@ -359,13 +350,12 @@ static bool nfsd_needs_lockd(struct nfsd_net *nn) */ void nfsd_copy_write_verifier(__be32 verf[2], struct nfsd_net *nn) { - int seq = 0; + unsigned int seq; do { - read_seqbegin_or_lock(&nn->writeverf_lock, &seq); + seq = read_seqbegin(&nn->writeverf_lock); memcpy(verf, nn->writeverf, sizeof(nn->writeverf)); - } while (need_seqretry(&nn->writeverf_lock, seq)); - done_seqretry(&nn->writeverf_lock, seq); + } while (read_seqretry(&nn->writeverf_lock, seq)); } static void nfsd_reset_write_verifier_locked(struct nfsd_net *nn) @@ -542,7 +532,11 @@ static struct notifier_block nfsd_inet6addr_notifier = { /* Only used under nfsd_mutex, so this atomic may be overkill: */ static atomic_t nfsd_notifier_refcount = ATOMIC_INIT(0); -void nfsd_last_thread(struct net *net) +/** + * nfsd_destroy_serv - tear down NFSD's svc_serv for a namespace + * @net: network namespace the NFS service is associated with + */ +void nfsd_destroy_serv(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_serv *serv = nn->nfsd_serv; @@ -564,7 +558,7 @@ void nfsd_last_thread(struct net *net) /* * write_ports can create the server without actually starting * any threads--if we get shut down before any threads are - * started, then nfsd_last_thread will be run before any of this + * started, then nfsd_destroy_serv will be run before any of this * other initialization has been done except the rpcb information. */ svc_rpcb_cleanup(serv, net); @@ -573,6 +567,7 @@ void nfsd_last_thread(struct net *net) nfsd_shutdown_net(net); nfsd_export_flush(net); + svc_destroy(&serv); } void nfsd_reset_versions(struct nfsd_net *nn) @@ -647,11 +642,9 @@ void nfsd_shutdown_threads(struct net *net) return; } - svc_get(serv); /* Kill outstanding nfsd threads */ svc_set_num_threads(serv, NULL, 0); - nfsd_last_thread(net); - svc_put(serv); + nfsd_destroy_serv(net); mutex_unlock(&nfsd_mutex); } @@ -667,10 +660,9 @@ int nfsd_create_serv(struct net *net) struct svc_serv *serv; WARN_ON(!mutex_is_locked(&nfsd_mutex)); - if (nn->nfsd_serv) { - svc_get(nn->nfsd_serv); + if (nn->nfsd_serv) return 0; - } + if (nfsd_max_blksize == 0) nfsd_max_blksize = nfsd_get_default_max_blksize(); nfsd_reset_versions(nn); @@ -681,10 +673,11 @@ int nfsd_create_serv(struct net *net) serv->sv_maxconn = nn->max_connections; error = svc_bind(serv, net); if (error < 0) { - svc_put(serv); + svc_destroy(&serv); return error; } spin_lock(&nfsd_notifier_lock); + nn->nfsd_info.mutex = &nfsd_mutex; nn->nfsd_serv = serv; spin_unlock(&nfsd_notifier_lock); @@ -764,7 +757,6 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) nthreads[0] = 1; /* apply the new numbers */ - svc_get(nn->nfsd_serv); for (i = 0; i < n; i++) { err = svc_set_num_threads(nn->nfsd_serv, &nn->nfsd_serv->sv_pools[i], @@ -772,7 +764,6 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) if (err) break; } - svc_put(nn->nfsd_serv); return err; } @@ -814,13 +805,8 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred) goto out_put; error = serv->sv_nrthreads; out_put: - /* Threads now hold service active */ - if (xchg(&nn->keep_active, 0)) - svc_put(serv); - if (serv->sv_nrthreads == 0) - nfsd_last_thread(net); - svc_put(serv); + nfsd_destroy_serv(net); out: mutex_unlock(&nfsd_mutex); return error; @@ -1083,28 +1069,7 @@ bool nfssvc_encode_voidres(struct svc_rqst *rqstp, struct xdr_stream *xdr) int nfsd_pool_stats_open(struct inode *inode, struct file *file) { - int ret; struct nfsd_net *nn = net_generic(inode->i_sb->s_fs_info, nfsd_net_id); - mutex_lock(&nfsd_mutex); - if (nn->nfsd_serv == NULL) { - mutex_unlock(&nfsd_mutex); - return -ENODEV; - } - svc_get(nn->nfsd_serv); - ret = svc_pool_stats_open(nn->nfsd_serv, file); - mutex_unlock(&nfsd_mutex); - return ret; -} - -int nfsd_pool_stats_release(struct inode *inode, struct file *file) -{ - struct seq_file *seq = file->private_data; - struct svc_serv *serv = seq->private; - int ret = seq_release(inode, file); - - mutex_lock(&nfsd_mutex); - svc_put(serv); - mutex_unlock(&nfsd_mutex); - return ret; + return svc_pool_stats_open(&nn->nfsd_info, file); } diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index fbc0ccb40424..d1e8cf079b0f 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -1262,28 +1262,6 @@ TRACE_EVENT(nfsd_drc_mismatch, __entry->ingress) ); -TRACE_EVENT_CONDITION(nfsd_drc_gc, - TP_PROTO( - const struct nfsd_net *nn, - unsigned long freed - ), - TP_ARGS(nn, freed), - TP_CONDITION(freed > 0), - TP_STRUCT__entry( - __field(unsigned long long, boot_time) - __field(unsigned long, freed) - __field(int, total) - ), - TP_fast_assign( - __entry->boot_time = nn->boot_time; - __entry->freed = freed; - __entry->total = atomic_read(&nn->num_drc_entries); - ), - TP_printk("boot_time=%16llx total=%d freed=%lu", - __entry->boot_time, __entry->total, __entry->freed - ) -); - TRACE_EVENT(nfsd_cb_args, TP_PROTO( const struct nfs4_client *clp, diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index e01e4e2acbd9..b7c7a9273ea0 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1039,7 +1039,10 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, ssize_t host_err; trace_nfsd_read_splice(rqstp, fhp, offset, *count); - host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); + host_err = rw_verify_area(READ, file, &offset, *count); + if (!host_err) + host_err = splice_direct_to_actor(file, &sd, + nfsd_direct_splice_actor); return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } @@ -1176,9 +1179,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, since = READ_ONCE(file->f_wb_err); if (verf) nfsd_copy_write_verifier(verf, nn); - file_start_write(file); host_err = vfs_iter_write(file, &iter, &pos, flags); - file_end_write(file); if (host_err < 0) { commit_reset_write_verifier(nn, rqstp, host_err); goto out_nfserr; @@ -1210,6 +1211,30 @@ out_nfserr: } /** + * nfsd_read_splice_ok - check if spliced reading is supported + * @rqstp: RPC transaction context + * + * Return values: + * %true: nfsd_splice_read() may be used + * %false: nfsd_splice_read() must not be used + * + * NFS READ normally uses splice to send data in-place. However the + * data in cache can change after the reply's MIC is computed but + * before the RPC reply is sent. To prevent the client from + * rejecting the server-computed MIC in this somewhat rare case, do + * not use splice with the GSS integrity and privacy services. + */ +bool nfsd_read_splice_ok(struct svc_rqst *rqstp) +{ + switch (svc_auth_flavor(rqstp)) { + case RPC_AUTH_GSS_KRB5I: + case RPC_AUTH_GSS_KRB5P: + return false; + } + return true; +} + +/** * nfsd_read - Read data from a file * @rqstp: RPC transaction context * @fhp: file handle of file to be read @@ -1238,7 +1263,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, return err; file = nf->nf_file; - if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags)) + if (file->f_op->splice_read && nfsd_read_splice_ok(rqstp)) err = nfsd_splice_read(rqstp, fhp, file, offset, count, eof); else err = nfsd_iter_read(rqstp, fhp, file, offset, count, 0, eof); @@ -1806,6 +1831,10 @@ retry: } trap = lock_rename(tdentry, fdentry); + if (IS_ERR(trap)) { + err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev; + goto out; + } err = fh_fill_pre_attrs(ffhp); if (err != nfs_ok) goto out_unlock; @@ -2102,9 +2131,23 @@ static __be32 nfsd_buffered_readdir(struct file *file, struct svc_fh *fhp, return cdp->err; } -/* - * Read entries from a directory. - * The NFSv3/4 verifier we ignore for now. +/** + * nfsd_readdir - Read entries from a directory + * @rqstp: RPC transaction context + * @fhp: NFS file handle of directory to be read + * @offsetp: OUT: seek offset of final entry that was read + * @cdp: OUT: an eof error value + * @func: entry filler actor + * + * This implementation ignores the NFSv3/4 verifier cookie. + * + * NB: normal system calls hold file->f_pos_lock when calling + * ->iterate_shared and ->llseek, but nfsd_readdir() does not. + * Because the struct file acquired here is not visible to other + * threads, it's internal state does not need mutex protection. + * + * Returns nfs_ok on success, otherwise an nfsstat code is + * returned. */ __be32 nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index e3c29596f4df..702fbc4483bf 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -114,6 +114,7 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, unsigned long *count, unsigned int base, u32 *eof); +bool nfsd_read_splice_ok(struct svc_rqst *rqstp); __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, unsigned long *count, u32 *eof); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 80e859dc84d8..415516c1b27e 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -840,6 +840,7 @@ struct nfsd4_compoundargs { u32 minorversion; u32 client_opcnt; u32 opcnt; + bool splice_ok; struct nfsd4_op *ops; struct nfsd4_op iops[8]; }; diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 5710833ac1cc..0131d83b912d 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -64,8 +64,8 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) set_buffer_mapped(bh); set_buffer_uptodate(bh); - unlock_page(bh->b_page); - put_page(bh->b_page); + folio_unlock(bh->b_folio); + folio_put(bh->b_folio); return bh; } @@ -75,7 +75,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, { struct buffer_head *bh; struct inode *inode = btnc->host; - struct page *page; + struct folio *folio; int err; bh = nilfs_grab_buffer(inode, btnc, blocknr, BIT(BH_NILFS_Node)); @@ -83,7 +83,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, return -ENOMEM; err = -EEXIST; /* internal code */ - page = bh->b_page; + folio = bh->b_folio; if (buffer_uptodate(bh) || buffer_dirty(bh)) goto found; @@ -130,8 +130,8 @@ found: *pbh = bh; out_locked: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return err; } @@ -145,19 +145,19 @@ out_locked: void nilfs_btnode_delete(struct buffer_head *bh) { struct address_space *mapping; - struct page *page = bh->b_page; - pgoff_t index = page_index(page); + struct folio *folio = bh->b_folio; + pgoff_t index = folio->index; int still_dirty; - get_page(page); - lock_page(page); - wait_on_page_writeback(page); + folio_get(folio); + folio_lock(folio); + folio_wait_writeback(folio); nilfs_forget_buffer(bh); - still_dirty = PageDirty(page); - mapping = page->mapping; - unlock_page(page); - put_page(page); + still_dirty = folio_test_dirty(folio); + mapping = folio->mapping; + folio_unlock(folio); + folio_put(folio); if (!still_dirty && mapping) invalidate_inode_pages2_range(mapping, index, index); @@ -185,23 +185,23 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc, ctxt->newbh = NULL; if (inode->i_blkbits == PAGE_SHIFT) { - struct page *opage = obh->b_page; - lock_page(opage); + struct folio *ofolio = obh->b_folio; + folio_lock(ofolio); retry: /* BUG_ON(oldkey != obh->b_folio->index); */ - if (unlikely(oldkey != opage->index)) - NILFS_PAGE_BUG(opage, + if (unlikely(oldkey != ofolio->index)) + NILFS_FOLIO_BUG(ofolio, "invalid oldkey %lld (newkey=%lld)", (unsigned long long)oldkey, (unsigned long long)newkey); xa_lock_irq(&btnc->i_pages); - err = __xa_insert(&btnc->i_pages, newkey, opage, GFP_NOFS); + err = __xa_insert(&btnc->i_pages, newkey, ofolio, GFP_NOFS); xa_unlock_irq(&btnc->i_pages); /* - * Note: page->index will not change to newkey until + * Note: folio->index will not change to newkey until * nilfs_btnode_commit_change_key() will be called. - * To protect the page in intermediate state, the page lock + * To protect the folio in intermediate state, the folio lock * is held. */ if (!err) @@ -213,7 +213,7 @@ retry: if (!err) goto retry; /* fallback to copy mode */ - unlock_page(opage); + folio_unlock(ofolio); } nbh = nilfs_btnode_create_block(btnc, newkey); @@ -225,7 +225,7 @@ retry: return 0; failed_unlock: - unlock_page(obh->b_page); + folio_unlock(obh->b_folio); return err; } @@ -238,15 +238,15 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc, { struct buffer_head *obh = ctxt->bh, *nbh = ctxt->newbh; __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey; - struct page *opage; + struct folio *ofolio; if (oldkey == newkey) return; if (nbh == NULL) { /* blocksize == pagesize */ - opage = obh->b_page; - if (unlikely(oldkey != opage->index)) - NILFS_PAGE_BUG(opage, + ofolio = obh->b_folio; + if (unlikely(oldkey != ofolio->index)) + NILFS_FOLIO_BUG(ofolio, "invalid oldkey %lld (newkey=%lld)", (unsigned long long)oldkey, (unsigned long long)newkey); @@ -257,8 +257,8 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc, __xa_set_mark(&btnc->i_pages, newkey, PAGECACHE_TAG_DIRTY); xa_unlock_irq(&btnc->i_pages); - opage->index = obh->b_blocknr = newkey; - unlock_page(opage); + ofolio->index = obh->b_blocknr = newkey; + folio_unlock(ofolio); } else { nilfs_copy_buffer(nbh, obh); mark_buffer_dirty(nbh); @@ -284,7 +284,7 @@ void nilfs_btnode_abort_change_key(struct address_space *btnc, if (nbh == NULL) { /* blocksize == pagesize */ xa_erase_irq(&btnc->i_pages, newkey); - unlock_page(ctxt->bh->b_page); + folio_unlock(ctxt->bh->b_folio); } else { /* * When canceling a buffer that a prepare operation has diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index 9ebefb3acb0e..39136637f715 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -552,11 +552,29 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, } /** - * nilfs_cpfile_get_cpinfo - - * @cpfile: - * @cno: - * @ci: - * @nci: + * nilfs_cpfile_get_cpinfo - get information on checkpoints + * @cpfile: checkpoint file inode + * @cnop: place to pass a starting checkpoint number and receive a + * checkpoint number to continue the search + * @mode: mode of checkpoints that the caller wants to retrieve + * @buf: buffer for storing checkpoints' information + * @cisz: byte size of one checkpoint info item in array + * @nci: number of checkpoint info items to retrieve + * + * nilfs_cpfile_get_cpinfo() searches for checkpoints in @mode state + * starting from the checkpoint number stored in @cnop, and stores + * information about found checkpoints in @buf. + * The buffer pointed to by @buf must be large enough to store information + * for @nci checkpoints. If at least one checkpoint information is + * successfully retrieved, @cnop is updated to point to the checkpoint + * number to continue searching. + * + * Return: Count of checkpoint info items stored in the output buffer on + * success, or the following negative error code on failure. + * * %-EINVAL - Invalid checkpoint mode. + * * %-ENOMEM - Insufficient memory available. + * * %-EIO - I/O error (including metadata corruption). + * * %-ENOENT - Invalid checkpoint number specified. */ ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode, diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index de2073c47651..bc846b904b68 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -64,12 +64,6 @@ static inline unsigned int nilfs_chunk_size(struct inode *inode) return inode->i_sb->s_blocksize; } -static inline void nilfs_put_page(struct page *page) -{ - kunmap(page); - put_page(page); -} - /* * Return the offset into page `page_nr' of the last valid * byte in that page, plus one. @@ -84,48 +78,46 @@ static unsigned int nilfs_last_byte(struct inode *inode, unsigned long page_nr) return last_byte; } -static int nilfs_prepare_chunk(struct page *page, unsigned int from, +static int nilfs_prepare_chunk(struct folio *folio, unsigned int from, unsigned int to) { - loff_t pos = page_offset(page) + from; + loff_t pos = folio_pos(folio) + from; - return __block_write_begin(page, pos, to - from, nilfs_get_block); + return __block_write_begin(&folio->page, pos, to - from, nilfs_get_block); } -static void nilfs_commit_chunk(struct page *page, - struct address_space *mapping, - unsigned int from, unsigned int to) +static void nilfs_commit_chunk(struct folio *folio, + struct address_space *mapping, size_t from, size_t to) { struct inode *dir = mapping->host; - loff_t pos = page_offset(page) + from; - unsigned int len = to - from; - unsigned int nr_dirty, copied; + loff_t pos = folio_pos(folio) + from; + size_t copied, len = to - from; + unsigned int nr_dirty; int err; - nr_dirty = nilfs_page_count_clean_buffers(page, from, to); - copied = block_write_end(NULL, mapping, pos, len, len, page, NULL); + nr_dirty = nilfs_page_count_clean_buffers(&folio->page, from, to); + copied = block_write_end(NULL, mapping, pos, len, len, &folio->page, NULL); if (pos + copied > dir->i_size) i_size_write(dir, pos + copied); if (IS_DIRSYNC(dir)) nilfs_set_transaction_flag(NILFS_TI_SYNC); err = nilfs_set_file_dirty(dir, nr_dirty); WARN_ON(err); /* do not happen */ - unlock_page(page); + folio_unlock(folio); } -static bool nilfs_check_page(struct page *page) +static bool nilfs_check_folio(struct folio *folio, char *kaddr) { - struct inode *dir = page->mapping->host; + struct inode *dir = folio->mapping->host; struct super_block *sb = dir->i_sb; unsigned int chunk_size = nilfs_chunk_size(dir); - char *kaddr = page_address(page); - unsigned int offs, rec_len; - unsigned int limit = PAGE_SIZE; + size_t offs, rec_len; + size_t limit = folio_size(folio); struct nilfs_dir_entry *p; char *error; - if ((dir->i_size >> PAGE_SHIFT) == page->index) { - limit = dir->i_size & ~PAGE_MASK; + if (dir->i_size < folio_pos(folio) + limit) { + limit = dir->i_size - folio_pos(folio); if (limit & (chunk_size - 1)) goto Ebadsize; if (!limit) @@ -147,7 +139,7 @@ static bool nilfs_check_page(struct page *page) if (offs != limit) goto Eend; out: - SetPageChecked(page); + folio_set_checked(folio); return true; /* Too bad, we had an error */ @@ -170,8 +162,8 @@ Espan: error = "directory entry across blocks"; bad_entry: nilfs_error(sb, - "bad entry in directory #%lu: %s - offset=%lu, inode=%lu, rec_len=%d, name_len=%d", - dir->i_ino, error, (page->index << PAGE_SHIFT) + offs, + "bad entry in directory #%lu: %s - offset=%lu, inode=%lu, rec_len=%zd, name_len=%d", + dir->i_ino, error, (folio->index << PAGE_SHIFT) + offs, (unsigned long)le64_to_cpu(p->inode), rec_len, p->name_len); goto fail; @@ -179,29 +171,34 @@ Eend: p = (struct nilfs_dir_entry *)(kaddr + offs); nilfs_error(sb, "entry in directory #%lu spans the page boundary offset=%lu, inode=%lu", - dir->i_ino, (page->index << PAGE_SHIFT) + offs, + dir->i_ino, (folio->index << PAGE_SHIFT) + offs, (unsigned long)le64_to_cpu(p->inode)); fail: - SetPageError(page); + folio_set_error(folio); return false; } -static struct page *nilfs_get_page(struct inode *dir, unsigned long n) +static void *nilfs_get_folio(struct inode *dir, unsigned long n, + struct folio **foliop) { struct address_space *mapping = dir->i_mapping; - struct page *page = read_mapping_page(mapping, n, NULL); + struct folio *folio = read_mapping_folio(mapping, n, NULL); + void *kaddr; - if (!IS_ERR(page)) { - kmap(page); - if (unlikely(!PageChecked(page))) { - if (!nilfs_check_page(page)) - goto fail; - } + if (IS_ERR(folio)) + return folio; + + kaddr = kmap_local_folio(folio, 0); + if (unlikely(!folio_test_checked(folio))) { + if (!nilfs_check_folio(folio, kaddr)) + goto fail; } - return page; + + *foliop = folio; + return kaddr; fail: - nilfs_put_page(page); + folio_release_kmap(folio, kaddr); return ERR_PTR(-EIO); } @@ -275,21 +272,21 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx) for ( ; n < npages; n++, offset = 0) { char *kaddr, *limit; struct nilfs_dir_entry *de; - struct page *page = nilfs_get_page(inode, n); + struct folio *folio; - if (IS_ERR(page)) { + kaddr = nilfs_get_folio(inode, n, &folio); + if (IS_ERR(kaddr)) { nilfs_error(sb, "bad page in #%lu", inode->i_ino); ctx->pos += PAGE_SIZE - offset; return -EIO; } - kaddr = page_address(page); de = (struct nilfs_dir_entry *)(kaddr + offset); limit = kaddr + nilfs_last_byte(inode, n) - NILFS_DIR_REC_LEN(1); for ( ; (char *)de <= limit; de = nilfs_next_entry(de)) { if (de->rec_len == 0) { nilfs_error(sb, "zero-length directory entry"); - nilfs_put_page(page); + folio_release_kmap(folio, kaddr); return -EIO; } if (de->inode) { @@ -302,72 +299,67 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit(ctx, de->name, de->name_len, le64_to_cpu(de->inode), t)) { - nilfs_put_page(page); + folio_release_kmap(folio, kaddr); return 0; } } ctx->pos += nilfs_rec_len_from_disk(de->rec_len); } - nilfs_put_page(page); + folio_release_kmap(folio, kaddr); } return 0; } /* - * nilfs_find_entry() + * nilfs_find_entry() + * + * Finds an entry in the specified directory with the wanted name. It + * returns the folio in which the entry was found, and the entry itself. + * The folio is mapped and unlocked. When the caller is finished with + * the entry, it should call folio_release_kmap(). * - * finds an entry in the specified directory with the wanted name. It - * returns the page in which the entry was found, and the entry itself - * (as a parameter - res_dir). Page is returned mapped and unlocked. - * Entry is guaranteed to be valid. + * On failure, returns NULL and the caller should ignore foliop. */ -struct nilfs_dir_entry * -nilfs_find_entry(struct inode *dir, const struct qstr *qstr, - struct page **res_page) +struct nilfs_dir_entry *nilfs_find_entry(struct inode *dir, + const struct qstr *qstr, struct folio **foliop) { const unsigned char *name = qstr->name; int namelen = qstr->len; unsigned int reclen = NILFS_DIR_REC_LEN(namelen); unsigned long start, n; unsigned long npages = dir_pages(dir); - struct page *page = NULL; struct nilfs_inode_info *ei = NILFS_I(dir); struct nilfs_dir_entry *de; if (npages == 0) goto out; - /* OFFSET_CACHE */ - *res_page = NULL; - start = ei->i_dir_start_lookup; if (start >= npages) start = 0; n = start; do { - char *kaddr; + char *kaddr = nilfs_get_folio(dir, n, foliop); - page = nilfs_get_page(dir, n); - if (!IS_ERR(page)) { - kaddr = page_address(page); + if (!IS_ERR(kaddr)) { de = (struct nilfs_dir_entry *)kaddr; kaddr += nilfs_last_byte(dir, n) - reclen; while ((char *) de <= kaddr) { if (de->rec_len == 0) { nilfs_error(dir->i_sb, "zero-length directory entry"); - nilfs_put_page(page); + folio_release_kmap(*foliop, kaddr); goto out; } if (nilfs_match(namelen, name, de)) goto found; de = nilfs_next_entry(de); } - nilfs_put_page(page); + folio_release_kmap(*foliop, kaddr); } if (++n >= npages) n = 0; - /* next page is past the blocks we've got */ + /* next folio is past the blocks we've got */ if (unlikely(n > (dir->i_blocks >> (PAGE_SHIFT - 9)))) { nilfs_error(dir->i_sb, "dir %lu size %lld exceeds block count %llu", @@ -380,55 +372,47 @@ out: return NULL; found: - *res_page = page; ei->i_dir_start_lookup = n; return de; } -struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct page **p) +struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct folio **foliop) { - struct page *page = nilfs_get_page(dir, 0); - struct nilfs_dir_entry *de = NULL; + struct nilfs_dir_entry *de = nilfs_get_folio(dir, 0, foliop); - if (!IS_ERR(page)) { - de = nilfs_next_entry( - (struct nilfs_dir_entry *)page_address(page)); - *p = page; - } - return de; + if (IS_ERR(de)) + return NULL; + return nilfs_next_entry(de); } ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr) { ino_t res = 0; struct nilfs_dir_entry *de; - struct page *page; + struct folio *folio; - de = nilfs_find_entry(dir, qstr, &page); + de = nilfs_find_entry(dir, qstr, &folio); if (de) { res = le64_to_cpu(de->inode); - kunmap(page); - put_page(page); + folio_release_kmap(folio, de); } return res; } -/* Releases the page */ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, - struct page *page, struct inode *inode) + struct folio *folio, struct inode *inode) { - unsigned int from = (char *)de - (char *)page_address(page); - unsigned int to = from + nilfs_rec_len_from_disk(de->rec_len); - struct address_space *mapping = page->mapping; + size_t from = offset_in_folio(folio, de); + size_t to = from + nilfs_rec_len_from_disk(de->rec_len); + struct address_space *mapping = folio->mapping; int err; - lock_page(page); - err = nilfs_prepare_chunk(page, from, to); + folio_lock(folio); + err = nilfs_prepare_chunk(folio, from, to); BUG_ON(err); de->inode = cpu_to_le64(inode->i_ino); nilfs_set_de_type(de, inode); - nilfs_commit_chunk(page, mapping, from, to); - nilfs_put_page(page); + nilfs_commit_chunk(folio, mapping, from, to); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); } @@ -443,31 +427,28 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode) unsigned int chunk_size = nilfs_chunk_size(dir); unsigned int reclen = NILFS_DIR_REC_LEN(namelen); unsigned short rec_len, name_len; - struct page *page = NULL; + struct folio *folio = NULL; struct nilfs_dir_entry *de; unsigned long npages = dir_pages(dir); unsigned long n; - char *kaddr; - unsigned int from, to; + size_t from, to; int err; /* * We take care of directory expansion in the same loop. - * This code plays outside i_size, so it locks the page + * This code plays outside i_size, so it locks the folio * to protect that region. */ for (n = 0; n <= npages; n++) { + char *kaddr = nilfs_get_folio(dir, n, &folio); char *dir_end; - page = nilfs_get_page(dir, n); - err = PTR_ERR(page); - if (IS_ERR(page)) - goto out; - lock_page(page); - kaddr = page_address(page); + if (IS_ERR(kaddr)) + return PTR_ERR(kaddr); + folio_lock(folio); dir_end = kaddr + nilfs_last_byte(dir, n); de = (struct nilfs_dir_entry *)kaddr; - kaddr += PAGE_SIZE - reclen; + kaddr += folio_size(folio) - reclen; while ((char *)de <= kaddr) { if ((char *)de == dir_end) { /* We hit i_size */ @@ -494,16 +475,16 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode) goto got_it; de = (struct nilfs_dir_entry *)((char *)de + rec_len); } - unlock_page(page); - nilfs_put_page(page); + folio_unlock(folio); + folio_release_kmap(folio, kaddr); } BUG(); return -EINVAL; got_it: - from = (char *)de - (char *)page_address(page); + from = offset_in_folio(folio, de); to = from + rec_len; - err = nilfs_prepare_chunk(page, from, to); + err = nilfs_prepare_chunk(folio, from, to); if (err) goto out_unlock; if (de->inode) { @@ -518,29 +499,28 @@ got_it: memcpy(de->name, name, namelen); de->inode = cpu_to_le64(inode->i_ino); nilfs_set_de_type(de, inode); - nilfs_commit_chunk(page, page->mapping, from, to); + nilfs_commit_chunk(folio, folio->mapping, from, to); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); nilfs_mark_inode_dirty(dir); /* OFFSET_CACHE */ out_put: - nilfs_put_page(page); -out: + folio_release_kmap(folio, de); return err; out_unlock: - unlock_page(page); + folio_unlock(folio); goto out_put; } /* * nilfs_delete_entry deletes a directory entry by merging it with the - * previous entry. Page is up-to-date. Releases the page. + * previous entry. Folio is up-to-date. */ -int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page) +int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct folio *folio) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; - char *kaddr = page_address(page); - unsigned int from, to; + char *kaddr = (char *)((unsigned long)dir & ~(folio_size(folio) - 1)); + size_t from, to; struct nilfs_dir_entry *de, *pde = NULL; int err; @@ -559,17 +539,16 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page) de = nilfs_next_entry(de); } if (pde) - from = (char *)pde - (char *)page_address(page); - lock_page(page); - err = nilfs_prepare_chunk(page, from, to); + from = (char *)pde - kaddr; + folio_lock(folio); + err = nilfs_prepare_chunk(folio, from, to); BUG_ON(err); if (pde) pde->rec_len = nilfs_rec_len_to_disk(to - from); dir->inode = 0; - nilfs_commit_chunk(page, mapping, from, to); + nilfs_commit_chunk(folio, mapping, from, to); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); out: - nilfs_put_page(page); return err; } @@ -579,21 +558,21 @@ out: int nilfs_make_empty(struct inode *inode, struct inode *parent) { struct address_space *mapping = inode->i_mapping; - struct page *page = grab_cache_page(mapping, 0); + struct folio *folio = filemap_grab_folio(mapping, 0); unsigned int chunk_size = nilfs_chunk_size(inode); struct nilfs_dir_entry *de; int err; void *kaddr; - if (!page) - return -ENOMEM; + if (IS_ERR(folio)) + return PTR_ERR(folio); - err = nilfs_prepare_chunk(page, 0, chunk_size); + err = nilfs_prepare_chunk(folio, 0, chunk_size); if (unlikely(err)) { - unlock_page(page); + folio_unlock(folio); goto fail; } - kaddr = kmap_atomic(page); + kaddr = kmap_local_folio(folio, 0); memset(kaddr, 0, chunk_size); de = (struct nilfs_dir_entry *)kaddr; de->name_len = 1; @@ -608,10 +587,10 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent) de->inode = cpu_to_le64(parent->i_ino); memcpy(de->name, "..\0", 4); nilfs_set_de_type(de, inode); - kunmap_atomic(kaddr); - nilfs_commit_chunk(page, mapping, 0, chunk_size); + kunmap_local(kaddr); + nilfs_commit_chunk(folio, mapping, 0, chunk_size); fail: - put_page(page); + folio_put(folio); return err; } @@ -620,18 +599,17 @@ fail: */ int nilfs_empty_dir(struct inode *inode) { - struct page *page = NULL; + struct folio *folio = NULL; + char *kaddr; unsigned long i, npages = dir_pages(inode); for (i = 0; i < npages; i++) { - char *kaddr; struct nilfs_dir_entry *de; - page = nilfs_get_page(inode, i); - if (IS_ERR(page)) + kaddr = nilfs_get_folio(inode, i, &folio); + if (IS_ERR(kaddr)) continue; - kaddr = page_address(page); de = (struct nilfs_dir_entry *)kaddr; kaddr += nilfs_last_byte(inode, i) - NILFS_DIR_REC_LEN(1); @@ -657,12 +635,12 @@ int nilfs_empty_dir(struct inode *inode) } de = nilfs_next_entry(de); } - nilfs_put_page(page); + folio_release_kmap(folio, kaddr); } return 1; not_empty: - nilfs_put_page(page); + folio_release_kmap(folio, kaddr); return 0; } diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 740ce26d1e76..bec33b89a075 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -45,34 +45,36 @@ int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); struct inode *inode = file_inode(vma->vm_file); struct nilfs_transaction_info ti; + struct buffer_head *bh, *head; int ret = 0; if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info))) return VM_FAULT_SIGBUS; /* -ENOSPC */ sb_start_pagefault(inode->i_sb); - lock_page(page); - if (page->mapping != inode->i_mapping || - page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) { - unlock_page(page); + folio_lock(folio); + if (folio->mapping != inode->i_mapping || + folio_pos(folio) >= i_size_read(inode) || + !folio_test_uptodate(folio)) { + folio_unlock(folio); ret = -EFAULT; /* make the VM retry the fault */ goto out; } /* - * check to see if the page is mapped already (no holes) + * check to see if the folio is mapped already (no holes) */ - if (PageMappedToDisk(page)) + if (folio_test_mappedtodisk(folio)) goto mapped; - if (page_has_buffers(page)) { - struct buffer_head *bh, *head; + head = folio_buffers(folio); + if (head) { int fully_mapped = 1; - bh = head = page_buffers(page); + bh = head; do { if (!buffer_mapped(bh)) { fully_mapped = 0; @@ -81,11 +83,11 @@ static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf) } while (bh = bh->b_this_page, bh != head); if (fully_mapped) { - SetPageMappedToDisk(page); + folio_set_mappedtodisk(folio); goto mapped; } } - unlock_page(page); + folio_unlock(folio); /* * fill hole blocks @@ -105,7 +107,7 @@ static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf) nilfs_transaction_commit(inode->i_sb); mapped: - wait_for_stable_page(page); + folio_wait_stable(folio); out: sb_end_pagefault(inode->i_sb); return vmf_fs_error(ret); diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 8beb2730929d..bf9a11d58817 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -98,8 +98,8 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, *out_bh = bh; failed: - unlock_page(bh->b_page); - put_page(bh->b_page); + folio_unlock(bh->b_folio); + folio_put(bh->b_folio); if (unlikely(err)) brelse(bh); return err; diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index f861f3a0bf5c..9c334c722fc1 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -175,7 +175,8 @@ static int nilfs_writepages(struct address_space *mapping, static int nilfs_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct folio *folio = page_folio(page); + struct inode *inode = folio->mapping->host; int err; if (sb_rdonly(inode->i_sb)) { @@ -185,13 +186,13 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc) * have dirty pages that try to be flushed in background. * So, here we simply discard this dirty page. */ - nilfs_clear_dirty_page(page, false); - unlock_page(page); + nilfs_clear_folio_dirty(folio, false); + folio_unlock(folio); return -EROFS; } - redirty_page_for_writepage(wbc, page); - unlock_page(page); + folio_redirty_for_writepage(wbc, folio); + folio_unlock(folio); if (wbc->sync_mode == WB_SYNC_ALL) { err = nilfs_construct_segment(inode->i_sb); @@ -214,7 +215,7 @@ static bool nilfs_dirty_folio(struct address_space *mapping, /* * The page may not be locked, eg if called from try_to_unmap_one() */ - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); head = folio_buffers(folio); if (head) { struct buffer_head *bh = head; @@ -230,7 +231,7 @@ static bool nilfs_dirty_folio(struct address_space *mapping, } else if (ret) { nr_dirty = 1 << (folio_shift(folio) - inode->i_blkbits); } - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); if (nr_dirty) nilfs_set_file_dirty(inode, nr_dirty); diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 40ffade49f38..cfb6aca5ec38 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -872,16 +872,14 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, nsegs = argv[4].v_nmembs; if (argv[4].v_size != argsz[4]) goto out; - if (nsegs > UINT_MAX / sizeof(__u64)) - goto out; /* * argv[4] points to segment numbers this ioctl cleans. We - * use kmalloc() for its buffer because memory used for the - * segment numbers is enough small. + * use kmalloc() for its buffer because the memory used for the + * segment numbers is small enough. */ - kbufs[4] = memdup_user((void __user *)(unsigned long)argv[4].v_base, - nsegs * sizeof(__u64)); + kbufs[4] = memdup_array_user((void __user *)(unsigned long)argv[4].v_base, + nsegs, sizeof(__u64)); if (IS_ERR(kbufs[4])) { ret = PTR_ERR(kbufs[4]); goto out; diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index c97c77a39668..e45c01a559c0 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -97,8 +97,8 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block, } failed_bh: - unlock_page(bh->b_page); - put_page(bh->b_page); + folio_unlock(bh->b_folio); + folio_put(bh->b_folio); brelse(bh); failed_unlock: @@ -158,8 +158,8 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, blk_opf_t opf, *out_bh = bh; failed_bh: - unlock_page(bh->b_page); - put_page(bh->b_page); + folio_unlock(bh->b_folio); + folio_put(bh->b_folio); brelse(bh); failed: return ret; @@ -399,7 +399,8 @@ int nilfs_mdt_fetch_dirty(struct inode *inode) static int nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct folio *folio = page_folio(page); + struct inode *inode = folio->mapping->host; struct super_block *sb; int err = 0; @@ -407,16 +408,16 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) /* * It means that filesystem was remounted in read-only * mode because of error or metadata corruption. But we - * have dirty pages that try to be flushed in background. - * So, here we simply discard this dirty page. + * have dirty folios that try to be flushed in background. + * So, here we simply discard this dirty folio. */ - nilfs_clear_dirty_page(page, false); - unlock_page(page); + nilfs_clear_folio_dirty(folio, false); + folio_unlock(folio); return -EROFS; } - redirty_page_for_writepage(wbc, page); - unlock_page(page); + folio_redirty_for_writepage(wbc, folio); + folio_unlock(folio); if (!inode) return 0; diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 2a4e7f4a8102..c950139db6ef 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -260,11 +260,11 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode; struct nilfs_dir_entry *de; - struct page *page; + struct folio *folio; int err; err = -ENOENT; - de = nilfs_find_entry(dir, &dentry->d_name, &page); + de = nilfs_find_entry(dir, &dentry->d_name, &folio); if (!de) goto out; @@ -279,7 +279,8 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) inode->i_ino, inode->i_nlink); set_nlink(inode, 1); } - err = nilfs_delete_entry(de, page); + err = nilfs_delete_entry(de, folio); + folio_release_kmap(folio, de); if (err) goto out; @@ -347,9 +348,9 @@ static int nilfs_rename(struct mnt_idmap *idmap, { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); - struct page *dir_page = NULL; + struct folio *dir_folio = NULL; struct nilfs_dir_entry *dir_de = NULL; - struct page *old_page; + struct folio *old_folio; struct nilfs_dir_entry *old_de; struct nilfs_transaction_info ti; int err; @@ -362,19 +363,19 @@ static int nilfs_rename(struct mnt_idmap *idmap, return err; err = -ENOENT; - old_de = nilfs_find_entry(old_dir, &old_dentry->d_name, &old_page); + old_de = nilfs_find_entry(old_dir, &old_dentry->d_name, &old_folio); if (!old_de) goto out; if (S_ISDIR(old_inode->i_mode)) { err = -EIO; - dir_de = nilfs_dotdot(old_inode, &dir_page); + dir_de = nilfs_dotdot(old_inode, &dir_folio); if (!dir_de) goto out_old; } if (new_inode) { - struct page *new_page; + struct folio *new_folio; struct nilfs_dir_entry *new_de; err = -ENOTEMPTY; @@ -382,10 +383,11 @@ static int nilfs_rename(struct mnt_idmap *idmap, goto out_dir; err = -ENOENT; - new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page); + new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_folio); if (!new_de) goto out_dir; - nilfs_set_link(new_dir, new_de, new_page, old_inode); + nilfs_set_link(new_dir, new_de, new_folio, old_inode); + folio_release_kmap(new_folio, new_de); nilfs_mark_inode_dirty(new_dir); inode_set_ctime_current(new_inode); if (dir_de) @@ -408,12 +410,15 @@ static int nilfs_rename(struct mnt_idmap *idmap, */ inode_set_ctime_current(old_inode); - nilfs_delete_entry(old_de, old_page); + nilfs_delete_entry(old_de, old_folio); if (dir_de) { - nilfs_set_link(old_inode, dir_de, dir_page, new_dir); + nilfs_set_link(old_inode, dir_de, dir_folio, new_dir); + folio_release_kmap(dir_folio, dir_de); drop_nlink(old_dir); } + folio_release_kmap(old_folio, old_de); + nilfs_mark_inode_dirty(old_dir); nilfs_mark_inode_dirty(old_inode); @@ -421,13 +426,10 @@ static int nilfs_rename(struct mnt_idmap *idmap, return err; out_dir: - if (dir_de) { - kunmap(dir_page); - put_page(dir_page); - } + if (dir_de) + folio_release_kmap(dir_folio, dir_de); out_old: - kunmap(old_page); - put_page(old_page); + folio_release_kmap(old_folio, old_de); out: nilfs_transaction_abort(old_dir->i_sb); return err; @@ -439,7 +441,6 @@ out: static struct dentry *nilfs_get_parent(struct dentry *child) { unsigned long ino; - struct inode *inode; struct nilfs_root *root; ino = nilfs_inode_by_name(d_inode(child), &dotdot_name); @@ -448,11 +449,7 @@ static struct dentry *nilfs_get_parent(struct dentry *child) root = NILFS_I(d_inode(child))->i_root; - inode = nilfs_iget(child->d_sb, root, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); - - return d_obtain_alias(inode); + return d_obtain_alias(nilfs_iget(child->d_sb, root, ino)); } static struct dentry *nilfs_get_dentry(struct super_block *sb, u64 cno, diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 8046490cd7fe..98cffaf0ac12 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -226,16 +226,16 @@ static inline __u32 nilfs_mask_flags(umode_t mode, __u32 flags) } /* dir.c */ -extern int nilfs_add_link(struct dentry *, struct inode *); -extern ino_t nilfs_inode_by_name(struct inode *, const struct qstr *); -extern int nilfs_make_empty(struct inode *, struct inode *); -extern struct nilfs_dir_entry * -nilfs_find_entry(struct inode *, const struct qstr *, struct page **); -extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *); -extern int nilfs_empty_dir(struct inode *); -extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **); -extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *, - struct page *, struct inode *); +int nilfs_add_link(struct dentry *, struct inode *); +ino_t nilfs_inode_by_name(struct inode *, const struct qstr *); +int nilfs_make_empty(struct inode *, struct inode *); +struct nilfs_dir_entry *nilfs_find_entry(struct inode *, const struct qstr *, + struct folio **); +int nilfs_delete_entry(struct nilfs_dir_entry *, struct folio *); +int nilfs_empty_dir(struct inode *); +struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct folio **); +void nilfs_set_link(struct inode *, struct nilfs_dir_entry *, + struct folio *, struct inode *); /* file.c */ extern int nilfs_sync_file(struct file *, loff_t, loff_t, int); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 06b04758f289..5c2eba1987bd 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -73,7 +73,7 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode, */ void nilfs_forget_buffer(struct buffer_head *bh) { - struct page *page = bh->b_page; + struct folio *folio = bh->b_folio; const unsigned long clear_bits = (BIT(BH_Uptodate) | BIT(BH_Dirty) | BIT(BH_Mapped) | BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) | @@ -81,12 +81,12 @@ void nilfs_forget_buffer(struct buffer_head *bh) lock_buffer(bh); set_mask_bits(&bh->b_state, clear_bits, 0); - if (nilfs_page_buffers_clean(page)) - __nilfs_clear_page_dirty(page); + if (nilfs_folio_buffers_clean(folio)) + __nilfs_clear_folio_dirty(folio); bh->b_blocknr = -1; - ClearPageUptodate(page); - ClearPageMappedToDisk(page); + folio_clear_uptodate(folio); + folio_clear_mappedtodisk(folio); unlock_buffer(bh); brelse(bh); } @@ -131,48 +131,49 @@ void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh) } /** - * nilfs_page_buffers_clean - check if a page has dirty buffers or not. - * @page: page to be checked + * nilfs_folio_buffers_clean - Check if a folio has dirty buffers or not. + * @folio: Folio to be checked. * - * nilfs_page_buffers_clean() returns zero if the page has dirty buffers. - * Otherwise, it returns non-zero value. + * nilfs_folio_buffers_clean() returns false if the folio has dirty buffers. + * Otherwise, it returns true. */ -int nilfs_page_buffers_clean(struct page *page) +bool nilfs_folio_buffers_clean(struct folio *folio) { struct buffer_head *bh, *head; - bh = head = page_buffers(page); + bh = head = folio_buffers(folio); do { if (buffer_dirty(bh)) - return 0; + return false; bh = bh->b_this_page; } while (bh != head); - return 1; + return true; } -void nilfs_page_bug(struct page *page) +void nilfs_folio_bug(struct folio *folio) { + struct buffer_head *bh, *head; struct address_space *m; unsigned long ino; - if (unlikely(!page)) { - printk(KERN_CRIT "NILFS_PAGE_BUG(NULL)\n"); + if (unlikely(!folio)) { + printk(KERN_CRIT "NILFS_FOLIO_BUG(NULL)\n"); return; } - m = page->mapping; + m = folio->mapping; ino = m ? m->host->i_ino : 0; - printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx " + printk(KERN_CRIT "NILFS_FOLIO_BUG(%p): cnt=%d index#=%llu flags=0x%lx " "mapping=%p ino=%lu\n", - page, page_ref_count(page), - (unsigned long long)page->index, page->flags, m, ino); + folio, folio_ref_count(folio), + (unsigned long long)folio->index, folio->flags, m, ino); - if (page_has_buffers(page)) { - struct buffer_head *bh, *head; + head = folio_buffers(folio); + if (head) { int i = 0; - bh = head = page_buffers(page); + bh = head; do { printk(KERN_CRIT " BH[%d] %p: cnt=%d block#=%llu state=0x%lx\n", @@ -258,7 +259,7 @@ repeat: folio_lock(folio); if (unlikely(!folio_test_dirty(folio))) - NILFS_PAGE_BUG(&folio->page, "inconsistent dirty state"); + NILFS_FOLIO_BUG(folio, "inconsistent dirty state"); dfolio = filemap_grab_folio(dmap, folio->index); if (unlikely(IS_ERR(dfolio))) { @@ -268,7 +269,7 @@ repeat: break; } if (unlikely(!folio_buffers(folio))) - NILFS_PAGE_BUG(&folio->page, + NILFS_FOLIO_BUG(folio, "found empty page in dat page cache"); nilfs_copy_folio(dfolio, folio, true); @@ -379,7 +380,7 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent) * was acquired. Skip processing in that case. */ if (likely(folio->mapping == mapping)) - nilfs_clear_dirty_page(&folio->page, silent); + nilfs_clear_folio_dirty(folio, silent); folio_unlock(folio); } @@ -389,32 +390,33 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent) } /** - * nilfs_clear_dirty_page - discard dirty page - * @page: dirty page that will be discarded + * nilfs_clear_folio_dirty - discard dirty folio + * @folio: dirty folio that will be discarded * @silent: suppress [true] or print [false] warning messages */ -void nilfs_clear_dirty_page(struct page *page, bool silent) +void nilfs_clear_folio_dirty(struct folio *folio, bool silent) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct super_block *sb = inode->i_sb; + struct buffer_head *bh, *head; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); if (!silent) nilfs_warn(sb, "discard dirty page: offset=%lld, ino=%lu", - page_offset(page), inode->i_ino); + folio_pos(folio), inode->i_ino); - ClearPageUptodate(page); - ClearPageMappedToDisk(page); + folio_clear_uptodate(folio); + folio_clear_mappedtodisk(folio); - if (page_has_buffers(page)) { - struct buffer_head *bh, *head; + head = folio_buffers(folio); + if (head) { const unsigned long clear_bits = (BIT(BH_Uptodate) | BIT(BH_Dirty) | BIT(BH_Mapped) | BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) | BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected)); - bh = head = page_buffers(page); + bh = head; do { lock_buffer(bh); if (!silent) @@ -427,7 +429,7 @@ void nilfs_clear_dirty_page(struct page *page, bool silent) } while (bh = bh->b_this_page, bh != head); } - __nilfs_clear_page_dirty(page); + __nilfs_clear_folio_dirty(folio); } unsigned int nilfs_page_count_clean_buffers(struct page *page, @@ -457,22 +459,23 @@ unsigned int nilfs_page_count_clean_buffers(struct page *page, * 2) Some B-tree operations like insertion or deletion may dispose buffers * in dirty state, and this needs to cancel the dirty state of their pages. */ -int __nilfs_clear_page_dirty(struct page *page) +void __nilfs_clear_folio_dirty(struct folio *folio) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; if (mapping) { xa_lock_irq(&mapping->i_pages); - if (test_bit(PG_dirty, &page->flags)) { - __xa_clear_mark(&mapping->i_pages, page_index(page), + if (folio_test_dirty(folio)) { + __xa_clear_mark(&mapping->i_pages, folio->index, PAGECACHE_TAG_DIRTY); xa_unlock_irq(&mapping->i_pages); - return clear_page_dirty_for_io(page); + folio_clear_dirty_for_io(folio); + return; } xa_unlock_irq(&mapping->i_pages); - return 0; + return; } - return TestClearPageDirty(page); + folio_clear_dirty(folio); } /** diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index d249ea1cefff..7e1a2c455a10 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -30,18 +30,18 @@ BUFFER_FNS(NILFS_Checked, nilfs_checked) /* buffer is verified */ BUFFER_FNS(NILFS_Redirected, nilfs_redirected) /* redirected to a copy */ -int __nilfs_clear_page_dirty(struct page *); +void __nilfs_clear_folio_dirty(struct folio *); struct buffer_head *nilfs_grab_buffer(struct inode *, struct address_space *, unsigned long, unsigned long); void nilfs_forget_buffer(struct buffer_head *); void nilfs_copy_buffer(struct buffer_head *, struct buffer_head *); -int nilfs_page_buffers_clean(struct page *); -void nilfs_page_bug(struct page *); +bool nilfs_folio_buffers_clean(struct folio *); +void nilfs_folio_bug(struct folio *); int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); void nilfs_copy_back_pages(struct address_space *, struct address_space *); -void nilfs_clear_dirty_page(struct page *, bool); +void nilfs_clear_folio_dirty(struct folio *, bool); void nilfs_clear_dirty_pages(struct address_space *, bool); unsigned int nilfs_page_count_clean_buffers(struct page *, unsigned int, unsigned int); @@ -49,7 +49,7 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode, sector_t start_blk, sector_t *blkoff); -#define NILFS_PAGE_BUG(page, m, a...) \ - do { nilfs_page_bug(page); BUG(); } while (0) +#define NILFS_FOLIO_BUG(folio, m, a...) \ + do { nilfs_folio_bug(folio); BUG(); } while (0) #endif /* _NILFS_PAGE_H */ diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 55e31cc903d1..2590a0860eab 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1665,39 +1665,39 @@ static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode) return 0; } -static void nilfs_begin_page_io(struct page *page) +static void nilfs_begin_folio_io(struct folio *folio) { - if (!page || PageWriteback(page)) + if (!folio || folio_test_writeback(folio)) /* * For split b-tree node pages, this function may be called * twice. We ignore the 2nd or later calls by this check. */ return; - lock_page(page); - clear_page_dirty_for_io(page); - set_page_writeback(page); - unlock_page(page); + folio_lock(folio); + folio_clear_dirty_for_io(folio); + folio_start_writeback(folio); + folio_unlock(folio); } static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci) { struct nilfs_segment_buffer *segbuf; - struct page *bd_page = NULL, *fs_page = NULL; + struct folio *bd_folio = NULL, *fs_folio = NULL; list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { struct buffer_head *bh; list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { - if (bh->b_page != bd_page) { - if (bd_page) { - lock_page(bd_page); - clear_page_dirty_for_io(bd_page); - set_page_writeback(bd_page); - unlock_page(bd_page); + if (bh->b_folio != bd_folio) { + if (bd_folio) { + folio_lock(bd_folio); + folio_clear_dirty_for_io(bd_folio); + folio_start_writeback(bd_folio); + folio_unlock(bd_folio); } - bd_page = bh->b_page; + bd_folio = bh->b_folio; } } @@ -1705,28 +1705,28 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci) b_assoc_buffers) { set_buffer_async_write(bh); if (bh == segbuf->sb_super_root) { - if (bh->b_page != bd_page) { - lock_page(bd_page); - clear_page_dirty_for_io(bd_page); - set_page_writeback(bd_page); - unlock_page(bd_page); - bd_page = bh->b_page; + if (bh->b_folio != bd_folio) { + folio_lock(bd_folio); + folio_clear_dirty_for_io(bd_folio); + folio_start_writeback(bd_folio); + folio_unlock(bd_folio); + bd_folio = bh->b_folio; } break; } - if (bh->b_page != fs_page) { - nilfs_begin_page_io(fs_page); - fs_page = bh->b_page; + if (bh->b_folio != fs_folio) { + nilfs_begin_folio_io(fs_folio); + fs_folio = bh->b_folio; } } } - if (bd_page) { - lock_page(bd_page); - clear_page_dirty_for_io(bd_page); - set_page_writeback(bd_page); - unlock_page(bd_page); + if (bd_folio) { + folio_lock(bd_folio); + folio_clear_dirty_for_io(bd_folio); + folio_start_writeback(bd_folio); + folio_unlock(bd_folio); } - nilfs_begin_page_io(fs_page); + nilfs_begin_folio_io(fs_folio); } static int nilfs_segctor_write(struct nilfs_sc_info *sci, @@ -1739,17 +1739,18 @@ static int nilfs_segctor_write(struct nilfs_sc_info *sci, return ret; } -static void nilfs_end_page_io(struct page *page, int err) +static void nilfs_end_folio_io(struct folio *folio, int err) { - if (!page) + if (!folio) return; - if (buffer_nilfs_node(page_buffers(page)) && !PageWriteback(page)) { + if (buffer_nilfs_node(folio_buffers(folio)) && + !folio_test_writeback(folio)) { /* * For b-tree node pages, this function may be called twice * or more because they might be split in a segment. */ - if (PageDirty(page)) { + if (folio_test_dirty(folio)) { /* * For pages holding split b-tree node buffers, dirty * flag on the buffers may be cleared discretely. @@ -1757,30 +1758,30 @@ static void nilfs_end_page_io(struct page *page, int err) * remaining buffers, and it must be cancelled if * all the buffers get cleaned later. */ - lock_page(page); - if (nilfs_page_buffers_clean(page)) - __nilfs_clear_page_dirty(page); - unlock_page(page); + folio_lock(folio); + if (nilfs_folio_buffers_clean(folio)) + __nilfs_clear_folio_dirty(folio); + folio_unlock(folio); } return; } if (!err) { - if (!nilfs_page_buffers_clean(page)) - __set_page_dirty_nobuffers(page); - ClearPageError(page); + if (!nilfs_folio_buffers_clean(folio)) + filemap_dirty_folio(folio->mapping, folio); + folio_clear_error(folio); } else { - __set_page_dirty_nobuffers(page); - SetPageError(page); + filemap_dirty_folio(folio->mapping, folio); + folio_set_error(folio); } - end_page_writeback(page); + folio_end_writeback(folio); } static void nilfs_abort_logs(struct list_head *logs, int err) { struct nilfs_segment_buffer *segbuf; - struct page *bd_page = NULL, *fs_page = NULL; + struct folio *bd_folio = NULL, *fs_folio = NULL; struct buffer_head *bh; if (list_empty(logs)) @@ -1790,10 +1791,10 @@ static void nilfs_abort_logs(struct list_head *logs, int err) list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { clear_buffer_uptodate(bh); - if (bh->b_page != bd_page) { - if (bd_page) - end_page_writeback(bd_page); - bd_page = bh->b_page; + if (bh->b_folio != bd_folio) { + if (bd_folio) + folio_end_writeback(bd_folio); + bd_folio = bh->b_folio; } } @@ -1802,22 +1803,22 @@ static void nilfs_abort_logs(struct list_head *logs, int err) clear_buffer_async_write(bh); if (bh == segbuf->sb_super_root) { clear_buffer_uptodate(bh); - if (bh->b_page != bd_page) { - end_page_writeback(bd_page); - bd_page = bh->b_page; + if (bh->b_folio != bd_folio) { + folio_end_writeback(bd_folio); + bd_folio = bh->b_folio; } break; } - if (bh->b_page != fs_page) { - nilfs_end_page_io(fs_page, err); - fs_page = bh->b_page; + if (bh->b_folio != fs_folio) { + nilfs_end_folio_io(fs_folio, err); + fs_folio = bh->b_folio; } } } - if (bd_page) - end_page_writeback(bd_page); + if (bd_folio) + folio_end_writeback(bd_folio); - nilfs_end_page_io(fs_page, err); + nilfs_end_folio_io(fs_folio, err); } static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci, @@ -1859,7 +1860,7 @@ static void nilfs_set_next_segment(struct the_nilfs *nilfs, static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) { struct nilfs_segment_buffer *segbuf; - struct page *bd_page = NULL, *fs_page = NULL; + struct folio *bd_folio = NULL, *fs_folio = NULL; struct the_nilfs *nilfs = sci->sc_super->s_fs_info; int update_sr = false; @@ -1870,21 +1871,21 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) b_assoc_buffers) { set_buffer_uptodate(bh); clear_buffer_dirty(bh); - if (bh->b_page != bd_page) { - if (bd_page) - end_page_writeback(bd_page); - bd_page = bh->b_page; + if (bh->b_folio != bd_folio) { + if (bd_folio) + folio_end_writeback(bd_folio); + bd_folio = bh->b_folio; } } /* - * We assume that the buffers which belong to the same page + * We assume that the buffers which belong to the same folio * continue over the buffer list. - * Under this assumption, the last BHs of pages is - * identifiable by the discontinuity of bh->b_page - * (page != fs_page). + * Under this assumption, the last BHs of folios is + * identifiable by the discontinuity of bh->b_folio + * (folio != fs_folio). * * For B-tree node blocks, however, this assumption is not - * guaranteed. The cleanup code of B-tree node pages needs + * guaranteed. The cleanup code of B-tree node folios needs * special care. */ list_for_each_entry(bh, &segbuf->sb_payload_buffers, @@ -1897,16 +1898,16 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) set_mask_bits(&bh->b_state, clear_bits, set_bits); if (bh == segbuf->sb_super_root) { - if (bh->b_page != bd_page) { - end_page_writeback(bd_page); - bd_page = bh->b_page; + if (bh->b_folio != bd_folio) { + folio_end_writeback(bd_folio); + bd_folio = bh->b_folio; } update_sr = true; break; } - if (bh->b_page != fs_page) { - nilfs_end_page_io(fs_page, 0); - fs_page = bh->b_page; + if (bh->b_folio != fs_folio) { + nilfs_end_folio_io(fs_folio, 0); + fs_folio = bh->b_folio; } } @@ -1920,13 +1921,13 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) } } /* - * Since pages may continue over multiple segment buffers, - * end of the last page must be checked outside of the loop. + * Since folios may continue over multiple segment buffers, + * end of the last folio must be checked outside of the loop. */ - if (bd_page) - end_page_writeback(bd_page); + if (bd_folio) + folio_end_writeback(bd_folio); - nilfs_end_page_io(fs_page, 0); + nilfs_end_folio_io(fs_folio, 0); nilfs_drop_collected_inodes(&sci->sc_dirty_files); @@ -2587,6 +2588,7 @@ static int nilfs_segctor_thread(void *arg) "segctord starting. Construction interval = %lu seconds, CP frequency < %lu seconds", sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ); + set_freezable(); spin_lock(&sci->sc_state_lock); loop: for (;;) { diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 58ca7c936393..0a8119456c21 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -471,10 +471,15 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, kunmap_atomic(kaddr); return; } - WARN_ON(nilfs_segment_usage_error(su)); - WARN_ON(!nilfs_segment_usage_dirty(su)); + if (unlikely(nilfs_segment_usage_error(su))) + nilfs_warn(sufile->i_sb, "free segment %llu marked in error", + (unsigned long long)segnum); sudirty = nilfs_segment_usage_dirty(su); + if (unlikely(!sudirty)) + nilfs_warn(sufile->i_sb, "free unallocated segment %llu", + (unsigned long long)segnum); + nilfs_segment_usage_set_clean(su); kunmap_atomic(kaddr); mark_buffer_dirty(su_bh); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index a5d1fa4e7552..df8674173b22 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1314,15 +1314,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags, return ERR_CAST(s); if (!s->s_root) { - /* - * We drop s_umount here because we need to open the bdev and - * bdev->open_mutex ranks above s_umount (blkdev_put() -> - * __invalidate_device()). It is safe because we have active sb - * reference and SB_BORN is not set yet. - */ - up_write(&s->s_umount); err = setup_bdev_super(s, flags, NULL); - down_write(&s->s_umount); if (!err) err = nilfs_fill_super(s, data, flags & SB_SILENT ? 1 : 0); diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 1cb9ad7e884e..3464fa7e8538 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -29,7 +29,6 @@ static struct ctl_table dnotify_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - {} }; static void __init dnotify_sysctl_init(void) { diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 9dac7f6e72d2..1e4def21811e 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -29,12 +29,6 @@ static unsigned int fanotify_hash_path(const struct path *path) hash_ptr(path->mnt, FANOTIFY_EVENT_HASH_BITS); } -static inline bool fanotify_fsid_equal(__kernel_fsid_t *fsid1, - __kernel_fsid_t *fsid2) -{ - return fsid1->val[0] == fsid2->val[0] && fsid1->val[1] == fsid2->val[1]; -} - static unsigned int fanotify_hash_fsid(__kernel_fsid_t *fsid) { return hash_32(fsid->val[0], FANOTIFY_EVENT_HASH_BITS) ^ @@ -838,9 +832,8 @@ out: } /* - * Get cached fsid of the filesystem containing the object from any connector. - * All connectors are supposed to have the same fsid, but we do not verify that - * here. + * Get cached fsid of the filesystem containing the object from any mark. + * All marks are supposed to have the same fsid, but we do not verify that here. */ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info) { @@ -849,18 +842,11 @@ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info) __kernel_fsid_t fsid = {}; fsnotify_foreach_iter_mark_type(iter_info, mark, type) { - struct fsnotify_mark_connector *conn; - - conn = READ_ONCE(mark->connector); - /* Mark is just getting destroyed or created? */ - if (!conn) + if (!(mark->flags & FSNOTIFY_MARK_FLAG_HAS_FSID)) continue; - if (!(conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID)) - continue; - /* Pairs with smp_wmb() in fsnotify_add_mark_list() */ - smp_rmb(); - fsid = conn->fsid; - if (WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1])) + fsid = FANOTIFY_MARK(mark)->fsid; + if (!(mark->flags & FSNOTIFY_MARK_FLAG_WEAK_FSID) && + WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1])) continue; return fsid; } @@ -942,12 +928,8 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, return 0; } - if (FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS)) { + if (FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS)) fsid = fanotify_get_fsid(iter_info); - /* Racing with mark destruction or creation? */ - if (!fsid.val[0] && !fsid.val[1]) - return 0; - } event = fanotify_alloc_event(group, mask, data, data_type, dir, file_name, &fsid, match_mask); @@ -1068,7 +1050,7 @@ static void fanotify_freeing_mark(struct fsnotify_mark *mark, static void fanotify_free_mark(struct fsnotify_mark *fsn_mark) { - kmem_cache_free(fanotify_mark_cache, fsn_mark); + kmem_cache_free(fanotify_mark_cache, FANOTIFY_MARK(fsn_mark)); } const struct fsnotify_ops fanotify_fsnotify_ops = { diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 6936671e148d..e5ab33cae6a7 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -489,6 +489,22 @@ static inline unsigned int fanotify_event_hash_bucket( return event->hash & FANOTIFY_HTABLE_MASK; } +struct fanotify_mark { + struct fsnotify_mark fsn_mark; + __kernel_fsid_t fsid; +}; + +static inline struct fanotify_mark *FANOTIFY_MARK(struct fsnotify_mark *mark) +{ + return container_of(mark, struct fanotify_mark, fsn_mark); +} + +static inline bool fanotify_fsid_equal(__kernel_fsid_t *fsid1, + __kernel_fsid_t *fsid2) +{ + return fsid1->val[0] == fsid2->val[0] && fsid1->val[1] == fsid2->val[1]; +} + static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark) { unsigned int mflags = 0; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 4d765c72496f..fbdc63cc10d9 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -23,7 +23,7 @@ #include <asm/ioctls.h> -#include "../../mount.h" +#include "../fsnotify.h" #include "../fdinfo.h" #include "fanotify.h" @@ -86,7 +86,6 @@ static struct ctl_table fanotify_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO }, - { } }; static void __init fanotify_sysctls_init(void) @@ -1192,13 +1191,71 @@ static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, return recalc; } +struct fan_fsid { + struct super_block *sb; + __kernel_fsid_t id; + bool weak; +}; + +static int fanotify_set_mark_fsid(struct fsnotify_group *group, + struct fsnotify_mark *mark, + struct fan_fsid *fsid) +{ + struct fsnotify_mark_connector *conn; + struct fsnotify_mark *old; + struct super_block *old_sb = NULL; + + FANOTIFY_MARK(mark)->fsid = fsid->id; + mark->flags |= FSNOTIFY_MARK_FLAG_HAS_FSID; + if (fsid->weak) + mark->flags |= FSNOTIFY_MARK_FLAG_WEAK_FSID; + + /* First mark added will determine if group is single or multi fsid */ + if (list_empty(&group->marks_list)) + return 0; + + /* Find sb of an existing mark */ + list_for_each_entry(old, &group->marks_list, g_list) { + conn = READ_ONCE(old->connector); + if (!conn) + continue; + old_sb = fsnotify_connector_sb(conn); + if (old_sb) + break; + } + + /* Only detached marks left? */ + if (!old_sb) + return 0; + + /* Do not allow mixing of marks with weak and strong fsid */ + if ((mark->flags ^ old->flags) & FSNOTIFY_MARK_FLAG_WEAK_FSID) + return -EXDEV; + + /* Allow mixing of marks with strong fsid from different fs */ + if (!fsid->weak) + return 0; + + /* Do not allow mixing marks with weak fsid from different fs */ + if (old_sb != fsid->sb) + return -EXDEV; + + /* Do not allow mixing marks from different btrfs sub-volumes */ + if (!fanotify_fsid_equal(&FANOTIFY_MARK(old)->fsid, + &FANOTIFY_MARK(mark)->fsid)) + return -EXDEV; + + return 0; +} + static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, fsnotify_connp_t *connp, unsigned int obj_type, unsigned int fan_flags, - __kernel_fsid_t *fsid) + struct fan_fsid *fsid) { struct ucounts *ucounts = group->fanotify_data.ucounts; + struct fanotify_mark *fan_mark; struct fsnotify_mark *mark; int ret; @@ -1211,24 +1268,34 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS)) return ERR_PTR(-ENOSPC); - mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); - if (!mark) { + fan_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); + if (!fan_mark) { ret = -ENOMEM; goto out_dec_ucounts; } + mark = &fan_mark->fsn_mark; fsnotify_init_mark(mark, group); if (fan_flags & FAN_MARK_EVICTABLE) mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF; - ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0, fsid); - if (ret) { - fsnotify_put_mark(mark); - goto out_dec_ucounts; + /* Cache fsid of filesystem containing the marked object */ + if (fsid) { + ret = fanotify_set_mark_fsid(group, mark, fsid); + if (ret) + goto out_put_mark; + } else { + fan_mark->fsid.val[0] = fan_mark->fsid.val[1] = 0; } + ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0); + if (ret) + goto out_put_mark; + return mark; +out_put_mark: + fsnotify_put_mark(mark); out_dec_ucounts: if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS)) dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS); @@ -1279,7 +1346,7 @@ static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark, static int fanotify_add_mark(struct fsnotify_group *group, fsnotify_connp_t *connp, unsigned int obj_type, __u32 mask, unsigned int fan_flags, - __kernel_fsid_t *fsid) + struct fan_fsid *fsid) { struct fsnotify_mark *fsn_mark; bool recalc; @@ -1327,7 +1394,7 @@ out: static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, struct vfsmount *mnt, __u32 mask, - unsigned int flags, __kernel_fsid_t *fsid) + unsigned int flags, struct fan_fsid *fsid) { return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks, FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid); @@ -1335,7 +1402,7 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, static int fanotify_add_sb_mark(struct fsnotify_group *group, struct super_block *sb, __u32 mask, - unsigned int flags, __kernel_fsid_t *fsid) + unsigned int flags, struct fan_fsid *fsid) { return fanotify_add_mark(group, &sb->s_fsnotify_marks, FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid); @@ -1343,7 +1410,7 @@ static int fanotify_add_sb_mark(struct fsnotify_group *group, static int fanotify_add_inode_mark(struct fsnotify_group *group, struct inode *inode, __u32 mask, - unsigned int flags, __kernel_fsid_t *fsid) + unsigned int flags, struct fan_fsid *fsid) { pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); @@ -1554,20 +1621,25 @@ out_destroy_group: return fd; } -static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid) +static int fanotify_test_fsid(struct dentry *dentry, unsigned int flags, + struct fan_fsid *fsid) { + unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; __kernel_fsid_t root_fsid; int err; /* * Make sure dentry is not of a filesystem with zero fsid (e.g. fuse). */ - err = vfs_get_fsid(dentry, fsid); + err = vfs_get_fsid(dentry, &fsid->id); if (err) return err; - if (!fsid->val[0] && !fsid->val[1]) - return -ENODEV; + fsid->sb = dentry->d_sb; + if (!fsid->id.val[0] && !fsid->id.val[1]) { + err = -ENODEV; + goto weak; + } /* * Make sure dentry is not of a filesystem subvolume (e.g. btrfs) @@ -1577,11 +1649,18 @@ static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid) if (err) return err; - if (root_fsid.val[0] != fsid->val[0] || - root_fsid.val[1] != fsid->val[1]) - return -EXDEV; + if (!fanotify_fsid_equal(&root_fsid, &fsid->id)) { + err = -EXDEV; + goto weak; + } + fsid->weak = false; return 0; + +weak: + /* Allow weak fsid when marking inodes */ + fsid->weak = true; + return (mark_type == FAN_MARK_INODE) ? 0 : err; } /* Check if filesystem can encode a unique fid */ @@ -1665,7 +1744,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, struct fsnotify_group *group; struct fd f; struct path path; - __kernel_fsid_t __fsid, *fsid = NULL; + struct fan_fsid __fsid, *fsid = NULL; u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS; unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS; @@ -1817,7 +1896,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, } if (fid_mode) { - ret = fanotify_test_fsid(path.dentry, &__fsid); + ret = fanotify_test_fsid(path.dentry, flags, &__fsid); if (ret) goto path_put_and_out; @@ -1935,7 +2014,7 @@ static int __init fanotify_user_setup(void) BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11); - fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, + fanotify_mark_cache = KMEM_CACHE(fanotify_mark, SLAB_PANIC|SLAB_ACCOUNT); fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event, SLAB_PANIC); diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 7974e91ffe13..8bfd690e9f10 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -124,7 +124,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode) * d_flags to indicate parental interest (their parent is the * original inode) */ spin_lock(&alias->d_lock); - list_for_each_entry(child, &alias->d_subdirs, d_child) { + hlist_for_each_entry(child, &alias->d_children, d_sib) { if (!child->d_inode) continue; diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index a3809ae92170..85d8fdd55329 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -85,7 +85,6 @@ static struct ctl_table inotify_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO }, - { } }; static void __init inotify_sysctls_init(void) diff --git a/fs/notify/mark.c b/fs/notify/mark.c index c74ef947447d..d6944ff86ffa 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -537,8 +537,7 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) } static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, - unsigned int obj_type, - __kernel_fsid_t *fsid) + unsigned int obj_type) { struct fsnotify_mark_connector *conn; @@ -550,14 +549,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, conn->flags = 0; conn->type = obj_type; conn->obj = connp; - /* Cache fsid of filesystem containing the object */ - if (fsid) { - conn->fsid = *fsid; - conn->flags = FSNOTIFY_CONN_FLAG_HAS_FSID; - } else { - conn->fsid.val[0] = conn->fsid.val[1] = 0; - conn->flags = 0; - } + conn->flags = 0; fsnotify_get_sb_connectors(conn); /* @@ -608,8 +600,7 @@ out: */ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, fsnotify_connp_t *connp, - unsigned int obj_type, - int add_flags, __kernel_fsid_t *fsid) + unsigned int obj_type, int add_flags) { struct fsnotify_mark *lmark, *last = NULL; struct fsnotify_mark_connector *conn; @@ -619,41 +610,15 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, if (WARN_ON(!fsnotify_valid_obj_type(obj_type))) return -EINVAL; - /* Backend is expected to check for zero fsid (e.g. tmpfs) */ - if (fsid && WARN_ON_ONCE(!fsid->val[0] && !fsid->val[1])) - return -ENODEV; - restart: spin_lock(&mark->lock); conn = fsnotify_grab_connector(connp); if (!conn) { spin_unlock(&mark->lock); - err = fsnotify_attach_connector_to_object(connp, obj_type, - fsid); + err = fsnotify_attach_connector_to_object(connp, obj_type); if (err) return err; goto restart; - } else if (fsid && !(conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID)) { - conn->fsid = *fsid; - /* Pairs with smp_rmb() in fanotify_get_fsid() */ - smp_wmb(); - conn->flags |= FSNOTIFY_CONN_FLAG_HAS_FSID; - } else if (fsid && (conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID) && - (fsid->val[0] != conn->fsid.val[0] || - fsid->val[1] != conn->fsid.val[1])) { - /* - * Backend is expected to check for non uniform fsid - * (e.g. btrfs), but maybe we missed something? - * Only allow setting conn->fsid once to non zero fsid. - * inotify and non-fid fanotify groups do not set nor test - * conn->fsid. - */ - pr_warn_ratelimited("%s: fsid mismatch on object of type %u: " - "%x.%x != %x.%x\n", __func__, conn->type, - fsid->val[0], fsid->val[1], - conn->fsid.val[0], conn->fsid.val[1]); - err = -EXDEV; - goto out_err; } /* is mark the first mark? */ @@ -703,7 +668,7 @@ out_err: */ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_connp_t *connp, unsigned int obj_type, - int add_flags, __kernel_fsid_t *fsid) + int add_flags) { struct fsnotify_group *group = mark->group; int ret = 0; @@ -723,7 +688,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_get_mark(mark); /* for g_list */ spin_unlock(&mark->lock); - ret = fsnotify_add_mark_list(mark, connp, obj_type, add_flags, fsid); + ret = fsnotify_add_mark_list(mark, connp, obj_type, add_flags); if (ret) goto err; @@ -742,14 +707,13 @@ err: } int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp, - unsigned int obj_type, int add_flags, - __kernel_fsid_t *fsid) + unsigned int obj_type, int add_flags) { int ret; struct fsnotify_group *group = mark->group; fsnotify_group_lock(group); - ret = fsnotify_add_mark_locked(mark, connp, obj_type, add_flags, fsid); + ret = fsnotify_add_mark_locked(mark, connp, obj_type, add_flags); fsnotify_group_unlock(group); return ret; } diff --git a/fs/nsfs.c b/fs/nsfs.c index 9a4b228d42fa..34e1e3e36733 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -90,12 +90,9 @@ slow: inode->i_fop = &ns_file_operations; inode->i_private = ns; - dentry = d_alloc_anon(mnt->mnt_sb); - if (!dentry) { - iput(inode); + dentry = d_make_root(inode); /* not the normal use, but... */ + if (!dentry) return -ENOMEM; - } - d_instantiate(dentry, inode); dentry->d_fsdata = (void *)ns->ops; d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry); if (d) { diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 71e31e789b29..2d01517a2d59 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -1304,7 +1304,7 @@ done: * page cleaned. The VM has already locked the page and marked it clean. * * For non-resident attributes, ntfs_writepage() writes the @page by calling - * the ntfs version of the generic block_write_full_page() function, + * the ntfs version of the generic block_write_full_folio() function, * ntfs_write_block(), which in turn if necessary creates and writes the * buffers associated with the page asynchronously. * @@ -1314,7 +1314,7 @@ done: * vfs inode dirty code path for the inode the mft record belongs to or via the * vm page dirty code path for the page the mft record is in. * - * Based on ntfs_read_folio() and fs/buffer.c::block_write_full_page(). + * Based on ntfs_read_folio() and fs/buffer.c::block_write_full_folio(). * * Return 0 on success and -errno on error. */ @@ -1644,7 +1644,7 @@ const struct address_space_operations ntfs_normal_aops = { .bmap = ntfs_bmap, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; /* @@ -1658,7 +1658,7 @@ const struct address_space_operations ntfs_compressed_aops = { #endif /* NTFS_RW */ .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; /* @@ -1673,7 +1673,7 @@ const struct address_space_operations ntfs_mst_aops = { #endif /* NTFS_RW */ .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; #ifdef NTFS_RW @@ -1690,7 +1690,7 @@ const struct address_space_operations ntfs_mst_aops = { * * If the page does not have buffers, we create them and set them uptodate. * The page may not be locked which is why we need to handle the buffers under - * the mapping->private_lock. Once the buffers are marked dirty we no longer + * the mapping->i_private_lock. Once the buffers are marked dirty we no longer * need the lock since try_to_free_buffers() does not free dirty buffers. */ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { @@ -1702,11 +1702,11 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { BUG_ON(!PageUptodate(page)); end = ofs + ni->itype.index.block_size; bh_size = VFS_I(ni)->i_sb->s_blocksize; - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); if (unlikely(!page_has_buffers(page))) { - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); bh = head = alloc_page_buffers(page, bh_size, true); - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); if (likely(!page_has_buffers(page))) { struct buffer_head *tail; @@ -1730,7 +1730,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { break; set_buffer_dirty(bh); } while ((bh = bh->b_this_page) != head); - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); filemap_dirty_folio(mapping, page_folio(page)); if (unlikely(buffers_to_free)) { do { diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 4596c90e7b7c..629723a8d712 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1462,7 +1462,8 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp) /** * ntfs_dir_fsync - sync a directory to disk * @filp: directory to be synced - * @dentry: dentry describing the directory to sync + * @start: offset in bytes of the beginning of data range to sync + * @end: offset in bytes of the end of data range (inclusive) * @datasync: if non-zero only flush user data and not metadata * * Data integrity sync of a directory to disk. Used for fsync, fdatasync, and diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c index 174fe536a1c0..4e980170d86a 100644 --- a/fs/ntfs/sysctl.c +++ b/fs/ntfs/sysctl.c @@ -28,7 +28,6 @@ static struct ctl_table ntfs_sysctls[] = { .mode = 0644, /* Mode, proc handler. */ .proc_handler = proc_dointvec }, - {} }; /* Storage for the sysctls header. */ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 91b32b2377ac..ea9127ba3208 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6934,7 +6934,7 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, * nonzero data on subsequent file extends. * * We need to call this before i_size is updated on the inode because - * otherwise block_write_full_page() will skip writeout of pages past + * otherwise block_write_full_folio() will skip writeout of pages past * i_size. */ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index ba790219d528..b82185075de7 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -389,21 +389,18 @@ out_unlock: /* Note: Because we don't support holes, our allocation has * already happened (allocation writes zeros to the file data) * so we don't have to worry about ordered writes in - * ocfs2_writepage. + * ocfs2_writepages. * - * ->writepage is called during the process of invalidating the page cache + * ->writepages is called during the process of invalidating the page cache * during blocked lock processing. It can't block on any cluster locks * to during block mapping. It's relying on the fact that the block * mapping can't have disappeared under the dirty pages that it is * being asked to write back. */ -static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) +static int ocfs2_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - trace_ocfs2_writepage( - (unsigned long long)OCFS2_I(page->mapping->host)->ip_blkno, - page->index); - - return block_write_full_page(page, ocfs2_get_block, wbc); + return mpage_writepages(mapping, wbc, ocfs2_get_block); } /* Taken from ext3. We don't necessarily need the full blown @@ -2471,7 +2468,7 @@ const struct address_space_operations ocfs2_aops = { .dirty_folio = block_dirty_folio, .read_folio = ocfs2_read_folio, .readahead = ocfs2_readahead, - .writepage = ocfs2_writepage, + .writepages = ocfs2_writepages, .write_begin = ocfs2_write_begin, .write_end = ocfs2_write_end, .bmap = ocfs2_bmap, @@ -2480,5 +2477,5 @@ const struct address_space_operations ocfs2_aops = { .release_folio = ocfs2_release_folio, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 04fc8344063a..a9b8688aaf30 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -124,17 +124,10 @@ static int ocfs2_match_dentry(struct dentry *dentry, if (!dentry->d_fsdata) return 0; - if (!dentry->d_parent) - return 0; - if (skip_unhashed && d_unhashed(dentry)) return 0; parent = d_inode(dentry->d_parent); - /* Negative parent dentry? */ - if (!parent) - return 0; - /* Name is in a different directory. */ if (OCFS2_I(parent)->ip_blkno != parent_blkno) return 0; diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index a14c8fee6ee5..d620d4c53c6f 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1593,9 +1593,6 @@ int __ocfs2_add_entry(handle_t *handle, struct buffer_head *insert_bh = lookup->dl_leaf_bh; char *data_start = insert_bh->b_data; - if (!namelen) - return -EINVAL; - if (ocfs2_dir_indexed(dir)) { struct buffer_head *bh; @@ -4245,12 +4242,6 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, trace_ocfs2_prepare_dir_for_insert( (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen); - if (!namelen) { - ret = -EINVAL; - mlog_errno(ret); - goto out; - } - /* * Do this up front to reduce confusion. * diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index eaa8c80ace3c..b8b6a191b5cb 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -280,4 +280,5 @@ const struct export_operations ocfs2_export_ops = { .fh_to_dentry = ocfs2_fh_to_dentry, .fh_to_parent = ocfs2_fh_to_parent, .get_parent = ocfs2_get_parent, + .flags = EXPORT_OP_ASYNC_LOCK, }; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 94e2a1244442..8b6d15010703 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -818,7 +818,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, /* * fs-writeback will release the dirty pages without page lock * whose offset are over inode size, the release happens at - * block_write_full_page(). + * block_write_full_folio(). */ i_size_write(inode, abs_to); inode->i_blocks = ocfs2_inode_sector_count(inode); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 814733ba2f4b..9221a33f917b 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1336,7 +1336,7 @@ static int ocfs2_rename(struct mnt_idmap *idmap, goto bail; } - if (S_ISDIR(old_inode->i_mode)) { + if (S_ISDIR(old_inode->i_mode) && new_dir != old_dir) { u64 old_inode_parent; update_dot_dot = 1; @@ -1353,8 +1353,7 @@ static int ocfs2_rename(struct mnt_idmap *idmap, goto bail; } - if (!new_inode && new_dir != old_dir && - new_dir->i_nlink >= ocfs2_link_max(osb)) { + if (!new_inode && new_dir->i_nlink >= ocfs2_link_max(osb)) { status = -EMLINK; goto bail; } @@ -1601,6 +1600,9 @@ static int ocfs2_rename(struct mnt_idmap *idmap, mlog_errno(status); goto bail; } + } + + if (S_ISDIR(old_inode->i_mode)) { drop_nlink(old_dir); if (new_inode) { drop_nlink(new_inode); diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index ac4fd1d5b128..9898c11bdfa1 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -1157,8 +1157,6 @@ DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_get_block_end); DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_readpage); -DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_writepage); - DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_bmap); TRACE_EVENT(ocfs2_try_to_write_inline_data, diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index a8d5ca98fa57..20aa37b67cfb 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -658,7 +658,6 @@ static struct ctl_table ocfs2_nm_table[] = { .mode = 0644, .proc_handler = proc_dostring, }, - { } }; static struct ctl_table_header *ocfs2_table_header; diff --git a/fs/open.c b/fs/open.c index 3494a9cd8046..a84d21e55c39 100644 --- a/fs/open.c +++ b/fs/open.c @@ -304,6 +304,10 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (ret) return ret; + ret = fsnotify_file_area_perm(file, MAY_WRITE, &offset, len); + if (ret) + return ret; + if (S_ISFIFO(inode->i_mode)) return -ESPIPE; @@ -442,7 +446,8 @@ static const struct cred *access_override_creds(void) * 'get_current_cred()' function), that will clear the * non_rcu field, because now that other user may be * expecting RCU freeing. But normal thread-synchronous - * cred accesses will keep things non-RCY. + * cred accesses will keep things non-racy to avoid RCU + * freeing. */ override_cred->non_rcu = 1; @@ -1177,44 +1182,6 @@ struct file *kernel_file_open(const struct path *path, int flags, } EXPORT_SYMBOL_GPL(kernel_file_open); -/** - * backing_file_open - open a backing file for kernel internal use - * @user_path: path that the user reuqested to open - * @flags: open flags - * @real_path: path of the backing file - * @cred: credentials for open - * - * Open a backing file for a stackable filesystem (e.g., overlayfs). - * @user_path may be on the stackable filesystem and @real_path on the - * underlying filesystem. In this case, we want to be able to return the - * @user_path of the stackable filesystem. This is done by embedding the - * returned file into a container structure that also stores the stacked - * file's path, which can be retrieved using backing_file_user_path(). - */ -struct file *backing_file_open(const struct path *user_path, int flags, - const struct path *real_path, - const struct cred *cred) -{ - struct file *f; - int error; - - f = alloc_empty_backing_file(flags, cred); - if (IS_ERR(f)) - return f; - - path_get(user_path); - *backing_file_user_path(f) = *user_path; - f->f_path = *real_path; - error = do_dentry_open(f, d_inode(real_path->dentry), NULL); - if (error) { - fput(f); - f = ERR_PTR(error); - } - - return f; -} -EXPORT_SYMBOL_GPL(backing_file_open); - #define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE)) #define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC) @@ -1574,7 +1541,7 @@ SYSCALL_DEFINE1(close, unsigned int, fd) int retval; struct file *file; - file = close_fd_get_file(fd); + file = file_close_fd(fd); if (!file) return -EBADF; diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c index 9cacce5d55c1..6d1fbeca9d81 100644 --- a/fs/orangefs/dir.c +++ b/fs/orangefs/dir.c @@ -58,10 +58,10 @@ struct orangefs_dir { * first part of the part list. */ -static int do_readdir(struct orangefs_inode_s *oi, - struct orangefs_dir *od, struct dentry *dentry, +static int do_readdir(struct orangefs_dir *od, struct inode *inode, struct orangefs_kernel_op_s *op) { + struct orangefs_inode_s *oi = ORANGEFS_I(inode); struct orangefs_readdir_response_s *resp; int bufi, r; @@ -87,7 +87,7 @@ again: op->upcall.req.readdir.buf_index = bufi; r = service_operation(op, "orangefs_readdir", - get_interruptible_flag(dentry->d_inode)); + get_interruptible_flag(inode)); orangefs_readdir_index_put(bufi); @@ -158,8 +158,7 @@ static int parse_readdir(struct orangefs_dir *od, return 0; } -static int orangefs_dir_more(struct orangefs_inode_s *oi, - struct orangefs_dir *od, struct dentry *dentry) +static int orangefs_dir_more(struct orangefs_dir *od, struct inode *inode) { struct orangefs_kernel_op_s *op; int r; @@ -169,7 +168,7 @@ static int orangefs_dir_more(struct orangefs_inode_s *oi, od->error = -ENOMEM; return -ENOMEM; } - r = do_readdir(oi, od, dentry, op); + r = do_readdir(od, inode, op); if (r) { od->error = r; goto out; @@ -238,9 +237,7 @@ next: return 1; } -static int orangefs_dir_fill(struct orangefs_inode_s *oi, - struct orangefs_dir *od, struct dentry *dentry, - struct dir_context *ctx) +static int orangefs_dir_fill(struct orangefs_dir *od, struct dir_context *ctx) { struct orangefs_dir_part *part; size_t count; @@ -304,15 +301,10 @@ static loff_t orangefs_dir_llseek(struct file *file, loff_t offset, static int orangefs_dir_iterate(struct file *file, struct dir_context *ctx) { - struct orangefs_inode_s *oi; - struct orangefs_dir *od; - struct dentry *dentry; + struct orangefs_dir *od = file->private_data; + struct inode *inode = file_inode(file); int r; - dentry = file->f_path.dentry; - oi = ORANGEFS_I(dentry->d_inode); - od = file->private_data; - if (od->error) return od->error; @@ -342,7 +334,7 @@ static int orangefs_dir_iterate(struct file *file, */ while (od->token != ORANGEFS_ITERATE_END && ctx->pos > od->end) { - r = orangefs_dir_more(oi, od, dentry); + r = orangefs_dir_more(od, inode); if (r) return r; } @@ -351,17 +343,17 @@ static int orangefs_dir_iterate(struct file *file, /* Then try to fill if there's any left in the buffer. */ if (ctx->pos < od->end) { - r = orangefs_dir_fill(oi, od, dentry, ctx); + r = orangefs_dir_fill(od, ctx); if (r) return r; } /* Finally get some more and try to fill. */ if (od->token != ORANGEFS_ITERATE_END) { - r = orangefs_dir_more(oi, od, dentry); + r = orangefs_dir_more(od, inode); if (r) return r; - r = orangefs_dir_fill(oi, od, dentry, ctx); + r = orangefs_dir_fill(od, ctx); } return r; diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index fec5020c3495..2ac67e04a6fb 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config OVERLAY_FS tristate "Overlay filesystem support" + select FS_STACK select EXPORTFS help An overlay filesystem combines two filesystems - an 'upper' filesystem diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 8bea66c97316..b8e25ca51016 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -230,6 +230,19 @@ static int ovl_copy_fileattr(struct inode *inode, const struct path *old, return ovl_real_fileattr_set(new, &newfa); } +static int ovl_verify_area(loff_t pos, loff_t pos2, loff_t len, loff_t totlen) +{ + loff_t tmp; + + if (WARN_ON_ONCE(pos != pos2)) + return -EIO; + if (WARN_ON_ONCE(pos < 0 || len < 0 || totlen < 0)) + return -EIO; + if (WARN_ON_ONCE(check_add_overflow(pos, len, &tmp))) + return -EIO; + return 0; +} + static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, struct file *new_file, loff_t len) { @@ -244,13 +257,20 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, int error = 0; ovl_path_lowerdata(dentry, &datapath); - if (WARN_ON(datapath.dentry == NULL)) + if (WARN_ON_ONCE(datapath.dentry == NULL) || + WARN_ON_ONCE(len < 0)) return -EIO; old_file = ovl_path_open(&datapath, O_LARGEFILE | O_RDONLY); if (IS_ERR(old_file)) return PTR_ERR(old_file); + error = rw_verify_area(READ, old_file, &old_pos, len); + if (!error) + error = rw_verify_area(WRITE, new_file, &new_pos, len); + if (error) + goto out_fput; + /* Try to use clone_file_range to clone up within the same fs */ ovl_start_write(dentry); cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0); @@ -265,7 +285,7 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, while (len) { size_t this_len = OVL_COPY_UP_CHUNK_SIZE; - long bytes; + ssize_t bytes; if (len < this_len) this_len = len; @@ -309,11 +329,13 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, } } - ovl_start_write(dentry); + error = ovl_verify_area(old_pos, new_pos, this_len, len); + if (error) + break; + bytes = do_splice_direct(old_file, &old_pos, new_file, &new_pos, this_len, SPLICE_F_MOVE); - ovl_end_write(dentry); if (bytes <= 0) { error = bytes; break; @@ -722,7 +744,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) struct inode *inode; struct inode *udir = d_inode(c->destdir), *wdir = d_inode(c->workdir); struct path path = { .mnt = ovl_upper_mnt(ofs) }; - struct dentry *temp, *upper; + struct dentry *temp, *upper, *trap; struct ovl_cu_creds cc; int err; struct ovl_cattr cattr = { @@ -759,11 +781,13 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) * temp wasn't moved before copy up completion or cleanup. */ ovl_start_write(c->dentry); - if (lock_rename(c->workdir, c->destdir) != NULL || - temp->d_parent != c->workdir) { + trap = lock_rename(c->workdir, c->destdir); + if (trap || temp->d_parent != c->workdir) { /* temp or workdir moved underneath us? abort without cleanup */ dput(temp); err = -EIO; + if (IS_ERR(trap)) + goto out; goto unlock; } else if (err) { goto cleanup; @@ -804,6 +828,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) ovl_set_flag(OVL_WHITEOUTS, inode); unlock: unlock_rename(c->workdir, c->destdir); +out: ovl_end_write(c->dentry); return err; @@ -931,6 +956,13 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c) goto out_free_fh; } else { /* + * c->dentry->d_name is stabilzed by ovl_copy_up_start(), + * because if we got here, it means that c->dentry has no upper + * alias and changing ->d_name means going through ovl_rename() + * that will call ovl_copy_up() on source and target dentry. + */ + c->destname = c->dentry->d_name; + /* * Mark parent "impure" because it may now contain non-pure * upper */ @@ -1110,7 +1142,6 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, if (parent) { ovl_path_upper(parent, &parentpath); ctx.destdir = parentpath.dentry; - ctx.destname = dentry->d_name; err = vfs_getattr(&parentpath, &ctx.pstat, STATX_ATIME | STATX_MTIME, diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index aab3f5d93556..0f8b4a719237 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -1180,6 +1180,10 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, } trap = lock_rename(new_upperdir, old_upperdir); + if (IS_ERR(trap)) { + err = PTR_ERR(trap); + goto out_revert_creds; + } olddentry = ovl_lookup_upper(ofs, old->d_name.name, old_upperdir, old->d_name.len); diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 7e16bbcad95e..063409069f56 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -289,7 +289,6 @@ static struct dentry *ovl_obtain_alias(struct super_block *sb, { struct dentry *lower = lowerpath ? lowerpath->dentry : NULL; struct dentry *upper = upper_alias ?: index; - struct dentry *dentry; struct inode *inode = NULL; struct ovl_entry *oe; struct ovl_inode_params oip = { @@ -320,27 +319,7 @@ static struct dentry *ovl_obtain_alias(struct super_block *sb, if (upper) ovl_set_flag(OVL_UPPERDATA, inode); - dentry = d_find_any_alias(inode); - if (dentry) - goto out_iput; - - dentry = d_alloc_anon(inode->i_sb); - if (unlikely(!dentry)) - goto nomem; - - if (upper_alias) - ovl_dentry_set_upper_alias(dentry); - - ovl_dentry_init_reval(dentry, upper, OVL_I_E(inode)); - - return d_instantiate_anon(dentry, inode); - -nomem: - dput(dentry); - dentry = ERR_PTR(-ENOMEM); -out_iput: - iput(inode); - return dentry; + return d_obtain_alias(inode); } /* Get the upper or lower dentry in stack whose on layer @idx */ @@ -460,7 +439,7 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb, * For decoded lower dir file handle, lookup index by origin to check * if lower dir was copied up and and/or removed. */ - if (!this && layer->idx && ofs->indexdir && !WARN_ON(!d_is_dir(real))) { + if (!this && layer->idx && ovl_indexdir(sb) && !WARN_ON(!d_is_dir(real))) { index = ovl_lookup_index(ofs, NULL, real, false); if (IS_ERR(index)) return index; @@ -733,7 +712,7 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb, } /* Then lookup indexed upper/whiteout by origin fh */ - if (ofs->indexdir) { + if (ovl_indexdir(sb)) { index = ovl_get_index_fh(ofs, fh); err = PTR_ERR(index); if (IS_ERR(index)) { diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 131621daeb13..05536964d37f 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -9,25 +9,11 @@ #include <linux/xattr.h> #include <linux/uio.h> #include <linux/uaccess.h> -#include <linux/splice.h> #include <linux/security.h> -#include <linux/mm.h> #include <linux/fs.h> +#include <linux/backing-file.h> #include "overlayfs.h" -#include "../internal.h" /* for sb_init_dio_done_wq */ - -struct ovl_aio_req { - struct kiocb iocb; - refcount_t ref; - struct kiocb *orig_iocb; - /* used for aio completion */ - struct work_struct work; - long res; -}; - -static struct kmem_cache *ovl_aio_request_cachep; - static char ovl_whatisit(struct inode *inode, struct inode *realinode) { if (realinode != ovl_inode_upper(inode)) @@ -274,83 +260,16 @@ static void ovl_file_accessed(struct file *file) touch_atime(&file->f_path); } -#define OVL_IOCB_MASK \ - (IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND) - -static rwf_t iocb_to_rw_flags(int flags) -{ - return (__force rwf_t)(flags & OVL_IOCB_MASK); -} - -static inline void ovl_aio_put(struct ovl_aio_req *aio_req) -{ - if (refcount_dec_and_test(&aio_req->ref)) { - fput(aio_req->iocb.ki_filp); - kmem_cache_free(ovl_aio_request_cachep, aio_req); - } -} - -static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) -{ - struct kiocb *iocb = &aio_req->iocb; - struct kiocb *orig_iocb = aio_req->orig_iocb; - - if (iocb->ki_flags & IOCB_WRITE) { - kiocb_end_write(iocb); - ovl_file_modified(orig_iocb->ki_filp); - } - - orig_iocb->ki_pos = iocb->ki_pos; - ovl_aio_put(aio_req); -} - -static void ovl_aio_rw_complete(struct kiocb *iocb, long res) -{ - struct ovl_aio_req *aio_req = container_of(iocb, - struct ovl_aio_req, iocb); - struct kiocb *orig_iocb = aio_req->orig_iocb; - - ovl_aio_cleanup_handler(aio_req); - orig_iocb->ki_complete(orig_iocb, res); -} - -static void ovl_aio_complete_work(struct work_struct *work) -{ - struct ovl_aio_req *aio_req = container_of(work, - struct ovl_aio_req, work); - - ovl_aio_rw_complete(&aio_req->iocb, aio_req->res); -} - -static void ovl_aio_queue_completion(struct kiocb *iocb, long res) -{ - struct ovl_aio_req *aio_req = container_of(iocb, - struct ovl_aio_req, iocb); - struct kiocb *orig_iocb = aio_req->orig_iocb; - - /* - * Punt to a work queue to serialize updates of mtime/size. - */ - aio_req->res = res; - INIT_WORK(&aio_req->work, ovl_aio_complete_work); - queue_work(file_inode(orig_iocb->ki_filp)->i_sb->s_dio_done_wq, - &aio_req->work); -} - -static int ovl_init_aio_done_wq(struct super_block *sb) -{ - if (sb->s_dio_done_wq) - return 0; - - return sb_init_dio_done_wq(sb); -} - static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct fd real; - const struct cred *old_cred; ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ovl_creds(file_inode(file)->i_sb), + .user_file = file, + .accessed = ovl_file_accessed, + }; if (!iov_iter_count(iter)) return 0; @@ -359,37 +278,8 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) if (ret) return ret; - ret = -EINVAL; - if (iocb->ki_flags & IOCB_DIRECT && - !(real.file->f_mode & FMODE_CAN_ODIRECT)) - goto out_fdput; - - old_cred = ovl_override_creds(file_inode(file)->i_sb); - if (is_sync_kiocb(iocb)) { - rwf_t rwf = iocb_to_rw_flags(iocb->ki_flags); - - ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, rwf); - } else { - struct ovl_aio_req *aio_req; - - ret = -ENOMEM; - aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL); - if (!aio_req) - goto out; - - aio_req->orig_iocb = iocb; - kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); - aio_req->iocb.ki_complete = ovl_aio_rw_complete; - refcount_set(&aio_req->ref, 2); - ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter); - ovl_aio_put(aio_req); - if (ret != -EIOCBQUEUED) - ovl_aio_cleanup_handler(aio_req); - } -out: - revert_creds(old_cred); - ovl_file_accessed(file); -out_fdput: + ret = backing_file_read_iter(real.file, iter, iocb, iocb->ki_flags, + &ctx); fdput(real); return ret; @@ -400,9 +290,13 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct fd real; - const struct cred *old_cred; ssize_t ret; int ifl = iocb->ki_flags; + struct backing_file_ctx ctx = { + .cred = ovl_creds(inode->i_sb), + .user_file = file, + .end_write = ovl_file_modified, + }; if (!iov_iter_count(iter)) return 0; @@ -410,19 +304,11 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) inode_lock(inode); /* Update mode */ ovl_copyattr(inode); - ret = file_remove_privs(file); - if (ret) - goto out_unlock; ret = ovl_real_fdget(file, &real); if (ret) goto out_unlock; - ret = -EINVAL; - if (iocb->ki_flags & IOCB_DIRECT && - !(real.file->f_mode & FMODE_CAN_ODIRECT)) - goto out_fdput; - if (!ovl_should_sync(OVL_FS(inode->i_sb))) ifl &= ~(IOCB_DSYNC | IOCB_SYNC); @@ -431,42 +317,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) * this property in case it is set by the issuer. */ ifl &= ~IOCB_DIO_CALLER_COMP; - - old_cred = ovl_override_creds(file_inode(file)->i_sb); - if (is_sync_kiocb(iocb)) { - rwf_t rwf = iocb_to_rw_flags(ifl); - - file_start_write(real.file); - ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, rwf); - file_end_write(real.file); - /* Update size */ - ovl_file_modified(file); - } else { - struct ovl_aio_req *aio_req; - - ret = ovl_init_aio_done_wq(inode->i_sb); - if (ret) - goto out; - - ret = -ENOMEM; - aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL); - if (!aio_req) - goto out; - - aio_req->orig_iocb = iocb; - kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); - aio_req->iocb.ki_flags = ifl; - aio_req->iocb.ki_complete = ovl_aio_queue_completion; - refcount_set(&aio_req->ref, 2); - kiocb_start_write(&aio_req->iocb); - ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter); - ovl_aio_put(aio_req); - if (ret != -EIOCBQUEUED) - ovl_aio_cleanup_handler(aio_req); - } -out: - revert_creds(old_cred); -out_fdput: + ret = backing_file_write_iter(real.file, iter, iocb, ifl, &ctx); fdput(real); out_unlock: @@ -479,20 +330,21 @@ static ssize_t ovl_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { - const struct cred *old_cred; struct fd real; ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ovl_creds(file_inode(in)->i_sb), + .user_file = in, + .accessed = ovl_file_accessed, + }; ret = ovl_real_fdget(in, &real); if (ret) return ret; - old_cred = ovl_override_creds(file_inode(in)->i_sb); - ret = vfs_splice_read(real.file, ppos, pipe, len, flags); - revert_creds(old_cred); - ovl_file_accessed(in); - + ret = backing_file_splice_read(real.file, ppos, pipe, len, flags, &ctx); fdput(real); + return ret; } @@ -508,30 +360,23 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { struct fd real; - const struct cred *old_cred; struct inode *inode = file_inode(out); ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ovl_creds(inode->i_sb), + .user_file = out, + .end_write = ovl_file_modified, + }; inode_lock(inode); /* Update mode */ ovl_copyattr(inode); - ret = file_remove_privs(out); - if (ret) - goto out_unlock; ret = ovl_real_fdget(out, &real); if (ret) goto out_unlock; - old_cred = ovl_override_creds(inode->i_sb); - file_start_write(real.file); - - ret = iter_file_splice_write(pipe, real.file, ppos, len, flags); - - file_end_write(real.file); - /* Update size */ - ovl_file_modified(out); - revert_creds(old_cred); + ret = backing_file_splice_write(pipe, real.file, ppos, len, flags, &ctx); fdput(real); out_unlock: @@ -569,23 +414,13 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) static int ovl_mmap(struct file *file, struct vm_area_struct *vma) { struct file *realfile = file->private_data; - const struct cred *old_cred; - int ret; - - if (!realfile->f_op->mmap) - return -ENODEV; + struct backing_file_ctx ctx = { + .cred = ovl_creds(file_inode(file)->i_sb), + .user_file = file, + .accessed = ovl_file_accessed, + }; - if (WARN_ON(file != vma->vm_file)) - return -EIO; - - vma_set_file(vma, realfile); - - old_cred = ovl_override_creds(file_inode(file)->i_sb); - ret = call_mmap(vma->vm_file, vma); - revert_creds(old_cred); - ovl_file_accessed(file); - - return ret; + return backing_file_mmap(realfile, vma, &ctx); } static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len) @@ -778,19 +613,3 @@ const struct file_operations ovl_file_operations = { .copy_file_range = ovl_copy_file_range, .remap_file_range = ovl_remap_file_range, }; - -int __init ovl_aio_request_cache_init(void) -{ - ovl_aio_request_cachep = kmem_cache_create("ovl_aio_req", - sizeof(struct ovl_aio_req), - 0, SLAB_HWCACHE_ALIGN, NULL); - if (!ovl_aio_request_cachep) - return -ENOMEM; - - return 0; -} - -void ovl_aio_request_cache_destroy(void) -{ - kmem_cache_destroy(ovl_aio_request_cachep); -} diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 03bc8d5dfa31..984ffdaeed6c 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -754,7 +754,7 @@ struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh) if (err) return ERR_PTR(err); - index = lookup_positive_unlocked(name.name, ofs->indexdir, name.len); + index = lookup_positive_unlocked(name.name, ofs->workdir, name.len); kfree(name.name); if (IS_ERR(index)) { if (PTR_ERR(index) == -ENOENT) @@ -787,7 +787,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, return ERR_PTR(err); index = lookup_one_positive_unlocked(ovl_upper_mnt_idmap(ofs), name.name, - ofs->indexdir, name.len); + ofs->workdir, name.len); if (IS_ERR(index)) { err = PTR_ERR(index); if (err == -ENOENT) { diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 05c3dd597fa8..5ba11eb43767 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -425,6 +425,12 @@ int ovl_want_write(struct dentry *dentry); void ovl_drop_write(struct dentry *dentry); struct dentry *ovl_workdir(struct dentry *dentry); const struct cred *ovl_override_creds(struct super_block *sb); + +static inline const struct cred *ovl_creds(struct super_block *sb) +{ + return OVL_FS(sb)->creator_cred; +} + int ovl_can_decode_fh(struct super_block *sb); struct dentry *ovl_indexdir(struct super_block *sb); bool ovl_index_all(struct super_block *sb); @@ -837,8 +843,6 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir, /* file.c */ extern const struct file_operations ovl_file_operations; -int __init ovl_aio_request_cache_init(void); -void ovl_aio_request_cache_destroy(void); int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa); int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa); int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa); diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index d82d2a043da2..5fa9c58af65f 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -63,10 +63,8 @@ struct ovl_fs { struct ovl_sb *fs; /* workbasedir is the path at workdir= mount option */ struct dentry *workbasedir; - /* workdir is the 'work' directory under workbasedir */ + /* workdir is the 'work' or 'index' directory under workbasedir */ struct dentry *workdir; - /* index directory listing overlay inodes by origin file handle */ - struct dentry *indexdir; long namelen; /* pathnames of lower and upper dirs, for show_options */ struct ovl_config config; @@ -81,7 +79,6 @@ struct ovl_fs { /* Traps in ovl inode cache */ struct inode *workbasedir_trap; struct inode *workdir_trap; - struct inode *indexdir_trap; /* -1: disabled, 0: same fs, 1..32: number of unused ino bits */ int xino_mode; /* For allocation of non-persistent inode numbers */ diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index 3fe2dde1598f..112b4b12f825 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -743,10 +743,8 @@ void ovl_free_fs(struct ovl_fs *ofs) unsigned i; iput(ofs->workbasedir_trap); - iput(ofs->indexdir_trap); iput(ofs->workdir_trap); dput(ofs->whiteout); - dput(ofs->indexdir); dput(ofs->workdir); if (ofs->workdir_locked) ovl_inuse_unlock(ofs->workbasedir); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index a490fc47c3e7..e71156baa7bc 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -1169,7 +1169,7 @@ int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir, int ovl_indexdir_cleanup(struct ovl_fs *ofs) { int err; - struct dentry *indexdir = ofs->indexdir; + struct dentry *indexdir = ofs->workdir; struct dentry *index = NULL; struct inode *dir = indexdir->d_inode; struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = indexdir }; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index a0967bb25003..4ab66e3d4cff 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -439,8 +439,10 @@ static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir) bool ok = false; if (workdir != upperdir) { - ok = (lock_rename(workdir, upperdir) == NULL); - unlock_rename(workdir, upperdir); + struct dentry *trap = lock_rename(workdir, upperdir); + if (!IS_ERR(trap)) + unlock_rename(workdir, upperdir); + ok = (trap == NULL); } return ok; } @@ -853,10 +855,8 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, if (IS_ERR(indexdir)) { err = PTR_ERR(indexdir); } else if (indexdir) { - ofs->indexdir = indexdir; - ofs->workdir = dget(indexdir); - - err = ovl_setup_trap(sb, ofs->indexdir, &ofs->indexdir_trap, + ofs->workdir = indexdir; + err = ovl_setup_trap(sb, indexdir, &ofs->workdir_trap, "indexdir"); if (err) goto out; @@ -869,16 +869,15 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, * ".overlay.upper" to indicate that index may have * directory entries. */ - if (ovl_check_origin_xattr(ofs, ofs->indexdir)) { - err = ovl_verify_origin_xattr(ofs, ofs->indexdir, + if (ovl_check_origin_xattr(ofs, indexdir)) { + err = ovl_verify_origin_xattr(ofs, indexdir, OVL_XATTR_ORIGIN, upperpath->dentry, true, false); if (err) pr_err("failed to verify index dir 'origin' xattr\n"); } - err = ovl_verify_upper(ofs, ofs->indexdir, upperpath->dentry, - true); + err = ovl_verify_upper(ofs, indexdir, upperpath->dentry, true); if (err) pr_err("failed to verify index dir 'upper' xattr\n"); @@ -886,7 +885,7 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, if (!err) err = ovl_indexdir_cleanup(ofs); } - if (err || !ofs->indexdir) + if (err || !indexdir) pr_warn("try deleting index dir or mounting with '-o index=off' to disable inodes index.\n"); out: @@ -1406,7 +1405,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) goto out_free_oe; /* Force r/o mount with no index dir */ - if (!ofs->indexdir) + if (!ofs->workdir) sb->s_flags |= SB_RDONLY; } @@ -1415,7 +1414,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) goto out_free_oe; /* Show index=off in /proc/mounts for forced r/o mount */ - if (!ofs->indexdir) { + if (!ofs->workdir) { ofs->config.index = false; if (ovl_upper_mnt(ofs) && ofs->config.nfs_export) { pr_warn("NFS export requires an index dir, falling back to nfs_export=off.\n"); @@ -1454,6 +1453,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) * lead to unexpected results. */ sb->s_iflags |= SB_I_NOUMASK; + sb->s_iflags |= SB_I_EVM_UNSUPPORTED; err = -ENOMEM; root_dentry = ovl_get_root(sb, ctx->upper.dentry, oe); @@ -1501,14 +1501,10 @@ static int __init ovl_init(void) if (ovl_inode_cachep == NULL) return -ENOMEM; - err = ovl_aio_request_cache_init(); - if (!err) { - err = register_filesystem(&ovl_fs_type); - if (!err) - return 0; + err = register_filesystem(&ovl_fs_type); + if (!err) + return 0; - ovl_aio_request_cache_destroy(); - } kmem_cache_destroy(ovl_inode_cachep); return err; @@ -1524,7 +1520,6 @@ static void __exit ovl_exit(void) */ rcu_barrier(); kmem_cache_destroy(ovl_inode_cachep); - ovl_aio_request_cache_destroy(); } module_init(ovl_init); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index c3f020ca13a8..0217094c23ea 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -91,7 +91,7 @@ struct dentry *ovl_indexdir(struct super_block *sb) { struct ovl_fs *ofs = OVL_FS(sb); - return ofs->indexdir; + return ofs->config.index ? ofs->workdir : NULL; } /* Index all files on copy up. For now only enabled for NFS export */ @@ -1198,12 +1198,17 @@ void ovl_nlink_end(struct dentry *dentry) int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir) { + struct dentry *trap; + /* Workdir should not be the same as upperdir */ if (workdir == upperdir) goto err; /* Workdir should not be subdir of upperdir and vice versa */ - if (lock_rename(workdir, upperdir) != NULL) + trap = lock_rename(workdir, upperdir); + if (IS_ERR(trap)) + goto err; + if (trap) goto err_unlock; return 0; diff --git a/fs/pipe.c b/fs/pipe.c index 804a7d789452..f1adbfe743d4 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -446,6 +446,18 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) bool was_empty = false; bool wake_next_writer = false; + /* + * Reject writing to watch queue pipes before the point where we lock + * the pipe. + * Otherwise, lockdep would be unhappy if the caller already has another + * pipe locked. + * If we had to support locking a normal pipe and a notification pipe at + * the same time, we could set up lockdep annotations for that, but + * since we don't actually need that, it's simpler to just bail here. + */ + if (pipe_has_watch_queue(pipe)) + return -EXDEV; + /* Null write succeeds. */ if (unlikely(total_len == 0)) return 0; @@ -458,11 +470,6 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) goto out; } - if (pipe_has_watch_queue(pipe)) { - ret = -EXDEV; - goto out; - } - /* * If it wasn't empty we try to merge new data into * the last buffer. @@ -1317,6 +1324,11 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) pipe->tail = tail; pipe->head = head; + if (!pipe_has_watch_queue(pipe)) { + pipe->max_usage = nr_slots; + pipe->nr_accounted = nr_slots; + } + spin_unlock_irq(&pipe->rd_wait.lock); /* This might have made more room for writers */ @@ -1368,8 +1380,6 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg) if (ret < 0) goto out_revert_acct; - pipe->max_usage = nr_slots; - pipe->nr_accounted = nr_slots; return pipe->max_usage * PAGE_SIZE; out_revert_acct: @@ -1497,7 +1507,6 @@ static struct ctl_table fs_pipe_sysctls[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, - { } }; #endif diff --git a/fs/pnode.c b/fs/pnode.c index e4d0340393d5..a799e0315cc9 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -468,7 +468,7 @@ static void umount_one(struct mount *mnt, struct list_head *to_umount) mnt->mnt.mnt_flags |= MNT_UMOUNT; list_del_init(&mnt->mnt_child); list_del_init(&mnt->mnt_umounting); - list_move_tail(&mnt->mnt_list, to_umount); + move_from_ns(mnt, to_umount); } /* diff --git a/fs/posix_acl.c b/fs/posix_acl.c index a05fe94970ce..e1af20893ebe 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -600,7 +600,7 @@ EXPORT_SYMBOL(__posix_acl_chmod); * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs @nop_mnt_idmap. + * performed on the raw inode simply pass @nop_mnt_idmap. */ int posix_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry, @@ -700,7 +700,7 @@ EXPORT_SYMBOL_GPL(posix_acl_create); * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs @nop_mnt_idmap. + * performed on the raw inode simply pass @nop_mnt_idmap. * * Called from set_acl inode operations. */ diff --git a/fs/proc/base.c b/fs/proc/base.c index dd31e3b6bf77..98a031ac2648 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -97,6 +97,7 @@ #include <linux/resctrl.h> #include <linux/cn_proc.h> #include <linux/ksm.h> +#include <uapi/linux/lsm.h> #include <trace/events/oom.h> #include "internal.h" #include "fd.h" @@ -146,10 +147,10 @@ struct pid_entry { NOD(NAME, (S_IFREG|(MODE)), \ NULL, &proc_single_file_operations, \ { .proc_show = show } ) -#define ATTR(LSM, NAME, MODE) \ +#define ATTR(LSMID, NAME, MODE) \ NOD(NAME, (S_IFREG|(MODE)), \ NULL, &proc_pid_attr_operations, \ - { .lsm = LSM }) + { .lsmid = LSMID }) /* * Count the number of hardlinks for the pid_entry table, excluding the . @@ -2726,7 +2727,7 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, if (!task) return -ESRCH; - length = security_getprocattr(task, PROC_I(inode)->op.lsm, + length = security_getprocattr(task, PROC_I(inode)->op.lsmid, file->f_path.dentry->d_name.name, &p); put_task_struct(task); @@ -2784,7 +2785,7 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, if (rv < 0) goto out_free; - rv = security_setprocattr(PROC_I(inode)->op.lsm, + rv = security_setprocattr(PROC_I(inode)->op.lsmid, file->f_path.dentry->d_name.name, page, count); mutex_unlock(¤t->signal->cred_guard_mutex); @@ -2833,27 +2834,27 @@ static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \ #ifdef CONFIG_SECURITY_SMACK static const struct pid_entry smack_attr_dir_stuff[] = { - ATTR("smack", "current", 0666), + ATTR(LSM_ID_SMACK, "current", 0666), }; LSM_DIR_OPS(smack); #endif #ifdef CONFIG_SECURITY_APPARMOR static const struct pid_entry apparmor_attr_dir_stuff[] = { - ATTR("apparmor", "current", 0666), - ATTR("apparmor", "prev", 0444), - ATTR("apparmor", "exec", 0666), + ATTR(LSM_ID_APPARMOR, "current", 0666), + ATTR(LSM_ID_APPARMOR, "prev", 0444), + ATTR(LSM_ID_APPARMOR, "exec", 0666), }; LSM_DIR_OPS(apparmor); #endif static const struct pid_entry attr_dir_stuff[] = { - ATTR(NULL, "current", 0666), - ATTR(NULL, "prev", 0444), - ATTR(NULL, "exec", 0666), - ATTR(NULL, "fscreate", 0666), - ATTR(NULL, "keycreate", 0666), - ATTR(NULL, "sockcreate", 0666), + ATTR(LSM_ID_UNDEF, "current", 0666), + ATTR(LSM_ID_UNDEF, "prev", 0444), + ATTR(LSM_ID_UNDEF, "exec", 0666), + ATTR(LSM_ID_UNDEF, "fscreate", 0666), + ATTR(LSM_ID_UNDEF, "keycreate", 0666), + ATTR(LSM_ID_UNDEF, "sockcreate", 0666), #ifdef CONFIG_SECURITY_SMACK DIR("smack", 0555, proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops), diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 9a8f32f21ff5..a71ac5379584 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -92,7 +92,7 @@ union proc_op { int (*proc_show)(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); - const char *lsm; + int lsmid; }; struct proc_inode { diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 8064ea76f80b..37cde0efee57 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -44,7 +44,7 @@ static struct ctl_table sysctl_mount_point[] = { */ struct ctl_table_header *register_sysctl_mount_point(const char *path) { - return register_sysctl_sz(path, sysctl_mount_point, 0); + return register_sysctl(path, sysctl_mount_point); } EXPORT_SYMBOL(register_sysctl_mount_point); @@ -71,7 +71,6 @@ static struct ctl_table root_table[] = { .procname = "", .mode = S_IFDIR|S_IRUGO|S_IXUGO, }, - { } }; static struct ctl_table_root sysctl_table_root = { .default_set.dir.header = { @@ -233,7 +232,8 @@ static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header) return -EROFS; /* Am I creating a permanently empty directory? */ - if (sysctl_is_perm_empty_ctl_table(header->ctl_table)) { + if (header->ctl_table_size > 0 && + sysctl_is_perm_empty_ctl_table(header->ctl_table)) { if (!RB_EMPTY_ROOT(&dir->root)) return -EINVAL; sysctl_set_perm_empty_ctl_header(dir_h); @@ -534,13 +534,8 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, goto out; } - inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); - if (IS_ERR(inode)) { - err = ERR_CAST(inode); - goto out; - } - d_set_d_op(dentry, &proc_sys_dentry_operations); + inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); err = d_splice_alias(inode, dentry); out: @@ -698,13 +693,8 @@ static bool proc_sys_fill_cache(struct file *file, return false; if (d_in_lookup(child)) { struct dentry *res; - inode = proc_sys_make_inode(dir->d_sb, head, table); - if (IS_ERR(inode)) { - d_lookup_done(child); - dput(child); - return false; - } d_set_d_op(child, &proc_sys_dentry_operations); + inode = proc_sys_make_inode(dir->d_sb, head, table); res = d_splice_alias(inode, child); d_lookup_done(child); if (unlikely(res)) { @@ -1213,6 +1203,10 @@ static bool get_links(struct ctl_dir *dir, struct ctl_table_header *tmp_head; struct ctl_table *entry, *link; + if (header->ctl_table_size == 0 || + sysctl_is_perm_empty_ctl_table(header->ctl_table)) + return true; + /* Are there links available for every entry in table? */ list_for_each_table_entry(entry, header) { const char *procname = entry->procname; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 435b61054b5b..62b16f42d5d2 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -273,7 +273,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) const char *name = NULL; if (file) { - struct inode *inode = file_inode(vma->vm_file); + const struct inode *inode = file_user_inode(vma->vm_file); + dev = inode->i_sb->s_dev; ino = inode->i_ino; pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; @@ -865,7 +866,8 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %8u\n", - hugepage_vma_check(vma, vma->vm_flags, true, false, true)); + !!thp_vma_allowable_orders(vma, vma->vm_flags, true, false, + true, THP_ORDERS_ALL)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); @@ -1761,7 +1763,7 @@ static int pagemap_release(struct inode *inode, struct file *file) #define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \ PAGE_IS_FILE | PAGE_IS_PRESENT | \ PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \ - PAGE_IS_HUGE) + PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY) #define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC) struct pagemap_scan_private { @@ -1793,6 +1795,8 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p, if (is_zero_pfn(pte_pfn(pte))) categories |= PAGE_IS_PFNZERO; + if (pte_soft_dirty(pte)) + categories |= PAGE_IS_SOFT_DIRTY; } else if (is_swap_pte(pte)) { swp_entry_t swp; @@ -1806,6 +1810,8 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p, !PageAnon(pfn_swap_entry_to_page(swp))) categories |= PAGE_IS_FILE; } + if (pte_swp_soft_dirty(pte)) + categories |= PAGE_IS_SOFT_DIRTY; } return categories; @@ -1853,12 +1859,16 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, if (is_zero_pfn(pmd_pfn(pmd))) categories |= PAGE_IS_PFNZERO; + if (pmd_soft_dirty(pmd)) + categories |= PAGE_IS_SOFT_DIRTY; } else if (is_swap_pmd(pmd)) { swp_entry_t swp; categories |= PAGE_IS_SWAPPED; if (!pmd_swp_uffd_wp(pmd)) categories |= PAGE_IS_WRITTEN; + if (pmd_swp_soft_dirty(pmd)) + categories |= PAGE_IS_SOFT_DIRTY; if (p->masks_of_interest & PAGE_IS_FILE) { swp = pmd_to_swp_entry(pmd); @@ -1905,10 +1915,14 @@ static unsigned long pagemap_hugetlb_category(pte_t pte) categories |= PAGE_IS_FILE; if (is_zero_pfn(pte_pfn(pte))) categories |= PAGE_IS_PFNZERO; + if (pte_soft_dirty(pte)) + categories |= PAGE_IS_SOFT_DIRTY; } else if (is_swap_pte(pte)) { categories |= PAGE_IS_SWAPPED; if (!pte_swp_uffd_wp_any(pte)) categories |= PAGE_IS_WRITTEN; + if (pte_swp_soft_dirty(pte)) + categories |= PAGE_IS_SOFT_DIRTY; } return categories; @@ -2007,6 +2021,9 @@ static int pagemap_scan_test_walk(unsigned long start, unsigned long end, if (wp_allowed) vma_category |= PAGE_IS_WPALLOWED; + if (vma->vm_flags & VM_SOFTDIRTY) + vma_category |= PAGE_IS_SOFT_DIRTY; + if (!pagemap_scan_is_interesting_vma(vma_category, p)) return 1; diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 250eb5bf7b52..0a808951b7d3 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -142,13 +142,9 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) seq_printf(m, "%i %i %u:%u ", r->mnt_id, r->mnt_parent->mnt_id, MAJOR(sb->s_dev), MINOR(sb->s_dev)); - if (sb->s_op->show_path) { - err = sb->s_op->show_path(m, mnt->mnt_root); - if (err) - goto out; - } else { - seq_dentry(m, mnt->mnt_root, " \t\n\\"); - } + err = show_path(m, mnt->mnt_root); + if (err) + goto out; seq_putc(m, ' '); /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ @@ -283,8 +279,6 @@ static int mounts_open_common(struct inode *inode, struct file *file, p->ns = ns; p->root = root; p->show = show; - INIT_LIST_HEAD(&p->cursor.mnt_list); - p->cursor.mnt.mnt_flags = MNT_CURSOR; return 0; @@ -301,7 +295,6 @@ static int mounts_release(struct inode *inode, struct file *file) struct seq_file *m = file->private_data; struct proc_mounts *p = m->private; path_put(&p->root); - mnt_cursor_del(p->ns, &p->cursor); put_mnt_ns(p->ns); return seq_release_private(inode, file); } diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index d41c20d1b5e8..d0d9bfdad30c 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -23,6 +23,7 @@ #include <linux/pstore.h> #include <linux/slab.h> #include <linux/uaccess.h> +#include <linux/cleanup.h> #include "internal.h" @@ -34,6 +35,8 @@ static LIST_HEAD(records_list); static DEFINE_MUTEX(pstore_sb_lock); static struct super_block *pstore_sb; +DEFINE_FREE(pstore_iput, struct inode *, if (_T) iput(_T)) + struct pstore_private { struct list_head list; struct dentry *dentry; @@ -60,11 +63,12 @@ static void free_pstore_private(struct pstore_private *private) } kfree(private); } +DEFINE_FREE(pstore_private, struct pstore_private *, free_pstore_private(_T)); static void *pstore_ftrace_seq_start(struct seq_file *s, loff_t *pos) { struct pstore_private *ps = s->private; - struct pstore_ftrace_seq_data *data; + struct pstore_ftrace_seq_data *data __free(kfree) = NULL; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) @@ -72,13 +76,10 @@ static void *pstore_ftrace_seq_start(struct seq_file *s, loff_t *pos) data->off = ps->total_size % REC_SIZE; data->off += *pos * REC_SIZE; - if (data->off + REC_SIZE > ps->total_size) { - kfree(data); + if (data->off + REC_SIZE > ps->total_size) return NULL; - } - - return data; + return_ptr(data); } static void pstore_ftrace_seq_stop(struct seq_file *s, void *v) @@ -182,25 +183,21 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) { struct pstore_private *p = d_inode(dentry)->i_private; struct pstore_record *record = p->record; - int rc = 0; if (!record->psi->erase) return -EPERM; /* Make sure we can't race while removing this file. */ - mutex_lock(&records_list_lock); - if (!list_empty(&p->list)) - list_del_init(&p->list); - else - rc = -ENOENT; - p->dentry = NULL; - mutex_unlock(&records_list_lock); - if (rc) - return rc; - - mutex_lock(&record->psi->read_mutex); - record->psi->erase(record); - mutex_unlock(&record->psi->read_mutex); + scoped_guard(mutex, &records_list_lock) { + if (!list_empty(&p->list)) + list_del_init(&p->list); + else + return -ENOENT; + p->dentry = NULL; + } + + scoped_guard(mutex, &record->psi->read_mutex) + record->psi->erase(record); return simple_unlink(dir, dentry); } @@ -292,19 +289,16 @@ static struct dentry *psinfo_lock_root(void) { struct dentry *root; - mutex_lock(&pstore_sb_lock); + guard(mutex)(&pstore_sb_lock); /* * Having no backend is fine -- no records appear. * Not being mounted is fine -- nothing to do. */ - if (!psinfo || !pstore_sb) { - mutex_unlock(&pstore_sb_lock); + if (!psinfo || !pstore_sb) return NULL; - } root = pstore_sb->s_root; inode_lock(d_inode(root)); - mutex_unlock(&pstore_sb_lock); return root; } @@ -319,19 +313,19 @@ int pstore_put_backend_records(struct pstore_info *psi) if (!root) return 0; - mutex_lock(&records_list_lock); - list_for_each_entry_safe(pos, tmp, &records_list, list) { - if (pos->record->psi == psi) { - list_del_init(&pos->list); - rc = simple_unlink(d_inode(root), pos->dentry); - if (WARN_ON(rc)) - break; - d_drop(pos->dentry); - dput(pos->dentry); - pos->dentry = NULL; + scoped_guard(mutex, &records_list_lock) { + list_for_each_entry_safe(pos, tmp, &records_list, list) { + if (pos->record->psi == psi) { + list_del_init(&pos->list); + rc = simple_unlink(d_inode(root), pos->dentry); + if (WARN_ON(rc)) + break; + d_drop(pos->dentry); + dput(pos->dentry); + pos->dentry = NULL; + } } } - mutex_unlock(&records_list_lock); inode_unlock(d_inode(root)); @@ -346,29 +340,27 @@ int pstore_put_backend_records(struct pstore_info *psi) int pstore_mkfile(struct dentry *root, struct pstore_record *record) { struct dentry *dentry; - struct inode *inode; - int rc = 0; + struct inode *inode __free(pstore_iput) = NULL; char name[PSTORE_NAMELEN]; - struct pstore_private *private, *pos; + struct pstore_private *private __free(pstore_private) = NULL, *pos; size_t size = record->size + record->ecc_notice_size; if (WARN_ON(!inode_is_locked(d_inode(root)))) return -EINVAL; - rc = -EEXIST; + guard(mutex)(&records_list_lock); + /* Skip records that are already present in the filesystem. */ - mutex_lock(&records_list_lock); list_for_each_entry(pos, &records_list, list) { if (pos->record->type == record->type && pos->record->id == record->id && pos->record->psi == record->psi) - goto fail; + return -EEXIST; } - rc = -ENOMEM; inode = pstore_get_inode(root->d_sb); if (!inode) - goto fail; + return -ENOMEM; inode->i_mode = S_IFREG | 0444; inode->i_fop = &pstore_file_operations; scnprintf(name, sizeof(name), "%s-%s-%llu%s", @@ -378,11 +370,11 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) private = kzalloc(sizeof(*private), GFP_KERNEL); if (!private) - goto fail_inode; + return -ENOMEM; dentry = d_alloc_name(root, name); if (!dentry) - goto fail_private; + return -ENOMEM; private->dentry = dentry; private->record = record; @@ -393,20 +385,11 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) inode_set_mtime_to_ts(inode, inode_set_ctime_to_ts(inode, record->time)); - d_add(dentry, inode); + d_add(dentry, no_free_ptr(inode)); - list_add(&private->list, &records_list); - mutex_unlock(&records_list_lock); + list_add(&(no_free_ptr(private))->list, &records_list); return 0; - -fail_private: - free_pstore_private(private); -fail_inode: - iput(inode); -fail: - mutex_unlock(&records_list_lock); - return rc; } /* @@ -451,9 +434,8 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent) if (!sb->s_root) return -ENOMEM; - mutex_lock(&pstore_sb_lock); - pstore_sb = sb; - mutex_unlock(&pstore_sb_lock); + scoped_guard(mutex, &pstore_sb_lock) + pstore_sb = sb; pstore_get_records(0); @@ -468,17 +450,14 @@ static struct dentry *pstore_mount(struct file_system_type *fs_type, static void pstore_kill_sb(struct super_block *sb) { - mutex_lock(&pstore_sb_lock); + guard(mutex)(&pstore_sb_lock); WARN_ON(pstore_sb && pstore_sb != sb); kill_litter_super(sb); pstore_sb = NULL; - mutex_lock(&records_list_lock); + guard(mutex)(&records_list_lock); INIT_LIST_HEAD(&records_list); - mutex_unlock(&records_list_lock); - - mutex_unlock(&pstore_sb_lock); } static struct file_system_type pstore_fs_type = { diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index d36702c7ab3c..88b34fdbf759 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -529,6 +529,7 @@ static int ramoops_init_przs(const char *name, } zone_sz = mem_sz / *cnt; + zone_sz = ALIGN_DOWN(zone_sz, 2); if (!zone_sz) { dev_err(dev, "%s zone size == 0\n", name); goto fail; diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 650e437b55e6..f1848cdd6d34 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -190,7 +190,7 @@ static int persistent_ram_init_ecc(struct persistent_ram_zone *prz, { int numerr; struct persistent_ram_buffer *buffer = prz->buffer; - int ecc_blocks; + size_t ecc_blocks; size_t ecc_total; if (!ecc_info || !ecc_info->ecc_size) diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c index 66645a5a35f3..42a529e26bd6 100644 --- a/fs/qnx4/dir.c +++ b/fs/qnx4/dir.c @@ -15,43 +15,6 @@ #include <linux/buffer_head.h> #include "qnx4.h" -/* - * A qnx4 directory entry is an inode entry or link info - * depending on the status field in the last byte. The - * first byte is where the name start either way, and a - * zero means it's empty. - * - * Also, due to a bug in gcc, we don't want to use the - * real (differently sized) name arrays in the inode and - * link entries, but always the 'de_name[]' one in the - * fake struct entry. - * - * See - * - * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99578#c6 - * - * for details, but basically gcc will take the size of the - * 'name' array from one of the used union entries randomly. - * - * This use of 'de_name[]' (48 bytes) avoids the false positive - * warnings that would happen if gcc decides to use 'inode.di_name' - * (16 bytes) even when the pointer and size were to come from - * 'link.dl_name' (48 bytes). - * - * In all cases the actual name pointer itself is the same, it's - * only the gcc internal 'what is the size of this field' logic - * that can get confused. - */ -union qnx4_directory_entry { - struct { - const char de_name[48]; - u8 de_pad[15]; - u8 de_status; - }; - struct qnx4_inode_entry inode; - struct qnx4_link_info link; -}; - static int qnx4_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); @@ -74,26 +37,25 @@ static int qnx4_readdir(struct file *file, struct dir_context *ctx) ix = (ctx->pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK; for (; ix < QNX4_INODES_PER_BLOCK; ix++, ctx->pos += QNX4_DIR_ENTRY_SIZE) { union qnx4_directory_entry *de; + const char *fname; offset = ix * QNX4_DIR_ENTRY_SIZE; de = (union qnx4_directory_entry *) (bh->b_data + offset); - if (!de->de_name[0]) - continue; - if (!(de->de_status & (QNX4_FILE_USED|QNX4_FILE_LINK))) + fname = get_entry_fname(de, &size); + if (!fname) continue; + if (!(de->de_status & QNX4_FILE_LINK)) { - size = sizeof(de->inode.di_fname); ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1; } else { - size = sizeof(de->link.dl_fname); ino = ( le32_to_cpu(de->link.dl_inode_blk) - 1 ) * QNX4_INODES_PER_BLOCK + de->link.dl_inode_ndx; } - size = strnlen(de->de_name, size); - QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, name)); - if (!dir_emit(ctx, de->de_name, size, ino, DT_UNKNOWN)) { + + QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, fname)); + if (!dir_emit(ctx, fname, size, ino, DT_UNKNOWN)) { brelse(bh); return 0; } diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c index 8d72221735d7..bb8db6550ca5 100644 --- a/fs/qnx4/namei.c +++ b/fs/qnx4/namei.c @@ -26,31 +26,24 @@ static int qnx4_match(int len, const char *name, struct buffer_head *bh, unsigned long *offset) { - struct qnx4_inode_entry *de; - int namelen, thislen; + union qnx4_directory_entry *de; + const char *fname; + int fnamelen; if (bh == NULL) { printk(KERN_WARNING "qnx4: matching unassigned buffer !\n"); return 0; } - de = (struct qnx4_inode_entry *) (bh->b_data + *offset); + de = (union qnx4_directory_entry *) (bh->b_data + *offset); *offset += QNX4_DIR_ENTRY_SIZE; - if ((de->di_status & QNX4_FILE_LINK) != 0) { - namelen = QNX4_NAME_MAX; - } else { - namelen = QNX4_SHORT_NAME_MAX; - } - thislen = strlen( de->di_fname ); - if ( thislen > namelen ) - thislen = namelen; - if (len != thislen) { + + fname = get_entry_fname(de, &fnamelen); + if (!fname || len != fnamelen) return 0; - } - if (strncmp(name, de->di_fname, len) == 0) { - if ((de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK)) != 0) { - return 1; - } - } + + if (strncmp(name, fname, len) == 0) + return 1; + return 0; } diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h index 6283705466a4..5c2b1fb6b952 100644 --- a/fs/qnx4/qnx4.h +++ b/fs/qnx4/qnx4.h @@ -44,3 +44,63 @@ static inline struct qnx4_inode_entry *qnx4_raw_inode(struct inode *inode) { return &qnx4_i(inode)->raw; } + +/* + * A qnx4 directory entry is an inode entry or link info + * depending on the status field in the last byte. The + * first byte is where the name start either way, and a + * zero means it's empty. + * + * Also, due to a bug in gcc, we don't want to use the + * real (differently sized) name arrays in the inode and + * link entries, but always the 'de_name[]' one in the + * fake struct entry. + * + * See + * + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99578#c6 + * + * for details, but basically gcc will take the size of the + * 'name' array from one of the used union entries randomly. + * + * This use of 'de_name[]' (48 bytes) avoids the false positive + * warnings that would happen if gcc decides to use 'inode.di_name' + * (16 bytes) even when the pointer and size were to come from + * 'link.dl_name' (48 bytes). + * + * In all cases the actual name pointer itself is the same, it's + * only the gcc internal 'what is the size of this field' logic + * that can get confused. + */ +union qnx4_directory_entry { + struct { + const char de_name[48]; + u8 de_pad[15]; + u8 de_status; + }; + struct qnx4_inode_entry inode; + struct qnx4_link_info link; +}; + +static inline const char *get_entry_fname(union qnx4_directory_entry *de, + int *size) +{ + /* Make sure the status byte is in the same place for all structs. */ + BUILD_BUG_ON(offsetof(struct qnx4_inode_entry, di_status) != + offsetof(struct qnx4_link_info, dl_status)); + BUILD_BUG_ON(offsetof(struct qnx4_inode_entry, di_status) != + offsetof(union qnx4_directory_entry, de_status)); + + if (!de->de_name[0]) + return NULL; + if (!(de->de_status & (QNX4_FILE_USED|QNX4_FILE_LINK))) + return NULL; + if (!(de->de_status & QNX4_FILE_LINK)) + *size = sizeof(de->inode.di_fname); + else + *size = sizeof(de->link.dl_fname); + + *size = strnlen(de->de_name, *size); + + return de->de_name; +} diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 58b5de081b57..1f0c754416b6 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1787,7 +1787,7 @@ EXPORT_SYMBOL(dquot_alloc_inode); /* * Convert in-memory reserved quotas to real consumed quotas */ -int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) +void dquot_claim_space_nodirty(struct inode *inode, qsize_t number) { struct dquot **dquots; int cnt, index; @@ -1797,7 +1797,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) *inode_reserved_space(inode) -= number; __inode_add_bytes(inode, number); spin_unlock(&inode->i_lock); - return 0; + return; } dquots = i_dquot(inode); @@ -1822,7 +1822,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) spin_unlock(&inode->i_lock); mark_all_dquot_dirty(dquots); srcu_read_unlock(&dquot_srcu, index); - return 0; + return; } EXPORT_SYMBOL(dquot_claim_space_nodirty); @@ -2969,7 +2969,6 @@ static struct ctl_table fs_dqstats_table[] = { .proc_handler = proc_dointvec, }, #endif - { }, }; static int __init dquot_init(void) diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index efb1b4c1a0a4..7a6d980e614d 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -70,7 +70,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) /* make various checks */ order = get_order(newsize); - if (unlikely(order > MAX_ORDER)) + if (unlikely(order > MAX_PAGE_ORDER)) return -EFBIG; ret = inode_newsize_ok(inode, newsize); diff --git a/fs/read_write.c b/fs/read_write.c index 4771701c896b..d4c036e82b6c 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -354,6 +354,9 @@ out_putf: int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) { + int mask = read_write == READ ? MAY_READ : MAY_WRITE; + int ret; + if (unlikely((ssize_t) count < 0)) return -EINVAL; @@ -371,8 +374,11 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t } } - return security_file_permission(file, - read_write == READ ? MAY_READ : MAY_WRITE); + ret = security_file_permission(file, mask); + if (ret) + return ret; + + return fsnotify_file_area_perm(file, mask, ppos, count); } EXPORT_SYMBOL(rw_verify_area); @@ -773,12 +779,14 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, return ret; } -static ssize_t do_iter_read(struct file *file, struct iov_iter *iter, - loff_t *pos, rwf_t flags) +ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, + struct iov_iter *iter) { size_t tot_len; ssize_t ret = 0; + if (!file->f_op->read_iter) + return -EINVAL; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) @@ -787,22 +795,20 @@ static ssize_t do_iter_read(struct file *file, struct iov_iter *iter, tot_len = iov_iter_count(iter); if (!tot_len) goto out; - ret = rw_verify_area(READ, file, pos, tot_len); + ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len); if (ret < 0) return ret; - if (file->f_op->read_iter) - ret = do_iter_readv_writev(file, iter, pos, READ, flags); - else - ret = do_loop_readv_writev(file, iter, pos, READ, flags); + ret = call_read_iter(file, iocb, iter); out: if (ret >= 0) fsnotify_access(file); return ret; } +EXPORT_SYMBOL(vfs_iocb_iter_read); -ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, - struct iov_iter *iter) +ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, + rwf_t flags) { size_t tot_len; ssize_t ret = 0; @@ -817,33 +823,30 @@ ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, tot_len = iov_iter_count(iter); if (!tot_len) goto out; - ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len); + ret = rw_verify_area(READ, file, ppos, tot_len); if (ret < 0) return ret; - ret = call_read_iter(file, iocb, iter); + ret = do_iter_readv_writev(file, iter, ppos, READ, flags); out: if (ret >= 0) fsnotify_access(file); return ret; } -EXPORT_SYMBOL(vfs_iocb_iter_read); - -ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, - rwf_t flags) -{ - if (!file->f_op->read_iter) - return -EINVAL; - return do_iter_read(file, iter, ppos, flags); -} EXPORT_SYMBOL(vfs_iter_read); -static ssize_t do_iter_write(struct file *file, struct iov_iter *iter, - loff_t *pos, rwf_t flags) +/* + * Caller is responsible for calling kiocb_end_write() on completion + * if async iocb was queued. + */ +ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, + struct iov_iter *iter) { size_t tot_len; ssize_t ret = 0; + if (!file->f_op->write_iter) + return -EINVAL; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) @@ -852,88 +855,127 @@ static ssize_t do_iter_write(struct file *file, struct iov_iter *iter, tot_len = iov_iter_count(iter); if (!tot_len) return 0; - ret = rw_verify_area(WRITE, file, pos, tot_len); + ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len); if (ret < 0) return ret; - if (file->f_op->write_iter) - ret = do_iter_readv_writev(file, iter, pos, WRITE, flags); - else - ret = do_loop_readv_writev(file, iter, pos, WRITE, flags); + kiocb_start_write(iocb); + ret = call_write_iter(file, iocb, iter); + if (ret != -EIOCBQUEUED) + kiocb_end_write(iocb); if (ret > 0) fsnotify_modify(file); + return ret; } +EXPORT_SYMBOL(vfs_iocb_iter_write); -ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, - struct iov_iter *iter) +ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, + rwf_t flags) { size_t tot_len; - ssize_t ret = 0; + ssize_t ret; - if (!file->f_op->write_iter) - return -EINVAL; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; + if (!file->f_op->write_iter) + return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) return 0; - ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len); + + ret = rw_verify_area(WRITE, file, ppos, tot_len); if (ret < 0) return ret; - ret = call_write_iter(file, iocb, iter); + file_start_write(file); + ret = do_iter_readv_writev(file, iter, ppos, WRITE, flags); if (ret > 0) fsnotify_modify(file); + file_end_write(file); return ret; } -EXPORT_SYMBOL(vfs_iocb_iter_write); - -ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, - rwf_t flags) -{ - if (!file->f_op->write_iter) - return -EINVAL; - return do_iter_write(file, iter, ppos, flags); -} EXPORT_SYMBOL(vfs_iter_write); static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, - unsigned long vlen, loff_t *pos, rwf_t flags) + unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; - ssize_t ret; + size_t tot_len; + ssize_t ret = 0; - ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); - if (ret >= 0) { - ret = do_iter_read(file, &iter, pos, flags); - kfree(iov); - } + if (!(file->f_mode & FMODE_READ)) + return -EBADF; + if (!(file->f_mode & FMODE_CAN_READ)) + return -EINVAL; + + ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, + &iter); + if (ret < 0) + return ret; + + tot_len = iov_iter_count(&iter); + if (!tot_len) + goto out; + ret = rw_verify_area(READ, file, pos, tot_len); + if (ret < 0) + goto out; + + if (file->f_op->read_iter) + ret = do_iter_readv_writev(file, &iter, pos, READ, flags); + else + ret = do_loop_readv_writev(file, &iter, pos, READ, flags); +out: + if (ret >= 0) + fsnotify_access(file); + kfree(iov); return ret; } static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, - unsigned long vlen, loff_t *pos, rwf_t flags) + unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; - ssize_t ret; + size_t tot_len; + ssize_t ret = 0; - ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); - if (ret >= 0) { - file_start_write(file); - ret = do_iter_write(file, &iter, pos, flags); - file_end_write(file); - kfree(iov); - } + if (!(file->f_mode & FMODE_WRITE)) + return -EBADF; + if (!(file->f_mode & FMODE_CAN_WRITE)) + return -EINVAL; + + ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, + &iter); + if (ret < 0) + return ret; + + tot_len = iov_iter_count(&iter); + if (!tot_len) + goto out; + + ret = rw_verify_area(WRITE, file, pos, tot_len); + if (ret < 0) + goto out; + + file_start_write(file); + if (file->f_op->write_iter) + ret = do_iter_readv_writev(file, &iter, pos, WRITE, flags); + else + ret = do_loop_readv_writev(file, &iter, pos, WRITE, flags); + if (ret > 0) + fsnotify_modify(file); + file_end_write(file); +out: + kfree(iov); return ret; } @@ -1178,7 +1220,7 @@ COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, #endif /* CONFIG_COMPAT */ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, - size_t count, loff_t max) + size_t count, loff_t max) { struct fd in, out; struct inode *in_inode, *out_inode; @@ -1250,10 +1292,8 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, retval = rw_verify_area(WRITE, out.file, &out_pos, count); if (retval < 0) goto fput_out; - file_start_write(out.file); retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl); - file_end_write(out.file); } else { if (out.file->f_flags & O_NONBLOCK) fl |= SPLICE_F_NONBLOCK; @@ -1362,38 +1402,6 @@ COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, } #endif -/** - * generic_copy_file_range - copy data between two files - * @file_in: file structure to read from - * @pos_in: file offset to read from - * @file_out: file structure to write data to - * @pos_out: file offset to write data to - * @len: amount of data to copy - * @flags: copy flags - * - * This is a generic filesystem helper to copy data from one file to another. - * It has no constraints on the source or destination file owners - the files - * can belong to different superblocks and different filesystem types. Short - * copies are allowed. - * - * This should be called from the @file_out filesystem, as per the - * ->copy_file_range() method. - * - * Returns the number of bytes copied or a negative error indicating the - * failure. - */ - -ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - size_t len, unsigned int flags) -{ - lockdep_assert(sb_write_started(file_inode(file_out)->i_sb)); - - return do_splice_direct(file_in, &pos_in, file_out, &pos_out, - len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); -} -EXPORT_SYMBOL(generic_copy_file_range); - /* * Performs necessary checks before doing a file copy * @@ -1478,6 +1486,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, { ssize_t ret; bool splice = flags & COPY_FILE_SPLICE; + bool samesb = file_inode(file_in)->i_sb == file_inode(file_out)->i_sb; if (flags & ~COPY_FILE_SPLICE) return -EINVAL; @@ -1509,19 +1518,24 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, pos_out, len, flags); - goto done; - } - - if (!splice && file_in->f_op->remap_file_range && - file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) { + } else if (!splice && file_in->f_op->remap_file_range && samesb) { ret = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, min_t(loff_t, MAX_RW_COUNT, len), REMAP_FILE_CAN_SHORTEN); - if (ret > 0) - goto done; + /* fallback to splice */ + if (ret <= 0) + splice = true; + } else if (samesb) { + /* Fallback to splice for same sb copy for backward compat */ + splice = true; } + file_end_write(file_out); + + if (!splice) + goto done; + /* * We can get here for same sb copy of filesystems that do not implement * ->copy_file_range() in case filesystem does not support clone or in @@ -1533,11 +1547,16 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, * and which filesystems do not, that will allow userspace tools to * make consistent desicions w.r.t using copy_file_range(). * - * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE. + * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE + * for server-side-copy between any two sb. + * + * In any case, we call do_splice_direct() and not splice_file_range(), + * without file_start_write() held, to avoid possible deadlocks related + * to splicing from input file, while file_start_write() is held on + * the output file on a different sb. */ - ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len, - flags); - + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, + min_t(size_t, len, MAX_RW_COUNT), 0); done: if (ret > 0) { fsnotify_access(file_in); @@ -1549,8 +1568,6 @@ done: inc_syscr(current); inc_syscw(current); - file_end_write(file_out); - return ret; } EXPORT_SYMBOL(vfs_copy_file_range); diff --git a/fs/readdir.c b/fs/readdir.c index c8c46e294431..278bc0254732 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -96,6 +96,10 @@ int iterate_dir(struct file *file, struct dir_context *ctx) if (res) goto out; + res = fsnotify_file_perm(file, MAY_READ); + if (res) + goto out; + res = down_read_killable(&inode->i_rwsem); if (res) goto out; diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 994d6e6995ab..7e7b531fcc49 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -451,13 +451,6 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, BUG_ON(!th->t_trans_id); - /* cannot allow items to be added into a busy deleted directory */ - if (!namelen) - return -EINVAL; - - if (namelen > REISERFS_MAX_NAME(dir->i_sb->s_blocksize)) - return -ENAMETOOLONG; - /* each entry has unique key. compose it */ make_cpu_key(&entry_key, dir, get_third_component(dir->i_sb, name, namelen), @@ -1324,8 +1317,8 @@ static int reiserfs_rename(struct mnt_idmap *idmap, struct inode *old_inode, *new_dentry_inode; struct reiserfs_transaction_handle th; int jbegin_count; - umode_t old_inode_mode; unsigned long savelink = 1; + bool update_dir_parent = false; if (flags & ~RENAME_NOREPLACE) return -EINVAL; @@ -1375,8 +1368,7 @@ static int reiserfs_rename(struct mnt_idmap *idmap, return -ENOENT; } - old_inode_mode = old_inode->i_mode; - if (S_ISDIR(old_inode_mode)) { + if (S_ISDIR(old_inode->i_mode)) { /* * make sure that directory being renamed has correct ".." * and that its new parent directory has not too many links @@ -1389,24 +1381,28 @@ static int reiserfs_rename(struct mnt_idmap *idmap, } } - /* - * directory is renamed, its parent directory will be changed, - * so find ".." entry - */ - dot_dot_de.de_gen_number_bit_string = NULL; - retval = - reiserfs_find_entry(old_inode, "..", 2, &dot_dot_entry_path, + if (old_dir != new_dir) { + /* + * directory is renamed, its parent directory will be + * changed, so find ".." entry + */ + dot_dot_de.de_gen_number_bit_string = NULL; + retval = + reiserfs_find_entry(old_inode, "..", 2, + &dot_dot_entry_path, &dot_dot_de); - pathrelse(&dot_dot_entry_path); - if (retval != NAME_FOUND) { - reiserfs_write_unlock(old_dir->i_sb); - return -EIO; - } + pathrelse(&dot_dot_entry_path); + if (retval != NAME_FOUND) { + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } - /* inode number of .. must equal old_dir->i_ino */ - if (dot_dot_de.de_objectid != old_dir->i_ino) { - reiserfs_write_unlock(old_dir->i_sb); - return -EIO; + /* inode number of .. must equal old_dir->i_ino */ + if (dot_dot_de.de_objectid != old_dir->i_ino) { + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + update_dir_parent = true; } } @@ -1486,7 +1482,7 @@ static int reiserfs_rename(struct mnt_idmap *idmap, reiserfs_prepare_for_journal(old_inode->i_sb, new_de.de_bh, 1); - if (S_ISDIR(old_inode->i_mode)) { + if (update_dir_parent) { if ((retval = search_by_entry_key(new_dir->i_sb, &dot_dot_de.de_entry_key, @@ -1534,14 +1530,14 @@ static int reiserfs_rename(struct mnt_idmap *idmap, new_de.de_bh); reiserfs_restore_prepared_buffer(old_inode->i_sb, old_de.de_bh); - if (S_ISDIR(old_inode_mode)) + if (update_dir_parent) reiserfs_restore_prepared_buffer(old_inode-> i_sb, dot_dot_de. de_bh); continue; } - if (S_ISDIR(old_inode_mode)) { + if (update_dir_parent) { if (item_moved(&dot_dot_ih, &dot_dot_entry_path) || !entry_points_to_object("..", 2, &dot_dot_de, old_dir)) { @@ -1559,7 +1555,7 @@ static int reiserfs_rename(struct mnt_idmap *idmap, } } - RFALSE(S_ISDIR(old_inode_mode) && + RFALSE(update_dir_parent && !buffer_journal_prepared(dot_dot_de.de_bh), ""); break; @@ -1592,11 +1588,12 @@ static int reiserfs_rename(struct mnt_idmap *idmap, savelink = new_dentry_inode->i_nlink; } - if (S_ISDIR(old_inode_mode)) { + if (update_dir_parent) { /* adjust ".." of renamed directory */ set_ino_in_dir_entry(&dot_dot_de, INODE_PKEY(new_dir)); journal_mark_dirty(&th, dot_dot_de.de_bh); - + } + if (S_ISDIR(old_inode->i_mode)) { /* * there (in new_dir) was no directory, so it got new link * (".." of renamed directory) diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 2138ee7d271d..5faf702f8d15 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -1407,7 +1407,7 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, INITIALIZE_PATH(path); int item_len = 0; int tb_init = 0; - struct cpu_key cpu_key; + struct cpu_key cpu_key = {}; int retval; int quota_cut_bytes = 0; diff --git a/fs/remap_range.c b/fs/remap_range.c index 87ae4f0dc3aa..f8c1120b8311 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -102,7 +102,9 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in, static int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write) { + int mask = write ? MAY_WRITE : MAY_READ; loff_t tmp; + int ret; if (unlikely(pos < 0 || len < 0)) return -EINVAL; @@ -110,7 +112,11 @@ static int remap_verify_area(struct file *file, loff_t pos, loff_t len, if (unlikely(check_add_overflow(pos, len, &tmp))) return -EINVAL; - return security_file_permission(file, write ? MAY_WRITE : MAY_READ); + ret = security_file_permission(file, mask); + if (ret) + return ret; + + return fsnotify_file_area_perm(file, mask, &pos, len); } /* @@ -385,14 +391,6 @@ loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, if (!file_in->f_op->remap_file_range) return -EOPNOTSUPP; - ret = remap_verify_area(file_in, pos_in, len, false); - if (ret) - return ret; - - ret = remap_verify_area(file_out, pos_out, len, true); - if (ret) - return ret; - ret = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, len, remap_flags); if (ret < 0) @@ -410,6 +408,14 @@ loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, { loff_t ret; + ret = remap_verify_area(file_in, pos_in, len, false); + if (ret) + return ret; + + ret = remap_verify_area(file_out, pos_out, len, true); + if (ret) + return ret; + file_start_write(file_out); ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len, remap_flags); @@ -420,7 +426,7 @@ loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, EXPORT_SYMBOL(vfs_clone_file_range); /* Check whether we are allowed to dedupe the destination file */ -static bool allow_file_dedupe(struct file *file) +static bool may_dedupe_file(struct file *file) { struct mnt_idmap *idmap = file_mnt_idmap(file); struct inode *inode = file_inode(file); @@ -445,24 +451,29 @@ loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN)); - ret = mnt_want_write_file(dst_file); - if (ret) - return ret; - /* * This is redundant if called from vfs_dedupe_file_range(), but other * callers need it and it's not performance sesitive... */ ret = remap_verify_area(src_file, src_pos, len, false); if (ret) - goto out_drop_write; + return ret; ret = remap_verify_area(dst_file, dst_pos, len, true); if (ret) - goto out_drop_write; + return ret; + + /* + * This needs to be called after remap_verify_area() because of + * sb_start_write() and before may_dedupe_file() because the mount's + * MAY_WRITE need to be checked with mnt_get_write_access_file() held. + */ + ret = mnt_want_write_file(dst_file); + if (ret) + return ret; ret = -EPERM; - if (!allow_file_dedupe(dst_file)) + if (!may_dedupe_file(dst_file)) goto out_drop_write; ret = -EXDEV; diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 2131638f26d0..99b0ade833aa 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -25,6 +25,7 @@ #include <linux/freezer.h> #include <linux/namei.h> #include <linux/random.h> +#include <linux/splice.h> #include <linux/uuid.h> #include <linux/xattr.h> #include <uapi/linux/magic.h> @@ -1506,8 +1507,8 @@ static ssize_t cifs_copy_file_range(struct file *src_file, loff_t off, free_xid(xid); if (rc == -EOPNOTSUPP || rc == -EXDEV) - rc = generic_copy_file_range(src_file, off, dst_file, - destoff, len, flags); + rc = splice_copy_file_range(src_file, off, dst_file, + destoff, len); return rc; } diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h index 3adea10aa9da..685f7d1139c6 100644 --- a/fs/smb/client/cifsfs.h +++ b/fs/smb/client/cifsfs.h @@ -152,6 +152,6 @@ extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ /* when changing internal version - update following two lines at same time */ -#define SMB3_PRODUCT_BUILD 46 -#define CIFS_VERSION "2.46" +#define SMB3_PRODUCT_BUILD 47 +#define CIFS_VERSION "2.47" #endif /* _CIFSFS_H */ diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 5e32c79f03a7..879d5ef8a66e 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -192,6 +192,11 @@ struct cifs_open_info_data { bool symlink; }; struct { + /* ioctl response buffer */ + struct { + int buftype; + struct kvec iov; + } io; __u32 tag; union { struct reparse_data_buffer *buf; @@ -205,13 +210,17 @@ struct cifs_open_info_data { }; }; -#define cifs_open_data_reparse(d) \ - ((d)->reparse_point || \ - (le32_to_cpu((d)->fi.Attributes) & ATTR_REPARSE)) - -static inline void cifs_free_open_info(struct cifs_open_info_data *data) +static inline bool cifs_open_data_reparse(struct cifs_open_info_data *data) { - kfree(data->symlink_target); + struct smb2_file_all_info *fi = &data->fi; + u32 attrs = le32_to_cpu(fi->Attributes); + bool ret; + + ret = data->reparse_point || (attrs & ATTR_REPARSE); + if (ret) + attrs |= ATTR_REPARSE; + fi->Attributes = cpu_to_le32(attrs); + return ret; } /* @@ -390,12 +399,17 @@ struct smb_version_operations { int (*rename_pending_delete)(const char *, struct dentry *, const unsigned int); /* send rename request */ - int (*rename)(const unsigned int, struct cifs_tcon *, const char *, - const char *, struct cifs_sb_info *); + int (*rename)(const unsigned int xid, + struct cifs_tcon *tcon, + struct dentry *source_dentry, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb); /* send create hardlink request */ - int (*create_hardlink)(const unsigned int, struct cifs_tcon *, - const char *, const char *, - struct cifs_sb_info *); + int (*create_hardlink)(const unsigned int xid, + struct cifs_tcon *tcon, + struct dentry *source_dentry, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb); /* query symlink target */ int (*query_symlink)(const unsigned int xid, struct cifs_tcon *tcon, @@ -560,6 +574,12 @@ struct smb_version_operations { int (*parse_reparse_point)(struct cifs_sb_info *cifs_sb, struct kvec *rsp_iov, struct cifs_open_info_data *data); + int (*create_reparse_symlink)(const unsigned int xid, + struct inode *inode, + struct dentry *dentry, + struct cifs_tcon *tcon, + const char *full_path, + const char *symname); }; struct smb_version_values { @@ -1545,6 +1565,7 @@ struct cifsInodeInfo { spinlock_t deferred_lock; /* protection on deferred list */ bool lease_granted; /* Flag to indicate whether lease or oplock is granted. */ char *symlink_target; + __u32 reparse_tag; }; static inline struct cifsInodeInfo * @@ -2238,8 +2259,8 @@ static inline void cifs_sg_set_buf(struct sg_table *sgtable, struct smb2_compound_vars { struct cifs_open_parms oparms; - struct kvec rsp_iov[3]; - struct smb_rqst rqst[3]; + struct kvec rsp_iov[MAX_COMPOUND]; + struct smb_rqst rqst[MAX_COMPOUND]; struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; struct kvec qi_iov; struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index 46feaa0880bd..a841bf4967fa 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -211,8 +211,12 @@ int cifs_get_inode_info(struct inode **inode, const char *full_path, bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct cifs_open_info_data *data); -extern int smb311_posix_get_inode_info(struct inode **pinode, const char *search_path, - struct super_block *sb, unsigned int xid); + +extern int smb311_posix_get_inode_info(struct inode **inode, + const char *full_path, + struct cifs_open_info_data *data, + struct super_block *sb, + const unsigned int xid); extern int cifs_get_inode_info_unix(struct inode **pinode, const unsigned char *search_path, struct super_block *sb, unsigned int xid); @@ -435,16 +439,19 @@ extern int CIFSPOSIXDelFile(const unsigned int xid, struct cifs_tcon *tcon, int remap_special_chars); extern int CIFSSMBDelFile(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb); -extern int CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon, - const char *from_name, const char *to_name, - struct cifs_sb_info *cifs_sb); +int CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon, + struct dentry *source_dentry, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb); extern int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *tcon, int netfid, const char *target_name, const struct nls_table *nls_codepage, int remap_special_chars); -extern int CIFSCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon, - const char *from_name, const char *to_name, - struct cifs_sb_info *cifs_sb); +int CIFSCreateHardLink(const unsigned int xid, + struct cifs_tcon *tcon, + struct dentry *source_dentry, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb); extern int CIFSUnixCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon, const char *fromName, const char *toName, @@ -649,7 +656,7 @@ cifs_chan_is_iface_active(struct cifs_ses *ses, struct TCP_Server_Info *server); void cifs_disable_secondary_channels(struct cifs_ses *ses); -int +void cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server); int SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon, bool in_mount); @@ -760,4 +767,11 @@ static inline void release_mid(struct mid_q_entry *mid) kref_put(&mid->refcount, __release_mid); } +static inline void cifs_free_open_info(struct cifs_open_info_data *data) +{ + kfree(data->symlink_target); + free_rsp_buf(data->reparse.io.buftype, data->reparse.io.iov.iov_base); + memset(data, 0, sizeof(*data)); +} + #endif /* _CIFSPROTO_H */ diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 9ee348e6d106..01e89070df5a 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -2149,10 +2149,10 @@ CIFSSMBFlush(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id) return rc; } -int -CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon, - const char *from_name, const char *to_name, - struct cifs_sb_info *cifs_sb) +int CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon, + struct dentry *source_dentry, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb) { int rc = 0; RENAME_REQ *pSMB = NULL; @@ -2530,10 +2530,11 @@ createHardLinkRetry: return rc; } -int -CIFSCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon, - const char *from_name, const char *to_name, - struct cifs_sb_info *cifs_sb) +int CIFSCreateHardLink(const unsigned int xid, + struct cifs_tcon *tcon, + struct dentry *source_dentry, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb) { int rc = 0; NT_RENAME_REQ *pSMB = NULL; @@ -2699,11 +2700,12 @@ int cifs_query_reparse_point(const unsigned int xid, u32 *tag, struct kvec *rsp, int *rsp_buftype) { + struct reparse_data_buffer *buf; struct cifs_open_parms oparms; TRANSACT_IOCTL_REQ *io_req = NULL; TRANSACT_IOCTL_RSP *io_rsp = NULL; struct cifs_fid fid; - __u32 data_offset, data_count; + __u32 data_offset, data_count, len; __u8 *start, *end; int io_rsp_len; int oplock = 0; @@ -2773,7 +2775,16 @@ int cifs_query_reparse_point(const unsigned int xid, goto error; } - *tag = le32_to_cpu(((struct reparse_data_buffer *)start)->ReparseTag); + data_count = le16_to_cpu(io_rsp->ByteCount); + buf = (struct reparse_data_buffer *)start; + len = sizeof(*buf); + if (data_count < len || + data_count < le16_to_cpu(buf->ReparseDataLength) + len) { + rc = -EIO; + goto error; + } + + *tag = le32_to_cpu(buf->ReparseTag); rsp->iov_base = io_rsp; rsp->iov_len = io_rsp_len; *rsp_buftype = CIFS_LARGE_BUFFER; diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index dc9b95ca71e6..3052a208c6ca 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -483,6 +483,7 @@ static int reconnect_target_unlocked(struct TCP_Server_Info *server, struct dfs_ static int reconnect_dfs_server(struct TCP_Server_Info *server) { struct dfs_cache_tgt_iterator *target_hint = NULL; + DFS_CACHE_TGT_LIST(tl); int num_targets = 0; int rc = 0; @@ -745,6 +746,7 @@ cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, { struct msghdr smb_msg = {}; struct kvec iov = {.iov_base = buf, .iov_len = to_read}; + iov_iter_kvec(&smb_msg.msg_iter, ITER_DEST, &iov, 1, to_read); return cifs_readv_from_socket(server, &smb_msg); @@ -1400,11 +1402,13 @@ cifs_match_ipaddr(struct sockaddr *srcaddr, struct sockaddr *rhs) case AF_INET: { struct sockaddr_in *saddr4 = (struct sockaddr_in *)srcaddr; struct sockaddr_in *vaddr4 = (struct sockaddr_in *)rhs; + return (saddr4->sin_addr.s_addr == vaddr4->sin_addr.s_addr); } case AF_INET6: { struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr; struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)rhs; + return (ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr) && saddr6->sin6_scope_id == vaddr6->sin6_scope_id); } @@ -2599,8 +2603,8 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) rc = -EOPNOTSUPP; goto out_fail; } else { - cifs_dbg(VFS, "Check vers= mount option. SMB3.11 " - "disabled but required for POSIX extensions\n"); + cifs_dbg(VFS, + "Check vers= mount option. SMB3.11 disabled but required for POSIX extensions\n"); rc = -EOPNOTSUPP; goto out_fail; } @@ -2743,7 +2747,6 @@ cifs_put_tlink(struct tcon_link *tlink) if (!IS_ERR(tlink_tcon(tlink))) cifs_put_tcon(tlink_tcon(tlink)); kfree(tlink); - return; } static int @@ -2884,6 +2887,7 @@ static inline void cifs_reclassify_socket4(struct socket *sock) { struct sock *sk = sock->sk; + BUG_ON(!sock_allow_reclassification(sk)); sock_lock_init_class_and_name(sk, "slock-AF_INET-CIFS", &cifs_slock_key[0], "sk_lock-AF_INET-CIFS", &cifs_key[0]); @@ -2893,6 +2897,7 @@ static inline void cifs_reclassify_socket6(struct socket *sock) { struct sock *sk = sock->sk; + BUG_ON(!sock_allow_reclassification(sk)); sock_lock_init_class_and_name(sk, "slock-AF_INET6-CIFS", &cifs_slock_key[1], "sk_lock-AF_INET6-CIFS", &cifs_key[1]); @@ -2927,15 +2932,18 @@ static int bind_socket(struct TCP_Server_Info *server) { int rc = 0; + if (server->srcaddr.ss_family != AF_UNSPEC) { /* Bind to the specified local IP address */ struct socket *socket = server->ssocket; + rc = kernel_bind(socket, (struct sockaddr *) &server->srcaddr, sizeof(server->srcaddr)); if (rc < 0) { struct sockaddr_in *saddr4; struct sockaddr_in6 *saddr6; + saddr4 = (struct sockaddr_in *)&server->srcaddr; saddr6 = (struct sockaddr_in6 *)&server->srcaddr; if (saddr6->sin6_family == AF_INET6) @@ -3165,6 +3173,7 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon, if (!CIFSSMBQFSUnixInfo(xid, tcon)) { __u64 cap = le64_to_cpu(tcon->fsUnixInfo.Capability); + cifs_dbg(FYI, "unix caps which server supports %lld\n", cap); /* * check for reconnect case in which we do not @@ -3668,7 +3677,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, smb_buffer_response = smb_buffer; header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX, - NULL /*no tid */ , 4 /*wct */ ); + NULL /*no tid */, 4 /*wct */); smb_buffer->Mid = get_next_mid(ses->server); smb_buffer->Uid = ses->Suid; @@ -3687,12 +3696,12 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, if (ses->server->sign) smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; - if (ses->capabilities & CAP_STATUS32) { + if (ses->capabilities & CAP_STATUS32) smb_buffer->Flags2 |= SMBFLG2_ERR_STATUS; - } - if (ses->capabilities & CAP_DFS) { + + if (ses->capabilities & CAP_DFS) smb_buffer->Flags2 |= SMBFLG2_DFS; - } + if (ses->capabilities & CAP_UNICODE) { smb_buffer->Flags2 |= SMBFLG2_UNICODE; length = diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c index 580a27a3a7e6..89333d9bce36 100644 --- a/fs/smb/client/dir.c +++ b/fs/smb/client/dir.c @@ -680,9 +680,10 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, full_path, d_inode(direntry)); again: - if (pTcon->posix_extensions) - rc = smb311_posix_get_inode_info(&newInode, full_path, parent_dir_inode->i_sb, xid); - else if (pTcon->unix_ext) { + if (pTcon->posix_extensions) { + rc = smb311_posix_get_inode_info(&newInode, full_path, NULL, + parent_dir_inode->i_sb, xid); + } else if (pTcon->unix_ext) { rc = cifs_get_inode_info_unix(&newInode, full_path, parent_dir_inode->i_sb, xid); } else { diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 32a8525415d9..1b4262aff8fa 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -1020,14 +1020,16 @@ reopen_success: if (!is_interrupt_error(rc)) mapping_set_error(inode->i_mapping, rc); - if (tcon->posix_extensions) - rc = smb311_posix_get_inode_info(&inode, full_path, inode->i_sb, xid); - else if (tcon->unix_ext) + if (tcon->posix_extensions) { + rc = smb311_posix_get_inode_info(&inode, full_path, + NULL, inode->i_sb, xid); + } else if (tcon->unix_ext) { rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb, xid); - else + } else { rc = cifs_get_inode_info(&inode, full_path, NULL, inode->i_sb, xid, NULL); + } } /* * Else we are writing out data to server already and could deadlock if @@ -2706,8 +2708,7 @@ static void cifs_extend_writeback(struct address_space *mapping, */ if (!folio_clear_dirty_for_io(folio)) WARN_ON(1); - if (folio_start_writeback(folio)) - WARN_ON(1); + folio_start_writeback(folio); *_count -= folio_nr_pages(folio); folio_unlock(folio); @@ -2742,8 +2743,7 @@ static ssize_t cifs_write_back_from_locked_folio(struct address_space *mapping, int rc; /* The folio should be locked, dirty and not undergoing writeback. */ - if (folio_start_writeback(folio)) - WARN_ON(1); + folio_start_writeback(folio); count -= folio_nr_pages(folio); len = folio_size(folio); diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 09c5c0f5c96e..9f37c1758f73 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -182,6 +182,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) inode->i_mode = fattr->cf_mode; cifs_i->cifsAttrs = fattr->cf_cifsattrs; + cifs_i->reparse_tag = fattr->cf_cifstag; if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL) cifs_i->time = 0; @@ -209,7 +210,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) inode->i_blocks = (512 - 1 + fattr->cf_bytes) >> 9; } - if (S_ISLNK(fattr->cf_mode)) { + if (S_ISLNK(fattr->cf_mode) && fattr->cf_symlink_target) { kfree(cifs_i->symlink_target); cifs_i->symlink_target = fattr->cf_symlink_target; fattr->cf_symlink_target = NULL; @@ -691,29 +692,36 @@ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, fattr->cf_mtime.tv_sec += tcon->ses->server->timeAdj; } + /* + * The srv fs device id is overridden on network mount so setting + * @fattr->cf_rdev isn't needed here. + */ fattr->cf_eof = le64_to_cpu(info->EndOfFile); fattr->cf_bytes = le64_to_cpu(info->AllocationSize); fattr->cf_createtime = le64_to_cpu(info->CreationTime); - fattr->cf_nlink = le32_to_cpu(info->HardLinks); fattr->cf_mode = (umode_t) le32_to_cpu(info->Mode); - /* The srv fs device id is overridden on network mount so setting rdev isn't needed here */ - /* fattr->cf_rdev = le32_to_cpu(info->DeviceId); */ - if (data->symlink) { - fattr->cf_mode |= S_IFLNK; - fattr->cf_dtype = DT_LNK; - fattr->cf_symlink_target = data->symlink_target; - data->symlink_target = NULL; - } else if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { + if (cifs_open_data_reparse(data) && + cifs_reparse_point_to_fattr(cifs_sb, fattr, data)) + goto out_reparse; + + fattr->cf_mode &= ~S_IFMT; + if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { fattr->cf_mode |= S_IFDIR; fattr->cf_dtype = DT_DIR; } else { /* file */ fattr->cf_mode |= S_IFREG; fattr->cf_dtype = DT_REG; } - /* else if reparse point ... TODO: add support for FIFO and blk dev; special file types */ +out_reparse: + if (S_ISLNK(fattr->cf_mode)) { + if (likely(data->symlink_target)) + fattr->cf_eof = strnlen(data->symlink_target, PATH_MAX); + fattr->cf_symlink_target = data->symlink_target; + data->symlink_target = NULL; + } sid_to_id(cifs_sb, owner, fattr, SIDOWNER); sid_to_id(cifs_sb, group, fattr, SIDGROUP); @@ -738,25 +746,25 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, if (tag == IO_REPARSE_TAG_NFS && buf) { switch (le64_to_cpu(buf->InodeType)) { case NFS_SPECFILE_CHR: - fattr->cf_mode |= S_IFCHR | cifs_sb->ctx->file_mode; + fattr->cf_mode |= S_IFCHR; fattr->cf_dtype = DT_CHR; fattr->cf_rdev = nfs_mkdev(buf); break; case NFS_SPECFILE_BLK: - fattr->cf_mode |= S_IFBLK | cifs_sb->ctx->file_mode; + fattr->cf_mode |= S_IFBLK; fattr->cf_dtype = DT_BLK; fattr->cf_rdev = nfs_mkdev(buf); break; case NFS_SPECFILE_FIFO: - fattr->cf_mode |= S_IFIFO | cifs_sb->ctx->file_mode; + fattr->cf_mode |= S_IFIFO; fattr->cf_dtype = DT_FIFO; break; case NFS_SPECFILE_SOCK: - fattr->cf_mode |= S_IFSOCK | cifs_sb->ctx->file_mode; + fattr->cf_mode |= S_IFSOCK; fattr->cf_dtype = DT_SOCK; break; case NFS_SPECFILE_LNK: - fattr->cf_mode = S_IFLNK | cifs_sb->ctx->file_mode; + fattr->cf_mode |= S_IFLNK; fattr->cf_dtype = DT_LNK; break; default: @@ -768,29 +776,29 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, switch (tag) { case IO_REPARSE_TAG_LX_SYMLINK: - fattr->cf_mode |= S_IFLNK | cifs_sb->ctx->file_mode; + fattr->cf_mode |= S_IFLNK; fattr->cf_dtype = DT_LNK; break; case IO_REPARSE_TAG_LX_FIFO: - fattr->cf_mode |= S_IFIFO | cifs_sb->ctx->file_mode; + fattr->cf_mode |= S_IFIFO; fattr->cf_dtype = DT_FIFO; break; case IO_REPARSE_TAG_AF_UNIX: - fattr->cf_mode |= S_IFSOCK | cifs_sb->ctx->file_mode; + fattr->cf_mode |= S_IFSOCK; fattr->cf_dtype = DT_SOCK; break; case IO_REPARSE_TAG_LX_CHR: - fattr->cf_mode |= S_IFCHR | cifs_sb->ctx->file_mode; + fattr->cf_mode |= S_IFCHR; fattr->cf_dtype = DT_CHR; break; case IO_REPARSE_TAG_LX_BLK: - fattr->cf_mode |= S_IFBLK | cifs_sb->ctx->file_mode; + fattr->cf_mode |= S_IFBLK; fattr->cf_dtype = DT_BLK; break; case 0: /* SMB1 symlink */ case IO_REPARSE_TAG_SYMLINK: case IO_REPARSE_TAG_NFS: - fattr->cf_mode = S_IFLNK | cifs_sb->ctx->file_mode; + fattr->cf_mode |= S_IFLNK; fattr->cf_dtype = DT_LNK; break; default: @@ -830,6 +838,7 @@ static void cifs_open_info_to_fattr(struct cifs_fattr *fattr, fattr->cf_createtime = le64_to_cpu(info->CreationTime); fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks); + fattr->cf_mode = cifs_sb->ctx->file_mode; if (cifs_open_data_reparse(data) && cifs_reparse_point_to_fattr(cifs_sb, fattr, data)) goto out_reparse; @@ -1061,7 +1070,9 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data, const unsigned int xid, struct cifs_tcon *tcon, const char *full_path, - struct cifs_fattr *fattr) + struct cifs_fattr *fattr, + struct cifs_sid *owner, + struct cifs_sid *group) { struct TCP_Server_Info *server = tcon->ses->server; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); @@ -1076,6 +1087,9 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data, &rsp_iov, &rsp_buftype); if (!rc) iov = &rsp_iov; + } else if (data->reparse.io.buftype != CIFS_NO_BUFFER && + data->reparse.io.iov.iov_base) { + iov = &data->reparse.io.iov; } rc = -EOPNOTSUPP; @@ -1092,17 +1106,22 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data, rc = 0; goto out; default: - if (data->symlink_target) { + /* Check for cached reparse point data */ + if (data->symlink_target || data->reparse.buf) { rc = 0; - } else if (server->ops->parse_reparse_point) { + } else if (iov && server->ops->parse_reparse_point) { rc = server->ops->parse_reparse_point(cifs_sb, iov, data); } break; } - cifs_open_info_to_fattr(fattr, data, sb); + if (tcon->posix_extensions) + smb311_posix_info_to_fattr(fattr, data, owner, group, sb); + else + cifs_open_info_to_fattr(fattr, data, sb); out: + fattr->cf_cifstag = data->reparse.tag; free_rsp_buf(rsp_buftype, rsp_iov.iov_base); return rc; } @@ -1152,7 +1171,8 @@ static int cifs_get_fattr(struct cifs_open_info_data *data, */ if (cifs_open_data_reparse(data)) { rc = reparse_info_to_fattr(data, sb, xid, tcon, - full_path, fattr); + full_path, fattr, + NULL, NULL); } else { cifs_open_info_to_fattr(fattr, data, sb); } @@ -1290,18 +1310,19 @@ out: return rc; } -static int smb311_posix_get_fattr(struct cifs_fattr *fattr, +static int smb311_posix_get_fattr(struct cifs_open_info_data *data, + struct cifs_fattr *fattr, const char *full_path, struct super_block *sb, const unsigned int xid) { - struct cifs_open_info_data data = {}; + struct cifs_open_info_data tmp_data = {}; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_tcon *tcon; struct tcon_link *tlink; struct cifs_sid owner, group; int tmprc; - int rc; + int rc = 0; tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -1309,12 +1330,14 @@ static int smb311_posix_get_fattr(struct cifs_fattr *fattr, tcon = tlink_tcon(tlink); /* - * 1. Fetch file metadata + * 1. Fetch file metadata if not provided (data) */ - - rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, - full_path, &data, - &owner, &group); + if (!data) { + rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, + full_path, &tmp_data, + &owner, &group); + data = &tmp_data; + } /* * 2. Convert it to internal cifs metadata (fattr) @@ -1322,7 +1345,14 @@ static int smb311_posix_get_fattr(struct cifs_fattr *fattr, switch (rc) { case 0: - smb311_posix_info_to_fattr(fattr, &data, &owner, &group, sb); + if (cifs_open_data_reparse(data)) { + rc = reparse_info_to_fattr(data, sb, xid, tcon, + full_path, fattr, + &owner, &group); + } else { + smb311_posix_info_to_fattr(fattr, data, + &owner, &group, sb); + } break; case -EREMOTE: /* DFS link, no metadata available on this server */ @@ -1353,12 +1383,15 @@ static int smb311_posix_get_fattr(struct cifs_fattr *fattr, out: cifs_put_tlink(tlink); - cifs_free_open_info(&data); + cifs_free_open_info(data); return rc; } -int smb311_posix_get_inode_info(struct inode **inode, const char *full_path, - struct super_block *sb, const unsigned int xid) +int smb311_posix_get_inode_info(struct inode **inode, + const char *full_path, + struct cifs_open_info_data *data, + struct super_block *sb, + const unsigned int xid) { struct cifs_fattr fattr = {}; int rc; @@ -1368,7 +1401,7 @@ int smb311_posix_get_inode_info(struct inode **inode, const char *full_path, return 0; } - rc = smb311_posix_get_fattr(&fattr, full_path, sb, xid); + rc = smb311_posix_get_fattr(data, &fattr, full_path, sb, xid); if (rc) goto out; @@ -1516,7 +1549,7 @@ struct inode *cifs_root_iget(struct super_block *sb) convert_delimiter(path, CIFS_DIR_SEP(cifs_sb)); if (tcon->posix_extensions) - rc = smb311_posix_get_fattr(&fattr, path, sb, xid); + rc = smb311_posix_get_fattr(NULL, &fattr, path, sb, xid); else rc = cifs_get_fattr(NULL, sb, xid, NULL, &fattr, &inode, path); @@ -1889,16 +1922,18 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode, int rc = 0; struct inode *inode = NULL; - if (tcon->posix_extensions) - rc = smb311_posix_get_inode_info(&inode, full_path, parent->i_sb, xid); + if (tcon->posix_extensions) { + rc = smb311_posix_get_inode_info(&inode, full_path, + NULL, parent->i_sb, xid); #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY - else if (tcon->unix_ext) + } else if (tcon->unix_ext) { rc = cifs_get_inode_info_unix(&inode, full_path, parent->i_sb, xid); #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ - else + } else { rc = cifs_get_inode_info(&inode, full_path, NULL, parent->i_sb, xid, NULL); + } if (rc) return rc; @@ -2219,7 +2254,8 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, return -ENOSYS; /* try path-based rename first */ - rc = server->ops->rename(xid, tcon, from_path, to_path, cifs_sb); + rc = server->ops->rename(xid, tcon, from_dentry, + from_path, to_path, cifs_sb); /* * Don't bother with rename by filehandle unless file is busy and @@ -2579,13 +2615,15 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry) dentry, cifs_get_time(dentry), jiffies); again: - if (cifs_sb_master_tcon(CIFS_SB(sb))->posix_extensions) - rc = smb311_posix_get_inode_info(&inode, full_path, sb, xid); - else if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext) + if (cifs_sb_master_tcon(CIFS_SB(sb))->posix_extensions) { + rc = smb311_posix_get_inode_info(&inode, full_path, + NULL, sb, xid); + } else if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext) { rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid); - else + } else { rc = cifs_get_inode_info(&inode, full_path, NULL, sb, xid, NULL); + } if (rc == -EAGAIN && count++ < 10) goto again; out: diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c index a1da50e66fbb..d86da949a919 100644 --- a/fs/smb/client/link.c +++ b/fs/smb/client/link.c @@ -510,8 +510,8 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, rc = -ENOSYS; goto cifs_hl_exit; } - rc = server->ops->create_hardlink(xid, tcon, from_name, to_name, - cifs_sb); + rc = server->ops->create_hardlink(xid, tcon, old_file, + from_name, to_name, cifs_sb); if ((rc == -EIO) || (rc == -EINVAL)) rc = -EOPNOTSUPP; } @@ -569,6 +569,7 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, int rc = -EOPNOTSUPP; unsigned int xid; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct TCP_Server_Info *server; struct tcon_link *tlink; struct cifs_tcon *pTcon; const char *full_path; @@ -590,6 +591,7 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, goto symlink_exit; } pTcon = tlink_tcon(tlink); + server = cifs_pick_channel(pTcon->ses); full_path = build_path_from_dentry(direntry, page); if (IS_ERR(full_path)) { @@ -601,27 +603,32 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, cifs_dbg(FYI, "symname is %s\n", symname); /* BB what if DFS and this volume is on different share? BB */ - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { rc = create_mf_symlink(xid, pTcon, cifs_sb, full_path, symname); #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY - else if (pTcon->unix_ext) + } else if (pTcon->unix_ext) { rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname, cifs_sb->local_nls, cifs_remap(cifs_sb)); #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ - /* else - rc = CIFSCreateReparseSymLink(xid, pTcon, fromName, toName, - cifs_sb_target->local_nls); */ + } else if (server->ops->create_reparse_symlink) { + rc = server->ops->create_reparse_symlink(xid, inode, direntry, + pTcon, full_path, + symname); + goto symlink_exit; + } if (rc == 0) { - if (pTcon->posix_extensions) - rc = smb311_posix_get_inode_info(&newinode, full_path, inode->i_sb, xid); - else if (pTcon->unix_ext) + if (pTcon->posix_extensions) { + rc = smb311_posix_get_inode_info(&newinode, full_path, + NULL, inode->i_sb, xid); + } else if (pTcon->unix_ext) { rc = cifs_get_inode_info_unix(&newinode, full_path, inode->i_sb, xid); - else + } else { rc = cifs_get_inode_info(&newinode, full_path, NULL, inode->i_sb, xid, NULL); + } if (rc != 0) { cifs_dbg(FYI, "Create symlink ok, getinodeinfo fail rc = %d\n", diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index d30ea2005eb3..056cae1ddcce 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -56,6 +56,23 @@ static inline void dump_cifs_file_struct(struct file *file, char *label) #endif /* DEBUG2 */ /* + * Match a reparse point inode if reparse tag and ctime haven't changed. + * + * Windows Server updates ctime of reparse points when their data have changed. + * The server doesn't allow changing reparse tags from existing reparse points, + * though it's worth checking. + */ +static inline bool reparse_inode_match(struct inode *inode, + struct cifs_fattr *fattr) +{ + struct timespec64 ctime = inode_get_ctime(inode); + + return (CIFS_I(inode)->cifsAttrs & ATTR_REPARSE) && + CIFS_I(inode)->reparse_tag == fattr->cf_cifstag && + timespec64_equal(&ctime, &fattr->cf_ctime); +} + +/* * Attempt to preload the dcache with the results from the FIND_FIRST/NEXT * * Find the dentry that matches "name". If there isn't one, create one. If it's @@ -71,6 +88,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, struct super_block *sb = parent->d_sb; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + int rc; cifs_dbg(FYI, "%s: for %s\n", __func__, name->name); @@ -82,9 +100,11 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, * We'll end up doing an on the wire call either way and * this spares us an invalidation. */ - if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL) - return; retry: + if ((fattr->cf_cifsattrs & ATTR_REPARSE) || + (fattr->cf_flags & CIFS_FATTR_NEED_REVAL)) + return; + dentry = d_alloc_parallel(parent, name, &wq); } if (IS_ERR(dentry)) @@ -104,12 +124,34 @@ retry: if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) fattr->cf_uniqueid = CIFS_I(inode)->uniqueid; - /* update inode in place - * if both i_ino and i_mode didn't change */ - if (CIFS_I(inode)->uniqueid == fattr->cf_uniqueid && - cifs_fattr_to_inode(inode, fattr) == 0) { - dput(dentry); - return; + /* + * Update inode in place if both i_ino and i_mode didn't + * change. + */ + if (CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) { + /* + * Query dir responses don't provide enough + * information about reparse points other than + * their reparse tags. Save an invalidation by + * not clobbering the existing mode, size and + * symlink target (if any) when reparse tag and + * ctime haven't changed. + */ + rc = 0; + if (fattr->cf_cifsattrs & ATTR_REPARSE) { + if (likely(reparse_inode_match(inode, fattr))) { + fattr->cf_mode = inode->i_mode; + fattr->cf_eof = CIFS_I(inode)->server_eof; + fattr->cf_symlink_target = NULL; + } else { + CIFS_I(inode)->time = 0; + rc = -ESTALE; + } + } + if (!rc && !cifs_fattr_to_inode(inode, fattr)) { + dput(dentry); + return; + } } } d_invalidate(dentry); @@ -127,29 +169,6 @@ retry: dput(dentry); } -static bool reparse_file_needs_reval(const struct cifs_fattr *fattr) -{ - if (!(fattr->cf_cifsattrs & ATTR_REPARSE)) - return false; - /* - * The DFS tags should be only intepreted by server side as per - * MS-FSCC 2.1.2.1, but let's include them anyway. - * - * Besides, if cf_cifstag is unset (0), then we still need it to be - * revalidated to know exactly what reparse point it is. - */ - switch (fattr->cf_cifstag) { - case IO_REPARSE_TAG_DFS: - case IO_REPARSE_TAG_DFSR: - case IO_REPARSE_TAG_SYMLINK: - case IO_REPARSE_TAG_NFS: - case IO_REPARSE_TAG_MOUNT_POINT: - case 0: - return true; - } - return false; -} - static void cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) { @@ -181,14 +200,6 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) } out_reparse: - /* - * We need to revalidate it further to make a decision about whether it - * is a symbolic link, DFS referral or a reparse point with a direct - * access like junctions, deduplicated files, NFS symlinks. - */ - if (reparse_file_needs_reval(fattr)) - fattr->cf_flags |= CIFS_FATTR_NEED_REVAL; - /* non-unix readdir doesn't provide nlink */ fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK; @@ -269,9 +280,6 @@ cifs_posix_to_fattr(struct cifs_fattr *fattr, struct smb2_posix_info *info, fattr->cf_dtype = DT_REG; } - if (reparse_file_needs_reval(fattr)) - fattr->cf_flags |= CIFS_FATTR_NEED_REVAL; - sid_to_id(cifs_sb, &parsed.owner, fattr, SIDOWNER); sid_to_id(cifs_sb, &parsed.group, fattr, SIDGROUP); } @@ -331,38 +339,6 @@ cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info, cifs_fill_common_info(fattr, cifs_sb); } -/* BB eventually need to add the following helper function to - resolve NT_STATUS_STOPPED_ON_SYMLINK return code when - we try to do FindFirst on (NTFS) directory symlinks */ -/* -int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb, - unsigned int xid) -{ - __u16 fid; - int len; - int oplock = 0; - int rc; - struct cifs_tcon *ptcon = cifs_sb_tcon(cifs_sb); - char *tmpbuffer; - - rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ, - OPEN_REPARSE_POINT, &fid, &oplock, NULL, - cifs_sb->local_nls, - cifs_remap(cifs_sb); - if (!rc) { - tmpbuffer = kmalloc(maxpath); - rc = CIFSSMBQueryReparseLinkInfo(xid, ptcon, full_path, - tmpbuffer, - maxpath -1, - fid, - cifs_sb->local_nls); - if (CIFSSMBClose(xid, ptcon, fid)) { - cifs_dbg(FYI, "Error closing temporary reparsepoint open\n"); - } - } -} - */ - static int _initiate_cifs_search(const unsigned int xid, struct file *file, const char *full_path) @@ -431,13 +407,10 @@ ffirst_retry: &cifsFile->fid, search_flags, &cifsFile->srch_inf); - if (rc == 0) + if (rc == 0) { cifsFile->invalidHandle = false; - /* BB add following call to handle readdir on new NTFS symlink errors - else if STATUS_STOPPED_ON_SYMLINK - call get_symlink_reparse_path and retry with new path */ - else if ((rc == -EOPNOTSUPP) && - (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) { + } else if ((rc == -EOPNOTSUPP) && + (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) { cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; goto ffirst_retry; } diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index 2d3b332a79a1..cde81042bebd 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -356,10 +356,9 @@ done: /* * update the iface for the channel if necessary. - * will return 0 when iface is updated, 1 if removed, 2 otherwise * Must be called with chan_lock held. */ -int +void cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) { unsigned int chan_index; @@ -368,20 +367,19 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) struct cifs_server_iface *old_iface = NULL; struct cifs_server_iface *last_iface = NULL; struct sockaddr_storage ss; - int rc = 0; spin_lock(&ses->chan_lock); chan_index = cifs_ses_get_chan_index(ses, server); if (chan_index == CIFS_INVAL_CHAN_INDEX) { spin_unlock(&ses->chan_lock); - return 0; + return; } if (ses->chans[chan_index].iface) { old_iface = ses->chans[chan_index].iface; if (old_iface->is_active) { spin_unlock(&ses->chan_lock); - return 1; + return; } } spin_unlock(&ses->chan_lock); @@ -394,7 +392,7 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) if (!ses->iface_count) { spin_unlock(&ses->iface_lock); cifs_dbg(VFS, "server %s does not advertise interfaces\n", ses->server->hostname); - return 0; + return; } last_iface = list_last_entry(&ses->iface_list, struct cifs_server_iface, @@ -434,16 +432,21 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) } if (list_entry_is_head(iface, &ses->iface_list, iface_head)) { - rc = 1; iface = NULL; cifs_dbg(FYI, "unable to find a suitable iface\n"); } if (!iface) { - cifs_dbg(FYI, "unable to get the interface matching: %pIS\n", - &ss); + if (!chan_index) + cifs_dbg(FYI, "unable to get the interface matching: %pIS\n", + &ss); + else { + cifs_dbg(FYI, "unable to find another interface to replace: %pIS\n", + &old_iface->sockaddr); + } + spin_unlock(&ses->iface_lock); - return 0; + return; } /* now drop the ref to the current iface */ @@ -459,34 +462,24 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) iface->weight_fulfilled++; kref_put(&old_iface->refcount, release_iface); - } else if (old_iface) { - /* if a new candidate is not found, keep things as is */ - cifs_dbg(FYI, "could not replace iface: %pIS\n", - &old_iface->sockaddr); } else if (!chan_index) { /* special case: update interface for primary channel */ - if (iface) { - cifs_dbg(FYI, "referencing primary channel iface: %pIS\n", - &iface->sockaddr); - iface->num_channels++; - iface->weight_fulfilled++; - } + cifs_dbg(FYI, "referencing primary channel iface: %pIS\n", + &iface->sockaddr); + iface->num_channels++; + iface->weight_fulfilled++; } spin_unlock(&ses->iface_lock); - if (iface) { - spin_lock(&ses->chan_lock); - chan_index = cifs_ses_get_chan_index(ses, server); - if (chan_index == CIFS_INVAL_CHAN_INDEX) { - spin_unlock(&ses->chan_lock); - return 0; - } - - ses->chans[chan_index].iface = iface; + spin_lock(&ses->chan_lock); + chan_index = cifs_ses_get_chan_index(ses, server); + if (chan_index == CIFS_INVAL_CHAN_INDEX) { spin_unlock(&ses->chan_lock); + return; } - return rc; + ses->chans[chan_index].iface = iface; + spin_unlock(&ses->chan_lock); } /* diff --git a/fs/smb/client/smb2glob.h b/fs/smb/client/smb2glob.h index 82e916ad167c..a0c156996fc5 100644 --- a/fs/smb/client/smb2glob.h +++ b/fs/smb/client/smb2glob.h @@ -23,17 +23,21 @@ * Identifiers for functions that use the open, operation, close pattern * in smb2inode.c:smb2_compound_op() */ -#define SMB2_OP_SET_DELETE 1 -#define SMB2_OP_SET_INFO 2 -#define SMB2_OP_QUERY_INFO 3 -#define SMB2_OP_QUERY_DIR 4 -#define SMB2_OP_MKDIR 5 -#define SMB2_OP_RENAME 6 -#define SMB2_OP_DELETE 7 -#define SMB2_OP_HARDLINK 8 -#define SMB2_OP_SET_EOF 9 -#define SMB2_OP_RMDIR 10 -#define SMB2_OP_POSIX_QUERY_INFO 11 +enum smb2_compound_ops { + SMB2_OP_SET_DELETE = 1, + SMB2_OP_SET_INFO, + SMB2_OP_QUERY_INFO, + SMB2_OP_QUERY_DIR, + SMB2_OP_MKDIR, + SMB2_OP_RENAME, + SMB2_OP_DELETE, + SMB2_OP_HARDLINK, + SMB2_OP_SET_EOF, + SMB2_OP_RMDIR, + SMB2_OP_POSIX_QUERY_INFO, + SMB2_OP_SET_REPARSE, + SMB2_OP_GET_REPARSE +}; /* Used when constructing chained read requests. */ #define CHAINED_REQUEST 1 diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index c94940af5d4b..5053a5550abe 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -26,13 +26,34 @@ #include "cached_dir.h" #include "smb2status.h" -static void -free_set_inf_compound(struct smb_rqst *rqst) +static struct reparse_data_buffer *reparse_buf_ptr(struct kvec *iov) { - if (rqst[1].rq_iov) - SMB2_set_info_free(&rqst[1]); - if (rqst[2].rq_iov) - SMB2_close_free(&rqst[2]); + struct reparse_data_buffer *buf; + struct smb2_ioctl_rsp *io = iov->iov_base; + u32 off, count, len; + + count = le32_to_cpu(io->OutputCount); + off = le32_to_cpu(io->OutputOffset); + if (check_add_overflow(off, count, &len) || len > iov->iov_len) + return ERR_PTR(-EIO); + + buf = (struct reparse_data_buffer *)((u8 *)io + off); + len = sizeof(*buf); + if (count < len || count < le16_to_cpu(buf->ReparseDataLength) + len) + return ERR_PTR(-EIO); + return buf; +} + +static inline __u32 file_create_options(struct dentry *dentry) +{ + struct cifsInodeInfo *ci; + + if (dentry) { + ci = CIFS_I(d_inode(dentry)); + if (ci->cifsAttrs & ATTR_REPARSE) + return OPEN_REPARSE_POINT; + } + return 0; } /* @@ -45,13 +66,16 @@ free_set_inf_compound(struct smb_rqst *rqst) */ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, - __u32 desired_access, __u32 create_disposition, __u32 create_options, - umode_t mode, void *ptr, int command, struct cifsFileInfo *cfile, + __u32 desired_access, __u32 create_disposition, + __u32 create_options, umode_t mode, struct kvec *in_iov, + int *cmds, int num_cmds, struct cifsFileInfo *cfile, __u8 **extbuf, size_t *extbuflen, struct kvec *out_iov, int *out_buftype) { + + struct reparse_data_buffer *rbuf; struct smb2_compound_vars *vars = NULL; - struct kvec *rsp_iov; + struct kvec *rsp_iov, *iov; struct smb_rqst *rqst; int rc; __le16 *utf16_path = NULL; @@ -59,8 +83,8 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid fid; struct cifs_ses *ses = tcon->ses; struct TCP_Server_Info *server; - int num_rqst = 0; - int resp_buftype[3]; + int num_rqst = 0, i; + int resp_buftype[MAX_COMPOUND]; struct smb2_query_info_rsp *qi_rsp = NULL; struct cifs_open_info_data *idata; int flags = 0; @@ -80,7 +104,8 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; + for (i = 0; i < ARRAY_SIZE(resp_buftype); i++) + resp_buftype[i] = CIFS_NO_BUFFER; /* We already have a handle so we can skip the open */ if (cfile) @@ -118,242 +143,277 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, num_rqst++; rc = 0; - /* Operation */ - switch (command) { - case SMB2_OP_QUERY_INFO: - rqst[num_rqst].rq_iov = &vars->qi_iov; - rqst[num_rqst].rq_nvec = 1; - - if (cfile) - rc = SMB2_query_info_init(tcon, server, - &rqst[num_rqst], - cfile->fid.persistent_fid, - cfile->fid.volatile_fid, - FILE_ALL_INFORMATION, - SMB2_O_INFO_FILE, 0, - sizeof(struct smb2_file_all_info) + - PATH_MAX * 2, 0, NULL); - else { - rc = SMB2_query_info_init(tcon, server, - &rqst[num_rqst], - COMPOUND_FID, - COMPOUND_FID, - FILE_ALL_INFORMATION, - SMB2_O_INFO_FILE, 0, - sizeof(struct smb2_file_all_info) + - PATH_MAX * 2, 0, NULL); - if (!rc) { - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst]); + for (i = 0; i < num_cmds; i++) { + /* Operation */ + switch (cmds[i]) { + case SMB2_OP_QUERY_INFO: + rqst[num_rqst].rq_iov = &vars->qi_iov; + rqst[num_rqst].rq_nvec = 1; + + if (cfile) { + rc = SMB2_query_info_init(tcon, server, + &rqst[num_rqst], + cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + FILE_ALL_INFORMATION, + SMB2_O_INFO_FILE, 0, + sizeof(struct smb2_file_all_info) + + PATH_MAX * 2, 0, NULL); + } else { + rc = SMB2_query_info_init(tcon, server, + &rqst[num_rqst], + COMPOUND_FID, + COMPOUND_FID, + FILE_ALL_INFORMATION, + SMB2_O_INFO_FILE, 0, + sizeof(struct smb2_file_all_info) + + PATH_MAX * 2, 0, NULL); + if (!rc) { + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst]); + } } - } - if (rc) - goto finished; - num_rqst++; - trace_smb3_query_info_compound_enter(xid, ses->Suid, tcon->tid, - full_path); - break; - case SMB2_OP_POSIX_QUERY_INFO: - rqst[num_rqst].rq_iov = &vars->qi_iov; - rqst[num_rqst].rq_nvec = 1; - - if (cfile) - rc = SMB2_query_info_init(tcon, server, - &rqst[num_rqst], - cfile->fid.persistent_fid, - cfile->fid.volatile_fid, - SMB_FIND_FILE_POSIX_INFO, - SMB2_O_INFO_FILE, 0, + if (rc) + goto finished; + num_rqst++; + trace_smb3_query_info_compound_enter(xid, ses->Suid, + tcon->tid, full_path); + break; + case SMB2_OP_POSIX_QUERY_INFO: + rqst[num_rqst].rq_iov = &vars->qi_iov; + rqst[num_rqst].rq_nvec = 1; + + if (cfile) { /* TBD: fix following to allow for longer SIDs */ - sizeof(struct smb311_posix_qinfo *) + (PATH_MAX * 2) + - (sizeof(struct cifs_sid) * 2), 0, NULL); - else { - rc = SMB2_query_info_init(tcon, server, - &rqst[num_rqst], - COMPOUND_FID, - COMPOUND_FID, - SMB_FIND_FILE_POSIX_INFO, - SMB2_O_INFO_FILE, 0, - sizeof(struct smb311_posix_qinfo *) + (PATH_MAX * 2) + - (sizeof(struct cifs_sid) * 2), 0, NULL); - if (!rc) { - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst]); + rc = SMB2_query_info_init(tcon, server, + &rqst[num_rqst], + cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + SMB_FIND_FILE_POSIX_INFO, + SMB2_O_INFO_FILE, 0, + sizeof(struct smb311_posix_qinfo *) + + (PATH_MAX * 2) + + (sizeof(struct cifs_sid) * 2), 0, NULL); + } else { + rc = SMB2_query_info_init(tcon, server, + &rqst[num_rqst], + COMPOUND_FID, + COMPOUND_FID, + SMB_FIND_FILE_POSIX_INFO, + SMB2_O_INFO_FILE, 0, + sizeof(struct smb311_posix_qinfo *) + + (PATH_MAX * 2) + + (sizeof(struct cifs_sid) * 2), 0, NULL); + if (!rc) { + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst]); + } } - } - if (rc) - goto finished; - num_rqst++; - trace_smb3_posix_query_info_compound_enter(xid, ses->Suid, tcon->tid, full_path); - break; - case SMB2_OP_DELETE: - trace_smb3_delete_enter(xid, ses->Suid, tcon->tid, full_path); - break; - case SMB2_OP_MKDIR: - /* - * Directories are created through parameters in the - * SMB2_open() call. - */ - trace_smb3_mkdir_enter(xid, ses->Suid, tcon->tid, full_path); - break; - case SMB2_OP_RMDIR: - rqst[num_rqst].rq_iov = &vars->si_iov[0]; - rqst[num_rqst].rq_nvec = 1; - - size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */ - data[0] = &delete_pending[0]; - - rc = SMB2_set_info_init(tcon, server, - &rqst[num_rqst], COMPOUND_FID, - COMPOUND_FID, current->tgid, - FILE_DISPOSITION_INFORMATION, - SMB2_O_INFO_FILE, 0, data, size); - if (rc) - goto finished; - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst++]); - trace_smb3_rmdir_enter(xid, ses->Suid, tcon->tid, full_path); - break; - case SMB2_OP_SET_EOF: - rqst[num_rqst].rq_iov = &vars->si_iov[0]; - rqst[num_rqst].rq_nvec = 1; + if (rc) + goto finished; + num_rqst++; + trace_smb3_posix_query_info_compound_enter(xid, ses->Suid, + tcon->tid, full_path); + break; + case SMB2_OP_DELETE: + trace_smb3_delete_enter(xid, ses->Suid, tcon->tid, full_path); + break; + case SMB2_OP_MKDIR: + /* + * Directories are created through parameters in the + * SMB2_open() call. + */ + trace_smb3_mkdir_enter(xid, ses->Suid, tcon->tid, full_path); + break; + case SMB2_OP_RMDIR: + rqst[num_rqst].rq_iov = &vars->si_iov[0]; + rqst[num_rqst].rq_nvec = 1; - size[0] = 8; /* sizeof __le64 */ - data[0] = ptr; + size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */ + data[0] = &delete_pending[0]; - if (cfile) { rc = SMB2_set_info_init(tcon, server, - &rqst[num_rqst], - cfile->fid.persistent_fid, - cfile->fid.volatile_fid, - current->tgid, - FILE_END_OF_FILE_INFORMATION, - SMB2_O_INFO_FILE, 0, - data, size); - } else { - rc = SMB2_set_info_init(tcon, server, - &rqst[num_rqst], - COMPOUND_FID, - COMPOUND_FID, - current->tgid, - FILE_END_OF_FILE_INFORMATION, - SMB2_O_INFO_FILE, 0, - data, size); - if (!rc) { - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst]); + &rqst[num_rqst], COMPOUND_FID, + COMPOUND_FID, current->tgid, + FILE_DISPOSITION_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + if (rc) + goto finished; + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst++]); + trace_smb3_rmdir_enter(xid, ses->Suid, tcon->tid, full_path); + break; + case SMB2_OP_SET_EOF: + rqst[num_rqst].rq_iov = &vars->si_iov[0]; + rqst[num_rqst].rq_nvec = 1; + + size[0] = in_iov[i].iov_len; + data[0] = in_iov[i].iov_base; + + if (cfile) { + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], + cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + current->tgid, + FILE_END_OF_FILE_INFORMATION, + SMB2_O_INFO_FILE, 0, + data, size); + } else { + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], + COMPOUND_FID, + COMPOUND_FID, + current->tgid, + FILE_END_OF_FILE_INFORMATION, + SMB2_O_INFO_FILE, 0, + data, size); + if (!rc) { + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst]); + } } - } - if (rc) - goto finished; - num_rqst++; - trace_smb3_set_eof_enter(xid, ses->Suid, tcon->tid, full_path); - break; - case SMB2_OP_SET_INFO: - rqst[num_rqst].rq_iov = &vars->si_iov[0]; - rqst[num_rqst].rq_nvec = 1; - - - size[0] = sizeof(FILE_BASIC_INFO); - data[0] = ptr; - - if (cfile) - rc = SMB2_set_info_init(tcon, server, - &rqst[num_rqst], - cfile->fid.persistent_fid, - cfile->fid.volatile_fid, current->tgid, - FILE_BASIC_INFORMATION, - SMB2_O_INFO_FILE, 0, data, size); - else { - rc = SMB2_set_info_init(tcon, server, - &rqst[num_rqst], - COMPOUND_FID, - COMPOUND_FID, current->tgid, - FILE_BASIC_INFORMATION, - SMB2_O_INFO_FILE, 0, data, size); - if (!rc) { - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst]); + if (rc) + goto finished; + num_rqst++; + trace_smb3_set_eof_enter(xid, ses->Suid, tcon->tid, full_path); + break; + case SMB2_OP_SET_INFO: + rqst[num_rqst].rq_iov = &vars->si_iov[0]; + rqst[num_rqst].rq_nvec = 1; + + size[0] = in_iov[i].iov_len; + data[0] = in_iov[i].iov_base; + + if (cfile) { + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], + cfile->fid.persistent_fid, + cfile->fid.volatile_fid, current->tgid, + FILE_BASIC_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + } else { + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], + COMPOUND_FID, + COMPOUND_FID, current->tgid, + FILE_BASIC_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + if (!rc) { + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst]); + } } - } - if (rc) - goto finished; - num_rqst++; - trace_smb3_set_info_compound_enter(xid, ses->Suid, tcon->tid, - full_path); - break; - case SMB2_OP_RENAME: - rqst[num_rqst].rq_iov = &vars->si_iov[0]; - rqst[num_rqst].rq_nvec = 2; + if (rc) + goto finished; + num_rqst++; + trace_smb3_set_info_compound_enter(xid, ses->Suid, + tcon->tid, full_path); + break; + case SMB2_OP_RENAME: + rqst[num_rqst].rq_iov = &vars->si_iov[0]; + rqst[num_rqst].rq_nvec = 2; - len = (2 * UniStrnlen((wchar_t *)ptr, PATH_MAX)); + len = in_iov[i].iov_len; - vars->rename_info.ReplaceIfExists = 1; - vars->rename_info.RootDirectory = 0; - vars->rename_info.FileNameLength = cpu_to_le32(len); + vars->rename_info.ReplaceIfExists = 1; + vars->rename_info.RootDirectory = 0; + vars->rename_info.FileNameLength = cpu_to_le32(len); - size[0] = sizeof(struct smb2_file_rename_info); - data[0] = &vars->rename_info; + size[0] = sizeof(struct smb2_file_rename_info); + data[0] = &vars->rename_info; - size[1] = len + 2 /* null */; - data[1] = (__le16 *)ptr; + size[1] = len + 2 /* null */; + data[1] = in_iov[i].iov_base; - if (cfile) - rc = SMB2_set_info_init(tcon, server, - &rqst[num_rqst], - cfile->fid.persistent_fid, - cfile->fid.volatile_fid, - current->tgid, FILE_RENAME_INFORMATION, - SMB2_O_INFO_FILE, 0, data, size); - else { - rc = SMB2_set_info_init(tcon, server, - &rqst[num_rqst], - COMPOUND_FID, COMPOUND_FID, - current->tgid, FILE_RENAME_INFORMATION, - SMB2_O_INFO_FILE, 0, data, size); - if (!rc) { - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst]); + if (cfile) { + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], + cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + current->tgid, FILE_RENAME_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + } else { + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], + COMPOUND_FID, COMPOUND_FID, + current->tgid, FILE_RENAME_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + if (!rc) { + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst]); + } } - } - if (rc) - goto finished; - num_rqst++; - trace_smb3_rename_enter(xid, ses->Suid, tcon->tid, full_path); - break; - case SMB2_OP_HARDLINK: - rqst[num_rqst].rq_iov = &vars->si_iov[0]; - rqst[num_rqst].rq_nvec = 2; + if (rc) + goto finished; + num_rqst++; + trace_smb3_rename_enter(xid, ses->Suid, tcon->tid, full_path); + break; + case SMB2_OP_HARDLINK: + rqst[num_rqst].rq_iov = &vars->si_iov[0]; + rqst[num_rqst].rq_nvec = 2; - len = (2 * UniStrnlen((wchar_t *)ptr, PATH_MAX)); + len = in_iov[i].iov_len; - vars->link_info.ReplaceIfExists = 0; - vars->link_info.RootDirectory = 0; - vars->link_info.FileNameLength = cpu_to_le32(len); + vars->link_info.ReplaceIfExists = 0; + vars->link_info.RootDirectory = 0; + vars->link_info.FileNameLength = cpu_to_le32(len); - size[0] = sizeof(struct smb2_file_link_info); - data[0] = &vars->link_info; + size[0] = sizeof(struct smb2_file_link_info); + data[0] = &vars->link_info; - size[1] = len + 2 /* null */; - data[1] = (__le16 *)ptr; + size[1] = len + 2 /* null */; + data[1] = in_iov[i].iov_base; - rc = SMB2_set_info_init(tcon, server, - &rqst[num_rqst], COMPOUND_FID, - COMPOUND_FID, current->tgid, - FILE_LINK_INFORMATION, - SMB2_O_INFO_FILE, 0, data, size); - if (rc) - goto finished; - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst++]); - trace_smb3_hardlink_enter(xid, ses->Suid, tcon->tid, full_path); - break; - default: - cifs_dbg(VFS, "Invalid command\n"); - rc = -EINVAL; + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], COMPOUND_FID, + COMPOUND_FID, current->tgid, + FILE_LINK_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + if (rc) + goto finished; + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst++]); + trace_smb3_hardlink_enter(xid, ses->Suid, tcon->tid, full_path); + break; + case SMB2_OP_SET_REPARSE: + rqst[num_rqst].rq_iov = vars->io_iov; + rqst[num_rqst].rq_nvec = ARRAY_SIZE(vars->io_iov); + + rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst], + COMPOUND_FID, COMPOUND_FID, + FSCTL_SET_REPARSE_POINT, + in_iov[i].iov_base, + in_iov[i].iov_len, 0); + if (rc) + goto finished; + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst++]); + trace_smb3_set_reparse_compound_enter(xid, ses->Suid, + tcon->tid, full_path); + break; + case SMB2_OP_GET_REPARSE: + rqst[num_rqst].rq_iov = vars->io_iov; + rqst[num_rqst].rq_nvec = ARRAY_SIZE(vars->io_iov); + + rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst], + COMPOUND_FID, COMPOUND_FID, + FSCTL_GET_REPARSE_POINT, + NULL, 0, CIFSMaxBufSize); + if (rc) + goto finished; + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst++]); + trace_smb3_get_reparse_compound_enter(xid, ses->Suid, + tcon->tid, full_path); + break; + default: + cifs_dbg(VFS, "Invalid command\n"); + rc = -EINVAL; + } } if (rc) goto finished; @@ -385,145 +445,176 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, rqst, resp_buftype, rsp_iov); - finished: - SMB2_open_free(&rqst[0]); +finished: + num_rqst = 0; + SMB2_open_free(&rqst[num_rqst++]); if (rc == -EREMCHG) { pr_warn_once("server share %s deleted\n", tcon->tree_name); tcon->need_reconnect = true; } - switch (command) { - case SMB2_OP_QUERY_INFO: - idata = ptr; - if (rc == 0 && cfile && cfile->symlink_target) { - idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); - if (!idata->symlink_target) - rc = -ENOMEM; - } - if (rc == 0) { - qi_rsp = (struct smb2_query_info_rsp *) - rsp_iov[1].iov_base; - rc = smb2_validate_and_copy_iov( - le16_to_cpu(qi_rsp->OutputBufferOffset), - le32_to_cpu(qi_rsp->OutputBufferLength), - &rsp_iov[1], sizeof(idata->fi), (char *)&idata->fi); - } - if (rqst[1].rq_iov) - SMB2_query_info_free(&rqst[1]); - if (rqst[2].rq_iov) - SMB2_close_free(&rqst[2]); - if (rc) - trace_smb3_query_info_compound_err(xid, ses->Suid, - tcon->tid, rc); - else - trace_smb3_query_info_compound_done(xid, ses->Suid, - tcon->tid); - break; - case SMB2_OP_POSIX_QUERY_INFO: - idata = ptr; - if (rc == 0 && cfile && cfile->symlink_target) { - idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); - if (!idata->symlink_target) - rc = -ENOMEM; - } - if (rc == 0) { - qi_rsp = (struct smb2_query_info_rsp *) - rsp_iov[1].iov_base; - rc = smb2_validate_and_copy_iov( - le16_to_cpu(qi_rsp->OutputBufferOffset), - le32_to_cpu(qi_rsp->OutputBufferLength), - &rsp_iov[1], sizeof(idata->posix_fi) /* add SIDs */, - (char *)&idata->posix_fi); - } - if (rc == 0) { - unsigned int length = le32_to_cpu(qi_rsp->OutputBufferLength); - - if (length > sizeof(idata->posix_fi)) { - char *base = (char *)rsp_iov[1].iov_base + - le16_to_cpu(qi_rsp->OutputBufferOffset) + - sizeof(idata->posix_fi); - *extbuflen = length - sizeof(idata->posix_fi); - *extbuf = kmemdup(base, *extbuflen, GFP_KERNEL); - if (!*extbuf) + for (i = 0; i < num_cmds; i++) { + switch (cmds[i]) { + case SMB2_OP_QUERY_INFO: + idata = in_iov[i].iov_base; + if (rc == 0 && cfile && cfile->symlink_target) { + idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); + if (!idata->symlink_target) rc = -ENOMEM; + } + if (rc == 0) { + qi_rsp = (struct smb2_query_info_rsp *) + rsp_iov[i + 1].iov_base; + rc = smb2_validate_and_copy_iov( + le16_to_cpu(qi_rsp->OutputBufferOffset), + le32_to_cpu(qi_rsp->OutputBufferLength), + &rsp_iov[i + 1], sizeof(idata->fi), (char *)&idata->fi); + } + SMB2_query_info_free(&rqst[num_rqst++]); + if (rc) + trace_smb3_query_info_compound_err(xid, ses->Suid, + tcon->tid, rc); + else + trace_smb3_query_info_compound_done(xid, ses->Suid, + tcon->tid); + break; + case SMB2_OP_POSIX_QUERY_INFO: + idata = in_iov[i].iov_base; + if (rc == 0 && cfile && cfile->symlink_target) { + idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); + if (!idata->symlink_target) + rc = -ENOMEM; + } + if (rc == 0) { + qi_rsp = (struct smb2_query_info_rsp *) + rsp_iov[i + 1].iov_base; + rc = smb2_validate_and_copy_iov( + le16_to_cpu(qi_rsp->OutputBufferOffset), + le32_to_cpu(qi_rsp->OutputBufferLength), + &rsp_iov[i + 1], sizeof(idata->posix_fi) /* add SIDs */, + (char *)&idata->posix_fi); + } + if (rc == 0) { + unsigned int length = le32_to_cpu(qi_rsp->OutputBufferLength); + + if (length > sizeof(idata->posix_fi)) { + char *base = (char *)rsp_iov[i + 1].iov_base + + le16_to_cpu(qi_rsp->OutputBufferOffset) + + sizeof(idata->posix_fi); + *extbuflen = length - sizeof(idata->posix_fi); + *extbuf = kmemdup(base, *extbuflen, GFP_KERNEL); + if (!*extbuf) + rc = -ENOMEM; + } else { + rc = -EINVAL; + } + } + SMB2_query_info_free(&rqst[num_rqst++]); + if (rc) + trace_smb3_posix_query_info_compound_err(xid, ses->Suid, + tcon->tid, rc); + else + trace_smb3_posix_query_info_compound_done(xid, ses->Suid, + tcon->tid); + break; + case SMB2_OP_DELETE: + if (rc) + trace_smb3_delete_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_delete_done(xid, ses->Suid, tcon->tid); + break; + case SMB2_OP_MKDIR: + if (rc) + trace_smb3_mkdir_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_mkdir_done(xid, ses->Suid, tcon->tid); + break; + case SMB2_OP_HARDLINK: + if (rc) + trace_smb3_hardlink_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_hardlink_done(xid, ses->Suid, tcon->tid); + SMB2_set_info_free(&rqst[num_rqst++]); + break; + case SMB2_OP_RENAME: + if (rc) + trace_smb3_rename_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_rename_done(xid, ses->Suid, tcon->tid); + SMB2_set_info_free(&rqst[num_rqst++]); + break; + case SMB2_OP_RMDIR: + if (rc) + trace_smb3_rmdir_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_rmdir_done(xid, ses->Suid, tcon->tid); + SMB2_set_info_free(&rqst[num_rqst++]); + break; + case SMB2_OP_SET_EOF: + if (rc) + trace_smb3_set_eof_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_set_eof_done(xid, ses->Suid, tcon->tid); + SMB2_set_info_free(&rqst[num_rqst++]); + break; + case SMB2_OP_SET_INFO: + if (rc) + trace_smb3_set_info_compound_err(xid, ses->Suid, + tcon->tid, rc); + else + trace_smb3_set_info_compound_done(xid, ses->Suid, + tcon->tid); + SMB2_set_info_free(&rqst[num_rqst++]); + break; + case SMB2_OP_SET_REPARSE: + if (rc) { + trace_smb3_set_reparse_compound_err(xid, ses->Suid, + tcon->tid, rc); + } else { + trace_smb3_set_reparse_compound_done(xid, ses->Suid, + tcon->tid); + } + SMB2_ioctl_free(&rqst[num_rqst++]); + break; + case SMB2_OP_GET_REPARSE: + if (!rc) { + iov = &rsp_iov[i + 1]; + idata = in_iov[i].iov_base; + idata->reparse.io.iov = *iov; + idata->reparse.io.buftype = resp_buftype[i + 1]; + rbuf = reparse_buf_ptr(iov); + if (IS_ERR(rbuf)) { + rc = PTR_ERR(rbuf); + trace_smb3_set_reparse_compound_err(xid, ses->Suid, + tcon->tid, rc); + } else { + idata->reparse.tag = le32_to_cpu(rbuf->ReparseTag); + trace_smb3_set_reparse_compound_done(xid, ses->Suid, + tcon->tid); + } + memset(iov, 0, sizeof(*iov)); + resp_buftype[i + 1] = CIFS_NO_BUFFER; } else { - rc = -EINVAL; + trace_smb3_set_reparse_compound_err(xid, ses->Suid, + tcon->tid, rc); } + SMB2_ioctl_free(&rqst[num_rqst++]); + break; } - if (rqst[1].rq_iov) - SMB2_query_info_free(&rqst[1]); - if (rqst[2].rq_iov) - SMB2_close_free(&rqst[2]); - if (rc) - trace_smb3_posix_query_info_compound_err(xid, ses->Suid, tcon->tid, rc); - else - trace_smb3_posix_query_info_compound_done(xid, ses->Suid, tcon->tid); - break; - case SMB2_OP_DELETE: - if (rc) - trace_smb3_delete_err(xid, ses->Suid, tcon->tid, rc); - else - trace_smb3_delete_done(xid, ses->Suid, tcon->tid); - if (rqst[1].rq_iov) - SMB2_close_free(&rqst[1]); - break; - case SMB2_OP_MKDIR: - if (rc) - trace_smb3_mkdir_err(xid, ses->Suid, tcon->tid, rc); - else - trace_smb3_mkdir_done(xid, ses->Suid, tcon->tid); - if (rqst[1].rq_iov) - SMB2_close_free(&rqst[1]); - break; - case SMB2_OP_HARDLINK: - if (rc) - trace_smb3_hardlink_err(xid, ses->Suid, tcon->tid, rc); - else - trace_smb3_hardlink_done(xid, ses->Suid, tcon->tid); - free_set_inf_compound(rqst); - break; - case SMB2_OP_RENAME: - if (rc) - trace_smb3_rename_err(xid, ses->Suid, tcon->tid, rc); - else - trace_smb3_rename_done(xid, ses->Suid, tcon->tid); - free_set_inf_compound(rqst); - break; - case SMB2_OP_RMDIR: - if (rc) - trace_smb3_rmdir_err(xid, ses->Suid, tcon->tid, rc); - else - trace_smb3_rmdir_done(xid, ses->Suid, tcon->tid); - free_set_inf_compound(rqst); - break; - case SMB2_OP_SET_EOF: - if (rc) - trace_smb3_set_eof_err(xid, ses->Suid, tcon->tid, rc); - else - trace_smb3_set_eof_done(xid, ses->Suid, tcon->tid); - free_set_inf_compound(rqst); - break; - case SMB2_OP_SET_INFO: - if (rc) - trace_smb3_set_info_compound_err(xid, ses->Suid, - tcon->tid, rc); - else - trace_smb3_set_info_compound_done(xid, ses->Suid, - tcon->tid); - free_set_inf_compound(rqst); - break; } + SMB2_close_free(&rqst[num_rqst]); if (cfile) cifsFileInfo_put(cfile); + num_cmds += 2; if (out_iov && out_buftype) { - memcpy(out_iov, rsp_iov, 3 * sizeof(*out_iov)); - memcpy(out_buftype, resp_buftype, 3 * sizeof(*out_buftype)); + memcpy(out_iov, rsp_iov, num_cmds * sizeof(*out_iov)); + memcpy(out_buftype, resp_buftype, + num_cmds * sizeof(*out_buftype)); } else { - free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); - free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); - free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); + for (i = 0; i < num_cmds; i++) + free_rsp_buf(resp_buftype[i], rsp_iov[i].iov_base); } kfree(vars); return rc; @@ -569,9 +660,11 @@ int smb2_query_path_info(const unsigned int xid, struct cifsFileInfo *cfile; struct cached_fid *cfid = NULL; struct smb2_hdr *hdr; - struct kvec out_iov[3] = {}; + struct kvec in_iov[2], out_iov[3] = {}; int out_buftype[3] = {}; + int cmds[2] = { SMB2_OP_QUERY_INFO, }; bool islink; + int i, num_cmds; int rc, rc2; data->adjust_tz = false; @@ -593,9 +686,15 @@ int smb2_query_path_info(const unsigned int xid, return rc; } + in_iov[0].iov_base = data; + in_iov[0].iov_len = sizeof(*data); + in_iov[1] = in_iov[0]; + cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, data, SMB2_OP_QUERY_INFO, cfile, + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, + FILE_READ_ATTRIBUTES, FILE_OPEN, + create_options, ACL_NO_MODE, + in_iov, cmds, 1, cfile, NULL, NULL, out_iov, out_buftype); hdr = out_iov[0].iov_base; /* @@ -612,14 +711,19 @@ int smb2_query_path_info(const unsigned int xid, if (rc || !data->reparse_point) goto out; + if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK) { + /* symlink already parsed in create response */ + num_cmds = 1; + } else { + cmds[1] = SMB2_OP_GET_REPARSE; + num_cmds = 2; + } create_options |= OPEN_REPARSE_POINT; - /* Failed on a symbolic link - query a reparse point info */ cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, data, - SMB2_OP_QUERY_INFO, cfile, NULL, NULL, - NULL, NULL); + create_options, ACL_NO_MODE, in_iov, cmds, + num_cmds, cfile, NULL, NULL, NULL, NULL); break; case -EREMOTE: break; @@ -637,9 +741,8 @@ int smb2_query_path_info(const unsigned int xid, } out: - free_rsp_buf(out_buftype[0], out_iov[0].iov_base); - free_rsp_buf(out_buftype[1], out_iov[1].iov_base); - free_rsp_buf(out_buftype[2], out_iov[2].iov_base); + for (i = 0; i < ARRAY_SIZE(out_buftype); i++) + free_rsp_buf(out_buftype[i], out_iov[i].iov_base); return rc; } @@ -654,12 +757,14 @@ int smb311_posix_query_path_info(const unsigned int xid, int rc; __u32 create_options = 0; struct cifsFileInfo *cfile; - struct kvec out_iov[3] = {}; + struct kvec in_iov[2], out_iov[3] = {}; int out_buftype[3] = {}; __u8 *sidsbuf = NULL; __u8 *sidsbuf_end = NULL; size_t sidsbuflen = 0; size_t owner_len, group_len; + int cmds[2] = { SMB2_OP_POSIX_QUERY_INFO, }; + int i, num_cmds; data->adjust_tz = false; data->reparse_point = false; @@ -670,11 +775,15 @@ int smb311_posix_query_path_info(const unsigned int xid, * when we already have an open file handle for this. For now this is fast enough * (always using the compounded version). */ + in_iov[0].iov_base = data; + in_iov[0].iov_len = sizeof(*data); + in_iov[1] = in_iov[0]; cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, data, SMB2_OP_POSIX_QUERY_INFO, cfile, - &sidsbuf, &sidsbuflen, out_iov, out_buftype); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, + FILE_READ_ATTRIBUTES, FILE_OPEN, + create_options, ACL_NO_MODE, in_iov, cmds, 1, + cfile, &sidsbuf, &sidsbuflen, out_iov, out_buftype); /* * If first iov is unset, then SMB session was dropped or we've got a * cached open file (@cfile). @@ -690,13 +799,19 @@ int smb311_posix_query_path_info(const unsigned int xid, if (rc || !data->reparse_point) goto out; + if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK) { + /* symlink already parsed in create response */ + num_cmds = 1; + } else { + cmds[1] = SMB2_OP_GET_REPARSE; + num_cmds = 2; + } create_options |= OPEN_REPARSE_POINT; - /* Failed on a symbolic link - query a reparse point info */ cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, - FILE_OPEN, create_options, ACL_NO_MODE, data, - SMB2_OP_POSIX_QUERY_INFO, cfile, - &sidsbuf, &sidsbuflen, NULL, NULL); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, + FILE_READ_ATTRIBUTES, FILE_OPEN, + create_options, ACL_NO_MODE, in_iov, cmds, + num_cmds, cfile, &sidsbuf, &sidsbuflen, NULL, NULL); break; } @@ -721,9 +836,8 @@ out: } kfree(sidsbuf); - free_rsp_buf(out_buftype[0], out_iov[0].iov_base); - free_rsp_buf(out_buftype[1], out_iov[1].iov_base); - free_rsp_buf(out_buftype[2], out_iov[2].iov_base); + for (i = 0; i < ARRAY_SIZE(out_buftype); i++) + free_rsp_buf(out_buftype[i], out_iov[i].iov_base); return rc; } @@ -734,7 +848,8 @@ smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode, { return smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, - CREATE_NOT_FILE, mode, NULL, SMB2_OP_MKDIR, + CREATE_NOT_FILE, mode, NULL, + &(int){SMB2_OP_MKDIR}, 1, NULL, NULL, NULL, NULL, NULL); } @@ -743,21 +858,24 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name, struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon, const unsigned int xid) { - FILE_BASIC_INFO data; + FILE_BASIC_INFO data = {}; struct cifsInodeInfo *cifs_i; struct cifsFileInfo *cfile; + struct kvec in_iov; u32 dosattrs; int tmprc; - memset(&data, 0, sizeof(data)); + in_iov.iov_base = &data; + in_iov.iov_len = sizeof(data); cifs_i = CIFS_I(inode); dosattrs = cifs_i->cifsAttrs | ATTR_READONLY; data.Attributes = cpu_to_le32(dosattrs); cifs_get_writable_path(tcon, name, FIND_WR_ANY, &cfile); tmprc = smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, - CREATE_NOT_FILE, ACL_NO_MODE, - &data, SMB2_OP_SET_INFO, cfile, NULL, NULL, NULL, NULL); + CREATE_NOT_FILE, ACL_NO_MODE, &in_iov, + &(int){SMB2_OP_SET_INFO}, 1, + cfile, NULL, NULL, NULL, NULL); if (tmprc == 0) cifs_i->cifsAttrs = dosattrs; } @@ -767,9 +885,10 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { drop_cached_dir_by_name(xid, tcon, name, cifs_sb); - return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, - CREATE_NOT_FILE, ACL_NO_MODE, - NULL, SMB2_OP_RMDIR, NULL, NULL, NULL, NULL, NULL); + return smb2_compound_op(xid, tcon, cifs_sb, name, + DELETE, FILE_OPEN, CREATE_NOT_FILE, + ACL_NO_MODE, NULL, &(int){SMB2_OP_RMDIR}, 1, + NULL, NULL, NULL, NULL, NULL); } int @@ -778,15 +897,17 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, { return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, - ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL, NULL, NULL, NULL, NULL); + ACL_NO_MODE, NULL, &(int){SMB2_OP_DELETE}, 1, + NULL, NULL, NULL, NULL, NULL); } -static int -smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, - const char *from_name, const char *to_name, - struct cifs_sb_info *cifs_sb, __u32 access, int command, - struct cifsFileInfo *cfile) +static int smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb, + __u32 create_options, __u32 access, + int command, struct cifsFileInfo *cfile) { + struct kvec in_iov; __le16 *smb2_to_name = NULL; int rc; @@ -795,36 +916,43 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, rc = -ENOMEM; goto smb2_rename_path; } + in_iov.iov_base = smb2_to_name; + in_iov.iov_len = 2 * UniStrnlen((wchar_t *)smb2_to_name, PATH_MAX); rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access, - FILE_OPEN, 0, ACL_NO_MODE, smb2_to_name, - command, cfile, NULL, NULL, NULL, NULL); + FILE_OPEN, create_options, ACL_NO_MODE, &in_iov, + &command, 1, cfile, NULL, NULL, NULL, NULL); smb2_rename_path: kfree(smb2_to_name); return rc; } -int -smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon, - const char *from_name, const char *to_name, - struct cifs_sb_info *cifs_sb) +int smb2_rename_path(const unsigned int xid, + struct cifs_tcon *tcon, + struct dentry *source_dentry, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb) { struct cifsFileInfo *cfile; + __u32 co = file_create_options(source_dentry); drop_cached_dir_by_name(xid, tcon, from_name, cifs_sb); cifs_get_writable_path(tcon, from_name, FIND_WR_WITH_DELETE, &cfile); - return smb2_set_path_attr(xid, tcon, from_name, to_name, - cifs_sb, DELETE, SMB2_OP_RENAME, cfile); + return smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb, + co, DELETE, SMB2_OP_RENAME, cfile); } -int -smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon, - const char *from_name, const char *to_name, - struct cifs_sb_info *cifs_sb) +int smb2_create_hardlink(const unsigned int xid, + struct cifs_tcon *tcon, + struct dentry *source_dentry, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb) { - return smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb, - FILE_READ_ATTRIBUTES, SMB2_OP_HARDLINK, - NULL); + __u32 co = file_create_options(source_dentry); + + return smb2_set_path_attr(xid, tcon, from_name, to_name, + cifs_sb, co, FILE_READ_ATTRIBUTES, + SMB2_OP_HARDLINK, NULL); } int @@ -832,13 +960,18 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, const char *full_path, __u64 size, struct cifs_sb_info *cifs_sb, bool set_alloc) { - __le64 eof = cpu_to_le64(size); struct cifsFileInfo *cfile; + struct kvec in_iov; + __le64 eof = cpu_to_le64(size); + in_iov.iov_base = &eof; + in_iov.iov_len = sizeof(eof); cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); return smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_WRITE_DATA, FILE_OPEN, 0, ACL_NO_MODE, - &eof, SMB2_OP_SET_EOF, cfile, NULL, NULL, NULL, NULL); + FILE_WRITE_DATA, FILE_OPEN, + 0, ACL_NO_MODE, &in_iov, + &(int){SMB2_OP_SET_EOF}, 1, + cfile, NULL, NULL, NULL, NULL); } int @@ -849,6 +982,7 @@ smb2_set_file_info(struct inode *inode, const char *full_path, struct tcon_link *tlink; struct cifs_tcon *tcon; struct cifsFileInfo *cfile; + struct kvec in_iov = { .iov_base = buf, .iov_len = sizeof(*buf), }; int rc; if ((buf->CreationTime == 0) && (buf->LastAccessTime == 0) && @@ -864,8 +998,91 @@ smb2_set_file_info(struct inode *inode, const char *full_path, cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_WRITE_ATTRIBUTES, FILE_OPEN, - 0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, cfile, + 0, ACL_NO_MODE, &in_iov, + &(int){SMB2_OP_SET_INFO}, 1, cfile, NULL, NULL, NULL, NULL); cifs_put_tlink(tlink); return rc; } + +struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data, + struct super_block *sb, + const unsigned int xid, + struct cifs_tcon *tcon, + const char *full_path, + struct kvec *iov) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifsFileInfo *cfile; + struct inode *new = NULL; + struct kvec in_iov[2]; + int cmds[2]; + int da, co, cd; + int rc; + + da = SYNCHRONIZE | DELETE | + FILE_READ_ATTRIBUTES | + FILE_WRITE_ATTRIBUTES; + co = CREATE_NOT_DIR | OPEN_REPARSE_POINT; + cd = FILE_CREATE; + cmds[0] = SMB2_OP_SET_REPARSE; + in_iov[0] = *iov; + in_iov[1].iov_base = data; + in_iov[1].iov_len = sizeof(*data); + + if (tcon->posix_extensions) { + cmds[1] = SMB2_OP_POSIX_QUERY_INFO; + cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, + da, cd, co, ACL_NO_MODE, in_iov, + cmds, 2, cfile, NULL, NULL, NULL, NULL); + if (!rc) { + rc = smb311_posix_get_inode_info(&new, full_path, + data, sb, xid); + } + } else { + cmds[1] = SMB2_OP_QUERY_INFO; + cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, + da, cd, co, ACL_NO_MODE, in_iov, + cmds, 2, cfile, NULL, NULL, NULL, NULL); + if (!rc) { + rc = cifs_get_inode_info(&new, full_path, + data, sb, xid, NULL); + } + } + return rc ? ERR_PTR(rc) : new; +} + +int smb2_query_reparse_point(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, + u32 *tag, struct kvec *rsp, + int *rsp_buftype) +{ + struct cifs_open_info_data data = {}; + struct cifsFileInfo *cfile; + struct kvec in_iov = { .iov_base = &data, .iov_len = sizeof(data), }; + int rc; + + cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); + + cifs_get_readable_path(tcon, full_path, &cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, + FILE_READ_ATTRIBUTES, FILE_OPEN, + OPEN_REPARSE_POINT, ACL_NO_MODE, &in_iov, + &(int){SMB2_OP_GET_REPARSE}, 1, cfile, + NULL, NULL, NULL, NULL); + if (rc) + goto out; + + *tag = data.reparse.tag; + *rsp = data.reparse.io.iov; + *rsp_buftype = data.reparse.io.buftype; + memset(&data.reparse.io.iov, 0, sizeof(data.reparse.io.iov)); + data.reparse.io.buftype = CIFS_NO_BUFFER; +out: + cifs_free_open_info(&data); + return rc; +} diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 14bc745de199..01a5bd7e6a30 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -1935,7 +1935,6 @@ static int smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon, struct cifsFileInfo *cfile, __u64 size, bool set_alloc) { - __le64 eof = cpu_to_le64(size); struct inode *inode; /* @@ -1952,7 +1951,7 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon, } return SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, - cfile->fid.volatile_fid, cfile->pid, &eof); + cfile->fid.volatile_fid, cfile->pid, size); } static int @@ -2948,18 +2947,6 @@ int parse_reparse_point(struct reparse_data_buffer *buf, u32 plen, struct cifs_sb_info *cifs_sb, bool unicode, struct cifs_open_info_data *data) { - if (plen < sizeof(*buf)) { - cifs_dbg(VFS, "%s: reparse buffer is too small. Must be at least 8 bytes but was %d\n", - __func__, plen); - return -EIO; - } - - if (plen < le16_to_cpu(buf->ReparseDataLength) + sizeof(*buf)) { - cifs_dbg(VFS, "%s: invalid reparse buf length: %d\n", - __func__, plen); - return -EIO; - } - data->reparse.buf = buf; /* See MS-FSCC 2.1.2 */ @@ -2997,145 +2984,6 @@ static int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, return parse_reparse_point(buf, plen, cifs_sb, true, data); } -static int smb2_query_reparse_point(const unsigned int xid, - struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, - const char *full_path, - u32 *tag, struct kvec *rsp, - int *rsp_buftype) -{ - struct smb2_compound_vars *vars; - int rc; - __le16 *utf16_path = NULL; - __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; - struct cifs_open_parms oparms; - struct cifs_fid fid; - struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses); - int flags = CIFS_CP_CREATE_CLOSE_OP; - struct smb_rqst *rqst; - int resp_buftype[3]; - struct kvec *rsp_iov; - struct smb2_ioctl_rsp *ioctl_rsp; - struct reparse_data_buffer *reparse_buf; - u32 off, count, len; - - cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); - - if (smb3_encryption_required(tcon)) - flags |= CIFS_TRANSFORM_REQ; - - utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); - if (!utf16_path) - return -ENOMEM; - - resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; - vars = kzalloc(sizeof(*vars), GFP_KERNEL); - if (!vars) { - rc = -ENOMEM; - goto out_free_path; - } - rqst = vars->rqst; - rsp_iov = vars->rsp_iov; - - /* - * setup smb2open - TODO add optimization to call cifs_get_readable_path - * to see if there is a handle already open that we can use - */ - rqst[0].rq_iov = vars->open_iov; - rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; - - oparms = (struct cifs_open_parms) { - .tcon = tcon, - .path = full_path, - .desired_access = FILE_READ_ATTRIBUTES, - .disposition = FILE_OPEN, - .create_options = cifs_create_options(cifs_sb, OPEN_REPARSE_POINT), - .fid = &fid, - }; - - rc = SMB2_open_init(tcon, server, - &rqst[0], &oplock, &oparms, utf16_path); - if (rc) - goto query_rp_exit; - smb2_set_next_command(tcon, &rqst[0]); - - - /* IOCTL */ - rqst[1].rq_iov = vars->io_iov; - rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE; - - rc = SMB2_ioctl_init(tcon, server, - &rqst[1], COMPOUND_FID, - COMPOUND_FID, FSCTL_GET_REPARSE_POINT, NULL, 0, - CIFSMaxBufSize - - MAX_SMB2_CREATE_RESPONSE_SIZE - - MAX_SMB2_CLOSE_RESPONSE_SIZE); - if (rc) - goto query_rp_exit; - - smb2_set_next_command(tcon, &rqst[1]); - smb2_set_related(&rqst[1]); - - /* Close */ - rqst[2].rq_iov = &vars->close_iov; - rqst[2].rq_nvec = 1; - - rc = SMB2_close_init(tcon, server, - &rqst[2], COMPOUND_FID, COMPOUND_FID, false); - if (rc) - goto query_rp_exit; - - smb2_set_related(&rqst[2]); - - rc = compound_send_recv(xid, tcon->ses, server, - flags, 3, rqst, - resp_buftype, rsp_iov); - - ioctl_rsp = rsp_iov[1].iov_base; - - /* - * Open was successful and we got an ioctl response. - */ - if (rc == 0) { - /* See MS-FSCC 2.3.23 */ - off = le32_to_cpu(ioctl_rsp->OutputOffset); - count = le32_to_cpu(ioctl_rsp->OutputCount); - if (check_add_overflow(off, count, &len) || - len > rsp_iov[1].iov_len) { - cifs_tcon_dbg(VFS, "%s: invalid ioctl: off=%d count=%d\n", - __func__, off, count); - rc = -EIO; - goto query_rp_exit; - } - - reparse_buf = (void *)((u8 *)ioctl_rsp + off); - len = sizeof(*reparse_buf); - if (count < len || - count < le16_to_cpu(reparse_buf->ReparseDataLength) + len) { - cifs_tcon_dbg(VFS, "%s: invalid ioctl: off=%d count=%d\n", - __func__, off, count); - rc = -EIO; - goto query_rp_exit; - } - *tag = le32_to_cpu(reparse_buf->ReparseTag); - *rsp = rsp_iov[1]; - *rsp_buftype = resp_buftype[1]; - resp_buftype[1] = CIFS_NO_BUFFER; - } - - query_rp_exit: - SMB2_open_free(&rqst[0]); - SMB2_ioctl_free(&rqst[1]); - SMB2_close_free(&rqst[2]); - free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); - free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); - free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); - kfree(vars); -out_free_path: - kfree(utf16_path); - return rc; -} - static struct cifs_ntsd * get_smb2_acl_by_fid(struct cifs_sb_info *cifs_sb, const struct cifs_fid *cifsfid, u32 *pacllen, u32 info) @@ -3336,7 +3184,6 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, unsigned long long new_size; long rc; unsigned int xid; - __le64 eof; xid = get_xid(); @@ -3366,9 +3213,8 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, */ new_size = offset + len; if (keep_size == false && (unsigned long long)i_size_read(inode) < new_size) { - eof = cpu_to_le64(new_size); rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, - cfile->fid.volatile_fid, cfile->pid, &eof); + cfile->fid.volatile_fid, cfile->pid, new_size); if (rc >= 0) { truncate_setsize(inode, new_size); fscache_resize_cookie(cifs_inode_cookie(inode), new_size); @@ -3561,7 +3407,7 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, struct cifsFileInfo *cfile = file->private_data; long rc = -EOPNOTSUPP; unsigned int xid; - __le64 eof; + loff_t new_eof; xid = get_xid(); @@ -3590,14 +3436,14 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, if (cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) smb2_set_sparse(xid, tcon, cfile, inode, false); - eof = cpu_to_le64(off + len); + new_eof = off + len; rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, - cfile->fid.volatile_fid, cfile->pid, &eof); + cfile->fid.volatile_fid, cfile->pid, new_eof); if (rc == 0) { - cifsi->server_eof = off + len; - cifs_setsize(inode, off + len); + cifsi->server_eof = new_eof; + cifs_setsize(inode, new_eof); cifs_truncate_page(inode->i_mapping, inode->i_size); - truncate_setsize(inode, off + len); + truncate_setsize(inode, new_eof); } goto out; } @@ -3688,8 +3534,7 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, struct inode *inode = file_inode(file); struct cifsFileInfo *cfile = file->private_data; struct cifsInodeInfo *cifsi = CIFS_I(inode); - __le64 eof; - loff_t old_eof; + loff_t old_eof, new_eof; xid = get_xid(); @@ -3714,9 +3559,9 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, if (rc < 0) goto out_2; - eof = cpu_to_le64(old_eof - len); + new_eof = old_eof - len; rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, - cfile->fid.volatile_fid, cfile->pid, &eof); + cfile->fid.volatile_fid, cfile->pid, new_eof); if (rc < 0) goto out_2; @@ -3740,8 +3585,7 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon, unsigned int xid; struct cifsFileInfo *cfile = file->private_data; struct inode *inode = file_inode(file); - __le64 eof; - __u64 count, old_eof; + __u64 count, old_eof, new_eof; xid = get_xid(); @@ -3754,20 +3598,20 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon, } count = old_eof - off; - eof = cpu_to_le64(old_eof + len); + new_eof = old_eof + len; filemap_invalidate_lock(inode->i_mapping); - rc = filemap_write_and_wait_range(inode->i_mapping, off, old_eof + len - 1); + rc = filemap_write_and_wait_range(inode->i_mapping, off, new_eof - 1); if (rc < 0) goto out_2; truncate_pagecache_range(inode, off, old_eof); rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, - cfile->fid.volatile_fid, cfile->pid, &eof); + cfile->fid.volatile_fid, cfile->pid, new_eof); if (rc < 0) goto out_2; - truncate_setsize(inode, old_eof + len); + truncate_setsize(inode, new_eof); fscache_resize_cookie(cifs_inode_cookie(inode), i_size_read(inode)); rc = smb2_copychunk_range(xid, cfile, cfile, off, count, off + len); @@ -5171,11 +5015,154 @@ int cifs_sfu_make_node(unsigned int xid, struct inode *inode, return rc; } +static inline u64 mode_nfs_type(mode_t mode) +{ + switch (mode & S_IFMT) { + case S_IFBLK: return NFS_SPECFILE_BLK; + case S_IFCHR: return NFS_SPECFILE_CHR; + case S_IFIFO: return NFS_SPECFILE_FIFO; + case S_IFSOCK: return NFS_SPECFILE_SOCK; + } + return 0; +} + +static int nfs_set_reparse_buf(struct reparse_posix_data *buf, + mode_t mode, dev_t dev, + struct kvec *iov) +{ + u64 type; + u16 len, dlen; + + len = sizeof(*buf); + + switch ((type = mode_nfs_type(mode))) { + case NFS_SPECFILE_BLK: + case NFS_SPECFILE_CHR: + dlen = sizeof(__le64); + break; + case NFS_SPECFILE_FIFO: + case NFS_SPECFILE_SOCK: + dlen = 0; + break; + default: + return -EOPNOTSUPP; + } + + buf->ReparseTag = cpu_to_le32(IO_REPARSE_TAG_NFS); + buf->Reserved = 0; + buf->InodeType = cpu_to_le64(type); + buf->ReparseDataLength = cpu_to_le16(len + dlen - + sizeof(struct reparse_data_buffer)); + *(__le64 *)buf->DataBuffer = cpu_to_le64(((u64)MAJOR(dev) << 32) | + MINOR(dev)); + iov->iov_base = buf; + iov->iov_len = len + dlen; + return 0; +} + +static int nfs_make_node(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, umode_t mode, dev_t dev) +{ + struct cifs_open_info_data data; + struct reparse_posix_data *p; + struct inode *new; + struct kvec iov; + __u8 buf[sizeof(*p) + sizeof(__le64)]; + int rc; + + p = (struct reparse_posix_data *)buf; + rc = nfs_set_reparse_buf(p, mode, dev, &iov); + if (rc) + return rc; + + data = (struct cifs_open_info_data) { + .reparse_point = true, + .reparse = { .tag = IO_REPARSE_TAG_NFS, .posix = p, }, + }; + + new = smb2_get_reparse_inode(&data, inode->i_sb, xid, + tcon, full_path, &iov); + if (!IS_ERR(new)) + d_instantiate(dentry, new); + else + rc = PTR_ERR(new); + cifs_free_open_info(&data); + return rc; +} + +static int smb2_create_reparse_symlink(const unsigned int xid, + struct inode *inode, + struct dentry *dentry, + struct cifs_tcon *tcon, + const char *full_path, + const char *symname) +{ + struct reparse_symlink_data_buffer *buf = NULL; + struct cifs_open_info_data data; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct inode *new; + struct kvec iov; + __le16 *path; + char *sym; + u16 len, plen; + int rc = 0; + + sym = kstrdup(symname, GFP_KERNEL); + if (!sym) + return -ENOMEM; + + data = (struct cifs_open_info_data) { + .reparse_point = true, + .reparse = { .tag = IO_REPARSE_TAG_SYMLINK, }, + .symlink_target = sym, + }; + + path = cifs_convert_path_to_utf16(symname, cifs_sb); + if (!path) { + rc = -ENOMEM; + goto out; + } + + plen = 2 * UniStrnlen((wchar_t *)path, PATH_MAX); + len = sizeof(*buf) + plen * 2; + buf = kzalloc(len, GFP_KERNEL); + if (!buf) { + rc = -ENOMEM; + goto out; + } + + buf->ReparseTag = cpu_to_le32(IO_REPARSE_TAG_SYMLINK); + buf->ReparseDataLength = cpu_to_le16(len - sizeof(struct reparse_data_buffer)); + buf->SubstituteNameOffset = cpu_to_le16(plen); + buf->SubstituteNameLength = cpu_to_le16(plen); + memcpy(&buf->PathBuffer[plen], path, plen); + buf->PrintNameOffset = 0; + buf->PrintNameLength = cpu_to_le16(plen); + memcpy(buf->PathBuffer, path, plen); + buf->Flags = cpu_to_le32(*symname != '/' ? SYMLINK_FLAG_RELATIVE : 0); + + iov.iov_base = buf; + iov.iov_len = len; + new = smb2_get_reparse_inode(&data, inode->i_sb, xid, + tcon, full_path, &iov); + if (!IS_ERR(new)) + d_instantiate(dentry, new); + else + rc = PTR_ERR(new); +out: + kfree(path); + cifs_free_open_info(&data); + kfree(buf); + return rc; +} + static int smb2_make_node(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, umode_t mode, dev_t dev) { struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + int rc; /* * Check if mounted with mount parm 'sfu' mount parm. @@ -5183,15 +5170,14 @@ static int smb2_make_node(unsigned int xid, struct inode *inode, * supports block and char device (no socket & fifo), * and was used by default in earlier versions of Windows */ - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) - return -EPERM; - /* - * TODO: Add ability to create instead via reparse point. Windows (e.g. - * their current NFS server) uses this approach to expose special files - * over SMB2/SMB3 and Samba will do this with SMB3.1.1 POSIX Extensions - */ - return cifs_sfu_make_node(xid, inode, dentry, tcon, - full_path, mode, dev); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { + rc = cifs_sfu_make_node(xid, inode, dentry, tcon, + full_path, mode, dev); + } else { + rc = nfs_make_node(xid, inode, dentry, tcon, + full_path, mode, dev); + } + return rc; } #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY @@ -5247,6 +5233,7 @@ struct smb_version_operations smb20_operations = { .parse_reparse_point = smb2_parse_reparse_point, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, + .create_reparse_symlink = smb2_create_reparse_symlink, .open = smb2_open_file, .set_fid = smb2_set_fid, .close = smb2_close_file, @@ -5349,6 +5336,7 @@ struct smb_version_operations smb21_operations = { .parse_reparse_point = smb2_parse_reparse_point, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, + .create_reparse_symlink = smb2_create_reparse_symlink, .open = smb2_open_file, .set_fid = smb2_set_fid, .close = smb2_close_file, @@ -5454,6 +5442,7 @@ struct smb_version_operations smb30_operations = { .parse_reparse_point = smb2_parse_reparse_point, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, + .create_reparse_symlink = smb2_create_reparse_symlink, .open = smb2_open_file, .set_fid = smb2_set_fid, .close = smb2_close_file, @@ -5568,6 +5557,7 @@ struct smb_version_operations smb311_operations = { .parse_reparse_point = smb2_parse_reparse_point, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, + .create_reparse_symlink = smb2_create_reparse_symlink, .open = smb2_open_file, .set_fid = smb2_set_fid, .close = smb2_close_file, diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 4f971c1061f0..bd25c34dc398 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -5347,18 +5347,18 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, - u64 volatile_fid, u32 pid, __le64 *eof) + u64 volatile_fid, u32 pid, loff_t new_eof) { struct smb2_file_eof_info info; void *data; unsigned int size; - info.EndOfFile = *eof; + info.EndOfFile = cpu_to_le64(new_eof); data = &info; size = sizeof(struct smb2_file_eof_info); - trace_smb3_set_eof(xid, persistent_fid, tcon->tid, tcon->ses->Suid, le64_to_cpu(*eof)); + trace_smb3_set_eof(xid, persistent_fid, tcon->tid, tcon->ses->Suid, new_eof); return send_set_info(xid, tcon, persistent_fid, volatile_fid, pid, FILE_END_OF_FILE_INFORMATION, SMB2_O_INFO_FILE, diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h index 0e371f7e2854..343ada691e76 100644 --- a/fs/smb/client/smb2proto.h +++ b/fs/smb/client/smb2proto.h @@ -56,6 +56,18 @@ extern int smb3_handle_read_data(struct TCP_Server_Info *server, extern int smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *path, __u32 *reparse_tag); +struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data, + struct super_block *sb, + const unsigned int xid, + struct cifs_tcon *tcon, + const char *full_path, + struct kvec *iov); +int smb2_query_reparse_point(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, + u32 *tag, struct kvec *rsp, + int *rsp_buftype); int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, @@ -80,12 +92,16 @@ extern int smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb); extern int smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb); -extern int smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon, - const char *from_name, const char *to_name, - struct cifs_sb_info *cifs_sb); -extern int smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon, - const char *from_name, const char *to_name, - struct cifs_sb_info *cifs_sb); +int smb2_rename_path(const unsigned int xid, + struct cifs_tcon *tcon, + struct dentry *source_dentry, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb); +int smb2_create_hardlink(const unsigned int xid, + struct cifs_tcon *tcon, + struct dentry *source_dentry, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb); extern int smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const unsigned char *path, char *pbuf, unsigned int *pbytes_written); @@ -205,7 +221,7 @@ extern int SMB2_query_directory_init(unsigned int xid, struct cifs_tcon *tcon, extern void SMB2_query_directory_free(struct smb_rqst *rqst); extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 pid, - __le64 *eof); + loff_t new_eof); extern int SMB2_set_info_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, struct smb_rqst *rqst, @@ -289,4 +305,5 @@ int smb311_posix_query_path_info(const unsigned int xid, int posix_info_parse(const void *beg, const void *end, struct smb2_posix_info_parsed *out); int posix_info_sid_size(const void *beg, const void *end); + #endif /* _SMB2PROTO_H */ diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 94df9eec3d8d..d74e829de51c 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -2136,7 +2136,7 @@ static int allocate_mr_list(struct smbd_connection *info) for (i = 0; i < info->responder_resources * 2; i++) { smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL); if (!smbdirect_mr) - goto out; + goto cleanup_entries; smbdirect_mr->mr = ib_alloc_mr(info->pd, info->mr_type, info->max_frmr_depth); if (IS_ERR(smbdirect_mr->mr)) { @@ -2162,7 +2162,7 @@ static int allocate_mr_list(struct smbd_connection *info) out: kfree(smbdirect_mr); - +cleanup_entries: list_for_each_entry_safe(smbdirect_mr, tmp, &info->mr_list, list) { list_del(&smbdirect_mr->list); ib_dereg_mr(smbdirect_mr->mr); diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index de199ec9f726..522fa387fcfd 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -370,11 +370,12 @@ DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rename_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rmdir_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_reparse_compound_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(get_reparse_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(tdis_enter); - DECLARE_EVENT_CLASS(smb3_inf_compound_done_class, TP_PROTO(unsigned int xid, __u32 tid, @@ -408,6 +409,8 @@ DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rename_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rmdir_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_reparse_compound_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(get_reparse_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(tdis_done); @@ -451,6 +454,8 @@ DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rename_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rmdir_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_reparse_compound_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(get_reparse_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(tdis_err); diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c index 229a6527870d..09b20039636e 100644 --- a/fs/smb/server/auth.c +++ b/fs/smb/server/auth.c @@ -208,10 +208,12 @@ out: /** * ksmbd_auth_ntlmv2() - NTLMv2 authentication handler - * @sess: session of connection + * @conn: connection + * @sess: session of connection * @ntlmv2: NTLMv2 challenge response * @blen: NTLMv2 blob length * @domain_name: domain name + * @cryptkey: session crypto key * * Return: 0 on success, error number on error */ @@ -294,7 +296,8 @@ out: * ksmbd_decode_ntlmssp_auth_blob() - helper function to construct * authenticate blob * @authblob: authenticate blob source pointer - * @usr: user details + * @blob_len: length of the @authblob message + * @conn: connection * @sess: session of connection * * Return: 0 on success, error number on error @@ -376,8 +379,8 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, * ksmbd_decode_ntlmssp_neg_blob() - helper function to construct * negotiate blob * @negblob: negotiate blob source pointer - * @rsp: response header pointer to be updated - * @sess: session of connection + * @blob_len: length of the @authblob message + * @conn: connection * */ int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob, @@ -403,8 +406,7 @@ int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob, * ksmbd_build_ntlmssp_challenge_blob() - helper function to construct * challenge blob * @chgblob: challenge blob source pointer to initialize - * @rsp: response header pointer to be updated - * @sess: session of connection + * @conn: connection * */ unsigned int diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index b6fa1e285c40..d311c2ee10bd 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -284,6 +284,7 @@ int ksmbd_conn_handler_loop(void *p) goto out; conn->last_active = jiffies; + set_freezable(); while (ksmbd_conn_alive(conn)) { if (try_to_freeze()) continue; diff --git a/fs/smb/server/mgmt/ksmbd_ida.c b/fs/smb/server/mgmt/ksmbd_ida.c index 54194d959a5e..a18e27e9e0cd 100644 --- a/fs/smb/server/mgmt/ksmbd_ida.c +++ b/fs/smb/server/mgmt/ksmbd_ida.c @@ -5,42 +5,33 @@ #include "ksmbd_ida.h" -static inline int __acquire_id(struct ida *ida, int from, int to) -{ - return ida_simple_get(ida, from, to, GFP_KERNEL); -} - int ksmbd_acquire_smb2_tid(struct ida *ida) { - int id; - - id = __acquire_id(ida, 1, 0xFFFFFFFF); - - return id; + return ida_alloc_range(ida, 1, 0xFFFFFFFE, GFP_KERNEL); } int ksmbd_acquire_smb2_uid(struct ida *ida) { int id; - id = __acquire_id(ida, 1, 0); + id = ida_alloc_min(ida, 1, GFP_KERNEL); if (id == 0xFFFE) - id = __acquire_id(ida, 1, 0); + id = ida_alloc_min(ida, 1, GFP_KERNEL); return id; } int ksmbd_acquire_async_msg_id(struct ida *ida) { - return __acquire_id(ida, 1, 0); + return ida_alloc_min(ida, 1, GFP_KERNEL); } int ksmbd_acquire_id(struct ida *ida) { - return __acquire_id(ida, 0, 0); + return ida_alloc(ida, GFP_KERNEL); } void ksmbd_release_id(struct ida *ida, int id) { - ida_simple_remove(ida, id); + ida_free(ida, id); } diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c index 562b180459a1..001926d3b348 100644 --- a/fs/smb/server/oplock.c +++ b/fs/smb/server/oplock.c @@ -105,7 +105,7 @@ static int alloc_lease(struct oplock_info *opinfo, struct lease_ctx_info *lctx) lease->is_dir = lctx->is_dir; memcpy(lease->parent_lease_key, lctx->parent_lease_key, SMB2_LEASE_KEY_SIZE); lease->version = lctx->version; - lease->epoch = le16_to_cpu(lctx->epoch); + lease->epoch = le16_to_cpu(lctx->epoch) + 1; INIT_LIST_HEAD(&opinfo->lease_entry); opinfo->o_lease = lease; @@ -546,6 +546,7 @@ static struct oplock_info *same_client_has_lease(struct ksmbd_inode *ci, atomic_read(&ci->sop_count)) == 1) { if (lease->state != SMB2_LEASE_NONE_LE && lease->state == (lctx->req_state & lease->state)) { + lease->epoch++; lease->state |= lctx->req_state; if (lctx->req_state & SMB2_LEASE_WRITE_CACHING_LE) @@ -556,13 +557,17 @@ static struct oplock_info *same_client_has_lease(struct ksmbd_inode *ci, atomic_read(&ci->sop_count)) > 1) { if (lctx->req_state == (SMB2_LEASE_READ_CACHING_LE | - SMB2_LEASE_HANDLE_CACHING_LE)) + SMB2_LEASE_HANDLE_CACHING_LE)) { + lease->epoch++; lease->state = lctx->req_state; + } } if (lctx->req_state && lease->state == - SMB2_LEASE_NONE_LE) + SMB2_LEASE_NONE_LE) { + lease->epoch++; lease_none_upgrade(opinfo, lctx->req_state); + } } read_lock(&ci->m_lock); } @@ -1035,7 +1040,8 @@ static void copy_lease(struct oplock_info *op1, struct oplock_info *op2) SMB2_LEASE_KEY_SIZE); lease2->duration = lease1->duration; lease2->flags = lease1->flags; - lease2->epoch = lease1->epoch++; + lease2->epoch = lease1->epoch; + lease2->version = lease1->version; } static int add_lease_global_list(struct oplock_info *opinfo) @@ -1447,7 +1453,7 @@ void create_lease_buf(u8 *rbuf, struct lease *lease) memcpy(buf->lcontext.LeaseKey, lease->lease_key, SMB2_LEASE_KEY_SIZE); buf->lcontext.LeaseFlags = lease->flags; - buf->lcontext.Epoch = cpu_to_le16(++lease->epoch); + buf->lcontext.Epoch = cpu_to_le16(lease->epoch); buf->lcontext.LeaseState = lease->state; memcpy(buf->lcontext.ParentLeaseKey, lease->parent_lease_key, SMB2_LEASE_KEY_SIZE); diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 652ab429bf2e..3143819935dc 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -2311,11 +2311,12 @@ out: * @eabuf: set info command buffer * @buf_len: set info command buffer length * @path: dentry path for get ea + * @get_write: get write access to a mount * * Return: 0 on success, otherwise error */ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, - const struct path *path) + const struct path *path, bool get_write) { struct mnt_idmap *idmap = mnt_idmap(path->mnt); char *attr_name = NULL, *value; @@ -2971,7 +2972,7 @@ int smb2_open(struct ksmbd_work *work) &may_flags); if (!test_tree_conn_flag(tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { - if (open_flags & O_CREAT) { + if (open_flags & (O_CREAT | O_TRUNC)) { ksmbd_debug(SMB, "User does not have write permission\n"); rc = -EACCES; @@ -3003,7 +3004,7 @@ int smb2_open(struct ksmbd_work *work) rc = smb2_set_ea(&ea_buf->ea, le32_to_cpu(ea_buf->ccontext.DataLength), - &path); + &path, false); if (rc == -EOPNOTSUPP) rc = 0; else if (rc) @@ -5568,6 +5569,7 @@ static int smb2_rename(struct ksmbd_work *work, if (!file_info->ReplaceIfExists) flags = RENAME_NOREPLACE; + smb_break_all_levII_oplock(work, fp, 0); rc = ksmbd_vfs_rename(work, &fp->filp->f_path, new_name, flags); out: kfree(new_name); @@ -5943,12 +5945,6 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp, } case FILE_RENAME_INFORMATION: { - if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { - ksmbd_debug(SMB, - "User does not have write permission\n"); - return -EACCES; - } - if (buf_len < sizeof(struct smb2_file_rename_info)) return -EINVAL; @@ -5968,12 +5964,6 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp, } case FILE_DISPOSITION_INFORMATION: { - if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { - ksmbd_debug(SMB, - "User does not have write permission\n"); - return -EACCES; - } - if (buf_len < sizeof(struct smb2_file_disposition_info)) return -EINVAL; @@ -5992,7 +5982,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp, return -EINVAL; return smb2_set_ea((struct smb2_ea_info *)req->Buffer, - buf_len, &fp->filp->f_path); + buf_len, &fp->filp->f_path, true); } case FILE_POSITION_INFORMATION: { @@ -6035,7 +6025,7 @@ int smb2_set_info(struct ksmbd_work *work) { struct smb2_set_info_req *req; struct smb2_set_info_rsp *rsp; - struct ksmbd_file *fp; + struct ksmbd_file *fp = NULL; int rc = 0; unsigned int id = KSMBD_NO_FID, pid = KSMBD_NO_FID; @@ -6055,6 +6045,13 @@ int smb2_set_info(struct ksmbd_work *work) rsp = smb2_get_msg(work->response_buf); } + if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { + ksmbd_debug(SMB, "User does not have write permission\n"); + pr_err("User does not have write permission\n"); + rc = -EACCES; + goto err_out; + } + if (!has_file_id(id)) { id = req->VolatileFileId; pid = req->PersistentFileId; diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c index 6691ae68af0c..7c98bf699772 100644 --- a/fs/smb/server/smb_common.c +++ b/fs/smb/server/smb_common.c @@ -158,8 +158,12 @@ int ksmbd_verify_smb_message(struct ksmbd_work *work) */ bool ksmbd_smb_request(struct ksmbd_conn *conn) { - __le32 *proto = (__le32 *)smb2_get_msg(conn->request_buf); + __le32 *proto; + if (conn->request_buf[0] != 0) + return false; + + proto = (__le32 *)smb2_get_msg(conn->request_buf); if (*proto == SMB2_COMPRESSION_TRANSFORM_ID) { pr_err_ratelimited("smb2 compression not support yet"); return false; diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c index 1164365533f0..1c9775f1efa5 100644 --- a/fs/smb/server/smbacl.c +++ b/fs/smb/server/smbacl.c @@ -401,10 +401,6 @@ static void parse_dacl(struct mnt_idmap *idmap, if (num_aces > ULONG_MAX / sizeof(struct smb_ace *)) return; - ppace = kmalloc_array(num_aces, sizeof(struct smb_ace *), GFP_KERNEL); - if (!ppace) - return; - ret = init_acl_state(&acl_state, num_aces); if (ret) return; @@ -414,6 +410,13 @@ static void parse_dacl(struct mnt_idmap *idmap, return; } + ppace = kmalloc_array(num_aces, sizeof(struct smb_ace *), GFP_KERNEL); + if (!ppace) { + free_acl_state(&default_acl_state); + free_acl_state(&acl_state); + return; + } + /* * reset rwx permissions for user/group/other. * Also, if num_aces is 0 i.e. DACL has no ACEs, diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 4277750a6da1..a6961bfe3e13 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -49,6 +49,10 @@ static void ksmbd_vfs_inherit_owner(struct ksmbd_work *work, /** * ksmbd_vfs_lock_parent() - lock parent dentry if it is stable + * @parent: parent dentry + * @child: child dentry + * + * Returns: %0 on success, %-ENOENT if the parent dentry is not stable */ int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child) { @@ -360,7 +364,7 @@ out: /** * ksmbd_vfs_read() - vfs helper for smb file read * @work: smb work - * @fid: file id of open file + * @fp: ksmbd file pointer * @count: read byte count * @pos: file pos * @rbuf: read data buffer @@ -474,7 +478,7 @@ out: /** * ksmbd_vfs_write() - vfs helper for smb file write * @work: work - * @fid: file id of open file + * @fp: ksmbd file pointer * @buf: buf containing data for writing * @count: read byte count * @pos: file pos @@ -545,10 +549,8 @@ out: /** * ksmbd_vfs_getattr() - vfs helper for smb getattr - * @work: work - * @fid: file id of open file - * @attrs: inode attributes - * + * @path: path of dentry + * @stat: pointer to returned kernel stat structure * Return: 0 on success, otherwise error */ int ksmbd_vfs_getattr(const struct path *path, struct kstat *stat) @@ -565,6 +567,7 @@ int ksmbd_vfs_getattr(const struct path *path, struct kstat *stat) * ksmbd_vfs_fsync() - vfs helper for smb fsync * @work: work * @fid: file id of open file + * @p_id: persistent file id * * Return: 0 on success, otherwise error */ @@ -587,7 +590,8 @@ int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id) /** * ksmbd_vfs_remove_file() - vfs helper for smb rmdir or unlink - * @name: directory or file name that is relative to share + * @work: work + * @path: path of dentry * * Return: 0 on success, otherwise error */ @@ -623,6 +627,7 @@ out_err: /** * ksmbd_vfs_link() - vfs helper for creating smb hardlink + * @work: work * @oldname: source file name * @newname: hardlink name that is relative to share * @@ -715,6 +720,10 @@ retry: goto out2; trap = lock_rename_child(old_child, new_path.dentry); + if (IS_ERR(trap)) { + err = PTR_ERR(trap); + goto out_drop_write; + } old_parent = dget(old_child->d_parent); if (d_unhashed(old_child)) { @@ -777,6 +786,7 @@ out4: out3: dput(old_parent); unlock_rename(old_parent, new_path.dentry); +out_drop_write: mnt_drop_write(old_path->mnt); out2: path_put(&new_path); @@ -795,7 +805,7 @@ revert_fsids: /** * ksmbd_vfs_truncate() - vfs helper for smb file truncate * @work: work - * @fid: file id of old file + * @fp: ksmbd file pointer * @size: truncate to given size * * Return: 0 on success, otherwise error @@ -838,7 +848,6 @@ int ksmbd_vfs_truncate(struct ksmbd_work *work, * ksmbd_vfs_listxattr() - vfs helper for smb list extended attributes * @dentry: dentry of file for listing xattrs * @list: destination buffer - * @size: destination buffer length * * Return: xattr list length on success, otherwise error */ @@ -947,7 +956,7 @@ int ksmbd_vfs_setxattr(struct mnt_idmap *idmap, /** * ksmbd_vfs_set_fadvise() - convert smb IO caching options to linux options * @filp: file pointer for IO - * @options: smb IO options + * @option: smb IO options */ void ksmbd_vfs_set_fadvise(struct file *filp, __le32 option) { @@ -1159,6 +1168,7 @@ static bool __caseless_lookup(struct dir_context *ctx, const char *name, * @dir: path info * @name: filename to lookup * @namelen: filename length + * @um: &struct unicode_map to use * * Return: 0 on success, otherwise error */ @@ -1189,6 +1199,7 @@ static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name, /** * ksmbd_vfs_kern_path_locked() - lookup a file and get path info + * @work: work * @name: file path that is relative to share * @flags: lookup flags * @parent_path: if lookup succeed, return parent_path info @@ -1636,6 +1647,8 @@ int ksmbd_vfs_get_dos_attrib_xattr(struct mnt_idmap *idmap, * ksmbd_vfs_init_kstat() - convert unix stat information to smb stat format * @p: destination buffer * @ksmbd_kstat: ksmbd kstat wrapper + * + * Returns: pointer to the converted &struct file_directory_info */ void *ksmbd_vfs_init_kstat(char **p, struct ksmbd_kstat *ksmbd_kstat) { diff --git a/fs/splice.c b/fs/splice.c index d983d375ff11..218e24b1ac40 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -201,7 +201,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, unsigned int tail = pipe->tail; unsigned int head = pipe->head; unsigned int mask = pipe->ring_size - 1; - int ret = 0, page_nr = 0; + ssize_t ret = 0; + int page_nr = 0; if (!spd_pages) return 0; @@ -673,10 +674,13 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, .u.file = out, }; int nbufs = pipe->max_usage; - struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec), - GFP_KERNEL); + struct bio_vec *array; ssize_t ret; + if (!out->f_op->write_iter) + return -EINVAL; + + array = kcalloc(nbufs, sizeof(struct bio_vec), GFP_KERNEL); if (unlikely(!array)) return -ENOMEM; @@ -684,6 +688,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, splice_from_pipe_begin(&sd); while (sd.total_len) { + struct kiocb kiocb; struct iov_iter from; unsigned int head, tail, mask; size_t left; @@ -733,7 +738,10 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, } iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left); - ret = vfs_iter_write(out, &from, &sd.pos, 0); + init_sync_kiocb(&kiocb, out); + kiocb.ki_pos = sd.pos; + ret = call_write_iter(out, &kiocb, &from); + sd.pos = kiocb.ki_pos; if (ret <= 0) break; @@ -925,8 +933,8 @@ static int warn_unsupported(struct file *file, const char *op) /* * Attempt to initiate a splice from pipe to file. */ -static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, - loff_t *ppos, size_t len, unsigned int flags) +static ssize_t do_splice_from(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) { if (unlikely(!out->f_op->splice_write)) return warn_unsupported(out, "write"); @@ -944,27 +952,15 @@ static void do_splice_eof(struct splice_desc *sd) sd->splice_eof(sd); } -/** - * vfs_splice_read - Read data from a file and splice it into a pipe - * @in: File to splice from - * @ppos: Input file offset - * @pipe: Pipe to splice to - * @len: Number of bytes to splice - * @flags: Splice modifier flags (SPLICE_F_*) - * - * Splice the requested amount of data from the input file to the pipe. This - * is synchronous as the caller must hold the pipe lock across the entire - * operation. - * - * If successful, it returns the amount of data spliced, 0 if it hit the EOF or - * a hole and a negative error code otherwise. +/* + * Callers already called rw_verify_area() on the entire range. + * No need to call it for sub ranges. */ -long vfs_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) +static ssize_t do_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { unsigned int p_space; - int ret; if (unlikely(!(in->f_mode & FMODE_READ))) return -EBADF; @@ -975,10 +971,6 @@ long vfs_splice_read(struct file *in, loff_t *ppos, p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail); len = min_t(size_t, len, p_space << PAGE_SHIFT); - ret = rw_verify_area(READ, in, ppos, len); - if (unlikely(ret < 0)) - return ret; - if (unlikely(len > MAX_RW_COUNT)) len = MAX_RW_COUNT; @@ -992,6 +984,34 @@ long vfs_splice_read(struct file *in, loff_t *ppos, return copy_splice_read(in, ppos, pipe, len, flags); return in->f_op->splice_read(in, ppos, pipe, len, flags); } + +/** + * vfs_splice_read - Read data from a file and splice it into a pipe + * @in: File to splice from + * @ppos: Input file offset + * @pipe: Pipe to splice to + * @len: Number of bytes to splice + * @flags: Splice modifier flags (SPLICE_F_*) + * + * Splice the requested amount of data from the input file to the pipe. This + * is synchronous as the caller must hold the pipe lock across the entire + * operation. + * + * If successful, it returns the amount of data spliced, 0 if it hit the EOF or + * a hole and a negative error code otherwise. + */ +ssize_t vfs_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + ssize_t ret; + + ret = rw_verify_area(READ, in, ppos, len); + if (unlikely(ret < 0)) + return ret; + + return do_splice_read(in, ppos, pipe, len, flags); +} EXPORT_SYMBOL_GPL(vfs_splice_read); /** @@ -1011,7 +1031,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, splice_direct_actor *actor) { struct pipe_inode_info *pipe; - long ret, bytes; + ssize_t ret, bytes; size_t len; int i, flags, more; @@ -1066,7 +1086,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, size_t read_len; loff_t pos = sd->pos, prev_pos = pos; - ret = vfs_splice_read(in, &pos, pipe, len, flags); + ret = do_splice_read(in, &pos, pipe, len, flags); if (unlikely(ret <= 0)) goto read_failure; @@ -1138,9 +1158,20 @@ static int direct_splice_actor(struct pipe_inode_info *pipe, struct splice_desc *sd) { struct file *file = sd->u.file; + long ret; + + file_start_write(file); + ret = do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags); + file_end_write(file); + return ret; +} + +static int splice_file_range_actor(struct pipe_inode_info *pipe, + struct splice_desc *sd) +{ + struct file *file = sd->u.file; - return do_splice_from(pipe, file, sd->opos, sd->total_len, - sd->flags); + return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags); } static void direct_file_splice_eof(struct splice_desc *sd) @@ -1151,24 +1182,10 @@ static void direct_file_splice_eof(struct splice_desc *sd) file->f_op->splice_eof(file); } -/** - * do_splice_direct - splices data directly between two files - * @in: file to splice from - * @ppos: input file offset - * @out: file to splice to - * @opos: output file offset - * @len: number of bytes to splice - * @flags: splice modifier flags - * - * Description: - * For use by do_sendfile(). splice can easily emulate sendfile, but - * doing it in the application would incur an extra system call - * (splice in + splice out, as compared to just sendfile()). So this helper - * can splice directly through a process-private pipe. - * - */ -long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, - loff_t *opos, size_t len, unsigned int flags) +static ssize_t do_splice_direct_actor(struct file *in, loff_t *ppos, + struct file *out, loff_t *opos, + size_t len, unsigned int flags, + splice_direct_actor *actor) { struct splice_desc sd = { .len = len, @@ -1179,7 +1196,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, .splice_eof = direct_file_splice_eof, .opos = opos, }; - long ret; + ssize_t ret; if (unlikely(!(out->f_mode & FMODE_WRITE))) return -EBADF; @@ -1187,18 +1204,63 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, if (unlikely(out->f_flags & O_APPEND)) return -EINVAL; - ret = rw_verify_area(WRITE, out, opos, len); - if (unlikely(ret < 0)) - return ret; - - ret = splice_direct_to_actor(in, &sd, direct_splice_actor); + ret = splice_direct_to_actor(in, &sd, actor); if (ret > 0) *ppos = sd.pos; return ret; } +/** + * do_splice_direct - splices data directly between two files + * @in: file to splice from + * @ppos: input file offset + * @out: file to splice to + * @opos: output file offset + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Description: + * For use by do_sendfile(). splice can easily emulate sendfile, but + * doing it in the application would incur an extra system call + * (splice in + splice out, as compared to just sendfile()). So this helper + * can splice directly through a process-private pipe. + * + * Callers already called rw_verify_area() on the entire range. + */ +ssize_t do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len, unsigned int flags) +{ + return do_splice_direct_actor(in, ppos, out, opos, len, flags, + direct_splice_actor); +} EXPORT_SYMBOL(do_splice_direct); +/** + * splice_file_range - splices data between two files for copy_file_range() + * @in: file to splice from + * @ppos: input file offset + * @out: file to splice to + * @opos: output file offset + * @len: number of bytes to splice + * + * Description: + * For use by ->copy_file_range() methods. + * Like do_splice_direct(), but vfs_copy_file_range() already holds + * start_file_write() on @out file. + * + * Callers already called rw_verify_area() on the entire range. + */ +ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len) +{ + lockdep_assert(file_write_started(out)); + + return do_splice_direct_actor(in, ppos, out, opos, + min_t(size_t, len, MAX_RW_COUNT), + 0, splice_file_range_actor); +} +EXPORT_SYMBOL(splice_file_range); + static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) { for (;;) { @@ -1220,17 +1282,17 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags); -long splice_file_to_pipe(struct file *in, - struct pipe_inode_info *opipe, - loff_t *offset, - size_t len, unsigned int flags) +ssize_t splice_file_to_pipe(struct file *in, + struct pipe_inode_info *opipe, + loff_t *offset, + size_t len, unsigned int flags) { - long ret; + ssize_t ret; pipe_lock(opipe); ret = wait_for_space(opipe, flags); if (!ret) - ret = vfs_splice_read(in, offset, opipe, len, flags); + ret = do_splice_read(in, offset, opipe, len, flags); pipe_unlock(opipe); if (ret > 0) wakeup_pipe_readers(opipe); @@ -1240,13 +1302,13 @@ long splice_file_to_pipe(struct file *in, /* * Determine where to splice to/from. */ -long do_splice(struct file *in, loff_t *off_in, struct file *out, - loff_t *off_out, size_t len, unsigned int flags) +ssize_t do_splice(struct file *in, loff_t *off_in, struct file *out, + loff_t *off_out, size_t len, unsigned int flags) { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; loff_t offset; - long ret; + ssize_t ret; if (unlikely(!(in->f_mode & FMODE_READ) || !(out->f_mode & FMODE_WRITE))) @@ -1307,6 +1369,10 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out, offset = in->f_pos; } + ret = rw_verify_area(READ, in, &offset, len); + if (unlikely(ret < 0)) + return ret; + if (out->f_flags & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; @@ -1333,14 +1399,14 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out, return ret; } -static long __do_splice(struct file *in, loff_t __user *off_in, - struct file *out, loff_t __user *off_out, - size_t len, unsigned int flags) +static ssize_t __do_splice(struct file *in, loff_t __user *off_in, + struct file *out, loff_t __user *off_out, + size_t len, unsigned int flags) { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; loff_t offset, *__off_in = NULL, *__off_out = NULL; - long ret; + ssize_t ret; ipipe = get_pipe_info(in, true); opipe = get_pipe_info(out, true); @@ -1379,16 +1445,16 @@ static long __do_splice(struct file *in, loff_t __user *off_in, return ret; } -static int iter_to_pipe(struct iov_iter *from, - struct pipe_inode_info *pipe, - unsigned flags) +static ssize_t iter_to_pipe(struct iov_iter *from, + struct pipe_inode_info *pipe, + unsigned int flags) { struct pipe_buffer buf = { .ops = &user_page_pipe_buf_ops, .flags = flags }; size_t total = 0; - int ret = 0; + ssize_t ret = 0; while (iov_iter_count(from)) { struct page *pages[16]; @@ -1437,8 +1503,8 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, * For lack of a better implementation, implement vmsplice() to userspace * as a simple copy of the pipes pages to the user iov. */ -static long vmsplice_to_user(struct file *file, struct iov_iter *iter, - unsigned int flags) +static ssize_t vmsplice_to_user(struct file *file, struct iov_iter *iter, + unsigned int flags) { struct pipe_inode_info *pipe = get_pipe_info(file, true); struct splice_desc sd = { @@ -1446,7 +1512,7 @@ static long vmsplice_to_user(struct file *file, struct iov_iter *iter, .flags = flags, .u.data = iter }; - long ret = 0; + ssize_t ret = 0; if (!pipe) return -EBADF; @@ -1470,11 +1536,11 @@ static long vmsplice_to_user(struct file *file, struct iov_iter *iter, * as splice-from-memory, where the regular splice is splice-from-file (or * to file). In both cases the output is a pipe, naturally. */ -static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter, - unsigned int flags) +static ssize_t vmsplice_to_pipe(struct file *file, struct iov_iter *iter, + unsigned int flags) { struct pipe_inode_info *pipe; - long ret = 0; + ssize_t ret = 0; unsigned buf_flag = 0; if (flags & SPLICE_F_GIFT) @@ -1570,7 +1636,7 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, size_t, len, unsigned int, flags) { struct fd in, out; - long error; + ssize_t error; if (unlikely(!len)) return 0; @@ -1584,7 +1650,7 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, out = fdget(fd_out); if (out.file) { error = __do_splice(in.file, off_in, out.file, off_out, - len, flags); + len, flags); fdput(out); } fdput(in); @@ -1807,15 +1873,15 @@ retry: /* * Link contents of ipipe to opipe. */ -static int link_pipe(struct pipe_inode_info *ipipe, - struct pipe_inode_info *opipe, - size_t len, unsigned int flags) +static ssize_t link_pipe(struct pipe_inode_info *ipipe, + struct pipe_inode_info *opipe, + size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; unsigned int i_head, o_head; unsigned int i_tail, o_tail; unsigned int i_mask, o_mask; - int ret = 0; + ssize_t ret = 0; /* * Potential ABBA deadlock, work around it by ordering lock @@ -1898,11 +1964,12 @@ static int link_pipe(struct pipe_inode_info *ipipe, * The 'flags' used are the SPLICE_F_* variants, currently the only * applicable one is SPLICE_F_NONBLOCK. */ -long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) +ssize_t do_tee(struct file *in, struct file *out, size_t len, + unsigned int flags) { struct pipe_inode_info *ipipe = get_pipe_info(in, true); struct pipe_inode_info *opipe = get_pipe_info(out, true); - int ret = -EINVAL; + ssize_t ret = -EINVAL; if (unlikely(!(in->f_mode & FMODE_READ) || !(out->f_mode & FMODE_WRITE))) @@ -1939,7 +2006,7 @@ long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) { struct fd in, out; - int error; + ssize_t error; if (unlikely(flags & ~SPLICE_F_ALL)) return -EINVAL; diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index 8ba8c4c50770..e8df6430444b 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -544,7 +544,8 @@ static void squashfs_readahead(struct readahead_control *ractl) struct squashfs_page_actor *actor; unsigned int nr_pages = 0; struct page **pages; - int i, file_end = i_size_read(inode) >> msblk->block_log; + int i; + loff_t file_end = i_size_read(inode) >> msblk->block_log; unsigned int max_pages = 1UL << shift; readahead_expand(ractl, start, (len | mask) + 1); diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index f1ccad519e28..763a3f7a75f6 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -26,10 +26,10 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, struct inode *inode = target_page->mapping->host; struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; - int file_end = (i_size_read(inode) - 1) >> PAGE_SHIFT; + loff_t file_end = (i_size_read(inode) - 1) >> PAGE_SHIFT; int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; - int start_index = target_page->index & ~mask; - int end_index = start_index | mask; + loff_t start_index = target_page->index & ~mask; + loff_t end_index = start_index | mask; int i, n, pages, bytes, res = -ENOMEM; struct page **page; struct squashfs_page_actor *actor; diff --git a/fs/stat.c b/fs/stat.c index f721d26ec3f7..77cdc69eb422 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -41,7 +41,7 @@ * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before filling in the * uid and gid filds. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs @nop_mnt_idmap. + * performed on the raw inode simply pass @nop_mnt_idmap. */ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask, struct inode *inode, struct kstat *stat) @@ -247,8 +247,13 @@ retry: error = vfs_getattr(&path, stat, request_mask, flags); - stat->mnt_id = real_mount(path.mnt)->mnt_id; - stat->result_mask |= STATX_MNT_ID; + if (request_mask & STATX_MNT_ID_UNIQUE) { + stat->mnt_id = real_mount(path.mnt)->mnt_id_unique; + stat->result_mask |= STATX_MNT_ID_UNIQUE; + } else { + stat->mnt_id = real_mount(path.mnt)->mnt_id; + stat->result_mask |= STATX_MNT_ID; + } if (path.mnt->mnt_root == path.dentry) stat->attributes |= STATX_ATTR_MOUNT_ROOT; diff --git a/fs/super.c b/fs/super.c index 076392396e72..d35e85295489 100644 --- a/fs/super.c +++ b/fs/super.c @@ -81,16 +81,13 @@ static inline void super_unlock_shared(struct super_block *sb) super_unlock(sb, false); } -static inline bool wait_born(struct super_block *sb) +static bool super_flags(const struct super_block *sb, unsigned int flags) { - unsigned int flags; - /* * Pairs with smp_store_release() in super_wake() and ensures - * that we see SB_BORN or SB_DYING after we're woken. + * that we see @flags after we're woken. */ - flags = smp_load_acquire(&sb->s_flags); - return flags & (SB_BORN | SB_DYING); + return smp_load_acquire(&sb->s_flags) & flags; } /** @@ -105,15 +102,21 @@ static inline bool wait_born(struct super_block *sb) * * The caller must have acquired a temporary reference on @sb->s_count. * - * Return: This returns true if SB_BORN was set, false if SB_DYING was - * set. The function acquires s_umount and returns with it held. + * Return: The function returns true if SB_BORN was set and with + * s_umount held. The function returns false if SB_DYING was + * set and without s_umount held. */ static __must_check bool super_lock(struct super_block *sb, bool excl) { - lockdep_assert_not_held(&sb->s_umount); -relock: + /* wait until the superblock is ready or dying */ + wait_var_event(&sb->s_flags, super_flags(sb, SB_BORN | SB_DYING)); + + /* Don't pointlessly acquire s_umount. */ + if (super_flags(sb, SB_DYING)) + return false; + __super_lock(sb, excl); /* @@ -121,32 +124,22 @@ relock: * @sb->s_root is NULL and @sb->s_active is 0. No one needs to * grab a reference to this. Tell them so. */ - if (sb->s_flags & SB_DYING) + if (sb->s_flags & SB_DYING) { + super_unlock(sb, excl); return false; + } - /* Has called ->get_tree() successfully. */ - if (sb->s_flags & SB_BORN) - return true; - - super_unlock(sb, excl); - - /* wait until the superblock is ready or dying */ - wait_var_event(&sb->s_flags, wait_born(sb)); - - /* - * Neither SB_BORN nor SB_DYING are ever unset so we never loop. - * Just reacquire @sb->s_umount for the caller. - */ - goto relock; + WARN_ON_ONCE(!(sb->s_flags & SB_BORN)); + return true; } -/* wait and acquire read-side of @sb->s_umount */ +/* wait and try to acquire read-side of @sb->s_umount */ static inline bool super_lock_shared(struct super_block *sb) { return super_lock(sb, false); } -/* wait and acquire write-side of @sb->s_umount */ +/* wait and try to acquire write-side of @sb->s_umount */ static inline bool super_lock_excl(struct super_block *sb) { return super_lock(sb, true); @@ -323,7 +316,7 @@ static void destroy_unused_super(struct super_block *s) static struct super_block *alloc_super(struct file_system_type *type, int flags, struct user_namespace *user_ns) { - struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER); + struct super_block *s = kzalloc(sizeof(struct super_block), GFP_KERNEL); static const struct super_operations default_op; int i; @@ -521,48 +514,7 @@ void deactivate_super(struct super_block *s) EXPORT_SYMBOL(deactivate_super); /** - * grab_super - acquire an active reference - * @s: reference we are trying to make active - * - * Tries to acquire an active reference. grab_super() is used when we - * had just found a superblock in super_blocks or fs_type->fs_supers - * and want to turn it into a full-blown active reference. grab_super() - * is called with sb_lock held and drops it. Returns 1 in case of - * success, 0 if we had failed (superblock contents was already dead or - * dying when grab_super() had been called). Note that this is only - * called for superblocks not in rundown mode (== ones still on ->fs_supers - * of their type), so increment of ->s_count is OK here. - */ -static int grab_super(struct super_block *s) __releases(sb_lock) -{ - bool born; - - s->s_count++; - spin_unlock(&sb_lock); - born = super_lock_excl(s); - if (born && atomic_inc_not_zero(&s->s_active)) { - put_super(s); - return 1; - } - super_unlock_excl(s); - put_super(s); - return 0; -} - -static inline bool wait_dead(struct super_block *sb) -{ - unsigned int flags; - - /* - * Pairs with memory barrier in super_wake() and ensures - * that we see SB_DEAD after we're woken. - */ - flags = smp_load_acquire(&sb->s_flags); - return flags & SB_DEAD; -} - -/** - * grab_super_dead - acquire an active reference to a superblock + * grab_super - acquire an active reference to a superblock * @sb: superblock to acquire * * Acquire a temporary reference on a superblock and try to trade it for @@ -573,17 +525,21 @@ static inline bool wait_dead(struct super_block *sb) * Return: This returns true if an active reference could be acquired, * false if not. */ -static bool grab_super_dead(struct super_block *sb) +static bool grab_super(struct super_block *sb) { + bool locked; sb->s_count++; - if (grab_super(sb)) { - put_super(sb); - lockdep_assert_held(&sb->s_umount); - return true; + spin_unlock(&sb_lock); + locked = super_lock_excl(sb); + if (locked) { + if (atomic_inc_not_zero(&sb->s_active)) { + put_super(sb); + return true; + } + super_unlock_excl(sb); } - wait_var_event(&sb->s_flags, wait_dead(sb)); - lockdep_assert_not_held(&sb->s_umount); + wait_var_event(&sb->s_flags, super_flags(sb, SB_DEAD)); put_super(sb); return false; } @@ -681,12 +637,6 @@ void generic_shutdown_super(struct super_block *sb) fsnotify_sb_delete(sb); security_sb_delete(sb); - /* - * Now that all potentially-encrypted inodes have been evicted, - * the fscrypt keyring can be destroyed. - */ - fscrypt_destroy_keyring(sb); - if (sb->s_dio_done_wq) { destroy_workqueue(sb->s_dio_done_wq); sb->s_dio_done_wq = NULL; @@ -695,6 +645,12 @@ void generic_shutdown_super(struct super_block *sb) if (sop->put_super) sop->put_super(sb); + /* + * Now that all potentially-encrypted inodes have been evicted, + * the fscrypt keyring can be destroyed. + */ + fscrypt_destroy_keyring(sb); + if (CHECK_DATA_CORRUPTION(!list_empty(&sb->s_inodes), "VFS: Busy inodes after unmount of %s (%s)", sb->s_id, sb->s_type->name)) { @@ -834,7 +790,7 @@ share_extant_sb: warnfc(fc, "reusing existing filesystem in another namespace not allowed"); return ERR_PTR(-EBUSY); } - if (!grab_super_dead(old)) + if (!grab_super(old)) goto retry; destroy_unused_super(s); return old; @@ -878,7 +834,7 @@ retry: destroy_unused_super(s); return ERR_PTR(-EBUSY); } - if (!grab_super_dead(old)) + if (!grab_super(old)) goto retry; destroy_unused_super(s); return old; @@ -930,8 +886,7 @@ static void __iterate_supers(void (*f)(struct super_block *)) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - /* Pairs with memory marrier in super_wake(). */ - if (smp_load_acquire(&sb->s_flags) & SB_DYING) + if (super_flags(sb, SB_DYING)) continue; sb->s_count++; spin_unlock(&sb_lock); @@ -961,15 +916,17 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - bool born; + bool locked; sb->s_count++; spin_unlock(&sb_lock); - born = super_lock_shared(sb); - if (born && sb->s_root) - f(sb, arg); - super_unlock_shared(sb); + locked = super_lock_shared(sb); + if (locked) { + if (sb->s_root) + f(sb, arg); + super_unlock_shared(sb); + } spin_lock(&sb_lock); if (p) @@ -997,15 +954,17 @@ void iterate_supers_type(struct file_system_type *type, spin_lock(&sb_lock); hlist_for_each_entry(sb, &type->fs_supers, s_instances) { - bool born; + bool locked; sb->s_count++; spin_unlock(&sb_lock); - born = super_lock_shared(sb); - if (born && sb->s_root) - f(sb, arg); - super_unlock_shared(sb); + locked = super_lock_shared(sb); + if (locked) { + if (sb->s_root) + f(sb, arg); + super_unlock_shared(sb); + } spin_lock(&sb_lock); if (p) @@ -1019,34 +978,6 @@ void iterate_supers_type(struct file_system_type *type, EXPORT_SYMBOL(iterate_supers_type); -/** - * get_active_super - get an active reference to the superblock of a device - * @bdev: device to get the superblock for - * - * Scans the superblock list and finds the superblock of the file system - * mounted on the device given. Returns the superblock with an active - * reference or %NULL if none was found. - */ -struct super_block *get_active_super(struct block_device *bdev) -{ - struct super_block *sb; - - if (!bdev) - return NULL; - - spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { - if (sb->s_bdev == bdev) { - if (!grab_super(sb)) - return NULL; - super_unlock_excl(sb); - return sb; - } - } - spin_unlock(&sb_lock); - return NULL; -} - struct super_block *user_get_super(dev_t dev, bool excl) { struct super_block *sb; @@ -1054,15 +985,17 @@ struct super_block *user_get_super(dev_t dev, bool excl) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { if (sb->s_dev == dev) { - bool born; + bool locked; sb->s_count++; spin_unlock(&sb_lock); /* still alive? */ - born = super_lock(sb, excl); - if (born && sb->s_root) - return sb; - super_unlock(sb, excl); + locked = super_lock(sb, excl); + if (locked) { + if (sb->s_root) + return sb; + super_unlock(sb, excl); + } /* nope, got unmounted */ spin_lock(&sb_lock); __put_super(sb); @@ -1173,9 +1106,9 @@ cancel_readonly: static void do_emergency_remount_callback(struct super_block *sb) { - bool born = super_lock_excl(sb); + bool locked = super_lock_excl(sb); - if (born && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) { + if (locked && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) { struct fs_context *fc; fc = fs_context_for_reconfigure(sb->s_root, @@ -1186,7 +1119,8 @@ static void do_emergency_remount_callback(struct super_block *sb) put_fs_context(fc); } } - super_unlock_excl(sb); + if (locked) + super_unlock_excl(sb); } static void do_emergency_remount(struct work_struct *work) @@ -1209,16 +1143,17 @@ void emergency_remount(void) static void do_thaw_all_callback(struct super_block *sb) { - bool born = super_lock_excl(sb); + bool locked = super_lock_excl(sb); - if (born && sb->s_root) { + if (locked && sb->s_root) { if (IS_ENABLED(CONFIG_BLOCK)) - while (sb->s_bdev && !thaw_bdev(sb->s_bdev)) + while (sb->s_bdev && !bdev_thaw(sb->s_bdev)) pr_warn("Emergency Thaw on %pg\n", sb->s_bdev); thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE); - } else { - super_unlock_excl(sb); + return; } + if (locked) + super_unlock_excl(sb); } static void do_thaw_all(struct work_struct *work) @@ -1428,11 +1363,11 @@ EXPORT_SYMBOL(sget_dev); * * The function must be called with bdev->bd_holder_lock and releases it. */ -static struct super_block *bdev_super_lock_shared(struct block_device *bdev) +static struct super_block *bdev_super_lock(struct block_device *bdev, bool excl) __releases(&bdev->bd_holder_lock) { struct super_block *sb = bdev->bd_holder; - bool born; + bool locked; lockdep_assert_held(&bdev->bd_holder_lock); lockdep_assert_not_held(&sb->s_umount); @@ -1442,19 +1377,25 @@ static struct super_block *bdev_super_lock_shared(struct block_device *bdev) spin_lock(&sb_lock); sb->s_count++; spin_unlock(&sb_lock); + mutex_unlock(&bdev->bd_holder_lock); - born = super_lock_shared(sb); - if (!born || !sb->s_root || !(sb->s_flags & SB_ACTIVE)) { - super_unlock_shared(sb); - put_super(sb); - return NULL; - } + locked = super_lock(sb, excl); + /* - * The superblock is active and we hold s_umount, we can drop our - * temporary reference now. - */ + * If the superblock wasn't already SB_DYING then we hold + * s_umount and can safely drop our temporary reference. + */ put_super(sb); + + if (!locked) + return NULL; + + if (!sb->s_root || !(sb->s_flags & SB_ACTIVE)) { + super_unlock(sb, excl); + return NULL; + } + return sb; } @@ -1462,7 +1403,7 @@ static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise) { struct super_block *sb; - sb = bdev_super_lock_shared(bdev); + sb = bdev_super_lock(bdev, false); if (!sb) return; @@ -1480,16 +1421,110 @@ static void fs_bdev_sync(struct block_device *bdev) { struct super_block *sb; - sb = bdev_super_lock_shared(bdev); + sb = bdev_super_lock(bdev, false); if (!sb) return; + sync_filesystem(sb); super_unlock_shared(sb); } +static struct super_block *get_bdev_super(struct block_device *bdev) +{ + bool active = false; + struct super_block *sb; + + sb = bdev_super_lock(bdev, true); + if (sb) { + active = atomic_inc_not_zero(&sb->s_active); + super_unlock_excl(sb); + } + if (!active) + return NULL; + return sb; +} + +/** + * fs_bdev_freeze - freeze owning filesystem of block device + * @bdev: block device + * + * Freeze the filesystem that owns this block device if it is still + * active. + * + * A filesystem that owns multiple block devices may be frozen from each + * block device and won't be unfrozen until all block devices are + * unfrozen. Each block device can only freeze the filesystem once as we + * nest freezes for block devices in the block layer. + * + * Return: If the freeze was successful zero is returned. If the freeze + * failed a negative error code is returned. + */ +static int fs_bdev_freeze(struct block_device *bdev) +{ + struct super_block *sb; + int error = 0; + + lockdep_assert_held(&bdev->bd_fsfreeze_mutex); + + sb = get_bdev_super(bdev); + if (!sb) + return -EINVAL; + + if (sb->s_op->freeze_super) + error = sb->s_op->freeze_super(sb, + FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE); + else + error = freeze_super(sb, + FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE); + if (!error) + error = sync_blockdev(bdev); + deactivate_super(sb); + return error; +} + +/** + * fs_bdev_thaw - thaw owning filesystem of block device + * @bdev: block device + * + * Thaw the filesystem that owns this block device. + * + * A filesystem that owns multiple block devices may be frozen from each + * block device and won't be unfrozen until all block devices are + * unfrozen. Each block device can only freeze the filesystem once as we + * nest freezes for block devices in the block layer. + * + * Return: If the thaw was successful zero is returned. If the thaw + * failed a negative error code is returned. If this function + * returns zero it doesn't mean that the filesystem is unfrozen + * as it may have been frozen multiple times (kernel may hold a + * freeze or might be frozen from other block devices). + */ +static int fs_bdev_thaw(struct block_device *bdev) +{ + struct super_block *sb; + int error; + + lockdep_assert_held(&bdev->bd_fsfreeze_mutex); + + sb = get_bdev_super(bdev); + if (WARN_ON_ONCE(!sb)) + return -EINVAL; + + if (sb->s_op->thaw_super) + error = sb->s_op->thaw_super(sb, + FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE); + else + error = thaw_super(sb, + FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE); + deactivate_super(sb); + return error; +} + const struct blk_holder_ops fs_holder_ops = { .mark_dead = fs_bdev_mark_dead, .sync = fs_bdev_sync, + .freeze = fs_bdev_freeze, + .thaw = fs_bdev_thaw, }; EXPORT_SYMBOL_GPL(fs_holder_ops); @@ -1519,15 +1554,10 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, } /* - * Until SB_BORN flag is set, there can be no active superblock - * references and thus no filesystem freezing. get_active_super() will - * just loop waiting for SB_BORN so even freeze_bdev() cannot proceed. - * - * It is enough to check bdev was not frozen before we set s_bdev. + * It is enough to check bdev was not frozen before we set + * s_bdev as freezing will wait until SB_BORN is set. */ - mutex_lock(&bdev->bd_fsfreeze_mutex); - if (bdev->bd_fsfreeze_count > 0) { - mutex_unlock(&bdev->bd_fsfreeze_mutex); + if (atomic_read(&bdev->bd_fsfreeze_count) > 0) { if (fc) warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); bdev_release(bdev_handle); @@ -1540,7 +1570,6 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, if (bdev_stable_writes(bdev)) sb->s_iflags |= SB_I_STABLE_WRITES; spin_unlock(&sb_lock); - mutex_unlock(&bdev->bd_fsfreeze_mutex); snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); shrinker_debugfs_rename(sb->s_shrink, "sb-%s:%s", sb->s_type->name, @@ -1585,15 +1614,7 @@ int get_tree_bdev(struct fs_context *fc, return -EBUSY; } } else { - /* - * We drop s_umount here because we need to open the bdev and - * bdev->open_mutex ranks above s_umount (blkdev_put() -> - * bdev_mark_dead()). It is safe because we have active sb - * reference and SB_BORN is not set yet. - */ - super_unlock_excl(s); error = setup_bdev_super(s, fc->sb_flags, fc); - __super_lock_excl(s); if (!error) error = fill_super(s, fc); if (error) { @@ -1637,15 +1658,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, return ERR_PTR(-EBUSY); } } else { - /* - * We drop s_umount here because we need to open the bdev and - * bdev->open_mutex ranks above s_umount (blkdev_put() -> - * bdev_mark_dead()). It is safe because we have active sb - * reference and SB_BORN is not set yet. - */ - super_unlock_excl(s); error = setup_bdev_super(s, flags, NULL); - __super_lock_excl(s); if (!error) error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { @@ -1914,6 +1927,47 @@ static int wait_for_partially_frozen(struct super_block *sb) return ret; } +#define FREEZE_HOLDERS (FREEZE_HOLDER_KERNEL | FREEZE_HOLDER_USERSPACE) +#define FREEZE_FLAGS (FREEZE_HOLDERS | FREEZE_MAY_NEST) + +static inline int freeze_inc(struct super_block *sb, enum freeze_holder who) +{ + WARN_ON_ONCE((who & ~FREEZE_FLAGS)); + WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1); + + if (who & FREEZE_HOLDER_KERNEL) + ++sb->s_writers.freeze_kcount; + if (who & FREEZE_HOLDER_USERSPACE) + ++sb->s_writers.freeze_ucount; + return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount; +} + +static inline int freeze_dec(struct super_block *sb, enum freeze_holder who) +{ + WARN_ON_ONCE((who & ~FREEZE_FLAGS)); + WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1); + + if ((who & FREEZE_HOLDER_KERNEL) && sb->s_writers.freeze_kcount) + --sb->s_writers.freeze_kcount; + if ((who & FREEZE_HOLDER_USERSPACE) && sb->s_writers.freeze_ucount) + --sb->s_writers.freeze_ucount; + return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount; +} + +static inline bool may_freeze(struct super_block *sb, enum freeze_holder who) +{ + WARN_ON_ONCE((who & ~FREEZE_FLAGS)); + WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1); + + if (who & FREEZE_HOLDER_KERNEL) + return (who & FREEZE_MAY_NEST) || + sb->s_writers.freeze_kcount == 0; + if (who & FREEZE_HOLDER_USERSPACE) + return (who & FREEZE_MAY_NEST) || + sb->s_writers.freeze_ucount == 0; + return false; +} + /** * freeze_super - lock the filesystem and force it into a consistent state * @sb: the super to lock @@ -1926,6 +1980,7 @@ static int wait_for_partially_frozen(struct super_block *sb) * @who should be: * * %FREEZE_HOLDER_USERSPACE if userspace wants to freeze the fs; * * %FREEZE_HOLDER_KERNEL if the kernel wants to freeze the fs. + * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed. * * The @who argument distinguishes between the kernel and userspace trying to * freeze the filesystem. Although there cannot be multiple kernel freezes or @@ -1933,6 +1988,13 @@ static int wait_for_partially_frozen(struct super_block *sb) * userspace can both hold a filesystem frozen. The filesystem remains frozen * until there are no kernel or userspace freezes in effect. * + * A filesystem may hold multiple devices and thus a filesystems may be + * frozen through the block layer via multiple block devices. In this + * case the request is marked as being allowed to nest by passing + * FREEZE_MAY_NEST. The filesystem remains frozen until all block + * devices are unfrozen. If multiple freezes are attempted without + * FREEZE_MAY_NEST -EBUSY will be returned. + * * During this function, sb->s_writers.frozen goes through these values: * * SB_UNFROZEN: File system is normal, all writes progress as usual. @@ -1957,31 +2019,29 @@ static int wait_for_partially_frozen(struct super_block *sb) * mostly auxiliary for filesystems to verify they do not modify frozen fs. * * sb->s_writers.frozen is protected by sb->s_umount. + * + * Return: If the freeze was successful zero is returned. If the freeze + * failed a negative error code is returned. */ int freeze_super(struct super_block *sb, enum freeze_holder who) { int ret; + if (!super_lock_excl(sb)) { + WARN_ON_ONCE("Dying superblock while freezing!"); + return -EINVAL; + } atomic_inc(&sb->s_active); - if (!super_lock_excl(sb)) - WARN(1, "Dying superblock while freezing!"); retry: if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) { - if (sb->s_writers.freeze_holders & who) { - deactivate_locked_super(sb); - return -EBUSY; - } - - WARN_ON(sb->s_writers.freeze_holders == 0); - - /* - * Someone else already holds this type of freeze; share the - * freeze and assign the active ref to the freeze. - */ - sb->s_writers.freeze_holders |= who; - super_unlock_excl(sb); - return 0; + if (may_freeze(sb, who)) + ret = !!WARN_ON_ONCE(freeze_inc(sb, who) == 1); + else + ret = -EBUSY; + /* All freezers share a single active reference. */ + deactivate_locked_super(sb); + return ret; } if (sb->s_writers.frozen != SB_UNFROZEN) { @@ -1994,14 +2054,9 @@ retry: goto retry; } - if (!(sb->s_flags & SB_BORN)) { - super_unlock_excl(sb); - return 0; /* sic - it's "nothing to do" */ - } - if (sb_rdonly(sb)) { /* Nothing to do really... */ - sb->s_writers.freeze_holders |= who; + WARN_ON_ONCE(freeze_inc(sb, who) > 1); sb->s_writers.frozen = SB_FREEZE_COMPLETE; wake_up_var(&sb->s_writers.frozen); super_unlock_excl(sb); @@ -2012,8 +2067,7 @@ retry: /* Release s_umount to preserve sb_start_write -> s_umount ordering */ super_unlock_excl(sb); sb_wait_write(sb, SB_FREEZE_WRITE); - if (!super_lock_excl(sb)) - WARN(1, "Dying superblock while freezing!"); + __super_lock_excl(sb); /* Now we go and block page faults... */ sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; @@ -2049,7 +2103,7 @@ retry: * For debugging purposes so that fs can warn if it sees write activity * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super(). */ - sb->s_writers.freeze_holders |= who; + WARN_ON_ONCE(freeze_inc(sb, who) > 1); sb->s_writers.frozen = SB_FREEZE_COMPLETE; wake_up_var(&sb->s_writers.frozen); lockdep_sb_freeze_release(sb); @@ -2066,34 +2120,22 @@ EXPORT_SYMBOL(freeze_super); */ static int thaw_super_locked(struct super_block *sb, enum freeze_holder who) { - int error; + int error = -EINVAL; - if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) { - if (!(sb->s_writers.freeze_holders & who)) { - super_unlock_excl(sb); - return -EINVAL; - } + if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) + goto out_unlock; - /* - * Freeze is shared with someone else. Release our hold and - * drop the active ref that freeze_super assigned to the - * freezer. - */ - if (sb->s_writers.freeze_holders & ~who) { - sb->s_writers.freeze_holders &= ~who; - deactivate_locked_super(sb); - return 0; - } - } else { - super_unlock_excl(sb); - return -EINVAL; - } + /* + * All freezers share a single active reference. + * So just unlock in case there are any left. + */ + if (freeze_dec(sb, who)) + goto out_unlock; if (sb_rdonly(sb)) { - sb->s_writers.freeze_holders &= ~who; sb->s_writers.frozen = SB_UNFROZEN; wake_up_var(&sb->s_writers.frozen); - goto out; + goto out_deactivate; } lockdep_sb_freeze_acquire(sb); @@ -2101,20 +2143,23 @@ static int thaw_super_locked(struct super_block *sb, enum freeze_holder who) if (sb->s_op->unfreeze_fs) { error = sb->s_op->unfreeze_fs(sb); if (error) { - printk(KERN_ERR "VFS:Filesystem thaw failed\n"); + pr_err("VFS: Filesystem thaw failed\n"); + freeze_inc(sb, who); lockdep_sb_freeze_release(sb); - super_unlock_excl(sb); - return error; + goto out_unlock; } } - sb->s_writers.freeze_holders &= ~who; sb->s_writers.frozen = SB_UNFROZEN; wake_up_var(&sb->s_writers.frozen); sb_freeze_unlock(sb, SB_FREEZE_FS); -out: +out_deactivate: deactivate_locked_super(sb); return 0; + +out_unlock: + super_unlock_excl(sb); + return error; } /** @@ -2128,11 +2173,18 @@ out: * @who should be: * * %FREEZE_HOLDER_USERSPACE if userspace wants to thaw the fs; * * %FREEZE_HOLDER_KERNEL if the kernel wants to thaw the fs. + * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed + * + * A filesystem may hold multiple devices and thus a filesystems may + * have been frozen through the block layer via multiple block devices. + * The filesystem remains frozen until all block devices are unfrozen. */ int thaw_super(struct super_block *sb, enum freeze_holder who) { - if (!super_lock_excl(sb)) - WARN(1, "Dying superblock while thawing!"); + if (!super_lock_excl(sb)) { + WARN_ON_ONCE("Dying superblock while thawing!"); + return -EINVAL; + } return thaw_super_locked(sb, who); } EXPORT_SYMBOL(thaw_super); diff --git a/fs/sysctls.c b/fs/sysctls.c index 76a0aee8c229..8dbde9a802fa 100644 --- a/fs/sysctls.c +++ b/fs/sysctls.c @@ -26,7 +26,6 @@ static struct ctl_table fs_shared_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_MAXOLDUID, }, - { } }; static int __init init_fs_sysctls(void) diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index 725981474e5f..410ab2a44d2f 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -8,6 +8,7 @@ #include <linux/buffer_head.h> #include <linux/mount.h> +#include <linux/mpage.h> #include <linux/string.h> #include "sysv.h" @@ -456,9 +457,10 @@ int sysv_getattr(struct mnt_idmap *idmap, const struct path *path, return 0; } -static int sysv_writepage(struct page *page, struct writeback_control *wbc) +static int sysv_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - return block_write_full_page(page,get_block,wbc); + return mpage_writepages(mapping, wbc, get_block); } static int sysv_read_folio(struct file *file, struct folio *folio) @@ -503,8 +505,9 @@ const struct address_space_operations sysv_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = sysv_read_folio, - .writepage = sysv_writepage, + .writepages = sysv_writepages, .write_begin = sysv_write_begin, .write_end = generic_write_end, + .migrate_folio = buffer_migrate_folio, .bmap = sysv_bmap }; diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index bc86ffdb103b..ad20e6af938d 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -199,21 +199,17 @@ static void change_gid(struct dentry *dentry, kgid_t gid) */ static void set_gid(struct dentry *parent, kgid_t gid) { - struct dentry *this_parent; - struct list_head *next; + struct dentry *this_parent, *dentry; this_parent = parent; spin_lock(&this_parent->d_lock); change_gid(this_parent, gid); repeat: - next = this_parent->d_subdirs.next; + dentry = d_first_child(this_parent); resume: - while (next != &this_parent->d_subdirs) { + hlist_for_each_entry_from(dentry, d_sib) { struct tracefs_inode *ti; - struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_child); - next = tmp->next; /* Note, getdents() can add a cursor dentry with no inode */ if (!dentry->d_inode) @@ -228,7 +224,7 @@ resume: if (ti && (ti->flags & TRACEFS_EVENT_INODE)) eventfs_update_gid(dentry, gid); - if (!list_empty(&dentry->d_subdirs)) { + if (!hlist_empty(&dentry->d_children)) { spin_unlock(&this_parent->d_lock); spin_release(&dentry->d_lock.dep_map, _RET_IP_); this_parent = dentry; @@ -243,21 +239,20 @@ resume: rcu_read_lock(); ascend: if (this_parent != parent) { - struct dentry *child = this_parent; - this_parent = child->d_parent; + dentry = this_parent; + this_parent = dentry->d_parent; - spin_unlock(&child->d_lock); + spin_unlock(&dentry->d_lock); spin_lock(&this_parent->d_lock); /* go into the first sibling still alive */ - do { - next = child->d_child.next; - if (next == &this_parent->d_subdirs) - goto ascend; - child = list_entry(next, struct dentry, d_child); - } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED)); - rcu_read_unlock(); - goto resume; + hlist_for_each_entry_continue(dentry, d_sib) { + if (likely(!(dentry->d_flags & DCACHE_DENTRY_KILLED))) { + rcu_read_unlock(); + goto resume; + } + } + goto ascend; } rcu_read_unlock(); spin_unlock(&this_parent->d_lock); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 3508ac484da3..1bb6ed948927 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -125,8 +125,6 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, udf_fiiter_release(&iter); inode = udf_iget(dir->i_sb, &loc); - if (IS_ERR(inode)) - return ERR_CAST(inode); } return d_splice_alias(inode, dentry); @@ -230,8 +228,6 @@ static int udf_fiiter_add_entry(struct inode *dir, struct dentry *dentry, char name[UDF_NAME_LEN_CS0]; if (dentry) { - if (!dentry->d_name.len) - return -EINVAL; namelen = udf_put_filename(dir->i_sb, dentry->d_name.name, dentry->d_name.len, name, UDF_NAME_LEN_CS0); @@ -766,7 +762,7 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); struct udf_fileident_iter oiter, niter, diriter; - bool has_diriter = false; + bool has_diriter = false, is_dir = false; int retval; struct kernel_lb_addr tloc; @@ -789,6 +785,9 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (!empty_dir(new_inode)) goto out_oiter; } + is_dir = true; + } + if (is_dir && old_dir != new_dir) { retval = udf_fiiter_find_entry(old_inode, &dotdot_name, &diriter); if (retval == -ENOENT) { @@ -878,7 +877,9 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, udf_dir_entry_len(&diriter.fi)); udf_fiiter_write_fi(&diriter, NULL); udf_fiiter_release(&diriter); + } + if (is_dir) { inode_dec_link_count(old_dir); if (new_inode) inode_dec_link_count(new_inode); @@ -899,7 +900,6 @@ out_oiter: static struct dentry *udf_get_parent(struct dentry *child) { struct kernel_lb_addr tloc; - struct inode *inode = NULL; struct udf_fileident_iter iter; int err; @@ -909,11 +909,7 @@ static struct dentry *udf_get_parent(struct dentry *child) tloc = lelb_to_cpu(iter.fi.icb.extLocation); udf_fiiter_release(&iter); - inode = udf_iget(child->d_sb, &tloc); - if (IS_ERR(inode)) - return ERR_CAST(inode); - - return d_obtain_alias(inode); + return d_obtain_alias(udf_iget(child->d_sb, &tloc)); } diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index ebce93b08281..a7bb2e63cdde 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -35,6 +35,7 @@ #include <linux/string.h> #include <linux/mm.h> #include <linux/buffer_head.h> +#include <linux/mpage.h> #include <linux/writeback.h> #include <linux/iversion.h> @@ -390,7 +391,7 @@ out: /** * ufs_getfrag_block() - `get_block_t' function, interface between UFS and - * read_folio, writepage and so on + * read_folio, writepages and so on */ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create) @@ -467,9 +468,10 @@ done: return 0; } -static int ufs_writepage(struct page *page, struct writeback_control *wbc) +static int ufs_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - return block_write_full_page(page,ufs_getfrag_block,wbc); + return mpage_writepages(mapping, wbc, ufs_getfrag_block); } static int ufs_read_folio(struct file *file, struct folio *folio) @@ -528,9 +530,10 @@ const struct address_space_operations ufs_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = ufs_read_folio, - .writepage = ufs_writepage, + .writepages = ufs_writepages, .write_begin = ufs_write_begin, .write_end = ufs_write_end, + .migrate_folio = buffer_migrate_folio, .bmap = ufs_bmap }; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index e8af40b05549..4fcefe5ef7cb 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -45,7 +45,6 @@ static struct ctl_table vm_userfaultfd_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { } }; #endif @@ -2005,6 +2004,75 @@ static inline unsigned int uffd_ctx_features(__u64 user_features) return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED; } +static int userfaultfd_move(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + __s64 ret; + struct uffdio_move uffdio_move; + struct uffdio_move __user *user_uffdio_move; + struct userfaultfd_wake_range range; + struct mm_struct *mm = ctx->mm; + + user_uffdio_move = (struct uffdio_move __user *) arg; + + if (atomic_read(&ctx->mmap_changing)) + return -EAGAIN; + + if (copy_from_user(&uffdio_move, user_uffdio_move, + /* don't copy "move" last field */ + sizeof(uffdio_move)-sizeof(__s64))) + return -EFAULT; + + /* Do not allow cross-mm moves. */ + if (mm != current->mm) + return -EINVAL; + + ret = validate_range(mm, uffdio_move.dst, uffdio_move.len); + if (ret) + return ret; + + ret = validate_range(mm, uffdio_move.src, uffdio_move.len); + if (ret) + return ret; + + if (uffdio_move.mode & ~(UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES| + UFFDIO_MOVE_MODE_DONTWAKE)) + return -EINVAL; + + if (mmget_not_zero(mm)) { + mmap_read_lock(mm); + + /* Re-check after taking mmap_lock */ + if (likely(!atomic_read(&ctx->mmap_changing))) + ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src, + uffdio_move.len, uffdio_move.mode); + else + ret = -EINVAL; + + mmap_read_unlock(mm); + mmput(mm); + } else { + return -ESRCH; + } + + if (unlikely(put_user(ret, &user_uffdio_move->move))) + return -EFAULT; + if (ret < 0) + goto out; + + /* len == 0 would wake all */ + VM_WARN_ON(!ret); + range.len = ret; + if (!(uffdio_move.mode & UFFDIO_MOVE_MODE_DONTWAKE)) { + range.start = uffdio_move.dst; + wake_userfault(ctx, &range); + } + ret = range.len == uffdio_move.len ? 0 : -EAGAIN; + +out: + return ret; +} + /* * userland asks for a certain API version and we return which bits * and ioctl commands are implemented in this kernel for such API @@ -2097,6 +2165,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd, case UFFDIO_ZEROPAGE: ret = userfaultfd_zeropage(ctx, arg); break; + case UFFDIO_MOVE: + ret = userfaultfd_move(ctx, arg); + break; case UFFDIO_WRITEPROTECT: ret = userfaultfd_writeprotect(ctx, arg); break; diff --git a/fs/vboxsf/vboxsf_wrappers.c b/fs/vboxsf/vboxsf_wrappers.c index bfc78a097dae..5e8d4359e171 100644 --- a/fs/vboxsf/vboxsf_wrappers.c +++ b/fs/vboxsf/vboxsf_wrappers.c @@ -114,7 +114,7 @@ int vboxsf_unmap_folder(u32 root) * vboxsf_create - Create a new file or folder * @root: Root of the shared folder in which to create the file * @parsed_path: The path of the file or folder relative to the shared folder - * @param: create_parms Parameters for file/folder creation. + * @create_parms: Parameters for file/folder creation. * * Create a new file or folder or open an existing one in a shared folder. * Note this function always returns 0 / success unless an exceptional condition diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index d071a6e32581..a6a6b2749241 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -100,6 +100,16 @@ fsverity_msg(const struct inode *inode, const char *level, #define fsverity_err(inode, fmt, ...) \ fsverity_msg((inode), KERN_ERR, fmt, ##__VA_ARGS__) +/* measure.c */ + +#ifdef CONFIG_BPF_SYSCALL +void __init fsverity_init_bpf(void); +#else +static inline void fsverity_init_bpf(void) +{ +} +#endif + /* open.c */ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, diff --git a/fs/verity/init.c b/fs/verity/init.c index a29f062f6047..cb2c9aac61ed 100644 --- a/fs/verity/init.c +++ b/fs/verity/init.c @@ -24,7 +24,6 @@ static struct ctl_table fsverity_sysctl_table[] = { .extra2 = SYSCTL_ONE, }, #endif - { } }; static void __init fsverity_init_sysctl(void) @@ -69,6 +68,7 @@ static int __init fsverity_init(void) fsverity_init_workqueue(); fsverity_init_sysctl(); fsverity_init_signature(); + fsverity_init_bpf(); return 0; } late_initcall(fsverity_init) diff --git a/fs/verity/measure.c b/fs/verity/measure.c index eec5956141da..bf7a5f4cccaf 100644 --- a/fs/verity/measure.c +++ b/fs/verity/measure.c @@ -7,6 +7,8 @@ #include "fsverity_private.h" +#include <linux/bpf.h> +#include <linux/btf.h> #include <linux/uaccess.h> /** @@ -100,3 +102,85 @@ int fsverity_get_digest(struct inode *inode, return hash_alg->digest_size; } EXPORT_SYMBOL_GPL(fsverity_get_digest); + +#ifdef CONFIG_BPF_SYSCALL + +/* bpf kfuncs */ +__bpf_kfunc_start_defs(); + +/** + * bpf_get_fsverity_digest: read fsverity digest of file + * @file: file to get digest from + * @digest_ptr: (out) dynptr for struct fsverity_digest + * + * Read fsverity_digest of *file* into *digest_ptr*. + * + * Return: 0 on success, a negative value on error. + */ +__bpf_kfunc int bpf_get_fsverity_digest(struct file *file, struct bpf_dynptr_kern *digest_ptr) +{ + const struct inode *inode = file_inode(file); + u32 dynptr_sz = __bpf_dynptr_size(digest_ptr); + struct fsverity_digest *arg; + const struct fsverity_info *vi; + const struct fsverity_hash_alg *hash_alg; + int out_digest_sz; + + if (dynptr_sz < sizeof(struct fsverity_digest)) + return -EINVAL; + + arg = __bpf_dynptr_data_rw(digest_ptr, dynptr_sz); + if (!arg) + return -EINVAL; + + if (!IS_ALIGNED((uintptr_t)arg, __alignof__(*arg))) + return -EINVAL; + + vi = fsverity_get_info(inode); + if (!vi) + return -ENODATA; /* not a verity file */ + + hash_alg = vi->tree_params.hash_alg; + + arg->digest_algorithm = hash_alg - fsverity_hash_algs; + arg->digest_size = hash_alg->digest_size; + + out_digest_sz = dynptr_sz - sizeof(struct fsverity_digest); + + /* copy digest */ + memcpy(arg->digest, vi->file_digest, min_t(int, hash_alg->digest_size, out_digest_sz)); + + /* fill the extra buffer with zeros */ + if (out_digest_sz > hash_alg->digest_size) + memset(arg->digest + arg->digest_size, 0, out_digest_sz - hash_alg->digest_size); + + return 0; +} + +__bpf_kfunc_end_defs(); + +BTF_SET8_START(fsverity_set_ids) +BTF_ID_FLAGS(func, bpf_get_fsverity_digest, KF_TRUSTED_ARGS) +BTF_SET8_END(fsverity_set_ids) + +static int bpf_get_fsverity_digest_filter(const struct bpf_prog *prog, u32 kfunc_id) +{ + if (!btf_id_set8_contains(&fsverity_set_ids, kfunc_id)) + return 0; + + /* Only allow to attach from LSM hooks, to avoid recursion */ + return prog->type != BPF_PROG_TYPE_LSM ? -EACCES : 0; +} + +static const struct btf_kfunc_id_set bpf_fsverity_set = { + .owner = THIS_MODULE, + .set = &fsverity_set_ids, + .filter = bpf_get_fsverity_digest_filter, +}; + +void __init fsverity_init_bpf(void) +{ + register_btf_kfunc_id_set(BPF_PROG_TYPE_LSM, &bpf_fsverity_set); +} + +#endif /* CONFIG_BPF_SYSCALL */ diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 7762c01a85cf..fbe3cdc79036 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -145,6 +145,7 @@ ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y) xfs-y += $(addprefix scrub/, \ trace.o \ + agb_bitmap.o \ agheader.o \ alloc.o \ attr.o \ @@ -175,14 +176,32 @@ xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \ rtsummary.o \ ) -xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o +xfs-$(CONFIG_XFS_QUOTA) += $(addprefix scrub/, \ + dqiterate.o \ + quota.o \ + ) # online repair ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y) xfs-y += $(addprefix scrub/, \ agheader_repair.o \ + alloc_repair.o \ + bmap_repair.o \ + cow_repair.o \ + ialloc_repair.o \ + inode_repair.o \ + newbt.o \ reap.o \ + refcount_repair.o \ repair.o \ ) + +xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \ + rtbitmap_repair.o \ + ) + +xfs-$(CONFIG_XFS_QUOTA) += $(addprefix scrub/, \ + quota_repair.o \ + ) endif endif diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index f9f4d694640d..39d9525270b7 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -332,6 +332,31 @@ xfs_agino_range( return __xfs_agino_range(mp, xfs_ag_block_count(mp, agno), first, last); } +/* + * Free perag within the specified AG range, it is only used to free unused + * perags under the error handling path. + */ +void +xfs_free_unused_perag_range( + struct xfs_mount *mp, + xfs_agnumber_t agstart, + xfs_agnumber_t agend) +{ + struct xfs_perag *pag; + xfs_agnumber_t index; + + for (index = agstart; index < agend; index++) { + spin_lock(&mp->m_perag_lock); + pag = radix_tree_delete(&mp->m_perag_tree, index); + spin_unlock(&mp->m_perag_lock); + if (!pag) + break; + xfs_buf_hash_destroy(pag); + xfs_defer_drain_free(&pag->pag_intents_drain); + kmem_free(pag); + } +} + int xfs_initialize_perag( struct xfs_mount *mp, @@ -424,19 +449,14 @@ xfs_initialize_perag( out_remove_pag: xfs_defer_drain_free(&pag->pag_intents_drain); + spin_lock(&mp->m_perag_lock); radix_tree_delete(&mp->m_perag_tree, index); + spin_unlock(&mp->m_perag_lock); out_free_pag: kmem_free(pag); out_unwind_new_pags: /* unwind any prior newly initialized pags */ - for (index = first_initialised; index < agcount; index++) { - pag = radix_tree_delete(&mp->m_perag_tree, index); - if (!pag) - break; - xfs_buf_hash_destroy(pag); - xfs_defer_drain_free(&pag->pag_intents_drain); - kmem_free(pag); - } + xfs_free_unused_perag_range(mp, first_initialised, agcount); return error; } @@ -984,7 +1004,7 @@ xfs_ag_shrink_space( if (err2 != -ENOSPC) goto resv_err; - err2 = __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, + err2 = xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, XFS_AG_RESV_NONE, true); if (err2) goto resv_err; diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 2e0aef87d633..4b343c4fac28 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -80,6 +80,16 @@ struct xfs_perag { */ uint16_t pag_checked; uint16_t pag_sick; + +#ifdef CONFIG_XFS_ONLINE_REPAIR + /* + * Alternate btree heights so that online repair won't trip the write + * verifiers while rebuilding the AG btrees. + */ + uint8_t pagf_repair_levels[XFS_BTNUM_AGF]; + uint8_t pagf_repair_refcount_level; +#endif + spinlock_t pag_state_lock; spinlock_t pagb_lock; /* lock for pagb_tree */ @@ -133,6 +143,8 @@ __XFS_AG_OPSTATE(prefers_metadata, PREFERS_METADATA) __XFS_AG_OPSTATE(allows_inodes, ALLOWS_INODES) __XFS_AG_OPSTATE(agfl_needs_reset, AGFL_NEEDS_RESET) +void xfs_free_unused_perag_range(struct xfs_mount *mp, xfs_agnumber_t agstart, + xfs_agnumber_t agend); int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount, xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi); int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno); diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index 7fd1fea95552..da1057bd0e60 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -411,6 +411,8 @@ xfs_ag_resv_free_extent( fallthrough; case XFS_AG_RESV_NONE: xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len); + fallthrough; + case XFS_AG_RESV_IGNORE: return; } diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 100ab5931b31..3bd0a33fee0a 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -246,11 +246,9 @@ xfs_alloc_btrec_to_irec( /* Simple checks for free space records. */ xfs_failaddr_t xfs_alloc_check_irec( - struct xfs_btree_cur *cur, - const struct xfs_alloc_rec_incore *irec) + struct xfs_perag *pag, + const struct xfs_alloc_rec_incore *irec) { - struct xfs_perag *pag = cur->bc_ag.pag; - if (irec->ar_blockcount == 0) return __this_address; @@ -299,7 +297,7 @@ xfs_alloc_get_rec( return error; xfs_alloc_btrec_to_irec(rec, &irec); - fa = xfs_alloc_check_irec(cur, &irec); + fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec); if (fa) return xfs_alloc_complain_bad_rec(cur, fa, &irec); @@ -2514,7 +2512,7 @@ xfs_defer_agfl_block( trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); xfs_extent_free_get_group(mp, xefi); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list); + xfs_defer_add(tp, &xefi->xefi_list, &xfs_agfl_free_defer_type); return 0; } @@ -2522,14 +2520,15 @@ xfs_defer_agfl_block( * Add the extent to the list of extents to be free at transaction end. * The list is maintained sorted (by block number). */ -int -__xfs_free_extent_later( +static int +xfs_defer_extent_free( struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type, - bool skip_discard) + bool skip_discard, + struct xfs_defer_pending **dfpp) { struct xfs_extent_free_item *xefi; struct xfs_mount *mp = tp->t_mountp; @@ -2577,10 +2576,105 @@ __xfs_free_extent_later( XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len); xfs_extent_free_get_group(mp, xefi); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list); + *dfpp = xfs_defer_add(tp, &xefi->xefi_list, &xfs_extent_free_defer_type); return 0; } +int +xfs_free_extent_later( + struct xfs_trans *tp, + xfs_fsblock_t bno, + xfs_filblks_t len, + const struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type, + bool skip_discard) +{ + struct xfs_defer_pending *dontcare = NULL; + + return xfs_defer_extent_free(tp, bno, len, oinfo, type, skip_discard, + &dontcare); +} + +/* + * Set up automatic freeing of unwritten space in the filesystem. + * + * This function attached a paused deferred extent free item to the + * transaction. Pausing means that the EFI will be logged in the next + * transaction commit, but the pending EFI will not be finished until the + * pending item is unpaused. + * + * If the system goes down after the EFI has been persisted to the log but + * before the pending item is unpaused, log recovery will find the EFI, fail to + * find the EFD, and free the space. + * + * If the pending item is unpaused, the next transaction commit will log an EFD + * without freeing the space. + * + * Caller must ensure that the tp, fsbno, len, oinfo, and resv flags of the + * @args structure are set to the relevant values. + */ +int +xfs_alloc_schedule_autoreap( + const struct xfs_alloc_arg *args, + bool skip_discard, + struct xfs_alloc_autoreap *aarp) +{ + int error; + + error = xfs_defer_extent_free(args->tp, args->fsbno, args->len, + &args->oinfo, args->resv, skip_discard, &aarp->dfp); + if (error) + return error; + + xfs_defer_item_pause(args->tp, aarp->dfp); + return 0; +} + +/* + * Cancel automatic freeing of unwritten space in the filesystem. + * + * Earlier, we created a paused deferred extent free item and attached it to + * this transaction so that we could automatically roll back a new space + * allocation if the system went down. Now we want to cancel the paused work + * item by marking the EFI stale so we don't actually free the space, unpausing + * the pending item and logging an EFD. + * + * The caller generally should have already mapped the space into the ondisk + * filesystem. If the reserved space was partially used, the caller must call + * xfs_free_extent_later to create a new EFI to free the unused space. + */ +void +xfs_alloc_cancel_autoreap( + struct xfs_trans *tp, + struct xfs_alloc_autoreap *aarp) +{ + struct xfs_defer_pending *dfp = aarp->dfp; + struct xfs_extent_free_item *xefi; + + if (!dfp) + return; + + list_for_each_entry(xefi, &dfp->dfp_work, xefi_list) + xefi->xefi_flags |= XFS_EFI_CANCELLED; + + xfs_defer_item_unpause(tp, dfp); +} + +/* + * Commit automatic freeing of unwritten space in the filesystem. + * + * This unpauses an earlier _schedule_autoreap and commits to freeing the + * allocated space. Call this if none of the reserved space was used. + */ +void +xfs_alloc_commit_autoreap( + struct xfs_trans *tp, + struct xfs_alloc_autoreap *aarp) +{ + if (aarp->dfp) + xfs_defer_item_unpause(tp, aarp->dfp); +} + #ifdef DEBUG /* * Check if an AGF has a free extent record whose length is equal to @@ -3848,7 +3942,7 @@ xfs_alloc_query_range_helper( xfs_failaddr_t fa; xfs_alloc_btrec_to_irec(rec, &irec); - fa = xfs_alloc_check_irec(cur, &irec); + fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec); if (fa) return xfs_alloc_complain_bad_rec(cur, fa, &irec); diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 6bb8d295c321..0b956f8b9d5a 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -185,7 +185,7 @@ xfs_alloc_get_rec( union xfs_btree_rec; void xfs_alloc_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_alloc_rec_incore *irec); -xfs_failaddr_t xfs_alloc_check_irec(struct xfs_btree_cur *cur, +xfs_failaddr_t xfs_alloc_check_irec(struct xfs_perag *pag, const struct xfs_alloc_rec_incore *irec); int xfs_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags, @@ -231,7 +231,7 @@ xfs_buf_to_agfl_bno( return bp->b_addr; } -int __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno, +int xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type, bool skip_discard); @@ -255,18 +255,18 @@ void xfs_extent_free_get_group(struct xfs_mount *mp, #define XFS_EFI_SKIP_DISCARD (1U << 0) /* don't issue discard */ #define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */ #define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */ +#define XFS_EFI_CANCELLED (1U << 3) /* dont actually free the space */ -static inline int -xfs_free_extent_later( - struct xfs_trans *tp, - xfs_fsblock_t bno, - xfs_filblks_t len, - const struct xfs_owner_info *oinfo, - enum xfs_ag_resv_type type) -{ - return __xfs_free_extent_later(tp, bno, len, oinfo, type, false); -} +struct xfs_alloc_autoreap { + struct xfs_defer_pending *dfp; +}; +int xfs_alloc_schedule_autoreap(const struct xfs_alloc_arg *args, + bool skip_discard, struct xfs_alloc_autoreap *aarp); +void xfs_alloc_cancel_autoreap(struct xfs_trans *tp, + struct xfs_alloc_autoreap *aarp); +void xfs_alloc_commit_autoreap(struct xfs_trans *tp, + struct xfs_alloc_autoreap *aarp); extern struct kmem_cache *xfs_extfree_item_cache; diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index c65228efed4a..a7032bf0cd37 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -323,7 +323,18 @@ xfs_allocbt_verify( if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC)) btnum = XFS_BTNUM_CNTi; if (pag && xfs_perag_initialised_agf(pag)) { - if (level >= pag->pagf_levels[btnum]) + unsigned int maxlevel = pag->pagf_levels[btnum]; + +#ifdef CONFIG_XFS_ONLINE_REPAIR + /* + * Online repair could be rewriting the free space btrees, so + * we'll validate against the larger of either tree while this + * is going on. + */ + maxlevel = max_t(unsigned int, maxlevel, + pag->pagf_repair_levels[btnum]); +#endif + if (level >= maxlevel) return __this_address; } else if (level >= mp->m_alloc_maxlevels) return __this_address; diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index e28d93d232de..9976a00a73f9 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -862,8 +862,11 @@ xfs_attr_lookup( if (!xfs_inode_hasattr(dp)) return -ENOATTR; - if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL) - return xfs_attr_sf_findname(args, NULL, NULL); + if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL) { + if (xfs_attr_sf_findname(args)) + return -EEXIST; + return -ENOATTR; + } if (xfs_attr_is_leaf(dp)) { error = xfs_attr_leaf_hasname(args, &bp); @@ -880,11 +883,10 @@ xfs_attr_lookup( return error; } -static int -xfs_attr_intent_init( +static void +xfs_attr_defer_add( struct xfs_da_args *args, - unsigned int op_flags, /* op flag (set or remove) */ - struct xfs_attr_intent **attr) /* new xfs_attr_intent */ + unsigned int op_flags) { struct xfs_attr_intent *new; @@ -893,66 +895,22 @@ xfs_attr_intent_init( new->xattri_op_flags = op_flags; new->xattri_da_args = args; - *attr = new; - return 0; -} - -/* Sets an attribute for an inode as a deferred operation */ -static int -xfs_attr_defer_add( - struct xfs_da_args *args) -{ - struct xfs_attr_intent *new; - int error = 0; - - error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_SET, &new); - if (error) - return error; + switch (op_flags) { + case XFS_ATTRI_OP_FLAGS_SET: + new->xattri_dela_state = xfs_attr_init_add_state(args); + break; + case XFS_ATTRI_OP_FLAGS_REPLACE: + new->xattri_dela_state = xfs_attr_init_replace_state(args); + break; + case XFS_ATTRI_OP_FLAGS_REMOVE: + new->xattri_dela_state = xfs_attr_init_remove_state(args); + break; + default: + ASSERT(0); + } - new->xattri_dela_state = xfs_attr_init_add_state(args); - xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); + xfs_defer_add(args->trans, &new->xattri_list, &xfs_attr_defer_type); trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp); - - return 0; -} - -/* Sets an attribute for an inode as a deferred operation */ -static int -xfs_attr_defer_replace( - struct xfs_da_args *args) -{ - struct xfs_attr_intent *new; - int error = 0; - - error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REPLACE, &new); - if (error) - return error; - - new->xattri_dela_state = xfs_attr_init_replace_state(args); - xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); - trace_xfs_attr_defer_replace(new->xattri_dela_state, args->dp); - - return 0; -} - -/* Removes an attribute for an inode as a deferred operation */ -static int -xfs_attr_defer_remove( - struct xfs_da_args *args) -{ - - struct xfs_attr_intent *new; - int error; - - error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REMOVE, &new); - if (error) - return error; - - new->xattri_dela_state = xfs_attr_init_remove_state(args); - xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); - trace_xfs_attr_defer_remove(new->xattri_dela_state, args->dp); - - return 0; } /* @@ -1038,16 +996,16 @@ xfs_attr_set( error = xfs_attr_lookup(args); switch (error) { case -EEXIST: - /* if no value, we are performing a remove operation */ if (!args->value) { - error = xfs_attr_defer_remove(args); + /* if no value, we are performing a remove operation */ + xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REMOVE); break; } + /* Pure create fails if the attr already exists */ if (args->attr_flags & XATTR_CREATE) goto out_trans_cancel; - - error = xfs_attr_defer_replace(args); + xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REPLACE); break; case -ENOATTR: /* Can't remove what isn't there. */ @@ -1057,14 +1015,11 @@ xfs_attr_set( /* Pure replace fails if no existing attr to replace. */ if (args->attr_flags & XATTR_REPLACE) goto out_trans_cancel; - - error = xfs_attr_defer_add(args); + xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_SET); break; default: goto out_trans_cancel; } - if (error) - goto out_trans_cancel; /* * If this is a synchronous mount, make sure that the @@ -1097,10 +1052,9 @@ out_trans_cancel: static inline int xfs_attr_sf_totsize(struct xfs_inode *dp) { - struct xfs_attr_shortform *sf; + struct xfs_attr_sf_hdr *sf = dp->i_af.if_data; - sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data; - return be16_to_cpu(sf->hdr.totsize); + return be16_to_cpu(sf->totsize); } /* @@ -1112,19 +1066,13 @@ xfs_attr_shortform_addname( struct xfs_da_args *args) { int newsize, forkoff; - int error; trace_xfs_attr_sf_addname(args); - error = xfs_attr_shortform_lookup(args); - switch (error) { - case -ENOATTR: - if (args->op_flags & XFS_DA_OP_REPLACE) - return error; - break; - case -EEXIST: - if (!(args->op_flags & XFS_DA_OP_REPLACE)) - return error; + if (xfs_attr_sf_findname(args)) { + int error; + + ASSERT(args->op_flags & XFS_DA_OP_REPLACE); error = xfs_attr_sf_removename(args); if (error) @@ -1137,11 +1085,8 @@ xfs_attr_shortform_addname( * around. */ args->op_flags &= ~XFS_DA_OP_REPLACE; - break; - case 0: - break; - default: - return error; + } else { + ASSERT(!(args->op_flags & XFS_DA_OP_REPLACE)); } if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX || diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 2580ae47209a..6374bf107242 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -690,56 +690,32 @@ xfs_attr_shortform_create( ASSERT(ifp->if_bytes == 0); if (ifp->if_format == XFS_DINODE_FMT_EXTENTS) ifp->if_format = XFS_DINODE_FMT_LOCAL; - xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK); - hdr = (struct xfs_attr_sf_hdr *)ifp->if_u1.if_data; + + hdr = xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK); memset(hdr, 0, sizeof(*hdr)); hdr->totsize = cpu_to_be16(sizeof(*hdr)); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); } /* - * Return -EEXIST if attr is found, or -ENOATTR if not - * args: args containing attribute name and namelen - * sfep: If not null, pointer will be set to the last attr entry found on - -EEXIST. On -ENOATTR pointer is left at the last entry in the list - * basep: If not null, pointer is set to the byte offset of the entry in the - * list on -EEXIST. On -ENOATTR, pointer is left at the byte offset of - * the last entry in the list + * Return the entry if the attr in args is found, or NULL if not. */ -int +struct xfs_attr_sf_entry * xfs_attr_sf_findname( - struct xfs_da_args *args, - struct xfs_attr_sf_entry **sfep, - unsigned int *basep) + struct xfs_da_args *args) { - struct xfs_attr_shortform *sf; - struct xfs_attr_sf_entry *sfe; - unsigned int base = sizeof(struct xfs_attr_sf_hdr); - int size = 0; - int end; - int i; + struct xfs_attr_sf_hdr *sf = args->dp->i_af.if_data; + struct xfs_attr_sf_entry *sfe; - sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data; - sfe = &sf->list[0]; - end = sf->hdr.count; - for (i = 0; i < end; sfe = xfs_attr_sf_nextentry(sfe), - base += size, i++) { - size = xfs_attr_sf_entsize(sfe); - if (!xfs_attr_match(args, sfe->namelen, sfe->nameval, - sfe->flags)) - continue; - break; + for (sfe = xfs_attr_sf_firstentry(sf); + sfe < xfs_attr_sf_endptr(sf); + sfe = xfs_attr_sf_nextentry(sfe)) { + if (xfs_attr_match(args, sfe->namelen, sfe->nameval, + sfe->flags)) + return sfe; } - if (sfep != NULL) - *sfep = sfe; - - if (basep != NULL) - *basep = base; - - if (i == end) - return -ENOATTR; - return -EEXIST; + return NULL; } /* @@ -751,38 +727,31 @@ xfs_attr_shortform_add( struct xfs_da_args *args, int forkoff) { - struct xfs_attr_shortform *sf; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_ifork *ifp = &dp->i_af; + struct xfs_attr_sf_hdr *sf = ifp->if_data; struct xfs_attr_sf_entry *sfe; - int offset, size; - struct xfs_mount *mp; - struct xfs_inode *dp; - struct xfs_ifork *ifp; + int size; trace_xfs_attr_sf_add(args); - dp = args->dp; - mp = dp->i_mount; dp->i_forkoff = forkoff; - ifp = &dp->i_af; ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); - sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; - if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST) - ASSERT(0); + ASSERT(!xfs_attr_sf_findname(args)); - offset = (char *)sfe - (char *)sf; size = xfs_attr_sf_entsize_byname(args->namelen, args->valuelen); - xfs_idata_realloc(dp, size, XFS_ATTR_FORK); - sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; - sfe = (struct xfs_attr_sf_entry *)((char *)sf + offset); + sf = xfs_idata_realloc(dp, size, XFS_ATTR_FORK); + sfe = xfs_attr_sf_endptr(sf); sfe->namelen = args->namelen; sfe->valuelen = args->valuelen; sfe->flags = args->attr_filter; memcpy(sfe->nameval, args->name, args->namelen); memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen); - sf->hdr.count++; - be16_add_cpu(&sf->hdr.totsize, size); + sf->count++; + be16_add_cpu(&sf->totsize, size); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); xfs_sbversion_add_attr2(mp, args->trans); @@ -811,48 +780,43 @@ int xfs_attr_sf_removename( struct xfs_da_args *args) { - struct xfs_attr_shortform *sf; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_attr_sf_hdr *sf = dp->i_af.if_data; struct xfs_attr_sf_entry *sfe; - int size = 0, end, totsize; - unsigned int base; - struct xfs_mount *mp; - struct xfs_inode *dp; - int error; + uint16_t totsize = be16_to_cpu(sf->totsize); + void *next, *end; + int size = 0; trace_xfs_attr_sf_remove(args); - dp = args->dp; - mp = dp->i_mount; - sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data; - - error = xfs_attr_sf_findname(args, &sfe, &base); - - /* - * If we are recovering an operation, finding nothing to - * remove is not an error - it just means there was nothing - * to clean up. - */ - if (error == -ENOATTR && (args->op_flags & XFS_DA_OP_RECOVERY)) - return 0; - if (error != -EEXIST) - return error; - size = xfs_attr_sf_entsize(sfe); + sfe = xfs_attr_sf_findname(args); + if (!sfe) { + /* + * If we are recovering an operation, finding nothing to remove + * is not an error, it just means there was nothing to clean up. + */ + if (args->op_flags & XFS_DA_OP_RECOVERY) + return 0; + return -ENOATTR; + } /* * Fix up the attribute fork data, covering the hole */ - end = base + size; - totsize = be16_to_cpu(sf->hdr.totsize); - if (end != totsize) - memmove(&((char *)sf)[base], &((char *)sf)[end], totsize - end); - sf->hdr.count--; - be16_add_cpu(&sf->hdr.totsize, -size); + size = xfs_attr_sf_entsize(sfe); + next = xfs_attr_sf_nextentry(sfe); + end = xfs_attr_sf_endptr(sf); + if (next < end) + memmove(sfe, next, end - next); + sf->count--; + totsize -= size; + sf->totsize = cpu_to_be16(totsize); /* * Fix up the start offset of the attribute fork */ - totsize -= size; - if (totsize == sizeof(xfs_attr_sf_hdr_t) && xfs_has_attr2(mp) && + if (totsize == sizeof(struct xfs_attr_sf_hdr) && xfs_has_attr2(mp) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE))) { xfs_attr_fork_remove(dp, args->trans); @@ -860,7 +824,7 @@ xfs_attr_sf_removename( xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); dp->i_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); ASSERT(dp->i_forkoff); - ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || + ASSERT(totsize > sizeof(struct xfs_attr_sf_hdr) || (args->op_flags & XFS_DA_OP_ADDNAME) || !xfs_has_attr2(mp) || dp->i_df.if_format == XFS_DINODE_FMT_BTREE); @@ -874,33 +838,6 @@ xfs_attr_sf_removename( } /* - * Look up a name in a shortform attribute list structure. - */ -/*ARGSUSED*/ -int -xfs_attr_shortform_lookup(xfs_da_args_t *args) -{ - struct xfs_attr_shortform *sf; - struct xfs_attr_sf_entry *sfe; - int i; - struct xfs_ifork *ifp; - - trace_xfs_attr_sf_lookup(args); - - ifp = &args->dp->i_af; - ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); - sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; - sfe = &sf->list[0]; - for (i = 0; i < sf->hdr.count; - sfe = xfs_attr_sf_nextentry(sfe), i++) { - if (xfs_attr_match(args, sfe->namelen, sfe->nameval, - sfe->flags)) - return -EEXIST; - } - return -ENOATTR; -} - -/* * Retrieve the attribute value and length. * * If args->valuelen is zero, only the length needs to be returned. Unlike a @@ -909,23 +846,19 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) */ int xfs_attr_shortform_getvalue( - struct xfs_da_args *args) + struct xfs_da_args *args) { - struct xfs_attr_shortform *sf; - struct xfs_attr_sf_entry *sfe; - int i; + struct xfs_attr_sf_entry *sfe; ASSERT(args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL); - sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data; - sfe = &sf->list[0]; - for (i = 0; i < sf->hdr.count; - sfe = xfs_attr_sf_nextentry(sfe), i++) { - if (xfs_attr_match(args, sfe->namelen, sfe->nameval, - sfe->flags)) - return xfs_attr_copy_value(args, - &sfe->nameval[args->namelen], sfe->valuelen); - } - return -ENOATTR; + + trace_xfs_attr_sf_lookup(args); + + sfe = xfs_attr_sf_findname(args); + if (!sfe) + return -ENOATTR; + return xfs_attr_copy_value(args, &sfe->nameval[args->namelen], + sfe->valuelen); } /* Convert from using the shortform to the leaf format. */ @@ -933,26 +866,23 @@ int xfs_attr_shortform_to_leaf( struct xfs_da_args *args) { - struct xfs_inode *dp; - struct xfs_attr_shortform *sf; + struct xfs_inode *dp = args->dp; + struct xfs_ifork *ifp = &dp->i_af; + struct xfs_attr_sf_hdr *sf = ifp->if_data; struct xfs_attr_sf_entry *sfe; + int size = be16_to_cpu(sf->totsize); struct xfs_da_args nargs; char *tmpbuffer; - int error, i, size; + int error, i; xfs_dablk_t blkno; struct xfs_buf *bp; - struct xfs_ifork *ifp; trace_xfs_attr_sf_to_leaf(args); - dp = args->dp; - ifp = &dp->i_af; - sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; - size = be16_to_cpu(sf->hdr.totsize); tmpbuffer = kmem_alloc(size, 0); ASSERT(tmpbuffer != NULL); - memcpy(tmpbuffer, ifp->if_u1.if_data, size); - sf = (struct xfs_attr_shortform *)tmpbuffer; + memcpy(tmpbuffer, ifp->if_data, size); + sf = (struct xfs_attr_sf_hdr *)tmpbuffer; xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK); @@ -975,8 +905,8 @@ xfs_attr_shortform_to_leaf( nargs.trans = args->trans; nargs.op_flags = XFS_DA_OP_OKNOENT; - sfe = &sf->list[0]; - for (i = 0; i < sf->hdr.count; i++) { + sfe = xfs_attr_sf_firstentry(sf); + for (i = 0; i < sf->count; i++) { nargs.name = sfe->nameval; nargs.namelen = sfe->namelen; nargs.value = &sfe->nameval[nargs.namelen]; @@ -1040,23 +970,16 @@ xfs_attr_shortform_allfit( return xfs_attr_shortform_bytesfit(dp, bytes); } -/* Verify the consistency of an inline attribute fork. */ +/* Verify the consistency of a raw inline attribute fork. */ xfs_failaddr_t xfs_attr_shortform_verify( - struct xfs_inode *ip) + struct xfs_attr_sf_hdr *sfp, + size_t size) { - struct xfs_attr_shortform *sfp; - struct xfs_attr_sf_entry *sfep; + struct xfs_attr_sf_entry *sfep = xfs_attr_sf_firstentry(sfp); struct xfs_attr_sf_entry *next_sfep; char *endp; - struct xfs_ifork *ifp; int i; - int64_t size; - - ASSERT(ip->i_af.if_format == XFS_DINODE_FMT_LOCAL); - ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK); - sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data; - size = ifp->if_bytes; /* * Give up if the attribute is way too short. @@ -1067,8 +990,7 @@ xfs_attr_shortform_verify( endp = (char *)sfp + size; /* Check all reported entries */ - sfep = &sfp->list[0]; - for (i = 0; i < sfp->hdr.count; i++) { + for (i = 0; i < sfp->count; i++) { /* * struct xfs_attr_sf_entry has a variable length. * Check the fixed-offset parts of the structure are @@ -1244,14 +1166,10 @@ xfs_attr3_leaf_to_node( if (error) goto out; - /* copy leaf to new buffer, update identifiers */ - xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF); - bp2->b_ops = bp1->b_ops; - memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize); - if (xfs_has_crc(mp)) { - struct xfs_da3_blkinfo *hdr3 = bp2->b_addr; - hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp2)); - } + /* + * Copy leaf to new buffer and log it. + */ + xfs_da_buf_copy(bp2, bp1, args->geo->blksize); xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1); /* diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 368f4d9fa1d5..9b9948639c0f 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -47,16 +47,14 @@ struct xfs_attr3_icleaf_hdr { */ void xfs_attr_shortform_create(struct xfs_da_args *args); void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff); -int xfs_attr_shortform_lookup(struct xfs_da_args *args); int xfs_attr_shortform_getvalue(struct xfs_da_args *args); int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); int xfs_attr_sf_removename(struct xfs_da_args *args); -int xfs_attr_sf_findname(struct xfs_da_args *args, - struct xfs_attr_sf_entry **sfep, - unsigned int *basep); +struct xfs_attr_sf_entry *xfs_attr_sf_findname(struct xfs_da_args *args); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes); -xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_inode *ip); +xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_attr_sf_hdr *sfp, + size_t size); void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp); /* diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h index 37578b369d9b..bc4422223024 100644 --- a/fs/xfs/libxfs/xfs_attr_sf.h +++ b/fs/xfs/libxfs/xfs_attr_sf.h @@ -7,14 +7,6 @@ #define __XFS_ATTR_SF_H__ /* - * Attribute storage when stored inside the inode. - * - * Small attribute lists are packed as tightly as possible so as - * to fit into the literal area of the inode. - */ -typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t; - -/* * We generate this then sort it, attr_list() must return things in hash-order. */ typedef struct xfs_attr_sf_sort { @@ -41,11 +33,25 @@ static inline int xfs_attr_sf_entsize(struct xfs_attr_sf_entry *sfep) return struct_size(sfep, nameval, sfep->namelen + sfep->valuelen); } -/* next entry in struct */ +/* first entry in the SF attr fork */ +static inline struct xfs_attr_sf_entry * +xfs_attr_sf_firstentry(struct xfs_attr_sf_hdr *hdr) +{ + return (struct xfs_attr_sf_entry *)(hdr + 1); +} + +/* next entry after sfep */ static inline struct xfs_attr_sf_entry * xfs_attr_sf_nextentry(struct xfs_attr_sf_entry *sfep) { return (void *)sfep + xfs_attr_sf_entsize(sfep); } +/* pointer to the space after the last entry, e.g. for adding a new one */ +static inline struct xfs_attr_sf_entry * +xfs_attr_sf_endptr(struct xfs_attr_sf_hdr *sf) +{ + return (void *)sf + be16_to_cpu(sf->totsize); +} + #endif /* __XFS_ATTR_SF_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index be62acffad6c..98aaca933bdd 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -575,7 +575,7 @@ xfs_bmap_btree_to_extents( xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); error = xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) return error; @@ -747,7 +747,7 @@ xfs_bmap_local_to_extents_empty( ASSERT(ifp->if_nextents == 0); xfs_bmap_forkoff_reset(ip, whichfork); - ifp->if_u1.if_root = NULL; + ifp->if_data = NULL; ifp->if_height = 0; ifp->if_format = XFS_DINODE_FMT_EXTENTS; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -832,7 +832,7 @@ xfs_bmap_local_to_extents( xfs_bmap_local_to_extents_empty(tp, ip, whichfork); flags |= XFS_ILOG_CORE; - ifp->if_u1.if_root = NULL; + ifp->if_data = NULL; ifp->if_height = 0; rec.br_startoff = 0; @@ -3044,7 +3044,8 @@ xfs_bmap_extsize_align( #define XFS_ALLOC_GAP_UNITS 4 -void +/* returns true if ap->blkno was modified */ +bool xfs_bmap_adjacent( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { @@ -3079,13 +3080,14 @@ xfs_bmap_adjacent( if (adjust && ISVALID(ap->blkno + adjust, ap->prev.br_startblock)) ap->blkno += adjust; + return true; } /* * If not at eof, then compare the two neighbor blocks. * Figure out whether either one gives us a good starting point, * and pick the better one. */ - else if (!ap->eof) { + if (!ap->eof) { xfs_fsblock_t gotbno; /* right side block number */ xfs_fsblock_t gotdiff=0; /* right side difference */ xfs_fsblock_t prevbno; /* left side block number */ @@ -3165,14 +3167,21 @@ xfs_bmap_adjacent( * If both valid, pick the better one, else the only good * one, else ap->blkno is already set (to 0 or the inode block). */ - if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK) + if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK) { ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno; - else if (prevbno != NULLFSBLOCK) + return true; + } + if (prevbno != NULLFSBLOCK) { ap->blkno = prevbno; - else if (gotbno != NULLFSBLOCK) + return true; + } + if (gotbno != NULLFSBLOCK) { ap->blkno = gotbno; + return true; + } } #undef ISVALID + return false; } int @@ -3263,11 +3272,14 @@ xfs_bmap_btalloc_select_lengths( } /* Update all inode and quota accounting for the allocation we just did. */ -static void -xfs_bmap_btalloc_accounting( - struct xfs_bmalloca *ap, - struct xfs_alloc_arg *args) +void +xfs_bmap_alloc_account( + struct xfs_bmalloca *ap) { + bool isrt = XFS_IS_REALTIME_INODE(ap->ip) && + (ap->flags & XFS_BMAPI_ATTRFORK); + uint fld; + if (ap->flags & XFS_BMAPI_COWFORK) { /* * COW fork blocks are in-core only and thus are treated as @@ -3279,7 +3291,7 @@ xfs_bmap_btalloc_accounting( * yet. */ if (ap->wasdel) { - xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len); + xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length); return; } @@ -3291,22 +3303,25 @@ xfs_bmap_btalloc_accounting( * This essentially transfers the transaction quota reservation * to that of a delalloc extent. */ - ap->ip->i_delayed_blks += args->len; - xfs_trans_mod_dquot_byino(ap->tp, ap->ip, XFS_TRANS_DQ_RES_BLKS, - -(long)args->len); + ap->ip->i_delayed_blks += ap->length; + xfs_trans_mod_dquot_byino(ap->tp, ap->ip, isrt ? + XFS_TRANS_DQ_RES_RTBLKS : XFS_TRANS_DQ_RES_BLKS, + -(long)ap->length); return; } /* data/attr fork only */ - ap->ip->i_nblocks += args->len; + ap->ip->i_nblocks += ap->length; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); if (ap->wasdel) { - ap->ip->i_delayed_blks -= args->len; - xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len); + ap->ip->i_delayed_blks -= ap->length; + xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length); + fld = isrt ? XFS_TRANS_DQ_DELRTBCOUNT : XFS_TRANS_DQ_DELBCOUNT; + } else { + fld = isrt ? XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT; } - xfs_trans_mod_dquot_byino(ap->tp, ap->ip, - ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT : XFS_TRANS_DQ_BCOUNT, - args->len); + + xfs_trans_mod_dquot_byino(ap->tp, ap->ip, fld, ap->length); } static int @@ -3380,7 +3395,7 @@ xfs_bmap_process_allocated_extent( ap->offset = orig_offset; else if (ap->offset + ap->length < orig_offset + orig_length) ap->offset = orig_offset + orig_length - ap->length; - xfs_bmap_btalloc_accounting(ap, args); + xfs_bmap_alloc_account(ap); } #ifdef DEBUG @@ -5010,7 +5025,6 @@ xfs_bmap_del_extent_real( xfs_fileoff_t del_endoff; /* first offset past del */ int do_fx; /* free extent at end of routine */ int error; /* error return value */ - int flags = 0;/* inode logging flags */ struct xfs_bmbt_irec got; /* current extent entry */ xfs_fileoff_t got_endoff; /* first offset past got */ int i; /* temp state */ @@ -5023,6 +5037,8 @@ xfs_bmap_del_extent_real( uint32_t state = xfs_bmap_fork_to_state(whichfork); struct xfs_bmbt_irec old; + *logflagsp = 0; + mp = ip->i_mount; XFS_STATS_INC(mp, xs_del_exlist); @@ -5035,7 +5051,6 @@ xfs_bmap_del_extent_real( ASSERT(got_endoff >= del_endoff); ASSERT(!isnullstartblock(got.br_startblock)); qfield = 0; - error = 0; /* * If it's the case where the directory code is running with no block @@ -5051,13 +5066,13 @@ xfs_bmap_del_extent_real( del->br_startoff > got.br_startoff && del_endoff < got_endoff) return -ENOSPC; - flags = XFS_ILOG_CORE; + *logflagsp = XFS_ILOG_CORE; if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { if (!(bflags & XFS_BMAPI_REMAP)) { error = xfs_rtfree_blocks(tp, del->br_startblock, del->br_blockcount); if (error) - goto done; + return error; } do_fx = 0; @@ -5072,11 +5087,9 @@ xfs_bmap_del_extent_real( if (cur) { error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) - goto done; - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto done; - } + return error; + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; } if (got.br_startoff == del->br_startoff) @@ -5093,17 +5106,15 @@ xfs_bmap_del_extent_real( xfs_iext_prev(ifp, icur); ifp->if_nextents--; - flags |= XFS_ILOG_CORE; + *logflagsp |= XFS_ILOG_CORE; if (!cur) { - flags |= xfs_ilog_fext(whichfork); + *logflagsp |= xfs_ilog_fext(whichfork); break; } if ((error = xfs_btree_delete(cur, &i))) - goto done; - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto done; - } + return error; + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; break; case BMAP_LEFT_FILLING: /* @@ -5114,12 +5125,12 @@ xfs_bmap_del_extent_real( got.br_blockcount -= del->br_blockcount; xfs_iext_update_extent(ip, state, icur, &got); if (!cur) { - flags |= xfs_ilog_fext(whichfork); + *logflagsp |= xfs_ilog_fext(whichfork); break; } error = xfs_bmbt_update(cur, &got); if (error) - goto done; + return error; break; case BMAP_RIGHT_FILLING: /* @@ -5128,12 +5139,12 @@ xfs_bmap_del_extent_real( got.br_blockcount -= del->br_blockcount; xfs_iext_update_extent(ip, state, icur, &got); if (!cur) { - flags |= xfs_ilog_fext(whichfork); + *logflagsp |= xfs_ilog_fext(whichfork); break; } error = xfs_bmbt_update(cur, &got); if (error) - goto done; + return error; break; case 0: /* @@ -5150,18 +5161,18 @@ xfs_bmap_del_extent_real( new.br_state = got.br_state; new.br_startblock = del_endblock; - flags |= XFS_ILOG_CORE; + *logflagsp |= XFS_ILOG_CORE; if (cur) { error = xfs_bmbt_update(cur, &got); if (error) - goto done; + return error; error = xfs_btree_increment(cur, 0, &i); if (error) - goto done; + return error; cur->bc_rec.b = new; error = xfs_btree_insert(cur, &i); if (error && error != -ENOSPC) - goto done; + return error; /* * If get no-space back from btree insert, it tried a * split, and we have a zero block reservation. Fix up @@ -5174,33 +5185,28 @@ xfs_bmap_del_extent_real( */ error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) - goto done; - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto done; - } + return error; + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; /* * Update the btree record back * to the original value. */ error = xfs_bmbt_update(cur, &old); if (error) - goto done; + return error; /* * Reset the extent record back * to the original value. */ xfs_iext_update_extent(ip, state, icur, &old); - flags = 0; - error = -ENOSPC; - goto done; - } - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto done; + *logflagsp = 0; + return -ENOSPC; } + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; } else - flags |= xfs_ilog_fext(whichfork); + *logflagsp |= xfs_ilog_fext(whichfork); ifp->if_nextents++; xfs_iext_next(ifp, icur); @@ -5218,13 +5224,13 @@ xfs_bmap_del_extent_real( if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { xfs_refcount_decrease_extent(tp, del); } else { - error = __xfs_free_extent_later(tp, del->br_startblock, + error = xfs_free_extent_later(tp, del->br_startblock, del->br_blockcount, NULL, XFS_AG_RESV_NONE, ((bflags & XFS_BMAPI_NODISCARD) || del->br_state == XFS_EXT_UNWRITTEN)); if (error) - goto done; + return error; } } @@ -5239,9 +5245,7 @@ xfs_bmap_del_extent_real( if (qfield && !(bflags & XFS_BMAPI_REMAP)) xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks); -done: - *logflagsp = flags; - return error; + return 0; } /* @@ -5250,7 +5254,7 @@ done: * that value. If not all extents in the block range can be removed then * *done is set. */ -int /* error */ +static int __xfs_bunmapi( struct xfs_trans *tp, /* transaction pointer */ struct xfs_inode *ip, /* incore inode */ @@ -6102,7 +6106,7 @@ __xfs_bmap_add( bi->bi_bmap = *bmap; xfs_bmap_update_get_group(tp->t_mountp, bi); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_BMAP, &bi->bi_list); + xfs_defer_add(tp, &bi->bi_list, &xfs_bmap_update_defer_type); return 0; } @@ -6179,19 +6183,18 @@ xfs_bmap_finish_one( return error; } -/* Check that an inode's extent does not have invalid flags or bad ranges. */ +/* Check that an extent does not have invalid flags or bad ranges. */ xfs_failaddr_t -xfs_bmap_validate_extent( - struct xfs_inode *ip, +xfs_bmap_validate_extent_raw( + struct xfs_mount *mp, + bool rtfile, int whichfork, struct xfs_bmbt_irec *irec) { - struct xfs_mount *mp = ip->i_mount; - if (!xfs_verify_fileext(mp, irec->br_startoff, irec->br_blockcount)) return __this_address; - if (XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK) { + if (rtfile && whichfork == XFS_DATA_FORK) { if (!xfs_verify_rtbext(mp, irec->br_startblock, irec->br_blockcount)) return __this_address; @@ -6221,3 +6224,53 @@ xfs_bmap_intent_destroy_cache(void) kmem_cache_destroy(xfs_bmap_intent_cache); xfs_bmap_intent_cache = NULL; } + +/* Check that an inode's extent does not have invalid flags or bad ranges. */ +xfs_failaddr_t +xfs_bmap_validate_extent( + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *irec) +{ + return xfs_bmap_validate_extent_raw(ip->i_mount, + XFS_IS_REALTIME_INODE(ip), whichfork, irec); +} + +/* + * Used in xfs_itruncate_extents(). This is the maximum number of extents + * freed from a file in a single transaction. + */ +#define XFS_ITRUNC_MAX_EXTENTS 2 + +/* + * Unmap every extent in part of an inode's fork. We don't do any higher level + * invalidation work at all. + */ +int +xfs_bunmapi_range( + struct xfs_trans **tpp, + struct xfs_inode *ip, + uint32_t flags, + xfs_fileoff_t startoff, + xfs_fileoff_t endoff) +{ + xfs_filblks_t unmap_len = endoff - startoff + 1; + int error = 0; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + while (unmap_len > 0) { + ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER); + error = __xfs_bunmapi(*tpp, ip, startoff, &unmap_len, flags, + XFS_ITRUNC_MAX_EXTENTS); + if (error) + goto out; + + /* free the just unmapped extents */ + error = xfs_defer_finish(tpp); + if (error) + goto out; + } +out: + return error; +} diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index e33470e39728..f6b73f1bad5f 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -116,6 +116,8 @@ static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags) return XFS_DATA_FORK; } +void xfs_bmap_alloc_account(struct xfs_bmalloca *ap); + /* * Special values for xfs_bmbt_irec_t br_startblock field. */ @@ -190,9 +192,6 @@ int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap); -int __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t bno, xfs_filblks_t *rlen, uint32_t flags, - xfs_extnum_t nexts); int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extnum_t nexts, int *done); @@ -263,6 +262,8 @@ static inline uint32_t xfs_bmap_fork_to_state(int whichfork) } } +xfs_failaddr_t xfs_bmap_validate_extent_raw(struct xfs_mount *mp, bool rtfile, + int whichfork, struct xfs_bmbt_irec *irec); xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *irec); int xfs_bmap_complain_bad_rec(struct xfs_inode *ip, int whichfork, @@ -271,6 +272,8 @@ int xfs_bmap_complain_bad_rec(struct xfs_inode *ip, int whichfork, int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, uint32_t flags); +int xfs_bunmapi_range(struct xfs_trans **tpp, struct xfs_inode *ip, + uint32_t flags, xfs_fileoff_t startoff, xfs_fileoff_t endoff); extern struct kmem_cache *xfs_bmap_intent_cache; diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index bf3f1b36fdd2..71f2d50f7823 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -15,6 +15,7 @@ #include "xfs_trans.h" #include "xfs_alloc.h" #include "xfs_btree.h" +#include "xfs_btree_staging.h" #include "xfs_bmap_btree.h" #include "xfs_bmap.h" #include "xfs_error.h" @@ -272,7 +273,7 @@ xfs_bmbt_free_block( xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); error = xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) return error; @@ -288,10 +289,7 @@ xfs_bmbt_get_minrecs( int level) { if (level == cur->bc_nlevels - 1) { - struct xfs_ifork *ifp; - - ifp = xfs_ifork_ptr(cur->bc_ino.ip, - cur->bc_ino.whichfork); + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); return xfs_bmbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, level == 0) / 2; @@ -306,10 +304,7 @@ xfs_bmbt_get_maxrecs( int level) { if (level == cur->bc_nlevels - 1) { - struct xfs_ifork *ifp; - - ifp = xfs_ifork_ptr(cur->bc_ino.ip, - cur->bc_ino.whichfork); + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); return xfs_bmbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, level == 0); @@ -543,23 +538,19 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { .keys_contiguous = xfs_bmbt_keys_contiguous, }; -/* - * Allocate a new bmap btree cursor. - */ -struct xfs_btree_cur * /* new bmap btree cursor */ -xfs_bmbt_init_cursor( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* inode owning the btree */ - int whichfork) /* data or attr fork */ +static struct xfs_btree_cur * +xfs_bmbt_init_common( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork) { - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur; + ASSERT(whichfork != XFS_COW_FORK); cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_BMAP, mp->m_bm_maxlevels[whichfork], xfs_bmbt_cur_cache); - cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1; cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2); cur->bc_ops = &xfs_bmbt_ops; @@ -567,10 +558,30 @@ xfs_bmbt_init_cursor( if (xfs_has_crc(mp)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork); cur->bc_ino.ip = ip; cur->bc_ino.allocated = 0; cur->bc_ino.flags = 0; + + return cur; +} + +/* + * Allocate a new bmap btree cursor. + */ +struct xfs_btree_cur * +xfs_bmbt_init_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + struct xfs_btree_cur *cur; + + cur = xfs_bmbt_init_common(mp, tp, ip, whichfork); + + cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1; + cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork); cur->bc_ino.whichfork = whichfork; return cur; @@ -588,6 +599,76 @@ xfs_bmbt_block_maxrecs( } /* + * Allocate a new bmap btree cursor for reloading an inode block mapping data + * structure. Note that callers can use the staged cursor to reload extents + * format inode forks if they rebuild the iext tree and commit the staged + * cursor immediately. + */ +struct xfs_btree_cur * +xfs_bmbt_stage_cursor( + struct xfs_mount *mp, + struct xfs_inode *ip, + struct xbtree_ifakeroot *ifake) +{ + struct xfs_btree_cur *cur; + struct xfs_btree_ops *ops; + + /* data fork always has larger maxheight */ + cur = xfs_bmbt_init_common(mp, NULL, ip, XFS_DATA_FORK); + cur->bc_nlevels = ifake->if_levels; + cur->bc_ino.forksize = ifake->if_fork_size; + + /* Don't let anyone think we're attached to the real fork yet. */ + cur->bc_ino.whichfork = -1; + xfs_btree_stage_ifakeroot(cur, ifake, &ops); + ops->update_cursor = NULL; + return cur; +} + +/* + * Swap in the new inode fork root. Once we pass this point the newly rebuilt + * mappings are in place and we have to kill off any old btree blocks. + */ +void +xfs_bmbt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + int whichfork) +{ + struct xbtree_ifakeroot *ifake = cur->bc_ino.ifake; + struct xfs_ifork *ifp; + static const short brootflag[2] = {XFS_ILOG_DBROOT, XFS_ILOG_ABROOT}; + static const short extflag[2] = {XFS_ILOG_DEXT, XFS_ILOG_AEXT}; + int flags = XFS_ILOG_CORE; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + ASSERT(whichfork != XFS_COW_FORK); + + /* + * Free any resources hanging off the real fork, then shallow-copy the + * staging fork's contents into the real fork to transfer everything + * we just built. + */ + ifp = xfs_ifork_ptr(cur->bc_ino.ip, whichfork); + xfs_idestroy_fork(ifp); + memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork)); + + switch (ifp->if_format) { + case XFS_DINODE_FMT_EXTENTS: + flags |= extflag[whichfork]; + break; + case XFS_DINODE_FMT_BTREE: + flags |= brootflag[whichfork]; + break; + default: + ASSERT(0); + break; + } + xfs_trans_log_inode(tp, cur->bc_ino.ip, flags); + xfs_btree_commit_ifakeroot(cur, tp, whichfork, &xfs_bmbt_ops); +} + +/* * Calculate number of records in a bmap btree block. */ int diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h index 3e7a40a83835..151b8491f60e 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.h +++ b/fs/xfs/libxfs/xfs_bmap_btree.h @@ -11,6 +11,7 @@ struct xfs_btree_block; struct xfs_mount; struct xfs_inode; struct xfs_trans; +struct xbtree_ifakeroot; /* * Btree block header size depends on a superblock flag. @@ -106,6 +107,10 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip, extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); +struct xfs_btree_cur *xfs_bmbt_stage_cursor(struct xfs_mount *mp, + struct xfs_inode *ip, struct xbtree_ifakeroot *ifake); +void xfs_bmbt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp, int whichfork); extern unsigned long long xfs_bmbt_calc_size(struct xfs_mount *mp, unsigned long long len); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 6a6503ab0cd7..ea8d3659df20 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1330,7 +1330,7 @@ xfs_btree_get_buf_block( * Read in the buffer at the given ptr and return the buffer and * the block pointer within the buffer. */ -STATIC int +int xfs_btree_read_buf_block( struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, @@ -5212,3 +5212,29 @@ xfs_btree_destroy_cur_caches(void) xfs_rmapbt_destroy_cur_cache(); xfs_refcountbt_destroy_cur_cache(); } + +/* Move the btree cursor before the first record. */ +int +xfs_btree_goto_left_edge( + struct xfs_btree_cur *cur) +{ + int stat = 0; + int error; + + memset(&cur->bc_rec, 0, sizeof(cur->bc_rec)); + error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat); + if (error) + return error; + if (!stat) + return 0; + + error = xfs_btree_decrement(cur, 0, &stat); + if (error) + return error; + if (stat != 0) { + ASSERT(0); + return -EFSCORRUPTED; + } + + return 0; +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 4d68a58be160..d906324e25c8 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -700,6 +700,9 @@ void xfs_btree_set_ptr_null(struct xfs_btree_cur *cur, int xfs_btree_get_buf_block(struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, struct xfs_btree_block **block, struct xfs_buf **bpp); +int xfs_btree_read_buf_block(struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, int flags, + struct xfs_btree_block **block, struct xfs_buf **bpp); void xfs_btree_set_sibling(struct xfs_btree_cur *cur, struct xfs_btree_block *block, const union xfs_btree_ptr *ptr, int lr); @@ -735,4 +738,6 @@ xfs_btree_alloc_cursor( int __init xfs_btree_init_cur_caches(void); void xfs_btree_destroy_cur_caches(void); +int xfs_btree_goto_left_edge(struct xfs_btree_cur *cur); + #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index dd75e208b543..e276eba87cb1 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -333,20 +333,41 @@ xfs_btree_commit_ifakeroot( /* * Put a btree block that we're loading onto the ordered list and release it. * The btree blocks will be written to disk when bulk loading is finished. + * If we reach the dirty buffer threshold, flush them to disk before + * continuing. */ -static void +static int xfs_btree_bload_drop_buf( - struct list_head *buffers_list, - struct xfs_buf **bpp) + struct xfs_btree_bload *bbl, + struct list_head *buffers_list, + struct xfs_buf **bpp) { - if (*bpp == NULL) - return; + struct xfs_buf *bp = *bpp; + int error; + + if (!bp) + return 0; - if (!xfs_buf_delwri_queue(*bpp, buffers_list)) - ASSERT(0); + /* + * Mark this buffer XBF_DONE (i.e. uptodate) so that a subsequent + * xfs_buf_read will not pointlessly reread the contents from the disk. + */ + bp->b_flags |= XBF_DONE; - xfs_buf_relse(*bpp); + xfs_buf_delwri_queue_here(bp, buffers_list); + xfs_buf_relse(bp); *bpp = NULL; + bbl->nr_dirty++; + + if (!bbl->max_dirty || bbl->nr_dirty < bbl->max_dirty) + return 0; + + error = xfs_buf_delwri_submit(buffers_list); + if (error) + return error; + + bbl->nr_dirty = 0; + return 0; } /* @@ -384,7 +405,7 @@ xfs_btree_bload_prep_block( ASSERT(*bpp == NULL); /* Allocate a new incore btree root block. */ - new_size = bbl->iroot_size(cur, nr_this_block, priv); + new_size = bbl->iroot_size(cur, level, nr_this_block, priv); ifp->if_broot = kmem_zalloc(new_size, 0); ifp->if_broot_bytes = (int)new_size; @@ -418,7 +439,10 @@ xfs_btree_bload_prep_block( */ if (*blockp) xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB); - xfs_btree_bload_drop_buf(buffers_list, bpp); + + ret = xfs_btree_bload_drop_buf(bbl, buffers_list, bpp); + if (ret) + return ret; /* Initialize the new btree block. */ xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block); @@ -436,22 +460,19 @@ STATIC int xfs_btree_bload_leaf( struct xfs_btree_cur *cur, unsigned int recs_this_block, - xfs_btree_bload_get_record_fn get_record, + xfs_btree_bload_get_records_fn get_records, struct xfs_btree_block *block, void *priv) { - unsigned int j; + unsigned int j = 1; int ret; /* Fill the leaf block with records. */ - for (j = 1; j <= recs_this_block; j++) { - union xfs_btree_rec *block_rec; - - ret = get_record(cur, priv); - if (ret) + while (j <= recs_this_block) { + ret = get_records(cur, j, block, recs_this_block - j + 1, priv); + if (ret < 0) return ret; - block_rec = xfs_btree_rec_addr(cur, j, block); - cur->bc_ops->init_rec_from_cur(cur, block_rec); + j += ret; } return 0; @@ -485,7 +506,12 @@ xfs_btree_bload_node( ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr)); - ret = xfs_btree_get_buf_block(cur, child_ptr, &child_block, + /* + * Read the lower-level block in case the buffer for it has + * been reclaimed. LRU refs will be set on the block, which is + * desirable if the new btree commits. + */ + ret = xfs_btree_read_buf_block(cur, child_ptr, 0, &child_block, &child_bp); if (ret) return ret; @@ -570,7 +596,14 @@ xfs_btree_bload_level_geometry( unsigned int desired_npb; unsigned int maxnr; - maxnr = cur->bc_ops->get_maxrecs(cur, level); + /* + * Compute the absolute maximum number of records that we can store in + * the ondisk block or inode root. + */ + if (cur->bc_ops->get_dmaxrecs) + maxnr = cur->bc_ops->get_dmaxrecs(cur, level); + else + maxnr = cur->bc_ops->get_maxrecs(cur, level); /* * Compute the number of blocks we need to fill each block with the @@ -764,6 +797,7 @@ xfs_btree_bload( cur->bc_nlevels = bbl->btree_height; xfs_btree_set_ptr_null(cur, &child_ptr); xfs_btree_set_ptr_null(cur, &ptr); + bbl->nr_dirty = 0; xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, &avg_per_block, &blocks, &blocks_with_extra); @@ -789,7 +823,7 @@ xfs_btree_bload( trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr, nr_this_block); - ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_record, + ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_records, block, priv); if (ret) goto out; @@ -802,7 +836,10 @@ xfs_btree_bload( xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1); } total_blocks += blocks; - xfs_btree_bload_drop_buf(&buffers_list, &bp); + + ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp); + if (ret) + goto out; /* Populate the internal btree nodes. */ for (level = 1; level < cur->bc_nlevels; level++) { @@ -844,7 +881,11 @@ xfs_btree_bload( xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1); } total_blocks += blocks; - xfs_btree_bload_drop_buf(&buffers_list, &bp); + + ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp); + if (ret) + goto out; + xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1); } diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h index f0d2976050ae..055ea43b1e18 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.h +++ b/fs/xfs/libxfs/xfs_btree_staging.h @@ -37,12 +37,6 @@ struct xbtree_ifakeroot { /* Number of bytes available for this fork in the inode. */ unsigned int if_fork_size; - - /* Fork format. */ - unsigned int if_format; - - /* Number of records. */ - unsigned int if_extents; }; /* Cursor interactions with fake roots for inode-rooted btrees. */ @@ -53,19 +47,24 @@ void xfs_btree_commit_ifakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp, int whichfork, const struct xfs_btree_ops *ops); /* Bulk loading of staged btrees. */ -typedef int (*xfs_btree_bload_get_record_fn)(struct xfs_btree_cur *cur, void *priv); +typedef int (*xfs_btree_bload_get_records_fn)(struct xfs_btree_cur *cur, + unsigned int idx, struct xfs_btree_block *block, + unsigned int nr_wanted, void *priv); typedef int (*xfs_btree_bload_claim_block_fn)(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, void *priv); typedef size_t (*xfs_btree_bload_iroot_size_fn)(struct xfs_btree_cur *cur, - unsigned int nr_this_level, void *priv); + unsigned int level, unsigned int nr_this_level, void *priv); struct xfs_btree_bload { /* - * This function will be called nr_records times to load records into - * the btree. The function does this by setting the cursor's bc_rec - * field in in-core format. Records must be returned in sort order. + * This function will be called to load @nr_wanted records into the + * btree. The implementation does this by setting the cursor's bc_rec + * field in in-core format and using init_rec_from_cur to set the + * records in the btree block. Records must be returned in sort order. + * The function must return the number of records loaded or the usual + * negative errno. */ - xfs_btree_bload_get_record_fn get_record; + xfs_btree_bload_get_records_fn get_records; /* * This function will be called nr_blocks times to obtain a pointer @@ -113,6 +112,16 @@ struct xfs_btree_bload { * height of the new btree. */ unsigned int btree_height; + + /* + * Flush the new btree block buffer list to disk after this many blocks + * have been formatted. Zero prohibits writing any buffers until all + * blocks have been formatted. + */ + uint16_t max_dirty; + + /* Number of dirty buffers. */ + uint16_t nr_dirty; }; int xfs_btree_bload_compute_geometry(struct xfs_btree_cur *cur, diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index e576560b46e9..5457188bb4de 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -421,6 +421,25 @@ xfs_da3_node_read_mapped( return xfs_da3_node_set_type(tp, *bpp); } +/* + * Copy src directory/attr leaf/node buffer to the dst. + * For v5 file systems make sure the right blkno is stamped in. + */ +void +xfs_da_buf_copy( + struct xfs_buf *dst, + struct xfs_buf *src, + size_t size) +{ + struct xfs_da3_blkinfo *da3 = dst->b_addr; + + memcpy(dst->b_addr, src->b_addr, size); + dst->b_ops = src->b_ops; + xfs_trans_buf_copy_type(dst, src); + if (xfs_has_crc(dst->b_mount)) + da3->blkno = cpu_to_be64(xfs_buf_daddr(dst)); +} + /*======================================================================== * Routines used for growing the Btree. *========================================================================*/ @@ -690,12 +709,6 @@ xfs_da3_root_split( btree = icnodehdr.btree; size = (int)((char *)&btree[icnodehdr.count] - (char *)oldroot); level = icnodehdr.level; - - /* - * we are about to copy oldroot to bp, so set up the type - * of bp while we know exactly what it will be. - */ - xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF); } else { struct xfs_dir3_icleaf_hdr leafhdr; @@ -707,31 +720,17 @@ xfs_da3_root_split( size = (int)((char *)&leafhdr.ents[leafhdr.count] - (char *)leaf); level = 0; - - /* - * we are about to copy oldroot to bp, so set up the type - * of bp while we know exactly what it will be. - */ - xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF); } /* - * we can copy most of the information in the node from one block to - * another, but for CRC enabled headers we have to make sure that the - * block specific identifiers are kept intact. We update the buffer - * directly for this. + * Copy old root to new buffer and log it. */ - memcpy(node, oldroot, size); - if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) || - oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { - struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node; - - node3->hdr.info.blkno = cpu_to_be64(xfs_buf_daddr(bp)); - } + xfs_da_buf_copy(bp, blk1->bp, size); xfs_trans_log_buf(tp, bp, 0, size - 1); - bp->b_ops = blk1->bp->b_ops; - xfs_trans_buf_copy_type(bp, blk1->bp); + /* + * Update blk1 to point to new buffer. + */ blk1->bp = bp; blk1->blkno = blkno; @@ -1220,21 +1219,14 @@ xfs_da3_root_join( xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level); /* - * This could be copying a leaf back into the root block in the case of - * there only being a single leaf block left in the tree. Hence we have - * to update the b_ops pointer as well to match the buffer type change - * that could occur. For dir3 blocks we also need to update the block - * number in the buffer header. + * Copy child to root buffer and log it. */ - memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize); - root_blk->bp->b_ops = bp->b_ops; - xfs_trans_buf_copy_type(root_blk->bp, bp); - if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) { - struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr; - da3->blkno = cpu_to_be64(xfs_buf_daddr(root_blk->bp)); - } + xfs_da_buf_copy(root_blk->bp, bp, args->geo->blksize); xfs_trans_log_buf(args->trans, root_blk->bp, 0, args->geo->blksize - 1); + /* + * Now we can drop the child buffer. + */ error = xfs_da_shrink_inode(args, child, bp); return error; } @@ -2317,9 +2309,10 @@ xfs_da3_swap_lastblock( /* * Copy the last block into the dead buffer and log it. */ - memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize); + xfs_da_buf_copy(dead_buf, last_buf, args->geo->blksize); xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1); dead_info = dead_buf->b_addr; + /* * Get values from the moved block. */ diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index ffa3df5b2893..706baf36e175 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -219,6 +219,8 @@ int xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno, const struct xfs_buf_ops *ops); int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, struct xfs_buf *dead_buf); +void xfs_da_buf_copy(struct xfs_buf *dst, struct xfs_buf *src, + size_t size); uint xfs_da_hashname(const uint8_t *name_string, int name_length); enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index f9015f88eca7..24f9d1461f9a 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -578,20 +578,25 @@ xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp) #define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */ /* - * Entries are packed toward the top as tight as possible. - */ -struct xfs_attr_shortform { - struct xfs_attr_sf_hdr { /* constant-structure header block */ - __be16 totsize; /* total bytes in shortform list */ - __u8 count; /* count of active entries */ - __u8 padding; - } hdr; - struct xfs_attr_sf_entry { - uint8_t namelen; /* actual length of name (no NULL) */ - uint8_t valuelen; /* actual length of value (no NULL) */ - uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ - uint8_t nameval[]; /* name & value bytes concatenated */ - } list[]; /* variable sized array */ + * Attribute storage when stored inside the inode. + * + * Small attribute lists are packed as tightly as possible so as to fit into the + * literal area of the inode. + * + * These "shortform" attribute forks consist of a single xfs_attr_sf_hdr header + * followed by zero or more xfs_attr_sf_entry structures. + */ +struct xfs_attr_sf_hdr { /* constant-structure header block */ + __be16 totsize; /* total bytes in shortform list */ + __u8 count; /* count of active entries */ + __u8 padding; +}; + +struct xfs_attr_sf_entry { + __u8 namelen; /* actual length of name (no NULL) */ + __u8 valuelen; /* actual length of value (no NULL) */ + __u8 flags; /* flags bits (XFS_ATTR_*) */ + __u8 nameval[]; /* name & value bytes concatenated */ }; typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index f71679ce23b9..66a17910d021 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -26,6 +26,7 @@ #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_attr.h" +#include "xfs_trans_priv.h" static struct kmem_cache *xfs_defer_pending_cache; @@ -181,16 +182,89 @@ static struct kmem_cache *xfs_defer_pending_cache; * Note that the continuation requested between t2 and t3 is likely to * reoccur. */ +STATIC struct xfs_log_item * +xfs_defer_barrier_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + return NULL; +} -static const struct xfs_defer_op_type *defer_op_types[] = { - [XFS_DEFER_OPS_TYPE_BMAP] = &xfs_bmap_update_defer_type, - [XFS_DEFER_OPS_TYPE_REFCOUNT] = &xfs_refcount_update_defer_type, - [XFS_DEFER_OPS_TYPE_RMAP] = &xfs_rmap_update_defer_type, - [XFS_DEFER_OPS_TYPE_FREE] = &xfs_extent_free_defer_type, - [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type, - [XFS_DEFER_OPS_TYPE_ATTR] = &xfs_attr_defer_type, +STATIC void +xfs_defer_barrier_abort_intent( + struct xfs_log_item *intent) +{ + /* empty */ +} + +STATIC struct xfs_log_item * +xfs_defer_barrier_create_done( + struct xfs_trans *tp, + struct xfs_log_item *intent, + unsigned int count) +{ + return NULL; +} + +STATIC int +xfs_defer_barrier_finish_item( + struct xfs_trans *tp, + struct xfs_log_item *done, + struct list_head *item, + struct xfs_btree_cur **state) +{ + ASSERT(0); + return -EFSCORRUPTED; +} + +STATIC void +xfs_defer_barrier_cancel_item( + struct list_head *item) +{ + ASSERT(0); +} + +static const struct xfs_defer_op_type xfs_barrier_defer_type = { + .max_items = 1, + .create_intent = xfs_defer_barrier_create_intent, + .abort_intent = xfs_defer_barrier_abort_intent, + .create_done = xfs_defer_barrier_create_done, + .finish_item = xfs_defer_barrier_finish_item, + .cancel_item = xfs_defer_barrier_cancel_item, }; +/* Create a log intent done item for a log intent item. */ +static inline void +xfs_defer_create_done( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + struct xfs_log_item *lip; + + /* If there is no log intent item, there can be no log done item. */ + if (!dfp->dfp_intent) + return; + + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the log intent item and frees the log done item + * 2.) shuts down the filesystem + */ + tp->t_flags |= XFS_TRANS_DIRTY; + lip = dfp->dfp_ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count); + if (!lip) + return; + + tp->t_flags |= XFS_TRANS_HAS_INTENT_DONE; + xfs_trans_add_item(tp, lip); + set_bit(XFS_LI_DIRTY, &lip->li_flags); + dfp->dfp_done = lip; +} + /* * Ensure there's a log intent item associated with this deferred work item if * the operation must be restarted on crash. Returns 1 if there's a log item; @@ -202,18 +276,21 @@ xfs_defer_create_intent( struct xfs_defer_pending *dfp, bool sort) { - const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; struct xfs_log_item *lip; if (dfp->dfp_intent) return 1; - lip = ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, sort); + lip = dfp->dfp_ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, + sort); if (!lip) return 0; if (IS_ERR(lip)) return PTR_ERR(lip); + tp->t_flags |= XFS_TRANS_DIRTY; + xfs_trans_add_item(tp, lip); + set_bit(XFS_LI_DIRTY, &lip->li_flags); dfp->dfp_intent = lip; return 1; } @@ -245,23 +322,50 @@ xfs_defer_create_intents( return ret; } -STATIC void +static inline void xfs_defer_pending_abort( struct xfs_mount *mp, + struct xfs_defer_pending *dfp) +{ + trace_xfs_defer_pending_abort(mp, dfp); + + if (dfp->dfp_intent && !dfp->dfp_done) { + dfp->dfp_ops->abort_intent(dfp->dfp_intent); + dfp->dfp_intent = NULL; + } +} + +static inline void +xfs_defer_pending_cancel_work( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp) +{ + struct list_head *pwi; + struct list_head *n; + + trace_xfs_defer_cancel_list(mp, dfp); + + list_del(&dfp->dfp_list); + list_for_each_safe(pwi, n, &dfp->dfp_work) { + list_del(pwi); + dfp->dfp_count--; + trace_xfs_defer_cancel_item(mp, dfp, pwi); + dfp->dfp_ops->cancel_item(pwi); + } + ASSERT(dfp->dfp_count == 0); + kmem_cache_free(xfs_defer_pending_cache, dfp); +} + +STATIC void +xfs_defer_pending_abort_list( + struct xfs_mount *mp, struct list_head *dop_list) { struct xfs_defer_pending *dfp; - const struct xfs_defer_op_type *ops; /* Abort intent items that don't have a done item. */ - list_for_each_entry(dfp, dop_list, dfp_list) { - ops = defer_op_types[dfp->dfp_type]; - trace_xfs_defer_pending_abort(mp, dfp); - if (dfp->dfp_intent && !dfp->dfp_done) { - ops->abort_intent(dfp->dfp_intent); - dfp->dfp_intent = NULL; - } - } + list_for_each_entry(dfp, dop_list, dfp_list) + xfs_defer_pending_abort(mp, dfp); } /* Abort all the intents that were committed. */ @@ -271,7 +375,7 @@ xfs_defer_trans_abort( struct list_head *dop_pending) { trace_xfs_defer_trans_abort(tp, _RET_IP_); - xfs_defer_pending_abort(tp->t_mountp, dop_pending); + xfs_defer_pending_abort_list(tp->t_mountp, dop_pending); } /* @@ -389,27 +493,31 @@ xfs_defer_cancel_list( { struct xfs_defer_pending *dfp; struct xfs_defer_pending *pli; - struct list_head *pwi; - struct list_head *n; - const struct xfs_defer_op_type *ops; /* * Free the pending items. Caller should already have arranged * for the intent items to be released. */ - list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) { - ops = defer_op_types[dfp->dfp_type]; - trace_xfs_defer_cancel_list(mp, dfp); - list_del(&dfp->dfp_list); - list_for_each_safe(pwi, n, &dfp->dfp_work) { - list_del(pwi); - dfp->dfp_count--; - trace_xfs_defer_cancel_item(mp, dfp, pwi); - ops->cancel_item(pwi); - } - ASSERT(dfp->dfp_count == 0); - kmem_cache_free(xfs_defer_pending_cache, dfp); + list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) + xfs_defer_pending_cancel_work(mp, dfp); +} + +static inline void +xfs_defer_relog_intent( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + struct xfs_log_item *lip; + + xfs_defer_create_done(tp, dfp); + + lip = dfp->dfp_ops->relog_intent(tp, dfp->dfp_intent, dfp->dfp_done); + if (lip) { + xfs_trans_add_item(tp, lip); + set_bit(XFS_LI_DIRTY, &lip->li_flags); } + dfp->dfp_done = NULL; + dfp->dfp_intent = lip; } /* @@ -417,7 +525,7 @@ xfs_defer_cancel_list( * done item to release the intent item; and then log a new intent item. * The caller should provide a fresh transaction and roll it after we're done. */ -static int +static void xfs_defer_relog( struct xfs_trans **tpp, struct list_head *dfops) @@ -456,31 +564,28 @@ xfs_defer_relog( trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp); XFS_STATS_INC((*tpp)->t_mountp, defer_relog); - dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp); - } - if ((*tpp)->t_flags & XFS_TRANS_DIRTY) - return xfs_defer_trans_roll(tpp); - return 0; + xfs_defer_relog_intent(*tpp, dfp); + } } /* * Log an intent-done item for the first pending intent, and finish the work * items. */ -static int +int xfs_defer_finish_one( struct xfs_trans *tp, struct xfs_defer_pending *dfp) { - const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + const struct xfs_defer_op_type *ops = dfp->dfp_ops; struct xfs_btree_cur *state = NULL; struct list_head *li, *n; int error; trace_xfs_defer_pending_finish(tp->t_mountp, dfp); - dfp->dfp_done = ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count); + xfs_defer_create_done(tp, dfp); list_for_each_safe(li, n, &dfp->dfp_work) { list_del(li); dfp->dfp_count--; @@ -517,6 +622,24 @@ out: return error; } +/* Move all paused deferred work from @tp to @paused_list. */ +static void +xfs_defer_isolate_paused( + struct xfs_trans *tp, + struct list_head *paused_list) +{ + struct xfs_defer_pending *dfp; + struct xfs_defer_pending *pli; + + list_for_each_entry_safe(dfp, pli, &tp->t_dfops, dfp_list) { + if (!(dfp->dfp_flags & XFS_DEFER_PAUSED)) + continue; + + list_move_tail(&dfp->dfp_list, paused_list); + trace_xfs_defer_isolate_paused(tp->t_mountp, dfp); + } +} + /* * Finish all the pending work. This involves logging intent items for * any work items that wandered in since the last transaction roll (if @@ -532,6 +655,7 @@ xfs_defer_finish_noroll( struct xfs_defer_pending *dfp = NULL; int error = 0; LIST_HEAD(dop_pending); + LIST_HEAD(dop_paused); ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); @@ -550,6 +674,8 @@ xfs_defer_finish_noroll( */ int has_intents = xfs_defer_create_intents(*tp); + xfs_defer_isolate_paused(*tp, &dop_paused); + list_splice_init(&(*tp)->t_dfops, &dop_pending); if (has_intents < 0) { @@ -562,22 +688,33 @@ xfs_defer_finish_noroll( goto out_shutdown; /* Relog intent items to keep the log moving. */ - error = xfs_defer_relog(tp, &dop_pending); - if (error) - goto out_shutdown; + xfs_defer_relog(tp, &dop_pending); + xfs_defer_relog(tp, &dop_paused); + + if ((*tp)->t_flags & XFS_TRANS_DIRTY) { + error = xfs_defer_trans_roll(tp); + if (error) + goto out_shutdown; + } } - dfp = list_first_entry(&dop_pending, struct xfs_defer_pending, - dfp_list); + dfp = list_first_entry_or_null(&dop_pending, + struct xfs_defer_pending, dfp_list); + if (!dfp) + break; error = xfs_defer_finish_one(*tp, dfp); if (error && error != -EAGAIN) goto out_shutdown; } + /* Requeue the paused items in the outgoing transaction. */ + list_splice_tail_init(&dop_paused, &(*tp)->t_dfops); + trace_xfs_defer_finish_done(*tp, _RET_IP_); return 0; out_shutdown: + list_splice_tail_init(&dop_paused, &dop_pending); xfs_defer_trans_abort(*tp, &dop_pending); xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE); trace_xfs_defer_finish_error(*tp, error); @@ -590,6 +727,9 @@ int xfs_defer_finish( struct xfs_trans **tp) { +#ifdef DEBUG + struct xfs_defer_pending *dfp; +#endif int error; /* @@ -609,7 +749,10 @@ xfs_defer_finish( } /* Reset LOWMODE now that we've finished all the dfops. */ - ASSERT(list_empty(&(*tp)->t_dfops)); +#ifdef DEBUG + list_for_each_entry(dfp, &(*tp)->t_dfops, dfp_list) + ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED); +#endif (*tp)->t_flags &= ~XFS_TRANS_LOWMODE; return 0; } @@ -621,48 +764,165 @@ xfs_defer_cancel( struct xfs_mount *mp = tp->t_mountp; trace_xfs_defer_cancel(tp, _RET_IP_); + xfs_defer_trans_abort(tp, &tp->t_dfops); xfs_defer_cancel_list(mp, &tp->t_dfops); } +/* + * Return the last pending work item attached to this transaction if it matches + * the deferred op type. + */ +static inline struct xfs_defer_pending * +xfs_defer_find_last( + struct xfs_trans *tp, + const struct xfs_defer_op_type *ops) +{ + struct xfs_defer_pending *dfp = NULL; + + /* No dfops at all? */ + if (list_empty(&tp->t_dfops)) + return NULL; + + dfp = list_last_entry(&tp->t_dfops, struct xfs_defer_pending, + dfp_list); + + /* Wrong type? */ + if (dfp->dfp_ops != ops) + return NULL; + return dfp; +} + +/* + * Decide if we can add a deferred work item to the last dfops item attached + * to the transaction. + */ +static inline bool +xfs_defer_can_append( + struct xfs_defer_pending *dfp, + const struct xfs_defer_op_type *ops) +{ + /* Already logged? */ + if (dfp->dfp_intent) + return false; + + /* Paused items cannot absorb more work */ + if (dfp->dfp_flags & XFS_DEFER_PAUSED) + return NULL; + + /* Already full? */ + if (ops->max_items && dfp->dfp_count >= ops->max_items) + return false; + + return true; +} + +/* Create a new pending item at the end of the transaction list. */ +static inline struct xfs_defer_pending * +xfs_defer_alloc( + struct xfs_trans *tp, + const struct xfs_defer_op_type *ops) +{ + struct xfs_defer_pending *dfp; + + dfp = kmem_cache_zalloc(xfs_defer_pending_cache, + GFP_NOFS | __GFP_NOFAIL); + dfp->dfp_ops = ops; + INIT_LIST_HEAD(&dfp->dfp_work); + list_add_tail(&dfp->dfp_list, &tp->t_dfops); + + return dfp; +} + /* Add an item for later deferred processing. */ -void +struct xfs_defer_pending * xfs_defer_add( struct xfs_trans *tp, - enum xfs_defer_ops_type type, - struct list_head *li) + struct list_head *li, + const struct xfs_defer_op_type *ops) { struct xfs_defer_pending *dfp = NULL; - const struct xfs_defer_op_type *ops = defer_op_types[type]; ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); - BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX); - /* - * Add the item to a pending item at the end of the intake list. - * If the last pending item has the same type, reuse it. Else, - * create a new pending item at the end of the intake list. - */ - if (!list_empty(&tp->t_dfops)) { - dfp = list_last_entry(&tp->t_dfops, - struct xfs_defer_pending, dfp_list); - if (dfp->dfp_type != type || - (ops->max_items && dfp->dfp_count >= ops->max_items)) - dfp = NULL; - } - if (!dfp) { - dfp = kmem_cache_zalloc(xfs_defer_pending_cache, - GFP_NOFS | __GFP_NOFAIL); - dfp->dfp_type = type; - dfp->dfp_intent = NULL; - dfp->dfp_done = NULL; - dfp->dfp_count = 0; - INIT_LIST_HEAD(&dfp->dfp_work); - list_add_tail(&dfp->dfp_list, &tp->t_dfops); - } + dfp = xfs_defer_find_last(tp, ops); + if (!dfp || !xfs_defer_can_append(dfp, ops)) + dfp = xfs_defer_alloc(tp, ops); - list_add_tail(li, &dfp->dfp_work); + xfs_defer_add_item(dfp, li); trace_xfs_defer_add_item(tp->t_mountp, dfp, li); - dfp->dfp_count++; + return dfp; +} + +/* + * Add a defer ops barrier to force two otherwise adjacent deferred work items + * to be tracked separately and have separate log items. + */ +void +xfs_defer_add_barrier( + struct xfs_trans *tp) +{ + struct xfs_defer_pending *dfp; + + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + + /* If the last defer op added was a barrier, we're done. */ + dfp = xfs_defer_find_last(tp, &xfs_barrier_defer_type); + if (dfp) + return; + + xfs_defer_alloc(tp, &xfs_barrier_defer_type); + + trace_xfs_defer_add_item(tp->t_mountp, dfp, NULL); +} + +/* + * Create a pending deferred work item to replay the recovered intent item + * and add it to the list. + */ +void +xfs_defer_start_recovery( + struct xfs_log_item *lip, + struct list_head *r_dfops, + const struct xfs_defer_op_type *ops) +{ + struct xfs_defer_pending *dfp; + + dfp = kmem_cache_zalloc(xfs_defer_pending_cache, + GFP_NOFS | __GFP_NOFAIL); + dfp->dfp_ops = ops; + dfp->dfp_intent = lip; + INIT_LIST_HEAD(&dfp->dfp_work); + list_add_tail(&dfp->dfp_list, r_dfops); +} + +/* + * Cancel a deferred work item created to recover a log intent item. @dfp + * will be freed after this function returns. + */ +void +xfs_defer_cancel_recovery( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp) +{ + xfs_defer_pending_abort(mp, dfp); + xfs_defer_pending_cancel_work(mp, dfp); +} + +/* Replay the deferred work item created from a recovered log intent item. */ +int +xfs_defer_finish_recovery( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + struct list_head *capture_list) +{ + const struct xfs_defer_op_type *ops = dfp->dfp_ops; + int error; + + /* dfp is freed by recover_work and must not be accessed afterwards */ + error = ops->recover_work(dfp, capture_list); + if (error) + trace_xlog_intent_recovery_failed(mp, ops, error); + return error; } /* @@ -769,7 +1029,7 @@ xfs_defer_ops_capture_abort( { unsigned short i; - xfs_defer_pending_abort(mp, &dfc->dfc_dfops); + xfs_defer_pending_abort_list(mp, &dfc->dfc_dfops); xfs_defer_cancel_list(mp, &dfc->dfc_dfops); for (i = 0; i < dfc->dfc_held.dr_bufs; i++) @@ -938,3 +1198,36 @@ xfs_defer_destroy_item_caches(void) xfs_rmap_intent_destroy_cache(); xfs_defer_destroy_cache(); } + +/* + * Mark a deferred work item so that it will be requeued indefinitely without + * being finished. Caller must ensure there are no data dependencies on this + * work item in the meantime. + */ +void +xfs_defer_item_pause( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + ASSERT(!(dfp->dfp_flags & XFS_DEFER_PAUSED)); + + dfp->dfp_flags |= XFS_DEFER_PAUSED; + + trace_xfs_defer_item_pause(tp->t_mountp, dfp); +} + +/* + * Release a paused deferred work item so that it will be finished during the + * next transaction roll. + */ +void +xfs_defer_item_unpause( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED); + + dfp->dfp_flags &= ~XFS_DEFER_PAUSED; + + trace_xfs_defer_item_unpause(tp->t_mountp, dfp); +} diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 8788ad5f6a73..18a9fb92dde8 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -11,19 +11,6 @@ struct xfs_defer_op_type; struct xfs_defer_capture; /* - * Header for deferred operation list. - */ -enum xfs_defer_ops_type { - XFS_DEFER_OPS_TYPE_BMAP, - XFS_DEFER_OPS_TYPE_REFCOUNT, - XFS_DEFER_OPS_TYPE_RMAP, - XFS_DEFER_OPS_TYPE_FREE, - XFS_DEFER_OPS_TYPE_AGFL_FREE, - XFS_DEFER_OPS_TYPE_ATTR, - XFS_DEFER_OPS_TYPE_MAX, -}; - -/* * Save a log intent item and a list of extents, so that we can replay * whatever action had to happen to the extent list and file the log done * item. @@ -33,19 +20,35 @@ struct xfs_defer_pending { struct list_head dfp_work; /* work items */ struct xfs_log_item *dfp_intent; /* log intent item */ struct xfs_log_item *dfp_done; /* log done item */ + const struct xfs_defer_op_type *dfp_ops; unsigned int dfp_count; /* # extent items */ - enum xfs_defer_ops_type dfp_type; + unsigned int dfp_flags; }; -void xfs_defer_add(struct xfs_trans *tp, enum xfs_defer_ops_type type, - struct list_head *h); +/* + * Create a log intent item for this deferred item, but don't actually finish + * the work. Caller must clear this before the final transaction commit. + */ +#define XFS_DEFER_PAUSED (1U << 0) + +#define XFS_DEFER_PENDING_STRINGS \ + { XFS_DEFER_PAUSED, "paused" } + +void xfs_defer_item_pause(struct xfs_trans *tp, struct xfs_defer_pending *dfp); +void xfs_defer_item_unpause(struct xfs_trans *tp, struct xfs_defer_pending *dfp); + +struct xfs_defer_pending *xfs_defer_add(struct xfs_trans *tp, struct list_head *h, + const struct xfs_defer_op_type *ops); int xfs_defer_finish_noroll(struct xfs_trans **tp); int xfs_defer_finish(struct xfs_trans **tp); +int xfs_defer_finish_one(struct xfs_trans *tp, struct xfs_defer_pending *dfp); void xfs_defer_cancel(struct xfs_trans *); void xfs_defer_move(struct xfs_trans *dtp, struct xfs_trans *stp); /* Description of a deferred type. */ struct xfs_defer_op_type { + const char *name; + unsigned int max_items; struct xfs_log_item *(*create_intent)(struct xfs_trans *tp, struct list_head *items, unsigned int count, bool sort); void (*abort_intent)(struct xfs_log_item *intent); @@ -56,7 +59,11 @@ struct xfs_defer_op_type { void (*finish_cleanup)(struct xfs_trans *tp, struct xfs_btree_cur *state, int error); void (*cancel_item)(struct list_head *item); - unsigned int max_items; + int (*recover_work)(struct xfs_defer_pending *dfp, + struct list_head *capture_list); + struct xfs_log_item *(*relog_intent)(struct xfs_trans *tp, + struct xfs_log_item *intent, + struct xfs_log_item *done_item); }; extern const struct xfs_defer_op_type xfs_bmap_update_defer_type; @@ -125,7 +132,25 @@ void xfs_defer_ops_capture_abort(struct xfs_mount *mp, struct xfs_defer_capture *d); void xfs_defer_resources_rele(struct xfs_defer_resources *dres); +void xfs_defer_start_recovery(struct xfs_log_item *lip, + struct list_head *r_dfops, const struct xfs_defer_op_type *ops); +void xfs_defer_cancel_recovery(struct xfs_mount *mp, + struct xfs_defer_pending *dfp); +int xfs_defer_finish_recovery(struct xfs_mount *mp, + struct xfs_defer_pending *dfp, struct list_head *capture_list); + +static inline void +xfs_defer_add_item( + struct xfs_defer_pending *dfp, + struct list_head *work) +{ + list_add_tail(work, &dfp->dfp_work); + dfp->dfp_count++; +} + int __init xfs_defer_init_item_caches(void); void xfs_defer_destroy_item_caches(void); +void xfs_defer_add_barrier(struct xfs_trans *tp); + #endif /* __XFS_DEFER_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index f5462fd582d5..a76673281514 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -196,7 +196,7 @@ xfs_dir_isempty( return 1; if (dp->i_disk_size > xfs_inode_data_fork_size(dp)) return 0; - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = dp->i_df.if_data; return !sfp->count; } diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 00f960a703b2..3c256d4cc40b 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -1089,7 +1089,7 @@ xfs_dir2_sf_to_block( int newoffset; /* offset from current entry */ unsigned int offset = geo->data_entry_offset; xfs_dir2_sf_entry_t *sfep; /* sf entry pointer */ - xfs_dir2_sf_hdr_t *oldsfp; /* old shortform header */ + struct xfs_dir2_sf_hdr *oldsfp = ifp->if_data; xfs_dir2_sf_hdr_t *sfp; /* shortform header */ __be16 *tagp; /* end of data entry */ struct xfs_name name; @@ -1099,10 +1099,8 @@ xfs_dir2_sf_to_block( ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); - oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data; - ASSERT(ifp->if_bytes == dp->i_disk_size); - ASSERT(ifp->if_u1.if_data != NULL); + ASSERT(oldsfp != NULL); ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count)); ASSERT(dp->i_df.if_nextents == 0); diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 7404a9ff1a92..1db2e60ba827 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -175,7 +175,8 @@ extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); extern int xfs_dir2_sf_removename(struct xfs_da_args *args); extern int xfs_dir2_sf_replace(struct xfs_da_args *args); -extern xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_inode *ip); +xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *sfp, int64_t size); int xfs_dir2_sf_entsize(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, int len); void xfs_dir2_sf_put_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 8cd37e6e9d38..e1f83fc7b6ad 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -364,25 +364,23 @@ int /* error */ xfs_dir2_sf_addname( xfs_da_args_t *args) /* operation arguments */ { - xfs_inode_t *dp; /* incore directory inode */ + struct xfs_inode *dp = args->dp; + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; int error; /* error return value */ int incr_isize; /* total change in size */ int new_isize; /* size after adding name */ int objchange; /* changing to 8-byte inodes */ xfs_dir2_data_aoff_t offset = 0; /* offset for new entry */ int pick; /* which algorithm to use */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ xfs_dir2_sf_entry_t *sfep = NULL; /* shortform entry */ trace_xfs_dir2_sf_addname(args); ASSERT(xfs_dir2_sf_lookup(args) == -ENOENT); - dp = args->dp; ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL); ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); ASSERT(dp->i_df.if_bytes == dp->i_disk_size); - ASSERT(dp->i_df.if_u1.if_data != NULL); - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(sfp != NULL); ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); /* * Compute entry (and change in) size. @@ -462,20 +460,17 @@ xfs_dir2_sf_addname_easy( { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; - int byteoff; /* byte offset in sf dir */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; + int byteoff = (int)((char *)sfep - (char *)sfp); - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - byteoff = (int)((char *)sfep - (char *)sfp); /* * Grow the in-inode space. */ - xfs_idata_realloc(dp, xfs_dir2_sf_entsize(mp, sfp, args->namelen), + sfp = xfs_idata_realloc(dp, xfs_dir2_sf_entsize(mp, sfp, args->namelen), XFS_DATA_FORK); /* * Need to set up again due to realloc of the inode data. */ - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff); /* * Fill in the new entry. @@ -528,11 +523,10 @@ xfs_dir2_sf_addname_hard( /* * Copy the old directory to the stack buffer. */ - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; old_isize = (int)dp->i_disk_size; buf = kmem_alloc(old_isize, 0); oldsfp = (xfs_dir2_sf_hdr_t *)buf; - memcpy(oldsfp, sfp, old_isize); + memcpy(oldsfp, dp->i_df.if_data, old_isize); /* * Loop over the old directory finding the place we're going * to insert the new entry. @@ -556,11 +550,8 @@ xfs_dir2_sf_addname_hard( * the data. */ xfs_idata_realloc(dp, -old_isize, XFS_DATA_FORK); - xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK); - /* - * Reset the pointer since the buffer was reallocated. - */ - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK); + /* * Copy the first part of the directory, including the header. */ @@ -610,11 +601,10 @@ xfs_dir2_sf_addname_pick( int i; /* entry number */ xfs_dir2_data_aoff_t offset; /* data block offset */ xfs_dir2_sf_entry_t *sfep; /* shortform entry */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; int size; /* entry's data size */ int used; /* data bytes used */ - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; size = xfs_dir2_data_entsize(mp, args->namelen); offset = args->geo->data_first_offset; sfep = xfs_dir2_sf_firstentry(sfp); @@ -673,14 +663,13 @@ xfs_dir2_sf_check( { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; int i; /* entry number */ int i8count; /* number of big inode#s */ xfs_ino_t ino; /* entry inode number */ int offset; /* data offset */ xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; offset = args->geo->data_first_offset; ino = xfs_dir2_sf_get_parent_ino(sfp); i8count = ino > XFS_DIR2_MAX_SHORT_INUM; @@ -707,11 +696,10 @@ xfs_dir2_sf_check( /* Verify the consistency of an inline directory. */ xfs_failaddr_t xfs_dir2_sf_verify( - struct xfs_inode *ip) + struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *sfp, + int64_t size) { - struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); - struct xfs_dir2_sf_hdr *sfp; struct xfs_dir2_sf_entry *sfep; struct xfs_dir2_sf_entry *next_sfep; char *endp; @@ -719,15 +707,9 @@ xfs_dir2_sf_verify( int i; int i8count; int offset; - int64_t size; int error; uint8_t filetype; - ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); - - sfp = (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data; - size = ifp->if_bytes; - /* * Give up if the directory is way too short. */ @@ -834,15 +816,13 @@ xfs_dir2_sf_create( ASSERT(dp->i_df.if_bytes == 0); i8count = pino > XFS_DIR2_MAX_SHORT_INUM; size = xfs_dir2_sf_hdr_size(i8count); + /* - * Make a buffer for the data. - */ - xfs_idata_realloc(dp, size, XFS_DATA_FORK); - /* - * Fill in the header, + * Make a buffer for the data and fill in the header. */ - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = xfs_idata_realloc(dp, size, XFS_DATA_FORK); sfp->i8count = i8count; + /* * Now can put in the inode number, since i8count is set. */ @@ -864,9 +844,9 @@ xfs_dir2_sf_lookup( { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; int i; /* entry index */ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ enum xfs_dacmp cmp; /* comparison result */ xfs_dir2_sf_entry_t *ci_sfep; /* case-insens. entry */ @@ -877,8 +857,7 @@ xfs_dir2_sf_lookup( ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL); ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); ASSERT(dp->i_df.if_bytes == dp->i_disk_size); - ASSERT(dp->i_df.if_u1.if_data != NULL); - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(sfp != NULL); ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); /* * Special case for . @@ -940,13 +919,13 @@ xfs_dir2_sf_removename( { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; int byteoff; /* offset of removed entry */ int entsize; /* this entry's size */ int i; /* shortform entry index */ int newsize; /* new inode size */ int oldsize; /* old inode size */ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ trace_xfs_dir2_sf_removename(args); @@ -954,8 +933,7 @@ xfs_dir2_sf_removename( oldsize = (int)dp->i_disk_size; ASSERT(oldsize >= offsetof(struct xfs_dir2_sf_hdr, parent)); ASSERT(dp->i_df.if_bytes == oldsize); - ASSERT(dp->i_df.if_u1.if_data != NULL); - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(sfp != NULL); ASSERT(oldsize >= xfs_dir2_sf_hdr_size(sfp->i8count)); /* * Loop over the old directory entries. @@ -992,11 +970,12 @@ xfs_dir2_sf_removename( */ sfp->count--; dp->i_disk_size = newsize; + /* * Reallocate, making it smaller. */ - xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK); - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK); + /* * Are we changing inode number size? */ @@ -1019,13 +998,12 @@ xfs_dir2_sf_replace_needblock( struct xfs_inode *dp, xfs_ino_t inum) { + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; int newsize; - struct xfs_dir2_sf_hdr *sfp; if (dp->i_df.if_format != XFS_DINODE_FMT_LOCAL) return false; - sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data; newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF; return inum > XFS_DIR2_MAX_SHORT_INUM && @@ -1041,19 +1019,18 @@ xfs_dir2_sf_replace( { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; int i; /* entry index */ xfs_ino_t ino=0; /* entry old inode number */ int i8elevated; /* sf_toino8 set i8count=1 */ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ trace_xfs_dir2_sf_replace(args); ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL); ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); ASSERT(dp->i_df.if_bytes == dp->i_disk_size); - ASSERT(dp->i_df.if_u1.if_data != NULL); - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(sfp != NULL); ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); /* @@ -1076,7 +1053,7 @@ xfs_dir2_sf_replace( */ xfs_dir2_sf_toino8(args); i8elevated = 1; - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = dp->i_df.if_data; } else i8elevated = 0; @@ -1157,11 +1134,11 @@ xfs_dir2_sf_toino4( { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; + struct xfs_dir2_sf_hdr *oldsfp = dp->i_df.if_data; char *buf; /* old dir's buffer */ int i; /* entry index */ int newsize; /* new inode size */ xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */ - xfs_dir2_sf_hdr_t *oldsfp; /* old sf directory */ int oldsize; /* old inode size */ xfs_dir2_sf_entry_t *sfep; /* new sf entry */ xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ @@ -1175,7 +1152,6 @@ xfs_dir2_sf_toino4( */ oldsize = dp->i_df.if_bytes; buf = kmem_alloc(oldsize, 0); - oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; ASSERT(oldsfp->i8count == 1); memcpy(buf, oldsfp, oldsize); /* @@ -1188,7 +1164,7 @@ xfs_dir2_sf_toino4( * Reset our pointers, the data has moved. */ oldsfp = (xfs_dir2_sf_hdr_t *)buf; - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = dp->i_df.if_data; /* * Fill in the new header. */ @@ -1230,11 +1206,11 @@ xfs_dir2_sf_toino8( { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; + struct xfs_dir2_sf_hdr *oldsfp = dp->i_df.if_data; char *buf; /* old dir's buffer */ int i; /* entry index */ int newsize; /* new inode size */ xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */ - xfs_dir2_sf_hdr_t *oldsfp; /* old sf directory */ int oldsize; /* old inode size */ xfs_dir2_sf_entry_t *sfep; /* new sf entry */ xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ @@ -1248,7 +1224,6 @@ xfs_dir2_sf_toino8( */ oldsize = dp->i_df.if_bytes; buf = kmem_alloc(oldsize, 0); - oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; ASSERT(oldsfp->i8count == 0); memcpy(buf, oldsfp, oldsize); /* @@ -1261,7 +1236,7 @@ xfs_dir2_sf_toino8( * Reset our pointers, the data has moved. */ oldsfp = (xfs_dir2_sf_hdr_t *)buf; - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = dp->i_df.if_data; /* * Fill in the new header. */ diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 9a88aba1589f..382ab1e71c0b 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1008,7 +1008,7 @@ enum xfs_dinode_fmt { * Return pointers to the data or attribute forks. */ #define XFS_DFORK_DPTR(dip) \ - ((char *)dip + xfs_dinode_size(dip->di_version)) + ((void *)dip + xfs_dinode_size(dip->di_version)) #define XFS_DFORK_APTR(dip) \ (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip)) #define XFS_DFORK_PTR(dip,w) \ @@ -1156,20 +1156,6 @@ static inline bool xfs_dinode_has_large_extent_counts( #define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */ #define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */ -#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize) -#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask) - -/* - * RT bit manipulation macros. - */ -#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b)) -#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b)) - -#define XFS_RTLOBIT(w) xfs_lowbit32(w) -#define XFS_RTHIBIT(w) xfs_highbit32(w) - -#define XFS_RTBLOCKLOG(b) xfs_highbit64(b) - /* * Dquot and dquot block format definitions */ @@ -1272,6 +1258,9 @@ static inline time64_t xfs_dq_bigtime_to_unix(uint32_t ondisk_seconds) #define XFS_DQ_GRACE_MIN ((int64_t)0) #define XFS_DQ_GRACE_MAX ((int64_t)U32_MAX) +/* Maximum id value for a quota record */ +#define XFS_DQ_ID_MAX (U32_MAX) + /* * This is the main portion of the on-disk representation of quota information * for a user. We pad this with some more expansion room to construct the on diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h index 99e796256c5d..6296993ff8f3 100644 --- a/fs/xfs/libxfs/xfs_health.h +++ b/fs/xfs/libxfs/xfs_health.h @@ -68,6 +68,11 @@ struct xfs_fsop_geom; #define XFS_SICK_INO_SYMLINK (1 << 6) /* symbolic link remote target */ #define XFS_SICK_INO_PARENT (1 << 7) /* parent pointers */ +#define XFS_SICK_INO_BMBTD_ZAPPED (1 << 8) /* data fork erased */ +#define XFS_SICK_INO_BMBTA_ZAPPED (1 << 9) /* attr fork erased */ +#define XFS_SICK_INO_DIR_ZAPPED (1 << 10) /* directory erased */ +#define XFS_SICK_INO_SYMLINK_ZAPPED (1 << 11) /* symlink erased */ + /* Primary evidence of health problems in a given group. */ #define XFS_SICK_FS_PRIMARY (XFS_SICK_FS_COUNTERS | \ XFS_SICK_FS_UQUOTA | \ @@ -97,6 +102,11 @@ struct xfs_fsop_geom; XFS_SICK_INO_SYMLINK | \ XFS_SICK_INO_PARENT) +#define XFS_SICK_INO_ZAPPED (XFS_SICK_INO_BMBTD_ZAPPED | \ + XFS_SICK_INO_BMBTA_ZAPPED | \ + XFS_SICK_INO_DIR_ZAPPED | \ + XFS_SICK_INO_SYMLINK_ZAPPED) + /* These functions must be provided by the xfs implementation. */ void xfs_fs_mark_sick(struct xfs_mount *mp, unsigned int mask); diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index b83e54c70906..2361a22035b0 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -95,18 +95,28 @@ xfs_inobt_btrec_to_irec( irec->ir_free = be64_to_cpu(rec->inobt.ir_free); } +/* Compute the freecount of an incore inode record. */ +uint8_t +xfs_inobt_rec_freecount( + const struct xfs_inobt_rec_incore *irec) +{ + uint64_t realfree = irec->ir_free; + + if (xfs_inobt_issparse(irec->ir_holemask)) + realfree &= xfs_inobt_irec_to_allocmask(irec); + return hweight64(realfree); +} + /* Simple checks for inode records. */ xfs_failaddr_t xfs_inobt_check_irec( - struct xfs_btree_cur *cur, + struct xfs_perag *pag, const struct xfs_inobt_rec_incore *irec) { - uint64_t realfree; - /* Record has to be properly aligned within the AG. */ - if (!xfs_verify_agino(cur->bc_ag.pag, irec->ir_startino)) + if (!xfs_verify_agino(pag, irec->ir_startino)) return __this_address; - if (!xfs_verify_agino(cur->bc_ag.pag, + if (!xfs_verify_agino(pag, irec->ir_startino + XFS_INODES_PER_CHUNK - 1)) return __this_address; if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT || @@ -115,12 +125,7 @@ xfs_inobt_check_irec( if (irec->ir_freecount > XFS_INODES_PER_CHUNK) return __this_address; - /* if there are no holes, return the first available offset */ - if (!xfs_inobt_issparse(irec->ir_holemask)) - realfree = irec->ir_free; - else - realfree = irec->ir_free & xfs_inobt_irec_to_allocmask(irec); - if (hweight64(realfree) != irec->ir_freecount) + if (xfs_inobt_rec_freecount(irec) != irec->ir_freecount) return __this_address; return NULL; @@ -164,7 +169,7 @@ xfs_inobt_get_rec( return error; xfs_inobt_btrec_to_irec(mp, rec, irec); - fa = xfs_inobt_check_irec(cur, irec); + fa = xfs_inobt_check_irec(cur->bc_ag.pag, irec); if (fa) return xfs_inobt_complain_bad_rec(cur, fa, irec); @@ -1854,7 +1859,7 @@ xfs_difree_inode_chunk( return xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, sagbno), M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); } /* holemask is only 16-bits (fits in an unsigned long) */ @@ -1900,7 +1905,8 @@ xfs_difree_inode_chunk( ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); error = xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, - &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE); + &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE, + false); if (error) return error; @@ -2739,7 +2745,7 @@ xfs_ialloc_count_inodes_rec( xfs_failaddr_t fa; xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec); - fa = xfs_inobt_check_irec(cur, &irec); + fa = xfs_inobt_check_irec(cur->bc_ag.pag, &irec); if (fa) return xfs_inobt_complain_bad_rec(cur, fa, &irec); diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index fe824bb04a09..f1412183bb44 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -79,6 +79,7 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino, */ int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_inobt_rec_incore_t *rec, int *stat); +uint8_t xfs_inobt_rec_freecount(const struct xfs_inobt_rec_incore *irec); /* * Inode chunk initialisation routine @@ -93,7 +94,7 @@ union xfs_btree_rec; void xfs_inobt_btrec_to_irec(struct xfs_mount *mp, const union xfs_btree_rec *rec, struct xfs_inobt_rec_incore *irec); -xfs_failaddr_t xfs_inobt_check_irec(struct xfs_btree_cur *cur, +xfs_failaddr_t xfs_inobt_check_irec(struct xfs_perag *pag, const struct xfs_inobt_rec_incore *irec); int xfs_ialloc_has_inodes_at_extent(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 9258f01c0015..42a5e1f227a0 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -161,7 +161,7 @@ __xfs_inobt_free_block( xfs_inobt_mod_blockcount(cur, -1); fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); return xfs_free_extent_later(cur->bc_tp, fsbno, 1, - &XFS_RMAP_OINFO_INOBT, resv); + &XFS_RMAP_OINFO_INOBT, resv, false); } STATIC int diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index 773cf4349428..f4e6b200cdf8 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -158,7 +158,7 @@ static void * xfs_iext_find_first_leaf( struct xfs_ifork *ifp) { - struct xfs_iext_node *node = ifp->if_u1.if_root; + struct xfs_iext_node *node = ifp->if_data; int height; if (!ifp->if_height) @@ -176,7 +176,7 @@ static void * xfs_iext_find_last_leaf( struct xfs_ifork *ifp) { - struct xfs_iext_node *node = ifp->if_u1.if_root; + struct xfs_iext_node *node = ifp->if_data; int height, i; if (!ifp->if_height) @@ -306,7 +306,7 @@ xfs_iext_find_level( xfs_fileoff_t offset, int level) { - struct xfs_iext_node *node = ifp->if_u1.if_root; + struct xfs_iext_node *node = ifp->if_data; int height, i; if (!ifp->if_height) @@ -402,12 +402,12 @@ xfs_iext_grow( int i; if (ifp->if_height == 1) { - struct xfs_iext_leaf *prev = ifp->if_u1.if_root; + struct xfs_iext_leaf *prev = ifp->if_data; node->keys[0] = xfs_iext_leaf_key(prev, 0); node->ptrs[0] = prev; } else { - struct xfs_iext_node *prev = ifp->if_u1.if_root; + struct xfs_iext_node *prev = ifp->if_data; ASSERT(ifp->if_height > 1); @@ -418,7 +418,7 @@ xfs_iext_grow( for (i = 1; i < KEYS_PER_NODE; i++) node->keys[i] = XFS_IEXT_KEY_INVALID; - ifp->if_u1.if_root = node; + ifp->if_data = node; ifp->if_height++; } @@ -430,7 +430,7 @@ xfs_iext_update_node( int level, void *ptr) { - struct xfs_iext_node *node = ifp->if_u1.if_root; + struct xfs_iext_node *node = ifp->if_data; int height, i; for (height = ifp->if_height; height > level; height--) { @@ -583,11 +583,11 @@ xfs_iext_alloc_root( { ASSERT(ifp->if_bytes == 0); - ifp->if_u1.if_root = kmem_zalloc(sizeof(struct xfs_iext_rec), KM_NOFS); + ifp->if_data = kmem_zalloc(sizeof(struct xfs_iext_rec), KM_NOFS); ifp->if_height = 1; /* now that we have a node step into it */ - cur->leaf = ifp->if_u1.if_root; + cur->leaf = ifp->if_data; cur->pos = 0; } @@ -603,9 +603,9 @@ xfs_iext_realloc_root( if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF) new_size = NODE_SIZE; - new = krealloc(ifp->if_u1.if_root, new_size, GFP_NOFS | __GFP_NOFAIL); + new = krealloc(ifp->if_data, new_size, GFP_NOFS | __GFP_NOFAIL); memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes); - ifp->if_u1.if_root = new; + ifp->if_data = new; cur->leaf = new; } @@ -622,13 +622,11 @@ static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp) } void -xfs_iext_insert( - struct xfs_inode *ip, +xfs_iext_insert_raw( + struct xfs_ifork *ifp, struct xfs_iext_cursor *cur, - struct xfs_bmbt_irec *irec, - int state) + struct xfs_bmbt_irec *irec) { - struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state); xfs_fileoff_t offset = irec->br_startoff; struct xfs_iext_leaf *new = NULL; int nr_entries, i; @@ -662,12 +660,23 @@ xfs_iext_insert( xfs_iext_set(cur_rec(cur), irec); ifp->if_bytes += sizeof(struct xfs_iext_rec); - trace_xfs_iext_insert(ip, cur, state, _RET_IP_); - if (new) xfs_iext_insert_node(ifp, xfs_iext_leaf_key(new, 0), new, 2); } +void +xfs_iext_insert( + struct xfs_inode *ip, + struct xfs_iext_cursor *cur, + struct xfs_bmbt_irec *irec, + int state) +{ + struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state); + + xfs_iext_insert_raw(ifp, cur, irec); + trace_xfs_iext_insert(ip, cur, state, _RET_IP_); +} + static struct xfs_iext_node * xfs_iext_rebalance_node( struct xfs_iext_node *parent, @@ -777,8 +786,8 @@ again: * If we are at the root and only one entry is left we can just * free this node and update the root pointer. */ - ASSERT(node == ifp->if_u1.if_root); - ifp->if_u1.if_root = node->ptrs[0]; + ASSERT(node == ifp->if_data); + ifp->if_data = node->ptrs[0]; ifp->if_height--; kmem_free(node); } @@ -854,8 +863,8 @@ xfs_iext_free_last_leaf( struct xfs_ifork *ifp) { ifp->if_height--; - kmem_free(ifp->if_u1.if_root); - ifp->if_u1.if_root = NULL; + kmem_free(ifp->if_data); + ifp->if_data = NULL; } void @@ -872,7 +881,7 @@ xfs_iext_remove( trace_xfs_iext_remove(ip, cur, state, _RET_IP_); ASSERT(ifp->if_height > 0); - ASSERT(ifp->if_u1.if_root != NULL); + ASSERT(ifp->if_data != NULL); ASSERT(xfs_iext_valid(ifp, cur)); xfs_iext_inc_seq(ifp); @@ -1042,9 +1051,9 @@ void xfs_iext_destroy( struct xfs_ifork *ifp) { - xfs_iext_destroy_node(ifp->if_u1.if_root, ifp->if_height); + xfs_iext_destroy_node(ifp->if_data, ifp->if_height); ifp->if_bytes = 0; ifp->if_height = 0; - ifp->if_u1.if_root = NULL; + ifp->if_data = NULL; } diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 5a2e7ddfa76d..f4569e18a8d0 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -50,12 +50,15 @@ xfs_init_local_fork( mem_size++; if (size) { - ifp->if_u1.if_data = kmem_alloc(mem_size, KM_NOFS); - memcpy(ifp->if_u1.if_data, data, size); + char *new_data = kmem_alloc(mem_size, KM_NOFS); + + memcpy(new_data, data, size); if (zero_terminate) - ifp->if_u1.if_data[size] = '\0'; + new_data[size] = '\0'; + + ifp->if_data = new_data; } else { - ifp->if_u1.if_data = NULL; + ifp->if_data = NULL; } ifp->if_bytes = size; @@ -125,7 +128,7 @@ xfs_iformat_extents( } ifp->if_bytes = 0; - ifp->if_u1.if_root = NULL; + ifp->if_data = NULL; ifp->if_height = 0; if (size) { dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork); @@ -212,7 +215,7 @@ xfs_iformat_btree( ifp->if_broot, size); ifp->if_bytes = 0; - ifp->if_u1.if_root = NULL; + ifp->if_data = NULL; ifp->if_height = 0; return 0; } @@ -276,10 +279,9 @@ static uint16_t xfs_dfork_attr_shortform_size( struct xfs_dinode *dip) { - struct xfs_attr_shortform *atp = - (struct xfs_attr_shortform *)XFS_DFORK_APTR(dip); + struct xfs_attr_sf_hdr *sf = XFS_DFORK_APTR(dip); - return be16_to_cpu(atp->hdr.totsize); + return be16_to_cpu(sf->totsize); } void @@ -493,7 +495,7 @@ xfs_iroot_realloc( * byte_diff -- the change in the number of bytes, positive or negative, * requested for the if_data array. */ -void +void * xfs_idata_realloc( struct xfs_inode *ip, int64_t byte_diff, @@ -505,21 +507,18 @@ xfs_idata_realloc( ASSERT(new_size >= 0); ASSERT(new_size <= xfs_inode_fork_size(ip, whichfork)); - if (byte_diff == 0) - return; - - if (new_size == 0) { - kmem_free(ifp->if_u1.if_data); - ifp->if_u1.if_data = NULL; - ifp->if_bytes = 0; - return; + if (byte_diff) { + ifp->if_data = krealloc(ifp->if_data, new_size, + GFP_NOFS | __GFP_NOFAIL); + if (new_size == 0) + ifp->if_data = NULL; + ifp->if_bytes = new_size; } - ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, new_size, - GFP_NOFS | __GFP_NOFAIL); - ifp->if_bytes = new_size; + return ifp->if_data; } +/* Free all memory and reset a fork back to its initial state. */ void xfs_idestroy_fork( struct xfs_ifork *ifp) @@ -531,8 +530,8 @@ xfs_idestroy_fork( switch (ifp->if_format) { case XFS_DINODE_FMT_LOCAL: - kmem_free(ifp->if_u1.if_data); - ifp->if_u1.if_data = NULL; + kmem_free(ifp->if_data); + ifp->if_data = NULL; break; case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE: @@ -625,9 +624,9 @@ xfs_iflush_fork( case XFS_DINODE_FMT_LOCAL: if ((iip->ili_fields & dataflag[whichfork]) && (ifp->if_bytes > 0)) { - ASSERT(ifp->if_u1.if_data != NULL); + ASSERT(ifp->if_data != NULL); ASSERT(ifp->if_bytes <= xfs_inode_fork_size(ip, whichfork)); - memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes); + memcpy(cp, ifp->if_data, ifp->if_bytes); } break; @@ -702,19 +701,27 @@ xfs_ifork_verify_local_data( xfs_failaddr_t fa = NULL; switch (VFS_I(ip)->i_mode & S_IFMT) { - case S_IFDIR: - fa = xfs_dir2_sf_verify(ip); + case S_IFDIR: { + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + struct xfs_dir2_sf_hdr *sfp = ifp->if_data; + + fa = xfs_dir2_sf_verify(mp, sfp, ifp->if_bytes); break; - case S_IFLNK: - fa = xfs_symlink_shortform_verify(ip); + } + case S_IFLNK: { + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + + fa = xfs_symlink_shortform_verify(ifp->if_data, ifp->if_bytes); break; + } default: break; } if (fa) { xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork", - ip->i_df.if_u1.if_data, ip->i_df.if_bytes, fa); + ip->i_df.if_data, ip->i_df.if_bytes, fa); return -EFSCORRUPTED; } @@ -729,14 +736,17 @@ xfs_ifork_verify_local_attr( struct xfs_ifork *ifp = &ip->i_af; xfs_failaddr_t fa; - if (!xfs_inode_has_attr_fork(ip)) + if (!xfs_inode_has_attr_fork(ip)) { fa = __this_address; - else - fa = xfs_attr_shortform_verify(ip); + } else { + struct xfs_ifork *ifp = &ip->i_af; + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); + fa = xfs_attr_shortform_verify(ifp->if_data, ifp->if_bytes); + } if (fa) { xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork", - ifp->if_u1.if_data, ifp->if_bytes, fa); + ifp->if_data, ifp->if_bytes, fa); return -EFSCORRUPTED; } diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 96d307784c85..96303249d28a 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -13,14 +13,12 @@ struct xfs_dinode; * File incore extent information, present for each of data & attr forks. */ struct xfs_ifork { - int64_t if_bytes; /* bytes in if_u1 */ + int64_t if_bytes; /* bytes in if_data */ struct xfs_btree_block *if_broot; /* file's incore btree root */ unsigned int if_seq; /* fork mod counter */ int if_height; /* height of the extent tree */ - union { - void *if_root; /* extent tree root */ - char *if_data; /* inline file data */ - } if_u1; + void *if_data; /* extent tree root or + inline data */ xfs_extnum_t if_nextents; /* # of extents in this fork */ short if_broot_bytes; /* bytes allocated for root */ int8_t if_format; /* format of this fork */ @@ -170,7 +168,7 @@ int xfs_iformat_attr_fork(struct xfs_inode *, struct xfs_dinode *); void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, struct xfs_inode_log_item *, int); void xfs_idestroy_fork(struct xfs_ifork *ifp); -void xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff, +void * xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff, int whichfork); void xfs_iroot_realloc(struct xfs_inode *, int, int); int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int); @@ -180,6 +178,9 @@ void xfs_init_local_fork(struct xfs_inode *ip, int whichfork, const void *data, int64_t size); xfs_extnum_t xfs_iext_count(struct xfs_ifork *ifp); +void xfs_iext_insert_raw(struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur, + struct xfs_bmbt_irec *irec); void xfs_iext_insert(struct xfs_inode *, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *, int); void xfs_iext_remove(struct xfs_inode *, struct xfs_iext_cursor *, diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index a5100a11faf9..9fe7a9564bca 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -11,6 +11,7 @@ * define how recovery should work for that type of log item. */ struct xlog_recover_item; +struct xfs_defer_op_type; /* Sorting hat for log items as they're read in. */ enum xlog_recover_reorder { @@ -153,4 +154,11 @@ xlog_recover_resv(const struct xfs_trans_res *r) return ret; } +struct xfs_defer_pending; + +void xlog_recover_intent_item(struct xlog *log, struct xfs_log_item *lip, + xfs_lsn_t lsn, const struct xfs_defer_op_type *ops); +int xlog_recover_finish_intent(struct xfs_trans *tp, + struct xfs_defer_pending *dfp); + #endif /* __XFS_LOG_RECOVER_H__ */ diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h index 21a7e350b4c5..81885a6a028e 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/libxfs/xfs_ondisk.h @@ -7,16 +7,16 @@ #define __XFS_ONDISK_H #define XFS_CHECK_STRUCT_SIZE(structname, size) \ - BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \ - #structname ") is wrong, expected " #size) + static_assert(sizeof(structname) == (size), \ + "XFS: sizeof(" #structname ") is wrong, expected " #size) #define XFS_CHECK_OFFSET(structname, member, off) \ - BUILD_BUG_ON_MSG(offsetof(structname, member) != (off), \ + static_assert(offsetof(structname, member) == (off), \ "XFS: offsetof(" #structname ", " #member ") is wrong, " \ "expected " #off) #define XFS_CHECK_VALUE(value, expected) \ - BUILD_BUG_ON_MSG((value) != (expected), \ + static_assert((value) == (expected), \ "XFS: value of " #value " is wrong, expected " #expected) static inline void __init @@ -93,13 +93,13 @@ xfs_check_ondisk_structs(void) XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8); XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9); XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 32); - XFS_CHECK_STRUCT_SIZE(struct xfs_attr_shortform, 4); - XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.totsize, 0); - XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.count, 2); - XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].namelen, 4); - XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].valuelen, 5); - XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].flags, 6); - XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].nameval, 7); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_sf_hdr, 4); + XFS_CHECK_OFFSET(struct xfs_attr_sf_hdr, totsize, 0); + XFS_CHECK_OFFSET(struct xfs_attr_sf_hdr, count, 2); + XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, namelen, 0); + XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, valuelen, 1); + XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, flags, 2); + XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, nameval, 3); XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12); XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8); diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 646b3fa362ad..6709a7f8bad5 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -123,11 +123,9 @@ xfs_refcount_btrec_to_irec( /* Simple checks for refcount records. */ xfs_failaddr_t xfs_refcount_check_irec( - struct xfs_btree_cur *cur, + struct xfs_perag *pag, const struct xfs_refcount_irec *irec) { - struct xfs_perag *pag = cur->bc_ag.pag; - if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN) return __this_address; @@ -179,7 +177,7 @@ xfs_refcount_get_rec( return error; xfs_refcount_btrec_to_irec(rec, irec); - fa = xfs_refcount_check_irec(cur, irec); + fa = xfs_refcount_check_irec(cur->bc_ag.pag, irec); if (fa) return xfs_refcount_complain_bad_rec(cur, fa, irec); @@ -1153,7 +1151,7 @@ xfs_refcount_adjust_extents( tmp.rc_startblock); error = xfs_free_extent_later(cur->bc_tp, fsbno, tmp.rc_blockcount, NULL, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) goto out_error; } @@ -1215,7 +1213,7 @@ xfs_refcount_adjust_extents( ext.rc_startblock); error = xfs_free_extent_later(cur->bc_tp, fsbno, ext.rc_blockcount, NULL, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) goto out_error; } @@ -1458,7 +1456,7 @@ __xfs_refcount_add( ri->ri_blockcount = blockcount; xfs_refcount_update_get_group(tp->t_mountp, ri); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list); + xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type); } /* @@ -1899,7 +1897,7 @@ xfs_refcount_recover_extent( INIT_LIST_HEAD(&rr->rr_list); xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); - if (xfs_refcount_check_irec(cur, &rr->rr_rrec) != NULL || + if (xfs_refcount_check_irec(cur->bc_ag.pag, &rr->rr_rrec) != NULL || XFS_IS_CORRUPT(cur->bc_mp, rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) { kfree(rr); @@ -1985,7 +1983,7 @@ xfs_refcount_recover_cow_leftovers( /* Free the block. */ error = xfs_free_extent_later(tp, fsb, rr->rr_rrec.rc_blockcount, NULL, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) goto out_trans; @@ -2033,6 +2031,47 @@ xfs_refcount_has_records( return xfs_btree_has_records(cur, &low, &high, NULL, outcome); } +struct xfs_refcount_query_range_info { + xfs_refcount_query_range_fn fn; + void *priv; +}; + +/* Format btree record and pass to our callback. */ +STATIC int +xfs_refcount_query_range_helper( + struct xfs_btree_cur *cur, + const union xfs_btree_rec *rec, + void *priv) +{ + struct xfs_refcount_query_range_info *query = priv; + struct xfs_refcount_irec irec; + xfs_failaddr_t fa; + + xfs_refcount_btrec_to_irec(rec, &irec); + fa = xfs_refcount_check_irec(cur->bc_ag.pag, &irec); + if (fa) + return xfs_refcount_complain_bad_rec(cur, fa, &irec); + + return query->fn(cur, &irec, query->priv); +} + +/* Find all refcount records between two keys. */ +int +xfs_refcount_query_range( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *low_rec, + const struct xfs_refcount_irec *high_rec, + xfs_refcount_query_range_fn fn, + void *priv) +{ + union xfs_btree_irec low_brec = { .rc = *low_rec }; + union xfs_btree_irec high_brec = { .rc = *high_rec }; + struct xfs_refcount_query_range_info query = { .priv = priv, .fn = fn }; + + return xfs_btree_query_range(cur, &low_brec, &high_brec, + xfs_refcount_query_range_helper, &query); +} + int __init xfs_refcount_intent_init_cache(void) { diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 783cd89ca195..9b56768a590c 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -117,7 +117,7 @@ extern int xfs_refcount_has_records(struct xfs_btree_cur *cur, union xfs_btree_rec; extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_refcount_irec *irec); -xfs_failaddr_t xfs_refcount_check_irec(struct xfs_btree_cur *cur, +xfs_failaddr_t xfs_refcount_check_irec(struct xfs_perag *pag, const struct xfs_refcount_irec *irec); extern int xfs_refcount_insert(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, int *stat); @@ -127,4 +127,14 @@ extern struct kmem_cache *xfs_refcount_intent_cache; int __init xfs_refcount_intent_init_cache(void); void xfs_refcount_intent_destroy_cache(void); +typedef int (*xfs_refcount_query_range_fn)( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *rec, + void *priv); + +int xfs_refcount_query_range(struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *low_rec, + const struct xfs_refcount_irec *high_rec, + xfs_refcount_query_range_fn fn, void *priv); + #endif /* __XFS_REFCOUNT_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 5c3987d8dc24..0d80bd99147c 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -112,7 +112,7 @@ xfs_refcountbt_free_block( be32_add_cpu(&agf->agf_refcount_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); return xfs_free_extent_later(cur->bc_tp, fsbno, 1, - &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA); + &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA, false); } STATIC int @@ -226,7 +226,18 @@ xfs_refcountbt_verify( level = be16_to_cpu(block->bb_level); if (pag && xfs_perag_initialised_agf(pag)) { - if (level >= pag->pagf_refcount_level) + unsigned int maxlevel = pag->pagf_refcount_level; + +#ifdef CONFIG_XFS_ONLINE_REPAIR + /* + * Online repair could be rewriting the refcount btree, so + * we'll validate against the larger of either tree while this + * is going on. + */ + maxlevel = max_t(unsigned int, maxlevel, + pag->pagf_repair_refcount_level); +#endif + if (level >= maxlevel) return __this_address; } else if (level >= mp->m_refc_maxlevels) return __this_address; diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index fbb0b2637463..76bf7f48cb5a 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -2567,7 +2567,7 @@ __xfs_rmap_add( ri->ri_bmap = *bmap; xfs_rmap_update_get_group(tp->t_mountp, ri); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list); + xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type); } /* Map an extent into a file. */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index c269d704314d..31100120b2c5 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -184,7 +184,7 @@ xfs_rtfind_back( * Calculate first (leftmost) bit number to look at, * and mask for all the relevant bits in this word. */ - firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0); + firstbit = max_t(xfs_srtblock_t, bit - len + 1, 0); mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) << firstbit; /* @@ -195,7 +195,7 @@ xfs_rtfind_back( /* * Different. Mark where we are and return. */ - i = bit - XFS_RTHIBIT(wdiff); + i = bit - xfs_highbit32(wdiff); *rtx = start - i + 1; return 0; } @@ -233,7 +233,7 @@ xfs_rtfind_back( /* * Different, mark where we are and return. */ - i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); + i += XFS_NBWORD - 1 - xfs_highbit32(wdiff); *rtx = start - i + 1; return 0; } @@ -272,7 +272,7 @@ xfs_rtfind_back( /* * Different, mark where we are and return. */ - i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); + i += XFS_NBWORD - 1 - xfs_highbit32(wdiff); *rtx = start - i + 1; return 0; } else @@ -338,7 +338,7 @@ xfs_rtfind_forw( * Calculate last (rightmost) bit number to look at, * and mask for all the relevant bits in this word. */ - lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + lastbit = min(bit + len, XFS_NBWORD); mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; /* * Calculate the difference between the value there @@ -348,7 +348,7 @@ xfs_rtfind_forw( /* * Different. Mark where we are and return. */ - i = XFS_RTLOBIT(wdiff) - bit; + i = xfs_lowbit32(wdiff) - bit; *rtx = start + i - 1; return 0; } @@ -386,7 +386,7 @@ xfs_rtfind_forw( /* * Different, mark where we are and return. */ - i += XFS_RTLOBIT(wdiff); + i += xfs_lowbit32(wdiff); *rtx = start + i - 1; return 0; } @@ -423,7 +423,7 @@ xfs_rtfind_forw( /* * Different, mark where we are and return. */ - i += XFS_RTLOBIT(wdiff); + i += xfs_lowbit32(wdiff); *rtx = start + i - 1; return 0; } else @@ -452,71 +452,59 @@ xfs_trans_log_rtsummary( } /* - * Read and/or modify the summary information for a given extent size, - * bitmap block combination. - * Keeps track of a current summary block, so we don't keep reading - * it from the buffer cache. - * - * Summary information is returned in *sum if specified. - * If no delta is specified, returns summary only. + * Modify the summary information for a given extent size, bitmap block + * combination. */ int -xfs_rtmodify_summary_int( +xfs_rtmodify_summary( struct xfs_rtalloc_args *args, int log, /* log2 of extent size */ xfs_fileoff_t bbno, /* bitmap block number */ - int delta, /* change to make to summary info */ - xfs_suminfo_t *sum) /* out: summary info for this block */ + int delta) /* in/out: summary block number */ { struct xfs_mount *mp = args->mp; - int error; - xfs_fileoff_t sb; /* summary fsblock */ - xfs_rtsumoff_t so; /* index into the summary file */ + xfs_rtsumoff_t so = xfs_rtsumoffs(mp, log, bbno); unsigned int infoword; + xfs_suminfo_t val; + int error; - /* - * Compute entry number in the summary file. - */ - so = xfs_rtsumoffs(mp, log, bbno); - /* - * Compute the block number in the summary file. - */ - sb = xfs_rtsumoffs_to_block(mp, so); - - error = xfs_rtsummary_read_buf(args, sb); + error = xfs_rtsummary_read_buf(args, xfs_rtsumoffs_to_block(mp, so)); if (error) return error; - /* - * Point to the summary information, modify/log it, and/or copy it out. - */ infoword = xfs_rtsumoffs_to_infoword(mp, so); - if (delta) { - xfs_suminfo_t val = xfs_suminfo_add(args, infoword, delta); - - if (mp->m_rsum_cache) { - if (val == 0 && log + 1 == mp->m_rsum_cache[bbno]) - mp->m_rsum_cache[bbno] = log; - if (val != 0 && log >= mp->m_rsum_cache[bbno]) - mp->m_rsum_cache[bbno] = log + 1; - } - xfs_trans_log_rtsummary(args, infoword); - if (sum) - *sum = val; - } else if (sum) { - *sum = xfs_suminfo_get(args, infoword); + val = xfs_suminfo_add(args, infoword, delta); + + if (mp->m_rsum_cache) { + if (val == 0 && log + 1 == mp->m_rsum_cache[bbno]) + mp->m_rsum_cache[bbno] = log; + if (val != 0 && log >= mp->m_rsum_cache[bbno]) + mp->m_rsum_cache[bbno] = log + 1; } + + xfs_trans_log_rtsummary(args, infoword); return 0; } +/* + * Read and return the summary information for a given extent size, bitmap block + * combination. + */ int -xfs_rtmodify_summary( +xfs_rtget_summary( struct xfs_rtalloc_args *args, int log, /* log2 of extent size */ xfs_fileoff_t bbno, /* bitmap block number */ - int delta) /* in/out: summary block number */ + xfs_suminfo_t *sum) /* out: summary info for this block */ { - return xfs_rtmodify_summary_int(args, log, bbno, delta, NULL); + struct xfs_mount *mp = args->mp; + xfs_rtsumoff_t so = xfs_rtsumoffs(mp, log, bbno); + int error; + + error = xfs_rtsummary_read_buf(args, xfs_rtsumoffs_to_block(mp, so)); + if (!error) + *sum = xfs_suminfo_get(args, xfs_rtsumoffs_to_infoword(mp, so)); + return error; } /* Log rtbitmap block from the word @from to the byte before @next. */ @@ -585,7 +573,7 @@ xfs_rtmodify_range( /* * Compute first bit not changed and mask of relevant bits. */ - lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + lastbit = min(bit + len, XFS_NBWORD); mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; /* * Set/clear the active bits. @@ -720,7 +708,7 @@ xfs_rtfree_range( */ if (preblock < start) { error = xfs_rtmodify_summary(args, - XFS_RTBLOCKLOG(start - preblock), + xfs_highbit64(start - preblock), xfs_rtx_to_rbmblock(mp, preblock), -1); if (error) { return error; @@ -732,7 +720,7 @@ xfs_rtfree_range( */ if (postblock > end) { error = xfs_rtmodify_summary(args, - XFS_RTBLOCKLOG(postblock - end), + xfs_highbit64(postblock - end), xfs_rtx_to_rbmblock(mp, end + 1), -1); if (error) { return error; @@ -743,7 +731,7 @@ xfs_rtfree_range( * (new) free extent. */ return xfs_rtmodify_summary(args, - XFS_RTBLOCKLOG(postblock + 1 - preblock), + xfs_highbit64(postblock + 1 - preblock), xfs_rtx_to_rbmblock(mp, preblock), 1); } @@ -799,7 +787,7 @@ xfs_rtcheck_range( /* * Compute first bit not examined. */ - lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + lastbit = min(bit + len, XFS_NBWORD); /* * Mask of relevant bits. */ @@ -812,7 +800,7 @@ xfs_rtcheck_range( /* * Different, compute first wrong bit and return. */ - i = XFS_RTLOBIT(wdiff) - bit; + i = xfs_lowbit32(wdiff) - bit; *new = start + i; *stat = 0; return 0; @@ -851,7 +839,7 @@ xfs_rtcheck_range( /* * Different, compute first wrong bit and return. */ - i += XFS_RTLOBIT(wdiff); + i += xfs_lowbit32(wdiff); *new = start + i; *stat = 0; return 0; @@ -889,7 +877,7 @@ xfs_rtcheck_range( /* * Different, compute first wrong bit and return. */ - i += XFS_RTLOBIT(wdiff); + i += xfs_lowbit32(wdiff); *new = start + i; *stat = 0; return 0; @@ -1131,6 +1119,20 @@ xfs_rtbitmap_blockcount( } /* + * Compute the maximum level number of the realtime summary file, as defined by + * mkfs. The historic use of highbit32 on a 64-bit quantity prohibited correct + * use of rt volumes with more than 2^32 extents. + */ +uint8_t +xfs_compute_rextslog( + xfs_rtbxlen_t rtextents) +{ + if (!rtextents) + return 0; + return xfs_highbit64(rtextents); +} + +/* * Compute the number of rtbitmap words needed to populate every block of a * bitmap that is large enough to track the given number of rt extents. */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h index c0637057d69c..274dc7dae1fa 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.h +++ b/fs/xfs/libxfs/xfs_rtbitmap.h @@ -321,8 +321,8 @@ int xfs_rtfind_forw(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock); int xfs_rtmodify_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, xfs_rtxlen_t len, int val); -int xfs_rtmodify_summary_int(struct xfs_rtalloc_args *args, int log, - xfs_fileoff_t bbno, int delta, xfs_suminfo_t *sum); +int xfs_rtget_summary(struct xfs_rtalloc_args *args, int log, + xfs_fileoff_t bbno, xfs_suminfo_t *sum); int xfs_rtmodify_summary(struct xfs_rtalloc_args *args, int log, xfs_fileoff_t bbno, int delta); int xfs_rtfree_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, @@ -351,6 +351,20 @@ xfs_rtfree_extent( int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno, xfs_filblks_t rtlen); +uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents); + +/* Do we support an rt volume having this number of rtextents? */ +static inline bool +xfs_validate_rtextents( + xfs_rtbxlen_t rtextents) +{ + /* No runt rt volumes */ + if (rtextents == 0) + return false; + + return true; +} + xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents); unsigned long long xfs_rtbitmap_wordcount(struct xfs_mount *mp, @@ -369,6 +383,8 @@ unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp, # define xfs_rtsummary_read_buf(a,b) (-ENOSYS) # define xfs_rtbuf_cache_relse(a) (0) # define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS) +# define xfs_compute_rextslog(rtx) (0) +# define xfs_validate_rtextents(rtx) (false) static inline xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents) { diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 1f74d0cd1618..4a9e8588f4c9 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -25,6 +25,7 @@ #include "xfs_da_format.h" #include "xfs_health.h" #include "xfs_ag.h" +#include "xfs_rtbitmap.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. @@ -508,8 +509,9 @@ xfs_validate_sb_common( rbmblocks = howmany_64(sbp->sb_rextents, NBBY * sbp->sb_blocksize); - if (sbp->sb_rextents != rexts || - sbp->sb_rextslog != xfs_highbit32(sbp->sb_rextents) || + if (!xfs_validate_rtextents(rexts) || + sbp->sb_rextents != rexts || + sbp->sb_rextslog != xfs_compute_rextslog(rexts) || sbp->sb_rbmblocks != rbmblocks) { xfs_notice(mp, "realtime geometry sanity check failed"); diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index c4381388c0c1..4220d3584c1b 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -139,7 +139,7 @@ bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset, uint32_t size, struct xfs_buf *bp); void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_inode *ip, struct xfs_ifork *ifp); -xfs_failaddr_t xfs_symlink_shortform_verify(struct xfs_inode *ip); +xfs_failaddr_t xfs_symlink_shortform_verify(void *sfp, int64_t size); /* Computed inode geometry for the filesystem. */ struct xfs_ino_geometry { diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index bdc777b9ec4a..160aa20aa441 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -175,7 +175,7 @@ xfs_symlink_local_to_remote( if (!xfs_has_crc(mp)) { bp->b_ops = NULL; - memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); + memcpy(bp->b_addr, ifp->if_data, ifp->if_bytes); xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); return; } @@ -191,7 +191,7 @@ xfs_symlink_local_to_remote( buf = bp->b_addr; buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp); - memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes); + memcpy(buf, ifp->if_data, ifp->if_bytes); xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsymlink_hdr) + ifp->if_bytes - 1); } @@ -202,15 +202,11 @@ xfs_symlink_local_to_remote( */ xfs_failaddr_t xfs_symlink_shortform_verify( - struct xfs_inode *ip) + void *sfp, + int64_t size) { - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); - char *sfp = (char *)ifp->if_u1.if_data; - int size = ifp->if_bytes; char *endp = sfp + size; - ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); - /* * Zero length symlinks should never occur in memory as they are * never allowed to exist on disk. diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 533200c4ccc2..20b5375f2d9c 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -51,7 +51,6 @@ typedef void * xfs_failaddr_t; #define NULLRFSBLOCK ((xfs_rfsblock_t)-1) #define NULLRTBLOCK ((xfs_rtblock_t)-1) #define NULLFILEOFF ((xfs_fileoff_t)-1) -#define NULLRTEXTNO ((xfs_rtxnum_t)-1) #define NULLAGBLOCK ((xfs_agblock_t)-1) #define NULLAGNUMBER ((xfs_agnumber_t)-1) @@ -208,6 +207,13 @@ enum xfs_ag_resv_type { XFS_AG_RESV_AGFL, XFS_AG_RESV_METADATA, XFS_AG_RESV_RMAPBT, + + /* + * Don't increase fdblocks when freeing extent. This is a pony for + * the bnobt repair functions to re-free the free space without + * altering fdblocks. If you think you need this you're wrong. + */ + XFS_AG_RESV_IGNORE, }; /* Results of scanning a btree keyspace to check occupancy. */ diff --git a/fs/xfs/scrub/agb_bitmap.c b/fs/xfs/scrub/agb_bitmap.c new file mode 100644 index 000000000000..573e4e062754 --- /dev/null +++ b/fs/xfs/scrub/agb_bitmap.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_bit.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "bitmap.h" +#include "scrub/agb_bitmap.h" + +/* + * Record all btree blocks seen while iterating all records of a btree. + * + * We know that the btree query_all function starts at the left edge and walks + * towards the right edge of the tree. Therefore, we know that we can walk up + * the btree cursor towards the root; if the pointer for a given level points + * to the first record/key in that block, we haven't seen this block before; + * and therefore we need to remember that we saw this block in the btree. + * + * So if our btree is: + * + * 4 + * / | \ + * 1 2 3 + * + * Pretend for this example that each leaf block has 100 btree records. For + * the first btree record, we'll observe that bc_levels[0].ptr == 1, so we + * record that we saw block 1. Then we observe that bc_levels[1].ptr == 1, so + * we record block 4. The list is [1, 4]. + * + * For the second btree record, we see that bc_levels[0].ptr == 2, so we exit + * the loop. The list remains [1, 4]. + * + * For the 101st btree record, we've moved onto leaf block 2. Now + * bc_levels[0].ptr == 1 again, so we record that we saw block 2. We see that + * bc_levels[1].ptr == 2, so we exit the loop. The list is now [1, 4, 2]. + * + * For the 102nd record, bc_levels[0].ptr == 2, so we continue. + * + * For the 201st record, we've moved on to leaf block 3. + * bc_levels[0].ptr == 1, so we add 3 to the list. Now it is [1, 4, 2, 3]. + * + * For the 300th record we just exit, with the list being [1, 4, 2, 3]. + */ + +/* Mark a btree block to the agblock bitmap. */ +STATIC int +xagb_bitmap_visit_btblock( + struct xfs_btree_cur *cur, + int level, + void *priv) +{ + struct xagb_bitmap *bitmap = priv; + struct xfs_buf *bp; + xfs_fsblock_t fsbno; + xfs_agblock_t agbno; + + xfs_btree_get_block(cur, level, &bp); + if (!bp) + return 0; + + fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); + agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + + return xagb_bitmap_set(bitmap, agbno, 1); +} + +/* Mark all (per-AG) btree blocks in the agblock bitmap. */ +int +xagb_bitmap_set_btblocks( + struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur) +{ + return xfs_btree_visit_blocks(cur, xagb_bitmap_visit_btblock, + XFS_BTREE_VISIT_ALL, bitmap); +} + +/* + * Record all the buffers pointed to by the btree cursor. Callers already + * engaged in a btree walk should call this function to capture the list of + * blocks going from the leaf towards the root. + */ +int +xagb_bitmap_set_btcur_path( + struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur) +{ + int i; + int error; + + for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) { + error = xagb_bitmap_visit_btblock(cur, i, bitmap); + if (error) + return error; + } + + return 0; +} diff --git a/fs/xfs/scrub/agb_bitmap.h b/fs/xfs/scrub/agb_bitmap.h new file mode 100644 index 000000000000..ed08f76ff4f3 --- /dev/null +++ b/fs/xfs/scrub/agb_bitmap.h @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_AGB_BITMAP_H__ +#define __XFS_SCRUB_AGB_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_agblock_t */ + +struct xagb_bitmap { + struct xbitmap32 agbitmap; +}; + +static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap) +{ + xbitmap32_init(&bitmap->agbitmap); +} + +static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap) +{ + xbitmap32_destroy(&bitmap->agbitmap); +} + +static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap, + xfs_agblock_t start, xfs_extlen_t len) +{ + return xbitmap32_clear(&bitmap->agbitmap, start, len); +} +static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap, + xfs_agblock_t start, xfs_extlen_t len) +{ + return xbitmap32_set(&bitmap->agbitmap, start, len); +} + +static inline bool xagb_bitmap_test(struct xagb_bitmap *bitmap, + xfs_agblock_t start, xfs_extlen_t *len) +{ + return xbitmap32_test(&bitmap->agbitmap, start, len); +} + +static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap, + struct xagb_bitmap *sub) +{ + return xbitmap32_disunion(&bitmap->agbitmap, &sub->agbitmap); +} + +static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap) +{ + return xbitmap32_hweight(&bitmap->agbitmap); +} +static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap) +{ + return xbitmap32_empty(&bitmap->agbitmap); +} + +static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap, + xbitmap32_walk_fn fn, void *priv) +{ + return xbitmap32_walk(&bitmap->agbitmap, fn, priv); +} + +int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur); +int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur); + +#endif /* __XFS_SCRUB_AGB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 876a2f41b063..26bd1ff68f1b 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -26,6 +26,7 @@ #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" #include "scrub/reap.h" /* Superblock */ @@ -72,7 +73,7 @@ xrep_superblock( /* Write this to disk. */ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF); xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1); - return error; + return 0; } /* AGF */ @@ -341,7 +342,7 @@ xrep_agf_commit_new( pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); - return 0; + return xrep_roll_ag_trans(sc); } /* Repair the AGF. v5 filesystems only. */ @@ -494,12 +495,11 @@ xrep_agfl_walk_rmap( /* Strike out the blocks that are cross-linked according to the rmapbt. */ STATIC int xrep_agfl_check_extent( - uint64_t start, - uint64_t len, + uint32_t agbno, + uint32_t len, void *priv) { struct xrep_agfl *ra = priv; - xfs_agblock_t agbno = start; xfs_agblock_t last_agbno = agbno + len - 1; int error; @@ -647,8 +647,8 @@ struct xrep_agfl_fill { /* Fill the AGFL with whatever blocks are in this extent. */ static int xrep_agfl_fill( - uint64_t start, - uint64_t len, + uint32_t start, + uint32_t len, void *priv) { struct xrep_agfl_fill *af = priv; @@ -789,6 +789,9 @@ xrep_agfl( /* Dump any AGFL overflow. */ error = xrep_reap_agblocks(sc, &agfl_extents, &XFS_RMAP_OINFO_AG, XFS_AG_RESV_AGFL); + if (error) + goto err; + err: xagb_bitmap_destroy(&agfl_extents); return error; @@ -962,7 +965,7 @@ xrep_agi_commit_new( pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); - return 0; + return xrep_roll_ag_trans(sc); } /* Repair the AGI. */ diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 279af72b1671..d1b8a4997dd2 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -9,13 +9,16 @@ #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" #include "xfs_btree.h" #include "xfs_alloc.h" #include "xfs_rmap.h" +#include "xfs_ag.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" -#include "xfs_ag.h" +#include "scrub/repair.h" /* * Set us up to scrub free space btrees. @@ -24,10 +27,19 @@ int xchk_setup_ag_allocbt( struct xfs_scrub *sc) { + int error; + if (xchk_need_intent_drain(sc)) xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); - return xchk_setup_ag_btree(sc, false); + error = xchk_setup_ag_btree(sc, false); + if (error) + return error; + + if (xchk_could_repair(sc)) + return xrep_setup_ag_allocbt(sc); + + return 0; } /* Free space btree scrubber. */ @@ -127,7 +139,7 @@ xchk_allocbt_rec( struct xchk_alloc *ca = bs->private; xfs_alloc_btrec_to_irec(rec, &irec); - if (xfs_alloc_check_irec(bs->cur, &irec) != NULL) { + if (xfs_alloc_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } @@ -138,31 +150,27 @@ xchk_allocbt_rec( return 0; } -/* Scrub the freespace btrees for some AG. */ -STATIC int +/* Scrub one of the freespace btrees for some AG. */ +int xchk_allocbt( - struct xfs_scrub *sc, - xfs_btnum_t which) + struct xfs_scrub *sc) { struct xchk_alloc ca = { }; struct xfs_btree_cur *cur; - cur = which == XFS_BTNUM_BNO ? sc->sa.bno_cur : sc->sa.cnt_cur; - return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, &ca); -} - -int -xchk_bnobt( - struct xfs_scrub *sc) -{ - return xchk_allocbt(sc, XFS_BTNUM_BNO); -} + switch (sc->sm->sm_type) { + case XFS_SCRUB_TYPE_BNOBT: + cur = sc->sa.bno_cur; + break; + case XFS_SCRUB_TYPE_CNTBT: + cur = sc->sa.cnt_cur; + break; + default: + ASSERT(0); + return -EIO; + } -int -xchk_cntbt( - struct xfs_scrub *sc) -{ - return xchk_allocbt(sc, XFS_BTNUM_CNT); + return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, &ca); } /* xref check that the extent is not free */ diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c new file mode 100644 index 000000000000..45edda096869 --- /dev/null +++ b/fs/xfs/scrub/alloc_repair.c @@ -0,0 +1,934 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_inode.h" +#include "xfs_refcount.h" +#include "xfs_extent_busy.h" +#include "xfs_health.h" +#include "xfs_bmap.h" +#include "xfs_ialloc.h" +#include "xfs_ag.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" + +/* + * Free Space Btree Repair + * ======================= + * + * The reverse mappings are supposed to record all space usage for the entire + * AG. Therefore, we can recreate the free extent records in an AG by looking + * for gaps in the physical extents recorded in the rmapbt. These records are + * staged in @free_records. Identifying the gaps is more difficult on a + * reflink filesystem because rmap records are allowed to overlap. + * + * Because the final step of building a new index is to free the space used by + * the old index, repair needs to find that space. Unfortunately, all + * structures that live in the free space (bnobt, cntbt, rmapbt, agfl) share + * the same rmapbt owner code (OWN_AG), so this is not straightforward. + * + * The scan of the reverse mapping information records the space used by OWN_AG + * in @old_allocbt_blocks, which (at this stage) is somewhat misnamed. While + * walking the rmapbt records, we create a second bitmap @not_allocbt_blocks to + * record all visited rmap btree blocks and all blocks owned by the AGFL. + * + * After that is where the definitions of old_allocbt_blocks shifts. This + * expression identifies possible former bnobt/cntbt blocks: + * + * (OWN_AG blocks) & ~(rmapbt blocks | agfl blocks); + * + * Substituting from above definitions, that becomes: + * + * old_allocbt_blocks & ~not_allocbt_blocks + * + * The OWN_AG bitmap itself isn't needed after this point, so what we really do + * instead is: + * + * old_allocbt_blocks &= ~not_allocbt_blocks; + * + * After this point, @old_allocbt_blocks is a bitmap of alleged former + * bnobt/cntbt blocks. The xagb_bitmap_disunion operation modifies its first + * parameter in place to avoid copying records around. + * + * Next, some of the space described by @free_records are diverted to the newbt + * reservation and used to format new btree blocks. The remaining records are + * written to the new btree indices. We reconstruct both bnobt and cntbt at + * the same time since we've already done all the work. + * + * We use the prefix 'xrep_abt' here because we regenerate both free space + * allocation btrees at the same time. + */ + +struct xrep_abt { + /* Blocks owned by the rmapbt or the agfl. */ + struct xagb_bitmap not_allocbt_blocks; + + /* All OWN_AG blocks. */ + struct xagb_bitmap old_allocbt_blocks; + + /* + * New bnobt information. All btree block reservations are added to + * the reservation list in new_bnobt. + */ + struct xrep_newbt new_bnobt; + + /* new cntbt information */ + struct xrep_newbt new_cntbt; + + /* Free space extents. */ + struct xfarray *free_records; + + struct xfs_scrub *sc; + + /* Number of non-null records in @free_records. */ + uint64_t nr_real_records; + + /* get_records()'s position in the free space record array. */ + xfarray_idx_t array_cur; + + /* + * Next block we anticipate seeing in the rmap records. If the next + * rmap record is greater than next_agbno, we have found unused space. + */ + xfs_agblock_t next_agbno; + + /* Number of free blocks in this AG. */ + xfs_agblock_t nr_blocks; + + /* Longest free extent we found in the AG. */ + xfs_agblock_t longest; +}; + +/* Set up to repair AG free space btrees. */ +int +xrep_setup_ag_allocbt( + struct xfs_scrub *sc) +{ + unsigned int busy_gen; + + /* + * Make sure the busy extent list is clear because we can't put extents + * on there twice. + */ + busy_gen = READ_ONCE(sc->sa.pag->pagb_gen); + if (xfs_extent_busy_list_empty(sc->sa.pag)) + return 0; + + return xfs_extent_busy_flush(sc->tp, sc->sa.pag, busy_gen, 0); +} + +/* Check for any obvious conflicts in the free extent. */ +STATIC int +xrep_abt_check_free_ext( + struct xfs_scrub *sc, + const struct xfs_alloc_rec_incore *rec) +{ + enum xbtree_recpacking outcome; + int error; + + if (xfs_alloc_check_irec(sc->sa.pag, rec) != NULL) + return -EFSCORRUPTED; + + /* Must not be an inode chunk. */ + error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur, + rec->ar_startblock, rec->ar_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + /* Must not be shared or CoW staging. */ + if (sc->sa.refc_cur) { + error = xfs_refcount_has_records(sc->sa.refc_cur, + XFS_REFC_DOMAIN_SHARED, rec->ar_startblock, + rec->ar_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + error = xfs_refcount_has_records(sc->sa.refc_cur, + XFS_REFC_DOMAIN_COW, rec->ar_startblock, + rec->ar_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + } + + return 0; +} + +/* + * Stash a free space record for all the space since the last bno we found + * all the way up to @end. + */ +static int +xrep_abt_stash( + struct xrep_abt *ra, + xfs_agblock_t end) +{ + struct xfs_alloc_rec_incore arec = { + .ar_startblock = ra->next_agbno, + .ar_blockcount = end - ra->next_agbno, + }; + struct xfs_scrub *sc = ra->sc; + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + error = xrep_abt_check_free_ext(ra->sc, &arec); + if (error) + return error; + + trace_xrep_abt_found(sc->mp, sc->sa.pag->pag_agno, &arec); + + error = xfarray_append(ra->free_records, &arec); + if (error) + return error; + + ra->nr_blocks += arec.ar_blockcount; + return 0; +} + +/* Record extents that aren't in use from gaps in the rmap records. */ +STATIC int +xrep_abt_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_abt *ra = priv; + int error; + + /* Record all the OWN_AG blocks... */ + if (rec->rm_owner == XFS_RMAP_OWN_AG) { + error = xagb_bitmap_set(&ra->old_allocbt_blocks, + rec->rm_startblock, rec->rm_blockcount); + if (error) + return error; + } + + /* ...and all the rmapbt blocks... */ + error = xagb_bitmap_set_btcur_path(&ra->not_allocbt_blocks, cur); + if (error) + return error; + + /* ...and all the free space. */ + if (rec->rm_startblock > ra->next_agbno) { + error = xrep_abt_stash(ra, rec->rm_startblock); + if (error) + return error; + } + + /* + * rmap records can overlap on reflink filesystems, so project + * next_agbno as far out into the AG space as we currently know about. + */ + ra->next_agbno = max_t(xfs_agblock_t, ra->next_agbno, + rec->rm_startblock + rec->rm_blockcount); + return 0; +} + +/* Collect an AGFL block for the not-to-release list. */ +static int +xrep_abt_walk_agfl( + struct xfs_mount *mp, + xfs_agblock_t agbno, + void *priv) +{ + struct xrep_abt *ra = priv; + + return xagb_bitmap_set(&ra->not_allocbt_blocks, agbno, 1); +} + +/* + * Compare two free space extents by block number. We want to sort in order of + * increasing block number. + */ +static int +xrep_bnobt_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_alloc_rec_incore *ap = a; + const struct xfs_alloc_rec_incore *bp = b; + + if (ap->ar_startblock > bp->ar_startblock) + return 1; + else if (ap->ar_startblock < bp->ar_startblock) + return -1; + return 0; +} + +/* + * Re-sort the free extents by block number so that we can put the records into + * the bnobt in the correct order. Make sure the records do not overlap in + * physical space. + */ +STATIC int +xrep_bnobt_sort_records( + struct xrep_abt *ra) +{ + struct xfs_alloc_rec_incore arec; + xfarray_idx_t cur = XFARRAY_CURSOR_INIT; + xfs_agblock_t next_agbno = 0; + int error; + + error = xfarray_sort(ra->free_records, xrep_bnobt_extent_cmp, 0); + if (error) + return error; + + while ((error = xfarray_iter(ra->free_records, &cur, &arec)) == 1) { + if (arec.ar_startblock < next_agbno) + return -EFSCORRUPTED; + + next_agbno = arec.ar_startblock + arec.ar_blockcount; + } + + return error; +} + +/* + * Compare two free space extents by length and then block number. We want + * to sort first in order of increasing length and then in order of increasing + * block number. + */ +static int +xrep_cntbt_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_alloc_rec_incore *ap = a; + const struct xfs_alloc_rec_incore *bp = b; + + if (ap->ar_blockcount > bp->ar_blockcount) + return 1; + else if (ap->ar_blockcount < bp->ar_blockcount) + return -1; + return xrep_bnobt_extent_cmp(a, b); +} + +/* + * Sort the free extents by length so so that we can put the records into the + * cntbt in the correct order. Don't let userspace kill us if we're resorting + * after allocating btree blocks. + */ +STATIC int +xrep_cntbt_sort_records( + struct xrep_abt *ra, + bool is_resort) +{ + return xfarray_sort(ra->free_records, xrep_cntbt_extent_cmp, + is_resort ? 0 : XFARRAY_SORT_KILLABLE); +} + +/* + * Iterate all reverse mappings to find (1) the gaps between rmap records (all + * unowned space), (2) the OWN_AG extents (which encompass the free space + * btrees, the rmapbt, and the agfl), (3) the rmapbt blocks, and (4) the AGFL + * blocks. The free space is (1) + (2) - (3) - (4). + */ +STATIC int +xrep_abt_find_freespace( + struct xrep_abt *ra) +{ + struct xfs_scrub *sc = ra->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + struct xfs_buf *agfl_bp; + xfs_agblock_t agend; + int error; + + xagb_bitmap_init(&ra->not_allocbt_blocks); + + xrep_ag_btcur_init(sc, &sc->sa); + + /* + * Iterate all the reverse mappings to find gaps in the physical + * mappings, all the OWN_AG blocks, and all the rmapbt extents. + */ + error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_abt_walk_rmap, ra); + if (error) + goto err; + + /* Insert a record for space between the last rmap and EOAG. */ + agend = be32_to_cpu(agf->agf_length); + if (ra->next_agbno < agend) { + error = xrep_abt_stash(ra, agend); + if (error) + goto err; + } + + /* Collect all the AGFL blocks. */ + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); + if (error) + goto err; + + error = xfs_agfl_walk(mp, agf, agfl_bp, xrep_abt_walk_agfl, ra); + if (error) + goto err_agfl; + + /* Compute the old bnobt/cntbt blocks. */ + error = xagb_bitmap_disunion(&ra->old_allocbt_blocks, + &ra->not_allocbt_blocks); + if (error) + goto err_agfl; + + ra->nr_real_records = xfarray_length(ra->free_records); +err_agfl: + xfs_trans_brelse(sc->tp, agfl_bp); +err: + xchk_ag_btcur_free(&sc->sa); + xagb_bitmap_destroy(&ra->not_allocbt_blocks); + return error; +} + +/* + * We're going to use the observed free space records to reserve blocks for the + * new free space btrees, so we play an iterative game where we try to converge + * on the number of blocks we need: + * + * 1. Estimate how many blocks we'll need to store the records. + * 2. If the first free record has more blocks than we need, we're done. + * We will have to re-sort the records prior to building the cntbt. + * 3. If that record has exactly the number of blocks we need, null out the + * record. We're done. + * 4. Otherwise, we still need more blocks. Null out the record, subtract its + * length from the number of blocks we need, and go back to step 1. + * + * Fortunately, we don't have to do any transaction work to play this game, so + * we don't have to tear down the staging cursors. + */ +STATIC int +xrep_abt_reserve_space( + struct xrep_abt *ra, + struct xfs_btree_cur *bno_cur, + struct xfs_btree_cur *cnt_cur, + bool *needs_resort) +{ + struct xfs_scrub *sc = ra->sc; + xfarray_idx_t record_nr; + unsigned int allocated = 0; + int error = 0; + + record_nr = xfarray_length(ra->free_records) - 1; + do { + struct xfs_alloc_rec_incore arec; + uint64_t required; + unsigned int desired; + unsigned int len; + + /* Compute how many blocks we'll need. */ + error = xfs_btree_bload_compute_geometry(cnt_cur, + &ra->new_cntbt.bload, ra->nr_real_records); + if (error) + break; + + error = xfs_btree_bload_compute_geometry(bno_cur, + &ra->new_bnobt.bload, ra->nr_real_records); + if (error) + break; + + /* How many btree blocks do we need to store all records? */ + required = ra->new_bnobt.bload.nr_blocks + + ra->new_cntbt.bload.nr_blocks; + ASSERT(required < INT_MAX); + + /* If we've reserved enough blocks, we're done. */ + if (allocated >= required) + break; + + desired = required - allocated; + + /* We need space but there's none left; bye! */ + if (ra->nr_real_records == 0) { + error = -ENOSPC; + break; + } + + /* Grab the first record from the list. */ + error = xfarray_load(ra->free_records, record_nr, &arec); + if (error) + break; + + ASSERT(arec.ar_blockcount <= UINT_MAX); + len = min_t(unsigned int, arec.ar_blockcount, desired); + + trace_xrep_newbt_alloc_ag_blocks(sc->mp, sc->sa.pag->pag_agno, + arec.ar_startblock, len, XFS_RMAP_OWN_AG); + + error = xrep_newbt_add_extent(&ra->new_bnobt, sc->sa.pag, + arec.ar_startblock, len); + if (error) + break; + allocated += len; + ra->nr_blocks -= len; + + if (arec.ar_blockcount > desired) { + /* + * Record has more space than we need. The number of + * free records doesn't change, so shrink the free + * record, inform the caller that the records are no + * longer sorted by length, and exit. + */ + arec.ar_startblock += desired; + arec.ar_blockcount -= desired; + error = xfarray_store(ra->free_records, record_nr, + &arec); + if (error) + break; + + *needs_resort = true; + return 0; + } + + /* + * We're going to use up the entire record, so unset it and + * move on to the next one. This changes the number of free + * records (but doesn't break the sorting order), so we must + * go around the loop once more to re-run _bload_init. + */ + error = xfarray_unset(ra->free_records, record_nr); + if (error) + break; + ra->nr_real_records--; + record_nr--; + } while (1); + + return error; +} + +STATIC int +xrep_abt_dispose_one( + struct xrep_abt *ra, + struct xrep_newbt_resv *resv) +{ + struct xfs_scrub *sc = ra->sc; + struct xfs_perag *pag = sc->sa.pag; + xfs_agblock_t free_agbno = resv->agbno + resv->used; + xfs_extlen_t free_aglen = resv->len - resv->used; + int error; + + ASSERT(pag == resv->pag); + + /* Add a deferred rmap for each extent we used. */ + if (resv->used > 0) + xfs_rmap_alloc_extent(sc->tp, pag->pag_agno, resv->agbno, + resv->used, XFS_RMAP_OWN_AG); + + /* + * For each reserved btree block we didn't use, add it to the free + * space btree. We didn't touch fdblocks when we reserved them, so + * we don't touch it now. + */ + if (free_aglen == 0) + return 0; + + trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno, + free_aglen, ra->new_bnobt.oinfo.oi_owner); + + error = __xfs_free_extent(sc->tp, resv->pag, free_agbno, free_aglen, + &ra->new_bnobt.oinfo, XFS_AG_RESV_IGNORE, true); + if (error) + return error; + + return xrep_defer_finish(sc); +} + +/* + * Deal with all the space we reserved. Blocks that were allocated for the + * free space btrees need to have a (deferred) rmap added for the OWN_AG + * allocation, and blocks that didn't get used can be freed via the usual + * (deferred) means. + */ +STATIC void +xrep_abt_dispose_reservations( + struct xrep_abt *ra, + int error) +{ + struct xrep_newbt_resv *resv, *n; + + if (error) + goto junkit; + + list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) { + error = xrep_abt_dispose_one(ra, resv); + if (error) + goto junkit; + } + +junkit: + list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) { + xfs_perag_put(resv->pag); + list_del(&resv->list); + kfree(resv); + } + + xrep_newbt_cancel(&ra->new_bnobt); + xrep_newbt_cancel(&ra->new_cntbt); +} + +/* Retrieve free space data for bulk load. */ +STATIC int +xrep_abt_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_alloc_rec_incore *arec = &cur->bc_rec.a; + struct xrep_abt *ra = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + error = xfarray_load_next(ra->free_records, &ra->array_cur, + arec); + if (error) + return error; + + ra->longest = max(ra->longest, arec->ar_blockcount); + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_abt_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_abt *ra = priv; + + return xrep_newbt_claim_block(cur, &ra->new_bnobt, ptr); +} + +/* + * Reset the AGF counters to reflect the free space btrees that we just + * rebuilt, then reinitialize the per-AG data. + */ +STATIC int +xrep_abt_reset_counters( + struct xrep_abt *ra) +{ + struct xfs_scrub *sc = ra->sc; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + unsigned int freesp_btreeblks = 0; + + /* + * Compute the contribution to agf_btreeblks for the new free space + * btrees. This is the computed btree size minus anything we didn't + * use. + */ + freesp_btreeblks += ra->new_bnobt.bload.nr_blocks - 1; + freesp_btreeblks += ra->new_cntbt.bload.nr_blocks - 1; + + freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_bnobt); + freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_cntbt); + + /* + * The AGF header contains extra information related to the free space + * btrees, so we must update those fields here. + */ + agf->agf_btreeblks = cpu_to_be32(freesp_btreeblks + + (be32_to_cpu(agf->agf_rmap_blocks) - 1)); + agf->agf_freeblks = cpu_to_be32(ra->nr_blocks); + agf->agf_longest = cpu_to_be32(ra->longest); + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS | + XFS_AGF_LONGEST | + XFS_AGF_FREEBLKS); + + /* + * After we commit the new btree to disk, it is possible that the + * process to reap the old btree blocks will race with the AIL trying + * to checkpoint the old btree blocks into the filesystem. If the new + * tree is shorter than the old one, the allocbt write verifier will + * fail and the AIL will shut down the filesystem. + * + * To avoid this, save the old incore btree height values as the alt + * height values before re-initializing the perag info from the updated + * AGF to capture all the new values. + */ + pag->pagf_repair_levels[XFS_BTNUM_BNOi] = pag->pagf_levels[XFS_BTNUM_BNOi]; + pag->pagf_repair_levels[XFS_BTNUM_CNTi] = pag->pagf_levels[XFS_BTNUM_CNTi]; + + /* Reinitialize with the values we just logged. */ + return xrep_reinit_pagf(sc); +} + +/* + * Use the collected free space information to stage new free space btrees. + * If this is successful we'll return with the new btree root + * information logged to the repair transaction but not yet committed. + */ +STATIC int +xrep_abt_build_new_trees( + struct xrep_abt *ra) +{ + struct xfs_scrub *sc = ra->sc; + struct xfs_btree_cur *bno_cur; + struct xfs_btree_cur *cnt_cur; + struct xfs_perag *pag = sc->sa.pag; + bool needs_resort = false; + int error; + + /* + * Sort the free extents by length so that we can set up the free space + * btrees in as few extents as possible. This reduces the amount of + * deferred rmap / free work we have to do at the end. + */ + error = xrep_cntbt_sort_records(ra, false); + if (error) + return error; + + /* + * Prepare to construct the new btree by reserving disk space for the + * new btree and setting up all the accounting information we'll need + * to root the new btree while it's under construction and before we + * attach it to the AG header. + */ + xrep_newbt_init_bare(&ra->new_bnobt, sc); + xrep_newbt_init_bare(&ra->new_cntbt, sc); + + ra->new_bnobt.bload.get_records = xrep_abt_get_records; + ra->new_cntbt.bload.get_records = xrep_abt_get_records; + + ra->new_bnobt.bload.claim_block = xrep_abt_claim_block; + ra->new_cntbt.bload.claim_block = xrep_abt_claim_block; + + /* Allocate cursors for the staged btrees. */ + bno_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_bnobt.afake, + pag, XFS_BTNUM_BNO); + cnt_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_cntbt.afake, + pag, XFS_BTNUM_CNT); + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto err_cur; + + /* Reserve the space we'll need for the new btrees. */ + error = xrep_abt_reserve_space(ra, bno_cur, cnt_cur, &needs_resort); + if (error) + goto err_cur; + + /* + * If we need to re-sort the free extents by length, do so so that we + * can put the records into the cntbt in the correct order. + */ + if (needs_resort) { + error = xrep_cntbt_sort_records(ra, needs_resort); + if (error) + goto err_cur; + } + + /* + * Due to btree slack factors, it's possible for a new btree to be one + * level taller than the old btree. Update the alternate incore btree + * height so that we don't trip the verifiers when writing the new + * btree blocks to disk. + */ + pag->pagf_repair_levels[XFS_BTNUM_BNOi] = + ra->new_bnobt.bload.btree_height; + pag->pagf_repair_levels[XFS_BTNUM_CNTi] = + ra->new_cntbt.bload.btree_height; + + /* Load the free space by length tree. */ + ra->array_cur = XFARRAY_CURSOR_INIT; + ra->longest = 0; + error = xfs_btree_bload(cnt_cur, &ra->new_cntbt.bload, ra); + if (error) + goto err_levels; + + error = xrep_bnobt_sort_records(ra); + if (error) + return error; + + /* Load the free space by block number tree. */ + ra->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(bno_cur, &ra->new_bnobt.bload, ra); + if (error) + goto err_levels; + + /* + * Install the new btrees in the AG header. After this point the old + * btrees are no longer accessible and the new trees are live. + */ + xfs_allocbt_commit_staged_btree(bno_cur, sc->tp, sc->sa.agf_bp); + xfs_btree_del_cursor(bno_cur, 0); + xfs_allocbt_commit_staged_btree(cnt_cur, sc->tp, sc->sa.agf_bp); + xfs_btree_del_cursor(cnt_cur, 0); + + /* Reset the AGF counters now that we've changed the btree shape. */ + error = xrep_abt_reset_counters(ra); + if (error) + goto err_newbt; + + /* Dispose of any unused blocks and the accounting information. */ + xrep_abt_dispose_reservations(ra, error); + + return xrep_roll_ag_trans(sc); + +err_levels: + pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0; + pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0; +err_cur: + xfs_btree_del_cursor(cnt_cur, error); + xfs_btree_del_cursor(bno_cur, error); +err_newbt: + xrep_abt_dispose_reservations(ra, error); + return error; +} + +/* + * Now that we've logged the roots of the new btrees, invalidate all of the + * old blocks and free them. + */ +STATIC int +xrep_abt_remove_old_trees( + struct xrep_abt *ra) +{ + struct xfs_perag *pag = ra->sc->sa.pag; + int error; + + /* Free the old btree blocks if they're not in use. */ + error = xrep_reap_agblocks(ra->sc, &ra->old_allocbt_blocks, + &XFS_RMAP_OINFO_AG, XFS_AG_RESV_IGNORE); + if (error) + return error; + + /* + * Now that we've zapped all the old allocbt blocks we can turn off + * the alternate height mechanism. + */ + pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0; + pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0; + return 0; +} + +/* Repair the freespace btrees for some AG. */ +int +xrep_allocbt( + struct xfs_scrub *sc) +{ + struct xrep_abt *ra; + struct xfs_mount *mp = sc->mp; + char *descr; + int error; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rmapbt(mp)) + return -EOPNOTSUPP; + + ra = kzalloc(sizeof(struct xrep_abt), XCHK_GFP_FLAGS); + if (!ra) + return -ENOMEM; + ra->sc = sc; + + /* We rebuild both data structures. */ + sc->sick_mask = XFS_SICK_AG_BNOBT | XFS_SICK_AG_CNTBT; + + /* + * Make sure the busy extent list is clear because we can't put extents + * on there twice. In theory we cleared this before we started, but + * let's not risk the filesystem. + */ + if (!xfs_extent_busy_list_empty(sc->sa.pag)) { + error = -EDEADLOCK; + goto out_ra; + } + + /* Set up enough storage to handle maximally fragmented free space. */ + descr = xchk_xfile_ag_descr(sc, "free space records"); + error = xfarray_create(descr, mp->m_sb.sb_agblocks / 2, + sizeof(struct xfs_alloc_rec_incore), + &ra->free_records); + kfree(descr); + if (error) + goto out_ra; + + /* Collect the free space data and find the old btree blocks. */ + xagb_bitmap_init(&ra->old_allocbt_blocks); + error = xrep_abt_find_freespace(ra); + if (error) + goto out_bitmap; + + /* Rebuild the free space information. */ + error = xrep_abt_build_new_trees(ra); + if (error) + goto out_bitmap; + + /* Kill the old trees. */ + error = xrep_abt_remove_old_trees(ra); + if (error) + goto out_bitmap; + +out_bitmap: + xagb_bitmap_destroy(&ra->old_allocbt_blocks); + xfarray_destroy(ra->free_records); +out_ra: + kfree(ra); + return error; +} + +/* Make sure both btrees are ok after we've rebuilt them. */ +int +xrep_revalidate_allocbt( + struct xfs_scrub *sc) +{ + __u32 old_type = sc->sm->sm_type; + int error; + + /* + * We must update sm_type temporarily so that the tree-to-tree cross + * reference checks will work in the correct direction, and also so + * that tracing will report correctly if there are more errors. + */ + sc->sm->sm_type = XFS_SCRUB_TYPE_BNOBT; + error = xchk_allocbt(sc); + if (error) + goto out; + + sc->sm->sm_type = XFS_SCRUB_TYPE_CNTBT; + error = xchk_allocbt(sc); +out: + sc->sm->sm_type = old_type; + return error; +} diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 6c16d9530cca..83c7feb38714 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -527,28 +527,23 @@ xchk_xattr_check_sf( struct xfs_scrub *sc) { struct xchk_xattr_buf *ab = sc->buf; - struct xfs_attr_shortform *sf; - struct xfs_attr_sf_entry *sfe; + struct xfs_ifork *ifp = &sc->ip->i_af; + struct xfs_attr_sf_hdr *sf = ifp->if_data; + struct xfs_attr_sf_entry *sfe = xfs_attr_sf_firstentry(sf); struct xfs_attr_sf_entry *next; - struct xfs_ifork *ifp; - unsigned char *end; + unsigned char *end = ifp->if_data + ifp->if_bytes; int i; int error = 0; - ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); - bitmap_zero(ab->usedmap, ifp->if_bytes); - sf = (struct xfs_attr_shortform *)sc->ip->i_af.if_u1.if_data; - end = (unsigned char *)ifp->if_u1.if_data + ifp->if_bytes; - xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(sf->hdr)); + xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(*sf)); - sfe = &sf->list[0]; if ((unsigned char *)sfe > end) { xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); return 0; } - for (i = 0; i < sf->hdr.count; i++) { + for (i = 0; i < sf->count; i++) { unsigned char *name = sfe->nameval; unsigned char *value = &sfe->nameval[sfe->namelen]; diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index e0c89a9a0ca0..1449bb5262d9 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -16,7 +16,9 @@ #include <linux/interval_tree_generic.h> -struct xbitmap_node { +/* u64 bitmap */ + +struct xbitmap64_node { struct rb_node bn_rbnode; /* First set bit of this interval and subtree. */ @@ -39,72 +41,72 @@ struct xbitmap_node { * forward-declare them anyway for clarity. */ static inline void -xbitmap_tree_insert(struct xbitmap_node *node, struct rb_root_cached *root); +xbitmap64_tree_insert(struct xbitmap64_node *node, struct rb_root_cached *root); static inline void -xbitmap_tree_remove(struct xbitmap_node *node, struct rb_root_cached *root); +xbitmap64_tree_remove(struct xbitmap64_node *node, struct rb_root_cached *root); -static inline struct xbitmap_node * -xbitmap_tree_iter_first(struct rb_root_cached *root, uint64_t start, +static inline struct xbitmap64_node * +xbitmap64_tree_iter_first(struct rb_root_cached *root, uint64_t start, uint64_t last); -static inline struct xbitmap_node * -xbitmap_tree_iter_next(struct xbitmap_node *node, uint64_t start, +static inline struct xbitmap64_node * +xbitmap64_tree_iter_next(struct xbitmap64_node *node, uint64_t start, uint64_t last); -INTERVAL_TREE_DEFINE(struct xbitmap_node, bn_rbnode, uint64_t, - __bn_subtree_last, START, LAST, static inline, xbitmap_tree) +INTERVAL_TREE_DEFINE(struct xbitmap64_node, bn_rbnode, uint64_t, + __bn_subtree_last, START, LAST, static inline, xbitmap64_tree) /* Iterate each interval of a bitmap. Do not change the bitmap. */ -#define for_each_xbitmap_extent(bn, bitmap) \ +#define for_each_xbitmap64_extent(bn, bitmap) \ for ((bn) = rb_entry_safe(rb_first(&(bitmap)->xb_root.rb_root), \ - struct xbitmap_node, bn_rbnode); \ + struct xbitmap64_node, bn_rbnode); \ (bn) != NULL; \ (bn) = rb_entry_safe(rb_next(&(bn)->bn_rbnode), \ - struct xbitmap_node, bn_rbnode)) + struct xbitmap64_node, bn_rbnode)) /* Clear a range of this bitmap. */ int -xbitmap_clear( - struct xbitmap *bitmap, +xbitmap64_clear( + struct xbitmap64 *bitmap, uint64_t start, uint64_t len) { - struct xbitmap_node *bn; - struct xbitmap_node *new_bn; + struct xbitmap64_node *bn; + struct xbitmap64_node *new_bn; uint64_t last = start + len - 1; - while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last))) { + while ((bn = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last))) { if (bn->bn_start < start && bn->bn_last > last) { uint64_t old_last = bn->bn_last; /* overlaps with the entire clearing range */ - xbitmap_tree_remove(bn, &bitmap->xb_root); + xbitmap64_tree_remove(bn, &bitmap->xb_root); bn->bn_last = start - 1; - xbitmap_tree_insert(bn, &bitmap->xb_root); + xbitmap64_tree_insert(bn, &bitmap->xb_root); /* add an extent */ - new_bn = kmalloc(sizeof(struct xbitmap_node), + new_bn = kmalloc(sizeof(struct xbitmap64_node), XCHK_GFP_FLAGS); if (!new_bn) return -ENOMEM; new_bn->bn_start = last + 1; new_bn->bn_last = old_last; - xbitmap_tree_insert(new_bn, &bitmap->xb_root); + xbitmap64_tree_insert(new_bn, &bitmap->xb_root); } else if (bn->bn_start < start) { /* overlaps with the left side of the clearing range */ - xbitmap_tree_remove(bn, &bitmap->xb_root); + xbitmap64_tree_remove(bn, &bitmap->xb_root); bn->bn_last = start - 1; - xbitmap_tree_insert(bn, &bitmap->xb_root); + xbitmap64_tree_insert(bn, &bitmap->xb_root); } else if (bn->bn_last > last) { /* overlaps with the right side of the clearing range */ - xbitmap_tree_remove(bn, &bitmap->xb_root); + xbitmap64_tree_remove(bn, &bitmap->xb_root); bn->bn_start = last + 1; - xbitmap_tree_insert(bn, &bitmap->xb_root); + xbitmap64_tree_insert(bn, &bitmap->xb_root); break; } else { /* in the middle of the clearing range */ - xbitmap_tree_remove(bn, &bitmap->xb_root); + xbitmap64_tree_remove(bn, &bitmap->xb_root); kfree(bn); } } @@ -114,59 +116,59 @@ xbitmap_clear( /* Set a range of this bitmap. */ int -xbitmap_set( - struct xbitmap *bitmap, +xbitmap64_set( + struct xbitmap64 *bitmap, uint64_t start, uint64_t len) { - struct xbitmap_node *left; - struct xbitmap_node *right; + struct xbitmap64_node *left; + struct xbitmap64_node *right; uint64_t last = start + len - 1; int error; /* Is this whole range already set? */ - left = xbitmap_tree_iter_first(&bitmap->xb_root, start, last); + left = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last); if (left && left->bn_start <= start && left->bn_last >= last) return 0; /* Clear out everything in the range we want to set. */ - error = xbitmap_clear(bitmap, start, len); + error = xbitmap64_clear(bitmap, start, len); if (error) return error; /* Do we have a left-adjacent extent? */ - left = xbitmap_tree_iter_first(&bitmap->xb_root, start - 1, start - 1); + left = xbitmap64_tree_iter_first(&bitmap->xb_root, start - 1, start - 1); ASSERT(!left || left->bn_last + 1 == start); /* Do we have a right-adjacent extent? */ - right = xbitmap_tree_iter_first(&bitmap->xb_root, last + 1, last + 1); + right = xbitmap64_tree_iter_first(&bitmap->xb_root, last + 1, last + 1); ASSERT(!right || right->bn_start == last + 1); if (left && right) { /* combine left and right adjacent extent */ - xbitmap_tree_remove(left, &bitmap->xb_root); - xbitmap_tree_remove(right, &bitmap->xb_root); + xbitmap64_tree_remove(left, &bitmap->xb_root); + xbitmap64_tree_remove(right, &bitmap->xb_root); left->bn_last = right->bn_last; - xbitmap_tree_insert(left, &bitmap->xb_root); + xbitmap64_tree_insert(left, &bitmap->xb_root); kfree(right); } else if (left) { /* combine with left extent */ - xbitmap_tree_remove(left, &bitmap->xb_root); + xbitmap64_tree_remove(left, &bitmap->xb_root); left->bn_last = last; - xbitmap_tree_insert(left, &bitmap->xb_root); + xbitmap64_tree_insert(left, &bitmap->xb_root); } else if (right) { /* combine with right extent */ - xbitmap_tree_remove(right, &bitmap->xb_root); + xbitmap64_tree_remove(right, &bitmap->xb_root); right->bn_start = start; - xbitmap_tree_insert(right, &bitmap->xb_root); + xbitmap64_tree_insert(right, &bitmap->xb_root); } else { /* add an extent */ - left = kmalloc(sizeof(struct xbitmap_node), XCHK_GFP_FLAGS); + left = kmalloc(sizeof(struct xbitmap64_node), XCHK_GFP_FLAGS); if (!left) return -ENOMEM; left->bn_start = start; left->bn_last = last; - xbitmap_tree_insert(left, &bitmap->xb_root); + xbitmap64_tree_insert(left, &bitmap->xb_root); } return 0; @@ -174,21 +176,21 @@ xbitmap_set( /* Free everything related to this bitmap. */ void -xbitmap_destroy( - struct xbitmap *bitmap) +xbitmap64_destroy( + struct xbitmap64 *bitmap) { - struct xbitmap_node *bn; + struct xbitmap64_node *bn; - while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, 0, -1ULL))) { - xbitmap_tree_remove(bn, &bitmap->xb_root); + while ((bn = xbitmap64_tree_iter_first(&bitmap->xb_root, 0, -1ULL))) { + xbitmap64_tree_remove(bn, &bitmap->xb_root); kfree(bn); } } /* Set up a per-AG block bitmap. */ void -xbitmap_init( - struct xbitmap *bitmap) +xbitmap64_init( + struct xbitmap64 *bitmap) { bitmap->xb_root = RB_ROOT_CACHED; } @@ -208,18 +210,18 @@ xbitmap_init( * This is the logical equivalent of bitmap &= ~sub. */ int -xbitmap_disunion( - struct xbitmap *bitmap, - struct xbitmap *sub) +xbitmap64_disunion( + struct xbitmap64 *bitmap, + struct xbitmap64 *sub) { - struct xbitmap_node *bn; + struct xbitmap64_node *bn; int error; - if (xbitmap_empty(bitmap) || xbitmap_empty(sub)) + if (xbitmap64_empty(bitmap) || xbitmap64_empty(sub)) return 0; - for_each_xbitmap_extent(bn, sub) { - error = xbitmap_clear(bitmap, bn->bn_start, + for_each_xbitmap64_extent(bn, sub) { + error = xbitmap64_clear(bitmap, bn->bn_start, bn->bn_last - bn->bn_start + 1); if (error) return error; @@ -228,88 +230,273 @@ xbitmap_disunion( return 0; } +/* How many bits are set in this bitmap? */ +uint64_t +xbitmap64_hweight( + struct xbitmap64 *bitmap) +{ + struct xbitmap64_node *bn; + uint64_t ret = 0; + + for_each_xbitmap64_extent(bn, bitmap) + ret += bn->bn_last - bn->bn_start + 1; + + return ret; +} + +/* Call a function for every run of set bits in this bitmap. */ +int +xbitmap64_walk( + struct xbitmap64 *bitmap, + xbitmap64_walk_fn fn, + void *priv) +{ + struct xbitmap64_node *bn; + int error = 0; + + for_each_xbitmap64_extent(bn, bitmap) { + error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv); + if (error) + break; + } + + return error; +} + +/* Does this bitmap have no bits set at all? */ +bool +xbitmap64_empty( + struct xbitmap64 *bitmap) +{ + return bitmap->xb_root.rb_root.rb_node == NULL; +} + +/* Is the start of the range set or clear? And for how long? */ +bool +xbitmap64_test( + struct xbitmap64 *bitmap, + uint64_t start, + uint64_t *len) +{ + struct xbitmap64_node *bn; + uint64_t last = start + *len - 1; + + bn = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last); + if (!bn) + return false; + if (bn->bn_start <= start) { + if (bn->bn_last < last) + *len = bn->bn_last - start + 1; + return true; + } + *len = bn->bn_start - start; + return false; +} + +/* u32 bitmap */ + +struct xbitmap32_node { + struct rb_node bn_rbnode; + + /* First set bit of this interval and subtree. */ + uint32_t bn_start; + + /* Last set bit of this interval. */ + uint32_t bn_last; + + /* Last set bit of this subtree. Do not touch this. */ + uint32_t __bn_subtree_last; +}; + +/* Define our own interval tree type with uint32_t parameters. */ + /* - * Record all btree blocks seen while iterating all records of a btree. - * - * We know that the btree query_all function starts at the left edge and walks - * towards the right edge of the tree. Therefore, we know that we can walk up - * the btree cursor towards the root; if the pointer for a given level points - * to the first record/key in that block, we haven't seen this block before; - * and therefore we need to remember that we saw this block in the btree. - * - * So if our btree is: - * - * 4 - * / | \ - * 1 2 3 - * - * Pretend for this example that each leaf block has 100 btree records. For - * the first btree record, we'll observe that bc_levels[0].ptr == 1, so we - * record that we saw block 1. Then we observe that bc_levels[1].ptr == 1, so - * we record block 4. The list is [1, 4]. - * - * For the second btree record, we see that bc_levels[0].ptr == 2, so we exit - * the loop. The list remains [1, 4]. - * - * For the 101st btree record, we've moved onto leaf block 2. Now - * bc_levels[0].ptr == 1 again, so we record that we saw block 2. We see that - * bc_levels[1].ptr == 2, so we exit the loop. The list is now [1, 4, 2]. - * - * For the 102nd record, bc_levels[0].ptr == 2, so we continue. - * - * For the 201st record, we've moved on to leaf block 3. - * bc_levels[0].ptr == 1, so we add 3 to the list. Now it is [1, 4, 2, 3]. - * - * For the 300th record we just exit, with the list being [1, 4, 2, 3]. + * These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll + * forward-declare them anyway for clarity. */ +static inline void +xbitmap32_tree_insert(struct xbitmap32_node *node, struct rb_root_cached *root); -/* Mark a btree block to the agblock bitmap. */ -STATIC int -xagb_bitmap_visit_btblock( - struct xfs_btree_cur *cur, - int level, - void *priv) +static inline void +xbitmap32_tree_remove(struct xbitmap32_node *node, struct rb_root_cached *root); + +static inline struct xbitmap32_node * +xbitmap32_tree_iter_first(struct rb_root_cached *root, uint32_t start, + uint32_t last); + +static inline struct xbitmap32_node * +xbitmap32_tree_iter_next(struct xbitmap32_node *node, uint32_t start, + uint32_t last); + +INTERVAL_TREE_DEFINE(struct xbitmap32_node, bn_rbnode, uint32_t, + __bn_subtree_last, START, LAST, static inline, xbitmap32_tree) + +/* Iterate each interval of a bitmap. Do not change the bitmap. */ +#define for_each_xbitmap32_extent(bn, bitmap) \ + for ((bn) = rb_entry_safe(rb_first(&(bitmap)->xb_root.rb_root), \ + struct xbitmap32_node, bn_rbnode); \ + (bn) != NULL; \ + (bn) = rb_entry_safe(rb_next(&(bn)->bn_rbnode), \ + struct xbitmap32_node, bn_rbnode)) + +/* Clear a range of this bitmap. */ +int +xbitmap32_clear( + struct xbitmap32 *bitmap, + uint32_t start, + uint32_t len) { - struct xagb_bitmap *bitmap = priv; - struct xfs_buf *bp; - xfs_fsblock_t fsbno; - xfs_agblock_t agbno; + struct xbitmap32_node *bn; + struct xbitmap32_node *new_bn; + uint32_t last = start + len - 1; - xfs_btree_get_block(cur, level, &bp); - if (!bp) - return 0; + while ((bn = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last))) { + if (bn->bn_start < start && bn->bn_last > last) { + uint32_t old_last = bn->bn_last; - fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); - agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + /* overlaps with the entire clearing range */ + xbitmap32_tree_remove(bn, &bitmap->xb_root); + bn->bn_last = start - 1; + xbitmap32_tree_insert(bn, &bitmap->xb_root); - return xagb_bitmap_set(bitmap, agbno, 1); + /* add an extent */ + new_bn = kmalloc(sizeof(struct xbitmap32_node), + XCHK_GFP_FLAGS); + if (!new_bn) + return -ENOMEM; + new_bn->bn_start = last + 1; + new_bn->bn_last = old_last; + xbitmap32_tree_insert(new_bn, &bitmap->xb_root); + } else if (bn->bn_start < start) { + /* overlaps with the left side of the clearing range */ + xbitmap32_tree_remove(bn, &bitmap->xb_root); + bn->bn_last = start - 1; + xbitmap32_tree_insert(bn, &bitmap->xb_root); + } else if (bn->bn_last > last) { + /* overlaps with the right side of the clearing range */ + xbitmap32_tree_remove(bn, &bitmap->xb_root); + bn->bn_start = last + 1; + xbitmap32_tree_insert(bn, &bitmap->xb_root); + break; + } else { + /* in the middle of the clearing range */ + xbitmap32_tree_remove(bn, &bitmap->xb_root); + kfree(bn); + } + } + + return 0; } -/* Mark all (per-AG) btree blocks in the agblock bitmap. */ +/* Set a range of this bitmap. */ int -xagb_bitmap_set_btblocks( - struct xagb_bitmap *bitmap, - struct xfs_btree_cur *cur) +xbitmap32_set( + struct xbitmap32 *bitmap, + uint32_t start, + uint32_t len) { - return xfs_btree_visit_blocks(cur, xagb_bitmap_visit_btblock, - XFS_BTREE_VISIT_ALL, bitmap); + struct xbitmap32_node *left; + struct xbitmap32_node *right; + uint32_t last = start + len - 1; + int error; + + /* Is this whole range already set? */ + left = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last); + if (left && left->bn_start <= start && left->bn_last >= last) + return 0; + + /* Clear out everything in the range we want to set. */ + error = xbitmap32_clear(bitmap, start, len); + if (error) + return error; + + /* Do we have a left-adjacent extent? */ + left = xbitmap32_tree_iter_first(&bitmap->xb_root, start - 1, start - 1); + ASSERT(!left || left->bn_last + 1 == start); + + /* Do we have a right-adjacent extent? */ + right = xbitmap32_tree_iter_first(&bitmap->xb_root, last + 1, last + 1); + ASSERT(!right || right->bn_start == last + 1); + + if (left && right) { + /* combine left and right adjacent extent */ + xbitmap32_tree_remove(left, &bitmap->xb_root); + xbitmap32_tree_remove(right, &bitmap->xb_root); + left->bn_last = right->bn_last; + xbitmap32_tree_insert(left, &bitmap->xb_root); + kfree(right); + } else if (left) { + /* combine with left extent */ + xbitmap32_tree_remove(left, &bitmap->xb_root); + left->bn_last = last; + xbitmap32_tree_insert(left, &bitmap->xb_root); + } else if (right) { + /* combine with right extent */ + xbitmap32_tree_remove(right, &bitmap->xb_root); + right->bn_start = start; + xbitmap32_tree_insert(right, &bitmap->xb_root); + } else { + /* add an extent */ + left = kmalloc(sizeof(struct xbitmap32_node), XCHK_GFP_FLAGS); + if (!left) + return -ENOMEM; + left->bn_start = start; + left->bn_last = last; + xbitmap32_tree_insert(left, &bitmap->xb_root); + } + + return 0; +} + +/* Free everything related to this bitmap. */ +void +xbitmap32_destroy( + struct xbitmap32 *bitmap) +{ + struct xbitmap32_node *bn; + + while ((bn = xbitmap32_tree_iter_first(&bitmap->xb_root, 0, -1U))) { + xbitmap32_tree_remove(bn, &bitmap->xb_root); + kfree(bn); + } +} + +/* Set up a per-AG block bitmap. */ +void +xbitmap32_init( + struct xbitmap32 *bitmap) +{ + bitmap->xb_root = RB_ROOT_CACHED; } /* - * Record all the buffers pointed to by the btree cursor. Callers already - * engaged in a btree walk should call this function to capture the list of - * blocks going from the leaf towards the root. + * Remove all the blocks mentioned in @sub from the extents in @bitmap. + * + * The intent is that callers will iterate the rmapbt for all of its records + * for a given owner to generate @bitmap; and iterate all the blocks of the + * metadata structures that are not being rebuilt and have the same rmapbt + * owner to generate @sub. This routine subtracts all the extents + * mentioned in sub from all the extents linked in @bitmap, which leaves + * @bitmap as the list of blocks that are not accounted for, which we assume + * are the dead blocks of the old metadata structure. The blocks mentioned in + * @bitmap can be reaped. + * + * This is the logical equivalent of bitmap &= ~sub. */ int -xagb_bitmap_set_btcur_path( - struct xagb_bitmap *bitmap, - struct xfs_btree_cur *cur) +xbitmap32_disunion( + struct xbitmap32 *bitmap, + struct xbitmap32 *sub) { - int i; + struct xbitmap32_node *bn; int error; - for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) { - error = xagb_bitmap_visit_btblock(cur, i, bitmap); + if (xbitmap32_empty(bitmap) || xbitmap32_empty(sub)) + return 0; + + for_each_xbitmap32_extent(bn, sub) { + error = xbitmap32_clear(bitmap, bn->bn_start, + bn->bn_last - bn->bn_start + 1); if (error) return error; } @@ -318,14 +505,14 @@ xagb_bitmap_set_btcur_path( } /* How many bits are set in this bitmap? */ -uint64_t -xbitmap_hweight( - struct xbitmap *bitmap) +uint32_t +xbitmap32_hweight( + struct xbitmap32 *bitmap) { - struct xbitmap_node *bn; - uint64_t ret = 0; + struct xbitmap32_node *bn; + uint32_t ret = 0; - for_each_xbitmap_extent(bn, bitmap) + for_each_xbitmap32_extent(bn, bitmap) ret += bn->bn_last - bn->bn_start + 1; return ret; @@ -333,15 +520,15 @@ xbitmap_hweight( /* Call a function for every run of set bits in this bitmap. */ int -xbitmap_walk( - struct xbitmap *bitmap, - xbitmap_walk_fn fn, +xbitmap32_walk( + struct xbitmap32 *bitmap, + xbitmap32_walk_fn fn, void *priv) { - struct xbitmap_node *bn; + struct xbitmap32_node *bn; int error = 0; - for_each_xbitmap_extent(bn, bitmap) { + for_each_xbitmap32_extent(bn, bitmap) { error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv); if (error) break; @@ -352,23 +539,23 @@ xbitmap_walk( /* Does this bitmap have no bits set at all? */ bool -xbitmap_empty( - struct xbitmap *bitmap) +xbitmap32_empty( + struct xbitmap32 *bitmap) { return bitmap->xb_root.rb_root.rb_node == NULL; } /* Is the start of the range set or clear? And for how long? */ bool -xbitmap_test( - struct xbitmap *bitmap, - uint64_t start, - uint64_t *len) +xbitmap32_test( + struct xbitmap32 *bitmap, + uint32_t start, + uint32_t *len) { - struct xbitmap_node *bn; - uint64_t last = start + *len - 1; + struct xbitmap32_node *bn; + uint32_t last = start + *len - 1; - bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last); + bn = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last); if (!bn) return false; if (bn->bn_start <= start) { diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h index 4fe58bad6734..2df8911606d6 100644 --- a/fs/xfs/scrub/bitmap.h +++ b/fs/xfs/scrub/bitmap.h @@ -6,17 +6,19 @@ #ifndef __XFS_SCRUB_BITMAP_H__ #define __XFS_SCRUB_BITMAP_H__ -struct xbitmap { +/* u64 bitmap */ + +struct xbitmap64 { struct rb_root_cached xb_root; }; -void xbitmap_init(struct xbitmap *bitmap); -void xbitmap_destroy(struct xbitmap *bitmap); +void xbitmap64_init(struct xbitmap64 *bitmap); +void xbitmap64_destroy(struct xbitmap64 *bitmap); -int xbitmap_clear(struct xbitmap *bitmap, uint64_t start, uint64_t len); -int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len); -int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub); -uint64_t xbitmap_hweight(struct xbitmap *bitmap); +int xbitmap64_clear(struct xbitmap64 *bitmap, uint64_t start, uint64_t len); +int xbitmap64_set(struct xbitmap64 *bitmap, uint64_t start, uint64_t len); +int xbitmap64_disunion(struct xbitmap64 *bitmap, struct xbitmap64 *sub); +uint64_t xbitmap64_hweight(struct xbitmap64 *bitmap); /* * Return codes for the bitmap iterator functions are 0 to continue iterating, @@ -25,84 +27,39 @@ uint64_t xbitmap_hweight(struct xbitmap *bitmap); * iteration, because neither bitmap iterator ever generates that error code on * its own. Callers must not modify the bitmap while walking it. */ -typedef int (*xbitmap_walk_fn)(uint64_t start, uint64_t len, void *priv); -int xbitmap_walk(struct xbitmap *bitmap, xbitmap_walk_fn fn, +typedef int (*xbitmap64_walk_fn)(uint64_t start, uint64_t len, void *priv); +int xbitmap64_walk(struct xbitmap64 *bitmap, xbitmap64_walk_fn fn, void *priv); -bool xbitmap_empty(struct xbitmap *bitmap); -bool xbitmap_test(struct xbitmap *bitmap, uint64_t start, uint64_t *len); +bool xbitmap64_empty(struct xbitmap64 *bitmap); +bool xbitmap64_test(struct xbitmap64 *bitmap, uint64_t start, uint64_t *len); -/* Bitmaps, but for type-checked for xfs_agblock_t */ +/* u32 bitmap */ -struct xagb_bitmap { - struct xbitmap agbitmap; +struct xbitmap32 { + struct rb_root_cached xb_root; }; -static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap) -{ - xbitmap_init(&bitmap->agbitmap); -} - -static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap) -{ - xbitmap_destroy(&bitmap->agbitmap); -} - -static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap, - xfs_agblock_t start, xfs_extlen_t len) -{ - return xbitmap_clear(&bitmap->agbitmap, start, len); -} -static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap, - xfs_agblock_t start, xfs_extlen_t len) -{ - return xbitmap_set(&bitmap->agbitmap, start, len); -} - -static inline bool -xagb_bitmap_test( - struct xagb_bitmap *bitmap, - xfs_agblock_t start, - xfs_extlen_t *len) -{ - uint64_t biglen = *len; - bool ret; - - ret = xbitmap_test(&bitmap->agbitmap, start, &biglen); - - if (start + biglen >= UINT_MAX) { - ASSERT(0); - biglen = UINT_MAX - start; - } - - *len = biglen; - return ret; -} - -static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap, - struct xagb_bitmap *sub) -{ - return xbitmap_disunion(&bitmap->agbitmap, &sub->agbitmap); -} +void xbitmap32_init(struct xbitmap32 *bitmap); +void xbitmap32_destroy(struct xbitmap32 *bitmap); -static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap) -{ - return xbitmap_hweight(&bitmap->agbitmap); -} -static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap) -{ - return xbitmap_empty(&bitmap->agbitmap); -} +int xbitmap32_clear(struct xbitmap32 *bitmap, uint32_t start, uint32_t len); +int xbitmap32_set(struct xbitmap32 *bitmap, uint32_t start, uint32_t len); +int xbitmap32_disunion(struct xbitmap32 *bitmap, struct xbitmap32 *sub); +uint32_t xbitmap32_hweight(struct xbitmap32 *bitmap); -static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap, - xbitmap_walk_fn fn, void *priv) -{ - return xbitmap_walk(&bitmap->agbitmap, fn, priv); -} +/* + * Return codes for the bitmap iterator functions are 0 to continue iterating, + * and non-zero to stop iterating. Any non-zero value will be passed up to the + * iteration caller. The special value -ECANCELED can be used to stop + * iteration, because neither bitmap iterator ever generates that error code on + * its own. Callers must not modify the bitmap while walking it. + */ +typedef int (*xbitmap32_walk_fn)(uint32_t start, uint32_t len, void *priv); +int xbitmap32_walk(struct xbitmap32 *bitmap, xbitmap32_walk_fn fn, + void *priv); -int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap, - struct xfs_btree_cur *cur); -int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap, - struct xfs_btree_cur *cur); +bool xbitmap32_empty(struct xbitmap32 *bitmap); +bool xbitmap32_test(struct xbitmap32 *bitmap, uint32_t start, uint32_t *len); #endif /* __XFS_SCRUB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 06d8c1996a33..b169cddde6da 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -19,9 +19,11 @@ #include "xfs_bmap_btree.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" +#include "xfs_health.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" +#include "scrub/health.h" #include "xfs_ag.h" /* Set us up with an inode's bmap. */ @@ -48,9 +50,18 @@ xchk_setup_inode_bmap( if (S_ISREG(VFS_I(sc->ip)->i_mode) && sc->sm->sm_type != XFS_SCRUB_TYPE_BMBTA) { struct address_space *mapping = VFS_I(sc->ip)->i_mapping; + bool is_repair = xchk_could_repair(sc); xchk_ilock(sc, XFS_MMAPLOCK_EXCL); + /* Break all our leases, we're going to mess with things. */ + if (is_repair) { + error = xfs_break_layouts(VFS_I(sc->ip), + &sc->ilock_flags, BREAK_WRITE); + if (error) + goto out; + } + inode_dio_wait(VFS_I(sc->ip)); /* @@ -71,6 +82,15 @@ xchk_setup_inode_bmap( error = filemap_fdatawait_keep_errors(mapping); if (error && (error != -ENOSPC && error != -EIO)) goto out; + + /* Drop the page cache if we're repairing block mappings. */ + if (is_repair) { + error = invalidate_inode_pages2( + VFS_I(sc->ip)->i_mapping); + if (error) + goto out; + } + } /* Got the inode, lock it and we're ready to go. */ @@ -78,6 +98,10 @@ xchk_setup_inode_bmap( if (error) goto out; + error = xchk_ino_dqattach(sc); + if (error) + goto out; + xchk_ilock(sc, XFS_ILOCK_EXCL); out: /* scrub teardown will unlock and release the inode */ @@ -633,6 +657,82 @@ xchk_bmap_check_ag_rmaps( } /* + * Decide if we want to scan the reverse mappings to determine if the attr + * fork /really/ has zero space mappings. + */ +STATIC bool +xchk_bmap_check_empty_attrfork( + struct xfs_inode *ip) +{ + struct xfs_ifork *ifp = &ip->i_af; + + /* + * If the dinode repair found a bad attr fork, it will reset the fork + * to extents format with zero records and wait for the this scrubber + * to reconstruct the block mappings. If the fork is not in this + * state, then the fork cannot have been zapped. + */ + if (ifp->if_format != XFS_DINODE_FMT_EXTENTS || ifp->if_nextents != 0) + return false; + + /* + * Files can have an attr fork in EXTENTS format with zero records for + * several reasons: + * + * a) an attr set created a fork but ran out of space + * b) attr replace deleted an old attr but failed during the set step + * c) the data fork was in btree format when all attrs were deleted, so + * the fork was left in place + * d) the inode repair code zapped the fork + * + * Only in case (d) do we want to scan the rmapbt to see if we need to + * rebuild the attr fork. The fork zap code clears all DAC permission + * bits and zeroes the uid and gid, so avoid the scan if any of those + * three conditions are not met. + */ + if ((VFS_I(ip)->i_mode & 0777) != 0) + return false; + if (!uid_eq(VFS_I(ip)->i_uid, GLOBAL_ROOT_UID)) + return false; + if (!gid_eq(VFS_I(ip)->i_gid, GLOBAL_ROOT_GID)) + return false; + + return true; +} + +/* + * Decide if we want to scan the reverse mappings to determine if the data + * fork /really/ has zero space mappings. + */ +STATIC bool +xchk_bmap_check_empty_datafork( + struct xfs_inode *ip) +{ + struct xfs_ifork *ifp = &ip->i_df; + + /* Don't support realtime rmap checks yet. */ + if (XFS_IS_REALTIME_INODE(ip)) + return false; + + /* + * If the dinode repair found a bad data fork, it will reset the fork + * to extents format with zero records and wait for the this scrubber + * to reconstruct the block mappings. If the fork is not in this + * state, then the fork cannot have been zapped. + */ + if (ifp->if_format != XFS_DINODE_FMT_EXTENTS || ifp->if_nextents != 0) + return false; + + /* + * If we encounter an empty data fork along with evidence that the fork + * might not really be empty, we need to scan the reverse mappings to + * decide if we're going to rebuild the fork. Data forks with nonzero + * file size are scanned. + */ + return i_size_read(VFS_I(ip)) != 0; +} + +/* * Decide if we want to walk every rmap btree in the fs to make sure that each * rmap for this file fork has corresponding bmbt entries. */ @@ -641,7 +741,6 @@ xchk_bmap_want_check_rmaps( struct xchk_bmap_info *info) { struct xfs_scrub *sc = info->sc; - struct xfs_ifork *ifp; if (!xfs_has_rmapbt(sc->mp)) return false; @@ -650,28 +749,10 @@ xchk_bmap_want_check_rmaps( if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return false; - /* Don't support realtime rmap checks yet. */ - if (info->is_rt) - return false; - - /* - * The inode repair code zaps broken inode forks by resetting them back - * to EXTENTS format and zero extent records. If we encounter a fork - * in this state along with evidence that the fork isn't supposed to be - * empty, we need to scan the reverse mappings to decide if we're going - * to rebuild the fork. Data forks with nonzero file size are scanned. - * xattr forks are never empty of content, so they are always scanned. - */ - ifp = xfs_ifork_ptr(sc->ip, info->whichfork); - if (ifp->if_format == XFS_DINODE_FMT_EXTENTS && ifp->if_nextents == 0) { - if (info->whichfork == XFS_DATA_FORK && - i_size_read(VFS_I(sc->ip)) == 0) - return false; - - return true; - } + if (info->whichfork == XFS_ATTR_FORK) + return xchk_bmap_check_empty_attrfork(sc->ip); - return false; + return xchk_bmap_check_empty_datafork(sc->ip); } /* Make sure each rmap has a corresponding bmbt entry. */ @@ -939,7 +1020,20 @@ int xchk_bmap_data( struct xfs_scrub *sc) { - return xchk_bmap(sc, XFS_DATA_FORK); + int error; + + if (xchk_file_looks_zapped(sc, XFS_SICK_INO_BMBTD_ZAPPED)) { + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + error = xchk_bmap(sc, XFS_DATA_FORK); + if (error) + return error; + + /* If the data fork is clean, it is clearly not zapped. */ + xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_BMBTD_ZAPPED); + return 0; } /* Scrub an inode's attr fork. */ @@ -947,7 +1041,27 @@ int xchk_bmap_attr( struct xfs_scrub *sc) { - return xchk_bmap(sc, XFS_ATTR_FORK); + int error; + + /* + * If the attr fork has been zapped, it's possible that forkoff was + * reset to zero and hence sc->ip->i_afp is NULL. We don't want the + * NULL ifp check in xchk_bmap to conclude that the attr fork is ok, + * so short circuit that logic by setting the corruption flag and + * returning immediately. + */ + if (xchk_file_looks_zapped(sc, XFS_SICK_INO_BMBTA_ZAPPED)) { + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + error = xchk_bmap(sc, XFS_ATTR_FORK); + if (error) + return error; + + /* If the attr fork is clean, it is clearly not zapped. */ + xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_BMBTA_ZAPPED); + return 0; } /* Scrub an inode's CoW fork. */ diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c new file mode 100644 index 000000000000..a4bb89fdd510 --- /dev/null +++ b/fs/xfs/scrub/bmap_repair.c @@ -0,0 +1,867 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_alloc.h" +#include "xfs_rtalloc.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_refcount.h" +#include "xfs_quota.h" +#include "xfs_ialloc.h" +#include "xfs_ag.h" +#include "xfs_reflink.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/fsb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" + +/* + * Inode Fork Block Mapping (BMBT) Repair + * ====================================== + * + * Gather all the rmap records for the inode and fork we're fixing, reset the + * incore fork, then recreate the btree. + */ + +enum reflink_scan_state { + RLS_IRRELEVANT = -1, /* not applicable to this file */ + RLS_UNKNOWN, /* shared extent scans required */ + RLS_SET_IFLAG, /* iflag must be set */ +}; + +struct xrep_bmap { + /* Old bmbt blocks */ + struct xfsb_bitmap old_bmbt_blocks; + + /* New fork. */ + struct xrep_newbt new_bmapbt; + + /* List of new bmap records. */ + struct xfarray *bmap_records; + + struct xfs_scrub *sc; + + /* How many blocks did we find allocated to this file? */ + xfs_rfsblock_t nblocks; + + /* How many bmbt blocks did we find for this fork? */ + xfs_rfsblock_t old_bmbt_block_count; + + /* get_records()'s position in the free space record array. */ + xfarray_idx_t array_cur; + + /* How many real (non-hole, non-delalloc) mappings do we have? */ + uint64_t real_mappings; + + /* Which fork are we fixing? */ + int whichfork; + + /* What d the REFLINK flag be set when the repair is over? */ + enum reflink_scan_state reflink_scan; + + /* Do we allow unwritten extents? */ + bool allow_unwritten; +}; + +/* Is this space extent shared? Flag the inode if it is. */ +STATIC int +xrep_bmap_discover_shared( + struct xrep_bmap *rb, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount) +{ + struct xfs_scrub *sc = rb->sc; + xfs_agblock_t agbno; + xfs_agblock_t fbno; + xfs_extlen_t flen; + int error; + + agbno = XFS_FSB_TO_AGBNO(sc->mp, startblock); + error = xfs_refcount_find_shared(sc->sa.refc_cur, agbno, blockcount, + &fbno, &flen, false); + if (error) + return error; + + if (fbno != NULLAGBLOCK) + rb->reflink_scan = RLS_SET_IFLAG; + + return 0; +} + +/* Remember this reverse-mapping as a series of bmap records. */ +STATIC int +xrep_bmap_from_rmap( + struct xrep_bmap *rb, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount, + bool unwritten) +{ + struct xfs_bmbt_irec irec = { + .br_startoff = startoff, + .br_startblock = startblock, + .br_state = unwritten ? XFS_EXT_UNWRITTEN : XFS_EXT_NORM, + }; + struct xfs_bmbt_rec rbe; + struct xfs_scrub *sc = rb->sc; + int error = 0; + + /* + * If we're repairing the data fork of a non-reflinked regular file on + * a reflink filesystem, we need to figure out if this space extent is + * shared. + */ + if (rb->reflink_scan == RLS_UNKNOWN && !unwritten) { + error = xrep_bmap_discover_shared(rb, startblock, blockcount); + if (error) + return error; + } + + do { + xfs_failaddr_t fa; + + irec.br_blockcount = min_t(xfs_filblks_t, blockcount, + XFS_MAX_BMBT_EXTLEN); + + fa = xfs_bmap_validate_extent(sc->ip, rb->whichfork, &irec); + if (fa) + return -EFSCORRUPTED; + + xfs_bmbt_disk_set_all(&rbe, &irec); + + trace_xrep_bmap_found(sc->ip, rb->whichfork, &irec); + + if (xchk_should_terminate(sc, &error)) + return error; + + error = xfarray_append(rb->bmap_records, &rbe); + if (error) + return error; + + rb->real_mappings++; + + irec.br_startblock += irec.br_blockcount; + irec.br_startoff += irec.br_blockcount; + blockcount -= irec.br_blockcount; + } while (blockcount > 0); + + return 0; +} + +/* Check for any obvious errors or conflicts in the file mapping. */ +STATIC int +xrep_bmap_check_fork_rmap( + struct xrep_bmap *rb, + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec) +{ + struct xfs_scrub *sc = rb->sc; + enum xbtree_recpacking outcome; + int error; + + /* + * Data extents for rt files are never stored on the data device, but + * everything else (xattrs, bmbt blocks) can be. + */ + if (XFS_IS_REALTIME_INODE(sc->ip) && + !(rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) + return -EFSCORRUPTED; + + /* Check that this is within the AG. */ + if (!xfs_verify_agbext(cur->bc_ag.pag, rec->rm_startblock, + rec->rm_blockcount)) + return -EFSCORRUPTED; + + /* Check the file offset range. */ + if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && + !xfs_verify_fileext(sc->mp, rec->rm_offset, rec->rm_blockcount)) + return -EFSCORRUPTED; + + /* No contradictory flags. */ + if ((rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)) && + (rec->rm_flags & XFS_RMAP_UNWRITTEN)) + return -EFSCORRUPTED; + + /* Make sure this isn't free space. */ + error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock, + rec->rm_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + /* Must not be an inode chunk. */ + error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur, + rec->rm_startblock, rec->rm_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + return 0; +} + +/* Record extents that belong to this inode's fork. */ +STATIC int +xrep_bmap_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_bmap *rb = priv; + struct xfs_mount *mp = cur->bc_mp; + xfs_fsblock_t fsbno; + int error = 0; + + if (xchk_should_terminate(rb->sc, &error)) + return error; + + if (rec->rm_owner != rb->sc->ip->i_ino) + return 0; + + error = xrep_bmap_check_fork_rmap(rb, cur, rec); + if (error) + return error; + + /* + * Record all blocks allocated to this file even if the extent isn't + * for the fork we're rebuilding so that we can reset di_nblocks later. + */ + rb->nblocks += rec->rm_blockcount; + + /* If this rmap isn't for the fork we want, we're done. */ + if (rb->whichfork == XFS_DATA_FORK && + (rec->rm_flags & XFS_RMAP_ATTR_FORK)) + return 0; + if (rb->whichfork == XFS_ATTR_FORK && + !(rec->rm_flags & XFS_RMAP_ATTR_FORK)) + return 0; + + /* Reject unwritten extents if we don't allow those. */ + if ((rec->rm_flags & XFS_RMAP_UNWRITTEN) && !rb->allow_unwritten) + return -EFSCORRUPTED; + + fsbno = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno, + rec->rm_startblock); + + if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) { + rb->old_bmbt_block_count += rec->rm_blockcount; + return xfsb_bitmap_set(&rb->old_bmbt_blocks, fsbno, + rec->rm_blockcount); + } + + return xrep_bmap_from_rmap(rb, rec->rm_offset, fsbno, + rec->rm_blockcount, + rec->rm_flags & XFS_RMAP_UNWRITTEN); +} + +/* + * Compare two block mapping records. We want to sort in order of increasing + * file offset. + */ +static int +xrep_bmap_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_bmbt_rec *ba = a; + const struct xfs_bmbt_rec *bb = b; + xfs_fileoff_t ao = xfs_bmbt_disk_get_startoff(ba); + xfs_fileoff_t bo = xfs_bmbt_disk_get_startoff(bb); + + if (ao > bo) + return 1; + else if (ao < bo) + return -1; + return 0; +} + +/* + * Sort the bmap extents by fork offset or else the records will be in the + * wrong order. Ensure there are no overlaps in the file offset ranges. + */ +STATIC int +xrep_bmap_sort_records( + struct xrep_bmap *rb) +{ + struct xfs_bmbt_irec irec; + xfs_fileoff_t next_off = 0; + xfarray_idx_t array_cur; + int error; + + error = xfarray_sort(rb->bmap_records, xrep_bmap_extent_cmp, + XFARRAY_SORT_KILLABLE); + if (error) + return error; + + foreach_xfarray_idx(rb->bmap_records, array_cur) { + struct xfs_bmbt_rec rec; + + if (xchk_should_terminate(rb->sc, &error)) + return error; + + error = xfarray_load(rb->bmap_records, array_cur, &rec); + if (error) + return error; + + xfs_bmbt_disk_get_all(&rec, &irec); + + if (irec.br_startoff < next_off) + return -EFSCORRUPTED; + + next_off = irec.br_startoff + irec.br_blockcount; + } + + return 0; +} + +/* Scan one AG for reverse mappings that we can turn into extent maps. */ +STATIC int +xrep_bmap_scan_ag( + struct xrep_bmap *rb, + struct xfs_perag *pag) +{ + struct xfs_scrub *sc = rb->sc; + int error; + + error = xrep_ag_init(sc, pag, &sc->sa); + if (error) + return error; + + error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_bmap_walk_rmap, rb); + xchk_ag_free(sc, &sc->sa); + return error; +} + +/* Find the delalloc extents from the old incore extent tree. */ +STATIC int +xrep_bmap_find_delalloc( + struct xrep_bmap *rb) +{ + struct xfs_bmbt_irec irec; + struct xfs_iext_cursor icur; + struct xfs_bmbt_rec rbe; + struct xfs_inode *ip = rb->sc->ip; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, rb->whichfork); + int error = 0; + + /* + * Skip this scan if we don't expect to find delayed allocation + * reservations in this fork. + */ + if (rb->whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0) + return 0; + + for_each_xfs_iext(ifp, &icur, &irec) { + if (!isnullstartblock(irec.br_startblock)) + continue; + + xfs_bmbt_disk_set_all(&rbe, &irec); + + trace_xrep_bmap_found(ip, rb->whichfork, &irec); + + if (xchk_should_terminate(rb->sc, &error)) + return error; + + error = xfarray_append(rb->bmap_records, &rbe); + if (error) + return error; + } + + return 0; +} + +/* + * Collect block mappings for this fork of this inode and decide if we have + * enough space to rebuild. Caller is responsible for cleaning up the list if + * anything goes wrong. + */ +STATIC int +xrep_bmap_find_mappings( + struct xrep_bmap *rb) +{ + struct xfs_scrub *sc = rb->sc; + struct xfs_perag *pag; + xfs_agnumber_t agno; + int error = 0; + + /* Iterate the rmaps for extents. */ + for_each_perag(sc->mp, agno, pag) { + error = xrep_bmap_scan_ag(rb, pag); + if (error) { + xfs_perag_rele(pag); + return error; + } + } + + return xrep_bmap_find_delalloc(rb); +} + +/* Retrieve real extent mappings for bulk loading the bmap btree. */ +STATIC int +xrep_bmap_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_bmbt_rec rec; + struct xfs_bmbt_irec *irec = &cur->bc_rec.b; + struct xrep_bmap *rb = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + do { + error = xfarray_load(rb->bmap_records, rb->array_cur++, + &rec); + if (error) + return error; + + xfs_bmbt_disk_get_all(&rec, irec); + } while (isnullstartblock(irec->br_startblock)); + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_bmap_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_bmap *rb = priv; + + return xrep_newbt_claim_block(cur, &rb->new_bmapbt, ptr); +} + +/* Figure out how much space we need to create the incore btree root block. */ +STATIC size_t +xrep_bmap_iroot_size( + struct xfs_btree_cur *cur, + unsigned int level, + unsigned int nr_this_level, + void *priv) +{ + ASSERT(level > 0); + + return XFS_BMAP_BROOT_SPACE_CALC(cur->bc_mp, nr_this_level); +} + +/* Update the inode counters. */ +STATIC int +xrep_bmap_reset_counters( + struct xrep_bmap *rb) +{ + struct xfs_scrub *sc = rb->sc; + struct xbtree_ifakeroot *ifake = &rb->new_bmapbt.ifake; + int64_t delta; + + if (rb->reflink_scan == RLS_SET_IFLAG) + sc->ip->i_diflags2 |= XFS_DIFLAG2_REFLINK; + + /* + * Update the inode block counts to reflect the extents we found in the + * rmapbt. + */ + delta = ifake->if_blocks - rb->old_bmbt_block_count; + sc->ip->i_nblocks = rb->nblocks + delta; + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + + /* + * Adjust the quota counts by the difference in size between the old + * and new bmbt. + */ + xfs_trans_mod_dquot_byino(sc->tp, sc->ip, XFS_TRANS_DQ_BCOUNT, delta); + return 0; +} + +/* + * Create a new iext tree and load it with block mappings. If the inode is + * in extents format, that's all we need to do to commit the new mappings. + * If it is in btree format, this takes care of preloading the incore tree. + */ +STATIC int +xrep_bmap_extents_load( + struct xrep_bmap *rb) +{ + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec irec; + struct xfs_ifork *ifp = rb->new_bmapbt.ifake.if_fork; + xfarray_idx_t array_cur; + int error; + + ASSERT(ifp->if_bytes == 0); + + /* Add all the mappings (incl. delalloc) to the incore extent tree. */ + xfs_iext_first(ifp, &icur); + foreach_xfarray_idx(rb->bmap_records, array_cur) { + struct xfs_bmbt_rec rec; + + error = xfarray_load(rb->bmap_records, array_cur, &rec); + if (error) + return error; + + xfs_bmbt_disk_get_all(&rec, &irec); + + xfs_iext_insert_raw(ifp, &icur, &irec); + if (!isnullstartblock(irec.br_startblock)) + ifp->if_nextents++; + + xfs_iext_next(ifp, &icur); + } + + return xrep_ino_ensure_extent_count(rb->sc, rb->whichfork, + ifp->if_nextents); +} + +/* + * Reserve new btree blocks, bulk load the bmap records into the ondisk btree, + * and load the incore extent tree. + */ +STATIC int +xrep_bmap_btree_load( + struct xrep_bmap *rb, + struct xfs_btree_cur *bmap_cur) +{ + struct xfs_scrub *sc = rb->sc; + int error; + + /* Compute how many blocks we'll need. */ + error = xfs_btree_bload_compute_geometry(bmap_cur, + &rb->new_bmapbt.bload, rb->real_mappings); + if (error) + return error; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + /* + * Guess how many blocks we're going to need to rebuild an entire bmap + * from the number of extents we found, and pump up our transaction to + * have sufficient block reservation. We're allowed to exceed file + * quota to repair inconsistent metadata. + */ + error = xfs_trans_reserve_more_inode(sc->tp, sc->ip, + rb->new_bmapbt.bload.nr_blocks, 0, true); + if (error) + return error; + + /* Reserve the space we'll need for the new btree. */ + error = xrep_newbt_alloc_blocks(&rb->new_bmapbt, + rb->new_bmapbt.bload.nr_blocks); + if (error) + return error; + + /* Add all observed bmap records. */ + rb->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(bmap_cur, &rb->new_bmapbt.bload, rb); + if (error) + return error; + + /* + * Load the new bmap records into the new incore extent tree to + * preserve delalloc reservations for regular files. The directory + * code loads the extent tree during xfs_dir_open and assumes + * thereafter that it remains loaded, so we must not violate that + * assumption. + */ + return xrep_bmap_extents_load(rb); +} + +/* + * Use the collected bmap information to stage a new bmap fork. If this is + * successful we'll return with the new fork information logged to the repair + * transaction but not yet committed. The caller must ensure that the inode + * is joined to the transaction; the inode will be joined to a clean + * transaction when the function returns. + */ +STATIC int +xrep_bmap_build_new_fork( + struct xrep_bmap *rb) +{ + struct xfs_owner_info oinfo; + struct xfs_scrub *sc = rb->sc; + struct xfs_btree_cur *bmap_cur; + struct xbtree_ifakeroot *ifake = &rb->new_bmapbt.ifake; + int error; + + error = xrep_bmap_sort_records(rb); + if (error) + return error; + + /* + * Prepare to construct the new fork by initializing the new btree + * structure and creating a fake ifork in the ifakeroot structure. + */ + xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, rb->whichfork); + error = xrep_newbt_init_inode(&rb->new_bmapbt, sc, rb->whichfork, + &oinfo); + if (error) + return error; + + rb->new_bmapbt.bload.get_records = xrep_bmap_get_records; + rb->new_bmapbt.bload.claim_block = xrep_bmap_claim_block; + rb->new_bmapbt.bload.iroot_size = xrep_bmap_iroot_size; + bmap_cur = xfs_bmbt_stage_cursor(sc->mp, sc->ip, ifake); + + /* + * Figure out the size and format of the new fork, then fill it with + * all the bmap records we've found. Join the inode to the transaction + * so that we can roll the transaction while holding the inode locked. + */ + if (rb->real_mappings <= XFS_IFORK_MAXEXT(sc->ip, rb->whichfork)) { + ifake->if_fork->if_format = XFS_DINODE_FMT_EXTENTS; + error = xrep_bmap_extents_load(rb); + } else { + ifake->if_fork->if_format = XFS_DINODE_FMT_BTREE; + error = xrep_bmap_btree_load(rb, bmap_cur); + } + if (error) + goto err_cur; + + /* + * Install the new fork in the inode. After this point the old mapping + * data are no longer accessible and the new tree is live. We delete + * the cursor immediately after committing the staged root because the + * staged fork might be in extents format. + */ + xfs_bmbt_commit_staged_btree(bmap_cur, sc->tp, rb->whichfork); + xfs_btree_del_cursor(bmap_cur, 0); + + /* Reset the inode counters now that we've changed the fork. */ + error = xrep_bmap_reset_counters(rb); + if (error) + goto err_newbt; + + /* Dispose of any unused blocks and the accounting information. */ + error = xrep_newbt_commit(&rb->new_bmapbt); + if (error) + return error; + + return xrep_roll_trans(sc); + +err_cur: + if (bmap_cur) + xfs_btree_del_cursor(bmap_cur, error); +err_newbt: + xrep_newbt_cancel(&rb->new_bmapbt); + return error; +} + +/* + * Now that we've logged the new inode btree, invalidate all of the old blocks + * and free them, if there were any. + */ +STATIC int +xrep_bmap_remove_old_tree( + struct xrep_bmap *rb) +{ + struct xfs_scrub *sc = rb->sc; + struct xfs_owner_info oinfo; + + /* Free the old bmbt blocks if they're not in use. */ + xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, rb->whichfork); + return xrep_reap_fsblocks(sc, &rb->old_bmbt_blocks, &oinfo); +} + +/* Check for garbage inputs. Returns -ECANCELED if there's nothing to do. */ +STATIC int +xrep_bmap_check_inputs( + struct xfs_scrub *sc, + int whichfork) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork); + + ASSERT(whichfork == XFS_DATA_FORK || whichfork == XFS_ATTR_FORK); + + if (!xfs_has_rmapbt(sc->mp)) + return -EOPNOTSUPP; + + /* No fork means nothing to rebuild. */ + if (!ifp) + return -ECANCELED; + + /* + * We only know how to repair extent mappings, which is to say that we + * only support extents and btree fork format. Repairs to a local + * format fork require a higher level repair function, so we do not + * have any work to do here. + */ + switch (ifp->if_format) { + case XFS_DINODE_FMT_DEV: + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_UUID: + return -ECANCELED; + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + break; + default: + return -EFSCORRUPTED; + } + + if (whichfork == XFS_ATTR_FORK) + return 0; + + /* Only files, symlinks, and directories get to have data forks. */ + switch (VFS_I(sc->ip)->i_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + /* ok */ + break; + default: + return -EINVAL; + } + + /* Don't know how to rebuild realtime data forks. */ + if (XFS_IS_REALTIME_INODE(sc->ip)) + return -EOPNOTSUPP; + + return 0; +} + +/* Set up the initial state of the reflink scan. */ +static inline enum reflink_scan_state +xrep_bmap_init_reflink_scan( + struct xfs_scrub *sc, + int whichfork) +{ + /* cannot share on non-reflink filesystem */ + if (!xfs_has_reflink(sc->mp)) + return RLS_IRRELEVANT; + + /* preserve flag if it's already set */ + if (xfs_is_reflink_inode(sc->ip)) + return RLS_SET_IFLAG; + + /* can only share regular files */ + if (!S_ISREG(VFS_I(sc->ip)->i_mode)) + return RLS_IRRELEVANT; + + /* cannot share attr fork extents */ + if (whichfork != XFS_DATA_FORK) + return RLS_IRRELEVANT; + + /* cannot share realtime extents */ + if (XFS_IS_REALTIME_INODE(sc->ip)) + return RLS_IRRELEVANT; + + return RLS_UNKNOWN; +} + +/* Repair an inode fork. */ +int +xrep_bmap( + struct xfs_scrub *sc, + int whichfork, + bool allow_unwritten) +{ + struct xrep_bmap *rb; + char *descr; + unsigned int max_bmbt_recs; + bool large_extcount; + int error = 0; + + error = xrep_bmap_check_inputs(sc, whichfork); + if (error == -ECANCELED) + return 0; + if (error) + return error; + + rb = kzalloc(sizeof(struct xrep_bmap), XCHK_GFP_FLAGS); + if (!rb) + return -ENOMEM; + rb->sc = sc; + rb->whichfork = whichfork; + rb->reflink_scan = xrep_bmap_init_reflink_scan(sc, whichfork); + rb->allow_unwritten = allow_unwritten; + + /* Set up enough storage to handle the max records for this fork. */ + large_extcount = xfs_has_large_extent_counts(sc->mp); + max_bmbt_recs = xfs_iext_max_nextents(large_extcount, whichfork); + descr = xchk_xfile_ino_descr(sc, "%s fork mapping records", + whichfork == XFS_DATA_FORK ? "data" : "attr"); + error = xfarray_create(descr, max_bmbt_recs, + sizeof(struct xfs_bmbt_rec), &rb->bmap_records); + kfree(descr); + if (error) + goto out_rb; + + /* Collect all reverse mappings for this fork's extents. */ + xfsb_bitmap_init(&rb->old_bmbt_blocks); + error = xrep_bmap_find_mappings(rb); + if (error) + goto out_bitmap; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* Rebuild the bmap information. */ + error = xrep_bmap_build_new_fork(rb); + if (error) + goto out_bitmap; + + /* Kill the old tree. */ + error = xrep_bmap_remove_old_tree(rb); + if (error) + goto out_bitmap; + +out_bitmap: + xfsb_bitmap_destroy(&rb->old_bmbt_blocks); + xfarray_destroy(rb->bmap_records); +out_rb: + kfree(rb); + return error; +} + +/* Repair an inode's data fork. */ +int +xrep_bmap_data( + struct xfs_scrub *sc) +{ + return xrep_bmap(sc, XFS_DATA_FORK, true); +} + +/* Repair an inode's attr fork. */ +int +xrep_bmap_attr( + struct xfs_scrub *sc) +{ + return xrep_bmap(sc, XFS_ATTR_FORK, false); +} diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index de24532fe083..81f2b96bb5a7 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -25,6 +25,7 @@ #include "xfs_trans_priv.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" +#include "xfs_dir2_priv.h" #include "xfs_attr.h" #include "xfs_reflink.h" #include "xfs_ag.h" @@ -604,6 +605,7 @@ xchk_ag_free( struct xchk_ag *sa) { xchk_ag_btcur_free(sa); + xrep_reset_perag_resv(sc); if (sa->agf_bp) { xfs_trans_brelse(sc->tp, sa->agf_bp); sa->agf_bp = NULL; @@ -733,6 +735,8 @@ xchk_iget( xfs_ino_t inum, struct xfs_inode **ipp) { + ASSERT(sc->tp != NULL); + return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp); } @@ -816,6 +820,26 @@ again: return 0; } +#ifdef CONFIG_XFS_QUOTA +/* + * Try to attach dquots to this inode if we think we might want to repair it. + * Callers must not hold any ILOCKs. If the dquots are broken and cannot be + * attached, a quotacheck will be scheduled. + */ +int +xchk_ino_dqattach( + struct xfs_scrub *sc) +{ + ASSERT(sc->tp != NULL); + ASSERT(sc->ip != NULL); + + if (!xchk_could_repair(sc)) + return 0; + + return xrep_ino_dqattach(sc); +} +#endif + /* Install an inode that we opened by handle for scrubbing. */ int xchk_install_handle_inode( @@ -882,8 +906,8 @@ xchk_iget_for_scrubbing( if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino)) return -ENOENT; - /* Try a regular untrusted iget. */ - error = xchk_iget(sc, sc->sm->sm_ino, &ip); + /* Try a safe untrusted iget. */ + error = xchk_iget_safe(sc, sc->sm->sm_ino, &ip); if (!error) return xchk_install_handle_inode(sc, ip); if (error == -ENOENT) @@ -1027,6 +1051,11 @@ xchk_setup_inode_contents( error = xchk_trans_alloc(sc, resblks); if (error) goto out; + + error = xchk_ino_dqattach(sc); + if (error) + goto out; + xchk_ilock(sc, XFS_ILOCK_EXCL); out: /* scrub teardown will unlock and release the inode for us */ @@ -1132,6 +1161,7 @@ xchk_metadata_inode_subtype( unsigned int scrub_type) { __u32 smtype = sc->sm->sm_type; + unsigned int sick_mask = sc->sick_mask; int error; sc->sm->sm_type = scrub_type; @@ -1149,6 +1179,7 @@ xchk_metadata_inode_subtype( break; } + sc->sick_mask = sick_mask; sc->sm->sm_type = smtype; return error; } diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index cabdc0e16838..da09580b454a 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -103,9 +103,15 @@ xchk_setup_rtsummary(struct xfs_scrub *sc) } #endif #ifdef CONFIG_XFS_QUOTA +int xchk_ino_dqattach(struct xfs_scrub *sc); int xchk_setup_quota(struct xfs_scrub *sc); #else static inline int +xchk_ino_dqattach(struct xfs_scrub *sc) +{ + return 0; +} +static inline int xchk_setup_quota(struct xfs_scrub *sc) { return -ENOENT; @@ -151,6 +157,11 @@ void xchk_iunlock(struct xfs_scrub *sc, unsigned int ilock_flags); void xchk_buffer_recheck(struct xfs_scrub *sc, struct xfs_buf *bp); +/* + * Grab the inode at @inum. The caller must have created a scrub transaction + * so that we can confirm the inumber by walking the inobt and not deadlock on + * a loop in the inobt. + */ int xchk_iget(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp); int xchk_iget_agi(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_buf **agi_bpp, struct xfs_inode **ipp); @@ -158,6 +169,26 @@ void xchk_irele(struct xfs_scrub *sc, struct xfs_inode *ip); int xchk_install_handle_inode(struct xfs_scrub *sc, struct xfs_inode *ip); /* + * Safe version of (untrusted) xchk_iget that uses an empty transaction to + * avoid deadlocking on loops in the inobt. This should only be used in a + * scrub or repair setup routine, and only prior to grabbing a transaction. + */ +static inline int +xchk_iget_safe(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp) +{ + int error; + + ASSERT(sc->tp == NULL); + + error = xchk_trans_alloc(sc, 0); + if (error) + return error; + error = xchk_iget(sc, inum, ipp); + xchk_trans_cancel(sc); + return error; +} + +/* * Don't bother cross-referencing if we already found corruption or cross * referencing discrepancies. */ @@ -167,6 +198,8 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm) XFS_SCRUB_OFLAG_XCORRUPT); } +bool xchk_dir_looks_zapped(struct xfs_inode *dp); + #ifdef CONFIG_XFS_ONLINE_REPAIR /* Decide if a repair is required. */ static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm) @@ -175,8 +208,21 @@ static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm) XFS_SCRUB_OFLAG_XCORRUPT | XFS_SCRUB_OFLAG_PREEN); } + +/* + * "Should we prepare for a repair?" + * + * Return true if the caller permits us to repair metadata and we're not + * setting up for a post-repair evaluation. + */ +static inline bool xchk_could_repair(const struct xfs_scrub *sc) +{ + return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && + !(sc->flags & XREP_ALREADY_FIXED); +} #else # define xchk_needs_repair(sc) (false) +# define xchk_could_repair(sc) (false) #endif /* CONFIG_XFS_ONLINE_REPAIR */ int xchk_metadata_inode_forks(struct xfs_scrub *sc); @@ -188,6 +234,16 @@ int xchk_metadata_inode_forks(struct xfs_scrub *sc); #define xchk_xfile_descr(sc, fmt, ...) \ kasprintf(XCHK_GFP_FLAGS, "XFS (%s): " fmt, \ (sc)->mp->m_super->s_id, ##__VA_ARGS__) +#define xchk_xfile_ag_descr(sc, fmt, ...) \ + kasprintf(XCHK_GFP_FLAGS, "XFS (%s): AG 0x%x " fmt, \ + (sc)->mp->m_super->s_id, \ + (sc)->sa.pag ? (sc)->sa.pag->pag_agno : (sc)->sm->sm_agno, \ + ##__VA_ARGS__) +#define xchk_xfile_ino_descr(sc, fmt, ...) \ + kasprintf(XCHK_GFP_FLAGS, "XFS (%s): inode 0x%llx " fmt, \ + (sc)->mp->m_super->s_id, \ + (sc)->ip ? (sc)->ip->i_ino : (sc)->sm->sm_ino, \ + ##__VA_ARGS__) /* * Setting up a hook to wait for intents to drain is costly -- we have to take diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c new file mode 100644 index 000000000000..1e82c727af8e --- /dev/null +++ b/fs/xfs/scrub/cow_repair.c @@ -0,0 +1,614 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_rmap.h" +#include "xfs_refcount.h" +#include "xfs_quota.h" +#include "xfs_ialloc.h" +#include "xfs_ag.h" +#include "xfs_error.h" +#include "xfs_errortag.h" +#include "xfs_icache.h" +#include "xfs_refcount_btree.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/off_bitmap.h" +#include "scrub/fsb_bitmap.h" +#include "scrub/reap.h" + +/* + * CoW Fork Mapping Repair + * ======================= + * + * Although CoW staging extents are owned by incore CoW inode forks, on disk + * they are owned by the refcount btree. The ondisk metadata does not record + * any ownership information, which limits what we can do to repair the + * mappings in the CoW fork. At most, we can replace ifork mappings that lack + * an entry in the refcount btree or are described by a reverse mapping record + * whose owner is not OWN_COW. + * + * Replacing extents is also tricky -- we can't touch written CoW fork extents + * since they are undergoing writeback, and delalloc extents do not require + * repair since they only exist incore. Hence the most we can do is find the + * bad parts of unwritten mappings, allocate a replacement set of blocks, and + * replace the incore mapping. We use the regular reaping process to unmap + * or free the discarded blocks, as appropriate. + */ +struct xrep_cow { + struct xfs_scrub *sc; + + /* Bitmap of file offset ranges that need replacing. */ + struct xoff_bitmap bad_fileoffs; + + /* Bitmap of fsblocks that were removed from the CoW fork. */ + struct xfsb_bitmap old_cowfork_fsblocks; + + /* CoW fork mappings used to scan for bad CoW staging extents. */ + struct xfs_bmbt_irec irec; + + /* refcount btree block number of irec.br_startblock */ + unsigned int irec_startbno; + + /* refcount btree block number of the next refcount record we expect */ + unsigned int next_bno; +}; + +/* CoW staging extent. */ +struct xrep_cow_extent { + xfs_fsblock_t fsbno; + xfs_extlen_t len; +}; + +/* + * Mark the part of the file range that corresponds to the given physical + * space. Caller must ensure that the physical range is within xc->irec. + */ +STATIC int +xrep_cow_mark_file_range( + struct xrep_cow *xc, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount) +{ + xfs_fileoff_t startoff; + + startoff = xc->irec.br_startoff + + (startblock - xc->irec.br_startblock); + + trace_xrep_cow_mark_file_range(xc->sc->ip, startblock, startoff, + blockcount); + + return xoff_bitmap_set(&xc->bad_fileoffs, startoff, blockcount); +} + +/* + * Trim @src to fit within the CoW fork mapping being examined, and put the + * result in @dst. + */ +static inline void +xrep_cow_trim_refcount( + struct xrep_cow *xc, + struct xfs_refcount_irec *dst, + const struct xfs_refcount_irec *src) +{ + unsigned int adj; + + memcpy(dst, src, sizeof(*dst)); + + if (dst->rc_startblock < xc->irec_startbno) { + adj = xc->irec_startbno - dst->rc_startblock; + dst->rc_blockcount -= adj; + dst->rc_startblock += adj; + } + + if (dst->rc_startblock + dst->rc_blockcount > + xc->irec_startbno + xc->irec.br_blockcount) { + adj = (dst->rc_startblock + dst->rc_blockcount) - + (xc->irec_startbno + xc->irec.br_blockcount); + dst->rc_blockcount -= adj; + } +} + +/* Mark any shared CoW staging extents. */ +STATIC int +xrep_cow_mark_shared_staging( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *rec, + void *priv) +{ + struct xrep_cow *xc = priv; + struct xfs_refcount_irec rrec; + xfs_fsblock_t fsbno; + + if (!xfs_refcount_check_domain(rec) || + rec->rc_domain != XFS_REFC_DOMAIN_SHARED) + return -EFSCORRUPTED; + + xrep_cow_trim_refcount(xc, &rrec, rec); + + fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, + rrec.rc_startblock); + return xrep_cow_mark_file_range(xc, fsbno, rrec.rc_blockcount); +} + +/* + * Mark any portion of the CoW fork file offset range where there is not a CoW + * staging extent record in the refcountbt, and keep a record of where we did + * find correct refcountbt records. Staging records are always cleaned out at + * mount time, so any two inodes trying to map the same staging area would have + * already taken the fs down due to refcount btree verifier errors. Hence this + * inode should be the sole creator of the staging extent records ondisk. + */ +STATIC int +xrep_cow_mark_missing_staging( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *rec, + void *priv) +{ + struct xrep_cow *xc = priv; + struct xfs_refcount_irec rrec; + int error; + + if (!xfs_refcount_check_domain(rec) || + rec->rc_domain != XFS_REFC_DOMAIN_COW) + return -EFSCORRUPTED; + + xrep_cow_trim_refcount(xc, &rrec, rec); + + if (xc->next_bno >= rrec.rc_startblock) + goto next; + + error = xrep_cow_mark_file_range(xc, + XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, + xc->next_bno), + rrec.rc_startblock - xc->next_bno); + if (error) + return error; + +next: + xc->next_bno = rrec.rc_startblock + rrec.rc_blockcount; + return 0; +} + +/* + * Mark any area that does not correspond to a CoW staging rmap. These are + * cross-linked areas that must be avoided. + */ +STATIC int +xrep_cow_mark_missing_staging_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_cow *xc = priv; + xfs_fsblock_t fsbno; + xfs_agblock_t rec_bno; + xfs_extlen_t rec_len; + unsigned int adj; + + if (rec->rm_owner == XFS_RMAP_OWN_COW) + return 0; + + rec_bno = rec->rm_startblock; + rec_len = rec->rm_blockcount; + if (rec_bno < xc->irec_startbno) { + adj = xc->irec_startbno - rec_bno; + rec_len -= adj; + rec_bno += adj; + } + + if (rec_bno + rec_len > xc->irec_startbno + xc->irec.br_blockcount) { + adj = (rec_bno + rec_len) - + (xc->irec_startbno + xc->irec.br_blockcount); + rec_len -= adj; + } + + fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, rec_bno); + return xrep_cow_mark_file_range(xc, fsbno, rec_len); +} + +/* + * Find any part of the CoW fork mapping that isn't a single-owner CoW staging + * extent and mark the corresponding part of the file range in the bitmap. + */ +STATIC int +xrep_cow_find_bad( + struct xrep_cow *xc) +{ + struct xfs_refcount_irec rc_low = { 0 }; + struct xfs_refcount_irec rc_high = { 0 }; + struct xfs_rmap_irec rm_low = { 0 }; + struct xfs_rmap_irec rm_high = { 0 }; + struct xfs_perag *pag; + struct xfs_scrub *sc = xc->sc; + xfs_agnumber_t agno; + int error; + + agno = XFS_FSB_TO_AGNO(sc->mp, xc->irec.br_startblock); + xc->irec_startbno = XFS_FSB_TO_AGBNO(sc->mp, xc->irec.br_startblock); + + pag = xfs_perag_get(sc->mp, agno); + if (!pag) + return -EFSCORRUPTED; + + error = xrep_ag_init(sc, pag, &sc->sa); + if (error) + goto out_pag; + + /* Mark any CoW fork extents that are shared. */ + rc_low.rc_startblock = xc->irec_startbno; + rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; + rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_SHARED; + error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high, + xrep_cow_mark_shared_staging, xc); + if (error) + goto out_sa; + + /* Make sure there are CoW staging extents for the whole mapping. */ + rc_low.rc_startblock = xc->irec_startbno; + rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; + rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_COW; + xc->next_bno = xc->irec_startbno; + error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high, + xrep_cow_mark_missing_staging, xc); + if (error) + goto out_sa; + + if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) { + error = xrep_cow_mark_file_range(xc, + XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, + xc->next_bno), + xc->irec_startbno + xc->irec.br_blockcount - + xc->next_bno); + if (error) + goto out_sa; + } + + /* Mark any area has an rmap that isn't a COW staging extent. */ + rm_low.rm_startblock = xc->irec_startbno; + memset(&rm_high, 0xFF, sizeof(rm_high)); + rm_high.rm_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; + error = xfs_rmap_query_range(sc->sa.rmap_cur, &rm_low, &rm_high, + xrep_cow_mark_missing_staging_rmap, xc); + if (error) + goto out_sa; + + /* + * If userspace is forcing us to rebuild the CoW fork or someone turned + * on the debugging knob, replace everything in the CoW fork. + */ + if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || + XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { + error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock, + xc->irec.br_blockcount); + if (error) + return error; + } + +out_sa: + xchk_ag_free(sc, &sc->sa); +out_pag: + xfs_perag_put(pag); + return 0; +} + +/* + * Allocate a replacement CoW staging extent of up to the given number of + * blocks, and fill out the mapping. + */ +STATIC int +xrep_cow_alloc( + struct xfs_scrub *sc, + xfs_extlen_t maxlen, + struct xrep_cow_extent *repl) +{ + struct xfs_alloc_arg args = { + .tp = sc->tp, + .mp = sc->mp, + .oinfo = XFS_RMAP_OINFO_SKIP_UPDATE, + .minlen = 1, + .maxlen = maxlen, + .prod = 1, + .resv = XFS_AG_RESV_NONE, + .datatype = XFS_ALLOC_USERDATA, + }; + int error; + + error = xfs_trans_reserve_more(sc->tp, maxlen, 0); + if (error) + return error; + + error = xfs_alloc_vextent_start_ag(&args, + XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino)); + if (error) + return error; + if (args.fsbno == NULLFSBLOCK) + return -ENOSPC; + + xfs_refcount_alloc_cow_extent(sc->tp, args.fsbno, args.len); + + repl->fsbno = args.fsbno; + repl->len = args.len; + return 0; +} + +/* + * Look up the current CoW fork mapping so that we only allocate enough to + * replace a single mapping. If we don't find a mapping that covers the start + * of the file range, or we find a delalloc or written extent, something is + * seriously wrong, since we didn't drop the ILOCK. + */ +static inline int +xrep_cow_find_mapping( + struct xrep_cow *xc, + struct xfs_iext_cursor *icur, + xfs_fileoff_t startoff, + struct xfs_bmbt_irec *got) +{ + struct xfs_inode *ip = xc->sc->ip; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); + + if (!xfs_iext_lookup_extent(ip, ifp, startoff, icur, got)) + goto bad; + + if (got->br_startoff > startoff) + goto bad; + + if (got->br_blockcount == 0) + goto bad; + + if (isnullstartblock(got->br_startblock)) + goto bad; + + if (xfs_bmap_is_written_extent(got)) + goto bad; + + return 0; +bad: + ASSERT(0); + return -EFSCORRUPTED; +} + +#define REPLACE_LEFT_SIDE (1U << 0) +#define REPLACE_RIGHT_SIDE (1U << 1) + +/* + * Given a CoW fork mapping @got and a replacement mapping @repl, remap the + * beginning of @got with the space described by @rep. + */ +static inline void +xrep_cow_replace_mapping( + struct xfs_inode *ip, + struct xfs_iext_cursor *icur, + const struct xfs_bmbt_irec *got, + const struct xrep_cow_extent *repl) +{ + struct xfs_bmbt_irec new = *got; /* struct copy */ + + ASSERT(repl->len > 0); + ASSERT(!isnullstartblock(got->br_startblock)); + + trace_xrep_cow_replace_mapping(ip, got, repl->fsbno, repl->len); + + if (got->br_blockcount == repl->len) { + /* + * The new extent is a complete replacement for the existing + * extent. Update the COW fork record. + */ + new.br_startblock = repl->fsbno; + xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new); + return; + } + + /* + * The new extent can replace the beginning of the COW fork record. + * Move the left side of @got upwards, then insert the new record. + */ + new.br_startoff += repl->len; + new.br_startblock += repl->len; + new.br_blockcount -= repl->len; + xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new); + + new.br_startoff = got->br_startoff; + new.br_startblock = repl->fsbno; + new.br_blockcount = repl->len; + xfs_iext_insert(ip, icur, &new, BMAP_COWFORK); +} + +/* + * Replace the unwritten CoW staging extent backing the given file range with a + * new space extent that isn't as problematic. + */ +STATIC int +xrep_cow_replace_range( + struct xrep_cow *xc, + xfs_fileoff_t startoff, + xfs_extlen_t *blockcount) +{ + struct xfs_iext_cursor icur; + struct xrep_cow_extent repl; + struct xfs_bmbt_irec got; + struct xfs_scrub *sc = xc->sc; + xfs_fileoff_t nextoff; + xfs_extlen_t alloc_len; + int error; + + /* + * Put the existing CoW fork mapping in @got. If @got ends before + * @rep, truncate @rep so we only replace one extent mapping at a time. + */ + error = xrep_cow_find_mapping(xc, &icur, startoff, &got); + if (error) + return error; + nextoff = min(startoff + *blockcount, + got.br_startoff + got.br_blockcount); + + /* + * Allocate a replacement extent. If we don't fill all the blocks, + * shorten the quantity that will be deleted in this step. + */ + alloc_len = min_t(xfs_fileoff_t, XFS_MAX_BMBT_EXTLEN, + nextoff - startoff); + error = xrep_cow_alloc(sc, alloc_len, &repl); + if (error) + return error; + + /* + * Replace the old mapping with the new one, and commit the metadata + * changes made so far. + */ + xrep_cow_replace_mapping(sc->ip, &icur, &got, &repl); + + xfs_inode_set_cowblocks_tag(sc->ip); + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + + /* Note the old CoW staging extents; we'll reap them all later. */ + error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks, got.br_startblock, + repl.len); + if (error) + return error; + + *blockcount = repl.len; + return 0; +} + +/* + * Replace a bad part of an unwritten CoW staging extent with a fresh delalloc + * reservation. + */ +STATIC int +xrep_cow_replace( + uint64_t startoff, + uint64_t blockcount, + void *priv) +{ + struct xrep_cow *xc = priv; + int error = 0; + + while (blockcount > 0) { + xfs_extlen_t len = min_t(xfs_filblks_t, blockcount, + XFS_MAX_BMBT_EXTLEN); + + error = xrep_cow_replace_range(xc, startoff, &len); + if (error) + break; + + blockcount -= len; + startoff += len; + } + + return error; +} + +/* + * Repair an inode's CoW fork. The CoW fork is an in-core structure, so + * there's no btree to rebuid. Instead, we replace any mappings that are + * cross-linked or lack ondisk CoW fork records in the refcount btree. + */ +int +xrep_bmap_cow( + struct xfs_scrub *sc) +{ + struct xrep_cow *xc; + struct xfs_iext_cursor icur; + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_COW_FORK); + int error; + + if (!xfs_has_rmapbt(sc->mp) || !xfs_has_reflink(sc->mp)) + return -EOPNOTSUPP; + + if (!ifp) + return 0; + + /* realtime files aren't supported yet */ + if (XFS_IS_REALTIME_INODE(sc->ip)) + return -EOPNOTSUPP; + + /* + * If we're somehow not in extents format, then reinitialize it to + * an empty extent mapping fork and exit. + */ + if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) { + ifp->if_format = XFS_DINODE_FMT_EXTENTS; + ifp->if_nextents = 0; + return 0; + } + + xc = kzalloc(sizeof(struct xrep_cow), XCHK_GFP_FLAGS); + if (!xc) + return -ENOMEM; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + xc->sc = sc; + xoff_bitmap_init(&xc->bad_fileoffs); + xfsb_bitmap_init(&xc->old_cowfork_fsblocks); + + for_each_xfs_iext(ifp, &icur, &xc->irec) { + if (xchk_should_terminate(sc, &error)) + goto out_bitmap; + + /* + * delalloc reservations only exist incore, so there is no + * ondisk metadata that we can examine. Hence we leave them + * alone. + */ + if (isnullstartblock(xc->irec.br_startblock)) + continue; + + /* + * COW fork extents are only in the written state if writeback + * is actively writing to disk. We cannot restart the write + * at a different disk address since we've already issued the + * IO, so we leave these alone and hope for the best. + */ + if (xfs_bmap_is_written_extent(&xc->irec)) + continue; + + error = xrep_cow_find_bad(xc); + if (error) + goto out_bitmap; + } + + /* Replace any bad unwritten mappings with fresh reservations. */ + error = xoff_bitmap_walk(&xc->bad_fileoffs, xrep_cow_replace, xc); + if (error) + goto out_bitmap; + + /* + * Reap as many of the old CoW blocks as we can. They are owned ondisk + * by the refcount btree, not the inode, so it is correct to treat them + * like inode metadata. + */ + error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks, + &XFS_RMAP_OINFO_COW); + if (error) + goto out_bitmap; + +out_bitmap: + xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks); + xoff_bitmap_destroy(&xc->bad_fileoffs); + kmem_free(xc); + return error; +} diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 0b491784b759..d86ab51af928 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -15,10 +15,12 @@ #include "xfs_icache.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" +#include "xfs_health.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/dabtree.h" #include "scrub/readdir.h" +#include "scrub/health.h" /* Set us up to scrub directories. */ int @@ -760,6 +762,11 @@ xchk_directory( if (!S_ISDIR(VFS_I(sc->ip)->i_mode)) return -ENOENT; + if (xchk_file_looks_zapped(sc, XFS_SICK_INO_DIR_ZAPPED)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + /* Plausible size? */ if (sc->ip->i_disk_size < xfs_dir2_sf_hdr_size(0)) { xchk_ino_set_corrupt(sc, sc->ip->i_ino); @@ -784,7 +791,36 @@ xchk_directory( /* Look up every name in this directory by hash. */ error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, NULL); - if (error == -ECANCELED) - error = 0; - return error; + if (error && error != -ECANCELED) + return error; + + /* If the dir is clean, it is clearly not zapped. */ + xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_DIR_ZAPPED); + return 0; +} + +/* + * Decide if this directory has been zapped to satisfy the inode and ifork + * verifiers. Checking and repairing should be postponed until the directory + * is fixed. + */ +bool +xchk_dir_looks_zapped( + struct xfs_inode *dp) +{ + /* Repair zapped this dir's data fork a short time ago */ + if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) + return true; + + /* + * If the dinode repair found a bad data fork, it will reset the fork + * to extents format with zero records and wait for the bmapbtd + * scrubber to reconstruct the block mappings. Directories always + * contain some content, so this is a clear sign of a zapped directory. + * The state checked by xfs_ifork_zapped is not persisted, so this is + * the secondary strategy if repairs are interrupted by a crash or an + * unmount. + */ + return dp->i_df.if_format == XFS_DINODE_FMT_EXTENTS && + dp->i_df.if_nextents == 0; } diff --git a/fs/xfs/scrub/dqiterate.c b/fs/xfs/scrub/dqiterate.c new file mode 100644 index 000000000000..20c4daedd48d --- /dev/null +++ b/fs/xfs/scrub/dqiterate.c @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_bit.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_bmap.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/quota.h" +#include "scrub/trace.h" + +/* Initialize a dquot iteration cursor. */ +void +xchk_dqiter_init( + struct xchk_dqiter *cursor, + struct xfs_scrub *sc, + xfs_dqtype_t dqtype) +{ + cursor->sc = sc; + cursor->bmap.br_startoff = NULLFILEOFF; + cursor->dqtype = dqtype & XFS_DQTYPE_REC_MASK; + cursor->quota_ip = xfs_quota_inode(sc->mp, cursor->dqtype); + cursor->id = 0; +} + +/* + * Ensure that the cached data fork mapping for the dqiter cursor is fresh and + * covers the dquot pointed to by the scan cursor. + */ +STATIC int +xchk_dquot_iter_revalidate_bmap( + struct xchk_dqiter *cursor) +{ + struct xfs_quotainfo *qi = cursor->sc->mp->m_quotainfo; + struct xfs_ifork *ifp = xfs_ifork_ptr(cursor->quota_ip, + XFS_DATA_FORK); + xfs_fileoff_t fileoff; + xfs_dqid_t this_id = cursor->id; + int nmaps = 1; + int error; + + fileoff = this_id / qi->qi_dqperchunk; + + /* + * If we have a mapping for cursor->id and it's still fresh, there's + * no need to reread the bmbt. + */ + if (cursor->bmap.br_startoff != NULLFILEOFF && + cursor->if_seq == ifp->if_seq && + cursor->bmap.br_startoff + cursor->bmap.br_blockcount > fileoff) + return 0; + + /* Look up the data fork mapping for the dquot id of interest. */ + error = xfs_bmapi_read(cursor->quota_ip, fileoff, + XFS_MAX_FILEOFF - fileoff, &cursor->bmap, &nmaps, 0); + if (error) + return error; + if (!nmaps) { + ASSERT(nmaps > 0); + return -EFSCORRUPTED; + } + if (cursor->bmap.br_startoff > fileoff) { + ASSERT(cursor->bmap.br_startoff == fileoff); + return -EFSCORRUPTED; + } + + cursor->if_seq = ifp->if_seq; + trace_xchk_dquot_iter_revalidate_bmap(cursor, cursor->id); + return 0; +} + +/* Advance the dqiter cursor to the next non-sparse region of the quota file. */ +STATIC int +xchk_dquot_iter_advance_bmap( + struct xchk_dqiter *cursor, + uint64_t *next_ondisk_id) +{ + struct xfs_quotainfo *qi = cursor->sc->mp->m_quotainfo; + struct xfs_ifork *ifp = xfs_ifork_ptr(cursor->quota_ip, + XFS_DATA_FORK); + xfs_fileoff_t fileoff; + uint64_t next_id; + int nmaps = 1; + int error; + + /* Find the dquot id for the next non-hole mapping. */ + do { + fileoff = cursor->bmap.br_startoff + cursor->bmap.br_blockcount; + if (fileoff > XFS_DQ_ID_MAX / qi->qi_dqperchunk) { + /* The hole goes beyond the max dquot id, we're done */ + *next_ondisk_id = -1ULL; + return 0; + } + + error = xfs_bmapi_read(cursor->quota_ip, fileoff, + XFS_MAX_FILEOFF - fileoff, &cursor->bmap, + &nmaps, 0); + if (error) + return error; + if (!nmaps) { + /* Must have reached the end of the mappings. */ + *next_ondisk_id = -1ULL; + return 0; + } + if (cursor->bmap.br_startoff > fileoff) { + ASSERT(cursor->bmap.br_startoff == fileoff); + return -EFSCORRUPTED; + } + } while (!xfs_bmap_is_real_extent(&cursor->bmap)); + + next_id = cursor->bmap.br_startoff * qi->qi_dqperchunk; + if (next_id > XFS_DQ_ID_MAX) { + /* The hole goes beyond the max dquot id, we're done */ + *next_ondisk_id = -1ULL; + return 0; + } + + /* Propose jumping forward to the dquot in the next allocated block. */ + *next_ondisk_id = next_id; + cursor->if_seq = ifp->if_seq; + trace_xchk_dquot_iter_advance_bmap(cursor, *next_ondisk_id); + return 0; +} + +/* + * Find the id of the next highest incore dquot. Normally this will correspond + * exactly with the quota file block mappings, but repair might have erased a + * mapping because it was crosslinked; in that case, we need to re-allocate the + * space so that we can reset q_blkno. + */ +STATIC void +xchk_dquot_iter_advance_incore( + struct xchk_dqiter *cursor, + uint64_t *next_incore_id) +{ + struct xfs_quotainfo *qi = cursor->sc->mp->m_quotainfo; + struct radix_tree_root *tree = xfs_dquot_tree(qi, cursor->dqtype); + struct xfs_dquot *dq; + unsigned int nr_found; + + *next_incore_id = -1ULL; + + mutex_lock(&qi->qi_tree_lock); + nr_found = radix_tree_gang_lookup(tree, (void **)&dq, cursor->id, 1); + if (nr_found) + *next_incore_id = dq->q_id; + mutex_unlock(&qi->qi_tree_lock); + + trace_xchk_dquot_iter_advance_incore(cursor, *next_incore_id); +} + +/* + * Walk all incore dquots of this filesystem. Caller must set *@cursorp to + * zero before the first call, and must not hold the quota file ILOCK. + * Returns 1 and a valid *@dqpp; 0 and *@dqpp == NULL when there are no more + * dquots to iterate; or a negative errno. + */ +int +xchk_dquot_iter( + struct xchk_dqiter *cursor, + struct xfs_dquot **dqpp) +{ + struct xfs_mount *mp = cursor->sc->mp; + struct xfs_dquot *dq = NULL; + uint64_t next_ondisk, next_incore = -1ULL; + unsigned int lock_mode; + int error = 0; + + if (cursor->id > XFS_DQ_ID_MAX) + return 0; + next_ondisk = cursor->id; + + /* Revalidate and/or advance the cursor. */ + lock_mode = xfs_ilock_data_map_shared(cursor->quota_ip); + error = xchk_dquot_iter_revalidate_bmap(cursor); + if (!error && !xfs_bmap_is_real_extent(&cursor->bmap)) + error = xchk_dquot_iter_advance_bmap(cursor, &next_ondisk); + xfs_iunlock(cursor->quota_ip, lock_mode); + if (error) + return error; + + if (next_ondisk > cursor->id) + xchk_dquot_iter_advance_incore(cursor, &next_incore); + + /* Pick the next dquot in the sequence and return it. */ + cursor->id = min(next_ondisk, next_incore); + if (cursor->id > XFS_DQ_ID_MAX) + return 0; + + trace_xchk_dquot_iter(cursor, cursor->id); + + error = xfs_qm_dqget(mp, cursor->id, cursor->dqtype, false, &dq); + if (error) + return error; + + cursor->id = dq->q_id + 1; + *dqpp = dq; + return 1; +} diff --git a/fs/xfs/scrub/fsb_bitmap.h b/fs/xfs/scrub/fsb_bitmap.h new file mode 100644 index 000000000000..40b462c1dd0d --- /dev/null +++ b/fs/xfs/scrub/fsb_bitmap.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_FSB_BITMAP_H__ +#define __XFS_SCRUB_FSB_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_fsblock_t */ + +struct xfsb_bitmap { + struct xbitmap64 fsbitmap; +}; + +static inline void xfsb_bitmap_init(struct xfsb_bitmap *bitmap) +{ + xbitmap64_init(&bitmap->fsbitmap); +} + +static inline void xfsb_bitmap_destroy(struct xfsb_bitmap *bitmap) +{ + xbitmap64_destroy(&bitmap->fsbitmap); +} + +static inline int xfsb_bitmap_set(struct xfsb_bitmap *bitmap, + xfs_fsblock_t start, xfs_filblks_t len) +{ + return xbitmap64_set(&bitmap->fsbitmap, start, len); +} + +static inline int xfsb_bitmap_walk(struct xfsb_bitmap *bitmap, + xbitmap64_walk_fn fn, void *priv) +{ + return xbitmap64_walk(&bitmap->fsbitmap, fn, priv); +} + +#endif /* __XFS_SCRUB_FSB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index 5e2b09ed6e29..531006910ca9 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -10,8 +10,6 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_btree.h" -#include "xfs_trans_resv.h" -#include "xfs_mount.h" #include "xfs_ag.h" #include "xfs_health.h" #include "scrub/scrub.h" @@ -118,6 +116,38 @@ xchk_health_mask_for_scrub_type( } /* + * If the scrub state is clean, add @mask to the scrub sick mask to clear + * additional sick flags from the metadata object's sick state. + */ +void +xchk_mark_healthy_if_clean( + struct xfs_scrub *sc, + unsigned int mask) +{ + if (!(sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT))) + sc->sick_mask |= mask; +} + +/* + * If we're scrubbing a piece of file metadata for the first time, does it look + * like it has been zapped? Skip the check if we just repaired the metadata + * and are revalidating it. + */ +bool +xchk_file_looks_zapped( + struct xfs_scrub *sc, + unsigned int mask) +{ + ASSERT((mask & ~XFS_SICK_INO_ZAPPED) == 0); + + if (sc->flags & XREP_ALREADY_FIXED) + return false; + + return xfs_inode_has_sickness(sc->ip, mask); +} + +/* * Update filesystem health assessments based on what we found and did. * * If the scrubber finds errors, we mark sick whatever's mentioned in diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h index 66a273f8585b..a731b2467399 100644 --- a/fs/xfs/scrub/health.h +++ b/fs/xfs/scrub/health.h @@ -10,5 +10,7 @@ unsigned int xchk_health_mask_for_scrub_type(__u32 scrub_type); void xchk_update_health(struct xfs_scrub *sc); bool xchk_ag_btree_healthy_enough(struct xfs_scrub *sc, struct xfs_perag *pag, xfs_btnum_t btnum); +void xchk_mark_healthy_if_clean(struct xfs_scrub *sc, unsigned int mask); +bool xchk_file_looks_zapped(struct xfs_scrub *sc, unsigned int mask); #endif /* __XFS_SCRUB_HEALTH_H__ */ diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index fb7bbf47ae5d..a720fc62262a 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -585,7 +585,7 @@ xchk_iallocbt_rec( uint16_t holemask; xfs_inobt_btrec_to_irec(mp, rec, &irec); - if (xfs_inobt_check_irec(bs->cur, &irec) != NULL) { + if (xfs_inobt_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } @@ -708,11 +708,10 @@ xchk_iallocbt_xref_rmap_inodes( xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); } -/* Scrub the inode btrees for some AG. */ -STATIC int +/* Scrub one of the inode btrees for some AG. */ +int xchk_iallocbt( - struct xfs_scrub *sc, - xfs_btnum_t which) + struct xfs_scrub *sc) { struct xfs_btree_cur *cur; struct xchk_iallocbt iabt = { @@ -720,9 +719,23 @@ xchk_iallocbt( .next_startino = NULLAGINO, .next_cluster_ino = NULLAGINO, }; + xfs_btnum_t which; int error; - cur = which == XFS_BTNUM_INO ? sc->sa.ino_cur : sc->sa.fino_cur; + switch (sc->sm->sm_type) { + case XFS_SCRUB_TYPE_INOBT: + cur = sc->sa.ino_cur; + which = XFS_BTNUM_INO; + break; + case XFS_SCRUB_TYPE_FINOBT: + cur = sc->sa.fino_cur; + which = XFS_BTNUM_FINO; + break; + default: + ASSERT(0); + return -EIO; + } + error = xchk_btree(sc, cur, xchk_iallocbt_rec, &XFS_RMAP_OINFO_INOBT, &iabt); if (error) @@ -743,20 +756,6 @@ xchk_iallocbt( return error; } -int -xchk_inobt( - struct xfs_scrub *sc) -{ - return xchk_iallocbt(sc, XFS_BTNUM_INO); -} - -int -xchk_finobt( - struct xfs_scrub *sc) -{ - return xchk_iallocbt(sc, XFS_BTNUM_FINO); -} - /* See if an inode btree has (or doesn't have) an inode chunk record. */ static inline void xchk_xref_inode_check( diff --git a/fs/xfs/scrub/ialloc_repair.c b/fs/xfs/scrub/ialloc_repair.c new file mode 100644 index 000000000000..b3f7182dd2f5 --- /dev/null +++ b/fs/xfs/scrub/ialloc_repair.c @@ -0,0 +1,884 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_icache.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_log.h" +#include "xfs_trans_priv.h" +#include "xfs_error.h" +#include "xfs_health.h" +#include "xfs_ag.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" + +/* + * Inode Btree Repair + * ================== + * + * A quick refresher of inode btrees on a v5 filesystem: + * + * - Inode records are read into memory in units of 'inode clusters'. However + * many inodes fit in a cluster buffer is the smallest number of inodes that + * can be allocated or freed. Clusters are never smaller than one fs block + * though they can span multiple blocks. The size (in fs blocks) is + * computed with xfs_icluster_size_fsb(). The fs block alignment of a + * cluster is computed with xfs_ialloc_cluster_alignment(). + * + * - Each inode btree record can describe a single 'inode chunk'. The chunk + * size is defined to be 64 inodes. If sparse inodes are enabled, every + * inobt record must be aligned to the chunk size; if not, every record must + * be aligned to the start of a cluster. It is possible to construct an XFS + * geometry where one inobt record maps to multiple inode clusters; it is + * also possible to construct a geometry where multiple inobt records map to + * different parts of one inode cluster. + * + * - If sparse inodes are not enabled, the smallest unit of allocation for + * inode records is enough to contain one inode chunk's worth of inodes. + * + * - If sparse inodes are enabled, the holemask field will be active. Each + * bit of the holemask represents 4 potential inodes; if set, the + * corresponding space does *not* contain inodes and must be left alone. + * Clusters cannot be smaller than 4 inodes. The smallest unit of allocation + * of inode records is one inode cluster. + * + * So what's the rebuild algorithm? + * + * Iterate the reverse mapping records looking for OWN_INODES and OWN_INOBT + * records. The OWN_INOBT records are the old inode btree blocks and will be + * cleared out after we've rebuilt the tree. Each possible inode cluster + * within an OWN_INODES record will be read in; for each possible inobt record + * associated with that cluster, compute the freemask calculated from the + * i_mode data in the inode chunk. For sparse inodes the holemask will be + * calculated by creating the properly aligned inobt record and punching out + * any chunk that's missing. Inode allocations and frees grab the AGI first, + * so repair protects itself from concurrent access by locking the AGI. + * + * Once we've reconstructed all the inode records, we can create new inode + * btree roots and reload the btrees. We rebuild both inode trees at the same + * time because they have the same rmap owner and it would be more complex to + * figure out if the other tree isn't in need of a rebuild and which OWN_INOBT + * blocks it owns. We have all the data we need to build both, so dump + * everything and start over. + * + * We use the prefix 'xrep_ibt' because we rebuild both inode btrees at once. + */ + +struct xrep_ibt { + /* Record under construction. */ + struct xfs_inobt_rec_incore rie; + + /* new inobt information */ + struct xrep_newbt new_inobt; + + /* new finobt information */ + struct xrep_newbt new_finobt; + + /* Old inode btree blocks we found in the rmap. */ + struct xagb_bitmap old_iallocbt_blocks; + + /* Reconstructed inode records. */ + struct xfarray *inode_records; + + struct xfs_scrub *sc; + + /* Number of inodes assigned disk space. */ + unsigned int icount; + + /* Number of inodes in use. */ + unsigned int iused; + + /* Number of finobt records needed. */ + unsigned int finobt_recs; + + /* get_records()'s position in the inode record array. */ + xfarray_idx_t array_cur; +}; + +/* + * Is this inode in use? If the inode is in memory we can tell from i_mode, + * otherwise we have to check di_mode in the on-disk buffer. We only care + * that the high (i.e. non-permission) bits of _mode are zero. This should be + * safe because repair keeps all AG headers locked until the end, and process + * trying to perform an inode allocation/free must lock the AGI. + * + * @cluster_ag_base is the inode offset of the cluster within the AG. + * @cluster_bp is the cluster buffer. + * @cluster_index is the inode offset within the inode cluster. + */ +STATIC int +xrep_ibt_check_ifree( + struct xrep_ibt *ri, + xfs_agino_t cluster_ag_base, + struct xfs_buf *cluster_bp, + unsigned int cluster_index, + bool *inuse) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_dinode *dip; + xfs_ino_t fsino; + xfs_agino_t agino; + xfs_agnumber_t agno = ri->sc->sa.pag->pag_agno; + unsigned int cluster_buf_base; + unsigned int offset; + int error; + + agino = cluster_ag_base + cluster_index; + fsino = XFS_AGINO_TO_INO(mp, agno, agino); + + /* Inode uncached or half assembled, read disk buffer */ + cluster_buf_base = XFS_INO_TO_OFFSET(mp, cluster_ag_base); + offset = (cluster_buf_base + cluster_index) * mp->m_sb.sb_inodesize; + if (offset >= BBTOB(cluster_bp->b_length)) + return -EFSCORRUPTED; + dip = xfs_buf_offset(cluster_bp, offset); + if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) + return -EFSCORRUPTED; + + if (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino) + return -EFSCORRUPTED; + + /* Will the in-core inode tell us if it's in use? */ + error = xchk_inode_is_allocated(sc, agino, inuse); + if (!error) + return 0; + + *inuse = dip->di_mode != 0; + return 0; +} + +/* Stash the accumulated inobt record for rebuilding. */ +STATIC int +xrep_ibt_stash( + struct xrep_ibt *ri) +{ + int error = 0; + + if (xchk_should_terminate(ri->sc, &error)) + return error; + + ri->rie.ir_freecount = xfs_inobt_rec_freecount(&ri->rie); + if (xfs_inobt_check_irec(ri->sc->sa.pag, &ri->rie) != NULL) + return -EFSCORRUPTED; + + if (ri->rie.ir_freecount > 0) + ri->finobt_recs++; + + trace_xrep_ibt_found(ri->sc->mp, ri->sc->sa.pag->pag_agno, &ri->rie); + + error = xfarray_append(ri->inode_records, &ri->rie); + if (error) + return error; + + ri->rie.ir_startino = NULLAGINO; + return 0; +} + +/* + * Given an extent of inodes and an inode cluster buffer, calculate the + * location of the corresponding inobt record (creating it if necessary), + * then update the parts of the holemask and freemask of that record that + * correspond to the inode extent we were given. + * + * @cluster_ir_startino is the AG inode number of an inobt record that we're + * proposing to create for this inode cluster. If sparse inodes are enabled, + * we must round down to a chunk boundary to find the actual sparse record. + * @cluster_bp is the buffer of the inode cluster. + * @nr_inodes is the number of inodes to check from the cluster. + */ +STATIC int +xrep_ibt_cluster_record( + struct xrep_ibt *ri, + xfs_agino_t cluster_ir_startino, + struct xfs_buf *cluster_bp, + unsigned int nr_inodes) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_mount *mp = sc->mp; + xfs_agino_t ir_startino; + unsigned int cluster_base; + unsigned int cluster_index; + int error = 0; + + ir_startino = cluster_ir_startino; + if (xfs_has_sparseinodes(mp)) + ir_startino = rounddown(ir_startino, XFS_INODES_PER_CHUNK); + cluster_base = cluster_ir_startino - ir_startino; + + /* + * If the accumulated inobt record doesn't map this cluster, add it to + * the list and reset it. + */ + if (ri->rie.ir_startino != NULLAGINO && + ri->rie.ir_startino + XFS_INODES_PER_CHUNK <= ir_startino) { + error = xrep_ibt_stash(ri); + if (error) + return error; + } + + if (ri->rie.ir_startino == NULLAGINO) { + ri->rie.ir_startino = ir_startino; + ri->rie.ir_free = XFS_INOBT_ALL_FREE; + ri->rie.ir_holemask = 0xFFFF; + ri->rie.ir_count = 0; + } + + /* Record the whole cluster. */ + ri->icount += nr_inodes; + ri->rie.ir_count += nr_inodes; + ri->rie.ir_holemask &= ~xfs_inobt_maskn( + cluster_base / XFS_INODES_PER_HOLEMASK_BIT, + nr_inodes / XFS_INODES_PER_HOLEMASK_BIT); + + /* Which inodes within this cluster are free? */ + for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) { + bool inuse = false; + + error = xrep_ibt_check_ifree(ri, cluster_ir_startino, + cluster_bp, cluster_index, &inuse); + if (error) + return error; + if (!inuse) + continue; + ri->iused++; + ri->rie.ir_free &= ~XFS_INOBT_MASK(cluster_base + + cluster_index); + } + return 0; +} + +/* + * For each inode cluster covering the physical extent recorded by the rmapbt, + * we must calculate the properly aligned startino of that cluster, then + * iterate each cluster to fill in used and filled masks appropriately. We + * then use the (startino, used, filled) information to construct the + * appropriate inode records. + */ +STATIC int +xrep_ibt_process_cluster( + struct xrep_ibt *ri, + xfs_agblock_t cluster_bno) +{ + struct xfs_imap imap; + struct xfs_buf *cluster_bp; + struct xfs_scrub *sc = ri->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agino_t cluster_ag_base; + xfs_agino_t irec_index; + unsigned int nr_inodes; + int error; + + nr_inodes = min_t(unsigned int, igeo->inodes_per_cluster, + XFS_INODES_PER_CHUNK); + + /* + * Grab the inode cluster buffer. This is safe to do with a broken + * inobt because imap_to_bp directly maps the buffer without touching + * either inode btree. + */ + imap.im_blkno = XFS_AGB_TO_DADDR(mp, sc->sa.pag->pag_agno, cluster_bno); + imap.im_len = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster); + imap.im_boffset = 0; + error = xfs_imap_to_bp(mp, sc->tp, &imap, &cluster_bp); + if (error) + return error; + + /* + * Record the contents of each possible inobt record mapping this + * cluster. + */ + cluster_ag_base = XFS_AGB_TO_AGINO(mp, cluster_bno); + for (irec_index = 0; + irec_index < igeo->inodes_per_cluster; + irec_index += XFS_INODES_PER_CHUNK) { + error = xrep_ibt_cluster_record(ri, + cluster_ag_base + irec_index, cluster_bp, + nr_inodes); + if (error) + break; + + } + + xfs_trans_brelse(sc->tp, cluster_bp); + return error; +} + +/* Check for any obvious conflicts in the inode chunk extent. */ +STATIC int +xrep_ibt_check_inode_ext( + struct xfs_scrub *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agino_t agino; + enum xbtree_recpacking outcome; + int error; + + /* Inode records must be within the AG. */ + if (!xfs_verify_agbext(sc->sa.pag, agbno, len)) + return -EFSCORRUPTED; + + /* The entire record must align to the inode cluster size. */ + if (!IS_ALIGNED(agbno, igeo->blocks_per_cluster) || + !IS_ALIGNED(agbno + len, igeo->blocks_per_cluster)) + return -EFSCORRUPTED; + + /* + * The entire record must also adhere to the inode cluster alignment + * size if sparse inodes are not enabled. + */ + if (!xfs_has_sparseinodes(mp) && + (!IS_ALIGNED(agbno, igeo->cluster_align) || + !IS_ALIGNED(agbno + len, igeo->cluster_align))) + return -EFSCORRUPTED; + + /* + * On a sparse inode fs, this cluster could be part of a sparse chunk. + * Sparse clusters must be aligned to sparse chunk alignment. + */ + if (xfs_has_sparseinodes(mp) && + (!IS_ALIGNED(agbno, mp->m_sb.sb_spino_align) || + !IS_ALIGNED(agbno + len, mp->m_sb.sb_spino_align))) + return -EFSCORRUPTED; + + /* Make sure the entire range of blocks are valid AG inodes. */ + agino = XFS_AGB_TO_AGINO(mp, agbno); + if (!xfs_verify_agino(sc->sa.pag, agino)) + return -EFSCORRUPTED; + + agino = XFS_AGB_TO_AGINO(mp, agbno + len) - 1; + if (!xfs_verify_agino(sc->sa.pag, agino)) + return -EFSCORRUPTED; + + /* Make sure this isn't free space. */ + error = xfs_alloc_has_records(sc->sa.bno_cur, agbno, len, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + return 0; +} + +/* Found a fragment of the old inode btrees; dispose of them later. */ +STATIC int +xrep_ibt_record_old_btree_blocks( + struct xrep_ibt *ri, + const struct xfs_rmap_irec *rec) +{ + if (!xfs_verify_agbext(ri->sc->sa.pag, rec->rm_startblock, + rec->rm_blockcount)) + return -EFSCORRUPTED; + + return xagb_bitmap_set(&ri->old_iallocbt_blocks, rec->rm_startblock, + rec->rm_blockcount); +} + +/* Record extents that belong to inode cluster blocks. */ +STATIC int +xrep_ibt_record_inode_blocks( + struct xrep_ibt *ri, + const struct xfs_rmap_irec *rec) +{ + struct xfs_mount *mp = ri->sc->mp; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agblock_t cluster_base; + int error; + + error = xrep_ibt_check_inode_ext(ri->sc, rec->rm_startblock, + rec->rm_blockcount); + if (error) + return error; + + trace_xrep_ibt_walk_rmap(mp, ri->sc->sa.pag->pag_agno, + rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, + rec->rm_offset, rec->rm_flags); + + /* + * Record the free/hole masks for each inode cluster that could be + * mapped by this rmap record. + */ + for (cluster_base = 0; + cluster_base < rec->rm_blockcount; + cluster_base += igeo->blocks_per_cluster) { + error = xrep_ibt_process_cluster(ri, + rec->rm_startblock + cluster_base); + if (error) + return error; + } + + return 0; +} + +STATIC int +xrep_ibt_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_ibt *ri = priv; + int error = 0; + + if (xchk_should_terminate(ri->sc, &error)) + return error; + + switch (rec->rm_owner) { + case XFS_RMAP_OWN_INOBT: + return xrep_ibt_record_old_btree_blocks(ri, rec); + case XFS_RMAP_OWN_INODES: + return xrep_ibt_record_inode_blocks(ri, rec); + } + return 0; +} + +/* + * Iterate all reverse mappings to find the inodes (OWN_INODES) and the inode + * btrees (OWN_INOBT). Figure out if we have enough free space to reconstruct + * the inode btrees. The caller must clean up the lists if anything goes + * wrong. + */ +STATIC int +xrep_ibt_find_inodes( + struct xrep_ibt *ri) +{ + struct xfs_scrub *sc = ri->sc; + int error; + + ri->rie.ir_startino = NULLAGINO; + + /* Collect all reverse mappings for inode blocks. */ + xrep_ag_btcur_init(sc, &sc->sa); + error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_ibt_walk_rmap, ri); + xchk_ag_btcur_free(&sc->sa); + if (error) + return error; + + /* If we have a record ready to go, add it to the array. */ + if (ri->rie.ir_startino != NULLAGINO) + return xrep_ibt_stash(ri); + + return 0; +} + +/* Update the AGI counters. */ +STATIC int +xrep_ibt_reset_counters( + struct xrep_ibt *ri) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_agi *agi = sc->sa.agi_bp->b_addr; + unsigned int freecount = ri->icount - ri->iused; + + /* Trigger inode count recalculation */ + xfs_force_summary_recalc(sc->mp); + + /* + * The AGI header contains extra information related to the inode + * btrees, so we must update those fields here. + */ + agi->agi_count = cpu_to_be32(ri->icount); + agi->agi_freecount = cpu_to_be32(freecount); + xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, + XFS_AGI_COUNT | XFS_AGI_FREECOUNT); + + /* Reinitialize with the values we just logged. */ + return xrep_reinit_pagi(sc); +} + +/* Retrieve finobt data for bulk load. */ +STATIC int +xrep_fibt_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_inobt_rec_incore *irec = &cur->bc_rec.i; + struct xrep_ibt *ri = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + do { + error = xfarray_load(ri->inode_records, + ri->array_cur++, irec); + } while (error == 0 && xfs_inobt_rec_freecount(irec) == 0); + if (error) + return error; + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Retrieve inobt data for bulk load. */ +STATIC int +xrep_ibt_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_inobt_rec_incore *irec = &cur->bc_rec.i; + struct xrep_ibt *ri = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + error = xfarray_load(ri->inode_records, ri->array_cur++, irec); + if (error) + return error; + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new inobt blocks to the bulk loader. */ +STATIC int +xrep_ibt_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_ibt *ri = priv; + + return xrep_newbt_claim_block(cur, &ri->new_inobt, ptr); +} + +/* Feed one of the new finobt blocks to the bulk loader. */ +STATIC int +xrep_fibt_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_ibt *ri = priv; + + return xrep_newbt_claim_block(cur, &ri->new_finobt, ptr); +} + +/* Make sure the records do not overlap in inumber address space. */ +STATIC int +xrep_ibt_check_overlap( + struct xrep_ibt *ri) +{ + struct xfs_inobt_rec_incore irec; + xfarray_idx_t cur; + xfs_agino_t next_agino = 0; + int error = 0; + + foreach_xfarray_idx(ri->inode_records, cur) { + if (xchk_should_terminate(ri->sc, &error)) + return error; + + error = xfarray_load(ri->inode_records, cur, &irec); + if (error) + return error; + + if (irec.ir_startino < next_agino) + return -EFSCORRUPTED; + + next_agino = irec.ir_startino + XFS_INODES_PER_CHUNK; + } + + return error; +} + +/* Build new inode btrees and dispose of the old one. */ +STATIC int +xrep_ibt_build_new_trees( + struct xrep_ibt *ri) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_btree_cur *ino_cur; + struct xfs_btree_cur *fino_cur = NULL; + xfs_fsblock_t fsbno; + bool need_finobt; + int error; + + need_finobt = xfs_has_finobt(sc->mp); + + /* + * Create new btrees for staging all the inobt records we collected + * earlier. The records were collected in order of increasing agino, + * so we do not have to sort them. Ensure there are no overlapping + * records. + */ + error = xrep_ibt_check_overlap(ri); + if (error) + return error; + + /* + * The new inode btrees will not be rooted in the AGI until we've + * successfully rebuilt the tree. + * + * Start by setting up the inobt staging cursor. + */ + fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, + XFS_IBT_BLOCK(sc->mp)), + xrep_newbt_init_ag(&ri->new_inobt, sc, &XFS_RMAP_OINFO_INOBT, fsbno, + XFS_AG_RESV_NONE); + ri->new_inobt.bload.claim_block = xrep_ibt_claim_block; + ri->new_inobt.bload.get_records = xrep_ibt_get_records; + + ino_cur = xfs_inobt_stage_cursor(sc->sa.pag, &ri->new_inobt.afake, + XFS_BTNUM_INO); + error = xfs_btree_bload_compute_geometry(ino_cur, &ri->new_inobt.bload, + xfarray_length(ri->inode_records)); + if (error) + goto err_inocur; + + /* Set up finobt staging cursor. */ + if (need_finobt) { + enum xfs_ag_resv_type resv = XFS_AG_RESV_METADATA; + + if (sc->mp->m_finobt_nores) + resv = XFS_AG_RESV_NONE; + + fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, + XFS_FIBT_BLOCK(sc->mp)), + xrep_newbt_init_ag(&ri->new_finobt, sc, &XFS_RMAP_OINFO_INOBT, + fsbno, resv); + ri->new_finobt.bload.claim_block = xrep_fibt_claim_block; + ri->new_finobt.bload.get_records = xrep_fibt_get_records; + + fino_cur = xfs_inobt_stage_cursor(sc->sa.pag, + &ri->new_finobt.afake, XFS_BTNUM_FINO); + error = xfs_btree_bload_compute_geometry(fino_cur, + &ri->new_finobt.bload, ri->finobt_recs); + if (error) + goto err_finocur; + } + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto err_finocur; + + /* Reserve all the space we need to build the new btrees. */ + error = xrep_newbt_alloc_blocks(&ri->new_inobt, + ri->new_inobt.bload.nr_blocks); + if (error) + goto err_finocur; + + if (need_finobt) { + error = xrep_newbt_alloc_blocks(&ri->new_finobt, + ri->new_finobt.bload.nr_blocks); + if (error) + goto err_finocur; + } + + /* Add all inobt records. */ + ri->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(ino_cur, &ri->new_inobt.bload, ri); + if (error) + goto err_finocur; + + /* Add all finobt records. */ + if (need_finobt) { + ri->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(fino_cur, &ri->new_finobt.bload, ri); + if (error) + goto err_finocur; + } + + /* + * Install the new btrees in the AG header. After this point the old + * btrees are no longer accessible and the new trees are live. + */ + xfs_inobt_commit_staged_btree(ino_cur, sc->tp, sc->sa.agi_bp); + xfs_btree_del_cursor(ino_cur, 0); + + if (fino_cur) { + xfs_inobt_commit_staged_btree(fino_cur, sc->tp, sc->sa.agi_bp); + xfs_btree_del_cursor(fino_cur, 0); + } + + /* Reset the AGI counters now that we've changed the inode roots. */ + error = xrep_ibt_reset_counters(ri); + if (error) + goto err_finobt; + + /* Free unused blocks and bitmap. */ + if (need_finobt) { + error = xrep_newbt_commit(&ri->new_finobt); + if (error) + goto err_inobt; + } + error = xrep_newbt_commit(&ri->new_inobt); + if (error) + return error; + + return xrep_roll_ag_trans(sc); + +err_finocur: + if (need_finobt) + xfs_btree_del_cursor(fino_cur, error); +err_inocur: + xfs_btree_del_cursor(ino_cur, error); +err_finobt: + if (need_finobt) + xrep_newbt_cancel(&ri->new_finobt); +err_inobt: + xrep_newbt_cancel(&ri->new_inobt); + return error; +} + +/* + * Now that we've logged the roots of the new btrees, invalidate all of the + * old blocks and free them. + */ +STATIC int +xrep_ibt_remove_old_trees( + struct xrep_ibt *ri) +{ + struct xfs_scrub *sc = ri->sc; + int error; + + /* + * Free the old inode btree blocks if they're not in use. It's ok to + * reap with XFS_AG_RESV_NONE even if the finobt had a per-AG + * reservation because we reset the reservation before releasing the + * AGI and AGF header buffer locks. + */ + error = xrep_reap_agblocks(sc, &ri->old_iallocbt_blocks, + &XFS_RMAP_OINFO_INOBT, XFS_AG_RESV_NONE); + if (error) + return error; + + /* + * If the finobt is enabled and has a per-AG reservation, make sure we + * reinitialize the per-AG reservations. + */ + if (xfs_has_finobt(sc->mp) && !sc->mp->m_finobt_nores) + sc->flags |= XREP_RESET_PERAG_RESV; + + return 0; +} + +/* Repair both inode btrees. */ +int +xrep_iallocbt( + struct xfs_scrub *sc) +{ + struct xrep_ibt *ri; + struct xfs_mount *mp = sc->mp; + char *descr; + xfs_agino_t first_agino, last_agino; + int error = 0; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rmapbt(mp)) + return -EOPNOTSUPP; + + ri = kzalloc(sizeof(struct xrep_ibt), XCHK_GFP_FLAGS); + if (!ri) + return -ENOMEM; + ri->sc = sc; + + /* We rebuild both inode btrees. */ + sc->sick_mask = XFS_SICK_AG_INOBT | XFS_SICK_AG_FINOBT; + + /* Set up enough storage to handle an AG with nothing but inodes. */ + xfs_agino_range(mp, sc->sa.pag->pag_agno, &first_agino, &last_agino); + last_agino /= XFS_INODES_PER_CHUNK; + descr = xchk_xfile_ag_descr(sc, "inode index records"); + error = xfarray_create(descr, last_agino, + sizeof(struct xfs_inobt_rec_incore), + &ri->inode_records); + kfree(descr); + if (error) + goto out_ri; + + /* Collect the inode data and find the old btree blocks. */ + xagb_bitmap_init(&ri->old_iallocbt_blocks); + error = xrep_ibt_find_inodes(ri); + if (error) + goto out_bitmap; + + /* Rebuild the inode indexes. */ + error = xrep_ibt_build_new_trees(ri); + if (error) + goto out_bitmap; + + /* Kill the old tree. */ + error = xrep_ibt_remove_old_trees(ri); + if (error) + goto out_bitmap; + +out_bitmap: + xagb_bitmap_destroy(&ri->old_iallocbt_blocks); + xfarray_destroy(ri->inode_records); +out_ri: + kfree(ri); + return error; +} + +/* Make sure both btrees are ok after we've rebuilt them. */ +int +xrep_revalidate_iallocbt( + struct xfs_scrub *sc) +{ + __u32 old_type = sc->sm->sm_type; + int error; + + /* + * We must update sm_type temporarily so that the tree-to-tree cross + * reference checks will work in the correct direction, and also so + * that tracing will report correctly if there are more errors. + */ + sc->sm->sm_type = XFS_SCRUB_TYPE_INOBT; + error = xchk_iallocbt(sc); + if (error) + goto out; + + if (xfs_has_finobt(sc->mp)) { + sc->sm->sm_type = XFS_SCRUB_TYPE_FINOBT; + error = xchk_iallocbt(sc); + } + +out: + sc->sm->sm_type = old_type; + return error; +} diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 889f556bc98f..6e2fe2d6250b 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -25,6 +25,7 @@ #include "scrub/common.h" #include "scrub/btree.h" #include "scrub/trace.h" +#include "scrub/repair.h" /* Prepare the attached inode for scrubbing. */ static inline int @@ -39,6 +40,10 @@ xchk_prepare_iscrub( if (error) return error; + error = xchk_ino_dqattach(sc); + if (error) + return error; + xchk_ilock(sc, XFS_ILOCK_EXCL); return 0; } @@ -95,8 +100,8 @@ xchk_setup_inode( if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino)) return -ENOENT; - /* Try a regular untrusted iget. */ - error = xchk_iget(sc, sc->sm->sm_ino, &ip); + /* Try a safe untrusted iget. */ + error = xchk_iget_safe(sc, sc->sm->sm_ino, &ip); if (!error) return xchk_install_handle_iscrub(sc, ip); if (error == -ENOENT) @@ -181,8 +186,11 @@ xchk_setup_inode( * saying the inode is allocated and the icache being unable to load * the inode until we can flag the corruption in xchk_inode. The * scrub function has to note the corruption, since we're not really - * supposed to do that from the setup function. + * supposed to do that from the setup function. Save the mapping to + * make repairs to the ondisk inode buffer. */ + if (xchk_could_repair(sc)) + xrep_setup_inode(sc, &imap); return 0; out_cancel: @@ -338,6 +346,10 @@ xchk_inode_flags2( if (xfs_dinode_has_bigtime(dip) && !xfs_has_bigtime(mp)) goto bad; + /* no large extent counts without the filesystem feature */ + if ((flags2 & XFS_DIFLAG2_NREXT64) && !xfs_has_large_extent_counts(mp)) + goto bad; + return; bad: xchk_ino_set_corrupt(sc, ino); @@ -548,7 +560,7 @@ xchk_dinode( } /* di_forkoff */ - if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize) + if (XFS_DFORK_BOFF(dip) >= mp->m_sb.sb_inodesize) xchk_ino_set_corrupt(sc, ino); if (naextents != 0 && dip->di_forkoff == 0) xchk_ino_set_corrupt(sc, ino); diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c new file mode 100644 index 000000000000..0ca62d59f84a --- /dev/null +++ b/fs/xfs/scrub/inode_repair.c @@ -0,0 +1,1525 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_inode_buf.h" +#include "xfs_inode_fork.h" +#include "xfs_ialloc.h" +#include "xfs_da_format.h" +#include "xfs_reflink.h" +#include "xfs_alloc.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_bmap_util.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_quota_defs.h" +#include "xfs_quota.h" +#include "xfs_ag.h" +#include "xfs_rtbitmap.h" +#include "xfs_attr_leaf.h" +#include "xfs_log_priv.h" +#include "xfs_health.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" + +/* + * Inode Record Repair + * =================== + * + * Roughly speaking, inode problems can be classified based on whether or not + * they trip the dinode verifiers. If those trip, then we won't be able to + * xfs_iget ourselves the inode. + * + * Therefore, the xrep_dinode_* functions fix anything that will cause the + * inode buffer verifier or the dinode verifier. The xrep_inode_* functions + * fix things on live incore inodes. The inode repair functions make decisions + * with security and usability implications when reviving a file: + * + * - Files with zero di_mode or a garbage di_mode are converted to regular file + * that only root can read. This file may not actually contain user data, + * if the file was not previously a regular file. Setuid and setgid bits + * are cleared. + * + * - Zero-size directories can be truncated to look empty. It is necessary to + * run the bmapbtd and directory repair functions to fully rebuild the + * directory. + * + * - Zero-size symbolic link targets can be truncated to '?'. It is necessary + * to run the bmapbtd and symlink repair functions to salvage the symlink. + * + * - Invalid extent size hints will be removed. + * + * - Quotacheck will be scheduled if we repaired an inode that was so badly + * damaged that the ondisk inode had to be rebuilt. + * + * - Invalid user, group, or project IDs (aka -1U) will be reset to zero. + * Setuid and setgid bits are cleared. + * + * - Data and attr forks are reset to extents format with zero extents if the + * fork data is inconsistent. It is necessary to run the bmapbtd or bmapbta + * repair functions to recover the space mapping. + * + * - ACLs will not be recovered if the attr fork is zapped or the extended + * attribute structure itself requires salvaging. + * + * - If the attr fork is zapped, the user and group ids are reset to root and + * the setuid and setgid bits are removed. + */ + +/* + * All the information we need to repair the ondisk inode if we can't iget the + * incore inode. We don't allocate this buffer unless we're going to perform + * a repair to the ondisk inode cluster buffer. + */ +struct xrep_inode { + /* Inode mapping that we saved from the initial lookup attempt. */ + struct xfs_imap imap; + + struct xfs_scrub *sc; + + /* Blocks in use on the data device by data extents or bmbt blocks. */ + xfs_rfsblock_t data_blocks; + + /* Blocks in use on the rt device. */ + xfs_rfsblock_t rt_blocks; + + /* Blocks in use by the attr fork. */ + xfs_rfsblock_t attr_blocks; + + /* Number of data device extents for the data fork. */ + xfs_extnum_t data_extents; + + /* + * Number of realtime device extents for the data fork. If + * data_extents and rt_extents indicate that the data fork has extents + * on both devices, we'll just back away slowly. + */ + xfs_extnum_t rt_extents; + + /* Number of (data device) extents for the attr fork. */ + xfs_aextnum_t attr_extents; + + /* Sick state to set after zapping parts of the inode. */ + unsigned int ino_sick_mask; + + /* Must we remove all access from this file? */ + bool zap_acls; +}; + +/* + * Setup function for inode repair. @imap contains the ondisk inode mapping + * information so that we can correct the ondisk inode cluster buffer if + * necessary to make iget work. + */ +int +xrep_setup_inode( + struct xfs_scrub *sc, + const struct xfs_imap *imap) +{ + struct xrep_inode *ri; + + sc->buf = kzalloc(sizeof(struct xrep_inode), XCHK_GFP_FLAGS); + if (!sc->buf) + return -ENOMEM; + + ri = sc->buf; + memcpy(&ri->imap, imap, sizeof(struct xfs_imap)); + ri->sc = sc; + return 0; +} + +/* + * Make sure this ondisk inode can pass the inode buffer verifier. This is + * not the same as the dinode verifier. + */ +STATIC void +xrep_dinode_buf_core( + struct xfs_scrub *sc, + struct xfs_buf *bp, + unsigned int ioffset) +{ + struct xfs_dinode *dip = xfs_buf_offset(bp, ioffset); + struct xfs_trans *tp = sc->tp; + struct xfs_mount *mp = sc->mp; + xfs_agino_t agino; + bool crc_ok = false; + bool magic_ok = false; + bool unlinked_ok = false; + + agino = be32_to_cpu(dip->di_next_unlinked); + + if (xfs_verify_agino_or_null(bp->b_pag, agino)) + unlinked_ok = true; + + if (dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && + xfs_dinode_good_version(mp, dip->di_version)) + magic_ok = true; + + if (xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, + XFS_DINODE_CRC_OFF)) + crc_ok = true; + + if (magic_ok && unlinked_ok && crc_ok) + return; + + if (!magic_ok) { + dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); + dip->di_version = 3; + } + if (!unlinked_ok) + dip->di_next_unlinked = cpu_to_be32(NULLAGINO); + xfs_dinode_calc_crc(mp, dip); + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); + xfs_trans_log_buf(tp, bp, ioffset, + ioffset + sizeof(struct xfs_dinode) - 1); +} + +/* Make sure this inode cluster buffer can pass the inode buffer verifier. */ +STATIC void +xrep_dinode_buf( + struct xfs_scrub *sc, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = sc->mp; + int i; + int ni; + + ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; + for (i = 0; i < ni; i++) + xrep_dinode_buf_core(sc, bp, i << mp->m_sb.sb_inodelog); +} + +/* Reinitialize things that never change in an inode. */ +STATIC void +xrep_dinode_header( + struct xfs_scrub *sc, + struct xfs_dinode *dip) +{ + trace_xrep_dinode_header(sc, dip); + + dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); + if (!xfs_dinode_good_version(sc->mp, dip->di_version)) + dip->di_version = 3; + dip->di_ino = cpu_to_be64(sc->sm->sm_ino); + uuid_copy(&dip->di_uuid, &sc->mp->m_sb.sb_meta_uuid); + dip->di_gen = cpu_to_be32(sc->sm->sm_gen); +} + +/* Turn di_mode into /something/ recognizable. */ +STATIC void +xrep_dinode_mode( + struct xrep_inode *ri, + struct xfs_dinode *dip) +{ + struct xfs_scrub *sc = ri->sc; + uint16_t mode = be16_to_cpu(dip->di_mode); + + trace_xrep_dinode_mode(sc, dip); + + if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN) + return; + + /* bad mode, so we set it to a file that only root can read */ + mode = S_IFREG; + dip->di_mode = cpu_to_be16(mode); + dip->di_uid = 0; + dip->di_gid = 0; + ri->zap_acls = true; +} + +/* Fix any conflicting flags that the verifiers complain about. */ +STATIC void +xrep_dinode_flags( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + bool isrt) +{ + struct xfs_mount *mp = sc->mp; + uint64_t flags2 = be64_to_cpu(dip->di_flags2); + uint16_t flags = be16_to_cpu(dip->di_flags); + uint16_t mode = be16_to_cpu(dip->di_mode); + + trace_xrep_dinode_flags(sc, dip); + + if (isrt) + flags |= XFS_DIFLAG_REALTIME; + else + flags &= ~XFS_DIFLAG_REALTIME; + + /* + * For regular files on a reflink filesystem, set the REFLINK flag to + * protect shared extents. A later stage will actually check those + * extents and clear the flag if possible. + */ + if (xfs_has_reflink(mp) && S_ISREG(mode)) + flags2 |= XFS_DIFLAG2_REFLINK; + else + flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE); + if (flags & XFS_DIFLAG_REALTIME) + flags2 &= ~XFS_DIFLAG2_REFLINK; + if (!xfs_has_bigtime(mp)) + flags2 &= ~XFS_DIFLAG2_BIGTIME; + if (!xfs_has_large_extent_counts(mp)) + flags2 &= ~XFS_DIFLAG2_NREXT64; + if (flags2 & XFS_DIFLAG2_NREXT64) + dip->di_nrext64_pad = 0; + else if (dip->di_version >= 3) + dip->di_v3_pad = 0; + dip->di_flags = cpu_to_be16(flags); + dip->di_flags2 = cpu_to_be64(flags2); +} + +/* + * Blow out symlink; now it points nowhere. We don't have to worry about + * incore state because this inode is failing the verifiers. + */ +STATIC void +xrep_dinode_zap_symlink( + struct xrep_inode *ri, + struct xfs_dinode *dip) +{ + struct xfs_scrub *sc = ri->sc; + char *p; + + trace_xrep_dinode_zap_symlink(sc, dip); + + dip->di_format = XFS_DINODE_FMT_LOCAL; + dip->di_size = cpu_to_be64(1); + p = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + *p = '?'; + ri->ino_sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED; +} + +/* + * Blow out dir, make the parent point to the root. In the future repair will + * reconstruct this directory for us. Note that there's no in-core directory + * inode because the sf verifier tripped, so we don't have to worry about the + * dentry cache. + */ +STATIC void +xrep_dinode_zap_dir( + struct xrep_inode *ri, + struct xfs_dinode *dip) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_dir2_sf_hdr *sfp; + int i8count; + + trace_xrep_dinode_zap_dir(sc, dip); + + dip->di_format = XFS_DINODE_FMT_LOCAL; + i8count = mp->m_sb.sb_rootino > XFS_DIR2_MAX_SHORT_INUM; + sfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + sfp->count = 0; + sfp->i8count = i8count; + xfs_dir2_sf_put_parent_ino(sfp, mp->m_sb.sb_rootino); + dip->di_size = cpu_to_be64(xfs_dir2_sf_hdr_size(i8count)); + ri->ino_sick_mask |= XFS_SICK_INO_DIR_ZAPPED; +} + +/* Make sure we don't have a garbage file size. */ +STATIC void +xrep_dinode_size( + struct xrep_inode *ri, + struct xfs_dinode *dip) +{ + struct xfs_scrub *sc = ri->sc; + uint64_t size = be64_to_cpu(dip->di_size); + uint16_t mode = be16_to_cpu(dip->di_mode); + + trace_xrep_dinode_size(sc, dip); + + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + /* di_size can't be nonzero for special files */ + dip->di_size = 0; + break; + case S_IFREG: + /* Regular files can't be larger than 2^63-1 bytes. */ + dip->di_size = cpu_to_be64(size & ~(1ULL << 63)); + break; + case S_IFLNK: + /* + * Truncate ridiculously oversized symlinks. If the size is + * zero, reset it to point to the current directory. Both of + * these conditions trigger dinode verifier errors, so there + * is no in-core state to reset. + */ + if (size > XFS_SYMLINK_MAXLEN) + dip->di_size = cpu_to_be64(XFS_SYMLINK_MAXLEN); + else if (size == 0) + xrep_dinode_zap_symlink(ri, dip); + break; + case S_IFDIR: + /* + * Directories can't have a size larger than 32G. If the size + * is zero, reset it to an empty directory. Both of these + * conditions trigger dinode verifier errors, so there is no + * in-core state to reset. + */ + if (size > XFS_DIR2_SPACE_SIZE) + dip->di_size = cpu_to_be64(XFS_DIR2_SPACE_SIZE); + else if (size == 0) + xrep_dinode_zap_dir(ri, dip); + break; + } +} + +/* Fix extent size hints. */ +STATIC void +xrep_dinode_extsize_hints( + struct xfs_scrub *sc, + struct xfs_dinode *dip) +{ + struct xfs_mount *mp = sc->mp; + uint64_t flags2 = be64_to_cpu(dip->di_flags2); + uint16_t flags = be16_to_cpu(dip->di_flags); + uint16_t mode = be16_to_cpu(dip->di_mode); + + xfs_failaddr_t fa; + + trace_xrep_dinode_extsize_hints(sc, dip); + + fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize), + mode, flags); + if (fa) { + dip->di_extsize = 0; + dip->di_flags &= ~cpu_to_be16(XFS_DIFLAG_EXTSIZE | + XFS_DIFLAG_EXTSZINHERIT); + } + + if (dip->di_version < 3) + return; + + fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), + mode, flags, flags2); + if (fa) { + dip->di_cowextsize = 0; + dip->di_flags2 &= ~cpu_to_be64(XFS_DIFLAG2_COWEXTSIZE); + } +} + +/* Count extents and blocks for an inode given an rmap. */ +STATIC int +xrep_dinode_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_inode *ri = priv; + int error = 0; + + if (xchk_should_terminate(ri->sc, &error)) + return error; + + /* We only care about this inode. */ + if (rec->rm_owner != ri->sc->sm->sm_ino) + return 0; + + if (rec->rm_flags & XFS_RMAP_ATTR_FORK) { + ri->attr_blocks += rec->rm_blockcount; + if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK)) + ri->attr_extents++; + + return 0; + } + + ri->data_blocks += rec->rm_blockcount; + if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK)) + ri->data_extents++; + + return 0; +} + +/* Count extents and blocks for an inode from all AG rmap data. */ +STATIC int +xrep_dinode_count_ag_rmaps( + struct xrep_inode *ri, + struct xfs_perag *pag) +{ + struct xfs_btree_cur *cur; + struct xfs_buf *agf; + int error; + + error = xfs_alloc_read_agf(pag, ri->sc->tp, 0, &agf); + if (error) + return error; + + cur = xfs_rmapbt_init_cursor(ri->sc->mp, ri->sc->tp, agf, pag); + error = xfs_rmap_query_all(cur, xrep_dinode_walk_rmap, ri); + xfs_btree_del_cursor(cur, error); + xfs_trans_brelse(ri->sc->tp, agf); + return error; +} + +/* Count extents and blocks for a given inode from all rmap data. */ +STATIC int +xrep_dinode_count_rmaps( + struct xrep_inode *ri) +{ + struct xfs_perag *pag; + xfs_agnumber_t agno; + int error; + + if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp)) + return -EOPNOTSUPP; + + for_each_perag(ri->sc->mp, agno, pag) { + error = xrep_dinode_count_ag_rmaps(ri, pag); + if (error) { + xfs_perag_rele(pag); + return error; + } + } + + /* Can't have extents on both the rt and the data device. */ + if (ri->data_extents && ri->rt_extents) + return -EFSCORRUPTED; + + trace_xrep_dinode_count_rmaps(ri->sc, + ri->data_blocks, ri->rt_blocks, ri->attr_blocks, + ri->data_extents, ri->rt_extents, ri->attr_extents); + return 0; +} + +/* Return true if this extents-format ifork looks like garbage. */ +STATIC bool +xrep_dinode_bad_extents_fork( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + unsigned int dfork_size, + int whichfork) +{ + struct xfs_bmbt_irec new; + struct xfs_bmbt_rec *dp; + xfs_extnum_t nex; + bool isrt; + unsigned int i; + + nex = xfs_dfork_nextents(dip, whichfork); + if (nex > dfork_size / sizeof(struct xfs_bmbt_rec)) + return true; + + dp = XFS_DFORK_PTR(dip, whichfork); + + isrt = dip->di_flags & cpu_to_be16(XFS_DIFLAG_REALTIME); + for (i = 0; i < nex; i++, dp++) { + xfs_failaddr_t fa; + + xfs_bmbt_disk_get_all(dp, &new); + fa = xfs_bmap_validate_extent_raw(sc->mp, isrt, whichfork, + &new); + if (fa) + return true; + } + + return false; +} + +/* Return true if this btree-format ifork looks like garbage. */ +STATIC bool +xrep_dinode_bad_bmbt_fork( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + unsigned int dfork_size, + int whichfork) +{ + struct xfs_bmdr_block *dfp; + xfs_extnum_t nex; + unsigned int i; + unsigned int dmxr; + unsigned int nrecs; + unsigned int level; + + nex = xfs_dfork_nextents(dip, whichfork); + if (nex <= dfork_size / sizeof(struct xfs_bmbt_rec)) + return true; + + if (dfork_size < sizeof(struct xfs_bmdr_block)) + return true; + + dfp = XFS_DFORK_PTR(dip, whichfork); + nrecs = be16_to_cpu(dfp->bb_numrecs); + level = be16_to_cpu(dfp->bb_level); + + if (nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > dfork_size) + return true; + if (level == 0 || level >= XFS_BM_MAXLEVELS(sc->mp, whichfork)) + return true; + + dmxr = xfs_bmdr_maxrecs(dfork_size, 0); + for (i = 1; i <= nrecs; i++) { + struct xfs_bmbt_key *fkp; + xfs_bmbt_ptr_t *fpp; + xfs_fileoff_t fileoff; + xfs_fsblock_t fsbno; + + fkp = XFS_BMDR_KEY_ADDR(dfp, i); + fileoff = be64_to_cpu(fkp->br_startoff); + if (!xfs_verify_fileoff(sc->mp, fileoff)) + return true; + + fpp = XFS_BMDR_PTR_ADDR(dfp, i, dmxr); + fsbno = be64_to_cpu(*fpp); + if (!xfs_verify_fsbno(sc->mp, fsbno)) + return true; + } + + return false; +} + +/* + * Check the data fork for things that will fail the ifork verifiers or the + * ifork formatters. + */ +STATIC bool +xrep_dinode_check_dfork( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + uint16_t mode) +{ + void *dfork_ptr; + int64_t data_size; + unsigned int fmt; + unsigned int dfork_size; + + /* + * Verifier functions take signed int64_t, so check for bogus negative + * values first. + */ + data_size = be64_to_cpu(dip->di_size); + if (data_size < 0) + return true; + + fmt = XFS_DFORK_FORMAT(dip, XFS_DATA_FORK); + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + if (fmt != XFS_DINODE_FMT_DEV) + return true; + break; + case S_IFREG: + if (fmt == XFS_DINODE_FMT_LOCAL) + return true; + fallthrough; + case S_IFLNK: + case S_IFDIR: + switch (fmt) { + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + break; + default: + return true; + } + break; + default: + return true; + } + + dfork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_DATA_FORK); + dfork_ptr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + + switch (fmt) { + case XFS_DINODE_FMT_DEV: + break; + case XFS_DINODE_FMT_LOCAL: + /* dir/symlink structure cannot be larger than the fork */ + if (data_size > dfork_size) + return true; + /* directory structure must pass verification. */ + if (S_ISDIR(mode) && + xfs_dir2_sf_verify(sc->mp, dfork_ptr, data_size) != NULL) + return true; + /* symlink structure must pass verification. */ + if (S_ISLNK(mode) && + xfs_symlink_shortform_verify(dfork_ptr, data_size) != NULL) + return true; + break; + case XFS_DINODE_FMT_EXTENTS: + if (xrep_dinode_bad_extents_fork(sc, dip, dfork_size, + XFS_DATA_FORK)) + return true; + break; + case XFS_DINODE_FMT_BTREE: + if (xrep_dinode_bad_bmbt_fork(sc, dip, dfork_size, + XFS_DATA_FORK)) + return true; + break; + default: + return true; + } + + return false; +} + +static void +xrep_dinode_set_data_nextents( + struct xfs_dinode *dip, + xfs_extnum_t nextents) +{ + if (xfs_dinode_has_large_extent_counts(dip)) + dip->di_big_nextents = cpu_to_be64(nextents); + else + dip->di_nextents = cpu_to_be32(nextents); +} + +static void +xrep_dinode_set_attr_nextents( + struct xfs_dinode *dip, + xfs_extnum_t nextents) +{ + if (xfs_dinode_has_large_extent_counts(dip)) + dip->di_big_anextents = cpu_to_be32(nextents); + else + dip->di_anextents = cpu_to_be16(nextents); +} + +/* Reset the data fork to something sane. */ +STATIC void +xrep_dinode_zap_dfork( + struct xrep_inode *ri, + struct xfs_dinode *dip, + uint16_t mode) +{ + struct xfs_scrub *sc = ri->sc; + + trace_xrep_dinode_zap_dfork(sc, dip); + + ri->ino_sick_mask |= XFS_SICK_INO_BMBTD_ZAPPED; + + xrep_dinode_set_data_nextents(dip, 0); + ri->data_blocks = 0; + ri->rt_blocks = 0; + + /* Special files always get reset to DEV */ + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + dip->di_format = XFS_DINODE_FMT_DEV; + dip->di_size = 0; + return; + } + + /* + * If we have data extents, reset to an empty map and hope the user + * will run the bmapbtd checker next. + */ + if (ri->data_extents || ri->rt_extents || S_ISREG(mode)) { + dip->di_format = XFS_DINODE_FMT_EXTENTS; + return; + } + + /* Otherwise, reset the local format to the minimum. */ + switch (mode & S_IFMT) { + case S_IFLNK: + xrep_dinode_zap_symlink(ri, dip); + break; + case S_IFDIR: + xrep_dinode_zap_dir(ri, dip); + break; + } +} + +/* + * Check the attr fork for things that will fail the ifork verifiers or the + * ifork formatters. + */ +STATIC bool +xrep_dinode_check_afork( + struct xfs_scrub *sc, + struct xfs_dinode *dip) +{ + struct xfs_attr_sf_hdr *afork_ptr; + size_t attr_size; + unsigned int afork_size; + + if (XFS_DFORK_BOFF(dip) == 0) + return dip->di_aformat != XFS_DINODE_FMT_EXTENTS || + xfs_dfork_attr_extents(dip) != 0; + + afork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK); + afork_ptr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK); + + switch (XFS_DFORK_FORMAT(dip, XFS_ATTR_FORK)) { + case XFS_DINODE_FMT_LOCAL: + /* Fork has to be large enough to extract the xattr size. */ + if (afork_size < sizeof(struct xfs_attr_sf_hdr)) + return true; + + /* xattr structure cannot be larger than the fork */ + attr_size = be16_to_cpu(afork_ptr->totsize); + if (attr_size > afork_size) + return true; + + /* xattr structure must pass verification. */ + return xfs_attr_shortform_verify(afork_ptr, attr_size) != NULL; + case XFS_DINODE_FMT_EXTENTS: + if (xrep_dinode_bad_extents_fork(sc, dip, afork_size, + XFS_ATTR_FORK)) + return true; + break; + case XFS_DINODE_FMT_BTREE: + if (xrep_dinode_bad_bmbt_fork(sc, dip, afork_size, + XFS_ATTR_FORK)) + return true; + break; + default: + return true; + } + + return false; +} + +/* + * Reset the attr fork to empty. Since the attr fork could have contained + * ACLs, make the file readable only by root. + */ +STATIC void +xrep_dinode_zap_afork( + struct xrep_inode *ri, + struct xfs_dinode *dip, + uint16_t mode) +{ + struct xfs_scrub *sc = ri->sc; + + trace_xrep_dinode_zap_afork(sc, dip); + + ri->ino_sick_mask |= XFS_SICK_INO_BMBTA_ZAPPED; + + dip->di_aformat = XFS_DINODE_FMT_EXTENTS; + xrep_dinode_set_attr_nextents(dip, 0); + ri->attr_blocks = 0; + + /* + * If the data fork is in btree format, removing the attr fork entirely + * might cause verifier failures if the next level down in the bmbt + * could now fit in the data fork area. + */ + if (dip->di_format != XFS_DINODE_FMT_BTREE) + dip->di_forkoff = 0; + dip->di_mode = cpu_to_be16(mode & ~0777); + dip->di_uid = 0; + dip->di_gid = 0; +} + +/* Make sure the fork offset is a sensible value. */ +STATIC void +xrep_dinode_ensure_forkoff( + struct xrep_inode *ri, + struct xfs_dinode *dip, + uint16_t mode) +{ + struct xfs_bmdr_block *bmdr; + struct xfs_scrub *sc = ri->sc; + xfs_extnum_t attr_extents, data_extents; + size_t bmdr_minsz = XFS_BMDR_SPACE_CALC(1); + unsigned int lit_sz = XFS_LITINO(sc->mp); + unsigned int afork_min, dfork_min; + + trace_xrep_dinode_ensure_forkoff(sc, dip); + + /* + * Before calling this function, xrep_dinode_core ensured that both + * forks actually fit inside their respective literal areas. If this + * was not the case, the fork was reset to FMT_EXTENTS with zero + * records. If the rmapbt scan found attr or data fork blocks, this + * will be noted in the dinode_stats, and we must leave enough room + * for the bmap repair code to reconstruct the mapping structure. + * + * First, compute the minimum space required for the attr fork. + */ + switch (dip->di_aformat) { + case XFS_DINODE_FMT_LOCAL: + /* + * If we still have a shortform xattr structure at all, that + * means the attr fork area was exactly large enough to fit + * the sf structure. + */ + afork_min = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK); + break; + case XFS_DINODE_FMT_EXTENTS: + attr_extents = xfs_dfork_attr_extents(dip); + if (attr_extents) { + /* + * We must maintain sufficient space to hold the entire + * extent map array in the data fork. Note that we + * previously zapped the fork if it had no chance of + * fitting in the inode. + */ + afork_min = sizeof(struct xfs_bmbt_rec) * attr_extents; + } else if (ri->attr_extents > 0) { + /* + * The attr fork thinks it has zero extents, but we + * found some xattr extents. We need to leave enough + * empty space here so that the incore attr fork will + * get created (and hence trigger the attr fork bmap + * repairer). + */ + afork_min = bmdr_minsz; + } else { + /* No extents on disk or found in rmapbt. */ + afork_min = 0; + } + break; + case XFS_DINODE_FMT_BTREE: + /* Must have space for btree header and key/pointers. */ + bmdr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK); + afork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr); + break; + default: + /* We should never see any other formats. */ + afork_min = 0; + break; + } + + /* Compute the minimum space required for the data fork. */ + switch (dip->di_format) { + case XFS_DINODE_FMT_DEV: + dfork_min = sizeof(__be32); + break; + case XFS_DINODE_FMT_UUID: + dfork_min = sizeof(uuid_t); + break; + case XFS_DINODE_FMT_LOCAL: + /* + * If we still have a shortform data fork at all, that means + * the data fork area was large enough to fit whatever was in + * there. + */ + dfork_min = be64_to_cpu(dip->di_size); + break; + case XFS_DINODE_FMT_EXTENTS: + data_extents = xfs_dfork_data_extents(dip); + if (data_extents) { + /* + * We must maintain sufficient space to hold the entire + * extent map array in the data fork. Note that we + * previously zapped the fork if it had no chance of + * fitting in the inode. + */ + dfork_min = sizeof(struct xfs_bmbt_rec) * data_extents; + } else if (ri->data_extents > 0 || ri->rt_extents > 0) { + /* + * The data fork thinks it has zero extents, but we + * found some data extents. We need to leave enough + * empty space here so that the data fork bmap repair + * will recover the mappings. + */ + dfork_min = bmdr_minsz; + } else { + /* No extents on disk or found in rmapbt. */ + dfork_min = 0; + } + break; + case XFS_DINODE_FMT_BTREE: + /* Must have space for btree header and key/pointers. */ + bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + dfork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr); + break; + default: + dfork_min = 0; + break; + } + + /* + * Round all values up to the nearest 8 bytes, because that is the + * precision of di_forkoff. + */ + afork_min = roundup(afork_min, 8); + dfork_min = roundup(dfork_min, 8); + bmdr_minsz = roundup(bmdr_minsz, 8); + + ASSERT(dfork_min <= lit_sz); + ASSERT(afork_min <= lit_sz); + + /* + * If the data fork was zapped and we don't have enough space for the + * recovery fork, move the attr fork up. + */ + if (dip->di_format == XFS_DINODE_FMT_EXTENTS && + xfs_dfork_data_extents(dip) == 0 && + (ri->data_extents > 0 || ri->rt_extents > 0) && + bmdr_minsz > XFS_DFORK_DSIZE(dip, sc->mp)) { + if (bmdr_minsz + afork_min > lit_sz) { + /* + * The attr for and the stub fork we need to recover + * the data fork won't both fit. Zap the attr fork. + */ + xrep_dinode_zap_afork(ri, dip, mode); + afork_min = bmdr_minsz; + } else { + void *before, *after; + + /* Otherwise, just slide the attr fork up. */ + before = XFS_DFORK_APTR(dip); + dip->di_forkoff = bmdr_minsz >> 3; + after = XFS_DFORK_APTR(dip); + memmove(after, before, XFS_DFORK_ASIZE(dip, sc->mp)); + } + } + + /* + * If the attr fork was zapped and we don't have enough space for the + * recovery fork, move the attr fork down. + */ + if (dip->di_aformat == XFS_DINODE_FMT_EXTENTS && + xfs_dfork_attr_extents(dip) == 0 && + ri->attr_extents > 0 && + bmdr_minsz > XFS_DFORK_ASIZE(dip, sc->mp)) { + if (dip->di_format == XFS_DINODE_FMT_BTREE) { + /* + * If the data fork is in btree format then we can't + * adjust forkoff because that runs the risk of + * violating the extents/btree format transition rules. + */ + } else if (bmdr_minsz + dfork_min > lit_sz) { + /* + * If we can't move the attr fork, too bad, we lose the + * attr fork and leak its blocks. + */ + xrep_dinode_zap_afork(ri, dip, mode); + } else { + /* + * Otherwise, just slide the attr fork down. The attr + * fork is empty, so we don't have any old contents to + * move here. + */ + dip->di_forkoff = (lit_sz - bmdr_minsz) >> 3; + } + } +} + +/* + * Zap the data/attr forks if we spot anything that isn't going to pass the + * ifork verifiers or the ifork formatters, because we need to get the inode + * into good enough shape that the higher level repair functions can run. + */ +STATIC void +xrep_dinode_zap_forks( + struct xrep_inode *ri, + struct xfs_dinode *dip) +{ + struct xfs_scrub *sc = ri->sc; + xfs_extnum_t data_extents; + xfs_extnum_t attr_extents; + xfs_filblks_t nblocks; + uint16_t mode; + bool zap_datafork = false; + bool zap_attrfork = ri->zap_acls; + + trace_xrep_dinode_zap_forks(sc, dip); + + mode = be16_to_cpu(dip->di_mode); + + data_extents = xfs_dfork_data_extents(dip); + attr_extents = xfs_dfork_attr_extents(dip); + nblocks = be64_to_cpu(dip->di_nblocks); + + /* Inode counters don't make sense? */ + if (data_extents > nblocks) + zap_datafork = true; + if (attr_extents > nblocks) + zap_attrfork = true; + if (data_extents + attr_extents > nblocks) + zap_datafork = zap_attrfork = true; + + if (!zap_datafork) + zap_datafork = xrep_dinode_check_dfork(sc, dip, mode); + if (!zap_attrfork) + zap_attrfork = xrep_dinode_check_afork(sc, dip); + + /* Zap whatever's bad. */ + if (zap_attrfork) + xrep_dinode_zap_afork(ri, dip, mode); + if (zap_datafork) + xrep_dinode_zap_dfork(ri, dip, mode); + xrep_dinode_ensure_forkoff(ri, dip, mode); + + /* + * Zero di_nblocks if we don't have any extents at all to satisfy the + * buffer verifier. + */ + data_extents = xfs_dfork_data_extents(dip); + attr_extents = xfs_dfork_attr_extents(dip); + if (data_extents + attr_extents == 0) + dip->di_nblocks = 0; +} + +/* Inode didn't pass dinode verifiers, so fix the raw buffer and retry iget. */ +STATIC int +xrep_dinode_core( + struct xrep_inode *ri) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_buf *bp; + struct xfs_dinode *dip; + xfs_ino_t ino = sc->sm->sm_ino; + int error; + int iget_error; + + /* Figure out what this inode had mapped in both forks. */ + error = xrep_dinode_count_rmaps(ri); + if (error) + return error; + + /* Read the inode cluster buffer. */ + error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, + ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp, + NULL); + if (error) + return error; + + /* Make sure we can pass the inode buffer verifier. */ + xrep_dinode_buf(sc, bp); + bp->b_ops = &xfs_inode_buf_ops; + + /* Fix everything the verifier will complain about. */ + dip = xfs_buf_offset(bp, ri->imap.im_boffset); + xrep_dinode_header(sc, dip); + xrep_dinode_mode(ri, dip); + xrep_dinode_flags(sc, dip, ri->rt_extents > 0); + xrep_dinode_size(ri, dip); + xrep_dinode_extsize_hints(sc, dip); + xrep_dinode_zap_forks(ri, dip); + + /* Write out the inode. */ + trace_xrep_dinode_fixed(sc, dip); + xfs_dinode_calc_crc(sc->mp, dip); + xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_DINO_BUF); + xfs_trans_log_buf(sc->tp, bp, ri->imap.im_boffset, + ri->imap.im_boffset + sc->mp->m_sb.sb_inodesize - 1); + + /* + * In theory, we've fixed the ondisk inode record enough that we should + * be able to load the inode into the cache. Try to iget that inode + * now while we hold the AGI and the inode cluster buffer and take the + * IOLOCK so that we can continue with repairs without anyone else + * accessing the inode. If iget fails, we still need to commit the + * changes. + */ + iget_error = xchk_iget(sc, ino, &sc->ip); + if (!iget_error) + xchk_ilock(sc, XFS_IOLOCK_EXCL); + + /* + * Commit the inode cluster buffer updates and drop the AGI buffer that + * we've been holding since scrub setup. From here on out, repairs + * deal only with the cached inode. + */ + error = xrep_trans_commit(sc); + if (error) + return error; + + if (iget_error) + return iget_error; + + error = xchk_trans_alloc(sc, 0); + if (error) + return error; + + error = xrep_ino_dqattach(sc); + if (error) + return error; + + xchk_ilock(sc, XFS_ILOCK_EXCL); + if (ri->ino_sick_mask) + xfs_inode_mark_sick(sc->ip, ri->ino_sick_mask); + return 0; +} + +/* Fix everything xfs_dinode_verify cares about. */ +STATIC int +xrep_dinode_problems( + struct xrep_inode *ri) +{ + struct xfs_scrub *sc = ri->sc; + int error; + + error = xrep_dinode_core(ri); + if (error) + return error; + + /* We had to fix a totally busted inode, schedule quotacheck. */ + if (XFS_IS_UQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_USER); + if (XFS_IS_GQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); + if (XFS_IS_PQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); + + return 0; +} + +/* + * Fix problems that the verifiers don't care about. In general these are + * errors that don't cause problems elsewhere in the kernel that we can easily + * detect, so we don't check them all that rigorously. + */ + +/* Make sure block and extent counts are ok. */ +STATIC int +xrep_inode_blockcounts( + struct xfs_scrub *sc) +{ + struct xfs_ifork *ifp; + xfs_filblks_t count; + xfs_filblks_t acount; + xfs_extnum_t nextents; + int error; + + trace_xrep_inode_blockcounts(sc); + + /* Set data fork counters from the data fork mappings. */ + error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK, + &nextents, &count); + if (error) + return error; + if (xfs_is_reflink_inode(sc->ip)) { + /* + * data fork blockcount can exceed physical storage if a user + * reflinks the same block over and over again. + */ + ; + } else if (XFS_IS_REALTIME_INODE(sc->ip)) { + if (count >= sc->mp->m_sb.sb_rblocks) + return -EFSCORRUPTED; + } else { + if (count >= sc->mp->m_sb.sb_dblocks) + return -EFSCORRUPTED; + } + error = xrep_ino_ensure_extent_count(sc, XFS_DATA_FORK, nextents); + if (error) + return error; + sc->ip->i_df.if_nextents = nextents; + + /* Set attr fork counters from the attr fork mappings. */ + ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); + if (ifp) { + error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK, + &nextents, &acount); + if (error) + return error; + if (count >= sc->mp->m_sb.sb_dblocks) + return -EFSCORRUPTED; + error = xrep_ino_ensure_extent_count(sc, XFS_ATTR_FORK, + nextents); + if (error) + return error; + ifp->if_nextents = nextents; + } else { + acount = 0; + } + + sc->ip->i_nblocks = count + acount; + return 0; +} + +/* Check for invalid uid/gid/prid. */ +STATIC void +xrep_inode_ids( + struct xfs_scrub *sc) +{ + bool dirty = false; + + trace_xrep_inode_ids(sc); + + if (!uid_valid(VFS_I(sc->ip)->i_uid)) { + i_uid_write(VFS_I(sc->ip), 0); + dirty = true; + if (XFS_IS_UQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_USER); + } + + if (!gid_valid(VFS_I(sc->ip)->i_gid)) { + i_gid_write(VFS_I(sc->ip), 0); + dirty = true; + if (XFS_IS_GQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); + } + + if (sc->ip->i_projid == -1U) { + sc->ip->i_projid = 0; + dirty = true; + if (XFS_IS_PQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); + } + + /* strip setuid/setgid if we touched any of the ids */ + if (dirty) + VFS_I(sc->ip)->i_mode &= ~(S_ISUID | S_ISGID); +} + +static inline void +xrep_clamp_timestamp( + struct xfs_inode *ip, + struct timespec64 *ts) +{ + ts->tv_nsec = clamp_t(long, ts->tv_nsec, 0, NSEC_PER_SEC); + *ts = timestamp_truncate(*ts, VFS_I(ip)); +} + +/* Nanosecond counters can't have more than 1 billion. */ +STATIC void +xrep_inode_timestamps( + struct xfs_inode *ip) +{ + struct timespec64 tstamp; + struct inode *inode = VFS_I(ip); + + tstamp = inode_get_atime(inode); + xrep_clamp_timestamp(ip, &tstamp); + inode_set_atime_to_ts(inode, tstamp); + + tstamp = inode_get_mtime(inode); + xrep_clamp_timestamp(ip, &tstamp); + inode_set_mtime_to_ts(inode, tstamp); + + tstamp = inode_get_ctime(inode); + xrep_clamp_timestamp(ip, &tstamp); + inode_set_ctime_to_ts(inode, tstamp); + + xrep_clamp_timestamp(ip, &ip->i_crtime); +} + +/* Fix inode flags that don't make sense together. */ +STATIC void +xrep_inode_flags( + struct xfs_scrub *sc) +{ + uint16_t mode; + + trace_xrep_inode_flags(sc); + + mode = VFS_I(sc->ip)->i_mode; + + /* Clear junk flags */ + if (sc->ip->i_diflags & ~XFS_DIFLAG_ANY) + sc->ip->i_diflags &= ~XFS_DIFLAG_ANY; + + /* NEWRTBM only applies to realtime bitmaps */ + if (sc->ip->i_ino == sc->mp->m_sb.sb_rbmino) + sc->ip->i_diflags |= XFS_DIFLAG_NEWRTBM; + else + sc->ip->i_diflags &= ~XFS_DIFLAG_NEWRTBM; + + /* These only make sense for directories. */ + if (!S_ISDIR(mode)) + sc->ip->i_diflags &= ~(XFS_DIFLAG_RTINHERIT | + XFS_DIFLAG_EXTSZINHERIT | + XFS_DIFLAG_PROJINHERIT | + XFS_DIFLAG_NOSYMLINKS); + + /* These only make sense for files. */ + if (!S_ISREG(mode)) + sc->ip->i_diflags &= ~(XFS_DIFLAG_REALTIME | + XFS_DIFLAG_EXTSIZE); + + /* These only make sense for non-rt files. */ + if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) + sc->ip->i_diflags &= ~XFS_DIFLAG_FILESTREAM; + + /* Immutable and append only? Drop the append. */ + if ((sc->ip->i_diflags & XFS_DIFLAG_IMMUTABLE) && + (sc->ip->i_diflags & XFS_DIFLAG_APPEND)) + sc->ip->i_diflags &= ~XFS_DIFLAG_APPEND; + + /* Clear junk flags. */ + if (sc->ip->i_diflags2 & ~XFS_DIFLAG2_ANY) + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_ANY; + + /* No reflink flag unless we support it and it's a file. */ + if (!xfs_has_reflink(sc->mp) || !S_ISREG(mode)) + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + + /* DAX only applies to files and dirs. */ + if (!(S_ISREG(mode) || S_ISDIR(mode))) + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX; + + /* No reflink files on the realtime device. */ + if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; +} + +/* + * Fix size problems with block/node format directories. If we fail to find + * the extent list, just bail out and let the bmapbtd repair functions clean + * up that mess. + */ +STATIC void +xrep_inode_blockdir_size( + struct xfs_scrub *sc) +{ + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got; + struct xfs_ifork *ifp; + xfs_fileoff_t off; + int error; + + trace_xrep_inode_blockdir_size(sc); + + error = xfs_iread_extents(sc->tp, sc->ip, XFS_DATA_FORK); + if (error) + return; + + /* Find the last block before 32G; this is the dir size. */ + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); + off = XFS_B_TO_FSB(sc->mp, XFS_DIR2_SPACE_SIZE); + if (!xfs_iext_lookup_extent_before(sc->ip, ifp, &off, &icur, &got)) { + /* zero-extents directory? */ + return; + } + + off = got.br_startoff + got.br_blockcount; + sc->ip->i_disk_size = min_t(loff_t, XFS_DIR2_SPACE_SIZE, + XFS_FSB_TO_B(sc->mp, off)); +} + +/* Fix size problems with short format directories. */ +STATIC void +xrep_inode_sfdir_size( + struct xfs_scrub *sc) +{ + struct xfs_ifork *ifp; + + trace_xrep_inode_sfdir_size(sc); + + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); + sc->ip->i_disk_size = ifp->if_bytes; +} + +/* + * Fix any irregularities in a directory inode's size now that we can iterate + * extent maps and access other regular inode data. + */ +STATIC void +xrep_inode_dir_size( + struct xfs_scrub *sc) +{ + trace_xrep_inode_dir_size(sc); + + switch (sc->ip->i_df.if_format) { + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + xrep_inode_blockdir_size(sc); + break; + case XFS_DINODE_FMT_LOCAL: + xrep_inode_sfdir_size(sc); + break; + } +} + +/* Fix extent size hint problems. */ +STATIC void +xrep_inode_extsize( + struct xfs_scrub *sc) +{ + /* Fix misaligned extent size hints on a directory. */ + if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) && + (sc->ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && + xfs_extlen_to_rtxmod(sc->mp, sc->ip->i_extsize) > 0) { + sc->ip->i_extsize = 0; + sc->ip->i_diflags &= ~XFS_DIFLAG_EXTSZINHERIT; + } +} + +/* Fix any irregularities in an inode that the verifiers don't catch. */ +STATIC int +xrep_inode_problems( + struct xfs_scrub *sc) +{ + int error; + + error = xrep_inode_blockcounts(sc); + if (error) + return error; + xrep_inode_timestamps(sc->ip); + xrep_inode_flags(sc); + xrep_inode_ids(sc); + /* + * We can now do a better job fixing the size of a directory now that + * we can scan the data fork extents than we could in xrep_dinode_size. + */ + if (S_ISDIR(VFS_I(sc->ip)->i_mode)) + xrep_inode_dir_size(sc); + xrep_inode_extsize(sc); + + trace_xrep_inode_fixed(sc); + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + return xrep_roll_trans(sc); +} + +/* Repair an inode's fields. */ +int +xrep_inode( + struct xfs_scrub *sc) +{ + int error = 0; + + /* + * No inode? That means we failed the _iget verifiers. Repair all + * the things that the inode verifiers care about, then retry _iget. + */ + if (!sc->ip) { + struct xrep_inode *ri = sc->buf; + + ASSERT(ri != NULL); + + error = xrep_dinode_problems(ri); + if (error) + return error; + + /* By this point we had better have a working incore inode. */ + if (!sc->ip) + return -EFSCORRUPTED; + } + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* If we found corruption of any kind, try to fix it. */ + if ((sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) || + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT)) { + error = xrep_inode_problems(sc); + if (error) + return error; + } + + /* See if we can clear the reflink flag. */ + if (xfs_is_reflink_inode(sc->ip)) { + error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp); + if (error) + return error; + } + + return xrep_defer_finish(sc); +} diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c new file mode 100644 index 000000000000..bb6d980b4fcd --- /dev/null +++ b/fs/xfs/scrub/newbt.c @@ -0,0 +1,559 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_rmap.h" +#include "xfs_ag.h" +#include "xfs_defer.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/newbt.h" + +/* + * Estimate proper slack values for a btree that's being reloaded. + * + * Under most circumstances, we'll take whatever default loading value the + * btree bulk loading code calculates for us. However, there are some + * exceptions to this rule: + * + * (0) If someone turned one of the debug knobs. + * (1) If this is a per-AG btree and the AG has less than 10% space free. + * (2) If this is an inode btree and the FS has less than 10% space free. + + * In either case, format the new btree blocks almost completely full to + * minimize space usage. + */ +static void +xrep_newbt_estimate_slack( + struct xrep_newbt *xnr) +{ + struct xfs_scrub *sc = xnr->sc; + struct xfs_btree_bload *bload = &xnr->bload; + uint64_t free; + uint64_t sz; + + /* + * The xfs_globals values are set to -1 (i.e. take the bload defaults) + * unless someone has set them otherwise, so we just pull the values + * here. + */ + bload->leaf_slack = xfs_globals.bload_leaf_slack; + bload->node_slack = xfs_globals.bload_node_slack; + + if (sc->ops->type == ST_PERAG) { + free = sc->sa.pag->pagf_freeblks; + sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno); + } else { + free = percpu_counter_sum(&sc->mp->m_fdblocks); + sz = sc->mp->m_sb.sb_dblocks; + } + + /* No further changes if there's more than 10% free space left. */ + if (free >= div_u64(sz, 10)) + return; + + /* + * We're low on space; load the btrees as tightly as possible. Leave + * a couple of open slots in each btree block so that we don't end up + * splitting the btrees like crazy after a mount. + */ + if (bload->leaf_slack < 0) + bload->leaf_slack = 2; + if (bload->node_slack < 0) + bload->node_slack = 2; +} + +/* Initialize accounting resources for staging a new AG btree. */ +void +xrep_newbt_init_ag( + struct xrep_newbt *xnr, + struct xfs_scrub *sc, + const struct xfs_owner_info *oinfo, + xfs_fsblock_t alloc_hint, + enum xfs_ag_resv_type resv) +{ + memset(xnr, 0, sizeof(struct xrep_newbt)); + xnr->sc = sc; + xnr->oinfo = *oinfo; /* structure copy */ + xnr->alloc_hint = alloc_hint; + xnr->resv = resv; + INIT_LIST_HEAD(&xnr->resv_list); + xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */ + xrep_newbt_estimate_slack(xnr); +} + +/* Initialize accounting resources for staging a new inode fork btree. */ +int +xrep_newbt_init_inode( + struct xrep_newbt *xnr, + struct xfs_scrub *sc, + int whichfork, + const struct xfs_owner_info *oinfo) +{ + struct xfs_ifork *ifp; + + ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS); + if (!ifp) + return -ENOMEM; + + xrep_newbt_init_ag(xnr, sc, oinfo, + XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino), + XFS_AG_RESV_NONE); + xnr->ifake.if_fork = ifp; + xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork); + return 0; +} + +/* + * Initialize accounting resources for staging a new btree. Callers are + * expected to add their own reservations (and clean them up) manually. + */ +void +xrep_newbt_init_bare( + struct xrep_newbt *xnr, + struct xfs_scrub *sc) +{ + xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK, + XFS_AG_RESV_NONE); +} + +/* + * Designate specific blocks to be used to build our new btree. @pag must be + * a passive reference. + */ +STATIC int +xrep_newbt_add_blocks( + struct xrep_newbt *xnr, + struct xfs_perag *pag, + const struct xfs_alloc_arg *args) +{ + struct xfs_mount *mp = xnr->sc->mp; + struct xrep_newbt_resv *resv; + int error; + + resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS); + if (!resv) + return -ENOMEM; + + INIT_LIST_HEAD(&resv->list); + resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); + resv->len = args->len; + resv->used = 0; + resv->pag = xfs_perag_hold(pag); + + if (args->tp) { + ASSERT(xnr->oinfo.oi_offset == 0); + + error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap); + if (error) + goto out_pag; + } + + list_add_tail(&resv->list, &xnr->resv_list); + return 0; +out_pag: + xfs_perag_put(resv->pag); + kfree(resv); + return error; +} + +/* + * Add an extent to the new btree reservation pool. Callers are required to + * reap this reservation manually if the repair is cancelled. @pag must be a + * passive reference. + */ +int +xrep_newbt_add_extent( + struct xrep_newbt *xnr, + struct xfs_perag *pag, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + struct xfs_mount *mp = xnr->sc->mp; + struct xfs_alloc_arg args = { + .tp = NULL, /* no autoreap */ + .oinfo = xnr->oinfo, + .fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno), + .len = len, + .resv = xnr->resv, + }; + + return xrep_newbt_add_blocks(xnr, pag, &args); +} + +/* Don't let our allocation hint take us beyond this AG */ +static inline void +xrep_newbt_validate_ag_alloc_hint( + struct xrep_newbt *xnr) +{ + struct xfs_scrub *sc = xnr->sc; + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint); + + if (agno == sc->sa.pag->pag_agno && + xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) + return; + + xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, + XFS_AGFL_BLOCK(sc->mp) + 1); +} + +/* Allocate disk space for a new per-AG btree. */ +STATIC int +xrep_newbt_alloc_ag_blocks( + struct xrep_newbt *xnr, + uint64_t nr_blocks) +{ + struct xfs_scrub *sc = xnr->sc; + struct xfs_mount *mp = sc->mp; + int error = 0; + + ASSERT(sc->sa.pag != NULL); + + while (nr_blocks > 0) { + struct xfs_alloc_arg args = { + .tp = sc->tp, + .mp = mp, + .oinfo = xnr->oinfo, + .minlen = 1, + .maxlen = nr_blocks, + .prod = 1, + .resv = xnr->resv, + }; + xfs_agnumber_t agno; + + xrep_newbt_validate_ag_alloc_hint(xnr); + + error = xfs_alloc_vextent_near_bno(&args, xnr->alloc_hint); + if (error) + return error; + if (args.fsbno == NULLFSBLOCK) + return -ENOSPC; + + agno = XFS_FSB_TO_AGNO(mp, args.fsbno); + + trace_xrep_newbt_alloc_ag_blocks(mp, agno, + XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, + xnr->oinfo.oi_owner); + + if (agno != sc->sa.pag->pag_agno) { + ASSERT(agno == sc->sa.pag->pag_agno); + return -EFSCORRUPTED; + } + + error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args); + if (error) + return error; + + nr_blocks -= args.len; + xnr->alloc_hint = args.fsbno + args.len; + + error = xrep_defer_finish(sc); + if (error) + return error; + } + + return 0; +} + +/* Don't let our allocation hint take us beyond EOFS */ +static inline void +xrep_newbt_validate_file_alloc_hint( + struct xrep_newbt *xnr) +{ + struct xfs_scrub *sc = xnr->sc; + + if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) + return; + + xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1); +} + +/* Allocate disk space for our new file-based btree. */ +STATIC int +xrep_newbt_alloc_file_blocks( + struct xrep_newbt *xnr, + uint64_t nr_blocks) +{ + struct xfs_scrub *sc = xnr->sc; + struct xfs_mount *mp = sc->mp; + int error = 0; + + while (nr_blocks > 0) { + struct xfs_alloc_arg args = { + .tp = sc->tp, + .mp = mp, + .oinfo = xnr->oinfo, + .minlen = 1, + .maxlen = nr_blocks, + .prod = 1, + .resv = xnr->resv, + }; + struct xfs_perag *pag; + xfs_agnumber_t agno; + + xrep_newbt_validate_file_alloc_hint(xnr); + + error = xfs_alloc_vextent_start_ag(&args, xnr->alloc_hint); + if (error) + return error; + if (args.fsbno == NULLFSBLOCK) + return -ENOSPC; + + agno = XFS_FSB_TO_AGNO(mp, args.fsbno); + + trace_xrep_newbt_alloc_file_blocks(mp, agno, + XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, + xnr->oinfo.oi_owner); + + pag = xfs_perag_get(mp, agno); + if (!pag) { + ASSERT(0); + return -EFSCORRUPTED; + } + + error = xrep_newbt_add_blocks(xnr, pag, &args); + xfs_perag_put(pag); + if (error) + return error; + + nr_blocks -= args.len; + xnr->alloc_hint = args.fsbno + args.len; + + error = xrep_defer_finish(sc); + if (error) + return error; + } + + return 0; +} + +/* Allocate disk space for our new btree. */ +int +xrep_newbt_alloc_blocks( + struct xrep_newbt *xnr, + uint64_t nr_blocks) +{ + if (xnr->sc->ip) + return xrep_newbt_alloc_file_blocks(xnr, nr_blocks); + return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks); +} + +/* + * Free the unused part of a space extent that was reserved for a new ondisk + * structure. Returns the number of EFIs logged or a negative errno. + */ +STATIC int +xrep_newbt_free_extent( + struct xrep_newbt *xnr, + struct xrep_newbt_resv *resv, + bool btree_committed) +{ + struct xfs_scrub *sc = xnr->sc; + xfs_agblock_t free_agbno = resv->agbno; + xfs_extlen_t free_aglen = resv->len; + xfs_fsblock_t fsbno; + int error; + + if (!btree_committed || resv->used == 0) { + /* + * If we're not committing a new btree or we didn't use the + * space reservation, let the existing EFI free the entire + * space extent. + */ + trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, + free_agbno, free_aglen, xnr->oinfo.oi_owner); + xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); + return 1; + } + + /* + * We used space and committed the btree. Cancel the autoreap, remove + * the written blocks from the reservation, and possibly log a new EFI + * to free any unused reservation space. + */ + xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap); + free_agbno += resv->used; + free_aglen -= resv->used; + + if (free_aglen == 0) + return 0; + + trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno, + free_aglen, xnr->oinfo.oi_owner); + + ASSERT(xnr->resv != XFS_AG_RESV_AGFL); + ASSERT(xnr->resv != XFS_AG_RESV_IGNORE); + + /* + * Use EFIs to free the reservations. This reduces the chance + * that we leak blocks if the system goes down. + */ + fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno); + error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo, + xnr->resv, true); + if (error) + return error; + + return 1; +} + +/* Free all the accounting info and disk space we reserved for a new btree. */ +STATIC int +xrep_newbt_free( + struct xrep_newbt *xnr, + bool btree_committed) +{ + struct xfs_scrub *sc = xnr->sc; + struct xrep_newbt_resv *resv, *n; + unsigned int freed = 0; + int error = 0; + + /* + * If the filesystem already went down, we can't free the blocks. Skip + * ahead to freeing the incore metadata because we can't fix anything. + */ + if (xfs_is_shutdown(sc->mp)) + goto junkit; + + list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { + int ret; + + ret = xrep_newbt_free_extent(xnr, resv, btree_committed); + list_del(&resv->list); + xfs_perag_put(resv->pag); + kfree(resv); + if (ret < 0) { + error = ret; + goto junkit; + } + + freed += ret; + if (freed >= XREP_MAX_ITRUNCATE_EFIS) { + error = xrep_defer_finish(sc); + if (error) + goto junkit; + freed = 0; + } + } + + if (freed) + error = xrep_defer_finish(sc); + +junkit: + /* + * If we still have reservations attached to @newbt, cleanup must have + * failed and the filesystem is about to go down. Clean up the incore + * reservations and try to commit to freeing the space we used. + */ + list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { + xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); + list_del(&resv->list); + xfs_perag_put(resv->pag); + kfree(resv); + } + + if (sc->ip) { + kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork); + xnr->ifake.if_fork = NULL; + } + + return error; +} + +/* + * Free all the accounting info and unused disk space allocations after + * committing a new btree. + */ +int +xrep_newbt_commit( + struct xrep_newbt *xnr) +{ + return xrep_newbt_free(xnr, true); +} + +/* + * Free all the accounting info and all of the disk space we reserved for a new + * btree that we're not going to commit. We want to try to roll things back + * cleanly for things like ENOSPC midway through allocation. + */ +void +xrep_newbt_cancel( + struct xrep_newbt *xnr) +{ + xrep_newbt_free(xnr, false); +} + +/* Feed one of the reserved btree blocks to the bulk loader. */ +int +xrep_newbt_claim_block( + struct xfs_btree_cur *cur, + struct xrep_newbt *xnr, + union xfs_btree_ptr *ptr) +{ + struct xrep_newbt_resv *resv; + struct xfs_mount *mp = cur->bc_mp; + xfs_agblock_t agbno; + + /* + * The first item in the list should always have a free block unless + * we're completely out. + */ + resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list); + if (resv->used == resv->len) + return -ENOSPC; + + /* + * Peel off a block from the start of the reservation. We allocate + * blocks in order to place blocks on disk in increasing record or key + * order. The block reservations tend to end up on the list in + * decreasing order, which hopefully results in leaf blocks ending up + * together. + */ + agbno = resv->agbno + resv->used; + resv->used++; + + /* If we used all the blocks in this reservation, move it to the end. */ + if (resv->used == resv->len) + list_move_tail(&resv->list, &xnr->resv_list); + + trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1, + xnr->oinfo.oi_owner); + + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno, + agbno)); + else + ptr->s = cpu_to_be32(agbno); + + /* Relog all the EFIs. */ + return xrep_defer_finish(xnr->sc); +} + +/* How many reserved blocks are unused? */ +unsigned int +xrep_newbt_unused_blocks( + struct xrep_newbt *xnr) +{ + struct xrep_newbt_resv *resv; + unsigned int unused = 0; + + list_for_each_entry(resv, &xnr->resv_list, list) + unused += resv->len - resv->used; + return unused; +} diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h new file mode 100644 index 000000000000..89f8e3970b1f --- /dev/null +++ b/fs/xfs/scrub/newbt.h @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_NEWBT_H__ +#define __XFS_SCRUB_NEWBT_H__ + +struct xrep_newbt_resv { + /* Link to list of extents that we've reserved. */ + struct list_head list; + + struct xfs_perag *pag; + + /* Auto-freeing this reservation if we don't commit. */ + struct xfs_alloc_autoreap autoreap; + + /* AG block of the extent we reserved. */ + xfs_agblock_t agbno; + + /* Length of the reservation. */ + xfs_extlen_t len; + + /* How much of this reservation has been used. */ + xfs_extlen_t used; +}; + +struct xrep_newbt { + struct xfs_scrub *sc; + + /* List of extents that we've reserved. */ + struct list_head resv_list; + + /* Fake root for new btree. */ + union { + struct xbtree_afakeroot afake; + struct xbtree_ifakeroot ifake; + }; + + /* rmap owner of these blocks */ + struct xfs_owner_info oinfo; + + /* btree geometry for the bulk loader */ + struct xfs_btree_bload bload; + + /* Allocation hint */ + xfs_fsblock_t alloc_hint; + + /* per-ag reservation type */ + enum xfs_ag_resv_type resv; +}; + +void xrep_newbt_init_bare(struct xrep_newbt *xnr, struct xfs_scrub *sc); +void xrep_newbt_init_ag(struct xrep_newbt *xnr, struct xfs_scrub *sc, + const struct xfs_owner_info *oinfo, xfs_fsblock_t alloc_hint, + enum xfs_ag_resv_type resv); +int xrep_newbt_init_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc, + int whichfork, const struct xfs_owner_info *oinfo); +int xrep_newbt_alloc_blocks(struct xrep_newbt *xnr, uint64_t nr_blocks); +int xrep_newbt_add_extent(struct xrep_newbt *xnr, struct xfs_perag *pag, + xfs_agblock_t agbno, xfs_extlen_t len); +void xrep_newbt_cancel(struct xrep_newbt *xnr); +int xrep_newbt_commit(struct xrep_newbt *xnr); +int xrep_newbt_claim_block(struct xfs_btree_cur *cur, struct xrep_newbt *xnr, + union xfs_btree_ptr *ptr); +unsigned int xrep_newbt_unused_blocks(struct xrep_newbt *xnr); + +#endif /* __XFS_SCRUB_NEWBT_H__ */ diff --git a/fs/xfs/scrub/off_bitmap.h b/fs/xfs/scrub/off_bitmap.h new file mode 100644 index 000000000000..0d3f9e6c1aad --- /dev/null +++ b/fs/xfs/scrub/off_bitmap.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_OFF_BITMAP_H__ +#define __XFS_SCRUB_OFF_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_fileoff_t */ + +struct xoff_bitmap { + struct xbitmap64 offbitmap; +}; + +static inline void xoff_bitmap_init(struct xoff_bitmap *bitmap) +{ + xbitmap64_init(&bitmap->offbitmap); +} + +static inline void xoff_bitmap_destroy(struct xoff_bitmap *bitmap) +{ + xbitmap64_destroy(&bitmap->offbitmap); +} + +static inline int xoff_bitmap_set(struct xoff_bitmap *bitmap, + xfs_fileoff_t off, xfs_filblks_t len) +{ + return xbitmap64_set(&bitmap->offbitmap, off, len); +} + +static inline int xoff_bitmap_walk(struct xoff_bitmap *bitmap, + xbitmap64_walk_fn fn, void *priv) +{ + return xbitmap64_walk(&bitmap->offbitmap, fn, priv); +} + +#endif /* __XFS_SCRUB_OFF_BITMAP_H__ */ diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index e6155d86f791..7db873672146 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -156,6 +156,16 @@ xchk_parent_validate( goto out_rele; } + /* + * We cannot yet validate this parent pointer if the directory looks as + * though it has been zapped by the inode record repair code. + */ + if (xchk_dir_looks_zapped(dp)) { + error = -EBUSY; + xchk_set_incomplete(sc); + goto out_unlock; + } + /* Look for a directory entry in the parent pointing to the child. */ error = xchk_dir_walk(sc, dp, xchk_parent_actor, &spc); if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) @@ -217,6 +227,13 @@ xchk_parent( */ error = xchk_parent_validate(sc, parent_ino); } while (error == -EAGAIN); + if (error == -EBUSY) { + /* + * We could not scan a directory, so we marked the check + * incomplete. No further error return is necessary. + */ + return 0; + } return error; } diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 5671c8153433..183d531875ea 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -6,6 +6,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" +#include "xfs_bit.h" #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" @@ -17,9 +18,10 @@ #include "xfs_bmap.h" #include "scrub/scrub.h" #include "scrub/common.h" +#include "scrub/quota.h" /* Convert a scrub type code to a DQ flag, or return 0 if error. */ -static inline xfs_dqtype_t +xfs_dqtype_t xchk_quota_to_dqtype( struct xfs_scrub *sc) { @@ -75,14 +77,70 @@ struct xchk_quota_info { xfs_dqid_t last_id; }; +/* There's a written block backing this dquot, right? */ +STATIC int +xchk_quota_item_bmap( + struct xfs_scrub *sc, + struct xfs_dquot *dq, + xfs_fileoff_t offset) +{ + struct xfs_bmbt_irec irec; + struct xfs_mount *mp = sc->mp; + int nmaps = 1; + int error; + + if (!xfs_verify_fileoff(mp, offset)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return 0; + } + + if (dq->q_fileoffset != offset) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return 0; + } + + error = xfs_bmapi_read(sc->ip, offset, 1, &irec, &nmaps, 0); + if (error) + return error; + + if (nmaps != 1) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return 0; + } + + if (!xfs_verify_fsbno(mp, irec.br_startblock)) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + if (XFS_FSB_TO_DADDR(mp, irec.br_startblock) != dq->q_blkno) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + if (!xfs_bmap_is_written_extent(&irec)) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + + return 0; +} + +/* Complain if a quota timer is incorrectly set. */ +static inline void +xchk_quota_item_timer( + struct xfs_scrub *sc, + xfs_fileoff_t offset, + const struct xfs_dquot_res *res) +{ + if ((res->softlimit && res->count > res->softlimit) || + (res->hardlimit && res->count > res->hardlimit)) { + if (!res->timer) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + } else { + if (res->timer) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + } +} + /* Scrub the fields in an individual quota item. */ STATIC int xchk_quota_item( - struct xfs_dquot *dq, - xfs_dqtype_t dqtype, - void *priv) + struct xchk_quota_info *sqi, + struct xfs_dquot *dq) { - struct xchk_quota_info *sqi = priv; struct xfs_scrub *sc = sqi->sc; struct xfs_mount *mp = sc->mp; struct xfs_quotainfo *qi = mp->m_quotainfo; @@ -94,6 +152,17 @@ xchk_quota_item( return error; /* + * We want to validate the bmap record for the storage backing this + * dquot, so we need to lock the dquot and the quota file. For quota + * operations, the locking order is first the ILOCK and then the dquot. + * However, dqiterate gave us a locked dquot, so drop the dquot lock to + * get the ILOCK. + */ + xfs_dqunlock(dq); + xchk_ilock(sc, XFS_ILOCK_SHARED); + xfs_dqlock(dq); + + /* * Except for the root dquot, the actual dquot we got must either have * the same or higher id as we saw before. */ @@ -103,6 +172,11 @@ xchk_quota_item( sqi->last_id = dq->q_id; + error = xchk_quota_item_bmap(sc, dq, offset); + xchk_iunlock(sc, XFS_ILOCK_SHARED); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, offset, &error)) + return error; + /* * Warn if the hard limits are larger than the fs. * Administrators can do this, though in production this seems @@ -166,6 +240,10 @@ xchk_quota_item( dq->q_rtb.count > dq->q_rtb.hardlimit) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); + xchk_quota_item_timer(sc, offset, &dq->q_blk); + xchk_quota_item_timer(sc, offset, &dq->q_ino); + xchk_quota_item_timer(sc, offset, &dq->q_rtb); + out: if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return -ECANCELED; @@ -191,7 +269,7 @@ xchk_quota_data_fork( return error; /* Check for data fork problems that apply only to quota files. */ - max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk; + max_dqid_off = XFS_DQ_ID_MAX / qi->qi_dqperchunk; ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); for_each_xfs_iext(ifp, &icur, &irec) { if (xchk_should_terminate(sc, &error)) @@ -218,9 +296,11 @@ int xchk_quota( struct xfs_scrub *sc) { - struct xchk_quota_info sqi; + struct xchk_dqiter cursor = { }; + struct xchk_quota_info sqi = { .sc = sc }; struct xfs_mount *mp = sc->mp; struct xfs_quotainfo *qi = mp->m_quotainfo; + struct xfs_dquot *dq; xfs_dqtype_t dqtype; int error = 0; @@ -239,10 +319,15 @@ xchk_quota( * functions. */ xchk_iunlock(sc, sc->ilock_flags); - sqi.sc = sc; - sqi.last_id = 0; - error = xfs_qm_dqiterate(mp, dqtype, xchk_quota_item, &sqi); - xchk_ilock(sc, XFS_ILOCK_EXCL); + + /* Now look for things that the quota verifiers won't complain about. */ + xchk_dqiter_init(&cursor, sc, dqtype); + while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { + error = xchk_quota_item(&sqi, dq); + xfs_qm_dqput(dq); + if (error) + break; + } if (error == -ECANCELED) error = 0; if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, diff --git a/fs/xfs/scrub/quota.h b/fs/xfs/scrub/quota.h new file mode 100644 index 000000000000..6c7134ce2385 --- /dev/null +++ b/fs/xfs/scrub/quota.h @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_QUOTA_H__ +#define __XFS_SCRUB_QUOTA_H__ + +xfs_dqtype_t xchk_quota_to_dqtype(struct xfs_scrub *sc); + +/* dquot iteration code */ + +struct xchk_dqiter { + struct xfs_scrub *sc; + + /* Quota file that we're walking. */ + struct xfs_inode *quota_ip; + + /* Cached data fork mapping for the dquot. */ + struct xfs_bmbt_irec bmap; + + /* The next dquot to scan. */ + uint64_t id; + + /* Quota type (user/group/project). */ + xfs_dqtype_t dqtype; + + /* Data fork sequence number to detect stale mappings. */ + unsigned int if_seq; +}; + +void xchk_dqiter_init(struct xchk_dqiter *cursor, struct xfs_scrub *sc, + xfs_dqtype_t dqtype); +int xchk_dquot_iter(struct xchk_dqiter *cursor, struct xfs_dquot **dqpp); + +#endif /* __XFS_SCRUB_QUOTA_H__ */ diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c new file mode 100644 index 000000000000..0bab4c30cb85 --- /dev/null +++ b/fs/xfs/scrub/quota_repair.c @@ -0,0 +1,575 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_dquot.h" +#include "xfs_dquot_item.h" +#include "xfs_reflink.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/quota.h" +#include "scrub/trace.h" +#include "scrub/repair.h" + +/* + * Quota Repair + * ============ + * + * Quota repairs are fairly simplistic; we fix everything that the dquot + * verifiers complain about, cap any counters or limits that make no sense, + * and schedule a quotacheck if we had to fix anything. We also repair any + * data fork extent records that don't apply to metadata files. + */ + +struct xrep_quota_info { + struct xfs_scrub *sc; + bool need_quotacheck; +}; + +/* + * Allocate a new block into a sparse hole in the quota file backing this + * dquot, initialize the block, and commit the whole mess. + */ +STATIC int +xrep_quota_item_fill_bmap_hole( + struct xfs_scrub *sc, + struct xfs_dquot *dq, + struct xfs_bmbt_irec *irec) +{ + struct xfs_buf *bp; + struct xfs_mount *mp = sc->mp; + int nmaps = 1; + int error; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* Map a block into the file. */ + error = xfs_trans_reserve_more(sc->tp, XFS_QM_DQALLOC_SPACE_RES(mp), + 0); + if (error) + return error; + + error = xfs_bmapi_write(sc->tp, sc->ip, dq->q_fileoffset, + XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0, + irec, &nmaps); + if (error) + return error; + if (nmaps != 1) + return -ENOSPC; + + dq->q_blkno = XFS_FSB_TO_DADDR(mp, irec->br_startblock); + + trace_xrep_dquot_item_fill_bmap_hole(sc->mp, dq->q_type, dq->q_id); + + /* Initialize the new block. */ + error = xfs_trans_get_buf(sc->tp, mp->m_ddev_targp, dq->q_blkno, + mp->m_quotainfo->qi_dqchunklen, 0, &bp); + if (error) + return error; + bp->b_ops = &xfs_dquot_buf_ops; + + xfs_qm_init_dquot_blk(sc->tp, dq->q_id, dq->q_type, bp); + xfs_buf_set_ref(bp, XFS_DQUOT_REF); + + /* + * Finish the mapping transactions and roll one more time to + * disconnect sc->ip from sc->tp. + */ + error = xrep_defer_finish(sc); + if (error) + return error; + return xfs_trans_roll(&sc->tp); +} + +/* Make sure there's a written block backing this dquot */ +STATIC int +xrep_quota_item_bmap( + struct xfs_scrub *sc, + struct xfs_dquot *dq, + bool *dirty) +{ + struct xfs_bmbt_irec irec; + struct xfs_mount *mp = sc->mp; + struct xfs_quotainfo *qi = mp->m_quotainfo; + xfs_fileoff_t offset = dq->q_id / qi->qi_dqperchunk; + int nmaps = 1; + int error; + + /* The computed file offset should always be valid. */ + if (!xfs_verify_fileoff(mp, offset)) { + ASSERT(xfs_verify_fileoff(mp, offset)); + return -EFSCORRUPTED; + } + dq->q_fileoffset = offset; + + error = xfs_bmapi_read(sc->ip, offset, 1, &irec, &nmaps, 0); + if (error) + return error; + + if (nmaps < 1 || !xfs_bmap_is_real_extent(&irec)) { + /* Hole/delalloc extent; allocate a real block. */ + error = xrep_quota_item_fill_bmap_hole(sc, dq, &irec); + if (error) + return error; + } else if (irec.br_state != XFS_EXT_NORM) { + /* Unwritten extent, which we already took care of? */ + ASSERT(irec.br_state == XFS_EXT_NORM); + return -EFSCORRUPTED; + } else if (dq->q_blkno != XFS_FSB_TO_DADDR(mp, irec.br_startblock)) { + /* + * If the cached daddr is incorrect, repair probably punched a + * hole out of the quota file and filled it back in with a new + * block. Update the block mapping in the dquot. + */ + dq->q_blkno = XFS_FSB_TO_DADDR(mp, irec.br_startblock); + } + + *dirty = true; + return 0; +} + +/* Reset quota timers if incorrectly set. */ +static inline void +xrep_quota_item_timer( + struct xfs_scrub *sc, + const struct xfs_dquot_res *res, + bool *dirty) +{ + if ((res->softlimit && res->count > res->softlimit) || + (res->hardlimit && res->count > res->hardlimit)) { + if (!res->timer) + *dirty = true; + } else { + if (res->timer) + *dirty = true; + } +} + +/* Scrub the fields in an individual quota item. */ +STATIC int +xrep_quota_item( + struct xrep_quota_info *rqi, + struct xfs_dquot *dq) +{ + struct xfs_scrub *sc = rqi->sc; + struct xfs_mount *mp = sc->mp; + xfs_ino_t fs_icount; + bool dirty = false; + int error = 0; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + /* + * We might need to fix holes in the bmap record for the storage + * backing this dquot, so we need to lock the dquot and the quota file. + * dqiterate gave us a locked dquot, so drop the dquot lock to get the + * ILOCK_EXCL. + */ + xfs_dqunlock(dq); + xchk_ilock(sc, XFS_ILOCK_EXCL); + xfs_dqlock(dq); + + error = xrep_quota_item_bmap(sc, dq, &dirty); + xchk_iunlock(sc, XFS_ILOCK_EXCL); + if (error) + return error; + + /* Check the limits. */ + if (dq->q_blk.softlimit > dq->q_blk.hardlimit) { + dq->q_blk.softlimit = dq->q_blk.hardlimit; + dirty = true; + } + + if (dq->q_ino.softlimit > dq->q_ino.hardlimit) { + dq->q_ino.softlimit = dq->q_ino.hardlimit; + dirty = true; + } + + if (dq->q_rtb.softlimit > dq->q_rtb.hardlimit) { + dq->q_rtb.softlimit = dq->q_rtb.hardlimit; + dirty = true; + } + + /* + * Check that usage doesn't exceed physical limits. However, on + * a reflink filesystem we're allowed to exceed physical space + * if there are no quota limits. We don't know what the real number + * is, but we can make quotacheck find out for us. + */ + if (!xfs_has_reflink(mp) && dq->q_blk.count > mp->m_sb.sb_dblocks) { + dq->q_blk.reserved -= dq->q_blk.count; + dq->q_blk.reserved += mp->m_sb.sb_dblocks; + dq->q_blk.count = mp->m_sb.sb_dblocks; + rqi->need_quotacheck = true; + dirty = true; + } + fs_icount = percpu_counter_sum(&mp->m_icount); + if (dq->q_ino.count > fs_icount) { + dq->q_ino.reserved -= dq->q_ino.count; + dq->q_ino.reserved += fs_icount; + dq->q_ino.count = fs_icount; + rqi->need_quotacheck = true; + dirty = true; + } + if (dq->q_rtb.count > mp->m_sb.sb_rblocks) { + dq->q_rtb.reserved -= dq->q_rtb.count; + dq->q_rtb.reserved += mp->m_sb.sb_rblocks; + dq->q_rtb.count = mp->m_sb.sb_rblocks; + rqi->need_quotacheck = true; + dirty = true; + } + + xrep_quota_item_timer(sc, &dq->q_blk, &dirty); + xrep_quota_item_timer(sc, &dq->q_ino, &dirty); + xrep_quota_item_timer(sc, &dq->q_rtb, &dirty); + + if (!dirty) + return 0; + + trace_xrep_dquot_item(sc->mp, dq->q_type, dq->q_id); + + dq->q_flags |= XFS_DQFLAG_DIRTY; + xfs_trans_dqjoin(sc->tp, dq); + if (dq->q_id) { + xfs_qm_adjust_dqlimits(dq); + xfs_qm_adjust_dqtimers(dq); + } + xfs_trans_log_dquot(sc->tp, dq); + error = xfs_trans_roll(&sc->tp); + xfs_dqlock(dq); + return error; +} + +/* Fix a quota timer so that we can pass the verifier. */ +STATIC void +xrep_quota_fix_timer( + struct xfs_mount *mp, + const struct xfs_disk_dquot *ddq, + __be64 softlimit, + __be64 countnow, + __be32 *timer, + time64_t timelimit) +{ + uint64_t soft = be64_to_cpu(softlimit); + uint64_t count = be64_to_cpu(countnow); + time64_t new_timer; + uint32_t t; + + if (!soft || count <= soft || *timer != 0) + return; + + new_timer = xfs_dquot_set_timeout(mp, + ktime_get_real_seconds() + timelimit); + if (ddq->d_type & XFS_DQTYPE_BIGTIME) + t = xfs_dq_unix_to_bigtime(new_timer); + else + t = new_timer; + + *timer = cpu_to_be32(t); +} + +/* Fix anything the verifiers complain about. */ +STATIC int +xrep_quota_block( + struct xfs_scrub *sc, + xfs_daddr_t daddr, + xfs_dqtype_t dqtype, + xfs_dqid_t id) +{ + struct xfs_dqblk *dqblk; + struct xfs_disk_dquot *ddq; + struct xfs_quotainfo *qi = sc->mp->m_quotainfo; + struct xfs_def_quota *defq = xfs_get_defquota(qi, dqtype); + struct xfs_buf *bp = NULL; + enum xfs_blft buftype = 0; + int i; + int error; + + error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, daddr, + qi->qi_dqchunklen, 0, &bp, &xfs_dquot_buf_ops); + switch (error) { + case -EFSBADCRC: + case -EFSCORRUPTED: + /* Failed verifier, retry read with no ops. */ + error = xfs_trans_read_buf(sc->mp, sc->tp, + sc->mp->m_ddev_targp, daddr, qi->qi_dqchunklen, + 0, &bp, NULL); + if (error) + return error; + break; + case 0: + dqblk = bp->b_addr; + ddq = &dqblk[0].dd_diskdq; + + /* + * If there's nothing that would impede a dqiterate, we're + * done. + */ + if ((ddq->d_type & XFS_DQTYPE_REC_MASK) != dqtype || + id == be32_to_cpu(ddq->d_id)) { + xfs_trans_brelse(sc->tp, bp); + return 0; + } + break; + default: + return error; + } + + /* Something's wrong with the block, fix the whole thing. */ + dqblk = bp->b_addr; + bp->b_ops = &xfs_dquot_buf_ops; + for (i = 0; i < qi->qi_dqperchunk; i++, dqblk++) { + ddq = &dqblk->dd_diskdq; + + trace_xrep_disk_dquot(sc->mp, dqtype, id + i); + + ddq->d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); + ddq->d_version = XFS_DQUOT_VERSION; + ddq->d_type = dqtype; + ddq->d_id = cpu_to_be32(id + i); + + if (xfs_has_bigtime(sc->mp) && ddq->d_id) + ddq->d_type |= XFS_DQTYPE_BIGTIME; + + xrep_quota_fix_timer(sc->mp, ddq, ddq->d_blk_softlimit, + ddq->d_bcount, &ddq->d_btimer, + defq->blk.time); + + xrep_quota_fix_timer(sc->mp, ddq, ddq->d_ino_softlimit, + ddq->d_icount, &ddq->d_itimer, + defq->ino.time); + + xrep_quota_fix_timer(sc->mp, ddq, ddq->d_rtb_softlimit, + ddq->d_rtbcount, &ddq->d_rtbtimer, + defq->rtb.time); + + /* We only support v5 filesystems so always set these. */ + uuid_copy(&dqblk->dd_uuid, &sc->mp->m_sb.sb_meta_uuid); + xfs_update_cksum((char *)dqblk, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + dqblk->dd_lsn = 0; + } + switch (dqtype) { + case XFS_DQTYPE_USER: + buftype = XFS_BLFT_UDQUOT_BUF; + break; + case XFS_DQTYPE_GROUP: + buftype = XFS_BLFT_GDQUOT_BUF; + break; + case XFS_DQTYPE_PROJ: + buftype = XFS_BLFT_PDQUOT_BUF; + break; + } + xfs_trans_buf_set_type(sc->tp, bp, buftype); + xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1); + return xrep_roll_trans(sc); +} + +/* + * Repair a quota file's data fork. The function returns with the inode + * joined. + */ +STATIC int +xrep_quota_data_fork( + struct xfs_scrub *sc, + xfs_dqtype_t dqtype) +{ + struct xfs_bmbt_irec irec = { 0 }; + struct xfs_iext_cursor icur; + struct xfs_quotainfo *qi = sc->mp->m_quotainfo; + struct xfs_ifork *ifp; + xfs_fileoff_t max_dqid_off; + xfs_fileoff_t off; + xfs_fsblock_t fsbno; + bool truncate = false; + bool joined = false; + int error = 0; + + error = xrep_metadata_inode_forks(sc); + if (error) + goto out; + + /* Check for data fork problems that apply only to quota files. */ + max_dqid_off = XFS_DQ_ID_MAX / qi->qi_dqperchunk; + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); + for_each_xfs_iext(ifp, &icur, &irec) { + if (isnullstartblock(irec.br_startblock)) { + error = -EFSCORRUPTED; + goto out; + } + + if (irec.br_startoff > max_dqid_off || + irec.br_startoff + irec.br_blockcount - 1 > max_dqid_off) { + truncate = true; + break; + } + + /* Convert unwritten extents to real ones. */ + if (irec.br_state == XFS_EXT_UNWRITTEN) { + struct xfs_bmbt_irec nrec; + int nmap = 1; + + if (!joined) { + xfs_trans_ijoin(sc->tp, sc->ip, 0); + joined = true; + } + + error = xfs_bmapi_write(sc->tp, sc->ip, + irec.br_startoff, irec.br_blockcount, + XFS_BMAPI_CONVERT, 0, &nrec, &nmap); + if (error) + goto out; + if (nmap != 1) { + error = -ENOSPC; + goto out; + } + ASSERT(nrec.br_startoff == irec.br_startoff); + ASSERT(nrec.br_blockcount == irec.br_blockcount); + + error = xfs_defer_finish(&sc->tp); + if (error) + goto out; + } + } + + if (!joined) { + xfs_trans_ijoin(sc->tp, sc->ip, 0); + joined = true; + } + + if (truncate) { + /* Erase everything after the block containing the max dquot */ + error = xfs_bunmapi_range(&sc->tp, sc->ip, 0, + max_dqid_off * sc->mp->m_sb.sb_blocksize, + XFS_MAX_FILEOFF); + if (error) + goto out; + + /* Remove all CoW reservations. */ + error = xfs_reflink_cancel_cow_blocks(sc->ip, &sc->tp, 0, + XFS_MAX_FILEOFF, true); + if (error) + goto out; + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + + /* + * Always re-log the inode so that our permanent transaction + * can keep on rolling it forward in the log. + */ + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + } + + /* Now go fix anything that fails the verifiers. */ + for_each_xfs_iext(ifp, &icur, &irec) { + for (fsbno = irec.br_startblock, off = irec.br_startoff; + fsbno < irec.br_startblock + irec.br_blockcount; + fsbno += XFS_DQUOT_CLUSTER_SIZE_FSB, + off += XFS_DQUOT_CLUSTER_SIZE_FSB) { + error = xrep_quota_block(sc, + XFS_FSB_TO_DADDR(sc->mp, fsbno), + dqtype, off * qi->qi_dqperchunk); + if (error) + goto out; + } + } + +out: + return error; +} + +/* + * Go fix anything in the quota items that we could have been mad about. Now + * that we've checked the quota inode data fork we have to drop ILOCK_EXCL to + * use the regular dquot functions. + */ +STATIC int +xrep_quota_problems( + struct xfs_scrub *sc, + xfs_dqtype_t dqtype) +{ + struct xchk_dqiter cursor = { }; + struct xrep_quota_info rqi = { .sc = sc }; + struct xfs_dquot *dq; + int error; + + xchk_dqiter_init(&cursor, sc, dqtype); + while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { + error = xrep_quota_item(&rqi, dq); + xfs_qm_dqput(dq); + if (error) + break; + } + if (error) + return error; + + /* Make a quotacheck happen. */ + if (rqi.need_quotacheck) + xrep_force_quotacheck(sc, dqtype); + return 0; +} + +/* Repair all of a quota type's items. */ +int +xrep_quota( + struct xfs_scrub *sc) +{ + xfs_dqtype_t dqtype; + int error; + + dqtype = xchk_quota_to_dqtype(sc); + + /* + * Re-take the ILOCK so that we can fix any problems that we found + * with the data fork mappings, or with the dquot bufs themselves. + */ + if (!(sc->ilock_flags & XFS_ILOCK_EXCL)) + xchk_ilock(sc, XFS_ILOCK_EXCL); + error = xrep_quota_data_fork(sc, dqtype); + if (error) + return error; + + /* + * Finish deferred items and roll the transaction to unjoin the quota + * inode from transaction so that we can unlock the quota inode; we + * play only with dquots from now on. + */ + error = xrep_defer_finish(sc); + if (error) + return error; + error = xfs_trans_roll(&sc->tp); + if (error) + return error; + xchk_iunlock(sc, sc->ilock_flags); + + /* Fix anything the dquot verifiers don't complain about. */ + error = xrep_quota_problems(sc, dqtype); + if (error) + return error; + + return xrep_trans_commit(sc); +} diff --git a/fs/xfs/scrub/readdir.c b/fs/xfs/scrub/readdir.c index e51c1544be63..16462332c897 100644 --- a/fs/xfs/scrub/readdir.c +++ b/fs/xfs/scrub/readdir.c @@ -36,16 +36,14 @@ xchk_dir_walk_sf( struct xfs_mount *mp = dp->i_mount; struct xfs_da_geometry *geo = mp->m_dir_geo; struct xfs_dir2_sf_entry *sfep; - struct xfs_dir2_sf_hdr *sfp; + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; xfs_ino_t ino; xfs_dir2_dataptr_t dapos; unsigned int i; int error; ASSERT(dp->i_df.if_bytes == dp->i_disk_size); - ASSERT(dp->i_df.if_u1.if_data != NULL); - - sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data; + ASSERT(sfp != NULL); /* dot entry */ dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 86a62420e02c..f99eca799809 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -20,6 +20,7 @@ #include "xfs_ialloc_btree.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" +#include "xfs_refcount.h" #include "xfs_refcount_btree.h" #include "xfs_extent_busy.h" #include "xfs_ag.h" @@ -31,11 +32,14 @@ #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_attr_remote.h" +#include "xfs_defer.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/fsb_bitmap.h" #include "scrub/reap.h" /* @@ -73,10 +77,10 @@ * with only the same rmap owner but the block is not owned by something with * the same rmap owner, the block will be freed. * - * The caller is responsible for locking the AG headers for the entire rebuild - * operation so that nothing else can sneak in and change the AG state while - * we're not looking. We must also invalidate any buffers associated with - * @bitmap. + * The caller is responsible for locking the AG headers/inode for the entire + * rebuild operation so that nothing else can sneak in and change the incore + * state while we're not looking. We must also invalidate any buffers + * associated with @bitmap. */ /* Information about reaping extents after a repair. */ @@ -247,7 +251,7 @@ xreap_agextent_binval( max_fsbs = min_t(xfs_agblock_t, agbno_next - bno, xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX)); - for (fsbcount = 1; fsbcount < max_fsbs; fsbcount++) { + for (fsbcount = 1; fsbcount <= max_fsbs; fsbcount++) { struct xfs_buf *bp = NULL; xfs_daddr_t daddr; int error; @@ -377,6 +381,17 @@ xreap_agextent_iter( trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp); rs->force_roll = true; + + if (rs->oinfo == &XFS_RMAP_OINFO_COW) { + /* + * If we're unmapping CoW staging extents, remove the + * records from the refcountbt, which will remove the + * rmap record as well. + */ + xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp); + return 0; + } + return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, *aglenp, rs->oinfo); } @@ -395,6 +410,26 @@ xreap_agextent_iter( return 0; } + /* + * If we're getting rid of CoW staging extents, use deferred work items + * to remove the refcountbt records (which removes the rmap records) + * and free the extent. We're not worried about the system going down + * here because log recovery walks the refcount btree to clean out the + * CoW staging extents. + */ + if (rs->oinfo == &XFS_RMAP_OINFO_COW) { + ASSERT(rs->resv == XFS_AG_RESV_NONE); + + xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp); + error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL, + rs->resv, true); + if (error) + return error; + + rs->force_roll = true; + return 0; + } + /* Put blocks back on the AGFL one at a time. */ if (rs->resv == XFS_AG_RESV_AGFL) { ASSERT(*aglenp == 1); @@ -409,13 +444,17 @@ xreap_agextent_iter( /* * Use deferred frees to get rid of the old btree blocks to try to * minimize the window in which we could crash and lose the old blocks. + * Add a defer ops barrier every other extent to avoid stressing the + * system with large EFIs. */ - error = __xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo, + error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo, rs->resv, true); if (error) return error; rs->deferred++; + if (rs->deferred % 2 == 0) + xfs_defer_add_barrier(sc->tp); return 0; } @@ -425,13 +464,12 @@ xreap_agextent_iter( */ STATIC int xreap_agmeta_extent( - uint64_t fsbno, - uint64_t len, + uint32_t agbno, + uint32_t len, void *priv) { struct xreap_state *rs = priv; struct xfs_scrub *sc = rs->sc; - xfs_agblock_t agbno = fsbno; xfs_agblock_t agbno_next = agbno + len; int error = 0; @@ -496,3 +534,115 @@ xrep_reap_agblocks( return 0; } + +/* + * Break a file metadata extent into sub-extents by fate (crosslinked, not + * crosslinked), and dispose of each sub-extent separately. The extent must + * not cross an AG boundary. + */ +STATIC int +xreap_fsmeta_extent( + uint64_t fsbno, + uint64_t len, + void *priv) +{ + struct xreap_state *rs = priv; + struct xfs_scrub *sc = rs->sc; + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, fsbno); + xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); + xfs_agblock_t agbno_next = agbno + len; + int error = 0; + + ASSERT(len <= XFS_MAX_BMBT_EXTLEN); + ASSERT(sc->ip != NULL); + ASSERT(!sc->sa.pag); + + /* + * We're reaping blocks after repairing file metadata, which means that + * we have to init the xchk_ag structure ourselves. + */ + sc->sa.pag = xfs_perag_get(sc->mp, agno); + if (!sc->sa.pag) + return -EFSCORRUPTED; + + error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp); + if (error) + goto out_pag; + + while (agbno < agbno_next) { + xfs_extlen_t aglen; + bool crosslinked; + + error = xreap_agextent_select(rs, agbno, agbno_next, + &crosslinked, &aglen); + if (error) + goto out_agf; + + error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked); + if (error) + goto out_agf; + + if (xreap_want_defer_finish(rs)) { + /* + * Holds the AGF buffer across the deferred chain + * processing. + */ + error = xrep_defer_finish(sc); + if (error) + goto out_agf; + xreap_defer_finish_reset(rs); + } else if (xreap_want_roll(rs)) { + /* + * Hold the AGF buffer across the transaction roll so + * that we don't have to reattach it to the scrub + * context. + */ + xfs_trans_bhold(sc->tp, sc->sa.agf_bp); + error = xfs_trans_roll_inode(&sc->tp, sc->ip); + xfs_trans_bjoin(sc->tp, sc->sa.agf_bp); + if (error) + goto out_agf; + xreap_reset(rs); + } + + agbno += aglen; + } + +out_agf: + xfs_trans_brelse(sc->tp, sc->sa.agf_bp); + sc->sa.agf_bp = NULL; +out_pag: + xfs_perag_put(sc->sa.pag); + sc->sa.pag = NULL; + return error; +} + +/* + * Dispose of every block of every fs metadata extent in the bitmap. + * Do not use this to dispose of the mappings in an ondisk inode fork. + */ +int +xrep_reap_fsblocks( + struct xfs_scrub *sc, + struct xfsb_bitmap *bitmap, + const struct xfs_owner_info *oinfo) +{ + struct xreap_state rs = { + .sc = sc, + .oinfo = oinfo, + .resv = XFS_AG_RESV_NONE, + }; + int error; + + ASSERT(xfs_has_rmapbt(sc->mp)); + ASSERT(sc->ip != NULL); + + error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); + if (error) + return error; + + if (xreap_dirty(&rs)) + return xrep_defer_finish(sc); + + return 0; +} diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h index fe24626af164..0b69f16dd98f 100644 --- a/fs/xfs/scrub/reap.h +++ b/fs/xfs/scrub/reap.h @@ -6,7 +6,12 @@ #ifndef __XFS_SCRUB_REAP_H__ #define __XFS_SCRUB_REAP_H__ +struct xagb_bitmap; +struct xfsb_bitmap; + int xrep_reap_agblocks(struct xfs_scrub *sc, struct xagb_bitmap *bitmap, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); +int xrep_reap_fsblocks(struct xfs_scrub *sc, struct xfsb_bitmap *bitmap, + const struct xfs_owner_info *oinfo); #endif /* __XFS_SCRUB_REAP_H__ */ diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 304ea1e1bfb0..bf22f245bbfa 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -441,7 +441,7 @@ xchk_refcountbt_rec( struct xchk_refcbt_records *rrc = bs->private; xfs_refcount_btrec_to_irec(rec, &irec); - if (xfs_refcount_check_irec(bs->cur, &irec) != NULL) { + if (xfs_refcount_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c new file mode 100644 index 000000000000..f38fccc42a20 --- /dev/null +++ b/fs/xfs/scrub/refcount_repair.c @@ -0,0 +1,794 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_inode.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_refcount.h" +#include "xfs_refcount_btree.h" +#include "xfs_error.h" +#include "xfs_ag.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" + +/* + * Rebuilding the Reference Count Btree + * ==================================== + * + * This algorithm is "borrowed" from xfs_repair. Imagine the rmap + * entries as rectangles representing extents of physical blocks, and + * that the rectangles can be laid down to allow them to overlap each + * other; then we know that we must emit a refcnt btree entry wherever + * the amount of overlap changes, i.e. the emission stimulus is + * level-triggered: + * + * - --- + * -- ----- ---- --- ------ + * -- ---- ----------- ---- --------- + * -------------------------------- ----------- + * ^ ^ ^^ ^^ ^ ^^ ^^^ ^^^^ ^ ^^ ^ ^ ^ + * 2 1 23 21 3 43 234 2123 1 01 2 3 0 + * + * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner). + * + * Note that in the actual refcnt btree we don't store the refcount < 2 + * cases because the bnobt tells us which blocks are free; single-use + * blocks aren't recorded in the bnobt or the refcntbt. If the rmapbt + * supports storing multiple entries covering a given block we could + * theoretically dispense with the refcntbt and simply count rmaps, but + * that's inefficient in the (hot) write path, so we'll take the cost of + * the extra tree to save time. Also there's no guarantee that rmap + * will be enabled. + * + * Given an array of rmaps sorted by physical block number, a starting + * physical block (sp), a bag to hold rmaps that cover sp, and the next + * physical block where the level changes (np), we can reconstruct the + * refcount btree as follows: + * + * While there are still unprocessed rmaps in the array, + * - Set sp to the physical block (pblk) of the next unprocessed rmap. + * - Add to the bag all rmaps in the array where startblock == sp. + * - Set np to the physical block where the bag size will change. This + * is the minimum of (the pblk of the next unprocessed rmap) and + * (startblock + len of each rmap in the bag). + * - Record the bag size as old_bag_size. + * + * - While the bag isn't empty, + * - Remove from the bag all rmaps where startblock + len == np. + * - Add to the bag all rmaps in the array where startblock == np. + * - If the bag size isn't old_bag_size, store the refcount entry + * (sp, np - sp, bag_size) in the refcnt btree. + * - If the bag is empty, break out of the inner loop. + * - Set old_bag_size to the bag size + * - Set sp = np. + * - Set np to the physical block where the bag size will change. + * This is the minimum of (the pblk of the next unprocessed rmap) + * and (startblock + len of each rmap in the bag). + * + * Like all the other repairers, we make a list of all the refcount + * records we need, then reinitialize the refcount btree root and + * insert all the records. + */ + +/* The only parts of the rmap that we care about for computing refcounts. */ +struct xrep_refc_rmap { + xfs_agblock_t startblock; + xfs_extlen_t blockcount; +} __packed; + +struct xrep_refc { + /* refcount extents */ + struct xfarray *refcount_records; + + /* new refcountbt information */ + struct xrep_newbt new_btree; + + /* old refcountbt blocks */ + struct xagb_bitmap old_refcountbt_blocks; + + struct xfs_scrub *sc; + + /* get_records()'s position in the refcount record array. */ + xfarray_idx_t array_cur; + + /* # of refcountbt blocks */ + xfs_extlen_t btblocks; +}; + +/* Check for any obvious conflicts with this shared/CoW staging extent. */ +STATIC int +xrep_refc_check_ext( + struct xfs_scrub *sc, + const struct xfs_refcount_irec *rec) +{ + enum xbtree_recpacking outcome; + int error; + + if (xfs_refcount_check_irec(sc->sa.pag, rec) != NULL) + return -EFSCORRUPTED; + + /* Make sure this isn't free space. */ + error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rc_startblock, + rec->rc_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + /* Must not be an inode chunk. */ + error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur, + rec->rc_startblock, rec->rc_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + return 0; +} + +/* Record a reference count extent. */ +STATIC int +xrep_refc_stash( + struct xrep_refc *rr, + enum xfs_refc_domain domain, + xfs_agblock_t agbno, + xfs_extlen_t len, + uint64_t refcount) +{ + struct xfs_refcount_irec irec = { + .rc_startblock = agbno, + .rc_blockcount = len, + .rc_domain = domain, + }; + struct xfs_scrub *sc = rr->sc; + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + irec.rc_refcount = min_t(uint64_t, MAXREFCOUNT, refcount); + + error = xrep_refc_check_ext(rr->sc, &irec); + if (error) + return error; + + trace_xrep_refc_found(sc->sa.pag, &irec); + + return xfarray_append(rr->refcount_records, &irec); +} + +/* Record a CoW staging extent. */ +STATIC int +xrep_refc_stash_cow( + struct xrep_refc *rr, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + return xrep_refc_stash(rr, XFS_REFC_DOMAIN_COW, agbno, len, 1); +} + +/* Decide if an rmap could describe a shared extent. */ +static inline bool +xrep_refc_rmap_shareable( + struct xfs_mount *mp, + const struct xfs_rmap_irec *rmap) +{ + /* AG metadata are never sharable */ + if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner)) + return false; + + /* Metadata in files are never shareable */ + if (xfs_internal_inum(mp, rmap->rm_owner)) + return false; + + /* Metadata and unwritten file blocks are not shareable. */ + if (rmap->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK | + XFS_RMAP_UNWRITTEN)) + return false; + + return true; +} + +/* + * Walk along the reverse mapping records until we find one that could describe + * a shared extent. + */ +STATIC int +xrep_refc_walk_rmaps( + struct xrep_refc *rr, + struct xrep_refc_rmap *rrm, + bool *have_rec) +{ + struct xfs_rmap_irec rmap; + struct xfs_btree_cur *cur = rr->sc->sa.rmap_cur; + struct xfs_mount *mp = cur->bc_mp; + int have_gt; + int error = 0; + + *have_rec = false; + + /* + * Loop through the remaining rmaps. Remember CoW staging + * extents and the refcountbt blocks from the old tree for later + * disposal. We can only share written data fork extents, so + * keep looping until we find an rmap for one. + */ + do { + if (xchk_should_terminate(rr->sc, &error)) + return error; + + error = xfs_btree_increment(cur, 0, &have_gt); + if (error) + return error; + if (!have_gt) + return 0; + + error = xfs_rmap_get_rec(cur, &rmap, &have_gt); + if (error) + return error; + if (XFS_IS_CORRUPT(mp, !have_gt)) + return -EFSCORRUPTED; + + if (rmap.rm_owner == XFS_RMAP_OWN_COW) { + error = xrep_refc_stash_cow(rr, rmap.rm_startblock, + rmap.rm_blockcount); + if (error) + return error; + } else if (rmap.rm_owner == XFS_RMAP_OWN_REFC) { + /* refcountbt block, dump it when we're done. */ + rr->btblocks += rmap.rm_blockcount; + error = xagb_bitmap_set(&rr->old_refcountbt_blocks, + rmap.rm_startblock, rmap.rm_blockcount); + if (error) + return error; + } + } while (!xrep_refc_rmap_shareable(mp, &rmap)); + + rrm->startblock = rmap.rm_startblock; + rrm->blockcount = rmap.rm_blockcount; + *have_rec = true; + return 0; +} + +static inline uint32_t +xrep_refc_encode_startblock( + const struct xfs_refcount_irec *irec) +{ + uint32_t start; + + start = irec->rc_startblock & ~XFS_REFC_COWFLAG; + if (irec->rc_domain == XFS_REFC_DOMAIN_COW) + start |= XFS_REFC_COWFLAG; + + return start; +} + +/* Sort in the same order as the ondisk records. */ +static int +xrep_refc_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_refcount_irec *ap = a; + const struct xfs_refcount_irec *bp = b; + uint32_t sa, sb; + + sa = xrep_refc_encode_startblock(ap); + sb = xrep_refc_encode_startblock(bp); + + if (sa > sb) + return 1; + if (sa < sb) + return -1; + return 0; +} + +/* + * Sort the refcount extents by startblock or else the btree records will be in + * the wrong order. Make sure the records do not overlap in physical space. + */ +STATIC int +xrep_refc_sort_records( + struct xrep_refc *rr) +{ + struct xfs_refcount_irec irec; + xfarray_idx_t cur; + enum xfs_refc_domain dom = XFS_REFC_DOMAIN_SHARED; + xfs_agblock_t next_agbno = 0; + int error; + + error = xfarray_sort(rr->refcount_records, xrep_refc_extent_cmp, + XFARRAY_SORT_KILLABLE); + if (error) + return error; + + foreach_xfarray_idx(rr->refcount_records, cur) { + if (xchk_should_terminate(rr->sc, &error)) + return error; + + error = xfarray_load(rr->refcount_records, cur, &irec); + if (error) + return error; + + if (dom == XFS_REFC_DOMAIN_SHARED && + irec.rc_domain == XFS_REFC_DOMAIN_COW) { + dom = irec.rc_domain; + next_agbno = 0; + } + + if (dom != irec.rc_domain) + return -EFSCORRUPTED; + if (irec.rc_startblock < next_agbno) + return -EFSCORRUPTED; + + next_agbno = irec.rc_startblock + irec.rc_blockcount; + } + + return error; +} + +#define RRM_NEXT(r) ((r).startblock + (r).blockcount) +/* + * Find the next block where the refcount changes, given the next rmap we + * looked at and the ones we're already tracking. + */ +static inline int +xrep_refc_next_edge( + struct xfarray *rmap_bag, + struct xrep_refc_rmap *next_rrm, + bool next_valid, + xfs_agblock_t *nbnop) +{ + struct xrep_refc_rmap rrm; + xfarray_idx_t array_cur = XFARRAY_CURSOR_INIT; + xfs_agblock_t nbno = NULLAGBLOCK; + int error; + + if (next_valid) + nbno = next_rrm->startblock; + + while ((error = xfarray_iter(rmap_bag, &array_cur, &rrm)) == 1) + nbno = min_t(xfs_agblock_t, nbno, RRM_NEXT(rrm)); + + if (error) + return error; + + /* + * We should have found /something/ because either next_rrm is the next + * interesting rmap to look at after emitting this refcount extent, or + * there are other rmaps in rmap_bag contributing to the current + * sharing count. But if something is seriously wrong, bail out. + */ + if (nbno == NULLAGBLOCK) + return -EFSCORRUPTED; + + *nbnop = nbno; + return 0; +} + +/* + * Walk forward through the rmap btree to collect all rmaps starting at + * @bno in @rmap_bag. These represent the file(s) that share ownership of + * the current block. Upon return, the rmap cursor points to the last record + * satisfying the startblock constraint. + */ +static int +xrep_refc_push_rmaps_at( + struct xrep_refc *rr, + struct xfarray *rmap_bag, + xfs_agblock_t bno, + struct xrep_refc_rmap *rrm, + bool *have, + uint64_t *stack_sz) +{ + struct xfs_scrub *sc = rr->sc; + int have_gt; + int error; + + while (*have && rrm->startblock == bno) { + error = xfarray_store_anywhere(rmap_bag, rrm); + if (error) + return error; + (*stack_sz)++; + error = xrep_refc_walk_rmaps(rr, rrm, have); + if (error) + return error; + } + + error = xfs_btree_decrement(sc->sa.rmap_cur, 0, &have_gt); + if (error) + return error; + if (XFS_IS_CORRUPT(sc->mp, !have_gt)) + return -EFSCORRUPTED; + + return 0; +} + +/* Iterate all the rmap records to generate reference count data. */ +STATIC int +xrep_refc_find_refcounts( + struct xrep_refc *rr) +{ + struct xrep_refc_rmap rrm; + struct xfs_scrub *sc = rr->sc; + struct xfarray *rmap_bag; + char *descr; + uint64_t old_stack_sz; + uint64_t stack_sz = 0; + xfs_agblock_t sbno; + xfs_agblock_t cbno; + xfs_agblock_t nbno; + bool have; + int error; + + xrep_ag_btcur_init(sc, &sc->sa); + + /* + * Set up a sparse array to store all the rmap records that we're + * tracking to generate a reference count record. If this exceeds + * MAXREFCOUNT, we clamp rc_refcount. + */ + descr = xchk_xfile_ag_descr(sc, "rmap record bag"); + error = xfarray_create(descr, 0, sizeof(struct xrep_refc_rmap), + &rmap_bag); + kfree(descr); + if (error) + goto out_cur; + + /* Start the rmapbt cursor to the left of all records. */ + error = xfs_btree_goto_left_edge(sc->sa.rmap_cur); + if (error) + goto out_bag; + + /* Process reverse mappings into refcount data. */ + while (xfs_btree_has_more_records(sc->sa.rmap_cur)) { + /* Push all rmaps with pblk == sbno onto the stack */ + error = xrep_refc_walk_rmaps(rr, &rrm, &have); + if (error) + goto out_bag; + if (!have) + break; + sbno = cbno = rrm.startblock; + error = xrep_refc_push_rmaps_at(rr, rmap_bag, sbno, + &rrm, &have, &stack_sz); + if (error) + goto out_bag; + + /* Set nbno to the bno of the next refcount change */ + error = xrep_refc_next_edge(rmap_bag, &rrm, have, &nbno); + if (error) + goto out_bag; + + ASSERT(nbno > sbno); + old_stack_sz = stack_sz; + + /* While stack isn't empty... */ + while (stack_sz) { + xfarray_idx_t array_cur = XFARRAY_CURSOR_INIT; + + /* Pop all rmaps that end at nbno */ + while ((error = xfarray_iter(rmap_bag, &array_cur, + &rrm)) == 1) { + if (RRM_NEXT(rrm) != nbno) + continue; + error = xfarray_unset(rmap_bag, array_cur - 1); + if (error) + goto out_bag; + stack_sz--; + } + if (error) + goto out_bag; + + /* Push array items that start at nbno */ + error = xrep_refc_walk_rmaps(rr, &rrm, &have); + if (error) + goto out_bag; + if (have) { + error = xrep_refc_push_rmaps_at(rr, rmap_bag, + nbno, &rrm, &have, &stack_sz); + if (error) + goto out_bag; + } + + /* Emit refcount if necessary */ + ASSERT(nbno > cbno); + if (stack_sz != old_stack_sz) { + if (old_stack_sz > 1) { + error = xrep_refc_stash(rr, + XFS_REFC_DOMAIN_SHARED, + cbno, nbno - cbno, + old_stack_sz); + if (error) + goto out_bag; + } + cbno = nbno; + } + + /* Stack empty, go find the next rmap */ + if (stack_sz == 0) + break; + old_stack_sz = stack_sz; + sbno = nbno; + + /* Set nbno to the bno of the next refcount change */ + error = xrep_refc_next_edge(rmap_bag, &rrm, have, + &nbno); + if (error) + goto out_bag; + + ASSERT(nbno > sbno); + } + } + + ASSERT(stack_sz == 0); +out_bag: + xfarray_destroy(rmap_bag); +out_cur: + xchk_ag_btcur_free(&sc->sa); + return error; +} +#undef RRM_NEXT + +/* Retrieve refcountbt data for bulk load. */ +STATIC int +xrep_refc_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_refcount_irec *irec = &cur->bc_rec.rc; + struct xrep_refc *rr = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + error = xfarray_load(rr->refcount_records, rr->array_cur++, + irec); + if (error) + return error; + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_refc_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_refc *rr = priv; + + return xrep_newbt_claim_block(cur, &rr->new_btree, ptr); +} + +/* Update the AGF counters. */ +STATIC int +xrep_refc_reset_counters( + struct xrep_refc *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_perag *pag = sc->sa.pag; + + /* + * After we commit the new btree to disk, it is possible that the + * process to reap the old btree blocks will race with the AIL trying + * to checkpoint the old btree blocks into the filesystem. If the new + * tree is shorter than the old one, the refcountbt write verifier will + * fail and the AIL will shut down the filesystem. + * + * To avoid this, save the old incore btree height values as the alt + * height values before re-initializing the perag info from the updated + * AGF to capture all the new values. + */ + pag->pagf_repair_refcount_level = pag->pagf_refcount_level; + + /* Reinitialize with the values we just logged. */ + return xrep_reinit_pagf(sc); +} + +/* + * Use the collected refcount information to stage a new refcount btree. If + * this is successful we'll return with the new btree root information logged + * to the repair transaction but not yet committed. + */ +STATIC int +xrep_refc_build_new_tree( + struct xrep_refc *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_btree_cur *refc_cur; + struct xfs_perag *pag = sc->sa.pag; + xfs_fsblock_t fsbno; + int error; + + error = xrep_refc_sort_records(rr); + if (error) + return error; + + /* + * Prepare to construct the new btree by reserving disk space for the + * new btree and setting up all the accounting information we'll need + * to root the new btree while it's under construction and before we + * attach it to the AG header. + */ + fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, xfs_refc_block(sc->mp)); + xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_REFC, fsbno, + XFS_AG_RESV_METADATA); + rr->new_btree.bload.get_records = xrep_refc_get_records; + rr->new_btree.bload.claim_block = xrep_refc_claim_block; + + /* Compute how many blocks we'll need. */ + refc_cur = xfs_refcountbt_stage_cursor(sc->mp, &rr->new_btree.afake, + pag); + error = xfs_btree_bload_compute_geometry(refc_cur, + &rr->new_btree.bload, + xfarray_length(rr->refcount_records)); + if (error) + goto err_cur; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto err_cur; + + /* Reserve the space we'll need for the new btree. */ + error = xrep_newbt_alloc_blocks(&rr->new_btree, + rr->new_btree.bload.nr_blocks); + if (error) + goto err_cur; + + /* + * Due to btree slack factors, it's possible for a new btree to be one + * level taller than the old btree. Update the incore btree height so + * that we don't trip the verifiers when writing the new btree blocks + * to disk. + */ + pag->pagf_repair_refcount_level = rr->new_btree.bload.btree_height; + + /* Add all observed refcount records. */ + rr->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(refc_cur, &rr->new_btree.bload, rr); + if (error) + goto err_level; + + /* + * Install the new btree in the AG header. After this point the old + * btree is no longer accessible and the new tree is live. + */ + xfs_refcountbt_commit_staged_btree(refc_cur, sc->tp, sc->sa.agf_bp); + xfs_btree_del_cursor(refc_cur, 0); + + /* Reset the AGF counters now that we've changed the btree shape. */ + error = xrep_refc_reset_counters(rr); + if (error) + goto err_newbt; + + /* Dispose of any unused blocks and the accounting information. */ + error = xrep_newbt_commit(&rr->new_btree); + if (error) + return error; + + return xrep_roll_ag_trans(sc); + +err_level: + pag->pagf_repair_refcount_level = 0; +err_cur: + xfs_btree_del_cursor(refc_cur, error); +err_newbt: + xrep_newbt_cancel(&rr->new_btree); + return error; +} + +/* + * Now that we've logged the roots of the new btrees, invalidate all of the + * old blocks and free them. + */ +STATIC int +xrep_refc_remove_old_tree( + struct xrep_refc *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_perag *pag = sc->sa.pag; + int error; + + /* Free the old refcountbt blocks if they're not in use. */ + error = xrep_reap_agblocks(sc, &rr->old_refcountbt_blocks, + &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA); + if (error) + return error; + + /* + * Now that we've zapped all the old refcountbt blocks we can turn off + * the alternate height mechanism and reset the per-AG space + * reservations. + */ + pag->pagf_repair_refcount_level = 0; + sc->flags |= XREP_RESET_PERAG_RESV; + return 0; +} + +/* Rebuild the refcount btree. */ +int +xrep_refcountbt( + struct xfs_scrub *sc) +{ + struct xrep_refc *rr; + struct xfs_mount *mp = sc->mp; + char *descr; + int error; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rmapbt(mp)) + return -EOPNOTSUPP; + + rr = kzalloc(sizeof(struct xrep_refc), XCHK_GFP_FLAGS); + if (!rr) + return -ENOMEM; + rr->sc = sc; + + /* Set up enough storage to handle one refcount record per block. */ + descr = xchk_xfile_ag_descr(sc, "reference count records"); + error = xfarray_create(descr, mp->m_sb.sb_agblocks, + sizeof(struct xfs_refcount_irec), + &rr->refcount_records); + kfree(descr); + if (error) + goto out_rr; + + /* Collect all reference counts. */ + xagb_bitmap_init(&rr->old_refcountbt_blocks); + error = xrep_refc_find_refcounts(rr); + if (error) + goto out_bitmap; + + /* Rebuild the refcount information. */ + error = xrep_refc_build_new_tree(rr); + if (error) + goto out_bitmap; + + /* Kill the old tree. */ + error = xrep_refc_remove_old_tree(rr); + if (error) + goto out_bitmap; + +out_bitmap: + xagb_bitmap_destroy(&rr->old_refcountbt_blocks); + xfarray_destroy(rr->refcount_records); +out_rr: + kfree(rr); + return error; +} diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 1b8b5439f2d7..745d5b8f405a 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -27,6 +27,9 @@ #include "xfs_quota.h" #include "xfs_qm.h" #include "xfs_defer.h" +#include "xfs_errortag.h" +#include "xfs_error.h" +#include "xfs_reflink.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -176,6 +179,16 @@ xrep_roll_ag_trans( return 0; } +/* Roll the scrub transaction, holding the primary metadata locked. */ +int +xrep_roll_trans( + struct xfs_scrub *sc) +{ + if (!sc->ip) + return xrep_roll_ag_trans(sc); + return xfs_trans_roll_inode(&sc->tp, sc->ip); +} + /* Finish all deferred work attached to the repair transaction. */ int xrep_defer_finish( @@ -673,6 +686,7 @@ xrep_find_ag_btree_roots( return error; } +#ifdef CONFIG_XFS_QUOTA /* Force a quotacheck the next time we mount. */ void xrep_force_quotacheck( @@ -699,10 +713,10 @@ xrep_force_quotacheck( * * This function ensures that the appropriate dquots are attached to an inode. * We cannot allow the dquot code to allocate an on-disk dquot block here - * because we're already in transaction context with the inode locked. The - * on-disk dquot should already exist anyway. If the quota code signals - * corruption or missing quota information, schedule quotacheck, which will - * repair corruptions in the quota metadata. + * because we're already in transaction context. The on-disk dquot should + * already exist anyway. If the quota code signals corruption or missing quota + * information, schedule quotacheck, which will repair corruptions in the quota + * metadata. */ int xrep_ino_dqattach( @@ -710,7 +724,10 @@ xrep_ino_dqattach( { int error; - error = xfs_qm_dqattach_locked(sc->ip, false); + ASSERT(sc->tp != NULL); + ASSERT(sc->ip != NULL); + + error = xfs_qm_dqattach(sc->ip); switch (error) { case -EFSBADCRC: case -EFSCORRUPTED: @@ -734,3 +751,367 @@ xrep_ino_dqattach( return error; } +#endif /* CONFIG_XFS_QUOTA */ + +/* + * Ensure that the inode being repaired is ready to handle a certain number of + * extents, or return EFSCORRUPTED. Caller must hold the ILOCK of the inode + * being repaired and have joined it to the scrub transaction. + */ +int +xrep_ino_ensure_extent_count( + struct xfs_scrub *sc, + int whichfork, + xfs_extnum_t nextents) +{ + xfs_extnum_t max_extents; + bool inode_has_nrext64; + + inode_has_nrext64 = xfs_inode_has_large_extent_counts(sc->ip); + max_extents = xfs_iext_max_nextents(inode_has_nrext64, whichfork); + if (nextents <= max_extents) + return 0; + if (inode_has_nrext64) + return -EFSCORRUPTED; + if (!xfs_has_large_extent_counts(sc->mp)) + return -EFSCORRUPTED; + + max_extents = xfs_iext_max_nextents(true, whichfork); + if (nextents > max_extents) + return -EFSCORRUPTED; + + sc->ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + return 0; +} + +/* + * Initialize all the btree cursors for an AG repair except for the btree that + * we're rebuilding. + */ +void +xrep_ag_btcur_init( + struct xfs_scrub *sc, + struct xchk_ag *sa) +{ + struct xfs_mount *mp = sc->mp; + + /* Set up a bnobt cursor for cross-referencing. */ + if (sc->sm->sm_type != XFS_SCRUB_TYPE_BNOBT && + sc->sm->sm_type != XFS_SCRUB_TYPE_CNTBT) { + sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, + sc->sa.pag, XFS_BTNUM_BNO); + sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, + sc->sa.pag, XFS_BTNUM_CNT); + } + + /* Set up a inobt cursor for cross-referencing. */ + if (sc->sm->sm_type != XFS_SCRUB_TYPE_INOBT && + sc->sm->sm_type != XFS_SCRUB_TYPE_FINOBT) { + sa->ino_cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, + sa->agi_bp, XFS_BTNUM_INO); + if (xfs_has_finobt(mp)) + sa->fino_cur = xfs_inobt_init_cursor(sc->sa.pag, + sc->tp, sa->agi_bp, XFS_BTNUM_FINO); + } + + /* Set up a rmapbt cursor for cross-referencing. */ + if (sc->sm->sm_type != XFS_SCRUB_TYPE_RMAPBT && + xfs_has_rmapbt(mp)) + sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp, + sc->sa.pag); + + /* Set up a refcountbt cursor for cross-referencing. */ + if (sc->sm->sm_type != XFS_SCRUB_TYPE_REFCNTBT && + xfs_has_reflink(mp)) + sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, + sa->agf_bp, sc->sa.pag); +} + +/* + * Reinitialize the in-core AG state after a repair by rereading the AGF + * buffer. We had better get the same AGF buffer as the one that's attached + * to the scrub context. + */ +int +xrep_reinit_pagf( + struct xfs_scrub *sc) +{ + struct xfs_perag *pag = sc->sa.pag; + struct xfs_buf *bp; + int error; + + ASSERT(pag); + ASSERT(xfs_perag_initialised_agf(pag)); + + clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); + error = xfs_alloc_read_agf(pag, sc->tp, 0, &bp); + if (error) + return error; + + if (bp != sc->sa.agf_bp) { + ASSERT(bp == sc->sa.agf_bp); + return -EFSCORRUPTED; + } + + return 0; +} + +/* + * Reinitialize the in-core AG state after a repair by rereading the AGI + * buffer. We had better get the same AGI buffer as the one that's attached + * to the scrub context. + */ +int +xrep_reinit_pagi( + struct xfs_scrub *sc) +{ + struct xfs_perag *pag = sc->sa.pag; + struct xfs_buf *bp; + int error; + + ASSERT(pag); + ASSERT(xfs_perag_initialised_agi(pag)); + + clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); + error = xfs_ialloc_read_agi(pag, sc->tp, &bp); + if (error) + return error; + + if (bp != sc->sa.agi_bp) { + ASSERT(bp == sc->sa.agi_bp); + return -EFSCORRUPTED; + } + + return 0; +} + +/* + * Given an active reference to a perag structure, load AG headers and cursors. + * This should only be called to scan an AG while repairing file-based metadata. + */ +int +xrep_ag_init( + struct xfs_scrub *sc, + struct xfs_perag *pag, + struct xchk_ag *sa) +{ + int error; + + ASSERT(!sa->pag); + + error = xfs_ialloc_read_agi(pag, sc->tp, &sa->agi_bp); + if (error) + return error; + + error = xfs_alloc_read_agf(pag, sc->tp, 0, &sa->agf_bp); + if (error) + return error; + + /* Grab our own passive reference from the caller's ref. */ + sa->pag = xfs_perag_hold(pag); + xrep_ag_btcur_init(sc, sa); + return 0; +} + +/* Reinitialize the per-AG block reservation for the AG we just fixed. */ +int +xrep_reset_perag_resv( + struct xfs_scrub *sc) +{ + int error; + + if (!(sc->flags & XREP_RESET_PERAG_RESV)) + return 0; + + ASSERT(sc->sa.pag != NULL); + ASSERT(sc->ops->type == ST_PERAG); + ASSERT(sc->tp); + + sc->flags &= ~XREP_RESET_PERAG_RESV; + error = xfs_ag_resv_free(sc->sa.pag); + if (error) + goto out; + error = xfs_ag_resv_init(sc->sa.pag, sc->tp); + if (error == -ENOSPC) { + xfs_err(sc->mp, +"Insufficient free space to reset per-AG reservation for AG %u after repair.", + sc->sa.pag->pag_agno); + error = 0; + } + +out: + return error; +} + +/* Decide if we are going to call the repair function for a scrub type. */ +bool +xrep_will_attempt( + struct xfs_scrub *sc) +{ + /* Userspace asked us to rebuild the structure regardless. */ + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) + return true; + + /* Let debug users force us into the repair routines. */ + if (XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) + return true; + + /* Metadata is corrupt or failed cross-referencing. */ + if (xchk_needs_repair(sc->sm)) + return true; + + return false; +} + +/* Try to fix some part of a metadata inode by calling another scrubber. */ +STATIC int +xrep_metadata_inode_subtype( + struct xfs_scrub *sc, + unsigned int scrub_type) +{ + __u32 smtype = sc->sm->sm_type; + __u32 smflags = sc->sm->sm_flags; + unsigned int sick_mask = sc->sick_mask; + int error; + + /* + * Let's see if the inode needs repair. We're going to open-code calls + * to the scrub and repair functions so that we can hang on to the + * resources that we already acquired instead of using the standard + * setup/teardown routines. + */ + sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; + sc->sm->sm_type = scrub_type; + + switch (scrub_type) { + case XFS_SCRUB_TYPE_INODE: + error = xchk_inode(sc); + break; + case XFS_SCRUB_TYPE_BMBTD: + error = xchk_bmap_data(sc); + break; + case XFS_SCRUB_TYPE_BMBTA: + error = xchk_bmap_attr(sc); + break; + default: + ASSERT(0); + error = -EFSCORRUPTED; + } + if (error) + goto out; + + if (!xrep_will_attempt(sc)) + goto out; + + /* + * Repair some part of the inode. This will potentially join the inode + * to the transaction. + */ + switch (scrub_type) { + case XFS_SCRUB_TYPE_INODE: + error = xrep_inode(sc); + break; + case XFS_SCRUB_TYPE_BMBTD: + error = xrep_bmap(sc, XFS_DATA_FORK, false); + break; + case XFS_SCRUB_TYPE_BMBTA: + error = xrep_bmap(sc, XFS_ATTR_FORK, false); + break; + } + if (error) + goto out; + + /* + * Finish all deferred intent items and then roll the transaction so + * that the inode will not be joined to the transaction when we exit + * the function. + */ + error = xfs_defer_finish(&sc->tp); + if (error) + goto out; + error = xfs_trans_roll(&sc->tp); + if (error) + goto out; + + /* + * Clear the corruption flags and re-check the metadata that we just + * repaired. + */ + sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; + + switch (scrub_type) { + case XFS_SCRUB_TYPE_INODE: + error = xchk_inode(sc); + break; + case XFS_SCRUB_TYPE_BMBTD: + error = xchk_bmap_data(sc); + break; + case XFS_SCRUB_TYPE_BMBTA: + error = xchk_bmap_attr(sc); + break; + } + if (error) + goto out; + + /* If corruption persists, the repair has failed. */ + if (xchk_needs_repair(sc->sm)) { + error = -EFSCORRUPTED; + goto out; + } +out: + sc->sick_mask = sick_mask; + sc->sm->sm_type = smtype; + sc->sm->sm_flags = smflags; + return error; +} + +/* + * Repair the ondisk forks of a metadata inode. The caller must ensure that + * sc->ip points to the metadata inode and the ILOCK is held on that inode. + * The inode must not be joined to the transaction before the call, and will + * not be afterwards. + */ +int +xrep_metadata_inode_forks( + struct xfs_scrub *sc) +{ + bool dirty = false; + int error; + + /* Repair the inode record and the data fork. */ + error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE); + if (error) + return error; + + error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD); + if (error) + return error; + + /* Make sure the attr fork looks ok before we delete it. */ + error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA); + if (error) + return error; + + /* Clear the reflink flag since metadata never shares. */ + if (xfs_is_reflink_inode(sc->ip)) { + dirty = true; + xfs_trans_ijoin(sc->tp, sc->ip, 0); + error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp); + if (error) + return error; + } + + /* + * If we modified the inode, roll the transaction but don't rejoin the + * inode to the new transaction because xrep_bmap_data can do that. + */ + if (dirty) { + error = xfs_trans_roll(&sc->tp); + if (error) + return error; + dirty = false; + } + + return 0; +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 60d2a9ae5f2e..17114327e6fa 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -28,15 +28,28 @@ static inline int xrep_notsupported(struct xfs_scrub *sc) /* Repair helpers */ int xrep_attempt(struct xfs_scrub *sc, struct xchk_stats_run *run); +bool xrep_will_attempt(struct xfs_scrub *sc); void xrep_failure(struct xfs_mount *mp); int xrep_roll_ag_trans(struct xfs_scrub *sc); +int xrep_roll_trans(struct xfs_scrub *sc); int xrep_defer_finish(struct xfs_scrub *sc); bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks, enum xfs_ag_resv_type type); xfs_extlen_t xrep_calc_ag_resblks(struct xfs_scrub *sc); +static inline int +xrep_trans_commit( + struct xfs_scrub *sc) +{ + int error = xfs_trans_commit(sc->tp); + + sc->tp = NULL; + return error; +} + struct xbitmap; struct xagb_bitmap; +struct xfsb_bitmap; int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink); @@ -57,8 +70,35 @@ struct xrep_find_ag_btree { int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp, struct xrep_find_ag_btree *btree_info, struct xfs_buf *agfl_bp); + +#ifdef CONFIG_XFS_QUOTA void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type); int xrep_ino_dqattach(struct xfs_scrub *sc); +#else +# define xrep_force_quotacheck(sc, type) ((void)0) +# define xrep_ino_dqattach(sc) (0) +#endif /* CONFIG_XFS_QUOTA */ + +int xrep_ino_ensure_extent_count(struct xfs_scrub *sc, int whichfork, + xfs_extnum_t nextents); +int xrep_reset_perag_resv(struct xfs_scrub *sc); +int xrep_bmap(struct xfs_scrub *sc, int whichfork, bool allow_unwritten); +int xrep_metadata_inode_forks(struct xfs_scrub *sc); + +/* Repair setup functions */ +int xrep_setup_ag_allocbt(struct xfs_scrub *sc); + +struct xfs_imap; +int xrep_setup_inode(struct xfs_scrub *sc, const struct xfs_imap *imap); + +void xrep_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa); +int xrep_ag_init(struct xfs_scrub *sc, struct xfs_perag *pag, + struct xchk_ag *sa); + +/* Metadata revalidators */ + +int xrep_revalidate_allocbt(struct xfs_scrub *sc); +int xrep_revalidate_iallocbt(struct xfs_scrub *sc); /* Metadata repairers */ @@ -67,9 +107,34 @@ int xrep_superblock(struct xfs_scrub *sc); int xrep_agf(struct xfs_scrub *sc); int xrep_agfl(struct xfs_scrub *sc); int xrep_agi(struct xfs_scrub *sc); +int xrep_allocbt(struct xfs_scrub *sc); +int xrep_iallocbt(struct xfs_scrub *sc); +int xrep_refcountbt(struct xfs_scrub *sc); +int xrep_inode(struct xfs_scrub *sc); +int xrep_bmap_data(struct xfs_scrub *sc); +int xrep_bmap_attr(struct xfs_scrub *sc); +int xrep_bmap_cow(struct xfs_scrub *sc); + +#ifdef CONFIG_XFS_RT +int xrep_rtbitmap(struct xfs_scrub *sc); +#else +# define xrep_rtbitmap xrep_notsupported +#endif /* CONFIG_XFS_RT */ + +#ifdef CONFIG_XFS_QUOTA +int xrep_quota(struct xfs_scrub *sc); +#else +# define xrep_quota xrep_notsupported +#endif /* CONFIG_XFS_QUOTA */ + +int xrep_reinit_pagf(struct xfs_scrub *sc); +int xrep_reinit_pagi(struct xfs_scrub *sc); #else +#define xrep_ino_dqattach(sc) (0) +#define xrep_will_attempt(sc) (false) + static inline int xrep_attempt( struct xfs_scrub *sc, @@ -87,11 +152,45 @@ xrep_calc_ag_resblks( return 0; } +static inline int +xrep_reset_perag_resv( + struct xfs_scrub *sc) +{ + if (!(sc->flags & XREP_RESET_PERAG_RESV)) + return 0; + + ASSERT(0); + return -EOPNOTSUPP; +} + +/* repair setup functions for no-repair */ +static inline int +xrep_setup_nothing( + struct xfs_scrub *sc) +{ + return 0; +} +#define xrep_setup_ag_allocbt xrep_setup_nothing + +#define xrep_setup_inode(sc, imap) ((void)0) + +#define xrep_revalidate_allocbt (NULL) +#define xrep_revalidate_iallocbt (NULL) + #define xrep_probe xrep_notsupported #define xrep_superblock xrep_notsupported #define xrep_agf xrep_notsupported #define xrep_agfl xrep_notsupported #define xrep_agi xrep_notsupported +#define xrep_allocbt xrep_notsupported +#define xrep_iallocbt xrep_notsupported +#define xrep_refcountbt xrep_notsupported +#define xrep_inode xrep_notsupported +#define xrep_bmap_data xrep_notsupported +#define xrep_bmap_attr xrep_notsupported +#define xrep_bmap_cow xrep_notsupported +#define xrep_rtbitmap xrep_notsupported +#define xrep_quota xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index d29a26ecddd6..c99d1714f283 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -24,6 +24,7 @@ #include "scrub/common.h" #include "scrub/btree.h" #include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" /* * Set us up to scrub reverse mapping btrees. diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 41a1d89ae8e6..441ca9977652 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -14,17 +14,33 @@ #include "xfs_rtbitmap.h" #include "xfs_inode.h" #include "xfs_bmap.h" +#include "xfs_bit.h" #include "scrub/scrub.h" #include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/rtbitmap.h" /* Set us up with the realtime metadata locked. */ int xchk_setup_rtbitmap( struct xfs_scrub *sc) { + struct xfs_mount *mp = sc->mp; + struct xchk_rtbitmap *rtb; int error; - error = xchk_trans_alloc(sc, 0); + rtb = kzalloc(sizeof(struct xchk_rtbitmap), XCHK_GFP_FLAGS); + if (!rtb) + return -ENOMEM; + sc->buf = rtb; + + if (xchk_could_repair(sc)) { + error = xrep_setup_rtbitmap(sc, rtb); + if (error) + return error; + } + + error = xchk_trans_alloc(sc, rtb->resblks); if (error) return error; @@ -32,7 +48,22 @@ xchk_setup_rtbitmap( if (error) return error; + error = xchk_ino_dqattach(sc); + if (error) + return error; + xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); + + /* + * Now that we've locked the rtbitmap, we can't race with growfsrt + * trying to expand the bitmap or change the size of the rt volume. + * Hence it is safe to compute and check the geometry values. + */ + if (mp->m_sb.sb_rblocks) { + rtb->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks); + rtb->rextslog = xfs_compute_rextslog(rtb->rextents); + rtb->rbmblocks = xfs_rtbitmap_blockcount(mp, rtb->rextents); + } return 0; } @@ -63,21 +94,30 @@ STATIC int xchk_rtbitmap_check_extents( struct xfs_scrub *sc) { - struct xfs_mount *mp = sc->mp; struct xfs_bmbt_irec map; - xfs_rtblock_t off; - int nmap; + struct xfs_iext_cursor icur; + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip = sc->ip; + xfs_fileoff_t off = 0; + xfs_fileoff_t endoff; int error = 0; - for (off = 0; off < mp->m_sb.sb_rbmblocks;) { + /* Mappings may not cross or lie beyond EOF. */ + endoff = XFS_B_TO_FSB(mp, ip->i_disk_size); + if (xfs_iext_lookup_extent(ip, &ip->i_df, endoff, &icur, &map)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, endoff); + return 0; + } + + while (off < endoff) { + int nmap = 1; + if (xchk_should_terminate(sc, &error) || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) break; /* Make sure we have a written extent. */ - nmap = 1; - error = xfs_bmapi_read(mp->m_rbmip, off, - mp->m_sb.sb_rbmblocks - off, &map, &nmap, + error = xfs_bmapi_read(ip, off, endoff - off, &map, &nmap, XFS_DATA_FORK); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) break; @@ -98,12 +138,48 @@ int xchk_rtbitmap( struct xfs_scrub *sc) { + struct xfs_mount *mp = sc->mp; + struct xchk_rtbitmap *rtb = sc->buf; int error; - /* Is the size of the rtbitmap correct? */ - if (sc->mp->m_rbmip->i_disk_size != - XFS_FSB_TO_B(sc->mp, sc->mp->m_sb.sb_rbmblocks)) { - xchk_ino_set_corrupt(sc, sc->mp->m_rbmip->i_ino); + /* Is sb_rextents correct? */ + if (mp->m_sb.sb_rextents != rtb->rextents) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + + /* Is sb_rextslog correct? */ + if (mp->m_sb.sb_rextslog != rtb->rextslog) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + + /* + * Is sb_rbmblocks large enough to handle the current rt volume? In no + * case can we exceed 4bn bitmap blocks since the super field is a u32. + */ + if (rtb->rbmblocks > U32_MAX) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + if (mp->m_sb.sb_rbmblocks != rtb->rbmblocks) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + + /* The bitmap file length must be aligned to an fsblock. */ + if (mp->m_rbmip->i_disk_size & mp->m_blockmask) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + + /* + * Is the bitmap file itself large enough to handle the rt volume? + * growfsrt expands the bitmap file before updating sb_rextents, so the + * file can be larger than sb_rbmblocks. + */ + if (mp->m_rbmip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks)) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); return 0; } @@ -116,12 +192,11 @@ xchk_rtbitmap( if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) return error; - error = xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtbitmap_rec, sc); + error = xfs_rtalloc_query_all(mp, sc->tp, xchk_rtbitmap_rec, sc); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) - goto out; + return error; -out: - return error; + return 0; } /* xref check that the extent is not free in the rtbitmap */ diff --git a/fs/xfs/scrub/rtbitmap.h b/fs/xfs/scrub/rtbitmap.h new file mode 100644 index 000000000000..85304ff019e1 --- /dev/null +++ b/fs/xfs/scrub/rtbitmap.h @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_RTBITMAP_H__ +#define __XFS_SCRUB_RTBITMAP_H__ + +struct xchk_rtbitmap { + uint64_t rextents; + uint64_t rbmblocks; + unsigned int rextslog; + unsigned int resblks; +}; + +#ifdef CONFIG_XFS_ONLINE_REPAIR +int xrep_setup_rtbitmap(struct xfs_scrub *sc, struct xchk_rtbitmap *rtb); +#else +# define xrep_setup_rtbitmap(sc, rtb) (0) +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +#endif /* __XFS_SCRUB_RTBITMAP_H__ */ diff --git a/fs/xfs/scrub/rtbitmap_repair.c b/fs/xfs/scrub/rtbitmap_repair.c new file mode 100644 index 000000000000..46f5d5f605c9 --- /dev/null +++ b/fs/xfs/scrub/rtbitmap_repair.c @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2020-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_bit.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/xfile.h" +#include "scrub/rtbitmap.h" + +/* Set up to repair the realtime bitmap file metadata. */ +int +xrep_setup_rtbitmap( + struct xfs_scrub *sc, + struct xchk_rtbitmap *rtb) +{ + struct xfs_mount *mp = sc->mp; + unsigned long long blocks = 0; + + /* + * Reserve enough blocks to write out a completely new bmbt for a + * maximally fragmented bitmap file. We do not hold the rtbitmap + * ILOCK yet, so this is entirely speculative. + */ + blocks = xfs_bmbt_calc_size(mp, mp->m_sb.sb_rbmblocks); + if (blocks > UINT_MAX) + return -EOPNOTSUPP; + + rtb->resblks += blocks; + return 0; +} + +/* + * Make sure that the given range of the data fork of the realtime file is + * mapped to written blocks. The caller must ensure that the inode is joined + * to the transaction. + */ +STATIC int +xrep_rtbitmap_data_mappings( + struct xfs_scrub *sc, + xfs_filblks_t len) +{ + struct xfs_bmbt_irec map; + xfs_fileoff_t off = 0; + int error; + + ASSERT(sc->ip != NULL); + + while (off < len) { + int nmaps = 1; + + /* + * If we have a real extent mapping this block then we're + * in ok shape. + */ + error = xfs_bmapi_read(sc->ip, off, len - off, &map, &nmaps, + XFS_DATA_FORK); + if (error) + return error; + if (nmaps == 0) { + ASSERT(nmaps != 0); + return -EFSCORRUPTED; + } + + /* + * Written extents are ok. Holes are not filled because we + * do not know the freespace information. + */ + if (xfs_bmap_is_written_extent(&map) || + map.br_startblock == HOLESTARTBLOCK) { + off = map.br_startoff + map.br_blockcount; + continue; + } + + /* + * If we find a delalloc reservation then something is very + * very wrong. Bail out. + */ + if (map.br_startblock == DELAYSTARTBLOCK) + return -EFSCORRUPTED; + + /* Make sure we're really converting an unwritten extent. */ + if (map.br_state != XFS_EXT_UNWRITTEN) { + ASSERT(map.br_state == XFS_EXT_UNWRITTEN); + return -EFSCORRUPTED; + } + + /* Make sure this block has a real zeroed extent mapped. */ + nmaps = 1; + error = xfs_bmapi_write(sc->tp, sc->ip, map.br_startoff, + map.br_blockcount, + XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, + 0, &map, &nmaps); + if (error) + return error; + if (nmaps != 1) + return -EFSCORRUPTED; + + /* Commit new extent and all deferred work. */ + error = xrep_defer_finish(sc); + if (error) + return error; + + off = map.br_startoff + map.br_blockcount; + } + + return 0; +} + +/* Fix broken rt volume geometry. */ +STATIC int +xrep_rtbitmap_geometry( + struct xfs_scrub *sc, + struct xchk_rtbitmap *rtb) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_trans *tp = sc->tp; + + /* Superblock fields */ + if (mp->m_sb.sb_rextents != rtb->rextents) + xfs_trans_mod_sb(sc->tp, XFS_TRANS_SB_REXTENTS, + rtb->rextents - mp->m_sb.sb_rextents); + + if (mp->m_sb.sb_rbmblocks != rtb->rbmblocks) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS, + rtb->rbmblocks - mp->m_sb.sb_rbmblocks); + + if (mp->m_sb.sb_rextslog != rtb->rextslog) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG, + rtb->rextslog - mp->m_sb.sb_rextslog); + + /* Fix broken isize */ + sc->ip->i_disk_size = roundup_64(sc->ip->i_disk_size, + mp->m_sb.sb_blocksize); + + if (sc->ip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks)) + sc->ip->i_disk_size = XFS_FSB_TO_B(mp, rtb->rbmblocks); + + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + return xrep_roll_trans(sc); +} + +/* Repair the realtime bitmap file metadata. */ +int +xrep_rtbitmap( + struct xfs_scrub *sc) +{ + struct xchk_rtbitmap *rtb = sc->buf; + struct xfs_mount *mp = sc->mp; + unsigned long long blocks = 0; + int error; + + /* Impossibly large rtbitmap means we can't touch the filesystem. */ + if (rtb->rbmblocks > U32_MAX) + return 0; + + /* + * If the size of the rt bitmap file is larger than what we reserved, + * figure out if we need to adjust the block reservation in the + * transaction. + */ + blocks = xfs_bmbt_calc_size(mp, rtb->rbmblocks); + if (blocks > UINT_MAX) + return -EOPNOTSUPP; + if (blocks > rtb->resblks) { + error = xfs_trans_reserve_more(sc->tp, blocks, 0); + if (error) + return error; + + rtb->resblks += blocks; + } + + /* Fix inode core and forks. */ + error = xrep_metadata_inode_forks(sc); + if (error) + return error; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* Ensure no unwritten extents. */ + error = xrep_rtbitmap_data_mappings(sc, rtb->rbmblocks); + if (error) + return error; + + /* Fix inconsistent bitmap geometry */ + return xrep_rtbitmap_geometry(sc, rtb); +} diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c index 8b15c47408d0..fabd0ed9dfa6 100644 --- a/fs/xfs/scrub/rtsummary.c +++ b/fs/xfs/scrub/rtsummary.c @@ -31,6 +31,18 @@ * (potentially large) amount of data in pageable memory. */ +struct xchk_rtsummary { + struct xfs_rtalloc_args args; + + uint64_t rextents; + uint64_t rbmblocks; + uint64_t rsumsize; + unsigned int rsumlevels; + + /* Memory buffer for the summary comparison. */ + union xfs_suminfo_raw words[]; +}; + /* Set us up to check the rtsummary file. */ int xchk_setup_rtsummary( @@ -38,8 +50,15 @@ xchk_setup_rtsummary( { struct xfs_mount *mp = sc->mp; char *descr; + struct xchk_rtsummary *rts; int error; + rts = kvzalloc(struct_size(rts, words, mp->m_blockwsize), + XCHK_GFP_FLAGS); + if (!rts) + return -ENOMEM; + sc->buf = rts; + /* * Create an xfile to construct a new rtsummary file. The xfile allows * us to avoid pinning kernel memory for this purpose. @@ -54,15 +73,14 @@ xchk_setup_rtsummary( if (error) return error; - /* Allocate a memory buffer for the summary comparison. */ - sc->buf = kvmalloc(mp->m_sb.sb_blocksize, XCHK_GFP_FLAGS); - if (!sc->buf) - return -ENOMEM; - error = xchk_install_live_inode(sc, mp->m_rsumip); if (error) return error; + error = xchk_ino_dqattach(sc); + if (error) + return error; + /* * Locking order requires us to take the rtbitmap first. We must be * careful to unlock it ourselves when we are done with the rtbitmap @@ -71,13 +89,29 @@ xchk_setup_rtsummary( */ xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM); + + /* + * Now that we've locked the rtbitmap and rtsummary, we can't race with + * growfsrt trying to expand the summary or change the size of the rt + * volume. Hence it is safe to compute and check the geometry values. + */ + if (mp->m_sb.sb_rblocks) { + xfs_filblks_t rsumblocks; + int rextslog; + + rts->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks); + rextslog = xfs_compute_rextslog(rts->rextents); + rts->rsumlevels = rextslog + 1; + rts->rbmblocks = xfs_rtbitmap_blockcount(mp, rts->rextents); + rsumblocks = xfs_rtsummary_blockcount(mp, rts->rsumlevels, + rts->rbmblocks); + rts->rsumsize = XFS_FSB_TO_B(mp, rsumblocks); + } return 0; } /* Helper functions to record suminfo words in an xfile. */ -typedef unsigned int xchk_rtsumoff_t; - static inline int xfsum_load( struct xfs_scrub *sc, @@ -143,7 +177,7 @@ xchk_rtsum_record_free( /* Compute the relevant location in the rtsum file. */ rbmoff = xfs_rtx_to_rbmblock(mp, rec->ar_startext); - lenlog = XFS_RTBLOCKLOG(rec->ar_extcount); + lenlog = xfs_highbit64(rec->ar_extcount); offs = xfs_rtsumoffs(mp, lenlog, rbmoff); rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext); @@ -188,19 +222,29 @@ STATIC int xchk_rtsum_compare( struct xfs_scrub *sc) { - struct xfs_rtalloc_args args = { - .mp = sc->mp, - .tp = sc->tp, - }; - struct xfs_mount *mp = sc->mp; struct xfs_bmbt_irec map; - xfs_fileoff_t off; - xchk_rtsumoff_t sumoff = 0; - int nmap; + struct xfs_iext_cursor icur; + + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip = sc->ip; + struct xchk_rtsummary *rts = sc->buf; + xfs_fileoff_t off = 0; + xfs_fileoff_t endoff; + xfs_rtsumoff_t sumoff = 0; + int error = 0; + + rts->args.mp = sc->mp; + rts->args.tp = sc->tp; + + /* Mappings may not cross or lie beyond EOF. */ + endoff = XFS_B_TO_FSB(mp, ip->i_disk_size); + if (xfs_iext_lookup_extent(ip, &ip->i_df, endoff, &icur, &map)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, endoff); + return 0; + } - for (off = 0; off < XFS_B_TO_FSB(mp, mp->m_rsumsize); off++) { - union xfs_suminfo_raw *ondisk_info; - int error = 0; + while (off < endoff) { + int nmap = 1; if (xchk_should_terminate(sc, &error)) return error; @@ -208,8 +252,7 @@ xchk_rtsum_compare( return 0; /* Make sure we have a written extent. */ - nmap = 1; - error = xfs_bmapi_read(mp->m_rsumip, off, 1, &map, &nmap, + error = xfs_bmapi_read(ip, off, endoff - off, &map, &nmap, XFS_DATA_FORK); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) return error; @@ -219,24 +262,33 @@ xchk_rtsum_compare( return 0; } + off += map.br_blockcount; + } + + for (off = 0; off < endoff; off++) { + union xfs_suminfo_raw *ondisk_info; + /* Read a block's worth of ondisk rtsummary file. */ - error = xfs_rtsummary_read_buf(&args, off); + error = xfs_rtsummary_read_buf(&rts->args, off); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) return error; /* Read a block's worth of computed rtsummary file. */ - error = xfsum_copyout(sc, sumoff, sc->buf, mp->m_blockwsize); + error = xfsum_copyout(sc, sumoff, rts->words, mp->m_blockwsize); if (error) { - xfs_rtbuf_cache_relse(&args); + xfs_rtbuf_cache_relse(&rts->args); return error; } - ondisk_info = xfs_rsumblock_infoptr(&args, 0); - if (memcmp(ondisk_info, sc->buf, - mp->m_blockwsize << XFS_WORDLOG) != 0) + ondisk_info = xfs_rsumblock_infoptr(&rts->args, 0); + if (memcmp(ondisk_info, rts->words, + mp->m_blockwsize << XFS_WORDLOG) != 0) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off); + xfs_rtbuf_cache_relse(&rts->args); + return error; + } - xfs_rtbuf_cache_relse(&args); + xfs_rtbuf_cache_relse(&rts->args); sumoff += mp->m_blockwsize; } @@ -249,8 +301,43 @@ xchk_rtsummary( struct xfs_scrub *sc) { struct xfs_mount *mp = sc->mp; + struct xchk_rtsummary *rts = sc->buf; int error = 0; + /* Is sb_rextents correct? */ + if (mp->m_sb.sb_rextents != rts->rextents) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + goto out_rbm; + } + + /* Is m_rsumlevels correct? */ + if (mp->m_rsumlevels != rts->rsumlevels) { + xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); + goto out_rbm; + } + + /* Is m_rsumsize correct? */ + if (mp->m_rsumsize != rts->rsumsize) { + xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); + goto out_rbm; + } + + /* The summary file length must be aligned to an fsblock. */ + if (mp->m_rsumip->i_disk_size & mp->m_blockmask) { + xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); + goto out_rbm; + } + + /* + * Is the summary file itself large enough to handle the rt volume? + * growfsrt expands the summary file before updating sb_rextents, so + * the file can be larger than rsumsize. + */ + if (mp->m_rsumip->i_disk_size < rts->rsumsize) { + xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); + goto out_rbm; + } + /* Invoke the fork scrubber. */ error = xchk_metadata_inode_forks(sc); if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 4849efcaa33a..caf324c2b991 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -14,8 +14,6 @@ #include "xfs_inode.h" #include "xfs_quota.h" #include "xfs_qm.h" -#include "xfs_errortag.h" -#include "xfs_error.h" #include "xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -238,27 +236,31 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { [XFS_SCRUB_TYPE_BNOBT] = { /* bnobt */ .type = ST_PERAG, .setup = xchk_setup_ag_allocbt, - .scrub = xchk_bnobt, - .repair = xrep_notsupported, + .scrub = xchk_allocbt, + .repair = xrep_allocbt, + .repair_eval = xrep_revalidate_allocbt, }, [XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */ .type = ST_PERAG, .setup = xchk_setup_ag_allocbt, - .scrub = xchk_cntbt, - .repair = xrep_notsupported, + .scrub = xchk_allocbt, + .repair = xrep_allocbt, + .repair_eval = xrep_revalidate_allocbt, }, [XFS_SCRUB_TYPE_INOBT] = { /* inobt */ .type = ST_PERAG, .setup = xchk_setup_ag_iallocbt, - .scrub = xchk_inobt, - .repair = xrep_notsupported, + .scrub = xchk_iallocbt, + .repair = xrep_iallocbt, + .repair_eval = xrep_revalidate_iallocbt, }, [XFS_SCRUB_TYPE_FINOBT] = { /* finobt */ .type = ST_PERAG, .setup = xchk_setup_ag_iallocbt, - .scrub = xchk_finobt, + .scrub = xchk_iallocbt, .has = xfs_has_finobt, - .repair = xrep_notsupported, + .repair = xrep_iallocbt, + .repair_eval = xrep_revalidate_iallocbt, }, [XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */ .type = ST_PERAG, @@ -272,31 +274,31 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .setup = xchk_setup_ag_refcountbt, .scrub = xchk_refcountbt, .has = xfs_has_reflink, - .repair = xrep_notsupported, + .repair = xrep_refcountbt, }, [XFS_SCRUB_TYPE_INODE] = { /* inode record */ .type = ST_INODE, .setup = xchk_setup_inode, .scrub = xchk_inode, - .repair = xrep_notsupported, + .repair = xrep_inode, }, [XFS_SCRUB_TYPE_BMBTD] = { /* inode data fork */ .type = ST_INODE, .setup = xchk_setup_inode_bmap, .scrub = xchk_bmap_data, - .repair = xrep_notsupported, + .repair = xrep_bmap_data, }, [XFS_SCRUB_TYPE_BMBTA] = { /* inode attr fork */ .type = ST_INODE, .setup = xchk_setup_inode_bmap, .scrub = xchk_bmap_attr, - .repair = xrep_notsupported, + .repair = xrep_bmap_attr, }, [XFS_SCRUB_TYPE_BMBTC] = { /* inode CoW fork */ .type = ST_INODE, .setup = xchk_setup_inode_bmap, .scrub = xchk_bmap_cow, - .repair = xrep_notsupported, + .repair = xrep_bmap_cow, }, [XFS_SCRUB_TYPE_DIR] = { /* directory */ .type = ST_INODE, @@ -326,33 +328,31 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_FS, .setup = xchk_setup_rtbitmap, .scrub = xchk_rtbitmap, - .has = xfs_has_realtime, - .repair = xrep_notsupported, + .repair = xrep_rtbitmap, }, [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ .type = ST_FS, .setup = xchk_setup_rtsummary, .scrub = xchk_rtsummary, - .has = xfs_has_realtime, .repair = xrep_notsupported, }, [XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */ .type = ST_FS, .setup = xchk_setup_quota, .scrub = xchk_quota, - .repair = xrep_notsupported, + .repair = xrep_quota, }, [XFS_SCRUB_TYPE_GQUOTA] = { /* group quota */ .type = ST_FS, .setup = xchk_setup_quota, .scrub = xchk_quota, - .repair = xrep_notsupported, + .repair = xrep_quota, }, [XFS_SCRUB_TYPE_PQUOTA] = { /* project quota */ .type = ST_FS, .setup = xchk_setup_quota, .scrub = xchk_quota, - .repair = xrep_notsupported, + .repair = xrep_quota, }, [XFS_SCRUB_TYPE_FSCOUNTERS] = { /* fs summary counters */ .type = ST_FS, @@ -531,7 +531,10 @@ retry_op: /* Scrub for errors. */ check_start = xchk_stats_now(); - error = sc->ops->scrub(sc); + if ((sc->flags & XREP_ALREADY_FIXED) && sc->ops->repair_eval != NULL) + error = sc->ops->repair_eval(sc); + else + error = sc->ops->scrub(sc); run.scrub_ns += xchk_stats_elapsed_ns(check_start); if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER)) goto try_harder; @@ -542,23 +545,12 @@ retry_op: xchk_update_health(sc); - if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && - !(sc->flags & XREP_ALREADY_FIXED)) { - bool needs_fix = xchk_needs_repair(sc->sm); - - /* Userspace asked us to rebuild the structure regardless. */ - if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) - needs_fix = true; - - /* Let debug users force us into the repair routines. */ - if (XFS_TEST_ERROR(needs_fix, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) - needs_fix = true; - + if (xchk_could_repair(sc)) { /* * If userspace asked for a repair but it wasn't necessary, * report that back to userspace. */ - if (!needs_fix) { + if (!xrep_will_attempt(sc)) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED; goto out_nofix; } diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 1ef9c6b4842a..7fc50654c4fe 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -35,6 +35,14 @@ struct xchk_meta_ops { /* Repair or optimize the metadata. */ int (*repair)(struct xfs_scrub *); + /* + * Re-scrub the metadata we repaired, in case there's extra work that + * we need to do to check our repair work. If this is NULL, we'll use + * the ->scrub function pointer, assuming that the regular scrub is + * sufficient. + */ + int (*repair_eval)(struct xfs_scrub *sc); + /* Decide if we even have this piece of metadata. */ bool (*has)(struct xfs_mount *); @@ -113,6 +121,7 @@ struct xfs_scrub { #define XCHK_HAVE_FREEZE_PROT (1U << 1) /* do we have freeze protection? */ #define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */ #define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */ +#define XREP_RESET_PERAG_RESV (1U << 30) /* must reset AG space reservation */ #define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */ /* @@ -129,10 +138,8 @@ int xchk_superblock(struct xfs_scrub *sc); int xchk_agf(struct xfs_scrub *sc); int xchk_agfl(struct xfs_scrub *sc); int xchk_agi(struct xfs_scrub *sc); -int xchk_bnobt(struct xfs_scrub *sc); -int xchk_cntbt(struct xfs_scrub *sc); -int xchk_inobt(struct xfs_scrub *sc); -int xchk_finobt(struct xfs_scrub *sc); +int xchk_allocbt(struct xfs_scrub *sc); +int xchk_iallocbt(struct xfs_scrub *sc); int xchk_rmapbt(struct xfs_scrub *sc); int xchk_refcountbt(struct xfs_scrub *sc); int xchk_inode(struct xfs_scrub *sc); diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index 38708fb9a5d7..ddff86713df3 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -12,8 +12,10 @@ #include "xfs_log_format.h" #include "xfs_inode.h" #include "xfs_symlink.h" +#include "xfs_health.h" #include "scrub/scrub.h" #include "scrub/common.h" +#include "scrub/health.h" /* Set us up to scrub a symbolic link. */ int @@ -41,29 +43,37 @@ xchk_symlink( if (!S_ISLNK(VFS_I(ip)->i_mode)) return -ENOENT; + + if (xchk_file_looks_zapped(sc, XFS_SICK_INO_SYMLINK_ZAPPED)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); len = ip->i_disk_size; /* Plausible size? */ if (len > XFS_SYMLINK_MAXLEN || len <= 0) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - goto out; + return 0; } /* Inline symlink? */ if (ifp->if_format == XFS_DINODE_FMT_LOCAL) { if (len > xfs_inode_data_fork_size(ip) || - len > strnlen(ifp->if_u1.if_data, xfs_inode_data_fork_size(ip))) + len > strnlen(ifp->if_data, xfs_inode_data_fork_size(ip))) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - goto out; + return 0; } /* Remote symlink; must read the contents. */ error = xfs_readlink_bmap_ilocked(sc->ip, sc->buf); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) - goto out; + return error; if (strnlen(sc->buf, XFS_SYMLINK_MAXLEN) < len) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); -out: - return error; + + /* If a remote symlink is clean, it is clearly not zapped. */ + xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_SYMLINK_ZAPPED); + return 0; } diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 29afa4851235..d0e24ffaf754 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -14,9 +14,12 @@ #include "xfs_btree.h" #include "xfs_ag.h" #include "xfs_rtbitmap.h" +#include "xfs_quota.h" +#include "xfs_quota_defs.h" #include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" +#include "scrub/quota.h" /* Figure out which block the btree cursor was pointing to. */ static inline xfs_fsblock_t diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 4a8bc6f3c8f2..6bbb4e8639dc 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -19,6 +19,7 @@ struct xfile; struct xfarray; struct xfarray_sortinfo; +struct xchk_dqiter; /* * ftrace's __print_symbolic requires that all enum values be wrapped in the @@ -106,6 +107,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); { XCHK_HAVE_FREEZE_PROT, "nofreeze" }, \ { XCHK_FSGATES_DRAIN, "fsgates_drain" }, \ { XCHK_NEED_DRAIN, "need_drain" }, \ + { XREP_RESET_PERAG_RESV, "reset_perag_resv" }, \ { XREP_ALREADY_FIXED, "already_fixed" } DECLARE_EVENT_CLASS(xchk_class, @@ -347,6 +349,54 @@ DEFINE_EVENT(xchk_fblock_error_class, name, \ DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_error); DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_warning); +#ifdef CONFIG_XFS_QUOTA +DECLARE_EVENT_CLASS(xchk_dqiter_class, + TP_PROTO(struct xchk_dqiter *cursor, uint64_t id), + TP_ARGS(cursor, id), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_dqtype_t, dqtype) + __field(xfs_ino_t, ino) + __field(unsigned long long, cur_id) + __field(unsigned long long, id) + __field(xfs_fileoff_t, startoff) + __field(xfs_fsblock_t, startblock) + __field(xfs_filblks_t, blockcount) + __field(xfs_exntst_t, state) + ), + TP_fast_assign( + __entry->dev = cursor->sc->ip->i_mount->m_super->s_dev; + __entry->dqtype = cursor->dqtype; + __entry->ino = cursor->quota_ip->i_ino; + __entry->cur_id = cursor->id; + __entry->startoff = cursor->bmap.br_startoff; + __entry->startblock = cursor->bmap.br_startblock; + __entry->blockcount = cursor->bmap.br_blockcount; + __entry->state = cursor->bmap.br_state; + __entry->id = id; + ), + TP_printk("dev %d:%d dquot type %s ino 0x%llx cursor_id 0x%llx startoff 0x%llx startblock 0x%llx blockcount 0x%llx state %u id 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->dqtype, XFS_DQTYPE_STRINGS), + __entry->ino, + __entry->cur_id, + __entry->startoff, + __entry->startblock, + __entry->blockcount, + __entry->state, + __entry->id) +); + +#define DEFINE_SCRUB_DQITER_EVENT(name) \ +DEFINE_EVENT(xchk_dqiter_class, name, \ + TP_PROTO(struct xchk_dqiter *cursor, uint64_t id), \ + TP_ARGS(cursor, id)) +DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_revalidate_bmap); +DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_bmap); +DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_incore); +DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter); +#endif /* CONFIG_XFS_QUOTA */ + TRACE_EVENT(xchk_incomplete, TP_PROTO(struct xfs_scrub *sc, void *ret_ip), TP_ARGS(sc, ret_ip), @@ -1172,37 +1222,125 @@ DEFINE_EVENT(xrep_rmap_class, name, \ xfs_agblock_t agbno, xfs_extlen_t len, \ uint64_t owner, uint64_t offset, unsigned int flags), \ TP_ARGS(mp, agno, agbno, len, owner, offset, flags)) -DEFINE_REPAIR_RMAP_EVENT(xrep_alloc_extent_fn); -DEFINE_REPAIR_RMAP_EVENT(xrep_ialloc_extent_fn); +DEFINE_REPAIR_RMAP_EVENT(xrep_ibt_walk_rmap); DEFINE_REPAIR_RMAP_EVENT(xrep_rmap_extent_fn); -DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_extent_fn); +DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_walk_rmap); -TRACE_EVENT(xrep_refcount_extent_fn, +TRACE_EVENT(xrep_abt_found, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - struct xfs_refcount_irec *irec), - TP_ARGS(mp, agno, irec), + const struct xfs_alloc_rec_incore *rec), + TP_ARGS(mp, agno, rec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, startblock) __field(xfs_extlen_t, blockcount) - __field(xfs_nlink_t, refcount) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; - __entry->startblock = irec->rc_startblock; - __entry->blockcount = irec->rc_blockcount; - __entry->refcount = irec->rc_refcount; + __entry->startblock = rec->ar_startblock; + __entry->blockcount = rec->ar_blockcount; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startblock, + __entry->blockcount) +) + +TRACE_EVENT(xrep_ibt_found, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + const struct xfs_inobt_rec_incore *rec), + TP_ARGS(mp, agno, rec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, startino) + __field(uint16_t, holemask) + __field(uint8_t, count) + __field(uint8_t, freecount) + __field(uint64_t, freemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startino = rec->ir_startino; + __entry->holemask = rec->ir_holemask; + __entry->count = rec->ir_count; + __entry->freecount = rec->ir_freecount; + __entry->freemask = rec->ir_free; + ), + TP_printk("dev %d:%d agno 0x%x agino 0x%x holemask 0x%x count 0x%x freecount 0x%x freemask 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startino, + __entry->holemask, + __entry->count, + __entry->freecount, + __entry->freemask) +) + +TRACE_EVENT(xrep_refc_found, + TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *rec), + TP_ARGS(pag, rec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, domain) + __field(xfs_agblock_t, startblock) + __field(xfs_extlen_t, blockcount) + __field(xfs_nlink_t, refcount) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->domain = rec->rc_domain; + __entry->startblock = rec->rc_startblock; + __entry->blockcount = rec->rc_blockcount; + __entry->refcount = rec->rc_refcount; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u", + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS), __entry->startblock, __entry->blockcount, __entry->refcount) ) +TRACE_EVENT(xrep_bmap_found, + TP_PROTO(struct xfs_inode *ip, int whichfork, + struct xfs_bmbt_irec *irec), + TP_ARGS(ip, whichfork, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, whichfork) + __field(xfs_fileoff_t, lblk) + __field(xfs_filblks_t, len) + __field(xfs_fsblock_t, pblk) + __field(int, state) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->whichfork = whichfork; + __entry->lblk = irec->br_startoff; + __entry->len = irec->br_blockcount; + __entry->pblk = irec->br_startblock; + __entry->state = irec->br_state; + ), + TP_printk("dev %d:%d ino 0x%llx whichfork %s fileoff 0x%llx fsbcount 0x%llx startblock 0x%llx state %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), + __entry->lblk, + __entry->len, + __entry->pblk, + __entry->state) +); + TRACE_EVENT(xrep_findroot_block, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, uint32_t magic, uint16_t level), @@ -1299,39 +1437,327 @@ TRACE_EVENT(xrep_reset_counters, MAJOR(__entry->dev), MINOR(__entry->dev)) ) -TRACE_EVENT(xrep_ialloc_insert, +DECLARE_EVENT_CLASS(xrep_newbt_extent_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agino_t startino, uint16_t holemask, uint8_t count, - uint8_t freecount, uint64_t freemask), - TP_ARGS(mp, agno, startino, holemask, count, freecount, freemask), + xfs_agblock_t agbno, xfs_extlen_t len, + int64_t owner), + TP_ARGS(mp, agno, agbno, len, owner), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) - __field(xfs_agino_t, startino) - __field(uint16_t, holemask) - __field(uint8_t, count) - __field(uint8_t, freecount) - __field(uint64_t, freemask) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(int64_t, owner) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; - __entry->startino = startino; - __entry->holemask = holemask; - __entry->count = count; - __entry->freecount = freecount; - __entry->freemask = freemask; + __entry->agbno = agbno; + __entry->len = len; + __entry->owner = owner; ), - TP_printk("dev %d:%d agno 0x%x startino 0x%x holemask 0x%x count %u freecount %u freemask 0x%llx", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, - __entry->startino, - __entry->holemask, - __entry->count, - __entry->freecount, - __entry->freemask) + __entry->agbno, + __entry->len, + __entry->owner) +); +#define DEFINE_NEWBT_EXTENT_EVENT(name) \ +DEFINE_EVENT(xrep_newbt_extent_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len, \ + int64_t owner), \ + TP_ARGS(mp, agno, agbno, len, owner)) +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_ag_blocks); +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_file_blocks); +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_free_blocks); +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_claim_block); + +DECLARE_EVENT_CLASS(xrep_dinode_class, + TP_PROTO(struct xfs_scrub *sc, struct xfs_dinode *dip), + TP_ARGS(sc, dip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(uint16_t, mode) + __field(uint8_t, version) + __field(uint8_t, format) + __field(uint32_t, uid) + __field(uint32_t, gid) + __field(uint64_t, size) + __field(uint64_t, nblocks) + __field(uint32_t, extsize) + __field(uint32_t, nextents) + __field(uint16_t, anextents) + __field(uint8_t, forkoff) + __field(uint8_t, aformat) + __field(uint16_t, flags) + __field(uint32_t, gen) + __field(uint64_t, flags2) + __field(uint32_t, cowextsize) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->sm->sm_ino; + __entry->mode = be16_to_cpu(dip->di_mode); + __entry->version = dip->di_version; + __entry->format = dip->di_format; + __entry->uid = be32_to_cpu(dip->di_uid); + __entry->gid = be32_to_cpu(dip->di_gid); + __entry->size = be64_to_cpu(dip->di_size); + __entry->nblocks = be64_to_cpu(dip->di_nblocks); + __entry->extsize = be32_to_cpu(dip->di_extsize); + __entry->nextents = be32_to_cpu(dip->di_nextents); + __entry->anextents = be16_to_cpu(dip->di_anextents); + __entry->forkoff = dip->di_forkoff; + __entry->aformat = dip->di_aformat; + __entry->flags = be16_to_cpu(dip->di_flags); + __entry->gen = be32_to_cpu(dip->di_gen); + __entry->flags2 = be64_to_cpu(dip->di_flags2); + __entry->cowextsize = be32_to_cpu(dip->di_cowextsize); + ), + TP_printk("dev %d:%d ino 0x%llx mode 0x%x version %u format %u uid %u gid %u disize 0x%llx nblocks 0x%llx extsize %u nextents %u anextents %u forkoff 0x%x aformat %u flags 0x%x gen 0x%x flags2 0x%llx cowextsize %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->mode, + __entry->version, + __entry->format, + __entry->uid, + __entry->gid, + __entry->size, + __entry->nblocks, + __entry->extsize, + __entry->nextents, + __entry->anextents, + __entry->forkoff, + __entry->aformat, + __entry->flags, + __entry->gen, + __entry->flags2, + __entry->cowextsize) ) +#define DEFINE_REPAIR_DINODE_EVENT(name) \ +DEFINE_EVENT(xrep_dinode_class, name, \ + TP_PROTO(struct xfs_scrub *sc, struct xfs_dinode *dip), \ + TP_ARGS(sc, dip)) +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_header); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_mode); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_flags); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_size); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_extsize_hints); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_symlink); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_dir); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_fixed); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_forks); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_dfork); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_afork); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_ensure_forkoff); + +DECLARE_EVENT_CLASS(xrep_inode_class, + TP_PROTO(struct xfs_scrub *sc), + TP_ARGS(sc), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(xfs_rfsblock_t, nblocks) + __field(uint16_t, flags) + __field(uint64_t, flags2) + __field(uint32_t, nextents) + __field(uint8_t, format) + __field(uint32_t, anextents) + __field(uint8_t, aformat) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->sm->sm_ino; + __entry->size = sc->ip->i_disk_size; + __entry->nblocks = sc->ip->i_nblocks; + __entry->flags = sc->ip->i_diflags; + __entry->flags2 = sc->ip->i_diflags2; + __entry->nextents = sc->ip->i_df.if_nextents; + __entry->format = sc->ip->i_df.if_format; + __entry->anextents = sc->ip->i_af.if_nextents; + __entry->aformat = sc->ip->i_af.if_format; + ), + TP_printk("dev %d:%d ino 0x%llx disize 0x%llx nblocks 0x%llx flags 0x%x flags2 0x%llx nextents %u format %u anextents %u aformat %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->nblocks, + __entry->flags, + __entry->flags2, + __entry->nextents, + __entry->format, + __entry->anextents, + __entry->aformat) +) + +#define DEFINE_REPAIR_INODE_EVENT(name) \ +DEFINE_EVENT(xrep_inode_class, name, \ + TP_PROTO(struct xfs_scrub *sc), \ + TP_ARGS(sc)) +DEFINE_REPAIR_INODE_EVENT(xrep_inode_blockcounts); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_ids); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_flags); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_blockdir_size); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_sfdir_size); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_dir_size); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_fixed); + +TRACE_EVENT(xrep_dinode_count_rmaps, + TP_PROTO(struct xfs_scrub *sc, xfs_rfsblock_t data_blocks, + xfs_rfsblock_t rt_blocks, xfs_rfsblock_t attr_blocks, + xfs_extnum_t data_extents, xfs_extnum_t rt_extents, + xfs_aextnum_t attr_extents), + TP_ARGS(sc, data_blocks, rt_blocks, attr_blocks, data_extents, + rt_extents, attr_extents), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_rfsblock_t, data_blocks) + __field(xfs_rfsblock_t, rt_blocks) + __field(xfs_rfsblock_t, attr_blocks) + __field(xfs_extnum_t, data_extents) + __field(xfs_extnum_t, rt_extents) + __field(xfs_aextnum_t, attr_extents) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->sm->sm_ino; + __entry->data_blocks = data_blocks; + __entry->rt_blocks = rt_blocks; + __entry->attr_blocks = attr_blocks; + __entry->data_extents = data_extents; + __entry->rt_extents = rt_extents; + __entry->attr_extents = attr_extents; + ), + TP_printk("dev %d:%d ino 0x%llx dblocks 0x%llx rtblocks 0x%llx ablocks 0x%llx dextents %llu rtextents %llu aextents %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->data_blocks, + __entry->rt_blocks, + __entry->attr_blocks, + __entry->data_extents, + __entry->rt_extents, + __entry->attr_extents) +); + +TRACE_EVENT(xrep_cow_mark_file_range, + TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t startblock, + xfs_fileoff_t startoff, xfs_filblks_t blockcount), + TP_ARGS(ip, startblock, startoff, blockcount), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsblock_t, startblock) + __field(xfs_fileoff_t, startoff) + __field(xfs_filblks_t, blockcount) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->startoff = startoff; + __entry->startblock = startblock; + __entry->blockcount = blockcount; + ), + TP_printk("dev %d:%d ino 0x%llx fileoff 0x%llx startblock 0x%llx fsbcount 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->startoff, + __entry->startblock, + __entry->blockcount) +); + +TRACE_EVENT(xrep_cow_replace_mapping, + TP_PROTO(struct xfs_inode *ip, const struct xfs_bmbt_irec *irec, + xfs_fsblock_t new_startblock, xfs_extlen_t new_blockcount), + TP_ARGS(ip, irec, new_startblock, new_blockcount), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsblock_t, startblock) + __field(xfs_fileoff_t, startoff) + __field(xfs_filblks_t, blockcount) + __field(xfs_exntst_t, state) + __field(xfs_fsblock_t, new_startblock) + __field(xfs_extlen_t, new_blockcount) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->startoff = irec->br_startoff; + __entry->startblock = irec->br_startblock; + __entry->blockcount = irec->br_blockcount; + __entry->state = irec->br_state; + __entry->new_startblock = new_startblock; + __entry->new_blockcount = new_blockcount; + ), + TP_printk("dev %d:%d ino 0x%llx startoff 0x%llx startblock 0x%llx fsbcount 0x%llx state 0x%x new_startblock 0x%llx new_fsbcount 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->startoff, + __entry->startblock, + __entry->blockcount, + __entry->state, + __entry->new_startblock, + __entry->new_blockcount) +); + +TRACE_EVENT(xrep_cow_free_staging, + TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, + xfs_extlen_t blockcount), + TP_ARGS(pag, agbno, blockcount), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, blockcount) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->agbno = agbno; + __entry->blockcount = blockcount; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->blockcount) +); + +#ifdef CONFIG_XFS_QUOTA +DECLARE_EVENT_CLASS(xrep_dquot_class, + TP_PROTO(struct xfs_mount *mp, uint8_t type, uint32_t id), + TP_ARGS(mp, type, id), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(uint8_t, type) + __field(uint32_t, id) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->id = id; + __entry->type = type; + ), + TP_printk("dev %d:%d type %s id 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS), + __entry->id) +); + +#define DEFINE_XREP_DQUOT_EVENT(name) \ +DEFINE_EVENT(xrep_dquot_class, name, \ + TP_PROTO(struct xfs_mount *mp, uint8_t type, uint32_t id), \ + TP_ARGS(mp, type, id)) +DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item); +DEFINE_XREP_DQUOT_EVENT(xrep_disk_dquot); +DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item_fill_bmap_hole); +#endif /* CONFIG_XFS_QUOTA */ + #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ #endif /* _TRACE_XFS_SCRUB_TRACE_H */ diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h index 4ecac01363d9..62b9c506fdd1 100644 --- a/fs/xfs/scrub/xfarray.h +++ b/fs/xfs/scrub/xfarray.h @@ -54,6 +54,28 @@ static inline int xfarray_append(struct xfarray *array, const void *ptr) uint64_t xfarray_length(struct xfarray *array); int xfarray_load_next(struct xfarray *array, xfarray_idx_t *idx, void *rec); +/* + * Iterate the non-null elements in a sparse xfarray. Callers should + * initialize *idx to XFARRAY_CURSOR_INIT before the first call; on return, it + * will be set to one more than the index of the record that was retrieved. + * Returns 1 if a record was retrieved, 0 if there weren't any more records, or + * a negative errno. + */ +static inline int +xfarray_iter( + struct xfarray *array, + xfarray_idx_t *idx, + void *rec) +{ + int ret = xfarray_load_next(array, idx, rec); + + if (ret == -ENODATA) + return 0; + if (ret == 0) + return 1; + return ret; +} + /* Declarations for xfile array sort functionality. */ typedef cmp_func_t xfarray_cmp_fn; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 465d7630bb21..813f85156b0c 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -584,7 +584,7 @@ const struct address_space_operations xfs_address_space_operations = { .bmap = xfs_vm_bmap, .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = xfs_iomap_swapfile_activate, }; diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 36fe2abb16e6..9e02111bd890 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -33,8 +33,6 @@ struct kmem_cache *xfs_attrd_cache; static const struct xfs_item_ops xfs_attri_item_ops; static const struct xfs_item_ops xfs_attrd_item_ops; -static struct xfs_attrd_log_item *xfs_trans_get_attrd(struct xfs_trans *tp, - struct xfs_attri_log_item *attrip); static inline struct xfs_attri_log_item *ATTRI_ITEM(struct xfs_log_item *lip) { @@ -310,47 +308,6 @@ xfs_attrd_item_intent( return &ATTRD_ITEM(lip)->attrd_attrip->attri_item; } -/* - * Performs one step of an attribute update intent and marks the attrd item - * dirty.. An attr operation may be a set or a remove. Note that the - * transaction is marked dirty regardless of whether the operation succeeds or - * fails to support the ATTRI/ATTRD lifecycle rules. - */ -STATIC int -xfs_xattri_finish_update( - struct xfs_attr_intent *attr, - struct xfs_attrd_log_item *attrdp) -{ - struct xfs_da_args *args = attr->xattri_da_args; - int error; - - if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) { - error = -EIO; - goto out; - } - - error = xfs_attr_set_iter(attr); - if (!error && attr->xattri_dela_state != XFS_DAS_DONE) - error = -EAGAIN; -out: - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the ATTRI and frees the ATTRD - * 2.) shuts down the filesystem - */ - args->trans->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; - - /* - * attr intent/done items are null when logged attributes are disabled - */ - if (attrdp) - set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags); - - return error; -} - /* Log an attr to the intent item. */ STATIC void xfs_attr_log_item( @@ -360,9 +317,6 @@ xfs_attr_log_item( { struct xfs_attri_log_format *attrp; - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &attrip->attri_item.li_flags); - /* * At this point the xfs_attr_intent has been constructed, and we've * created the log intent. Fill in the attri log item and log format @@ -419,7 +373,6 @@ xfs_attr_create_intent( } attrip = xfs_attri_init(mp, attr->xattri_nameval); - xfs_trans_add_item(tp, &attrip->attri_item); xfs_attr_log_item(tp, attrip, attr); return &attrip->attri_item; @@ -447,23 +400,33 @@ xfs_attr_finish_item( struct xfs_btree_cur **state) { struct xfs_attr_intent *attr; - struct xfs_attrd_log_item *done_item = NULL; + struct xfs_da_args *args; int error; attr = container_of(item, struct xfs_attr_intent, xattri_list); - if (done) - done_item = ATTRD_ITEM(done); + args = attr->xattri_da_args; - /* - * Always reset trans after EAGAIN cycle - * since the transaction is new - */ - attr->xattri_da_args->trans = tp; + /* Reset trans after EAGAIN cycle since the transaction is new */ + args->trans = tp; - error = xfs_xattri_finish_update(attr, done_item); - if (error != -EAGAIN) - xfs_attr_free_item(attr); + if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) { + error = -EIO; + goto out; + } + /* If an attr removal is trivially complete, we're done. */ + if (attr->xattri_op_flags == XFS_ATTRI_OP_FLAGS_REMOVE && + !xfs_inode_hasattr(args->dp)) { + error = 0; + goto out; + } + + error = xfs_attr_set_iter(attr); + if (!error && attr->xattri_dela_state != XFS_DAS_DONE) + return -EAGAIN; + +out: + xfs_attr_free_item(attr); return error; } @@ -532,41 +495,22 @@ xfs_attri_validate( return xfs_verify_ino(mp, attrp->alfi_ino); } -/* - * Process an attr intent item that was recovered from the log. We need to - * delete the attr that it describes. - */ -STATIC int -xfs_attri_item_recover( - struct xfs_log_item *lip, - struct list_head *capture_list) +static inline struct xfs_attr_intent * +xfs_attri_recover_work( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + struct xfs_attri_log_format *attrp, + struct xfs_inode **ipp, + struct xfs_attri_log_nameval *nv) { - struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); struct xfs_attr_intent *attr; - struct xfs_mount *mp = lip->li_log->l_mp; - struct xfs_inode *ip; struct xfs_da_args *args; - struct xfs_trans *tp; - struct xfs_trans_res resv; - struct xfs_attri_log_format *attrp; - struct xfs_attri_log_nameval *nv = attrip->attri_nameval; - int error; - int total; int local; - struct xfs_attrd_log_item *done_item = NULL; - - /* - * First check the validity of the attr described by the ATTRI. If any - * are bad, then assume that all are bad and just toss the ATTRI. - */ - attrp = &attrip->attri_format; - if (!xfs_attri_validate(mp, attrp) || - !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len)) - return -EFSCORRUPTED; + int error; - error = xlog_recover_iget(mp, attrp->alfi_ino, &ip); + error = xlog_recover_iget(mp, attrp->alfi_ino, ipp); if (error) - return error; + return ERR_PTR(error); attr = kmem_zalloc(sizeof(struct xfs_attr_intent) + sizeof(struct xfs_da_args), KM_NOFS); @@ -584,7 +528,7 @@ xfs_attri_item_recover( attr->xattri_nameval = xfs_attri_log_nameval_get(nv); ASSERT(attr->xattri_nameval); - args->dp = ip; + args->dp = *ipp; args->geo = mp->m_attr_geo; args->whichfork = XFS_ATTR_FORK; args->name = nv->name.i_addr; @@ -608,43 +552,65 @@ xfs_attri_item_recover( attr->xattri_dela_state = xfs_attr_init_add_state(args); break; case XFS_ATTRI_OP_FLAGS_REMOVE: - if (!xfs_inode_hasattr(args->dp)) - goto out; attr->xattri_dela_state = xfs_attr_init_remove_state(args); break; - default: - ASSERT(0); - error = -EFSCORRUPTED; - goto out; } + xfs_defer_add_item(dfp, &attr->xattri_list); + return attr; +} + +/* + * Process an attr intent item that was recovered from the log. We need to + * delete the attr that it describes. + */ +STATIC int +xfs_attr_recover_work( + struct xfs_defer_pending *dfp, + struct list_head *capture_list) +{ + struct xfs_log_item *lip = dfp->dfp_intent; + struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); + struct xfs_attr_intent *attr; + struct xfs_mount *mp = lip->li_log->l_mp; + struct xfs_inode *ip; + struct xfs_da_args *args; + struct xfs_trans *tp; + struct xfs_trans_res resv; + struct xfs_attri_log_format *attrp; + struct xfs_attri_log_nameval *nv = attrip->attri_nameval; + int error; + int total; + + /* + * First check the validity of the attr described by the ATTRI. If any + * are bad, then assume that all are bad and just toss the ATTRI. + */ + attrp = &attrip->attri_format; + if (!xfs_attri_validate(mp, attrp) || + !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len)) + return -EFSCORRUPTED; + + attr = xfs_attri_recover_work(mp, dfp, attrp, &ip, nv); + if (IS_ERR(attr)) + return PTR_ERR(attr); + args = attr->xattri_da_args; + xfs_init_attr_trans(args, &resv, &total); resv = xlog_recover_resv(&resv); error = xfs_trans_alloc(mp, &resv, total, 0, XFS_TRANS_RESERVE, &tp); if (error) - goto out; - + return error; args->trans = tp; - done_item = xfs_trans_get_attrd(tp, attrip); xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - error = xfs_xattri_finish_update(attr, done_item); - if (error == -EAGAIN) { - /* - * There's more work to do, so add the intent item to this - * transaction so that we can continue it later. - */ - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list); - error = xfs_defer_ops_capture_and_commit(tp, capture_list); - if (error) - goto out_unlock; - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_irele(ip); - return 0; - } + error = xlog_recover_finish_intent(tp, dfp); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &attrip->attri_format, + sizeof(attrip->attri_format)); if (error) { xfs_trans_cancel(tp); goto out_unlock; @@ -654,18 +620,16 @@ xfs_attri_item_recover( out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_irele(ip); -out: - xfs_attr_free_item(attr); return error; } /* Re-log an intent item to push the log tail forward. */ static struct xfs_log_item * -xfs_attri_item_relog( +xfs_attr_relog_intent( + struct xfs_trans *tp, struct xfs_log_item *intent, - struct xfs_trans *tp) + struct xfs_log_item *done_item) { - struct xfs_attrd_log_item *attrdp; struct xfs_attri_log_item *old_attrip; struct xfs_attri_log_item *new_attrip; struct xfs_attri_log_format *new_attrp; @@ -674,10 +638,6 @@ xfs_attri_item_relog( old_attrip = ATTRI_ITEM(intent); old_attrp = &old_attrip->attri_format; - tp->t_flags |= XFS_TRANS_DIRTY; - attrdp = xfs_trans_get_attrd(tp, old_attrip); - set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags); - /* * Create a new log item that shares the same name/value buffer as the * old log item. @@ -691,12 +651,43 @@ xfs_attri_item_relog( new_attrp->alfi_name_len = old_attrp->alfi_name_len; new_attrp->alfi_attr_filter = old_attrp->alfi_attr_filter; - xfs_trans_add_item(tp, &new_attrip->attri_item); - set_bit(XFS_LI_DIRTY, &new_attrip->attri_item.li_flags); - return &new_attrip->attri_item; } +/* Get an ATTRD so we can process all the attrs. */ +static struct xfs_log_item * +xfs_attr_create_done( + struct xfs_trans *tp, + struct xfs_log_item *intent, + unsigned int count) +{ + struct xfs_attri_log_item *attrip; + struct xfs_attrd_log_item *attrdp; + + attrip = ATTRI_ITEM(intent); + + attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL); + + xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD, + &xfs_attrd_item_ops); + attrdp->attrd_attrip = attrip; + attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id; + + return &attrdp->attrd_item; +} + +const struct xfs_defer_op_type xfs_attr_defer_type = { + .name = "attr", + .max_items = 1, + .create_intent = xfs_attr_create_intent, + .abort_intent = xfs_attr_abort_intent, + .create_done = xfs_attr_create_done, + .finish_item = xfs_attr_finish_item, + .cancel_item = xfs_attr_cancel_item, + .recover_work = xfs_attr_recover_work, + .relog_intent = xfs_attr_relog_intent, +}; + STATIC int xlog_recover_attri_commit_pass2( struct xlog *log, @@ -767,63 +758,13 @@ xlog_recover_attri_commit_pass2( attrip = xfs_attri_init(mp, nv); memcpy(&attrip->attri_format, attri_formatp, len); - /* - * The ATTRI has two references. One for the ATTRD and one for ATTRI to - * ensure it makes it into the AIL. Insert the ATTRI into the AIL - * directly and drop the ATTRI reference. Note that - * xfs_trans_ail_update() drops the AIL lock. - */ - xfs_trans_ail_insert(log->l_ailp, &attrip->attri_item, lsn); - xfs_attri_release(attrip); + xlog_recover_intent_item(log, &attrip->attri_item, lsn, + &xfs_attr_defer_type); xfs_attri_log_nameval_put(nv); return 0; } /* - * This routine is called to allocate an "attr free done" log item. - */ -static struct xfs_attrd_log_item * -xfs_trans_get_attrd(struct xfs_trans *tp, - struct xfs_attri_log_item *attrip) -{ - struct xfs_attrd_log_item *attrdp; - - ASSERT(tp != NULL); - - attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL); - - xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD, - &xfs_attrd_item_ops); - attrdp->attrd_attrip = attrip; - attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id; - - xfs_trans_add_item(tp, &attrdp->attrd_item); - return attrdp; -} - -/* Get an ATTRD so we can process all the attrs. */ -static struct xfs_log_item * -xfs_attr_create_done( - struct xfs_trans *tp, - struct xfs_log_item *intent, - unsigned int count) -{ - if (!intent) - return NULL; - - return &xfs_trans_get_attrd(tp, ATTRI_ITEM(intent))->attrd_item; -} - -const struct xfs_defer_op_type xfs_attr_defer_type = { - .max_items = 1, - .create_intent = xfs_attr_create_intent, - .abort_intent = xfs_attr_abort_intent, - .create_done = xfs_attr_create_done, - .finish_item = xfs_attr_finish_item, - .cancel_item = xfs_attr_cancel_item, -}; - -/* * This routine is called when an ATTRD format structure is found in a committed * transaction in the log. Its purpose is to cancel the corresponding ATTRI if * it was still in the log. To do this it searches the AIL for the ATTRI with @@ -857,9 +798,7 @@ static const struct xfs_item_ops xfs_attri_item_ops = { .iop_format = xfs_attri_item_format, .iop_unpin = xfs_attri_item_unpin, .iop_release = xfs_attri_item_release, - .iop_recover = xfs_attri_item_recover, .iop_match = xfs_attri_item_match, - .iop_relog = xfs_attri_item_relog, }; const struct xlog_recover_item_ops xlog_attri_item_ops = { diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 99bbbe1a0e44..e368ad671e26 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -56,14 +56,13 @@ xfs_attr_shortform_list( struct xfs_attrlist_cursor_kern *cursor = &context->cursor; struct xfs_inode *dp = context->dp; struct xfs_attr_sf_sort *sbuf, *sbp; - struct xfs_attr_shortform *sf; + struct xfs_attr_sf_hdr *sf = dp->i_af.if_data; struct xfs_attr_sf_entry *sfe; int sbsize, nsbuf, count, i; int error = 0; - sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data; ASSERT(sf != NULL); - if (!sf->hdr.count) + if (!sf->count) return 0; trace_xfs_attr_list_sf(context); @@ -79,8 +78,8 @@ xfs_attr_shortform_list( */ if (context->bufsize == 0 || (XFS_ISRESET_CURSOR(cursor) && - (dp->i_af.if_bytes + sf->hdr.count * 16) < context->bufsize)) { - for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { + (dp->i_af.if_bytes + sf->count * 16) < context->bufsize)) { + for (i = 0, sfe = xfs_attr_sf_firstentry(sf); i < sf->count; i++) { if (XFS_IS_CORRUPT(context->dp->i_mount, !xfs_attr_namecheck(sfe->nameval, sfe->namelen))) @@ -109,7 +108,7 @@ xfs_attr_shortform_list( /* * It didn't all fit, so we have to sort everything on hashval. */ - sbsize = sf->hdr.count * sizeof(*sbuf); + sbsize = sf->count * sizeof(*sbuf); sbp = sbuf = kmem_alloc(sbsize, KM_NOFS); /* @@ -117,7 +116,7 @@ xfs_attr_shortform_list( * the relevant info from only those that match into a buffer. */ nsbuf = 0; - for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { + for (i = 0, sfe = xfs_attr_sf_firstentry(sf); i < sf->count; i++) { if (unlikely( ((char *)sfe < (char *)sf) || ((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)))) { diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index e736a0844c89..52fb8a148b7d 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -221,51 +221,6 @@ static const struct xfs_item_ops xfs_bud_item_ops = { .iop_intent = xfs_bud_item_intent, }; -static struct xfs_bud_log_item * -xfs_trans_get_bud( - struct xfs_trans *tp, - struct xfs_bui_log_item *buip) -{ - struct xfs_bud_log_item *budp; - - budp = kmem_cache_zalloc(xfs_bud_cache, GFP_KERNEL | __GFP_NOFAIL); - xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD, - &xfs_bud_item_ops); - budp->bud_buip = buip; - budp->bud_format.bud_bui_id = buip->bui_format.bui_id; - - xfs_trans_add_item(tp, &budp->bud_item); - return budp; -} - -/* - * Finish an bmap update and log it to the BUD. Note that the - * transaction is marked dirty regardless of whether the bmap update - * succeeds or fails to support the BUI/BUD lifecycle rules. - */ -static int -xfs_trans_log_finish_bmap_update( - struct xfs_trans *tp, - struct xfs_bud_log_item *budp, - struct xfs_bmap_intent *bi) -{ - int error; - - error = xfs_bmap_finish_one(tp, bi); - - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the BUI and frees the BUD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; - set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags); - - return error; -} - /* Sort bmap intents by inode. */ static int xfs_bmap_update_diff_items( @@ -314,9 +269,6 @@ xfs_bmap_update_log_item( uint next_extent; struct xfs_map_extent *map; - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags); - /* * atomic_inc_return gives us the value after the increment; * we want to use it as an array index so we need to subtract 1 from @@ -346,7 +298,6 @@ xfs_bmap_update_create_intent( ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS); - xfs_trans_add_item(tp, &buip->bui_item); if (sort) list_sort(mp, items, xfs_bmap_update_diff_items); list_for_each_entry(bi, items, bi_list) @@ -354,14 +305,23 @@ xfs_bmap_update_create_intent( return &buip->bui_item; } -/* Get an BUD so we can process all the deferred rmap updates. */ +/* Get an BUD so we can process all the deferred bmap updates. */ static struct xfs_log_item * xfs_bmap_update_create_done( struct xfs_trans *tp, struct xfs_log_item *intent, unsigned int count) { - return &xfs_trans_get_bud(tp, BUI_ITEM(intent))->bud_item; + struct xfs_bui_log_item *buip = BUI_ITEM(intent); + struct xfs_bud_log_item *budp; + + budp = kmem_cache_zalloc(xfs_bud_cache, GFP_KERNEL | __GFP_NOFAIL); + xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD, + &xfs_bud_item_ops); + budp->bud_buip = buip; + budp->bud_format.bud_bui_id = buip->bui_format.bui_id; + + return &budp->bud_item; } /* Take a passive ref to the AG containing the space we're mapping. */ @@ -392,7 +352,7 @@ xfs_bmap_update_put_group( xfs_perag_intent_put(bi->bi_pag); } -/* Process a deferred rmap update. */ +/* Process a deferred bmap update. */ STATIC int xfs_bmap_update_finish_item( struct xfs_trans *tp, @@ -405,7 +365,7 @@ xfs_bmap_update_finish_item( bi = container_of(item, struct xfs_bmap_intent, bi_list); - error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done), bi); + error = xfs_bmap_finish_one(tp, bi); if (!error && bi->bi_bmap.br_blockcount > 0) { ASSERT(bi->bi_type == XFS_BMAP_UNMAP); return -EAGAIN; @@ -437,15 +397,6 @@ xfs_bmap_update_cancel_item( kmem_cache_free(xfs_bmap_intent_cache, bi); } -const struct xfs_defer_op_type xfs_bmap_update_defer_type = { - .max_items = XFS_BUI_MAX_FAST_EXTENTS, - .create_intent = xfs_bmap_update_create_intent, - .abort_intent = xfs_bmap_update_abort_intent, - .create_done = xfs_bmap_update_create_done, - .finish_item = xfs_bmap_update_finish_item, - .cancel_item = xfs_bmap_update_cancel_item, -}; - /* Is this recovered BUI ok? */ static inline bool xfs_bui_validate( @@ -480,23 +431,53 @@ xfs_bui_validate( return xfs_verify_fsbext(mp, map->me_startblock, map->me_len); } +static inline struct xfs_bmap_intent * +xfs_bui_recover_work( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + struct xfs_inode **ipp, + struct xfs_map_extent *map) +{ + struct xfs_bmap_intent *bi; + int error; + + error = xlog_recover_iget(mp, map->me_owner, ipp); + if (error) + return ERR_PTR(error); + + bi = kmem_cache_zalloc(xfs_bmap_intent_cache, GFP_NOFS | __GFP_NOFAIL); + bi->bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + bi->bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; + bi->bi_bmap.br_startblock = map->me_startblock; + bi->bi_bmap.br_startoff = map->me_startoff; + bi->bi_bmap.br_blockcount = map->me_len; + bi->bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? + XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + bi->bi_owner = *ipp; + xfs_bmap_update_get_group(mp, bi); + + xfs_defer_add_item(dfp, &bi->bi_list); + return bi; +} + /* * Process a bmap update intent item that was recovered from the log. * We need to update some inode's bmbt. */ STATIC int -xfs_bui_item_recover( - struct xfs_log_item *lip, +xfs_bmap_recover_work( + struct xfs_defer_pending *dfp, struct list_head *capture_list) { - struct xfs_bmap_intent fake = { }; struct xfs_trans_res resv; + struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_bui_log_item *buip = BUI_ITEM(lip); struct xfs_trans *tp; struct xfs_inode *ip = NULL; struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_map_extent *map; - struct xfs_bud_log_item *budp; + struct xfs_bmap_intent *work; int iext_delta; int error = 0; @@ -507,13 +488,9 @@ xfs_bui_item_recover( } map = &buip->bui_format.bui_extents[0]; - fake.bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; - fake.bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; - - error = xlog_recover_iget(mp, map->me_owner, &ip); - if (error) - return error; + work = xfs_bui_recover_work(mp, dfp, &ip, map); + if (IS_ERR(work)) + return PTR_ERR(work); /* Allocate transaction and do the work. */ resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); @@ -522,42 +499,27 @@ xfs_bui_item_recover( if (error) goto err_rele; - budp = xfs_trans_get_bud(tp, buip); xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - if (fake.bi_type == XFS_BMAP_MAP) + if (work->bi_type == XFS_BMAP_MAP) iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT; else iext_delta = XFS_IEXT_PUNCH_HOLE_CNT; - error = xfs_iext_count_may_overflow(ip, fake.bi_whichfork, iext_delta); + error = xfs_iext_count_may_overflow(ip, work->bi_whichfork, iext_delta); if (error == -EFBIG) error = xfs_iext_count_upgrade(tp, ip, iext_delta); if (error) goto err_cancel; - fake.bi_owner = ip; - fake.bi_bmap.br_startblock = map->me_startblock; - fake.bi_bmap.br_startoff = map->me_startoff; - fake.bi_bmap.br_blockcount = map->me_len; - fake.bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? - XFS_EXT_UNWRITTEN : XFS_EXT_NORM; - - xfs_bmap_update_get_group(mp, &fake); - error = xfs_trans_log_finish_bmap_update(tp, budp, &fake); + error = xlog_recover_finish_intent(tp, dfp); if (error == -EFSCORRUPTED) - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, map, - sizeof(*map)); - xfs_bmap_update_put_group(&fake); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &buip->bui_format, sizeof(buip->bui_format)); if (error) goto err_cancel; - if (fake.bi_bmap.br_blockcount > 0) { - ASSERT(fake.bi_type == XFS_BMAP_UNMAP); - xfs_bmap_unmap_extent(tp, ip, &fake.bi_bmap); - } - /* * Commit transaction, which frees the transaction and saves the inode * for later replay activities. @@ -579,21 +541,13 @@ err_rele: return error; } -STATIC bool -xfs_bui_item_match( - struct xfs_log_item *lip, - uint64_t intent_id) -{ - return BUI_ITEM(lip)->bui_format.bui_id == intent_id; -} - /* Relog an intent item to push the log tail forward. */ static struct xfs_log_item * -xfs_bui_item_relog( +xfs_bmap_relog_intent( + struct xfs_trans *tp, struct xfs_log_item *intent, - struct xfs_trans *tp) + struct xfs_log_item *done_item) { - struct xfs_bud_log_item *budp; struct xfs_bui_log_item *buip; struct xfs_map_extent *map; unsigned int count; @@ -601,27 +555,40 @@ xfs_bui_item_relog( count = BUI_ITEM(intent)->bui_format.bui_nextents; map = BUI_ITEM(intent)->bui_format.bui_extents; - tp->t_flags |= XFS_TRANS_DIRTY; - budp = xfs_trans_get_bud(tp, BUI_ITEM(intent)); - set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags); - buip = xfs_bui_init(tp->t_mountp); memcpy(buip->bui_format.bui_extents, map, count * sizeof(*map)); atomic_set(&buip->bui_next_extent, count); - xfs_trans_add_item(tp, &buip->bui_item); - set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags); + return &buip->bui_item; } +const struct xfs_defer_op_type xfs_bmap_update_defer_type = { + .name = "bmap", + .max_items = XFS_BUI_MAX_FAST_EXTENTS, + .create_intent = xfs_bmap_update_create_intent, + .abort_intent = xfs_bmap_update_abort_intent, + .create_done = xfs_bmap_update_create_done, + .finish_item = xfs_bmap_update_finish_item, + .cancel_item = xfs_bmap_update_cancel_item, + .recover_work = xfs_bmap_recover_work, + .relog_intent = xfs_bmap_relog_intent, +}; + +STATIC bool +xfs_bui_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return BUI_ITEM(lip)->bui_format.bui_id == intent_id; +} + static const struct xfs_item_ops xfs_bui_item_ops = { .flags = XFS_ITEM_INTENT, .iop_size = xfs_bui_item_size, .iop_format = xfs_bui_item_format, .iop_unpin = xfs_bui_item_unpin, .iop_release = xfs_bui_item_release, - .iop_recover = xfs_bui_item_recover, .iop_match = xfs_bui_item_match, - .iop_relog = xfs_bui_item_relog, }; static inline void @@ -681,12 +648,9 @@ xlog_recover_bui_commit_pass2( buip = xfs_bui_init(mp); xfs_bui_copy_format(&buip->bui_format, bui_formatp); atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents); - /* - * Insert the intent into the AIL directly and drop one reference so - * that finishing or canceling the work will drop the other. - */ - xfs_trans_ail_insert(log->l_ailp, &buip->bui_item, lsn); - xfs_bui_release(buip); + + xlog_recover_intent_item(log, &buip->bui_item, lsn, + &xfs_bmap_update_defer_type); return 0; } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 731260a5af6d..c2531c28905c 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -69,147 +69,6 @@ xfs_zero_extent( GFP_NOFS, 0); } -#ifdef CONFIG_XFS_RT -int -xfs_bmap_rtalloc( - struct xfs_bmalloca *ap) -{ - struct xfs_mount *mp = ap->ip->i_mount; - xfs_fileoff_t orig_offset = ap->offset; - xfs_rtxnum_t rtx; - xfs_rtxlen_t prod = 0; /* product factor for allocators */ - xfs_extlen_t mod = 0; /* product factor for allocators */ - xfs_rtxlen_t ralen = 0; /* realtime allocation length */ - xfs_extlen_t align; /* minimum allocation alignment */ - xfs_extlen_t orig_length = ap->length; - xfs_extlen_t minlen = mp->m_sb.sb_rextsize; - xfs_rtxlen_t raminlen; - bool rtlocked = false; - bool ignore_locality = false; - int error; - - align = xfs_get_extsz_hint(ap->ip); -retry: - prod = xfs_extlen_to_rtxlen(mp, align); - error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, - align, 1, ap->eof, 0, - ap->conv, &ap->offset, &ap->length); - if (error) - return error; - ASSERT(ap->length); - ASSERT(xfs_extlen_to_rtxmod(mp, ap->length) == 0); - - /* - * If we shifted the file offset downward to satisfy an extent size - * hint, increase minlen by that amount so that the allocator won't - * give us an allocation that's too short to cover at least one of the - * blocks that the caller asked for. - */ - if (ap->offset != orig_offset) - minlen += orig_offset - ap->offset; - - /* - * If the offset & length are not perfectly aligned - * then kill prod, it will just get us in trouble. - */ - div_u64_rem(ap->offset, align, &mod); - if (mod || ap->length % align) - prod = 1; - /* - * Set ralen to be the actual requested length in rtextents. - * - * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that - * we rounded up to it, cut it back so it's valid again. - * Note that if it's a really large request (bigger than - * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't - * adjust the starting point to match it. - */ - ralen = xfs_extlen_to_rtxlen(mp, min(ap->length, XFS_MAX_BMBT_EXTLEN)); - - /* - * Lock out modifications to both the RT bitmap and summary inodes - */ - if (!rtlocked) { - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP); - xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM); - xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL); - rtlocked = true; - } - - /* - * If it's an allocation to an empty file at offset 0, - * pick an extent that will space things out in the rt area. - */ - if (ap->eof && ap->offset == 0) { - error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); - if (error) - return error; - ap->blkno = xfs_rtx_to_rtb(mp, rtx); - } else { - ap->blkno = 0; - } - - xfs_bmap_adjacent(ap); - - /* - * Realtime allocation, done through xfs_rtallocate_extent. - */ - if (ignore_locality) - rtx = 0; - else - rtx = xfs_rtb_to_rtx(mp, ap->blkno); - raminlen = max_t(xfs_rtxlen_t, 1, xfs_extlen_to_rtxlen(mp, minlen)); - error = xfs_rtallocate_extent(ap->tp, rtx, raminlen, ralen, &ralen, - ap->wasdel, prod, &rtx); - if (error) - return error; - - if (rtx != NULLRTEXTNO) { - ap->blkno = xfs_rtx_to_rtb(mp, rtx); - ap->length = xfs_rtxlen_to_extlen(mp, ralen); - ap->ip->i_nblocks += ap->length; - xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); - if (ap->wasdel) - ap->ip->i_delayed_blks -= ap->length; - /* - * Adjust the disk quota also. This was reserved - * earlier. - */ - xfs_trans_mod_dquot_byino(ap->tp, ap->ip, - ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT : - XFS_TRANS_DQ_RTBCOUNT, ap->length); - return 0; - } - - if (align > mp->m_sb.sb_rextsize) { - /* - * We previously enlarged the request length to try to satisfy - * an extent size hint. The allocator didn't return anything, - * so reset the parameters to the original values and try again - * without alignment criteria. - */ - ap->offset = orig_offset; - ap->length = orig_length; - minlen = align = mp->m_sb.sb_rextsize; - goto retry; - } - - if (!ignore_locality && ap->blkno != 0) { - /* - * If we can't allocate near a specific rt extent, try again - * without locality criteria. - */ - ignore_locality = true; - goto retry; - } - - ap->blkno = NULLFSBLOCK; - ap->length = 0; - return 0; -} -#endif /* CONFIG_XFS_RT */ - /* * Extent tree block counting routines. */ diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 6888078f5c31..77ecbb753ef2 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -47,7 +47,7 @@ int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz, int rt, int eof, int delay, int convert, xfs_fileoff_t *offp, xfs_extlen_t *lenp); -void xfs_bmap_adjacent(struct xfs_bmalloca *ap); +bool xfs_bmap_adjacent(struct xfs_bmalloca *ap); int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *rec, int *is_empty); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 545c7991b9b5..8e5bd50d29fe 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -169,7 +169,7 @@ xfs_buf_stale( atomic_set(&bp->b_lru_ref, 0); if (!(bp->b_state & XFS_BSTATE_DISPOSE) && - (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru))) + (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru))) atomic_dec(&bp->b_hold); ASSERT(atomic_read(&bp->b_hold) >= 1); @@ -1047,7 +1047,7 @@ xfs_buf_rele( * buffer for the LRU and clear the (now stale) dispose list * state flag */ - if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { + if (list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru)) { bp->b_state &= ~XFS_BSTATE_DISPOSE; atomic_inc(&bp->b_hold); } @@ -1060,7 +1060,7 @@ xfs_buf_rele( * was on was the disposal list */ if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { - list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); + list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru); } else { ASSERT(list_empty(&bp->b_lru)); } @@ -2049,6 +2049,14 @@ error_free: return NULL; } +static inline void +xfs_buf_list_del( + struct xfs_buf *bp) +{ + list_del_init(&bp->b_list); + wake_up_var(&bp->b_list); +} + /* * Cancel a delayed write list. * @@ -2066,7 +2074,7 @@ xfs_buf_delwri_cancel( xfs_buf_lock(bp); bp->b_flags &= ~_XBF_DELWRI_Q; - list_del_init(&bp->b_list); + xfs_buf_list_del(bp); xfs_buf_relse(bp); } } @@ -2120,6 +2128,34 @@ xfs_buf_delwri_queue( } /* + * Queue a buffer to this delwri list as part of a data integrity operation. + * If the buffer is on any other delwri list, we'll wait for that to clear + * so that the caller can submit the buffer for IO and wait for the result. + * Callers must ensure the buffer is not already on the list. + */ +void +xfs_buf_delwri_queue_here( + struct xfs_buf *bp, + struct list_head *buffer_list) +{ + /* + * We need this buffer to end up on the /caller's/ delwri list, not any + * old list. This can happen if the buffer is marked stale (which + * clears DELWRI_Q) after the AIL queues the buffer to its list but + * before the AIL has a chance to submit the list. + */ + while (!list_empty(&bp->b_list)) { + xfs_buf_unlock(bp); + wait_var_event(&bp->b_list, list_empty(&bp->b_list)); + xfs_buf_lock(bp); + } + + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); + + xfs_buf_delwri_queue(bp, buffer_list); +} + +/* * Compare function is more complex than it needs to be because * the return value is only 32 bits and we are doing comparisons * on 64 bit values @@ -2181,7 +2217,7 @@ xfs_buf_delwri_submit_buffers( * reference and remove it from the list here. */ if (!(bp->b_flags & _XBF_DELWRI_Q)) { - list_del_init(&bp->b_list); + xfs_buf_list_del(bp); xfs_buf_relse(bp); continue; } @@ -2201,7 +2237,7 @@ xfs_buf_delwri_submit_buffers( list_move_tail(&bp->b_list, wait_list); } else { bp->b_flags |= XBF_ASYNC; - list_del_init(&bp->b_list); + xfs_buf_list_del(bp); } __xfs_buf_submit(bp, false); } @@ -2255,7 +2291,7 @@ xfs_buf_delwri_submit( while (!list_empty(&wait_list)) { bp = list_first_entry(&wait_list, struct xfs_buf, b_list); - list_del_init(&bp->b_list); + xfs_buf_list_del(bp); /* * Wait on the locked buffer, check for errors and unlock and diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index c86e16419656..b470de08a46c 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -319,6 +319,7 @@ extern void xfs_buf_stale(struct xfs_buf *bp); /* Delayed Write Buffer Routines */ extern void xfs_buf_delwri_cancel(struct list_head *); extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); +void xfs_buf_delwri_queue_here(struct xfs_buf *bp, struct list_head *bl); extern int xfs_buf_delwri_submit(struct list_head *); extern int xfs_buf_delwri_submit_nowait(struct list_head *); extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *); diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 9f3ceb461515..cc6dc56f455d 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -18,6 +18,7 @@ #include "xfs_bmap.h" #include "xfs_trans.h" #include "xfs_error.h" +#include "xfs_health.h" /* * Directory file type support functions @@ -51,7 +52,7 @@ xfs_dir2_sf_getdents( struct xfs_mount *mp = dp->i_mount; xfs_dir2_dataptr_t off; /* current entry's offset */ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; xfs_dir2_dataptr_t dot_offset; xfs_dir2_dataptr_t dotdot_offset; xfs_ino_t ino; @@ -59,9 +60,7 @@ xfs_dir2_sf_getdents( ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL); ASSERT(dp->i_df.if_bytes == dp->i_disk_size); - ASSERT(dp->i_df.if_u1.if_data != NULL); - - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(sfp != NULL); /* * If the block number in the offset is out of range, we're done. @@ -519,6 +518,8 @@ xfs_readdir( if (xfs_is_shutdown(dp->i_mount)) return -EIO; + if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) + return -EIO; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); ASSERT(xfs_isilocked(dp, XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index a013b87ab8d5..b4f20d9c8f98 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -172,14 +172,14 @@ xfs_qm_adjust_dqtimers( /* * initialize a buffer full of dquots and log the whole thing */ -STATIC void +void xfs_qm_init_dquot_blk( struct xfs_trans *tp, - struct xfs_mount *mp, xfs_dqid_t id, xfs_dqtype_t type, struct xfs_buf *bp) { + struct xfs_mount *mp = tp->t_mountp; struct xfs_quotainfo *q = mp->m_quotainfo; struct xfs_dqblk *d; xfs_dqid_t curid; @@ -353,7 +353,7 @@ xfs_dquot_disk_alloc( * Make a chunk of dquots out of this buffer and log * the entire thing. */ - xfs_qm_init_dquot_blk(tp, mp, dqp->q_id, qtype, bp); + xfs_qm_init_dquot_blk(tp, dqp->q_id, qtype, bp); xfs_buf_set_ref(bp, XFS_DQUOT_REF); /* @@ -1065,7 +1065,7 @@ xfs_qm_dqput( struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; trace_xfs_dqput_free(dqp); - if (list_lru_add(&qi->qi_lru, &dqp->q_lru)) + if (list_lru_add_obj(&qi->qi_lru, &dqp->q_lru)) XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused); } xfs_dqunlock(dqp); @@ -1362,34 +1362,3 @@ xfs_qm_exit(void) kmem_cache_destroy(xfs_dqtrx_cache); kmem_cache_destroy(xfs_dquot_cache); } - -/* - * Iterate every dquot of a particular type. The caller must ensure that the - * particular quota type is active. iter_fn can return negative error codes, - * or -ECANCELED to indicate that it wants to stop iterating. - */ -int -xfs_qm_dqiterate( - struct xfs_mount *mp, - xfs_dqtype_t type, - xfs_qm_dqiterate_fn iter_fn, - void *priv) -{ - struct xfs_dquot *dq; - xfs_dqid_t id = 0; - int error; - - do { - error = xfs_qm_dqget_next(mp, id, type, &dq); - if (error == -ENOENT) - return 0; - if (error) - return error; - - error = iter_fn(dq, type, priv); - id = dq->q_id + 1; - xfs_qm_dqput(dq); - } while (error == 0 && id != 0); - - return error; -} diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 80c8f851a2f3..956272d9b302 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -234,12 +234,10 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) return dqp; } -typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq, - xfs_dqtype_t type, void *priv); -int xfs_qm_dqiterate(struct xfs_mount *mp, xfs_dqtype_t type, - xfs_qm_dqiterate_fn iter_fn, void *priv); - time64_t xfs_dquot_set_timeout(struct xfs_mount *mp, time64_t timeout); time64_t xfs_dquot_set_grace_period(time64_t grace); +void xfs_qm_init_dquot_blk(struct xfs_trans *tp, xfs_dqid_t id, xfs_dqtype_t + type, struct xfs_buf *bp); + #endif /* __XFS_DQUOT_H__ */ diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 9ecfdcdc752f..2ccde32c9a9e 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -678,3 +678,16 @@ xfs_extent_busy_ag_cmp( diff = b1->bno - b2->bno; return diff; } + +/* Are there any busy extents in this AG? */ +bool +xfs_extent_busy_list_empty( + struct xfs_perag *pag) +{ + bool res; + + spin_lock(&pag->pagb_lock); + res = RB_EMPTY_ROOT(&pag->pagb_tree); + spin_unlock(&pag->pagb_lock); + return res; +} diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index 0639aab336f3..470032de3139 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -85,4 +85,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list) list_sort(NULL, list, xfs_extent_busy_ag_cmp); } +bool xfs_extent_busy_list_empty(struct xfs_perag *pag); + #endif /* __XFS_EXTENT_BUSY_H__ */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 3fa8789820ad..1d1185fca6a5 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -304,39 +304,6 @@ static const struct xfs_item_ops xfs_efd_item_ops = { }; /* - * Allocate an "extent free done" log item that will hold nextents worth of - * extents. The caller must use all nextents extents, because we are not - * flexible about this at all. - */ -static struct xfs_efd_log_item * -xfs_trans_get_efd( - struct xfs_trans *tp, - struct xfs_efi_log_item *efip, - unsigned int nextents) -{ - struct xfs_efd_log_item *efdp; - - ASSERT(nextents > 0); - - if (nextents > XFS_EFD_MAX_FAST_EXTENTS) { - efdp = kzalloc(xfs_efd_log_item_sizeof(nextents), - GFP_KERNEL | __GFP_NOFAIL); - } else { - efdp = kmem_cache_zalloc(xfs_efd_cache, - GFP_KERNEL | __GFP_NOFAIL); - } - - xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD, - &xfs_efd_item_ops); - efdp->efd_efip = efip; - efdp->efd_format.efd_nextents = nextents; - efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; - - xfs_trans_add_item(tp, &efdp->efd_item); - return efdp; -} - -/* * Fill the EFD with all extents from the EFI when we need to roll the * transaction and continue with a new EFI. * @@ -364,69 +331,6 @@ xfs_efd_from_efi( efdp->efd_next_extent = efip->efi_format.efi_nextents; } -/* - * Free an extent and log it to the EFD. Note that the transaction is marked - * dirty regardless of whether the extent free succeeds or fails to support the - * EFI/EFD lifecycle rules. - */ -static int -xfs_trans_free_extent( - struct xfs_trans *tp, - struct xfs_efd_log_item *efdp, - struct xfs_extent_free_item *xefi) -{ - struct xfs_owner_info oinfo = { }; - struct xfs_mount *mp = tp->t_mountp; - struct xfs_extent *extp; - uint next_extent; - xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, - xefi->xefi_startblock); - int error; - - oinfo.oi_owner = xefi->xefi_owner; - if (xefi->xefi_flags & XFS_EFI_ATTR_FORK) - oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK; - if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK) - oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; - - trace_xfs_bmap_free_deferred(tp->t_mountp, xefi->xefi_pag->pag_agno, 0, - agbno, xefi->xefi_blockcount); - - error = __xfs_free_extent(tp, xefi->xefi_pag, agbno, - xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv, - xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); - - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the EFI and frees the EFD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; - set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); - - /* - * If we need a new transaction to make progress, the caller will log a - * new EFI with the current contents. It will also log an EFD to cancel - * the existing EFI, and so we need to copy all the unprocessed extents - * in this EFI to the EFD so this works correctly. - */ - if (error == -EAGAIN) { - xfs_efd_from_efi(efdp); - return error; - } - - next_extent = efdp->efd_next_extent; - ASSERT(next_extent < efdp->efd_format.efd_nextents); - extp = &(efdp->efd_format.efd_extents[next_extent]); - extp->ext_start = xefi->xefi_startblock; - extp->ext_len = xefi->xefi_blockcount; - efdp->efd_next_extent++; - - return error; -} - /* Sort bmap items by AG. */ static int xfs_extent_free_diff_items( @@ -453,9 +357,6 @@ xfs_extent_free_log_item( uint next_extent; struct xfs_extent *extp; - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags); - /* * atomic_inc_return gives us the value after the increment; * we want to use it as an array index so we need to subtract 1 from @@ -481,7 +382,6 @@ xfs_extent_free_create_intent( ASSERT(count > 0); - xfs_trans_add_item(tp, &efip->efi_item); if (sort) list_sort(mp, items, xfs_extent_free_diff_items); list_for_each_entry(xefi, items, xefi_list) @@ -496,7 +396,26 @@ xfs_extent_free_create_done( struct xfs_log_item *intent, unsigned int count) { - return &xfs_trans_get_efd(tp, EFI_ITEM(intent), count)->efd_item; + struct xfs_efi_log_item *efip = EFI_ITEM(intent); + struct xfs_efd_log_item *efdp; + + ASSERT(count > 0); + + if (count > XFS_EFD_MAX_FAST_EXTENTS) { + efdp = kzalloc(xfs_efd_log_item_sizeof(count), + GFP_KERNEL | __GFP_NOFAIL); + } else { + efdp = kmem_cache_zalloc(xfs_efd_cache, + GFP_KERNEL | __GFP_NOFAIL); + } + + xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD, + &xfs_efd_item_ops); + efdp->efd_efip = efip; + efdp->efd_format.efd_nextents = count; + efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; + + return &efdp->efd_item; } /* Take a passive ref to the AG containing the space we're freeing. */ @@ -527,19 +446,49 @@ xfs_extent_free_finish_item( struct list_head *item, struct xfs_btree_cur **state) { + struct xfs_owner_info oinfo = { }; struct xfs_extent_free_item *xefi; - int error; + struct xfs_efd_log_item *efdp = EFD_ITEM(done); + struct xfs_mount *mp = tp->t_mountp; + struct xfs_extent *extp; + uint next_extent; + xfs_agblock_t agbno; + int error = 0; xefi = container_of(item, struct xfs_extent_free_item, xefi_list); + agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock); + + oinfo.oi_owner = xefi->xefi_owner; + if (xefi->xefi_flags & XFS_EFI_ATTR_FORK) + oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK; + if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK) + oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; - error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi); + trace_xfs_bmap_free_deferred(tp->t_mountp, xefi->xefi_pag->pag_agno, 0, + agbno, xefi->xefi_blockcount); /* - * Don't free the XEFI if we need a new transaction to complete - * processing of it. + * If we need a new transaction to make progress, the caller will log a + * new EFI with the current contents. It will also log an EFD to cancel + * the existing EFI, and so we need to copy all the unprocessed extents + * in this EFI to the EFD so this works correctly. */ - if (error == -EAGAIN) + if (!(xefi->xefi_flags & XFS_EFI_CANCELLED)) + error = __xfs_free_extent(tp, xefi->xefi_pag, agbno, + xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv, + xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); + if (error == -EAGAIN) { + xfs_efd_from_efi(efdp); return error; + } + + /* Add the work we finished to the EFD, even though nobody uses that */ + next_extent = efdp->efd_next_extent; + ASSERT(next_extent < efdp->efd_format.efd_nextents); + extp = &(efdp->efd_format.efd_extents[next_extent]); + extp->ext_start = xefi->xefi_startblock; + extp->ext_len = xefi->xefi_blockcount; + efdp->efd_next_extent++; xfs_extent_free_put_group(xefi); kmem_cache_free(xfs_extfree_item_cache, xefi); @@ -567,15 +516,6 @@ xfs_extent_free_cancel_item( kmem_cache_free(xfs_extfree_item_cache, xefi); } -const struct xfs_defer_op_type xfs_extent_free_defer_type = { - .max_items = XFS_EFI_MAX_FAST_EXTENTS, - .create_intent = xfs_extent_free_create_intent, - .abort_intent = xfs_extent_free_abort_intent, - .create_done = xfs_extent_free_create_done, - .finish_item = xfs_extent_free_finish_item, - .cancel_item = xfs_extent_free_cancel_item, -}; - /* * AGFL blocks are accounted differently in the reserve pools and are not * inserted into the busy extent list. @@ -610,16 +550,6 @@ xfs_agfl_free_finish_item( error = xfs_free_agfl_block(tp, xefi->xefi_pag->pag_agno, agbno, agbp, &oinfo); - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the EFI and frees the EFD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); - next_extent = efdp->efd_next_extent; ASSERT(next_extent < efdp->efd_format.efd_nextents); extp = &(efdp->efd_format.efd_extents[next_extent]); @@ -632,16 +562,6 @@ xfs_agfl_free_finish_item( return error; } -/* sub-type with special handling for AGFL deferred frees */ -const struct xfs_defer_op_type xfs_agfl_free_defer_type = { - .max_items = XFS_EFI_MAX_FAST_EXTENTS, - .create_intent = xfs_extent_free_create_intent, - .abort_intent = xfs_extent_free_abort_intent, - .create_done = xfs_extent_free_create_done, - .finish_item = xfs_agfl_free_finish_item, - .cancel_item = xfs_extent_free_cancel_item, -}; - /* Is this recovered EFI ok? */ static inline bool xfs_efi_validate_ext( @@ -651,23 +571,41 @@ xfs_efi_validate_ext( return xfs_verify_fsbext(mp, extp->ext_start, extp->ext_len); } +static inline void +xfs_efi_recover_work( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + struct xfs_extent *extp) +{ + struct xfs_extent_free_item *xefi; + + xefi = kmem_cache_zalloc(xfs_extfree_item_cache, + GFP_KERNEL | __GFP_NOFAIL); + xefi->xefi_startblock = extp->ext_start; + xefi->xefi_blockcount = extp->ext_len; + xefi->xefi_agresv = XFS_AG_RESV_NONE; + xefi->xefi_owner = XFS_RMAP_OWN_UNKNOWN; + xfs_extent_free_get_group(mp, xefi); + + xfs_defer_add_item(dfp, &xefi->xefi_list); +} + /* * Process an extent free intent item that was recovered from * the log. We need to free the extents that it describes. */ STATIC int -xfs_efi_item_recover( - struct xfs_log_item *lip, +xfs_extent_free_recover_work( + struct xfs_defer_pending *dfp, struct list_head *capture_list) { struct xfs_trans_res resv; + struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_efi_log_item *efip = EFI_ITEM(lip); struct xfs_mount *mp = lip->li_log->l_mp; - struct xfs_efd_log_item *efdp; struct xfs_trans *tp; int i; int error = 0; - bool requeue_only = false; /* * First check the validity of the extents described by the @@ -682,55 +620,22 @@ xfs_efi_item_recover( sizeof(efip->efi_format)); return -EFSCORRUPTED; } + + xfs_efi_recover_work(mp, dfp, &efip->efi_format.efi_extents[i]); } resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); error = xfs_trans_alloc(mp, &resv, 0, 0, 0, &tp); if (error) return error; - efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); - - for (i = 0; i < efip->efi_format.efi_nextents; i++) { - struct xfs_extent_free_item fake = { - .xefi_owner = XFS_RMAP_OWN_UNKNOWN, - .xefi_agresv = XFS_AG_RESV_NONE, - }; - struct xfs_extent *extp; - extp = &efip->efi_format.efi_extents[i]; - - fake.xefi_startblock = extp->ext_start; - fake.xefi_blockcount = extp->ext_len; - - if (!requeue_only) { - xfs_extent_free_get_group(mp, &fake); - error = xfs_trans_free_extent(tp, efdp, &fake); - xfs_extent_free_put_group(&fake); - } - - /* - * If we can't free the extent without potentially deadlocking, - * requeue the rest of the extents to a new so that they get - * run again later with a new transaction context. - */ - if (error == -EAGAIN || requeue_only) { - error = xfs_free_extent_later(tp, fake.xefi_startblock, - fake.xefi_blockcount, - &XFS_RMAP_OINFO_ANY_OWNER, - fake.xefi_agresv); - if (!error) { - requeue_only = true; - continue; - } - } - - if (error == -EFSCORRUPTED) - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - extp, sizeof(*extp)); - if (error) - goto abort_error; - - } + error = xlog_recover_finish_intent(tp, dfp); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &efip->efi_format, + sizeof(efip->efi_format)); + if (error) + goto abort_error; return xfs_defer_ops_capture_and_commit(tp, capture_list); @@ -739,21 +644,14 @@ abort_error: return error; } -STATIC bool -xfs_efi_item_match( - struct xfs_log_item *lip, - uint64_t intent_id) -{ - return EFI_ITEM(lip)->efi_format.efi_id == intent_id; -} - /* Relog an intent item to push the log tail forward. */ static struct xfs_log_item * -xfs_efi_item_relog( +xfs_extent_free_relog_intent( + struct xfs_trans *tp, struct xfs_log_item *intent, - struct xfs_trans *tp) + struct xfs_log_item *done_item) { - struct xfs_efd_log_item *efdp; + struct xfs_efd_log_item *efdp = EFD_ITEM(done_item); struct xfs_efi_log_item *efip; struct xfs_extent *extp; unsigned int count; @@ -761,29 +659,56 @@ xfs_efi_item_relog( count = EFI_ITEM(intent)->efi_format.efi_nextents; extp = EFI_ITEM(intent)->efi_format.efi_extents; - tp->t_flags |= XFS_TRANS_DIRTY; - efdp = xfs_trans_get_efd(tp, EFI_ITEM(intent), count); efdp->efd_next_extent = count; memcpy(efdp->efd_format.efd_extents, extp, count * sizeof(*extp)); - set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); efip = xfs_efi_init(tp->t_mountp, count); memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp)); atomic_set(&efip->efi_next_extent, count); - xfs_trans_add_item(tp, &efip->efi_item); - set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags); + return &efip->efi_item; } +const struct xfs_defer_op_type xfs_extent_free_defer_type = { + .name = "extent_free", + .max_items = XFS_EFI_MAX_FAST_EXTENTS, + .create_intent = xfs_extent_free_create_intent, + .abort_intent = xfs_extent_free_abort_intent, + .create_done = xfs_extent_free_create_done, + .finish_item = xfs_extent_free_finish_item, + .cancel_item = xfs_extent_free_cancel_item, + .recover_work = xfs_extent_free_recover_work, + .relog_intent = xfs_extent_free_relog_intent, +}; + +/* sub-type with special handling for AGFL deferred frees */ +const struct xfs_defer_op_type xfs_agfl_free_defer_type = { + .name = "agfl_free", + .max_items = XFS_EFI_MAX_FAST_EXTENTS, + .create_intent = xfs_extent_free_create_intent, + .abort_intent = xfs_extent_free_abort_intent, + .create_done = xfs_extent_free_create_done, + .finish_item = xfs_agfl_free_finish_item, + .cancel_item = xfs_extent_free_cancel_item, + .recover_work = xfs_extent_free_recover_work, + .relog_intent = xfs_extent_free_relog_intent, +}; + +STATIC bool +xfs_efi_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return EFI_ITEM(lip)->efi_format.efi_id == intent_id; +} + static const struct xfs_item_ops xfs_efi_item_ops = { .flags = XFS_ITEM_INTENT, .iop_size = xfs_efi_item_size, .iop_format = xfs_efi_item_format, .iop_unpin = xfs_efi_item_unpin, .iop_release = xfs_efi_item_release, - .iop_recover = xfs_efi_item_recover, .iop_match = xfs_efi_item_match, - .iop_relog = xfs_efi_item_relog, }; /* @@ -820,12 +745,9 @@ xlog_recover_efi_commit_pass2( return error; } atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents); - /* - * Insert the intent into the AIL directly and drop one reference so - * that finishing or canceling the work will drop the other. - */ - xfs_trans_ail_insert(log->l_ailp, &efip->efi_item, lsn); - xfs_efi_release(efip); + + xlog_recover_intent_item(log, &efip->efi_item, lsn, + &xfs_extent_free_defer_type); return 0; } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 7cb75cb6b8e9..83f708f62ed9 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -134,6 +134,10 @@ xfs_growfs_data_private( if (delta < 0 && nagcount < 2) return -EINVAL; + /* No work to do */ + if (delta == 0) + return 0; + oagcount = mp->m_sb.sb_agcount; /* allocate the new per-ag structures */ if (nagcount > oagcount) { @@ -153,7 +157,7 @@ xfs_growfs_data_private( error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, -delta, 0, 0, &tp); if (error) - return error; + goto out_free_unused_perag; last_pag = xfs_perag_get(mp, oagcount - 1); if (delta > 0) { @@ -227,6 +231,9 @@ xfs_growfs_data_private( out_trans_cancel: xfs_trans_cancel(tp); +out_free_unused_perag: + if (nagcount > oagcount) + xfs_free_unused_perag_range(mp, oagcount, nagcount); return error; } @@ -344,59 +351,20 @@ xfs_growfs_log( } /* - * exported through ioctl XFS_IOC_FSCOUNTS - */ - -void -xfs_fs_counts( - xfs_mount_t *mp, - xfs_fsop_counts_t *cnt) -{ - cnt->allocino = percpu_counter_read_positive(&mp->m_icount); - cnt->freeino = percpu_counter_read_positive(&mp->m_ifree); - cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) - - xfs_fdblocks_unavailable(mp); - cnt->freertx = percpu_counter_read_positive(&mp->m_frextents); -} - -/* - * exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS - * - * xfs_reserve_blocks is called to set m_resblks - * in the in-core mount table. The number of unused reserved blocks - * is kept in m_resblks_avail. - * * Reserve the requested number of blocks if available. Otherwise return * as many as possible to satisfy the request. The actual number - * reserved are returned in outval - * - * A null inval pointer indicates that only the current reserved blocks - * available should be returned no settings are changed. + * reserved are returned in outval. */ - int xfs_reserve_blocks( - xfs_mount_t *mp, - uint64_t *inval, - xfs_fsop_resblks_t *outval) + struct xfs_mount *mp, + uint64_t request) { int64_t lcounter, delta; int64_t fdblks_delta = 0; - uint64_t request; int64_t free; int error = 0; - /* If inval is null, report current values and return */ - if (inval == (uint64_t *)NULL) { - if (!outval) - return -EINVAL; - outval->resblks = mp->m_resblks; - outval->resblks_avail = mp->m_resblks_avail; - return 0; - } - - request = *inval; - /* * With per-cpu counters, this becomes an interesting problem. we need * to work out if we are freeing or allocation blocks first, then we can @@ -466,11 +434,6 @@ xfs_reserve_blocks( spin_lock(&mp->m_sb_lock); } out: - if (outval) { - outval->resblks = mp->m_resblks; - outval->resblks_avail = mp->m_resblks_avail; - } - spin_unlock(&mp->m_sb_lock); return error; } @@ -482,9 +445,9 @@ xfs_fs_goingdown( { switch (inflags) { case XFS_FSOP_GOING_FLAGS_DEFAULT: { - if (!freeze_bdev(mp->m_super->s_bdev)) { + if (!bdev_freeze(mp->m_super->s_bdev)) { xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); - thaw_bdev(mp->m_super->s_bdev); + bdev_thaw(mp->m_super->s_bdev); } break; } diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 2cffe51a31e8..44457b0a0593 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -6,14 +6,12 @@ #ifndef __XFS_FSOPS_H__ #define __XFS_FSOPS_H__ -extern int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in); -extern int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in); -extern void xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); -extern int xfs_reserve_blocks(xfs_mount_t *mp, uint64_t *inval, - xfs_fsop_resblks_t *outval); -extern int xfs_fs_goingdown(xfs_mount_t *mp, uint32_t inflags); +int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in); +int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in); +int xfs_reserve_blocks(struct xfs_mount *mp, uint64_t request); +int xfs_fs_goingdown(struct xfs_mount *mp, uint32_t inflags); -extern int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp); -extern int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp); +int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp); +int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp); #endif /* __XFS_FSOPS_H__ */ diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 9edc1f2bc939..f18fec0adf66 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -44,4 +44,16 @@ struct xfs_globals xfs_globals = { .pwork_threads = -1, /* automatic thread detection */ .larp = false, /* log attribute replay */ #endif + + /* + * Leave this many record slots empty when bulk loading btrees. By + * default we load new btree leaf blocks 75% full. + */ + .bload_leaf_slack = -1, + + /* + * Leave this many key/ptr slots empty when bulk loading btrees. By + * default we load new btree node blocks 75% full. + */ + .bload_node_slack = -1, }; diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index 72a075bb2c10..9a57afee9338 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -222,7 +222,7 @@ xfs_inode_mark_sick( struct xfs_inode *ip, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY)); + ASSERT(!(mask & ~(XFS_SICK_INO_PRIMARY | XFS_SICK_INO_ZAPPED))); trace_xfs_inode_mark_sick(ip, mask); spin_lock(&ip->i_flags_lock); @@ -246,7 +246,7 @@ xfs_inode_mark_healthy( struct xfs_inode *ip, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY)); + ASSERT(!(mask & ~(XFS_SICK_INO_PRIMARY | XFS_SICK_INO_ZAPPED))); trace_xfs_inode_mark_healthy(ip, mask); spin_lock(&ip->i_flags_lock); @@ -369,6 +369,10 @@ static const struct ioctl_sick_map ino_map[] = { { XFS_SICK_INO_XATTR, XFS_BS_SICK_XATTR }, { XFS_SICK_INO_SYMLINK, XFS_BS_SICK_SYMLINK }, { XFS_SICK_INO_PARENT, XFS_BS_SICK_PARENT }, + { XFS_SICK_INO_BMBTD_ZAPPED, XFS_BS_SICK_BMBTD }, + { XFS_SICK_INO_BMBTA_ZAPPED, XFS_BS_SICK_BMBTA }, + { XFS_SICK_INO_DIR_ZAPPED, XFS_BS_SICK_DIR }, + { XFS_SICK_INO_SYMLINK_ZAPPED, XFS_BS_SICK_SYMLINK }, { 0, 0 }, }; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c0f1c89786c2..1fd94958aa97 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -37,15 +37,10 @@ #include "xfs_reflink.h" #include "xfs_ag.h" #include "xfs_log_priv.h" +#include "xfs_health.h" struct kmem_cache *xfs_inode_cache; -/* - * Used in xfs_itruncate_extents(). This is the maximum number of extents - * freed from a file in a single transaction. - */ -#define XFS_ITRUNC_MAX_EXTENTS 2 - STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *); STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag, struct xfs_inode *); @@ -661,6 +656,8 @@ xfs_lookup( if (xfs_is_shutdown(dp->i_mount)) return -EIO; + if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) + return -EIO; error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); if (error) @@ -875,7 +872,7 @@ xfs_init_new_inode( case S_IFLNK: ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; ip->i_df.if_bytes = 0; - ip->i_df.if_u1.if_root = NULL; + ip->i_df.if_data = NULL; break; default: ASSERT(0); @@ -978,6 +975,8 @@ xfs_create( if (xfs_is_shutdown(mp)) return -EIO; + if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) + return -EIO; prid = xfs_get_initial_prid(dp); @@ -1217,6 +1216,8 @@ xfs_link( if (xfs_is_shutdown(mp)) return -EIO; + if (xfs_ifork_zapped(tdp, XFS_DATA_FORK)) + return -EIO; error = xfs_qm_dqattach(sip); if (error) @@ -1339,7 +1340,6 @@ xfs_itruncate_extents_flags( struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp = *tpp; xfs_fileoff_t first_unmap_block; - xfs_filblks_t unmap_len; int error = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -1371,19 +1371,10 @@ xfs_itruncate_extents_flags( return 0; } - unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1; - while (unmap_len > 0) { - ASSERT(tp->t_highest_agno == NULLAGNUMBER); - error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len, - flags, XFS_ITRUNC_MAX_EXTENTS); - if (error) - goto out; - - /* free the just unmapped extents */ - error = xfs_defer_finish(&tp); - if (error) - goto out; - } + error = xfs_bunmapi_range(&tp, ip, flags, first_unmap_block, + XFS_MAX_FILEOFF); + if (error) + goto out; if (whichfork == XFS_DATA_FORK) { /* Remove all pending CoW reservations. */ @@ -2387,8 +2378,8 @@ xfs_ifree( * already been freed by xfs_attr_inactive. */ if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { - kmem_free(ip->i_df.if_u1.if_data); - ip->i_df.if_u1.if_data = NULL; + kmem_free(ip->i_df.if_data); + ip->i_df.if_data = NULL; ip->i_df.if_bytes = 0; } @@ -2506,6 +2497,8 @@ xfs_remove( if (xfs_is_shutdown(mp)) return -EIO; + if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) + return -EIO; error = xfs_qm_dqattach(dp); if (error) @@ -3758,3 +3751,29 @@ xfs_inode_reload_unlinked( return error; } + +/* Has this inode fork been zapped by repair? */ +bool +xfs_ifork_zapped( + const struct xfs_inode *ip, + int whichfork) +{ + unsigned int datamask = 0; + + switch (whichfork) { + case XFS_DATA_FORK: + switch (ip->i_vnode.i_mode & S_IFMT) { + case S_IFDIR: + datamask = XFS_SICK_INO_DIR_ZAPPED; + break; + case S_IFLNK: + datamask = XFS_SICK_INO_SYMLINK_ZAPPED; + break; + } + return ip->i_sick & (XFS_SICK_INO_BMBTD_ZAPPED | datamask); + case XFS_ATTR_FORK: + return ip->i_sick & XFS_SICK_INO_BMBTA_ZAPPED; + default: + return false; + } +} diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 3beb470f1892..97f63bacd4c2 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -622,4 +622,6 @@ xfs_inode_unlinked_incomplete( int xfs_inode_reload_unlinked_bucket(struct xfs_trans *tp, struct xfs_inode *ip); int xfs_inode_reload_unlinked(struct xfs_inode *ip); +bool xfs_ifork_zapped(const struct xfs_inode *ip, int whichfork); + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index cd7803fda8b1..0aee97ba0be8 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -352,11 +352,10 @@ xfs_inode_item_format_data_fork( ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV); if ((iip->ili_fields & XFS_ILOG_DDATA) && ip->i_df.if_bytes > 0) { - ASSERT(ip->i_df.if_u1.if_data != NULL); + ASSERT(ip->i_df.if_data != NULL); ASSERT(ip->i_disk_size > 0); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL, - ip->i_df.if_u1.if_data, - ip->i_df.if_bytes); + ip->i_df.if_data, ip->i_df.if_bytes); ilf->ilf_dsize = (unsigned)ip->i_df.if_bytes; ilf->ilf_size++; } else { @@ -431,10 +430,9 @@ xfs_inode_item_format_attr_fork( if ((iip->ili_fields & XFS_ILOG_ADATA) && ip->i_af.if_bytes > 0) { - ASSERT(ip->i_af.if_u1.if_data != NULL); + ASSERT(ip->i_af.if_data != NULL); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL, - ip->i_af.if_u1.if_data, - ip->i_af.if_bytes); + ip->i_af.if_data, ip->i_af.if_bytes); ilf->ilf_asize = (unsigned)ip->i_af.if_bytes; ilf->ilf_size++; } else { @@ -557,6 +555,9 @@ xfs_inode_to_log_dinode( memset(to->di_pad2, 0, sizeof(to->di_pad2)); uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); to->di_v3_pad = 0; + + /* dummy value for initialisation */ + to->di_crc = 0; } else { to->di_version = 2; to->di_flushiter = ip->i_flushiter; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 6c3919687ea6..f02b6e558af5 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1872,6 +1872,63 @@ xfs_fs_eofblocks_from_user( return 0; } +static int +xfs_ioctl_getset_resblocks( + struct file *filp, + unsigned int cmd, + void __user *arg) +{ + struct xfs_mount *mp = XFS_I(file_inode(filp))->i_mount; + struct xfs_fsop_resblks fsop = { }; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (cmd == XFS_IOC_SET_RESBLKS) { + if (xfs_is_readonly(mp)) + return -EROFS; + + if (copy_from_user(&fsop, arg, sizeof(fsop))) + return -EFAULT; + + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_reserve_blocks(mp, fsop.resblks); + mnt_drop_write_file(filp); + if (error) + return error; + } + + spin_lock(&mp->m_sb_lock); + fsop.resblks = mp->m_resblks; + fsop.resblks_avail = mp->m_resblks_avail; + spin_unlock(&mp->m_sb_lock); + + if (copy_to_user(arg, &fsop, sizeof(fsop))) + return -EFAULT; + return 0; +} + +static int +xfs_ioctl_fs_counts( + struct xfs_mount *mp, + struct xfs_fsop_counts __user *uarg) +{ + struct xfs_fsop_counts out = { + .allocino = percpu_counter_read_positive(&mp->m_icount), + .freeino = percpu_counter_read_positive(&mp->m_ifree), + .freedata = percpu_counter_read_positive(&mp->m_fdblocks) - + xfs_fdblocks_unavailable(mp), + .freertx = percpu_counter_read_positive(&mp->m_frextents), + }; + + if (copy_to_user(uarg, &out, sizeof(out))) + return -EFAULT; + return 0; +} + /* * These long-unused ioctls were removed from the official ioctl API in 5.17, * but retain these definitions so that we can log warnings about them. @@ -2008,60 +2065,12 @@ xfs_file_ioctl( return error; } - case XFS_IOC_FSCOUNTS: { - xfs_fsop_counts_t out; + case XFS_IOC_FSCOUNTS: + return xfs_ioctl_fs_counts(mp, arg); - xfs_fs_counts(mp, &out); - - if (copy_to_user(arg, &out, sizeof(out))) - return -EFAULT; - return 0; - } - - case XFS_IOC_SET_RESBLKS: { - xfs_fsop_resblks_t inout; - uint64_t in; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (xfs_is_readonly(mp)) - return -EROFS; - - if (copy_from_user(&inout, arg, sizeof(inout))) - return -EFAULT; - - error = mnt_want_write_file(filp); - if (error) - return error; - - /* input parameter is passed in resblks field of structure */ - in = inout.resblks; - error = xfs_reserve_blocks(mp, &in, &inout); - mnt_drop_write_file(filp); - if (error) - return error; - - if (copy_to_user(arg, &inout, sizeof(inout))) - return -EFAULT; - return 0; - } - - case XFS_IOC_GET_RESBLKS: { - xfs_fsop_resblks_t out; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - error = xfs_reserve_blocks(mp, NULL, &out); - if (error) - return error; - - if (copy_to_user(arg, &out, sizeof(out))) - return -EFAULT; - - return 0; - } + case XFS_IOC_SET_RESBLKS: + case XFS_IOC_GET_RESBLKS: + return xfs_ioctl_getset_resblocks(filp, cmd, arg); case XFS_IOC_FSGROWFSDATA: { struct xfs_growfs_data in; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index ee206facf0dc..a1650fc81382 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1542,6 +1542,7 @@ xlog_alloc_log( log->l_covered_state = XLOG_STATE_COVER_IDLE; set_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate); INIT_DELAYED_WORK(&log->l_work, xfs_log_worker); + INIT_LIST_HEAD(&log->r_dfops); log->l_prev_block = -1; /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index fa3ad1d7b31c..e30c06ec20e3 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -407,6 +407,7 @@ struct xlog { long l_opstate; /* operational state */ uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ struct list_head *l_buf_cancel_table; + struct list_head r_dfops; /* recovered log intent items */ int l_iclog_hsize; /* size of iclog header */ int l_iclog_heads; /* # of iclog header sectors */ uint l_sectBBsize; /* sector size in BBs (2^n) */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index a1e18b24971a..1251c81e55f9 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1723,30 +1723,24 @@ xlog_clear_stale_blocks( */ void xlog_recover_release_intent( - struct xlog *log, - unsigned short intent_type, - uint64_t intent_id) + struct xlog *log, + unsigned short intent_type, + uint64_t intent_id) { - struct xfs_ail_cursor cur; - struct xfs_log_item *lip; - struct xfs_ail *ailp = log->l_ailp; + struct xfs_defer_pending *dfp, *n; + + list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) { + struct xfs_log_item *lip = dfp->dfp_intent; - spin_lock(&ailp->ail_lock); - for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL; - lip = xfs_trans_ail_cursor_next(ailp, &cur)) { if (lip->li_type != intent_type) continue; if (!lip->li_ops->iop_match(lip, intent_id)) continue; - spin_unlock(&ailp->ail_lock); - lip->li_ops->iop_release(lip); - spin_lock(&ailp->ail_lock); - break; - } + ASSERT(xlog_item_is_intent(lip)); - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); + xfs_defer_cancel_recovery(log->l_mp, dfp); + } } int @@ -1939,6 +1933,29 @@ xlog_buf_readahead( xfs_buf_readahead(log->l_mp->m_ddev_targp, blkno, len, ops); } +/* + * Create a deferred work structure for resuming and tracking the progress of a + * log intent item that was found during recovery. + */ +void +xlog_recover_intent_item( + struct xlog *log, + struct xfs_log_item *lip, + xfs_lsn_t lsn, + const struct xfs_defer_op_type *ops) +{ + ASSERT(xlog_item_is_intent(lip)); + + xfs_defer_start_recovery(lip, &log->r_dfops, ops); + + /* + * Insert the intent into the AIL directly and drop one reference so + * that finishing or canceling the work will drop the other. + */ + xfs_trans_ail_insert(log->l_ailp, lip, lsn); + lip->li_ops->iop_unpin(lip, 0); +} + STATIC int xlog_recover_items_pass2( struct xlog *log, @@ -2533,36 +2550,26 @@ xlog_abort_defer_ops( */ STATIC int xlog_recover_process_intents( - struct xlog *log) + struct xlog *log) { LIST_HEAD(capture_list); - struct xfs_ail_cursor cur; - struct xfs_log_item *lip; - struct xfs_ail *ailp; - int error = 0; + struct xfs_defer_pending *dfp, *n; + int error = 0; #if defined(DEBUG) || defined(XFS_WARN) - xfs_lsn_t last_lsn; -#endif + xfs_lsn_t last_lsn; - ailp = log->l_ailp; - spin_lock(&ailp->ail_lock); -#if defined(DEBUG) || defined(XFS_WARN) last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); #endif - for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); - lip != NULL; - lip = xfs_trans_ail_cursor_next(ailp, &cur)) { - const struct xfs_item_ops *ops; - if (!xlog_item_is_intent(lip)) - break; + list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) { + ASSERT(xlog_item_is_intent(dfp->dfp_intent)); /* * We should never see a redo item with a LSN higher than * the last transaction we found in the log at the start * of recovery. */ - ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0); + ASSERT(XFS_LSN_CMP(last_lsn, dfp->dfp_intent->li_lsn) >= 0); /* * NOTE: If your intent processing routine can create more @@ -2571,21 +2578,14 @@ xlog_recover_process_intents( * replayed in the wrong order! * * The recovery function can free the log item, so we must not - * access lip after it returns. + * access dfp->dfp_intent after it returns. It must dispose of + * @dfp if it returns 0. */ - spin_unlock(&ailp->ail_lock); - ops = lip->li_ops; - error = ops->iop_recover(lip, &capture_list); - spin_lock(&ailp->ail_lock); - if (error) { - trace_xlog_intent_recovery_failed(log->l_mp, error, - ops->iop_recover); + error = xfs_defer_finish_recovery(log->l_mp, dfp, + &capture_list); + if (error) break; - } } - - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); if (error) goto err; @@ -2606,27 +2606,34 @@ err: */ STATIC void xlog_recover_cancel_intents( - struct xlog *log) + struct xlog *log) { - struct xfs_log_item *lip; - struct xfs_ail_cursor cur; - struct xfs_ail *ailp; - - ailp = log->l_ailp; - spin_lock(&ailp->ail_lock); - lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); - while (lip != NULL) { - if (!xlog_item_is_intent(lip)) - break; + struct xfs_defer_pending *dfp, *n; - spin_unlock(&ailp->ail_lock); - lip->li_ops->iop_release(lip); - spin_lock(&ailp->ail_lock); - lip = xfs_trans_ail_cursor_next(ailp, &cur); + list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) { + ASSERT(xlog_item_is_intent(dfp->dfp_intent)); + + xfs_defer_cancel_recovery(log->l_mp, dfp); } +} - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); +/* + * Transfer ownership of the recovered pending work to the recovery transaction + * and try to finish the work. If there is more work to be done, the dfp will + * remain attached to the transaction. If not, the dfp is freed. + */ +int +xlog_recover_finish_intent( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + int error; + + list_move(&dfp->dfp_list, &tp->t_dfops); + error = xfs_defer_finish_one(tp, dfp); + if (error == -EAGAIN) + return 0; + return error; } /* diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index aed5be5508fe..aabb25dc3efa 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -637,7 +637,6 @@ xfs_mountfs( struct xfs_sb *sbp = &(mp->m_sb); struct xfs_inode *rip; struct xfs_ino_geometry *igeo = M_IGEO(mp); - uint64_t resblks; uint quotamount = 0; uint quotaflags = 0; int error = 0; @@ -974,8 +973,7 @@ xfs_mountfs( * we were already there on the last unmount. Warn if this occurs. */ if (!xfs_is_readonly(mp)) { - resblks = xfs_default_resblks(mp); - error = xfs_reserve_blocks(mp, &resblks, NULL); + error = xfs_reserve_blocks(mp, xfs_default_resblks(mp)); if (error) xfs_warn(mp, "Unable to allocate reserve blocks. Continuing without reserve pool."); @@ -1053,7 +1051,6 @@ void xfs_unmountfs( struct xfs_mount *mp) { - uint64_t resblks; int error; /* @@ -1090,8 +1087,7 @@ xfs_unmountfs( * we only every apply deltas to the superblock and hence the incore * value does not matter.... */ - resblks = 0; - error = xfs_reserve_blocks(mp, &resblks, NULL); + error = xfs_reserve_blocks(mp, 0); if (error) xfs_warn(mp, "Unable to free reserved block pool. " "Freespace may not be correct on next mount."); diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index a7daa522e00f..fa50e5308292 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -22,6 +22,7 @@ #include <linux/mm.h> #include <linux/dax.h> +#include <linux/fs.h> struct xfs_failure_info { xfs_agblock_t startblock; @@ -73,10 +74,16 @@ xfs_dax_failure_fn( struct xfs_mount *mp = cur->bc_mp; struct xfs_inode *ip; struct xfs_failure_info *notify = data; + struct address_space *mapping; + pgoff_t pgoff; + unsigned long pgcnt; int error = 0; if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) { + /* Continue the query because this isn't a failure. */ + if (notify->mf_flags & MF_MEM_PRE_REMOVE) + return 0; notify->want_shutdown = true; return 0; } @@ -92,15 +99,61 @@ xfs_dax_failure_fn( return 0; } - error = mf_dax_kill_procs(VFS_I(ip)->i_mapping, - xfs_failure_pgoff(mp, rec, notify), - xfs_failure_pgcnt(mp, rec, notify), - notify->mf_flags); + mapping = VFS_I(ip)->i_mapping; + pgoff = xfs_failure_pgoff(mp, rec, notify); + pgcnt = xfs_failure_pgcnt(mp, rec, notify); + + /* Continue the rmap query if the inode isn't a dax file. */ + if (dax_mapping(mapping)) + error = mf_dax_kill_procs(mapping, pgoff, pgcnt, + notify->mf_flags); + + /* Invalidate the cache in dax pages. */ + if (notify->mf_flags & MF_MEM_PRE_REMOVE) + invalidate_inode_pages2_range(mapping, pgoff, + pgoff + pgcnt - 1); + xfs_irele(ip); return error; } static int +xfs_dax_notify_failure_freeze( + struct xfs_mount *mp) +{ + struct super_block *sb = mp->m_super; + int error; + + error = freeze_super(sb, FREEZE_HOLDER_KERNEL); + if (error) + xfs_emerg(mp, "already frozen by kernel, err=%d", error); + + return error; +} + +static void +xfs_dax_notify_failure_thaw( + struct xfs_mount *mp, + bool kernel_frozen) +{ + struct super_block *sb = mp->m_super; + int error; + + if (kernel_frozen) { + error = thaw_super(sb, FREEZE_HOLDER_KERNEL); + if (error) + xfs_emerg(mp, "still frozen after notify failure, err=%d", + error); + } + + /* + * Also thaw userspace call anyway because the device is about to be + * removed immediately. + */ + thaw_super(sb, FREEZE_HOLDER_USERSPACE); +} + +static int xfs_dax_notify_ddev_failure( struct xfs_mount *mp, xfs_daddr_t daddr, @@ -112,15 +165,29 @@ xfs_dax_notify_ddev_failure( struct xfs_btree_cur *cur = NULL; struct xfs_buf *agf_bp = NULL; int error = 0; + bool kernel_frozen = false; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr); xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno); xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1); xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno); + if (mf_flags & MF_MEM_PRE_REMOVE) { + xfs_info(mp, "Device is about to be removed!"); + /* + * Freeze fs to prevent new mappings from being created. + * - Keep going on if others already hold the kernel forzen. + * - Keep going on if other errors too because this device is + * starting to fail. + * - If kernel frozen state is hold successfully here, thaw it + * here as well at the end. + */ + kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0; + } + error = xfs_trans_alloc_empty(mp, &tp); if (error) - return error; + goto out; for (; agno <= end_agno; agno++) { struct xfs_rmap_irec ri_low = { }; @@ -165,11 +232,26 @@ xfs_dax_notify_ddev_failure( } xfs_trans_cancel(tp); - if (error || notify.want_shutdown) { + + /* + * Shutdown fs from a force umount in pre-remove case which won't fail, + * so errors can be ignored. Otherwise, shutdown the filesystem with + * CORRUPT flag if error occured or notify.want_shutdown was set during + * RMAP querying. + */ + if (mf_flags & MF_MEM_PRE_REMOVE) + xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); + else if (error || notify.want_shutdown) { xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); if (!error) error = -EFSCORRUPTED; } + +out: + /* Thaw the fs if it has been frozen before. */ + if (mf_flags & MF_MEM_PRE_REMOVE) + xfs_dax_notify_failure_thaw(mp, kernel_frozen); + return error; } @@ -197,6 +279,14 @@ xfs_dax_notify_failure( if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev && mp->m_logdev_targp != mp->m_ddev_targp) { + /* + * In the pre-remove case the failure notification is attempting + * to trigger a force unmount. The expectation is that the + * device is still present, but its removal is in progress and + * can not be cancelled, proceed with accessing the log device. + */ + if (mf_flags & MF_MEM_PRE_REMOVE) + return 0; xfs_err(mp, "ondisk log corrupt, shutting down fs!"); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); return -EFSCORRUPTED; @@ -210,6 +300,12 @@ xfs_dax_notify_failure( ddev_start = mp->m_ddev_targp->bt_dax_part_off; ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1; + /* Notify failure on the whole device. */ + if (offset == 0 && len == U64_MAX) { + offset = ddev_start; + len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev); + } + /* Ignore the range out of filesystem area */ if (offset + len - 1 < ddev_start) return -ENXIO; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 94a7932ac570..67d0a8564ff3 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -171,7 +171,7 @@ xfs_qm_dqpurge( * hits zero, so it really should be on the freelist here. */ ASSERT(!list_empty(&dqp->q_lru)); - list_lru_del(&qi->qi_lru, &dqp->q_lru); + list_lru_del_obj(&qi->qi_lru, &dqp->q_lru); XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused); xfs_qm_dqdestroy(dqp); diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index dcc785fdd345..e0d56489f3b2 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -127,7 +127,10 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid, } #define xfs_trans_dup_dqinfo(tp, tp2) #define xfs_trans_free_dqinfo(tp) -#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) do { } while (0) +static inline void xfs_trans_mod_dquot_byino(struct xfs_trans *tp, + struct xfs_inode *ip, uint field, int64_t delta) +{ +} #define xfs_trans_apply_dquot_deltas(tp) #define xfs_trans_unreserve_and_mod_dquots(tp) static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 2d4444d61e98..20ad8086da60 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -227,52 +227,6 @@ static const struct xfs_item_ops xfs_cud_item_ops = { .iop_intent = xfs_cud_item_intent, }; -static struct xfs_cud_log_item * -xfs_trans_get_cud( - struct xfs_trans *tp, - struct xfs_cui_log_item *cuip) -{ - struct xfs_cud_log_item *cudp; - - cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL); - xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD, - &xfs_cud_item_ops); - cudp->cud_cuip = cuip; - cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id; - - xfs_trans_add_item(tp, &cudp->cud_item); - return cudp; -} - -/* - * Finish an refcount update and log it to the CUD. Note that the - * transaction is marked dirty regardless of whether the refcount - * update succeeds or fails to support the CUI/CUD lifecycle rules. - */ -static int -xfs_trans_log_finish_refcount_update( - struct xfs_trans *tp, - struct xfs_cud_log_item *cudp, - struct xfs_refcount_intent *ri, - struct xfs_btree_cur **pcur) -{ - int error; - - error = xfs_refcount_finish_one(tp, ri, pcur); - - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the CUI and frees the CUD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; - set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags); - - return error; -} - /* Sort refcount intents by AG. */ static int xfs_refcount_update_diff_items( @@ -318,9 +272,6 @@ xfs_refcount_update_log_item( uint next_extent; struct xfs_phys_extent *pmap; - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); - /* * atomic_inc_return gives us the value after the increment; * we want to use it as an array index so we need to subtract 1 from @@ -347,7 +298,6 @@ xfs_refcount_update_create_intent( ASSERT(count > 0); - xfs_trans_add_item(tp, &cuip->cui_item); if (sort) list_sort(mp, items, xfs_refcount_update_diff_items); list_for_each_entry(ri, items, ri_list) @@ -362,7 +312,16 @@ xfs_refcount_update_create_done( struct xfs_log_item *intent, unsigned int count) { - return &xfs_trans_get_cud(tp, CUI_ITEM(intent))->cud_item; + struct xfs_cui_log_item *cuip = CUI_ITEM(intent); + struct xfs_cud_log_item *cudp; + + cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL); + xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD, + &xfs_cud_item_ops); + cudp->cud_cuip = cuip; + cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id; + + return &cudp->cud_item; } /* Take a passive ref to the AG containing the space we're refcounting. */ @@ -397,10 +356,9 @@ xfs_refcount_update_finish_item( int error; ri = container_of(item, struct xfs_refcount_intent, ri_list); - error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done), ri, - state); /* Did we run out of reservation? Requeue what we didn't finish. */ + error = xfs_refcount_finish_one(tp, ri, state); if (!error && ri->ri_blockcount > 0) { ASSERT(ri->ri_type == XFS_REFCOUNT_INCREASE || ri->ri_type == XFS_REFCOUNT_DECREASE); @@ -433,16 +391,6 @@ xfs_refcount_update_cancel_item( kmem_cache_free(xfs_refcount_intent_cache, ri); } -const struct xfs_defer_op_type xfs_refcount_update_defer_type = { - .max_items = XFS_CUI_MAX_FAST_EXTENTS, - .create_intent = xfs_refcount_update_create_intent, - .abort_intent = xfs_refcount_update_abort_intent, - .create_done = xfs_refcount_update_create_done, - .finish_item = xfs_refcount_update_finish_item, - .finish_cleanup = xfs_refcount_finish_one_cleanup, - .cancel_item = xfs_refcount_update_cancel_item, -}; - /* Is this recovered CUI ok? */ static inline bool xfs_cui_validate_phys( @@ -468,23 +416,38 @@ xfs_cui_validate_phys( return xfs_verify_fsbext(mp, pmap->pe_startblock, pmap->pe_len); } +static inline void +xfs_cui_recover_work( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + struct xfs_phys_extent *pmap) +{ + struct xfs_refcount_intent *ri; + + ri = kmem_cache_alloc(xfs_refcount_intent_cache, + GFP_NOFS | __GFP_NOFAIL); + ri->ri_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; + ri->ri_startblock = pmap->pe_startblock; + ri->ri_blockcount = pmap->pe_len; + xfs_refcount_update_get_group(mp, ri); + + xfs_defer_add_item(dfp, &ri->ri_list); +} + /* * Process a refcount update intent item that was recovered from the log. * We need to update the refcountbt. */ STATIC int -xfs_cui_item_recover( - struct xfs_log_item *lip, +xfs_refcount_recover_work( + struct xfs_defer_pending *dfp, struct list_head *capture_list) { struct xfs_trans_res resv; + struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_cui_log_item *cuip = CUI_ITEM(lip); - struct xfs_cud_log_item *cudp; struct xfs_trans *tp; - struct xfs_btree_cur *rcur = NULL; struct xfs_mount *mp = lip->li_log->l_mp; - unsigned int refc_type; - bool requeue_only = false; int i; int error = 0; @@ -501,6 +464,8 @@ xfs_cui_item_recover( sizeof(cuip->cui_format)); return -EFSCORRUPTED; } + + xfs_cui_recover_work(mp, dfp, &cuip->cui_format.cui_extents[i]); } /* @@ -521,100 +486,28 @@ xfs_cui_item_recover( if (error) return error; - cudp = xfs_trans_get_cud(tp, cuip); - - for (i = 0; i < cuip->cui_format.cui_nextents; i++) { - struct xfs_refcount_intent fake = { }; - struct xfs_phys_extent *pmap; - - pmap = &cuip->cui_format.cui_extents[i]; - refc_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; - switch (refc_type) { - case XFS_REFCOUNT_INCREASE: - case XFS_REFCOUNT_DECREASE: - case XFS_REFCOUNT_ALLOC_COW: - case XFS_REFCOUNT_FREE_COW: - fake.ri_type = refc_type; - break; - default: - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - &cuip->cui_format, - sizeof(cuip->cui_format)); - error = -EFSCORRUPTED; - goto abort_error; - } - - fake.ri_startblock = pmap->pe_startblock; - fake.ri_blockcount = pmap->pe_len; - - if (!requeue_only) { - xfs_refcount_update_get_group(mp, &fake); - error = xfs_trans_log_finish_refcount_update(tp, cudp, - &fake, &rcur); - xfs_refcount_update_put_group(&fake); - } - if (error == -EFSCORRUPTED) - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - &cuip->cui_format, - sizeof(cuip->cui_format)); - if (error) - goto abort_error; - - /* Requeue what we didn't finish. */ - if (fake.ri_blockcount > 0) { - struct xfs_bmbt_irec irec = { - .br_startblock = fake.ri_startblock, - .br_blockcount = fake.ri_blockcount, - }; - - switch (fake.ri_type) { - case XFS_REFCOUNT_INCREASE: - xfs_refcount_increase_extent(tp, &irec); - break; - case XFS_REFCOUNT_DECREASE: - xfs_refcount_decrease_extent(tp, &irec); - break; - case XFS_REFCOUNT_ALLOC_COW: - xfs_refcount_alloc_cow_extent(tp, - irec.br_startblock, - irec.br_blockcount); - break; - case XFS_REFCOUNT_FREE_COW: - xfs_refcount_free_cow_extent(tp, - irec.br_startblock, - irec.br_blockcount); - break; - default: - ASSERT(0); - } - requeue_only = true; - } - } + error = xlog_recover_finish_intent(tp, dfp); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &cuip->cui_format, + sizeof(cuip->cui_format)); + if (error) + goto abort_error; - xfs_refcount_finish_one_cleanup(tp, rcur, error); return xfs_defer_ops_capture_and_commit(tp, capture_list); abort_error: - xfs_refcount_finish_one_cleanup(tp, rcur, error); xfs_trans_cancel(tp); return error; } -STATIC bool -xfs_cui_item_match( - struct xfs_log_item *lip, - uint64_t intent_id) -{ - return CUI_ITEM(lip)->cui_format.cui_id == intent_id; -} - /* Relog an intent item to push the log tail forward. */ static struct xfs_log_item * -xfs_cui_item_relog( +xfs_refcount_relog_intent( + struct xfs_trans *tp, struct xfs_log_item *intent, - struct xfs_trans *tp) + struct xfs_log_item *done_item) { - struct xfs_cud_log_item *cudp; struct xfs_cui_log_item *cuip; struct xfs_phys_extent *pmap; unsigned int count; @@ -622,27 +515,41 @@ xfs_cui_item_relog( count = CUI_ITEM(intent)->cui_format.cui_nextents; pmap = CUI_ITEM(intent)->cui_format.cui_extents; - tp->t_flags |= XFS_TRANS_DIRTY; - cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent)); - set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags); - cuip = xfs_cui_init(tp->t_mountp, count); memcpy(cuip->cui_format.cui_extents, pmap, count * sizeof(*pmap)); atomic_set(&cuip->cui_next_extent, count); - xfs_trans_add_item(tp, &cuip->cui_item); - set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); + return &cuip->cui_item; } +const struct xfs_defer_op_type xfs_refcount_update_defer_type = { + .name = "refcount", + .max_items = XFS_CUI_MAX_FAST_EXTENTS, + .create_intent = xfs_refcount_update_create_intent, + .abort_intent = xfs_refcount_update_abort_intent, + .create_done = xfs_refcount_update_create_done, + .finish_item = xfs_refcount_update_finish_item, + .finish_cleanup = xfs_refcount_finish_one_cleanup, + .cancel_item = xfs_refcount_update_cancel_item, + .recover_work = xfs_refcount_recover_work, + .relog_intent = xfs_refcount_relog_intent, +}; + +STATIC bool +xfs_cui_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return CUI_ITEM(lip)->cui_format.cui_id == intent_id; +} + static const struct xfs_item_ops xfs_cui_item_ops = { .flags = XFS_ITEM_INTENT, .iop_size = xfs_cui_item_size, .iop_format = xfs_cui_item_format, .iop_unpin = xfs_cui_item_unpin, .iop_release = xfs_cui_item_release, - .iop_recover = xfs_cui_item_recover, .iop_match = xfs_cui_item_match, - .iop_relog = xfs_cui_item_relog, }; static inline void @@ -696,12 +603,9 @@ xlog_recover_cui_commit_pass2( cuip = xfs_cui_init(mp, cui_formatp->cui_nextents); xfs_cui_copy_format(&cuip->cui_format, cui_formatp); atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); - /* - * Insert the intent into the AIL directly and drop one reference so - * that finishing or canceling the work will drop the other. - */ - xfs_trans_ail_insert(log->l_ailp, &cuip->cui_item, lsn); - xfs_cui_release(cuip); + + xlog_recover_intent_item(log, &cuip->cui_item, lsn, + &xfs_refcount_update_defer_type); return 0; } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index e5b62dc28466..d5ca8bcae65b 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -618,7 +618,7 @@ xfs_reflink_cancel_cow_blocks( error = xfs_free_extent_later(*tpp, del.br_startblock, del.br_blockcount, NULL, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) break; diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 0e0e747028da..79ad0087aeca 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -225,23 +225,6 @@ static const struct xfs_item_ops xfs_rud_item_ops = { .iop_intent = xfs_rud_item_intent, }; -static struct xfs_rud_log_item * -xfs_trans_get_rud( - struct xfs_trans *tp, - struct xfs_rui_log_item *ruip) -{ - struct xfs_rud_log_item *rudp; - - rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL); - xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD, - &xfs_rud_item_ops); - rudp->rud_ruip = ruip; - rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id; - - xfs_trans_add_item(tp, &rudp->rud_item); - return rudp; -} - /* Set the map extent flags for this reverse mapping. */ static void xfs_trans_set_rmap_flags( @@ -285,35 +268,6 @@ xfs_trans_set_rmap_flags( } } -/* - * Finish an rmap update and log it to the RUD. Note that the transaction is - * marked dirty regardless of whether the rmap update succeeds or fails to - * support the RUI/RUD lifecycle rules. - */ -static int -xfs_trans_log_finish_rmap_update( - struct xfs_trans *tp, - struct xfs_rud_log_item *rudp, - struct xfs_rmap_intent *ri, - struct xfs_btree_cur **pcur) -{ - int error; - - error = xfs_rmap_finish_one(tp, ri, pcur); - - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the RUI and frees the RUD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; - set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags); - - return error; -} - /* Sort rmap intents by AG. */ static int xfs_rmap_update_diff_items( @@ -340,9 +294,6 @@ xfs_rmap_update_log_item( uint next_extent; struct xfs_map_extent *map; - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags); - /* * atomic_inc_return gives us the value after the increment; * we want to use it as an array index so we need to subtract 1 from @@ -372,7 +323,6 @@ xfs_rmap_update_create_intent( ASSERT(count > 0); - xfs_trans_add_item(tp, &ruip->rui_item); if (sort) list_sort(mp, items, xfs_rmap_update_diff_items); list_for_each_entry(ri, items, ri_list) @@ -387,7 +337,16 @@ xfs_rmap_update_create_done( struct xfs_log_item *intent, unsigned int count) { - return &xfs_trans_get_rud(tp, RUI_ITEM(intent))->rud_item; + struct xfs_rui_log_item *ruip = RUI_ITEM(intent); + struct xfs_rud_log_item *rudp; + + rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL); + xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD, + &xfs_rud_item_ops); + rudp->rud_ruip = ruip; + rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id; + + return &rudp->rud_item; } /* Take a passive ref to the AG containing the space we're rmapping. */ @@ -423,8 +382,7 @@ xfs_rmap_update_finish_item( ri = container_of(item, struct xfs_rmap_intent, ri_list); - error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), ri, - state); + error = xfs_rmap_finish_one(tp, ri, state); xfs_rmap_update_put_group(ri); kmem_cache_free(xfs_rmap_intent_cache, ri); @@ -452,16 +410,6 @@ xfs_rmap_update_cancel_item( kmem_cache_free(xfs_rmap_intent_cache, ri); } -const struct xfs_defer_op_type xfs_rmap_update_defer_type = { - .max_items = XFS_RUI_MAX_FAST_EXTENTS, - .create_intent = xfs_rmap_update_create_intent, - .abort_intent = xfs_rmap_update_abort_intent, - .create_done = xfs_rmap_update_create_done, - .finish_item = xfs_rmap_update_finish_item, - .finish_cleanup = xfs_rmap_finish_one_cleanup, - .cancel_item = xfs_rmap_update_cancel_item, -}; - /* Is this recovered RUI ok? */ static inline bool xfs_rui_validate_map( @@ -498,20 +446,72 @@ xfs_rui_validate_map( return xfs_verify_fsbext(mp, map->me_startblock, map->me_len); } +static inline void +xfs_rui_recover_work( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + const struct xfs_map_extent *map) +{ + struct xfs_rmap_intent *ri; + + ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_NOFS | __GFP_NOFAIL); + + switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { + case XFS_RMAP_EXTENT_MAP: + ri->ri_type = XFS_RMAP_MAP; + break; + case XFS_RMAP_EXTENT_MAP_SHARED: + ri->ri_type = XFS_RMAP_MAP_SHARED; + break; + case XFS_RMAP_EXTENT_UNMAP: + ri->ri_type = XFS_RMAP_UNMAP; + break; + case XFS_RMAP_EXTENT_UNMAP_SHARED: + ri->ri_type = XFS_RMAP_UNMAP_SHARED; + break; + case XFS_RMAP_EXTENT_CONVERT: + ri->ri_type = XFS_RMAP_CONVERT; + break; + case XFS_RMAP_EXTENT_CONVERT_SHARED: + ri->ri_type = XFS_RMAP_CONVERT_SHARED; + break; + case XFS_RMAP_EXTENT_ALLOC: + ri->ri_type = XFS_RMAP_ALLOC; + break; + case XFS_RMAP_EXTENT_FREE: + ri->ri_type = XFS_RMAP_FREE; + break; + default: + ASSERT(0); + return; + } + + ri->ri_owner = map->me_owner; + ri->ri_whichfork = (map->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + ri->ri_bmap.br_startblock = map->me_startblock; + ri->ri_bmap.br_startoff = map->me_startoff; + ri->ri_bmap.br_blockcount = map->me_len; + ri->ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? + XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + xfs_rmap_update_get_group(mp, ri); + + xfs_defer_add_item(dfp, &ri->ri_list); +} + /* * Process an rmap update intent item that was recovered from the log. * We need to update the rmapbt. */ STATIC int -xfs_rui_item_recover( - struct xfs_log_item *lip, +xfs_rmap_recover_work( + struct xfs_defer_pending *dfp, struct list_head *capture_list) { struct xfs_trans_res resv; + struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_rui_log_item *ruip = RUI_ITEM(lip); - struct xfs_rud_log_item *rudp; struct xfs_trans *tp; - struct xfs_btree_cur *rcur = NULL; struct xfs_mount *mp = lip->li_log->l_mp; int i; int error = 0; @@ -529,6 +529,8 @@ xfs_rui_item_recover( sizeof(ruip->rui_format)); return -EFSCORRUPTED; } + + xfs_rui_recover_work(mp, dfp, &ruip->rui_format.rui_extents[i]); } resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); @@ -536,91 +538,29 @@ xfs_rui_item_recover( XFS_TRANS_RESERVE, &tp); if (error) return error; - rudp = xfs_trans_get_rud(tp, ruip); - for (i = 0; i < ruip->rui_format.rui_nextents; i++) { - struct xfs_rmap_intent fake = { }; - struct xfs_map_extent *map; - - map = &ruip->rui_format.rui_extents[i]; - switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { - case XFS_RMAP_EXTENT_MAP: - fake.ri_type = XFS_RMAP_MAP; - break; - case XFS_RMAP_EXTENT_MAP_SHARED: - fake.ri_type = XFS_RMAP_MAP_SHARED; - break; - case XFS_RMAP_EXTENT_UNMAP: - fake.ri_type = XFS_RMAP_UNMAP; - break; - case XFS_RMAP_EXTENT_UNMAP_SHARED: - fake.ri_type = XFS_RMAP_UNMAP_SHARED; - break; - case XFS_RMAP_EXTENT_CONVERT: - fake.ri_type = XFS_RMAP_CONVERT; - break; - case XFS_RMAP_EXTENT_CONVERT_SHARED: - fake.ri_type = XFS_RMAP_CONVERT_SHARED; - break; - case XFS_RMAP_EXTENT_ALLOC: - fake.ri_type = XFS_RMAP_ALLOC; - break; - case XFS_RMAP_EXTENT_FREE: - fake.ri_type = XFS_RMAP_FREE; - break; - default: - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - &ruip->rui_format, - sizeof(ruip->rui_format)); - error = -EFSCORRUPTED; - goto abort_error; - } - - fake.ri_owner = map->me_owner; - fake.ri_whichfork = (map->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; - fake.ri_bmap.br_startblock = map->me_startblock; - fake.ri_bmap.br_startoff = map->me_startoff; - fake.ri_bmap.br_blockcount = map->me_len; - fake.ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? - XFS_EXT_UNWRITTEN : XFS_EXT_NORM; - - xfs_rmap_update_get_group(mp, &fake); - error = xfs_trans_log_finish_rmap_update(tp, rudp, &fake, - &rcur); - if (error == -EFSCORRUPTED) - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - map, sizeof(*map)); - xfs_rmap_update_put_group(&fake); - if (error) - goto abort_error; - - } + error = xlog_recover_finish_intent(tp, dfp); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &ruip->rui_format, + sizeof(ruip->rui_format)); + if (error) + goto abort_error; - xfs_rmap_finish_one_cleanup(tp, rcur, error); return xfs_defer_ops_capture_and_commit(tp, capture_list); abort_error: - xfs_rmap_finish_one_cleanup(tp, rcur, error); xfs_trans_cancel(tp); return error; } -STATIC bool -xfs_rui_item_match( - struct xfs_log_item *lip, - uint64_t intent_id) -{ - return RUI_ITEM(lip)->rui_format.rui_id == intent_id; -} - /* Relog an intent item to push the log tail forward. */ static struct xfs_log_item * -xfs_rui_item_relog( +xfs_rmap_relog_intent( + struct xfs_trans *tp, struct xfs_log_item *intent, - struct xfs_trans *tp) + struct xfs_log_item *done_item) { - struct xfs_rud_log_item *rudp; struct xfs_rui_log_item *ruip; struct xfs_map_extent *map; unsigned int count; @@ -628,27 +568,41 @@ xfs_rui_item_relog( count = RUI_ITEM(intent)->rui_format.rui_nextents; map = RUI_ITEM(intent)->rui_format.rui_extents; - tp->t_flags |= XFS_TRANS_DIRTY; - rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent)); - set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags); - ruip = xfs_rui_init(tp->t_mountp, count); memcpy(ruip->rui_format.rui_extents, map, count * sizeof(*map)); atomic_set(&ruip->rui_next_extent, count); - xfs_trans_add_item(tp, &ruip->rui_item); - set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags); + return &ruip->rui_item; } +const struct xfs_defer_op_type xfs_rmap_update_defer_type = { + .name = "rmap", + .max_items = XFS_RUI_MAX_FAST_EXTENTS, + .create_intent = xfs_rmap_update_create_intent, + .abort_intent = xfs_rmap_update_abort_intent, + .create_done = xfs_rmap_update_create_done, + .finish_item = xfs_rmap_update_finish_item, + .finish_cleanup = xfs_rmap_finish_one_cleanup, + .cancel_item = xfs_rmap_update_cancel_item, + .recover_work = xfs_rmap_recover_work, + .relog_intent = xfs_rmap_relog_intent, +}; + +STATIC bool +xfs_rui_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return RUI_ITEM(lip)->rui_format.rui_id == intent_id; +} + static const struct xfs_item_ops xfs_rui_item_ops = { .flags = XFS_ITEM_INTENT, .iop_size = xfs_rui_item_size, .iop_format = xfs_rui_item_format, .iop_unpin = xfs_rui_item_unpin, .iop_release = xfs_rui_item_release, - .iop_recover = xfs_rui_item_recover, .iop_match = xfs_rui_item_match, - .iop_relog = xfs_rui_item_relog, }; static inline void @@ -702,12 +656,9 @@ xlog_recover_rui_commit_pass2( ruip = xfs_rui_init(mp, rui_formatp->rui_nextents); xfs_rui_copy_format(&ruip->rui_format, rui_formatp); atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents); - /* - * Insert the intent into the AIL directly and drop one reference so - * that finishing or canceling the work will drop the other. - */ - xfs_trans_ail_insert(log->l_ailp, &ruip->rui_item, lsn); - xfs_rui_release(ruip); + + xlog_recover_intent_item(log, &ruip->rui_item, lsn, + &xfs_rmap_update_defer_type); return 0; } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 88c48de5c9c8..8649d981a097 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -14,28 +14,14 @@ #include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_bmap_btree.h" +#include "xfs_bmap_util.h" #include "xfs_trans.h" #include "xfs_trans_space.h" #include "xfs_icache.h" #include "xfs_rtalloc.h" #include "xfs_sb.h" #include "xfs_rtbitmap.h" - -/* - * Read and return the summary information for a given extent size, - * bitmap block combination. - * Keeps track of a current summary block, so we don't keep reading - * it from the buffer cache. - */ -static int -xfs_rtget_summary( - struct xfs_rtalloc_args *args, - int log, /* log2 of extent size */ - xfs_fileoff_t bbno, /* bitmap block number */ - xfs_suminfo_t *sum) /* out: summary info for this block */ -{ - return xfs_rtmodify_summary_int(args, log, bbno, 0, sum); -} +#include "xfs_quota.h" /* * Return whether there are any free extents in the size range given @@ -154,56 +140,55 @@ xfs_rtallocate_range( * properly update the summary. */ error = xfs_rtfind_back(args, start, 0, &preblock); - if (error) { + if (error) return error; - } + /* * Find the next allocated block (end of free extent). */ error = xfs_rtfind_forw(args, end, mp->m_sb.sb_rextents - 1, &postblock); - if (error) { + if (error) return error; - } + /* * Decrement the summary information corresponding to the entire * (old) free extent. */ error = xfs_rtmodify_summary(args, - XFS_RTBLOCKLOG(postblock + 1 - preblock), + xfs_highbit64(postblock + 1 - preblock), xfs_rtx_to_rbmblock(mp, preblock), -1); - if (error) { + if (error) return error; - } + /* * If there are blocks not being allocated at the front of the * old extent, add summary data for them to be free. */ if (preblock < start) { error = xfs_rtmodify_summary(args, - XFS_RTBLOCKLOG(start - preblock), + xfs_highbit64(start - preblock), xfs_rtx_to_rbmblock(mp, preblock), 1); - if (error) { + if (error) return error; - } } + /* * If there are blocks not being allocated at the end of the * old extent, add summary data for them to be free. */ if (postblock > end) { error = xfs_rtmodify_summary(args, - XFS_RTBLOCKLOG(postblock - end), + xfs_highbit64(postblock - end), xfs_rtx_to_rbmblock(mp, end + 1), 1); - if (error) { + if (error) return error; - } } + /* * Modify the bitmap to mark this extent allocated. */ - error = xfs_rtmodify_range(args, start, len, 0); - return error; + return xfs_rtmodify_range(args, start, len, 0); } /* @@ -265,21 +250,17 @@ xfs_rtallocate_extent_block( * If it's not so then next will contain the first non-free. */ error = xfs_rtcheck_range(args, i, maxlen, 1, &next, &stat); - if (error) { + if (error) return error; - } if (stat) { /* * i for maxlen is all free, allocate and return that. */ - error = xfs_rtallocate_range(args, i, maxlen); - if (error) { - return error; - } - *len = maxlen; - *rtx = i; - return 0; + bestlen = maxlen; + besti = i; + goto allocate; } + /* * In the case where we have a variable-sized allocation * request, figure out how big this free piece is, @@ -298,45 +279,44 @@ xfs_rtallocate_extent_block( /* * If not done yet, find the start of the next free space. */ - if (next < end) { - error = xfs_rtfind_forw(args, next, end, &i); - if (error) { - return error; - } - } else + if (next >= end) break; + error = xfs_rtfind_forw(args, next, end, &i); + if (error) + return error; } + /* * Searched the whole thing & didn't find a maxlen free extent. */ - if (minlen < maxlen && besti != -1) { - xfs_rtxlen_t p; /* amount to trim length by */ - + if (minlen > maxlen || besti == -1) { /* - * If size should be a multiple of prod, make that so. + * Allocation failed. Set *nextp to the next block to try. */ - if (prod > 1) { - div_u64_rem(bestlen, prod, &p); - if (p) - bestlen -= p; - } + *nextp = next; + return -ENOSPC; + } - /* - * Allocate besti for bestlen & return that. - */ - error = xfs_rtallocate_range(args, besti, bestlen); - if (error) { - return error; - } - *len = bestlen; - *rtx = besti; - return 0; + /* + * If size should be a multiple of prod, make that so. + */ + if (prod > 1) { + xfs_rtxlen_t p; /* amount to trim length by */ + + div_u64_rem(bestlen, prod, &p); + if (p) + bestlen -= p; } + /* - * Allocation failed. Set *nextp to the next block to try. + * Allocate besti for bestlen & return that. */ - *nextp = next; - *rtx = NULLRTEXTNO; +allocate: + error = xfs_rtallocate_range(args, besti, bestlen); + if (error) + return error; + *len = bestlen; + *rtx = besti; return 0; } @@ -367,52 +347,33 @@ xfs_rtallocate_extent_exact( * Check if the range in question (for maxlen) is free. */ error = xfs_rtcheck_range(args, start, maxlen, 1, &next, &isfree); - if (error) { + if (error) return error; - } - if (isfree) { + + if (!isfree) { /* - * If it is, allocate it and return success. + * If not, allocate what there is, if it's at least minlen. */ - error = xfs_rtallocate_range(args, start, maxlen); - if (error) { - return error; - } - *len = maxlen; - *rtx = start; - return 0; - } - /* - * If not, allocate what there is, if it's at least minlen. - */ - maxlen = next - start; - if (maxlen < minlen) { + maxlen = next - start; + if (maxlen < minlen) + return -ENOSPC; + /* - * Failed, return failure status. + * Trim off tail of extent, if prod is specified. */ - *rtx = NULLRTEXTNO; - return 0; - } - /* - * Trim off tail of extent, if prod is specified. - */ - if (prod > 1 && (i = maxlen % prod)) { - maxlen -= i; - if (maxlen < minlen) { - /* - * Now we can't do it, return failure status. - */ - *rtx = NULLRTEXTNO; - return 0; + if (prod > 1 && (i = maxlen % prod)) { + maxlen -= i; + if (maxlen < minlen) + return -ENOSPC; } } + /* * Allocate what we can and return it. */ error = xfs_rtallocate_range(args, start, maxlen); - if (error) { + if (error) return error; - } *len = maxlen; *rtx = start; return 0; @@ -441,7 +402,6 @@ xfs_rtallocate_extent_near( int j; /* secondary loop control */ int log2len; /* log2 of minlen */ xfs_rtxnum_t n; /* next rtext to try */ - xfs_rtxnum_t r; /* result rtext */ ASSERT(minlen % prod == 0); ASSERT(maxlen % prod == 0); @@ -455,26 +415,18 @@ xfs_rtallocate_extent_near( /* Make sure we don't run off the end of the rt volume. */ maxlen = xfs_rtallocate_clamp_len(mp, start, maxlen, prod); - if (maxlen < minlen) { - *rtx = NULLRTEXTNO; - return 0; - } + if (maxlen < minlen) + return -ENOSPC; /* * Try the exact allocation first. */ error = xfs_rtallocate_extent_exact(args, start, minlen, maxlen, len, - prod, &r); - if (error) { + prod, rtx); + if (error != -ENOSPC) return error; - } - /* - * If the exact allocation worked, return that. - */ - if (r != NULLRTEXTNO) { - *rtx = r; - return 0; - } + + bbno = xfs_rtx_to_rbmblock(mp, start); i = 0; j = -1; @@ -490,9 +442,9 @@ xfs_rtallocate_extent_near( */ error = xfs_rtany_summary(args, log2len, mp->m_rsumlevels - 1, bbno + i, &maxlog); - if (error) { + if (error) return error; - } + /* * If there are any useful extents starting here, try * allocating one. @@ -511,17 +463,9 @@ xfs_rtallocate_extent_near( */ error = xfs_rtallocate_extent_block(args, bbno + i, minlen, maxavail, len, - &n, prod, &r); - if (error) { + &n, prod, rtx); + if (error != -ENOSPC) return error; - } - /* - * If it worked, return it. - */ - if (r != NULLRTEXTNO) { - *rtx = r; - return 0; - } } /* * On the negative side of the starting location. @@ -555,17 +499,9 @@ xfs_rtallocate_extent_near( error = xfs_rtallocate_extent_block(args, bbno + j, minlen, maxavail, len, &n, prod, - &r); - if (error) { + rtx); + if (error != -ENOSPC) return error; - } - /* - * If it works, return the extent. - */ - if (r != NULLRTEXTNO) { - *rtx = r; - return 0; - } } } } @@ -599,8 +535,53 @@ xfs_rtallocate_extent_near( else break; } - *rtx = NULLRTEXTNO; - return 0; + return -ENOSPC; +} + +static int +xfs_rtalloc_sumlevel( + struct xfs_rtalloc_args *args, + int l, /* level number */ + xfs_rtxlen_t minlen, /* minimum length to allocate */ + xfs_rtxlen_t maxlen, /* maximum length to allocate */ + xfs_rtxlen_t prod, /* extent product factor */ + xfs_rtxlen_t *len, /* out: actual length allocated */ + xfs_rtxnum_t *rtx) /* out: start rtext allocated */ +{ + xfs_fileoff_t i; /* bitmap block number */ + + for (i = 0; i < args->mp->m_sb.sb_rbmblocks; i++) { + xfs_suminfo_t sum; /* summary information for extents */ + xfs_rtxnum_t n; /* next rtext to be tried */ + int error; + + error = xfs_rtget_summary(args, l, i, &sum); + if (error) + return error; + + /* + * Nothing there, on to the next block. + */ + if (!sum) + continue; + + /* + * Try allocating the extent. + */ + error = xfs_rtallocate_extent_block(args, i, minlen, maxlen, + len, &n, prod, rtx); + if (error != -ENOSPC) + return error; + + /* + * If the "next block to try" returned from the allocator is + * beyond the next bitmap block, skip to that bitmap block. + */ + if (xfs_rtx_to_rbmblock(args->mp, n) > i + 1) + i = xfs_rtx_to_rbmblock(args->mp, n) - 1; + } + + return -ENOSPC; } /* @@ -617,13 +598,8 @@ xfs_rtallocate_extent_size( xfs_rtxlen_t prod, /* extent product factor */ xfs_rtxnum_t *rtx) /* out: start rtext allocated */ { - struct xfs_mount *mp = args->mp; int error; - xfs_fileoff_t i; /* bitmap block number */ int l; /* level number (loop control) */ - xfs_rtxnum_t n; /* next rtext to be tried */ - xfs_rtxnum_t r; /* result rtext number */ - xfs_suminfo_t sum; /* summary information for extents */ ASSERT(minlen % prod == 0); ASSERT(maxlen % prod == 0); @@ -631,119 +607,46 @@ xfs_rtallocate_extent_size( /* * Loop over all the levels starting with maxlen. - * At each level, look at all the bitmap blocks, to see if there - * are extents starting there that are long enough (>= maxlen). - * Note, only on the initial level can the allocation fail if - * the summary says there's an extent. + * + * At each level, look at all the bitmap blocks, to see if there are + * extents starting there that are long enough (>= maxlen). + * + * Note, only on the initial level can the allocation fail if the + * summary says there's an extent. */ - for (l = xfs_highbit32(maxlen); l < mp->m_rsumlevels; l++) { - /* - * Loop over all the bitmap blocks. - */ - for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) { - /* - * Get the summary for this level/block. - */ - error = xfs_rtget_summary(args, l, i, &sum); - if (error) { - return error; - } - /* - * Nothing there, on to the next block. - */ - if (!sum) - continue; - /* - * Try allocating the extent. - */ - error = xfs_rtallocate_extent_block(args, i, maxlen, - maxlen, len, &n, prod, &r); - if (error) { - return error; - } - /* - * If it worked, return that. - */ - if (r != NULLRTEXTNO) { - *rtx = r; - return 0; - } - /* - * If the "next block to try" returned from the - * allocator is beyond the next bitmap block, - * skip to that bitmap block. - */ - if (xfs_rtx_to_rbmblock(mp, n) > i + 1) - i = xfs_rtx_to_rbmblock(mp, n) - 1; - } + for (l = xfs_highbit32(maxlen); l < args->mp->m_rsumlevels; l++) { + error = xfs_rtalloc_sumlevel(args, l, minlen, maxlen, prod, len, + rtx); + if (error != -ENOSPC) + return error; } + /* - * Didn't find any maxlen blocks. Try smaller ones, unless - * we're asking for a fixed size extent. + * Didn't find any maxlen blocks. Try smaller ones, unless we are + * looking for a fixed size extent. */ - if (minlen > --maxlen) { - *rtx = NULLRTEXTNO; - return 0; - } + if (minlen > --maxlen) + return -ENOSPC; ASSERT(minlen != 0); ASSERT(maxlen != 0); /* * Loop over sizes, from maxlen down to minlen. - * This time, when we do the allocations, allow smaller ones - * to succeed. + * + * This time, when we do the allocations, allow smaller ones to succeed, + * but make sure the specified minlen/maxlen are in the possible range + * for this summary level. */ for (l = xfs_highbit32(maxlen); l >= xfs_highbit32(minlen); l--) { - /* - * Loop over all the bitmap blocks, try an allocation - * starting in that block. - */ - for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) { - /* - * Get the summary information for this level/block. - */ - error = xfs_rtget_summary(args, l, i, &sum); - if (error) { - return error; - } - /* - * If nothing there, go on to next. - */ - if (!sum) - continue; - /* - * Try the allocation. Make sure the specified - * minlen/maxlen are in the possible range for - * this summary level. - */ - error = xfs_rtallocate_extent_block(args, i, - XFS_RTMAX(minlen, 1 << l), - XFS_RTMIN(maxlen, (1 << (l + 1)) - 1), - len, &n, prod, &r); - if (error) { - return error; - } - /* - * If it worked, return that extent. - */ - if (r != NULLRTEXTNO) { - *rtx = r; - return 0; - } - /* - * If the "next block to try" returned from the - * allocator is beyond the next bitmap block, - * skip to that bitmap block. - */ - if (xfs_rtx_to_rbmblock(mp, n) > i + 1) - i = xfs_rtx_to_rbmblock(mp, n) - 1; - } + error = xfs_rtalloc_sumlevel(args, l, + max_t(xfs_rtxlen_t, minlen, 1 << l), + min_t(xfs_rtxlen_t, maxlen, (1 << (l + 1)) - 1), + prod, len, rtx); + if (error != -ENOSPC) + return error; } - /* - * Got nothing, return failure. - */ - *rtx = NULLRTEXTNO; - return 0; + + return -ENOSPC; } /* @@ -963,8 +866,10 @@ xfs_growfs_rt( */ nrextents = nrblocks; do_div(nrextents, in->extsize); + if (!xfs_validate_rtextents(nrextents)) + return -EINVAL; nrbmblocks = xfs_rtbitmap_blockcount(mp, nrextents); - nrextslog = xfs_highbit32(nrextents); + nrextslog = xfs_compute_rextslog(nrextents); nrsumlevels = nrextslog + 1; nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels, nrbmblocks); nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks); @@ -1031,11 +936,14 @@ xfs_growfs_rt( nsbp->sb_rblocks = min(nrblocks, nrblocks_step); nsbp->sb_rextents = xfs_rtb_to_rtx(nmp, nsbp->sb_rblocks); ASSERT(nsbp->sb_rextents != 0); - nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents); + nsbp->sb_rextslog = xfs_compute_rextslog(nsbp->sb_rextents); nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1; nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels, nsbp->sb_rbmblocks); nmp->m_rsumsize = nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks); + /* recompute growfsrt reservation from new rsumsize */ + xfs_trans_resv_calc(nmp, &nmp->m_resv); + /* * Start a transaction, get the log reservation. */ @@ -1122,6 +1030,8 @@ error_cancel: */ mp->m_rsumlevels = nrsumlevels; mp->m_rsumsize = nrsumsize; + /* recompute growfsrt reservation from new rsumsize */ + xfs_trans_resv_calc(mp, &mp->m_resv); error = xfs_trans_commit(tp); if (error) @@ -1160,81 +1070,6 @@ out_free: } /* - * Allocate an extent in the realtime subvolume, with the usual allocation - * parameters. The length units are all in realtime extents, as is the - * result block number. - */ -int -xfs_rtallocate_extent( - struct xfs_trans *tp, - xfs_rtxnum_t start, /* starting rtext number to allocate */ - xfs_rtxlen_t minlen, /* minimum length to allocate */ - xfs_rtxlen_t maxlen, /* maximum length to allocate */ - xfs_rtxlen_t *len, /* out: actual length allocated */ - int wasdel, /* was a delayed allocation extent */ - xfs_rtxlen_t prod, /* extent product factor */ - xfs_rtxnum_t *rtblock) /* out: start rtext allocated */ -{ - struct xfs_rtalloc_args args = { - .mp = tp->t_mountp, - .tp = tp, - }; - int error; /* error value */ - xfs_rtxnum_t r; /* result allocated rtext */ - - ASSERT(xfs_isilocked(args.mp->m_rbmip, XFS_ILOCK_EXCL)); - ASSERT(minlen > 0 && minlen <= maxlen); - - /* - * If prod is set then figure out what to do to minlen and maxlen. - */ - if (prod > 1) { - xfs_rtxlen_t i; - - if ((i = maxlen % prod)) - maxlen -= i; - if ((i = minlen % prod)) - minlen += prod - i; - if (maxlen < minlen) { - *rtblock = NULLRTEXTNO; - return 0; - } - } - -retry: - if (start == 0) { - error = xfs_rtallocate_extent_size(&args, minlen, - maxlen, len, prod, &r); - } else { - error = xfs_rtallocate_extent_near(&args, start, minlen, - maxlen, len, prod, &r); - } - - xfs_rtbuf_cache_relse(&args); - if (error) - return error; - - /* - * If it worked, update the superblock. - */ - if (r != NULLRTEXTNO) { - long slen = (long)*len; - - ASSERT(*len >= minlen && *len <= maxlen); - if (wasdel) - xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FREXTENTS, -slen); - else - xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, -slen); - } else if (prod > 1) { - prod = 1; - goto retry; - } - - *rtblock = r; - return 0; -} - -/* * Initialize realtime fields in the mount structure. */ int /* error */ @@ -1412,7 +1247,7 @@ xfs_rtunmount_inodes( * of rtextents and the fraction. * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ... */ -int /* error */ +static int xfs_rtpick_extent( xfs_mount_t *mp, /* file system mount point */ xfs_trans_t *tp, /* transaction pointer */ @@ -1451,3 +1286,177 @@ xfs_rtpick_extent( *pick = b; return 0; } + +static void +xfs_rtalloc_align_minmax( + xfs_rtxlen_t *raminlen, + xfs_rtxlen_t *ramaxlen, + xfs_rtxlen_t *prod) +{ + xfs_rtxlen_t newmaxlen = *ramaxlen; + xfs_rtxlen_t newminlen = *raminlen; + xfs_rtxlen_t slack; + + slack = newmaxlen % *prod; + if (slack) + newmaxlen -= slack; + slack = newminlen % *prod; + if (slack) + newminlen += *prod - slack; + + /* + * If adjusting for extent size hint alignment produces an invalid + * min/max len combination, go ahead without it. + */ + if (newmaxlen < newminlen) { + *prod = 1; + return; + } + *ramaxlen = newmaxlen; + *raminlen = newminlen; +} + +int +xfs_bmap_rtalloc( + struct xfs_bmalloca *ap) +{ + struct xfs_mount *mp = ap->ip->i_mount; + xfs_fileoff_t orig_offset = ap->offset; + xfs_rtxnum_t start; /* allocation hint rtextent no */ + xfs_rtxnum_t rtx; /* actually allocated rtextent no */ + xfs_rtxlen_t prod = 0; /* product factor for allocators */ + xfs_extlen_t mod = 0; /* product factor for allocators */ + xfs_rtxlen_t ralen = 0; /* realtime allocation length */ + xfs_extlen_t align; /* minimum allocation alignment */ + xfs_extlen_t orig_length = ap->length; + xfs_extlen_t minlen = mp->m_sb.sb_rextsize; + xfs_rtxlen_t raminlen; + bool rtlocked = false; + bool ignore_locality = false; + struct xfs_rtalloc_args args = { + .mp = mp, + .tp = ap->tp, + }; + int error; + + align = xfs_get_extsz_hint(ap->ip); +retry: + error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, + align, 1, ap->eof, 0, + ap->conv, &ap->offset, &ap->length); + if (error) + return error; + ASSERT(ap->length); + ASSERT(xfs_extlen_to_rtxmod(mp, ap->length) == 0); + + /* + * If we shifted the file offset downward to satisfy an extent size + * hint, increase minlen by that amount so that the allocator won't + * give us an allocation that's too short to cover at least one of the + * blocks that the caller asked for. + */ + if (ap->offset != orig_offset) + minlen += orig_offset - ap->offset; + + /* + * Set ralen to be the actual requested length in rtextents. + * + * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that + * we rounded up to it, cut it back so it's valid again. + * Note that if it's a really large request (bigger than + * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't + * adjust the starting point to match it. + */ + ralen = xfs_extlen_to_rtxlen(mp, min(ap->length, XFS_MAX_BMBT_EXTLEN)); + raminlen = max_t(xfs_rtxlen_t, 1, xfs_extlen_to_rtxlen(mp, minlen)); + ASSERT(raminlen > 0); + ASSERT(raminlen <= ralen); + + /* + * Lock out modifications to both the RT bitmap and summary inodes + */ + if (!rtlocked) { + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP); + xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM); + xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL); + rtlocked = true; + } + + if (ignore_locality) { + start = 0; + } else if (xfs_bmap_adjacent(ap)) { + start = xfs_rtb_to_rtx(mp, ap->blkno); + } else if (ap->eof && ap->offset == 0) { + /* + * If it's an allocation to an empty file at offset 0, pick an + * extent that will space things out in the rt area. + */ + error = xfs_rtpick_extent(mp, ap->tp, ralen, &start); + if (error) + return error; + } else { + start = 0; + } + + /* + * Only bother calculating a real prod factor if offset & length are + * perfectly aligned, otherwise it will just get us in trouble. + */ + div_u64_rem(ap->offset, align, &mod); + if (mod || ap->length % align) { + prod = 1; + } else { + prod = xfs_extlen_to_rtxlen(mp, align); + if (prod > 1) + xfs_rtalloc_align_minmax(&raminlen, &ralen, &prod); + } + + if (start) { + error = xfs_rtallocate_extent_near(&args, start, raminlen, + ralen, &ralen, prod, &rtx); + } else { + error = xfs_rtallocate_extent_size(&args, raminlen, + ralen, &ralen, prod, &rtx); + } + xfs_rtbuf_cache_relse(&args); + + if (error == -ENOSPC) { + if (align > mp->m_sb.sb_rextsize) { + /* + * We previously enlarged the request length to try to + * satisfy an extent size hint. The allocator didn't + * return anything, so reset the parameters to the + * original values and try again without alignment + * criteria. + */ + ap->offset = orig_offset; + ap->length = orig_length; + minlen = align = mp->m_sb.sb_rextsize; + goto retry; + } + + if (!ignore_locality && start != 0) { + /* + * If we can't allocate near a specific rt extent, try + * again without locality criteria. + */ + ignore_locality = true; + goto retry; + } + + ap->blkno = NULLFSBLOCK; + ap->length = 0; + return 0; + } + if (error) + return error; + + xfs_trans_mod_sb(ap->tp, ap->wasdel ? + XFS_TRANS_SB_RES_FREXTENTS : XFS_TRANS_SB_FREXTENTS, + -(long)ralen); + ap->blkno = xfs_rtx_to_rtb(mp, rtx); + ap->length = xfs_rtxlen_to_extlen(mp, ralen); + xfs_bmap_alloc_account(ap); + return 0; +} diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index f7cb9ffe51ca..a6836da9bebe 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -13,27 +13,6 @@ struct xfs_trans; #ifdef CONFIG_XFS_RT /* - * Function prototypes for exported functions. - */ - -/* - * Allocate an extent in the realtime subvolume, with the usual allocation - * parameters. The length units are all in realtime extents, as is the - * result block number. - */ -int /* error */ -xfs_rtallocate_extent( - struct xfs_trans *tp, /* transaction pointer */ - xfs_rtxnum_t start, /* starting rtext number to allocate */ - xfs_rtxlen_t minlen, /* minimum length to allocate */ - xfs_rtxlen_t maxlen, /* maximum length to allocate */ - xfs_rtxlen_t *len, /* out: actual length allocated */ - int wasdel, /* was a delayed allocation extent */ - xfs_rtxlen_t prod, /* extent product factor */ - xfs_rtxnum_t *rtblock); /* out: start rtext allocated */ - - -/* * Initialize realtime fields in the mount structure. */ int /* error */ @@ -52,20 +31,6 @@ xfs_rtmount_inodes( struct xfs_mount *mp); /* file system mount structure */ /* - * Pick an extent for allocation at the start of a new realtime file. - * Use the sequence number stored in the atime field of the bitmap inode. - * Translate this to a fraction of the rtextents, and return the product - * of rtextents and the fraction. - * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ... - */ -int /* error */ -xfs_rtpick_extent( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_rtxlen_t len, /* allocation length (rtextents) */ - xfs_rtxnum_t *pick); /* result rt extent */ - -/* * Grow the realtime area of the filesystem. */ int @@ -75,8 +40,6 @@ xfs_growfs_rt( int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp); #else -# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (-ENOSYS) -# define xfs_rtpick_extent(m,t,l,rb) (-ENOSYS) # define xfs_growfs_rt(mp,in) (-ENOSYS) # define xfs_rtalloc_reinit_frextents(m) (0) static inline int /* error */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 764304595e8b..aff20ddd4a9f 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -366,8 +366,9 @@ xfs_blkdev_get( { int error = 0; - *handlep = bdev_open_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE, - mp->m_super, &fs_holder_ops); + *handlep = bdev_open_by_path(name, + BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES, + mp->m_super, &fs_holder_ops); if (IS_ERR(*handlep)) { error = PTR_ERR(*handlep); *handlep = NULL; @@ -439,18 +440,12 @@ xfs_open_devices( int error; /* - * blkdev_put() can't be called under s_umount, see the comment - * in get_tree_bdev() for more details - */ - up_write(&sb->s_umount); - - /* * Open real time and log devices - order is important. */ if (mp->m_logname) { error = xfs_blkdev_get(mp, mp->m_logname, &logdev_handle); if (error) - goto out_relock; + return error; } if (mp->m_rtname) { @@ -493,10 +488,7 @@ xfs_open_devices( bdev_release(logdev_handle); } - error = 0; -out_relock: - down_write(&sb->s_umount); - return error; + return 0; out_free_rtdev_targ: if (mp->m_rtdev_targp) @@ -509,7 +501,7 @@ out_relock: out_close_logdev: if (logdev_handle) bdev_release(logdev_handle); - goto out_relock; + return error; } /* @@ -759,10 +751,6 @@ static void xfs_mount_free( struct xfs_mount *mp) { - /* - * Free the buftargs here because blkdev_put needs to be called outside - * of sb->s_umount, which is held around the call to ->put_super. - */ if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) xfs_free_buftarg(mp->m_logdev_targp); if (mp->m_rtdev_targp) @@ -906,10 +894,8 @@ xfs_fs_statfs( STATIC void xfs_save_resvblks(struct xfs_mount *mp) { - uint64_t resblks = 0; - mp->m_resblks_save = mp->m_resblks; - xfs_reserve_blocks(mp, &resblks, NULL); + xfs_reserve_blocks(mp, 0); } STATIC void @@ -923,7 +909,7 @@ xfs_restore_resvblks(struct xfs_mount *mp) } else resblks = xfs_default_resblks(mp); - xfs_reserve_blocks(mp, &resblks, NULL); + xfs_reserve_blocks(mp, resblks); } /* diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 85e433df6a3f..92974a4414c8 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -23,6 +23,7 @@ #include "xfs_trans.h" #include "xfs_ialloc.h" #include "xfs_error.h" +#include "xfs_health.h" /* ----- Kernel only functions below ----- */ int @@ -108,6 +109,8 @@ xfs_readlink( if (xfs_is_shutdown(mp)) return -EIO; + if (xfs_ifork_zapped(ip, XFS_DATA_FORK)) + return -EIO; xfs_ilock(ip, XFS_ILOCK_SHARED); @@ -128,10 +131,10 @@ xfs_readlink( * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED * if if_data is junk. */ - if (XFS_IS_CORRUPT(ip->i_mount, !ip->i_df.if_u1.if_data)) + if (XFS_IS_CORRUPT(ip->i_mount, !ip->i_df.if_data)) goto out; - memcpy(link, ip->i_df.if_u1.if_data, pathlen + 1); + memcpy(link, ip->i_df.if_data, pathlen + 1); error = 0; } else { error = xfs_readlink_bmap_ilocked(ip, link); diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index fade33735393..a191f6560f98 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -206,8 +206,6 @@ static struct ctl_table xfs_table[] = { .extra2 = &xfs_params.stats_clear.max }, #endif /* CONFIG_PROC_FS */ - - {} }; int diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index f78ad6b10ea5..276696a07040 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -85,6 +85,8 @@ struct xfs_globals { int pwork_threads; /* parallel workqueue threads */ bool larp; /* log attribute replay */ #endif + int bload_leaf_slack; /* btree bulk load leaf slack */ + int bload_node_slack; /* btree bulk load node slack */ int log_recovery_delay; /* log recovery delay (secs) */ int mount_delay; /* mount setup delay (secs) */ bool bug_on_assert; /* BUG() the kernel on assert failure */ diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index a3c6b1548723..17485666b672 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -229,6 +229,15 @@ pwork_threads_show( } XFS_SYSFS_ATTR_RW(pwork_threads); +/* + * The "LARP" (Logged extended Attribute Recovery Persistence) debugging knob + * sets the XFS_DA_OP_LOGGED flag on all xfs_attr_set operations performed on + * V5 filesystems. As a result, the intermediate progress of all setxattr and + * removexattr operations are tracked via the log and can be restarted during + * recovery. This is useful for testing xattr recovery prior to merging of the + * parent pointer feature which requires it to maintain consistency, and may be + * enabled for userspace xattrs in the future. + */ static ssize_t larp_store( struct kobject *kobject, @@ -253,6 +262,58 @@ larp_show( XFS_SYSFS_ATTR_RW(larp); #endif /* DEBUG */ +STATIC ssize_t +bload_leaf_slack_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + xfs_globals.bload_leaf_slack = val; + return count; +} + +STATIC ssize_t +bload_leaf_slack_show( + struct kobject *kobject, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_leaf_slack); +} +XFS_SYSFS_ATTR_RW(bload_leaf_slack); + +STATIC ssize_t +bload_node_slack_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + xfs_globals.bload_node_slack = val; + return count; +} + +STATIC ssize_t +bload_node_slack_show( + struct kobject *kobject, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_node_slack); +} +XFS_SYSFS_ATTR_RW(bload_node_slack); + static struct attribute *xfs_dbg_attrs[] = { ATTR_LIST(bug_on_assert), ATTR_LIST(log_recovery_delay), @@ -262,6 +323,8 @@ static struct attribute *xfs_dbg_attrs[] = { ATTR_LIST(pwork_threads), ATTR_LIST(larp), #endif + ATTR_LIST(bload_leaf_slack), + ATTR_LIST(bload_node_slack), NULL, }; ATTRIBUTE_GROUPS(xfs_dbg); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 3926cf7f2a6e..0984a1c884c7 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -67,6 +67,7 @@ struct xfs_buf_log_format; struct xfs_inode_log_format; struct xfs_bmbt_irec; struct xfs_btree_cur; +struct xfs_defer_op_type; struct xfs_refcount_irec; struct xfs_fsmap; struct xfs_rmap_irec; @@ -145,21 +146,23 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list); DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list); TRACE_EVENT(xlog_intent_recovery_failed, - TP_PROTO(struct xfs_mount *mp, int error, void *function), - TP_ARGS(mp, error, function), + TP_PROTO(struct xfs_mount *mp, const struct xfs_defer_op_type *ops, + int error), + TP_ARGS(mp, ops, error), TP_STRUCT__entry( __field(dev_t, dev) + __string(name, ops->name) __field(int, error) - __field(void *, function) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; + __assign_str(name, ops->name); __entry->error = error; - __entry->function = function; ), - TP_printk("dev %d:%d error %d function %pS", + TP_printk("dev %d:%d optype %s error %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->error, __entry->function) + __get_str(name), + __entry->error) ); DECLARE_EVENT_CLASS(xfs_perag_class, @@ -2549,22 +2552,25 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class, TP_ARGS(mp, dfp), TP_STRUCT__entry( __field(dev_t, dev) - __field(int, type) + __string(name, dfp->dfp_ops->name) __field(void *, intent) + __field(unsigned int, flags) __field(char, committed) __field(int, nr) ), TP_fast_assign( __entry->dev = mp ? mp->m_super->s_dev : 0; - __entry->type = dfp->dfp_type; + __assign_str(name, dfp->dfp_ops->name); __entry->intent = dfp->dfp_intent; + __entry->flags = dfp->dfp_flags; __entry->committed = dfp->dfp_done != NULL; __entry->nr = dfp->dfp_count; ), - TP_printk("dev %d:%d optype %d intent %p committed %d nr %d", + TP_printk("dev %d:%d optype %s intent %p flags %s committed %d nr %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->type, + __get_str(name), __entry->intent, + __print_flags(__entry->flags, "|", XFS_DEFER_PENDING_STRINGS), __entry->committed, __entry->nr) ) @@ -2675,6 +2681,9 @@ DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort); DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_isolate_paused); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_pause); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_unpause); #define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer); @@ -2688,25 +2697,28 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_item_class, TP_ARGS(mp, dfp, item), TP_STRUCT__entry( __field(dev_t, dev) - __field(int, type) + __string(name, dfp->dfp_ops->name) __field(void *, intent) __field(void *, item) __field(char, committed) + __field(unsigned int, flags) __field(int, nr) ), TP_fast_assign( __entry->dev = mp ? mp->m_super->s_dev : 0; - __entry->type = dfp->dfp_type; + __assign_str(name, dfp->dfp_ops->name); __entry->intent = dfp->dfp_intent; __entry->item = item; __entry->committed = dfp->dfp_done != NULL; + __entry->flags = dfp->dfp_flags; __entry->nr = dfp->dfp_count; ), - TP_printk("dev %d:%d optype %d intent %p item %p committed %d nr %d", + TP_printk("dev %d:%d optype %s intent %p item %p flags %s committed %d nr %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->type, + __get_str(name), __entry->intent, __entry->item, + __print_flags(__entry->flags, "|", XFS_DEFER_PENDING_STRINGS), __entry->committed, __entry->nr) ) @@ -4399,8 +4411,6 @@ DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return); DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_alloc); DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return); DEFINE_DAS_STATE_EVENT(xfs_attr_defer_add); -DEFINE_DAS_STATE_EVENT(xfs_attr_defer_replace); -DEFINE_DAS_STATE_EVENT(xfs_attr_defer_remove); TRACE_EVENT(xfs_force_shutdown, diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 305c9d07bf1b..12d45e93f07d 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1237,6 +1237,68 @@ out_cancel: } /* + * Try to reserve more blocks for a transaction. + * + * This is for callers that need to attach resources to a transaction, scan + * those resources to determine the space reservation requirements, and then + * modify the attached resources. In other words, online repair. This can + * fail due to ENOSPC, so the caller must be able to cancel the transaction + * without shutting down the fs. + */ +int +xfs_trans_reserve_more( + struct xfs_trans *tp, + unsigned int blocks, + unsigned int rtextents) +{ + struct xfs_trans_res resv = { }; + + return xfs_trans_reserve(tp, &resv, blocks, rtextents); +} + +/* + * Try to reserve more blocks and file quota for a transaction. Same + * conditions of usage as xfs_trans_reserve_more. + */ +int +xfs_trans_reserve_more_inode( + struct xfs_trans *tp, + struct xfs_inode *ip, + unsigned int dblocks, + unsigned int rblocks, + bool force_quota) +{ + struct xfs_trans_res resv = { }; + struct xfs_mount *mp = ip->i_mount; + unsigned int rtx = xfs_extlen_to_rtxlen(mp, rblocks); + int error; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + error = xfs_trans_reserve(tp, &resv, dblocks, rtx); + if (error) + return error; + + if (!XFS_IS_QUOTA_ON(mp) || xfs_is_quota_inode(&mp->m_sb, ip->i_ino)) + return 0; + + if (tp->t_flags & XFS_TRANS_RESERVE) + force_quota = true; + + error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks, + force_quota); + if (!error) + return 0; + + /* Quota failed, give back the new reservation. */ + xfs_mod_fdblocks(mp, dblocks, tp->t_flags & XFS_TRANS_RESERVE); + tp->t_blk_res -= dblocks; + xfs_mod_frextents(mp, rtx); + tp->t_rtx_res -= rtx; + return error; +} + +/* * Allocate an transaction in preparation for inode creation by reserving quota * against the given dquots. Callers are not required to hold any inode locks. */ diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 6e3646d524ce..08ce757c7454 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -78,11 +78,7 @@ struct xfs_item_ops { xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t); uint (*iop_push)(struct xfs_log_item *, struct list_head *); void (*iop_release)(struct xfs_log_item *); - int (*iop_recover)(struct xfs_log_item *lip, - struct list_head *capture_list); bool (*iop_match)(struct xfs_log_item *item, uint64_t id); - struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent, - struct xfs_trans *tp); struct xfs_log_item *(*iop_intent)(struct xfs_log_item *intent_done); }; @@ -168,6 +164,8 @@ typedef struct xfs_trans { int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp, uint blocks, uint rtextents, uint flags, struct xfs_trans **tpp); +int xfs_trans_reserve_more(struct xfs_trans *tp, + unsigned int blocks, unsigned int rtextents); int xfs_trans_alloc_empty(struct xfs_mount *mp, struct xfs_trans **tpp); void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); @@ -247,19 +245,13 @@ void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, extern struct kmem_cache *xfs_trans_cache; -static inline struct xfs_log_item * -xfs_trans_item_relog( - struct xfs_log_item *lip, - struct xfs_trans *tp) -{ - return lip->li_ops->iop_relog(lip, tp); -} - struct xfs_dquot; int xfs_trans_alloc_inode(struct xfs_inode *ip, struct xfs_trans_res *resv, unsigned int dblocks, unsigned int rblocks, bool force, struct xfs_trans **tpp); +int xfs_trans_reserve_more_inode(struct xfs_trans *tp, struct xfs_inode *ip, + unsigned int dblocks, unsigned int rblocks, bool force_quota); int xfs_trans_alloc_icreate(struct xfs_mount *mp, struct xfs_trans_res *resv, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, unsigned int dblocks, diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 987843f84d03..364104e1b38a 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -136,6 +136,9 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, }; int error; + if (xfs_ifork_zapped(XFS_I(inode), XFS_ATTR_FORK)) + return -EIO; + error = xfs_attr_get(&args); if (error) return error; @@ -294,6 +297,9 @@ xfs_vn_listxattr( struct inode *inode = d_inode(dentry); int error; + if (xfs_ifork_zapped(XFS_I(inode), XFS_ATTR_FORK)) + return -EIO; + /* * First read the regular on-disk attributes. */ diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index b2c9b35df8f7..6ab2318a9c8e 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -180,7 +180,7 @@ const struct address_space_operations zonefs_file_aops = { .invalidate_folio = iomap_invalidate_folio, .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = zonefs_swap_activate, }; diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index e6a75401677d..93971742613a 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -747,8 +747,6 @@ static struct dentry *zonefs_lookup(struct inode *dir, struct dentry *dentry, inode = zonefs_get_dir_inode(dir, dentry); else inode = zonefs_get_file_inode(dir, dentry); - if (IS_ERR(inode)) - return ERR_CAST(inode); return d_splice_alias(inode, dentry); } |